FIX: remove the tmp inventory files after the s3 uploads check.

FIX: remove the tmp inventory files after the s3 uploads check.

diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb
index 501cb91..ebe40fd 100644
--- a/lib/s3_inventory.rb
+++ b/lib/s3_inventory.rb
@@ -31,50 +31,58 @@ class S3Inventory
     end
 
     DistributedMutex.synchronize("s3_inventory_list_missing_#{type}") do
-      download_inventory_files_to_tmp_directory
-      decompress_inventory_files
+      begin
+        files.each do |file|
+          next if File.exists?(file[:filename][0...-3])
 
-      multisite_prefix = "uploads/#{RailsMultisite::ConnectionManagement.current_db}/"
-      ActiveRecord::Base.transaction do
-        begin
-          connection.exec("CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))")
-          connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
-            files.each do |file|
-              CSV.foreach(file[:filename][0...-3], headers: false) do |row|
-                key = row[CSV_KEY_INDEX]
-                next if Rails.configuration.multisite && key.exclude?(multisite_prefix)
-                url = File.join(Discourse.store.absolute_base_url, key)
-                connection.put_copy_data("#{url},#{row[CSV_ETAG_INDEX]}\n")
+          download_inventory_file_to_tmp_directory(file)
+          decompress_inventory_file(file)
+        end
+
+        multisite_prefix = "uploads/#{RailsMultisite::ConnectionManagement.current_db}/"
+        ActiveRecord::Base.transaction do
+          begin
+            connection.exec("CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))")
+            connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
+              files.each do |file|
+                CSV.foreach(file[:filename][0...-3], headers: false) do |row|
+                  key = row[CSV_KEY_INDEX]
+                  next if Rails.configuration.multisite && key.exclude?(multisite_prefix)
+                  url = File.join(Discourse.store.absolute_base_url, key)
+                  connection.put_copy_data("#{url},#{row[CSV_ETAG_INDEX]}\n")
+                end
               end
             end
-          end
 
-          # backfilling etags
-          connection.async_exec("UPDATE #{model.table_name}
-            SET etag = #{table_name}.etag
-            FROM #{table_name}
-            WHERE #{model.table_name}.etag IS NULL
-              AND #{model.table_name}.url = #{table_name}.url")
+            # backfilling etags
+            connection.async_exec("UPDATE #{model.table_name}
+              SET etag = #{table_name}.etag
+              FROM #{table_name}
+              WHERE #{model.table_name}.etag IS NULL
+                AND #{model.table_name}.url = #{table_name}.url")
 
-          list_missing_post_uploads if type == "original"
+            list_missing_post_uploads if type == "original"
 
-          uploads = (model == Upload) ? model.by_users.where("created_at < ?", inventory_date) : model
-          missing_uploads = uploads
-            .joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag")
-            .where("#{table_name}.etag IS NULL AND #{model.table_name}.etag IS NOT NULL")
+            uploads = (model == Upload) ? model.by_users.where("created_at < ?", inventory_date) : model
+            missing_uploads = uploads
+              .joins("LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag")
+              .where("#{table_name}.etag IS NULL AND #{model.table_name}.etag IS NOT NULL")
 
-          if (missing_count = missing_uploads.count) > 0
-            missing_uploads.select(:id, :url).find_each do |upload|
-              log upload.url
+            if (missing_count = missing_uploads.count) > 0
+              missing_uploads.select(:id, :url).find_each do |upload|
+                log upload.url
+              end
+
+              log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing"
             end
 
-            log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing"
+            Discourse.stats.set("missing_s3_#{model.table_name}", missing_count)
+          ensure
+            connection.exec("DROP TABLE #{table_name}") unless connection.nil?
           end
-
-          Discourse.stats.set("missing_s3_#{model.table_name}", missing_count)
-        ensure
-          connection.exec("DROP TABLE #{table_name}") unless connection.nil?
         end
+      ensure
+        cleanup!
       end
     end
   end
@@ -118,22 +126,18 @@ class S3Inventory
     log "#{missing[:count]} post uploads are missing."
   end
 
-  def download_inventory_files_to_tmp_directory
-    files.each do |file|
-      next if File.exists?(file[:filename])
+  def download_inventory_file_to_tmp_directory(file)
+    return if File.exists?(file[:filename])
 
-      log "Downloading inventory file '#{file[:key]}' to tmp directory..."
-      failure_message = "Failed to inventory file '#{file[:key]}' to tmp directory."
+    log "Downloading inventory file '#{file[:key]}' to tmp directory..."
+    failure_message = "Failed to inventory file '#{file[:key]}' to tmp directory."
 
-      @s3_helper.download_file(file[:key], file[:filename], failure_message)
-    end
+    @s3_helper.download_file(file[:key], file[:filename], failure_message)
   end
 
-  def decompress_inventory_files
-    files.each do |file|
-      log "Decompressing inventory file '#{file[:filename]}', this may take a while..."
-      Discourse::Utils.execute_command('gzip', '--decompress', file[:filename], failure_message: "Failed to decompress inventory file '#{file[:filename]}'.", chdir: tmp_directory)
-    end
+  def decompress_inventory_file(file)
+    log "Decompressing inventory file '#{file[:filename]}', this may take a while..."
+    Discourse::Utils.execute_command('gzip', '--decompress', file[:filename], failure_message: "Failed to decompress inventory file '#{file[:filename]}'.", chdir: tmp_directory)
   end
 
   def update_bucket_policy
@@ -173,6 +177,13 @@ class S3Inventory
 
   private
 
+  def cleanup!
+    files.each do |file|
+      File.delete(file[:filename]) if File.exists?(file[:filename])
+      File.delete(file[:filename][0...-3]) if File.exists?(file[:filename][0...-3])
+    end
+  end
+
   def connection
     @connection ||= ActiveRecord::Base.connection.raw_connection
   end
@@ -202,8 +213,7 @@ class S3Inventory
   def tmp_directory
     @tmp_directory ||= begin
       current_db = RailsMultisite::ConnectionManagement.current_db
-      timestamp = Time.now.strftime("%Y-%m-%d-%H%M%S")
-      directory = File.join(Rails.root, "tmp", INVENTORY_PREFIX, current_db, timestamp)
+      directory = File.join(Rails.root, "tmp", INVENTORY_PREFIX, current_db)
       FileUtils.mkdir_p(directory)
       directory
     end
diff --git a/spec/components/s3_inventory_spec.rb b/spec/components/s3_inventory_spec.rb
index a63913a..d29a302 100644
--- a/spec/components/s3_inventory_spec.rb
+++ b/spec/components/s3_inventory_spec.rb
@@ -48,6 +48,8 @@ describe "S3Inventory" do
         next_marker: "eyJNYXJrZXIiOiBudWxsLCAiYm90b190cnVuY2F0ZV9hbW91bnQiOiAyfQ=="
       }
     })
+
+    inventory.stubs(:cleanup!)
   end
 
   it "should raise error if an inventory file is not found" do
@@ -67,9 +69,7 @@ describe "S3Inventory" do
     Fabricate(:upload, etag: "ETag2", created_at: Time.now)
     Fabricate(:upload, created_at: 2.days.ago)
 
-    inventory.expects(:download_inventory_files_to_tmp_directory)
-    inventory.expects(:decompress_inventory_files)
-    inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(2)
+    inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
     inventory.expects(:inventory_date).returns(Time.now)
 
     output = capture_stdout do
@@ -87,9 +87,7 @@ describe "S3Inventory" do
     ]
     files.each { |file| Fabricate(:upload, url: file[0]) }
 
-    inventory.expects(:download_inventory_files_to_tmp_directory)
-    inventory.expects(:decompress_inventory_files)

[... diff too long, it was truncated ...]

GitHub sha: 9919ee19

I recommend leaning less on stubbing and mocking here for the testing, the only real stuff you need to mock in an integration test is going to be the HTTP request to get the inventory. Then the integration test can write it to the file system and nuke it.

2 Likes