FIX: strip invalid byte sequences

FIX: strip invalid byte sequences

diff --git a/lib/text_cleaner.rb b/lib/text_cleaner.rb
index 48307d3..cdbccc1 100644
--- a/lib/text_cleaner.rb
+++ b/lib/text_cleaner.rb
@@ -27,6 +27,8 @@ class TextCleaner
   end
 
   def self.clean(text, opts = {})
+    # Remove invalid byte sequences
+    text.scrub!("")
     # Replace !!!!! with a single !
     text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks]
     # Replace ????? with a single ?
diff --git a/spec/components/text_cleaner_spec.rb b/spec/components/text_cleaner_spec.rb
index d5cc332..e98ca72 100644
--- a/spec/components/text_cleaner_spec.rb
+++ b/spec/components/text_cleaner_spec.rb
@@ -229,4 +229,12 @@ describe TextCleaner do
     end
   end
 
+  context "invalid byte sequence" do
+    let(:with_invalid_bytes) { "abc\u3042\x81" }
+    let(:without_invalid_bytes) { "abc\u3042" }
+
+    it "removes invalid bytes" do
+      expect(TextCleaner.clean(with_invalid_bytes)).to eq(without_invalid_bytes)
+    end
+  end
 end

GitHub sha: 3d9981ac

1 Like