FIX: Add word boundaries to replace and tag watched words (#13405)

FIX: Add word boundaries to replace and tag watched words (#13405)

The generated regular expressions did not contain \b which matched every text that contained the word, even if it was only a substring of a word.

For example, if “art” was a watched word a post containing word “artist” matched.

diff --git a/app/assets/javascripts/discourse/tests/acceptance/admin-watched-words-test.js b/app/assets/javascripts/discourse/tests/acceptance/admin-watched-words-test.js
index d04bf15..ecc2958 100644
--- a/app/assets/javascripts/discourse/tests/acceptance/admin-watched-words-test.js
+++ b/app/assets/javascripts/discourse/tests/acceptance/admin-watched-words-test.js
@@ -118,7 +118,6 @@ acceptance("Admin - Watched Words - Bad regular expressions", function (needs) {
             action: "block",
           },
         ],
-        regular_expressions: true,
         compiled_regular_expressions: {
           block: null,
           censor: null,
diff --git a/app/assets/javascripts/discourse/tests/fixtures/watched-words-fixtures.js b/app/assets/javascripts/discourse/tests/fixtures/watched-words-fixtures.js
index 8a6ead3..8a5ac4b 100644
--- a/app/assets/javascripts/discourse/tests/fixtures/watched-words-fixtures.js
+++ b/app/assets/javascripts/discourse/tests/fixtures/watched-words-fixtures.js
@@ -11,14 +11,14 @@ export default {
       {
         id: 7,
         word: "hi",
-        regexp: "hi",
+        regexp: "(hi)",
         replacement: "hello",
         action: "replace",
       },
       {
         id: 8,
         word: "hello",
-        regexp: "hello",
+        regexp: "(hello)",
         replacement: "greeting",
         action: "tag",
       },
diff --git a/app/assets/javascripts/discourse/tests/unit/lib/pretty-text-test.js b/app/assets/javascripts/discourse/tests/unit/lib/pretty-text-test.js
index f07b88b..1b8d795 100644
--- a/app/assets/javascripts/discourse/tests/unit/lib/pretty-text-test.js
+++ b/app/assets/javascripts/discourse/tests/unit/lib/pretty-text-test.js
@@ -1675,21 +1675,21 @@ var bar = 'bar';
 
   test("watched words replace", function (assert) {
     const opts = {
-      watchedWordsReplace: { fun: "times" },
+      watchedWordsReplace: { "(?:\\W|^)(fun)(?=\\W|$)": "times" },
     };
 
-    assert.cookedOptions("test fun", opts, "<p>test times</p>");
+    assert.cookedOptions("test fun funny", opts, "<p>test times funny</p>");
   });
 
   test("watched words link", function (assert) {
     const opts = {
-      watchedWordsLink: { fun: "https://discourse.org" },
+      watchedWordsLink: { "(?:\\W|^)(fun)(?=\\W|$)": "https://discourse.org" },
     };
 
     assert.cookedOptions(
-      "test fun",
+      "test fun funny",
       opts,
-      '<p>test <a href="https://discourse.org">fun</a></p>'
+      '<p>test <a href="https://discourse.org">fun</a> funny</p>'
     );
   });
 
@@ -1697,7 +1697,7 @@ var bar = 'bar';
     const maxMatches = 100; // same limit as MD watched-words-replace plugin
     const opts = {
       siteSettings: { watched_words_regular_expressions: true },
-      watchedWordsReplace: { "\\bu?\\b": "you" },
+      watchedWordsReplace: { "(\\bu?\\b)": "you" },
     };
 
     assert.cookedOptions(
diff --git a/app/assets/javascripts/pretty-text/engines/discourse-markdown/watched-words.js b/app/assets/javascripts/pretty-text/engines/discourse-markdown/watched-words.js
index c8ad08a..cce3925 100644
--- a/app/assets/javascripts/pretty-text/engines/discourse-markdown/watched-words.js
+++ b/app/assets/javascripts/pretty-text/engines/discourse-markdown/watched-words.js
@@ -20,8 +20,8 @@ function findAllMatches(text, matchers) {
       count++ < MAX_MATCHES
     ) {
       matches.push({
-        index: match.index,
-        text: match[0],
+        index: match.index + match[0].indexOf(match[1]),
+        text: match[1],
         replacement: matcher.replacement,
         link: matcher.link,
       });
diff --git a/app/serializers/watched_word_serializer.rb b/app/serializers/watched_word_serializer.rb
index 4b3b138..070da70 100644
--- a/app/serializers/watched_word_serializer.rb
+++ b/app/serializers/watched_word_serializer.rb
@@ -4,7 +4,7 @@ class WatchedWordSerializer < ApplicationSerializer
   attributes :id, :word, :regexp, :replacement, :action
 
   def regexp
-    WordWatcher.word_to_regexp(word)
+    WordWatcher.word_to_regexp(word, whole: true)
   end
 
   def action
diff --git a/app/services/word_watcher.rb b/app/services/word_watcher.rb
index 3669914..1927ac9 100644
--- a/app/services/word_watcher.rb
+++ b/app/services/word_watcher.rb
@@ -54,17 +54,26 @@ class WordWatcher
 
   def self.word_matcher_regexps(action)
     if words = get_cached_words(action)
-      words.map { |w, r| [word_to_regexp(w), r] }.to_h
+      words.map { |w, r| [word_to_regexp(w, whole: true), r] }.to_h
     end
   end
 
-  def self.word_to_regexp(word)
+  def self.word_to_regexp(word, whole: false)
     if SiteSetting.watched_words_regular_expressions?
       # Strip ruby regexp format if present, we're going to make the whole thing
       # case insensitive anyway
-      return word.start_with?("(?-mix:") ? word[7..-2] : word
+      regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
+      regexp = "(#{regexp})" if whole
+      return regexp
     end
-    Regexp.escape(word).gsub("\\*", '\S*')
+
+    regexp = Regexp.escape(word).gsub("\\*", '\S*')
+
+    if whole && !SiteSetting.watched_words_regular_expressions?
+      regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
+    end
+
+    regexp
   end
 
   def self.word_matcher_regexp_key(action)
@@ -144,6 +153,6 @@ class WordWatcher
   end
 
   def word_matches?(word)
-    Regexp.new(WordWatcher.word_to_regexp(word), Regexp::IGNORECASE).match?(@raw)
+    Regexp.new(WordWatcher.word_to_regexp(word, whole: true), Regexp::IGNORECASE).match?(@raw)
   end
 end
diff --git a/spec/components/post_creator_spec.rb b/spec/components/post_creator_spec.rb
index 5827de6..e01db2b 100644
--- a/spec/components/post_creator_spec.rb
+++ b/spec/components/post_creator_spec.rb
@@ -502,13 +502,21 @@ describe PostCreator do
             end
 
             context "without regular expressions" do
-              it "works" do
+              it "works with many tags" do
                 Fabricate(:watched_word, action: WatchedWord.actions[:tag], word: "HELLO", replacement: "greetings , hey")
 
                 @post = creator.create
                 expect(@post.topic.tags.map(&:name)).to match_array(['greetings', 'hey'])
               end
 
+              it "works with overlapping words" do
+                Fabricate(:watched_word, action: WatchedWord.actions[:tag], word: "art", replacement: "about-art")
+                Fabricate(:watched_word, action: WatchedWord.actions[:tag], word: "artist*", replacement: "about-artists")
+
+                post = PostCreator.new(user, title: "hello world topic", raw: "this is topic abour artists", archetype_id: 1).create
+                expect(post.topic.tags.map(&:name)).to match_array(['about-artists'])
+              end
+
               it "does not treat as regular expressions" do
                 Fabricate(:watched_word, action: WatchedWord.actions[:tag], word: "he(llo|y)", replacement: "greetings , hey")
 
diff --git a/spec/components/pretty_text_spec.rb b/spec/components/pretty_text_spec.rb
index 0d53880..3d31c1a 100644
--- a/spec/components/pretty_text_spec.rb
+++ b/spec/components/pretty_text_spec.rb
@@ -1420,6 +1420,10 @@ HTML
       expect(PrettyText.cook("Lorem ipsum dolor sittt amet")).to match_html(<<~HTML)
         <p>Lorem ipsum something else amet</p>
       HTML
+
+      expect(PrettyText.cook("Lorem ipsum xdolor sit amet")).to match_html(<<~HTML)
+        <p>Lorem ipsum xdolor sit amet</p>
+      HTML
     end
 
     it "replaces words with links" do

GitHub sha: 74f72956316cec70bc6eba141cf1b04fe189a49c

This commit appears in #13405 which was approved by eviltrout. It was merged by nbianca.