Don't add \b to whole-word keywords that don't start with word characters.

Ditto for ending with \b.

Consider muting the phrase "(hot take)".  I stipulate it is reasonable
to enter this with the default "match whole word" behavior.  Under the
old behavior, this would be encoded as

    \b\(hot\ take\)\b

However, if \b is before the first character in the string and the first
character in the string is not a word character, then the match will
fail.  Ditto for after.  In our example, "(" is not a word character, so
this will not match statuses containing "(hot take)", and that's a very
surprising behavior.

To address this, we only add leading and trailing \b to keywords that
start or end with word characters.
This commit is contained in:
David Yip 2017-10-22 00:24:32 -05:00
parent 19826774f0
commit 4b68e82a19
2 changed files with 34 additions and 14 deletions

View file

@ -19,35 +19,49 @@ class Glitch::KeywordMute < ApplicationRecord
after_commit :invalidate_cached_matcher after_commit :invalidate_cached_matcher
def self.matcher_for(account_id) def self.matcher_for(account_id)
Rails.cache.fetch("keyword_mutes:matcher:#{account_id}") { Matcher.new(account_id) } Matcher.new(account_id)
end end
private private
def invalidate_cached_matcher def invalidate_cached_matcher
Rails.cache.delete("keyword_mutes:matcher:#{account_id}") Rails.cache.delete("keyword_mutes:regex:#{account_id}")
end end
class Matcher class Matcher
attr_reader :account_id
attr_reader :regex attr_reader :regex
def initialize(account_id) def initialize(account_id)
re = [].tap do |arr| @account_id = account_id
Glitch::KeywordMute.where(account_id: account_id).select(:keyword, :id, :whole_word).find_each do |m| @regex = Rails.cache.fetch("keyword_mutes:regex:#{account_id}") { regex_for_account }
boundary = m.whole_word ? '\b' : '' end
arr << "#{boundary}#{Regexp.escape(m.keyword.strip)}#{boundary}"
def keywords
Glitch::KeywordMute.
where(account_id: account_id).
select(:keyword, :id, :whole_word)
end
def regex_for_account
re_text = [].tap do |arr|
keywords.find_each do |kw|
arr << (kw.whole_word ? boundary_regex_for_keyword(kw.keyword) : Regexp.escape(kw.keyword))
end end
end.join('|') end.join('|')
@regex = /#{re}/i unless re.empty? /#{re_text}/i unless re_text.empty?
end
def boundary_regex_for_keyword(keyword)
sb = keyword =~ /\A[[:word:]]/ ? '\b' : ''
eb = keyword =~ /[[:word:]]\Z/ ? '\b' : ''
"#{sb}#{Regexp.escape(keyword)}#{eb}"
end end
def =~(str) def =~(str)
regex ? regex =~ str : false regex ? regex =~ str : false
end end
def matches?(str)
!!(regex =~ str)
end
end end
end end

View file

@ -7,7 +7,7 @@ RSpec.describe Glitch::KeywordMute, type: :model do
describe '.matcher_for' do describe '.matcher_for' do
let(:matcher) { Glitch::KeywordMute.matcher_for(alice) } let(:matcher) { Glitch::KeywordMute.matcher_for(alice) }
describe 'with no Glitch::KeywordMutes for an account' do describe 'with no mutes' do
before do before do
Glitch::KeywordMute.delete_all Glitch::KeywordMute.delete_all
end end
@ -17,7 +17,7 @@ RSpec.describe Glitch::KeywordMute, type: :model do
end end
end end
describe 'with Glitch::KeywordMutes for an account' do describe 'with mutes' do
it 'does not match keywords set by a different account' do it 'does not match keywords set by a different account' do
Glitch::KeywordMute.create!(account: bob, keyword: 'take') Glitch::KeywordMute.create!(account: bob, keyword: 'take')
@ -63,7 +63,13 @@ RSpec.describe Glitch::KeywordMute, type: :model do
it 'matches keywords surrounded by non-alphanumeric ornamentation' do it 'matches keywords surrounded by non-alphanumeric ornamentation' do
Glitch::KeywordMute.create!(account: alice, keyword: 'hot') Glitch::KeywordMute.create!(account: alice, keyword: 'hot')
expect(matcher =~ 'This is a ~*HOT*~ take').to be_truthy expect(matcher =~ '(hot take)').to be_truthy
end
it 'escapes metacharacters in keywords' do
Glitch::KeywordMute.create!(account: alice, keyword: '(hot take)')
expect(matcher =~ '(hot take)').to be_truthy
end end
it 'uses case-folding rules appropriate for more than just English' do it 'uses case-folding rules appropriate for more than just English' do