From 4f6af87906175d9ea802ef0c6f050388eac890fa Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Wed, 18 Sep 2019 12:53:13 +0200 Subject: [PATCH] Change spam check to apply to local accounts and add a threshold (#11806) Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positives --- app/lib/activitypub/activity/create.rb | 10 +----- app/lib/spam_check.rb | 46 +++++++++++++++++++----- app/services/process_mentions_service.rb | 5 +++ spec/lib/spam_check_spec.rb | 34 +++++++++++------- 4 files changed, 66 insertions(+), 29 deletions(-) diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index dea7fd43c..e69193b71 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -408,15 +408,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity end def check_for_spam - spam_check = SpamCheck.new(@status) - - return if spam_check.skip? - - if spam_check.spam? - spam_check.flag! - else - spam_check.remember! - end + SpamCheck.perform(@status) end def forward_for_reply diff --git a/app/lib/spam_check.rb b/app/lib/spam_check.rb index 0cf1b8790..441697364 100644 --- a/app/lib/spam_check.rb +++ b/app/lib/spam_check.rb @@ -4,9 +4,25 @@ class SpamCheck include Redisable include ActionView::Helpers::TextHelper + # Threshold over which two Nilsimsa values are considered + # to refer to the same text NILSIMSA_COMPARE_THRESHOLD = 95 - NILSIMSA_MIN_SIZE = 10 - EXPIRE_SET_AFTER = 1.week.seconds + + # Nilsimsa doesn't work well on small inputs, so below + # this size, we check only for exact matches with MD5 + NILSIMSA_MIN_SIZE = 10 + + # How long to keep the trail of digests between updates, + # there is no reason to store it forever + EXPIRE_SET_AFTER = 1.week.seconds + + # How many digests to keep in an account's trail. If it's + # too small, spam could rotate around different message templates + MAX_TRAIL_SIZE = 10 + + # How many detected duplicates to allow through before + # considering the message as spam + THRESHOLD = 5 def initialize(status) @account = status.account @@ -21,9 +37,9 @@ class SpamCheck if insufficient_data? false elsif nilsimsa? - any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } + digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } else - any_other_digest?('md5') { |_, other_digest| other_digest == digest } + digests_over_threshold?('md5') { |_, other_digest| other_digest == digest } end end @@ -38,7 +54,7 @@ class SpamCheck # get the correct status ID back, we have to save it in the string value redis.zadd(redis_key, @status.id, digest_with_algorithm) - redis.zremrangebyrank(redis_key, '0', '-10') + redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1)) redis.expire(redis_key, EXPIRE_SET_AFTER) end @@ -78,6 +94,20 @@ class SpamCheck end end + class << self + def perform(status) + spam_check = new(status) + + return if spam_check.skip? + + if spam_check.spam? + spam_check.flag! + else + spam_check.remember! + end + end + end + private def disabled? @@ -149,14 +179,14 @@ class SpamCheck redis.zrange(redis_key, 0, -1) end - def any_other_digest?(filter_algorithm) - other_digests.any? do |record| + def digests_over_threshold?(filter_algorithm) + other_digests.select do |record| algorithm, other_digest, status_id = record.split(':') next unless algorithm == filter_algorithm yield algorithm, other_digest, status_id - end + end.size >= THRESHOLD end def matching_status_ids diff --git a/app/services/process_mentions_service.rb b/app/services/process_mentions_service.rb index 90dca9740..2f7a9e985 100644 --- a/app/services/process_mentions_service.rb +++ b/app/services/process_mentions_service.rb @@ -33,6 +33,7 @@ class ProcessMentionsService < BaseService end status.save! + check_for_spam(status) mentions.each { |mention| create_notification(mention) } end @@ -61,4 +62,8 @@ class ProcessMentionsService < BaseService def resolve_account_service ResolveAccountService.new end + + def check_for_spam(status) + SpamCheck.perform(status) + end end diff --git a/spec/lib/spam_check_spec.rb b/spec/lib/spam_check_spec.rb index 9e0989216..4cae46111 100644 --- a/spec/lib/spam_check_spec.rb +++ b/spec/lib/spam_check_spec.rb @@ -86,23 +86,33 @@ RSpec.describe SpamCheck do end it 'returns true for duplicate statuses to the same recipient' do - status1 = status_with_html('@alice Hello') - described_class.new(status1).remember! + described_class::THRESHOLD.times do + status1 = status_with_html('@alice Hello') + described_class.new(status1).remember! + end + status2 = status_with_html('@alice Hello') expect(described_class.new(status2).spam?).to be true end it 'returns true for duplicate statuses to different recipients' do - status1 = status_with_html('@alice Hello') - described_class.new(status1).remember! + described_class::THRESHOLD.times do + status1 = status_with_html('@alice Hello') + described_class.new(status1).remember! + end + status2 = status_with_html('@bob Hello') expect(described_class.new(status2).spam?).to be true end it 'returns true for nearly identical statuses with random numbers' do source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.' - status1 = status_with_html('@alice ' + source_text + ' 1234') - described_class.new(status1).remember! + + described_class::THRESHOLD.times do + status1 = status_with_html('@alice ' + source_text + ' 1234') + described_class.new(status1).remember! + end + status2 = status_with_html('@bob ' + source_text + ' 9568') expect(described_class.new(status2).spam?).to be true end @@ -140,9 +150,9 @@ RSpec.describe SpamCheck do let(:redis_key) { spam_check.send(:redis_key) } it 'remembers' do - expect do - spam_check.remember! - end.to change { Redis.current.exists(redis_key) }.from(false).to(true) + expect(Redis.current.exists(redis_key)).to be true + spam_check.remember! + expect(Redis.current.exists(redis_key)).to be true end end @@ -156,9 +166,9 @@ RSpec.describe SpamCheck do end it 'resets' do - expect do - spam_check.reset! - end.to change { Redis.current.exists(redis_key) }.from(true).to(false) + expect(Redis.current.exists(redis_key)).to be true + spam_check.reset! + expect(Redis.current.exists(redis_key)).to be false end end