Change spam check to apply to local accounts and add a threshold (#11806)
Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positives
This commit is contained in:
		
					parent
					
						
							
								577706987d
							
						
					
				
			
			
				commit
				
					
						4f6af87906
					
				
			
		
					 4 changed files with 66 additions and 29 deletions
				
			
		|  | @ -4,9 +4,25 @@ class SpamCheck | |||
|   include Redisable | ||||
|   include ActionView::Helpers::TextHelper | ||||
| 
 | ||||
|   # Threshold over which two Nilsimsa values are considered | ||||
|   # to refer to the same text | ||||
|   NILSIMSA_COMPARE_THRESHOLD = 95 | ||||
|   NILSIMSA_MIN_SIZE          = 10 | ||||
|   EXPIRE_SET_AFTER           = 1.week.seconds | ||||
| 
 | ||||
|   # Nilsimsa doesn't work well on small inputs, so below | ||||
|   # this size, we check only for exact matches with MD5 | ||||
|   NILSIMSA_MIN_SIZE = 10 | ||||
| 
 | ||||
|   # How long to keep the trail of digests between updates, | ||||
|   # there is no reason to store it forever | ||||
|   EXPIRE_SET_AFTER = 1.week.seconds | ||||
| 
 | ||||
|   # How many digests to keep in an account's trail. If it's | ||||
|   # too small, spam could rotate around different message templates | ||||
|   MAX_TRAIL_SIZE = 10 | ||||
| 
 | ||||
|   # How many detected duplicates to allow through before | ||||
|   # considering the message as spam | ||||
|   THRESHOLD = 5 | ||||
| 
 | ||||
|   def initialize(status) | ||||
|     @account = status.account | ||||
|  | @ -21,9 +37,9 @@ class SpamCheck | |||
|     if insufficient_data? | ||||
|       false | ||||
|     elsif nilsimsa? | ||||
|       any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } | ||||
|       digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } | ||||
|     else | ||||
|       any_other_digest?('md5') { |_, other_digest| other_digest == digest } | ||||
|       digests_over_threshold?('md5') { |_, other_digest| other_digest == digest } | ||||
|     end | ||||
|   end | ||||
| 
 | ||||
|  | @ -38,7 +54,7 @@ class SpamCheck | |||
|     # get the correct status ID back, we have to save it in the string value | ||||
| 
 | ||||
|     redis.zadd(redis_key, @status.id, digest_with_algorithm) | ||||
|     redis.zremrangebyrank(redis_key, '0', '-10') | ||||
|     redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1)) | ||||
|     redis.expire(redis_key, EXPIRE_SET_AFTER) | ||||
|   end | ||||
| 
 | ||||
|  | @ -78,6 +94,20 @@ class SpamCheck | |||
|     end | ||||
|   end | ||||
| 
 | ||||
|   class << self | ||||
|     def perform(status) | ||||
|       spam_check = new(status) | ||||
| 
 | ||||
|       return if spam_check.skip? | ||||
| 
 | ||||
|       if spam_check.spam? | ||||
|         spam_check.flag! | ||||
|       else | ||||
|         spam_check.remember! | ||||
|       end | ||||
|     end | ||||
|   end | ||||
| 
 | ||||
|   private | ||||
| 
 | ||||
|   def disabled? | ||||
|  | @ -149,14 +179,14 @@ class SpamCheck | |||
|     redis.zrange(redis_key, 0, -1) | ||||
|   end | ||||
| 
 | ||||
|   def any_other_digest?(filter_algorithm) | ||||
|     other_digests.any? do |record| | ||||
|   def digests_over_threshold?(filter_algorithm) | ||||
|     other_digests.select do |record| | ||||
|       algorithm, other_digest, status_id = record.split(':') | ||||
| 
 | ||||
|       next unless algorithm == filter_algorithm | ||||
| 
 | ||||
|       yield algorithm, other_digest, status_id | ||||
|     end | ||||
|     end.size >= THRESHOLD | ||||
|   end | ||||
| 
 | ||||
|   def matching_status_ids | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue