Remove usernames and hashtags from language detection (#3503)
* Add failing specs for hashtag and username extraction in language detector * Remove usernames and hashtags from text before language detection * Handle multiple instances of special case, and reduce whitespace
This commit is contained in:
		
					parent
					
						
							
								d1e08bd38c
							
						
					
				
			
			
				commit
				
					
						d010e270e6
					
				
			
		
					 2 changed files with 47 additions and 2 deletions
				
			
		|  | @ -13,6 +13,10 @@ class LanguageDetector | ||||||
|     detected_language_code || default_locale.to_sym |     detected_language_code || default_locale.to_sym | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|  |   def prepared_text | ||||||
|  |     simplified_text.strip | ||||||
|  |   end | ||||||
|  | 
 | ||||||
|   private |   private | ||||||
| 
 | 
 | ||||||
|   def detected_language_code |   def detected_language_code | ||||||
|  | @ -20,18 +24,21 @@ class LanguageDetector | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def result |   def result | ||||||
|     @result ||= @identifier.find_language(text_without_urls) |     @result ||= @identifier.find_language(prepared_text) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def detected_language_reliable? |   def detected_language_reliable? | ||||||
|     result.reliable? |     result.reliable? | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def text_without_urls |   def simplified_text | ||||||
|     text.dup.tap do |new_text| |     text.dup.tap do |new_text| | ||||||
|       URI.extract(new_text).each do |url| |       URI.extract(new_text).each do |url| | ||||||
|         new_text.gsub!(url, '') |         new_text.gsub!(url, '') | ||||||
|       end |       end | ||||||
|  |       new_text.gsub!(Account::MENTION_RE, '') | ||||||
|  |       new_text.gsub!(Tag::HASHTAG_RE, '') | ||||||
|  |       new_text.gsub!(/\s+/, ' ') | ||||||
|     end |     end | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,7 +1,45 @@ | ||||||
| # frozen_string_literal: true | # frozen_string_literal: true | ||||||
|  | 
 | ||||||
| require 'rails_helper' | require 'rails_helper' | ||||||
| 
 | 
 | ||||||
| describe LanguageDetector do | describe LanguageDetector do | ||||||
|  |   describe 'prepared_text' do | ||||||
|  |     it 'returns unmodified string without special cases' do | ||||||
|  |       string = 'just a regular string' | ||||||
|  |       result = described_class.new(string).prepared_text | ||||||
|  | 
 | ||||||
|  |       expect(result).to eq string | ||||||
|  |     end | ||||||
|  | 
 | ||||||
|  |     it 'collapses spacing in strings' do | ||||||
|  |       string = 'The formatting   in    this is very        odd' | ||||||
|  | 
 | ||||||
|  |       result = described_class.new(string).prepared_text | ||||||
|  |       expect(result).to eq 'The formatting in this is very odd' | ||||||
|  |     end | ||||||
|  | 
 | ||||||
|  |     it 'strips usernames from strings before detection' do | ||||||
|  |       string = '@username Yeah, very surreal...! also @friend' | ||||||
|  | 
 | ||||||
|  |       result = described_class.new(string).prepared_text | ||||||
|  |       expect(result).to eq 'Yeah, very surreal...! also' | ||||||
|  |     end | ||||||
|  | 
 | ||||||
|  |     it 'strips URLs from strings before detection' do | ||||||
|  |       string = 'Our website is https://example.com and also http://localhost.dev' | ||||||
|  | 
 | ||||||
|  |       result = described_class.new(string).prepared_text | ||||||
|  |       expect(result).to eq 'Our website is and also' | ||||||
|  |     end | ||||||
|  | 
 | ||||||
|  |     it 'strips #hashtags from strings before detection' do | ||||||
|  |       string = 'Hey look at all the #animals and #fish' | ||||||
|  | 
 | ||||||
|  |       result = described_class.new(string).prepared_text | ||||||
|  |       expect(result).to eq 'Hey look at all the and' | ||||||
|  |     end | ||||||
|  |   end | ||||||
|  | 
 | ||||||
|   describe 'to_iso_s' do |   describe 'to_iso_s' do | ||||||
|     it 'detects english language for basic strings' do |     it 'detects english language for basic strings' do | ||||||
|       strings = [ |       strings = [ | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue