Use charlock_holmes instead of nkf at FetchLinkCardService (#4080)
* Specs for language detection * Use CharlockHolmes instead of NKF * Correct mistakes * Correct style * Set hint_enc instead of falling back and strip_tags * Improve specs * Add dependencies
This commit is contained in:
		
					parent
					
						
							
								794781d121
							
						
					
				
			
			
				commit
				
					
						007ab330e6
					
				
			
		
					 11 changed files with 78 additions and 4 deletions
				
			
		|  | @ -32,6 +32,7 @@ addons: | |||
|     - g++-6 | ||||
|     - libprotobuf-dev | ||||
|     - protobuf-compiler | ||||
|     - libicu-dev | ||||
| 
 | ||||
| rvm: | ||||
|   - 2.3.4 | ||||
|  |  | |||
							
								
								
									
										1
									
								
								Aptfile
									
										
									
									
									
								
							
							
						
						
									
										1
									
								
								Aptfile
									
										
									
									
									
								
							|  | @ -3,3 +3,4 @@ libprotobuf-dev | |||
| ffmpeg | ||||
| libxdamage1 | ||||
| libxfixes3 | ||||
| libicu-dev | ||||
|  |  | |||
|  | @ -25,6 +25,7 @@ RUN echo "@edge https://nl.alpinelinux.org/alpine/edge/main" >> /etc/apk/reposit | |||
|     ffmpeg \ | ||||
|     file \ | ||||
|     git \ | ||||
|     icu-dev \ | ||||
|     imagemagick@edge \ | ||||
|     libpq \ | ||||
|     libxml2 \ | ||||
|  |  | |||
							
								
								
									
										1
									
								
								Gemfile
									
										
									
									
									
								
							
							
						
						
									
										1
									
								
								Gemfile
									
										
									
									
									
								
							|  | @ -22,6 +22,7 @@ gem 'active_model_serializers', '~> 0.10' | |||
| gem 'addressable', '~> 2.5' | ||||
| gem 'bootsnap' | ||||
| gem 'browser' | ||||
| gem 'charlock_holmes', '~> 0.7.3' | ||||
| gem 'cld3', '~> 3.1' | ||||
| gem 'devise', '~> 4.2' | ||||
| gem 'devise-two-factor', '~> 3.0' | ||||
|  |  | |||
|  | @ -106,6 +106,7 @@ GEM | |||
|       rack (>= 1.0.0) | ||||
|       rack-test (>= 0.5.4) | ||||
|       xpath (~> 2.0) | ||||
|     charlock_holmes (0.7.3) | ||||
|     case_transform (0.2) | ||||
|       activesupport | ||||
|     chunky_png (1.3.8) | ||||
|  | @ -501,6 +502,7 @@ DEPENDENCIES | |||
|   capistrano-rbenv (~> 2.1) | ||||
|   capistrano-yarn (~> 2.0) | ||||
|   capybara (~> 2.14) | ||||
|   charlock_holmes (~> 0.7.3) | ||||
|   cld3 (~> 3.1) | ||||
|   climate_control (~> 0.2) | ||||
|   devise (~> 4.2) | ||||
|  |  | |||
							
								
								
									
										1
									
								
								Vagrantfile
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								Vagrantfile
									
										
									
									
										vendored
									
									
								
							|  | @ -37,6 +37,7 @@ sudo apt-get install \ | |||
|   yarn \ | ||||
|   libprotobuf-dev \ | ||||
|   libreadline-dev \ | ||||
|   libicu-dev \ | ||||
|   -y | ||||
| 
 | ||||
| # Install rvm | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| # frozen_string_literal: true | ||||
| require 'nkf' | ||||
| 
 | ||||
| class FetchLinkCardService < BaseService | ||||
|   include HttpHelper | ||||
|  | @ -86,7 +85,12 @@ class FetchLinkCardService < BaseService | |||
|     return if response.code != 200 || response.mime_type != 'text/html' | ||||
| 
 | ||||
|     html = response.to_s | ||||
|     page = Nokogiri::HTML(html, nil, NKF.guess(html).to_s) | ||||
| 
 | ||||
|     detector = CharlockHolmes::EncodingDetector.new | ||||
|     detector.strip_tags = true | ||||
| 
 | ||||
|     guess = detector.detect(html, response.charset) | ||||
|     page = Nokogiri::HTML(html, nil, guess&.fetch(:encoding)) | ||||
| 
 | ||||
|     card.type             = :link | ||||
|     card.title            = meta_property(page, 'og:title') || page.at_xpath('//title')&.content | ||||
|  |  | |||
							
								
								
									
										20
									
								
								spec/fixtures/requests/koi8-r.txt
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spec/fixtures/requests/koi8-r.txt
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | |||
| HTTP/1.1 200 OK | ||||
| Server: nginx/1.11.10 | ||||
| Date: Tue, 04 Jul 2017 16:43:39 GMT | ||||
| Content-Type: text/html | ||||
| Content-Length: 273 | ||||
| Connection: keep-alive | ||||
| Last-Modified: Tue, 04 Jul 2017 16:41:34 GMT | ||||
| Accept-Ranges: bytes | ||||
| 
 | ||||
| <HTML> | ||||
| <HEAD> | ||||
|   <META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac"> | ||||
|   <META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=koi8-r"> | ||||
|   <TITLE>íÏÓËÏ×Ñ ÎÁÞÉÎÁÅÔß ÔÏÌØËÏ ×ß XVI ÓÔ. ÐÒÉ×ÌÅËÁÔØ ×ÎÉÍÁÎÅ ÉÎÏÓÔÒÁÎÃÅ×ß.</TITLE> | ||||
| </HEAD> | ||||
| <BODY> | ||||
| <P><CENTER><B><FONT SIZE="+2">íÏÓËÏ×Ñ ÎÁÞÉÎÁÅÔß ÔÏÌØËÏ ×ß XVI ÓÔ. ÐÒÉ×ÌÅËÁÔØ ×ÎÉÍÁÎÅ ÉÎÏÓÔÒÁÎÃÅ×ß.</FONT></B><BR> | ||||
| <HR><BR> | ||||
| </BODY> | ||||
| </HTML> | ||||
							
								
								
									
										4
									
								
								spec/fixtures/requests/sjis.txt
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								spec/fixtures/requests/sjis.txt
									
										
									
									
										vendored
									
									
								
							|  | @ -11,10 +11,10 @@ Accept-Ranges: bytes | |||
| <HEAD> | ||||
|   <META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac"> | ||||
|   <META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=x-sjis"> | ||||
|   <TITLE>JSISのページ</TITLE> | ||||
|   <TITLE>SJISのページ</TITLE> | ||||
| </HEAD> | ||||
| <BODY> | ||||
| <P><CENTER><B><FONT SIZE="+2">SJISのページ</FONT></B><BR> | ||||
| <P><CENTER><B><FONT SIZE="+2">私も同年ましていわゆる記念人ってものの時でしありです。もし時間に意味者は正しくどんな発会ませだまでが申し上げがいらっしゃるたには参考帰るたいだから、少しにもやっあっましなた。金からいうないのはどうも九月をできるだけたたくた。けっして岡田さんに反抗幸少し徴に云おでしょ金力こうした権力あなたか指図がというお出入りなくだろなありて、その昔は私か金力陰を怒らから、久原さんのものをがたのいつがしかるにご希望と向いばそれmanにご矛盾へ参りように同時にご演説がしでならので、多分もし表裏に変ったてくれです事で考えたた。しかも例えばごがたがとどまらものも実際むやみとありですて、この自分では申しんてとして世間に並べのに行かなかっな。</FONT></B><BR> | ||||
| <HR><BR> | ||||
| </BODY> | ||||
| </HTML> | ||||
|  |  | |||
							
								
								
									
										20
									
								
								spec/fixtures/requests/sjis_with_wrong_charset.txt
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spec/fixtures/requests/sjis_with_wrong_charset.txt
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | |||
| HTTP/1.1 200 OK | ||||
| Server: nginx/1.11.10 | ||||
| Date: Tue, 04 Jul 2017 16:43:39 GMT | ||||
| Content-Type: text/html; charset=utf-8 | ||||
| Content-Length: 273 | ||||
| Connection: keep-alive | ||||
| Last-Modified: Tue, 04 Jul 2017 16:41:34 GMT | ||||
| Accept-Ranges: bytes | ||||
| 
 | ||||
| <HTML> | ||||
| <HEAD> | ||||
|   <META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac"> | ||||
|   <META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=x-sjis"> | ||||
|   <TITLE>SJISのページ</TITLE> | ||||
| </HEAD> | ||||
| <BODY> | ||||
| <P><CENTER><B><FONT SIZE="+2">私も同年ましていわゆる記念人ってものの時でしありです。もし時間に意味者は正しくどんな発会ませだまでが申し上げがいらっしゃるたには参考帰るたいだから、少しにもやっあっましなた。金からいうないのはどうも九月をできるだけたたくた。けっして岡田さんに反抗幸少し徴に云おでしょ金力こうした権力あなたか指図がというお出入りなくだろなありて、その昔は私か金力陰を怒らから、久原さんのものをがたのいつがしかるにご希望と向いばそれmanにご矛盾へ参りように同時にご演説がしでならので、多分もし表裏に変ったてくれです事で考えたた。しかも例えばごがたがとどまらものも実際むやみとありですて、この自分では申しんてとして世間に並べのに行かなかっな。</FONT></B><BR> | ||||
| <HR><BR> | ||||
| </BODY> | ||||
| </HTML> | ||||
|  | @ -8,6 +8,10 @@ RSpec.describe FetchLinkCardService do | |||
|     stub_request(:get, 'http://example.xn--fiqs8s/').to_return(request_fixture('idn.txt')) | ||||
|     stub_request(:head, 'http://example.com/sjis').to_return(status: 200, headers: { 'Content-Type' => 'text/html' }) | ||||
|     stub_request(:get, 'http://example.com/sjis').to_return(request_fixture('sjis.txt')) | ||||
|     stub_request(:head, 'http://example.com/sjis_with_wrong_charset').to_return(status: 200, headers: { 'Content-Type' => 'text/html' }) | ||||
|     stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt')) | ||||
|     stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' }) | ||||
|     stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt')) | ||||
|     stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404) | ||||
| 
 | ||||
|     subject.call(status) | ||||
|  | @ -27,6 +31,25 @@ RSpec.describe FetchLinkCardService do | |||
| 
 | ||||
|       it 'works with SJIS' do | ||||
|         expect(a_request(:get, 'http://example.com/sjis')).to have_been_made.at_least_once | ||||
|         expect(status.preview_card.title).to eq("SJISのページ") | ||||
|       end | ||||
|     end | ||||
| 
 | ||||
|     context do | ||||
|       let(:status) { Fabricate(:status, text: 'Check out http://example.com/sjis_with_wrong_charset') } | ||||
| 
 | ||||
|       it 'works with SJIS even with wrong charset header' do | ||||
|         expect(a_request(:get, 'http://example.com/sjis_with_wrong_charset')).to have_been_made.at_least_once | ||||
|         expect(status.preview_card.title).to eq("SJISのページ") | ||||
|       end | ||||
|     end | ||||
| 
 | ||||
|     context do | ||||
|       let(:status) { Fabricate(:status, text: 'Check out http://example.com/koi8-r') } | ||||
| 
 | ||||
|       it 'works with koi8-r' do | ||||
|         expect(a_request(:get, 'http://example.com/koi8-r')).to have_been_made.at_least_once | ||||
|         expect(status.preview_card.title).to eq("Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.") | ||||
|       end | ||||
|     end | ||||
|   end | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue