From 6cf83a2a64bf2f0f4848fb0fabae4d5f181ba587 Mon Sep 17 00:00:00 2001 From: David Roetzel Date: Fri, 21 Jun 2024 14:51:10 +0200 Subject: [PATCH] Improve encoding detection for link cards (#30780) --- app/lib/link_details_extractor.rb | 15 ++++++++++----- .../fixtures/requests/low_confidence_latin1.txt | 17 +++++++++++++++++ spec/services/fetch_link_card_service_spec.rb | 11 ++++++++++- 3 files changed, 37 insertions(+), 6 deletions(-) create mode 100644 spec/fixtures/requests/low_confidence_latin1.txt diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index 9d7748425..de354b109 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -255,16 +255,21 @@ class LinkDetailsExtractor end def document - @document ||= Nokogiri::HTML(@html, nil, encoding) + @document ||= detect_encoding_and_parse_document end - def encoding - @encoding ||= begin - guess = detector.detect(@html, @html_charset) - guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil + def detect_encoding_and_parse_document + [detect_encoding, nil, @html_charset, 'UTF-8'].uniq.each do |encoding| + document = Nokogiri::HTML(@html, nil, encoding) + return document if document.to_s.valid_encoding? end end + def detect_encoding + guess = detector.detect(@html, @html_charset) + guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil + end + def detector @detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector| detector.strip_tags = true diff --git a/spec/fixtures/requests/low_confidence_latin1.txt b/spec/fixtures/requests/low_confidence_latin1.txt new file mode 100644 index 000000000..39c3e23d6 --- /dev/null +++ b/spec/fixtures/requests/low_confidence_latin1.txt @@ -0,0 +1,17 @@ +HTTP/1.1 200 OK +server: nginx +date: Thu, 13 Jun 2024 14:33:13 GMT +content-type: text/html; charset=ISO-8859-1 +content-length: 158 +accept-ranges: bytes + + + + + + Tofu á l'orange + + +

Tofu á l'orange

+ + diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb index d8f4b8e23..c50eff90e 100644 --- a/spec/services/fetch_link_card_service_spec.rb +++ b/spec/services/fetch_link_card_service_spec.rb @@ -13,6 +13,7 @@ RSpec.describe FetchLinkCardService, type: :service do stub_request(:get, 'http://example.com/test?data=file.gpx%5E1').to_return(status: 200) stub_request(:get, 'http://example.com/test-').to_return(request_fixture('idn.txt')) stub_request(:get, 'http://example.com/windows-1251').to_return(request_fixture('windows-1251.txt')) + stub_request(:get, 'http://example.com/low_confidence_latin1').to_return(request_fixture('low_confidence_latin1.txt')) subject.call(status) end @@ -62,7 +63,15 @@ RSpec.describe FetchLinkCardService, type: :service do end end - context do + context 'with a URL of a page in ISO-8859-1 encoding, that charlock_holmes cannot detect' do + let(:status) { Fabricate(:status, text: 'Check out http://example.com/low_confidence_latin1') } + + it 'decodes the HTML' do + expect(status.preview_card.title).to eq("Tofu á l'orange") + end + end + + context 'with a Japanese path URL' do let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') } it 'works with Japanese path string' do