From f1f6ddd5362f40e287857750f5e102206bd0e169 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Mon, 7 Feb 2022 18:16:31 +0100 Subject: [PATCH] Fix structured data parsing from links choking on bad data (#17403) * Fix structured data parsing from links choking on bad data - Fix og:url meta tag being prioritized over canonical link tag - Fix structured data parsing choking on commented-out CDATA declarations - Fix HTML entities in title, description, provider_name, author_name - Change structured data parsing to attempt every JSON-LD script tag * Remove unnecessary slash escapes from CDATA regex pattern --- app/lib/link_details_extractor.rb | 53 ++++++++-- spec/lib/link_details_extractor_spec.rb | 122 ++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 9 deletions(-) diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index 56ad0717b..d2bcf0c25 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -3,6 +3,19 @@ class LinkDetailsExtractor include ActionView::Helpers::TagHelper + # Some publications wrap their JSON-LD data in their + + + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + + context 'but the first tag is invalid JSON' do + let(:html) { <<-HTML } + + + + + + + + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + end end