2023-02-22 11:55:31 +11:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
2021-11-06 09:23:05 +11:00
|
|
|
require 'rails_helper'
|
|
|
|
|
|
|
|
RSpec.describe LinkDetailsExtractor do
|
2023-08-03 23:41:51 +10:00
|
|
|
subject { described_class.new(original_url, html, nil) }
|
2023-02-20 15:24:14 +11:00
|
|
|
|
2023-08-03 23:41:51 +10:00
|
|
|
let(:original_url) { 'https://example.com/dog.html?tracking=123' }
|
2021-11-06 09:23:05 +11:00
|
|
|
|
|
|
|
describe '#canonical_url' do
|
2023-08-03 23:41:51 +10:00
|
|
|
let(:html) { "<!doctype html><link rel='canonical' href='#{url}'>" }
|
2021-11-06 09:23:05 +11:00
|
|
|
|
2023-08-03 23:41:51 +10:00
|
|
|
context 'when canonical URL points to the same host' do
|
|
|
|
let(:url) { 'https://example.com/dog.html' }
|
2021-11-06 09:23:05 +11:00
|
|
|
|
|
|
|
it 'ignores the canonical URLs' do
|
2023-08-03 23:41:51 +10:00
|
|
|
expect(subject.canonical_url).to eq 'https://example.com/dog.html'
|
2021-11-06 09:23:05 +11:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2023-08-03 23:41:51 +10:00
|
|
|
context 'when canonical URL points to another host' do
|
|
|
|
let(:url) { 'https://different.example.net/dog.html' }
|
2021-11-06 09:23:05 +11:00
|
|
|
|
|
|
|
it 'ignores the canonical URLs' do
|
2023-08-03 23:41:51 +10:00
|
|
|
expect(subject.canonical_url).to eq original_url
|
2021-11-06 09:23:05 +11:00
|
|
|
end
|
|
|
|
end
|
2022-03-26 05:31:35 +11:00
|
|
|
|
|
|
|
context 'when canonical URL is set to "null"' do
|
2023-08-03 23:41:51 +10:00
|
|
|
let(:url) { 'null' }
|
2022-03-26 05:31:35 +11:00
|
|
|
|
|
|
|
it 'ignores the canonical URLs' do
|
|
|
|
expect(subject.canonical_url).to eq original_url
|
|
|
|
end
|
|
|
|
end
|
2024-09-12 21:14:42 +10:00
|
|
|
|
|
|
|
context 'when canonical URL is set to "undefined"' do
|
|
|
|
let(:url) { 'undefined' }
|
|
|
|
|
|
|
|
it 'ignores the canonical URLs' do
|
|
|
|
expect(subject.canonical_url).to eq original_url
|
|
|
|
end
|
|
|
|
end
|
2021-11-06 09:23:05 +11:00
|
|
|
end
|
2022-02-08 04:16:31 +11:00
|
|
|
|
2023-08-03 23:41:51 +10:00
|
|
|
context 'when only basic metadata is present' do
|
|
|
|
let(:html) { <<~HTML }
|
|
|
|
<!doctype html>
|
|
|
|
<html lang="en">
|
|
|
|
<head>
|
|
|
|
<title>Man bites dog</title>
|
|
|
|
<meta name="description" content="A dog's tale">
|
|
|
|
</head>
|
|
|
|
</html>
|
|
|
|
HTML
|
2022-02-08 04:16:31 +11:00
|
|
|
|
2024-01-27 03:31:07 +11:00
|
|
|
it 'extracts the expected values from html metadata' do
|
|
|
|
expect(subject)
|
|
|
|
.to have_attributes(
|
|
|
|
title: eq('Man bites dog'),
|
|
|
|
description: eq("A dog's tale"),
|
|
|
|
language: eq('en')
|
|
|
|
)
|
2023-08-03 23:41:51 +10:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
context 'when structured data is present' do
|
|
|
|
let(:ld_json) do
|
|
|
|
{
|
|
|
|
'@context' => 'https://schema.org',
|
|
|
|
'@type' => 'NewsArticle',
|
|
|
|
'headline' => 'Man bites dog',
|
|
|
|
'description' => "A dog's tale",
|
|
|
|
'datePublished' => '2022-01-31T19:53:00+00:00',
|
|
|
|
'author' => {
|
|
|
|
'@type' => 'Organization',
|
|
|
|
'name' => 'Charlie Brown',
|
|
|
|
},
|
|
|
|
'publisher' => {
|
|
|
|
'@type' => 'NewsMediaOrganization',
|
|
|
|
'name' => 'Pet News',
|
|
|
|
'url' => 'https://example.com',
|
|
|
|
},
|
2023-10-17 00:24:14 +11:00
|
|
|
'inLanguage' => {
|
|
|
|
name: 'English',
|
|
|
|
alternateName: 'en',
|
|
|
|
},
|
2023-08-03 23:41:51 +10:00
|
|
|
}.to_json
|
|
|
|
end
|
2022-02-08 04:16:31 +11:00
|
|
|
|
2023-08-03 23:41:51 +10:00
|
|
|
shared_examples 'structured data' do
|
2024-01-27 03:31:07 +11:00
|
|
|
it 'extracts the expected values from structured data' do
|
|
|
|
expect(subject)
|
|
|
|
.to have_attributes(
|
|
|
|
title: eq('Man bites dog'),
|
|
|
|
description: eq("A dog's tale"),
|
|
|
|
published_at: eq('2022-01-31T19:53:00+00:00'),
|
|
|
|
author_name: eq('Charlie Brown'),
|
|
|
|
provider_name: eq('Pet News'),
|
|
|
|
language: eq('en')
|
|
|
|
)
|
2022-02-08 04:16:31 +11:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2023-08-03 23:41:51 +10:00
|
|
|
context 'when is wrapped in CDATA tags' do
|
|
|
|
let(:html) { <<~HTML }
|
|
|
|
<!doctype html>
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<script type="application/ld+json">
|
|
|
|
//<![CDATA[
|
|
|
|
#{ld_json}
|
|
|
|
//]]>
|
|
|
|
</script>
|
|
|
|
</head>
|
|
|
|
</html>
|
|
|
|
HTML
|
|
|
|
|
|
|
|
include_examples 'structured data'
|
|
|
|
end
|
|
|
|
|
2023-05-04 13:49:08 +10:00
|
|
|
context 'with the first tag is invalid JSON' do
|
2023-02-18 08:56:20 +11:00
|
|
|
let(:html) { <<~HTML }
|
|
|
|
<!doctype html>
|
|
|
|
<html>
|
|
|
|
<body>
|
|
|
|
<script type="application/ld+json">
|
2023-08-03 23:41:51 +10:00
|
|
|
invalid LD+JSON
|
2023-02-18 08:56:20 +11:00
|
|
|
</script>
|
|
|
|
<script type="application/ld+json">
|
2023-08-03 23:41:51 +10:00
|
|
|
#{ld_json}
|
|
|
|
</script>
|
|
|
|
</body>
|
|
|
|
</html>
|
|
|
|
HTML
|
|
|
|
|
|
|
|
include_examples 'structured data'
|
|
|
|
end
|
|
|
|
|
2024-07-26 00:51:44 +10:00
|
|
|
context 'with the first tag is null' do
|
|
|
|
let(:html) { <<~HTML }
|
|
|
|
<!doctype html>
|
|
|
|
<html>
|
|
|
|
<body>
|
|
|
|
<script type="application/ld+json">
|
|
|
|
null
|
|
|
|
</script>
|
|
|
|
<script type="application/ld+json">
|
|
|
|
#{ld_json}
|
|
|
|
</script>
|
|
|
|
</body>
|
|
|
|
</html>
|
|
|
|
HTML
|
|
|
|
|
|
|
|
include_examples 'structured data'
|
|
|
|
end
|
|
|
|
|
2023-08-03 23:41:51 +10:00
|
|
|
context 'with preceding block of unsupported LD+JSON' do
|
|
|
|
let(:html) { <<~HTML }
|
|
|
|
<!doctype html>
|
|
|
|
<html>
|
|
|
|
<body>
|
|
|
|
<script type="application/ld+json">
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"@context": "https://schema.org",
|
|
|
|
"@type": "ItemList",
|
|
|
|
"url": "https://example.com/cat.html",
|
|
|
|
"name": "Man bites cat",
|
|
|
|
"description": "A cat's tale"
|
2023-02-18 08:56:20 +11:00
|
|
|
},
|
2023-08-03 23:41:51 +10:00
|
|
|
{
|
|
|
|
"@context": "https://schema.org",
|
|
|
|
"@type": "BreadcrumbList",
|
|
|
|
"itemListElement":[
|
|
|
|
{
|
|
|
|
"@type": "ListItem",
|
|
|
|
"position": 1,
|
|
|
|
"item": {
|
|
|
|
"@id": "https://www.example.com",
|
|
|
|
"name": "Cat News"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]
|
2023-02-18 08:56:20 +11:00
|
|
|
}
|
2023-08-03 23:41:51 +10:00
|
|
|
]
|
|
|
|
</script>
|
|
|
|
<script type="application/ld+json">
|
|
|
|
#{ld_json}
|
2023-02-18 08:56:20 +11:00
|
|
|
</script>
|
|
|
|
</body>
|
|
|
|
</html>
|
2022-02-08 04:16:31 +11:00
|
|
|
HTML
|
|
|
|
|
2023-08-03 23:41:51 +10:00
|
|
|
include_examples 'structured data'
|
|
|
|
end
|
|
|
|
|
|
|
|
context 'with unsupported in same block LD+JSON' do
|
|
|
|
let(:html) { <<~HTML }
|
|
|
|
<!doctype html>
|
|
|
|
<html>
|
|
|
|
<body>
|
|
|
|
<script type="application/ld+json">
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"@context": "https://schema.org",
|
|
|
|
"@type": "ItemList",
|
|
|
|
"url": "https://example.com/cat.html",
|
|
|
|
"name": "Man bites cat",
|
|
|
|
"description": "A cat's tale"
|
|
|
|
},
|
|
|
|
#{ld_json}
|
|
|
|
]
|
|
|
|
</script>
|
|
|
|
</body>
|
|
|
|
</html>
|
|
|
|
HTML
|
|
|
|
|
|
|
|
include_examples 'structured data'
|
|
|
|
end
|
2024-07-09 02:04:36 +10:00
|
|
|
|
|
|
|
context 'with author names as array' do
|
|
|
|
let(:ld_json) do
|
|
|
|
{
|
|
|
|
'@context' => 'https://schema.org',
|
|
|
|
'@type' => 'NewsArticle',
|
|
|
|
'headline' => 'A lot of authors',
|
|
|
|
'description' => 'But we decided to cram them into one',
|
|
|
|
'author' => {
|
|
|
|
'@type' => 'Person',
|
|
|
|
'name' => ['Author 1', 'Author 2'],
|
|
|
|
},
|
|
|
|
}.to_json
|
|
|
|
end
|
2024-08-09 23:48:54 +10:00
|
|
|
let(:html) { <<~HTML }
|
|
|
|
<!doctype html>
|
|
|
|
<html>
|
|
|
|
<body>
|
|
|
|
<script type="application/ld+json">
|
|
|
|
#{ld_json}
|
|
|
|
</script>
|
|
|
|
</body>
|
|
|
|
</html>
|
|
|
|
HTML
|
2024-07-09 02:04:36 +10:00
|
|
|
|
|
|
|
it 'joins author names' do
|
|
|
|
expect(subject.author_name).to eq 'Author 1, Author 2'
|
|
|
|
end
|
|
|
|
end
|
2023-08-03 23:41:51 +10:00
|
|
|
end
|
|
|
|
|
|
|
|
context 'when Open Graph protocol data is present' do
|
|
|
|
let(:html) { <<~HTML }
|
|
|
|
<!doctype html>
|
|
|
|
<html>
|
|
|
|
<head>
|
|
|
|
<meta property="og:url" content="https://example.com/dog.html">
|
|
|
|
<meta property="og:title" content="Man bites dog">
|
|
|
|
<meta property="og:description" content="A dog's tale">
|
|
|
|
<meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
|
|
|
|
<meta property="og:author" content="Charlie Brown">
|
|
|
|
<meta property="og:locale" content="en">
|
|
|
|
<meta property="og:image" content="https://example.com/snoopy.jpg">
|
|
|
|
<meta property="og:image:alt" content="A good boy">
|
|
|
|
<meta property="og:site_name" content="Pet News">
|
|
|
|
</head>
|
|
|
|
</html>
|
|
|
|
HTML
|
|
|
|
|
2024-01-27 03:31:07 +11:00
|
|
|
it 'extracts the expected values from open graph data' do
|
|
|
|
expect(subject)
|
|
|
|
.to have_attributes(
|
|
|
|
canonical_url: eq('https://example.com/dog.html'),
|
|
|
|
title: eq('Man bites dog'),
|
|
|
|
description: eq("A dog's tale"),
|
|
|
|
published_at: eq('2022-01-31T19:53:00+00:00'),
|
|
|
|
author_name: eq('Charlie Brown'),
|
|
|
|
language: eq('en'),
|
|
|
|
image: eq('https://example.com/snoopy.jpg'),
|
|
|
|
image_alt: eq('A good boy'),
|
|
|
|
provider_name: eq('Pet News')
|
|
|
|
)
|
2022-02-08 04:16:31 +11:00
|
|
|
end
|
|
|
|
end
|
2021-11-06 09:23:05 +11:00
|
|
|
end
|