Skip to content

Commit 59b3c75

Browse files
committed
Load all JSON-LD data
The LinkedData class was only grabbing the first application/ld+json script tag found in the DOM. It now finds them all and makes them all available via the getAll method. Additionally, it inspects the mainEntityOfPage property of the json nodes and if a match is found with the request URI, that node is assigned as the default node rather than the first node found.
1 parent 4913408 commit 59b3c75

File tree

78 files changed

+3141
-85
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+3141
-85
lines changed

src/LinkedData.php

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
namespace Embed;
55

66
use Exception;
7+
use ML\JsonLD\JsonLD;
78
use ML\JsonLD\Document as LdDocument;
89
use ML\JsonLD\DocumentInterface;
910
use ML\JsonLD\GraphInterface;
@@ -16,7 +17,9 @@ class LinkedData
1617

1718
private ?DocumentInterface $document;
1819

19-
private function get(string ...$keys)
20+
private array $allData;
21+
22+
public function get(string ...$keys)
2023
{
2124
$graph = $this->getGraph();
2225

@@ -39,6 +42,15 @@ private function get(string ...$keys)
3942
return null;
4043
}
4144

45+
public function getAll()
46+
{
47+
if (!isset($this->allData)) {
48+
$this->fetchData();
49+
}
50+
51+
return $this->allData;
52+
}
53+
4254
private function getGraph(string $name = null): ?GraphInterface
4355
{
4456
if (!isset($this->document)) {
@@ -50,20 +62,55 @@ private function getGraph(string $name = null): ?GraphInterface
5062
}
5163
}
5264

53-
return $this->document->getGraph();
65+
return $this->document->getGraph($name);
5466
}
5567

5668
protected function fetchData(): array
5769
{
70+
$this->allData = [];
71+
5872
$document = $this->extractor->getDocument();
59-
$content = $document->select('.//script', ['type' => 'application/ld+json'])->str();
73+
$nodes = $document->select('.//script', ['type' => 'application/ld+json'])->strAll();
6074

61-
if (empty($content)) {
75+
if (empty($nodes)) {
6276
return [];
6377
}
6478

6579
try {
66-
return json_decode($content, true) ?: [];
80+
$data = [];
81+
$request_uri = (string)$this->extractor->getUri();
82+
foreach ($nodes as $node) {
83+
$ldjson = json_decode($node, true);
84+
if (!empty($ldjson)) {
85+
86+
if (empty($data)) {
87+
$data = $ldjson;
88+
} elseif (isset($ldjson['mainEntityOfPage'])) {
89+
$url = '';
90+
if (is_string($ldjson['mainEntityOfPage'])) {
91+
$url = $ldjson['mainEntityOfPage'];
92+
} elseif (isset($ldjson['mainEntityOfPage']['@id'])) {
93+
$url = $ldjson['mainEntityOfPage']['@id'];
94+
}
95+
if (!empty($url) && $url == $request_uri) {
96+
$data = $ldjson;
97+
}
98+
}
99+
100+
// some pages with multiple ld+json blocks will put
101+
// each block into an array (Flickr does this). Most
102+
// appear to put an object in each ld+json block. To
103+
// prevent them from stepping on one another, the ones
104+
// that are not arrays will be put into an array.
105+
if (!array_is_list($ldjson)) {
106+
$ldjson = [$ldjson];
107+
}
108+
109+
$this->allData = array_merge($this->allData, $ldjson);
110+
}
111+
}
112+
113+
return $data;
67114
} catch (Exception $exception) {
68115
return [];
69116
}

src/functions.php

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,25 @@ function isEmpty(mixed ...$values): bool
154154

155155
return false;
156156
}
157+
158+
if (!function_exists("array_is_list")) {
159+
/**
160+
* Polyfil for https://www.php.net/manual/en/function.array-is-list.php
161+
* which is only available in PHP 8.1+
162+
*
163+
* @param array $array The array
164+
*
165+
* @return bool
166+
*/
167+
function array_is_list(array $array): bool
168+
{
169+
$i = -1;
170+
foreach ($array as $k => $v) {
171+
++$i;
172+
if ($k !== $i) {
173+
return false;
174+
}
175+
}
176+
return true;
177+
}
178+
}

tests/PagesTest.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ public function testSoundCloud()
188188

189189
public function testSpotify()
190190
{
191-
$this->assertEmbed('https://play.spotify.com/album/7s66wU1XJ2NsUuWM2NKiUV');
192191
$this->assertEmbed('https://open.spotify.com/album/7s66wU1XJ2NsUuWM2NKiUV');
192+
$this->assertEmbed('https://play.spotify.com/album/7s66wU1XJ2NsUuWM2NKiUV');
193193
}
194194

195195
public function testTwitch()

tests/PagesTestCase.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ private static function getData(Extractor $extractor): array
107107
if (method_exists($extractor, 'getApi')) {
108108
$data['api'] = $extractor->getApi()->all();
109109
}
110+
$data['allLinkedData'] = $extractor->getLinkedData()->getAll();
110111

111112
return $data;
112113
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
<?php
2+
declare(strict_types = 1);
3+
4+
return [
5+
'headers' => [
6+
'content-type' => [
7+
'text/html; charset=utf-8'
8+
],
9+
'cache-control' => [
10+
'no-cache, no-store, max-age=0, must-revalidate'
11+
],
12+
'pragma' => [
13+
'no-cache'
14+
],
15+
'expires' => [
16+
'Mon, 01 Jan 1990 00:00:00 GMT'
17+
],
18+
'date' => [
19+
'Sun, 04 Apr 2021 15:20:47 GMT'
20+
],
21+
'p3p' => [
22+
'CP="This is not a P3P policy! See g.co/p3phelp for more info."'
23+
],
24+
'content-security-policy' => [
25+
'script-src \'nonce-g9/eaYJePAYsVh50Jyl0EQ\' \'unsafe-inline\';object-src \'none\';base-uri \'self\';report-uri /_/IdentityNotFoundHttp/cspreport;worker-src \'self\'',
26+
'script-src \'nonce-g9/eaYJePAYsVh50Jyl0EQ\' \'self\' https://apis.google.com https://ssl.gstatic.com https://www.google.com https://www.gstatic.com https://www.google-analytics.com;report-uri /_/IdentityNotFoundHttp/cspreport'
27+
],
28+
'content-encoding' => [
29+
'gzip'
30+
],
31+
'server' => [
32+
'ESF'
33+
],
34+
'x-xss-protection' => [
35+
'0'
36+
],
37+
'x-content-type-options' => [
38+
'nosniff'
39+
],
40+
'set-cookie' => [
41+
'NID=212=dAlC8GKROGQ6cWC5EeQ92vga0m4ReROz1kMl9BrboOg7GfaE3zvV7pmmgCmsXsJ7vya8tJGI4jioBfUTai-FbFjJPm264-_PY9-GEu66UJhCsRvBiDJVz3O5Ckjox4e0LsT9RZ2vuLADiJTrbw7nzwn4qwyWUF3duIq6_ZUnLhA; expires=Mon, 04-Oct-2021 15:20:47 GMT; path=/; domain=.google.es; Secure; HttpOnly; SameSite=none'
42+
],
43+
'alt-svc' => [
44+
'h3-29=":443"; ma=2592000,h3-T051=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"'
45+
],
46+
'Content-Location' => [
47+
'https://consent.google.es/\'https:/consent.google.es/ml?continue=https://www.google.es/maps/place/Tordoia,%2BA%2BCoru%25C3%25B1a/@43.0871207,-8.5710004,12z/data%3D!3m1!4b1!4m2!3m1!1s0xd2ef4006f1ef489:0x404f58273ca55a0&gl=ES&hl=es&pc=m&src=1&rffu=true\''
48+
],
49+
'X-Request-Time' => [
50+
'0.112 ms'
51+
]
52+
],
53+
'statusCode' => 404,
54+
'reasonPhrase' => 'Not Found',
55+
'body' => '<html lang=en><meta charset=utf-8><meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width"><title>Error 404 (No se ha encontrado.)!!1</title><style nonce="g9/eaYJePAYsVh50Jyl0EQ">*{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{color:#222;text-align:unset;margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px;}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}pre{white-space:pre-wrap;}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat 0% 0%/100% 100%;-moz-border-image:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) 0}}@media only screen and (-webkit-min-device-pixel-ratio:2){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat;-webkit-background-size:100% 100%}}#logo{display:inline-block;height:54px;width:150px}</style><div id="af-error-container"><a href=//www.google.com><span id=logo aria-label=Google></span></a><p><b>404.</b> <ins>Se trata de un error.</ins><p>No se ha encontrado la URL solicitada en este servidor. <ins>Esa es toda la información de la que disponemos.</ins></div>'
56+
];
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
<?php
2+
declare(strict_types = 1);
3+
4+
return [
5+
'headers' => [
6+
'content-type' => [
7+
'text/html; charset=utf-8'
8+
],
9+
'cache-control' => [
10+
'no-cache, no-store, max-age=0, must-revalidate'
11+
],
12+
'pragma' => [
13+
'no-cache'
14+
],
15+
'expires' => [
16+
'Mon, 01 Jan 1990 00:00:00 GMT'
17+
],
18+
'date' => [
19+
'Sun, 04 Apr 2021 15:23:17 GMT'
20+
],
21+
'content-security-policy' => [
22+
'script-src \'nonce-vsx63Xooae1XpA8j6kEGNA\' \'unsafe-inline\';object-src \'none\';base-uri \'self\';report-uri /_/IdentityNotFoundHttp/cspreport;worker-src \'self\'',
23+
'script-src \'nonce-vsx63Xooae1XpA8j6kEGNA\' \'self\' https://apis.google.com https://ssl.gstatic.com https://www.google.com https://www.gstatic.com https://www.google-analytics.com;report-uri /_/IdentityNotFoundHttp/cspreport'
24+
],
25+
'content-encoding' => [
26+
'gzip'
27+
],
28+
'server' => [
29+
'ESF'
30+
],
31+
'x-xss-protection' => [
32+
'0'
33+
],
34+
'x-content-type-options' => [
35+
'nosniff'
36+
],
37+
'alt-svc' => [
38+
'h3-29=":443"; ma=2592000,h3-T051=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"'
39+
],
40+
'Content-Location' => [
41+
'https://consent.youtube.com/\'https:/consent.youtube.com/ml?continue=https://www.youtube.com/channel/UCuZeHD5SGecQomz2pVDHGzg&gl=ES&hl=es&pc=yt&uxe=23983172&src=1&rffu=true\''
42+
],
43+
'X-Request-Time' => [
44+
'0.127 ms'
45+
]
46+
],
47+
'statusCode' => 404,
48+
'reasonPhrase' => 'Not Found',
49+
'body' => '<html lang=en><meta charset=utf-8><meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width"><title>Error 404 (No se ha encontrado.)!!1</title><style nonce="vsx63Xooae1XpA8j6kEGNA">*{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{color:#222;text-align:unset;margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px;}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}pre{white-space:pre-wrap;}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat 0% 0%/100% 100%;-moz-border-image:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) 0}}@media only screen and (-webkit-min-device-pixel-ratio:2){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat;-webkit-background-size:100% 100%}}#logo{display:inline-block;height:54px;width:150px}</style><div id="af-error-container"><a href=//www.google.com><span id=logo aria-label=Google></span></a><p><b>404.</b> <ins>Se trata de un error.</ins><p>No se ha encontrado la URL solicitada en este servidor. <ins>Esa es toda la información de la que disponemos.</ins></div>'
50+
];
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
<?php
2+
declare(strict_types = 1);
3+
4+
return [
5+
'headers' => [
6+
'content-type' => [
7+
'text/html; charset=utf-8'
8+
],
9+
'cache-control' => [
10+
'no-cache, no-store, max-age=0, must-revalidate'
11+
],
12+
'pragma' => [
13+
'no-cache'
14+
],
15+
'expires' => [
16+
'Mon, 01 Jan 1990 00:00:00 GMT'
17+
],
18+
'date' => [
19+
'Sun, 04 Apr 2021 15:23:17 GMT'
20+
],
21+
'content-security-policy' => [
22+
'script-src \'nonce-vsx63Xooae1XpA8j6kEGNA\' \'unsafe-inline\';object-src \'none\';base-uri \'self\';report-uri /_/IdentityNotFoundHttp/cspreport;worker-src \'self\'',
23+
'script-src \'nonce-vsx63Xooae1XpA8j6kEGNA\' \'self\' https://apis.google.com https://ssl.gstatic.com https://www.google.com https://www.gstatic.com https://www.google-analytics.com;report-uri /_/IdentityNotFoundHttp/cspreport'
24+
],
25+
'content-encoding' => [
26+
'gzip'
27+
],
28+
'server' => [
29+
'ESF'
30+
],
31+
'x-xss-protection' => [
32+
'0'
33+
],
34+
'x-content-type-options' => [
35+
'nosniff'
36+
],
37+
'alt-svc' => [
38+
'h3-29=":443"; ma=2592000,h3-T051=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"'
39+
],
40+
'Content-Location' => [
41+
'https://consent.youtube.com/\'https:/consent.youtube.com/ml?continue=https://www.youtube.com/channel/UCuZeHD5SGecQomz2pVDHGzg&gl=ES&hl=es&pc=yt&uxe=23983172&src=1&rffu=true\''
42+
],
43+
'X-Request-Time' => [
44+
'0.127 ms'
45+
]
46+
],
47+
'statusCode' => 404,
48+
'reasonPhrase' => 'Not Found',
49+
'body' => '<html lang=en><meta charset=utf-8><meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width"><title>Error 404 (No se ha encontrado.)!!1</title><style nonce="vsx63Xooae1XpA8j6kEGNA">*{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{color:#222;text-align:unset;margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px;}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}pre{white-space:pre-wrap;}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat 0% 0%/100% 100%;-moz-border-image:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) 0}}@media only screen and (-webkit-min-device-pixel-ratio:2){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat;-webkit-background-size:100% 100%}}#logo{display:inline-block;height:54px;width:150px}</style><div id="af-error-container"><a href=//www.google.com><span id=logo aria-label=Google></span></a><p><b>404.</b> <ins>Se trata de un error.</ins><p>No se ha encontrado la URL solicitada en este servidor. <ins>Esa es toda la información de la que disponemos.</ins></div>'
50+
];

tests/fixtures/500px.com.photo-138251239-taganay-park-by-daniel-kordan.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,6 @@
2222
'title' => '500px',
2323
'url' => 'https://500px.com/photo/138251239/taganay-park-by-daniel-kordan',
2424
'linkedData' => [],
25-
'oEmbed' => []
25+
'oEmbed' => [],
26+
'allLinkedData' => []
2627
];

tests/fixtures/animoto.com.play-gjsj1gu0wdrfr4pgw12xzq.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,6 @@
4646
'cache_age' => 604800,
4747
'video_url' => 'https://d150hyw1dtprld.cloudfront.net/swf/w.swf?w=swf/production/vp1&e=1617549702&f=GjsJ1gu0WDRfr4pGw12xZQ&d=0&m=p&r=360p&i=m&asset_domain=s3-p.animoto.com&animoto_domain=animoto.com&options=start_hq',
4848
'html' => '<iframe id="vp1GjsJ1" title="Video Player" width="640" height="360" frameborder="0" src="https://s3.amazonaws.com/embed.animoto.com/play.html?w=swf/production/vp1&e=1617549702&f=GjsJ1gu0WDRfr4pGw12xZQ&d=0&m=p&r=360p&i=m&asset_domain=s3-p.animoto.com&animoto_domain=animoto.com&options=start_hq" allowfullscreen></iframe>'
49-
]
49+
],
50+
'allLinkedData' => []
5051
];

tests/fixtures/archive.org.details-dn2015-0220_vid.php

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,5 +811,22 @@
811811
'collection_files_count' => null,
812812
'collection_size' => null
813813
]
814+
],
815+
'allLinkedData' => [
816+
[
817+
'@context' => 'http://schema.org',
818+
'@type' => 'BreadcrumbList',
819+
'itemListElement' => [
820+
[
821+
'@type' => 'ListItem',
822+
'position' => 1,
823+
'item' => [
824+
'@id' => 'https://archive.org/details/movies',
825+
'name' => 'Videos',
826+
'image' => 'https://archive.org/services/img/movies'
827+
]
828+
]
829+
]
830+
]
814831
]
815832
];

0 commit comments

Comments
 (0)