File: /home/bt667/www/wp-content/plugins/accelerated-mobile-pages/includes/vendor/tool/Dom/Document.php
<?php
namespace AmpProject\Dom;
use AmpProject\Amp;
use AmpProject\Attribute;
use AmpProject\DevMode;
use AmpProject\Dom\Document\Encoding;
use AmpProject\Dom\Document\Option;
use AmpProject\Exception\FailedToRetrieveRequiredDomElement;
use AmpProject\Exception\MaxCssByteCountExceeded;
use AmpProject\Optimizer\CssRule;
use AmpProject\Tag;
use DOMAttr;
use DOMComment;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMNodeList;
use DOMText;
use DOMXPath;
/**
* Abstract away some of the difficulties of working with PHP's DOMDocument.
*
* @property DOMXPath $xpath XPath query object for this document.
* @property Element $html The document's <html> element.
* @property Element $head The document's <head> element.
* @property Element $body The document's <body> element.
* @property Element|null $viewport The document's viewport meta element.
* @property DOMNodeList $ampElements The document's <amp-*> elements.
* @property Element $ampCustomStyle The document's <style amp-custom> element.
* @property int $ampCustomStyleByteCount Count of bytes of the CSS in the <style amp-custom> tag.
* @property int $inlineStyleByteCount Count of bytes of the CSS in all of the inline style attributes.
*
* @package ampproject/amp-toolbox
*/
final class Document extends DOMDocument
{
/**
* Default document type to use.
*
* @var string
*/
const DEFAULT_DOCTYPE = '<!DOCTYPE html>';
/**
* Regular expression to match the HTML doctype.
*
* @var string
*/
const HTML_DOCTYPE_REGEX_PATTERN = '#<!doctype\s+html[^>]+?>#si';
/**
* Attribute prefix for AMP-bind data attributes.
*
* @var string
*/
const AMP_BIND_DATA_ATTR_PREFIX = 'data-amp-bind-';
/**
* Pattern for HTML attribute accounting for binding attr name in square brackets syntax, boolean attribute,
* single/double-quoted attribute value, and unquoted attribute values.
*
* @var string
*/
const AMP_BIND_SQUARE_BRACKETS_ATTR_PATTERN = '#^\s+(?P<name>\[?[a-zA-Z0-9_\-]+\]?)'
. '(?P<value>=(?>"[^"]*+"|\'[^\']*+\'|[^\'"\s]+))?#';
/**
* Pattern for HTML attribute accounting for binding attr name in data attribute syntax, boolean attribute,
* single/double-quoted attribute value, and unquoted attribute values.
*
* @var string
*/
const AMP_BIND_DATA_ATTRIBUTE_ATTR_PATTERN = '#^\s+(?P<name>(?:'
. self::AMP_BIND_DATA_ATTR_PREFIX
. ')?[a-zA-Z0-9_\-]+)'
. '(?P<value>=(?>"[^"]*+"|\'[^\']*+\'|[^\'"\s]+))?#';
/**
* Match all start tags that contain a binding attribute in square brackets syntax.
*
* @var string
*/
const AMP_BIND_SQUARE_START_PATTERN = '#<'
. '(?P<name>[a-zA-Z0-9_\-]+)' // Tag name.
. '(?P<attrs>\s+' // Attributes.
. '(?>[^>"\'\[\]]+|"[^"]*+"|\'[^\']*+\')*+' // Non-binding attributes tokens.
. '\[[a-zA-Z0-9_\-]+\]' // One binding attribute key.
. '(?>[^>"\']+|"[^"]*+"|\'[^\']*+\')*+' // Any attribute tokens, including
// binding ones.
. ')>#s';
/**
* Match all start tags that contain a binding attribute in data attribute syntax.
*
* @var string
*/
const AMP_BIND_DATA_START_PATTERN = '#<'
. '(?P<name>[a-zA-Z0-9_\-]+)' // Tag name.
. '(?P<attrs>\s+' // Attributes.
. '(?>' // Match at least one attribute
. '(?>' // prefixed with "data-amp-bind-".
. '(?![a-zA-Z0-9_\-\s]*'
. self::AMP_BIND_DATA_ATTR_PREFIX
. '[a-zA-Z0-9_\-]+="[^"]*+"|\'[^\']*+\')'
. '[^>"\']+|"[^"]*+"|\'[^\']*+\''
. ')*+'
. '(?>[a-zA-Z0-9_\-\s]*'
. self::AMP_BIND_DATA_ATTR_PREFIX
. '[a-zA-Z0-9_\-]+'
. ')'
. ')+'
. '(?>[^>"\']+|"[^"]*+"|\'[^\']*+\')*+' // Any attribute tokens, including
// binding ones.
. ')>#is';
/*
* Regular expressions to fetch the individual structural tags.
* These patterns were optimized to avoid extreme backtracking on large documents.
*/
const HTML_STRUCTURE_DOCTYPE_PATTERN = '/^(?<doctype>[^<]*(?>\s*<!--.*?-->\s*)*<!doctype(?>\s+[^>]+)?>)/is';
const HTML_STRUCTURE_HTML_START_TAG = '/^(?<html_start>[^<]*(?>\s*<!--.*?-->\s*)*<html(?>\s+[^>]*)?>)/is';
const HTML_STRUCTURE_HTML_END_TAG = '/(?<html_end><\/html(?>\s+[^>]*)?>.*)$/is';
const HTML_STRUCTURE_HEAD_START_TAG = '/^[^<]*(?><!--.*?-->\s*)*(?><head(?>\s+[^>]*)?>)/is';
const HTML_STRUCTURE_BODY_START_TAG = '/^[^<]*(?><!--.*-->\s*)*(?><body(?>\s+[^>]*)?>)/is';
const HTML_STRUCTURE_BODY_END_TAG = '/(?><\/body(?>\s+[^>]*)?>.*)$/is';
const HTML_STRUCTURE_HEAD_TAG = '/^(?>[^<]*(?><head(?>\s+[^>]*)?>).*?<\/head(?>\s+[^>]*)?>)/is';
// Regex patterns used for securing and restoring the doctype node.
const HTML_SECURE_DOCTYPE_IF_NOT_FIRST_PATTERN = '/(^[^<]*(?>\s*<!--[^>]*>\s*)+<)(!)(doctype)(\s+[^>]+?)(>)/i';
const HTML_RESTORE_DOCTYPE_PATTERN = '/(^[^<]*(?>\s*<!--[^>]*>\s*)+<)'
. '(!--amp-)(doctype)(\s+[^>]+?)(-->)/i';
// Regex pattern used for removing Internet Explorer conditional comments.
const HTML_IE_CONDITIONAL_COMMENTS_PATTERN = '/<!--(?>\[if\s|<!\[endif)(?>[^>]+(?<!--)>)*(?>[^>]+(?<=--)>)/i';
/**
* Xpath query to fetch the attributes that are being URL-encoded by saveHTML().
*
* @var string
*/
const XPATH_URL_ENCODED_ATTRIBUTES_QUERY = './/*/@src|.//*/@href|.//*/@action';
/**
* Xpath query to fetch the elements containing Mustache templates (both <template type=amp-mustache> and
* <script type=text/plain template=amp-mustache>).
*
* @var string
*/
const XPATH_MUSTACHE_TEMPLATE_ELEMENTS_QUERY = './/self::template[ @type = "amp-mustache" ]'
. '|//self::script[ @type = "text/plain" '
. 'and @template = "amp-mustache" ]';
/**
* Error message to use when the __get() is triggered for an unknown property.
*
* @var string
*/
const PROPERTY_GETTER_ERROR_MESSAGE = 'Undefined property: AmpProject\\Dom\\Document::';
/**
* Charset compatibility tag for making DOMDocument behave.
*
* See: http://php.net/manual/en/domdocument.loadhtml.php#78243.
*
* @var string
*/
const HTTP_EQUIV_META_TAG = '<meta http-equiv="content-type" content="text/html; charset=utf-8">';
// Regex patterns and values used for adding and removing http-equiv charsets for compatibility.
// The opening tag pattern contains a comment to make sure we don't match a <head> tag within a comment.
const HTML_GET_HEAD_OPENING_TAG_PATTERN = '/(?><!--.*?-->\s*)*<head(?>\s+[^>]*)?>/is';
const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG;
const HTML_GET_HTML_OPENING_TAG_PATTERN = '/(?><!--.*?-->\s*)*<html(?>\s+[^>]*)?>/is';
const HTML_GET_HTML_OPENING_TAG_REPLACEMENT = '$0<head>' . self::HTTP_EQUIV_META_TAG . '</head>';
const HTML_GET_HTTP_EQUIV_TAG_PATTERN = '#<meta http-equiv=([\'"])content-type\1 '
. 'content=([\'"])text/html; '
. 'charset=utf-8\2>#i';
const HTML_HTTP_EQUIV_VALUE = 'content-type';
const HTML_HTTP_EQUIV_CONTENT_VALUE = 'text/html; charset=utf-8';
// Regex patterns used for finding tags or extracting attribute values in an HTML string.
const HTML_FIND_TAG_WITHOUT_ATTRIBUTE_PATTERN = '/<%1$s[^>]*?>[^<]*(?><\/%1$s>)?/i';
const HTML_FIND_TAG_WITH_ATTRIBUTE_PATTERN = '/<%1$s [^>]*?\s*%2$s\s*=[^>]*?>[^<]*(?><\/%1$s>)?/i';
const HTML_EXTRACT_ATTRIBUTE_VALUE_PATTERN = '/%s=(?>([\'"])(?<full>.*)?\1|(?<partial>[^ \'";]+))/';
const HTML_FIND_TAG_DELIMITER = '/';
/**
* Pattern to match an AMP emoji together with its variant (amp4ads, amp4email, ...).
*
* @var string
*/
const AMP_EMOJI_ATTRIBUTE_PATTERN = '/<html\s([^>]*?(?:'
. Attribute::AMP_EMOJI_ALT
. '|'
. Attribute::AMP_EMOJI
. ')(4(?:ads|email))?[^>]*?)>/i';
// Attribute to use as a placeholder to move the emoji AMP symbol (⚡) over to DOM.
const EMOJI_AMP_ATTRIBUTE_PLACEHOLDER = 'emoji-amp';
// Patterns used for fixing the mangled encoding of src attributes with SVG data.
const I_AMPHTML_SIZER_REGEX_PATTERN = '/(?<before_src><i-amphtml-sizer\s+[^>]*>\s*<img\s+[^>]*?\s+src=([\'"]))'
. '(?<src>.*?)'
. '(?<after_src>\2><\/i-amphtml-sizer>)/i';
const SRC_SVG_REGEX_PATTERN = '/^\s*(?<type>[^<]+)(?<value><svg[^>]+>)\s*$/i';
/**
* XPath query to retrieve all <amp-*> tags, relative to the <body> node.
*
* @var string
*/
const XPATH_AMP_ELEMENTS_QUERY = ".//*[starts-with(name(), 'amp-')]";
/**
* XPath query to retrieve the <style amp-custom> tag, relative to the <head> node.
*
* @var string
*/
const XPATH_AMP_CUSTOM_STYLE_QUERY = './/style[@amp-custom]';
/**
* XPath query to fetch the inline style attributes, relative to the <body> node.
*
* @var string
*/
const XPATH_INLINE_STYLE_ATTRIBUTES_QUERY = './/@style';
/**
* Associative array of options to configure the behavior of the DOM document abstraction.
*
* @see Option::DEFAULTS For a list of available options.
*
* @var array
*/
private $options;
/**
* Whether `data-ampdevmode` was initially set on the the document element.
*
* @var bool
*/
private $hasInitialAmpDevMode = false;
/**
* The original encoding of how the AmpProject\Dom\Document was created.
*
* This is stored to do an automatic conversion to UTF-8, which is
* a requirement for AMP.
*
* @var string
*/
private $originalEncoding;
/**
* Store the <noscript> markup that was extracted to preserve it during parsing.
*
* The array keys are the element IDs for placeholder <meta> tags.
*
* @see maybeReplaceNoscriptElements()
* @see maybeRestoreNoscriptElements()
*
* @var string[]
*/
private $noscriptPlaceholderComments = [];
/**
* Store whether mustache template tags were replaced and need to be restored.
*
* @see replaceMustacheTemplateTokens()
*
* @var bool
*/
private $mustacheTagsReplaced = false;
/**
* Whether we had secured a doctype that needs restoring or not.
*
* This is an int as it receives the $count from the preg_replace().
*
* @var int
*/
private $securedDoctype = 0;
/**
* Whether the self-closing tags were transformed and need to be restored.
*
* This avoids duplicating this effort (maybe corrupting the DOM) on multiple calls to saveHTML().
*
* @var bool
*/
private $selfClosingTagsTransformed = false;
/**
* Store the emoji that was used to represent the AMP attribute.
*
* There are a few variations, so we want to keep track of this.
*
* @see https://github.com/ampproject/amphtml/issues/25990
*
* @var string
*/
private $usedAmpEmoji;
/**
* Store the current index by prefix.
*
* This is used to generate unique-per-prefix IDs.
*
* @var int[]
*/
private $indexCounter = [];
/**
* The maximum number of bytes of CSS that is enforced.
*
* A negative number will disable the byte count limit.
*
* @var int
*/
private $cssMaxByteCountEnforced = -1;
/**
* Store the names of the amp-bind attributes that were converted so that we can restore them later on.
*
* @var array<string>
*/
private $convertedAmpBindAttributes = [];
/**
* Creates a new AmpProject\Dom\Document object
*
* @link https://php.net/manual/domdocument.construct.php
*
* @param string $version Optional. The version number of the document as part of the XML declaration.
* @param string $encoding Optional. The encoding of the document as part of the XML declaration.
*/
public function __construct($version = '', $encoding = null)
{
$this->originalEncoding = (string)$encoding ?: Encoding::UNKNOWN;
parent::__construct($version ?: '1.0', Encoding::AMP);
$this->registerNodeClass(DOMElement::class, Element::class);
$this->options = Option::DEFAULTS;
}
/**
* Named constructor to provide convenient way of transforming HTML into DOM.
*
* @param string $html HTML to turn into a DOM.
* @param array|string $options Optional. Array of options to configure the document. Used as encoding if a string
* is passed. Defaults to an empty array.
* @return Document|false DOM generated from provided HTML, or false if the transformation failed.
*/
public static function fromHtml($html, $options = [])
{
// Assume options are the encoding if a string is passed, for BC reasons.
if (is_string($options)) {
$options = [Option::ENCODING => $options];
}
$encoding = isset($options[Option::ENCODING]) ? $options[Option::ENCODING] : null;
$dom = new self('', $encoding);
if (! $dom->loadHTML($html, $options)) {
return false;
}
return $dom;
}
/**
* Named constructor to provide convenient way of transforming a HTML fragment into DOM.
*
* The difference to Document::fromHtml() is that fragments are not normalized as to their structure.
*
* @param string $html HTML to turn into a DOM.
* @param array|string $options Optional. Array of options to configure the document. Used as encoding if a string
* is passed. Defaults to an empty array.
* @return Document|false DOM generated from provided HTML, or false if the transformation failed.
*/
public static function fromHtmlFragment($html, $options = [])
{
// Assume options are the encoding if a string is passed, for BC reasons.
if (is_string($options)) {
$options = [Option::ENCODING => $options];
}
$encoding = isset($options[Option::ENCODING]) ? $options[Option::ENCODING] : null;
$dom = new self('', $encoding);
if (! $dom->loadHTMLFragment($html, $options)) {
return false;
}
return $dom;
}
/**
* Named constructor to provide convenient way of retrieving the DOM from a node.
*
* @param DOMNode $node Node to retrieve the DOM from. This is being modified by reference (!).
* @return Document DOM generated from provided HTML, or false if the transformation failed.
*/
public static function fromNode(DOMNode &$node)
{
/**
* Document of the node.
*
* If the node->ownerDocument returns null, the node is the document.
*
* @var DOMDocument
*/
$root = $node->ownerDocument === null ? $node : $node->ownerDocument;
if ($root instanceof self) {
return $root;
}
$dom = new self();
// We replace the $node by reference, to make sure the next lines of code will
// work as expected with the new document.
// Otherwise $dom and $node would refer to two different DOMDocuments.
$node = $dom->importNode($node, true);
$dom->appendChild($node);
$dom->hasInitialAmpDevMode = $dom->documentElement->hasAttribute(DevMode::DEV_MODE_ATTRIBUTE);
return $dom;
}
/**
* Reset the internal optimizations of the Document object.
*
* This might be needed if you are doing an operation that causes the cached
* nodes and XPath objects to point to the wrong document.
*
* @return self Reset version of the Document object.
*/
private function reset()
{
// Drop references to old DOM document.
unset($this->xpath, $this->head, $this->body);
// Reference of the document itself doesn't change here, but might need to change in the future.
return $this;
}
/**
* Load HTML from a string.
*
* @link https://php.net/manual/domdocument.loadhtml.php
*
* @param string $source The HTML string.
* @param array|int|string $options Optional. Array of options to configure the document. Used as additional Libxml
* parameters if an int or string is passed. Defaults to an empty array.
* @return bool true on success or false on failure.
*/
public function loadHTML($source, $options = [])
{
$source = $this->normalizeDocumentStructure($source);
$success = $this->loadHTMLFragment($source, $options);
if ($success) {
$this->insertMissingCharset();
// Do some further clean-up.
$this->deduplicateTag(Tag::HEAD);
$this->deduplicateTag(Tag::BODY);
$this->moveInvalidHeadNodesToBody();
$this->movePostBodyNodesToBody();
$this->convertHeadProfileToLink();
}
return $success;
}
/**
* Load a HTML fragment from a string.
*
* @param string $source The HTML fragment string.
* @param array|int|string $options Optional. Array of options to configure the document. Used as additional Libxml
* parameters if an int or string is passed. Defaults to an empty array.
* @return bool true on success or false on failure.
*/
public function loadHTMLFragment($source, $options = [])
{
// Assume options are the additional libxml flags if a string or int is passed, for BC reasons.
if (is_string($options)) {
$options = (int) $options;
}
if (is_int($options)) {
$options = [Option::LIBXML_FLAGS => $options];
}
$this->options = array_merge($this->options, $options);
$this->reset();
$source = $this->convertAmpEmojiAttribute($source);
$source = $this->convertAmpBindAttributes($source);
$source = $this->replaceSelfClosingTags($source);
$source = $this->maybeReplaceNoscriptElements($source);
$source = $this->secureMustacheScriptTemplates($source);
$source = $this->secureDoctypeNode($source);
list($source, $this->originalEncoding) = $this->detectAndStripEncoding($source);
if (Encoding::AMP !== strtolower($this->originalEncoding)) {
$source = $this->adaptEncoding($source);
}
$source = $this->addHttpEquivCharset($source);
$libxml_previous_state = libxml_use_internal_errors(true);
$this->options[Option::LIBXML_FLAGS] |= LIBXML_COMPACT;
/*
* LIBXML_HTML_NODEFDTD is only available for libxml 2.7.8+.
* This should be the case for PHP 5.4+, but some systems seem to compile against a custom libxml version that
* is lower than expected.
*/
if (defined('LIBXML_HTML_NODEFDTD')) {
$this->options[Option::LIBXML_FLAGS] |= constant('LIBXML_HTML_NODEFDTD');
}
$success = parent::loadHTML($source, $this->options[Option::LIBXML_FLAGS]);
libxml_clear_errors();
libxml_use_internal_errors($libxml_previous_state);
if ($success) {
$this->normalizeHtmlAttributes();
$this->restoreMustacheScriptTemplates();
$this->maybeRestoreNoscriptElements();
// Remove http-equiv charset again.
$meta = $this->head->firstChild;
// We might have leading comments we need to preserve here.
while ($meta instanceof DOMComment) {
$meta = $meta->nextSibling;
}
if (
$meta instanceof Element
&& Tag::META === $meta->tagName
&& self::HTML_HTTP_EQUIV_VALUE === $meta->getAttribute(Attribute::HTTP_EQUIV)
&& (self::HTML_HTTP_EQUIV_CONTENT_VALUE) === $meta->getAttribute(Attribute::CONTENT)
) {
$this->head->removeChild($meta);
}
$this->hasInitialAmpDevMode = $this->documentElement->hasAttribute(DevMode::DEV_MODE_ATTRIBUTE);
}
return $success;
}
/**
* Dumps the internal document into a string using HTML formatting.
*
* @link https://php.net/manual/domdocument.savehtml.php
*
* @param DOMNode|null $node Optional. Parameter to output a subset of the document.
* @return string The HTML, or false if an error occurred.
*/
public function saveHTML(DOMNode $node = null)
{
return $this->saveHTMLFragment($node);
}
/**
* Dumps the internal document fragment into a string using HTML formatting.
*
* @param DOMNode|null $node Optional. Parameter to output a subset of the document.
* @return string The HTML fragment, or false if an error occurred.
*/
public function saveHTMLFragment(DOMNode $node = null)
{
$this->replaceMustacheTemplateTokens();
// Force-add http-equiv charset to make DOMDocument behave as it should.
// See: http://php.net/manual/en/domdocument.loadhtml.php#78243.
$charset = $this->createElement(Tag::META);
$charset->setAttribute(Attribute::HTTP_EQUIV, self::HTML_HTTP_EQUIV_VALUE);
$charset->setAttribute(Attribute::CONTENT, self::HTML_HTTP_EQUIV_CONTENT_VALUE);
$this->head->insertBefore($charset, $this->head->firstChild);
if (null === $node || PHP_VERSION_ID >= 70300) {
$html = parent::saveHTML($node);
} else {
$html = $this->extractNodeViaFragmentBoundaries($node);
}
// Remove http-equiv charset again.
// It is also removed from the DOM again in case saveHTML() is used multiple times.
$this->head->removeChild($charset);
$html = preg_replace(self::HTML_GET_HTTP_EQUIV_TAG_PATTERN, '', $html, 1);
$html = $this->restoreDoctypeNode($html);
$html = $this->restoreMustacheTemplateTokens($html);
$html = $this->restoreSelfClosingTags($html);
$html = $this->restoreAmpBindAttributes($html);
$html = $this->restoreAmpEmojiAttribute($html);
$html = $this->fixSvgSourceAttributeEncoding($html);
// Whitespace just causes unit tests to fail... so whitespace begone.
if ('' === trim($html)) {
return '';
}
return $html;
}
/**
* Get the current options of the Document instance.
*
* @return array
*/
public function getOptions()
{
return $this->options;
}
/**
* Add the required utf-8 meta charset tag if it is still missing.
*/
private function insertMissingCharset()
{
// Bail if a charset tag is already present.
if ($this->xpath->query('.//meta[ @charset ]')->item(0)) {
return;
}
$charset = $this->createElement(Tag::META);
$charset->setAttribute(Attribute::CHARSET, Encoding::AMP);
$this->head->insertBefore($charset, $this->head->firstChild);
}
/**
* Extract a node's HTML via fragment boundaries.
*
* Temporarily adds fragment boundary comments in order to locate the desired node to extract from
* the given HTML document. This is required because libxml seems to only preserve whitespace when
* serializing when calling DOMDocument::saveHTML() on the entire document. If you pass the element
* to DOMDocument::saveHTML() then formatting whitespace gets added unexpectedly. This is seen to
* be fixed in PHP 7.3, but for older versions of PHP the following workaround is needed.
*
* @param DOMNode $node Node to extract the HTML for.
* @return string Extracted HTML string.
*/
private function extractNodeViaFragmentBoundaries(DOMNode $node)
{
$boundary = $this->getUniqueId('fragment_boundary');
$startBoundary = $boundary . ':start';
$endBoundary = $boundary . ':end';
$commentStart = $this->createComment($startBoundary);
$commentEnd = $this->createComment($endBoundary);
$node->parentNode->insertBefore($commentStart, $node);
$node->parentNode->insertBefore($commentEnd, $node->nextSibling);
$pattern = '/^.*?'
. preg_quote("<!--{$startBoundary}-->", '/')
. '(.*)'
. preg_quote("<!--{$endBoundary}-->", '/')
. '.*?\s*$/s';
$html = preg_replace($pattern, '$1', parent::saveHTML());
$node->parentNode->removeChild($commentStart);
$node->parentNode->removeChild($commentEnd);
return $html;
}
/**
* Normalize the document structure.
*
* This makes sure the document adheres to the general structure that AMP requires:
* ```
* <!DOCTYPE html>
* <html>
* <head>
* <meta charset="utf-8">
* </head>
* <body>
* </body>
* </html>
* ```
*
* @param string $content Content to normalize the structure of.
* @return string Normalized content.
*/
private function normalizeDocumentStructure($content)
{
$matches = [];
$doctype = self::DEFAULT_DOCTYPE;
$htmlStart = '<html>';
$htmlEnd = '</html>';
// Strip IE conditional comments, which are supported by IE 5-9 only (which AMP doesn't support).
$content = preg_replace(self::HTML_IE_CONDITIONAL_COMMENTS_PATTERN, '', $content);
// Detect and strip <!doctype> tags.
if (preg_match(self::HTML_STRUCTURE_DOCTYPE_PATTERN, $content, $matches)) {
$doctype = $matches['doctype'];
$content = preg_replace(self::HTML_STRUCTURE_DOCTYPE_PATTERN, '', $content, 1);
}
// Detect and strip <html> tags.
if (preg_match(self::HTML_STRUCTURE_HTML_START_TAG, $content, $matches)) {
$htmlStart = $matches['html_start'];
$content = preg_replace(self::HTML_STRUCTURE_HTML_START_TAG, '', $content, 1);
preg_match(self::HTML_STRUCTURE_HTML_END_TAG, $content, $matches);
$htmlEnd = isset($matches['html_end']) ? $matches['html_end'] : $htmlEnd;
$content = preg_replace(self::HTML_STRUCTURE_HTML_END_TAG, '', $content, 1);
}
// Detect <head> and <body> tags and add as needed.
if (! preg_match(self::HTML_STRUCTURE_HEAD_START_TAG, $content, $matches)) {
if (! preg_match(self::HTML_STRUCTURE_BODY_START_TAG, $content, $matches)) {
// Both <head> and <body> missing.
$content = "<head></head><body>{$content}</body>";
} else {
// Only <head> missing.
$content = "<head></head>{$content}";
}
} elseif (! preg_match(self::HTML_STRUCTURE_BODY_END_TAG, $content, $matches)) {
// Only <body> missing.
// @todo This is an expensive regex operation, look into further optimization.
$content = preg_replace(self::HTML_STRUCTURE_HEAD_TAG, '$0<body>', $content, 1);
$content .= '</body>';
}
$content = "{$htmlStart}{$content}{$htmlEnd}";
// Reinsert a standard doctype (while preserving any potentially leading comments).
$doctype = preg_replace(self::HTML_DOCTYPE_REGEX_PATTERN, self::DEFAULT_DOCTYPE, $doctype);
$content = "{$doctype}{$content}";
return $content;
}
/**
* Normalize the structure of the document if it was already provided as a DOM.
*/
public function normalizeDomStructure()
{
if (! $this->documentElement) {
$this->appendChild($this->createElement(Tag::HTML));
}
if (Tag::HTML !== $this->documentElement->nodeName) {
$nextSibling = $this->documentElement->nextSibling;
/**
* The old document element that we need to remove and replace as we cannot just move it around.
*
* @var Element
*/
$oldDocumentElement = $this->removeChild($this->documentElement);
$html = $this->createElement(Tag::HTML);
$this->insertBefore($html, $nextSibling);
if ($oldDocumentElement->nodeName === Tag::HEAD) {
$head = $oldDocumentElement;
} else {
$head = $this->getElementsByTagName(Tag::HEAD)->item(0);
if (!$head) {
$head = $this->createElement(Tag::HEAD);
}
}
if (!$head instanceof Element) {
throw FailedToRetrieveRequiredDomElement::forHeadElement($head);
}
$this->head = $head;
$html->appendChild($this->head);
if ($oldDocumentElement->nodeName === Tag::BODY) {
$body = $oldDocumentElement;
} else {
$body = $this->getElementsByTagName(Tag::BODY)->item(0);
if (!$body) {
$body = $this->createElement(Tag::BODY);
}
}
if (!$body instanceof Element) {
throw FailedToRetrieveRequiredDomElement::forBodyElement($body);
}
$this->body = $body;
$html->appendChild($this->body);
if ($oldDocumentElement !== $this->body && $oldDocumentElement !== $this->head) {
$this->body->appendChild($oldDocumentElement);
}
} else {
$head = $this->getElementsByTagName(Tag::HEAD)->item(0);
if (!$head) {
$this->head = $this->createElement(Tag::HEAD);
$this->documentElement->insertBefore($this->head, $this->documentElement->firstChild);
}
$body = $this->getElementsByTagName(Tag::BODY)->item(0);
if (!$body) {
$this->body = $this->createElement(Tag::BODY);
$this->documentElement->appendChild($this->body);
}
}
$this->moveInvalidHeadNodesToBody();
$this->movePostBodyNodesToBody();
}
/**
* Normalizes HTML attributes to be HTML5 compatible.
*
* Conditionally removes html[xmlns], and converts html[xml:lang] to html[lang].
*/
private function normalizeHtmlAttributes()
{
if (! $this->html->hasAttributes()) {
return;
}
$xmlns = $this->html->attributes->getNamedItem('xmlns');
if ($xmlns instanceof DOMAttr && 'http://www.w3.org/1999/xhtml' === $xmlns->nodeValue) {
$this->html->removeAttributeNode($xmlns);
}
$xml_lang = $this->html->attributes->getNamedItem('xml:lang');
if ($xml_lang instanceof DOMAttr) {
$lang_node = $this->html->attributes->getNamedItem('lang');
if ((! $lang_node || ! $lang_node->nodeValue) && $xml_lang->nodeValue) {
// Move the html[xml:lang] value to html[lang].
$this->html->setAttribute('lang', $xml_lang->nodeValue);
}
$this->html->removeAttributeNode($xml_lang);
}
}
/**
* Move invalid head nodes back to the body.
*/
private function moveInvalidHeadNodesToBody()
{
// Walking backwards makes it easier to move elements in the expected order.
$node = $this->head->lastChild;
while ($node) {
$nextSibling = $node->previousSibling;
if (! $this->isValidHeadNode($node)) {
$this->body->insertBefore($this->head->removeChild($node), $this->body->firstChild);
}
$node = $nextSibling;
}
}
/**
* Converts a possible head[profile] attribute to link[rel=profile].
*
* The head[profile] attribute is only valid in HTML4, not HTML5.
* So if it exists and isn't empty, add it to the <head> as a link[rel=profile] and strip the attribute.
*/
private function convertHeadProfileToLink()
{
if (! $this->head->hasAttribute(Attribute::PROFILE)) {
return;
}
$profile = $this->head->getAttribute(Attribute::PROFILE);
if ($profile) {
$link = $this->createElement(Tag::LINK);
$link->setAttribute(Attribute::REL, Attribute::PROFILE);
$link->setAttribute(Attribute::HREF, $profile);
$this->head->appendChild($link);
}
$this->head->removeAttribute(Attribute::PROFILE);
}
/**
* Move any nodes appearing after </body> or </html> to be appended to the <body>.
*
* This accounts for markup that is output at shutdown, such markup from Query Monitor. Not only is elements after
* the </body> not valid in AMP, but trailing elements after </html> will get wrapped in additional <html> elements.
* While comment nodes would be allowed in AMP, everything is moved regardless so that source stack comments will
* retain their relative position with the element nodes they annotate.
*/
private function movePostBodyNodesToBody()
{
// Move nodes (likely comments) from after the </body>.
while ($this->body->nextSibling) {
$this->body->appendChild($this->body->nextSibling);
}
// Move nodes from after the </html>.
while ($this->documentElement->nextSibling) {
$nextSibling = $this->documentElement->nextSibling;
if ($nextSibling instanceof Element && Tag::HTML === $nextSibling->nodeName) {
// Handle trailing elements getting wrapped in implicit duplicate <html>.
while ($nextSibling->firstChild) {
$this->body->appendChild($nextSibling->firstChild);
}
$nextSibling->parentNode->removeChild($nextSibling); // Discard now-empty implicit <html>.
} else {
$this->body->appendChild($this->documentElement->nextSibling);
}
}
}
/**
* Force all self-closing tags to have closing tags.
*
* This is needed because DOMDocument isn't fully aware of these.
*
* @param string $html HTML string to adapt.
* @return string Adapted HTML string.
* @see restoreSelfClosingTags() Reciprocal function.
*
*/
private function replaceSelfClosingTags($html)
{
static $regexPattern = null;
if (null === $regexPattern) {
$regexPattern = '#<(' . implode('|', Tag::SELF_CLOSING_TAGS) . ')([^>]*?)(?>\s*(?<!\\\\)\/)?>(?!</\1>)#';
}
$this->selfClosingTagsTransformed = true;
return preg_replace($regexPattern, '<$1$2></$1>', $html);
}
/**
* Restore all self-closing tags again.
*
* @param string $html HTML string to adapt.
* @return string Adapted HTML string.
* @see replaceSelfClosingTags Reciprocal function.
*
*/
private function restoreSelfClosingTags($html)
{
static $regexPattern = null;
if (! $this->selfClosingTagsTransformed) {
return $html;
}
if (null === $regexPattern) {
$regexPattern = '#</(' . implode('|', Tag::SELF_CLOSING_TAGS) . ')>#i';
}
$this->selfClosingTagsTransformed = false;
return preg_replace($regexPattern, '', $html);
}
/**
* Maybe replace noscript elements with placeholders.
*
* This is done because libxml<2.8 might parse them incorrectly.
* When appearing in the head element, a noscript can cause the head to close prematurely
* and the noscript gets moved to the body and anything after it which was in the head.
* See <https://stackoverflow.com/questions/39013102/why-does-noscript-move-into-body-tag-instead-of-head-tag>.
* This is limited to only running in the head element because this is where the problem lies,
* and it is important for the AMP_Script_Sanitizer to be able to access the noscript elements
* in the body otherwise.
*
* @param string $html HTML string to adapt.
* @return string Adapted HTML string.
* @see maybeRestoreNoscriptElements() Reciprocal function.
*/
private function maybeReplaceNoscriptElements($html)
{
if (version_compare(LIBXML_DOTTED_VERSION, '2.8', '<')) {
$html = preg_replace_callback(
'#^.+?(?=<body)#is',
function ($headMatches) {
return preg_replace_callback(
'#<noscript[^>]*>.*?</noscript>#si',
function ($noscriptMatches) {
$id = $this->getUniqueId('noscript');
$this->noscriptPlaceholderComments[$id] = $noscriptMatches[0];
return sprintf('<meta class="noscript-placeholder" id="%s">', $id);
},
$headMatches[0]
);
},
$html
);
}
return $html;
}
/**
* Maybe restore noscript elements with placeholders.
*
* This is done because libxml<2.8 might parse them incorrectly.
* When appearing in the head element, a noscript can cause the head to close prematurely
* and the noscript gets moved to the body and anything after it which was in the head.
* See <https://stackoverflow.com/questions/39013102/why-does-noscript-move-into-body-tag-instead-of-head-tag>.
* This is limited to only running in the head element because this is where the problem lies,
* and it is important for the AMP_Script_Sanitizer to be able to access the noscript elements
* in the body otherwise.
*
* @see maybeReplaceNoscriptElements() Reciprocal function.
*/
private function maybeRestoreNoscriptElements()
{
foreach ($this->noscriptPlaceholderComments as $id => $noscriptHtmlFragment) {
$placeholderElement = $this->getElementById($id);
if (!$placeholderElement || !$placeholderElement->parentNode) {
continue;
}
$noscriptFragmentDocument = self::fromHtmlFragment($noscriptHtmlFragment);
if (!$noscriptFragmentDocument) {
continue;
}
$exportBody = $noscriptFragmentDocument->getElementsByTagName(Tag::BODY)->item(0);
if (!$exportBody) {
continue;
}
$importFragment = $this->createDocumentFragment();
while ($exportBody->firstChild) {
$importNode = $exportBody->removeChild($exportBody->firstChild);
$importNode = $this->importNode($importNode, true);
$importFragment->appendChild($importNode);
}
$placeholderElement->parentNode->replaceChild($importFragment, $placeholderElement);
}
}
/**
* Secures instances of script[template="amp-mustache"] by renaming element to tmp-script, as a workaround to a
* libxml parsing issue.
*
* This script can have closing tags of its children table and td stripped.
* So this changes its name from script to tmp-script to avoid this.
*
* @link https://github.com/ampproject/amp-wp/issues/4254
* @see restoreMustacheScriptTemplates() Reciprocal function.
*
* @param string $html To replace the tag name that contains the mustache templates.
* @return string The HTML, with the tag name of the mustache templates replaced.
*/
private function secureMustacheScriptTemplates($html)
{
return preg_replace(
'#<script(\s[^>]*?template=(["\']?)amp-mustache\2[^>]*)>(.*?)</script\s*?>#is',
'<tmp-script$1>$3</tmp-script>',
$html
);
}
/**
* Restores the tag names of script[template="amp-mustache"] elements that were replaced earlier.
*
* @see secureMustacheScriptTemplates() Reciprocal function.
*/
private function restoreMustacheScriptTemplates()
{
$tmp_script_elements = iterator_to_array($this->getElementsByTagName('tmp-script'));
foreach ($tmp_script_elements as $tmp_script_element) {
$script = $this->createElement(Tag::SCRIPT);
foreach ($tmp_script_element->attributes as $attr) {
$script->setAttribute($attr->nodeName, $attr->nodeValue);
}
while ($tmp_script_element->firstChild) {
$script->appendChild($tmp_script_element->firstChild);
}
$tmp_script_element->parentNode->replaceChild($script, $tmp_script_element);
}
}
/**
* Replace AMP binding attributes with something that libxml can parse (as HTML5 data-* attributes).
*
* This is necessary because attributes in square brackets are not understood in PHP and
* get dropped with an error raised:
* > Warning: DOMDocument::loadHTML(): error parsing attribute name
*
* @see restoreAmpBindAttributes() Reciprocal function.
* @link https://www.ampproject.org/docs/reference/components/amp-bind
*
* @param string $html HTML containing amp-bind attributes.
* @return string HTML with AMP binding attributes replaced with HTML5 data-* attributes.
*/
private function convertAmpBindAttributes($html)
{
/**
* Replace callback.
*
* @param array $tagMatches Tag matches.
* @return string Replacement.
*/
$replaceCallback = function ($tagMatches) {
// Strip the self-closing slash as long as it is not an attribute value, like for the href attribute.
$oldAttrs = rtrim(preg_replace('#(?<!=)/$#', '', $tagMatches['attrs']));
$newAttrs = '';
$offset = 0;
while (preg_match(self::AMP_BIND_SQUARE_BRACKETS_ATTR_PATTERN, substr($oldAttrs, $offset), $attrMatches)) {
$offset += strlen($attrMatches[0]);
if ('[' === $attrMatches['name'][0]) {
$attrName = trim($attrMatches['name'], '[]');
$newAttrs .= ' ' . self::AMP_BIND_DATA_ATTR_PREFIX . $attrName;
if (isset($attrMatches['value'])) {
$newAttrs .= $attrMatches['value'];
}
$this->convertedAmpBindAttributes[] = $attrName;
} else {
$newAttrs .= $attrMatches[0];
}
}
// Bail on parse error which occurs when the regex isn't able to consume the entire $newAttrs string.
if (strlen($oldAttrs) !== $offset) {
return $tagMatches[0];
}
return '<' . $tagMatches['name'] . $newAttrs . '>';
};
$converted = preg_replace_callback(
self::AMP_BIND_SQUARE_START_PATTERN,
$replaceCallback,
$html
);
/*
* If the regex engine incurred an error during processing, for example exceeding the backtrack
* limit, $converted will be null. In this case we return the originally passed document to allow
* DOMDocument to attempt to load it. If the AMP HTML doesn't make use of amp-bind or similar
* attributes, then everything should still work.
*
* See https://github.com/ampproject/amp-wp/issues/993 for additional context on this issue.
* See http://php.net/manual/en/pcre.constants.php for additional info on PCRE errors.
*/
return (null !== $converted) ? $converted : $html;
}
/**
* Convert AMP bind-attributes back to their original syntax.
*
* This is not guaranteed to produce the exact same result as the initial markup, as it is more of a best guess.
* It can end up replacing the wrong attributes if the initial markup had inconsistent styling, mixing both syntaxes
* for the same attribute. In either case, it will always produce working markup, so this is not that big of a deal.
*
* @see convertAmpBindAttributes() Reciprocal function.
* @link https://www.ampproject.org/docs/reference/components/amp-bind
*
* @param string $html HTML with amp-bind attributes converted.
* @return string HTML with amp-bind attributes restored.
*/
public function restoreAmpBindAttributes($html)
{
if ($this->options[Option::AMP_BIND_SYNTAX] === Option::AMP_BIND_SYNTAX_DATA_ATTRIBUTE) {
// All amp-bind attributes should remain in their converted data attribute form.
return $html;
}
if (
$this->options[Option::AMP_BIND_SYNTAX] === Option::AMP_BIND_SYNTAX_AUTO
&&
empty($this->convertedAmpBindAttributes)
) {
// Only previously converted amp-bind attributes should be restored, but none were converted.
return $html;
}
/**
* Replace callback.
*
* @param array $tagMatches Tag matches.
* @return string Replacement.
*/
$replaceCallback = function ($tagMatches) {
// Strip the self-closing slash as long as it is not an attribute value, like for the href attribute.
$oldAttrs = rtrim(preg_replace('#(?<!=)/$#', '', $tagMatches['attrs']));
$newAttrs = '';
$offset = 0;
while (preg_match(self::AMP_BIND_DATA_ATTRIBUTE_ATTR_PATTERN, substr($oldAttrs, $offset), $attrMatches)) {
$offset += strlen($attrMatches[0]);
$attrName = substr($attrMatches['name'], strlen(self::AMP_BIND_DATA_ATTR_PREFIX));
if (
$this->options[Option::AMP_BIND_SYNTAX] === Option::AMP_BIND_SYNTAX_SQUARE_BRACKETS
||
in_array($attrName, $this->convertedAmpBindAttributes, true)
) {
$attrValue = isset($attrMatches['value']) ? $attrMatches['value'] : '=""';
$newAttrs .= " [{$attrName}]{$attrValue}";
} else {
$newAttrs .= $attrMatches[0];
}
}
// Bail on parse error which occurs when the regex isn't able to consume the entire $newAttrs string.
if (strlen($oldAttrs) !== $offset) {
return $tagMatches[0];
}
return '<' . $tagMatches['name'] . $newAttrs . '>';
};
$converted = preg_replace_callback(
self::AMP_BIND_DATA_START_PATTERN,
$replaceCallback,
$html
);
/*
* If the regex engine incurred an error during processing, for example exceeding the backtrack
* limit, $converted will be null. In this case we return the originally passed document to allow
* DOMDocument to attempt to load it. If the AMP HTML doesn't make use of amp-bind or similar
* attributes, then everything should still work.
*
* See https://github.com/ampproject/amp-wp/issues/993 for additional context on this issue.
* See http://php.net/manual/en/pcre.constants.php for additional info on PCRE errors.
*/
return (null !== $converted) ? $converted : $html;
}
/**
* Adapt the encoding of the content.
*
* @param string $source Source content to adapt the encoding of.
* @return string Adapted content.
*/
private function adaptEncoding($source)
{
// No encoding was provided, so we need to guess.
if (Encoding::UNKNOWN === $this->originalEncoding && function_exists('mb_detect_encoding')) {
$this->originalEncoding = mb_detect_encoding($source, Encoding::DETECTION_ORDER, true);
}
// Guessing the encoding seems to have failed, so we assume UTF-8 instead.
// In my testing, this was not possible as long as one ISO-8859-x is in the detection order.
if (empty($this->originalEncoding)) {
$this->originalEncoding = Encoding::AMP; // @codeCoverageIgnore
}
$this->originalEncoding = $this->sanitizeEncoding($this->originalEncoding);
// Sanitization failed, so we do a last effort to auto-detect.
if (Encoding::UNKNOWN === $this->originalEncoding && function_exists('mb_detect_encoding')) {
$detectedEncoding = mb_detect_encoding($source, Encoding::DETECTION_ORDER, true);
if ($detectedEncoding !== false) {
$this->originalEncoding = $detectedEncoding;
}
}
$target = false;
if (Encoding::AMP !== strtolower($this->originalEncoding)) {
$target = function_exists('mb_convert_encoding')
? mb_convert_encoding($source, Encoding::AMP, $this->originalEncoding)
: false;
}
return false !== $target ? $target : $source;
}
/**
* Detect the encoding of the document.
*
* @param string $content Content of which to detect the encoding.
* @return array {
* Detected encoding of the document, or false if none.
*
* @type string $content Potentially modified content.
* @type string|false $encoding Encoding of the content, or false if not detected.
* }
*/
private function detectAndStripEncoding($content)
{
$encoding = $this->originalEncoding;
// Check for HTML 4 http-equiv meta tags.
foreach ($this->findTags($content, Tag::META, Attribute::HTTP_EQUIV) as $potentialHttpEquivTag) {
$encoding = $this->extractValue($potentialHttpEquivTag, Attribute::CHARSET);
if (false !== $encoding) {
$httpEquivTag = $potentialHttpEquivTag;
}
}
// Strip all charset tags.
if (isset($httpEquivTag)) {
$content = str_replace($httpEquivTag, '', $content);
}
// Check for HTML 5 charset meta tag. This overrides the HTML 4 charset.
$charsetTag = $this->findTag($content, Tag::META, Attribute::CHARSET);
if ($charsetTag) {
$encoding = $this->extractValue($charsetTag, Attribute::CHARSET);
// Strip the encoding if it is not the required one.
if (strtolower($encoding) !== Encoding::AMP) {
$content = str_replace($charsetTag, '', $content);
}
}
return [$content, $encoding];
}
/**
* Find a given tag with a given attribute.
*
* If multiple tags match, this method will only return the first one.
*
* @param string $content Content in which to find the tag.
* @param string $element Element of the tag.
* @param string $attribute Attribute that the tag contains.
* @return string[] The requested tags. Returns an empty array if none found.
*/
private function findTags($content, $element, $attribute = null)
{
$matches = [];
$pattern = empty($attribute)
? sprintf(
self::HTML_FIND_TAG_WITHOUT_ATTRIBUTE_PATTERN,
preg_quote($element, self::HTML_FIND_TAG_DELIMITER)
)
: sprintf(
self::HTML_FIND_TAG_WITH_ATTRIBUTE_PATTERN,
preg_quote($element, self::HTML_FIND_TAG_DELIMITER),
preg_quote($attribute, self::HTML_FIND_TAG_DELIMITER)
);
if (preg_match($pattern, $content, $matches)) {
return $matches;
}
return [];
}
/**
* Find a given tag with a given attribute.
*
* If multiple tags match, this method will only return the first one.
*
* @param string $content Content in which to find the tag.
* @param string $element Element of the tag.
* @param string $attribute Attribute that the tag contains.
* @return string|false The requested tag, or false if not found.
*/
private function findTag($content, $element, $attribute = null)
{
$matches = $this->findTags($content, $element, $attribute);
if (empty($matches)) {
return false;
}
return $matches[0];
}
/**
* Extract an attribute value from an HTML tag.
*
* @param string $tag Tag from which to extract the attribute.
* @param string $attribute Attribute of which to extract the value.
* @return string|false Extracted attribute value, false if not found.
*/
private function extractValue($tag, $attribute)
{
$matches = [];
$pattern = sprintf(
self::HTML_EXTRACT_ATTRIBUTE_VALUE_PATTERN,
preg_quote($attribute, self::HTML_FIND_TAG_DELIMITER)
);
if (preg_match($pattern, $tag, $matches)) {
return empty($matches['full']) ? $matches['partial'] : $matches['full'];
}
return false;
}
/**
* Sanitize the encoding that was detected.
*
* If sanitization fails, it will return 'auto', letting the conversion
* logic try to figure it out itself.
*
* @param string $encoding Encoding to sanitize.
* @return string Sanitized encoding. Falls back to 'auto' on failure.
*/
private function sanitizeEncoding($encoding)
{
$encoding = strtolower($encoding);
if ($encoding === Encoding::AMP) {
return $encoding;
}
if (! function_exists('mb_list_encodings')) {
return $encoding;
}
static $knownEncodings = null;
if (null === $knownEncodings) {
$knownEncodings = array_map('strtolower', mb_list_encodings());
}
if (array_key_exists($encoding, Encoding::MAPPINGS)) {
$encoding = Encoding::MAPPINGS[$encoding];
}
if (! in_array($encoding, $knownEncodings, true)) {
return Encoding::UNKNOWN;
}
return $encoding;
}
/**
* Replace Mustache template tokens to safeguard them from turning into HTML entities.
*
* Prevents amp-mustache syntax from getting URL-encoded in attributes when saveHTML is done.
* While this is applying to the entire document, it only really matters inside of <template>
* elements, since URL-encoding of curly braces in href attributes would not normally matter.
* But when this is done inside of a <template> then it breaks Mustache. Since Mustache
* is logic-less and curly braces are not unsafe for HTML, we can do a global replacement.
* The replacement is done on the entire HTML document instead of just inside of the <template>
* elements since it is faster and wouldn't change the outcome.
*
* @see restoreMustacheTemplateTokens() Reciprocal function.
*/
private function replaceMustacheTemplateTokens()
{
$templates = $this->xpath->query(self::XPATH_MUSTACHE_TEMPLATE_ELEMENTS_QUERY, $this->body);
if (0 === $templates->length) {
return;
}
$mustacheTagPlaceholders = $this->getMustacheTagPlaceholders();
foreach ($templates as $template) {
foreach ($this->xpath->query(self::XPATH_URL_ENCODED_ATTRIBUTES_QUERY, $template) as $attribute) {
$value = preg_replace_callback(
$this->getMustacheTagPattern(),
static function ($matches) use ($mustacheTagPlaceholders) {
return $mustacheTagPlaceholders[trim($matches[0])];
},
$attribute->nodeValue,
-1,
$count
);
if ($count) {
// Note we cannot do `$attribute->nodeValue = $value` because the PHP DOM will try to parse any
// entities. In the case of a URL value like '/foo/?bar=1&baz=2' the result is a warning for an
// unterminated entity reference "baz". When the attribute value is updated via setAttribute() this
// same problem does not occur, so that is why the following is used.
$attribute->parentNode->setAttribute($attribute->nodeName, $value);
$this->mustacheTagsReplaced = true;
}
}
}
}
/**
* Restore Mustache template tokens that were previously replaced.
*
* @param string $html HTML string to adapt.
* @return string Adapted HTML string.
* @see replaceMustacheTemplateTokens() Reciprocal function.
*
*/
private function restoreMustacheTemplateTokens($html)
{
if (! $this->mustacheTagsReplaced) {
return $html;
}
$mustacheTagPlaceholders = $this->getMustacheTagPlaceholders();
return str_replace(
$mustacheTagPlaceholders,
array_keys($mustacheTagPlaceholders),
$html
);
}
/**
* Get amp-mustache tag/placeholder mappings.
*
* @return string[] Mapping of mustache tag token to its placeholder.
* @see \wpdb::placeholder_escape()
*/
private function getMustacheTagPlaceholders()
{
static $placeholders = null;
if (null === $placeholders) {
$placeholders = [];
// Note: The order of these tokens is important, as it determines the order of the replacements.
$tokens = [
'{{{',
'}}}',
'{{#',
'{{^',
'{{/',
'{{',
'}}',
];
foreach ($tokens as $token) {
$placeholders[$token] = '_amp_mustache_' . md5(uniqid($token));
}
}
return $placeholders;
}
/**
* Get a regular expression that matches all amp-mustache tags while consuming whitespace.
*
* Removing whitespace is needed to avoid DOMDocument turning whitespace into entities, like %20 for spaces.
*
* @return string Regex pattern to match amp-mustache tags with whitespace.
*/
private function getMustacheTagPattern()
{
static $tagPattern = null;
if (null === $tagPattern) {
$delimiter = ':';
$tags = [];
foreach (array_keys($this->getMustacheTagPlaceholders()) as $token) {
if ('{' === $token[0]) {
$tags[] = preg_quote($token, $delimiter) . '\s*';
} else {
$tags[] = '\s*' . preg_quote($token, $delimiter);
}
}
$tagPattern = $delimiter . implode('|', $tags) . $delimiter;
}
return $tagPattern;
}
/**
* Covert the emoji AMP symbol (⚡) into pure text.
*
* The emoji symbol gets stripped by DOMDocument::loadHTML().
*
* @param string $source Source HTML string to convert the emoji AMP symbol in.
* @return string Adapted source HTML string.
*/
private function convertAmpEmojiAttribute($source)
{
$this->usedAmpEmoji = '';
return preg_replace_callback(
self::AMP_EMOJI_ATTRIBUTE_PATTERN,
function ($matches) {
// Split into individual attributes.
$attributes = array_map(
'trim',
array_filter(
preg_split(
'#(\s+[^"\'\s=]+(?:=(?:"[^"]+"|\'[^\']+\'|[^"\'\s]+))?)#',
$matches[1],
-1,
PREG_SPLIT_DELIM_CAPTURE
)
)
);
foreach ($attributes as $index => $attribute) {
$attributeMatches = [];
if (
preg_match(
'/^(' . Attribute::AMP_EMOJI_ALT . '|' . Attribute::AMP_EMOJI . ')(4(?:ads|email))?$/i',
$attribute,
$attributeMatches
)
) {
$this->usedAmpEmoji = $attributeMatches[1];
$variant = ! empty($attributeMatches[2]) ? $attributeMatches[2] : '';
$attributes[$index] = self::EMOJI_AMP_ATTRIBUTE_PLACEHOLDER . "=\"{$variant}\"";
break;
}
}
return '<html ' . implode(' ', $attributes) . '>';
},
$source,
1
);
}
/**
* Restore the emoji AMP symbol (⚡) from its pure text placeholder.
*
* @param string $html HTML string to restore the AMP emoji symbol in.
* @return string Adapted HTML string.
*/
private function restoreAmpEmojiAttribute($html)
{
if (empty($this->usedAmpEmoji)) {
return $html;
}
return preg_replace(
'/(<html\s[^>]*?)' . preg_quote(self::EMOJI_AMP_ATTRIBUTE_PLACEHOLDER, '/') . '="([^"]*)"/i',
'\1' . $this->usedAmpEmoji . '\2',
$html,
1
);
}
/**
* Secure the original doctype node.
*
* We need to keep elements around that were prepended to the doctype, like comment node used for source-tracking.
* As DOM_Document prepends a new doctype node and removes the old one if the first element is not the doctype, we
* need to ensure the original one is not stripped (by changing its node type) and restore it later on.
*
* @param string $html HTML string to adapt.
* @return string Adapted HTML string.
* @see restoreDoctypeNode() Reciprocal function.
*
*/
private function secureDoctypeNode($html)
{
return preg_replace(
self::HTML_SECURE_DOCTYPE_IF_NOT_FIRST_PATTERN,
'\1!--amp-\3\4-->',
$html,
1,
$this->securedDoctype
);
}
/**
* Restore the original doctype node.
*
* @param string $html HTML string to adapt.
* @return string Adapted HTML string.
* @see secureDoctypeNode() Reciprocal function.
*
*/
private function restoreDoctypeNode($html)
{
if (! $this->securedDoctype) {
return $html;
}
return preg_replace(self::HTML_RESTORE_DOCTYPE_PATTERN, '\1!\3\4>', $html, 1);
}
/**
* Process the HTML output string and tweak it as needed.
*
* @param string $html HTML output string to tweak.
* @return string Tweaked HTML output string.
*/
public function fixSvgSourceAttributeEncoding($html)
{
return preg_replace_callback(self::I_AMPHTML_SIZER_REGEX_PATTERN, [$this, 'adaptSizer'], $html);
}
/**
* Adapt the sizer element so that it validates against the AMP spec.
*
* @param array $matches Matches that the regular expression collected.
* @return string Adapted string to use as replacement.
*/
private function adaptSizer($matches)
{
$src = $matches['src'];
$src = htmlspecialchars_decode($src, ENT_NOQUOTES);
$src = preg_replace_callback(self::SRC_SVG_REGEX_PATTERN, [$this, 'adaptSvg'], $src);
return $matches['before_src'] . $src . $matches['after_src'];
}
/**
* Adapt the SVG syntax within the sizer element so that it validates against the AMP spec.
*
* @param array $matches Matches that the regular expression collected.
* @return string Adapted string to use as replacement.
*/
private function adaptSvg($matches)
{
return $matches['type'] . urldecode($matches['value']);
}
/**
* Deduplicate a given tag.
*
* This keeps the first tag as the main tag and moves over all child nodes and attribute nodes from any subsequent
* same tags over to remove them.
*
* @param string $tagName Name of the tag to deduplicate.
*/
public function deduplicateTag($tagName)
{
$tags = $this->getElementsByTagName($tagName);
/**
* Main tag to keep.
*
* @var Element|null $mainTag
*/
$mainTag = $tags->item(0);
if (null === $mainTag) {
return;
}
while ($tags->length > 1) {
/**
* Tag to remove.
*
* @var Element $tagToRemove
*/
$tagToRemove = $tags->item(1);
foreach ($tagToRemove->childNodes as $childNode) {
$mainTag->appendChild($childNode->parentNode->removeChild($childNode));
}
while ($tagToRemove->hasAttributes()) {
/**
* Attribute node to move over to the main tag.
*
* @var DOMAttr $attribute
*/
$attribute = $tagToRemove->attributes->item(0);
$tagToRemove->removeAttributeNode($attribute);
// @TODO This doesn't deal properly with attributes present on both tags. Maybe overkill to add?
// We could move over the copy_attributes from AMP_DOM_Utils to do this.
$mainTag->setAttributeNode($attribute);
}
$tagToRemove->parentNode->removeChild($tagToRemove);
}
// Avoid doing the above query again if possible.
if (in_array($tagName, [Tag::HEAD, Tag::BODY], true)) {
$this->$tagName = $mainTag;
}
}
/**
* Determine whether a node can be in the head.
*
* @link https://github.com/ampproject/amphtml/blob/445d6e3be8a5063e2738c6f90fdcd57f2b6208be/validator/engine/htmlparser.js#L83-L100
* @link https://www.w3.org/TR/html5/document-metadata.html
*
* @param DOMNode $node Node.
* @return bool Whether valid head node.
*/
public function isValidHeadNode(DOMNode $node)
{
return (
($node instanceof Element && in_array($node->nodeName, Tag::ELEMENTS_ALLOWED_IN_HEAD, true))
||
($node instanceof DOMText && preg_match('/^\s*$/', $node->nodeValue)) // Whitespace text nodes are OK.
||
$node instanceof DOMComment
);
}
/**
* Get auto-incremented ID unique to this class's instantiation.
*
* @param string $prefix Prefix.
* @return string ID.
*/
private function getUniqueId($prefix = '')
{
if (array_key_exists($prefix, $this->indexCounter)) {
++$this->indexCounter[$prefix];
} else {
$this->indexCounter[$prefix] = 0;
}
$uniqueId = (string)$this->indexCounter[$prefix];
if ($prefix) {
$uniqueId = "{$prefix}-{$uniqueId}";
}
return $uniqueId;
}
/**
* Get the ID for an element.
*
* If the element does not have an ID, create one first.
*
* @param Element $element Element to get the ID for.
* @param string $prefix Optional. The prefix to use (should not have a trailing dash). Defaults to 'i-amp-id'.
* @return string ID to use.
*/
public function getElementId(Element $element, $prefix = 'i-amp')
{
if ($element->hasAttribute(Attribute::ID)) {
return $element->getAttribute(Attribute::ID);
}
$id = $this->getUniqueId($prefix);
while ($this->getElementById($id) instanceof Element) {
$id = $this->getUniqueId($prefix);
}
$element->setAttribute(Attribute::ID, $id);
return $id;
}
/**
* Determine whether `data-ampdevmode` was initially set on the document element.
*
* @return bool
*/
public function hasInitialAmpDevMode()
{
return $this->hasInitialAmpDevMode;
}
/**
* Add style(s) to the <style amp-custom> tag.
*
* @param string $style Style to add.
* @throws MaxCssByteCountExceeded If the allowed max byte count is exceeded.
*/
public function addAmpCustomStyle($style)
{
$style = trim($style, CssRule::CSS_TRIM_CHARACTERS);
$existingStyle = (string)$this->ampCustomStyle->textContent;
// Inject new styles before any potential source map annotation comment like: /*# sourceURL=amp-custom.css */.
// If not present, then just put it at the end of the stylesheet. This isn't strictly required, but putting the
// source map comments at the end is the convention.
$newStyle = preg_replace(
':(?=\s+/\*#[^*]+?\*/\s*$|$):s',
$style,
$existingStyle,
1
);
$newByteCount = strlen($newStyle);
if ($this->getRemainingCustomCssSpace() < ($newByteCount - $this->ampCustomStyleByteCount)) {
throw MaxCssByteCountExceeded::forAmpCustom($newStyle);
}
$this->ampCustomStyle->textContent = $newStyle;
$this->ampCustomStyleByteCount = $newByteCount;
}
/**
* Add the given number of bytes ot the total inline style byte count.
*
* @param int $byteCount Bytes to add.
*/
public function addInlineStyleByteCount($byteCount)
{
$this->inlineStyleByteCount += $byteCount;
}
/**
* Get the remaining number bytes allowed for custom CSS.
*
* @return int
*/
public function getRemainingCustomCssSpace()
{
if ($this->cssMaxByteCountEnforced < 0) {
// No CSS byte count limit is being enforced, so return the next best thing to +∞.
return PHP_INT_MAX;
}
return max(
0,
$this->cssMaxByteCountEnforced - (int)$this->ampCustomStyleByteCount - (int)$this->inlineStyleByteCount
);
}
/**
* Magic getter to implement lazily-created, cached properties for the document.
*
* @param string $name Name of the property to get.
* @return mixed Value of the property, or null if unknown property was requested.
*/
public function __get($name)
{
switch ($name) {
case 'xpath':
$this->xpath = new DOMXPath($this);
return $this->xpath;
case Tag::HTML:
$html = $this->getElementsByTagName(Tag::HTML)->item(0);
if ($html === null) {
// Document was assembled manually and bypassed normalisation.
$this->normalizeDomStructure();
$html = $this->getElementsByTagName(Tag::HTML)->item(0);
}
if (!$html instanceof Element) {
throw FailedToRetrieveRequiredDomElement::forHtmlElement($html);
}
$this->html = $html;
return $this->html;
case Tag::HEAD:
$head = $this->getElementsByTagName(Tag::HEAD)->item(0);
if ($head === null) {
// Document was assembled manually and bypassed normalisation.
$this->normalizeDomStructure();
$head = $this->getElementsByTagName(Tag::HEAD)->item(0);
}
if (!$head instanceof Element) {
throw FailedToRetrieveRequiredDomElement::forHeadElement($head);
}
$this->head = $head;
return $this->head;
case Tag::BODY:
$body = $this->getElementsByTagName(Tag::BODY)->item(0);
if ($body === null) {
// Document was assembled manually and bypassed normalisation.
$this->normalizeDomStructure();
$body = $this->getElementsByTagName(Tag::BODY)->item(0);
}
if (!$body instanceof Element) {
throw FailedToRetrieveRequiredDomElement::forBodyElement($body);
}
$this->body = $body;
return $this->body;
case Attribute::VIEWPORT:
// This is not cached as it could potentially be requested too early, before the viewport was added, and
// the cache would then store null without rechecking later on after the viewport has been added.
for ($node = $this->head->firstChild; $node !== null; $node = $node->nextSibling) {
if (
$node instanceof Element
&& $node->tagName === Tag::META
&& $node->getAttribute(Attribute::NAME) === Attribute::VIEWPORT
) {
return $node;
}
}
return null;
case 'ampElements':
// This is not cached as we clone some elements during SSR transformations to avoid ending up with
// partially transformed, broken elements.
return $this->xpath->query(self::XPATH_AMP_ELEMENTS_QUERY, $this->body)
?: new DOMNodeList();
case 'ampCustomStyle':
$ampCustomStyle = $this->xpath->query(self::XPATH_AMP_CUSTOM_STYLE_QUERY, $this->head)->item(0);
if (!$ampCustomStyle instanceof Element) {
$ampCustomStyle = $this->createElement(Tag::STYLE);
$ampCustomStyle->appendChild($this->createAttribute(Attribute::AMP_CUSTOM));
$this->head->appendChild($ampCustomStyle);
}
$this->ampCustomStyle = $ampCustomStyle;
return $this->ampCustomStyle;
case 'ampCustomStyleByteCount':
if (!isset($this->ampCustomStyle)) {
$ampCustomStyle = $this->xpath->query(self::XPATH_AMP_CUSTOM_STYLE_QUERY, $this->head)->item(0);
if (!$ampCustomStyle instanceof Element) {
return 0;
} else {
$this->ampCustomStyle = $ampCustomStyle;
}
}
if (!isset($this->ampCustomStyleByteCount)) {
$this->ampCustomStyleByteCount = strlen($this->ampCustomStyle->textContent);
}
return $this->ampCustomStyleByteCount;
case 'inlineStyleByteCount':
if (!isset($this->inlineStyleByteCount)) {
$this->inlineStyleByteCount = 0;
$attributes = $this->xpath->query(self::XPATH_INLINE_STYLE_ATTRIBUTES_QUERY, $this->body);
foreach ($attributes as $attribute) {
$this->inlineStyleByteCount += strlen($attribute->textContent);
}
}
return $this->inlineStyleByteCount;
}
// Mimic regular PHP behavior for missing notices.
trigger_error(self::PROPERTY_GETTER_ERROR_MESSAGE . $name, E_USER_NOTICE);
return null;
}
/**
* Make sure we properly reinitialize on clone.
*
* @return void
*/
public function __clone()
{
$this->reset();
}
/**
* Create new element node.
*
* @link https://php.net/manual/domdocument.createelement.php
*
* This override only serves to provide the correct object type-hint for our extended Dom/Element class.
*
* @param string $name The tag name of the element.
* @param string $value Optional. The value of the element. By default, an empty element will be created.
* You can also set the value later with Element->nodeValue.
* @return Element|false A new instance of class Element or false if an error occurred.
*/
public function createElement($name, $value = null)
{
$element = parent::createElement($name, $value);
if (!$element instanceof Element) {
return false;
}
return $element;
}
/**
* Create new element node.
*
* @link https://php.net/manual/domdocument.createelement.php
*
* This override only serves to provide the correct object type-hint for our extended Dom/Element class.
*
* @param string $name The tag name of the element.
* @param array $attributes Attributes to add to the newly created element.
* @param string $value Optional. The value of the element. By default, an empty element will be created.
* You can also set the value later with Element->nodeValue.
* @return Element|false A new instance of class Element or false if an error occurred.
*/
public function createElementWithAttributes($name, $attributes, $value = null)
{
$element = parent::createElement($name, $value);
if (!$element instanceof Element) {
return false;
}
$element->setAttributes($attributes);
return $element;
}
/**
* Check whether the CSS maximum byte count is enforced.
*
* @return bool Whether the CSS maximum byte count is enforced.
*/
public function isCssMaxByteCountEnforced()
{
return $this->cssMaxByteCountEnforced >= 0;
}
/**
* Enforce a maximum number of bytes for the CSS.
*
* @param int $maxByteCount Maximum number of bytes to limit the CSS to. A negative number disables the limit.
*/
public function enforceCssMaxByteCount($maxByteCount = Amp::MAX_CSS_BYTE_COUNT)
{
$this->cssMaxByteCountEnforced = $maxByteCount;
}
/**
* Add a http-equiv charset meta tag to the document's <head> node.
*
* This is needed to make the DOMDocument behave as it should in terms of encoding.
* See: http://php.net/manual/en/domdocument.loadhtml.php#78243.
*
* @param string $html HTML string to add the http-equiv charset to.
* @return string Adapted string of HTML.
*/
private function addHttpEquivCharset($html)
{
$count = 0;
// We try first to detect an existing <head> node.
$html = preg_replace(
self::HTML_GET_HEAD_OPENING_TAG_PATTERN,
self::HTML_GET_HEAD_OPENING_TAG_REPLACEMENT,
$html,
1,
$count
);
// If no <head> was found, we look for the <html> tag instead.
if ($count < 1) {
$html = preg_replace(
self::HTML_GET_HTML_OPENING_TAG_PATTERN,
self::HTML_GET_HTML_OPENING_TAG_REPLACEMENT,
$html,
1,
$count
);
}
// Finally, we just prepend the head with the required http-equiv charset.
if ($count < 1) {
$html = '<head>' . self::HTTP_EQUIV_META_TAG . '</head>' . $html;
}
return $html;
}
}