dom = $dom; $this->contextnode = null; */ trait DomQuery { public function find(string $pat) : Node { $els = $this->findAll($pat); if (!count($els)) { if (PARSER_DEBUG) { echo "---- match failed; context: ---\n"; echo $this->toXml() ."\n"; } throw new \Exception("No match: $pat"); } if (count($els) > 1) { if (PARSER_DEBUG) { echo "Query results:\n"; foreach ($els as $el) { echo $el->toXml()."\n"; } } throw new \Exception("Multiple match (".count($els)."x): $pat"); } return $els[0]; } /** * @param $pat * @return array|Node[] * @throws Exception */ public function findAll(string $pat) : array { # node if (preg_match('/^([a-z0-9_-]+)$/i', $pat, $matches)) { return $this->x("//$matches[1]"); } # .class, node.class if (preg_match('/^(?P[a-z0-9_-]*)\.(?P[a-z0-9_-]+)$/i', $pat, $matches)) { $elem = $matches['elem'] ?: '*'; return $this->x("//{$elem}[contains(concat(' ',normalize-space(@class),' '),' $matches[cls] ')]"); } // #id if (preg_match('/^#(\w+)$/', $pat, $matches)) { return $this->x("//*[id='$matches[1]']"); } # [attr=value], node[attr=value] (allows quotes) if (preg_match('/^(?P[a-z0-9_-]*)\[(?P[a-z0-9_-]+)(?P[$*~^]|)=[\'"]?(?P[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) { $elem = $matches['elem'] ?: '*'; $op = $matches['op']; switch ($op) { case '': return $this->x("//{$elem}[@$matches[attr]='$matches[val]']"); case '^': return $this->x("//{$elem}[starts-with(@$matches[attr], '$matches[val]')]"); // this doesnt work.. // case '$': // $vlen = strlen($matches['val']); // return $this->x("//{$elem}['$matches[val]' = substring(@$matches[attr], string-length(@$matches[attr]) - $vlen)]"); case '*': case '~': return $this->x("//{$elem}[contains(@$matches[attr], '$matches[val]')]"); } } # [attr^=value], node[attr^=value] (allows quotes) if (preg_match('/^(?P[a-z0-9_-]*)\[(?P[a-z0-9_-]+)^=[\'"]?(?P[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) { $elem = $matches['elem'] ?: '*'; return $this->x("//{$elem}[@$matches[attr]='$matches[val]']"); } # [attr], node[attr] if (preg_match('/^(?P[a-z0-9_-]*)\[(?P[a-z0-9_-]+)\]$/', $pat, $matches)) { $elem = $matches['elem'] ?: '*'; return $this->x("//{$elem}[@$matches[attr]]"); } throw new \Exception("Unknown pattern: $pat"); } public function x(string $x) : array { $xpath = new DOMXpath($this->dom); if (strpos($x, '//') === 0 && $this->contextnode) { $x = '.' . $x; } if (XPATH_DEBUG) echo "\nxpath is: $x\n"; $elements = $xpath->query($x, $this->contextnode) ?? []; $elems = []; foreach($elements as $e) { $elems[] = new Node($this->dom, $e); } return $elems; } } class Html { use DomQuery; public function __construct(string $html) { $dom = new DomDocument(); if (PARSER_DEBUG) echo "Creating HTML parser from:\n" . $html . "\n\n"; @$dom->loadHTML($html); // suppress spammy warnings $this->dom = $dom; $this->contextnode = null; } public function toXml() : string { return $this->dom->saveXml(); } } class Node { use DomQuery; public function __construct(DOMDocument $dom, DOMNode $element) { $this->dom = $dom; $this->element = $element; $this->contextnode = $element; } public function __get($name) { return $this->element->getAttribute($name); } public function text() : string { return $this->element->nodeValue ?? ''; } /** * @return array|Node[] */ public function childNodes() : array { $elems = []; foreach($this->element->childNodes as $e) { $elems[] = new Node($this->dom, $e); } return $elems; } /** * @return Node */ public function childNode() : Node { $cn = $this->childNodes(); if (count($cn) > 1) { throw new \Exception("More than one childnode."); } return $cn[0]; } public function toXml() : string { return $this->element->ownerDocument->saveXml($this->element); } }