const PARSER_DEBUG = 0;
const XPATH_DEBUG = 0;
* Trait DomQuery
* requires:
$this->dom = $dom;
$this->contextnode = null;
trait DomQuery {
public function find(string $pat) : Node
$els = $this->findAll($pat);
if (!count($els)) {
echo "---- match failed; context: ---\n";
echo $this->toXml() ."\n";
throw new \Exception("No match: $pat");
if (count($els) > 1) {
echo "Query results:\n";
foreach ($els as $el) {
echo $el->toXml()."\n";
throw new \Exception("Multiple match (".count($els)."x): $pat");
return $els[0];
* @param $pat
* @return array|Node[]
* @throws Exception
public function findAll(string $pat) : array
# node
if (preg_match('/^([a-z0-9_-]+)$/i', $pat, $matches)) {
return $this->x("//$matches[1]");
# .class, node.class
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\.(?P<cls>[a-z0-9_-]+)$/i', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
return $this->x("//{$elem}[contains(concat(' ',normalize-space(@class),' '),' $matches[cls] ')]");
// #id
if (preg_match('/^#(\w+)$/', $pat, $matches)) {
return $this->x("//*[id='$matches[1]']");
# [attr=value], node[attr=value] (allows quotes)
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)(?P<op>[$*~^]|)=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
$op = $matches['op'];
switch ($op) {
case '':
return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
case '^':
return $this->x("//{$elem}[starts-with(@$matches[attr], '$matches[val]')]");
// this doesnt work..
// case '$':
// $vlen = strlen($matches['val']);
// return $this->x("//{$elem}['$matches[val]' = substring(@$matches[attr], string-length(@$matches[attr]) - $vlen)]");
case '*':
case '~':
return $this->x("//{$elem}[contains(@$matches[attr], '$matches[val]')]");
# [attr^=value], node[attr^=value] (allows quotes)
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)^=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
# [attr], node[attr]
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)\]$/', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
return $this->x("//{$elem}[@$matches[attr]]");
throw new \Exception("Unknown pattern: $pat");
public function x(string $x) : array
$xpath = new DOMXpath($this->dom);
if (strpos($x, '//') === 0 && $this->contextnode) {
$x = '.' . $x;
if (XPATH_DEBUG) echo "\nxpath is: $x\n";
$elements = $xpath->query($x, $this->contextnode) ?? [];
$elems = [];
foreach($elements as $e) {
$elems[] = new Node($this->dom, $e);
return $elems;
class Html
use DomQuery;
public function __construct(string $html)
$dom = new DomDocument();
if (PARSER_DEBUG) echo "Creating HTML parser from:\n" . $html . "\n\n";
@$dom->loadHTML($html); // suppress spammy warnings
$this->dom = $dom;
$this->contextnode = null;
public function toXml() : string
return $this->dom->saveXml();
class Node
use DomQuery;
public function __construct(DOMDocument $dom, DOMNode $element)
$this->dom = $dom;
$this->element = $element;
$this->contextnode = $element;
public function __get($name)
return $this->element->getAttribute($name);
public function text() : string
return $this->element->nodeValue ?? '';
* @return array|Node[]
public function childNodes() : array
$elems = [];
foreach($this->element->childNodes as $e) {
$elems[] = new Node($this->dom, $e);
return $elems;
* @return Node
public function childNode() : Node
$cn = $this->childNodes();
if (count($cn) > 1) {
throw new \Exception("More than one childnode.");
return $cn[0];
public function toXml() : string
return $this->element->ownerDocument->saveXml($this->element);