You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
187 lines
4.0 KiB
187 lines
4.0 KiB
5 years ago
|
<?php
|
||
|
|
||
|
const PARSER_DEBUG = 0;
|
||
|
const XPATH_DEBUG = 0;
|
||
|
|
||
|
/**
|
||
|
* Trait DomQuery
|
||
|
*
|
||
|
* requires:
|
||
|
*
|
||
|
$this->dom = $dom;
|
||
|
$this->contextnode = null;
|
||
|
*/
|
||
|
trait DomQuery {
|
||
|
public function find(string $pat) : Node
|
||
|
{
|
||
|
$els = $this->findAll($pat);
|
||
|
if (!count($els)) {
|
||
|
if (PARSER_DEBUG) {
|
||
|
echo "---- match failed; context: ---\n";
|
||
|
echo $this->toXml() ."\n";
|
||
|
}
|
||
|
|
||
|
throw new \Exception("No match: $pat");
|
||
|
}
|
||
|
if (count($els) > 1) {
|
||
|
if (PARSER_DEBUG) {
|
||
|
echo "Query results:\n";
|
||
|
foreach ($els as $el) {
|
||
|
echo $el->toXml()."\n";
|
||
|
}
|
||
|
}
|
||
|
throw new \Exception("Multiple match (".count($els)."x): $pat");
|
||
|
}
|
||
|
return $els[0];
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* @param $pat
|
||
|
* @return array|Node[]
|
||
|
* @throws Exception
|
||
|
*/
|
||
|
public function findAll(string $pat) : array
|
||
|
{
|
||
|
# node
|
||
|
if (preg_match('/^([a-z0-9_-]+)$/i', $pat, $matches)) {
|
||
|
return $this->x("//$matches[1]");
|
||
|
}
|
||
|
|
||
|
# .class, node.class
|
||
|
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\.(?P<cls>[a-z0-9_-]+)$/i', $pat, $matches)) {
|
||
|
$elem = $matches['elem'] ?: '*';
|
||
|
return $this->x("//{$elem}[contains(concat(' ',normalize-space(@class),' '),' $matches[cls] ')]");
|
||
|
}
|
||
|
|
||
|
// #id
|
||
|
if (preg_match('/^#(\w+)$/', $pat, $matches)) {
|
||
|
return $this->x("//*[id='$matches[1]']");
|
||
|
}
|
||
|
|
||
|
# [attr=value], node[attr=value] (allows quotes)
|
||
|
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)(?P<op>[$*~^]|)=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
|
||
|
$elem = $matches['elem'] ?: '*';
|
||
|
$op = $matches['op'];
|
||
|
|
||
|
switch ($op) {
|
||
|
case '':
|
||
|
return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
|
||
|
case '^':
|
||
|
return $this->x("//{$elem}[starts-with(@$matches[attr], '$matches[val]')]");
|
||
|
// this doesnt work..
|
||
|
// case '$':
|
||
|
// $vlen = strlen($matches['val']);
|
||
|
// return $this->x("//{$elem}['$matches[val]' = substring(@$matches[attr], string-length(@$matches[attr]) - $vlen)]");
|
||
|
case '*':
|
||
|
case '~':
|
||
|
return $this->x("//{$elem}[contains(@$matches[attr], '$matches[val]')]");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# [attr^=value], node[attr^=value] (allows quotes)
|
||
|
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)^=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
|
||
|
$elem = $matches['elem'] ?: '*';
|
||
|
return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
|
||
|
}
|
||
|
|
||
|
# [attr], node[attr]
|
||
|
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)\]$/', $pat, $matches)) {
|
||
|
$elem = $matches['elem'] ?: '*';
|
||
|
return $this->x("//{$elem}[@$matches[attr]]");
|
||
|
}
|
||
|
|
||
|
throw new \Exception("Unknown pattern: $pat");
|
||
|
}
|
||
|
|
||
|
public function x(string $x) : array
|
||
|
{
|
||
|
$xpath = new DOMXpath($this->dom);
|
||
|
if (strpos($x, '//') === 0 && $this->contextnode) {
|
||
|
$x = '.' . $x;
|
||
|
}
|
||
|
|
||
|
if (XPATH_DEBUG) echo "\nxpath is: $x\n";
|
||
|
|
||
|
$elements = $xpath->query($x, $this->contextnode) ?? [];
|
||
|
$elems = [];
|
||
|
foreach($elements as $e) {
|
||
|
$elems[] = new Node($this->dom, $e);
|
||
|
}
|
||
|
return $elems;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
class Html
|
||
|
{
|
||
|
use DomQuery;
|
||
|
|
||
|
public function __construct(string $html)
|
||
|
{
|
||
|
$dom = new DomDocument();
|
||
|
|
||
|
if (PARSER_DEBUG) echo "Creating HTML parser from:\n" . $html . "\n\n";
|
||
|
|
||
|
@$dom->loadHTML($html); // suppress spammy warnings
|
||
|
$this->dom = $dom;
|
||
|
$this->contextnode = null;
|
||
|
}
|
||
|
|
||
|
public function toXml() : string
|
||
|
{
|
||
|
return $this->dom->saveXml();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
class Node
|
||
|
{
|
||
|
use DomQuery;
|
||
|
|
||
|
public function __construct(DOMDocument $dom, DOMNode $element)
|
||
|
{
|
||
|
$this->dom = $dom;
|
||
|
$this->element = $element;
|
||
|
$this->contextnode = $element;
|
||
|
}
|
||
|
|
||
|
public function __get($name)
|
||
|
{
|
||
|
return $this->element->getAttribute($name);
|
||
|
}
|
||
|
|
||
|
public function text() : string
|
||
|
{
|
||
|
return $this->element->nodeValue ?? '';
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* @return array|Node[]
|
||
|
*/
|
||
|
public function childNodes() : array
|
||
|
{
|
||
|
$elems = [];
|
||
|
foreach($this->element->childNodes as $e) {
|
||
|
$elems[] = new Node($this->dom, $e);
|
||
|
}
|
||
|
return $elems;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* @return Node
|
||
|
*/
|
||
|
public function childNode() : Node
|
||
|
{
|
||
|
$cn = $this->childNodes();
|
||
|
if (count($cn) > 1) {
|
||
|
throw new \Exception("More than one childnode.");
|
||
|
}
|
||
|
return $cn[0];
|
||
|
}
|
||
|
|
||
|
public function toXml() : string
|
||
|
{
|
||
|
return $this->element->ownerDocument->saveXml($this->element);
|
||
|
}
|
||
|
}
|