Stahovač archivu článků časopisu Vesmír (vesmir.cz). Vyžaduje aktivní předplatné a jméno/heslo, jinak budou některé články neúplné.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
vesmir-scraper/parse.inc

187 lines
4.0 KiB

5 years ago
<?php
const PARSER_DEBUG = 0;
const XPATH_DEBUG = 0;
/**
* Trait DomQuery
*
* requires:
*
$this->dom = $dom;
$this->contextnode = null;
*/
trait DomQuery {
public function find(string $pat) : Node
{
$els = $this->findAll($pat);
if (!count($els)) {
if (PARSER_DEBUG) {
echo "---- match failed; context: ---\n";
echo $this->toXml() ."\n";
}
throw new \Exception("No match: $pat");
}
if (count($els) > 1) {
if (PARSER_DEBUG) {
echo "Query results:\n";
foreach ($els as $el) {
echo $el->toXml()."\n";
}
}
throw new \Exception("Multiple match (".count($els)."x): $pat");
}
return $els[0];
}
/**
* @param $pat
* @return array|Node[]
* @throws Exception
*/
public function findAll(string $pat) : array
{
# node
if (preg_match('/^([a-z0-9_-]+)$/i', $pat, $matches)) {
return $this->x("//$matches[1]");
}
# .class, node.class
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\.(?P<cls>[a-z0-9_-]+)$/i', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
return $this->x("//{$elem}[contains(concat(' ',normalize-space(@class),' '),' $matches[cls] ')]");
}
// #id
if (preg_match('/^#(\w+)$/', $pat, $matches)) {
return $this->x("//*[id='$matches[1]']");
}
# [attr=value], node[attr=value] (allows quotes)
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)(?P<op>[$*~^]|)=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
$op = $matches['op'];
switch ($op) {
case '':
return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
case '^':
return $this->x("//{$elem}[starts-with(@$matches[attr], '$matches[val]')]");
// this doesnt work..
// case '$':
// $vlen = strlen($matches['val']);
// return $this->x("//{$elem}['$matches[val]' = substring(@$matches[attr], string-length(@$matches[attr]) - $vlen)]");
case '*':
case '~':
return $this->x("//{$elem}[contains(@$matches[attr], '$matches[val]')]");
}
}
# [attr^=value], node[attr^=value] (allows quotes)
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)^=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
}
# [attr], node[attr]
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)\]$/', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
return $this->x("//{$elem}[@$matches[attr]]");
}
throw new \Exception("Unknown pattern: $pat");
}
public function x(string $x) : array
{
$xpath = new DOMXpath($this->dom);
if (strpos($x, '//') === 0 && $this->contextnode) {
$x = '.' . $x;
}
if (XPATH_DEBUG) echo "\nxpath is: $x\n";
$elements = $xpath->query($x, $this->contextnode) ?? [];
$elems = [];
foreach($elements as $e) {
$elems[] = new Node($this->dom, $e);
}
return $elems;
}
}
class Html
{
use DomQuery;
public function __construct(string $html)
{
$dom = new DomDocument();
if (PARSER_DEBUG) echo "Creating HTML parser from:\n" . $html . "\n\n";
@$dom->loadHTML($html); // suppress spammy warnings
$this->dom = $dom;
$this->contextnode = null;
}
public function toXml() : string
{
return $this->dom->saveXml();
}
}
class Node
{
use DomQuery;
public function __construct(DOMDocument $dom, DOMNode $element)
{
$this->dom = $dom;
$this->element = $element;
$this->contextnode = $element;
}
public function __get($name)
{
return $this->element->getAttribute($name);
}
public function text() : string
{
return $this->element->nodeValue ?? '';
}
/**
* @return array|Node[]
*/
public function childNodes() : array
{
$elems = [];
foreach($this->element->childNodes as $e) {
$elems[] = new Node($this->dom, $e);
}
return $elems;
}
/**
* @return Node
*/
public function childNode() : Node
{
$cn = $this->childNodes();
if (count($cn) > 1) {
throw new \Exception("More than one childnode.");
}
return $cn[0];
}
public function toXml() : string
{
return $this->element->ownerDocument->saveXml($this->element);
}
}