commit 4760eaf987667c00328b9397b89a1a362e0813c8 Author: Ondřej Hruška Date: Mon Jun 10 11:53:03 2019 +0200 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..46be262 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.idea/ +out/ +cookie.txt +cookie-wget.txt +vesmir.cz diff --git a/http.inc b/http.inc new file mode 100644 index 0000000..f0dec4a --- /dev/null +++ b/http.inc @@ -0,0 +1,97 @@ +content); +} + +function get_file($url) { + return get($url)->content; +} + +function get_or_post($url, $mergeoptions) { + $options = array( + CURLOPT_USERAGENT => UA, //set user agent + CURLOPT_COOKIEFILE => "cookie.txt", //set cookie file + CURLOPT_COOKIEJAR => "cookie.txt", //set cookie jar + CURLOPT_COOKIESESSION => false, + CURLOPT_RETURNTRANSFER => true, // return web page + CURLOPT_HEADER => false, // don't return headers + CURLOPT_FOLLOWLOCATION => true, // follow redirects + CURLOPT_ENCODING => "", // handle all encodings + CURLOPT_AUTOREFERER => true, // set referer on redirect + CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect + CURLOPT_TIMEOUT => 120, // timeout on response + CURLOPT_MAXREDIRS => 10, // stop after 10 redirects + ); + + foreach ($mergeoptions as $k => $v) { + $options[$k] = $v; + } + + $ch = curl_init($url); + curl_setopt_array($ch, $options); + + // this function is called by curl for each header received + $response_headers = []; + curl_setopt($ch, CURLOPT_HEADERFUNCTION, + function($curl, $header) use (&$response_headers) + { + $len = strlen($header); + $header = explode(':', $header, 2); + if (count($header) < 2) // ignore invalid headers + return $len; + + $name = strtolower(trim($header[0])); + if (!array_key_exists($name, $response_headers)) + $response_headers[$name] = [trim($header[1])]; + else + $response_headers[$name][] = trim($header[1]); + + return $len; + } + ); + + $content = curl_exec($ch); + $err = curl_errno($ch); + $errmsg = curl_error($ch); + $header = curl_getinfo($ch); + curl_close($ch); + + $header['errno'] = $err; + $header['errmsg'] = $errmsg; + $header['headers'] = $response_headers; + +// echo "Result:\n"; +// print_r($header); + + $header['content'] = $content; + + if ($header['http_code'] != 200) { + print_r($header); + throw new \Exception("Error status: $header[http_code]"); + } + + return (object) $header; +} + +function get($url) +{ + echo "Sending GET to: $url\n"; + return get_or_post($url, [ + CURLOPT_CUSTOMREQUEST => "GET", //set request type post or get + CURLOPT_POST => false, //set to GET + ]); +} + +function post($url, $fields) +{ + echo "Sending POST to: $url\n"; + print_r($fields); + return get_or_post($url, [ + CURLOPT_CUSTOMREQUEST => "POST", //set request type post or get + CURLOPT_POST => true, //set to GET + CURLOPT_POSTFIELDS => $fields, + ]); +} diff --git a/parse.inc b/parse.inc new file mode 100644 index 0000000..6875401 --- /dev/null +++ b/parse.inc @@ -0,0 +1,186 @@ +dom = $dom; + $this->contextnode = null; + */ +trait DomQuery { + public function find(string $pat) : Node + { + $els = $this->findAll($pat); + if (!count($els)) { + if (PARSER_DEBUG) { + echo "---- match failed; context: ---\n"; + echo $this->toXml() ."\n"; + } + + throw new \Exception("No match: $pat"); + } + if (count($els) > 1) { + if (PARSER_DEBUG) { + echo "Query results:\n"; + foreach ($els as $el) { + echo $el->toXml()."\n"; + } + } + throw new \Exception("Multiple match (".count($els)."x): $pat"); + } + return $els[0]; + } + + /** + * @param $pat + * @return array|Node[] + * @throws Exception + */ + public function findAll(string $pat) : array + { + # node + if (preg_match('/^([a-z0-9_-]+)$/i', $pat, $matches)) { + return $this->x("//$matches[1]"); + } + + # .class, node.class + if (preg_match('/^(?P[a-z0-9_-]*)\.(?P[a-z0-9_-]+)$/i', $pat, $matches)) { + $elem = $matches['elem'] ?: '*'; + return $this->x("//{$elem}[contains(concat(' ',normalize-space(@class),' '),' $matches[cls] ')]"); + } + + // #id + if (preg_match('/^#(\w+)$/', $pat, $matches)) { + return $this->x("//*[id='$matches[1]']"); + } + + # [attr=value], node[attr=value] (allows quotes) + if (preg_match('/^(?P[a-z0-9_-]*)\[(?P[a-z0-9_-]+)(?P[$*~^]|)=[\'"]?(?P[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) { + $elem = $matches['elem'] ?: '*'; + $op = $matches['op']; + + switch ($op) { + case '': + return $this->x("//{$elem}[@$matches[attr]='$matches[val]']"); + case '^': + return $this->x("//{$elem}[starts-with(@$matches[attr], '$matches[val]')]"); + // this doesnt work.. +// case '$': +// $vlen = strlen($matches['val']); +// return $this->x("//{$elem}['$matches[val]' = substring(@$matches[attr], string-length(@$matches[attr]) - $vlen)]"); + case '*': + case '~': + return $this->x("//{$elem}[contains(@$matches[attr], '$matches[val]')]"); + } + } + + # [attr^=value], node[attr^=value] (allows quotes) + if (preg_match('/^(?P[a-z0-9_-]*)\[(?P[a-z0-9_-]+)^=[\'"]?(?P[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) { + $elem = $matches['elem'] ?: '*'; + return $this->x("//{$elem}[@$matches[attr]='$matches[val]']"); + } + + # [attr], node[attr] + if (preg_match('/^(?P[a-z0-9_-]*)\[(?P[a-z0-9_-]+)\]$/', $pat, $matches)) { + $elem = $matches['elem'] ?: '*'; + return $this->x("//{$elem}[@$matches[attr]]"); + } + + throw new \Exception("Unknown pattern: $pat"); + } + + public function x(string $x) : array + { + $xpath = new DOMXpath($this->dom); + if (strpos($x, '//') === 0 && $this->contextnode) { + $x = '.' . $x; + } + + if (XPATH_DEBUG) echo "\nxpath is: $x\n"; + + $elements = $xpath->query($x, $this->contextnode) ?? []; + $elems = []; + foreach($elements as $e) { + $elems[] = new Node($this->dom, $e); + } + return $elems; + } +} + + +class Html +{ + use DomQuery; + + public function __construct(string $html) + { + $dom = new DomDocument(); + + if (PARSER_DEBUG) echo "Creating HTML parser from:\n" . $html . "\n\n"; + + @$dom->loadHTML($html); // suppress spammy warnings + $this->dom = $dom; + $this->contextnode = null; + } + + public function toXml() : string + { + return $this->dom->saveXml(); + } +} + + +class Node +{ + use DomQuery; + + public function __construct(DOMDocument $dom, DOMNode $element) + { + $this->dom = $dom; + $this->element = $element; + $this->contextnode = $element; + } + + public function __get($name) + { + return $this->element->getAttribute($name); + } + + public function text() : string + { + return $this->element->nodeValue ?? ''; + } + + /** + * @return array|Node[] + */ + public function childNodes() : array + { + $elems = []; + foreach($this->element->childNodes as $e) { + $elems[] = new Node($this->dom, $e); + } + return $elems; + } + + /** + * @return Node + */ + public function childNode() : Node + { + $cn = $this->childNodes(); + if (count($cn) > 1) { + throw new \Exception("More than one childnode."); + } + return $cn[0]; + } + + public function toXml() : string + { + return $this->element->ownerDocument->saveXml($this->element); + } +} diff --git a/run.php b/run.php new file mode 100644 index 0000000..fa20100 --- /dev/null +++ b/run.php @@ -0,0 +1,279 @@ +find('.clanky'); + $clankyItems = $n_clanky->findAll('.row'); + + $cl_num = 0; + $aktualni_h4 = null; + + foreach ($clankyItems as $row) { + try { + $hh = $row->find('h4'); + $aktualni_h4 = $hh->text(); + echo "\n~ Skupina clanku: $aktualni_h4 ~\n"; + continue; + } catch(Exception $e) { + /* ok.. */ + } + + if ($row->class != 'clankyItem row') { + echo "Skip non-article\n"; + continue; + } + + try { + //echo $row->toXml(); + $num = ++$cl_num; // zvysit pocitadlo... + + $h3 = $row->find('h3'); + $a = $h3->find('a'); + $clanek_url = VESMIR_CZ . $a->href; + $clanek_nazev = $a->text(); + + // Get slug + preg_match('|/([^./]+)\.html$|', $clanek_url, $m); + $slug = $m[1]; + + // Get dirname + $fname = $num . ' - ' . $clanek_nazev; + $fname = mb_ereg_replace("([^\w\s\d\-_~,;\[\]\(\). ])", '', $fname); + $fname = mb_ereg_replace("([\.]{2,})", '', $fname); + + if (strlen($fname) > MAX_DIR_NAME_LEN) { + $fname = substr($fname, 0, strrpos($fname, ' ', -(strlen($fname) - MAX_DIR_NAME_LEN))); + } + + // Ensure dir exists + $clanek_dir = $cislo_dir . '/' . $fname; + if (!file_exists($clanek_dir)) { + mkdir($clanek_dir); + } + + echo "\n- $rocnik/$cislo -> Clanek #$num: $clanek_nazev -\nUrl: $clanek_url\n"; + + $perex = null; + try { + $perex = $row->find('.perex')->text(); + } catch (Exception $e) { + echo "No perex. ".$e->getMessage()."\n"; + } + + $thumbfile = null; + try { + if (file_exists($clanek_dir . '/thumb.jpg')) { + $thumbfile = 'thumb.jpg'; + } else { + $thumb = $row->find('img.img-responsive'); + + $f = get_file(VESMIR_CZ . $thumb->src); + file_put_contents($clanek_dir . '/thumb.jpg', $f); + $thumbfile = 'thumb.jpg'; + } + } catch (Exception $e) { + echo "No thumb. ".$e->getMessage()."\n"; + } + + $author_names = []; + try { + $authors = $row->find('.authors'); + $author_links = $authors->findAll('a'); + + foreach ($author_links as $al) { + $author_names[] = $al->text(); + } + } catch (Exception $e) { + echo "!! No .authors div\n"; + } + + $merged_authors = implode(', ', $author_names); + + if(SKIP_EXISTING && file_exists($clanek_dir . '/clanek.json')) { + echo "ARTICLE ALREADY DL'D, SKIP\n"; + continue; + } + + $resp = get_file($clanek_url); + file_put_contents($clanek_dir . '/orig.html', $resp); + + $article_doc = new Html($resp); + + $attachments = []; + + // Try to download attachments (pdf version...) + try { + $dmedia = $article_doc->find('.media'); + foreach ($dmedia->findAll('a[href]') as $item) { + $href = VESMIR_CZ . $item->href; + echo "> Downloading: " . $item->text() . "\n" . $href; + + $fname = uniqid() . '.pdf'; // it's probably a pdf + if ($item->text() == 'článek ve formátu pdf') { + $isarticlepdf = true; + $fname = $slug . '.pdf'; + } + + $resp = get($href); + + if (isset($resp->headers['content-disposition'])) { + $first = $resp->headers['content-disposition'][0]; + list(, $orig_fname) = explode('filename=', $first); + } + if (!$isarticlepdf) { + $fname = $orig_fname; + } + + file_put_contents($clanek_dir . '/' . $fname, $resp->content); + unset($resp->content); + + $attachments[] = [ + 'url' => $href, + 'popis' => $item->text(), + 'nazev' => $orig_fname, + 'soubor' => $fname, + ]; + } + + } catch(Exception $e) { + echo "Error finding media links: ".$e->getMessage()."\n"; + } + + $adiv = $article_doc->find('div.article'); + $body = $adiv->toXml(); // serialize the body div + $body = str_replace(' ', '', $body); + + $picnum = 0; + $body = preg_replace_callback('|src="(/images/[^"]+)"|', function($m) use ($clanek_dir, &$picnum) { + $uri = $m[1]; + $url = VESMIR_CZ . $uri; + + preg_match('|/([^/]+)$|', $uri, $m); + $img_slug = $m[1]; + $img_fname = 'img_' . ($picnum++) . '_' . $img_slug; + + try { + $f = get_file($url); + file_put_contents($clanek_dir . '/' . $img_fname, $f); + return "src=\"".htmlspecialchars($img_fname)."\""; + } catch(\Exception $e) { + echo "Error getting img $uri\n"; + echo $e->getMessage(); + echo $e->getTraceAsString(); + return $m[0]; // no subst. + } + }, $body); + + $nazev_e = htmlspecialchars($clanek_nazev); + $merged_authors_e = htmlspecialchars($merged_authors); + + $cleaned = << + + + +$nazev_e + + + +

$nazev_e

+

$merged_authors_e

+ + +$body + + + + + +DOC; + + file_put_contents($clanek_dir . '/clanek.html', $cleaned); + + $metadata = [ + 'nazev' => $clanek_nazev, + 'slug' => $slug, + 'url' => $clanek_url, + 'autori' => $author_names, + 'rocnik' => $rocnik, + 'cislo' => $cislo, + 'poradi' => $cl_num, + 'prilohy' => $attachments, + 'thumb' => $thumbfile, + 'perex' => $perex, + ]; + file_put_contents($clanek_dir . '/clanek.json', json_encode($metadata, 128|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES)); + + } catch (Exception $e) { + echo $e->getMessage() . "\n" . $e->getTraceAsString() . "\n"; + } + } +} + +function scrape_year($year) { + $doc = get_doc(VESMIR_CZ . "/cz/casopis/archiv-casopisu/$year/"); + $obalky = $doc->findAll('.vesmirObalka'); + + $rocnik_dir = __DIR__ . '/out/' . $year; + if (!file_exists($rocnik_dir)) { + mkdir($rocnik_dir); + } + + foreach ($obalky as $obalka) { + $a = $obalka->childNode(); + $url_cislo = $a->href; + + echo $url_cislo.PHP_EOL; + + if (!preg_match('|/(\d+)/cislo-(\d+)/$|', $url_cislo, $m)) { + die("weird format $url_cislo"); + } + echo "== Rocnik $m[1], cislo $m[2] ==\n"; + $rocnik = $m[1]; + $cislo = $m[2]; + $ident = "$rocnik-$cislo"; + + $i = $a->childNode(); + $url_thumb = $i->src; + + $url_thumb = str_replace("?h=180", "?h=1800", $url_thumb); + + echo "Casopis URL: $url_cislo\nObalka URL: $url_thumb\n\n"; + $obalka_file = $rocnik_dir . "/$ident.jpg"; + + if (!file_exists($obalka_file)) { + echo "Stahuji obalku...\n"; + $c = get_file(VESMIR_CZ . $url_thumb); + file_put_contents($obalka_file, $c); + } + + $c = get_doc(VESMIR_CZ . $url_cislo); + + scrape_issue($rocnik_dir, $rocnik, $cislo, $c); + } +} + + +ensure_logged_in(); +//scrape_year(2019); + +for ($i = 2019; $i >= 1994; $i--) { + ensure_logged_in(); + scrape_year($i); +} diff --git a/session.inc b/session.inc new file mode 100644 index 0000000..32f65d0 --- /dev/null +++ b/session.inc @@ -0,0 +1,87 @@ +find('[for=spamProtectionDisableResult]')->text(); + echo "Challenge is: $challenge\n"; + if (preg_match('/(\d+) (plus|mínus) (\d+)/', $challenge, $m)) { + $a = +$m[1]; + $op = $m[2] == 'plus' ? 1 : -1; + $b = +$m[3]; + $r = $a + $op * $b; + echo "Result: $r\n"; + return $r; + } else { + throw new Exception("Unexpected challenge: $challenge"); + } +} + +function login() +{ + echo "----- attempting to login -----\n"; + + return post( "https://vesmir.cz/usrlogon.do", [ + "username" => VESMIR_LOGIN, + "password" => VESMIR_PASSWORD, + "docId" => 9573, + "doShowdocAction" => "/usrlogon.do", + "emailLogon" => false, + "origDocId" => 9573, + ]); +} + +function disableAntispam() +{ + echo "----- disabling antispam -----\n"; + + $r = get("https://vesmir.cz/components/form/spamprotectiondisable.jsp?backurl=%2Fcz%2Fuzivatel.html"); + + $doc = new Html($r->content); + $solved = solveChallenge($doc); + + $result = post( "https://vesmir.cz/components/form/spamprotectiondisable.jsp", [ + "result" => $solved, + "__token" => $doc->find('[name=__token]')->value, + "backurl" => "/cz/uzivatel.html", + "hash" => $doc->find('[name=hash]')->value, + ]); + + if (preg_match("/Zadaný výsledek je správný/", $result->content)) { + echo "Anti-spam succeeded.\n"; + return $result; + } else { + print_r($result); + + throw new Exception("Failed to disable antispam."); + } +} + +function dump_cookie_file_for_wget() { +// echo "Exporting cookie for WGET\n"; +// $c = file_get_contents("cookie.txt"); +// $c = str_replace('#HttpOnly_', '', $c); +// file_put_contents('cookie-wget.txt', $c); +} + +function ensure_logged_in() { + // get a session cookie + $r = get("https://vesmir.cz/cz/uzivatel.html"); + if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) { + echo "Already logged in!\n"; + dump_cookie_file_for_wget(); + return true; + } else { + echo "Need login!\n"; + + disableAntispam(); + get("https://vesmir.cz/cz/uzivatel.html"); + $result = login(); + if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) { + echo "Logged in!\n"; + dump_cookie_file_for_wget(); + return true; + } else { + print_r($result); + throw new Exception("--- LOGIN FAILED! ---"); + } + } +}