Initial commit

master
Ondřej Hruška 6 years ago
commit 4760eaf987
Signed by: MightyPork
GPG Key ID: 2C5FD5035250423D
  1. 5
      .gitignore
  2. 97
      http.inc
  3. 186
      parse.inc
  4. 279
      run.php
  5. 87
      session.inc

5
.gitignore vendored

@ -0,0 +1,5 @@
.idea/
out/
cookie.txt
cookie-wget.txt
vesmir.cz

@ -0,0 +1,97 @@
<?php
const UA = 'Mozilla/5.0 (Windows NT 6.1; rv:8.0) Gecko/20100101 Firefox/8.0';
function get_doc($url) {
return new Html(get($url)->content);
}
function get_file($url) {
return get($url)->content;
}
function get_or_post($url, $mergeoptions) {
$options = array(
CURLOPT_USERAGENT => UA, //set user agent
CURLOPT_COOKIEFILE => "cookie.txt", //set cookie file
CURLOPT_COOKIEJAR => "cookie.txt", //set cookie jar
CURLOPT_COOKIESESSION => false,
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);
foreach ($mergeoptions as $k => $v) {
$options[$k] = $v;
}
$ch = curl_init($url);
curl_setopt_array($ch, $options);
// this function is called by curl for each header received
$response_headers = [];
curl_setopt($ch, CURLOPT_HEADERFUNCTION,
function($curl, $header) use (&$response_headers)
{
$len = strlen($header);
$header = explode(':', $header, 2);
if (count($header) < 2) // ignore invalid headers
return $len;
$name = strtolower(trim($header[0]));
if (!array_key_exists($name, $response_headers))
$response_headers[$name] = [trim($header[1])];
else
$response_headers[$name][] = trim($header[1]);
return $len;
}
);
$content = curl_exec($ch);
$err = curl_errno($ch);
$errmsg = curl_error($ch);
$header = curl_getinfo($ch);
curl_close($ch);
$header['errno'] = $err;
$header['errmsg'] = $errmsg;
$header['headers'] = $response_headers;
// echo "Result:\n";
// print_r($header);
$header['content'] = $content;
if ($header['http_code'] != 200) {
print_r($header);
throw new \Exception("Error status: $header[http_code]");
}
return (object) $header;
}
function get($url)
{
echo "Sending GET to: $url\n";
return get_or_post($url, [
CURLOPT_CUSTOMREQUEST => "GET", //set request type post or get
CURLOPT_POST => false, //set to GET
]);
}
function post($url, $fields)
{
echo "Sending POST to: $url\n";
print_r($fields);
return get_or_post($url, [
CURLOPT_CUSTOMREQUEST => "POST", //set request type post or get
CURLOPT_POST => true, //set to GET
CURLOPT_POSTFIELDS => $fields,
]);
}

@ -0,0 +1,186 @@
<?php
const PARSER_DEBUG = 0;
const XPATH_DEBUG = 0;
/**
* Trait DomQuery
*
* requires:
*
$this->dom = $dom;
$this->contextnode = null;
*/
trait DomQuery {
public function find(string $pat) : Node
{
$els = $this->findAll($pat);
if (!count($els)) {
if (PARSER_DEBUG) {
echo "---- match failed; context: ---\n";
echo $this->toXml() ."\n";
}
throw new \Exception("No match: $pat");
}
if (count($els) > 1) {
if (PARSER_DEBUG) {
echo "Query results:\n";
foreach ($els as $el) {
echo $el->toXml()."\n";
}
}
throw new \Exception("Multiple match (".count($els)."x): $pat");
}
return $els[0];
}
/**
* @param $pat
* @return array|Node[]
* @throws Exception
*/
public function findAll(string $pat) : array
{
# node
if (preg_match('/^([a-z0-9_-]+)$/i', $pat, $matches)) {
return $this->x("//$matches[1]");
}
# .class, node.class
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\.(?P<cls>[a-z0-9_-]+)$/i', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
return $this->x("//{$elem}[contains(concat(' ',normalize-space(@class),' '),' $matches[cls] ')]");
}
// #id
if (preg_match('/^#(\w+)$/', $pat, $matches)) {
return $this->x("//*[id='$matches[1]']");
}
# [attr=value], node[attr=value] (allows quotes)
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)(?P<op>[$*~^]|)=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
$op = $matches['op'];
switch ($op) {
case '':
return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
case '^':
return $this->x("//{$elem}[starts-with(@$matches[attr], '$matches[val]')]");
// this doesnt work..
// case '$':
// $vlen = strlen($matches['val']);
// return $this->x("//{$elem}['$matches[val]' = substring(@$matches[attr], string-length(@$matches[attr]) - $vlen)]");
case '*':
case '~':
return $this->x("//{$elem}[contains(@$matches[attr], '$matches[val]')]");
}
}
# [attr^=value], node[attr^=value] (allows quotes)
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)^=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
}
# [attr], node[attr]
if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)\]$/', $pat, $matches)) {
$elem = $matches['elem'] ?: '*';
return $this->x("//{$elem}[@$matches[attr]]");
}
throw new \Exception("Unknown pattern: $pat");
}
public function x(string $x) : array
{
$xpath = new DOMXpath($this->dom);
if (strpos($x, '//') === 0 && $this->contextnode) {
$x = '.' . $x;
}
if (XPATH_DEBUG) echo "\nxpath is: $x\n";
$elements = $xpath->query($x, $this->contextnode) ?? [];
$elems = [];
foreach($elements as $e) {
$elems[] = new Node($this->dom, $e);
}
return $elems;
}
}
class Html
{
use DomQuery;
public function __construct(string $html)
{
$dom = new DomDocument();
if (PARSER_DEBUG) echo "Creating HTML parser from:\n" . $html . "\n\n";
@$dom->loadHTML($html); // suppress spammy warnings
$this->dom = $dom;
$this->contextnode = null;
}
public function toXml() : string
{
return $this->dom->saveXml();
}
}
class Node
{
use DomQuery;
public function __construct(DOMDocument $dom, DOMNode $element)
{
$this->dom = $dom;
$this->element = $element;
$this->contextnode = $element;
}
public function __get($name)
{
return $this->element->getAttribute($name);
}
public function text() : string
{
return $this->element->nodeValue ?? '';
}
/**
* @return array|Node[]
*/
public function childNodes() : array
{
$elems = [];
foreach($this->element->childNodes as $e) {
$elems[] = new Node($this->dom, $e);
}
return $elems;
}
/**
* @return Node
*/
public function childNode() : Node
{
$cn = $this->childNodes();
if (count($cn) > 1) {
throw new \Exception("More than one childnode.");
}
return $cn[0];
}
public function toXml() : string
{
return $this->element->ownerDocument->saveXml($this->element);
}
}

@ -0,0 +1,279 @@
<?php
const MAX_DIR_NAME_LEN = 40;
const SKIP_EXISTING = true;
const VESMIR_CZ = 'https://vesmir.cz';
const VESMIR_LOGIN = "";
const VESMIR_PASSWORD = "";
require_once "http.inc";
require_once "parse.inc";
require_once "session.inc";
function scrape_issue($rocnik_dir, $rocnik, $cislo, Html $doc) {
$cislo_dir = $rocnik_dir . '/' . $cislo;
if (!file_exists($cislo_dir)) {
mkdir($cislo_dir);
}
echo "\nStahuji cislo $rocnik/$cislo\n\n";
$n_clanky = $doc->find('.clanky');
$clankyItems = $n_clanky->findAll('.row');
$cl_num = 0;
$aktualni_h4 = null;
foreach ($clankyItems as $row) {
try {
$hh = $row->find('h4');
$aktualni_h4 = $hh->text();
echo "\n~ Skupina clanku: $aktualni_h4 ~\n";
continue;
} catch(Exception $e) {
/* ok.. */
}
if ($row->class != 'clankyItem row') {
echo "Skip non-article\n";
continue;
}
try {
//echo $row->toXml();
$num = ++$cl_num; // zvysit pocitadlo...
$h3 = $row->find('h3');
$a = $h3->find('a');
$clanek_url = VESMIR_CZ . $a->href;
$clanek_nazev = $a->text();
// Get slug
preg_match('|/([^./]+)\.html$|', $clanek_url, $m);
$slug = $m[1];
// Get dirname
$fname = $num . ' - ' . $clanek_nazev;
$fname = mb_ereg_replace("([^\w\s\d\-_~,;\[\]\(\). ])", '', $fname);
$fname = mb_ereg_replace("([\.]{2,})", '', $fname);
if (strlen($fname) > MAX_DIR_NAME_LEN) {
$fname = substr($fname, 0, strrpos($fname, ' ', -(strlen($fname) - MAX_DIR_NAME_LEN)));
}
// Ensure dir exists
$clanek_dir = $cislo_dir . '/' . $fname;
if (!file_exists($clanek_dir)) {
mkdir($clanek_dir);
}
echo "\n- $rocnik/$cislo -> Clanek #$num: $clanek_nazev -\nUrl: $clanek_url\n";
$perex = null;
try {
$perex = $row->find('.perex')->text();
} catch (Exception $e) {
echo "No perex. ".$e->getMessage()."\n";
}
$thumbfile = null;
try {
if (file_exists($clanek_dir . '/thumb.jpg')) {
$thumbfile = 'thumb.jpg';
} else {
$thumb = $row->find('img.img-responsive');
$f = get_file(VESMIR_CZ . $thumb->src);
file_put_contents($clanek_dir . '/thumb.jpg', $f);
$thumbfile = 'thumb.jpg';
}
} catch (Exception $e) {
echo "No thumb. ".$e->getMessage()."\n";
}
$author_names = [];
try {
$authors = $row->find('.authors');
$author_links = $authors->findAll('a');
foreach ($author_links as $al) {
$author_names[] = $al->text();
}
} catch (Exception $e) {
echo "!! No .authors div\n";
}
$merged_authors = implode(', ', $author_names);
if(SKIP_EXISTING && file_exists($clanek_dir . '/clanek.json')) {
echo "ARTICLE ALREADY DL'D, SKIP\n";
continue;
}
$resp = get_file($clanek_url);
file_put_contents($clanek_dir . '/orig.html', $resp);
$article_doc = new Html($resp);
$attachments = [];
// Try to download attachments (pdf version...)
try {
$dmedia = $article_doc->find('.media');
foreach ($dmedia->findAll('a[href]') as $item) {
$href = VESMIR_CZ . $item->href;
echo "> Downloading: " . $item->text() . "\n" . $href;
$fname = uniqid() . '.pdf'; // it's probably a pdf
if ($item->text() == 'článek ve formátu pdf') {
$isarticlepdf = true;
$fname = $slug . '.pdf';
}
$resp = get($href);
if (isset($resp->headers['content-disposition'])) {
$first = $resp->headers['content-disposition'][0];
list(, $orig_fname) = explode('filename=', $first);
}
if (!$isarticlepdf) {
$fname = $orig_fname;
}
file_put_contents($clanek_dir . '/' . $fname, $resp->content);
unset($resp->content);
$attachments[] = [
'url' => $href,
'popis' => $item->text(),
'nazev' => $orig_fname,
'soubor' => $fname,
];
}
} catch(Exception $e) {
echo "Error finding media links: ".$e->getMessage()."\n";
}
$adiv = $article_doc->find('div.article');
$body = $adiv->toXml(); // serialize the body div
$body = str_replace('&#13;', '', $body);
$picnum = 0;
$body = preg_replace_callback('|src="(/images/[^"]+)"|', function($m) use ($clanek_dir, &$picnum) {
$uri = $m[1];
$url = VESMIR_CZ . $uri;
preg_match('|/([^/]+)$|', $uri, $m);
$img_slug = $m[1];
$img_fname = 'img_' . ($picnum++) . '_' . $img_slug;
try {
$f = get_file($url);
file_put_contents($clanek_dir . '/' . $img_fname, $f);
return "src=\"".htmlspecialchars($img_fname)."\"";
} catch(\Exception $e) {
echo "Error getting img $uri\n";
echo $e->getMessage();
echo $e->getTraceAsString();
return $m[0]; // no subst.
}
}, $body);
$nazev_e = htmlspecialchars($clanek_nazev);
$merged_authors_e = htmlspecialchars($merged_authors);
$cleaned = <<<DOC
<!DOCTYPE html>
<html lang="cs">
<head>
<meta charset="utf-8">
<title>$nazev_e</title>
<link href="../../../style.css" rel="stylesheet" type="text/css" />
</head>
<body>
<h1 class="article-name">$nazev_e</h1>
<p class="authors">$merged_authors_e</p>
<!-- article begin -->
$body
<!-- article end -->
</body>
</html>
DOC;
file_put_contents($clanek_dir . '/clanek.html', $cleaned);
$metadata = [
'nazev' => $clanek_nazev,
'slug' => $slug,
'url' => $clanek_url,
'autori' => $author_names,
'rocnik' => $rocnik,
'cislo' => $cislo,
'poradi' => $cl_num,
'prilohy' => $attachments,
'thumb' => $thumbfile,
'perex' => $perex,
];
file_put_contents($clanek_dir . '/clanek.json', json_encode($metadata, 128|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES));
} catch (Exception $e) {
echo $e->getMessage() . "\n" . $e->getTraceAsString() . "\n";
}
}
}
function scrape_year($year) {
$doc = get_doc(VESMIR_CZ . "/cz/casopis/archiv-casopisu/$year/");
$obalky = $doc->findAll('.vesmirObalka');
$rocnik_dir = __DIR__ . '/out/' . $year;
if (!file_exists($rocnik_dir)) {
mkdir($rocnik_dir);
}
foreach ($obalky as $obalka) {
$a = $obalka->childNode();
$url_cislo = $a->href;
echo $url_cislo.PHP_EOL;
if (!preg_match('|/(\d+)/cislo-(\d+)/$|', $url_cislo, $m)) {
die("weird format $url_cislo");
}
echo "== Rocnik $m[1], cislo $m[2] ==\n";
$rocnik = $m[1];
$cislo = $m[2];
$ident = "$rocnik-$cislo";
$i = $a->childNode();
$url_thumb = $i->src;
$url_thumb = str_replace("?h=180", "?h=1800", $url_thumb);
echo "Casopis URL: $url_cislo\nObalka URL: $url_thumb\n\n";
$obalka_file = $rocnik_dir . "/$ident.jpg";
if (!file_exists($obalka_file)) {
echo "Stahuji obalku...\n";
$c = get_file(VESMIR_CZ . $url_thumb);
file_put_contents($obalka_file, $c);
}
$c = get_doc(VESMIR_CZ . $url_cislo);
scrape_issue($rocnik_dir, $rocnik, $cislo, $c);
}
}
ensure_logged_in();
//scrape_year(2019);
for ($i = 2019; $i >= 1994; $i--) {
ensure_logged_in();
scrape_year($i);
}

@ -0,0 +1,87 @@
<?php
function solveChallenge(Html $doc) : int {
$challenge = $doc->find('[for=spamProtectionDisableResult]')->text();
echo "Challenge is: $challenge\n";
if (preg_match('/(\d+) (plus|mínus) (\d+)/', $challenge, $m)) {
$a = +$m[1];
$op = $m[2] == 'plus' ? 1 : -1;
$b = +$m[3];
$r = $a + $op * $b;
echo "Result: $r\n";
return $r;
} else {
throw new Exception("Unexpected challenge: $challenge");
}
}
function login()
{
echo "----- attempting to login -----\n";
return post( "https://vesmir.cz/usrlogon.do", [
"username" => VESMIR_LOGIN,
"password" => VESMIR_PASSWORD,
"docId" => 9573,
"doShowdocAction" => "/usrlogon.do",
"emailLogon" => false,
"origDocId" => 9573,
]);
}
function disableAntispam()
{
echo "----- disabling antispam -----\n";
$r = get("https://vesmir.cz/components/form/spamprotectiondisable.jsp?backurl=%2Fcz%2Fuzivatel.html");
$doc = new Html($r->content);
$solved = solveChallenge($doc);
$result = post( "https://vesmir.cz/components/form/spamprotectiondisable.jsp", [
"result" => $solved,
"__token" => $doc->find('[name=__token]')->value,
"backurl" => "/cz/uzivatel.html",
"hash" => $doc->find('[name=hash]')->value,
]);
if (preg_match("/Zadaný výsledek je správný/", $result->content)) {
echo "Anti-spam succeeded.\n";
return $result;
} else {
print_r($result);
throw new Exception("Failed to disable antispam.");
}
}
function dump_cookie_file_for_wget() {
// echo "Exporting cookie for WGET\n";
// $c = file_get_contents("cookie.txt");
// $c = str_replace('#HttpOnly_', '', $c);
// file_put_contents('cookie-wget.txt', $c);
}
function ensure_logged_in() {
// get a session cookie
$r = get("https://vesmir.cz/cz/uzivatel.html");
if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) {
echo "Already logged in!\n";
dump_cookie_file_for_wget();
return true;
} else {
echo "Need login!\n";
disableAntispam();
get("https://vesmir.cz/cz/uzivatel.html");
$result = login();
if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) {
echo "Logged in!\n";
dump_cookie_file_for_wget();
return true;
} else {
print_r($result);
throw new Exception("--- LOGIN FAILED! ---");
}
}
}
Loading…
Cancel
Save