Initial commit

7 years ago · 4760eaf987
commit 4760eaf987
5 changed files with 654 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+.idea/
+out/
+cookie.txt
+cookie-wget.txt
+vesmir.cz
--- a/http.inc
+++ b/http.inc
@ -0,0 +1,97 @@
+<?php
+
+const UA = 'Mozilla/5.0 (Windows NT 6.1; rv:8.0) Gecko/20100101 Firefox/8.0';
+
+function get_doc($url) {
+	return new Html(get($url)->content);
+}
+
+function get_file($url) {
+	return get($url)->content;
+}
+
+function get_or_post($url, $mergeoptions) {
+	$options = array(
+		CURLOPT_USERAGENT => UA, //set user agent
+		CURLOPT_COOKIEFILE => "cookie.txt", //set cookie file
+		CURLOPT_COOKIEJAR => "cookie.txt", //set cookie jar
+		CURLOPT_COOKIESESSION => false,
+		CURLOPT_RETURNTRANSFER => true,     // return web page
+		CURLOPT_HEADER => false,    // don't return headers
+		CURLOPT_FOLLOWLOCATION => true,     // follow redirects
+		CURLOPT_ENCODING => "",       // handle all encodings
+		CURLOPT_AUTOREFERER => true,     // set referer on redirect
+		CURLOPT_CONNECTTIMEOUT => 120,      // timeout on connect
+		CURLOPT_TIMEOUT => 120,      // timeout on response
+		CURLOPT_MAXREDIRS => 10,       // stop after 10 redirects
+	);
+
+	foreach ($mergeoptions as $k => $v) {
+		$options[$k] = $v;
+	}
+
+	$ch = curl_init($url);
+	curl_setopt_array($ch, $options);
+
+	// this function is called by curl for each header received
+	$response_headers = [];
+	curl_setopt($ch, CURLOPT_HEADERFUNCTION,
+		function($curl, $header) use (&$response_headers)
+		{
+			$len = strlen($header);
+			$header = explode(':', $header, 2);
+			if (count($header) < 2) // ignore invalid headers
+				return $len;
+
+			$name = strtolower(trim($header[0]));
+			if (!array_key_exists($name, $response_headers))
+				$response_headers[$name] = [trim($header[1])];
+			else
+				$response_headers[$name][] = trim($header[1]);
+
+			return $len;
+		}
+	);
+
+	$content = curl_exec($ch);
+	$err = curl_errno($ch);
+	$errmsg = curl_error($ch);
+	$header = curl_getinfo($ch);
+	curl_close($ch);
+
+	$header['errno'] = $err;
+	$header['errmsg'] = $errmsg;
+	$header['headers'] = $response_headers;
+
+//	echo "Result:\n";
+//	print_r($header);
+
+	$header['content'] = $content;
+
+	if ($header['http_code'] != 200) {
+		print_r($header);
+		throw new \Exception("Error status: $header[http_code]");
+	}
+
+	return (object) $header;
+}
+
+function get($url)
+{
+	echo "Sending GET to: $url\n";
+	return get_or_post($url, [
+		CURLOPT_CUSTOMREQUEST => "GET",        //set request type post or get
+		CURLOPT_POST => false,        //set to GET
+	]);
+}
+
+function post($url, $fields)
+{
+	echo "Sending POST to: $url\n";
+	print_r($fields);
+	return get_or_post($url, [
+		CURLOPT_CUSTOMREQUEST => "POST",        //set request type post or get
+		CURLOPT_POST => true,        //set to GET
+		CURLOPT_POSTFIELDS => $fields,
+	]);
+}
--- a/parse.inc
+++ b/parse.inc
@ -0,0 +1,186 @@
+<?php
+
+const PARSER_DEBUG = 0;
+const XPATH_DEBUG = 0;
+
+/**
+ * Trait DomQuery
+ *
+ * requires:
+ *
+	$this->dom = $dom;
+	$this->contextnode = null;
+ */
+trait DomQuery {
+	public function find(string $pat) : Node
+	{
+		$els = $this->findAll($pat);
+		if (!count($els)) {
+			if (PARSER_DEBUG) {
+				echo "---- match failed; context: ---\n";
+				echo $this->toXml() ."\n";
+			}
+
+			throw new \Exception("No match: $pat");
+		}
+		if (count($els) > 1) {
+			if (PARSER_DEBUG) {
+				echo "Query results:\n";
+				foreach ($els as $el) {
+					echo $el->toXml()."\n";
+				}
+			}
+			throw new \Exception("Multiple match (".count($els)."x): $pat");
+		}
+		return $els[0];
+	}
+
+	/**
+	 * @param $pat
+	 * @return array|Node[]
+	 * @throws Exception
+	 */
+	public function findAll(string $pat) : array
+	{
+		# node
+		if (preg_match('/^([a-z0-9_-]+)$/i', $pat, $matches)) {
+			return $this->x("//$matches[1]");
+		}
+
+		# .class, node.class
+		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\.(?P<cls>[a-z0-9_-]+)$/i', $pat, $matches)) {
+			$elem = $matches['elem'] ?: '*';
+			return $this->x("//{$elem}[contains(concat(' ',normalize-space(@class),' '),' $matches[cls] ')]");
+		}
+
+		// #id
+		if (preg_match('/^#(\w+)$/', $pat, $matches)) {
+			return $this->x("//*[id='$matches[1]']");
+		}
+
+		# [attr=value], node[attr=value] (allows quotes)
+		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)(?P<op>[$*~^]|)=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
+			$elem = $matches['elem'] ?: '*';
+			$op = $matches['op'];
+
+			switch ($op) {
+				case '':
+					return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
+				case '^':
+					return $this->x("//{$elem}[starts-with(@$matches[attr], '$matches[val]')]");
+				// this doesnt work..
+//				case '$':
+//					$vlen = strlen($matches['val']);
+//					return $this->x("//{$elem}['$matches[val]' = substring(@$matches[attr], string-length(@$matches[attr]) - $vlen)]");
+				case '*':
+				case '~':
+					return $this->x("//{$elem}[contains(@$matches[attr], '$matches[val]')]");
+			}
+		}
+
+		# [attr^=value], node[attr^=value] (allows quotes)
+		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)^=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
+			$elem = $matches['elem'] ?: '*';
+			return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
+		}
+
+		# [attr], node[attr]
+		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)\]$/', $pat, $matches)) {
+			$elem = $matches['elem'] ?: '*';
+			return $this->x("//{$elem}[@$matches[attr]]");
+		}
+
+		throw new \Exception("Unknown pattern: $pat");
+	}
+
+	public function x(string $x) : array
+	{
+		$xpath = new DOMXpath($this->dom);
+		if (strpos($x, '//') === 0 && $this->contextnode) {
+			$x = '.' . $x;
+		}
+
+		if (XPATH_DEBUG) echo "\nxpath is: $x\n";
+
+		$elements = $xpath->query($x, $this->contextnode) ?? [];
+		$elems = [];
+		foreach($elements as $e) {
+			$elems[] = new Node($this->dom, $e);
+		}
+		return $elems;
+	}
+}
+
+
+class Html
+{
+	use DomQuery;
+
+	public function __construct(string $html)
+	{
+		$dom = new DomDocument();
+
+		if (PARSER_DEBUG) echo "Creating HTML parser from:\n" . $html . "\n\n";
+
+		@$dom->loadHTML($html); // suppress spammy warnings
+		$this->dom = $dom;
+		$this->contextnode = null;
+	}
+
+	public function toXml() : string
+	{
+		return $this->dom->saveXml();
+	}
+}
+
+
+class Node
+{
+	use DomQuery;
+
+	public function __construct(DOMDocument $dom, DOMNode $element)
+	{
+		$this->dom = $dom;
+		$this->element = $element;
+		$this->contextnode = $element;
+	}
+
+	public function __get($name)
+	{
+		return $this->element->getAttribute($name);
+	}
+
+	public function text() : string
+	{
+		return $this->element->nodeValue ?? '';
+	}
+
+	/**
+	 * @return array|Node[]
+	 */
+	public function childNodes() : array
+	{
+		$elems = [];
+		foreach($this->element->childNodes as $e) {
+			$elems[] = new Node($this->dom, $e);
+		}
+		return $elems;
+	}
+
+	/**
+	 * @return Node
+	 */
+	public function childNode() : Node
+	{
+		$cn = $this->childNodes();
+		if (count($cn) > 1) {
+			throw new \Exception("More than one childnode.");
+		}
+		return $cn[0];
+	}
+
+	public function toXml() : string
+	{
+		return $this->element->ownerDocument->saveXml($this->element);
+	}
+}
--- a/run.php
+++ b/run.php
@ -0,0 +1,279 @@
+<?php
+
+const MAX_DIR_NAME_LEN = 40;
+const SKIP_EXISTING = true;
+const VESMIR_CZ = 'https://vesmir.cz';
+const VESMIR_LOGIN = "";
+const VESMIR_PASSWORD = "";
+
+require_once "http.inc";
+require_once "parse.inc";
+require_once "session.inc";
+
+function scrape_issue($rocnik_dir, $rocnik, $cislo, Html $doc) {
+	$cislo_dir = $rocnik_dir . '/' . $cislo;
+	if (!file_exists($cislo_dir)) {
+		mkdir($cislo_dir);
+	}
+
+	echo "\nStahuji cislo $rocnik/$cislo\n\n";
+
+	$n_clanky = $doc->find('.clanky');
+	$clankyItems = $n_clanky->findAll('.row');
+
+	$cl_num = 0;
+	$aktualni_h4 = null;
+
+	foreach ($clankyItems as $row) {
+		try {
+			$hh = $row->find('h4');
+			$aktualni_h4 = $hh->text();
+			echo "\n~ Skupina clanku: $aktualni_h4 ~\n";
+			continue;
+		} catch(Exception $e) {
+			/* ok.. */
+		}
+
+		if ($row->class != 'clankyItem row') {
+			echo "Skip non-article\n";
+			continue;
+		}
+
+		try {
+			//echo $row->toXml();
+			$num = ++$cl_num; // zvysit pocitadlo...
+
+			$h3 = $row->find('h3');
+			$a = $h3->find('a');
+			$clanek_url = VESMIR_CZ . $a->href;
+			$clanek_nazev = $a->text();
+
+			// Get slug
+			preg_match('|/([^./]+)\.html$|', $clanek_url, $m);
+			$slug = $m[1];
+
+			// Get dirname
+			$fname = $num . ' - ' . $clanek_nazev;
+			$fname = mb_ereg_replace("([^\w\s\d\-_~,;\[\]\(\). ])", '', $fname);
+			$fname = mb_ereg_replace("([\.]{2,})", '', $fname);
+
+			if (strlen($fname) > MAX_DIR_NAME_LEN) {
+				$fname = substr($fname, 0, strrpos($fname, ' ', -(strlen($fname) - MAX_DIR_NAME_LEN)));
+			}
+
+			// Ensure dir exists
+			$clanek_dir = $cislo_dir . '/' . $fname;
+			if (!file_exists($clanek_dir)) {
+				mkdir($clanek_dir);
+			}
+
+			echo "\n- $rocnik/$cislo -> Clanek #$num: $clanek_nazev -\nUrl: $clanek_url\n";
+
+			$perex = null;
+			try {
+				$perex = $row->find('.perex')->text();
+			} catch (Exception $e) {
+				echo "No perex. ".$e->getMessage()."\n";
+			}
+
+			$thumbfile = null;
+			try {
+				if (file_exists($clanek_dir . '/thumb.jpg')) {
+					$thumbfile = 'thumb.jpg';
+				} else {
+					$thumb = $row->find('img.img-responsive');
+
+					$f = get_file(VESMIR_CZ . $thumb->src);
+					file_put_contents($clanek_dir . '/thumb.jpg', $f);
+					$thumbfile = 'thumb.jpg';
+				}
+			} catch (Exception $e) {
+				echo "No thumb. ".$e->getMessage()."\n";
+			}
+
+			$author_names = [];
+			try {
+				$authors = $row->find('.authors');
+				$author_links = $authors->findAll('a');
+
+				foreach ($author_links as $al) {
+					$author_names[] = $al->text();
+				}
+			} catch (Exception $e) {
+				echo "!! No .authors div\n";
+			}
+
+			$merged_authors = implode(', ', $author_names);
+
+			if(SKIP_EXISTING && file_exists($clanek_dir . '/clanek.json')) {
+				echo "ARTICLE ALREADY DL'D, SKIP\n";
+				continue;
+			}
+
+			$resp = get_file($clanek_url);
+			file_put_contents($clanek_dir . '/orig.html', $resp);
+
+			$article_doc = new Html($resp);
+
+			$attachments = [];
+
+			// Try to download attachments (pdf version...)
+			try {
+				$dmedia = $article_doc->find('.media');
+				foreach ($dmedia->findAll('a[href]') as $item) {
+					$href = VESMIR_CZ . $item->href;
+					echo "> Downloading: " . $item->text() . "\n" . $href;
+
+					$fname = uniqid() . '.pdf'; // it's probably a pdf
+					if ($item->text() == 'článek ve formátu pdf') {
+						$isarticlepdf = true;
+						$fname = $slug . '.pdf';
+					}
+
+					$resp = get($href);
+
+					if (isset($resp->headers['content-disposition'])) {
+						$first = $resp->headers['content-disposition'][0];
+						list(, $orig_fname) = explode('filename=', $first);
+					}
+					if (!$isarticlepdf) {
+						$fname = $orig_fname;
+					}
+
+					file_put_contents($clanek_dir . '/' . $fname, $resp->content);
+					unset($resp->content);
+
+					$attachments[] = [
+						'url' => $href,
+						'popis' => $item->text(),
+						'nazev' => $orig_fname,
+						'soubor' => $fname,
+					];
+				}
+
+			} catch(Exception $e) {
+				echo "Error finding media links: ".$e->getMessage()."\n";
+			}
+
+			$adiv = $article_doc->find('div.article');
+			$body = $adiv->toXml(); // serialize the body div
+			$body = str_replace('&#13;', '', $body);
+
+			$picnum = 0;
+			$body = preg_replace_callback('|src="(/images/[^"]+)"|', function($m) use ($clanek_dir, &$picnum) {
+				$uri = $m[1];
+				$url = VESMIR_CZ . $uri;
+
+				preg_match('|/([^/]+)$|', $uri, $m);
+				$img_slug = $m[1];
+				$img_fname = 'img_' . ($picnum++) . '_' . $img_slug;
+
+				try {
+					$f = get_file($url);
+					file_put_contents($clanek_dir . '/' . $img_fname, $f);
+					return "src=\"".htmlspecialchars($img_fname)."\"";
+				} catch(\Exception $e) {
+					echo "Error getting img $uri\n";
+					echo $e->getMessage();
+					echo $e->getTraceAsString();
+					return $m[0]; // no subst.
+				}
+			}, $body);
+
+			$nazev_e = htmlspecialchars($clanek_nazev);
+			$merged_authors_e = htmlspecialchars($merged_authors);
+
+			$cleaned = <<<DOC
+<!DOCTYPE html>
+<html lang="cs">
+<head>
+<meta charset="utf-8">
+<title>$nazev_e</title>
+<link href="../../../style.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<h1 class="article-name">$nazev_e</h1>
+<p class="authors">$merged_authors_e</p>
+<!-- article begin -->
+
+$body
+
+<!-- article end -->
+</body>
+</html>
+
+DOC;
+
+			file_put_contents($clanek_dir . '/clanek.html', $cleaned);
+
+			$metadata = [
+				'nazev' => $clanek_nazev,
+				'slug' => $slug,
+				'url' => $clanek_url,
+				'autori' => $author_names,
+				'rocnik' => $rocnik,
+				'cislo' => $cislo,
+				'poradi' => $cl_num,
+				'prilohy' => $attachments,
+				'thumb' => $thumbfile,
+				'perex' => $perex,
+			];
+			file_put_contents($clanek_dir . '/clanek.json', json_encode($metadata, 128|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES));
+
+		} catch (Exception $e) {
+			echo $e->getMessage() . "\n" . $e->getTraceAsString() . "\n";
+		}
+	}
+}
+
+function scrape_year($year) {
+	$doc = get_doc(VESMIR_CZ . "/cz/casopis/archiv-casopisu/$year/");
+	$obalky = $doc->findAll('.vesmirObalka');
+
+	$rocnik_dir = __DIR__ . '/out/' . $year;
+	if (!file_exists($rocnik_dir)) {
+		mkdir($rocnik_dir);
+	}
+
+	foreach ($obalky as $obalka) {
+		$a = $obalka->childNode();
+		$url_cislo = $a->href;
+
+		echo $url_cislo.PHP_EOL;
+
+		if (!preg_match('|/(\d+)/cislo-(\d+)/$|', $url_cislo, $m)) {
+			die("weird format $url_cislo");
+		}
+		echo "== Rocnik $m[1], cislo $m[2] ==\n";
+		$rocnik = $m[1];
+		$cislo = $m[2];
+		$ident = "$rocnik-$cislo";
+
+		$i = $a->childNode();
+		$url_thumb = $i->src;
+
+		$url_thumb = str_replace("?h=180", "?h=1800", $url_thumb);
+
+		echo "Casopis URL: $url_cislo\nObalka URL: $url_thumb\n\n";
+		$obalka_file = $rocnik_dir . "/$ident.jpg";
+
+		if (!file_exists($obalka_file)) {
+			echo "Stahuji obalku...\n";
+			$c = get_file(VESMIR_CZ . $url_thumb);
+			file_put_contents($obalka_file, $c);
+		}
+
+		$c = get_doc(VESMIR_CZ . $url_cislo);
+
+		scrape_issue($rocnik_dir, $rocnik, $cislo, $c);
+	}
+}
+
+
+ensure_logged_in();
+//scrape_year(2019);
+
+for ($i = 2019; $i >= 1994; $i--) {
+	ensure_logged_in();
+	scrape_year($i);
+}
--- a/session.inc
+++ b/session.inc
@ -0,0 +1,87 @@
+<?php
+
+function solveChallenge(Html $doc) : int {
+	$challenge = $doc->find('[for=spamProtectionDisableResult]')->text();
+	echo "Challenge is: $challenge\n";
+	if (preg_match('/(\d+) (plus|mínus) (\d+)/', $challenge, $m)) {
+		$a = +$m[1];
+		$op = $m[2] == 'plus' ? 1 : -1;
+		$b = +$m[3];
+		$r = $a + $op * $b;
+		echo "Result: $r\n";
+		return $r;
+	} else {
+		throw new Exception("Unexpected challenge: $challenge");
+	}
+}
+
+function login()
+{
+	echo "----- attempting to login -----\n";
+
+	return post( "https://vesmir.cz/usrlogon.do", [
+		"username" => VESMIR_LOGIN,
+		"password" => VESMIR_PASSWORD,
+		"docId" => 9573,
+		"doShowdocAction" => "/usrlogon.do",
+		"emailLogon" => false,
+		"origDocId" => 9573,
+	]);
+}
+
+function disableAntispam()
+{
+	echo "----- disabling antispam -----\n";
+
+	$r = get("https://vesmir.cz/components/form/spamprotectiondisable.jsp?backurl=%2Fcz%2Fuzivatel.html");
+
+	$doc = new Html($r->content);
+	$solved = solveChallenge($doc);
+
+	$result = post( "https://vesmir.cz/components/form/spamprotectiondisable.jsp", [
+		"result" => $solved,
+		"__token" => $doc->find('[name=__token]')->value,
+		"backurl" => "/cz/uzivatel.html",
+		"hash" => $doc->find('[name=hash]')->value,
+	]);
+
+	if (preg_match("/Zadaný výsledek je správný/", $result->content)) {
+		echo "Anti-spam succeeded.\n";
+		return $result;
+	} else {
+		print_r($result);
+
+		throw new Exception("Failed to disable antispam.");
+	}
+}
+
+function dump_cookie_file_for_wget() {
+//	echo "Exporting cookie for WGET\n";
+//	$c = file_get_contents("cookie.txt");
+//	$c = str_replace('#HttpOnly_', '', $c);
+//	file_put_contents('cookie-wget.txt', $c);
+}
+
+function ensure_logged_in() {
+	// get a session cookie
+	$r = get("https://vesmir.cz/cz/uzivatel.html");
+	if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) {
+		echo "Already logged in!\n";
+		dump_cookie_file_for_wget();
+		return true;
+	} else {
+		echo "Need login!\n";
+
+		disableAntispam();
+		get("https://vesmir.cz/cz/uzivatel.html");
+		$result = login();
+		if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) {
+			echo "Logged in!\n";
+			dump_cookie_file_for_wget();
+			return true;
+		} else {
+			print_r($result);
+			throw new Exception("--- LOGIN FAILED! ---");
+		}
+	}
+}