Initial commit

6 years ago · 4760eaf987
commit 4760eaf987
5 changed files with 654 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
 .idea/
 out/
 cookie.txt
 cookie-wget.txt
 vesmir.cz
--- a/http.inc
+++ b/http.inc
@ -0,0 +1,97 @@
 <?php
 const UA = 'Mozilla/5.0 (Windows NT 6.1; rv:8.0) Gecko/20100101 Firefox/8.0';
 function get_doc($url) {
 	return new Html(get($url)->content);
 }
 function get_file($url) {
 	return get($url)->content;
 }
 function get_or_post($url, $mergeoptions) {
 	$options = array(
 		CURLOPT_USERAGENT => UA, //set user agent
 		CURLOPT_COOKIEFILE => "cookie.txt", //set cookie file
 		CURLOPT_COOKIEJAR => "cookie.txt", //set cookie jar
 		CURLOPT_COOKIESESSION => false,
 		CURLOPT_RETURNTRANSFER => true,     // return web page
 		CURLOPT_HEADER => false,    // don't return headers
 		CURLOPT_FOLLOWLOCATION => true,     // follow redirects
 		CURLOPT_ENCODING => "",       // handle all encodings
 		CURLOPT_AUTOREFERER => true,     // set referer on redirect
 		CURLOPT_CONNECTTIMEOUT => 120,      // timeout on connect
 		CURLOPT_TIMEOUT => 120,      // timeout on response
 		CURLOPT_MAXREDIRS => 10,       // stop after 10 redirects
 	);
 	foreach ($mergeoptions as $k => $v) {
 		$options[$k] = $v;
 	}
 	$ch = curl_init($url);
 	curl_setopt_array($ch, $options);
 	// this function is called by curl for each header received
 	$response_headers = [];
 	curl_setopt($ch, CURLOPT_HEADERFUNCTION,
 		function($curl, $header) use (&$response_headers)
 		{
 			$len = strlen($header);
 			$header = explode(':', $header, 2);
 			if (count($header) < 2) // ignore invalid headers
 				return $len;
 			$name = strtolower(trim($header[0]));
 			if (!array_key_exists($name, $response_headers))
 				$response_headers[$name] = [trim($header[1])];
 			else
 				$response_headers[$name][] = trim($header[1]);
 			return $len;
 		}
 	);
 	$content = curl_exec($ch);
 	$err = curl_errno($ch);
 	$errmsg = curl_error($ch);
 	$header = curl_getinfo($ch);
 	curl_close($ch);
 	$header['errno'] = $err;
 	$header['errmsg'] = $errmsg;
 	$header['headers'] = $response_headers;
 //	echo "Result:\n";
 //	print_r($header);
 	$header['content'] = $content;
 	if ($header['http_code'] != 200) {
 		print_r($header);
 		throw new \Exception("Error status: $header[http_code]");
 	}
 	return (object) $header;
 }
 function get($url)
 {
 	echo "Sending GET to: $url\n";
 	return get_or_post($url, [
 		CURLOPT_CUSTOMREQUEST => "GET",        //set request type post or get
 		CURLOPT_POST => false,        //set to GET
 	]);
 }
 function post($url, $fields)
 {
 	echo "Sending POST to: $url\n";
 	print_r($fields);
 	return get_or_post($url, [
 		CURLOPT_CUSTOMREQUEST => "POST",        //set request type post or get
 		CURLOPT_POST => true,        //set to GET
 		CURLOPT_POSTFIELDS => $fields,
 	]);
 }
--- a/parse.inc
+++ b/parse.inc
@ -0,0 +1,186 @@
 <?php
 const PARSER_DEBUG = 0;
 const XPATH_DEBUG = 0;
 /**
 * Trait DomQuery
 *
 * requires:
 *
 	$this->dom = $dom;
 	$this->contextnode = null;
 */
 trait DomQuery {
 	public function find(string $pat) : Node
 	{
 		$els = $this->findAll($pat);
 		if (!count($els)) {
 			if (PARSER_DEBUG) {
 				echo "---- match failed; context: ---\n";
 				echo $this->toXml() ."\n";
 			}
 			throw new \Exception("No match: $pat");
 		}
 		if (count($els) > 1) {
 			if (PARSER_DEBUG) {
 				echo "Query results:\n";
 				foreach ($els as $el) {
 					echo $el->toXml()."\n";
 				}
 			}
 			throw new \Exception("Multiple match (".count($els)."x): $pat");
 		}
 		return $els[0];
 	}
 	/**
 	 * @param $pat
 	 * @return array|Node[]
 	 * @throws Exception
 	 */
 	public function findAll(string $pat) : array
 	{
 		# node
 		if (preg_match('/^([a-z0-9_-]+)$/i', $pat, $matches)) {
 			return $this->x("//$matches[1]");
 		}
 		# .class, node.class
 		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\.(?P<cls>[a-z0-9_-]+)$/i', $pat, $matches)) {
 			$elem = $matches['elem'] ?: '*';
 			return $this->x("//{$elem}[contains(concat(' ',normalize-space(@class),' '),' $matches[cls] ')]");
 		}
 		// #id
 		if (preg_match('/^#(\w+)$/', $pat, $matches)) {
 			return $this->x("//*[id='$matches[1]']");
 		}
 		# [attr=value], node[attr=value] (allows quotes)
 		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)(?P<op>[$*~^]|)=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
 			$elem = $matches['elem'] ?: '*';
 			$op = $matches['op'];
 			switch ($op) {
 				case '':
 					return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
 				case '^':
 					return $this->x("//{$elem}[starts-with(@$matches[attr], '$matches[val]')]");
 				// this doesnt work..
 //				case '$':
 //					$vlen = strlen($matches['val']);
 //					return $this->x("//{$elem}['$matches[val]' = substring(@$matches[attr], string-length(@$matches[attr]) - $vlen)]");
 				case '*':
 				case '~':
 					return $this->x("//{$elem}[contains(@$matches[attr], '$matches[val]')]");
 			}
 		}
 		# [attr^=value], node[attr^=value] (allows quotes)
 		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)^=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) {
 			$elem = $matches['elem'] ?: '*';
 			return $this->x("//{$elem}[@$matches[attr]='$matches[val]']");
 		}
 		# [attr], node[attr]
 		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)\]$/', $pat, $matches)) {
 			$elem = $matches['elem'] ?: '*';
 			return $this->x("//{$elem}[@$matches[attr]]");
 		}
 		throw new \Exception("Unknown pattern: $pat");
 	}
 	public function x(string $x) : array
 	{
 		$xpath = new DOMXpath($this->dom);
 		if (strpos($x, '//') === 0 && $this->contextnode) {
 			$x = '.' . $x;
 		}
 		if (XPATH_DEBUG) echo "\nxpath is: $x\n";
 		$elements = $xpath->query($x, $this->contextnode) ?? [];
 		$elems = [];
 		foreach($elements as $e) {
 			$elems[] = new Node($this->dom, $e);
 		}
 		return $elems;
 	}
 }
 class Html
 {
 	use DomQuery;
 	public function __construct(string $html)
 	{
 		$dom = new DomDocument();
 		if (PARSER_DEBUG) echo "Creating HTML parser from:\n" . $html . "\n\n";
 		@$dom->loadHTML($html); // suppress spammy warnings
 		$this->dom = $dom;
 		$this->contextnode = null;
 	}
 	public function toXml() : string
 	{
 		return $this->dom->saveXml();
 	}
 }
 class Node
 {
 	use DomQuery;
 	public function __construct(DOMDocument $dom, DOMNode $element)
 	{
 		$this->dom = $dom;
 		$this->element = $element;
 		$this->contextnode = $element;
 	}
 	public function __get($name)
 	{
 		return $this->element->getAttribute($name);
 	}
 	public function text() : string
 	{
 		return $this->element->nodeValue ?? '';
 	}
 	/**
 	 * @return array|Node[]
 	 */
 	public function childNodes() : array
 	{
 		$elems = [];
 		foreach($this->element->childNodes as $e) {
 			$elems[] = new Node($this->dom, $e);
 		}
 		return $elems;
 	}
 	/**
 	 * @return Node
 	 */
 	public function childNode() : Node
 	{
 		$cn = $this->childNodes();
 		if (count($cn) > 1) {
 			throw new \Exception("More than one childnode.");
 		}
 		return $cn[0];
 	}
 	public function toXml() : string
 	{
 		return $this->element->ownerDocument->saveXml($this->element);
 	}
 }
--- a/run.php
+++ b/run.php
@ -0,0 +1,279 @@
 <?php
 const MAX_DIR_NAME_LEN = 40;
 const SKIP_EXISTING = true;
 const VESMIR_CZ = 'https://vesmir.cz';
 const VESMIR_LOGIN = "";
 const VESMIR_PASSWORD = "";
 require_once "http.inc";
 require_once "parse.inc";
 require_once "session.inc";
 function scrape_issue($rocnik_dir, $rocnik, $cislo, Html $doc) {
 	$cislo_dir = $rocnik_dir . '/' . $cislo;
 	if (!file_exists($cislo_dir)) {
 		mkdir($cislo_dir);
 	}
 	echo "\nStahuji cislo $rocnik/$cislo\n\n";
 	$n_clanky = $doc->find('.clanky');
 	$clankyItems = $n_clanky->findAll('.row');
 	$cl_num = 0;
 	$aktualni_h4 = null;
 	foreach ($clankyItems as $row) {
 		try {
 			$hh = $row->find('h4');
 			$aktualni_h4 = $hh->text();
 			echo "\n~ Skupina clanku: $aktualni_h4 ~\n";
 			continue;
 		} catch(Exception $e) {
 			/* ok.. */
 		}
 		if ($row->class != 'clankyItem row') {
 			echo "Skip non-article\n";
 			continue;
 		}
 		try {
 			//echo $row->toXml();
 			$num = ++$cl_num; // zvysit pocitadlo...
 			$h3 = $row->find('h3');
 			$a = $h3->find('a');
 			$clanek_url = VESMIR_CZ . $a->href;
 			$clanek_nazev = $a->text();
 			// Get slug
 			preg_match('|/([^./]+)\.html$|', $clanek_url, $m);
 			$slug = $m[1];
 			// Get dirname
 			$fname = $num . ' - ' . $clanek_nazev;
 			$fname = mb_ereg_replace("([^\w\s\d\-_~,;\[\]\(\). ])", '', $fname);
 			$fname = mb_ereg_replace("([\.]{2,})", '', $fname);
 			if (strlen($fname) > MAX_DIR_NAME_LEN) {
 				$fname = substr($fname, 0, strrpos($fname, ' ', -(strlen($fname) - MAX_DIR_NAME_LEN)));
 			}
 			// Ensure dir exists
 			$clanek_dir = $cislo_dir . '/' . $fname;
 			if (!file_exists($clanek_dir)) {
 				mkdir($clanek_dir);
 			}
 			echo "\n- $rocnik/$cislo -> Clanek #$num: $clanek_nazev -\nUrl: $clanek_url\n";
 			$perex = null;
 			try {
 				$perex = $row->find('.perex')->text();
 			} catch (Exception $e) {
 				echo "No perex. ".$e->getMessage()."\n";
 			}
 			$thumbfile = null;
 			try {
 				if (file_exists($clanek_dir . '/thumb.jpg')) {
 					$thumbfile = 'thumb.jpg';
 				} else {
 					$thumb = $row->find('img.img-responsive');
 					$f = get_file(VESMIR_CZ . $thumb->src);
 					file_put_contents($clanek_dir . '/thumb.jpg', $f);
 					$thumbfile = 'thumb.jpg';
 				}
 			} catch (Exception $e) {
 				echo "No thumb. ".$e->getMessage()."\n";
 			}
 			$author_names = [];
 			try {
 				$authors = $row->find('.authors');
 				$author_links = $authors->findAll('a');
 				foreach ($author_links as $al) {
 					$author_names[] = $al->text();
 				}
 			} catch (Exception $e) {
 				echo "!! No .authors div\n";
 			}
 			$merged_authors = implode(', ', $author_names);
 			if(SKIP_EXISTING && file_exists($clanek_dir . '/clanek.json')) {
 				echo "ARTICLE ALREADY DL'D, SKIP\n";
 				continue;
 			}
 			$resp = get_file($clanek_url);
 			file_put_contents($clanek_dir . '/orig.html', $resp);
 			$article_doc = new Html($resp);
 			$attachments = [];
 			// Try to download attachments (pdf version...)
 			try {
 				$dmedia = $article_doc->find('.media');
 				foreach ($dmedia->findAll('a[href]') as $item) {
 					$href = VESMIR_CZ . $item->href;
 					echo "> Downloading: " . $item->text() . "\n" . $href;
 					$fname = uniqid() . '.pdf'; // it's probably a pdf
 					if ($item->text() == 'článek ve formátu pdf') {
 						$isarticlepdf = true;
 						$fname = $slug . '.pdf';
 					}
 					$resp = get($href);
 					if (isset($resp->headers['content-disposition'])) {
 						$first = $resp->headers['content-disposition'][0];
 						list(, $orig_fname) = explode('filename=', $first);
 					}
 					if (!$isarticlepdf) {
 						$fname = $orig_fname;
 					}
 					file_put_contents($clanek_dir . '/' . $fname, $resp->content);
 					unset($resp->content);
 					$attachments[] = [
 						'url' => $href,
 						'popis' => $item->text(),
 						'nazev' => $orig_fname,
 						'soubor' => $fname,
 					];
 				}
 			} catch(Exception $e) {
 				echo "Error finding media links: ".$e->getMessage()."\n";
 			}
 			$adiv = $article_doc->find('div.article');
 			$body = $adiv->toXml(); // serialize the body div
 			$body = str_replace('&#13;', '', $body);
 			$picnum = 0;
 			$body = preg_replace_callback('|src="(/images/[^"]+)"|', function($m) use ($clanek_dir, &$picnum) {
 				$uri = $m[1];
 				$url = VESMIR_CZ . $uri;
 				preg_match('|/([^/]+)$|', $uri, $m);
 				$img_slug = $m[1];
 				$img_fname = 'img_' . ($picnum++) . '_' . $img_slug;
 				try {
 					$f = get_file($url);
 					file_put_contents($clanek_dir . '/' . $img_fname, $f);
 					return "src=\"".htmlspecialchars($img_fname)."\"";
 				} catch(\Exception $e) {
 					echo "Error getting img $uri\n";
 					echo $e->getMessage();
 					echo $e->getTraceAsString();
 					return $m[0]; // no subst.
 				}
 			}, $body);
 			$nazev_e = htmlspecialchars($clanek_nazev);
 			$merged_authors_e = htmlspecialchars($merged_authors);
 			$cleaned = <<<DOC
 <!DOCTYPE html>
 <html lang="cs">
 <head>
 <meta charset="utf-8">
 <title>$nazev_e</title>
 <link href="../../../style.css" rel="stylesheet" type="text/css" />
 </head>
 <body>
 <h1 class="article-name">$nazev_e</h1>
 <p class="authors">$merged_authors_e</p>
 <!-- article begin -->
 $body
 <!-- article end -->
 </body>
 </html>
 DOC;
 			file_put_contents($clanek_dir . '/clanek.html', $cleaned);
 			$metadata = [
 				'nazev' => $clanek_nazev,
 				'slug' => $slug,
 				'url' => $clanek_url,
 				'autori' => $author_names,
 				'rocnik' => $rocnik,
 				'cislo' => $cislo,
 				'poradi' => $cl_num,
 				'prilohy' => $attachments,
 				'thumb' => $thumbfile,
 				'perex' => $perex,
 			];
 			file_put_contents($clanek_dir . '/clanek.json', json_encode($metadata, 128|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES));
 		} catch (Exception $e) {
 			echo $e->getMessage() . "\n" . $e->getTraceAsString() . "\n";
 		}
 	}
 }
 function scrape_year($year) {
 	$doc = get_doc(VESMIR_CZ . "/cz/casopis/archiv-casopisu/$year/");
 	$obalky = $doc->findAll('.vesmirObalka');
 	$rocnik_dir = __DIR__ . '/out/' . $year;
 	if (!file_exists($rocnik_dir)) {
 		mkdir($rocnik_dir);
 	}
 	foreach ($obalky as $obalka) {
 		$a = $obalka->childNode();
 		$url_cislo = $a->href;
 		echo $url_cislo.PHP_EOL;
 		if (!preg_match('|/(\d+)/cislo-(\d+)/$|', $url_cislo, $m)) {
 			die("weird format $url_cislo");
 		}
 		echo "== Rocnik $m[1], cislo $m[2] ==\n";
 		$rocnik = $m[1];
 		$cislo = $m[2];
 		$ident = "$rocnik-$cislo";
 		$i = $a->childNode();
 		$url_thumb = $i->src;
 		$url_thumb = str_replace("?h=180", "?h=1800", $url_thumb);
 		echo "Casopis URL: $url_cislo\nObalka URL: $url_thumb\n\n";
 		$obalka_file = $rocnik_dir . "/$ident.jpg";
 		if (!file_exists($obalka_file)) {
 			echo "Stahuji obalku...\n";
 			$c = get_file(VESMIR_CZ . $url_thumb);
 			file_put_contents($obalka_file, $c);
 		}
 		$c = get_doc(VESMIR_CZ . $url_cislo);
 		scrape_issue($rocnik_dir, $rocnik, $cislo, $c);
 	}
 }
 ensure_logged_in();
 //scrape_year(2019);
 for ($i = 2019; $i >= 1994; $i--) {
 	ensure_logged_in();
 	scrape_year($i);
 }
--- a/session.inc
+++ b/session.inc
@ -0,0 +1,87 @@
 <?php
 function solveChallenge(Html $doc) : int {
 	$challenge = $doc->find('[for=spamProtectionDisableResult]')->text();
 	echo "Challenge is: $challenge\n";
 	if (preg_match('/(\d+) (plus|mínus) (\d+)/', $challenge, $m)) {
 		$a = +$m[1];
 		$op = $m[2] == 'plus' ? 1 : -1;
 		$b = +$m[3];
 		$r = $a + $op * $b;
 		echo "Result: $r\n";
 		return $r;
 	} else {
 		throw new Exception("Unexpected challenge: $challenge");
 	}
 }
 function login()
 {
 	echo "----- attempting to login -----\n";
 	return post( "https://vesmir.cz/usrlogon.do", [
 		"username" => VESMIR_LOGIN,
 		"password" => VESMIR_PASSWORD,
 		"docId" => 9573,
 		"doShowdocAction" => "/usrlogon.do",
 		"emailLogon" => false,
 		"origDocId" => 9573,
 	]);
 }
 function disableAntispam()
 {
 	echo "----- disabling antispam -----\n";
 	$r = get("https://vesmir.cz/components/form/spamprotectiondisable.jsp?backurl=%2Fcz%2Fuzivatel.html");
 	$doc = new Html($r->content);
 	$solved = solveChallenge($doc);
 	$result = post( "https://vesmir.cz/components/form/spamprotectiondisable.jsp", [
 		"result" => $solved,
 		"__token" => $doc->find('[name=__token]')->value,
 		"backurl" => "/cz/uzivatel.html",
 		"hash" => $doc->find('[name=hash]')->value,
 	]);
 	if (preg_match("/Zadaný výsledek je správný/", $result->content)) {
 		echo "Anti-spam succeeded.\n";
 		return $result;
 	} else {
 		print_r($result);
 		throw new Exception("Failed to disable antispam.");
 	}
 }
 function dump_cookie_file_for_wget() {
 //	echo "Exporting cookie for WGET\n";
 //	$c = file_get_contents("cookie.txt");
 //	$c = str_replace('#HttpOnly_', '', $c);
 //	file_put_contents('cookie-wget.txt', $c);
 }
 function ensure_logged_in() {
 	// get a session cookie
 	$r = get("https://vesmir.cz/cz/uzivatel.html");
 	if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) {
 		echo "Already logged in!\n";
 		dump_cookie_file_for_wget();
 		return true;
 	} else {
 		echo "Need login!\n";
 		disableAntispam();
 		get("https://vesmir.cz/cz/uzivatel.html");
 		$result = login();
 		if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) {
 			echo "Logged in!\n";
 			dump_cookie_file_for_wget();
 			return true;
 		} else {
 			print_r($result);
 			throw new Exception("--- LOGIN FAILED! ---");
 		}
 	}
 }