commit
						4760eaf987
					
				@ -0,0 +1,5 @@ | 
				
			||||
.idea/ | 
				
			||||
out/ | 
				
			||||
cookie.txt | 
				
			||||
cookie-wget.txt | 
				
			||||
vesmir.cz | 
				
			||||
@ -0,0 +1,97 @@ | 
				
			||||
<?php | 
				
			||||
 | 
				
			||||
const UA = 'Mozilla/5.0 (Windows NT 6.1; rv:8.0) Gecko/20100101 Firefox/8.0'; | 
				
			||||
 | 
				
			||||
function get_doc($url) { | 
				
			||||
	return new Html(get($url)->content); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
function get_file($url) { | 
				
			||||
	return get($url)->content; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
function get_or_post($url, $mergeoptions) { | 
				
			||||
	$options = array( | 
				
			||||
		CURLOPT_USERAGENT => UA, //set user agent | 
				
			||||
		CURLOPT_COOKIEFILE => "cookie.txt", //set cookie file | 
				
			||||
		CURLOPT_COOKIEJAR => "cookie.txt", //set cookie jar | 
				
			||||
		CURLOPT_COOKIESESSION => false, | 
				
			||||
		CURLOPT_RETURNTRANSFER => true,     // return web page | 
				
			||||
		CURLOPT_HEADER => false,    // don't return headers | 
				
			||||
		CURLOPT_FOLLOWLOCATION => true,     // follow redirects | 
				
			||||
		CURLOPT_ENCODING => "",       // handle all encodings | 
				
			||||
		CURLOPT_AUTOREFERER => true,     // set referer on redirect | 
				
			||||
		CURLOPT_CONNECTTIMEOUT => 120,      // timeout on connect | 
				
			||||
		CURLOPT_TIMEOUT => 120,      // timeout on response | 
				
			||||
		CURLOPT_MAXREDIRS => 10,       // stop after 10 redirects | 
				
			||||
	); | 
				
			||||
 | 
				
			||||
	foreach ($mergeoptions as $k => $v) { | 
				
			||||
		$options[$k] = $v; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	$ch = curl_init($url); | 
				
			||||
	curl_setopt_array($ch, $options); | 
				
			||||
 | 
				
			||||
	// this function is called by curl for each header received | 
				
			||||
	$response_headers = []; | 
				
			||||
	curl_setopt($ch, CURLOPT_HEADERFUNCTION, | 
				
			||||
		function($curl, $header) use (&$response_headers) | 
				
			||||
		{ | 
				
			||||
			$len = strlen($header); | 
				
			||||
			$header = explode(':', $header, 2); | 
				
			||||
			if (count($header) < 2) // ignore invalid headers | 
				
			||||
				return $len; | 
				
			||||
 | 
				
			||||
			$name = strtolower(trim($header[0])); | 
				
			||||
			if (!array_key_exists($name, $response_headers)) | 
				
			||||
				$response_headers[$name] = [trim($header[1])]; | 
				
			||||
			else | 
				
			||||
				$response_headers[$name][] = trim($header[1]); | 
				
			||||
 | 
				
			||||
			return $len; | 
				
			||||
		} | 
				
			||||
	); | 
				
			||||
 | 
				
			||||
	$content = curl_exec($ch); | 
				
			||||
	$err = curl_errno($ch); | 
				
			||||
	$errmsg = curl_error($ch); | 
				
			||||
	$header = curl_getinfo($ch); | 
				
			||||
	curl_close($ch); | 
				
			||||
 | 
				
			||||
	$header['errno'] = $err; | 
				
			||||
	$header['errmsg'] = $errmsg; | 
				
			||||
	$header['headers'] = $response_headers; | 
				
			||||
 | 
				
			||||
//	echo "Result:\n"; | 
				
			||||
//	print_r($header); | 
				
			||||
 | 
				
			||||
	$header['content'] = $content; | 
				
			||||
 | 
				
			||||
	if ($header['http_code'] != 200) { | 
				
			||||
		print_r($header); | 
				
			||||
		throw new \Exception("Error status: $header[http_code]"); | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	return (object) $header; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
function get($url) | 
				
			||||
{ | 
				
			||||
	echo "Sending GET to: $url\n"; | 
				
			||||
	return get_or_post($url, [ | 
				
			||||
		CURLOPT_CUSTOMREQUEST => "GET",        //set request type post or get | 
				
			||||
		CURLOPT_POST => false,        //set to GET | 
				
			||||
	]); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
function post($url, $fields) | 
				
			||||
{ | 
				
			||||
	echo "Sending POST to: $url\n"; | 
				
			||||
	print_r($fields); | 
				
			||||
	return get_or_post($url, [ | 
				
			||||
		CURLOPT_CUSTOMREQUEST => "POST",        //set request type post or get | 
				
			||||
		CURLOPT_POST => true,        //set to GET | 
				
			||||
		CURLOPT_POSTFIELDS => $fields, | 
				
			||||
	]); | 
				
			||||
} | 
				
			||||
@ -0,0 +1,186 @@ | 
				
			||||
<?php | 
				
			||||
 | 
				
			||||
const PARSER_DEBUG = 0; | 
				
			||||
const XPATH_DEBUG = 0; | 
				
			||||
 | 
				
			||||
/** | 
				
			||||
 * Trait DomQuery | 
				
			||||
 * | 
				
			||||
 * requires: | 
				
			||||
 * | 
				
			||||
	$this->dom = $dom; | 
				
			||||
	$this->contextnode = null; | 
				
			||||
 */ | 
				
			||||
trait DomQuery { | 
				
			||||
	public function find(string $pat) : Node | 
				
			||||
	{ | 
				
			||||
		$els = $this->findAll($pat); | 
				
			||||
		if (!count($els)) { | 
				
			||||
			if (PARSER_DEBUG) { | 
				
			||||
				echo "---- match failed; context: ---\n"; | 
				
			||||
				echo $this->toXml() ."\n"; | 
				
			||||
			} | 
				
			||||
 | 
				
			||||
			throw new \Exception("No match: $pat"); | 
				
			||||
		} | 
				
			||||
		if (count($els) > 1) { | 
				
			||||
			if (PARSER_DEBUG) { | 
				
			||||
				echo "Query results:\n"; | 
				
			||||
				foreach ($els as $el) { | 
				
			||||
					echo $el->toXml()."\n"; | 
				
			||||
				} | 
				
			||||
			} | 
				
			||||
			throw new \Exception("Multiple match (".count($els)."x): $pat"); | 
				
			||||
		} | 
				
			||||
		return $els[0]; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	/** | 
				
			||||
	 * @param $pat | 
				
			||||
	 * @return array|Node[] | 
				
			||||
	 * @throws Exception | 
				
			||||
	 */ | 
				
			||||
	public function findAll(string $pat) : array | 
				
			||||
	{ | 
				
			||||
		# node | 
				
			||||
		if (preg_match('/^([a-z0-9_-]+)$/i', $pat, $matches)) { | 
				
			||||
			return $this->x("//$matches[1]"); | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		# .class, node.class | 
				
			||||
		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\.(?P<cls>[a-z0-9_-]+)$/i', $pat, $matches)) { | 
				
			||||
			$elem = $matches['elem'] ?: '*'; | 
				
			||||
			return $this->x("//{$elem}[contains(concat(' ',normalize-space(@class),' '),' $matches[cls] ')]"); | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		// #id | 
				
			||||
		if (preg_match('/^#(\w+)$/', $pat, $matches)) { | 
				
			||||
			return $this->x("//*[id='$matches[1]']"); | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		# [attr=value], node[attr=value] (allows quotes) | 
				
			||||
		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)(?P<op>[$*~^]|)=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) { | 
				
			||||
			$elem = $matches['elem'] ?: '*'; | 
				
			||||
			$op = $matches['op']; | 
				
			||||
 | 
				
			||||
			switch ($op) { | 
				
			||||
				case '': | 
				
			||||
					return $this->x("//{$elem}[@$matches[attr]='$matches[val]']"); | 
				
			||||
				case '^': | 
				
			||||
					return $this->x("//{$elem}[starts-with(@$matches[attr], '$matches[val]')]"); | 
				
			||||
				// this doesnt work.. | 
				
			||||
//				case '$': | 
				
			||||
//					$vlen = strlen($matches['val']); | 
				
			||||
//					return $this->x("//{$elem}['$matches[val]' = substring(@$matches[attr], string-length(@$matches[attr]) - $vlen)]"); | 
				
			||||
				case '*': | 
				
			||||
				case '~': | 
				
			||||
					return $this->x("//{$elem}[contains(@$matches[attr], '$matches[val]')]"); | 
				
			||||
			} | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		# [attr^=value], node[attr^=value] (allows quotes) | 
				
			||||
		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)^=[\'"]?(?P<val>[^\'"\]]+)[\'"]?\]$/', $pat, $matches)) { | 
				
			||||
			$elem = $matches['elem'] ?: '*'; | 
				
			||||
			return $this->x("//{$elem}[@$matches[attr]='$matches[val]']"); | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		# [attr], node[attr] | 
				
			||||
		if (preg_match('/^(?P<elem>[a-z0-9_-]*)\[(?P<attr>[a-z0-9_-]+)\]$/', $pat, $matches)) { | 
				
			||||
			$elem = $matches['elem'] ?: '*'; | 
				
			||||
			return $this->x("//{$elem}[@$matches[attr]]"); | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		throw new \Exception("Unknown pattern: $pat"); | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	public function x(string $x) : array | 
				
			||||
	{ | 
				
			||||
		$xpath = new DOMXpath($this->dom); | 
				
			||||
		if (strpos($x, '//') === 0 && $this->contextnode) { | 
				
			||||
			$x = '.' . $x; | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		if (XPATH_DEBUG) echo "\nxpath is: $x\n"; | 
				
			||||
 | 
				
			||||
		$elements = $xpath->query($x, $this->contextnode) ?? []; | 
				
			||||
		$elems = []; | 
				
			||||
		foreach($elements as $e) { | 
				
			||||
			$elems[] = new Node($this->dom, $e); | 
				
			||||
		} | 
				
			||||
		return $elems; | 
				
			||||
	} | 
				
			||||
} | 
				
			||||
 | 
				
			||||
 | 
				
			||||
class Html | 
				
			||||
{ | 
				
			||||
	use DomQuery; | 
				
			||||
 | 
				
			||||
	public function __construct(string $html) | 
				
			||||
	{ | 
				
			||||
		$dom = new DomDocument(); | 
				
			||||
 | 
				
			||||
		if (PARSER_DEBUG) echo "Creating HTML parser from:\n" . $html . "\n\n"; | 
				
			||||
 | 
				
			||||
		@$dom->loadHTML($html); // suppress spammy warnings | 
				
			||||
		$this->dom = $dom; | 
				
			||||
		$this->contextnode = null; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	public function toXml() : string | 
				
			||||
	{ | 
				
			||||
		return $this->dom->saveXml(); | 
				
			||||
	} | 
				
			||||
} | 
				
			||||
 | 
				
			||||
 | 
				
			||||
class Node | 
				
			||||
{ | 
				
			||||
	use DomQuery; | 
				
			||||
 | 
				
			||||
	public function __construct(DOMDocument $dom, DOMNode $element) | 
				
			||||
	{ | 
				
			||||
		$this->dom = $dom; | 
				
			||||
		$this->element = $element; | 
				
			||||
		$this->contextnode = $element; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	public function __get($name) | 
				
			||||
	{ | 
				
			||||
		return $this->element->getAttribute($name); | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	public function text() : string | 
				
			||||
	{ | 
				
			||||
		return $this->element->nodeValue ?? ''; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	/** | 
				
			||||
	 * @return array|Node[] | 
				
			||||
	 */ | 
				
			||||
	public function childNodes() : array | 
				
			||||
	{ | 
				
			||||
		$elems = []; | 
				
			||||
		foreach($this->element->childNodes as $e) { | 
				
			||||
			$elems[] = new Node($this->dom, $e); | 
				
			||||
		} | 
				
			||||
		return $elems; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	/** | 
				
			||||
	 * @return Node | 
				
			||||
	 */ | 
				
			||||
	public function childNode() : Node | 
				
			||||
	{ | 
				
			||||
		$cn = $this->childNodes(); | 
				
			||||
		if (count($cn) > 1) { | 
				
			||||
			throw new \Exception("More than one childnode."); | 
				
			||||
		} | 
				
			||||
		return $cn[0]; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	public function toXml() : string | 
				
			||||
	{ | 
				
			||||
		return $this->element->ownerDocument->saveXml($this->element); | 
				
			||||
	} | 
				
			||||
} | 
				
			||||
@ -0,0 +1,279 @@ | 
				
			||||
<?php | 
				
			||||
 | 
				
			||||
const MAX_DIR_NAME_LEN = 40; | 
				
			||||
const SKIP_EXISTING = true; | 
				
			||||
const VESMIR_CZ = 'https://vesmir.cz'; | 
				
			||||
const VESMIR_LOGIN = ""; | 
				
			||||
const VESMIR_PASSWORD = ""; | 
				
			||||
 | 
				
			||||
require_once "http.inc"; | 
				
			||||
require_once "parse.inc"; | 
				
			||||
require_once "session.inc"; | 
				
			||||
 | 
				
			||||
function scrape_issue($rocnik_dir, $rocnik, $cislo, Html $doc) { | 
				
			||||
	$cislo_dir = $rocnik_dir . '/' . $cislo; | 
				
			||||
	if (!file_exists($cislo_dir)) { | 
				
			||||
		mkdir($cislo_dir); | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	echo "\nStahuji cislo $rocnik/$cislo\n\n"; | 
				
			||||
 | 
				
			||||
	$n_clanky = $doc->find('.clanky'); | 
				
			||||
	$clankyItems = $n_clanky->findAll('.row'); | 
				
			||||
 | 
				
			||||
	$cl_num = 0; | 
				
			||||
	$aktualni_h4 = null; | 
				
			||||
 | 
				
			||||
	foreach ($clankyItems as $row) { | 
				
			||||
		try { | 
				
			||||
			$hh = $row->find('h4'); | 
				
			||||
			$aktualni_h4 = $hh->text(); | 
				
			||||
			echo "\n~ Skupina clanku: $aktualni_h4 ~\n"; | 
				
			||||
			continue; | 
				
			||||
		} catch(Exception $e) { | 
				
			||||
			/* ok.. */ | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		if ($row->class != 'clankyItem row') { | 
				
			||||
			echo "Skip non-article\n"; | 
				
			||||
			continue; | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		try { | 
				
			||||
			//echo $row->toXml(); | 
				
			||||
			$num = ++$cl_num; // zvysit pocitadlo... | 
				
			||||
 | 
				
			||||
			$h3 = $row->find('h3'); | 
				
			||||
			$a = $h3->find('a'); | 
				
			||||
			$clanek_url = VESMIR_CZ . $a->href; | 
				
			||||
			$clanek_nazev = $a->text(); | 
				
			||||
 | 
				
			||||
			// Get slug | 
				
			||||
			preg_match('|/([^./]+)\.html$|', $clanek_url, $m); | 
				
			||||
			$slug = $m[1]; | 
				
			||||
 | 
				
			||||
			// Get dirname | 
				
			||||
			$fname = $num . ' - ' . $clanek_nazev; | 
				
			||||
			$fname = mb_ereg_replace("([^\w\s\d\-_~,;\[\]\(\). ])", '', $fname); | 
				
			||||
			$fname = mb_ereg_replace("([\.]{2,})", '', $fname); | 
				
			||||
 | 
				
			||||
			if (strlen($fname) > MAX_DIR_NAME_LEN) { | 
				
			||||
				$fname = substr($fname, 0, strrpos($fname, ' ', -(strlen($fname) - MAX_DIR_NAME_LEN))); | 
				
			||||
			} | 
				
			||||
 | 
				
			||||
			// Ensure dir exists | 
				
			||||
			$clanek_dir = $cislo_dir . '/' . $fname; | 
				
			||||
			if (!file_exists($clanek_dir)) { | 
				
			||||
				mkdir($clanek_dir); | 
				
			||||
			} | 
				
			||||
 | 
				
			||||
			echo "\n- $rocnik/$cislo -> Clanek #$num: $clanek_nazev -\nUrl: $clanek_url\n"; | 
				
			||||
 | 
				
			||||
			$perex = null; | 
				
			||||
			try { | 
				
			||||
				$perex = $row->find('.perex')->text(); | 
				
			||||
			} catch (Exception $e) { | 
				
			||||
				echo "No perex. ".$e->getMessage()."\n"; | 
				
			||||
			} | 
				
			||||
 | 
				
			||||
			$thumbfile = null; | 
				
			||||
			try { | 
				
			||||
				if (file_exists($clanek_dir . '/thumb.jpg')) { | 
				
			||||
					$thumbfile = 'thumb.jpg'; | 
				
			||||
				} else { | 
				
			||||
					$thumb = $row->find('img.img-responsive'); | 
				
			||||
 | 
				
			||||
					$f = get_file(VESMIR_CZ . $thumb->src); | 
				
			||||
					file_put_contents($clanek_dir . '/thumb.jpg', $f); | 
				
			||||
					$thumbfile = 'thumb.jpg'; | 
				
			||||
				} | 
				
			||||
			} catch (Exception $e) { | 
				
			||||
				echo "No thumb. ".$e->getMessage()."\n"; | 
				
			||||
			} | 
				
			||||
 | 
				
			||||
			$author_names = []; | 
				
			||||
			try { | 
				
			||||
				$authors = $row->find('.authors'); | 
				
			||||
				$author_links = $authors->findAll('a'); | 
				
			||||
 | 
				
			||||
				foreach ($author_links as $al) { | 
				
			||||
					$author_names[] = $al->text(); | 
				
			||||
				} | 
				
			||||
			} catch (Exception $e) { | 
				
			||||
				echo "!! No .authors div\n"; | 
				
			||||
			} | 
				
			||||
 | 
				
			||||
			$merged_authors = implode(', ', $author_names); | 
				
			||||
 | 
				
			||||
			if(SKIP_EXISTING && file_exists($clanek_dir . '/clanek.json')) { | 
				
			||||
				echo "ARTICLE ALREADY DL'D, SKIP\n"; | 
				
			||||
				continue; | 
				
			||||
			} | 
				
			||||
 | 
				
			||||
			$resp = get_file($clanek_url); | 
				
			||||
			file_put_contents($clanek_dir . '/orig.html', $resp); | 
				
			||||
 | 
				
			||||
			$article_doc = new Html($resp); | 
				
			||||
 | 
				
			||||
			$attachments = []; | 
				
			||||
 | 
				
			||||
			// Try to download attachments (pdf version...) | 
				
			||||
			try { | 
				
			||||
				$dmedia = $article_doc->find('.media'); | 
				
			||||
				foreach ($dmedia->findAll('a[href]') as $item) { | 
				
			||||
					$href = VESMIR_CZ . $item->href; | 
				
			||||
					echo "> Downloading: " . $item->text() . "\n" . $href; | 
				
			||||
 | 
				
			||||
					$fname = uniqid() . '.pdf'; // it's probably a pdf | 
				
			||||
					if ($item->text() == 'článek ve formátu pdf') { | 
				
			||||
						$isarticlepdf = true; | 
				
			||||
						$fname = $slug . '.pdf'; | 
				
			||||
					} | 
				
			||||
 | 
				
			||||
					$resp = get($href); | 
				
			||||
 | 
				
			||||
					if (isset($resp->headers['content-disposition'])) { | 
				
			||||
						$first = $resp->headers['content-disposition'][0]; | 
				
			||||
						list(, $orig_fname) = explode('filename=', $first); | 
				
			||||
					} | 
				
			||||
					if (!$isarticlepdf) { | 
				
			||||
						$fname = $orig_fname; | 
				
			||||
					} | 
				
			||||
 | 
				
			||||
					file_put_contents($clanek_dir . '/' . $fname, $resp->content); | 
				
			||||
					unset($resp->content); | 
				
			||||
 | 
				
			||||
					$attachments[] = [ | 
				
			||||
						'url' => $href, | 
				
			||||
						'popis' => $item->text(), | 
				
			||||
						'nazev' => $orig_fname, | 
				
			||||
						'soubor' => $fname, | 
				
			||||
					]; | 
				
			||||
				} | 
				
			||||
 | 
				
			||||
			} catch(Exception $e) { | 
				
			||||
				echo "Error finding media links: ".$e->getMessage()."\n"; | 
				
			||||
			} | 
				
			||||
 | 
				
			||||
			$adiv = $article_doc->find('div.article'); | 
				
			||||
			$body = $adiv->toXml(); // serialize the body div | 
				
			||||
			$body = str_replace('
', '', $body); | 
				
			||||
 | 
				
			||||
			$picnum = 0; | 
				
			||||
			$body = preg_replace_callback('|src="(/images/[^"]+)"|', function($m) use ($clanek_dir, &$picnum) { | 
				
			||||
				$uri = $m[1]; | 
				
			||||
				$url = VESMIR_CZ . $uri; | 
				
			||||
 | 
				
			||||
				preg_match('|/([^/]+)$|', $uri, $m); | 
				
			||||
				$img_slug = $m[1]; | 
				
			||||
				$img_fname = 'img_' . ($picnum++) . '_' . $img_slug; | 
				
			||||
 | 
				
			||||
				try { | 
				
			||||
					$f = get_file($url); | 
				
			||||
					file_put_contents($clanek_dir . '/' . $img_fname, $f); | 
				
			||||
					return "src=\"".htmlspecialchars($img_fname)."\""; | 
				
			||||
				} catch(\Exception $e) { | 
				
			||||
					echo "Error getting img $uri\n"; | 
				
			||||
					echo $e->getMessage(); | 
				
			||||
					echo $e->getTraceAsString(); | 
				
			||||
					return $m[0]; // no subst. | 
				
			||||
				} | 
				
			||||
			}, $body); | 
				
			||||
 | 
				
			||||
			$nazev_e = htmlspecialchars($clanek_nazev); | 
				
			||||
			$merged_authors_e = htmlspecialchars($merged_authors); | 
				
			||||
 | 
				
			||||
			$cleaned = <<<DOC | 
				
			||||
<!DOCTYPE html> | 
				
			||||
<html lang="cs"> | 
				
			||||
<head> | 
				
			||||
<meta charset="utf-8"> | 
				
			||||
<title>$nazev_e</title> | 
				
			||||
<link href="../../../style.css" rel="stylesheet" type="text/css" /> | 
				
			||||
</head> | 
				
			||||
<body> | 
				
			||||
<h1 class="article-name">$nazev_e</h1> | 
				
			||||
<p class="authors">$merged_authors_e</p> | 
				
			||||
<!-- article begin --> | 
				
			||||
 | 
				
			||||
$body | 
				
			||||
 | 
				
			||||
<!-- article end --> | 
				
			||||
</body> | 
				
			||||
</html> | 
				
			||||
 | 
				
			||||
DOC; | 
				
			||||
 | 
				
			||||
			file_put_contents($clanek_dir . '/clanek.html', $cleaned); | 
				
			||||
 | 
				
			||||
			$metadata = [ | 
				
			||||
				'nazev' => $clanek_nazev, | 
				
			||||
				'slug' => $slug, | 
				
			||||
				'url' => $clanek_url, | 
				
			||||
				'autori' => $author_names, | 
				
			||||
				'rocnik' => $rocnik, | 
				
			||||
				'cislo' => $cislo, | 
				
			||||
				'poradi' => $cl_num, | 
				
			||||
				'prilohy' => $attachments, | 
				
			||||
				'thumb' => $thumbfile, | 
				
			||||
				'perex' => $perex, | 
				
			||||
			]; | 
				
			||||
			file_put_contents($clanek_dir . '/clanek.json', json_encode($metadata, 128|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES)); | 
				
			||||
 | 
				
			||||
		} catch (Exception $e) { | 
				
			||||
			echo $e->getMessage() . "\n" . $e->getTraceAsString() . "\n"; | 
				
			||||
		} | 
				
			||||
	} | 
				
			||||
} | 
				
			||||
 | 
				
			||||
function scrape_year($year) { | 
				
			||||
	$doc = get_doc(VESMIR_CZ . "/cz/casopis/archiv-casopisu/$year/"); | 
				
			||||
	$obalky = $doc->findAll('.vesmirObalka'); | 
				
			||||
 | 
				
			||||
	$rocnik_dir = __DIR__ . '/out/' . $year; | 
				
			||||
	if (!file_exists($rocnik_dir)) { | 
				
			||||
		mkdir($rocnik_dir); | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	foreach ($obalky as $obalka) { | 
				
			||||
		$a = $obalka->childNode(); | 
				
			||||
		$url_cislo = $a->href; | 
				
			||||
 | 
				
			||||
		echo $url_cislo.PHP_EOL; | 
				
			||||
 | 
				
			||||
		if (!preg_match('|/(\d+)/cislo-(\d+)/$|', $url_cislo, $m)) { | 
				
			||||
			die("weird format $url_cislo"); | 
				
			||||
		} | 
				
			||||
		echo "== Rocnik $m[1], cislo $m[2] ==\n"; | 
				
			||||
		$rocnik = $m[1]; | 
				
			||||
		$cislo = $m[2]; | 
				
			||||
		$ident = "$rocnik-$cislo"; | 
				
			||||
 | 
				
			||||
		$i = $a->childNode(); | 
				
			||||
		$url_thumb = $i->src; | 
				
			||||
 | 
				
			||||
		$url_thumb = str_replace("?h=180", "?h=1800", $url_thumb); | 
				
			||||
 | 
				
			||||
		echo "Casopis URL: $url_cislo\nObalka URL: $url_thumb\n\n"; | 
				
			||||
		$obalka_file = $rocnik_dir . "/$ident.jpg"; | 
				
			||||
 | 
				
			||||
		if (!file_exists($obalka_file)) { | 
				
			||||
			echo "Stahuji obalku...\n"; | 
				
			||||
			$c = get_file(VESMIR_CZ . $url_thumb); | 
				
			||||
			file_put_contents($obalka_file, $c); | 
				
			||||
		} | 
				
			||||
 | 
				
			||||
		$c = get_doc(VESMIR_CZ . $url_cislo); | 
				
			||||
 | 
				
			||||
		scrape_issue($rocnik_dir, $rocnik, $cislo, $c); | 
				
			||||
	} | 
				
			||||
} | 
				
			||||
 | 
				
			||||
 | 
				
			||||
ensure_logged_in(); | 
				
			||||
//scrape_year(2019); | 
				
			||||
 | 
				
			||||
for ($i = 2019; $i >= 1994; $i--) { | 
				
			||||
	ensure_logged_in(); | 
				
			||||
	scrape_year($i); | 
				
			||||
} | 
				
			||||
@ -0,0 +1,87 @@ | 
				
			||||
<?php | 
				
			||||
 | 
				
			||||
function solveChallenge(Html $doc) : int { | 
				
			||||
	$challenge = $doc->find('[for=spamProtectionDisableResult]')->text(); | 
				
			||||
	echo "Challenge is: $challenge\n"; | 
				
			||||
	if (preg_match('/(\d+) (plus|mínus) (\d+)/', $challenge, $m)) { | 
				
			||||
		$a = +$m[1]; | 
				
			||||
		$op = $m[2] == 'plus' ? 1 : -1; | 
				
			||||
		$b = +$m[3]; | 
				
			||||
		$r = $a + $op * $b; | 
				
			||||
		echo "Result: $r\n"; | 
				
			||||
		return $r; | 
				
			||||
	} else { | 
				
			||||
		throw new Exception("Unexpected challenge: $challenge"); | 
				
			||||
	} | 
				
			||||
} | 
				
			||||
 | 
				
			||||
function login() | 
				
			||||
{ | 
				
			||||
	echo "----- attempting to login -----\n"; | 
				
			||||
 | 
				
			||||
	return post( "https://vesmir.cz/usrlogon.do", [ | 
				
			||||
		"username" => VESMIR_LOGIN, | 
				
			||||
		"password" => VESMIR_PASSWORD, | 
				
			||||
		"docId" => 9573, | 
				
			||||
		"doShowdocAction" => "/usrlogon.do", | 
				
			||||
		"emailLogon" => false, | 
				
			||||
		"origDocId" => 9573, | 
				
			||||
	]); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
function disableAntispam() | 
				
			||||
{ | 
				
			||||
	echo "----- disabling antispam -----\n"; | 
				
			||||
 | 
				
			||||
	$r = get("https://vesmir.cz/components/form/spamprotectiondisable.jsp?backurl=%2Fcz%2Fuzivatel.html"); | 
				
			||||
 | 
				
			||||
	$doc = new Html($r->content); | 
				
			||||
	$solved = solveChallenge($doc); | 
				
			||||
 | 
				
			||||
	$result = post( "https://vesmir.cz/components/form/spamprotectiondisable.jsp", [ | 
				
			||||
		"result" => $solved, | 
				
			||||
		"__token" => $doc->find('[name=__token]')->value, | 
				
			||||
		"backurl" => "/cz/uzivatel.html", | 
				
			||||
		"hash" => $doc->find('[name=hash]')->value, | 
				
			||||
	]); | 
				
			||||
 | 
				
			||||
	if (preg_match("/Zadaný výsledek je správný/", $result->content)) { | 
				
			||||
		echo "Anti-spam succeeded.\n"; | 
				
			||||
		return $result; | 
				
			||||
	} else { | 
				
			||||
		print_r($result); | 
				
			||||
 | 
				
			||||
		throw new Exception("Failed to disable antispam."); | 
				
			||||
	} | 
				
			||||
} | 
				
			||||
 | 
				
			||||
function dump_cookie_file_for_wget() { | 
				
			||||
//	echo "Exporting cookie for WGET\n"; | 
				
			||||
//	$c = file_get_contents("cookie.txt"); | 
				
			||||
//	$c = str_replace('#HttpOnly_', '', $c); | 
				
			||||
//	file_put_contents('cookie-wget.txt', $c); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
function ensure_logged_in() { | 
				
			||||
	// get a session cookie | 
				
			||||
	$r = get("https://vesmir.cz/cz/uzivatel.html"); | 
				
			||||
	if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) { | 
				
			||||
		echo "Already logged in!\n"; | 
				
			||||
		dump_cookie_file_for_wget(); | 
				
			||||
		return true; | 
				
			||||
	} else { | 
				
			||||
		echo "Need login!\n"; | 
				
			||||
 | 
				
			||||
		disableAntispam(); | 
				
			||||
		get("https://vesmir.cz/cz/uzivatel.html"); | 
				
			||||
		$result = login(); | 
				
			||||
		if (strpos($r->content, '/logoff.do?forward=/cz/') !== false) { | 
				
			||||
			echo "Logged in!\n"; | 
				
			||||
			dump_cookie_file_for_wget(); | 
				
			||||
			return true; | 
				
			||||
		} else { | 
				
			||||
			print_r($result); | 
				
			||||
			throw new Exception("--- LOGIN FAILED! ---"); | 
				
			||||
		} | 
				
			||||
	} | 
				
			||||
} | 
				
			||||
					Loading…
					
					
				
		Reference in new issue