You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							279 lines
						
					
					
						
							6.7 KiB
						
					
					
				
			
		
		
	
	
							279 lines
						
					
					
						
							6.7 KiB
						
					
					
				<?php
 | 
						|
 | 
						|
const MAX_DIR_NAME_LEN = 40;
 | 
						|
const SKIP_EXISTING = true;
 | 
						|
const VESMIR_CZ = 'https://vesmir.cz';
 | 
						|
const VESMIR_LOGIN = "";
 | 
						|
const VESMIR_PASSWORD = "";
 | 
						|
 | 
						|
require_once "http.inc";
 | 
						|
require_once "parse.inc";
 | 
						|
require_once "session.inc";
 | 
						|
 | 
						|
function scrape_issue($rocnik_dir, $rocnik, $cislo, Html $doc) {
 | 
						|
	$cislo_dir = $rocnik_dir . '/' . $cislo;
 | 
						|
	if (!file_exists($cislo_dir)) {
 | 
						|
		mkdir($cislo_dir);
 | 
						|
	}
 | 
						|
 | 
						|
	echo "\nStahuji cislo $rocnik/$cislo\n\n";
 | 
						|
 | 
						|
	$n_clanky = $doc->find('.clanky');
 | 
						|
	$clankyItems = $n_clanky->findAll('.row');
 | 
						|
 | 
						|
	$cl_num = 0;
 | 
						|
	$aktualni_h4 = null;
 | 
						|
 | 
						|
	foreach ($clankyItems as $row) {
 | 
						|
		try {
 | 
						|
			$hh = $row->find('h4');
 | 
						|
			$aktualni_h4 = $hh->text();
 | 
						|
			echo "\n~ Skupina clanku: $aktualni_h4 ~\n";
 | 
						|
			continue;
 | 
						|
		} catch(Exception $e) {
 | 
						|
			/* ok.. */
 | 
						|
		}
 | 
						|
 | 
						|
		if ($row->class != 'clankyItem row') {
 | 
						|
			echo "Skip non-article\n";
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
 | 
						|
		try {
 | 
						|
			//echo $row->toXml();
 | 
						|
			$num = ++$cl_num; // zvysit pocitadlo...
 | 
						|
 | 
						|
			$h3 = $row->find('h3');
 | 
						|
			$a = $h3->find('a');
 | 
						|
			$clanek_url = VESMIR_CZ . $a->href;
 | 
						|
			$clanek_nazev = $a->text();
 | 
						|
 | 
						|
			// Get slug
 | 
						|
			preg_match('|/([^./]+)\.html$|', $clanek_url, $m);
 | 
						|
			$slug = $m[1];
 | 
						|
 | 
						|
			// Get dirname
 | 
						|
			$fname = $num . ' - ' . $clanek_nazev;
 | 
						|
			$fname = mb_ereg_replace("([^\w\s\d\-_~,;\[\]\(\). ])", '', $fname);
 | 
						|
			$fname = mb_ereg_replace("([\.]{2,})", '', $fname);
 | 
						|
 | 
						|
			if (strlen($fname) > MAX_DIR_NAME_LEN) {
 | 
						|
				$fname = substr($fname, 0, strrpos($fname, ' ', -(strlen($fname) - MAX_DIR_NAME_LEN)));
 | 
						|
			}
 | 
						|
 | 
						|
			// Ensure dir exists
 | 
						|
			$clanek_dir = $cislo_dir . '/' . $fname;
 | 
						|
			if (!file_exists($clanek_dir)) {
 | 
						|
				mkdir($clanek_dir);
 | 
						|
			}
 | 
						|
 | 
						|
			echo "\n- $rocnik/$cislo -> Clanek #$num: $clanek_nazev -\nUrl: $clanek_url\n";
 | 
						|
 | 
						|
			$perex = null;
 | 
						|
			try {
 | 
						|
				$perex = $row->find('.perex')->text();
 | 
						|
			} catch (Exception $e) {
 | 
						|
				echo "No perex. ".$e->getMessage()."\n";
 | 
						|
			}
 | 
						|
 | 
						|
			$thumbfile = null;
 | 
						|
			try {
 | 
						|
				if (file_exists($clanek_dir . '/thumb.jpg')) {
 | 
						|
					$thumbfile = 'thumb.jpg';
 | 
						|
				} else {
 | 
						|
					$thumb = $row->find('img.img-responsive');
 | 
						|
 | 
						|
					$f = get_file(VESMIR_CZ . $thumb->src);
 | 
						|
					file_put_contents($clanek_dir . '/thumb.jpg', $f);
 | 
						|
					$thumbfile = 'thumb.jpg';
 | 
						|
				}
 | 
						|
			} catch (Exception $e) {
 | 
						|
				echo "No thumb. ".$e->getMessage()."\n";
 | 
						|
			}
 | 
						|
 | 
						|
			$author_names = [];
 | 
						|
			try {
 | 
						|
				$authors = $row->find('.authors');
 | 
						|
				$author_links = $authors->findAll('a');
 | 
						|
 | 
						|
				foreach ($author_links as $al) {
 | 
						|
					$author_names[] = $al->text();
 | 
						|
				}
 | 
						|
			} catch (Exception $e) {
 | 
						|
				echo "!! No .authors div\n";
 | 
						|
			}
 | 
						|
 | 
						|
			$merged_authors = implode(', ', $author_names);
 | 
						|
 | 
						|
			if(SKIP_EXISTING && file_exists($clanek_dir . '/clanek.json')) {
 | 
						|
				echo "ARTICLE ALREADY DL'D, SKIP\n";
 | 
						|
				continue;
 | 
						|
			}
 | 
						|
 | 
						|
			$resp = get_file($clanek_url);
 | 
						|
			file_put_contents($clanek_dir . '/orig.html', $resp);
 | 
						|
 | 
						|
			$article_doc = new Html($resp);
 | 
						|
 | 
						|
			$attachments = [];
 | 
						|
 | 
						|
			// Try to download attachments (pdf version...)
 | 
						|
			try {
 | 
						|
				$dmedia = $article_doc->find('.media');
 | 
						|
				foreach ($dmedia->findAll('a[href]') as $item) {
 | 
						|
					$href = VESMIR_CZ . $item->href;
 | 
						|
					echo "> Downloading: " . $item->text() . "\n" . $href;
 | 
						|
 | 
						|
					$fname = uniqid() . '.pdf'; // it's probably a pdf
 | 
						|
					if ($item->text() == 'článek ve formátu pdf') {
 | 
						|
						$isarticlepdf = true;
 | 
						|
						$fname = $slug . '.pdf';
 | 
						|
					}
 | 
						|
 | 
						|
					$resp = get($href);
 | 
						|
 | 
						|
					if (isset($resp->headers['content-disposition'])) {
 | 
						|
						$first = $resp->headers['content-disposition'][0];
 | 
						|
						list(, $orig_fname) = explode('filename=', $first);
 | 
						|
					}
 | 
						|
					if (!$isarticlepdf) {
 | 
						|
						$fname = $orig_fname;
 | 
						|
					}
 | 
						|
 | 
						|
					file_put_contents($clanek_dir . '/' . $fname, $resp->content);
 | 
						|
					unset($resp->content);
 | 
						|
 | 
						|
					$attachments[] = [
 | 
						|
						'url' => $href,
 | 
						|
						'popis' => $item->text(),
 | 
						|
						'nazev' => $orig_fname,
 | 
						|
						'soubor' => $fname,
 | 
						|
					];
 | 
						|
				}
 | 
						|
 | 
						|
			} catch(Exception $e) {
 | 
						|
				echo "Error finding media links: ".$e->getMessage()."\n";
 | 
						|
			}
 | 
						|
 | 
						|
			$adiv = $article_doc->find('div.article');
 | 
						|
			$body = $adiv->toXml(); // serialize the body div
 | 
						|
			$body = str_replace('
', '', $body);
 | 
						|
 | 
						|
			$picnum = 0;
 | 
						|
			$body = preg_replace_callback('|src="(/images/[^"]+)"|', function($m) use ($clanek_dir, &$picnum) {
 | 
						|
				$uri = $m[1];
 | 
						|
				$url = VESMIR_CZ . $uri;
 | 
						|
 | 
						|
				preg_match('|/([^/]+)$|', $uri, $m);
 | 
						|
				$img_slug = $m[1];
 | 
						|
				$img_fname = 'img_' . ($picnum++) . '_' . $img_slug;
 | 
						|
 | 
						|
				try {
 | 
						|
					$f = get_file($url);
 | 
						|
					file_put_contents($clanek_dir . '/' . $img_fname, $f);
 | 
						|
					return "src=\"".htmlspecialchars($img_fname)."\"";
 | 
						|
				} catch(\Exception $e) {
 | 
						|
					echo "Error getting img $uri\n";
 | 
						|
					echo $e->getMessage();
 | 
						|
					echo $e->getTraceAsString();
 | 
						|
					return $m[0]; // no subst.
 | 
						|
				}
 | 
						|
			}, $body);
 | 
						|
 | 
						|
			$nazev_e = htmlspecialchars($clanek_nazev);
 | 
						|
			$merged_authors_e = htmlspecialchars($merged_authors);
 | 
						|
 | 
						|
			$cleaned = <<<DOC
 | 
						|
<!DOCTYPE html>
 | 
						|
<html lang="cs">
 | 
						|
<head>
 | 
						|
<meta charset="utf-8">
 | 
						|
<title>$nazev_e</title>
 | 
						|
<link href="../../../style.css" rel="stylesheet" type="text/css" />
 | 
						|
</head>
 | 
						|
<body>
 | 
						|
<h1 class="article-name">$nazev_e</h1>
 | 
						|
<p class="authors">$merged_authors_e</p>
 | 
						|
<!-- article begin -->
 | 
						|
 | 
						|
$body
 | 
						|
 | 
						|
<!-- article end -->
 | 
						|
</body>
 | 
						|
</html>
 | 
						|
 | 
						|
DOC;
 | 
						|
 | 
						|
			file_put_contents($clanek_dir . '/clanek.html', $cleaned);
 | 
						|
 | 
						|
			$metadata = [
 | 
						|
				'nazev' => $clanek_nazev,
 | 
						|
				'slug' => $slug,
 | 
						|
				'url' => $clanek_url,
 | 
						|
				'autori' => $author_names,
 | 
						|
				'rocnik' => $rocnik,
 | 
						|
				'cislo' => $cislo,
 | 
						|
				'poradi' => $cl_num,
 | 
						|
				'prilohy' => $attachments,
 | 
						|
				'thumb' => $thumbfile,
 | 
						|
				'perex' => $perex,
 | 
						|
			];
 | 
						|
			file_put_contents($clanek_dir . '/clanek.json', json_encode($metadata, 128|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES));
 | 
						|
 | 
						|
		} catch (Exception $e) {
 | 
						|
			echo $e->getMessage() . "\n" . $e->getTraceAsString() . "\n";
 | 
						|
		}
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
function scrape_year($year) {
 | 
						|
	$doc = get_doc(VESMIR_CZ . "/cz/casopis/archiv-casopisu/$year/");
 | 
						|
	$obalky = $doc->findAll('.vesmirObalka');
 | 
						|
 | 
						|
	$rocnik_dir = __DIR__ . '/out/' . $year;
 | 
						|
	if (!file_exists($rocnik_dir)) {
 | 
						|
		mkdir($rocnik_dir);
 | 
						|
	}
 | 
						|
 | 
						|
	foreach ($obalky as $obalka) {
 | 
						|
		$a = $obalka->childNode();
 | 
						|
		$url_cislo = $a->href;
 | 
						|
 | 
						|
		echo $url_cislo.PHP_EOL;
 | 
						|
 | 
						|
		if (!preg_match('|/(\d+)/cislo-(\d+)/$|', $url_cislo, $m)) {
 | 
						|
			die("weird format $url_cislo");
 | 
						|
		}
 | 
						|
		echo "== Rocnik $m[1], cislo $m[2] ==\n";
 | 
						|
		$rocnik = $m[1];
 | 
						|
		$cislo = $m[2];
 | 
						|
		$ident = "$rocnik-$cislo";
 | 
						|
 | 
						|
		$i = $a->childNode();
 | 
						|
		$url_thumb = $i->src;
 | 
						|
 | 
						|
		$url_thumb = str_replace("?h=180", "?h=1800", $url_thumb);
 | 
						|
 | 
						|
		echo "Casopis URL: $url_cislo\nObalka URL: $url_thumb\n\n";
 | 
						|
		$obalka_file = $rocnik_dir . "/$ident.jpg";
 | 
						|
 | 
						|
		if (!file_exists($obalka_file)) {
 | 
						|
			echo "Stahuji obalku...\n";
 | 
						|
			$c = get_file(VESMIR_CZ . $url_thumb);
 | 
						|
			file_put_contents($obalka_file, $c);
 | 
						|
		}
 | 
						|
 | 
						|
		$c = get_doc(VESMIR_CZ . $url_cislo);
 | 
						|
 | 
						|
		scrape_issue($rocnik_dir, $rocnik, $cislo, $c);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
ensure_logged_in();
 | 
						|
//scrape_year(2019);
 | 
						|
 | 
						|
for ($i = 2019; $i >= 1994; $i--) {
 | 
						|
	ensure_logged_in();
 | 
						|
	scrape_year($i);
 | 
						|
}
 | 
						|
 |