vesmir-scraper/run.php

<?php

const MAX_DIR_NAME_LEN = 40;
const SKIP_EXISTING = true;
const VESMIR_CZ = 'https://vesmir.cz';
const VESMIR_LOGIN = "";
const VESMIR_PASSWORD = "";

require_once "http.inc";
require_once "parse.inc";
require_once "session.inc";

function scrape_issue($rocnik_dir, $rocnik, $cislo, Html $doc) {
	$cislo_dir = $rocnik_dir . '/' . $cislo;
	if (!file_exists($cislo_dir)) {
		mkdir($cislo_dir);
	}

	echo "\nStahuji cislo $rocnik/$cislo\n\n";

	$n_clanky = $doc->find('.clanky');
	$clankyItems = $n_clanky->findAll('.row');

	$cl_num = 0;
	$aktualni_h4 = null;

	foreach ($clankyItems as $row) {
		try {
			$hh = $row->find('h4');
			$aktualni_h4 = $hh->text();
			echo "\n~ Skupina clanku: $aktualni_h4 ~\n";
			continue;
		} catch(Exception $e) {
			/* ok.. */
		}

		if ($row->class != 'clankyItem row') {
			echo "Skip non-article\n";
			continue;
		}

		try {
			//echo $row->toXml();
			$num = ++$cl_num; // zvysit pocitadlo...

			$h3 = $row->find('h3');
			$a = $h3->find('a');
			$clanek_url = VESMIR_CZ . $a->href;
			$clanek_nazev = $a->text();

			// Get slug
			preg_match('|/([^./]+)\.html$|', $clanek_url, $m);
			$slug = $m[1];

			// Get dirname
			$fname = $num . ' - ' . $clanek_nazev;
			$fname = mb_ereg_replace("([^\w\s\d\-_~,;\[\]\(\). ])", '', $fname);
			$fname = mb_ereg_replace("([\.]{2,})", '', $fname);

			if (strlen($fname) > MAX_DIR_NAME_LEN) {
				$fname = substr($fname, 0, strrpos($fname, ' ', -(strlen($fname) - MAX_DIR_NAME_LEN)));
			}

			// Ensure dir exists
			$clanek_dir = $cislo_dir . '/' . $fname;
			if (!file_exists($clanek_dir)) {
				mkdir($clanek_dir);
			}

			echo "\n- $rocnik/$cislo -> Clanek #$num: $clanek_nazev -\nUrl: $clanek_url\n";

			$perex = null;
			try {
				$perex = $row->find('.perex')->text();
			} catch (Exception $e) {
				echo "No perex. ".$e->getMessage()."\n";
			}

			$thumbfile = null;
			try {
				if (file_exists($clanek_dir . '/thumb.jpg')) {
					$thumbfile = 'thumb.jpg';
				} else {
					$thumb = $row->find('img.img-responsive');

					$f = get_file(VESMIR_CZ . $thumb->src);
					file_put_contents($clanek_dir . '/thumb.jpg', $f);
					$thumbfile = 'thumb.jpg';
				}
			} catch (Exception $e) {
				echo "No thumb. ".$e->getMessage()."\n";
			}

			$author_names = [];
			try {
				$authors = $row->find('.authors');
				$author_links = $authors->findAll('a');

				foreach ($author_links as $al) {
					$author_names[] = $al->text();
				}
			} catch (Exception $e) {
				echo "!! No .authors div\n";
			}

			$merged_authors = implode(', ', $author_names);

			if(SKIP_EXISTING && file_exists($clanek_dir . '/clanek.json')) {
				echo "ARTICLE ALREADY DL'D, SKIP\n";
				continue;
			}

			$resp = get_file($clanek_url);
			file_put_contents($clanek_dir . '/orig.html', $resp);

			$article_doc = new Html($resp);

			$attachments = [];

			// Try to download attachments (pdf version...)
			try {
				$dmedia = $article_doc->find('.media');
				foreach ($dmedia->findAll('a[href]') as $item) {
					$href = VESMIR_CZ . $item->href;
					echo "> Downloading: " . $item->text() . "\n" . $href;

					$fname = uniqid() . '.pdf'; // it's probably a pdf
					if ($item->text() == 'článek ve formátu pdf') {
						$isarticlepdf = true;
						$fname = $slug . '.pdf';
					}

					$resp = get($href);

					if (isset($resp->headers['content-disposition'])) {
						$first = $resp->headers['content-disposition'][0];
						list(, $orig_fname) = explode('filename=', $first);
					}
					if (!$isarticlepdf) {
						$fname = $orig_fname;
					}

					file_put_contents($clanek_dir . '/' . $fname, $resp->content);
					unset($resp->content);

					$attachments[] = [
						'url' => $href,
						'popis' => $item->text(),
						'nazev' => $orig_fname,
						'soubor' => $fname,
					];
				}

			} catch(Exception $e) {
				echo "Error finding media links: ".$e->getMessage()."\n";
			}

			$adiv = $article_doc->find('div.article');
			$body = $adiv->toXml(); // serialize the body div
			$body = str_replace('&#13;', '', $body);

			$picnum = 0;
			$body = preg_replace_callback('|src="(/images/[^"]+)"|', function($m) use ($clanek_dir, &$picnum) {
				$uri = $m[1];
				$url = VESMIR_CZ . $uri;

				preg_match('|/([^/]+)$|', $uri, $m);
				$img_slug = $m[1];
				$img_fname = 'img_' . ($picnum++) . '_' . $img_slug;

				try {
					$f = get_file($url);
					file_put_contents($clanek_dir . '/' . $img_fname, $f);
					return "src=\"".htmlspecialchars($img_fname)."\"";
				} catch(\Exception $e) {
					echo "Error getting img $uri\n";
					echo $e->getMessage();
					echo $e->getTraceAsString();
					return $m[0]; // no subst.
				}
			}, $body);

			$nazev_e = htmlspecialchars($clanek_nazev);
			$merged_authors_e = htmlspecialchars($merged_authors);

			$cleaned = <<<DOC
<!DOCTYPE html>
<html lang="cs">
<head>
<meta charset="utf-8">
<title>$nazev_e</title>
<link href="../../../style.css" rel="stylesheet" type="text/css" />
</head>
<body>
<h1 class="article-name">$nazev_e</h1>
<p class="authors">$merged_authors_e</p>
<!-- article begin -->

$body

<!-- article end -->
</body>
</html>

DOC;

			file_put_contents($clanek_dir . '/clanek.html', $cleaned);

			$metadata = [
				'nazev' => $clanek_nazev,
				'slug' => $slug,
				'url' => $clanek_url,
				'autori' => $author_names,
				'rocnik' => $rocnik,
				'cislo' => $cislo,
				'poradi' => $cl_num,
				'prilohy' => $attachments,
				'thumb' => $thumbfile,
				'perex' => $perex,
			];
			file_put_contents($clanek_dir . '/clanek.json', json_encode($metadata, 128|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES));

		} catch (Exception $e) {
			echo $e->getMessage() . "\n" . $e->getTraceAsString() . "\n";
		}
	}
}

function scrape_year($year) {
	$doc = get_doc(VESMIR_CZ . "/cz/casopis/archiv-casopisu/$year/");
	$obalky = $doc->findAll('.vesmirObalka');

	$rocnik_dir = __DIR__ . '/out/' . $year;
	if (!file_exists($rocnik_dir)) {
		mkdir($rocnik_dir);
	}

	foreach ($obalky as $obalka) {
		$a = $obalka->childNode();
		$url_cislo = $a->href;

		echo $url_cislo.PHP_EOL;

		if (!preg_match('|/(\d+)/cislo-(\d+)/$|', $url_cislo, $m)) {
			die("weird format $url_cislo");
		}
		echo "== Rocnik $m[1], cislo $m[2] ==\n";
		$rocnik = $m[1];
		$cislo = $m[2];
		$ident = "$rocnik-$cislo";

		$i = $a->childNode();
		$url_thumb = $i->src;

		$url_thumb = str_replace("?h=180", "?h=1800", $url_thumb);

		echo "Casopis URL: $url_cislo\nObalka URL: $url_thumb\n\n";
		$obalka_file = $rocnik_dir . "/$ident.jpg";

		if (!file_exists($obalka_file)) {
			echo "Stahuji obalku...\n";
			$c = get_file(VESMIR_CZ . $url_thumb);
			file_put_contents($obalka_file, $c);
		}

		$c = get_doc(VESMIR_CZ . $url_cislo);

		scrape_issue($rocnik_dir, $rocnik, $cislo, $c);
	}
}


ensure_logged_in();
//scrape_year(2019);

for ($i = 2019; $i >= 1994; $i--) {
	ensure_logged_in();
	scrape_year($i);
}
Initial commit 6 years ago			`<?php`

			`const MAX_DIR_NAME_LEN = 40;`
			`const SKIP_EXISTING = true;`
			`const VESMIR_CZ = 'https://vesmir.cz';`
			`const VESMIR_LOGIN = "";`
			`const VESMIR_PASSWORD = "";`

			`require_once "http.inc";`
			`require_once "parse.inc";`
			`require_once "session.inc";`

			`function scrape_issue($rocnik_dir, $rocnik, $cislo, Html $doc) {`
			`$cislo_dir = $rocnik_dir . '/' . $cislo;`
			`if (!file_exists($cislo_dir)) {`
			`mkdir($cislo_dir);`
			`}`

			`echo "\nStahuji cislo $rocnik/$cislo\n\n";`

			`$n_clanky = $doc->find('.clanky');`
			`$clankyItems = $n_clanky->findAll('.row');`

			`$cl_num = 0;`
			`$aktualni_h4 = null;`

			`foreach ($clankyItems as $row) {`
			`try {`
			`$hh = $row->find('h4');`
			`$aktualni_h4 = $hh->text();`
			`echo "\n~ Skupina clanku: $aktualni_h4 ~\n";`
			`continue;`
			`} catch(Exception $e) {`
			`/* ok.. */`
			`}`

			`if ($row->class != 'clankyItem row') {`
			`echo "Skip non-article\n";`
			`continue;`
			`}`

			`try {`
			`//echo $row->toXml();`
			`$num = ++$cl_num; // zvysit pocitadlo...`

			`$h3 = $row->find('h3');`
			`$a = $h3->find('a');`
			`$clanek_url = VESMIR_CZ . $a->href;`
			`$clanek_nazev = $a->text();`

			`// Get slug`
			`preg_match('\|/([^./]+)\.html$\|', $clanek_url, $m);`
			`$slug = $m[1];`

			`// Get dirname`
			`$fname = $num . ' - ' . $clanek_nazev;`
			`$fname = mb_ereg_replace("([^\w\s\d\-_~,;\[\]\(\). ])", '', $fname);`
			`$fname = mb_ereg_replace("([\.]{2,})", '', $fname);`

			`if (strlen($fname) > MAX_DIR_NAME_LEN) {`
			`$fname = substr($fname, 0, strrpos($fname, ' ', -(strlen($fname) - MAX_DIR_NAME_LEN)));`
			`}`

			`// Ensure dir exists`
			`$clanek_dir = $cislo_dir . '/' . $fname;`
			`if (!file_exists($clanek_dir)) {`
			`mkdir($clanek_dir);`
			`}`

			`echo "\n- $rocnik/$cislo -> Clanek #$num: $clanek_nazev -\nUrl: $clanek_url\n";`

			`$perex = null;`
			`try {`
			`$perex = $row->find('.perex')->text();`
			`} catch (Exception $e) {`
			`echo "No perex. ".$e->getMessage()."\n";`
			`}`

			`$thumbfile = null;`
			`try {`
			`if (file_exists($clanek_dir . '/thumb.jpg')) {`
			`$thumbfile = 'thumb.jpg';`
			`} else {`
			`$thumb = $row->find('img.img-responsive');`

			`$f = get_file(VESMIR_CZ . $thumb->src);`
			`file_put_contents($clanek_dir . '/thumb.jpg', $f);`
			`$thumbfile = 'thumb.jpg';`
			`}`
			`} catch (Exception $e) {`
			`echo "No thumb. ".$e->getMessage()."\n";`
			`}`

			`$author_names = [];`
			`try {`
			`$authors = $row->find('.authors');`
			`$author_links = $authors->findAll('a');`

			`foreach ($author_links as $al) {`
			`$author_names[] = $al->text();`
			`}`
			`} catch (Exception $e) {`
			`echo "!! No .authors div\n";`
			`}`

			`$merged_authors = implode(', ', $author_names);`

			`if(SKIP_EXISTING && file_exists($clanek_dir . '/clanek.json')) {`
			`echo "ARTICLE ALREADY DL'D, SKIP\n";`
			`continue;`
			`}`

			`$resp = get_file($clanek_url);`
			`file_put_contents($clanek_dir . '/orig.html', $resp);`

			`$article_doc = new Html($resp);`

			`$attachments = [];`

			`// Try to download attachments (pdf version...)`
			`try {`
			`$dmedia = $article_doc->find('.media');`
			`foreach ($dmedia->findAll('a[href]') as $item) {`
			`$href = VESMIR_CZ . $item->href;`
			`echo "> Downloading: " . $item->text() . "\n" . $href;`

			`$fname = uniqid() . '.pdf'; // it's probably a pdf`
			`if ($item->text() == 'článek ve formátu pdf') {`
			`$isarticlepdf = true;`
			`$fname = $slug . '.pdf';`
			`}`

			`$resp = get($href);`

			`if (isset($resp->headers['content-disposition'])) {`
			`$first = $resp->headers['content-disposition'][0];`
			`list(, $orig_fname) = explode('filename=', $first);`
			`}`
			`if (!$isarticlepdf) {`
			`$fname = $orig_fname;`
			`}`

			`file_put_contents($clanek_dir . '/' . $fname, $resp->content);`
			`unset($resp->content);`

			`$attachments[] = [`
			`'url' => $href,`
			`'popis' => $item->text(),`
			`'nazev' => $orig_fname,`
			`'soubor' => $fname,`
			`];`
			`}`

			`} catch(Exception $e) {`
			`echo "Error finding media links: ".$e->getMessage()."\n";`
			`}`

			`$adiv = $article_doc->find('div.article');`
			`$body = $adiv->toXml(); // serialize the body div`
			`$body = str_replace(' ', '', $body);`

			`$picnum = 0;`
			`$body = preg_replace_callback('\|src="(/images/[^"]+)"\|', function($m) use ($clanek_dir, &$picnum) {`
			`$uri = $m[1];`
			`$url = VESMIR_CZ . $uri;`

			`preg_match('\|/([^/]+)$\|', $uri, $m);`
			`$img_slug = $m[1];`
			`$img_fname = 'img_' . ($picnum++) . '_' . $img_slug;`

			`try {`
			`$f = get_file($url);`
			`file_put_contents($clanek_dir . '/' . $img_fname, $f);`
			`return "src=\"".htmlspecialchars($img_fname)."\"";`
			`} catch(\Exception $e) {`
			`echo "Error getting img $uri\n";`
			`echo $e->getMessage();`
			`echo $e->getTraceAsString();`
			`return $m[0]; // no subst.`
			`}`
			`}, $body);`

			`$nazev_e = htmlspecialchars($clanek_nazev);`
			`$merged_authors_e = htmlspecialchars($merged_authors);`

			`$cleaned = <<<DOC`
			`<!DOCTYPE html>`
			`<html lang="cs">`
			`<head>`
			`<meta charset="utf-8">`
			`<title>$nazev_e</title>`
			`<link href="../../../style.css" rel="stylesheet" type="text/css" />`
			`</head>`
			`<body>`
			`<h1 class="article-name">$nazev_e</h1>`
			`<p class="authors">$merged_authors_e</p>`
			`<!-- article begin -->`

			`$body`

			`<!-- article end -->`
			`</body>`
			`</html>`

			`DOC;`

			`file_put_contents($clanek_dir . '/clanek.html', $cleaned);`

			`$metadata = [`
			`'nazev' => $clanek_nazev,`
			`'slug' => $slug,`
			`'url' => $clanek_url,`
			`'autori' => $author_names,`
			`'rocnik' => $rocnik,`
			`'cislo' => $cislo,`
			`'poradi' => $cl_num,`
			`'prilohy' => $attachments,`
			`'thumb' => $thumbfile,`
			`'perex' => $perex,`
			`];`
			`file_put_contents($clanek_dir . '/clanek.json', json_encode($metadata, 128\|JSON_UNESCAPED_UNICODE\|JSON_UNESCAPED_SLASHES));`

			`} catch (Exception $e) {`
			`echo $e->getMessage() . "\n" . $e->getTraceAsString() . "\n";`
			`}`
			`}`
			`}`

			`function scrape_year($year) {`
			`$doc = get_doc(VESMIR_CZ . "/cz/casopis/archiv-casopisu/$year/");`
			`$obalky = $doc->findAll('.vesmirObalka');`

			`$rocnik_dir = __DIR__ . '/out/' . $year;`
			`if (!file_exists($rocnik_dir)) {`
			`mkdir($rocnik_dir);`
			`}`

			`foreach ($obalky as $obalka) {`
			`$a = $obalka->childNode();`
			`$url_cislo = $a->href;`

			`echo $url_cislo.PHP_EOL;`

			`if (!preg_match('\|/(\d+)/cislo-(\d+)/$\|', $url_cislo, $m)) {`
			`die("weird format $url_cislo");`
			`}`
			`echo "== Rocnik $m[1], cislo $m[2] ==\n";`
			`$rocnik = $m[1];`
			`$cislo = $m[2];`
			`$ident = "$rocnik-$cislo";`

			`$i = $a->childNode();`
			`$url_thumb = $i->src;`

			`$url_thumb = str_replace("?h=180", "?h=1800", $url_thumb);`

			`echo "Casopis URL: $url_cislo\nObalka URL: $url_thumb\n\n";`
			`$obalka_file = $rocnik_dir . "/$ident.jpg";`

			`if (!file_exists($obalka_file)) {`
			`echo "Stahuji obalku...\n";`
			`$c = get_file(VESMIR_CZ . $url_thumb);`
			`file_put_contents($obalka_file, $c);`
			`}`

			`$c = get_doc(VESMIR_CZ . $url_cislo);`

			`scrape_issue($rocnik_dir, $rocnik, $cislo, $c);`
			`}`
			`}`


			`ensure_logged_in();`
			`//scrape_year(2019);`

			`for ($i = 2019; $i >= 1994; $i--) {`
			`ensure_logged_in();`
			`scrape_year($i);`
			`}`