find('.clanky'); $clankyItems = $n_clanky->findAll('.row'); $cl_num = 0; $aktualni_h4 = null; foreach ($clankyItems as $row) { try { $hh = $row->find('h4'); $aktualni_h4 = $hh->text(); echo "\n~ Skupina clanku: $aktualni_h4 ~\n"; continue; } catch(Exception $e) { /* ok.. */ } if ($row->class != 'clankyItem row') { echo "Skip non-article\n"; continue; } try { //echo $row->toXml(); $num = ++$cl_num; // zvysit pocitadlo... $h3 = $row->find('h3'); $a = $h3->find('a'); $clanek_url = VESMIR_CZ . $a->href; $clanek_nazev = $a->text(); // Get slug preg_match('|/([^./]+)\.html$|', $clanek_url, $m); $slug = $m[1]; // Get dirname $fname = $num . ' - ' . $clanek_nazev; $fname = mb_ereg_replace("([^\w\s\d\-_~,;\[\]\(\). ])", '', $fname); $fname = mb_ereg_replace("([\.]{2,})", '', $fname); if (strlen($fname) > MAX_DIR_NAME_LEN) { $fname = substr($fname, 0, strrpos($fname, ' ', -(strlen($fname) - MAX_DIR_NAME_LEN))); } // Ensure dir exists $clanek_dir = $cislo_dir . '/' . $fname; if (!file_exists($clanek_dir)) { mkdir($clanek_dir); } echo "\n- $rocnik/$cislo -> Clanek #$num: $clanek_nazev -\nUrl: $clanek_url\n"; $perex = null; try { $perex = $row->find('.perex')->text(); } catch (Exception $e) { echo "No perex. ".$e->getMessage()."\n"; } $thumbfile = null; try { if (file_exists($clanek_dir . '/thumb.jpg')) { $thumbfile = 'thumb.jpg'; } else { $thumb = $row->find('img.img-responsive'); $f = get_file(VESMIR_CZ . $thumb->src); file_put_contents($clanek_dir . '/thumb.jpg', $f); $thumbfile = 'thumb.jpg'; } } catch (Exception $e) { echo "No thumb. ".$e->getMessage()."\n"; } $author_names = []; try { $authors = $row->find('.authors'); $author_links = $authors->findAll('a'); foreach ($author_links as $al) { $author_names[] = $al->text(); } } catch (Exception $e) { echo "!! No .authors div\n"; } $merged_authors = implode(', ', $author_names); if(SKIP_EXISTING && file_exists($clanek_dir . '/clanek.json')) { echo "ARTICLE ALREADY DL'D, SKIP\n"; continue; } $resp = get_file($clanek_url); file_put_contents($clanek_dir . '/orig.html', $resp); $article_doc = new Html($resp); $attachments = []; // Try to download attachments (pdf version...) try { $dmedia = $article_doc->find('.media'); foreach ($dmedia->findAll('a[href]') as $item) { $href = VESMIR_CZ . $item->href; echo "> Downloading: " . $item->text() . "\n" . $href; $fname = uniqid() . '.pdf'; // it's probably a pdf if ($item->text() == 'článek ve formátu pdf') { $isarticlepdf = true; $fname = $slug . '.pdf'; } $resp = get($href); if (isset($resp->headers['content-disposition'])) { $first = $resp->headers['content-disposition'][0]; list(, $orig_fname) = explode('filename=', $first); } if (!$isarticlepdf) { $fname = $orig_fname; } file_put_contents($clanek_dir . '/' . $fname, $resp->content); unset($resp->content); $attachments[] = [ 'url' => $href, 'popis' => $item->text(), 'nazev' => $orig_fname, 'soubor' => $fname, ]; } } catch(Exception $e) { echo "Error finding media links: ".$e->getMessage()."\n"; } $adiv = $article_doc->find('div.article'); $body = $adiv->toXml(); // serialize the body div $body = str_replace(' ', '', $body); $picnum = 0; $body = preg_replace_callback('|src="(/images/[^"]+)"|', function($m) use ($clanek_dir, &$picnum) { $uri = $m[1]; $url = VESMIR_CZ . $uri; preg_match('|/([^/]+)$|', $uri, $m); $img_slug = $m[1]; $img_fname = 'img_' . ($picnum++) . '_' . $img_slug; try { $f = get_file($url); file_put_contents($clanek_dir . '/' . $img_fname, $f); return "src=\"".htmlspecialchars($img_fname)."\""; } catch(\Exception $e) { echo "Error getting img $uri\n"; echo $e->getMessage(); echo $e->getTraceAsString(); return $m[0]; // no subst. } }, $body); $nazev_e = htmlspecialchars($clanek_nazev); $merged_authors_e = htmlspecialchars($merged_authors); $cleaned = << $nazev_e

$nazev_e

$merged_authors_e

$body DOC; file_put_contents($clanek_dir . '/clanek.html', $cleaned); $metadata = [ 'nazev' => $clanek_nazev, 'slug' => $slug, 'url' => $clanek_url, 'autori' => $author_names, 'rocnik' => $rocnik, 'cislo' => $cislo, 'poradi' => $cl_num, 'prilohy' => $attachments, 'thumb' => $thumbfile, 'perex' => $perex, ]; file_put_contents($clanek_dir . '/clanek.json', json_encode($metadata, 128|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES)); } catch (Exception $e) { echo $e->getMessage() . "\n" . $e->getTraceAsString() . "\n"; } } } function scrape_year($year) { $doc = get_doc(VESMIR_CZ . "/cz/casopis/archiv-casopisu/$year/"); $obalky = $doc->findAll('.vesmirObalka'); $rocnik_dir = __DIR__ . '/out/' . $year; if (!file_exists($rocnik_dir)) { mkdir($rocnik_dir); } foreach ($obalky as $obalka) { $a = $obalka->childNode(); $url_cislo = $a->href; echo $url_cislo.PHP_EOL; if (!preg_match('|/(\d+)/cislo-(\d+)/$|', $url_cislo, $m)) { die("weird format $url_cislo"); } echo "== Rocnik $m[1], cislo $m[2] ==\n"; $rocnik = $m[1]; $cislo = $m[2]; $ident = "$rocnik-$cislo"; $i = $a->childNode(); $url_thumb = $i->src; $url_thumb = str_replace("?h=180", "?h=1800", $url_thumb); echo "Casopis URL: $url_cislo\nObalka URL: $url_thumb\n\n"; $obalka_file = $rocnik_dir . "/$ident.jpg"; if (!file_exists($obalka_file)) { echo "Stahuji obalku...\n"; $c = get_file(VESMIR_CZ . $url_thumb); file_put_contents($obalka_file, $c); } $c = get_doc(VESMIR_CZ . $url_cislo); scrape_issue($rocnik_dir, $rocnik, $cislo, $c); } } ensure_logged_in(); //scrape_year(2019); for ($i = 2019; $i >= 1994; $i--) { ensure_logged_in(); scrape_year($i); }