Stahovač archivu článků časopisu Vesmír (vesmir.cz). Vyžaduje aktivní předplatné a jméno/heslo, jinak budou některé články neúplné.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
vesmir-scraper/http.inc

97 lines
2.5 KiB

<?php
const UA = 'Mozilla/5.0 (Windows NT 6.1; rv:8.0) Gecko/20100101 Firefox/8.0';
function get_doc($url) {
return new Html(get($url)->content);
}
function get_file($url) {
return get($url)->content;
}
function get_or_post($url, $mergeoptions) {
$options = array(
CURLOPT_USERAGENT => UA, //set user agent
CURLOPT_COOKIEFILE => "cookie.txt", //set cookie file
CURLOPT_COOKIEJAR => "cookie.txt", //set cookie jar
CURLOPT_COOKIESESSION => false,
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);
foreach ($mergeoptions as $k => $v) {
$options[$k] = $v;
}
$ch = curl_init($url);
curl_setopt_array($ch, $options);
// this function is called by curl for each header received
$response_headers = [];
curl_setopt($ch, CURLOPT_HEADERFUNCTION,
function($curl, $header) use (&$response_headers)
{
$len = strlen($header);
$header = explode(':', $header, 2);
if (count($header) < 2) // ignore invalid headers
return $len;
$name = strtolower(trim($header[0]));
if (!array_key_exists($name, $response_headers))
$response_headers[$name] = [trim($header[1])];
else
$response_headers[$name][] = trim($header[1]);
return $len;
}
);
$content = curl_exec($ch);
$err = curl_errno($ch);
$errmsg = curl_error($ch);
$header = curl_getinfo($ch);
curl_close($ch);
$header['errno'] = $err;
$header['errmsg'] = $errmsg;
$header['headers'] = $response_headers;
// echo "Result:\n";
// print_r($header);
$header['content'] = $content;
if ($header['http_code'] != 200) {
print_r($header);
throw new \Exception("Error status: $header[http_code]");
}
return (object) $header;
}
function get($url)
{
echo "Sending GET to: $url\n";
return get_or_post($url, [
CURLOPT_CUSTOMREQUEST => "GET", //set request type post or get
CURLOPT_POST => false, //set to GET
]);
}
function post($url, $fields)
{
echo "Sending POST to: $url\n";
print_r($fields);
return get_or_post($url, [
CURLOPT_CUSTOMREQUEST => "POST", //set request type post or get
CURLOPT_POST => true, //set to GET
CURLOPT_POSTFIELDS => $fields,
]);
}