src/Job.php
<?php
namespace Nadar\Crawler;
use CurlHandle;
/**
* Job for an URL.
*
* The job class is the main class which combines handlers and parsers for a given URL.
*
* @author Basil Suter <git@nadar.io>
* @since 1.0.0
*/
class Job
{
/**
* @var Url contains the url which should be crawled.
*/
public $url;
/**
* @var Url contains the refferer url which triggered the crawl job (or which found the given page)
*/
public $referrerUrl;
/**
* Construtor
*
* @param Url $url
* @param Url $referrerUrl
*/
public function __construct(Url $url, Url $referrerUrl)
{
$this->url = $url;
$this->referrerUrl = $referrerUrl;
}
/**
* Whether the job is valid for further processing or not.
*
* @param Crawler $crawler
* @return boolean
*/
public function validate(Crawler $crawler): bool
{
foreach ($crawler->getParsers() as $handler) {
if ($handler->validateUrl($this->url)) {
return true;
}
}
return false;
}
/**
* Generate curl resource
*
* @return CurlHandle
*/
public function generateCurl()
{
$curl = curl_init();
curl_setopt($curl, CURLOPT_HEADER, false);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_URL, $this->url->getNormalized());
curl_setopt($curl, CURLOPT_HTTPGET, true);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 0);
curl_setopt($curl, CURLOPT_TIMEOUT, 5); // timeout after 5 seconds
return $curl;
}
/**
* Run the crawl job
*
* @param RequestResponse $requestResponse
* @param Crawler $crawler
*/
public function run(RequestResponse $requestResponse, Crawler $crawler)
{
foreach ($crawler->getParsers() as $parser) {
if ($parser->validateRequestResponse($requestResponse)) {
$parserResult = $parser->run($this, $requestResponse);
foreach ($parserResult->links as $url => $linkTitle) {
// create new url object for all found urls
$url = new Url($url);
// merge the current url (which is equals to the referrer in this case)
// in order to ensure correct relative paths
$url->merge($this->url);
if ($url->isValid() && $crawler->baseUrl->sameHost($url)) {
$job = new Job($url, $this->url);
$crawler->push($job);
unset($job);
}
unset($url);
}
if ($parserResult->ignore) {
// for whatever reason the parser ignores this url
unset($parserResult);
continue;
}
$result = new Result();
$result->refererUrl = $this->referrerUrl;
$result->contentType = $requestResponse->getContentType();
$result->parser = $parser;
$result->parserResult = $parserResult;
$result->checksum = $requestResponse->getChecksum();
$result->url = $this->url;
$result->language = $parserResult->language;
$result->title = $parserResult->title;
$result->content = $parserResult->content;
$result->keywords = $parserResult->keywords;
$result->description = $parserResult->description;
$result->group = $parserResult->group;
// post the result to the handlers
foreach ($crawler->getHandlers() as $handler) {
$handler->afterRun($result);
}
unset($handler, $result, $parserResult);
}
}
unset($parser, $requestResponse);
}
}