src/Client/Google.php
<?php
namespace RubtsovAV\Serps\Client;
use RubtsovAV\Serps\Core\Client\Client;
use RubtsovAV\Serps\Core\Query\Query;
use RubtsovAV\Serps\Core\Guise\Guise;
use RubtsovAV\Serps\Core\ItemPosition;
use RubtsovAV\Serps\Core\Exception\RegionException;
use RubtsovAV\Serps\Core\Exception\BadProxyException;
use RubtsovAV\Serps\Core\Exception\BannedProxyException;
use Serps\SearchEngine\Google\GoogleClient;
use Serps\SearchEngine\Google\NaturalResultType;
use Serps\SearchEngine\Google\GoogleUrl;
use Serps\SearchEngine\Google\Page\GoogleSerp;
use Serps\SearchEngine\Google\Page\GoogleCaptcha;
use Serps\SearchEngine\Google\Exception\GoogleCaptchaException;
use Serps\SearchEngine\Google\Exception\InvalidDOMException;
use Serps\HttpClient\CurlClient;
use Serps\HttpClient\CurlClient\CurlException;
use Serps\Core\Http\SearchEngineResponse;
use Serps\Core\Http\Proxy;
use Serps\Core\Cookie\Cookie;
use Serps\Exception as SerpsException;
use Serps\Exception\RequestError\InvalidResponseException;
use Zend\Diactoros\Uri;
class Google extends Client
{
protected $httpClient;
protected $googleClient;
protected $googleUrl;
protected $googlePageNumber;
protected function init()
{
$this->config = array_replace_recursive(
[
'domainZone' => 'com',
'countryCode' => null,
'googleHost' => 'google.%domainZone%',
'httpOnly' => false,
'httpClientOptions' => [],
'pathDump' => null,
'dumpSerp' => false,
'dumpInvalidResponse' => false,
'dumpSerpDomError' => false,
'dumpCaptchaPage' => false,
'dumpCaptchaImage' => false,
'dumpCaptchaConfirmResponse' => false,
],
$this->config
);
$this->httpClient = new CurlClient();
$this->googleClient = new GoogleClient($this->httpClient);
if (!empty($this->config['httpClientOptions'])) {
foreach ($this->config['httpClientOptions'] as $option => $value) {
$this->httpClient->getCurl()->setOption($option, $value);
}
}
}
public function prepareQuery(Query $query)
{
parent::prepareQuery($query);
$this->googleUrl = $this->createGoogleUrl($query);
$this->googlePageNumber = 1;
}
public function executeQuery()
{
$this->logger->debug('Client\Google->executeQuery');
$this->setGeoLocationCookie();
$queryResult = $this->queryResult;
do {
try {
$this->logger->debug('try Client\Google->sendQuery()');
$serp = $this->sendQuery();
} catch (CurlException $ex) {
$this->logger->debug(
'Client\Google->sendQuery() throw CurlException #' . $ex->getCurlErrCode()
);
throw new BadProxyException('', null, $ex);
} catch (SerpsException $ex) {
$this->logger->debug('Client\Google->sendQuery() throw SerpsException');
$this->logger->debug("$ex");
throw new BadProxyException('', null, $ex);
}
$items = $this->getItemsFromSerp($serp);
foreach ($items as $item) {
$queryResult->addItem($item);
if ($queryResult->isComplete()) {
break;
}
}
$this->setNextGooglePage();
if (empty($items)) {
$this->logger->info('items list is empty');
$queryResult->complete();
}
} while (!$queryResult->isComplete());
$this->logger->info('query executed');
return $queryResult;
}
protected function createGoogleUrl(Query $query)
{
$searchTerm = $query->getSearchTerm();
$region = $query->getSearchRegion();
$domainZone = $this->config['domainZone'];
$googleHost = $this->config['googleHost'];
$countryCode = $this->config['countryCode'];
if (isset($region['domainZone'])) {
$domainZone = $region['domainZone'];
}
if (isset($region['countryCode'])) {
$countryCode = $region['countryCode'];
}
$googleHost = str_replace('%domainZone%', $domainZone, $googleHost);
$url = new GoogleUrl();
$url->setHost($googleHost);
if ($this->config['httpOnly']) {
$url->setScheme('http');
$url->setParam('gws_rd', 'ssl');
}
if (isset($countryCode)) {
$url->setParam('cr', 'country' . $countryCode);
}
$url->setResultsPerPage(100);
$url->setSearchTerm($searchTerm);
return $url;
}
protected function sendQuery()
{
$this->logger->debug('Client\Google->sendQuery');
$proxy = $this->getProxy();
$cookieStorage = $this->getCookieStorage();
$httpHeaders = $this->getHttpHeaders();
if ($proxy) {
$proxyString = $proxy->getIP() . ':' . $proxy->getPort();
$this->logger->info("use proxy $proxyString");
}
if (isset($httpHeaders['User-Agent'])) {
$this->logger->info("set User-Agent: {$httpHeaders['User-Agent']}");
$this->googleClient->request->setUserAgent($httpHeaders['User-Agent']);
}
if (isset($httpHeaders['Accept-Language'])) {
$this->logger->info("set Accept-Language: {$httpHeaders['Accept-Language']}");
$this->googleClient->request->setDefaultAcceptLanguage(
$httpHeaders['Accept-Language']
);
}
while (true) {
try {
$this->logger->debug('try googleClient->query()');
$this->logger->info('send query');
$serp = $this->googleClient->query(
$this->googleUrl,
$proxy,
$cookieStorage
);
if ($this->config['dumpSerp']) {
$this->createDumpSerp($serp);
}
break;
} catch (GoogleCaptchaException $ex) {
$this->logger->debug('googleClient->query() throw GoogleCaptchaException');
$this->logger->info('CAPTCHA received');
if (!$this->canSolveCaptcha()) {
throw $ex;
}
$this->solveCaptcha($ex->getCaptcha());
$this->logger->info('CAPTCHA is solved');
} catch (InvalidResponseException $ex) {
$this->logger->debug('googleClient->query() throw InvalidResponseException');
$response = $ex->getResponse();
if ($this->config['dumpInvalidResponse']) {
$this->createDumpInvalidResponse($response);
}
if ($this->isBannedResponse($response)) {
$this->logger->info('proxy was banned in the Google');
throw new BannedProxyException('', null, $ex);
}
$statusCode = $response->getHttpResponseStatus();
$this->logger->notice(
"received undefined response with the status code $statusCode"
);
throw new BadProxyException('', null, $ex);
}
}
$this->logger->info('response received');
return $serp;
}
protected function isBannedResponse(SearchEngineResponse $response)
{
$pageContent = $response->getPageContent() . '';
return strpos($pageContent, '<title>Sorry...</title>') !== false;
}
protected function getProxy()
{
$guiseProxy = $this->guise->getProxy();
if (!$guiseProxy) {
return null;
}
return new Proxy($guiseProxy->ip, $guiseProxy->port);
}
protected function getCookieStorage()
{
return $this->guise->getCookieStorage();
}
protected function getHttpHeaders()
{
return $this->guise->getHttpHeaders();
}
protected function setGeoLocationCookie()
{
$this->logger->debug('Client\Google->setGeoLocationCookie');
$region = $this->query->getSearchRegion();
if (!isset($region['coordinates'])) {
$this->logger->info('coordinates is not set for the region');
return;
}
$latitude = $region['coordinates']['latitude'];
$longitude = $region['coordinates']['longitude'];
$latitude_e7 = round($latitude * pow(10, 7));
$longitude_e7 = round($longitude * pow(10, 7));
if (!$latitude_e7) {
$this->logger->error("wrong latitude $latitude");
throw new RegionException("wrong latitude $latitude");
}
if (!$longitude_e7) {
$this->logger->error("wrong longitude $longitude");
throw new RegionException("wrong longitude $longitude");
}
$googleHost = $this->googleUrl->getHost();
$value = "role:1\n";
$value .= "producer:12\n";
$value .= "provenance:6\n";
$value .= 'timestamp:'. (time() * 1000). "\n";
$value .= "latlng{\n";
$value .= "latitude_e7:{$latitude_e7}\n";
$value .= "longitude_e7:{$longitude_e7}\n";
$value .= "}\n";
$value .= 'radius:7150000';
$value = 'a+' . base64_encode($value);
$cookie = new Cookie('UULE', $value, ['domain' => ".$googleHost"]);
$cookieStorage = $this->guise->getCookieStorage();
$cookieStorage->set($cookie);
$this->logger->info("set geolocation coordinates in latitude $latitude and longitude $longitude");
}
protected function getItemsFromSerp(GoogleSerp $serp)
{
try {
$serpItems = $serp->getNaturalResults();
} catch (InvalidDOMException $ex) {
if ($this->config['dumpSerpDomError']) {
$this->createDumpSerpDomError($serp, $ex);
}
throw new BadProxyException('invalid dom of the serp', null, $ex);
}
$items = [];
foreach ($serpItems as $result) {
if ($result->is(NaturalResultType::CLASSICAL)) {
$items[] = new ItemPosition([
'position' => $result->getRealPosition() + 1,
'title' => $result->title,
'url' => $result->url,
]);
} elseif ($result->is(NaturalResultType::IN_THE_NEWS)) {
foreach ($result->news as $newsResult) {
$items[] = new ItemPosition([
'position' => $result->getRealPosition() + 1,
'title' => $newsResult->title,
'url' => $newsResult->url,
]);
}
}
}
return $items;
}
protected function setNextGooglePage()
{
$this->logger->debug('Client\Google->setNextGooglePage');
$this->googleUrl->setPage(++$this->googlePageNumber);
$this->logger->info("page number changed to {$this->googlePageNumber}");
}
protected function solveCaptcha(GoogleCaptcha $captcha)
{
$this->logger->debug('Client\Google->solveCaptcha');
if ($this->config['dumpCaptchaPage']) {
$this->createDumpCaptchaPage($captcha);
}
$imageUrl = $captcha->getImageUrl();
$this->logger->info("download captcha image from $imageUrl");
$imageData = $this->downloadImageData($imageUrl);
if ($this->config['dumpCaptchaImage']) {
$this->createDumpCaptchaImage($imageData);
}
$this->logger->info('solve captcha');
$captchaSolver = $this->config['captchaSolver'];
$captchaAnswer = $captchaSolver($imageData);
$this->logger->info("confirm captcha by answer $captchaAnswer");
$response = $this->confirmCaptcha($captcha, $captchaAnswer);
if ($this->config['dumpCaptchaConfirmResponse']) {
$this->createDumpCaptchaConfirmResponse($response);
}
}
protected function downloadImageData($imageUrl)
{
$this->logger->debug('Client\Google->downloadImageData');
$request = $this->googleClient->request->buildRequest($this->googleUrl);
$imageUri = new Uri($imageUrl);
// fix port
$imageUri = $imageUri->withPort($request->getUri()->getPort());
$request = $request->withUri($imageUri);
$proxy = $this->getProxy();
$cookieStorage = $this->getCookieStorage();
$response = $this->httpClient->sendRequest(
$request,
$proxy,
$cookieStorage
);
$imageData = $response->getPageContent();
if (!$imageData || !@getimagesizefromstring($imageData)) {
throw new BadProxyException();
}
$this->logger->debug('imageData downloaded');
return $imageData;
}
protected function confirmCaptcha(GoogleCaptcha $captcha, $captchaAnswer)
{
$this->logger->debug('Client\Google->confirmCaptcha');
$params = $this->getCaptchaConfirmParams($captcha, $captchaAnswer);
$confirmUri = new Uri($params['url']);
$request = $this->googleClient->request->buildRequest($this->googleUrl);
// fix port
$confirmUri = $confirmUri->withPort($request->getUri()->getPort());
$request = $request->withUri($confirmUri);
if (strcasecmp($params['method'], 'get') === 0) {
$uri = $request->getUri();
$query = $uri->getQuery();
if (!empty($query)) {
$query .= '&';
}
$query .= http_build_query($params['fields']);
$uri = $uri->withQuery($query);
$request = $request->withUri($uri);
}
$this->logger->info('confirm request uri is ' . $request->getUri());
$proxy = $this->getProxy();
$cookieStorage = $this->getCookieStorage();
$response = $this->httpClient->sendRequest(
$request,
$proxy,
$cookieStorage
);
return $response;
}
protected function getCaptchaConfirmParams(GoogleCaptcha $captcha, $captchaAnswer)
{
$xpath = $captcha->getErrorPage()->getXpath();
$form = $xpath->query('//body/div/form')->item(0);
$action = $form->getAttribute('action');
$method = $form->getAttribute('method');
if (empty($method)) {
$method = 'get';
}
$fields = [];
$inputs = $xpath->query('//input[@name]');
foreach ($inputs as $input) {
$name = $input->getAttribute('name');
if (empty($name)) {
continue;
}
$value = $input->getAttribute('value');
$fields[$name] = $value;
}
$fields['captcha'] = $captchaAnswer;
$url = $captcha->getErrorPage()->getUrl();
if (!empty($action)) {
if ($action{0} != '/') {
$path = $url->getPath();
$path = substr($path, 0, strrpos($path, '/'));
$action = $path . '/' . $action;
}
$url = $url->resolve($action);
}
return [
'url' => $url->buildUrl(),
'method' => $method,
'fields' => $fields,
];
}
protected function createDumpSerp(GoogleSerp $serp)
{
$this->logger->debug('Client\Google->createDumpSerp');
$dumpString = '';
$dumpString .= 'URL: ' . $serp->getUrl() . "\n";
$dumpString .= "Page content:\n". $serp->getDom()->C14N();
$this->logger->info('create dump of the SERP');
$this->createDump('Serp', $dumpString);
}
protected function createDumpSerpDomError(GoogleSerp $serp, SerpsException $error)
{
$this->logger->debug('Client\Google->createDumpSerpDomError');
$dumpString = '';
$dumpString .= 'Error: ' . $error . "\n";
$dumpString .= 'URL: ' . $serp->getUrl() . "\n";
$dumpString .= "Page content:\n". $serp->getDom()->C14N();
$this->logger->info('create dump of the SERP');
$this->createDump('SerpDomError', $dumpString);
}
protected function createDumpInvalidResponse(SearchEngineResponse $response)
{
$this->logger->debug('Client\Google->createDumpInvalidResponse');
$dumpString = '';
$dumpString .= 'Initial URL: ' . $response->getInitialUrl() . "\n";
$dumpString .= 'Effective URL: ' . $response->getEffectiveUrl() . "\n";
$dumpString .= 'HTTP Status: ' . $response->getHttpResponseStatus() . "\n";
$dumpString .= 'HTTP Headers: '. var_export($response->getHeaders(), true) . "\n";
$dumpString .= "Page content:\n". $response->getPageContent();
$this->logger->notice('create dump of the invalid response');
$this->createDump('InvalidResponse', $dumpString);
}
protected function createDumpCaptchaPage(GoogleCaptcha $captcha)
{
$this->logger->debug('Client\Google->createDumpCaptchaPage');
$dumpString = '';
$dumpString .= 'Url: ' . $captcha->getErrorPage()->getUrl() . "\n";
$dumpString .= "Page content:\n" . $captcha->getErrorPage()->getDom()->C14N();
$this->logger->info('create dump of the captcha page');
$this->createDump('CaptchaPage', $dumpString);
}
protected function createDumpCaptchaImage($imageData)
{
$this->logger->debug('Client\Google->createDumpCaptchaImage');
$this->logger->info('create dump of the captcha image');
$this->createDump('CaptchaImage', $imageData);
}
protected function createDumpCaptchaConfirmResponse(SearchEngineResponse $response)
{
$this->logger->debug('Client\Google->createDumpCaptchaConfirmResponse');
$dumpString = '';
$dumpString .= 'Initial URL: ' . $response->getInitialUrl() . "\n";
$dumpString .= 'Effective URL: ' . $response->getEffectiveUrl() . "\n";
$dumpString .= 'HTTP Status: ' . $response->getHttpResponseStatus() . "\n";
$dumpString .= 'HTTP Headers: '. var_export($response->getHeaders(), true) . "\n";
$dumpString .= "Page content:\n". $response->getPageContent();
$this->logger->info('create dump of the captcha confirm response');
$this->createDump('CaptchaConfirmResponse', $dumpString);
}
}