GetDKAN/dkan

View on GitHub
modules/harvest/src/Commands/HarvestCommands.php

Summary

Maintainability
A
1 hr
Test Coverage
F
0%
<?php

namespace Drupal\harvest\Commands;

use Drupal\Core\Logger\LoggerChannelInterface;
use Drupal\harvest\HarvestUtility;
use Drupal\harvest\Load\Dataset;
use Drupal\harvest\HarvestService;
use Drush\Commands\DrushCommands;
use Drush\Exceptions\UserAbortException;
use Harvest\ETL\Extract\DataJson;
use Symfony\Component\Console\Helper\Table;
use Symfony\Component\Console\Output\ConsoleOutput;

/**
 * Class.
 *
 * @codeCoverageIgnore
 */
class HarvestCommands extends DrushCommands {
  use Helper;

  /**
   * Harvest.
   *
   * @var \Drupal\harvest\HarvestService
   */
  protected HarvestService $harvestService;

  /**
   * Harvest utility service.
   *
   * @var \Drupal\harvest\HarvestUtility
   */
  protected HarvestUtility $harvestUtility;

  /**
   * Constructor.
   */
  public function __construct(
    HarvestService $service,
    LoggerChannelInterface $logger,
    HarvestUtility $harvestUtility
  ) {
    parent::__construct();
    // @todo passing via arguments doesn't seem play well with drush.services.yml
    $this->harvestService = $service;
    $this->logger = $logger;
    $this->harvestUtility = $harvestUtility;
  }

  /**
   * List available harvests.
   *
   * @command dkan:harvest:list
   *
   * @usage dkan:harvest:list
   *   List available harvests.
   */
  public function index() {
    // Each row needs to be an array for display.
    $rows = array_map(
      function ($id) {
        return [$id];
      },
      $this->harvestService->getAllHarvestIds()
    );
    if ($rows) {
      (new Table(new ConsoleOutput()))->setHeaders(['plan id'])->setRows($rows)->render();
      return;
    }
    $this->logger->notice('No harvests registered.');
  }

  /**
   * Register a new harvest.
   *
   * You may supply a full Harvest plan in JSON or provide configuration via
   * individual options. For a simple data.json harvest, pass only an
   * identifier and extract-uri.
   *
   * Harvest plans are validated against the schema at:
   * https://github.com/GetDKAN/harvest/blob/master/schema/schema.json
   *
   * @param string $plan_json
   *   Harvest plan configuration as JSON string. Example: '{"identifier":"example","extract":{"type":"\\Harvest\\ETL\\Extract\\DataJson","uri":"https://source/data.json"},"transforms":[],"load":{"type":"\\Drupal\\harvest\\Load\\Dataset"}}'.
   * @param array $opts
   *   Options array.
   *
   * @option identifier Identifier
   * @option extract-type Extract type
   * @option extract-uri Extract URI
   * @option transform A transform class to apply. You may pass multiple transforms.
   * @option load-type Load class
   *
   * @command dkan:harvest:register
   *
   * @usage dkan:harvest:register --identifier=myHarvestId --extract-uri=http://example.com/data.json
   */
  public function register(string $plan_json = '', array $opts = [
    'identifier' => '',
    'extract-type' => DataJson::class,
    'extract-uri' => '',
    'transform' => [],
    'load-type' => Dataset::class,
  ]) {
    try {
      $plan = $plan_json ? json_decode($plan_json) : $this->buildPlanFromOpts($opts);
      $identifier = $this->harvestService->registerHarvest($plan);
      $this->logger->notice('Successfully registered the ' . $identifier . ' harvest.');
    }
    catch (\Exception $e) {
      $this->logger->error($e->getMessage());
      $this->logger->debug($e->getTraceAsString());
    }
  }

  /**
   * Build a harvest plan object based on the options from register.
   *
   * @param mixed $opts
   *   Options array from register method.
   *
   * @return object
   *   A harvest plan PHP object.
   */
  protected function buildPlanFromOpts($opts) {
    return (object) [
      'identifier' => $opts['identifier'],
      'extract' => (object) [
        'type' => $opts['extract-type'] ?: NULL,
        'uri' => $opts['extract-uri'] ?: NULL,
      ],
      'transforms' => $opts['transform'],
      'load' => (object) [
        'type' => $opts['load-type'],
      ],
    ];
  }

  /**
   * Deregister a harvest.
   *
   * @command dkan:harvest:deregister
   */
  public function deregister($id) {
    $message = 'Could not deregister the ' . $id . ' harvest.';
    $this->logger->warning(
      'If you deregister a harvest with published datasets, you will
       not be able to bulk revert the datasets connected to this harvest.');
    if ($this->io()->confirm("Deregister harvest {$id}")) {
      if ($this->harvestService->deregisterHarvest($id)) {
        $message = 'Successfully deregistered the ' . $id . ' harvest.';
      }
    }
    else {
      throw new UserAbortException();
    }

    $this->logger->notice($message);
  }

  /**
   * Run a harvest.
   *
   * @param string $plan_id
   *   The harvest id.
   *
   * @command dkan:harvest:run
   *
   * @usage dkan:harvest:run
   *   Runs a harvest.
   */
  public function run($plan_id) {
    $result = $this->harvestService->runHarvest($plan_id);
    $this->renderHarvestRunsInfo([$result]);
  }

  /**
   * Run all harvests.
   *
   * @option new Run only harvests which haven't run before.
   *
   * @command dkan:harvest:run-all
   *
   * @usage dkan:harvest:run-all
   *   Runs all harvests.
   */
  public function runAll($options = ['new' => FALSE]) {
    $plan_ids = $this->harvestService->getAllHarvestIds(FALSE);
    if ($options['new']) {
      $plan_ids = array_diff(
        $plan_ids, $this->harvestService->getAllHarvestIds(TRUE)
      );
    }
    $runs_info = [];
    foreach ($plan_ids as $plan_id) {
      $result = $this->harvestService->runHarvest($plan_id);
      $runs_info[] = $result;
      // Since run IDs are also one-second-resolution timestamps, we must wait
      // one second before running the next harvest.
      // @todo Remove this sleep when we've switched to a better system for
      //   timestamps.
      sleep(1);
    }
    $this->renderHarvestRunsInfo($runs_info);
  }

  /**
   * Give information about a previous harvest run.
   *
   * @param string $harvestId
   *   The harvest id.
   * @param string $runId
   *   The run's id.
   *
   * @command dkan:harvest:info
   */
  public function info($harvestId, $runId = NULL) {
    $this->validateHarvestPlan($harvestId);
    $runIds = $runId ? [$runId] : $this->harvestService->getRunIdsForHarvest($harvestId);

    foreach ($runIds as $id) {
      $run = $this->harvestService->getHarvestRunInfo($harvestId, $id);
      $runs[] = json_decode($run, TRUE);
    }

    $this->renderHarvestRunsInfo($runs ?? []);
  }

  /**
   * Revert a harvest, i.e. remove all of its harvested entities.
   *
   * @param string $harvestId
   *   The source to revert.
   *
   * @command dkan:harvest:revert
   *
   * @usage dkan:harvest:revert
   *   Removes harvested entities.
   */
  public function revert($harvestId) {
    $this->validateHarvestPlan($harvestId);
    $result = $this->harvestService->revertHarvest($harvestId);
    (new ConsoleOutput())->write("{$result} items reverted for the '{$harvestId}' harvest plan." . PHP_EOL);
  }

  /**
   * Archive all harvested datasets for a single harvest.
   *
   * @param string $harvestId
   *   The source to archive harvests for.
   *
   * @command dkan:harvest:archive
   *
   * @usage dkan:harvest:archive
   *   Archives harvested entities.
   */
  public function archive($harvestId) {
    $this->validateHarvestPlan($harvestId);
    $result = $this->harvestService->archive($harvestId);
    if (empty($result)) {
      (new ConsoleOutput())->write("No items available to archive for the '{$harvestId}' harvest plan." . PHP_EOL);
    }
    foreach ($result as $id) {
      (new ConsoleOutput())->write("Archived dataset {$id} from harvest '{$harvestId}'." . PHP_EOL);
    }
  }

  /**
   * Archive all harvested datasets for a single harvest.
   *
   * @param string $harvestId
   *   The source to archive harvests for.
   *
   * @command dkan:harvest:publish
   *
   * @usage dkan:harvest:publish
   *   Publishes harvested entities.
   */
  public function publish($harvestId) {
    $this->validateHarvestPlan($harvestId);
    $result = $this->harvestService->publish($harvestId);
    if (empty($result)) {
      (new ConsoleOutput())->write("No items available to publish for the '{$harvestId}' harvest plan." . PHP_EOL);
    }
    foreach ($result as $id) {
      (new ConsoleOutput())->write("Published dataset {$id} from harvest '{$harvestId}'." . PHP_EOL);
    }
  }

  /**
   * Show status of of a particular harvest run.
   *
   * @param string $harvestId
   *   The id of the harvest source.
   * @param string $runId
   *   The run's id. Optional. Show the status for the latest run if not
   *   provided.
   *
   * @command dkan:harvest:status
   *
   * @usage dkan:harvest:status
   *   test 1599157120
   */
  public function status($harvestId, $runId = NULL) {
    $this->validateHarvestPlan($harvestId);

    // No run_id provided, get the latest run_id.
    // Validate run_id.
    $allRunIds = $this->harvestService->getRunIdsForHarvest($harvestId);

    if (empty($allRunIds)) {
      $this->logger()->error('No Run IDs found for harvest id ' . $harvestId);
      return DrushCommands::EXIT_FAILURE;
    }

    if (empty($runId)) {
      // Get the last run_id from the array.
      $runId = end($allRunIds);
      reset($allRunIds);
    }

    if (array_search($runId, $allRunIds) === FALSE) {
      $this->logger()->error("Run ID $runId not found for harvest id $harvestId");
      return DrushCommands::EXIT_FAILURE;
    }

    $run = $this->harvestService->getHarvestRunInfo($harvestId, $runId);

    if (empty($run)) {
      $this->logger()->error("No status found for harvest id $harvestId and run id $runId");
      return DrushCommands::EXIT_FAILURE;
    }

    $this->renderStatusTable($harvestId, $runId, json_decode($run, TRUE));
    return DrushCommands::EXIT_SUCCESS;
  }

  /**
   * Orphan datasets from every run of a harvest.
   *
   * @param string $harvestId
   *   Harvest identifier.
   *
   * @return int
   *   Exit code.
   *
   * @command dkan:harvest:orphan-datasets
   * @alias dkan:harvest:orphan
   */
  public function orphanDatasets(string $harvestId) : int {
    $this->validateHarvestPlan($harvestId);

    try {
      $orphans = $this->harvestService->getOrphanIdsFromCompleteHarvest($harvestId);
      $this->harvestService->processOrphanIds($orphans);
      $this->logger()->notice("Orphaned ids from harvest {$harvestId}: " . implode(', ', $orphans));
      return DrushCommands::EXIT_SUCCESS;
    }
    catch (\Exception $e) {
      $this->logger()->error('Error in orphaning datasets of harvest %harvest: %error', [
        '%harvest' => $harvestId,
        '%error' => $e->getMessage(),
      ]);
      return DrushCommands::EXIT_FAILURE;
    }
  }

  /**
   * Report and cleanup harvest data which may be cluttering your database.
   *
   * Will print a report. Add -y or --no-interaction to automatically perform
   * this cleanup.
   *
   * @command dkan:harvest:cleanup
   *
   * @return int
   *   Bash status code.
   *
   * @bootstrap full
   */
  public function harvestCleanup(): int {
    $logger = $this->logger();
    $orphaned = $this->harvestUtility->findOrphanedHarvestDataIds();
    if ($orphaned) {
      $logger->notice('Detected leftover harvest data for these plans: ' . implode(', ', $orphaned));
      if ($this->io()->confirm('Do you want to remove this data?', FALSE)) {
        $this->cleanupHarvestDataTables($orphaned);
      }
    }
    else {
      $logger->notice('No leftover harvest data detected.');
    }
    return DrushCommands::EXIT_SUCCESS;
  }

  /**
   * Perform the harvest data table cleanup.
   *
   * @param array $plan_ids
   *   An array of plan identifiers to clean up.
   */
  protected function cleanupHarvestDataTables(array $plan_ids) : void {
    foreach ($plan_ids as $plan_id) {
      $this->logger()->notice('Cleaning up: ' . $plan_id);
      $this->harvestUtility->destructOrphanTables($plan_id);
    }
  }

  /**
   * Throw error if Harvest ID does not exist.
   *
   * @param string $harvest_plan_id
   *   The Harvest ID.
   */
  private function validateHarvestPlan($harvest_plan_id) {
    if (!in_array($harvest_plan_id, $this->harvestService->getAllHarvestIds())) {
      throw new \InvalidArgumentException('Harvest id ' . $harvest_plan_id . ' not found.');
    }
  }

  /**
   * Update all harvest-related database tables to the latest version.
   *
   * This command is meant to aid in updating databases where the update hook
   * has already run, but the database still has old-style hash tables, with
   * names like harvest_PLANID_hash.
   *
   * This will move all harvest hash information to the updated schema,
   * including data which does not have a corresponding hash plan ID.
   *
   * Outdated tables will be removed.
   *
   * @command dkan:harvest:update
   *
   * @return int
   *   Bash status code.
   *
   * @bootstrap full
   */
  public function harvestUpdate(): int {
    $this->harvestUtility->harvestHashUpdate();
    $this->harvestUtility->harvestRunsUpdate();
    $this->logger()->success('Converted!');
    return DrushCommands::EXIT_SUCCESS;
  }

}