lib/tasks/elibrary/citations_cop_importer.rb from unepwcmc/SAPI

lib/tasks/elibrary/citations_cop_importer.rb
Summary

Maintainability

2 hrs
Test Coverage

Issues
require Rails.root.join('lib/tasks/elibrary/importable.rb')
require Rails.root.join('lib/tasks/elibrary/citations_importer.rb')

class Elibrary::CitationsCopImporter < Elibrary::CitationsImporter

  def columns_with_type
    super() + [
      ['ProposalNo', 'TEXT'],
      ['ProposalNature', 'TEXT'],
      ['ProposalOutcome', 'TEXT'],
      ['ProposalAdditionalComments', 'TEXT'],
      ['ProposalHardCopy', 'TEXT'],
      ['ProposalRepresentation', 'TEXT'],
      ['ProposalOtherTaxonName', 'TEXT']
    ]
  end

  def run_preparatory_queries
    super()
    ApplicationRecord.connection.execute("UPDATE #{table_name} SET ProposalNature = NULL WHERE ProposalNature='NULL'")
    ApplicationRecord.connection.execute("UPDATE #{table_name} SET ProposalOutcome = NULL WHERE ProposalOutcome='NULL'")
    ApplicationRecord.connection.execute("UPDATE #{table_name} SET ProposalRepresentation = NULL WHERE ProposalRepresentation='NULL'")

    # revert any previous CoP document duplication
    sql = <<-SQL
      WITH new_docs AS (
        SELECT id, original_id FROM documents
        WHERE type = 'Document::Proposal' AND original_id IS NOT NULL
      ), proposals AS (
        UPDATE proposal_details
        SET document_id = d.original_id
        FROM proposal_details pd
        JOIN new_docs d ON pd.document_id = d.id
        WHERE proposal_details.id = pd.id
      )
      DELETE FROM documents
      WHERE original_id IS NOT NULL;
    SQL
    ApplicationRecord.connection.execute(sql)
  end

  def run_final_queries
    # need to duplicate CoP documents, which are linked to more than one proposal
    # that applies to old documents
    sql = <<-SQL
      WITH proposal_details AS (
        SELECT proposal_details.*, ROW_NUMBER(*) OVER(PARTITION BY document_id)
        FROM proposal_details
      ), proposal_details_to_split AS (
        SELECT *
        FROM proposal_details
        WHERE row_number > 1
      ), documents_to_split AS (
        SELECT d.*
        FROM proposal_details_to_split pd
        JOIN documents d
        ON pd.document_id = d.id
      ), inserted_documents AS (
        INSERT INTO documents (
          event_id,
          sort_index,
          type,
          elib_legacy_id,
          title,
          date,
          filename,
          elib_legacy_file_name,
          is_public,
          language_id,
          created_at,
          updated_at,
          original_id
        )
        SELECT
          event_id,
          sort_index,
          type,
          elib_legacy_id,
          title,
          date,
          filename,
          elib_legacy_file_name,
          is_public,
          language_id,
          created_at,
          updated_at,
          id
        FROM documents_to_split d
        RETURNING *
      ), inserted_documents_with_rowno AS (
        SELECT *, ROW_NUMBER(*) OVER(PARTITION BY original_id)
        FROM inserted_documents
      ), proposal_details_to_update AS (
        SELECT pd.*, d.id AS new_document_id
        FROM proposal_details_to_split pd
        JOIN inserted_documents_with_rowno d
        ON d.original_id = pd.document_id AND (d.row_number + 1) = pd.row_number
      )
      UPDATE proposal_details
      SET document_id = new_document_id
      FROM proposal_details_to_update pd
      WHERE proposal_details.id = pd.id;
    SQL
    ApplicationRecord.connection.execute(sql)
    # in case you need to revert
    # WITH new_docs AS (
    #   SELECT * FROM documents WHERE original_id IS NOT NULL
    # ), proposals AS (
    #   UPDATE proposal_details
    #   SET document_id = d.original_id
    #   FROM proposal_details pd
    #   JOIN new_docs d ON pd.document_id = d.id
    #   WHERE proposal_details.id = pd.id
    # )
    # DELETE FROM documents
    # WHERE original_id IS NOT NULL;

    sql = <<-SQL
      UPDATE documents
      SET title = COALESCE(pd.proposal_nature, title)
      FROM
      proposal_details pd
      WHERE documents.id = pd.document_id;
    SQL
    ApplicationRecord.connection.execute(sql)
  end

  def run_queries
    super()
    sql = <<-SQL
      WITH rows_to_insert AS (
        #{proposal_details_rows_to_insert_sql}
      ), rows_to_insert_resolved AS (
        SELECT *, outcomes.id AS proposal_outcome_id, documents.id AS document_id
        FROM rows_to_insert
        JOIN documents ON DocumentID = documents.elib_legacy_id
        LEFT JOIN document_tags outcomes ON BTRIM(UPPER(outcomes.name)) = BTRIM(UPPER(ProposalOutcome))
      )
      INSERT INTO proposal_details(document_id, proposal_outcome_id, proposal_nature, representation, proposal_number, created_at, updated_at)
      SELECT document_id, proposal_outcome_id, ProposalNature, ProposalRepresentation, ProposalNo, NOW(), NOW()
      FROM rows_to_insert_resolved
    SQL
    ApplicationRecord.connection.execute(sql)
    run_final_queries
  end

  # this performs grouping, the proposal meta data used to be citation-level
  # but in the new system it is document-level
  def all_proposal_details_rows_sql
    <<-SQL
      SELECT CAST(DocumentID AS INT), ProposalNature, ProposalOutcome, ProposalRepresentation, ProposalNo
      FROM #{table_name}
      GROUP BY DocumentID, ProposalNature, ProposalOutcome, ProposalRepresentation, ProposalNo
    SQL
  end

  # this might return more than 1 row per DocumentID
  # it will lead to inserting multiple proposal_details records per document
  # that is not expected in the new structure; to work around the problem
  # after the import documents with multiple details will need to be duplicated
  def proposal_details_rows_to_insert_sql
    sql = <<-SQL
      SELECT * FROM (
        #{all_proposal_details_rows_sql}
      ) all_rows_in_table_name
      WHERE ProposalNature IS NOT NULL
        OR ProposalOutcome IS NOT NULL
        OR ProposalRepresentation IS NOT NULL
      EXCEPT
      SELECT
        d.elib_legacy_id,
        dd.proposal_nature,
        outcomes.name,
        dd.representation,
        dd.proposal_number
      FROM (
        #{all_rows_sql}
      ) nc
      JOIN documents d ON d.elib_legacy_id = nc.DocumentID
      JOIN proposal_details dd ON d.id = dd.document_id
      JOIN document_tags outcomes ON dd.proposal_outcome_id = outcomes.id
    SQL
  end

end