autotelik/datashift

View on GitHub
lib/datashift/loaders/excel_loader.rb

Summary

Maintainability
C
1 day
Test Coverage
# Copyright:: (c) Autotelik Media Ltd 2015
# Author ::   Tom Statter
# License::   MIT
#
# Details::   Specific loader to support Excel files.
#             Note this only requires JRuby, Excel not required, nor Win OLE.
#
#             Maps column headings to operations on the model.
#             Iterates over all the rows using mapped operations to assign row data to a database object,
#             i.e pulls data from each column and sends to object.
#
require_relative 'file_loader'

module DataShift

  class ExcelLoader < LoaderBase

    include DataShift::ExcelBase
    include DataShift::FileLoader

    def initialize
      super
    end

    #  Options
    #
    #   [:sheet_name]      : Create a new worksheet assign to @sheet. Default is class.name
    #   [:sheet_number]    : Default is 0. The index of the Excel Worksheet to use.
    #
    def perform_load( options = {} )

      allow_empty_rows = DataShift::Loaders::Configuration.call.allow_empty_rows

      logger.info "Starting bulk load from Excel : #{file_name}"

      start(file_name, options)

      # maps list of headers into suitable calls on the Active Record class
      bind_headers(headers)

      is_dummy_run = DataShift::Configuration.call.dummy_run

      begin
        puts 'Dummy Run - Changes will be rolled back' if is_dummy_run

        load_object_class.transaction do
          sheet.each_with_index do |row, current_row_idx|

            next if current_row_idx == headers.idx

            # Excel num_rows seems to return all 'visible' rows, which appears to be greater than the actual data rows
            # (TODO - write spec to process .xls with a huge number of rows)
            #
            # manually have to detect when actual data ends, this isn't very smart but
            # got no better idea than ending once we hit the first completely empty row
            break if !allow_empty_rows && (row.nil? || row.empty?)

            logger.info "Processing Row #{current_row_idx}"

            contains_data = false

            doc_context.progress_monitor.start_monitoring

            # Iterate over the bindings,
            # For each column bound to a model operator, create a context from data in associated Excel column

            @binder.bindings.each do |method_binding|

              # TODO: - how does this get inserted - bind headers ?? ignore if no index

              next if method_binding.index.nil?

              unless method_binding.valid?
                logger.warn("No binding was found for column (#{current_row_idx})")
                next
              end

              # If binding to a column, get the value from the cell (bindings can be to internal methods)
              #
              value = row[method_binding.index]

              context = doc_context.create_node_context(method_binding, current_row_idx, value)

              contains_data ||= context.contains_data?

              logger.info "Processing Column #{method_binding.index} (#{method_binding.pp})"

              begin
                context.process
              rescue StandardError => x
                if doc_context.all_or_nothing?
                  logger.error('All or nothing set and Current Column failed so complete Row aborted')
                  doc_context.failed!
                  break
                end
              end

            end

            doc_context.reset and next if doc_context.errors?

            # Excel data rows not accurate, seems to have to manually detect when actual Excel data rows end
            break if !allow_empty_rows && contains_data == false

            doc_context.save_and_monitor_progress

            # unless next operation is update, reset the loader object
            doc_context.reset unless doc_context.node_context.next_update?
          end # all rows processed

          if is_dummy_run
            puts 'Excel loading stage done - Dummy run so Rolling Back.'
            raise ActiveRecord::Rollback # Don't actually create/upload to DB if we are doing dummy run
          end
        end # TRANSACTION N.B ActiveRecord::Rollback does not propagate outside of the containing transaction block
      rescue StandardError => e
        puts "ERROR: Excel loading failed : #{e.inspect}"
        raise e
      ensure
        report
      end

      puts 'Excel loading stage Complete.'
    end

    private

    #  Options  :
    #
    #   [:sheet_name]      : Create a new worksheet assign to @sheet. Default is class.name
    #   [:sheet_number]    : Default is 0. The index of the Excel Worksheet to use.
    #
    def start( file_name, options = {} )
      open_excel(file_name, options)

      header_row = DataShift::Loaders::Configuration.call.header_row

      set_headers( parse_headers(sheet, header_row) )

      if headers.empty?
        raise MissingHeadersError, "No headers found - Check Sheet #{sheet} is complete and Row #{headers.idx} contains headers"
      end

      excel
    end

  end
end