pollandroll/daru_lite

View on GitHub
lib/daru_lite/io/io.rb

Summary

Maintainability
A
0 mins
Test Coverage
B
83%
module DaruLite
  require_relative 'csv/converters'
  module IOHelpers
    class << self
      def process_row(row, empty)
        row.to_a.map do |c|
          if empty.include?(c)
            # FIXME: As far as I can guess, it will never work.
            # It is called only inside `from_plaintext`, and there
            # data is splitted by `\s+` -- there is no chance that
            # "empty" (currently just '') will be between data?..
            nil
          else
            try_string_to_number(c)
          end
        end
      end

      def open_local_or_remote_file(path)
        uri = URI.parse(path)
        uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) ? uri.open : File.open(uri.path)
      end

      private

      INT_PATTERN = /^[-+]?\d+$/.freeze
      FLOAT_PATTERN = /^[-+]?\d+[,.]?\d*(e-?\d+)?$/.freeze

      def try_string_to_number(s)
        case s
        when INT_PATTERN
          s.to_i
        when FLOAT_PATTERN
          s.tr(',', '.').to_f
        else
          s
        end
      end
    end
  end

  module IO
    class << self
      # Functions for loading/writing Excel files.

      def from_excel(path, opts = {})
        opts = {
          worksheet_id: 0,
          row_id: 0
        }.merge opts

        worksheet, headers = read_from_excel(path, opts)
        df = DaruLite::DataFrame.new({})
        headers.each_with_index do |h, i|
          col = worksheet.column(i).to_a
          col.delete_at 0
          df[h] = col
        end

        df
      end

      def read_from_excel(path, opts)
        optional_gem 'spreadsheet', '~>1.3.0'

        worksheet_id = opts[:worksheet_id]
        row_id       = opts[:row_id]
        book         = Spreadsheet.open path
        worksheet    = book.worksheet worksheet_id
        headers      = ArrayHelper.recode_repeated(worksheet.row(row_id)).map(&:to_sym)

        [worksheet, headers]
      end

      def dataframe_write_excel(dataframe, path, _opts = {})
        book   = Spreadsheet::Workbook.new
        sheet  = book.create_worksheet
        format = Spreadsheet::Format.new color: :blue, weight: :bold

        sheet.row(0).concat(dataframe.vectors.to_a.map(&:to_s)) # Unfreeze strings
        sheet.row(0).default_format = format
        i = 1
        dataframe.each_row do |row|
          sheet.row(i).concat(row.to_a)
          i += 1
        end

        book.write(path)
      end

      # Functions for loading/writing CSV files
      def from_csv(path, opts = {})
        daru_options, opts = from_csv_prepare_opts opts
        # Preprocess headers for detecting and correcting repetition in
        # case the :headers option is not specified.
        hsh =
          if opts[:headers]
            from_csv_hash_with_headers(path, opts)
          else
            from_csv_hash(path, opts)
              .tap { |hash| daru_options[:order] = hash.keys }
          end
        DaruLite::DataFrame.new(hsh, daru_options)
      end

      def dataframe_write_csv(dataframe, path, opts = {})
        options = {
          converters: :numeric
        }.merge(opts)

        writer = ::CSV.open(path, 'w', **options)
        writer << dataframe.vectors.to_a unless options[:headers] == false

        dataframe.each_row do |row|
          writer << if options[:convert_comma]
                      row.map { |v| v.to_s.tr('.', ',') }
                    else
                      row.to_a
                    end
        end

        writer.close
      end

      # Execute a query and create a data frame from the result
      #
      # @param db [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
      # @param query [String] The query to be executed
      #
      # @return A dataframe containing the data resulting from the query
      def from_sql(db, query)
        require 'daru_lite/io/sql_data_source'
        SqlDataSource.make_dataframe(db, query)
      end

      def dataframe_write_sql(ds, dbh, table)
        require 'dbi'
        query = "INSERT INTO #{table} (#{ds.vectors.to_a.join(',')}) VALUES (#{(['?'] * ds.vectors.size).join(',')})"
        sth   = dbh.prepare(query)
        ds.each_row { |c| sth.execute(*c.to_a) }
        true
      end

      # Load dataframe from AR::Relation
      #
      # @param relation [ActiveRecord::Relation] A relation to be used to load the contents of dataframe
      #
      # @return A dataframe containing the data in the given relation
      def from_activerecord(relation, *fields)
        fields = relation.klass.column_names if fields.empty?
        fields = fields.map(&:to_sym)

        result = relation.pluck(*fields).transpose
        DaruLite::DataFrame.new(result, order: fields).tap(&:update)
      end

      # Loading data from plain text files

      def from_plaintext(filename, fields)
        ds = DaruLite::DataFrame.new({}, order: fields)
        fp = File.open(filename, 'r')
        fp.each_line do |line|
          row = DaruLite::IOHelpers.process_row(line.strip.split(/\s+/), [''])
          next if row == ["\x1A"]

          ds.add_row(row)
        end
        ds.update
        fields.each { |f| ds[f].rename f }
        ds
      end

      # Loading and writing Marshalled DataFrame/Vector
      def save(klass, filename)
        fp = File.open(filename, 'w')
        Marshal.dump(klass, fp)
        fp.close
      end

      def load(filename)
        if File.exist? filename
          o = false
          File.open(filename, 'r') { |fp| o = Marshal.load(fp) }
          o
        else
          false
        end
      end

      private

      def optional_gem(name, version)
        gem name, version
        require name
      rescue LoadError
        DaruLite.error "\nInstall the #{name} gem version #{version} for using  #{name} functions."
      end

      DARU_OPT_KEYS = %i[clone order index name].freeze

      def from_csv_prepare_opts(opts)
        opts[:col_sep]           ||= ','
        opts[:skip_blanks]       ||= true
        opts[:converters]        ||= [:numeric]

        opts[:converters] = from_csv_prepare_converters(opts[:converters])

        daru_options = opts.keys.each_with_object({}) do |k, hash|
          hash[k] = opts.delete(k) if DARU_OPT_KEYS.include?(k)
        end
        [daru_options, opts]
      end

      def from_csv_prepare_converters(converters)
        Array(converters).flat_map do |c|
          if ::CSV::Converters[c]
            ::CSV::Converters[c]
          elsif DaruLite::IO::CSV::CONVERTERS[c]
            DaruLite::IO::CSV::CONVERTERS[c]
          else
            c
          end
        end
      end

      def from_csv_hash_with_headers(path, opts)
        opts[:header_converters] ||= :symbol
        ::CSV
          .parse(DaruLite::IOHelpers.open_local_or_remote_file(path), **opts)
          .tap { |c| yield c if block_given? }
          .by_col.to_h { |col_name, values| [col_name, values] }
      end

      def from_csv_hash(path, opts)
        csv_as_arrays =
          ::CSV
          .parse(DaruLite::IOHelpers.open_local_or_remote_file(path), **opts)
          .tap { |c| yield c if block_given? }
          .to_a
        headers       = ArrayHelper.recode_repeated(csv_as_arrays.shift)
        csv_as_arrays = csv_as_arrays.transpose
        headers.each_with_index.to_h { |h, i| [h, csv_as_arrays[i]] }
      end

      def html_parse_table(table)
        headers, headers_size = html_scrape_tag(table, 'th')
        data, size = html_scrape_tag(table, 'td')
        data = data.keep_if { |x| x.count == size }
        order, indice = html_parse_hash(headers, size, headers_size) if headers_size >= size
        return unless (indice.nil? || indice.count == data.count) && !order.nil? && order.count.positive?

        { data: data.compact, index: indice, order: order }
      end

      def html_scrape_tag(table, tag)
        arr  = table.search('tr').map { |row| row.search(tag).map { |val| val.text.strip } }
        size = arr.map(&:count).max
        [arr, size]
      end

      # Splits headers (all th tags) into order and index. Wherein,
      # Order : All <th> tags on first proper row of HTML table
      # index : All <th> tags on first proper column of HTML table
      def html_parse_hash(headers, size, headers_size)
        headers_index = headers.find_index { |x| x.count == headers_size }
        order = headers[headers_index]
        order_index = order.count - size
        order = order[order_index..]
        indice = headers[headers_index + 1..].flatten
        indice = nil if indice.to_a.empty?
        [order, indice]
      end

      def html_search(table, match = nil)
        match.nil? ? true : (table.to_s.include? match)
      end

      # Allows user to override the scraped order / index / data
      def html_decide_values(scraped_val = {}, user_val = {})
        %I[data index name order].each do |key|
          user_val[key] ||= scraped_val[key]
        end
        user_val
      end

      def html_table_to_dataframe(table)
        DaruLite::DataFrame.rows table[:data],
                                 index: table[:index],
                                 order: table[:order],
                                 name: table[:name]
      end
    end
  end
end