roo-rb/roo-xls

View on GitHub
lib/roo/xls/excel.rb

Summary

Maintainability
D
2 days
Test Coverage
require 'roo/xls/version'
require 'roo/base'
require 'spreadsheet'
require 'tmpdir'

module Roo
  # Class for handling Excel-Spreadsheets
  class Excel < Roo::Base
    FORMULAS_MESSAGE = 'the spreadsheet gem does not support formulas, so roo can not.'
    CHARGUESS =
      begin
        require 'charguess'
        true
      rescue LoadError
        false
      end

    attr_reader :workbook

    # Creates a new Excel spreadsheet object.
    # Parameter packed: :zip - File is a zip-file
    def initialize(filename, options = {})
      packed = options[:packed]
      file_warning = options[:file_warning] || :error
      mode = options[:mode] || 'rb+'

      if is_stream?(filename)
        @workbook = ::Spreadsheet.open(filename, mode)
      else
        file_type_check(filename, '.xls', 'an Excel', file_warning, packed)
        Dir.mktmpdir do |tmpdir|
          filename = download_uri(filename, tmpdir) if uri?(filename)
          if filename.is_a?(::String) && filename[0, 7] == 'stream:'
            filename = open_from_stream(filename[7..-1], tmpdir)
          end
          filename = unzip(filename, tmpdir) if packed == :zip

          @filename = filename
          unless File.file?(@filename)
            raise IOError, "file #{@filename} does not exist"
          end
          @workbook = ::Spreadsheet.open(filename, mode)
        end
      end

      super(filename, options)
      @formula = {}
      @fonts = {}
    end

    def worksheets
      @worksheets ||= workbook.worksheets
    end

    def encoding=(codepage)
      @workbook.encoding = codepage
    end

    # returns an array of sheet names in the spreadsheet
    def sheets
      @sheets ||= worksheets.collect { |worksheet| normalize_string(worksheet.name) }
    end

    # this method lets you find the worksheet with the most data
    def longest_sheet
      sheet(worksheets.inject do |m, o|
        o.row_count > m.row_count ? o : m
      end.name)
    end

    # returns the content of a cell. The upper left corner is (1,1) or ('A',1)
    def cell(row, col, sheet = default_sheet)
      validate_sheet!(sheet)

      read_cells(sheet)
      raise 'should be read' unless @cells_read[sheet]
      row, col = normalize(row, col)
      if celltype(row, col, sheet) == :date
        yyyy, mm, dd = @cell[sheet][[row, col]].split('-')
        return Date.new(yyyy.to_i, mm.to_i, dd.to_i)
      end
      if celltype(row, col, sheet) == :string
        platform_specific_encoding(@cell[sheet][[row, col]])
      else
        @cell[sheet] && @cell[sheet][[row, col]]
      end
    end

    # returns the type of a cell:
    # * :float
    # * :string,
    # * :date
    # * :percentage
    # * :formula
    # * :time
    # * :datetime
    def celltype(row, col, sheet = default_sheet)
      read_cells(sheet)
      row, col = normalize(row, col)
      begin
        if @formula[sheet] && @formula[sheet][[row, col]]
          :formula
        elsif @cell_type[sheet]
          @cell_type[sheet][[row, col]]
        end
      rescue
        puts "Error in sheet #{sheet}, row #{row}, col #{col}"
        raise
      end
    end

    # returns NO formula in excel spreadsheets
    def formula(_row, _col, _sheet = nil)
      raise NotImplementedError, FORMULAS_MESSAGE
    end
    alias_method :formula?, :formula

    # returns NO formulas in excel spreadsheets
    def formulas(_sheet = nil)
      raise NotImplementedError, FORMULAS_MESSAGE
    end

    # Given a cell, return the cell's font
    def font(row, col, sheet = default_sheet)
      read_cells(sheet)
      row, col = normalize(row, col)
      @fonts[sheet][[row, col]]
    end

    # shows the internal representation of all cells
    # mainly for debugging purposes
    def to_s(sheet = default_sheet)
      read_cells(sheet)
      @cell[sheet].inspect
    end

    private

    # converts name of a sheet to index (0,1,2,..)
    def sheet_no(name)
      return name - 1 if name.is_a?(Integer)
      i = 0
      worksheets.each do |worksheet|
        return i if name == normalize_string(worksheet.name)
        i += 1
      end
      raise StandardError, "sheet '#{name}' not found"
    end

    def normalize_string(value)
      value = every_second_null?(value) ? remove_every_second_null(value) : value
      if CHARGUESS && (encoding = CharGuess.guess(value))
        encoding.encode Encoding::UTF_8
      else
        platform_specific_encoding(value)
      end
    end

    def platform_specific_encoding(value)
      result =
        case RUBY_PLATFORM.downcase
        when /darwin|solaris/
          value.encode Encoding::UTF_8
        when /mswin32/
          value.encode Encoding::ISO_8859_1
        else
          value
        end
      result = remove_every_second_null(result) if every_second_null?(result)
      result
    end

    def every_second_null?(str)
      result = true
      return false if str.length < 2
      0.upto(str.length / 2 - 1) do |i|
        if str[i * 2 + 1, 1] != "\000"
          result = false
          break
        end
      end
      result
    end

    def remove_every_second_null(str)
      result = ''
      0.upto(str.length / 2 - 1) do |i|
        c = str[i * 2, 1]
        result += c
      end
      result
    end

    # helper function to set the internal representation of cells
    def set_cell_values(sheet, row, col, i, v, value_type, formula, _tr, font)
      # key = "#{y},#{x+i}"
      key = [row, col + i]
      @cell_type[sheet] = {} unless @cell_type[sheet]
      @cell_type[sheet][key] = value_type
      @formula[sheet] = {} unless @formula[sheet]
      @formula[sheet][key] = formula if formula
      @cell[sheet] = {} unless @cell[sheet]
      @fonts[sheet] = {} unless @fonts[sheet]
      @fonts[sheet][key] = font

      @cell[sheet][key] =
        case value_type
        when :float
          v.to_f
        when :string
          v
        when :date
          v
        when :datetime
          @cell[sheet][key] = DateTime.new(v.year, v.month, v.day, v.hour, v.min, v.sec)
        when :percentage
          v.to_f
        when :time
          v
        else
          v
        end
    end

    # ruby-spreadsheet has a font object so we're extending it
    # with our own functionality but still providing full access
    # to the user for other font information
    module ExcelFontExtensions
      def bold?(*_args)
        # From ruby-spreadsheet doc: 100 <= weight <= 1000, bold => 700, normal => 400
        weight == 700
      end

      def italic?
        italic
      end

      def underline?
        underline != :none
      end
    end

    # read all cells in the selected sheet
    def read_cells(sheet = default_sheet)
      validate_sheet!(sheet)
      return if @cells_read[sheet]

      worksheet = @workbook.worksheet(sheet_no(sheet))
      row_index = 1
      worksheet.each(0) do |row|
        (0..row.size).each do |cell_index|
          cell = row.at(cell_index)
          next if cell.nil? # skip empty cells
          next if cell.class == ::Spreadsheet::Formula && cell.value.nil? # skip empty formula cells
          value_type, v =
            if date_or_time?(row, cell_index)
              read_cell_date_or_time(row, cell_index)
            else
              read_cell(row, cell_index)
            end
          formula = tr = nil # TODO:???
          col_index = cell_index + 1
          font = row.format(cell_index).font
          font.extend(ExcelFontExtensions)
          set_cell_values(sheet, row_index, col_index, 0, v, value_type, formula, tr, font)
        end # row
        row_index += 1
      end # worksheet
      @cells_read[sheet] = true
    end

    # Get the contents of a cell, accounting for the
    # way formula stores the value
    def read_cell_content(row, idx)
      cell = row.at(idx)
      cell = row[idx] if row[idx].class == ::Spreadsheet::Link
      cell = cell.value if cell.class == ::Spreadsheet::Formula
      cell
    end

    # Test the cell to see if it's a valid date/time.
    def date_or_time?(row, idx)
      format = row.format(idx)
      if format.date_or_time?
        cell = read_cell_content(row, idx)
        true if Float(cell) > 0 rescue false
      else
        false
      end
    end

    # Read the date-time cell and convert to,
    # the date-time values for Roo
    def read_cell_date_or_time(row, idx)
      cell = read_cell_content(row, idx)
      cell = cell.to_s.to_f
      if cell < 1.0
        value_type = :time
        f = cell * 24.0 * 60.0 * 60.0
        secs = f.round
        h = (secs / 3600.0).floor
        secs -= 3600 * h
        m = (secs / 60.0).floor
        secs -= 60 * m
        s = secs
        value = h * 3600 + m * 60 + s
      else
        datetime =
          if row.at(idx).class == ::Spreadsheet::Formula
            row.send(:_datetime, cell)
          else
            row.datetime(idx)
          end
        if datetime.hour != 0 ||
           datetime.min != 0 ||
           datetime.sec != 0
          value_type = :datetime
          value = datetime
        else
          value_type = :date
          value =
            if row.at(idx).class == ::Spreadsheet::Formula
              row.send(:_date, cell)
            else
              row.date(idx)
            end
          value = sprintf('%04d-%02d-%02d', value.year, value.month, value.day)
        end
      end
      [value_type, value]
    end

    # Read the cell and based on the class,
    # return the values for Roo
    def read_cell(row, idx)
      cell = read_cell_content(row, idx)
      case cell
      when Float, Integer
        value_type = :float
        value = cell.to_f
      when ::Spreadsheet::Link
        value_type = :link
        value = cell
      when String, TrueClass, FalseClass
        value_type = :string
        value = cell.to_s
      else
        value_type = cell.class.to_s.downcase.to_sym
        value = nil
      end # case
      [value_type, value]
    end
  end
end