roo-rb/roo-xls

View on GitHub
lib/roo/xls/excel_2003_xml.rb

Summary

Maintainability
C
1 day
Test Coverage
require 'date'
require 'base64'
require 'nokogiri'
require 'tmpdir'

class Roo::Excel2003XML < Roo::Base
  # initialization and opening of a spreadsheet file
  # values for packed: :zip
  def initialize(filename, options = {})
    packed = options[:packed]
    file_warning = options[:file_warning] || :error

    Dir.mktmpdir do |tmpdir|
      filename = download_uri(filename, tmpdir) if uri?(filename)
      filename = unzip(filename, tmpdir) if packed == :zip

      file_type_check(filename, '.xml', 'an Excel 2003 XML', file_warning)
      @filename = filename
      unless File.file?(@filename)
        raise IOError, "file #{@filename} does not exist"
      end
      @doc = ::Roo::Utils.load_xml(@filename)
    end
    namespace = @doc.namespaces.select { |_, urn| urn == 'urn:schemas-microsoft-com:office:spreadsheet' }.keys.last
    @namespace = (namespace.nil? || namespace.empty?) ? 'ss' : namespace.split(':').last
    super(filename, options)
    @formula = {}
    @style = {}
    @style_defaults = Hash.new { |h, k| h[k] = [] }
    @style_definitions = {}
    read_styles
  end

  # Returns the content of a spreadsheet-cell.
  # (1,1) is the upper left corner.
  # (1,1), (1,'A'), ('A',1), ('a',1) all refers to the
  # cell at the first line and first row.
  def cell(row, col, sheet = nil)
    sheet ||= @default_sheet
    read_cells(sheet)
    row, col = normalize(row, col)
    if celltype(row, col, sheet) == :date
      yyyy, mm, dd = @cell[sheet][[row, col]].split('-')
      return Date.new(yyyy.to_i, mm.to_i, dd.to_i)
    end
    @cell[sheet][[row, col]]
  end

  # Returns the formula at (row,col).
  # Returns nil if there is no formula.
  # The method #formula? checks if there is a formula.
  def formula(row, col, sheet = nil)
    sheet ||= @default_sheet
    read_cells(sheet)
    row, col = normalize(row, col)
    @formula[sheet][[row, col]] && @formula[sheet][[row, col]]['oooc:'.length..-1]
  end
  alias_method :formula?, :formula

  class Font
    attr_accessor :bold, :italic, :underline, :color, :name, :size

    def bold?
      @bold == '1'
    end

    def italic?
      @italic == '1'
    end

    def underline?
      @underline != nil
    end
  end

  # Given a cell, return the cell's style
  def font(row, col, sheet = nil)
    sheet ||= @default_sheet
    read_cells(sheet)
    row, col = normalize(row, col)
    style_name = @style[sheet][[row, col]] || @style_defaults[sheet][col - 1] || 'Default'
    @style_definitions[style_name]
  end

  # returns the type of a cell:
  # * :float
  # * :string
  # * :date
  # * :percentage
  # * :formula
  # * :time
  # * :datetime
  def celltype(row, col, sheet = nil)
    sheet ||= @default_sheet
    read_cells(sheet)
    row, col = normalize(row, col)
    if @formula[sheet][[row, col]]
      return :formula
    else
      @cell_type[sheet][[row, col]]
    end
  end

  def sheets
    @doc.xpath("/#{@namespace}:Workbook/#{@namespace}:Worksheet").map do |sheet|
      sheet["#{@namespace}:Name"]
    end
  end

  # version of the openoffice document
  # at 2007 this is always "1.0"
  def officeversion
    oo_version
    @officeversion
  end

  # shows the internal representation of all cells
  # mainly for debugging purposes
  def to_s(sheet = nil)
    sheet ||= @default_sheet
    read_cells(sheet)
    @cell[sheet].inspect
  end

  # save spreadsheet
  def save #:nodoc:
    42
  end

  # returns each formula in the selected sheet as an array of elements
  # [row, col, formula]
  def formulas(sheet = nil)
    theformulas = []
    sheet ||= @default_sheet
    read_cells(sheet)
    first_row(sheet).upto(last_row(sheet)) do|row|
      first_column(sheet).upto(last_column(sheet)) do|col|
        if formula?(row, col, sheet)
          f = [row, col, formula(row, col, sheet)]
          theformulas << f
        end
      end
    end
    theformulas
  end

  private

  # read the version of the OO-Version
  def oo_version
    @doc.find("//*[local-name()='document-content']").each do |office|
      @officeversion = office['version']
    end
  end

  # helper function to set the internal representation of cells
  def set_cell_values(sheet, x, y, i, v, value_type, formula, _table_cell, str_v, style_name)
    key = [y, x + i]
    @cell_type[sheet] = {} unless @cell_type[sheet]
    @cell_type[sheet][key] = value_type
    @formula[sheet] = {} unless @formula[sheet]
    @formula[sheet][key] = formula  if formula
    @cell[sheet]    = {} unless @cell[sheet]
    @style[sheet] = {} unless @style[sheet]
    @style[sheet][key] = style_name
    @cell[sheet][key] =
      case @cell_type[sheet][key]
      when :float
        v.to_f
      when :string
        str_v
      when :datetime
        DateTime.parse(v)
      when :percentage
        v.to_f
      # when :time
      #   hms = v.split(':')
      #   hms[0].to_i*3600 + hms[1].to_i*60 + hms[2].to_i
      else
        v
      end
  end

  # read all cells in the selected sheet
  #--
  # the following construct means '4 blanks'
  # some content <text:s text:c="3"/>
  #++
  def read_cells(sheet = nil)
    sheet ||= @default_sheet
    validate_sheet!(sheet)
    return if @cells_read[sheet]
    sheet_found = false

    @doc.xpath("/#{@namespace}:Workbook/#{@namespace}:Worksheet[@#{@namespace}:Name='#{sheet}']").each do |ws|
      sheet_found = true
      column_styles = {}

      # Column Styles
      col = 1
      ws.xpath("./#{@namespace}:Table/#{@namespace}:Column").each do |c|
        skip_to_col = c["#{@namespace}:Index"].to_i
        col = skip_to_col if skip_to_col > 0
        col_style_name = c["#{@namespace}:StyleID"]
        column_styles[col] = col_style_name unless col_style_name.nil?
        col += 1
      end

      # Rows
      row = 1
      ws.xpath("./#{@namespace}:Table/#{@namespace}:Row").each do |r|
        skip_to_row = r["#{@namespace}:Index"].to_i
        row = skip_to_row if skip_to_row > 0

        # Excel uses a 'Span' attribute on a 'Row' to indicate the presence of
        # empty rows to skip.
        skip_next_rows = r["#{@namespace}:Span"].to_i

        row_style_name = r["#{@namespace}:StyleID"]

        # Cells
        col = 1
        r.xpath("./#{@namespace}:Cell").each do |c|
          skip_to_col = c["#{@namespace}:Index"].to_i
          col = skip_to_col if skip_to_col > 0

          skip_next_cols = c["#{@namespace}:MergeAcross"].to_i

          cell_style_name = c["#{@namespace}:StyleID"]
          style_name = cell_style_name || row_style_name || column_styles[col]

          # Cell Data
          c.xpath("./#{@namespace}:Data").each do |cell|
            formula = cell['Formula']
            value_type = cell["#{@namespace}:Type"].downcase.to_sym
            v = cell.content
            str_v = v
            case value_type
            when :number
              v = v.to_f
              value_type = :float
            when :datetime
              if v =~ /^1899-12-31T(\d{2}:\d{2}:\d{2})/
                v = Regexp.last_match[1]
                value_type = :time
              elsif v =~ /([^T]+)T00:00:00.000/
                v = Regexp.last_match[1]
                value_type = :date
              end
            when :boolean
              v = cell['boolean-value']
            end
            set_cell_values(sheet, col, row, 0, v, value_type, formula, cell, str_v, style_name)
          end
          col += (skip_next_cols + 1)
        end
        row += (skip_next_rows + 1)
      end
    end
    unless sheet_found
      raise RangeError, "Unable to find sheet #{sheet} for reading"
    end
    @cells_read[sheet] = true
  end

  def read_styles
    @doc.xpath("/#{@namespace}:Workbook/#{@namespace}:Styles/#{@namespace}:Style").each do |style|
      style_id = style["#{@namespace}:ID"]
      font = style.at_xpath("./#{@namespace}:Font")
      unless font.nil?
        @style_definitions[style_id] = Roo::Excel2003XML::Font.new
        @style_definitions[style_id].bold      = font["#{@namespace}:Bold"]
        @style_definitions[style_id].italic    = font["#{@namespace}:Italic"]
        @style_definitions[style_id].underline = font["#{@namespace}:Underline"]
        @style_definitions[style_id].color     = font["#{@namespace}:Color"]
        @style_definitions[style_id].name      = font["#{@namespace}:FontName"]
        @style_definitions[style_id].size      = font["#{@namespace}:Size"]
      end
    end
  end

  A_ROO_TYPE = {
    'float'      => :float,
    'string'     => :string,
    'date'       => :date,
    'percentage' => :percentage,
    'time'       => :time
  }

  def self.oo_type_2_roo_type(ootype)
    A_ROO_TYPE[ootype]
  end

  # helper method to convert compressed spaces and other elements within
  # an text into a string
  def children_to_string(children)
    result = ''
    children.each do|child|
      if child.text?
        result = result + child.content
      else
        if child.name == 's'
          compressed_spaces = child['c'].to_i
          # no explicit number means a count of 1:
          if compressed_spaces == 0
            compressed_spaces = 1
          end
          result = result + ' ' * compressed_spaces
        else
          result = result + child.content
        end
      end
    end
    result
  end
end # class