ClusterLabs/hawk

View on GitHub
hawk/app/lib/cibtools.rb

Summary

Maintainability
C
1 day
Test Coverage
# Copyright (c) 2009-2015 Tim Serong <tserong@suse.com>
# Copyright (c) 2015 Kristoffer Gronlund <kgronlund@suse.com>
# See COPYING for license.

# Tools for manipulating the CIB XML.
require 'util'


module Cibtools

  # Roughly equivalent to crm_element_value() in Pacemaker
  def get_xml_attr(elem, name, default = nil)
    return nil if elem.nil?
    Util.unstring(elem.attributes[name], default)
  end
  module_function :get_xml_attr


  # Format the epoch string "admin_epoch:epoch:num_updates"
  def epoch_string(elem)
    "#{Cibtools.get_xml_attr(elem, 'admin_epoch', '0')}:#{Cibtools.get_xml_attr(elem, 'epoch', '0')}:#{Cibtools.get_xml_attr(elem, 'num_updates', '0')}";
  end
  module_function :epoch_string


  # transliteration of pacemaker/lib/pengine/unpack.c:determine_online_status_fencing()
  # ns is node_state element from CIB
  def determine_online_status_fencing(ns)
    in_ccm      = get_xml_attr(ns, 'in_ccm')
    crm_state   = get_xml_attr(ns, 'crmd')
    join_state  = get_xml_attr(ns, 'join')
    exp_state   = get_xml_attr(ns, 'expected')

    # expect it to be up (more or less) if 'shutdown' is '0' or unspecified
    expected_up = get_xml_attr(ns, 'shutdown', '0') == 0

    state = :unclean
    if in_ccm && crm_state == 'online'
      case join_state
      when 'member'         # rock 'n' roll (online)
        state = :online
      when exp_state        # coming up (!online)
        state = :offline
      when 'pending'        # technically online, but not ready to run resources
        state = :pending    # (online + pending + standby)
      when 'banned'         # not allowed to be part of the cluster
        state = :standby    # (online + pending + standby)
      else                  # unexpectedly down (unclean)
        state = :unclean
      end
    elsif !in_ccm && crm_state == 'offline' && !expected_up
      state = :offline      # not online, but cleanly
    elsif expected_up
      state = :unclean      # expected to be up, mark it unclean
    else
      state = :offline      # offline
    end
    return state
  end
  module_function :determine_online_status_fencing

  # transliteration of pacemaker/lib/pengine/unpack.c:determine_online_status_no_fencing()
  # ns is node_state element from CIB
  # TODO(could): can we consolidate this with determine_online_status_fencing?
  def determine_online_status_no_fencing(ns)
    in_ccm      = get_xml_attr(ns, 'in_ccm')
    crm_state   = get_xml_attr(ns, 'crmd')
    join_state  = get_xml_attr(ns, 'join')

    # expect it to be up (more or less) if 'shutdown' is '0' or unspecified
    expected_up = get_xml_attr(ns, 'shutdown', '0') == 0

    state = :unclean
    if !in_ccm
      state = :offline
    elsif crm_state == 'online'
      if join_state == 'member'
        state = :online
      else
        # not ready yet (should this break down to pending/banned like
        # determine_online_status_fencing?  It doesn't in unpack.c...)
        state = :offline
      end
    elsif !expected_up
      state = :offline
    else
      state = :unclean
    end
    return state
  end
  module_function :determine_online_status_no_fencing

  def determine_online_status(ns, stonith_enabled)
    if stonith_enabled
      return determine_online_status_fencing(ns)
    else
      return determine_online_status_no_fencing(ns)
    end
  end
  module_function :determine_online_status

  def rc_desc(rc)
    case rc
    when 0
      _('success')
    when 1
      _('generic error')
    when 2
      _('incorrect arguments')
    when 3
      _('unimplemented action')
    when 4
      _('insufficient permissions')
    when 5
      _('installation error')
    when 6
      _('configuration error')
    when 7
      _('not running')
    when 8
      _('promoted')
    when 9
      _('failed (promoted)')
    else
      _('other')
    end
  end
  module_function :rc_desc

  def sort_ops(a, b)
    a_op = a.attributes['operation']
    b_op = b.attributes['operation']
    a_call_id = a.attributes['call-id'].to_i
    b_call_id = b.attributes['call-id'].to_i
    if a_call_id != -1 && b_call_id != -1
      # Normal case, neither op is pending, call-id wins
      a_call_id <=> b_call_id
    elsif a_op.starts_with?('migrate_') || b_op.starts_with?('migrate_')
      # Special case for pending migrate ops, beacuse stale ops hang around
      # in the CIB (see lf#2481).  There's a couple of things to do here:
      a_key = a.attributes['transition-key']
      b_key = b.attributes['transition-key']
      a_key_split = a_key.split(':')
      b_key_split = b_key.split(':')
      if a_key == b_key
        # 1) if the transition keys match, newer call-id wins (ensures bogus
        # pending ops lose to immediately subsequent start/stop).
        a_call_id <=> b_call_id
      elsif a_key_split[3] == b_key_split[3]
        # 2) if the transition keys don't match but the transitioner UUIDs
        # *do* match, the migrate is either old (predating a start/stop that
        # occurred after a migrate's regular start/stop), or new (the current
        # pending op), in which case we assume the larger graph number is the
        # most recent op (this will break if uint64_t ever wraps).
        a_key_split[1].to_i <=> b_key_split[1].to_i
      else
        # If the transitioner UUIDs *don't* match (different instances
        # of crmd), we make the pending op most recent (reverse sort
        # call id), because experiment seems to indicate this is the
        # least-worst choice.  Pending migrate ops for a node evaporate
        # if Pacemaker is stopped on that node, so after a UUID change,
        # there should be at most one outstanding pending migrate op
        # that doesn't hit one of the other rules above - if this is
        # the case, this pending migrate op is what's happening right
        # *now*
        b_call_id <=> a_call_id
      end
    elsif a_op == b_op && a_key == b_key
      # Same operation, same transition key, and one op is allegedly pending.
      # This is a lie (see bnc#879034), so make newer call-id win hand have
      # bogus pending op lose (similar to above special case for migrate ops)
      a_call_id <=> b_call_id
    elsif a_call_id == -1
      1                                         # make pending start/stop op most recent
    elsif b_call_id == -1
      -1                                        # likewise
    else
      Rails.logger.error "Inexplicable op sort error (this can't happen)"
      a_call_id <=> b_call_id
    end
  end
  module_function :sort_ops

  # TODO(should): evil magic numbers!
  # The operation and RC code tells us the state of the resource on this node
  # when rc=0, anything other than a stop means we're running
  # (might be slave after a demote)
  # TODO(must): verify this demote business
  def op_rc_to_state(operation, rc, state)
    case rc
    when 7
      :stopped
    when 8
      :master
    when 0
      case operation
      when 'stop', 'migrate_to'
        :stopped
      when 'promote'
        :master
      else
        :started
      end
    else
      state
    end
  end
  module_function :op_rc_to_state

  def rsc_state_from_lrm_rsc_op(xml, node_uname, rsc_id)
    xml.elements.each("cib/status/node_state[@uname='#{node_uname}']/lrm/lrm_resources/lrm_resource[@id='#{rsc_id}']") do |lrm_resource|
      # logic derived somewhat from pacemaker/lib/pengine/unpack.c:unpack_rsc_op()
      state = :unknown
      ops = []
      lrm_resource.elements.each('lrm_rsc_op') do |op|
        ops << op
      end
      ops.sort { |a, b| Cibtools.sort_ops(a, b) }.each do |op|
        operation = op.attributes['operation']
        id = op.attributes['id']
        call_id = op.attributes['call-id'].to_i
        rc_code = op.attributes['rc-code'].to_i
        # Cope with missing transition key (e.g.: in multi1.xml CIB from pengine/test10)
        # TODO(should): Can we handle this better?  When is it valid for the transition
        # key to not be there?
        expected = rc_code
        if op.attributes.key?('transition-key')
          k = op.attributes['transition-key'].split(':')
          expected = k[2].to_i
        end

        # skip notifies, deletes, cancels
        next if ['notify', 'delete', 'cancel'].include? operation

        # skip allegedly pending "last_failure" ops (hack to fix bnc#706755)
        # TODO(should): see if we can remove this in future
        next if !id.nil? && id.end_with?("_last_failure_0") && call_id == -1

        if call_id == -1
          # Don't do any further processing for pending ops, but only set
          # resource state to "pending" if it's not a pending monitor
          # TODO(should): Look at doing this by "whitelist"? i.e. explicitly
          # notice pending start, stop, promote, demote, migrate_*..?
          # This would allow us to say "Staring", "Stopping", etc. in the UI.
          state = :pending if operation != "monitor"
          next
        end

        is_probe = operation == 'monitor' && op.attributes['interval'].to_i.zero?
        # Report failure if rc_code != expected, unless it's a probe,
        # in which case we only report failure when rc_code is not
        # 0 (running), 7 (not running) or 8 (running master), i.e. is
        # some error value.
        if rc_code != expected && (!is_probe || (rc_code != 0 && rc_code != 7 && rc_code != 8))

          # if on-fail == ignore for this op, pretend it succeeded for the purposes of state calculation
          ignore_failure = false
          xml.elements.each("cib/configuration//primitive[@id='#{rsc_id.split(":")[0]}']/operations/op[@name='#{operation}']") do |e|
            next unless e.attributes["on-fail"] && e.attributes["on-fail"] == "ignore"
            # TODO(must): Verify interval is correct
            ignore_failure = true
          end

          if ignore_failure
            rc_code = expected
          elsif operation == "stop"
            # We have a failed stop, the resource is failed (bnc#879034)
            state = :failed
          end
        end

        state = Cibtools.op_rc_to_state operation, rc_code, state
      end

      return state
    end
    :unknown
  end
  module_function :rsc_state_from_lrm_rsc_op
end