ManageIQ/manageiq

View on GitHub
app/models/miq_server/server_monitor.rb

Summary

Maintainability
B
5 hrs
Test Coverage
class MiqServer::ServerMonitor
  include Vmdb::Logging

  attr_reader :my_server

  def initialize(my_server)
    @my_server = my_server
  end

  def monitor_servers
    my_server.reload.is_master? ? monitor_servers_as_master : monitor_servers_as_non_master
  end

  private

  def make_master_server(last_master)
    _log.info("Master server has #{last_master.nil? ? "not been set" : "died, #{last_master.name}"}.  Attempting takeover as new master server, #{my_server.name}.")
    parent = MiqRegion.my_region(true)
    parent.lock do
      # See if an ACTIVE server has already taken over
      active_servers = parent.active_miq_servers

      _log.debug("Double checking that nothing has changed")
      master = active_servers.detect(&:is_master?)
      if (last_master.nil? && !master.nil?) || (!last_master.nil? && !master.nil? && last_master.id != master.id)
        _log.info("Aborting master server takeover as another server, #{master.name}, has taken control first.")
        return nil
      end

      _log.debug("Setting this server, #{my_server.name}, as master server")

      # Set is_master on self, reset every other server in the region, including
      # inactive ones.
      parent.miq_servers.each do |s|
        s.is_master = (my_server.id == s.id)
        s.save!
      end
    end
    _log.info("This server #{my_server.name} is now set as the master server, last_master: #{last_master.try(:name)}")
  end

  def miq_server_time_threshold
    ::Settings.server.heartbeat_timeout.to_i_with_method
  end

  def monitor_servers_as_master
    _log.debug("Checking other servers as master server")
    @last_master = nil
    @last_servers ||= {}

    # Check all of the other servers and see if we have new servers, servers have stopped, or servers have stopped responding
    all_servers = my_server.find_other_started_servers_in_region

    current_ids = all_servers.collect(&:id)
    last_ids    = @last_servers.keys
    added       = current_ids - last_ids
    removed     = last_ids - current_ids
    # unchanged = current_ids & last_ids

    removed.each do |id|
      last_server = @last_servers.delete(id)
      rec = last_server[:record]
      _log.info("#{rec.format_full_log_msg} has been stopped or removed, and will no longer be monitored.")
      rec.deactivate_all_roles
    end

    all_servers.each do |s|
      if added.include?(s.id)
        _log.info("#{s.format_full_log_msg} has been started or added, and will now be monitored.")
        @last_servers[s.id] = {
          :last_hb_change => Time.now.utc,
          :record         => s
        }

        if s.is_master?
          _log.info("#{s.format_short_log_msg} has been detected as a second master and is being demoted.")
          my_server.update(:is_master => false)
        end

      else # unchanged
        last_server = @last_servers[s.id]
        rec = last_server[:record]
        _log.debug("Checking #{s.format_full_log_msg}. time_threshold [#{miq_server_time_threshold.seconds.ago.utc}] last_heartbeat changed [#{rec.last_heartbeat}] last_heartbeat [#{s.last_heartbeat}]")
        # Check if the server has updated or has not passed the threshold
        if rec.last_heartbeat != s.last_heartbeat || miq_server_time_threshold.seconds.ago.utc <= last_server[:last_hb_change]
          last_server[:last_hb_change] = Time.now.utc if rec.last_heartbeat != s.last_heartbeat
          last_server[:record] = s
        else
          @last_servers.delete(s.id)
          s.mark_as_not_responding
        end
      end
    end
  end

  def monitor_servers_as_non_master
    @last_servers  = {}
    @last_master ||= {}
    rec = @last_master[:record]

    parent = MiqRegion.my_region
    master = parent.find_master_server

    msg = "Checking master MiqServer."
    msg << " There is no master server." if master.nil?
    msg << " time_threshold [#{miq_server_time_threshold.seconds.ago.utc}] last_heartbeat changed [#{@last_master[:last_hb_change]}] last_heartbeat [#{rec.last_heartbeat}]" unless master.nil? || rec.nil?
    _log.debug(msg)

    # Check if master is found; and has never been set, has changed, has heartbeated,
    #   or has not passed the threshold since the last heartbeat should have changed
    if !master.nil? && (@last_master.empty? || rec != master || rec.last_heartbeat != master.last_heartbeat || miq_server_time_threshold.seconds.ago.utc <= @last_master[:last_hb_change])
      @last_master[:last_hb_change] = Time.now.utc if rec.nil? || rec.last_heartbeat != master.last_heartbeat
      @last_master[:record] = master
    else
      _log.info("Master #{master.format_full_log_msg} has not responded in #{miq_server_time_threshold} seconds.") unless master.nil?
      make_master_server(@last_master.empty? ? nil : @last_master[:record])
      if my_server.reload.is_master?
        master.mark_as_not_responding unless master.nil?
        @last_master = nil

        parent.miq_servers.each do |s|
          next unless s.status == 'started'
          next if     s.is_master?
          @last_servers[s.id] = {:last_hb_change => Time.now.utc, :record => s}
        end

        # Raise miq_server_is_master event
        master_msg = master && " from #{master.format_short_log_msg}"
        msg = "#{my_server.format_short_log_msg} has taken over master#{master_msg}"
        MiqEvent.raise_evm_event_queue_in_region(my_server, "evm_server_is_master", :event_details => msg)

        monitor_servers_as_master
      else
        @last_master[:last_hb_change] = Time.now.utc
        @last_master[:record] = parent.find_master_server
      end
    end
  end
end