cloudamatic/mu

View on GitHub
modules/mu/mommacat/daemon.rb

Summary

Maintainability
A
3 hrs
Test Coverage
# Copyright:: Copyright (c) 2020 eGlobalTech, Inc., all rights reserved
#
# Licensed under the BSD-3 license (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License in the root of the project or at
#
#     http://egt-labs.com/mu/LICENSE.html
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

module MU

  # MommaCat is in charge of managing metadata about resources we've created,
  # as well as orchestrating amongst them and bootstrapping nodes outside of
  # the normal synchronous deploy sequence invoked by *mu-deploy*.
  class MommaCat

    # Check a provided deploy key against our stored version. The instance has
    # in theory accessed a secret via S3 and encrypted it with the deploy's
    # public key. If it decrypts correctly, we assume this instance is indeed
    # one of ours.
    # @param ciphertext [String]: The text to decrypt.
    # return [Boolean]: Whether the provided text was encrypted with the correct key
    def authKey(ciphertext)
      if @private_key.nil? or @deploy_secret.nil?
        MU.log "Missing auth metadata, can't authorize node in authKey", MU::ERR
        return false
      end
      my_key = OpenSSL::PKey::RSA.new(@private_key)

      begin
        if my_key.private_decrypt(ciphertext).force_encoding("UTF-8").chomp == @deploy_secret.force_encoding("UTF-8").chomp
          MU.log "Matched ciphertext for #{MU.deploy_id}", MU::INFO
          return true
        else
          MU.log "Mis-matched ciphertext for #{MU.deploy_id}", MU::ERR
          return false
        end
      rescue OpenSSL::PKey::RSAError => e
        MU.log "Error decrypting provided ciphertext using private key from #{deploy_dir}/private_key: #{e.message}", MU::ERR, details: ciphertext
        return false
      end
    end

    # Run {MU::Cloud::Server#postBoot} and {MU::Cloud::Server#groom} on a node.
    # @param cloud_id [OpenStruct]: The cloud provider's identifier for this node.
    # @param name [String]: The MU resource name of the node being created.
    # @param mu_name [String]: The full #{MU::MommaCat.getResourceName} name of the server we're grooming, if it's been initialized already.
    # @param type [String]: The type of resource that created this node (either *server* or *serverpool*).
    def groomNode(cloud_id, name, type, mu_name: nil, reraise_fail: false, sync_wait: true)
      if cloud_id.nil?
        raise GroomError, "MU::MommaCat.groomNode requires a {MU::Cloud::Server} object"
      end
      if name.nil? or name.empty?
        raise GroomError, "MU::MommaCat.groomNode requires a resource name"
      end
      if type.nil? or type.empty?
        raise GroomError, "MU::MommaCat.groomNode requires a resource type"
      end

      if !MU::MommaCat.lock(cloud_id+"-mommagroom", true)
        MU.log "Instance #{cloud_id} on #{MU.deploy_id} (#{type}: #{name}) is already being groomed, ignoring this extra request.", MU::NOTICE
        MU::MommaCat.unlockAll
        if !MU::MommaCat.locks.nil? and MU::MommaCat.locks.size > 0
          puts "------------------------------"
          puts "Open flock() locks:"
          pp MU::MommaCat.locks
          puts "------------------------------"
        end
        return
      end
      loadDeploy

      # XXX this is to stop Net::SSH from killing our entire stack when it
      # throws an exception. See ECAP-139 in JIRA. Far as we can tell, it's
      # just not entirely thread safe.
      Thread.handle_interrupt(Net::SSH::Disconnect => :never) {
        begin
          Thread.handle_interrupt(Net::SSH::Disconnect => :immediate) {
            MU.log "(Probably harmless) Caught a Net::SSH::Disconnect in #{Thread.current.inspect}", MU::DEBUG, details: Thread.current.backtrace
          }
        ensure
        end
      }

      if @original_config[type+"s"].nil?
        raise GroomError, "I see no configured resources of type #{type} (bootstrap request for #{name} on #{@deploy_id})"
      end
      kitten = nil

      kitten = findLitterMate(type: "server", name: name, mu_name: mu_name, cloud_id: cloud_id)
      if !kitten.nil?
        MU.log "Re-grooming #{mu_name}", details: kitten.deploydata
      else
        first_groom = true
        @original_config[type+"s"].each { |svr|
          if svr['name'] == name
            svr["instance_id"] = cloud_id

            # This will almost always be true in server pools, but lets be safe. Somewhat problematic because we are only
            # looking at deploy_id, but we still know this is our DNS record and not a custom one.
            if svr['dns_records'] && !svr['dns_records'].empty?
              svr['dns_records'].each { |dnsrec|
                if dnsrec.has_key?("name") && dnsrec['name'].start_with?(MU.deploy_id.downcase)
                  MU.log "DNS record for #{MU.deploy_id.downcase}, #{name} is probably wrong, deleting", MU::WARN, details: dnsrec
                  dnsrec.delete('name')
                  dnsrec.delete('target')
                end
              }
            end

            kitten = MU::Cloud::Server.new(mommacat: self, kitten_cfg: svr, cloud_id: cloud_id)
            mu_name = kitten.mu_name if mu_name.nil?
            MU.log "Grooming #{mu_name} for the first time", details: svr
            break
          end
        }
      end

      begin
        # This is a shared lock with MU::Cloud::AWS::Server.create, to keep from
        # stomping on synchronous deploys that are still running. This
        # means we're going to wait here if this instance is still being
        # bootstrapped by "regular" means.
        if !MU::MommaCat.lock(cloud_id+"-create", true)
          MU.log "#{mu_name} is still in mid-creation, skipping", MU::NOTICE
          MU::MommaCat.unlockAll
          if !MU::MommaCat.locks.nil? and MU::MommaCat.locks.size > 0
            puts "------------------------------"
            puts "Open flock() locks:"
            pp MU::MommaCat.locks
            puts "------------------------------"
          end
          return
        end
        MU::MommaCat.unlock(cloud_id+"-create")

        if !kitten.postBoot(cloud_id)
          MU.log "#{mu_name} is already being groomed, skipping", MU::NOTICE
          MU::MommaCat.unlockAll
          if !MU::MommaCat.locks.nil? and MU::MommaCat.locks.size > 0
            puts "------------------------------"
            puts "Open flock() locks:"
            pp MU::MommaCat.locks
            puts "------------------------------"
          end
          return
        end

        # This is a shared lock with MU::Deploy.createResources, simulating the
        # thread logic that tells MU::Cloud::AWS::Server.deploy to wait until
        # its dependencies are ready. We don't, for example, want to start
        # deploying if we rely on an RDS instance that isn't ready yet. We can
        # release this immediately, once we successfully grab it.
        MU::MommaCat.lock("#{kitten.cloudclass.name}_#{kitten.config["name"]}-dependencies")
        MU::MommaCat.unlock("#{kitten.cloudclass.name}_#{kitten.config["name"]}-dependencies")

        kitten.groom
      rescue StandardError => e
        MU::MommaCat.unlockAll
        if e.class.name != "MU::Cloud::AWS::Server::BootstrapTempFail" and !File.exist?(deploy_dir+"/.cleanup."+cloud_id) and !File.exist?(deploy_dir+"/.cleanup")
          MU.log "Grooming FAILED for #{kitten.mu_name} (#{e.inspect})", MU::ERR, details: e.backtrace
          sendAdminSlack("Grooming FAILED for `#{kitten.mu_name}` with `#{e.message}` :crying_cat_face:", msg: e.backtrace.join("\n"))
          sendAdminMail("Grooming FAILED for #{kitten.mu_name} on #{MU.appname} \"#{MU.handle}\" (#{MU.deploy_id})",
            msg: e.inspect,
            data: e.backtrace,
            debug: true
          )
          raise e if reraise_fail
        else
          MU.log "Grooming of #{kitten.mu_name} interrupted by cleanup or planned reboot"
        end
        return
      end

      if !@deployment['servers'].nil? and !sync_wait
        syncLitter(@deployment["servers"].keys, triggering_node: kitten)
      end
      MU::MommaCat.unlock(cloud_id+"-mommagroom")
      if MU.myCloud == "AWS"
        MU::Cloud::AWS.openFirewallForClients # XXX add the other clouds, or abstract
      end
      MU::MommaCat.getLitter(MU.deploy_id)
      MU::Master.syncMonitoringConfig(false)
      MU.log "Grooming complete for '#{name}' mu_name on \"#{MU.handle}\" (#{MU.deploy_id})"
      FileUtils.touch(MU.dataDir+"/deployments/#{MU.deploy_id}/#{name}_done.txt")
      MU::MommaCat.unlockAll
      if first_groom
        sendAdminSlack("Grooming complete for #{mu_name} :heart_eyes_cat:")
        sendAdminMail("Grooming complete for '#{name}' (#{mu_name}) on deploy \"#{MU.handle}\" (#{MU.deploy_id})")
      end
      return
    end

    @cleanup_threads = []

    # Iterate over all known deployments and look for instances that have been
    # terminated, but not yet cleaned up, then clean them up.
    def self.cleanTerminatedInstances(debug = false)
      loglevel = debug ? MU::NOTICE : MU::DEBUG
      MU::MommaCat.lock("clean-terminated-instances", false, true)
      MU.log "Checking for harvested instances in need of cleanup", loglevel
      parent_thread_id = Thread.current.object_id
      purged = 0

      MU::MommaCat.listDeploys.each { |deploy_id|
        next if File.exist?(deploy_dir(deploy_id)+"/.cleanup")
        MU.log "Checking for dead wood in #{deploy_id}", loglevel
        need_reload = false
        @cleanup_threads << Thread.new {
          MU.dupGlobals(parent_thread_id)
          deploy = MU::MommaCat.getLitter(deploy_id, set_context_to_me: true)
          purged_this_deploy = 0
            MU.log "#{deploy_id} has some kittens in it", loglevel, details: deploy.kittens.keys
          if deploy.kittens.has_key?("servers")
            MU.log "#{deploy_id} has some servers declared", loglevel, details: deploy.object_id
            deploy.kittens["servers"].values.each { |nodeclasses|
              nodeclasses.each_pair { |nodeclass, servers|
                deletia = []
                MU.log "Checking status of servers under '#{nodeclass}'", loglevel, details: servers.keys
                servers.each_pair { |mu_name, server|
                  server.describe
                  if !server.cloud_id
                    MU.log "Checking for presence of #{mu_name}, but unable to fetch its cloud_id", MU::WARN, details: server
                  elsif !server.active?
                    next if File.exist?(deploy_dir(deploy_id)+"/.cleanup-"+server.cloud_id)
                    deletia << mu_name
                    need_reload = true
                    MU.log "Cleaning up metadata for #{server} (#{nodeclass}), formerly #{server.cloud_id}, which appears to have been terminated", MU::NOTICE
                    begin
                      server.destroy
                      deploy.sendAdminMail("Retired metadata for terminated node #{mu_name}")
                      deploy.sendAdminSlack("Retired metadata for terminated node `#{mu_name}`")
                    rescue StandardError => e
                      MU.log "Saw #{e.message} while retiring #{mu_name}", MU::ERR, details: e.backtrace
                      next
                    end
                    MU.log "Cleanup of metadata for #{server} (#{nodeclass}), formerly #{server.cloud_id} complete", MU::NOTICE
                    purged = purged + 1
                    purged_this_deploy = purged_this_deploy + 1
                  end
                }
                deletia.each { |mu_name|
                  servers.delete(mu_name)
                }
                if purged_this_deploy > 0
                  # XXX triggering_node needs to take more than one node name
                  deploy.syncLitter(servers.keys, triggering_node: deletia.first)
                end
              }
            }
          end
          if need_reload
            MU.log "Saving modified deploy #{deploy_id}", loglevel
            deploy.save!
            MU::MommaCat.getLitter(deploy_id)
          end
          MU.purgeGlobals
        }
      }
      @cleanup_threads.each { |t|
        t.join
      }
      MU.log "cleanTerminatedInstances threads complete", loglevel
      MU::MommaCat.unlock("clean-terminated-instances", true)
      @cleanup_threads = []

      if purged > 0
        if MU.myCloud == "AWS"
          MU::Cloud::AWS.openFirewallForClients # XXX add the other clouds, or abstract
        end
        MU::Master.syncMonitoringConfig
        GC.start
      end
      MU.log "cleanTerminatedInstances returning", loglevel
    end

    # Path to the log file used by the Momma Cat daemon
    # @return [String]
    def self.daemonLogFile
      base = (Process.uid == 0 and !MU.localOnly) ? "/var" : MU.dataDir
      "#{base}/log/mu-momma-cat.log"
    end

    # Path to the PID file used by the Momma Cat daemon
    # @return [String]
    def self.daemonPidFile(root = false)
      base = ((Process.uid == 0 or root) and !MU.localOnly) ? "/var" : MU.dataDir
      "#{base}/run/mommacat.pid"
    end

        # Start the Momma Cat daemon and return the exit status of the command used
    # @return [Integer]
    def self.start
      if MU.inGem? and MU.muCfg['disable_mommacat']
        return
      end
      base = (Process.uid == 0 and !MU.localOnly) ? "/var" : MU.dataDir
      [base, "#{base}/log", "#{base}/run"].each { |dir|
       if !Dir.exist?(dir)
          MU.log "Creating #{dir}"
          Dir.mkdir(dir)
        end
      }
      if (Process.uid != 0 and
           (!$MU_CFG['overridden_keys'] or !$MU_CFG['overridden_keys'].include?("mommacat_port")) and
            status(true)
         ) or status
        return 0
      end
    
      File.unlink(daemonPidFile) if File.exists?(daemonPidFile)
      MU.log "Starting Momma Cat on port #{MU.mommaCatPort}, logging to #{daemonLogFile}, PID file #{daemonPidFile}"
      origdir = Dir.getwd
      Dir.chdir(MU.myRoot+"/modules")

      # XXX what's the safest way to find the 'bundle' executable in both gem and non-gem installs?
      if MU.inGem?
        cmd = %Q{thin --threaded --daemonize --port #{MU.mommaCatPort} --pid #{daemonPidFile} --log #{daemonLogFile} --ssl --ssl-key-file #{MU.muCfg['ssl']['key']} --ssl-cert-file #{MU.muCfg['ssl']['cert']} --ssl-disable-verify --tag mu-momma-cat -R mommacat.ru start}
      else
        cmd = %Q{bundle exec thin --threaded --daemonize --port #{MU.mommaCatPort} --pid #{daemonPidFile} --log #{daemonLogFile} --ssl --ssl-key-file #{MU.muCfg['ssl']['key']} --ssl-cert-file #{MU.muCfg['ssl']['cert']} --ssl-disable-verify --tag mu-momma-cat -R mommacat.ru start}
      end

      MU.log cmd, MU::NOTICE

      retries = 0
      begin
        output = %x{#{cmd}}
        sleep 1
        retries += 1
        if retries >= 10
          MU.log "MommaCat failed to start (command was #{cmd}, working directory #{MU.myRoot}/modules)", MU::WARN, details: output
          pp caller
          return $?.exitstatus
        end
      end while !status

      Dir.chdir(origdir)
    
      if $?.exitstatus != 0
        exit 1
      end

      return $?.exitstatus
    end

    @@notified_on_pid = {}

    # Return true if the Momma Cat daemon appears to be running
    # @return [Boolean]
    def self.status(root = false)
      if MU.inGem? and MU.muCfg['disable_mommacat']
        return true
      end
      if File.exist?(daemonPidFile(root))
        pid = File.read(daemonPidFile(root)).chomp.to_i
        begin
          Process.getpgid(pid)
          MU.log "Momma Cat running with pid #{pid.to_s}", (@@notified_on_pid[pid] ? MU::DEBUG : MU::INFO) # shush
          @@notified_on_pid[pid] = true
          return true
        rescue Errno::ESRCH
        end
      end
      MU.log "Momma Cat daemon not running", MU::NOTICE, details: daemonPidFile(root)
      false
    end
    
        # Stop the Momma Cat daemon, if it's running
    def self.stop
      if File.exist?(daemonPidFile)
        pid = File.read(daemonPidFile).chomp.to_i
        MU.log "Stopping Momma Cat with pid #{pid.to_s}"
        Process.kill("INT", pid)
        killed = false
        begin
          Process.getpgid(pid)
          sleep 1
        rescue Errno::ESRCH
          killed = true
        end while killed
        MU.log "Momma Cat with pid #{pid.to_s} stopped", MU::DEBUG, details: daemonPidFile
    
        begin
          File.unlink(daemonPidFile)
        rescue Errno::ENOENT
        end
      end
    end

        # (Re)start the Momma Cat daemon and return the exit status of the start command
    # @return [Integer]
    def self.restart
      stop
      start
    end

  end #class
end #module