bin/mu-node-manage
#!/usr/local/ruby-current/bin/ruby
# Copyright:: Copyright (c) 2014 eGlobalTech, Inc., all rights reserved
#
# Licensed under the BSD-3 license (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License in the root of the project or at
#
# http://egt-labs.com/mu/LICENSE.html
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
require 'optimist'
require 'json'
require File.realpath(File.expand_path(File.dirname(__FILE__)+"/mu-load-config.rb"))
require 'mu'
$opts = Optimist::options do
banner <<-EOS
Usage:
#{$0} [-c] [-w] [-l] [-d] [-a] [-e <environment>] [-p <platform>] [-m <mode>] [-o <chefopts>] [-x <command>] [ deploy_id|node_name [ ... ] ]
EOS
opt :concurrent, "Max number of processes to run concurrently when invoking Chef or MommaCat on multiple nodes.", :require => false, :default => 10, :type => :integer
opt :list, "Perform no action, but instead return a list of matching hosts. Default behavior with no other flags.", :require => false, :default => false, :type => :boolean
opt :deploys, "Operate on matching deploy IDs instead of node names.", :require => false, :default => false, :type => :boolean
opt :all, "Operate on all nodes/deploys. Use with caution.", :require => false, :default => false, :type => :boolean
opt :platform, "Operate exclusively on one nodes of a particular operating system. Can be used in conjunction with -a or -d. Valid platforms: linux, windows", :require => false, :type => :string
opt :environment, "Operate exclusively on one nodes with a particular environment (e.g. dev, prod). Can be used in conjunction with -a or -d.", :require => false, :type => :string
opt :override_chef_runlist, "An alternate runlist to pass to Chef, in groomeronly mode.", :require => false, :type => :string
opt :xecute, "Run a shell command on matching nodes. Overrides --mode and suppresses some informational output in favor of scriptability.", :require => false, :type => :string
opt :mode, "Action to perform on matching nodes. Valid actions: groom, groomeronly, awsmeta, vaults, certs, chefupgrade", :require => false, :default => "groomeronly", :type => :string
opt :verbose, "Show output from Chef runs, etc", :require => false, :default => false, :type => :boolean
opt :winrm, "Force WinRM connection. Disable SSH fallback", :require => false, :default => false, :type => :boolean
opt :info, "List a particular node attribute", :require => false, :default => 'nodename', :type => :string
end
MU.setLogging(MU::Logger::LOUD) if $opts[:verbose]
$opts[:mode] = "groomeronly" if $opts[:mode] == "chefrun"
if !["groom", "groomeronly", "vaults", "userdata", "awsmeta", "certs", "chefupgrade"].include?($opts[:mode])
Optimist::die(:mode, "--mode must be one of: groom, groomeronly, awsmeta, vaults, certs, chefupgrade")
end
if $opts[:platform] and !["linux", "windows"].include?($opts[:platform])
Optimist::die(:platform, "--platform must be one of: linux, windows")
end
if !$opts[:xecute] and !$opts[:override_chef_runlist] and !$opts[:mode_given]
$opts[:list] = true
end
if $opts[:override_chef_runlist] and !$opts[:mode_given]
$opts[:mode_given] = true
end
if ARGV.empty? and !$opts[:all] and !$opts[:platform] and !$opts[:environment] and !$opts[:list]
Optimist::educate
exit 1
end
Thread.abort_on_exception = true
master_pid = Process.pid
$children = {}
signals = Signal.list
signals.keys.each { |sig|
# Ruby 2.3 doesn't want to trap these
next if ["ILL", "FPE", "KILL", "BUS", "SEGV", "STOP", "VTALRM"].include?(sig)
Signal.trap(signals[sig]) do
if Process.pid == master_pid
$children.each_pair { |pid, node|
if ["INT", "TERM", "EXIT", "ABRT"].include?(sig)
Process.kill("KILL", pid) # aka --dammit
else
begin
Process.kill(sig, pid)
rescue Errno::ESRCH
end
end
}
if ["INT", "TERM", "EXIT"].include?(sig)
Process.waitall
end
end
end
}
# Run through our filters so we can pass flat lists into our methods that
# actually do things.
avail_deploys = MU::MommaCat.listDeploys
do_deploys = []
do_nodes = []
ok = true
if $opts[:all] or (ARGV.size == 0 and !$opts[:deploys])
do_deploys = avail_deploys
else
if $opts[:deploys] and !$opts[:all]
ARGV.each { |arg|
matched = avail_deploys.select { |deploy| deploy.match(/#{Regexp.quote(arg)}/i) }
if matched.size == 0
MU.log "Deploy pattern '#{arg}' doesn't appear to match anything", MU::ERR
end
do_deploys = do_deploys.concat(matched).uniq
}
else
do_nodes = ARGV
do_deploys = []
matched = 0
if do_nodes.size > 0
# Just load the deploys we need
do_nodes.each { |node|
if node.match(/^(.*?-[^\-]+?-\d{10}-[A-Z]{2})-.*/)
matched += 1
do_deploys << node.sub(/^(.*?-[^\-]+?-\d{10}-[A-Z]{2})-.*/, '\1')
end
}
do_deploys.uniq!
end
if do_deploys.size == 0 and do_nodes.size > 0 and (matched > 0 or ARGV.size > 0)
do_deploys = avail_deploys
end
end
end
avail_nodes = []
@avail_node_attributes = []
do_deploys.each { |muid|
mommacat = MU::MommaCat.new(muid, skip_resource_objects: true)
mommacat.listNodes.each_pair { |nodename, server|
next if server.nil? or server['conf'].nil?
id = server['instance_id']
server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform") or %w{centos centos6 centos7 ubuntu ubuntu14 rhel rhel7 rhel71 linux amazon}.include?(server['conf']["platform"])
server['conf']["platform"] = "windows" if %w{win2k12r2 win2k12 win2k8 win2k8r2}.include?(server['conf']["platform"])
next if !$opts[:platform].nil? and server['conf']["platform"] != $opts[:platform]
next if !$opts[:environment].nil? and MU.environment.upcase != $opts[:environment].upcase
avail_nodes << nodename
@avail_node_attributes << server
}
}
if do_nodes.size > 0
matching = avail_nodes.select { |node|
matched = false
do_nodes.each { |pattern|
if node.match(/#{Regexp.quote(pattern)}/i)
matched = true
break
end
}
matched
}
do_nodes = matching
else
do_nodes = avail_nodes
end
do_nodes.sort!
#do_nodes.sort!{ |x,y| (x[$opts[:info]] <=> y[$opts[:info]]) }
#puts node_attributes[0]
if $opts[:list]
if $opts[:info].eql? 'nodename'
puts do_nodes
else
do_nodes.each do |node|
@avail_node_attributes.each do |attr|
if attr['nodename'].eql? node
puts "#{attr['nodename']}: #{attr[$opts[:info]]}"
end
end
end
end
exit
end
exit 1 if !ok
def reGroom(deploys = MU::MommaCat.listDeploys, nodes = [], vaults_only: false, groomeronly: false)
badnodes = []
count = 0
deploys.each { |muid|
mommacat = MU::MommaCat.new(muid)
next if mommacat.kittens.nil? or mommacat.kittens['servers'].nil?
mommacat.kittens['servers'].each_pair { |habitat, nodeclasses|
nodeclasses.each_pair { |nodeclass, servers|
servers.each_pair { |mu_name, server|
next if nodes.size > 0 and !nodes.include?(mu_name)
server.myFirewallRules.each { |fw|
fw.groom
}
count = count + 1
child = Process.fork {
begin
type = "server"
type = "server_pool" if server.config.has_key?("basis")
if vaults_only
next if !server.config.has_key?("vault_access")
server.config["vault_access"].each { |v|
MU::Groomer::Chef.grantSecretAccess(mu_name, v['vault'], v['item'])
}
elsif groomeronly
server.groomer.run
else
mommacat.groomNode(server.cloud_id, nodeclass, type, mu_name: mu_name)
end
rescue Exception => e
MU.log e.inspect, MU::ERR, details: e.backtrace
exit 1
end
}
$children[child] = mu_name
}
while $children.size >= $opts[:concurrent]-1
child = Process.wait
if !$?.success?
badnodes << $children[child]
end
$children.delete(child)
end
}
}
}
Process.waitall.each { |child|
if !child[1].success?
badnodes << $children[child[0]]
end
}
if badnodes.size > 0
MU.log "Not all Momma Cat runs exited cleanly", MU::WARN, details: badnodes
end
end
def runCommand(deploys = MU::MommaCat.listDeploys, nodes = [], cmd = nil, print_output: $opts[:verbose], noop: false)
badnodes = []
count = 0
deploys.each { |muid|
mommacat = MU::MommaCat.new(muid)
mommacat.listNodes.each_pair { |nodename, server|
next if server['conf'].nil?
server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform")
next if nodes.size > 0 and !nodes.include?(nodename)
count = count + 1
child = Process.fork {
done = false
begin
serverobj = mommacat.findLitterMate(type: "server", mu_name: nodename)
if !serverobj
MU.log "Failed to load server object for #{nodename}", MU::ERR
next
end
MU.log "Running '#{cmd}' on #{nodename} (##{count})" if !print_output
# Set Variables to catch the output and exit code of the execution
output = nil
exitcode = -1
# Determine which protocols to attempt
if serverobj.windows?
attempt_winrm = true
if $opts[:winrm]
attempt_ssh = false
else
attempt_ssh = true
end
else
attempt_winrm = false
attempt_ssh = true
end
# Attempt WinRM Connection, and Fall back to SSH
if attempt_winrm
exec_protocol = 'WinRM'
# Attempt to make a connection and exec the command
resp = nil
begin
shell = serverobj.getWinRMSession(0, timeout: 10, winrm_retries: 1)
resp = shell.run(cmd)
rescue MU::MuError => e
end
if resp
# WINRM CONNECTION AND EXECUTION SUCCESS
output = resp.stdout if resp.stdout
exitcode = resp.exitcode if resp.exitcode
if exitcode.eql? 0
attempt_ssh = false
else
puts resp.stderr if resp.stderr
puts output
end
end
if exitcode != 0
if attempt_ssh
MU.log "#{nodename} WinRM exec failed, trying SSH", MU::NOTICE
else
MU.log "#{nodename} WinRM exec failed, NOT trying SSH", MU::WARN
end
end
end
if attempt_ssh
exec_protocol = 'SSH'
# this should use getSSHSession, for the sake of symmetry
output = `ssh -q #{nodename} "#{cmd}" 2>&1 < /dev/null`
exitcode = $?.exitstatus
end
if exitcode != 0
if output
if serverobj.windows? and output.match(/NoMethodError: unknown property or method: `ConnectServer'/)
MU.log "#{nodename} encountered transient Windows/Chef ConnectServer error, retrying", MU::WARN
elsif print_output
done = true
puts "#{nodename} - #{output}" if output.match(/[^\s]/)
MU.log "#{nodename} did not exit cleanly", MU::WARN
else
done = true
MU.log "#{nodename} did not exit cleanly", MU::WARN, details: output.slice(-2000, 2000)
end
else
done = true
MU.log "#{nodename} did not exit cleanly", MU::WARN
end
exit exitcode if done
else
MU.log "#{nodename} complete via #{exec_protocol}"
done = true
end
end until done
puts "#{nodename} - #{output}" if print_output and output.match(/[^\s]/)
}
$children[child] = nodename
while $children.size >= $opts[:concurrent] - 1
child = Process.wait
if !$?.success?
badnodes << $children[child]
end
$children.delete(child)
end
}
}
Process.waitall.each { |child|
if !child[1].success?
badnodes << $children[child[0]]
end
}
if badnodes.size > 0
cmd = "Chef" if $opts[:mode] == "groomeronly"
if !print_output
MU.log "Not all `#{cmd}` runs exited cleanly", MU::WARN, details: badnodes
else
MU.log "Not all `#{cmd}` runs exited cleanly", MU::WARN
end
end
end
def updateAWSMetaData(deploys = MU::MommaCat.listDeploys, nodes = [])
deploys.each { |muid|
mommacat = MU::MommaCat.new(muid)
if mommacat.original_config.nil?
MU.log "Failed to locate original config data for #{muid}", MU::WARN
next
end
# Clean up the userdata of matching Autoscale groups by replacing their
# Launch Configurations with new ones,
if mommacat.original_config.has_key?("server_pools")
mommacat.original_config['server_pools'].each { |server|
svr_class = server['name']
server["platform"] = "linux" if !server.has_key?("platform")
pool_name = mommacat.getResourceName(svr_class)
if nodes.size > 0
matched = false
nodes.each { |n|
if n.match(/^#{Regexp.quote(pool_name)}-[a-z0-9]{3}$/i)
matched = true
end
}
next if !matched
end
# MU::Cloud::AWS::Server.createIAMProfile(pool_name, base_profile: server['iam_role'], extra_policies: server['iam_policies'])
pool_obj = mommacat.findLitterMate(type: "server_pool", mu_name: pool_name)
pool_obj.groom
resp = MU::Cloud::AWS.autoscale.describe_auto_scaling_groups(
auto_scaling_group_names: [pool_name]
)
if resp.nil?
MU.log "Failed to locate any Autoscale Groups named #{pool_name}", MU::WARN
next
end
resp.auto_scaling_groups.each { |asg|
launch = MU::Cloud::AWS.autoscale.describe_launch_configurations(
launch_configuration_names: [asg.launch_configuration_name]
).launch_configurations.first
olduserdata = Base64.decode64(launch.user_data)
userdata = MU::Cloud::AWS::Server.fetchUserdata(
platform: server["platform"],
template_variables: {
"deployKey" => Base64.urlsafe_encode64(mommacat.public_key),
"deploySSHKey" => mommacat.ssh_public_key,
"muID" => muid,
"muUser" => MU.chef_user,
"mommaCatPort" => MU.mommaCatPort,
"publicIP" => MU.mu_public_ip,
"resourceName" => svr_class,
"windowsAdminName" => server['windows_admin_username'],
"skipApplyUpdates" => server['skipinitialupdates'],
"resourceType" => "server_pool"
},
custom_append: server['userdata_script']
)
# Figure out which devices are embedded in the AMI already.
image = MU::Cloud::AWS.ec2.describe_images(image_ids: [server["basis"]["launch_config"]["ami_id"]]).images.first
if image.nil?
MU.log "#{server["basis"]["launch_config"]["ami_id"]} does not exist, skipping launch config #{asg.launch_configuration_name}", MU::ERR
next
end
ext_disks = {}
if !image.block_device_mappings.nil?
image.block_device_mappings.each { |disk|
if !disk.device_name.nil? and !disk.device_name.empty? and !disk.ebs.nil? and !disk.ebs.empty?
ext_disks[disk.device_name] = MU.structToHash(disk.ebs)
if ext_disks[disk.device_name].has_key?(:snapshot_id)
ext_disks[disk.device_name].delete(:encrypted)
end
end
}
end
storage = []
if !server["basis"]["launch_config"]["storage"].nil?
server["basis"]["launch_config"]["storage"].each { |vol|
if ext_disks.has_key?(vol["device"])
if ext_disks[vol["device"]].has_key?(:snapshot_id)
vol.delete("encrypted")
end
end
mapping, cfm_mapping = MU::Cloud::AWS::Server.convertBlockDeviceMapping(vol)
storage << mapping
}
end
storage.concat(MU::Cloud::AWS::Server.ephemeral_mappings)
if userdata != olduserdata or
launch.image_id != server["basis"]["launch_config"]["ami_id"] or
launch.ebs_optimized != server["basis"]["launch_config"]["ebs_optimized"] or
launch.instance_type != server["basis"]["launch_config"]["size"] or
launch.instance_monitoring.enabled != server["basis"]["launch_config"]["monitoring"]
# launch.block_device_mappings != storage
# XXX block device comparison isn't this simple
need_update = true
end
next if !need_update
# Put our Autoscale group onto a temporary launch config
begin
MU::Cloud::AWS.autoscale.create_launch_configuration(
launch_configuration_name: pool_name+"-TMP",
user_data: Base64.encode64(userdata),
image_id: server["basis"]["launch_config"]["ami_id"],
key_name: launch.key_name,
security_groups: launch.security_groups,
instance_type: server["basis"]["launch_config"]["size"],
block_device_mappings: storage,
instance_monitoring: {:enabled => server["basis"]["launch_config"]["monitoring"]},
iam_instance_profile: launch.iam_instance_profile,
ebs_optimized: server["basis"]["launch_config"]["ebs_optimized"],
associate_public_ip_address: launch.associate_public_ip_address
)
rescue ::Aws::AutoScaling::Errors::ValidationError => e
if e.message.match(/Member must have length less than or equal to (\d+)/)
MU.log "Userdata script too long updating #{pool_name} Launch Config (#{Base64.encode64(userdata).size.to_s}/#{Regexp.last_match[1]} bytes)", MU::ERR
else
MU.log "Error updating #{pool_name} Launch Config", MU::ERR, details: e.message
end
next
end
MU::Cloud::AWS.autoscale.update_auto_scaling_group(
auto_scaling_group_name: pool_name,
launch_configuration_name: pool_name+"-TMP"
)
# ...now back to an identical one with the "real" name
MU::Cloud::AWS.autoscale.delete_launch_configuration(
launch_configuration_name: pool_name
)
MU::Cloud::AWS.autoscale.create_launch_configuration(
launch_configuration_name: pool_name,
user_data: Base64.encode64(userdata),
image_id: server["basis"]["launch_config"]["ami_id"],
key_name: launch.key_name,
security_groups: launch.security_groups,
instance_type: server["basis"]["launch_config"]["size"],
block_device_mappings: storage,
instance_monitoring: {:enabled => server["basis"]["launch_config"]["monitoring"]},
iam_instance_profile: launch.iam_instance_profile,
ebs_optimized: server["basis"]["launch_config"]["ebs_optimized"],
associate_public_ip_address: launch.associate_public_ip_address
)
MU::Cloud::AWS.autoscale.update_auto_scaling_group(
auto_scaling_group_name: pool_name,
launch_configuration_name: pool_name
)
MU::Cloud::AWS.autoscale.delete_launch_configuration(
launch_configuration_name: pool_name+"-TMP"
)
MU.log "Launch Configuration #{asg.launch_configuration_name} replaced"
}
}
end
# Update the userdata of live nodes. They must be in the Stopped state for
# us to do so.
mommacat.listNodes.each_pair { |nodename, server|
if server['conf'].nil?
MU.log "Failed to find config data for server #{nodename}", MU::WARN
next
end
id = server['cloud_id']
id = server['instance_id'] if id.nil?
desc = MU::Cloud::AWS.ec2(region: server['region']).describe_instances(instance_ids: [id]).reservations.first.instances.first
server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform")
next if nodes.size > 0 and !nodes.include?(nodename)
mytype = "server"
if server['conf'].has_key?("basis") or
server['conf']['#TYPENAME'] == "ServerPool" or
server['conf']["#MU_CLASS"] == "MU::Cloud::AWS::ServerPool"
mytype = "server_pool"
else
server_obj = mommacat.findLitterMate(type: "server", mu_name: nodename)
server_obj.groom
end
olduserdata = Base64.decode64(MU::Cloud::AWS.ec2(region: server['region']).describe_instance_attribute(
instance_id: id,
attribute: "userData"
).user_data.value)
userdata = MU::Cloud::AWS::Server.fetchUserdata(
platform: server['conf']["platform"],
template_variables: {
"deployKey" => Base64.urlsafe_encode64(mommacat.public_key),
"deploySSHKey" => mommacat.ssh_public_key,
"muID" => muid,
"muUser" => MU.chef_user,
"publicIP" => MU.mu_public_ip,
"resourceName" => server['conf']['name'],
"windowsAdminName" => server['conf']['windows_admin_username'],
"skipApplyUpdates" => server['conf']['skipinitialupdates'],
"resourceType" => mytype
},
custom_append: server['userdata_script']
)
if userdata == olduserdata
MU.log "#{nodename} has up-to-date userdata, skipping", MU::DEBUG
next
end
if desc.state.name != "stopped"
MU.log "#{nodename} needs a userdata update, but is not in Stopped state", MU::NOTICE
if mytype == "server_pool"
pool_name = mommacat.getResourceName(server['conf']['name'])
MU.log "Note: Be sure to pause Autoscaling for this group before stopping this instance, e.g. with: aws autoscaling suspend-processes --auto-scaling-group-name #{pool_name}", MU::WARN
end
next
end
MU.log "Updating #{nodename} userdata (#{server["conf"]["platform"]})"
begin
MU::Cloud::AWS.ec2(region: server['region']).modify_instance_attribute(
instance_id: id,
attribute: "userData",
value: Base64.encode64(userdata)
)
rescue ::Aws::EC2::Errors::InvalidParameterValue => e
if e.message.match(/User data is limited to (\d+)/)
MU.log "Userdata script too long updating #{nodename} (#{userdata.size.to_s}/#{Regexp.last_match[1]} bytes)", MU::ERR
else
MU.log "Error replacing userData on #{nodename}", MU::ERR, details: e.message
end
end
}
}
end
def sslCerts(deploys = MU::MommaCat.listDeploys, nodes = [], vaults_only: false)
badnodes = []
count = 0
deploys.each { |muid|
mommacat = MU::MommaCat.new(muid)
mommacat.listNodes.each_pair { |nodename, server|
next if server['conf'].nil?
server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform")
next if nodes.size > 0 and !nodes.include?(nodename)
if server['conf'].nil?
MU.log "Failed to find config data for server #{nodename}", MU::WARN
next
end
server_obj = mommacat.findLitterMate(type: "server", mu_name: nodename)
mommacat.nodeSSLCerts(server_obj)
}
}
end
def chefUpgrade(deploys = MU::MommaCat.listDeploys, nodes = [])
badnodes = []
deploys.each { |muid|
mommacat = MU::MommaCat.new(muid)
mommacat.listNodes.each_pair { |nodename, server|
next if server['conf'].nil?
server['conf']["platform"] = "linux" if !server['conf'].has_key?("platform")
next if nodes.size > 0 and !nodes.include?(nodename)
if server['conf'].nil?
MU.log "Failed to find config data for server #{nodename}", MU::WARN
next
end
child = Process.fork {
server_obj = mommacat.findLitterMate(type: "server", mu_name: nodename)
begin
server_obj.groomer.reinstall
rescue Exception
end
}
$children[child] = nodename
while $children.size >= $opts[:concurrent]-1
child = Process.wait
if !$?.success?
badnodes << $children[child]
end
$children.delete(child)
end
}
}
Process.waitall.each { |child|
if !child[1].success?
badnodes << $children[child[0]]
end
}
if badnodes.size > 0
MU.log "Not all Chef upgrades exited cleanly", MU::WARN, details: badnodes
end
end
if $opts[:xecute]
runCommand(do_deploys, do_nodes, $opts[:xecute], print_output: true)
elsif $opts[:mode] == "certs"
sslCerts(do_deploys, do_nodes)
elsif $opts[:mode] == "groom"
reGroom(do_deploys, do_nodes)
elsif $opts[:mode] == "vaults"
reGroom(do_deploys, do_nodes, vaults_only: true)
elsif $opts[:mode] == "chefupgrade"
chefUpgrade(do_deploys, do_nodes)
elsif $opts[:mode] == "groomeronly"
print_output = $opts[:verbose] || do_nodes.size == 1
if $opts[:override_chef_runlist]
# runCommand(do_deploys, do_nodes, chef_runlist: $opts[:override_chef_runlist], groomeronly: true, print_output: print_output)
else
# runCommand(do_deploys, do_nodes, groomeronly: true, print_output: print_output)
reGroom(do_deploys, do_nodes, groomeronly: true)
end
elsif $opts[:mode] == "userdata" or $opts[:mode] == "awsmeta"
# Need Google equiv and to select nodes correctly based on what cloud they're in
updateAWSMetaData(do_deploys, do_nodes)
end