crowbar_framework/app/models/api/upgrade.rb
#
# Copyright 2016, SUSE LINUX GmbH
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
require "open3"
module Api
class Upgrade < Tableless
class << self
def timeouts
::Crowbar::UpgradeTimeouts.new.values
end
def status
::Crowbar::UpgradeStatus.new.progress
end
def node_status
not_upgraded = []
upgraded = []
if ::Crowbar::UpgradeStatus.new.finished?
upgraded = ::Node.all.reject(&:admin?).map(&:name)
elsif ::Crowbar::UpgradeStatus.new.passed?(:services)
::Node.all.reject(&:admin?).each do |n|
if n.upgraded?
upgraded << n.name
else
not_upgraded << n.name
end
end
else
not_upgraded = ::Node.all.reject(&:admin?).map(&:name)
end
{
upgraded: upgraded,
not_upgraded: not_upgraded
}
end
def upgrade_mode
::Crowbar::UpgradeStatus.new.upgrade_mode
end
def upgrade_mode=(mode)
unless ["normal", "non_disruptive"].include? mode
raise ::Crowbar::Error::SaveUpgradeModeError, "Invalid upgrade mode #{mode}." \
"Valid upgrade modes are: 'normal' and 'non_disruptive'."
end
Rails.logger.debug("Setting upgrade mode #{mode}")
::Crowbar::UpgradeStatus.new.save_selected_upgrade_mode(mode.to_sym)
end
#
# prechecks
#
def checks
upgrade_status = ::Crowbar::UpgradeStatus.new
upgrade_status.start_step(:prechecks)
{}.tap do |ret|
ret[:checks] = {}
network = ::Crowbar::Sanity.check
ret[:checks][:network_checks] = {
required: true,
passed: network.empty?,
errors: network.empty? ? {} : sanity_check_errors(network)
}
health_check = Api::Crowbar.health_check
ret[:checks][:cloud_healthy] = {
required: true,
passed: health_check.empty?,
errors: health_check.empty? ? {} : health_check_errors(health_check)
}
deployment = Api::Crowbar.deployment_check
ret[:checks][:cloud_deployment] = {
required: true,
passed: deployment.empty?,
errors: deployment.empty? ? {} : deployment_errors(deployment)
}
maintenance_updates = ::Crowbar::Checks::Maintenance.updates_status
ret[:checks][:maintenance_updates_installed] = {
required: true,
passed: maintenance_updates.empty?,
errors: maintenance_updates.empty? ? {} : maintenance_updates_check_errors(
maintenance_updates
)
}
compute = Api::Crowbar.compute_status
ret[:checks][:compute_status] = {
required: false,
passed: compute.empty?,
errors: compute.empty? ? {} : compute_status_errors(compute)
}
ceph_status = Api::Crowbar.ceph_status
ret[:checks][:ceph_status] = {
required: true,
passed: ceph_status.empty?,
errors: ceph_status.empty? ? {} : ceph_errors(ceph_status)
}
ha_config = Api::Pacemaker.ha_presence_check.merge(
Api::Crowbar.ha_config_check
)
ret[:checks][:ha_configured] = {
required: false,
passed: ha_config.empty?,
errors: ha_config.empty? ? {} : ha_config_errors(ha_config)
}
if Api::Crowbar.addons.include?("ha")
clusters_health = Api::Pacemaker.health_report
ret[:checks][:clusters_healthy] = {
required: true,
passed: clusters_health.empty?,
errors: clusters_health.empty? ? {} : clusters_health_report_errors(clusters_health)
}
end
ret[:best_method] = if ret[:checks].any? { |_id, c| c[:required] && !c[:passed] }
# no upgrade if any of the required prechecks failed
:none
elsif !ret[:checks].any? { |_id, c| !c[:required] && !c[:passed] }
# allow non-disruptive when all prechecks succeeded
:non_disruptive
else
# otherwise choose the disruptive upgrade path (i.e. the required
# checks succeeded and some of the non-required ones failed)
:normal
end
::Crowbar::UpgradeStatus.new.save_suggested_upgrade_mode(ret[:best_method])
return ret unless upgrade_status.current_step == :prechecks
# transform from this:
# ret[:clusters_healthy][:errors] = {
# clusters_health_crm_failures: { data: "123", help: "abc" },
# another_error: { ... }
# }
# ret[:maintenance_updates_installed][:errors] = {
# maintenance_updates_installed: { data: "987", help: "xyz" }
# }
# to this:
# errors = {
# clusters_health_crm_failures: { data: "123", ... },
# another_error: { ... },
# maintenance_updates_installed: { data: "987", ... }
# }
errors = ret[:checks].select { |_k, v| v[:required] && v[:errors].any? }.
map { |_k, v| v[:errors] }.
reduce({}, :merge)
if errors.any?
::Crowbar::UpgradeStatus.new.end_step(false, errors)
else
::Crowbar::UpgradeStatus.new.end_step
end
end
rescue ::Crowbar::Error::StartStepRunningError,
::Crowbar::Error::StartStepOrderError,
::Crowbar::Error::SaveUpgradeStatusError => e
raise ::Crowbar::Error::UpgradeError.new(e.message)
rescue StandardError => e
# we need to check if it is actually running, as prechecks can be called at any time
if ::Crowbar::UpgradeStatus.new.running?(:prechecks)
::Crowbar::UpgradeStatus.new.end_step(
false,
prechecks: {
data: e.message,
help: "Crowbar has failed. Check /var/log/crowbar/production.log for details."
}
)
end
raise e
end
#
# prepare upgrade
#
def prepare(options = {})
background = options.fetch(:background, false)
if background
prepare_nodes_for_crowbar_upgrade_background
else
prepare_nodes_for_crowbar_upgrade
end
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
prepare: {
data: e.message,
help: "Crowbar has failed. Check /var/log/crowbar/production.log for details."
}
)
raise e
end
#
# repocheck
#
def adminrepocheck
upgrade_status = ::Crowbar::UpgradeStatus.new
upgrade_status.start_step(:repocheck_crowbar)
zypper_stream = Hash.from_xml(
`sudo /usr/bin/zypper-retry --xmlout products`
)["stream"]
{}.tap do |ret|
if zypper_stream["message"] =~ /^System management is locked/
upgrade_status.end_step(
false,
repocheck_crowbar: {
data: zypper_stream["message"],
help: "Make sure zypper is not running and try again."
}
)
return {
status: :service_unavailable,
error: I18n.t(
"api.crowbar.zypper_locked", zypper_locked_message: zypper_stream["message"]
)
}
end
prompt = zypper_stream["prompt"]
unless prompt.nil?
# keep only first prompt for easier formatting
prompt = prompt.first if prompt.is_a?(Array)
message_text = zypper_stream["message"]
message_text = message_text.join("\n") if message_text.is_a?(Array)
upgrade_status.end_step(
false,
repocheck_crowbar: {
data: [message_text, prompt["text"]].join("\n"),
help: "Make sure you complete the required action and try again."
}
)
return {
status: :service_unavailable,
error: I18n.t(
"api.crowbar.zypper_prompt",
zypper_prompt_text: prompt["text"],
zypper_message: message_text
)
}
end
products = zypper_stream["product_list"]["product"]
os_available = repo_version_available?(products, "SLES", "12.4")
ret[:os] = {
available: os_available,
repos: [
"SLES12-SP4-Pool",
"SLES12-SP4-Updates"
],
errors: {}
}
unless os_available
ret[:os][:errors][admin_architecture.to_sym] = {
missing: ret[:os][:repos]
}
end
cloud_available = repo_version_available?(products, "suse-openstack-cloud-crowbar", "8")
ret[:openstack] = {
available: cloud_available,
repos: [
"SUSE-OpenStack-Cloud-Crowbar-9-Pool",
"SUSE-OpenStack-Cloud-Crowbar-9-Updates"
],
errors: {}
}
unless cloud_available
ret[:openstack][:errors][admin_architecture.to_sym] = {
missing: ret[:openstack][:repos]
}
end
if ret.any? { |_k, v| !v[:available] }
missing_repos = ret.collect do |k, v|
next if v[:errors].empty?
missing_repo_arch = v[:errors].keys.first.to_sym
v[:errors][missing_repo_arch][:missing]
end.flatten.compact.join(", ")
::Crowbar::UpgradeStatus.new.end_step(
false,
repocheck_crowbar: {
data: "Missing repositories: #{missing_repos}",
help: "Fix the repository setup for the Admin server before " \
"you continue with the upgrade"
}
)
else
upgrade_status.end_step
end
end
rescue ::Crowbar::Error::StartStepRunningError,
::Crowbar::Error::StartStepOrderError,
::Crowbar::Error::SaveUpgradeStatusError => e
raise ::Crowbar::Error::UpgradeError.new(e.message)
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
repocheck_crowbar: {
data: e.message,
help: "Crowbar has failed. Check /var/log/crowbar/production.log for details."
}
)
raise e
end
def noderepocheck
upgrade_status = ::Crowbar::UpgradeStatus.new
upgrade_status.start_step(:repocheck_nodes)
response = {}
addons = Api::Crowbar.addons
addons.push("os", "openstack").each do |addon|
response.merge!(Api::Node.repocheck(addon: addon))
end
unavailable_repos = response.select { |_k, v| !v["available"] }
if unavailable_repos.any?
::Crowbar::UpgradeStatus.new.end_step(
false,
repocheck_nodes: {
data: "These repositories are missing: " \
"#{unavailable_repos.keys.join(', ')}.",
help: "Fix the repository setup for the cloud nodes before " \
"you continue with the upgrade."
}
)
else
upgrade_status.end_step
end
response
rescue ::Crowbar::Error::StartStepRunningError,
::Crowbar::Error::StartStepOrderError,
::Crowbar::Error::SaveUpgradeStatusError => e
raise ::Crowbar::Error::UpgradeError.new(e.message)
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
repocheck_nodes: {
data: e.message,
help: "Crowbar has failed. Check /var/log/crowbar/production.log for details."
}
)
raise e
end
def target_platform(options = {})
platform_exception = options.fetch(:platform_exception, nil)
if ENV["CROWBAR_VERSION"] == "4.0" && platform_exception == :ceph
::Crowbar::Product.ses_platform
else
::Node.admin_node.target_platform
end
end
# Check if they are any instances running.
# It's necessary for disruptive upgrade, when user has to shut them down manually before
# proceeding with the services shutdown.
def check_for_running_instances
controller = ::Node.find("run_list_map:nova-controller").first
if controller.nil?
Rails.logger.info("No nova-controller node found.")
return
end
out = controller.run_ssh_cmd("systemctl status openstack-nova-api")
unless out[:exit_code].zero?
Rails.logger.info("nova-api is not running: check for running instances not possible")
return
end
cmd = "openstack server list --insecure --all-projects --long " \
"--status active -f value -c Host"
out = controller.run_ssh_cmd("source /root/.openrc; #{cmd}", "60s")
unless out[:exit_code].zero?
raise_services_error(
"Error happened when trying to list running instances.\n" \
"Command '#{cmd}' failed at node #{controller.name} " \
"with #{out[:exit_code]}."
)
end
return if out[:stdout].nil? || out[:stdout].empty?
hosts = out[:stdout].split.uniq.join("\n")
raise_services_error(
"Following compute nodes still have instances running:\n#{hosts}."
)
end
# Do another pacemaker cluster health check at the start of services step
# If the cluster is unhealthy, it could lead to errors during upgrade.
# At this point all services are still running so it should be possible to
# fix the problems.
def cluster_health_check
return true unless Api::Crowbar.addons.include?("ha")
clusters_health = Api::Pacemaker.health_report
return true if clusters_health.empty?
Rails.logger.error "Pacemaker cluster is not healthy."
::Crowbar::UpgradeStatus.new.end_step(
false,
clusters_health_report_errors(clusters_health)
)
false
end
# Make sure the crowbar migrations are OK
#
# They were executed at the end of admin upgrade step and if there was a failure,
# there's a big chance services step would fail (because chef templates might not be rendered)
# We have to do the check here at the start of 'services' step and not during admin upgrade,
# because admin upgrade step is not repeatable.
def check_schema_migrations
schema_migrate = run_cmd(
"cd /opt/dell/crowbar_framework/; " \
"sudo -u crowbar RAILS_ENV=production bin/rake crowbar:schema_migrate_prod"
)
return true if schema_migrate[:exit_code].zero?
msg = "There was an error during crowbar schema migrations."
Rails.logger.error msg
::Crowbar::UpgradeStatus.new.end_step(
false,
services: {
data: msg,
help: "Check the admin server upgrade log /var/log/crowbar/admin-server-upgrade.log " \
"and /var/log/crowbar/component_install.log for possible hints. " \
"After fixing the problem, run the crowbar migrations manually and repeat this step."
}
)
false
end
# Check the version of installed SLES to verify admin upgrade went well.
# We have to do the check here at the start of 'services' step and not during admin upgrade,
# because admin upgrade step is not repeatable.
def check_product_version
zypper_stream = Hash.from_xml(
`sudo /usr/bin/zypper-retry --xmlout products -i`
)["stream"]
products = zypper_stream["product_list"]["product"]
return true if products.any? do |product|
product["name"] == "SLES" && product["version"] == "12.4"
end
Rails.logger.error "Incorrect SLES version present"
::Crowbar::UpgradeStatus.new.end_step(
false,
services: {
data: "It appears that incorrect version of SLES product is installed.",
help: "Check the admin server upgrade log /var/log/crowbar/admin-server-upgrade.log " \
"for possible hints of what went wrong during admin server upgrade. " \
"For fixing the problem, it might be needed to check the state of admin server " \
"repositories, upgrade the admin server manually (using 'zypper dup' command) and " \
"rebooting it. Once done, repeat this upgrade step."
}
)
false
end
#
# service shutdown
#
def services
return unless cluster_health_check
return unless check_schema_migrations
return unless check_product_version
begin
# prepare the scripts for various actions necessary for the upgrade
service_object = CrowbarService.new
service_object.prepare_nodes_for_os_upgrade
rescue => e
msg = e.message
Rails.logger.error msg
::Crowbar::UpgradeStatus.new.end_step(
false,
services: {
data: msg,
help: "Check /var/log/crowbar/production.log on admin server."
}
)
return
end
check_for_running_instances if upgrade_mode == :normal
# For all nodes in cluster, set the pre-upgrade attribute
upgrade_nodes = ::Node.find("state:crowbar_upgrade")
cinder_node = nil
horizon_node = nil
nova_node = nil
monasca_node = nil
upgrade_nodes.each do |node|
node.set_pre_upgrade_attribute
if node.roles.include?("cinder-controller") &&
(!node.roles.include?("pacemaker-cluster-member") ||
node["pacemaker"]["founder"] == node[:fqdn])
cinder_node = node
end
if node.roles.include?("horizon-server") &&
(!node.roles.include?("pacemaker-cluster-member") ||
node["pacemaker"]["founder"] == node[:fqdn])
horizon_node = node
end
if node.roles.include?("nova-controller") &&
(!node.roles.include?("pacemaker-cluster-member") ||
node["pacemaker"]["founder"] == node[:fqdn])
nova_node = node
end
monasca_node = node if node.roles.include?("monasca-server")
end
begin
unless monasca_node.nil?
monasca_node.wait_for_script_to_finish(
"/usr/sbin/crowbar-monasca-cleanups.sh",
timeouts[:monasca_cleanups]
)
Rails.logger.info("Removing Java based Monasca persister was successful.")
end
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
services: {
data: "Problem while removing Java based Monasca persister: " + e.message,
help: "Check /var/log/crowbar/production.log on admin server " \
"and /var/log/crowbar/node-upgrade.log at #{monasca_node.name}."
}
)
# Stop here and error out
return
end
begin
unless horizon_node.nil? || monasca_node.nil?
horizon_node.wait_for_script_to_finish(
"/usr/sbin/crowbar-dump-monasca-db.sh",
timeouts[:dump_grafana_db]
)
end
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
services: {
data: "Problem while dumping Grafana database: " + e.message,
help: "Check /var/log/crowbar/production.log on admin server, " \
"and /var/log/crowbar/node-upgrade.log on #{horizon_node.name}."
}
)
# Stop here and error out
return
end
begin
unless monasca_node.nil?
monasca_node.wait_for_script_to_finish(
"/usr/sbin/crowbar-dump-monasca-db.sh",
timeouts[:dump_monasca_db]
)
Rails.logger.info("Dump and shutdown of Monasca database sucessful.")
end
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
services: {
data: "Problem while dumping/shutting down monasca databases: " + e.message,
help: "Check /var/log/crowbar/production.log on admin server, " \
"and /var/log/crowbar/node-upgrade.log on #{monasca_node.name}."
}
)
# Stop here and error out
return
end
begin
unless nova_node.nil?
nova_node.wait_for_script_to_finish(
"/usr/sbin/crowbar-delete-unknown-nova-services.sh",
timeouts[:delete_nova_services]
)
Rails.logger.info("Deleting of nova services was successful.")
end
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
services: {
data: "Problem while deleting unknown nova services: " + e.message,
help: "Check /var/log/crowbar/production.log on admin server " \
"and /var/log/crowbar/node-upgrade.log at #{nova_node.name}."
}
)
# Stop here and error out
return
end
# Initiate the services shutdown for all nodes
execute_scripts_and_wait_for_finish(
upgrade_nodes,
"/usr/sbin/crowbar-shutdown-services-before-upgrade.sh",
timeouts[:shutdown_services]
)
Rails.logger.info("Services were shut down on all nodes.")
begin
unless cinder_node.nil?
cinder_node.wait_for_script_to_finish(
"/usr/sbin/crowbar-delete-cinder-services-before-upgrade.sh",
timeouts[:delete_cinder_services]
)
Rails.logger.info("Deleting of cinder services was successful.")
end
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
services: {
data: "Problem while deleting cinder services: " + e.message,
help: "Check /var/log/crowbar/production.log on admin server. " \
"If the action failed at a specific node, " \
"check /var/log/crowbar/node-upgrade.log at the node."
}
)
# Stop here and error out
return
end
::Crowbar::UpgradeStatus.new.end_step
rescue ::Crowbar::Error::Upgrade::ServicesError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
running_instances: {
data: e.message,
help: "Suspend or stop all instances before you continue with the upgrade."
}
)
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
services: {
data: e.message,
help: "Check /var/log/crowbar/production.log on admin server for details. " \
"If the action failed on a specific node, " \
"check /var/log/crowbar/node-upgrade.log on that node."
}
)
end
handle_asynchronously :services
def openstackbackup
crowbar_lib_dir = "/var/lib/crowbar"
dump_path = "#{crowbar_lib_dir}/backup/8-to-9-openstack_dump.sql.gz"
if File.exist?(dump_path)
Rails.logger.warn("OpenStack backup already exists. Skipping...")
::Crowbar::UpgradeStatus.new.end_step
return
end
db_node = openstack_db_node
if db_node.nil?
# This can happen if only the crowbar node was deployed and will be upgraded
Rails.logger.warn("Can not get database parameters for OpenStack backup. Skipping...")
::Crowbar::UpgradeStatus.new.end_step
return
end
db_user = "root"
db_password = db_node["database"]["mysql"]["server_root_password"]
# Note: Trying to (gu)estimate the size of the (compressed) database
# SQL dump from the sizes report via the below query is probably a
# bit far fetched. Is there a more realistic way for this?
size_query = "SELECT SUM(data_length + index_length) FROM information_schema.tables ;"
cmd = "echo \"#{size_query}\" | mysql -N -u #{db_user} -p#{db_password}"
Rails.logger.debug("Checking size of OpenStack database")
# Note: We need to run this on a database node since the mysql root
# user doesn't have remote access to the mysql server.
db_size = db_node.run_ssh_cmd(cmd, "60s")
unless db_size[:exit_code].zero?
Rails.logger.error(
"Failed to check size of OpenStack database: \n" \
" stdout: #{db_size[:stdout]} \n" \
" stderr: #{db_size[:stderr]}"
)
raise ::Crowbar::Error::Upgrade::DatabaseSizeError.new(
"stdout: #{db_size[:stdout]}\n stderr:#{db_size[:stderr]}"
)
end
free_space = run_cmd(
"LANG=C df -x 'tmpfs' -x 'devtmpfs' -B1 -l --output='avail' #{crowbar_lib_dir} | tail -n1"
)
unless free_space[:exit_code].zero?
Rails.logger.error("Cannot determine free disk space: #{free_space[:stdout_and_stderr]}")
raise ::Crowbar::Error::Upgrade::FreeDiskSpaceError.new(
free_space[:stdout_and_stderr]
)
end
if free_space[:stdout_and_stderr].strip.to_i < db_size[:stdout].strip.to_i
Rails.logger.error("Not enough free disk space to create the OpenStack database dump")
raise ::Crowbar::Error::Upgrade::NotEnoughDiskSpaceError.new("#{crowbar_lib_dir}/backup")
end
Rails.logger.debug("Creating OpenStack database dump")
# Note: We need to run this on a database node since the mysql root user
# doesn't have remote access to the mysql server. But we can't
# use Node.run_ssh_cmd here because we want to redirect to a file
# on the crowbar node.
db_dump = run_cmd("sudo ssh -o ConnectTimeout=10 root@#{db_node.name} " \
"\'mysqldump -u #{db_user} -p#{db_password} --all-databases | gzip\' " \
"> #{dump_path}"
)
unless db_dump[:exit_code].zero?
Rails.logger.error(
"Failed to create OpenStack database dump: #{db_dump[:stdout_and_stderr]}"
)
FileUtils.rm_f(dump_path)
raise ::Crowbar::Error::Upgrade::DatabaseDumpError.new(
db_dump[:stdout_and_stderr]
)
end
::Crowbar::UpgradeStatus.new.save_openstack_backup dump_path
::Crowbar::UpgradeStatus.new.end_step
rescue ::Crowbar::Error::Upgrade::NotEnoughDiskSpaceError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
backup_openstack: {
data: e.message,
help: "Make sure you have enough disk space to store the OpenStack database dump."
}
)
rescue ::Crowbar::Error::Upgrade::FreeDiskSpaceError,
::Crowbar::Error::Upgrade::DatabaseSizeError,
::Crowbar::Error::Upgrade::DatabaseDumpError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
backup_openstack: {
data: e.message
}
)
rescue StandardError => e
::Crowbar::UpgradeStatus.new.end_step(
false,
backup_openstack: {
data: e.message,
help: "Crowbar has failed. Check /var/log/crowbar/production.log for details."
}
)
raise e
end
handle_asynchronously :openstackbackup
#
# cancel upgrade
#
def cancel
upgrade_status = ::Crowbar::UpgradeStatus.new
unless upgrade_status.cancel_allowed?
Rails.logger.error(
"Not possible to cancel the upgrade at the step #{upgrade_status.current_step}"
)
raise ::Crowbar::Error::Upgrade::CancelError.new(upgrade_status.current_step)
end
provisioner_service = ProvisionerService.new
provisioner_service.enable_all_repositories
crowbar_service = CrowbarService.new
crowbar_service.revert_nodes_from_crowbar_upgrade
upgrade_status.initialize_state
end
def upgrade_pacemaker_cluster(cluster_env)
founder_name = ::Node.find(
"pacemaker_config_environment:#{cluster_env}"
).first[:pacemaker][:founder]
founder = ::Node.find_by_name(founder_name)
upgrade_cluster founder, cluster_env
end
# Take the list of elements as argument (that could be both nodes and clusters)
def upgrade_nodes_disruptive(elements_to_upgrade)
elements_to_upgrade.each do |element|
# If role has a cluster assigned ugprade that cluster (reuse the non-disruptive code here)
if ServiceObject.is_cluster?(element)
cluster = ServiceObject.cluster_name element
cluster_env = "pacemaker-config-#{cluster}"
upgrade_pacemaker_cluster cluster_env
else
node = ::Node.find_by_name(element)
if node["run_list_map"].key? "pacemaker-cluster-member"
# if the node is part of some cluster, upgrade the whole cluster
upgrade_pacemaker_cluster node[:pacemaker][:config][:environment]
else
# if role has single node(s) assigned upgrade those nodes (optionally all at once)
upgrade_one_node element
end
end
end
# and handle those as part of the compute node upgrade
end
# Go through active proposals and return elements with assigned roles in the right order
def upgradable_elements_of_proposals(proposals)
to_upgrade = []
# First find out the nodes with compute role, for later checks
compute_nodes = {}
nova_prop = proposals["nova"] || nil
unless nova_prop.nil?
nova_prop["deployment"]["nova"]["elements"].each do |role, elements|
next unless role.start_with?("nova-compute")
elements.each do |el|
if ServiceObject.is_remotes?(el)
PacemakerServiceObject.expand_remote_nodes(el).each { |n| compute_nodes[n] = 1 }
else
compute_nodes[el] = 1
end
end
end
end
# For each active proposal go through the roles in element order
proposals.each do |name, proposal|
elements = proposal["deployment"][name]["elements"]
proposal["deployment"][name]["element_order"].flatten.each do |el|
# skip nova compute nodes, they will be upgraded separately
next if el.start_with?("nova-compute") || elements[el].nil?
# skip some roles if they are assigned to compute nodes
if ["cinder-volume", "ceilometer-agent", "swift-storage"].include? el
# expanding elements so we catch the case of cinder-volume in cluster
ServiceObject.expand_nodes_for_all(elements[el]).flatten.each do |node|
next if compute_nodes[node]
to_upgrade |= [node]
end
else
to_upgrade |= elements[el]
end
end
end
to_upgrade
end
def upgrade_controllers_disruptive
Rails.logger.info("Entering disruptive upgrade of controller nodes")
# Go through all OpenStack barclamps ordered by run_order
proposals = {}
BarclampCatalog.barclamps.map do |name, attrs|
next if BarclampCatalog.category(name) != "OpenStack"
next if ["pacemaker", "openstack"].include? name
prop = Proposal.where(barclamp: name).first
next if prop.nil? || !prop.active?
proposals[name] = prop
end
proposals = proposals.sort_by { |key, _v| BarclampCatalog.run_order(key) }.to_h
elements_to_upgrade = upgradable_elements_of_proposals proposals
upgrade_nodes_disruptive elements_to_upgrade
save_nodes_state([], "", "")
Rails.logger.info("Successfully finished disruptive upgrade of controller nodes")
end
def do_controllers_substep(substep)
if substep == :controller_nodes
upgrade_controller_clusters
upgrade_non_compute_nodes
prepare_all_compute_nodes
# Stop nova services on all nodes to prevent any and all RPC API trouble
if upgrade_mode != :normal
stop_nova_services
reload_nova_services
end
::Crowbar::UpgradeStatus.new.save_substep(substep, :finished)
end
end
def remaining_nodes
::Node.find("state:crowbar_upgrade").size
end
#
# nodes upgrade
#
def nodes(component = "all")
status = ::Crowbar::UpgradeStatus.new
if component == "postpone"
status.postpone
return
elsif component == "resume"
status.resume
return
end
substep = status.current_substep
substep_status = status.current_substep_status
# initialize progress info about nodes upgrade
if status.progress[:remaining_nodes].nil?
status.save_nodes(0, remaining_nodes)
end
if substep.nil? || substep.empty?
substep = :controller_nodes
end
if substep == :controller_nodes && substep_status == :finished
substep = :compute_nodes
end
status.save_substep(substep, :running)
# decide what needs to be upgraded
case component
when "all"
# Upgrade everything
do_controllers_substep(substep)
substep = :compute_nodes
upgrade_all_compute_nodes
when "controllers"
# Upgrade controller clusters only
do_controllers_substep(substep)
# Finalize only upgraded nodes (compute nodes might be postponed)
::Node.all.each do |node|
finalize_node_upgrade(node) if !node.admin? && node.upgraded?
end
else
# Upgrade given compute node
names = component.split(/[\s,;]/)
if names.size == 1
upgrade_one_compute_node component
else
non_disruptive_upgrade_compute_nodes(names)
end
::Crowbar::UpgradeStatus.new.save_substep(substep, :node_finished)
end
# if all nodes are upgraded, cleanup and end the whole step
status = ::Crowbar::UpgradeStatus.new
if status.progress[:remaining_nodes].zero?
status.save_substep(substep, :finished)
if upgrade_mode == :normal
status.save_substep(:reload_nova_services, :running)
stop_nova_services
reload_nova_services
status.save_substep(:reload_nova_services, :finished)
end
status.save_substep(:run_online_migrations, :running)
run_online_migrations
status.save_substep(:run_online_migrations, :finished)
unlock_crowbar_ui_package
finalize_nodes_upgrade
cleanup_crowbar_proposal
status.end_step
end
rescue ::Crowbar::Error::Upgrade::NodeError => e
substep = ::Crowbar::UpgradeStatus.new.current_substep
::Crowbar::UpgradeStatus.new.save_substep(substep, :failed)
::Crowbar::UpgradeStatus.new.end_step(
false,
nodes: {
data: e.message,
help: "Check the log files at the node that has failed to find possible cause."
}
)
rescue ::Crowbar::Error::Upgrade::LiveMigrationError => e
substep = ::Crowbar::UpgradeStatus.new.current_substep
::Crowbar::UpgradeStatus.new.save_substep(substep, :failed)
::Crowbar::UpgradeStatus.new.end_step(
false,
nodes: {
data: e.message,
help:
"Log files at controller node should indicate the reason why " \
"the live migration has failed. " \
"Try to migrate the instances from the compute node that is " \
"about to be upgraded and then restart the upgrade step."
}
)
rescue StandardError => e
# end the step even for non-upgrade error, so we are not stuck with 'running'
substep = ::Crowbar::UpgradeStatus.new.current_substep
::Crowbar::UpgradeStatus.new.save_substep(substep, :failed)
::Crowbar::UpgradeStatus.new.end_step(
false,
nodes: {
data: e.message,
help: "Crowbar has failed. Check /var/log/crowbar/production.log for details."
}
)
raise e
end
handle_asynchronously :nodes
def raise_node_upgrade_error(message = "")
Rails.logger.error(message)
raise ::Crowbar::Error::Upgrade::NodeError.new(message)
end
def raise_live_migration_error(message = "")
Rails.logger.error(message)
raise ::Crowbar::Error::Upgrade::LiveMigrationError.new(message)
end
def raise_services_error(message = "")
Rails.logger.error(message)
raise ::Crowbar::Error::Upgrade::ServicesError.new(message)
end
protected
# If there's separate network cluster, we have touch it before we start upgrade of other
# nodes, specificaly we need to evacuate the network routers from the first network node.
def prepare_network_node(network_node)
return if network_node.upgraded?
evacuate_network_node(network_node, network_node)
delete_pacemaker_resources network_node
end
#
# controller nodes upgrade
#
def upgrade_controller_clusters
::Crowbar::UpgradeStatus.new.save_substep(:controller_nodes, :running)
return upgrade_controllers_disruptive if upgrade_mode == :normal
network_cluster_members = ::Node.find(
"run_list_map:pacemaker-cluster-member AND " \
"run_list_map:neutron-network AND NOT " \
"run_list_map:neutron-server"
)
network_node = if network_cluster_members.empty?
nil
else
::Node.find_by_name(network_cluster_members.first[:pacemaker][:founder])
end
prepare_network_node(network_node) unless network_node.nil?
# Now we must upgrade the clusters in the correct order:
# 1. data, 2. API, 3. network
# search in all the cluster members to get a unique list of founders for all clusters
cluster_founders_names = ::Node.find("run_list_map:pacemaker-cluster-member").map! do |node|
node[:pacemaker][:founder]
end.uniq
cluster_founders = cluster_founders_names.map { |name| ::Node.find_by_name(name) }
sorted_founders = cluster_founders.sort do |n1, n2|
first_data = n1[:run_list_map].key? "database-server"
first_api = n1[:run_list_map].key? "keystone-server"
second_net = n2[:run_list_map].key? "neutron-network"
first_data || (first_api && second_net) ? -1 : 1
end
sorted_founders.each do |founder|
cluster_env = founder[:pacemaker][:config][:environment]
upgrade_cluster founder, cluster_env
end
save_nodes_state([], "", "")
end
# crowbar_upgrade_step will not be needed after node is upgraded
def finalize_node_upgrade(node)
return unless node.crowbar.key? "crowbar_upgrade_step"
Rails.logger.info("Finalizing upgrade of node #{node.name}")
node.crowbar.delete "crowbar_upgrade_step"
node.crowbar.delete "node_upgrade_state"
node.save
Rails.logger.info("Starting chef-client service on #{node.name}")
node.ssh_cmd("systemctl start chef-client")
end
# After the controller upgrade we need to stop Nova services on all nodes
# to ensure that no old services remain running and cause restarted Nova
# services to autonegotiate the old RPC API version.
# However, if this is executed in 'normal' mode, compute nodes are fully
# upgraded and rebooted and this action is only required for controller ones.
def stop_nova_services
nova_nodes = if upgrade_mode == :normal
::Node.find("roles:nova-controller")
else
::Node.find("roles:nova-*").sort do |n|
n.roles.include?("nova-controller") ? -1 : 1
end
end
nova_nodes.each do |node|
save_node_action("Stopping nova services at #{node.name}")
begin
node.wait_for_script_to_finish(
"/usr/sbin/crowbar-stop-nova-services.sh",
timeouts[:stop_nova_services]
)
rescue StandardError => e
raise_node_upgrade_error(
"Stopping of nova services has failed on node #{node.name}: " \
"#{e.message} Check /var/log/crowbar/node-upgrade.log for details."
)
end
Rails.logger.info("Nova services stopped on #{node.name}.")
save_node_action("Nova services stop finished.")
end
end
# Once all nova services are upgraded to current version (that is Rocky for current SOC)
# we need to tell services to start using latest RPC. To this end we restart all of them.
# However, if this is executed in 'normal' mode, compute nodes are fully
# upgraded and rebooted and this action is only required for controller ones.
def reload_nova_services
nova_nodes = if upgrade_mode == :normal
::Node.find("roles:nova-controller")
else
::Node.find("roles:nova-*").sort do |n|
n.roles.include?("nova-controller") ? -1 : 1
end
end
nova_nodes.each do |node|
save_node_action("Reloading nova services at #{node.name}")
begin
node.wait_for_script_to_finish(
"/usr/sbin/crowbar-reload-nova-after-upgrade.sh",
timeouts[:reload_nova_services]
)
rescue StandardError => e
raise_node_upgrade_error(
"Reloading of some nova service has failed on node #{node.name}: " \
"#{e.message} Check /var/log/crowbar/node-upgrade.log for details."
)
end
Rails.logger.info("Nova services reloaded at #{node.name}. Sleeping for 10s.")
save_node_action("Nova services reload finished.")
# This is needed to stagger the nova-compute reload across the whole
# cloud. If all compute nodes are restarted simultaneously, they will
# DDoS RabbitMQ with message queue chatter on large clouds.
sleep(10)
end
end
# Once all nova services are running Pike, it's possible to execute
# online db migrations
def run_online_migrations
nova_nodes = ::Node.find("roles:nova-controller")
heat_nodes = ::Node.find("roles:heat-server")
return if nova_nodes.empty? && heat_nodes.empty?
nova_nodes.select! { |n| n[:fqdn] == n[:pacemaker][:founder] } if nova_nodes.size > 1
heat_nodes.select! { |n| n[:fqdn] == n[:pacemaker][:founder] } if heat_nodes.size > 1
if nova_nodes.any?
node = nova_nodes.first
save_node_action("Executing nova online migrations on #{node.name}...")
node.wait_for_script_to_finish(
"/usr/sbin/crowbar-nova-migrations-after-upgrade.sh",
timeouts[:online_migrations]
)
Rails.logger.info("Nova online migrations finished.")
end
if heat_nodes.any?
node = heat_nodes.first
save_node_action("Executing heat migrations on #{node.name}...")
node.wait_for_script_to_finish(
"/usr/sbin/crowbar-heat-migrations-after-upgrade.sh",
timeouts[:online_migrations]
)
Rails.logger.info("Heat migrations finished.")
end
save_node_action("Data migrations finished.")
rescue StandardError => e
raise_node_upgrade_error(
"Problem while running online migrations on node #{node.name}: " \
"#{e.message} Check /var/log/crowbar/node-upgrade.log for details."
)
end
def delete_upgrade_scripts(node)
return if node.admin?
scripts_to_delete = [
"prepare-repositories",
"upgrade-os",
"shutdown-services-before-upgrade",
"delete-cinder-services-before-upgrade",
"dump-monasca-db",
"monasca-cleanups",
"evacuate-host",
"pre-upgrade",
"delete-pacemaker-resources",
"dhcp-migration",
"router-migration",
"lbaas-evacuation",
"post-upgrade",
"chef-upgraded",
"reload-nova-after-upgrade",
"run-nova-online-migrations",
"delete-unknown-nova-services",
"heat-migrations-after-upgrade",
"migrate-keystone-and-start",
"nova-migrations-after-upgrade",
"set-network-agents-state",
"shutdown-keystone",
"shutdown-remaining-services",
"stop-nova-services",
"run-nova-online-migrations"
].map { |f| "/usr/sbin/crowbar-#{f}.sh" }.join(" ")
scripts_to_delete << " /etc/neutron/lbaas-connection.conf"
node.run_ssh_cmd("rm -f #{scripts_to_delete}")
end
def finalize_nodes_upgrade
::Node.find_all_nodes.each do |node|
finalize_node_upgrade node
delete_upgrade_scripts node
end
end
# Remove the assignement of crowbar-upgrade role to the nodes in crowbar barclamp
def cleanup_crowbar_proposal
proposal = Proposal.find_by(barclamp: "crowbar", name: "default")
# remove the nodes from upgrade role
proposal["deployment"]["crowbar"]["elements"]["crowbar-upgrade"] = []
proposal.save
end
#
# upgrade of controller nodes in given cluster
#
def upgrade_cluster(founder, cluster)
Rails.logger.info("Upgrading controller nodes in cluster #{cluster}")
non_founder_nodes = ::Node.find(
"pacemaker_config_environment:#{cluster} AND " \
"run_list_map:pacemaker-cluster-member AND " \
"NOT fqdn:#{founder[:fqdn]}"
)
non_founder_nodes.reject!(&:upgraded?)
if founder.upgraded? && non_founder_nodes.empty?
Rails.logger.info("All nodes in cluster #{cluster} have already been upgraded.")
return
end
upgrade_first_cluster_node founder, non_founder_nodes.first
# if we started upgrade of some node before, let's continue with it
non_founder_nodes.sort! { |n| n.upgrading? ? -1 : 1 }
# upgrade the rest of nodes in the same cluster
non_founder_nodes.each do |node|
upgrade_next_cluster_node node, founder
end
Rails.logger.info("Nodes in cluster #{cluster} successfully upgraded")
end
# Method for upgrading first node of the cluster
# other_node_name argument is the name of any other node in the same cluster
def upgrade_first_cluster_node(node, other_node)
return true if node.upgraded?
node_api = Api::Node.new node.name
other_node_api = Api::Node.new other_node.name
node_api.save_node_state("controller", "upgrading")
Rails.logger.info("Starting the upgrade of node #{node.name}")
evacuate_network_node(node, node)
# upgrade the first node
node_api.upgrade
# Explicitly mark the first node as cluster founder
# and in case of DRBD setup, adapt DRBD config accordingly.
save_node_action("marking node as cluster founder")
unless Api::Pacemaker.set_node_as_founder node.name
raise_node_upgrade_error("Changing the cluster founder to #{node.name} has failed.")
return false
end
# remove pre-upgrade attribute, so the services can start
other_node_api.disable_pre_upgrade_attribute_for node.name
# delete old pacemaker resources (from the node where old pacemaker is running)
delete_pacemaker_resources other_node
shutdown_all_services_in_cluster node
if node[:run_list_map].key?("keystone-server")
# prevent first upgrade node to start keystone during the
# crowbar join call
node = ::Node.find_by_name(node.name)
node["keystone"]["disable_vhost"] = true
node.save
end
# start crowbar-join at the first node
node_api.post_upgrade
node_api.join_and_chef
if node[:run_list_map].key?("keystone-server")
# stop keystone on non-upgraded nodes, run db migrations
# and start on the upgrade node
keystone_migrate_and_restart node
# allow keystone to be handled by chef again
node = ::Node.find_by_name(node.name)
node["keystone"].delete "disable_vhost"
node.save
end
re_enable_network_agents(node, node)
node_api.save_node_state("controller", "upgraded")
end
def upgrade_next_cluster_node(node, founder)
return true if node.upgraded?
node_api = Api::Node.new node.name
node_api.save_node_state("controller", "upgrading")
unless node.ready?
evacuate_network_node(founder, node, true)
Rails.logger.info("Starting the upgrade of node #{node.name}")
node_api.upgrade
node_api.post_upgrade
node_api.join_and_chef
end
# Remove pre-upgrade attribute _after_ chef-client run because pacemaker is already running
# and we want the configuration to be updated first
# (disabling attribute causes starting the services on the node)
node_api.disable_pre_upgrade_attribute_for node.name
re_enable_network_agents(founder, node)
node_api.save_node_state("controller", "upgraded")
end
# Delete existing pacemaker resources, from other node in the cluster
def delete_pacemaker_resources(node)
save_node_action("deleting old pacemaker resources")
node.wait_for_script_to_finish(
"/usr/sbin/crowbar-delete-pacemaker-resources.sh",
timeouts[:delete_pacemaker_resources]
)
Rails.logger.info("Deleting pacemaker resources was successful.")
rescue StandardError => e
raise_node_upgrade_error(
e.message +
" Check /var/log/crowbar/node-upgrade.log for details."
)
end
# Shutdown all remaining services on the nodes that are not being upgraded
# This would be solved by delete_pacemaker_resources, however since pacemaker
# does not manage all services anymore, it's necessary to do an extra shutdown
# step
#
# As an argument there's a node that is currently being upgraded.
# There's no need to care about this one, we only need to shut down services
# on remaining nodes of the cluster.
def shutdown_all_services_in_cluster(node)
save_node_action("Making sure that OpenStack services are stopped")
cluster = node[:pacemaker][:config][:environment]
cluster_nodes = ::Node.find(
"pacemaker_config_environment:#{cluster} AND " \
"run_list_map:pacemaker-cluster-member AND " \
"NOT fqdn:#{node[:fqdn]}"
)
begin
execute_scripts_and_wait_for_finish(
cluster_nodes,
"/usr/sbin/crowbar-shutdown-remaining-services.sh",
timeouts[:shutdown_remaining_services]
)
Rails.logger.info("All OpenStack services were shut down.")
rescue StandardError => e
raise_node_upgrade_error(
"Error while shutting down ervices. " + e.message
)
end
end
# "shutdown_all_services_in_cluster" kept the keystone vhost running
# on the non-upgraded nodes. This method will stop the vhost on those
# nodes, run the db migrations and start the upgrade keystone instance
# on "node".
def keystone_migrate_and_restart(node)
save_node_action("Making sure that Keystone is stopped on non-upgraded nodes")
cluster = node[:pacemaker][:config][:environment]
cluster_nodes = ::Node.find(
"pacemaker_config_environment:#{cluster} AND " \
"run_list_map:pacemaker-cluster-member AND " \
"NOT fqdn:#{node[:fqdn]}"
)
begin
execute_scripts_and_wait_for_finish(
cluster_nodes,
"/usr/sbin/crowbar-shutdown-keystone.sh",
timeouts[:shutdown_remaining_services]
)
Rails.logger.info("All non-upgraded keystone instances were shut down.")
rescue StandardError => e
raise_node_upgrade_error(
"Error while shutting down keystone. " + e.message
)
end
save_node_action("Running keystone db migrations and starting keystone on upgraded node")
begin
node.wait_for_script_to_finish(
"/usr/sbin/crowbar-migrate-keystone-and-start.sh",
timeouts[:shutdown_remaining_services]
)
Rails.logger.info("Keystone DB migrated and service started.")
rescue StandardError => e
raise_node_upgrade_error(
"Error while migrating and restarting keystone. " + e.message
)
end
end
# Evacuate all dhcp networks, routers and loadbalancers away from the specified
# network node to other available network nodes. The evacuation procedure is
# started on the specified controller node
def evacuate_network_node(controller, network_node, delete_namespaces = false)
save_node_action("evacuating dhcp networks")
hostname = network_node["hostname"]
migrated_file = "/var/lib/crowbar/upgrade/crowbar-network-evacuated"
if network_node.file_exist? migrated_file
Rails.logger.info("Network node #{hostname} was already evacuated.")
return
end
unless network_node[:run_list_map].key? "neutron-network"
Rails.logger.info(
"Node #{hostname} does not have 'neutron-network' role. Nothing to evacuate."
)
return
end
args = [hostname]
controller.wait_for_script_to_finish(
"/usr/sbin/crowbar-dhcp-migration.sh",
timeouts[:dhcp_migration],
args
)
Rails.logger.info("Migrating dhcp networks away from #{hostname} was successful.")
save_node_action("evacuating routers")
args = [hostname]
args << "delete-ns" if delete_namespaces
controller.wait_for_script_to_finish(
"/usr/sbin/crowbar-router-migration.sh",
timeouts[:router_migration],
args
)
Rails.logger.info("Migrating routers away from #{hostname} was successful.")
save_node_action("evacuating loadbalancers")
controller.wait_for_script_to_finish(
"/usr/sbin/crowbar-lbaas-evacuation.sh",
timeouts[:lbaas_evacuation],
args
)
Rails.logger.info("Migrating loadbalancers away from #{hostname} was successful.")
args = [hostname, "disable"]
save_node_action("disabling network agents")
controller.wait_for_script_to_finish(
"/usr/sbin/crowbar-set-network-agents-state.sh",
timeouts[:set_network_agents_state],
args
)
Rails.logger.info("Disabling network agents on #{hostname} was successful.")
network_node.run_ssh_cmd("mkdir -p /var/lib/crowbar/upgrade; touch #{migrated_file}")
# Cleanup up the ok/failed state files, as we likely need to
# run the script again on this node (to evacuate other nodes)
controller.delete_script_exit_files("/usr/sbin/crowbar-dhcp-migration.sh")
controller.delete_script_exit_files("/usr/sbin/crowbar-router-migration.sh")
controller.delete_script_exit_files("/usr/sbin/crowbar-lbaas-evacuation.sh")
controller.delete_script_exit_files("/usr/sbin/crowbar-set-network-agents-state.sh")
rescue StandardError => e
raise_node_upgrade_error(
e.message +
" Check /var/log/crowbar/node-upgrade.log at #{controller.name} for details."
)
end
# Re-enable neutron network agents on a just upgraded node
def re_enable_network_agents(controller, network_node)
hostname = network_node["hostname"]
save_node_action("Enabling neutron networking agents on #{hostname}")
args = [hostname, "enable"]
save_node_action("re-enabling network agents")
controller.wait_for_script_to_finish(
"/usr/sbin/crowbar-set-network-agents-state.sh",
timeouts[:set_network_agents_state],
args
)
Rails.logger.info("Enabling network agents on #{hostname} was successful.")
controller.delete_script_exit_files("/usr/sbin/crowbar-set-network-agents-state.sh")
rescue StandardError => e
raise_node_upgrade_error(
e.message +
" Check /var/log/crowbar/node-upgrade.log at #{controller.name} for details."
)
end
#
# upgrade given node, regardless its role
#
def upgrade_one_node(name)
node = ::Node.find_node_by_name_or_alias(name)
return if node.upgraded?
node_api = Api::Node.new name
roles = []
roles.push "cinder" if node["run_list_map"].key? "cinder-volume"
roles.push "swift" if node["run_list_map"].key? "swift-storage"
role = roles.join("+")
role = "controller" if role.empty?
node_api.save_node_state(role, "upgrading")
if node.ready_after_upgrade?
Rails.logger.info(
"Node #{node.name} is ready after the initial chef-client run."
)
else
node_api.upgrade
node_api.post_upgrade
node_api.join_and_chef
end
node_api.save_node_state(role, "upgraded")
end
# Upgrade the nodes that are not controlles neither compute-kvm ones
# (e.g. standalone cinder-volume or swift-storage nodes, or other compute nodes)
# This has to be done before compute nodes upgrade, so the live migration
# can use fully upgraded cinder stack.
def upgrade_non_compute_nodes
non_compute_nodes = ::Node.find("NOT run_list_map:nova-compute-*")
# this should also remove all controllers and crowbar node from the list
non_compute_nodes.reject!(&:upgraded?)
# if we started upgrade of some node before, let's continue with it
non_compute_nodes.sort! { |n| n.upgrading? ? -1 : 1 }
non_compute_nodes.each do |node|
upgrade_one_node node.name
end
save_nodes_state([], "", "")
end
def prepare_all_compute_nodes
# We do not support any non-kvm kind of compute, but in future we might...
type = upgrade_mode == :normal ? "*" : "kvm"
prepare_compute_nodes type
end
# Restart remote resources at target node from cluster node
# This is needed for services (like nova-compute) managed by pacemaker.
def restart_remote_resources(controller, node)
hostname = node[:hostname]
save_node_action("restarting services at remote node #{hostname}")
out = controller.run_ssh_cmd(
"crm --wait node standby remote-#{hostname}",
"60s"
)
unless out[:exit_code].zero?
raise_node_upgrade_error(
"Moving remote node '#{hostname}' to standby state has failed. " \
"Check /var/log/pacemaker.log at '#{controller.name}' for possible causes."
)
end
out = controller.run_ssh_cmd(
"crm --wait node online remote-#{hostname}",
"60s"
)
return if out[:exit_code].zero?
raise_node_upgrade_error(
"Bringing remote node '#{hostname}' from standby state has failed. " \
"Check /var/log/pacemaker.log at '#{controller.name}' for possible causes."
)
end
# Move the upgraded node to ready state and wait until nova-compute resource
# is reported as running.
def start_remote_resources(controller, hostname)
save_node_action("starting services at remote node #{hostname}")
out = controller.run_ssh_cmd(
"crm --wait node ready remote-#{hostname}",
"60s"
)
unless out[:exit_code].zero?
raise_node_upgrade_error(
"Starting remote services at '#{hostname}' node has failed. " \
"Check /var/log/pacemaker.log at '#{controller.name}' and " \
"'#{hostname}' nodes for possible causes."
)
end
seconds = timeouts[:wait_until_compute_started]
begin
Timeout.timeout(seconds) do
loop do
out = controller.run_ssh_cmd(
"source /root/.openrc; " \
"openstack --insecure compute service list " \
"--host #{hostname} --service nova-compute -f value -c State"
)
break if !out[:stdout].nil? && out[:stdout].chomp == "up"
sleep 1
end
end
rescue Timeout::Error
raise_node_upgrade_error(
"Service 'nova-compute' at '#{hostname}' node did not start " \
"after #{seconds} seconds. " \
"Check /var/log/pacemaker.log at '#{controller.name}' and " \
"'#{hostname}' nodes for possible causes."
)
end
end
def prepare_remote_nodes
# iterate over remote clusters
ServiceObject.available_remotes.each do |cluster, role|
# find the controller in this cluster
cluster_name = ServiceObject.cluster_name(cluster)
cluster_env = "pacemaker-config-#{cluster_name}"
founder_name = ::Node.find(
"pacemaker_config_environment:#{cluster_env}"
).first[:pacemaker][:founder]
controller = ::Node.find_by_name(founder_name)
# restart remote resources for each node
role.cluster_remote_nodes.each do |node|
restart_remote_resources(controller, node)
end
end
end
# Prepare the compute nodes for upgrade by upgrading necessary packages
def prepare_compute_nodes(virt)
Rails.logger.info("Preparing #{virt} compute nodes for upgrade... ")
compute_nodes = ::Node.find("roles:nova-compute-#{virt}")
if compute_nodes.empty?
Rails.logger.info("There are no compute nodes of #{virt} type.")
return
end
# remove upgraded compute nodes
compute_nodes.reject!(&:upgraded?)
if compute_nodes.empty?
Rails.logger.info(
"All compute nodes of #{virt} type are already upgraded."
)
return
end
save_node_action("preparing compute nodes before the upgrade")
# This batch of actions can be executed in parallel for all compute nodes
begin
execute_scripts_and_wait_for_finish(
compute_nodes,
"/usr/sbin/crowbar-prepare-repositories.sh",
timeouts[:prepare_repositories]
)
Rails.logger.info("Repositories prepared successfully.")
if upgrade_mode == :non_disruptive
execute_scripts_and_wait_for_finish(
compute_nodes,
"/usr/sbin/crowbar-pre-upgrade.sh",
timeouts[:pre_upgrade]
)
Rails.logger.info("Services on compute nodes upgraded and prepared.")
end
rescue StandardError => e
raise_node_upgrade_error(
"Error while preparing services on compute nodes. " + e.message
)
end
prepare_remote_nodes if upgrade_mode == :non_disruptive
save_node_action("compute nodes prepared")
end
#
# compute nodes upgrade
#
def upgrade_all_compute_nodes
::Crowbar::UpgradeStatus.new.save_substep(:compute_nodes, :running)
type = upgrade_mode == :normal ? "*" : "kvm"
upgrade_compute_nodes type
end
def parallel_upgrade_compute_nodes(compute_nodes, controller = nil)
Rails.logger.info("Entering parallel upgrade of compute nodes, #{upgrade_mode} mode")
Rails.logger.info("Nodes for upgrade: #{compute_nodes.map(&:name).join(', ')}")
save_nodes_state(compute_nodes, "compute", "upgrading")
save_node_action("upgrading the packages")
begin
execute_scripts_and_wait_for_finish(
compute_nodes,
"/usr/sbin/crowbar-upgrade-os.sh",
timeouts[:upgrade_os]
)
Rails.logger.info("Packages upgraded successfully.")
rescue StandardError => e
raise_node_upgrade_error(
"Error while upgrading compute nodes. " + e.message
)
end
# reboot block
compute_nodes.each do |node|
next if node.upgraded?
node_api = Api::Node.new node.name
node_api.reboot
end
# wait block (we have to wait for each node to be back)
compute_nodes.each do |node|
next if node.upgraded?
node_api = Api::Node.new node.name
node_api.wait_after_reboot
end
# crowbar_join preparations
compute_nodes.each do |node|
next if node.upgraded?
node_api = Api::Node.new node.name
node_api.prepare_join
end
# Now, run time consuming crowbar_join action in parallel
save_node_action("upgrading configuration and re-joining the crowbar environment")
begin
execute_scripts_and_wait_for_finish(
compute_nodes,
"/usr/sbin/crowbar-chef-upgraded.sh",
timeouts[:chef_upgraded]
)
Rails.logger.info("Crowbar-join executed successfully on all nodes in this set.")
rescue StandardError => e
raise_node_upgrade_error(
"Error while running the initial chef-client. #{e.message} " \
"Important information might be found under /var/log/crowbar/crowbar_join/."
)
end
# post crowbar_join actions
compute_nodes.each do |node|
next if node.upgraded?
node_api = Api::Node.new node.name
node_api.post_join_cleanup
end
# enable nova-compute services for upgraded nodes.
unless controller.nil?
compute_nodes.each do |node|
next if node.upgraded?
enable_compute_service(controller, node)
end
end
# mark the finish states
compute_nodes.each do |node|
next if node.upgraded?
node_api = Api::Node.new node.name
node_api.save_node_state("compute", "upgraded")
end
save_nodes_state([], "", "") if upgrade_mode == :normal
end
def upgrade_compute_nodes(virt)
Rails.logger.info("Upgrading #{virt} compute nodes... ")
compute_nodes = ::Node.find("roles:nova-compute-#{virt}")
if compute_nodes.empty?
Rails.logger.info("There are no compute nodes of #{virt} type.")
return
end
# remove upgraded compute nodes
compute_nodes.reject!(&:upgraded?)
if compute_nodes.empty?
Rails.logger.info(
"All compute nodes of #{virt} type are already upgraded."
)
return
end
return parallel_upgrade_compute_nodes(compute_nodes) if upgrade_mode == :normal
controller = fetch_nova_controller
# If there's a compute node which we already started to upgrade,
# (and the upgrade process was restarted due to the failure)
# continue with that one.
compute_nodes.sort! { |n| n.upgrading? ? -1 : 1 }
# This part must be done sequentially, only one compute node can be upgraded at a time
compute_nodes.each do |n|
upgrade_compute_node(controller, n)
end
end
# Non-disruptive way of upgrading more compute nodes at once.
# Does live-evacuation of each compute node first, then starts parallel upgrade.
# Argument is list of node names.
def non_disruptive_upgrade_compute_nodes(names)
compute_nodes = names.map do |name|
::Node.find_node_by_name_or_alias(name)
end
# make sure we upgrade each node only once
compute_nodes.uniq!(&:name)
# remove nodes which were already upgraded
compute_nodes.reject!(&:upgraded?)
controller = fetch_nova_controller
# If there's a compute node which we already started to upgrade,
# (and the upgrade process was restarted due to the failure)
# continue with that one.
compute_nodes.sort! { |n| n.upgrading? ? -1 : 1 }
while compute_nodes.any?
# 1. Evacuate as many as possible, create a subset of nodes without instances.
nodes_to_upgrade = []
already_upgrading = false
compute_nodes.each do |node|
# if nodes are already in upgrading state, we can skip the live-migration part
already_upgrading = true if node.upgrading?
break if already_upgrading && !node.upgrading?
hostname = node[:hostname]
begin
live_evacuate_compute_node(controller, hostname) unless already_upgrading
nodes_to_upgrade << node
rescue StandardError => e
# We could safely ignore the error when the execution failed with a timeout,
# raise it otherwise.
failed_file = "/var/lib/crowbar/upgrade/crowbar-evacuate-host-failed"
raise e if controller.file_exist? failed_file
Rails.logger.info("attempt to live evacuate #{hostname} took too long, exiting loop")
# We need to enable nova-service on the failed node again so its compute powers could
# be used when upgrading next bunch of compute nodes
enable_compute_service(controller, node, true)
break
end
end
if nodes_to_upgrade.empty?
raise_node_upgrade_error(
"There was a problem during live evacuation of #{compute_nodes.first[:name]}. " \
"Cannot proceed with upgrade of compute nodes. " \
"Check /var/log/crowbar/production.log and nova logs for details."
)
end
compute_nodes -= nodes_to_upgrade
save_nodes_state(nodes_to_upgrade, "compute", "upgrading")
# 2. Use original parallel upgrade method for nodes that are free of instances.
parallel_upgrade_compute_nodes(nodes_to_upgrade, controller)
end
save_nodes_state([], "", "")
end
def fetch_nova_controller
controller = ::Node.find("roles:nova-controller").first
if controller.nil?
raise_node_upgrade_error(
"No node with 'nova-controller' role node was found. " \
"Cannot proceed with upgrade of compute nodes."
)
end
controller
end
def upgrade_one_compute_node(name)
controller = fetch_nova_controller
node = ::Node.find_node_by_name_or_alias(name)
if node.nil?
raise_node_upgrade_error(
"No node with '#{name}' name or alias was found. "
)
end
upgrade_compute_node(controller, node)
end
# After compute node upgrade, nova-compute service needs to be enabled
# so that the node compute power can be used when live-migrating other nodes
def enable_compute_service(controller, node, only_enable = false)
hostname = node[:hostname]
controller.run_ssh_cmd(
"source /root/.openrc; " \
"openstack --insecure compute service set --enable #{hostname} nova-compute"
)
out = controller.run_ssh_cmd(
"source /root/.openrc; " \
"openstack --insecure compute service list --service nova-compute " \
"--host #{hostname} -f value -c Status",
"60s"
)
if out[:stdout].nil? || out[:stdout].chomp != "enabled"
raise_node_upgrade_error(
"Enabling nova-compute service for '#{hostname}' has failed. " \
"Check nova log files at '#{controller.name}' and '#{hostname}'."
)
end
return if only_enable
return unless node[:pacemaker] && node[:pacemaker][:is_remote]
start_remote_resources(controller, hostname)
end
# Fully upgrade one compute node
def upgrade_compute_node(controller, node)
return if node.upgraded?
node_api = Api::Node.new node.name
node_api.save_node_state("compute", "upgrading")
hostname = node[:hostname]
if node.ready_after_upgrade?
Rails.logger.info(
"Node #{node.name} is ready after the initial chef-client run."
)
else
live_evacuate_compute_node(controller, hostname) if upgrade_mode == :non_disruptive
node_api.os_upgrade
node_api.reboot_and_wait
node_api.post_upgrade
node_api.join_and_chef
end
if upgrade_mode == :normal
node_api.save_node_state("compute", "upgraded")
return
end
enable_compute_service(controller, node)
node_api.save_node_state("compute", "upgraded")
end
# Live migrate all instances of the specified
# node to other available hosts.
def live_evacuate_compute_node(controller, compute)
save_node_action("live-migrating nova instances from #{compute}")
controller.wait_for_script_to_finish(
"/usr/sbin/crowbar-evacuate-host.sh",
timeouts[:evacuate_host],
[compute],
true
)
Rails.logger.info(
"Migrating instances from node #{compute} was successful."
)
# Cleanup up the ok/failed state files, as we likely need to
# run the script again on this node (to live-evacuate other compute nodes)
controller.delete_script_exit_files("/usr/sbin/crowbar-evacuate-host.sh")
rescue StandardError => e
raise_live_migration_error(
e.message +
"Check /var/log/crowbar/node-upgrade.log at #{controller.name} " \
"or nova-compute logs at #{compute} for details."
)
end
def save_node_action(action)
::Crowbar::UpgradeStatus.new.save_current_node_action(action)
end
# Save the state of multiple nodes, being upgraded in paralel
def save_nodes_state(nodes, role, state)
status = ::Crowbar::UpgradeStatus.new
current_nodes = nodes.map do |node|
node.crowbar["node_upgrade_state"] = state
node.save
{
name: node.name,
alias: node.alias,
ip: node.public_ip,
state: state,
role: role
}
end
status.save_current_nodes current_nodes
end
# Take a list of nodes and execute given script at each node in the background
# Wait until all scripts at all nodes correctly finish or until some error is detected
def execute_scripts_and_wait_for_finish(nodes, script, seconds)
nodes.each do |node|
Rails.logger.info("Executing script '#{script}' at #{node.name}")
ssh_status = node.ssh_cmd(script).first
if ssh_status != 200
raise "Execution of script #{script} has failed on node #{node.name}."
end
end
scripts_status = {}
begin
Timeout.timeout(seconds) do
nodes.each do |node|
# wait until sript on this node finishes, than move to check next one
loop do
status = node.script_status(script)
scripts_status[node.name] = status
break if status != "running"
sleep 1
end
end
end
failed = scripts_status.select { |_, v| v == "failed" }.keys
unless failed.empty?
raise "Execution of script #{script} has failed at node(s) " \
"#{failed.join(", ")}. " \
"Check /var/log/crowbar/node-upgrade.log for details."
end
rescue Timeout::Error
running = scripts_status.select { |_, v| v == "running" }.keys
raise "Possible error during execution of #{script} at node(s) " \
"#{running.join(", ")}. " \
"Action did not finish after #{seconds} seconds."
end
end
# remove crowbar-ui package lock for future upgrades
def unlock_crowbar_ui_package
ui_unlock = run_cmd("sudo zypper-retry removelock 'crowbar-ui*'")
unless ui_unlock[:exit_code].zero?
raise_node_upgrade_error(
"Removing crowbar-ui package lock has failed. " \
"Please re-try current step to finalize the upgrade."
)
end
zypper_refresh = run_cmd("sudo zypper-retry -n refresh")
unless zypper_refresh[:exit_code].zero?
raise_node_upgrade_error(
"Refreshing repos for crowbar-ui update has failed. " \
"Please re-try current step to finalize the upgrade."
)
end
ui_update = run_cmd("sudo zypper-retry -n update crowbar-ui")
unless ui_update[:exit_code].zero?
raise_node_upgrade_error(
"Updating crowbar-ui package has failed. " \
"Please re-try current step to finalize the upgrade."
)
end
end
#
# prechecks helpers
#
# all of the below errors return a hash with the following schema:
# code: {
# data: ... whatever data type ...,
# help: String # "this is how you might fix the error"
# }
def sanity_check_errors(check)
{
network_checks: {
data: check,
help: I18n.t("api.upgrade.prechecks.network_checks.help.default")
}
}
end
def deployment_errors(check)
ret = {}
if check[:controller_roles]
ret[:controller_roles] = {
data: I18n.t("api.upgrade.prechecks.controller_roles.error",
node: check[:controller_roles][:node],
roles: check[:controller_roles][:roles]),
help: I18n.t("api.upgrade.prechecks.controller_roles.help")
}
end
if check[:wrong_sql_engine]
ret[:wrong_sql_engine] = {
data: I18n.t("api.upgrade.prechecks.wrong_sql_engine.error"),
help: I18n.t("api.upgrade.prechecks.wrong_sql_engine.help")
}
end
ret
end
def health_check_errors(check)
ret = {}
if check[:nodes_not_ready]
ret[:nodes_not_ready] = {
data: I18n.t("api.upgrade.prechecks.not_ready.error",
nodes: check[:nodes_not_ready].join(", ")),
help: I18n.t("api.upgrade.prechecks.not_ready.help")
}
end
if check[:failed_proposals]
ret[:failed_proposals] = {
data: I18n.t("api.upgrade.prechecks.failed_proposals.error",
proposals: check[:failed_proposals].join(", ")),
help: I18n.t("api.upgrade.prechecks.failed_proposals.help")
}
end
ret
end
def maintenance_updates_check_errors(check)
{
maintenance_updates_installed: {
data: check[:errors],
help: I18n.t("api.upgrade.prechecks.maintenance_updates_check.help.default")
}
}
end
def ceph_errors(check)
ret = {}
if check[:crowbar_ceph_nodes]
ret[:crowbar_ceph_nodes] = {
data: I18n.t("api.upgrade.prechecks.crowbar_ceph_present.error"),
help: I18n.t("api.upgrade.prechecks.crowbar_ceph_present.help")
}
end
ret
end
def ha_config_errors(check)
ret = {}
if check[:errors]
ret[:ha_configured] = {
data: check[:errors],
help: I18n.t("api.upgrade.prechecks.ha_configured.help.default")
}
end
if check[:cinder_wrong_backend]
ret[:cinder_wrong_backend] = {
data: I18n.t("api.upgrade.prechecks.cinder_wrong_backend.error"),
help: I18n.t("api.upgrade.prechecks.cinder_wrong_backend.help")
}
end
if check[:roles_not_ha]
ret[:roles_not_ha] = {
data: I18n.t("api.upgrade.prechecks.roles_not_ha.error",
roles: check[:roles_not_ha].join(", ")),
help: I18n.t("api.upgrade.prechecks.roles_not_ha.help")
}
end
if check[:role_conflicts]
nodes = check[:role_conflicts].map do |node, roles|
"#{node}: " + roles.join(", ")
end
ret[:role_conflicts] = {
data: I18n.t("api.upgrade.prechecks.role_conflicts.error",
nodes: nodes.join("\n")),
help: I18n.t("api.upgrade.prechecks.role_conflicts.help")
}
end
if check[:unsupported_cluster_setup]
ret[:unsupported_cluster_setup] = {
data: I18n.t("api.upgrade.prechecks.unsupported_cluster_setup.error"),
help: I18n.t("api.upgrade.prechecks.unsupported_cluster_setup.help")
}
end
ret
end
def clusters_health_report_errors(check)
ret = {}
crm_failures = check["crm_failures"]
failed_actions = check["failed_actions"]
unready_nodes = check["unready_nodes"]
ret[:clusters_health_crm_failures] = {
data: crm_failures.values,
help: I18n.t("api.upgrade.prechecks.clusters_health.crm_failures")
} if crm_failures
ret[:clusters_health_failed_actions] = {
data: failed_actions.values,
help: I18n.t("api.upgrade.prechecks.clusters_health.failed_actions")
} if failed_actions
if unready_nodes
ret[:clusters_health_unready_nodes] = {
data: unready_nodes.values,
help: I18n.t("api.upgrade.prechecks.clusters_health.unready_nodes")
}
end
ret
end
def compute_status_errors(check)
ret = {}
if check[:no_resources]
ret[:no_resources] = {
data: check[:no_resources],
help: I18n.t("api.upgrade.prechecks.no_resources.help")
}
end
if check[:non_kvm_computes]
ret[:non_kvm_computes] = {
data: I18n.t("api.upgrade.prechecks.non_kvm_computes.error",
nodes: check[:non_kvm_computes].join(", ")),
help: I18n.t("api.upgrade.prechecks.non_kvm_computes.help")
}
end
if check[:no_live_migration]
ret[:no_live_migration] = {
data: I18n.t("api.upgrade.prechecks.no_live_migration.error"),
help: I18n.t("api.upgrade.prechecks.no_resources.help")
}
end
ret
end
#
# prepare upgrade helpers
#
def prepare_nodes_for_crowbar_upgrade_background
@thread = Thread.new do
Rails.logger.debug("Started prepare in a background thread")
prepare_nodes_for_crowbar_upgrade
end
@thread.alive?
end
def prepare_nodes_for_crowbar_upgrade
crowbar_service = CrowbarService.new
crowbar_service.prepare_nodes_for_crowbar_upgrade
provisioner_service = ProvisionerService.new
provisioner_service.disable_all_repositories
::Crowbar::UpgradeStatus.new.end_step
true
rescue => e
message = e.message
::Crowbar::UpgradeStatus.new.end_step(
false,
prepare: {
data: message,
help: "Check /var/log/crowbar/production.log on admin server."
}
)
Rails.logger.error message
false
end
#
# repocheck helpers
#
def repo_version_available?(products, product, version)
products.any? do |p|
p["version"] == version && p["name"] == product
end
end
def admin_architecture
::Node.admin_node.architecture
end
#
# openstackbackup helpers
#
def openstack_db_node
db_node = ::Node.find("roles:database-config-default").first
if db_node.nil?
Rails.logger.warn("No node with role 'database-config-default' found")
return nil
end
db_node
end
#
# general helpers
#
def run_cmd(*args)
Open3.popen2e(*args) do |stdin, stdout_and_stderr, wait_thr|
{
stdout_and_stderr: stdout_and_stderr.gets(nil),
exit_code: wait_thr.value.exitstatus
}
end
end
end
end
end