chef/cookbooks/rabbitmq/recipes/ha_cluster.rb
# Copyright 2017 SUSE
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
pid_file = "/var/run/rabbitmq/pid"
agent_name = "ocf:rabbitmq:rabbitmq-server-ha"
service_name = "rabbitmq"
ms_name = "ms-#{service_name}"
# create file that will be sourced by OCF resource agent on promote
template "/etc/rabbitmq/ocf-promote" do
source "ocf-promote.erb"
owner "root"
group "root"
mode 0o644
variables(
clustername: node[:rabbitmq][:clustername]
)
end
# wait for service to have a master, and to be active
ruby_block "wait for #{ms_name} to be started" do
block do
require "timeout"
begin
Timeout.timeout(360) do
# Do not check if rabbitmq is running when it is not supposed to run.
# pre-upgrade attribute is set to true to indicate the case that the upgrade of the node has
# not been finished yet. In such case, services cannot start on the node
# (there's a location constraint that prevents that).
# See disable_pre_upgrade_attribute_for method in models/api/node.rb for more info
cmd = "crm_attribute --node #{node[:hostname]} --name pre-upgrade --query --quiet"
cmd << "| grep -q true"
break if ::Kernel.system(cmd)
# Check that the service has a master
cmd = "crm resource show #{ms_name} 2> /dev/null "
cmd << "| grep \"is running on\" | grep -q \"Master\""
until ::Kernel.system(cmd)
Chef::Log.info("#{ms_name} still without master")
sleep(2)
end
# Check that the service is running on this node
cmd = "crm resource show #{ms_name} 2> /dev/null "
cmd << "| grep -q \"is running on: #{node.hostname}\""
until ::Kernel.system(cmd)
Chef::Log.info("#{ms_name} still not running locally")
sleep(2)
end
# The sed command grabs everything between '{running_applications'
# and ']}', and what we want is that the rabbit application is
# running
# Checks if the actual rabbit app is running properly at least 5 times in a row
# as to prevent continuing when its not stable enough
cmd = "rabbitmqctl -q status 2> /dev/null "
cmd << "| sed -n '/{running_applications/,/\]}/p' | grep -q '{rabbit,'"
count = 0
until count == 5
if ::Kernel.system(cmd)
count += 1
sleep(2)
else
count = 0
end
end
# Check that we dont have any pending pacemaker resource operations
cmd = "crm resource operations #{ms_name} 2> /dev/null "
cmd << "| grep -q \"pending\""
while ::Kernel.system(cmd)
Chef::Log.info("resource #{ms_name} still has pending operations")
sleep(2)
end
end
rescue Timeout::Error
message = "The #{ms_name} pacemaker resource is not started or doesn't have a master yet."
message << " Please manually check for an error."
Chef::Log.fatal(message)
raise message
end
end
action :nothing
end
# Wait for all nodes to reach this point so we know that all nodes will have
# all the required packages installed before we create the pacemaker
# resources
crowbar_pacemaker_sync_mark "sync-rabbitmq_before_ha"
crowbar_pacemaker_sync_mark "wait-rabbitmq_ha_resources" do
timeout 300
end
transaction_objects = []
pacemaker_primitive service_name do
agent agent_name
# nodename is empty so that we explicitly depend on the config files
params ({
"erlang_cookie" => node[:rabbitmq][:erlang_cookie],
"pid_file" => pid_file,
"policy_file" => "/etc/rabbitmq/ocf-promote",
"rmq_feature_health_check" => node[:rabbitmq][:ha][:clustered_rmq_features],
"rmq_feature_local_list_queues" => node[:rabbitmq][:ha][:clustered_rmq_features],
"default_vhost" => node[:rabbitmq][:vhost]
})
op node[:rabbitmq][:ha][:clustered_op]
meta ({
"migration-threshold" => "10",
"failure-timeout" => "30s",
"resource-stickiness" => "100"
})
action :update
only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
end
transaction_objects.push("pacemaker_primitive[#{service_name}]")
# no location on the role here: the ms resource will have this constraint
pacemaker_ms ms_name do
rsc service_name
meta ({
"master-max" => "1",
"clone-max" => CrowbarPacemakerHelper.cluster_nodes(node).size,
"master-node-max" => "1",
"ordered" => "false",
"interleave" => "false",
"notify" => "true"
})
action :update
only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
end
transaction_objects.push("pacemaker_ms[#{ms_name}]")
ms_location_name = openstack_pacemaker_controller_only_location_for ms_name
transaction_objects.push("pacemaker_location[#{ms_location_name}]")
pacemaker_transaction "rabbitmq service" do
cib_objects transaction_objects
# note that this will also automatically start the resources
action :commit_new
only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
notifies :create, resources(ruby_block: "wait for #{ms_name} to be started"), :immediately
end
crowbar_pacemaker_sync_mark "create-rabbitmq_ha_resources"
clustermon_op = { "monitor" => [{ "interval" => "10s" }] }
clustermon_params = { "extra_options" => "-E /usr/bin/rabbitmq-alert-handler.sh --watch-fencing" }
name = "rabbitmq-port-blocker"
clone_name = "cl-#{name}"
location_name = "l-#{name}-controller"
node_upgrading = CrowbarPacemakerHelper.being_upgraded?(node)
clone_running = "crm resource show #{clone_name} | grep -q \"is running on:\""
primitive_running = "crm resource show #{name} | grep -q \"is running on:\""
port = node[:rabbitmq][:port]
ssl_port = node[:rabbitmq][:ssl][:port]
crowbar_pacemaker_sync_mark "wait-rabbitmq_alert_resources"
if CrowbarPacemakerHelper.cluster_nodes(node).size > 2 && !node_upgrading
template "/usr/bin/rabbitmq-alert-handler.sh" do
source "rabbitmq-alert-handler.erb"
owner "root"
group "root"
mode "0755"
variables(node: node, nodes: CrowbarPacemakerHelper.cluster_nodes(node))
end
template "/usr/bin/#{name}.sh" do
source "#{name}.erb"
owner "root"
group "root"
mode "0755"
variables(total_nodes: CrowbarPacemakerHelper.cluster_nodes(node).size,
port: port, ssl_port: ssl_port)
end
pacemaker_primitive name do
agent "ocf:pacemaker:ClusterMon"
op clustermon_op
params clustermon_params
action :update
only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
end
pacemaker_clone clone_name do
rsc name
meta CrowbarPacemakerHelper.clone_meta(node)
action :update
only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
end
pacemaker_location location_name do
definition OpenStackHAHelper.controller_only_location(location_name, clone_name)
action :update
only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
end
pacemaker_transaction name do
cib_objects [
"pacemaker_primitive[#{name}]",
"pacemaker_clone[#{clone_name}]",
"pacemaker_location[#{location_name}]"
]
# note that this will also automatically start the resources
action :commit_new
only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
end
else
pacemaker_location location_name do
definition OpenStackHAHelper.controller_only_location(location_name, clone_name)
action :delete
only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
end
pacemaker_clone "#{clone_name}_stop" do
name clone_name
rsc name
meta CrowbarPacemakerHelper.clone_meta(node)
action :stop
only_if do
running = system(clone_running, err: File::NULL)
CrowbarPacemakerHelper.is_cluster_founder?(node) && running
end
end
pacemaker_clone "#{clone_name}_delete" do
name clone_name
rsc name
meta CrowbarPacemakerHelper.clone_meta(node)
action :delete
only_if do
running = system(clone_running, err: File::NULL)
CrowbarPacemakerHelper.is_cluster_founder?(node) && running
end
end
pacemaker_primitive "#{name}_stop" do
agent "ocf:pacemaker:ClusterMon"
name name
op clustermon_op
params clustermon_params
action :stop
only_if do
running = system(primitive_running, err: File::NULL)
CrowbarPacemakerHelper.is_cluster_founder?(node) && running
end
end
pacemaker_primitive "#{name}_delete" do
agent "ocf:pacemaker:ClusterMon"
name name
op clustermon_op
params clustermon_params
action :delete
only_if do
running = system(primitive_running, err: File::NULL)
CrowbarPacemakerHelper.is_cluster_founder?(node) && running
end
end
file "/usr/bin/rabbitmq-alert-handler.sh" do
action :delete
end
file "/usr/bin/#{name}.sh" do
action :delete
end
# in case that the script was already deployed and the rule is already stored we need to clean it
# up as to not left anything around
bash "Remove existent rabbitmq blocking rules" do
code "iptables -D INPUT -p tcp --destination-port 5672 "\
"-m comment --comment \"rabbitmq port blocker (no quorum)\" -j DROP"
only_if do
# check for the rule
cmd = "iptables -L -n | grep -F \"tcp dpt:5672 /* rabbitmq port blocker (no quorum) */\""
system(cmd)
end
end
end
crowbar_pacemaker_sync_mark "create-rabbitmq_alert_resources"