[Asterisk-code-review] res_corosync: Fix crash in huge distributed environment. (asterisk[13])

Università di Bologna - CESIA VoIP asteriskteam at digium.com
Wed Jun 3 07:01:49 CDT 2020


Università di Bologna - CESIA VoIP has uploaded this change for review. ( https://gerrit.asterisk.org/c/asterisk/+/14455 )


Change subject: res_corosync: Fix crash in huge distributed environment.
......................................................................

res_corosync: Fix crash in huge distributed environment.

1) Fix memory-leaks
   Added code to release ast_events extracted from corosync and stasis messages

2) Clean stasis cache when a member of the corosync cluster leaves the group
   Added code to remove from the stasis cache of the members remained on the
   group all the messages with the EID of the left member.
   If the device states of the left member remain in the stasis cache of other
   members, they will not be updated anymore and high priority cached values,
   like BUSY, will take precedence over current device states.

3) Stop corosync event propagation when node is not joined to the group
   Updated dispatch_thread_handler code to detect when asterisk is not joined
   to the corosync group and added some condition in publish_event_to_corosync
   code to send corosync messages only when joined.
   When a node is not joined its corosync daemon can't send messages:
   the cpg_mcast_joined function append new messages to the FIFO buffer until
   it's full and then it blocks indefinitely.
   In this scenario if the stasis_message_cb callback, registered by
   res_corosync to handle stasis messages, try to send a corosync messages,
   the thread of the stasis thread-pool will be blocked until the node join
   the corosync cluster.

ASTERISK-28888
Reported by: Università di Bologna - CESIA VoIP

Change-Id: Ie8e99bc23f141a73c13ae6fb1948d148d4de17f2
---
M res/res_corosync.c
1 file changed, 533 insertions(+), 275 deletions(-)



  git pull ssh://gerrit.asterisk.org:29418/asterisk refs/changes/55/14455/1

diff --git a/res/res_corosync.c b/res/res_corosync.c
index 4e96bff..04b13c9 100644
--- a/res/res_corosync.c
+++ b/res/res_corosync.c
@@ -48,13 +48,21 @@
 #include "asterisk/stasis.h"
 #include "asterisk/stasis_message_router.h"
 #include "asterisk/stasis_system.h"
+#include "asterisk/taskprocessor.h"
 
 AST_RWLOCK_DEFINE_STATIC(event_types_lock);
+AST_RWLOCK_DEFINE_STATIC(init_cpg_lock);
+
+/*! \brief Timeout for Corosync's poll process */
+#define COROSYNC_POLL_TIMEOUT				(10 * 1000)
 
 static void publish_mwi_to_stasis(struct ast_event *event);
 static void publish_device_state_to_stasis(struct ast_event *event);
 static void publish_cluster_discovery_to_stasis(struct ast_event *event);
 
+/*! \brief Join to corosync */
+static int corosync_node_joined = 0;
+
 /*! \brief All the nodes that we're aware of */
 static struct ao2_container *nodes;
 
@@ -97,9 +105,11 @@
 		return NULL;
 	}
 
-	memcpy(&node->eid, (struct ast_eid *)ast_event_get_ie_raw(event, AST_EVENT_IE_EID), sizeof(node->eid));
+	memcpy(&node->eid, (struct ast_eid *) ast_event_get_ie_raw(event, AST_EVENT_IE_EID),
+		   sizeof(node->eid));
 	node->id = ast_event_get_ie_uint(event, AST_EVENT_IE_NODE_ID);
-	ast_sockaddr_parse(&node->addr, ast_event_get_ie_str(event, AST_EVENT_IE_LOCAL_ADDR), PARSE_PORT_IGNORE);
+	ast_sockaddr_parse(&node->addr, ast_event_get_ie_str(event, AST_EVENT_IE_LOCAL_ADDR),
+					   PARSE_PORT_IGNORE);
 
 	return node;
 }
@@ -182,23 +192,24 @@
 		return NULL;
 	}
 
-	event_eid = (struct ast_eid *)ast_event_get_ie_raw(payload->event, AST_EVENT_IE_EID);
+	event_eid = (struct ast_eid *) ast_event_get_ie_raw(payload->event, AST_EVENT_IE_EID);
 
 	event = ast_event_new(AST_EVENT_PING,
-				AST_EVENT_IE_EID, AST_EVENT_IE_PLTYPE_RAW, event_eid, sizeof(*event_eid),
-				AST_EVENT_IE_END);
+						  AST_EVENT_IE_EID, AST_EVENT_IE_PLTYPE_RAW, event_eid,
+						  sizeof(*event_eid), AST_EVENT_IE_END);
 
 	return event;
 }
 
-STASIS_MESSAGE_TYPE_DEFN_LOCAL(corosync_ping_message_type,
-	.to_event = corosync_ping_to_event, );
+STASIS_MESSAGE_TYPE_DEFN_LOCAL(corosync_ping_message_type,.to_event =
+							   corosync_ping_to_event,);
 
 /*! \brief Publish a Corosync ping to \ref stasis */
 static void publish_corosync_ping_to_stasis(struct ast_event *event)
 {
 	struct corosync_ping_payload *payload;
 	struct stasis_message *message;
+	struct ast_eid *event_eid;
 
 	ast_assert(ast_event_get_type(event) == AST_EVENT_PING);
 	ast_assert(event != NULL);
@@ -207,11 +218,15 @@
 		return;
 	}
 
-	payload = ao2_t_alloc(sizeof(*payload), corosync_ping_payload_dtor, "Create ping payload");
+	payload =
+		ao2_t_alloc(sizeof(*payload), corosync_ping_payload_dtor, "Create ping payload");
 	if (!payload) {
 		return;
 	}
-	payload->event = event;
+	event_eid = (struct ast_eid *) ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
+	payload->event = ast_event_new(AST_EVENT_PING,
+								   AST_EVENT_IE_EID, AST_EVENT_IE_PLTYPE_RAW, event_eid,
+								   sizeof(*event_eid), AST_EVENT_IE_END);
 
 	message = stasis_message_create(corosync_ping_message_type(), payload);
 	if (!message) {
@@ -232,33 +247,35 @@
 	unsigned char publish_default;
 	unsigned char subscribe;
 	unsigned char subscribe_default;
-	struct stasis_topic *(* topic_fn)(void);
-	struct stasis_cache *(* cache_fn)(void);
-	struct stasis_message_type *(* message_type_fn)(void);
-	void (* publish_to_stasis)(struct ast_event *);
+	struct stasis_topic *(*topic_fn) (void);
+	struct stasis_cache *(*cache_fn) (void);
+	struct stasis_message_type *(*message_type_fn) (void);
+	void (*publish_to_stasis)(struct ast_event *);
 } event_types[] = {
-	[AST_EVENT_MWI] = { .name = "mwi",
-	                    .topic_fn = ast_mwi_topic_all,
-	                    .cache_fn = ast_mwi_state_cache,
-	                    .message_type_fn = ast_mwi_state_type,
-	                    .publish_to_stasis = publish_mwi_to_stasis, },
-	[AST_EVENT_DEVICE_STATE_CHANGE] = { .name = "device_state",
-	                                    .topic_fn = ast_device_state_topic_all,
-	                                    .cache_fn = ast_device_state_cache,
-	                                    .message_type_fn = ast_device_state_message_type,
-	                                    .publish_to_stasis = publish_device_state_to_stasis, },
-	[AST_EVENT_PING] = { .name = "ping",
-	                     .publish_default = 1,
-	                     .subscribe_default = 1,
-	                     .topic_fn = corosync_topic,
-	                     .message_type_fn = corosync_ping_message_type,
-	                     .publish_to_stasis = publish_corosync_ping_to_stasis, },
-	[AST_EVENT_CLUSTER_DISCOVERY] = { .name = "cluster_discovery",
-	                                  .publish_default = 1,
-	                                  .subscribe_default = 1,
-	                                  .topic_fn = ast_system_topic,
-	                                  .message_type_fn = ast_cluster_discovery_type,
-	                                  .publish_to_stasis = publish_cluster_discovery_to_stasis, },
+	[AST_EVENT_MWI] = {.name = "mwi",
+					   .topic_fn = ast_mwi_topic_all,
+					   .cache_fn = ast_mwi_state_cache,
+					   .message_type_fn = ast_mwi_state_type,
+					   .publish_to_stasis = publish_mwi_to_stasis,},
+	[AST_EVENT_DEVICE_STATE_CHANGE] = {.name = "device_state",
+									   .topic_fn = ast_device_state_topic_all,
+									   .cache_fn = ast_device_state_cache,
+									   .message_type_fn = ast_device_state_message_type,
+									   .publish_to_stasis =
+									   publish_device_state_to_stasis,},
+	[AST_EVENT_PING] = {.name = "ping",
+						.publish_default = 1,
+						.subscribe_default = 1,
+						.topic_fn = corosync_topic,
+						.message_type_fn = corosync_ping_message_type,
+						.publish_to_stasis = publish_corosync_ping_to_stasis,},
+	[AST_EVENT_CLUSTER_DISCOVERY] = {.name = "cluster_discovery",
+									 .publish_default = 1,
+									 .subscribe_default = 1,
+									 .topic_fn = ast_system_topic,
+									 .message_type_fn = ast_cluster_discovery_type,
+									 .publish_to_stasis =
+									 publish_cluster_discovery_to_stasis,},
 };
 
 static struct {
@@ -267,20 +284,19 @@
 	unsigned int stop:1;
 } dispatch_thread = {
 	.id = AST_PTHREADT_NULL,
-	.alert_pipe = { -1, -1 },
+	.alert_pipe = {-1, -1},
 };
 
 static cpg_handle_t cpg_handle;
 static corosync_cfg_handle_t cfg_handle;
 
 #ifdef HAVE_COROSYNC_CFG_STATE_TRACK
-static void cfg_state_track_cb(
-		corosync_cfg_state_notification_buffer_t *notification_buffer,
-		cs_error_t error);
+static void cfg_state_track_cb(corosync_cfg_state_notification_buffer_t *
+							   notification_buffer, cs_error_t error);
 #endif /* HAVE_COROSYNC_CFG_STATE_TRACK */
 
 static void cfg_shutdown_cb(corosync_cfg_handle_t cfg_handle,
-		corosync_cfg_shutdown_flags_t flags);
+							corosync_cfg_shutdown_flags_t flags);
 
 static corosync_cfg_callbacks_t cfg_callbacks = {
 #ifdef HAVE_COROSYNC_CFG_STATE_TRACK
@@ -290,7 +306,8 @@
 };
 
 /*! \brief Publish cluster discovery to \ref stasis */
-static void publish_cluster_discovery_to_stasis_full(struct corosync_node *node, int joined)
+static void publish_cluster_discovery_to_stasis_full(struct corosync_node *node,
+													 int joined)
 {
 	struct ast_json *json;
 	struct ast_json_payload *payload;
@@ -302,16 +319,11 @@
 	addr = ast_sockaddr_stringify_addr(&node->addr);
 
 	ast_log(AST_LOG_NOTICE, "Node %u (%s) at %s %s the cluster\n",
-		node->id,
-		eid,
-		addr,
-		joined ? "joined" : "left");
+			node->id, eid, addr, joined ? "joined" : "left");
 
 	json = ast_json_pack("{s: s, s: i, s: s, s: i}",
-		"address", addr,
-		"node_id", node->id,
-		"eid", eid,
-		"joined", joined);
+						 "address", addr,
+						 "node_id", node->id, "eid", eid, "joined", joined);
 	if (!json) {
 		return;
 	}
@@ -346,8 +358,8 @@
 
 	ast_assert(ast_event_get_type(event) == AST_EVENT_CLUSTER_DISCOVERY);
 
-	event_eid = (struct ast_eid *)ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
-	if (!ast_eid_cmp(&ast_eid_default, event_eid)) {
+	event_eid = (struct ast_eid *) ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
+	if (!event_eid || !ast_eid_cmp(&ast_eid_default, event_eid)) {
 		/* Don't feed events back in that originated locally. */
 		return;
 	}
@@ -395,7 +407,7 @@
 	context = ast_event_get_ie_str(event, AST_EVENT_IE_CONTEXT);
 	new_msgs = ast_event_get_ie_uint(event, AST_EVENT_IE_NEWMSGS);
 	old_msgs = ast_event_get_ie_uint(event, AST_EVENT_IE_OLDMSGS);
-	event_eid = (struct ast_eid *)ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
+	event_eid = (struct ast_eid *) ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
 
 	if (ast_strlen_zero(mailbox) || ast_strlen_zero(context)) {
 		return;
@@ -409,12 +421,12 @@
 		old_msgs = INT_MAX;
 	}
 
-	if (ast_publish_mwi_state_full(mailbox, context, (int)new_msgs,
-	                               (int)old_msgs, NULL, event_eid)) {
+	if (ast_publish_mwi_state_full(mailbox, context, (int) new_msgs,
+								   (int) old_msgs, NULL, event_eid)) {
 		char eid[18];
 		ast_eid_to_str(eid, sizeof(eid), event_eid);
 		ast_log(LOG_WARNING, "Failed to publish MWI message for %s@%s from %s\n",
-			mailbox, context, eid);
+				mailbox, context, eid);
 	}
 }
 
@@ -431,7 +443,7 @@
 	device = ast_event_get_ie_str(event, AST_EVENT_IE_DEVICE);
 	state = ast_event_get_ie_uint(event, AST_EVENT_IE_STATE);
 	cachable = ast_event_get_ie_uint(event, AST_EVENT_IE_CACHABLE);
-	event_eid = (struct ast_eid *)ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
+	event_eid = (struct ast_eid *) ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
 
 	if (ast_strlen_zero(device)) {
 		return;
@@ -441,17 +453,19 @@
 		char eid[18];
 		ast_eid_to_str(eid, sizeof(eid), event_eid);
 		ast_log(LOG_WARNING, "Failed to publish device state message for %s from %s\n",
-			device, eid);
+				device, eid);
 	}
 }
 
 static void cpg_deliver_cb(cpg_handle_t handle, const struct cpg_name *group_name,
-		uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len);
+						   uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len);
 
 static void cpg_confchg_cb(cpg_handle_t handle, const struct cpg_name *group_name,
-		const struct cpg_address *member_list, size_t member_list_entries,
-		const struct cpg_address *left_list, size_t left_list_entries,
-		const struct cpg_address *joined_list, size_t joined_list_entries);
+						   const struct cpg_address *member_list,
+						   size_t member_list_entries,
+						   const struct cpg_address *left_list, size_t left_list_entries,
+						   const struct cpg_address *joined_list,
+						   size_t joined_list_entries);
 
 static cpg_callbacks_t cpg_callbacks = {
 	.cpg_deliver_fn = cpg_deliver_cb,
@@ -459,33 +473,33 @@
 };
 
 #ifdef HAVE_COROSYNC_CFG_STATE_TRACK
-static void cfg_state_track_cb(
-		corosync_cfg_state_notification_buffer_t *notification_buffer,
-		cs_error_t error)
+static void cfg_state_track_cb(corosync_cfg_state_notification_buffer_t *
+							   notification_buffer, cs_error_t error)
 {
 }
 #endif /* HAVE_COROSYNC_CFG_STATE_TRACK */
 
 static void cfg_shutdown_cb(corosync_cfg_handle_t cfg_handle,
-		corosync_cfg_shutdown_flags_t flags)
+							corosync_cfg_shutdown_flags_t flags)
 {
 }
 
 static void cpg_deliver_cb(cpg_handle_t handle, const struct cpg_name *group_name,
-		uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
+						   uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
 {
 	struct ast_event *event;
 	void (*publish_handler)(struct ast_event *) = NULL;
 	enum ast_event_type event_type;
+	struct ast_eid *event_eid;
 
 	if (msg_len < ast_event_minimum_length()) {
 		ast_debug(1, "Ignoring event that's too small. %u < %u\n",
-			(unsigned int) msg_len,
-			(unsigned int) ast_event_minimum_length());
+				  (unsigned int) msg_len, (unsigned int) ast_event_minimum_length());
 		return;
 	}
 
-	if (!ast_eid_cmp(&ast_eid_default, ast_event_get_ie_raw(msg, AST_EVENT_IE_EID))) {
+	event_eid = (struct ast_eid *) ast_event_get_ie_raw(msg, AST_EVENT_IE_EID);
+	if (!event_eid || !ast_eid_cmp(&ast_eid_default, event_eid)) {
 		/* Don't feed events back in that originated locally. */
 		return;
 	}
@@ -497,14 +511,17 @@
 	}
 
 	ast_rwlock_rdlock(&event_types_lock);
+	ast_debug(5, "cpg_deliver_cb rdlock\n");
 	publish_handler = event_types[event_type].publish_to_stasis;
 	if (!event_types[event_type].subscribe || !publish_handler) {
 		/* We are not configured to subscribe to these events or
 		   we have no way to publish it internally. */
 		ast_rwlock_unlock(&event_types_lock);
+		ast_debug(5, "cpg_deliver_cb unlock\n");
 		return;
 	}
 	ast_rwlock_unlock(&event_types_lock);
+	ast_debug(5, "cpg_deliver_cb unlock\n");
 
 	if (!(event = ast_malloc(msg_len))) {
 		return;
@@ -516,13 +533,14 @@
 		const struct ast_eid *eid;
 		char buf[128] = "";
 
-		eid = ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
+		eid = (struct ast_eid *) ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
 		ast_eid_to_str(buf, sizeof(buf), (struct ast_eid *) eid);
 		ast_log(LOG_NOTICE, "Got event PING from server with EID: '%s'\n", buf);
 	}
 	ast_debug(5, "Publishing event %s (%u) to stasis\n",
-		ast_event_get_type_name(event), event_type);
+			  ast_event_get_type_name(event), event_type);
 	publish_handler(event);
+	ast_free(event);
 }
 
 static void publish_event_to_corosync(struct ast_event *event)
@@ -530,30 +548,41 @@
 	cs_error_t cs_err;
 	struct iovec iov;
 
-	iov.iov_base = (void *)event;
+	iov.iov_base = (void *) event;
 	iov.iov_len = ast_event_get_size(event);
 
 	ast_debug(5, "Publishing event %s (%u) to corosync\n",
-		ast_event_get_type_name(event), ast_event_get_type(event));
+			  ast_event_get_type_name(event), ast_event_get_type(event));
 
 	/* The stasis subscription will only exist if we are configured to publish
 	 * these events, so just send away. */
-	if ((cs_err = cpg_mcast_joined(cpg_handle, CPG_TYPE_FIFO, &iov, 1)) != CS_OK) {
-		ast_log(LOG_WARNING, "CPG mcast failed (%u) for event %s (%u)\n",
-			cs_err, ast_event_get_type_name(event), ast_event_get_type(event));
+	if (corosync_node_joined && !ast_rwlock_tryrdlock(&init_cpg_lock)) {
+		ast_debug(5, "publish_event_to_corosync rdlock\n");
+		if ((cs_err = cpg_mcast_joined(cpg_handle, CPG_TYPE_FIFO, &iov, 1)) != CS_OK) {
+			ast_log(LOG_WARNING, "CPG mcast failed (%u) for event %s (%u)\n",
+					cs_err, ast_event_get_type_name(event), ast_event_get_type(event));
+		}
+		ast_rwlock_unlock(&init_cpg_lock);
+		ast_debug(5, "publish_event_to_corosync unlock\n");
+	} else {
+		ast_log(LOG_WARNING,
+				"CPG mcast not executed for event %s (%u): initializing CPG.\n",
+				ast_event_get_type_name(event), ast_event_get_type(event));
 	}
 }
 
 static void publish_to_corosync(struct stasis_message *message)
 {
 	struct ast_event *event;
+	struct ast_eid *event_eid;
 
 	event = stasis_message_to_event(message);
 	if (!event) {
 		return;
 	}
 
-	if (ast_eid_cmp(&ast_eid_default, ast_event_get_ie_raw(event, AST_EVENT_IE_EID))) {
+	event_eid = (struct ast_eid *) ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
+	if (!event_eid || ast_eid_cmp(&ast_eid_default, event_eid)) {
 		/* If the event didn't originate from this server, don't send it back out. */
 		ast_event_destroy(event);
 		return;
@@ -563,15 +592,17 @@
 		const struct ast_eid *eid;
 		char buf[128] = "";
 
-		eid = ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
+		eid = (struct ast_eid *) ast_event_get_ie_raw(event, AST_EVENT_IE_EID);
 		ast_eid_to_str(buf, sizeof(buf), (struct ast_eid *) eid);
 		ast_log(LOG_NOTICE, "Sending event PING from this server with EID: '%s'\n", buf);
 	}
 
 	publish_event_to_corosync(event);
+	ast_event_destroy(event);
 }
 
-static void stasis_message_cb(void *data, struct stasis_subscription *sub, struct stasis_message *message)
+static void stasis_message_cb(void *data, struct stasis_subscription *sub,
+							  struct stasis_message *message)
 {
 	if (!message) {
 		return;
@@ -593,23 +624,84 @@
 	return 0;
 }
 
+static int clear_node_cache(void *obj, void *arg, int flags)
+{
+	struct stasis_message *cached_msg = obj;
+	struct stasis_topic *topic = arg;
+	struct stasis_message *msg;
+	struct ast_eid *msg_eid;
+
+	if (!cached_msg) {
+		return 0;
+	}
+
+	msg_eid = (struct ast_eid *) stasis_message_eid(cached_msg);
+	if (msg_eid && ast_eid_cmp(&ast_eid_default, msg_eid)) {
+		msg = stasis_cache_clear_create(cached_msg);
+		if (msg) {
+			stasis_publish(topic, msg);
+			ao2_cleanup(msg);
+		}
+	}
+
+	return 0;
+}
+
 static void cpg_confchg_cb(cpg_handle_t handle, const struct cpg_name *group_name,
-		const struct cpg_address *member_list, size_t member_list_entries,
-		const struct cpg_address *left_list, size_t left_list_entries,
-		const struct cpg_address *joined_list, size_t joined_list_entries)
+						   const struct cpg_address *member_list,
+						   size_t member_list_entries,
+						   const struct cpg_address *left_list, size_t left_list_entries,
+						   const struct cpg_address *joined_list,
+						   size_t joined_list_entries)
 {
 	unsigned int i;
 
 
 	for (i = 0; i < left_list_entries; i++) {
 		const struct cpg_address *cpg_node = &left_list[i];
-		struct corosync_node* node;
+		struct corosync_node *node;
 
 		node = ao2_find(nodes, &cpg_node->nodeid, OBJ_UNLINK | OBJ_SEARCH_KEY);
 		if (!node) {
 			continue;
 		}
 
+		unsigned int j;
+		for (j = 0; j < ARRAY_LEN(event_types); j++) {
+			struct ao2_container *messages;
+
+			ast_rwlock_rdlock(&event_types_lock);
+			ast_debug(5, "cpg_confchg_cb rdlock\n");
+			if (!event_types[j].subscribe) {
+				ast_rwlock_unlock(&event_types_lock);
+				ast_debug(5, "cpg_confchg_cb unlock\n");
+				continue;
+			}
+
+			if (!event_types[j].cache_fn || !event_types[j].message_type_fn) {
+				ast_rwlock_unlock(&event_types_lock);
+				ast_debug(5, "cpg_confchg_cb unlock\n");
+				continue;
+			}
+			ast_rwlock_unlock(&event_types_lock);
+			ast_debug(5, "cpg_confchg_cb unlock\n");
+
+			messages =
+				stasis_cache_dump_by_eid(event_types[j].cache_fn(),
+										 event_types[j].message_type_fn(), &node->eid);
+
+			int messages_count = ao2_container_count(messages);
+			ast_log(LOG_NOTICE,
+					"Clearing %i events of type %s of node %i from stasis cache.\n",
+					messages_count, event_types[j].name, node->id);
+			ao2_callback(messages, OBJ_NODATA, clear_node_cache,
+						 event_types[j].topic_fn());
+			ast_log(LOG_NOTICE, "Cleared events of type %s from stasis cache.\n",
+					event_types[j].name);
+
+			ao2_t_ref(messages, -1, "Dispose of flushed cache");
+		}
+
 		publish_cluster_discovery_to_stasis_full(node, 0);
 		ao2_ref(node, -1);
 	}
@@ -624,22 +716,30 @@
 		struct ao2_container *messages;
 
 		ast_rwlock_rdlock(&event_types_lock);
+		ast_debug(5, "cpg_confchg_cb rdlock\n");
 		if (!event_types[i].publish) {
 			ast_rwlock_unlock(&event_types_lock);
+			ast_debug(5, "cpg_confchg_cb unlock\n");
 			continue;
 		}
 
 		if (!event_types[i].cache_fn || !event_types[i].message_type_fn) {
 			ast_rwlock_unlock(&event_types_lock);
+			ast_debug(5, "cpg_confchg_cb unlock\n");
 			continue;
 		}
-
-		messages = stasis_cache_dump_by_eid(event_types[i].cache_fn(),
-			event_types[i].message_type_fn(),
-			&ast_eid_default);
 		ast_rwlock_unlock(&event_types_lock);
+		ast_debug(5, "cpg_confchg_cb unlock\n");
 
+		messages =
+			stasis_cache_dump_by_eid(event_types[i].cache_fn(),
+									 event_types[i].message_type_fn(), &ast_eid_default);
+
+		int messages_count = ao2_container_count(messages);
+		ast_log(LOG_NOTICE, "Sending %i events of type %s to corosync.\n", messages_count,
+				event_types[i].name);
 		ao2_callback(messages, OBJ_NODATA, dump_cache_cb, NULL);
+		ast_log(LOG_NOTICE, "Sent events of type %s to corosync.\n", event_types[i].name);
 
 		ao2_t_ref(messages, -1, "Dispose of dumped cache");
 	}
@@ -658,53 +758,77 @@
 	char buf[128];
 	int res;
 
-	if ((cs_err = corosync_cfg_local_get(cfg_handle, &node_id)) != CS_OK) {
-		ast_log(LOG_WARNING, "Failed to extract Corosync node ID for this node. Not informing cluster of existance.\n");
-		return;
+	if (!ast_rwlock_tryrdlock(&init_cpg_lock)) {
+		ast_debug(5, "send_cluster_notify rdlock\n");
+
+		if ((cs_err = corosync_cfg_local_get(cfg_handle, &node_id)) != CS_OK) {
+			ast_log(LOG_WARNING,
+					"Failed to extract Corosync node ID for this node. Not informing cluster of existance.\n");
+			return;
+		}
+
+		if (((cs_err =
+			  corosync_cfg_get_node_addrs(cfg_handle, node_id, 1, &num_addrs,
+										  &corosync_addr)) != CS_OK) || (num_addrs < 1)) {
+			ast_log(LOG_WARNING,
+					"Failed to get local Corosync address. Not informing cluster of existance.\n");
+			return;
+		}
+
+		ast_rwlock_unlock(&init_cpg_lock);
+		ast_debug(5, "send_cluster_notify unlock\n");
 	}
 
-	if (((cs_err = corosync_cfg_get_node_addrs(cfg_handle, node_id, 1, &num_addrs, &corosync_addr)) != CS_OK) || (num_addrs < 1)) {
-		ast_log(LOG_WARNING, "Failed to get local Corosync address. Not informing cluster of existance.\n");
-		return;
-	}
-
-	sa = (struct sockaddr *)corosync_addr.address;
-	sa_len = (size_t)corosync_addr.address_length;
+	sa = (struct sockaddr *) corosync_addr.address;
+	sa_len = (size_t) corosync_addr.address_length;
 	if ((res = getnameinfo(sa, sa_len, buf, sizeof(buf), NULL, 0, NI_NUMERICHOST))) {
-		ast_log(LOG_WARNING, "Failed to determine name of local Corosync address: %s (%d). Not informing cluster of existance.\n",
-			gai_strerror(res), res);
+		ast_log(LOG_WARNING,
+				"Failed to determine name of local Corosync address: %s (%d). Not informing cluster of existance.\n",
+				gai_strerror(res), res);
 		return;
 	}
 
 	event = ast_event_new(AST_EVENT_CLUSTER_DISCOVERY,
-				    AST_EVENT_IE_NODE_ID, AST_EVENT_IE_PLTYPE_UINT, node_id,
-				    AST_EVENT_IE_LOCAL_ADDR, AST_EVENT_IE_PLTYPE_STR, buf,
-				    AST_EVENT_IE_END);
+						  AST_EVENT_IE_NODE_ID, AST_EVENT_IE_PLTYPE_UINT, node_id,
+						  AST_EVENT_IE_LOCAL_ADDR, AST_EVENT_IE_PLTYPE_STR, buf,
+						  AST_EVENT_IE_END);
 	publish_event_to_corosync(event);
-	ast_free(event);
+	ast_event_destroy(event);
 }
 
 static void *dispatch_thread_handler(void *data)
 {
 	cs_error_t cs_err;
 	struct pollfd pfd[3] = {
-		{ .events = POLLIN, },
-		{ .events = POLLIN, },
-		{ .events = POLLIN, },
+		{.events = POLLIN,},
+		{.events = POLLIN,},
+		{.events = POLLIN,},
 	};
 
-	if ((cs_err = cpg_fd_get(cpg_handle, &pfd[0].fd)) != CS_OK) {
-		ast_log(LOG_ERROR, "Failed to get CPG fd.  This module is now broken.\n");
+	if (!ast_rwlock_tryrdlock(&init_cpg_lock)) {
+		ast_debug(5, "dispatch_thread_handler rdlock\n");
+		if ((cs_err = cpg_fd_get(cpg_handle, &pfd[0].fd)) != CS_OK) {
+			ast_log(LOG_ERROR, "Failed to get CPG fd.  This module is now broken.\n");
+			ast_rwlock_unlock(&init_cpg_lock);
+			ast_debug(5, "dispatch_thread_handler unlock\n");
+			return NULL;
+		}
+
+		if ((cs_err = corosync_cfg_fd_get(cfg_handle, &pfd[1].fd)) != CS_OK) {
+			ast_log(LOG_ERROR, "Failed to get CFG fd.  This module is now broken.\n");
+			ast_rwlock_unlock(&init_cpg_lock);
+			ast_debug(5, "dispatch_thread_handler unlock\n");
+			return NULL;
+		}
+
+		pfd[2].fd = dispatch_thread.alert_pipe[0];
+		ast_rwlock_unlock(&init_cpg_lock);
+		ast_debug(5, "dispatch_thread_handler unlock\n");
+	} else {
+		ast_log(LOG_ERROR,
+				"Failed to get fd: initiliazing CPG.  This module is now broken.\n");
 		return NULL;
 	}
-
-	if ((cs_err = corosync_cfg_fd_get(cfg_handle, &pfd[1].fd)) != CS_OK) {
-		ast_log(LOG_ERROR, "Failed to get CFG fd.  This module is now broken.\n");
-		return NULL;
-	}
-
-	pfd[2].fd = dispatch_thread.alert_pipe[0];
-
 	send_cluster_notify();
 	while (!dispatch_thread.stop) {
 		int res;
@@ -715,72 +839,156 @@
 		pfd[1].revents = 0;
 		pfd[2].revents = 0;
 
-		res = ast_poll(pfd, ARRAY_LEN(pfd), -1);
+		res = ast_poll(pfd, ARRAY_LEN(pfd), COROSYNC_POLL_TIMEOUT);
 		if (res == -1 && errno != EINTR && errno != EAGAIN) {
 			ast_log(LOG_ERROR, "poll() error: %s (%d)\n", strerror(errno), errno);
-			continue;
-		}
+			cs_err = CS_ERR_BAD_HANDLE;
+		} else if (res == 0) {
+			unsigned int local_nodeid;
 
-		if (pfd[0].revents & POLLIN) {
-			if ((cs_err = cpg_dispatch(cpg_handle, CS_DISPATCH_ALL)) != CS_OK) {
-				ast_log(LOG_WARNING, "Failed CPG dispatch: %u\n", cs_err);
+			if (!ast_rwlock_tryrdlock(&init_cpg_lock)) {
+				ast_debug(5, "dispatch_thread_handler rdlock\n");
+				if ((cs_err = cpg_local_get(cpg_handle, &local_nodeid)) == CS_OK) {
+					struct cpg_name name;
+					struct cpg_address address[CPG_MEMBERS_MAX];
+					int entries = CPG_MEMBERS_MAX;
+
+					ast_copy_string(name.value, "asterisk", sizeof(name.value));
+					name.length = strlen(name.value);
+					if ((cs_err =
+						 cpg_membership_get(cpg_handle, &name, address,
+											&entries)) == CS_OK) {
+						int i;
+						int found = 0;
+
+						ast_debug(1, "CPG group has %i node membership\n", entries);
+						for (i = 0; (i < entries) && !found; i++) {
+							if (address[i].nodeid == local_nodeid)
+								found = 1;
+						}
+						if (!found) {
+							ast_log(LOG_WARNING, "Failed to check CPG node membership\n");
+							corosync_node_joined = 0;
+							cs_err = CS_ERR_BAD_HANDLE;
+						}
+					} else {
+						ast_log(LOG_WARNING, "Failed to get CPG node membership: %u\n",
+								cs_err);
+						corosync_node_joined = 0;
+						cs_err = CS_ERR_BAD_HANDLE;
+					}
+				} else {
+					ast_log(LOG_WARNING, "Failed to get CPG local node id: %u\n", cs_err);
+					corosync_node_joined = 0;
+					cs_err = CS_ERR_BAD_HANDLE;
+				}
+				ast_rwlock_unlock(&init_cpg_lock);
+				ast_debug(5, "dispatch_thread_handler unlock\n");
+			} else {
+				ast_log(LOG_WARNING,
+						"Failed to check CPG node membership: initializing CPG.\n");
+				corosync_node_joined = 0;
+				cs_err = CS_ERR_BAD_HANDLE;
+			}
+		} else {
+			if (!ast_rwlock_tryrdlock(&init_cpg_lock)) {
+				ast_debug(5, "dispatch_thread_handler rdlock\n");
+				if (pfd[0].revents & POLLIN) {
+					if ((cs_err = cpg_dispatch(cpg_handle, CS_DISPATCH_ALL)) != CS_OK) {
+						ast_log(LOG_WARNING, "Failed CPG dispatch: %u\n", cs_err);
+					}
+				}
+
+				if (pfd[1].revents & POLLIN) {
+					if ((cs_err =
+						 corosync_cfg_dispatch(cfg_handle, CS_DISPATCH_ALL)) != CS_OK) {
+						ast_log(LOG_WARNING, "Failed CFG dispatch: %u\n", cs_err);
+					}
+				}
+				ast_rwlock_unlock(&init_cpg_lock);
+				ast_debug(5, "dispatch_thread_handler unlock\n");
+			} else {
+				ast_log(LOG_WARNING, "Failed to dispatch: initializing CPG.\n");
 			}
 		}
-
-		if (pfd[1].revents & POLLIN) {
-			if ((cs_err = corosync_cfg_dispatch(cfg_handle, CS_DISPATCH_ALL)) != CS_OK) {
-				ast_log(LOG_WARNING, "Failed CFG dispatch: %u\n", cs_err);
-			}
-		}
-
 		if (cs_err == CS_ERR_LIBRARY || cs_err == CS_ERR_BAD_HANDLE) {
-			struct cpg_name name;
 
 			/* If corosync gets restarted out from under Asterisk, try to recover. */
 
 			ast_log(LOG_NOTICE, "Attempting to recover from corosync failure.\n");
 
-			if ((cs_err = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks)) != CS_OK) {
-				ast_log(LOG_ERROR, "Failed to initialize cfg (%d)\n", (int) cs_err);
-				sleep(5);
-				continue;
-			}
+			if (!ast_rwlock_trywrlock(&init_cpg_lock)) {
+				ast_debug(5, "dispatch_thread_handler wrlock\n");
+				struct cpg_name name;
 
-			if ((cs_err = cpg_initialize(&cpg_handle, &cpg_callbacks) != CS_OK)) {
-				ast_log(LOG_ERROR, "Failed to initialize cpg (%d)\n", (int) cs_err);
-				sleep(5);
-				continue;
-			}
+				corosync_node_joined = 0;
+				if (cpg_handle && (cs_err = cpg_finalize(cpg_handle)) != CS_OK) {
+					ast_log(LOG_ERROR, "Failed to finalize cpg (%d)\n", (int) cs_err);
+				}
 
-			if ((cs_err = cpg_fd_get(cpg_handle, &pfd[0].fd)) != CS_OK) {
-				ast_log(LOG_ERROR, "Failed to get CPG fd.\n");
-				sleep(5);
-				continue;
-			}
+				if (cfg_handle && (cs_err = corosync_cfg_finalize(cfg_handle)) != CS_OK) {
+					ast_log(LOG_ERROR, "Failed to finalize cfg (%d)\n", (int) cs_err);
+				}
 
-			if ((cs_err = corosync_cfg_fd_get(cfg_handle, &pfd[1].fd)) != CS_OK) {
-				ast_log(LOG_ERROR, "Failed to get CFG fd.\n");
-				sleep(5);
-				continue;
-			}
+				if ((cs_err =
+					 corosync_cfg_initialize(&cfg_handle, &cfg_callbacks)) != CS_OK) {
+					ast_log(LOG_ERROR, "Failed to initialize cfg (%d)\n", (int) cs_err);
+					ast_rwlock_unlock(&init_cpg_lock);
+					ast_debug(5, "dispatch_thread_handler unlock\n");
+					sleep(5);
+					continue;
+				}
 
-			ast_copy_string(name.value, "asterisk", sizeof(name.value));
-			name.length = strlen(name.value);
-			if ((cs_err = cpg_join(cpg_handle, &name)) != CS_OK) {
-				ast_log(LOG_ERROR, "Failed to join cpg (%d)\n", (int) cs_err);
-				sleep(5);
-				continue;
-			}
+				if ((cs_err = cpg_initialize(&cpg_handle, &cpg_callbacks) != CS_OK)) {
+					ast_log(LOG_ERROR, "Failed to initialize cpg (%d)\n", (int) cs_err);
+					ast_rwlock_unlock(&init_cpg_lock);
+					ast_debug(5, "dispatch_thread_handler unlock\n");
+					sleep(5);
+					continue;
+				}
 
-			ast_log(LOG_NOTICE, "Corosync recovery complete.\n");
-			send_cluster_notify();
+				if ((cs_err = cpg_fd_get(cpg_handle, &pfd[0].fd)) != CS_OK) {
+					ast_log(LOG_ERROR, "Failed to get CPG fd.\n");
+					ast_rwlock_unlock(&init_cpg_lock);
+					ast_debug(5, "dispatch_thread_handler unlock\n");
+					sleep(5);
+					continue;
+				}
+
+				if ((cs_err = corosync_cfg_fd_get(cfg_handle, &pfd[1].fd)) != CS_OK) {
+					ast_log(LOG_ERROR, "Failed to get CFG fd.\n");
+					ast_rwlock_unlock(&init_cpg_lock);
+					ast_debug(5, "dispatch_thread_handler unlock\n");
+					sleep(5);
+					continue;
+				}
+
+				ast_copy_string(name.value, "asterisk", sizeof(name.value));
+				name.length = strlen(name.value);
+				if ((cs_err = cpg_join(cpg_handle, &name)) != CS_OK) {
+					ast_log(LOG_ERROR, "Failed to join cpg (%d)\n", (int) cs_err);
+					ast_rwlock_unlock(&init_cpg_lock);
+					ast_debug(5, "dispatch_thread_handler unlock\n");
+					sleep(5);
+					continue;
+				}
+				corosync_node_joined = 1;
+				ast_rwlock_unlock(&init_cpg_lock);
+				ast_debug(5, "dispatch_thread_handler unlock\n");
+				ast_log(LOG_NOTICE, "Corosync recovery complete.\n");
+				send_cluster_notify();
+			} else {
+				ast_log(LOG_NOTICE,
+						"Failed to recover from corosync failure: initializing CPG.\n");
+			}
 		}
 	}
 
 	return NULL;
 }
 
-static char *corosync_show_members(struct ast_cli_entry *e, int cmd, struct ast_cli_args *a)
+static char *corosync_show_members(struct ast_cli_entry *e, int cmd,
+								   struct ast_cli_args *a)
 {
 	cs_error_t cs_err;
 	cpg_iteration_handle_t cpg_iter;
@@ -791,76 +999,83 @@
 	case CLI_INIT:
 		e->command = "corosync show members";
 		e->usage =
-			"Usage: corosync show members\n"
-			"       Show corosync cluster members\n";
+			"Usage: corosync show members\n" "       Show corosync cluster members\n";
 		return NULL;
 
 	case CLI_GENERATE:
-		return NULL;	/* no completion */
+		return NULL;			/* no completion */
 	}
 
 	if (a->argc != e->args) {
 		return CLI_SHOWUSAGE;
 	}
 
-	cs_err = cpg_iteration_initialize(cpg_handle, CPG_ITERATION_ALL, NULL, &cpg_iter);
+	if (!ast_rwlock_tryrdlock(&init_cpg_lock)) {
+		ast_debug(5, "corosync_show_members rdlock\n");
+		cs_err = cpg_iteration_initialize(cpg_handle, CPG_ITERATION_ALL, NULL, &cpg_iter);
 
-	if (cs_err != CS_OK) {
-		ast_cli(a->fd, "Failed to initialize CPG iterator.\n");
-		return CLI_FAILURE;
-	}
-
-	ast_cli(a->fd, "\n"
-	            "=============================================================\n"
-	            "=== Cluster members =========================================\n"
-	            "=============================================================\n"
-	            "===\n");
-
-	for (i = 1, cs_err = cpg_iteration_next(cpg_iter, &cpg_desc);
-			cs_err == CS_OK;
-			cs_err = cpg_iteration_next(cpg_iter, &cpg_desc), i++) {
-#ifdef HAVE_COROSYNC_CFG_STATE_TRACK
-		corosync_cfg_node_address_t addrs[8];
-		int num_addrs = 0;
-		unsigned int j;
-#endif
-
-		ast_cli(a->fd, "=== Node %u\n", i);
-		ast_cli(a->fd, "=== --> Group: %s\n", cpg_desc.group.value);
-
-#ifdef HAVE_COROSYNC_CFG_STATE_TRACK
-		/*
-		 * Corosync 2.x cfg lib needs to allocate 1M on stack after calling
-		 * corosync_cfg_get_node_addrs. netconsole thread has allocated only 0.5M
-		 * resulting in crash.
-		 */
-		cs_err = corosync_cfg_get_node_addrs(cfg_handle, cpg_desc.nodeid,
-				ARRAY_LEN(addrs), &num_addrs, addrs);
 		if (cs_err != CS_OK) {
-			ast_log(LOG_WARNING, "Failed to get node addresses\n");
-			continue;
+			ast_cli(a->fd, "Failed to initialize CPG iterator: %u.\n", cs_err);
+			cpg_iteration_finalize(cpg_iter);
+			ast_rwlock_unlock(&init_cpg_lock);
+			ast_debug(5, "corosync_show_members unlock\n");
+			return CLI_FAILURE;
 		}
 
-		for (j = 0; j < num_addrs; j++) {
-			struct sockaddr *sa = (struct sockaddr *) addrs[j].address;
-			size_t sa_len = (size_t) addrs[j].address_length;
-			char buf[128];
+		ast_cli(a->fd, "\n"
+				"=============================================================\n"
+				"=== Cluster members =========================================\n"
+				"=============================================================\n"
+				"===\n");
 
-			getnameinfo(sa, sa_len, buf, sizeof(buf), NULL, 0, NI_NUMERICHOST);
-
-			ast_cli(a->fd, "=== --> Address %u: %s\n", j + 1, buf);
-		}
-#else
-		ast_cli(a->fd, "=== --> Nodeid: %"PRIu32"\n", cpg_desc.nodeid);
+		for (i = 1, cs_err = cpg_iteration_next(cpg_iter, &cpg_desc);
+			 cs_err == CS_OK; cs_err = cpg_iteration_next(cpg_iter, &cpg_desc), i++) {
+#ifdef HAVE_COROSYNC_CFG_STATE_TRACK
+			corosync_cfg_node_address_t addrs[8];
+			int num_addrs = 0;
+			unsigned int j;
 #endif
+
+			ast_cli(a->fd, "=== Node %u\n", i);
+			ast_cli(a->fd, "=== --> Group: %s\n", cpg_desc.group.value);
+
+#ifdef HAVE_COROSYNC_CFG_STATE_TRACK
+			/*
+			 * Corosync 2.x cfg lib needs to allocate 1M on stack after calling
+			 * corosync_cfg_get_node_addrs. netconsole thread has allocated only 0.5M
+			 * resulting in crash.
+			 */
+			cs_err = corosync_cfg_get_node_addrs(cfg_handle, cpg_desc.nodeid,
+												 ARRAY_LEN(addrs), &num_addrs, addrs);
+			if (cs_err != CS_OK) {
+				ast_log(LOG_WARNING, "Failed to get node addresses\n");
+				continue;
+			}
+
+			for (j = 0; j < num_addrs; j++) {
+				struct sockaddr *sa = (struct sockaddr *) addrs[j].address;
+				size_t sa_len = (size_t) addrs[j].address_length;
+				char buf[128];
+
+				getnameinfo(sa, sa_len, buf, sizeof(buf), NULL, 0, NI_NUMERICHOST);
+
+				ast_cli(a->fd, "=== --> Address %u: %s\n", j + 1, buf);
+			}
+#else
+			ast_cli(a->fd, "=== --> Nodeid: %" PRIu32 "\n", cpg_desc.nodeid);
+#endif
+		}
+
+		ast_cli(a->fd, "===\n"
+				"=============================================================\n" "\n");
+
+		cpg_iteration_finalize(cpg_iter);
+		ast_rwlock_unlock(&init_cpg_lock);
+		ast_debug(5, "corosync_show_members unlock\n");
+	} else {
+		ast_cli(a->fd, "Failed to initialize CPG iterator: initializing CPG.\n");
 	}
 
-	ast_cli(a->fd, "===\n"
-	               "=============================================================\n"
-	               "\n");
-
-	cpg_iteration_finalize(cpg_iter);
-
 	return CLI_SUCCESS;
 }
 
@@ -880,7 +1095,7 @@
 		return NULL;
 
 	case CLI_GENERATE:
-		return NULL;	/* no completion */
+		return NULL;			/* no completion */
 	}
 
 	if (a->argc != e->args) {
@@ -893,14 +1108,14 @@
 		return CLI_FAILURE;
 	}
 
-	ast_rwlock_rdlock(&event_types_lock);
 	event_types[AST_EVENT_PING].publish_to_stasis(event);
-	ast_rwlock_unlock(&event_types_lock);
 
+	ast_event_destroy(event);
 	return CLI_SUCCESS;
 }
 
-static char *corosync_show_config(struct ast_cli_entry *e, int cmd, struct ast_cli_args *a)
+static char *corosync_show_config(struct ast_cli_entry *e, int cmd,
+								  struct ast_cli_args *a)
 {
 	unsigned int i;
 
@@ -913,7 +1128,7 @@
 		return NULL;
 
 	case CLI_GENERATE:
-		return NULL;	/* no completion */
+		return NULL;			/* no completion */
 	}
 
 	if (a->argc != e->args) {
@@ -921,16 +1136,15 @@
 	}
 
 	ast_cli(a->fd, "\n"
-	            "=============================================================\n"
-	            "=== res_corosync config =====================================\n"
-	            "=============================================================\n"
-	            "===\n");
+			"=============================================================\n"
+			"=== res_corosync config =====================================\n"
+			"=============================================================\n" "===\n");
 
 	ast_rwlock_rdlock(&event_types_lock);
+	ast_debug(5, "corosync_show_config rdlock\n");
 	for (i = 0; i < ARRAY_LEN(event_types); i++) {
 		if (event_types[i].publish) {
-			ast_cli(a->fd, "=== ==> Publishing Event Type: %s\n",
-					event_types[i].name);
+			ast_cli(a->fd, "=== ==> Publishing Event Type: %s\n", event_types[i].name);
 		}
 		if (event_types[i].subscribe) {
 			ast_cli(a->fd, "=== ==> Subscribing to Event Type: %s\n",
@@ -938,10 +1152,10 @@
 		}
 	}
 	ast_rwlock_unlock(&event_types_lock);
+	ast_debug(5, "corosync_show_config unlock\n");
 
 	ast_cli(a->fd, "===\n"
-	               "=============================================================\n"
-	               "\n");
+			"=============================================================\n" "\n");
 
 	return CLI_SUCCESS;
 }
@@ -988,6 +1202,7 @@
 	unsigned int i;
 
 	ast_rwlock_wrlock(&event_types_lock);
+	ast_debug(5, "load_general_config wrlock\n");
 
 	for (i = 0; i < ARRAY_LEN(event_types); i++) {
 		event_types[i].publish = event_types[i].publish_default;
@@ -1009,17 +1224,16 @@
 			event_types[i].sub = stasis_forward_all(event_types[i].topic_fn(),
 													corosync_topic());
 			stasis_message_router_add(stasis_router,
-			                          event_types[i].message_type_fn(),
-			                          stasis_message_cb,
-			                          NULL);
+									  event_types[i].message_type_fn(),
+									  stasis_message_cb, NULL);
 		} else if (!event_types[i].publish && event_types[i].sub) {
 			event_types[i].sub = stasis_forward_cancel(event_types[i].sub);
-			stasis_message_router_remove(stasis_router,
-			                             event_types[i].message_type_fn());
+			stasis_message_router_remove(stasis_router, event_types[i].message_type_fn());
 		}
 	}
 
 	ast_rwlock_unlock(&event_types_lock);
+	ast_debug(5, "load_general_config unlock\n");
 
 	return res;
 }
@@ -1059,17 +1273,39 @@
 	if (stasis_router) {
 
 		/* Unsubscribe all topic forwards and cancel all message routes */
-		ast_rwlock_wrlock(&event_types_lock);
 		for (i = 0; i < ARRAY_LEN(event_types); i++) {
+			struct ao2_container *messages = NULL;
+			unsigned char subscribe = 0;
+
+			ast_rwlock_wrlock(&event_types_lock);
+			ast_debug(5, "cleanup_module wrlock\n");
+			subscribe = event_types[i].subscribe;
+
 			if (event_types[i].sub) {
 				event_types[i].sub = stasis_forward_cancel(event_types[i].sub);
 				stasis_message_router_remove(stasis_router,
-				                             event_types[i].message_type_fn());
+											 event_types[i].message_type_fn());
 			}
 			event_types[i].publish = 0;
 			event_types[i].subscribe = 0;
+			ast_rwlock_unlock(&event_types_lock);
+			ast_debug(5, "cleanup_module unlock\n");
+
+			if (subscribe && event_types[i].cache_fn && event_types[i].message_type_fn) {
+				messages =
+					stasis_cache_dump_all(event_types[i].cache_fn(),
+										  event_types[i].message_type_fn());
+				int messages_count = ao2_container_count(messages);
+				ast_log(LOG_NOTICE,
+						"Clearing %i events of type %s of other nodes from stasis cache.\n",
+						messages_count, event_types[i].name);
+				ao2_callback(messages, OBJ_NODATA, clear_node_cache,
+							 event_types[i].topic_fn());
+				ast_log(LOG_NOTICE, "Cleared events of type %s from stasis cache.\n",
+						event_types[i].name);
+				ao2_t_ref(messages, -1, "Dispose of flushed cache");
+			}
 		}
-		ast_rwlock_unlock(&event_types_lock);
 
 		stasis_message_router_unsubscribe_and_join(stasis_router);
 		stasis_router = NULL;
@@ -1085,8 +1321,7 @@
 	if (dispatch_thread.id != AST_PTHREADT_NULL) {
 		char meepmeep = 'x';
 		dispatch_thread.stop = 1;
-		if (ast_carefulwrite(dispatch_thread.alert_pipe[1], &meepmeep, 1,
-					5000) == -1) {
+		if (ast_carefulwrite(dispatch_thread.alert_pipe[1], &meepmeep, 1, 5000) == -1) {
 			ast_log(LOG_ERROR, "Failed to write to pipe: %s (%d)\n",
 					strerror(errno), errno);
 		}
@@ -1103,16 +1338,21 @@
 		dispatch_thread.alert_pipe[1] = -1;
 	}
 
-	if (cpg_handle && (cs_err = cpg_finalize(cpg_handle)) != CS_OK) {
-		ast_log(LOG_ERROR, "Failed to finalize cpg (%d)\n", (int) cs_err);
-	}
-	cpg_handle = 0;
+	if (!ast_rwlock_trywrlock(&init_cpg_lock)) {
+		ast_debug(5, "cleanup_module wrlock\n");
+		if (cpg_handle && (cs_err = cpg_finalize(cpg_handle)) != CS_OK) {
+			ast_log(LOG_ERROR, "Failed to finalize cpg (%d)\n", (int) cs_err);
+		}
+		cpg_handle = 0;
 
-	if (cfg_handle && (cs_err = corosync_cfg_finalize(cfg_handle)) != CS_OK) {
-		ast_log(LOG_ERROR, "Failed to finalize cfg (%d)\n", (int) cs_err);
+		if (cfg_handle && (cs_err = corosync_cfg_finalize(cfg_handle)) != CS_OK) {
+			ast_log(LOG_ERROR, "Failed to finalize cfg (%d)\n", (int) cs_err);
+		}
+		cfg_handle = 0;
+		corosync_node_joined = 0;
+		ast_rwlock_unlock(&init_cpg_lock);
+		ast_debug(5, "cleanup_module unlock\n");
 	}
-	cfg_handle = 0;
-
 	ao2_cleanup(nodes);
 	nodes = NULL;
 }
@@ -1144,6 +1384,8 @@
 		ast_log(AST_LOG_ERROR, "Failed to create message router for corosync topic\n");
 		goto failed;
 	}
+	stasis_message_router_set_congestion_limits(stasis_router, -1,
+												10 * AST_TASKPROCESSOR_HIGH_WATER_LEVEL);
 
 	if (STASIS_MESSAGE_TYPE_INIT(corosync_ping_message_type) != 0) {
 		ast_log(AST_LOG_ERROR, "Failed to initialize corosync ping message type\n");
@@ -1155,42 +1397,58 @@
 		goto failed;
 	}
 
-	if ((cs_err = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks)) != CS_OK) {
-		ast_log(LOG_ERROR, "Failed to initialize cfg: (%d)\n", (int) cs_err);
+	if (!ast_rwlock_trywrlock(&init_cpg_lock)) {
+		corosync_node_joined = 0;
+		ast_debug(5, "load_module wrlock\n");
+		if ((cs_err = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks)) != CS_OK) {
+			ast_log(LOG_ERROR, "Failed to initialize cfg: (%d)\n", (int) cs_err);
+			ast_rwlock_unlock(&init_cpg_lock);
+			ast_debug(5, "load_module unlock\n");
+			goto failed;
+		}
+
+		if ((cs_err = cpg_initialize(&cpg_handle, &cpg_callbacks)) != CS_OK) {
+			ast_log(LOG_ERROR, "Failed to initialize cpg: (%d)\n", (int) cs_err);
+			ast_rwlock_unlock(&init_cpg_lock);
+			ast_debug(5, "load_module unlock\n");
+			goto failed;
+		}
+
+		ast_copy_string(name.value, "asterisk", sizeof(name.value));
+		name.length = strlen(name.value);
+
+		if ((cs_err = cpg_join(cpg_handle, &name)) != CS_OK) {
+			ast_log(LOG_ERROR, "Failed to join: (%d)\n", (int) cs_err);
+			ast_rwlock_unlock(&init_cpg_lock);
+			ast_debug(5, "load_module unlock\n");
+			goto failed;
+		}
+
+		if (pipe(dispatch_thread.alert_pipe) == -1) {
+			ast_log(LOG_ERROR, "Failed to create alert pipe: %s (%d)\n",
+					strerror(errno), errno);
+			ast_rwlock_unlock(&init_cpg_lock);
+			ast_debug(5, "load_module unlock\n");
+			goto failed;
+		}
+		corosync_node_joined = 1;
+
+		ast_rwlock_unlock(&init_cpg_lock);
+		ast_debug(5, "load_module unlock\n");
+		if (corosync_pthread_create_background(&dispatch_thread.id, NULL,
+											   dispatch_thread_handler, NULL)) {
+			ast_log(LOG_ERROR, "Error starting CPG dispatch thread.\n");
+			goto failed;
+		}
+
+		ast_cli_register_multiple(corosync_cli, ARRAY_LEN(corosync_cli));
+	} else {
 		goto failed;
 	}
 
-	if ((cs_err = cpg_initialize(&cpg_handle, &cpg_callbacks)) != CS_OK) {
-		ast_log(LOG_ERROR, "Failed to initialize cpg: (%d)\n", (int) cs_err);
-		goto failed;
-	}
-
-	ast_copy_string(name.value, "asterisk", sizeof(name.value));
-	name.length = strlen(name.value);
-
-	if ((cs_err = cpg_join(cpg_handle, &name)) != CS_OK) {
-		ast_log(LOG_ERROR, "Failed to join: (%d)\n", (int) cs_err);
-		goto failed;
-	}
-
-	if (pipe(dispatch_thread.alert_pipe) == -1) {
-		ast_log(LOG_ERROR, "Failed to create alert pipe: %s (%d)\n",
-				strerror(errno), errno);
-		goto failed;
-	}
-
-	if (corosync_pthread_create_background(&dispatch_thread.id, NULL,
-			dispatch_thread_handler, NULL)) {
-		ast_log(LOG_ERROR, "Error starting CPG dispatch thread.\n");
-		goto failed;
-	}
-
-	ast_cli_register_multiple(corosync_cli, ARRAY_LEN(corosync_cli));
-
-
 	return AST_MODULE_LOAD_SUCCESS;
 
-failed:
+  failed:
 	cleanup_module();
 
 	return AST_MODULE_LOAD_DECLINE;

-- 
To view, visit https://gerrit.asterisk.org/c/asterisk/+/14455
To unsubscribe, or for help writing mail filters, visit https://gerrit.asterisk.org/settings

Gerrit-Project: asterisk
Gerrit-Branch: 13
Gerrit-Change-Id: Ie8e99bc23f141a73c13ae6fb1948d148d4de17f2
Gerrit-Change-Number: 14455
Gerrit-PatchSet: 1
Gerrit-Owner: Università di Bologna - CESIA VoIP <cesia.voip at unibo.it>
Gerrit-MessageType: newchange
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.digium.com/pipermail/asterisk-code-review/attachments/20200603/05d11de7/attachment-0001.html>


More information about the asterisk-code-review mailing list