[Asterisk-code-review] res pjsip pubsub: Add more locking and checking around termi... (asterisk[master])

George Joseph asteriskteam at digium.com
Sun Jun 12 13:33:05 CDT 2016


George Joseph has uploaded a new change for review.

  https://gerrit.asterisk.org/3019

Change subject: res_pjsip_pubsub: Add more locking and checking around terminates
......................................................................

res_pjsip_pubsub: Add more locking and checking around terminates

A subsciption can be terminated by either a SUBSCRIBE with Expires = 0
or by the subscription timer expiring.  In heavily loaded systems
it's possible for the timer to fire at the same time we get a SUBSCRIBE
from the client either to terminate or refresh the subscription.

Client --> on_evsub_state     push --|
                                     serialized_pubsub_on_server_timeout
Timer  --> on_server_timeout  push --|

Both paths result in the pushing of a task to the subscription's
serializer, but on_server_timeout wasn't checking the last_notify flag
to see if a client-requested terminate was already in progress and
although on_evsub_state was checking it, it wasn't setting it.
This could result in 2 terminate tasks being placed on the serializer.
Since the last_notify flag was set only after the NOTIFY was sent
and the terminate task wasn't checking the flag either, both tasks
could run.  Additionally, some of the other pubsub tasks weren't
locking the dialog or checking last_notify so they could also be
running while pjproject was running one of the callbacks in another
thread.

In the end, there just wasn't enough locking and state checking and
we were seeing SEGVs in the pjproject code usually related to the dialog
being invalid.

Some of the checking in this patch is probably overkill but they're
simple boolean or null pointer checks so they're cheap.  Also some of
the debug levels in existing code were updated.

Changes:
 * Debug levels were changed so 1 is infrequent possible errors and
   3 is high-volume informational messages.
 * The evsub_state and server_timeout pjproject callbacks now increment
   last_notify while holding the dialog lock but before pushing the
   terminate task on the serializer.
 * The server_timeout callback now checks last_notify before pushing
   the terminate task.
 * The other callbacks and tasks now check that evsub, dlg and
   last_notify are valid before doing anything and the tasks now lock
   the dialog.
 * The serialized_pubsub_on_server_timeout task was renamed to
   serialized_pubsub_terminate and now checks the last_notify flag.  If
   it's > 1, then somehow 2 terminate tasks were pushed so this task
   decrements the flag and returns. The second task will now see the
   flag as 1 and will process.

ASTERISK-26099 #close
Reported-by: Ross Beer.

Change-Id: I779d11802cf672a51392e62a74a1216596075ba1
---
M res/res_pjsip_pubsub.c
1 file changed, 121 insertions(+), 46 deletions(-)


  git pull ssh://gerrit.asterisk.org:29418/asterisk refs/changes/19/3019/1

diff --git a/res/res_pjsip_pubsub.c b/res/res_pjsip_pubsub.c
index 06a1b52..3d0e471 100644
--- a/res/res_pjsip_pubsub.c
+++ b/res/res_pjsip_pubsub.c
@@ -879,7 +879,7 @@
 							"allocation error afterwards\n", resource);
 					continue;
 				}
-				ast_debug(1, "Subscription to leaf resource %s resulted in success. Adding to parent %s\n",
+				ast_debug(3, "Subscription to leaf resource %s resulted in success. Adding to parent %s\n",
 						resource, parent->resource);
 				AST_VECTOR_APPEND(&parent->children, current);
 			} else {
@@ -887,7 +887,7 @@
 						resource, resp);
 			}
 		} else {
-			ast_debug(1, "Resource %s (child of %s) is a list\n", resource, parent->resource);
+			ast_debug(3, "Resource %s (child of %s) is a list\n", resource, parent->resource);
 			current = tree_node_alloc(resource, visited, child_list->full_state);
 			if (!current) {
 				ast_debug(1, "Cannot build children of resource %s due to allocation failure\n", resource);
@@ -895,10 +895,10 @@
 			}
 			build_node_children(endpoint, handler, child_list, current, visited);
 			if (AST_VECTOR_SIZE(&current->children) > 0) {
-				ast_debug(1, "List %s had no successful children.\n", resource);
+				ast_debug(3, "List %s had no successful children.\n", resource);
 				AST_VECTOR_APPEND(&parent->children, current);
 			} else {
-				ast_debug(1, "List %s had successful children. Adding to parent %s\n",
+				ast_debug(3, "List %s had successful children. Adding to parent %s\n",
 						resource, parent->resource);
 				tree_node_destroy(current);
 			}
@@ -970,7 +970,7 @@
 	struct resources visited;
 
 	if (!has_eventlist_support || !(list = retrieve_resource_list(resource, handler->event_name))) {
-		ast_debug(1, "Subscription to resource %s is not to a list\n", resource);
+		ast_debug(3, "Subscription to resource %s is not to a list\n", resource);
 		tree->root = tree_node_alloc(resource, NULL, 0);
 		if (!tree->root) {
 			return 500;
@@ -978,7 +978,7 @@
 		return handler->notifier->new_subscribe(endpoint, resource);
 	}
 
-	ast_debug(1, "Subscription to resource %s is a list\n", resource);
+	ast_debug(3, "Subscription to resource %s is a list\n", resource);
 	if (AST_VECTOR_INIT(&visited, AST_VECTOR_SIZE(&list->items))) {
 		return 500;
 	}
@@ -1015,7 +1015,7 @@
 		if (i == obj) {
 			AST_RWLIST_REMOVE_CURRENT(next);
 			if (i->root) {
-				ast_debug(1, "Removing subscription to resource %s from list of subscriptions\n",
+				ast_debug(3, "Removing subscription to resource %s from list of subscriptions\n",
 						ast_sip_subscription_get_resource_name(i->root));
 			}
 			break;
@@ -2307,6 +2307,11 @@
 		return 0;
 	}
 
+	if (sub->tree->last_notify) {
+		pjsip_dlg_dec_lock(dlg);
+		return 0;
+	}
+
 	if (ast_sip_pubsub_generate_body_content(ast_sip_subscription_get_body_type(sub),
 				ast_sip_subscription_get_body_subtype(sub), notify_data, &sub->body_text)) {
 		pjsip_dlg_dec_lock(dlg);
@@ -3269,24 +3274,35 @@
 }
 
 /* XXX This function and serialized_pubsub_on_rx_refresh are nearly identical */
-static int serialized_pubsub_on_server_timeout(void *userdata)
+static int serialized_pubsub_terminate(void *userdata)
 {
 	struct sip_subscription_tree *sub_tree = userdata;
-	pjsip_dialog *dlg = sub_tree->dlg;
 
-	pjsip_dlg_inc_lock(dlg);
-	if (!sub_tree->evsub) {
-		pjsip_dlg_dec_lock(dlg);
+	if (!(sub_tree->evsub && sub_tree->dlg)) {
+		ast_debug(1, "sub_tree is already destroyed\n");
 		return 0;
 	}
+
+	pjsip_dlg_inc_lock(sub_tree->dlg);
+
+	if (sub_tree->last_notify > 1) {
+		ast_debug(1, "Possible terminate conflict. evsub: %p  Last Notify: %d\n",
+			sub_tree->evsub, sub_tree->last_notify);
+		sub_tree->last_notify--;
+		pjsip_dlg_dec_lock(sub_tree->dlg);
+		ao2_ref(sub_tree, -1);
+		return 0;
+	}
+
 	set_state_terminated(sub_tree->root);
 	send_notify(sub_tree, 1);
 	ast_test_suite_event_notify("SUBSCRIPTION_TERMINATED",
 			"Resource: %s",
 			sub_tree->root->resource);
 
-	pjsip_dlg_dec_lock(dlg);
+	pjsip_dlg_dec_lock(sub_tree->dlg);
 	ao2_cleanup(sub_tree);
+
 	return 0;
 }
 
@@ -3345,12 +3361,14 @@
 	}
 
 	sub_tree = pjsip_evsub_get_mod_data(evsub, pubsub_module.id);
-	if (!sub_tree) {
+	if (!(sub_tree && sub_tree->evsub && sub_tree->dlg)) {
+		ast_debug(1, "sub_tree is already destroyed\n");
 		return;
 	}
 
 	if (!sub_tree->last_notify) {
-		if (ast_sip_push_task(sub_tree->serializer, serialized_pubsub_on_server_timeout, ao2_bump(sub_tree))) {
+		sub_tree->last_notify++;
+		if (ast_sip_push_task(sub_tree->serializer, serialized_pubsub_terminate, ao2_bump(sub_tree))) {
 			ast_log(LOG_ERROR, "Failed to push task to send final NOTIFY.\n");
 			ao2_ref(sub_tree, -1);
 		} else {
@@ -3373,11 +3391,19 @@
 static int serialized_pubsub_on_rx_refresh(void *userdata)
 {
 	struct sip_subscription_tree *sub_tree = userdata;
-	pjsip_dialog *dlg = sub_tree->dlg;
 
-	pjsip_dlg_inc_lock(dlg);
-	if (!sub_tree->evsub) {
-		pjsip_dlg_dec_lock(dlg);
+	if (!(sub_tree->evsub && sub_tree->dlg)) {
+		ast_debug(1, "sub_tree is already destroyed\n");
+		return 0;
+	}
+
+	pjsip_dlg_inc_lock(sub_tree->dlg);
+
+	if (sub_tree->last_notify) {
+		ast_debug(1, "Possible terminate conflict. evsub: %p  Last Notify: %d\n",
+			sub_tree->evsub, sub_tree->last_notify);
+		pjsip_dlg_dec_lock(sub_tree->dlg);
+		ao2_ref(sub_tree, -1);
 		return 0;
 	}
 
@@ -3391,8 +3417,9 @@
 			"SUBSCRIPTION_TERMINATED" : "SUBSCRIPTION_REFRESHED",
 			"Resource: %s", sub_tree->root->resource);
 
-	pjsip_dlg_dec_lock(dlg);
+	pjsip_dlg_dec_lock(sub_tree->dlg);
 	ao2_cleanup(sub_tree);
+
 	return 0;
 }
 
@@ -3413,16 +3440,24 @@
 	struct sip_subscription_tree *sub_tree;
 
 	sub_tree = pjsip_evsub_get_mod_data(evsub, pubsub_module.id);
-	if (!sub_tree) {
+	if (!(sub_tree && sub_tree->evsub && sub_tree->dlg)) {
+		ast_debug(1, "sub_tree is already destroyed\n");
+		return;
+	}
+
+	if (sub_tree->last_notify) {
+		ast_debug(1, "Possible terminate conflict. evsub: %p  Last Notify: %d\n",
+			sub_tree->evsub, sub_tree->last_notify);
 		return;
 	}
 
 	/* PJSIP will set the evsub's state to terminated before calling into this function
 	 * if the Expires value of the incoming SUBSCRIBE is 0.
 	 */
+
 	if (pjsip_evsub_get_state(sub_tree->evsub) != PJSIP_EVSUB_STATE_TERMINATED) {
 		if (ast_sip_push_task(sub_tree->serializer, serialized_pubsub_on_rx_refresh, ao2_bump(sub_tree))) {
-			/* If we can't push the NOTIFY refreshing task...we'll just go with it. */
+			ast_log(LOG_ERROR, "Failed to push task\n");
 			ao2_ref(sub_tree, -1);
 		}
 	}
@@ -3438,6 +3473,7 @@
 	struct ast_sip_subscription *sub = pjsip_evsub_get_mod_data(evsub, pubsub_module.id);
 
 	if (!sub) {
+		ast_debug(1, "sub is already destroyed\n");
 		return;
 	}
 
@@ -3450,12 +3486,30 @@
 	struct sip_subscription_tree *sub_tree = userdata;
 	pjsip_tx_data *tdata;
 
+	if (!(sub_tree->evsub && sub_tree->dlg)) {
+		ast_debug(1, "sub_tree is already destroyed\n");
+		return 0;
+	}
+
+	pjsip_dlg_inc_lock(sub_tree->dlg);
+
+	if (sub_tree->last_notify) {
+		ast_debug(1, "Possible terminate conflict. evsub: %p  Last Notify: %d\n",
+			sub_tree->evsub, sub_tree->last_notify);
+		pjsip_dlg_dec_lock(sub_tree->dlg);
+		ao2_ref(sub_tree, -1);
+		return 0;
+	}
+
 	if (pjsip_evsub_initiate(sub_tree->evsub, NULL, -1, &tdata) == PJ_SUCCESS) {
 		pjsip_evsub_send_request(sub_tree->evsub, tdata);
 	} else {
 		pjsip_evsub_terminate(sub_tree->evsub, PJ_TRUE);
 	}
+
+	pjsip_dlg_dec_lock(sub_tree->dlg);
 	ao2_cleanup(sub_tree);
+
 	return 0;
 }
 
@@ -3463,32 +3517,53 @@
 {
 	struct sip_subscription_tree *sub_tree = pjsip_evsub_get_mod_data(evsub, pubsub_module.id);
 
-	ao2_ref(sub_tree, +1);
-	ast_sip_push_task(sub_tree->serializer, serialized_pubsub_on_client_refresh, sub_tree);
-}
-
-static void pubsub_on_server_timeout(pjsip_evsub *evsub)
-{
-
-	struct sip_subscription_tree *sub_tree = pjsip_evsub_get_mod_data(evsub, pubsub_module.id);
-	if (!sub_tree) {
-		/* PJSIP does not terminate the server timeout timer when a SUBSCRIBE
-		 * with Expires: 0 arrives to end a subscription, nor does it terminate
-		 * this timer when we send a NOTIFY request in response to receiving such
-		 * a SUBSCRIBE. PJSIP does not stop the server timeout timer until the
-		 * NOTIFY transaction has finished (either through receiving a response
-		 * or through a transaction timeout).
-		 *
-		 * Therefore, it is possible that we can be told that a server timeout
-		 * occurred after we already thought that the subscription had been
-		 * terminated. In such a case, we will have already removed the sub_tree
-		 * from the evsub's mod_data array.
-		 */
-        return;
+	if (!(sub_tree && sub_tree->evsub && sub_tree->dlg)) {
+		ast_debug(1, "sub_tree is already destroyed\n");
+		return;
 	}
 
-	ao2_ref(sub_tree, +1);
-	ast_sip_push_task(sub_tree->serializer, serialized_pubsub_on_server_timeout, sub_tree);
+	if (!sub_tree->last_notify) {
+		if (ast_sip_push_task(sub_tree->serializer, serialized_pubsub_on_client_refresh, ao2_bump(sub_tree))) {
+			ast_log(LOG_ERROR, "Failed to push task\n");
+			ao2_ref(sub_tree, -1);
+		}
+	} else {
+		ast_debug(1, "Possible terminate conflict. evsub: %p  Last Notify: %d\n",
+			sub_tree->evsub, sub_tree->last_notify);
+	}
+}
+
+/*!
+ * PJSIP does not terminate the server timeout timer when a SUBSCRIBE
+ * with Expires: 0 arrives to end a subscription, nor does it terminate
+ * this timer when we send a NOTIFY request in response to receiving such
+ * a SUBSCRIBE. PJSIP does not stop the server timeout timer until the
+ * NOTIFY transaction has finished (either through receiving a response
+ * or through a transaction timeout).
+ *
+ * Therefore, it is possible that we can be told that a server timeout
+ * occurred after we already thought that the subscription had been
+ * terminated. In such a case, we will have already removed the sub_tree
+ * from the evsub's mod_data array.
+ */
+static void pubsub_on_server_timeout(pjsip_evsub *evsub)
+{
+	struct sip_subscription_tree *sub_tree = pjsip_evsub_get_mod_data(evsub, pubsub_module.id);
+
+	if (!(sub_tree && sub_tree->evsub && sub_tree->dlg)) {
+		ast_debug(1, "sub_tree is already destroyed\n");
+		return;
+	}
+
+	if (!sub_tree->last_notify) {
+		sub_tree->last_notify++;
+		if (ast_sip_push_task(sub_tree->serializer, serialized_pubsub_terminate, ao2_bump(sub_tree))) {
+			ast_log(LOG_ERROR, "Failed to push task to send final NOTIFY.\n");
+			ao2_ref(sub_tree, -1);
+		}
+	} else {
+		ast_debug(1, "Possible terminate conflict. evsub: %p  Last Notify: %d\n", sub_tree->evsub, sub_tree->last_notify);
+	}
 }
 
 static int ami_subscription_detail(struct sip_subscription_tree *sub_tree,

-- 
To view, visit https://gerrit.asterisk.org/3019
To unsubscribe, visit https://gerrit.asterisk.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I779d11802cf672a51392e62a74a1216596075ba1
Gerrit-PatchSet: 1
Gerrit-Project: asterisk
Gerrit-Branch: master
Gerrit-Owner: George Joseph <gjoseph at digium.com>



More information about the asterisk-code-review mailing list