[Asterisk-code-review] chan sip.c: Fix packet retransid deadlock potential. (asterisk[13])

Anonymous Coward asteriskteam at digium.com
Thu Mar 17 10:27:48 CDT 2016


Anonymous Coward #1000019 has submitted this change and it was merged.

Change subject: chan_sip.c: Fix packet retransid deadlock potential.
......................................................................


chan_sip.c: Fix packet retransid deadlock potential.

This patch is part of a series to resolve deadlocks in chan_sip.c.

Stopping a scheduled event can result in a deadlock if the scheduled event
is running when you try to stop the event.  If you hold a lock needed by
the scheduled event while trying to stop the scheduled event then a
deadlock can happen.  The general strategy for resolving the deadlock
potential is to push the actual starting and stopping of the scheduled
events off onto the scheduler/do_monitor() thread by scheduling an
immediate one shot scheduled event.  Some restructuring may be needed
because the code may assume that the start/stop of the scheduled events is
immediate.

* Fix retrans_pkt() to call check_pendings() with both the owner channel
and the private objects locked as required.

* Refactor dialog retransmission packet list to safely remove packet
nodes.  The list nodes are now ao2 objects.  The list has a ref and the
scheduled entry has a ref.

ASTERISK-25023

Change-Id: I50926d81be53f4cd3d572a3292cd25f563f59641
---
M channels/chan_sip.c
1 file changed, 96 insertions(+), 79 deletions(-)

Approvals:
  Anonymous Coward #1000019: Verified
  Joshua Colp: Looks good to me, approved



diff --git a/channels/chan_sip.c b/channels/chan_sip.c
index ea137bb..c7cab1c 100644
--- a/channels/chan_sip.c
+++ b/channels/chan_sip.c
@@ -1511,6 +1511,7 @@
 static void stop_provisional_keepalive(struct sip_pvt *pvt);
 static void do_stop_session_timer(struct sip_pvt *pvt);
 static void stop_reinvite_retry(struct sip_pvt *pvt);
+static void stop_retrans_pkt(struct sip_pkt *pkt);
 
 /*! \brief Definition of this channel for PBX channel registration */
 struct ast_channel_tech sip_tech = {
@@ -3251,14 +3252,11 @@
 	/* remove all current packets in this dialog */
 	sip_pvt_lock(dialog);
 	while ((cp = dialog->packets)) {
-		/* Unlink the node from the list. */
+		/* Unlink and destroy the packet object. */
 		dialog->packets = dialog->packets->next;
-
-		/* Destroy the packet object. */
-		AST_SCHED_DEL(sched, cp->retransid);
-		dialog_unref(cp->owner, "Unlink dialog removing all retransmitable packets");
-		ast_free(cp->data);
-		ast_free(cp);
+		AST_SCHED_DEL_UNREF(sched, cp->retransid,
+			ao2_t_ref(cp, -1, "Stop scheduled packet retransmission"));
+		ao2_t_ref(cp, -1, "Packet retransmission list");
 	}
 	sip_pvt_unlock(dialog);
 
@@ -3953,10 +3951,17 @@
 	return;
 }
 
-/*! \brief Retransmit SIP message if no answer (Called from scheduler) */
+/*!
+ * \brief Retransmit SIP message if no answer
+ *
+ * \note Run by the sched thread.
+ */
 static int retrans_pkt(const void *data)
 {
-	struct sip_pkt *pkt = (struct sip_pkt *)data, *prev, *cur = NULL;
+	struct sip_pkt *pkt = (struct sip_pkt *) data;
+	struct sip_pkt *prev;
+	struct sip_pkt *cur;
+	struct ast_channel *owner_chan;
 	int reschedule = DEFAULT_RETRANS;
 	int xmitres = 0;
 	/* how many ms until retrans timeout is reached */
@@ -4021,6 +4026,7 @@
 
 		if (sip_debug_test_pvt(pkt->owner)) {
 			const struct ast_sockaddr *dst = sip_real_dst(pkt->owner);
+
 			ast_verbose("Retransmitting #%d (%s) to %s:\n%s\n---\n",
 				pkt->retrans, sip_nat_mode(pkt->owner),
 				ast_sockaddr_stringify(dst),
@@ -4041,7 +4047,7 @@
 				reschedule = diff;
 			}
 			sip_pvt_unlock(pkt->owner);
-			return  reschedule;
+			return reschedule;
 		}
 	}
 
@@ -4071,16 +4077,11 @@
 		append_history(pkt->owner, "MaxRetries", "%s", pkt->is_fatal ? "(Critical)" : "(Non-critical)");
 	}
 
+	sip_pvt_unlock(pkt->owner);	/* SIP_PVT, not channel */
+	owner_chan = sip_pvt_lock_full(pkt->owner);
+
 	if (pkt->is_fatal) {
-		while(pkt->owner->owner && ast_channel_trylock(pkt->owner->owner)) {
-			sip_pvt_unlock(pkt->owner);	/* SIP_PVT, not channel */
-			usleep(1);
-			sip_pvt_lock(pkt->owner);
-		}
-		if (pkt->owner->owner && !ast_channel_hangupcause(pkt->owner->owner)) {
-			ast_channel_hangupcause_set(pkt->owner->owner, AST_CAUSE_NO_USER_RESPONSE);
-		}
-		if (pkt->owner->owner) {
+		if (owner_chan) {
 			ast_log(LOG_WARNING, "Hanging up call %s - no reply to our critical packet (see https://wiki.asterisk.org/wiki/display/AST/SIP+Retransmissions).\n", pkt->owner->callid);
 
 			if (pkt->is_resp &&
@@ -4101,8 +4102,10 @@
 				/* there is nothing left to do, mark the dialog as gone */
 				sip_alreadygone(pkt->owner);
 			}
-			ast_queue_hangup_with_cause(pkt->owner->owner, AST_CAUSE_NO_USER_RESPONSE);
-			ast_channel_unlock(pkt->owner->owner);
+			if (!ast_channel_hangupcause(owner_chan)) {
+				ast_channel_hangupcause_set(owner_chan, AST_CAUSE_NO_USER_RESPONSE);
+			}
+			ast_queue_hangup_with_cause(owner_chan, AST_CAUSE_NO_USER_RESPONSE);
 		} else {
 			/* If no channel owner, destroy now */
 
@@ -4120,36 +4123,66 @@
 	       check_pendings(pkt->owner);
 	}
 
+	if (owner_chan) {
+		ast_channel_unlock(owner_chan);
+		ast_channel_unref(owner_chan);
+	}
+
 	if (pkt->method == SIP_BYE) {
 		/* We're not getting answers on SIP BYE's.  Tear down the call anyway. */
 		sip_alreadygone(pkt->owner);
-		if (pkt->owner->owner) {
-			ast_channel_unlock(pkt->owner->owner);
-		}
 		append_history(pkt->owner, "ByeFailure", "Remote peer doesn't respond to bye. Destroying call anyway.");
 		pvt_set_needdestroy(pkt->owner, "no response to BYE");
 	}
 
-	/* Remove the packet */
+	/* Unlink and destroy the packet object. */
 	for (prev = NULL, cur = pkt->owner->packets; cur; prev = cur, cur = cur->next) {
 		if (cur == pkt) {
+			/* Unlink the node from the list. */
 			UNLINK(cur, pkt->owner->packets, prev);
-			sip_pvt_unlock(pkt->owner);
-			if (pkt->owner) {
-				pkt->owner = dialog_unref(pkt->owner,"pkt is being freed, its dialog ref is dead now");
-			}
-			if (pkt->data) {
-				ast_free(pkt->data);
-			}
-			pkt->data = NULL;
-			ast_free(pkt);
-			return 0;
+			ao2_t_ref(pkt, -1, "Packet retransmission list (retransmission complete)");
+			break;
 		}
 	}
-	/* error case */
-	ast_log(LOG_WARNING, "Weird, couldn't find packet owner!\n");
+
+	/*
+	 * If the object was not in the list then we were in the process of
+	 * stopping retransmisions while we were sending this retransmission.
+	 */
+
 	sip_pvt_unlock(pkt->owner);
+	ao2_t_ref(pkt, -1, "Scheduled packet retransmission complete");
 	return 0;
+}
+
+/* Run by the sched thread. */
+static int __stop_retrans_pkt(const void *data)
+{
+	struct sip_pkt *pkt = (void *) data;
+
+	AST_SCHED_DEL_UNREF(sched, pkt->retransid,
+		ao2_t_ref(pkt, -1, "Stop scheduled packet retransmission"));
+	ao2_t_ref(pkt, -1, "Stop packet retransmission action");
+	return 0;
+}
+
+static void stop_retrans_pkt(struct sip_pkt *pkt)
+{
+	ao2_t_ref(pkt, +1, "Stop packet retransmission action");
+	if (ast_sched_add(sched, 0, __stop_retrans_pkt, pkt) < 0) {
+		/* Uh Oh.  Expect bad behavior. */
+		ao2_t_ref(pkt, -1, "Failed to schedule stop packet retransmission action");
+	}
+}
+
+static void sip_pkt_dtor(void *vdoomed)
+{
+	struct sip_pkt *pkt = (void *) vdoomed;
+
+	if (pkt->owner) {
+		dialog_unref(pkt->owner, "Retransmission packet is being destroyed");
+	}
+	ast_free(pkt->data);
 }
 
 /*!
@@ -4182,12 +4215,14 @@
 		}
 	}
 
-	if (!(pkt = ast_calloc(1, sizeof(*pkt)))) {
+	pkt = ao2_alloc_options(sizeof(*pkt), sip_pkt_dtor, AO2_ALLOC_OPT_LOCK_NOLOCK);
+	if (!pkt) {
 		return AST_FAILURE;
 	}
 	/* copy data, add a terminator and save length */
-	if (!(pkt->data = ast_str_create(ast_str_strlen(data)))) {
-		ast_free(pkt);
+	pkt->data = ast_str_create(ast_str_strlen(data));
+	if (!pkt->data) {
+		ao2_t_ref(pkt, -1, "Failed to initialize");
 		return AST_FAILURE;
 	}
 	ast_str_set(&pkt->data, 0, "%s%s", ast_str_buffer(data), "\0");
@@ -4197,8 +4232,11 @@
 	pkt->is_resp = resp;
 	pkt->is_fatal = fatal;
 	pkt->owner = dialog_ref(p, "__sip_reliable_xmit: setting pkt->owner");
+
+	/* The retransmission list owns a pkt ref */
 	pkt->next = p->packets;
 	p->packets = pkt;	/* Add it to the queue */
+
 	if (resp) {
 		/* Parse out the response code */
 		if (sscanf(ast_str_buffer(pkt->data), "SIP/2.0 %30u", &respid) == 1) {
@@ -4206,7 +4244,6 @@
 		}
 	}
 	pkt->timer_t1 = p->timer_t1;	/* Set SIP timer T1 */
-	pkt->retransid = -1;
 	if (pkt->timer_t1) {
 		siptimer_a = pkt->timer_t1;
 	}
@@ -4215,7 +4252,12 @@
 	pkt->retrans_stop_time = 64 * (pkt->timer_t1 ? pkt->timer_t1 : DEFAULT_TIMER_T1); /* time in ms after pkt->time_sent to stop retransmission */
 
 	/* Schedule retransmission */
-	AST_SCHED_REPLACE_VARIABLE(pkt->retransid, sched, siptimer_a, retrans_pkt, pkt, 1);
+	ao2_t_ref(pkt, +1, "Schedule packet retransmission");
+	pkt->retransid = ast_sched_add_variable(sched, siptimer_a, retrans_pkt, pkt, 1);
+	if (pkt->retransid < 0) {
+		ao2_t_ref(pkt, -1, "Failed to schedule packet retransmission");
+	}
+
 	if (sipdebug) {
 		ast_debug(4, "*** SIP TIMER: Initializing retransmit timer on packet: Id  #%d\n", pkt->retransid);
 	}
@@ -4225,11 +4267,11 @@
 	if (xmitres == XMIT_ERROR) {	/* Serious network trouble, no need to try again */
 		append_history(pkt->owner, "XmitErr", "%s", pkt->is_fatal ? "(Critical)" : "(Non-critical)");
 		ast_log(LOG_ERROR, "Serious Network Trouble; __sip_xmit returns error for pkt data\n");
-		AST_SCHED_DEL(sched, pkt->retransid);
+
+		/* Unlink and destroy the packet object. */
 		p->packets = pkt->next;
-		pkt->owner = dialog_unref(pkt->owner,"pkt is being freed, its dialog ref is dead now");
-		ast_free(pkt->data);
-		ast_free(pkt);
+		stop_retrans_pkt(pkt);
+		ao2_t_ref(pkt, -1, "Packet retransmission list");
 		return AST_FAILURE;
 	} else {
 		/* This is odd, but since the retrans timer starts at 500ms and the do_monitor thread
@@ -4479,33 +4521,11 @@
 				if (sipdebug)
 					ast_debug(4, "** SIP TIMER: Cancelling retransmit of packet (reply received) Retransid #%d\n", cur->retransid);
 			}
-			/* This odd section is designed to thwart a
-			 * race condition in the packet scheduler. There are
-			 * two conditions under which deleting the packet from the
-			 * scheduler can fail.
-			 *
-			 * 1. The packet has been removed from the scheduler because retransmission
-			 * is being attempted. The problem is that if the packet is currently attempting
-			 * retransmission and we are at this point in the code, then that MUST mean
-			 * that retrans_pkt is waiting on p's lock. Therefore we will relinquish the
-			 * lock temporarily to allow retransmission.
-			 *
-			 * 2. The packet has reached its maximum number of retransmissions and has
-			 * been permanently removed from the packet scheduler. If this is the case, then
-			 * the packet's retransid will be set to -1. The atomicity of the setting and checking
-			 * of the retransid to -1 is ensured since in both cases p's lock is held.
-			 */
-			while (cur->retransid > -1 && ast_sched_del(sched, cur->retransid)) {
-				sip_pvt_unlock(p);
-				usleep(1);
-				sip_pvt_lock(p);
-			}
+
+			/* Unlink and destroy the packet object. */
 			UNLINK(cur, p->packets, prev);
-			dialog_unref(cur->owner, "unref pkt cur->owner dialog from sip ack before freeing pkt");
-			if (cur->data) {
-				ast_free(cur->data);
-			}
-			ast_free(cur);
+			stop_retrans_pkt(cur);
+			ao2_t_ref(cur, -1, "Packet retransmission list");
 			break;
 		}
 	}
@@ -4546,7 +4566,7 @@
 				if (sipdebug)
 					ast_debug(4, "*** SIP TIMER: Cancelling retransmission #%d - %s (got response)\n", cur->retransid, sip_methods[sipmethod].text);
 			}
-			AST_SCHED_DEL(sched, cur->retransid);
+			stop_retrans_pkt(cur);
 			res = TRUE;
 			break;
 		}
@@ -26707,13 +26727,10 @@
 		 */
 		for (pkt = p->packets, prev_pkt = NULL; pkt; prev_pkt = pkt, pkt = pkt->next) {
 			if (pkt->seqno == p->lastinvite && pkt->response_code == 487) {
-				AST_SCHED_DEL(sched, pkt->retransid);
+				/* Unlink and destroy the packet object. */
 				UNLINK(pkt, p->packets, prev_pkt);
-				dialog_unref(pkt->owner, "unref packet->owner from dialog");
-				if (pkt->data) {
-					ast_free(pkt->data);
-				}
-				ast_free(pkt);
+				stop_retrans_pkt(pkt);
+				ao2_t_ref(pkt, -1, "Packet retransmission list");
 				break;
 			}
 		}

-- 
To view, visit https://gerrit.asterisk.org/2397
To unsubscribe, visit https://gerrit.asterisk.org/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I50926d81be53f4cd3d572a3292cd25f563f59641
Gerrit-PatchSet: 3
Gerrit-Project: asterisk
Gerrit-Branch: 13
Gerrit-Owner: Richard Mudgett <rmudgett at digium.com>
Gerrit-Reviewer: Anonymous Coward #1000019
Gerrit-Reviewer: Joshua Colp <jcolp at digium.com>
Gerrit-Reviewer: Mark Michelson <mmichelson at digium.com>



More information about the asterisk-code-review mailing list