dccp: Refine the wait-for-ccid mechanism

This extends the existing wait-for-ccid routine so that it may be used with different types of CCID. It further addresses the problems listed below. The code looks if the write queue is non-empty and grants the TX CCID up to `timeout' jiffies to drain the queue. It will instead purge that queue if * the delay suggested by the CCID exceeds the time budget; * a socket error occurred while waiting for the CCID; * there is a signal pending (eg. annoyed user pressed Control-C); * the CCID does not support delays (we don't know how long it will take). D e t a i l s [can be removed] ------------------------------- DCCP's sending mechanism functions a bit like non-blocking I/O: dccp_sendmsg() will enqueue up to net.dccp.default.tx_qlen packets (default=5), without waiting for them to be released to the network. Rate-based CCIDs, such as CCID3/4, can impose sending delays of up to maximally 64 seconds (t_mbi in RFC 3448). Hence the write queue may still contain packets when the application closes. Since the write queue is congestion-controlled by the CCID, draining the queue is also under control of the CCID. There are several problems that needed to be addressed: 1) The queue-drain mechanism only works with rate-based CCIDs. If CCID2 for example has a full TX queue and becomes network-limited just as the application wants to close, then waiting for CCID2 to become unblocked could lead to an indefinite delay (i.e., application "hangs"). 2) Since each TX CCID in turn uses a feedback mechanism, there may be changes in its sending policy while the queue is being drained. This can lead to further delays during which the application will not be able to terminate. 3) The minimum wait time for CCID3/4 can be expected to be the queue length times the current inter-packet delay. For example if tx_qlen=100 and a delay of 15 ms is used for each packet, then the application would have to wait for a minimum of 1.5 seconds before being allowed to exit. 4) There is no way for the user/application to control this behaviour. It would be good to use the timeout argument of dccp_close() as an upper bound. Then the maximum time that an application is willing to wait for its CCIDs to can be set via the SO_LINGER option. These problems are addressed by giving the CCID a grace period of up to the `timeout' value. The wait-for-ccid function is, as before, used when the application (a) has read all the data in its receive buffer and (b) if SO_LINGER was set with a non-zero linger time, or (c) the socket is either in the OPEN (active close) or in the PASSIVE_CLOSEREQ state (client application closes after receiving CloseReq). In addition, there is a catch-all case by calling __skb_queue_purge() after waiting for the CCID. This is necessary since the write queue may still have data when (a) the host has been passively-closed, (b) abnormal termination (unread data, zero linger time), (c) wait-for-ccid could not finish within the given time limit. Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
author: Gerrit Renker <gerrit@erg.abdn.ac.uk> 2008-09-04 01:30:19 -0400
committer: Gerrit Renker <gerrit@erg.abdn.ac.uk> 2008-09-04 01:45:38 -0400
commit: 146993cf5174472644ed11bd5fb539f0af8bfa49 (patch)
tree: b2c5343ad610fe113425a3663f0dc3ddb478911b /net
parent: e7937772d7a2b0127cc4cbc67bc594e139fdaf63 (diff)
4 files changed, 82 insertions, 53 deletions
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 1e65378eea3f..74c90cd27677 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -234,8 +234,9 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 extern void dccp_send_sync(struct sock *sk, const u64 seq,
                           const enum dccp_pkt_type pkt_type);
-extern void dccp_write_xmit(struct sock *sk, int block);
+extern void dccp_write_xmit(struct sock *sk);
 extern void dccp_write_space(struct sock *sk);
+extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
 extern void dccp_init_xmit_timers(struct sock *sk);
 static inline void dccp_clear_xmit_timers(struct sock *sk)
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 9afd58e39e23..39056dc61355 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -206,49 +206,29 @@ void dccp_write_space(struct sock *sk)
 }
 /**
- * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
+ * dccp_wait_for_ccid  -  Await CCID send permission
 * @sk:    socket to wait for
- * @skb:   current skb to pass on for waiting
+ * @delay: timeout in jiffies
- * @delay: sleep timeout in milliseconds (> 0)
+ * This is used by CCIDs which need to delay the send time in process context.
- * This function is called by default when the socket is closed, and
- * when a non-zero linger time is set on the socket. For consistency
 */
-static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
+static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
 {
-        struct dccp_sock *dp = dccp_sk(sk);
        DEFINE_WAIT(wait);
-        unsigned long jiffdelay;
+        long remaining;
-        int rc;
-        do {
-                dccp_pr_debug("delayed send by %d msec\n", delay);
-                jiffdelay = msecs_to_jiffies(delay);
-                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
-                sk->sk_write_pending++;
+        prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
-                release_sock(sk);
+        sk->sk_write_pending++;
-                schedule_timeout(jiffdelay);
+        release_sock(sk);
-                lock_sock(sk);
-                sk->sk_write_pending--;
-                if (sk->sk_err)
+        remaining = schedule_timeout(delay);
-                        goto do_error;
-                if (signal_pending(current))
-                        goto do_interrupted;
-                rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+        lock_sock(sk);
-        } while ((delay = rc) > 0);
+        sk->sk_write_pending--;
-out:
        finish_wait(sk->sk_sleep, &wait);
-        return rc;
+        if (signal_pending(current) || sk->sk_err)
-do_error:
+                return -1;
-        rc = -EPIPE;
+        return remaining;
-        goto out;
-do_interrupted:
-        rc = -EINTR;
-        goto out;
 }
 /**
@@ -311,7 +291,53 @@ static void dccp_xmit_packet(struct sock *sk)
                dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
 }
-void dccp_write_xmit(struct sock *sk, int block)
+/**
+ * dccp_flush_write_queue  -  Drain queue at end of connection
+ * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
+ * happen that the TX queue is not empty at the end of a connection. We give the
+ * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
+ * returns with a non-empty write queue, it will be purged later.
+ */
+void dccp_flush_write_queue(struct sock *sk, long *time_budget)
+{
+        struct dccp_sock *dp = dccp_sk(sk);
+        struct sk_buff *skb;
+        long delay, rc;
+        while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
+                rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+                switch (ccid_packet_dequeue_eval(rc)) {
+                case CCID_PACKET_WILL_DEQUEUE_LATER:
+                        /*
+                         * If the CCID determines when to send, the next sending
+                         * time is unknown or the CCID may not even send again
+                         * (e.g. remote host crashes or lost Ack packets).
+                         */
+                        DCCP_WARN("CCID did not manage to send all packets\n");
+                        return;
+                case CCID_PACKET_DELAY:
+                        delay = msecs_to_jiffies(rc);
+                        if (delay > *time_budget)
+                                return;
+                        rc = dccp_wait_for_ccid(sk, delay);
+                        if (rc < 0)
+                                return;
+                        *time_budget -= (delay - rc);
+                        /* check again if we can send now */
+                        break;
+                case CCID_PACKET_SEND_AT_ONCE:
+                        dccp_xmit_packet(sk);
+                        break;
+                case CCID_PACKET_ERR:
+                        skb_dequeue(&sk->sk_write_queue);
+                        kfree_skb(skb);
+                        dccp_pr_debug("packet discarded due to err=%ld\n", rc);
+                }
+        }
+}
+void dccp_write_xmit(struct sock *sk)
 {
        struct dccp_sock *dp = dccp_sk(sk);
        struct sk_buff *skb;
@@ -323,19 +349,9 @@ void dccp_write_xmit(struct sock *sk, int block)
                case CCID_PACKET_WILL_DEQUEUE_LATER:
                        return;
                case CCID_PACKET_DELAY:
-                        if (!block) {
+                        sk_reset_timer(sk, &dp->dccps_xmit_timer,
-                                sk_reset_timer(sk, &dp->dccps_xmit_timer,
+                                       jiffies + msecs_to_jiffies(rc));
-                                                msecs_to_jiffies(rc)+jiffies);
+                        return;
-                                return;
-                        }
-                        rc = dccp_wait_for_ccid(sk, skb, rc);
-                        if (rc && rc != -EINTR) {
-                                DCCP_BUG("err=%d after dccp_wait_for_ccid", rc);
-                                skb_dequeue(&sk->sk_write_queue);
-                                kfree_skb(skb);
-                                break;
-                        }
-                        /* fall through */
                case CCID_PACKET_SEND_AT_ONCE:
                        dccp_xmit_packet(sk);
                        break;
@@ -660,7 +676,6 @@ void dccp_send_close(struct sock *sk, const int active)
                DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
        if (active) {
-                dccp_write_xmit(sk, 1);
                dccp_skb_entail(sk, skb);
                dccp_transmit_skb(sk, skb_clone(skb, prio));
                /*
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 11905e0cf8f7..8c125ffab1c5 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -735,7 +735,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                goto out_discard;
        skb_queue_tail(&sk->sk_write_queue, skb);
-        dccp_write_xmit(sk,0);
+        dccp_write_xmit(sk);
 out_release:
        release_sock(sk);
        return rc ? : len;
@@ -958,9 +958,22 @@ void dccp_close(struct sock *sk, long timeout)
                /* Check zero linger _after_ checking for unread data. */
                sk->sk_prot->disconnect(sk, 0);
        } else if (sk->sk_state != DCCP_CLOSED) {
+                /*
+                 * Normal connection termination. May need to wait if there are
+                 * still packets in the TX queue that are delayed by the CCID.
+                 */
+                dccp_flush_write_queue(sk, &timeout);
                dccp_terminate_connection(sk);
        }
+        /*
+         * Flush write queue. This may be necessary in several cases:
+         * - we have been closed by the peer but still have application data;
+         * - abortive termination (unread data or zero linger time),
+         * - normal termination but queue could not be flushed within time limit
+         */
+        __skb_queue_purge(&sk->sk_write_queue);
        sk_stream_wait_close(sk, timeout);
 adjudge_to_death:
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 9369aca4b0e9..e02d5a94f4c0 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -249,7 +249,7 @@ static void dccp_write_xmitlet(unsigned long data)
        if (sock_owned_by_user(sk))
                sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
        else
-                dccp_write_xmit(sk, 0);
+                dccp_write_xmit(sk);
        bh_unlock_sock(sk);
 }
author	Gerrit Renker <gerrit@erg.abdn.ac.uk>	2008-09-04 01:30:19 -0400
committer	Gerrit Renker <gerrit@erg.abdn.ac.uk>	2008-09-04 01:45:38 -0400
commit	146993cf5174472644ed11bd5fb539f0af8bfa49 (patch)
tree	b2c5343ad610fe113425a3663f0dc3ddb478911b /net
parent	e7937772d7a2b0127cc4cbc67bc594e139fdaf63 (diff)

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 1e65378eea3f..74c90cd27677 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h
@@ -234,8 +234,9 @@ extern void dccp_reqsk_send_ack(struct sock sk, struct sk_buff skb,
234	extern void dccp_send_sync(struct sock *sk, const u64 seq,	234	extern void dccp_send_sync(struct sock *sk, const u64 seq,
235	const enum dccp_pkt_type pkt_type);	235	const enum dccp_pkt_type pkt_type);
236		236
237	extern void dccp_write_xmit(struct sock *sk, int block);	237	extern void dccp_write_xmit(struct sock *sk);
238	extern void dccp_write_space(struct sock *sk);	238	extern void dccp_write_space(struct sock *sk);
		239	extern void dccp_flush_write_queue(struct sock sk, long time_budget);
239		240
240	extern void dccp_init_xmit_timers(struct sock *sk);	241	extern void dccp_init_xmit_timers(struct sock *sk);
241	static inline void dccp_clear_xmit_timers(struct sock *sk)	242	static inline void dccp_clear_xmit_timers(struct sock *sk)


diff --git a/net/dccp/output.c b/net/dccp/output.c index 9afd58e39e23..39056dc61355 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c
@@ -206,49 +206,29 @@ void dccp_write_space(struct sock *sk)
206	}	206	}
207		207
208	/**	208	/**
209	* dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet	209	* dccp_wait_for_ccid - Await CCID send permission
210	* @sk: socket to wait for	210	* @sk: socket to wait for
211	* @skb: current skb to pass on for waiting	211	* @delay: timeout in jiffies
212	* @delay: sleep timeout in milliseconds (> 0)	212	* This is used by CCIDs which need to delay the send time in process context.
213	* This function is called by default when the socket is closed, and
214	* when a non-zero linger time is set on the socket. For consistency
215	*/	213	*/
216	static int dccp_wait_for_ccid(struct sock sk, struct sk_buff skb, int delay)	214	static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
217	{	215	{
218	struct dccp_sock *dp = dccp_sk(sk);
219	DEFINE_WAIT(wait);	216	DEFINE_WAIT(wait);
220	unsigned long jiffdelay;	217	long remaining;
221	int rc;
222
223	do {
224	dccp_pr_debug("delayed send by %d msec\n", delay);
225	jiffdelay = msecs_to_jiffies(delay);
226
227	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
228		218
229	sk->sk_write_pending++;	219	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
230	release_sock(sk);	220	sk->sk_write_pending++;
231	schedule_timeout(jiffdelay);	221	release_sock(sk);
232	lock_sock(sk);
233	sk->sk_write_pending--;
234		222
235	if (sk->sk_err)	223	remaining = schedule_timeout(delay);
236	goto do_error;
237	if (signal_pending(current))
238	goto do_interrupted;
239		224
240	rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);	225	lock_sock(sk);
241	} while ((delay = rc) > 0);	226	sk->sk_write_pending--;
242	out:
243	finish_wait(sk->sk_sleep, &wait);	227	finish_wait(sk->sk_sleep, &wait);
244	return rc;	228
245		229	if (signal_pending(current) \|\| sk->sk_err)
246	do_error:	230	return -1;
247	rc = -EPIPE;	231	return remaining;
248	goto out;
249	do_interrupted:
250	rc = -EINTR;
251	goto out;
252	}	232	}
253		233
254	/**	234	/**
@@ -311,7 +291,53 @@ static void dccp_xmit_packet(struct sock *sk)
311	dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);	291	dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
312	}	292	}
313		293
314	void dccp_write_xmit(struct sock *sk, int block)	294	/**
		295	* dccp_flush_write_queue - Drain queue at end of connection
		296	* Since dccp_sendmsg queues packets without waiting for them to be sent, it may
		297	* happen that the TX queue is not empty at the end of a connection. We give the
		298	* HC-sender CCID a grace period of up to @time_budget jiffies. If this function
		299	* returns with a non-empty write queue, it will be purged later.
		300	*/
		301	void dccp_flush_write_queue(struct sock sk, long time_budget)
		302	{
		303	struct dccp_sock *dp = dccp_sk(sk);
		304	struct sk_buff *skb;
		305	long delay, rc;
		306
		307	while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
		308	rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
		309
		310	switch (ccid_packet_dequeue_eval(rc)) {
		311	case CCID_PACKET_WILL_DEQUEUE_LATER:
		312	/*
		313	* If the CCID determines when to send, the next sending
		314	* time is unknown or the CCID may not even send again
		315	* (e.g. remote host crashes or lost Ack packets).
		316	*/
		317	DCCP_WARN("CCID did not manage to send all packets\n");
		318	return;
		319	case CCID_PACKET_DELAY:
		320	delay = msecs_to_jiffies(rc);
		321	if (delay > *time_budget)
		322	return;
		323	rc = dccp_wait_for_ccid(sk, delay);
		324	if (rc < 0)
		325	return;
		326	*time_budget -= (delay - rc);
		327	/* check again if we can send now */
		328	break;
		329	case CCID_PACKET_SEND_AT_ONCE:
		330	dccp_xmit_packet(sk);
		331	break;
		332	case CCID_PACKET_ERR:
		333	skb_dequeue(&sk->sk_write_queue);
		334	kfree_skb(skb);
		335	dccp_pr_debug("packet discarded due to err=%ld\n", rc);
		336	}
		337	}
		338	}
		339
		340	void dccp_write_xmit(struct sock *sk)
315	{	341	{
316	struct dccp_sock *dp = dccp_sk(sk);	342	struct dccp_sock *dp = dccp_sk(sk);
317	struct sk_buff *skb;	343	struct sk_buff *skb;
@@ -323,19 +349,9 @@ void dccp_write_xmit(struct sock *sk, int block)
323	case CCID_PACKET_WILL_DEQUEUE_LATER:	349	case CCID_PACKET_WILL_DEQUEUE_LATER:
324	return;	350	return;
325	case CCID_PACKET_DELAY:	351	case CCID_PACKET_DELAY:
326	if (!block) {	352	sk_reset_timer(sk, &dp->dccps_xmit_timer,
327	sk_reset_timer(sk, &dp->dccps_xmit_timer,	353	jiffies + msecs_to_jiffies(rc));
328	msecs_to_jiffies(rc)+jiffies);	354	return;
329	return;
330	}
331	rc = dccp_wait_for_ccid(sk, skb, rc);
332	if (rc && rc != -EINTR) {
333	DCCP_BUG("err=%d after dccp_wait_for_ccid", rc);
334	skb_dequeue(&sk->sk_write_queue);
335	kfree_skb(skb);
336	break;
337	}
338	/* fall through */
339	case CCID_PACKET_SEND_AT_ONCE:	355	case CCID_PACKET_SEND_AT_ONCE:
340	dccp_xmit_packet(sk);	356	dccp_xmit_packet(sk);
341	break;	357	break;
@@ -660,7 +676,6 @@ void dccp_send_close(struct sock *sk, const int active)
660	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;	676	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
661		677
662	if (active) {	678	if (active) {
663	dccp_write_xmit(sk, 1);
664	dccp_skb_entail(sk, skb);	679	dccp_skb_entail(sk, skb);
665	dccp_transmit_skb(sk, skb_clone(skb, prio));	680	dccp_transmit_skb(sk, skb_clone(skb, prio));
666	/*	681	/*


diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 11905e0cf8f7..8c125ffab1c5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c
@@ -735,7 +735,7 @@ int dccp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
735	goto out_discard;	735	goto out_discard;
736		736
737	skb_queue_tail(&sk->sk_write_queue, skb);	737	skb_queue_tail(&sk->sk_write_queue, skb);
738	dccp_write_xmit(sk,0);	738	dccp_write_xmit(sk);
739	out_release:	739	out_release:
740	release_sock(sk);	740	release_sock(sk);
741	return rc ? : len;	741	return rc ? : len;
@@ -958,9 +958,22 @@ void dccp_close(struct sock *sk, long timeout)
958	/* Check zero linger _after_ checking for unread data. */	958	/* Check zero linger _after_ checking for unread data. */
959	sk->sk_prot->disconnect(sk, 0);	959	sk->sk_prot->disconnect(sk, 0);
960	} else if (sk->sk_state != DCCP_CLOSED) {	960	} else if (sk->sk_state != DCCP_CLOSED) {
		961	/*
		962	* Normal connection termination. May need to wait if there are
		963	* still packets in the TX queue that are delayed by the CCID.
		964	*/
		965	dccp_flush_write_queue(sk, &timeout);
961	dccp_terminate_connection(sk);	966	dccp_terminate_connection(sk);
962	}	967	}
963		968
		969	/*
		970	* Flush write queue. This may be necessary in several cases:
		971	* - we have been closed by the peer but still have application data;
		972	* - abortive termination (unread data or zero linger time),
		973	* - normal termination but queue could not be flushed within time limit
		974	*/
		975	__skb_queue_purge(&sk->sk_write_queue);
		976
964	sk_stream_wait_close(sk, timeout);	977	sk_stream_wait_close(sk, timeout);
965		978
966	adjudge_to_death:	979	adjudge_to_death:


diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 9369aca4b0e9..e02d5a94f4c0 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c
@@ -249,7 +249,7 @@ static void dccp_write_xmitlet(unsigned long data)
249	if (sock_owned_by_user(sk))	249	if (sock_owned_by_user(sk))
250	sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);	250	sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
251	else	251	else
252	dccp_write_xmit(sk, 0);	252	dccp_write_xmit(sk);
253	bh_unlock_sock(sk);	253	bh_unlock_sock(sk);
254	}	254	}
255		255