From b36930dd508e00f0c5083bcd57d25de6d0375c76 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Wed, 10 Nov 2010 21:56:39 -0800
Subject: dlm: Handle application limited situations properly.

In the normal regime where an application uses non-blocking I/O
writes on a socket, they will handle -EAGAIN and use poll() to
wait for send space.

They don't actually sleep on the socket I/O write.

But kernel level RPC layers that do socket I/O operations directly
and key off of -EAGAIN on the write() to "try again later" don't
use poll(), they instead have their own sleeping mechanism and
rely upon ->sk_write_space() to trigger the wakeup.

So they do effectively sleep on the write(), but this mechanism
alone does not let the socket layers know what's going on.

Therefore they must emulate what would have happened, otherwise
TCP cannot possibly see that the connection is application window
size limited.

Handle this, therefore, like SUNRPC by setting SOCK_NOSPACE and
bumping the ->sk_write_count as needed when we hit the send buffer
limits.

This should make TCP send buffer size auto-tuning and the
->sk_write_space() callback invocations actually happen.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2c622a..77720f89c879 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -108,6 +108,7 @@ struct connection {
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
+#define CF_APP_LIMITED 7
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
@@ -295,7 +296,17 @@ static void lowcomms_write_space(struct sock *sk)
 {
 	struct connection *con = sock2con(sk);
 
-	if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+	if (!con)
+		return;
+
+	clear_bit(SOCK_NOSPACE, &con->sock->flags);
+
+	if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+		con->sock->sk->sk_write_pending--;
+		clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+	}
+
+	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
 		queue_work(send_workqueue, &con->swork);
 }
 
@@ -1319,6 +1330,15 @@ static void send_to_sock(struct connection *con)
 			ret = kernel_sendpage(con->sock, e->page, offset, len,
 					      msg_flags);
 			if (ret == -EAGAIN || ret == 0) {
+				if (ret == -EAGAIN &&
+				    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+				    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+					/* Notify TCP that we're limited by the
+					 * application window size.
+					 */
+					set_bit(SOCK_NOSPACE, &con->sock->flags);
+					con->sock->sk->sk_write_pending++;
+				}
 				cond_resched();
 				goto out;
 			}
-- 
cgit v1.2.2


From dcce240ead802d42b1e45ad2fcb2ed4a399cb255 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 12 Nov 2010 12:12:29 +0000
Subject: dlm: Use cmwq for send and receive workqueues

So far as I can tell, there is no reason to use a single-threaded
send workqueue for dlm, since it may need to send to several sockets
concurrently. Both workqueues are set to WQ_MEM_RECLAIM to avoid
any possible deadlocks, WQ_HIGHPRI since locking traffic is highly
latency sensitive (and to avoid a priority inversion wrt GFS2's
glock_workqueue) and WQ_FREEZABLE just in case someone needs to do
that (even though with current cluster infrastructure, it doesn't
make sense as the node will most likely land up ejected from the
cluster) in the future.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 77720f89c879..1d4e644c6589 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1451,14 +1451,16 @@ static void work_stop(void)
 static int work_start(void)
 {
 	int error;
-	recv_workqueue = create_workqueue("dlm_recv");
+	recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
+					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
 	error = IS_ERR(recv_workqueue);
 	if (error) {
 		log_print("can't start dlm_recv %d", error);
 		return error;
 	}
 
-	send_workqueue = create_singlethread_workqueue("dlm_send");
+	send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
+					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
 	error = IS_ERR(send_workqueue);
 	if (error) {
 		log_print("can't start dlm_send %d", error);
-- 
cgit v1.2.2


From cb2d45da81c86d5191b19d0f67732a854bc0253c Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 12 Nov 2010 11:12:55 -0600
Subject: dlm: use TCP_NODELAY

Nagling doesn't help and can sometimes hurt dlm comms.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 1d4e644c6589..2bedb0ac5f92 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -926,6 +926,7 @@ static void tcp_connect_to_sock(struct connection *con)
 	struct sockaddr_storage saddr, src_addr;
 	int addr_len;
 	struct socket *sock = NULL;
+	int one = 1;
 
 	if (con->nodeid == 0) {
 		log_print("attempt to connect sock 0 foiled");
@@ -971,6 +972,11 @@ static void tcp_connect_to_sock(struct connection *con)
 	make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
 
 	log_print("connecting to %d", con->nodeid);
+
+	/* Turn off Nagle's algorithm */
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+			  sizeof(one));
+
 	result =
 		sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
 				   O_NONBLOCK);
@@ -1022,6 +1028,10 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 		goto create_out;
 	}
 
+	/* Turn off Nagle's algorithm */
+	kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+			  sizeof(one));
+
 	result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
 				   (char *)&one, sizeof(one));
 
-- 
cgit v1.2.2


From f92c8dd7a0eb18124521e2b549f88422e17f707b Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Fri, 12 Nov 2010 11:15:20 -0600
Subject: dlm: reduce cond_resched during send

Calling cond_resched() after every send can unnecessarily
degrade performance.  Go back to an old method of scheduling
after 25 messages.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 2bedb0ac5f92..0e75f152eac2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,9 @@
 #define NEEDED_RMEM (4*1024*1024)
 #define CONN_HASH_SIZE 32
 
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
+
 struct cbuf {
 	unsigned int base;
 	unsigned int len;
@@ -1318,6 +1321,7 @@ static void send_to_sock(struct connection *con)
 	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
 	struct writequeue_entry *e;
 	int len, offset;
+	int count = 0;
 
 	mutex_lock(&con->sock_mutex);
 	if (con->sock == NULL)
@@ -1355,8 +1359,12 @@ static void send_to_sock(struct connection *con)
 			if (ret <= 0)
 				goto send_error;
 		}
-			/* Don't starve people filling buffers */
+
+		/* Don't starve people filling buffers */
+		if (++count >= MAX_SEND_MSG_COUNT) {
 			cond_resched();
+			count = 0;
+		}
 
 		spin_lock(&con->writequeue_lock);
 		e->offset += ret;
-- 
cgit v1.2.2


From b9d41052794385f9d47ebb7acf4a772f3ad02398 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 13 Dec 2010 13:42:24 -0600
Subject: dlm: sanitize work_start() in lowcomms.c

The create_workqueue() returns NULL if failed rather than ERR_PTR().
Fix error checking and remove unnecessary variable 'error'.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 0e75f152eac2..9c64ae9e4c1a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1468,22 +1468,19 @@ static void work_stop(void)
 
 static int work_start(void)
 {
-	int error;
 	recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
 					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-	error = IS_ERR(recv_workqueue);
-	if (error) {
-		log_print("can't start dlm_recv %d", error);
-		return error;
+	if (!recv_workqueue) {
+		log_print("can't start dlm_recv");
+		return -ENOMEM;
 	}
 
 	send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
 					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-	error = IS_ERR(send_workqueue);
-	if (error) {
-		log_print("can't start dlm_send %d", error);
+	if (!send_workqueue) {
+		log_print("can't start dlm_send");
 		destroy_workqueue(recv_workqueue);
-		return error;
+		return -ENOMEM;
 	}
 
 	return 0;
-- 
cgit v1.2.2