aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2009-03-24 19:37:55 -0400
committerDavid S. Miller <davem@davemloft.net>2009-03-24 19:37:55 -0400
commit38938bfe3489394e2eed5e40c9bb8f66a2ce1405 (patch)
treecbe1cafdaa990d31f0297d306e3adc61bb1e95a2 /net
parent7f649269c318c41030e492fc35f03d38c6e3b39b (diff)
netlink: add NETLINK_NO_ENOBUFS socket flag
This patch adds the NETLINK_NO_ENOBUFS socket flag. This flag can be used by unicast and broadcast listeners to avoid receiving ENOBUFS errors. Generally speaking, ENOBUFS errors are useful to notify two things to the listener: a) You may increase the receiver buffer size via setsockopt(). b) You have lost messages, you may be out of sync. In some cases, ignoring ENOBUFS errors can be useful. For example: a) nfnetlink_queue: this subsystem does not have any sort of resync method and you can decide to ignore ENOBUFS once you have set a given buffer size. b) ctnetlink: you can use this together with the socket flag NETLINK_BROADCAST_SEND_ERROR to stop getting ENOBUFS errors as you do not need to resync (packets whose event are not delivered are drop to provide reliable logging and state-synchronization). Moreover, the use of NETLINK_NO_ENOBUFS also reduces a "go up, go down" effect in terms of performance which is due to the netlink congestion control when the listener cannot back off. The effect is the following: 1) throughput rate goes up and netlink messages are inserted in the receiver buffer. 2) Then, netlink buffer fills and overruns (set on nlk->state bit 0). 3) While the listener empties the receiver buffer, netlink keeps dropping messages. Thus, throughput goes dramatically down. 4) Then, once the listener has emptied the buffer (nlk->state bit 0 is set off), goto step 1. This effect is easy to trigger with netlink broadcast under heavy load, and it is more noticeable when using a big receiver buffer. You can find some results in [1] that show this problem. [1] http://1984.lsi.us.es/linux/netlink/ This patch also includes the use of sk_drop to account the number of netlink messages drop due to overrun. This value is shown in /proc/net/netlink. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/netlink/af_netlink.c38
1 files changed, 32 insertions, 6 deletions
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b73d4e61c5ac..8b6bbb3032b0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -86,6 +86,7 @@ struct netlink_sock {
86#define NETLINK_KERNEL_SOCKET 0x1 86#define NETLINK_KERNEL_SOCKET 0x1
87#define NETLINK_RECV_PKTINFO 0x2 87#define NETLINK_RECV_PKTINFO 0x2
88#define NETLINK_BROADCAST_SEND_ERROR 0x4 88#define NETLINK_BROADCAST_SEND_ERROR 0x4
89#define NETLINK_RECV_NO_ENOBUFS 0x8
89 90
90static inline struct netlink_sock *nlk_sk(struct sock *sk) 91static inline struct netlink_sock *nlk_sk(struct sock *sk)
91{ 92{
@@ -717,10 +718,15 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr,
717 718
718static void netlink_overrun(struct sock *sk) 719static void netlink_overrun(struct sock *sk)
719{ 720{
720 if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { 721 struct netlink_sock *nlk = nlk_sk(sk);
721 sk->sk_err = ENOBUFS; 722
722 sk->sk_error_report(sk); 723 if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
724 if (!test_and_set_bit(0, &nlk_sk(sk)->state)) {
725 sk->sk_err = ENOBUFS;
726 sk->sk_error_report(sk);
727 }
723 } 728 }
729 atomic_inc(&sk->sk_drops);
724} 730}
725 731
726static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) 732static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid)
@@ -1182,6 +1188,15 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
1182 nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR; 1188 nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
1183 err = 0; 1189 err = 0;
1184 break; 1190 break;
1191 case NETLINK_NO_ENOBUFS:
1192 if (val) {
1193 nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
1194 clear_bit(0, &nlk->state);
1195 wake_up_interruptible(&nlk->wait);
1196 } else
1197 nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
1198 err = 0;
1199 break;
1185 default: 1200 default:
1186 err = -ENOPROTOOPT; 1201 err = -ENOPROTOOPT;
1187 } 1202 }
@@ -1224,6 +1239,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
1224 return -EFAULT; 1239 return -EFAULT;
1225 err = 0; 1240 err = 0;
1226 break; 1241 break;
1242 case NETLINK_NO_ENOBUFS:
1243 if (len < sizeof(int))
1244 return -EINVAL;
1245 len = sizeof(int);
1246 val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0;
1247 if (put_user(len, optlen) ||
1248 put_user(val, optval))
1249 return -EFAULT;
1250 err = 0;
1251 break;
1227 default: 1252 default:
1228 err = -ENOPROTOOPT; 1253 err = -ENOPROTOOPT;
1229 } 1254 }
@@ -1879,12 +1904,12 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
1879 if (v == SEQ_START_TOKEN) 1904 if (v == SEQ_START_TOKEN)
1880 seq_puts(seq, 1905 seq_puts(seq,
1881 "sk Eth Pid Groups " 1906 "sk Eth Pid Groups "
1882 "Rmem Wmem Dump Locks\n"); 1907 "Rmem Wmem Dump Locks Drops\n");
1883 else { 1908 else {
1884 struct sock *s = v; 1909 struct sock *s = v;
1885 struct netlink_sock *nlk = nlk_sk(s); 1910 struct netlink_sock *nlk = nlk_sk(s);
1886 1911
1887 seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %d\n", 1912 seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %-8d %-8d\n",
1888 s, 1913 s,
1889 s->sk_protocol, 1914 s->sk_protocol,
1890 nlk->pid, 1915 nlk->pid,
@@ -1892,7 +1917,8 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
1892 atomic_read(&s->sk_rmem_alloc), 1917 atomic_read(&s->sk_rmem_alloc),
1893 atomic_read(&s->sk_wmem_alloc), 1918 atomic_read(&s->sk_wmem_alloc),
1894 nlk->cb, 1919 nlk->cb,
1895 atomic_read(&s->sk_refcnt) 1920 atomic_read(&s->sk_refcnt),
1921 atomic_read(&s->sk_drops)
1896 ); 1922 );
1897 1923
1898 } 1924 }