aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHerbert Xu <herbert@gondor.apana.org.au>2010-05-24 03:12:34 -0400
committerDavid S. Miller <davem@davemloft.net>2010-05-24 03:12:34 -0400
commitf845172531fb7410c7fb7780b1a6e51ee6df7d52 (patch)
treeef1030d0ad9d9dbc8fe800a145c587f04be50ade
parenteda6e6f86b5f95b982ac7ebf7cf5be2a29a291e9 (diff)
cls_cgroup: Store classid in struct sock
Up until now cls_cgroup has relied on fetching the classid out of the current executing thread. This runs into trouble when a packet processing is delayed in which case it may execute out of another thread's context. Furthermore, even when a packet is not delayed we may fail to classify it if soft IRQs have been disabled, because this scenario is indistinguishable from one where a packet unrelated to the current thread is processed by a real soft IRQ. In fact, the current semantics is inherently broken, as a single skb may be constructed out of the writes of two different tasks. A different manifestation of this problem is when the TCP stack transmits in response of an incoming ACK. This is currently unclassified. As we already have a concept of packet ownership for accounting purposes in the skb->sk pointer, this is a natural place to store the classid in a persistent manner. This patch adds the cls_cgroup classid in struct sock, filling up an existing hole on 64-bit :) The value is set at socket creation time. So all sockets created via socket(2) automatically gains the ID of the thread creating it. Whenever another process touches the socket by either reading or writing to it, we will change the socket classid to that of the process if it has a valid (non-zero) classid. For sockets created on inbound connections through accept(2), we inherit the classid of the original listening socket through sk_clone, possibly preceding the actual accept(2) call. In order to minimise risks, I have not made this the authoritative classid. For now it is only used as a backup when we execute with soft IRQs disabled. Once we're completely happy with its semantics we can use it as the sole classid. Footnote: I have rearranged the error path on cls_group module creation. If we didn't do this, then there is a window where someone could create a tc rule using cls_group before the cgroup subsystem has been registered. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/cls_cgroup.h63
-rw-r--r--include/net/sock.h10
-rw-r--r--net/core/sock.c18
-rw-r--r--net/sched/cls_cgroup.c50
-rw-r--r--net/socket.c9
5 files changed, 133 insertions, 17 deletions
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
new file mode 100644
index 000000000000..ef2df1475b51
--- /dev/null
+++ b/include/net/cls_cgroup.h
@@ -0,0 +1,63 @@
1/*
2 * cls_cgroup.h Control Group Classifier
3 *
4 * Authors: Thomas Graf <tgraf@suug.ch>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation; either version 2 of the License, or (at your option)
9 * any later version.
10 *
11 */
12
13#ifndef _NET_CLS_CGROUP_H
14#define _NET_CLS_CGROUP_H
15
16#include <linux/cgroup.h>
17#include <linux/hardirq.h>
18#include <linux/rcupdate.h>
19
20#ifdef CONFIG_CGROUPS
21struct cgroup_cls_state
22{
23 struct cgroup_subsys_state css;
24 u32 classid;
25};
26
27#ifdef CONFIG_NET_CLS_CGROUP
28static inline u32 task_cls_classid(struct task_struct *p)
29{
30 if (in_interrupt())
31 return 0;
32
33 return container_of(task_subsys_state(p, net_cls_subsys_id),
34 struct cgroup_cls_state, css).classid;
35}
36#else
37extern int net_cls_subsys_id;
38
39static inline u32 task_cls_classid(struct task_struct *p)
40{
41 int id;
42 u32 classid;
43
44 if (in_interrupt())
45 return 0;
46
47 rcu_read_lock();
48 id = rcu_dereference(net_cls_subsys_id);
49 if (id >= 0)
50 classid = container_of(task_subsys_state(p, id),
51 struct cgroup_cls_state, css)->classid;
52 rcu_read_unlock();
53
54 return classid;
55}
56#endif
57#else
58static inline u32 task_cls_classid(struct task_struct *p)
59{
60 return 0;
61}
62#endif
63#endif /* _NET_CLS_CGROUP_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 5697caf8cc76..d24f382cb712 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -312,7 +312,7 @@ struct sock {
312 void *sk_security; 312 void *sk_security;
313#endif 313#endif
314 __u32 sk_mark; 314 __u32 sk_mark;
315 /* XXX 4 bytes hole on 64 bit */ 315 u32 sk_classid;
316 void (*sk_state_change)(struct sock *sk); 316 void (*sk_state_change)(struct sock *sk);
317 void (*sk_data_ready)(struct sock *sk, int bytes); 317 void (*sk_data_ready)(struct sock *sk, int bytes);
318 void (*sk_write_space)(struct sock *sk); 318 void (*sk_write_space)(struct sock *sk);
@@ -1074,6 +1074,14 @@ extern void *sock_kmalloc(struct sock *sk, int size,
1074extern void sock_kfree_s(struct sock *sk, void *mem, int size); 1074extern void sock_kfree_s(struct sock *sk, void *mem, int size);
1075extern void sk_send_sigurg(struct sock *sk); 1075extern void sk_send_sigurg(struct sock *sk);
1076 1076
1077#ifdef CONFIG_CGROUPS
1078extern void sock_update_classid(struct sock *sk);
1079#else
1080static inline void sock_update_classid(struct sock *sk)
1081{
1082}
1083#endif
1084
1077/* 1085/*
1078 * Functions to fill in entries in struct proto_ops when a protocol 1086 * Functions to fill in entries in struct proto_ops when a protocol
1079 * does not implement a particular function. 1087 * does not implement a particular function.
diff --git a/net/core/sock.c b/net/core/sock.c
index bf88a167c8f2..a05ae7f9771e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -123,6 +123,7 @@
123#include <linux/net_tstamp.h> 123#include <linux/net_tstamp.h>
124#include <net/xfrm.h> 124#include <net/xfrm.h>
125#include <linux/ipsec.h> 125#include <linux/ipsec.h>
126#include <net/cls_cgroup.h>
126 127
127#include <linux/filter.h> 128#include <linux/filter.h>
128 129
@@ -217,6 +218,11 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 218int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218EXPORT_SYMBOL(sysctl_optmem_max); 219EXPORT_SYMBOL(sysctl_optmem_max);
219 220
221#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
222int net_cls_subsys_id = -1;
223EXPORT_SYMBOL_GPL(net_cls_subsys_id);
224#endif
225
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 226static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{ 227{
222 struct timeval tv; 228 struct timeval tv;
@@ -1050,6 +1056,16 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
1050 module_put(owner); 1056 module_put(owner);
1051} 1057}
1052 1058
1059#ifdef CONFIG_CGROUPS
1060void sock_update_classid(struct sock *sk)
1061{
1062 u32 classid = task_cls_classid(current);
1063
1064 if (classid && classid != sk->sk_classid)
1065 sk->sk_classid = classid;
1066}
1067#endif
1068
1053/** 1069/**
1054 * sk_alloc - All socket objects are allocated here 1070 * sk_alloc - All socket objects are allocated here
1055 * @net: the applicable net namespace 1071 * @net: the applicable net namespace
@@ -1073,6 +1089,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1073 sock_lock_init(sk); 1089 sock_lock_init(sk);
1074 sock_net_set(sk, get_net(net)); 1090 sock_net_set(sk, get_net(net));
1075 atomic_set(&sk->sk_wmem_alloc, 1); 1091 atomic_set(&sk->sk_wmem_alloc, 1);
1092
1093 sock_update_classid(sk);
1076 } 1094 }
1077 1095
1078 return sk; 1096 return sk;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 221180384fd7..78ef2c5e130b 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -16,14 +16,11 @@
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/skbuff.h> 17#include <linux/skbuff.h>
18#include <linux/cgroup.h> 18#include <linux/cgroup.h>
19#include <linux/rcupdate.h>
19#include <net/rtnetlink.h> 20#include <net/rtnetlink.h>
20#include <net/pkt_cls.h> 21#include <net/pkt_cls.h>
21 22#include <net/sock.h>
22struct cgroup_cls_state 23#include <net/cls_cgroup.h>
23{
24 struct cgroup_subsys_state css;
25 u32 classid;
26};
27 24
28static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, 25static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
29 struct cgroup *cgrp); 26 struct cgroup *cgrp);
@@ -112,6 +109,10 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
112 struct cls_cgroup_head *head = tp->root; 109 struct cls_cgroup_head *head = tp->root;
113 u32 classid; 110 u32 classid;
114 111
112 rcu_read_lock();
113 classid = task_cls_state(current)->classid;
114 rcu_read_unlock();
115
115 /* 116 /*
116 * Due to the nature of the classifier it is required to ignore all 117 * Due to the nature of the classifier it is required to ignore all
117 * packets originating from softirq context as accessing `current' 118 * packets originating from softirq context as accessing `current'
@@ -122,12 +123,12 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
122 * calls by looking at the number of nested bh disable calls because 123 * calls by looking at the number of nested bh disable calls because
123 * softirqs always disables bh. 124 * softirqs always disables bh.
124 */ 125 */
125 if (softirq_count() != SOFTIRQ_OFFSET) 126 if (softirq_count() != SOFTIRQ_OFFSET) {
126 return -1; 127 /* If there is an sk_classid we'll use that. */
127 128 if (!skb->sk)
128 rcu_read_lock(); 129 return -1;
129 classid = task_cls_state(current)->classid; 130 classid = skb->sk->sk_classid;
130 rcu_read_unlock(); 131 }
131 132
132 if (!classid) 133 if (!classid)
133 return -1; 134 return -1;
@@ -289,18 +290,35 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
289 290
290static int __init init_cgroup_cls(void) 291static int __init init_cgroup_cls(void)
291{ 292{
292 int ret = register_tcf_proto_ops(&cls_cgroup_ops); 293 int ret;
293 if (ret) 294
294 return ret;
295 ret = cgroup_load_subsys(&net_cls_subsys); 295 ret = cgroup_load_subsys(&net_cls_subsys);
296 if (ret) 296 if (ret)
297 unregister_tcf_proto_ops(&cls_cgroup_ops); 297 goto out;
298
299#ifndef CONFIG_NET_CLS_CGROUP
300 /* We can't use rcu_assign_pointer because this is an int. */
301 smp_wmb();
302 net_cls_subsys_id = net_cls_subsys.subsys_id;
303#endif
304
305 ret = register_tcf_proto_ops(&cls_cgroup_ops);
306 if (ret)
307 cgroup_unload_subsys(&net_cls_subsys);
308
309out:
298 return ret; 310 return ret;
299} 311}
300 312
301static void __exit exit_cgroup_cls(void) 313static void __exit exit_cgroup_cls(void)
302{ 314{
303 unregister_tcf_proto_ops(&cls_cgroup_ops); 315 unregister_tcf_proto_ops(&cls_cgroup_ops);
316
317#ifndef CONFIG_NET_CLS_CGROUP
318 net_cls_subsys_id = -1;
319 synchronize_rcu();
320#endif
321
304 cgroup_unload_subsys(&net_cls_subsys); 322 cgroup_unload_subsys(&net_cls_subsys);
305} 323}
306 324
diff --git a/net/socket.c b/net/socket.c
index f9f7d0872cac..367d5477d00f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -94,6 +94,7 @@
94 94
95#include <net/compat.h> 95#include <net/compat.h>
96#include <net/wext.h> 96#include <net/wext.h>
97#include <net/cls_cgroup.h>
97 98
98#include <net/sock.h> 99#include <net/sock.h>
99#include <linux/netfilter.h> 100#include <linux/netfilter.h>
@@ -558,6 +559,8 @@ static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
558 struct sock_iocb *si = kiocb_to_siocb(iocb); 559 struct sock_iocb *si = kiocb_to_siocb(iocb);
559 int err; 560 int err;
560 561
562 sock_update_classid(sock->sk);
563
561 si->sock = sock; 564 si->sock = sock;
562 si->scm = NULL; 565 si->scm = NULL;
563 si->msg = msg; 566 si->msg = msg;
@@ -684,6 +687,8 @@ static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
684{ 687{
685 struct sock_iocb *si = kiocb_to_siocb(iocb); 688 struct sock_iocb *si = kiocb_to_siocb(iocb);
686 689
690 sock_update_classid(sock->sk);
691
687 si->sock = sock; 692 si->sock = sock;
688 si->scm = NULL; 693 si->scm = NULL;
689 si->msg = msg; 694 si->msg = msg;
@@ -777,6 +782,8 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
777 if (unlikely(!sock->ops->splice_read)) 782 if (unlikely(!sock->ops->splice_read))
778 return -EINVAL; 783 return -EINVAL;
779 784
785 sock_update_classid(sock->sk);
786
780 return sock->ops->splice_read(sock, ppos, pipe, len, flags); 787 return sock->ops->splice_read(sock, ppos, pipe, len, flags);
781} 788}
782 789
@@ -3069,6 +3076,8 @@ int kernel_setsockopt(struct socket *sock, int level, int optname,
3069int kernel_sendpage(struct socket *sock, struct page *page, int offset, 3076int kernel_sendpage(struct socket *sock, struct page *page, int offset,
3070 size_t size, int flags) 3077 size_t size, int flags)
3071{ 3078{
3079 sock_update_classid(sock->sk);
3080
3072 if (sock->ops->sendpage) 3081 if (sock->ops->sendpage)
3073 return sock->ops->sendpage(sock, page, offset, size, flags); 3082 return sock->ops->sendpage(sock, page, offset, size, flags);
3074 3083