aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2009-06-13 06:30:52 -0400
committerPatrick McHardy <kaber@trash.net>2009-06-13 06:30:52 -0400
commitdd7669a92c6066b2b31bae7e04cd787092920883 (patch)
treed06a9e18aec99c5a34a191cb3391e74ba8a8ec59 /net
parentd219dce76c64f2c883dad0537fa09a56d5ff0a10 (diff)
netfilter: conntrack: optional reliable conntrack event delivery
This patch improves ctnetlink event reliability if one broadcast listener has set the NETLINK_BROADCAST_ERROR socket option. The logic is the following: if an event delivery fails, we keep the undelivered events in the missed event cache. Once the next packet arrives, we add the new events (if any) to the missed events in the cache and we try a new delivery, and so on. Thus, if ctnetlink fails to deliver an event, we try to deliver them once we see a new packet. Therefore, we may lose state transitions but the userspace process gets in sync at some point. At worst case, if no events were delivered to userspace, we make sure that destroy events are successfully delivered. Basically, if ctnetlink fails to deliver the destroy event, we remove the conntrack entry from the hashes and we insert them in the dying list, which contains inactive entries. Then, the conntrack timer is added with an extra grace timeout of random32() % 15 seconds to trigger the event again (this grace timeout is tunable via /proc). The use of a limited random timeout value allows distributing the "destroy" resends, thus, avoiding accumulating lots "destroy" events at the same time. Event delivery may re-order but we can identify them by means of the tuple plus the conntrack ID. The maximum number of conntrack entries (active or inactive) is still handled by nf_conntrack_max. Thus, we may start dropping packets at some point if we accumulate a lot of inactive conntrack entries that did not successfully report the destroy event to userspace. During my stress tests consisting of setting a very small buffer of 2048 bytes for conntrackd and the NETLINK_BROADCAST_ERROR socket flag, and generating lots of very small connections, I noticed very few destroy entries on the fly waiting to be resend. A simple way to test this patch consist of creating a lot of entries, set a very small Netlink buffer in conntrackd (+ a patch which is not in the git tree to set the BROADCAST_ERROR flag) and invoke `conntrack -F'. For expectations, no changes are introduced in this patch. Currently, event delivery is only done for new expectations (no events from expectation expiration, removal and confirmation). In that case, they need a per-expectation event cache to implement the same idea that is exposed in this patch. This patch can be useful to provide reliable flow-accouting. We still have to add a new conntrack extension to store the creation and destroy time. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'net')
-rw-r--r--net/netfilter/nf_conntrack_core.c89
-rw-r--r--net/netfilter/nf_conntrack_ecache.c28
-rw-r--r--net/netfilter/nf_conntrack_netlink.c19
3 files changed, 118 insertions, 18 deletions
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 14235b144cb5..5f72b94b4918 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -183,10 +183,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
183 NF_CT_ASSERT(atomic_read(&nfct->use) == 0); 183 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
184 NF_CT_ASSERT(!timer_pending(&ct->timeout)); 184 NF_CT_ASSERT(!timer_pending(&ct->timeout));
185 185
186 if (!test_bit(IPS_DYING_BIT, &ct->status))
187 nf_conntrack_event(IPCT_DESTROY, ct);
188 set_bit(IPS_DYING_BIT, &ct->status);
189
190 /* To make sure we don't get any weird locking issues here: 186 /* To make sure we don't get any weird locking issues here:
191 * destroy_conntrack() MUST NOT be called with a write lock 187 * destroy_conntrack() MUST NOT be called with a write lock
192 * to nf_conntrack_lock!!! -HW */ 188 * to nf_conntrack_lock!!! -HW */
@@ -220,9 +216,8 @@ destroy_conntrack(struct nf_conntrack *nfct)
220 nf_conntrack_free(ct); 216 nf_conntrack_free(ct);
221} 217}
222 218
223static void death_by_timeout(unsigned long ul_conntrack) 219void nf_ct_delete_from_lists(struct nf_conn *ct)
224{ 220{
225 struct nf_conn *ct = (void *)ul_conntrack;
226 struct net *net = nf_ct_net(ct); 221 struct net *net = nf_ct_net(ct);
227 222
228 nf_ct_helper_destroy(ct); 223 nf_ct_helper_destroy(ct);
@@ -232,6 +227,59 @@ static void death_by_timeout(unsigned long ul_conntrack)
232 NF_CT_STAT_INC(net, delete_list); 227 NF_CT_STAT_INC(net, delete_list);
233 clean_from_lists(ct); 228 clean_from_lists(ct);
234 spin_unlock_bh(&nf_conntrack_lock); 229 spin_unlock_bh(&nf_conntrack_lock);
230}
231EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
232
233static void death_by_event(unsigned long ul_conntrack)
234{
235 struct nf_conn *ct = (void *)ul_conntrack;
236 struct net *net = nf_ct_net(ct);
237
238 if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
239 /* bad luck, let's retry again */
240 ct->timeout.expires = jiffies +
241 (random32() % net->ct.sysctl_events_retry_timeout);
242 add_timer(&ct->timeout);
243 return;
244 }
245 /* we've got the event delivered, now it's dying */
246 set_bit(IPS_DYING_BIT, &ct->status);
247 spin_lock(&nf_conntrack_lock);
248 hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
249 spin_unlock(&nf_conntrack_lock);
250 nf_ct_put(ct);
251}
252
253void nf_ct_insert_dying_list(struct nf_conn *ct)
254{
255 struct net *net = nf_ct_net(ct);
256
257 /* add this conntrack to the dying list */
258 spin_lock_bh(&nf_conntrack_lock);
259 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
260 &net->ct.dying);
261 spin_unlock_bh(&nf_conntrack_lock);
262 /* set a new timer to retry event delivery */
263 setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
264 ct->timeout.expires = jiffies +
265 (random32() % net->ct.sysctl_events_retry_timeout);
266 add_timer(&ct->timeout);
267}
268EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
269
270static void death_by_timeout(unsigned long ul_conntrack)
271{
272 struct nf_conn *ct = (void *)ul_conntrack;
273
274 if (!test_bit(IPS_DYING_BIT, &ct->status) &&
275 unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
276 /* destroy event was not delivered */
277 nf_ct_delete_from_lists(ct);
278 nf_ct_insert_dying_list(ct);
279 return;
280 }
281 set_bit(IPS_DYING_BIT, &ct->status);
282 nf_ct_delete_from_lists(ct);
235 nf_ct_put(ct); 283 nf_ct_put(ct);
236} 284}
237 285
@@ -982,11 +1030,13 @@ static int kill_report(struct nf_conn *i, void *data)
982{ 1030{
983 struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; 1031 struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
984 1032
985 /* get_next_corpse sets the dying bit for us */ 1033 /* If we fail to deliver the event, death_by_timeout() will retry */
986 nf_conntrack_event_report(IPCT_DESTROY, 1034 if (nf_conntrack_event_report(IPCT_DESTROY, i,
987 i, 1035 fr->pid, fr->report) < 0)
988 fr->pid, 1036 return 1;
989 fr->report); 1037
1038 /* Avoid the delivery of the destroy event in death_by_timeout(). */
1039 set_bit(IPS_DYING_BIT, &i->status);
990 return 1; 1040 return 1;
991} 1041}
992 1042
@@ -1015,6 +1065,21 @@ void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
1015} 1065}
1016EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); 1066EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
1017 1067
1068static void nf_ct_release_dying_list(void)
1069{
1070 struct nf_conntrack_tuple_hash *h;
1071 struct nf_conn *ct;
1072 struct hlist_nulls_node *n;
1073
1074 spin_lock_bh(&nf_conntrack_lock);
1075 hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
1076 ct = nf_ct_tuplehash_to_ctrack(h);
1077 /* never fails to remove them, no listeners at this point */
1078 nf_ct_kill(ct);
1079 }
1080 spin_unlock_bh(&nf_conntrack_lock);
1081}
1082
1018static void nf_conntrack_cleanup_init_net(void) 1083static void nf_conntrack_cleanup_init_net(void)
1019{ 1084{
1020 nf_conntrack_helper_fini(); 1085 nf_conntrack_helper_fini();
@@ -1026,6 +1091,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
1026{ 1091{
1027 i_see_dead_people: 1092 i_see_dead_people:
1028 nf_ct_iterate_cleanup(net, kill_all, NULL); 1093 nf_ct_iterate_cleanup(net, kill_all, NULL);
1094 nf_ct_release_dying_list();
1029 if (atomic_read(&net->ct.count) != 0) { 1095 if (atomic_read(&net->ct.count) != 0) {
1030 schedule(); 1096 schedule();
1031 goto i_see_dead_people; 1097 goto i_see_dead_people;
@@ -1207,6 +1273,7 @@ static int nf_conntrack_init_net(struct net *net)
1207 1273
1208 atomic_set(&net->ct.count, 0); 1274 atomic_set(&net->ct.count, 0);
1209 INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0); 1275 INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0);
1276 INIT_HLIST_NULLS_HEAD(&net->ct.dying, 0);
1210 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 1277 net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
1211 if (!net->ct.stat) { 1278 if (!net->ct.stat) {
1212 ret = -ENOMEM; 1279 ret = -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 683281b78047..aee560b4768d 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -56,8 +56,21 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
56 .pid = 0, 56 .pid = 0,
57 .report = 0 57 .report = 0
58 }; 58 };
59 59 int ret;
60 notify->fcn(events, &item); 60 /* We make a copy of the missed event cache without taking
61 * the lock, thus we may send missed events twice. However,
62 * this does not harm and it happens very rarely. */
63 unsigned long missed = e->missed;
64
65 ret = notify->fcn(events | missed, &item);
66 if (unlikely(ret < 0 || missed)) {
67 spin_lock_bh(&ct->lock);
68 if (ret < 0)
69 e->missed |= events;
70 else
71 e->missed &= ~missed;
72 spin_unlock_bh(&ct->lock);
73 }
61 } 74 }
62 75
63out_unlock: 76out_unlock:
@@ -133,6 +146,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
133 146
134#define NF_CT_EVENTS_DEFAULT 1 147#define NF_CT_EVENTS_DEFAULT 1
135static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; 148static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
149static int nf_ct_events_retry_timeout __read_mostly = 15*HZ;
136 150
137#ifdef CONFIG_SYSCTL 151#ifdef CONFIG_SYSCTL
138static struct ctl_table event_sysctl_table[] = { 152static struct ctl_table event_sysctl_table[] = {
@@ -144,6 +158,14 @@ static struct ctl_table event_sysctl_table[] = {
144 .mode = 0644, 158 .mode = 0644,
145 .proc_handler = proc_dointvec, 159 .proc_handler = proc_dointvec,
146 }, 160 },
161 {
162 .ctl_name = CTL_UNNUMBERED,
163 .procname = "nf_conntrack_events_retry_timeout",
164 .data = &init_net.ct.sysctl_events_retry_timeout,
165 .maxlen = sizeof(unsigned int),
166 .mode = 0644,
167 .proc_handler = proc_dointvec_jiffies,
168 },
147 {} 169 {}
148}; 170};
149#endif /* CONFIG_SYSCTL */ 171#endif /* CONFIG_SYSCTL */
@@ -165,6 +187,7 @@ static int nf_conntrack_event_init_sysctl(struct net *net)
165 goto out; 187 goto out;
166 188
167 table[0].data = &net->ct.sysctl_events; 189 table[0].data = &net->ct.sysctl_events;
190 table[1].data = &net->ct.sysctl_events_retry_timeout;
168 191
169 net->ct.event_sysctl_header = 192 net->ct.event_sysctl_header =
170 register_net_sysctl_table(net, 193 register_net_sysctl_table(net,
@@ -205,6 +228,7 @@ int nf_conntrack_ecache_init(struct net *net)
205 int ret; 228 int ret;
206 229
207 net->ct.sysctl_events = nf_ct_events; 230 net->ct.sysctl_events = nf_ct_events;
231 net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout;
208 232
209 if (net_eq(net, &init_net)) { 233 if (net_eq(net, &init_net)) {
210 ret = nf_ct_extend_register(&event_extend); 234 ret = nf_ct_extend_register(&event_extend);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 19706eff1647..49479d194570 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -463,6 +463,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
463 struct sk_buff *skb; 463 struct sk_buff *skb;
464 unsigned int type; 464 unsigned int type;
465 unsigned int flags = 0, group; 465 unsigned int flags = 0, group;
466 int err;
466 467
467 /* ignore our fake conntrack entry */ 468 /* ignore our fake conntrack entry */
468 if (ct == &nf_conntrack_untracked) 469 if (ct == &nf_conntrack_untracked)
@@ -558,7 +559,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
558 rcu_read_unlock(); 559 rcu_read_unlock();
559 560
560 nlmsg_end(skb, nlh); 561 nlmsg_end(skb, nlh);
561 nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC); 562 err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
563 if (err == -ENOBUFS || err == -EAGAIN)
564 return -ENOBUFS;
565
562 return 0; 566 return 0;
563 567
564nla_put_failure: 568nla_put_failure:
@@ -798,10 +802,15 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
798 } 802 }
799 } 803 }
800 804
801 nf_conntrack_event_report(IPCT_DESTROY, 805 if (nf_conntrack_event_report(IPCT_DESTROY, ct,
802 ct, 806 NETLINK_CB(skb).pid,
803 NETLINK_CB(skb).pid, 807 nlmsg_report(nlh)) < 0) {
804 nlmsg_report(nlh)); 808 nf_ct_delete_from_lists(ct);
809 /* we failed to report the event, try later */
810 nf_ct_insert_dying_list(ct);
811 nf_ct_put(ct);
812 return 0;
813 }
805 814
806 /* death_by_timeout would report the event again */ 815 /* death_by_timeout would report the event again */
807 set_bit(IPS_DYING_BIT, &ct->status); 816 set_bit(IPS_DYING_BIT, &ct->status);