aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2011-04-04 11:04:03 -0400
committerPatrick McHardy <kaber@trash.net>2011-04-04 11:04:03 -0400
commit7f5c6d4f665bb57a19a34ce1fb16cc708c04f219 (patch)
treee804faa506bbf9edcfd1fdadb2ab3749f58836cd
parent8f7b01a178b8e6a7b663a1bbaa1710756d67b69b (diff)
netfilter: get rid of atomic ops in fast path
We currently use a percpu spinlock to 'protect' rule bytes/packets counters, after various attempts to use RCU instead. Lately we added a seqlock so that get_counters() can run without blocking BH or 'writers'. But we really only need the seqcount in it. Spinlock itself is only locked by the current/owner cpu, so we can remove it completely. This cleanups api, using correct 'writer' vs 'reader' semantic. At replace time, the get_counters() call makes sure all cpus are done using the old table. Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Cc: Jan Engelhardt <jengelh@medozas.de> Signed-off-by: Patrick McHardy <kaber@trash.net>
-rw-r--r--include/linux/netfilter/x_tables.h96
-rw-r--r--net/ipv4/netfilter/arp_tables.c18
-rw-r--r--net/ipv4/netfilter/ip_tables.c28
-rw-r--r--net/ipv6/netfilter/ip6_tables.c19
-rw-r--r--net/netfilter/x_tables.c9
5 files changed, 80 insertions, 90 deletions
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 37219525ff6f..32cddf78b13e 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -456,72 +456,60 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
456extern struct xt_table_info *xt_alloc_table_info(unsigned int size); 456extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
457extern void xt_free_table_info(struct xt_table_info *info); 457extern void xt_free_table_info(struct xt_table_info *info);
458 458
459/* 459/**
460 * Per-CPU spinlock associated with per-cpu table entries, and 460 * xt_recseq - recursive seqcount for netfilter use
461 * with a counter for the "reading" side that allows a recursive 461 *
462 * reader to avoid taking the lock and deadlocking. 462 * Packet processing changes the seqcount only if no recursion happened
463 * 463 * get_counters() can use read_seqcount_begin()/read_seqcount_retry(),
464 * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu. 464 * because we use the normal seqcount convention :
465 * It needs to ensure that the rules are not being changed while the packet 465 * Low order bit set to 1 if a writer is active.
466 * is being processed. In some cases, the read lock will be acquired
467 * twice on the same CPU; this is okay because of the count.
468 *
469 * "writing" is used when reading counters.
470 * During replace any readers that are using the old tables have to complete
471 * before freeing the old table. This is handled by the write locking
472 * necessary for reading the counters.
473 */ 466 */
474struct xt_info_lock { 467DECLARE_PER_CPU(seqcount_t, xt_recseq);
475 seqlock_t lock;
476 unsigned char readers;
477};
478DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
479 468
480/* 469/**
481 * Note: we need to ensure that preemption is disabled before acquiring 470 * xt_write_recseq_begin - start of a write section
482 * the per-cpu-variable, so we do it as a two step process rather than
483 * using "spin_lock_bh()".
484 *
485 * We _also_ need to disable bottom half processing before updating our
486 * nesting count, to make sure that the only kind of re-entrancy is this
487 * code being called by itself: since the count+lock is not an atomic
488 * operation, we can allow no races.
489 * 471 *
490 * _Only_ that special combination of being per-cpu and never getting 472 * Begin packet processing : all readers must wait the end
491 * re-entered asynchronously means that the count is safe. 473 * 1) Must be called with preemption disabled
474 * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
475 * Returns :
476 * 1 if no recursion on this cpu
477 * 0 if recursion detected
492 */ 478 */
493static inline void xt_info_rdlock_bh(void) 479static inline unsigned int xt_write_recseq_begin(void)
494{ 480{
495 struct xt_info_lock *lock; 481 unsigned int addend;
496 482
497 local_bh_disable(); 483 /*
498 lock = &__get_cpu_var(xt_info_locks); 484 * Low order bit of sequence is set if we already
499 if (likely(!lock->readers++)) 485 * called xt_write_recseq_begin().
500 write_seqlock(&lock->lock); 486 */
501} 487 addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1;
502 488
503static inline void xt_info_rdunlock_bh(void) 489 /*
504{ 490 * This is kind of a write_seqcount_begin(), but addend is 0 or 1
505 struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); 491 * We dont check addend value to avoid a test and conditional jump,
492 * since addend is most likely 1
493 */
494 __this_cpu_add(xt_recseq.sequence, addend);
495 smp_wmb();
506 496
507 if (likely(!--lock->readers)) 497 return addend;
508 write_sequnlock(&lock->lock);
509 local_bh_enable();
510} 498}
511 499
512/* 500/**
513 * The "writer" side needs to get exclusive access to the lock, 501 * xt_write_recseq_end - end of a write section
514 * regardless of readers. This must be called with bottom half 502 * @addend: return value from previous xt_write_recseq_begin()
515 * processing (and thus also preemption) disabled. 503 *
504 * End packet processing : all readers can proceed
505 * 1) Must be called with preemption disabled
506 * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
516 */ 507 */
517static inline void xt_info_wrlock(unsigned int cpu) 508static inline void xt_write_recseq_end(unsigned int addend)
518{
519 write_seqlock(&per_cpu(xt_info_locks, cpu).lock);
520}
521
522static inline void xt_info_wrunlock(unsigned int cpu)
523{ 509{
524 write_sequnlock(&per_cpu(xt_info_locks, cpu).lock); 510 /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
511 smp_wmb();
512 __this_cpu_add(xt_recseq.sequence, addend);
525} 513}
526 514
527/* 515/*
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 4b5d457c2d76..2ea743336836 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
260 void *table_base; 260 void *table_base;
261 const struct xt_table_info *private; 261 const struct xt_table_info *private;
262 struct xt_action_param acpar; 262 struct xt_action_param acpar;
263 unsigned int addend;
263 264
264 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) 265 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
265 return NF_DROP; 266 return NF_DROP;
@@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
267 indev = in ? in->name : nulldevname; 268 indev = in ? in->name : nulldevname;
268 outdev = out ? out->name : nulldevname; 269 outdev = out ? out->name : nulldevname;
269 270
270 xt_info_rdlock_bh(); 271 local_bh_disable();
272 addend = xt_write_recseq_begin();
271 private = table->private; 273 private = table->private;
272 table_base = private->entries[smp_processor_id()]; 274 table_base = private->entries[smp_processor_id()];
273 275
@@ -338,7 +340,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
338 /* Verdict */ 340 /* Verdict */
339 break; 341 break;
340 } while (!acpar.hotdrop); 342 } while (!acpar.hotdrop);
341 xt_info_rdunlock_bh(); 343 xt_write_recseq_end(addend);
344 local_bh_enable();
342 345
343 if (acpar.hotdrop) 346 if (acpar.hotdrop)
344 return NF_DROP; 347 return NF_DROP;
@@ -712,7 +715,7 @@ static void get_counters(const struct xt_table_info *t,
712 unsigned int i; 715 unsigned int i;
713 716
714 for_each_possible_cpu(cpu) { 717 for_each_possible_cpu(cpu) {
715 seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; 718 seqcount_t *s = &per_cpu(xt_recseq, cpu);
716 719
717 i = 0; 720 i = 0;
718 xt_entry_foreach(iter, t->entries[cpu], t->size) { 721 xt_entry_foreach(iter, t->entries[cpu], t->size) {
@@ -720,10 +723,10 @@ static void get_counters(const struct xt_table_info *t,
720 unsigned int start; 723 unsigned int start;
721 724
722 do { 725 do {
723 start = read_seqbegin(lock); 726 start = read_seqcount_begin(s);
724 bcnt = iter->counters.bcnt; 727 bcnt = iter->counters.bcnt;
725 pcnt = iter->counters.pcnt; 728 pcnt = iter->counters.pcnt;
726 } while (read_seqretry(lock, start)); 729 } while (read_seqcount_retry(s, start));
727 730
728 ADD_COUNTER(counters[i], bcnt, pcnt); 731 ADD_COUNTER(counters[i], bcnt, pcnt);
729 ++i; 732 ++i;
@@ -1115,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user,
1115 int ret = 0; 1118 int ret = 0;
1116 void *loc_cpu_entry; 1119 void *loc_cpu_entry;
1117 struct arpt_entry *iter; 1120 struct arpt_entry *iter;
1121 unsigned int addend;
1118#ifdef CONFIG_COMPAT 1122#ifdef CONFIG_COMPAT
1119 struct compat_xt_counters_info compat_tmp; 1123 struct compat_xt_counters_info compat_tmp;
1120 1124
@@ -1171,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user,
1171 /* Choose the copy that is on our node */ 1175 /* Choose the copy that is on our node */
1172 curcpu = smp_processor_id(); 1176 curcpu = smp_processor_id();
1173 loc_cpu_entry = private->entries[curcpu]; 1177 loc_cpu_entry = private->entries[curcpu];
1174 xt_info_wrlock(curcpu); 1178 addend = xt_write_recseq_begin();
1175 xt_entry_foreach(iter, loc_cpu_entry, private->size) { 1179 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1176 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); 1180 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1177 ++i; 1181 ++i;
1178 } 1182 }
1179 xt_info_wrunlock(curcpu); 1183 xt_write_recseq_end(addend);
1180 unlock_up_free: 1184 unlock_up_free:
1181 local_bh_enable(); 1185 local_bh_enable();
1182 xt_table_unlock(t); 1186 xt_table_unlock(t);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ffcea0d1678e..2b6b700949eb 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info)
68} 68}
69EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); 69EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
70 70
71/*
72 We keep a set of rules for each CPU, so we can avoid write-locking
73 them in the softirq when updating the counters and therefore
74 only need to read-lock in the softirq; doing a write_lock_bh() in user
75 context stops packets coming through and allows user context to read
76 the counters or update the rules.
77
78 Hence the start of any table is given by get_table() below. */
79
80/* Returns whether matches rule or not. */ 71/* Returns whether matches rule or not. */
81/* Performance critical - called for every packet */ 72/* Performance critical - called for every packet */
82static inline bool 73static inline bool
@@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb,
311 unsigned int *stackptr, origptr, cpu; 302 unsigned int *stackptr, origptr, cpu;
312 const struct xt_table_info *private; 303 const struct xt_table_info *private;
313 struct xt_action_param acpar; 304 struct xt_action_param acpar;
305 unsigned int addend;
314 306
315 /* Initialization */ 307 /* Initialization */
316 ip = ip_hdr(skb); 308 ip = ip_hdr(skb);
@@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb,
331 acpar.hooknum = hook; 323 acpar.hooknum = hook;
332 324
333 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 325 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
334 xt_info_rdlock_bh(); 326 local_bh_disable();
327 addend = xt_write_recseq_begin();
335 private = table->private; 328 private = table->private;
336 cpu = smp_processor_id(); 329 cpu = smp_processor_id();
337 table_base = private->entries[cpu]; 330 table_base = private->entries[cpu];
@@ -430,7 +423,9 @@ ipt_do_table(struct sk_buff *skb,
430 pr_debug("Exiting %s; resetting sp from %u to %u\n", 423 pr_debug("Exiting %s; resetting sp from %u to %u\n",
431 __func__, *stackptr, origptr); 424 __func__, *stackptr, origptr);
432 *stackptr = origptr; 425 *stackptr = origptr;
433 xt_info_rdunlock_bh(); 426 xt_write_recseq_end(addend);
427 local_bh_enable();
428
434#ifdef DEBUG_ALLOW_ALL 429#ifdef DEBUG_ALLOW_ALL
435 return NF_ACCEPT; 430 return NF_ACCEPT;
436#else 431#else
@@ -886,7 +881,7 @@ get_counters(const struct xt_table_info *t,
886 unsigned int i; 881 unsigned int i;
887 882
888 for_each_possible_cpu(cpu) { 883 for_each_possible_cpu(cpu) {
889 seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; 884 seqcount_t *s = &per_cpu(xt_recseq, cpu);
890 885
891 i = 0; 886 i = 0;
892 xt_entry_foreach(iter, t->entries[cpu], t->size) { 887 xt_entry_foreach(iter, t->entries[cpu], t->size) {
@@ -894,10 +889,10 @@ get_counters(const struct xt_table_info *t,
894 unsigned int start; 889 unsigned int start;
895 890
896 do { 891 do {
897 start = read_seqbegin(lock); 892 start = read_seqcount_begin(s);
898 bcnt = iter->counters.bcnt; 893 bcnt = iter->counters.bcnt;
899 pcnt = iter->counters.pcnt; 894 pcnt = iter->counters.pcnt;
900 } while (read_seqretry(lock, start)); 895 } while (read_seqcount_retry(s, start));
901 896
902 ADD_COUNTER(counters[i], bcnt, pcnt); 897 ADD_COUNTER(counters[i], bcnt, pcnt);
903 ++i; /* macro does multi eval of i */ 898 ++i; /* macro does multi eval of i */
@@ -1312,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user,
1312 int ret = 0; 1307 int ret = 0;
1313 void *loc_cpu_entry; 1308 void *loc_cpu_entry;
1314 struct ipt_entry *iter; 1309 struct ipt_entry *iter;
1310 unsigned int addend;
1315#ifdef CONFIG_COMPAT 1311#ifdef CONFIG_COMPAT
1316 struct compat_xt_counters_info compat_tmp; 1312 struct compat_xt_counters_info compat_tmp;
1317 1313
@@ -1368,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user,
1368 /* Choose the copy that is on our node */ 1364 /* Choose the copy that is on our node */
1369 curcpu = smp_processor_id(); 1365 curcpu = smp_processor_id();
1370 loc_cpu_entry = private->entries[curcpu]; 1366 loc_cpu_entry = private->entries[curcpu];
1371 xt_info_wrlock(curcpu); 1367 addend = xt_write_recseq_begin();
1372 xt_entry_foreach(iter, loc_cpu_entry, private->size) { 1368 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1373 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); 1369 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1374 ++i; 1370 ++i;
1375 } 1371 }
1376 xt_info_wrunlock(curcpu); 1372 xt_write_recseq_end(addend);
1377 unlock_up_free: 1373 unlock_up_free:
1378 local_bh_enable(); 1374 local_bh_enable();
1379 xt_table_unlock(t); 1375 xt_table_unlock(t);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 0b2af9b85cec..ec7cf579cdd4 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -340,6 +340,7 @@ ip6t_do_table(struct sk_buff *skb,
340 unsigned int *stackptr, origptr, cpu; 340 unsigned int *stackptr, origptr, cpu;
341 const struct xt_table_info *private; 341 const struct xt_table_info *private;
342 struct xt_action_param acpar; 342 struct xt_action_param acpar;
343 unsigned int addend;
343 344
344 /* Initialization */ 345 /* Initialization */
345 indev = in ? in->name : nulldevname; 346 indev = in ? in->name : nulldevname;
@@ -358,7 +359,8 @@ ip6t_do_table(struct sk_buff *skb,
358 359
359 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 360 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
360 361
361 xt_info_rdlock_bh(); 362 local_bh_disable();
363 addend = xt_write_recseq_begin();
362 private = table->private; 364 private = table->private;
363 cpu = smp_processor_id(); 365 cpu = smp_processor_id();
364 table_base = private->entries[cpu]; 366 table_base = private->entries[cpu];
@@ -442,7 +444,9 @@ ip6t_do_table(struct sk_buff *skb,
442 } while (!acpar.hotdrop); 444 } while (!acpar.hotdrop);
443 445
444 *stackptr = origptr; 446 *stackptr = origptr;
445 xt_info_rdunlock_bh(); 447
448 xt_write_recseq_end(addend);
449 local_bh_enable();
446 450
447#ifdef DEBUG_ALLOW_ALL 451#ifdef DEBUG_ALLOW_ALL
448 return NF_ACCEPT; 452 return NF_ACCEPT;
@@ -899,7 +903,7 @@ get_counters(const struct xt_table_info *t,
899 unsigned int i; 903 unsigned int i;
900 904
901 for_each_possible_cpu(cpu) { 905 for_each_possible_cpu(cpu) {
902 seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; 906 seqcount_t *s = &per_cpu(xt_recseq, cpu);
903 907
904 i = 0; 908 i = 0;
905 xt_entry_foreach(iter, t->entries[cpu], t->size) { 909 xt_entry_foreach(iter, t->entries[cpu], t->size) {
@@ -907,10 +911,10 @@ get_counters(const struct xt_table_info *t,
907 unsigned int start; 911 unsigned int start;
908 912
909 do { 913 do {
910 start = read_seqbegin(lock); 914 start = read_seqcount_begin(s);
911 bcnt = iter->counters.bcnt; 915 bcnt = iter->counters.bcnt;
912 pcnt = iter->counters.pcnt; 916 pcnt = iter->counters.pcnt;
913 } while (read_seqretry(lock, start)); 917 } while (read_seqcount_retry(s, start));
914 918
915 ADD_COUNTER(counters[i], bcnt, pcnt); 919 ADD_COUNTER(counters[i], bcnt, pcnt);
916 ++i; 920 ++i;
@@ -1325,6 +1329,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
1325 int ret = 0; 1329 int ret = 0;
1326 const void *loc_cpu_entry; 1330 const void *loc_cpu_entry;
1327 struct ip6t_entry *iter; 1331 struct ip6t_entry *iter;
1332 unsigned int addend;
1328#ifdef CONFIG_COMPAT 1333#ifdef CONFIG_COMPAT
1329 struct compat_xt_counters_info compat_tmp; 1334 struct compat_xt_counters_info compat_tmp;
1330 1335
@@ -1381,13 +1386,13 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
1381 i = 0; 1386 i = 0;
1382 /* Choose the copy that is on our node */ 1387 /* Choose the copy that is on our node */
1383 curcpu = smp_processor_id(); 1388 curcpu = smp_processor_id();
1384 xt_info_wrlock(curcpu); 1389 addend = xt_write_recseq_begin();
1385 loc_cpu_entry = private->entries[curcpu]; 1390 loc_cpu_entry = private->entries[curcpu];
1386 xt_entry_foreach(iter, loc_cpu_entry, private->size) { 1391 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1387 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); 1392 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1388 ++i; 1393 ++i;
1389 } 1394 }
1390 xt_info_wrunlock(curcpu); 1395 xt_write_recseq_end(addend);
1391 1396
1392 unlock_up_free: 1397 unlock_up_free:
1393 local_bh_enable(); 1398 local_bh_enable();
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index a9adf4c6b299..52959efca858 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -762,8 +762,8 @@ void xt_compat_unlock(u_int8_t af)
762EXPORT_SYMBOL_GPL(xt_compat_unlock); 762EXPORT_SYMBOL_GPL(xt_compat_unlock);
763#endif 763#endif
764 764
765DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks); 765DEFINE_PER_CPU(seqcount_t, xt_recseq);
766EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); 766EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
767 767
768static int xt_jumpstack_alloc(struct xt_table_info *i) 768static int xt_jumpstack_alloc(struct xt_table_info *i)
769{ 769{
@@ -1362,10 +1362,7 @@ static int __init xt_init(void)
1362 int rv; 1362 int rv;
1363 1363
1364 for_each_possible_cpu(i) { 1364 for_each_possible_cpu(i) {
1365 struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); 1365 seqcount_init(&per_cpu(xt_recseq, i));
1366
1367 seqlock_init(&lock->lock);
1368 lock->readers = 0;
1369 } 1366 }
1370 1367
1371 xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); 1368 xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);