diff options
| author | Eric Dumazet <eric.dumazet@gmail.com> | 2011-04-04 11:04:03 -0400 |
|---|---|---|
| committer | Patrick McHardy <kaber@trash.net> | 2011-04-04 11:04:03 -0400 |
| commit | 7f5c6d4f665bb57a19a34ce1fb16cc708c04f219 (patch) | |
| tree | e804faa506bbf9edcfd1fdadb2ab3749f58836cd /include/linux | |
| parent | 8f7b01a178b8e6a7b663a1bbaa1710756d67b69b (diff) | |
netfilter: get rid of atomic ops in fast path
We currently use a percpu spinlock to 'protect' rule bytes/packets
counters, after various attempts to use RCU instead.
Lately we added a seqlock so that get_counters() can run without
blocking BH or 'writers'. But we really only need the seqcount in it.
Spinlock itself is only locked by the current/owner cpu, so we can
remove it completely.
This cleanups api, using correct 'writer' vs 'reader' semantic.
At replace time, the get_counters() call makes sure all cpus are done
using the old table.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/netfilter/x_tables.h | 96 |
1 files changed, 42 insertions, 54 deletions
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 37219525ff6..32cddf78b13 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h | |||
| @@ -456,72 +456,60 @@ extern void xt_proto_fini(struct net *net, u_int8_t af); | |||
| 456 | extern struct xt_table_info *xt_alloc_table_info(unsigned int size); | 456 | extern struct xt_table_info *xt_alloc_table_info(unsigned int size); |
| 457 | extern void xt_free_table_info(struct xt_table_info *info); | 457 | extern void xt_free_table_info(struct xt_table_info *info); |
| 458 | 458 | ||
| 459 | /* | 459 | /** |
| 460 | * Per-CPU spinlock associated with per-cpu table entries, and | 460 | * xt_recseq - recursive seqcount for netfilter use |
| 461 | * with a counter for the "reading" side that allows a recursive | 461 | * |
| 462 | * reader to avoid taking the lock and deadlocking. | 462 | * Packet processing changes the seqcount only if no recursion happened |
| 463 | * | 463 | * get_counters() can use read_seqcount_begin()/read_seqcount_retry(), |
| 464 | * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu. | 464 | * because we use the normal seqcount convention : |
| 465 | * It needs to ensure that the rules are not being changed while the packet | 465 | * Low order bit set to 1 if a writer is active. |
| 466 | * is being processed. In some cases, the read lock will be acquired | ||
| 467 | * twice on the same CPU; this is okay because of the count. | ||
| 468 | * | ||
| 469 | * "writing" is used when reading counters. | ||
| 470 | * During replace any readers that are using the old tables have to complete | ||
| 471 | * before freeing the old table. This is handled by the write locking | ||
| 472 | * necessary for reading the counters. | ||
| 473 | */ | 466 | */ |
| 474 | struct xt_info_lock { | 467 | DECLARE_PER_CPU(seqcount_t, xt_recseq); |
| 475 | seqlock_t lock; | ||
| 476 | unsigned char readers; | ||
| 477 | }; | ||
| 478 | DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); | ||
| 479 | 468 | ||
| 480 | /* | 469 | /** |
| 481 | * Note: we need to ensure that preemption is disabled before acquiring | 470 | * xt_write_recseq_begin - start of a write section |
| 482 | * the per-cpu-variable, so we do it as a two step process rather than | ||
| 483 | * using "spin_lock_bh()". | ||
| 484 | * | ||
| 485 | * We _also_ need to disable bottom half processing before updating our | ||
| 486 | * nesting count, to make sure that the only kind of re-entrancy is this | ||
| 487 | * code being called by itself: since the count+lock is not an atomic | ||
| 488 | * operation, we can allow no races. | ||
| 489 | * | 471 | * |
| 490 | * _Only_ that special combination of being per-cpu and never getting | 472 | * Begin packet processing : all readers must wait the end |
| 491 | * re-entered asynchronously means that the count is safe. | 473 | * 1) Must be called with preemption disabled |
| 474 | * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add()) | ||
| 475 | * Returns : | ||
| 476 | * 1 if no recursion on this cpu | ||
| 477 | * 0 if recursion detected | ||
| 492 | */ | 478 | */ |
| 493 | static inline void xt_info_rdlock_bh(void) | 479 | static inline unsigned int xt_write_recseq_begin(void) |
| 494 | { | 480 | { |
| 495 | struct xt_info_lock *lock; | 481 | unsigned int addend; |
| 496 | 482 | ||
| 497 | local_bh_disable(); | 483 | /* |
| 498 | lock = &__get_cpu_var(xt_info_locks); | 484 | * Low order bit of sequence is set if we already |
| 499 | if (likely(!lock->readers++)) | 485 | * called xt_write_recseq_begin(). |
| 500 | write_seqlock(&lock->lock); | 486 | */ |
| 501 | } | 487 | addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1; |
| 502 | 488 | ||
| 503 | static inline void xt_info_rdunlock_bh(void) | 489 | /* |
| 504 | { | 490 | * This is kind of a write_seqcount_begin(), but addend is 0 or 1 |
| 505 | struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); | 491 | * We dont check addend value to avoid a test and conditional jump, |
| 492 | * since addend is most likely 1 | ||
| 493 | */ | ||
| 494 | __this_cpu_add(xt_recseq.sequence, addend); | ||
| 495 | smp_wmb(); | ||
| 506 | 496 | ||
| 507 | if (likely(!--lock->readers)) | 497 | return addend; |
| 508 | write_sequnlock(&lock->lock); | ||
| 509 | local_bh_enable(); | ||
| 510 | } | 498 | } |
| 511 | 499 | ||
| 512 | /* | 500 | /** |
| 513 | * The "writer" side needs to get exclusive access to the lock, | 501 | * xt_write_recseq_end - end of a write section |
| 514 | * regardless of readers. This must be called with bottom half | 502 | * @addend: return value from previous xt_write_recseq_begin() |
| 515 | * processing (and thus also preemption) disabled. | 503 | * |
| 504 | * End packet processing : all readers can proceed | ||
| 505 | * 1) Must be called with preemption disabled | ||
| 506 | * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add()) | ||
| 516 | */ | 507 | */ |
| 517 | static inline void xt_info_wrlock(unsigned int cpu) | 508 | static inline void xt_write_recseq_end(unsigned int addend) |
| 518 | { | ||
| 519 | write_seqlock(&per_cpu(xt_info_locks, cpu).lock); | ||
| 520 | } | ||
| 521 | |||
| 522 | static inline void xt_info_wrunlock(unsigned int cpu) | ||
| 523 | { | 509 | { |
| 524 | write_sequnlock(&per_cpu(xt_info_locks, cpu).lock); | 510 | /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */ |
| 511 | smp_wmb(); | ||
| 512 | __this_cpu_add(xt_recseq.sequence, addend); | ||
| 525 | } | 513 | } |
| 526 | 514 | ||
| 527 | /* | 515 | /* |
