diff options
| author | Eric Dumazet <eric.dumazet@gmail.com> | 2011-01-10 14:11:38 -0500 |
|---|---|---|
| committer | Pablo Neira Ayuso <pablo@netfilter.org> | 2011-01-10 14:11:38 -0500 |
| commit | 83723d60717f8da0f53f91cf42a845ed56c09662 (patch) | |
| tree | 9d32edc2c6dc4849e63d422f8dad42606b2f984f | |
| parent | 45b9f509b7f5d2d792b3c03b78ddc8ec543e921b (diff) | |
netfilter: x_tables: dont block BH while reading counters
Using "iptables -L" with a lot of rules have a too big BH latency.
Jesper mentioned ~6 ms and worried of frame drops.
Switch to a per_cpu seqlock scheme, so that taking a snapshot of
counters doesnt need to block BH (for this cpu, but also other cpus).
This adds two increments on seqlock sequence per ipt_do_table() call,
its a reasonable cost for allowing "iptables -L" not block BH
processing.
Reported-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
| -rw-r--r-- | include/linux/netfilter/x_tables.h | 10 | ||||
| -rw-r--r-- | net/ipv4/netfilter/arp_tables.c | 45 | ||||
| -rw-r--r-- | net/ipv4/netfilter/ip_tables.c | 45 | ||||
| -rw-r--r-- | net/ipv6/netfilter/ip6_tables.c | 45 | ||||
| -rw-r--r-- | net/netfilter/x_tables.c | 3 |
5 files changed, 49 insertions, 99 deletions
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 742bec05144..6712e713b29 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h | |||
| @@ -472,7 +472,7 @@ extern void xt_free_table_info(struct xt_table_info *info); | |||
| 472 | * necessary for reading the counters. | 472 | * necessary for reading the counters. |
| 473 | */ | 473 | */ |
| 474 | struct xt_info_lock { | 474 | struct xt_info_lock { |
| 475 | spinlock_t lock; | 475 | seqlock_t lock; |
| 476 | unsigned char readers; | 476 | unsigned char readers; |
| 477 | }; | 477 | }; |
| 478 | DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); | 478 | DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); |
| @@ -497,7 +497,7 @@ static inline void xt_info_rdlock_bh(void) | |||
| 497 | local_bh_disable(); | 497 | local_bh_disable(); |
| 498 | lock = &__get_cpu_var(xt_info_locks); | 498 | lock = &__get_cpu_var(xt_info_locks); |
| 499 | if (likely(!lock->readers++)) | 499 | if (likely(!lock->readers++)) |
| 500 | spin_lock(&lock->lock); | 500 | write_seqlock(&lock->lock); |
| 501 | } | 501 | } |
| 502 | 502 | ||
| 503 | static inline void xt_info_rdunlock_bh(void) | 503 | static inline void xt_info_rdunlock_bh(void) |
| @@ -505,7 +505,7 @@ static inline void xt_info_rdunlock_bh(void) | |||
| 505 | struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); | 505 | struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); |
| 506 | 506 | ||
| 507 | if (likely(!--lock->readers)) | 507 | if (likely(!--lock->readers)) |
| 508 | spin_unlock(&lock->lock); | 508 | write_sequnlock(&lock->lock); |
| 509 | local_bh_enable(); | 509 | local_bh_enable(); |
| 510 | } | 510 | } |
| 511 | 511 | ||
| @@ -516,12 +516,12 @@ static inline void xt_info_rdunlock_bh(void) | |||
| 516 | */ | 516 | */ |
| 517 | static inline void xt_info_wrlock(unsigned int cpu) | 517 | static inline void xt_info_wrlock(unsigned int cpu) |
| 518 | { | 518 | { |
| 519 | spin_lock(&per_cpu(xt_info_locks, cpu).lock); | 519 | write_seqlock(&per_cpu(xt_info_locks, cpu).lock); |
| 520 | } | 520 | } |
| 521 | 521 | ||
| 522 | static inline void xt_info_wrunlock(unsigned int cpu) | 522 | static inline void xt_info_wrunlock(unsigned int cpu) |
| 523 | { | 523 | { |
| 524 | spin_unlock(&per_cpu(xt_info_locks, cpu).lock); | 524 | write_sequnlock(&per_cpu(xt_info_locks, cpu).lock); |
| 525 | } | 525 | } |
| 526 | 526 | ||
| 527 | /* | 527 | /* |
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3fac340a28d..e855fffaed9 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c | |||
| @@ -710,42 +710,25 @@ static void get_counters(const struct xt_table_info *t, | |||
| 710 | struct arpt_entry *iter; | 710 | struct arpt_entry *iter; |
| 711 | unsigned int cpu; | 711 | unsigned int cpu; |
| 712 | unsigned int i; | 712 | unsigned int i; |
| 713 | unsigned int curcpu = get_cpu(); | ||
| 714 | |||
| 715 | /* Instead of clearing (by a previous call to memset()) | ||
| 716 | * the counters and using adds, we set the counters | ||
| 717 | * with data used by 'current' CPU | ||
| 718 | * | ||
| 719 | * Bottom half has to be disabled to prevent deadlock | ||
| 720 | * if new softirq were to run and call ipt_do_table | ||
| 721 | */ | ||
| 722 | local_bh_disable(); | ||
| 723 | i = 0; | ||
| 724 | xt_entry_foreach(iter, t->entries[curcpu], t->size) { | ||
| 725 | SET_COUNTER(counters[i], iter->counters.bcnt, | ||
| 726 | iter->counters.pcnt); | ||
| 727 | ++i; | ||
| 728 | } | ||
| 729 | local_bh_enable(); | ||
| 730 | /* Processing counters from other cpus, we can let bottom half enabled, | ||
| 731 | * (preemption is disabled) | ||
| 732 | */ | ||
| 733 | 713 | ||
| 734 | for_each_possible_cpu(cpu) { | 714 | for_each_possible_cpu(cpu) { |
| 735 | if (cpu == curcpu) | 715 | seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; |
| 736 | continue; | 716 | |
| 737 | i = 0; | 717 | i = 0; |
| 738 | local_bh_disable(); | ||
| 739 | xt_info_wrlock(cpu); | ||
| 740 | xt_entry_foreach(iter, t->entries[cpu], t->size) { | 718 | xt_entry_foreach(iter, t->entries[cpu], t->size) { |
| 741 | ADD_COUNTER(counters[i], iter->counters.bcnt, | 719 | u64 bcnt, pcnt; |
| 742 | iter->counters.pcnt); | 720 | unsigned int start; |
| 721 | |||
| 722 | do { | ||
| 723 | start = read_seqbegin(lock); | ||
| 724 | bcnt = iter->counters.bcnt; | ||
| 725 | pcnt = iter->counters.pcnt; | ||
| 726 | } while (read_seqretry(lock, start)); | ||
| 727 | |||
| 728 | ADD_COUNTER(counters[i], bcnt, pcnt); | ||
| 743 | ++i; | 729 | ++i; |
| 744 | } | 730 | } |
| 745 | xt_info_wrunlock(cpu); | ||
| 746 | local_bh_enable(); | ||
| 747 | } | 731 | } |
| 748 | put_cpu(); | ||
| 749 | } | 732 | } |
| 750 | 733 | ||
| 751 | static struct xt_counters *alloc_counters(const struct xt_table *table) | 734 | static struct xt_counters *alloc_counters(const struct xt_table *table) |
| @@ -759,7 +742,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) | |||
| 759 | * about). | 742 | * about). |
| 760 | */ | 743 | */ |
| 761 | countersize = sizeof(struct xt_counters) * private->number; | 744 | countersize = sizeof(struct xt_counters) * private->number; |
| 762 | counters = vmalloc(countersize); | 745 | counters = vzalloc(countersize); |
| 763 | 746 | ||
| 764 | if (counters == NULL) | 747 | if (counters == NULL) |
| 765 | return ERR_PTR(-ENOMEM); | 748 | return ERR_PTR(-ENOMEM); |
| @@ -1007,7 +990,7 @@ static int __do_replace(struct net *net, const char *name, | |||
| 1007 | struct arpt_entry *iter; | 990 | struct arpt_entry *iter; |
| 1008 | 991 | ||
| 1009 | ret = 0; | 992 | ret = 0; |
| 1010 | counters = vmalloc(num_counters * sizeof(struct xt_counters)); | 993 | counters = vzalloc(num_counters * sizeof(struct xt_counters)); |
| 1011 | if (!counters) { | 994 | if (!counters) { |
| 1012 | ret = -ENOMEM; | 995 | ret = -ENOMEM; |
| 1013 | goto out; | 996 | goto out; |
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a846d633b3b..652efea013d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c | |||
| @@ -884,42 +884,25 @@ get_counters(const struct xt_table_info *t, | |||
| 884 | struct ipt_entry *iter; | 884 | struct ipt_entry *iter; |
| 885 | unsigned int cpu; | 885 | unsigned int cpu; |
| 886 | unsigned int i; | 886 | unsigned int i; |
| 887 | unsigned int curcpu = get_cpu(); | ||
| 888 | |||
| 889 | /* Instead of clearing (by a previous call to memset()) | ||
| 890 | * the counters and using adds, we set the counters | ||
| 891 | * with data used by 'current' CPU. | ||
| 892 | * | ||
| 893 | * Bottom half has to be disabled to prevent deadlock | ||
| 894 | * if new softirq were to run and call ipt_do_table | ||
| 895 | */ | ||
| 896 | local_bh_disable(); | ||
| 897 | i = 0; | ||
| 898 | xt_entry_foreach(iter, t->entries[curcpu], t->size) { | ||
| 899 | SET_COUNTER(counters[i], iter->counters.bcnt, | ||
| 900 | iter->counters.pcnt); | ||
| 901 | ++i; | ||
| 902 | } | ||
| 903 | local_bh_enable(); | ||
| 904 | /* Processing counters from other cpus, we can let bottom half enabled, | ||
| 905 | * (preemption is disabled) | ||
| 906 | */ | ||
| 907 | 887 | ||
| 908 | for_each_possible_cpu(cpu) { | 888 | for_each_possible_cpu(cpu) { |
| 909 | if (cpu == curcpu) | 889 | seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; |
| 910 | continue; | 890 | |
| 911 | i = 0; | 891 | i = 0; |
| 912 | local_bh_disable(); | ||
| 913 | xt_info_wrlock(cpu); | ||
| 914 | xt_entry_foreach(iter, t->entries[cpu], t->size) { | 892 | xt_entry_foreach(iter, t->entries[cpu], t->size) { |
| 915 | ADD_COUNTER(counters[i], iter->counters.bcnt, | 893 | u64 bcnt, pcnt; |
| 916 | iter->counters.pcnt); | 894 | unsigned int start; |
| 895 | |||
| 896 | do { | ||
| 897 | start = read_seqbegin(lock); | ||
| 898 | bcnt = iter->counters.bcnt; | ||
| 899 | pcnt = iter->counters.pcnt; | ||
| 900 | } while (read_seqretry(lock, start)); | ||
| 901 | |||
| 902 | ADD_COUNTER(counters[i], bcnt, pcnt); | ||
| 917 | ++i; /* macro does multi eval of i */ | 903 | ++i; /* macro does multi eval of i */ |
| 918 | } | 904 | } |
| 919 | xt_info_wrunlock(cpu); | ||
| 920 | local_bh_enable(); | ||
| 921 | } | 905 | } |
| 922 | put_cpu(); | ||
| 923 | } | 906 | } |
| 924 | 907 | ||
| 925 | static struct xt_counters *alloc_counters(const struct xt_table *table) | 908 | static struct xt_counters *alloc_counters(const struct xt_table *table) |
| @@ -932,7 +915,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) | |||
| 932 | (other than comefrom, which userspace doesn't care | 915 | (other than comefrom, which userspace doesn't care |
| 933 | about). */ | 916 | about). */ |
| 934 | countersize = sizeof(struct xt_counters) * private->number; | 917 | countersize = sizeof(struct xt_counters) * private->number; |
| 935 | counters = vmalloc(countersize); | 918 | counters = vzalloc(countersize); |
| 936 | 919 | ||
| 937 | if (counters == NULL) | 920 | if (counters == NULL) |
| 938 | return ERR_PTR(-ENOMEM); | 921 | return ERR_PTR(-ENOMEM); |
| @@ -1203,7 +1186,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, | |||
| 1203 | struct ipt_entry *iter; | 1186 | struct ipt_entry *iter; |
| 1204 | 1187 | ||
| 1205 | ret = 0; | 1188 | ret = 0; |
| 1206 | counters = vmalloc(num_counters * sizeof(struct xt_counters)); | 1189 | counters = vzalloc(num_counters * sizeof(struct xt_counters)); |
| 1207 | if (!counters) { | 1190 | if (!counters) { |
| 1208 | ret = -ENOMEM; | 1191 | ret = -ENOMEM; |
| 1209 | goto out; | 1192 | goto out; |
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 455582384ec..7d227c644f7 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c | |||
| @@ -897,42 +897,25 @@ get_counters(const struct xt_table_info *t, | |||
| 897 | struct ip6t_entry *iter; | 897 | struct ip6t_entry *iter; |
| 898 | unsigned int cpu; | 898 | unsigned int cpu; |
| 899 | unsigned int i; | 899 | unsigned int i; |
| 900 | unsigned int curcpu = get_cpu(); | ||
| 901 | |||
| 902 | /* Instead of clearing (by a previous call to memset()) | ||
| 903 | * the counters and using adds, we set the counters | ||
| 904 | * with data used by 'current' CPU | ||
| 905 | * | ||
| 906 | * Bottom half has to be disabled to prevent deadlock | ||
| 907 | * if new softirq were to run and call ipt_do_table | ||
| 908 | */ | ||
| 909 | local_bh_disable(); | ||
| 910 | i = 0; | ||
| 911 | xt_entry_foreach(iter, t->entries[curcpu], t->size) { | ||
| 912 | SET_COUNTER(counters[i], iter->counters.bcnt, | ||
| 913 | iter->counters.pcnt); | ||
| 914 | ++i; | ||
| 915 | } | ||
| 916 | local_bh_enable(); | ||
| 917 | /* Processing counters from other cpus, we can let bottom half enabled, | ||
| 918 | * (preemption is disabled) | ||
| 919 | */ | ||
| 920 | 900 | ||
| 921 | for_each_possible_cpu(cpu) { | 901 | for_each_possible_cpu(cpu) { |
| 922 | if (cpu == curcpu) | 902 | seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; |
| 923 | continue; | 903 | |
| 924 | i = 0; | 904 | i = 0; |
| 925 | local_bh_disable(); | ||
| 926 | xt_info_wrlock(cpu); | ||
| 927 | xt_entry_foreach(iter, t->entries[cpu], t->size) { | 905 | xt_entry_foreach(iter, t->entries[cpu], t->size) { |
| 928 | ADD_COUNTER(counters[i], iter->counters.bcnt, | 906 | u64 bcnt, pcnt; |
| 929 | iter->counters.pcnt); | 907 | unsigned int start; |
| 908 | |||
| 909 | do { | ||
| 910 | start = read_seqbegin(lock); | ||
| 911 | bcnt = iter->counters.bcnt; | ||
| 912 | pcnt = iter->counters.pcnt; | ||
| 913 | } while (read_seqretry(lock, start)); | ||
| 914 | |||
| 915 | ADD_COUNTER(counters[i], bcnt, pcnt); | ||
| 930 | ++i; | 916 | ++i; |
| 931 | } | 917 | } |
| 932 | xt_info_wrunlock(cpu); | ||
| 933 | local_bh_enable(); | ||
| 934 | } | 918 | } |
| 935 | put_cpu(); | ||
| 936 | } | 919 | } |
| 937 | 920 | ||
| 938 | static struct xt_counters *alloc_counters(const struct xt_table *table) | 921 | static struct xt_counters *alloc_counters(const struct xt_table *table) |
| @@ -945,7 +928,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) | |||
| 945 | (other than comefrom, which userspace doesn't care | 928 | (other than comefrom, which userspace doesn't care |
| 946 | about). */ | 929 | about). */ |
| 947 | countersize = sizeof(struct xt_counters) * private->number; | 930 | countersize = sizeof(struct xt_counters) * private->number; |
| 948 | counters = vmalloc(countersize); | 931 | counters = vzalloc(countersize); |
| 949 | 932 | ||
| 950 | if (counters == NULL) | 933 | if (counters == NULL) |
| 951 | return ERR_PTR(-ENOMEM); | 934 | return ERR_PTR(-ENOMEM); |
| @@ -1216,7 +1199,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, | |||
| 1216 | struct ip6t_entry *iter; | 1199 | struct ip6t_entry *iter; |
| 1217 | 1200 | ||
| 1218 | ret = 0; | 1201 | ret = 0; |
| 1219 | counters = vmalloc(num_counters * sizeof(struct xt_counters)); | 1202 | counters = vzalloc(num_counters * sizeof(struct xt_counters)); |
| 1220 | if (!counters) { | 1203 | if (!counters) { |
| 1221 | ret = -ENOMEM; | 1204 | ret = -ENOMEM; |
| 1222 | goto out; | 1205 | goto out; |
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 80463507420..c9423763107 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c | |||
| @@ -1325,7 +1325,8 @@ static int __init xt_init(void) | |||
| 1325 | 1325 | ||
| 1326 | for_each_possible_cpu(i) { | 1326 | for_each_possible_cpu(i) { |
| 1327 | struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); | 1327 | struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); |
| 1328 | spin_lock_init(&lock->lock); | 1328 | |
| 1329 | seqlock_init(&lock->lock); | ||
| 1329 | lock->readers = 0; | 1330 | lock->readers = 0; |
| 1330 | } | 1331 | } |
| 1331 | 1332 | ||
