diff options
author | Stephen Hemminger <shemminger@vyatta.com> | 2009-02-20 04:35:32 -0500 |
---|---|---|
committer | Patrick McHardy <kaber@trash.net> | 2009-02-20 04:35:32 -0500 |
commit | 784544739a25c30637397ace5489eeb6e15d7d49 (patch) | |
tree | c48bbf30f3eb753858de9a03b74e81925cf39018 /net/ipv6 | |
parent | 323dbf96382f057d035afce0237f08e18571ac1d (diff) |
netfilter: iptables: lock free counters
The reader/writer lock in ip_tables is acquired in the critical path of
processing packets and is one of the reasons just loading iptables can cause
a 20% performance loss. The rwlock serves two functions:
1) it prevents changes to table state (xt_replace) while table is in use.
This is now handled by doing rcu on the xt_table. When table is
replaced, the new table(s) are put in and the old one table(s) are freed
after RCU period.
2) it provides synchronization when accesing the counter values.
This is now handled by swapping in new table_info entries for each cpu
then summing the old values, and putting the result back onto one
cpu. On a busy system it may cause sampling to occur at different
times on each cpu, but no packet/byte counts are lost in the process.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)
BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)
Acked-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'net/ipv6')
-rw-r--r-- | net/ipv6/netfilter/ip6_tables.c | 119 |
1 files changed, 84 insertions, 35 deletions
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index d64594b6c061..34af7bb8df5f 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c | |||
@@ -382,10 +382,12 @@ ip6t_do_table(struct sk_buff *skb, | |||
382 | mtpar.family = tgpar.family = NFPROTO_IPV6; | 382 | mtpar.family = tgpar.family = NFPROTO_IPV6; |
383 | tgpar.hooknum = hook; | 383 | tgpar.hooknum = hook; |
384 | 384 | ||
385 | read_lock_bh(&table->lock); | ||
386 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); | 385 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); |
387 | private = table->private; | 386 | |
388 | table_base = (void *)private->entries[smp_processor_id()]; | 387 | rcu_read_lock(); |
388 | private = rcu_dereference(table->private); | ||
389 | table_base = rcu_dereference(private->entries[smp_processor_id()]); | ||
390 | |||
389 | e = get_entry(table_base, private->hook_entry[hook]); | 391 | e = get_entry(table_base, private->hook_entry[hook]); |
390 | 392 | ||
391 | /* For return from builtin chain */ | 393 | /* For return from builtin chain */ |
@@ -483,7 +485,7 @@ ip6t_do_table(struct sk_buff *skb, | |||
483 | #ifdef CONFIG_NETFILTER_DEBUG | 485 | #ifdef CONFIG_NETFILTER_DEBUG |
484 | ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; | 486 | ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; |
485 | #endif | 487 | #endif |
486 | read_unlock_bh(&table->lock); | 488 | rcu_read_unlock(); |
487 | 489 | ||
488 | #ifdef DEBUG_ALLOW_ALL | 490 | #ifdef DEBUG_ALLOW_ALL |
489 | return NF_ACCEPT; | 491 | return NF_ACCEPT; |
@@ -964,11 +966,64 @@ get_counters(const struct xt_table_info *t, | |||
964 | } | 966 | } |
965 | } | 967 | } |
966 | 968 | ||
969 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
970 | * and everything is OK. */ | ||
971 | static int | ||
972 | add_counter_to_entry(struct ip6t_entry *e, | ||
973 | const struct xt_counters addme[], | ||
974 | unsigned int *i) | ||
975 | { | ||
976 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
977 | |||
978 | (*i)++; | ||
979 | return 0; | ||
980 | } | ||
981 | |||
982 | /* Take values from counters and add them back onto the current cpu */ | ||
983 | static void put_counters(struct xt_table_info *t, | ||
984 | const struct xt_counters counters[]) | ||
985 | { | ||
986 | unsigned int i, cpu; | ||
987 | |||
988 | local_bh_disable(); | ||
989 | cpu = smp_processor_id(); | ||
990 | i = 0; | ||
991 | IP6T_ENTRY_ITERATE(t->entries[cpu], | ||
992 | t->size, | ||
993 | add_counter_to_entry, | ||
994 | counters, | ||
995 | &i); | ||
996 | local_bh_enable(); | ||
997 | } | ||
998 | |||
999 | static inline int | ||
1000 | zero_entry_counter(struct ip6t_entry *e, void *arg) | ||
1001 | { | ||
1002 | e->counters.bcnt = 0; | ||
1003 | e->counters.pcnt = 0; | ||
1004 | return 0; | ||
1005 | } | ||
1006 | |||
1007 | static void | ||
1008 | clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) | ||
1009 | { | ||
1010 | unsigned int cpu; | ||
1011 | const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; | ||
1012 | |||
1013 | memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | ||
1014 | for_each_possible_cpu(cpu) { | ||
1015 | memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); | ||
1016 | IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, | ||
1017 | zero_entry_counter, NULL); | ||
1018 | } | ||
1019 | } | ||
1020 | |||
967 | static struct xt_counters *alloc_counters(struct xt_table *table) | 1021 | static struct xt_counters *alloc_counters(struct xt_table *table) |
968 | { | 1022 | { |
969 | unsigned int countersize; | 1023 | unsigned int countersize; |
970 | struct xt_counters *counters; | 1024 | struct xt_counters *counters; |
971 | const struct xt_table_info *private = table->private; | 1025 | struct xt_table_info *private = table->private; |
1026 | struct xt_table_info *info; | ||
972 | 1027 | ||
973 | /* We need atomic snapshot of counters: rest doesn't change | 1028 | /* We need atomic snapshot of counters: rest doesn't change |
974 | (other than comefrom, which userspace doesn't care | 1029 | (other than comefrom, which userspace doesn't care |
@@ -977,14 +1032,28 @@ static struct xt_counters *alloc_counters(struct xt_table *table) | |||
977 | counters = vmalloc_node(countersize, numa_node_id()); | 1032 | counters = vmalloc_node(countersize, numa_node_id()); |
978 | 1033 | ||
979 | if (counters == NULL) | 1034 | if (counters == NULL) |
980 | return ERR_PTR(-ENOMEM); | 1035 | goto nomem; |
1036 | |||
1037 | info = xt_alloc_table_info(private->size); | ||
1038 | if (!info) | ||
1039 | goto free_counters; | ||
1040 | |||
1041 | clone_counters(info, private); | ||
1042 | |||
1043 | mutex_lock(&table->lock); | ||
1044 | xt_table_entry_swap_rcu(private, info); | ||
1045 | synchronize_net(); /* Wait until smoke has cleared */ | ||
1046 | |||
1047 | get_counters(info, counters); | ||
1048 | put_counters(private, counters); | ||
1049 | mutex_unlock(&table->lock); | ||
981 | 1050 | ||
982 | /* First, sum counters... */ | 1051 | xt_free_table_info(info); |
983 | write_lock_bh(&table->lock); | ||
984 | get_counters(private, counters); | ||
985 | write_unlock_bh(&table->lock); | ||
986 | 1052 | ||
987 | return counters; | 1053 | free_counters: |
1054 | vfree(counters); | ||
1055 | nomem: | ||
1056 | return ERR_PTR(-ENOMEM); | ||
988 | } | 1057 | } |
989 | 1058 | ||
990 | static int | 1059 | static int |
@@ -1351,28 +1420,6 @@ do_replace(struct net *net, void __user *user, unsigned int len) | |||
1351 | return ret; | 1420 | return ret; |
1352 | } | 1421 | } |
1353 | 1422 | ||
1354 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
1355 | * and everything is OK. */ | ||
1356 | static inline int | ||
1357 | add_counter_to_entry(struct ip6t_entry *e, | ||
1358 | const struct xt_counters addme[], | ||
1359 | unsigned int *i) | ||
1360 | { | ||
1361 | #if 0 | ||
1362 | duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", | ||
1363 | *i, | ||
1364 | (long unsigned int)e->counters.pcnt, | ||
1365 | (long unsigned int)e->counters.bcnt, | ||
1366 | (long unsigned int)addme[*i].pcnt, | ||
1367 | (long unsigned int)addme[*i].bcnt); | ||
1368 | #endif | ||
1369 | |||
1370 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
1371 | |||
1372 | (*i)++; | ||
1373 | return 0; | ||
1374 | } | ||
1375 | |||
1376 | static int | 1423 | static int |
1377 | do_add_counters(struct net *net, void __user *user, unsigned int len, | 1424 | do_add_counters(struct net *net, void __user *user, unsigned int len, |
1378 | int compat) | 1425 | int compat) |
@@ -1433,13 +1480,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, | |||
1433 | goto free; | 1480 | goto free; |
1434 | } | 1481 | } |
1435 | 1482 | ||
1436 | write_lock_bh(&t->lock); | 1483 | mutex_lock(&t->lock); |
1437 | private = t->private; | 1484 | private = t->private; |
1438 | if (private->number != num_counters) { | 1485 | if (private->number != num_counters) { |
1439 | ret = -EINVAL; | 1486 | ret = -EINVAL; |
1440 | goto unlock_up_free; | 1487 | goto unlock_up_free; |
1441 | } | 1488 | } |
1442 | 1489 | ||
1490 | preempt_disable(); | ||
1443 | i = 0; | 1491 | i = 0; |
1444 | /* Choose the copy that is on our node */ | 1492 | /* Choose the copy that is on our node */ |
1445 | loc_cpu_entry = private->entries[raw_smp_processor_id()]; | 1493 | loc_cpu_entry = private->entries[raw_smp_processor_id()]; |
@@ -1448,8 +1496,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, | |||
1448 | add_counter_to_entry, | 1496 | add_counter_to_entry, |
1449 | paddc, | 1497 | paddc, |
1450 | &i); | 1498 | &i); |
1499 | preempt_enable(); | ||
1451 | unlock_up_free: | 1500 | unlock_up_free: |
1452 | write_unlock_bh(&t->lock); | 1501 | mutex_unlock(&t->lock); |
1453 | xt_table_unlock(t); | 1502 | xt_table_unlock(t); |
1454 | module_put(t->me); | 1503 | module_put(t->me); |
1455 | free: | 1504 | free: |