diff options
author | Stephen Hemminger <shemminger@vyatta.com> | 2009-04-29 01:36:33 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-04-29 01:36:33 -0400 |
commit | 942e4a2bd680c606af0211e64eb216be2e19bf61 (patch) | |
tree | a83af49242d4a8d53aa0f3b5814eb17da72edc09 /net/ipv6 | |
parent | bf368e4e70cd4e0f880923c44e95a4273d725ab4 (diff) |
netfilter: revised locking for x_tables
The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.
This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv6')
-rw-r--r-- | net/ipv6/netfilter/ip6_tables.c | 123 |
1 files changed, 37 insertions, 86 deletions
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 800ae8542471..219e165aea10 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c | |||
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb, | |||
365 | 365 | ||
366 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); | 366 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); |
367 | 367 | ||
368 | rcu_read_lock_bh(); | 368 | xt_info_rdlock_bh(); |
369 | private = rcu_dereference(table->private); | 369 | private = table->private; |
370 | table_base = rcu_dereference(private->entries[smp_processor_id()]); | 370 | table_base = private->entries[smp_processor_id()]; |
371 | 371 | ||
372 | e = get_entry(table_base, private->hook_entry[hook]); | 372 | e = get_entry(table_base, private->hook_entry[hook]); |
373 | 373 | ||
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb, | |||
466 | #ifdef CONFIG_NETFILTER_DEBUG | 466 | #ifdef CONFIG_NETFILTER_DEBUG |
467 | ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; | 467 | ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; |
468 | #endif | 468 | #endif |
469 | rcu_read_unlock_bh(); | 469 | xt_info_rdunlock_bh(); |
470 | 470 | ||
471 | #ifdef DEBUG_ALLOW_ALL | 471 | #ifdef DEBUG_ALLOW_ALL |
472 | return NF_ACCEPT; | 472 | return NF_ACCEPT; |
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t, | |||
926 | /* Instead of clearing (by a previous call to memset()) | 926 | /* Instead of clearing (by a previous call to memset()) |
927 | * the counters and using adds, we set the counters | 927 | * the counters and using adds, we set the counters |
928 | * with data used by 'current' CPU | 928 | * with data used by 'current' CPU |
929 | * We dont care about preemption here. | 929 | * |
930 | * Bottom half has to be disabled to prevent deadlock | ||
931 | * if new softirq were to run and call ipt_do_table | ||
930 | */ | 932 | */ |
931 | curcpu = raw_smp_processor_id(); | 933 | local_bh_disable(); |
934 | curcpu = smp_processor_id(); | ||
932 | 935 | ||
933 | i = 0; | 936 | i = 0; |
934 | IP6T_ENTRY_ITERATE(t->entries[curcpu], | 937 | IP6T_ENTRY_ITERATE(t->entries[curcpu], |
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t, | |||
941 | if (cpu == curcpu) | 944 | if (cpu == curcpu) |
942 | continue; | 945 | continue; |
943 | i = 0; | 946 | i = 0; |
947 | xt_info_wrlock(cpu); | ||
944 | IP6T_ENTRY_ITERATE(t->entries[cpu], | 948 | IP6T_ENTRY_ITERATE(t->entries[cpu], |
945 | t->size, | 949 | t->size, |
946 | add_entry_to_counter, | 950 | add_entry_to_counter, |
947 | counters, | 951 | counters, |
948 | &i); | 952 | &i); |
953 | xt_info_wrunlock(cpu); | ||
949 | } | 954 | } |
950 | } | ||
951 | |||
952 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
953 | * and everything is OK. */ | ||
954 | static int | ||
955 | add_counter_to_entry(struct ip6t_entry *e, | ||
956 | const struct xt_counters addme[], | ||
957 | unsigned int *i) | ||
958 | { | ||
959 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
960 | |||
961 | (*i)++; | ||
962 | return 0; | ||
963 | } | ||
964 | |||
965 | /* Take values from counters and add them back onto the current cpu */ | ||
966 | static void put_counters(struct xt_table_info *t, | ||
967 | const struct xt_counters counters[]) | ||
968 | { | ||
969 | unsigned int i, cpu; | ||
970 | |||
971 | local_bh_disable(); | ||
972 | cpu = smp_processor_id(); | ||
973 | i = 0; | ||
974 | IP6T_ENTRY_ITERATE(t->entries[cpu], | ||
975 | t->size, | ||
976 | add_counter_to_entry, | ||
977 | counters, | ||
978 | &i); | ||
979 | local_bh_enable(); | 955 | local_bh_enable(); |
980 | } | 956 | } |
981 | 957 | ||
982 | static inline int | ||
983 | zero_entry_counter(struct ip6t_entry *e, void *arg) | ||
984 | { | ||
985 | e->counters.bcnt = 0; | ||
986 | e->counters.pcnt = 0; | ||
987 | return 0; | ||
988 | } | ||
989 | |||
990 | static void | ||
991 | clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) | ||
992 | { | ||
993 | unsigned int cpu; | ||
994 | const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; | ||
995 | |||
996 | memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | ||
997 | for_each_possible_cpu(cpu) { | ||
998 | memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); | ||
999 | IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, | ||
1000 | zero_entry_counter, NULL); | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | static struct xt_counters *alloc_counters(struct xt_table *table) | 958 | static struct xt_counters *alloc_counters(struct xt_table *table) |
1005 | { | 959 | { |
1006 | unsigned int countersize; | 960 | unsigned int countersize; |
1007 | struct xt_counters *counters; | 961 | struct xt_counters *counters; |
1008 | struct xt_table_info *private = table->private; | 962 | struct xt_table_info *private = table->private; |
1009 | struct xt_table_info *info; | ||
1010 | 963 | ||
1011 | /* We need atomic snapshot of counters: rest doesn't change | 964 | /* We need atomic snapshot of counters: rest doesn't change |
1012 | (other than comefrom, which userspace doesn't care | 965 | (other than comefrom, which userspace doesn't care |
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) | |||
1015 | counters = vmalloc_node(countersize, numa_node_id()); | 968 | counters = vmalloc_node(countersize, numa_node_id()); |
1016 | 969 | ||
1017 | if (counters == NULL) | 970 | if (counters == NULL) |
1018 | goto nomem; | 971 | return ERR_PTR(-ENOMEM); |
1019 | 972 | ||
1020 | info = xt_alloc_table_info(private->size); | 973 | get_counters(private, counters); |
1021 | if (!info) | ||
1022 | goto free_counters; | ||
1023 | |||
1024 | clone_counters(info, private); | ||
1025 | |||
1026 | mutex_lock(&table->lock); | ||
1027 | xt_table_entry_swap_rcu(private, info); | ||
1028 | synchronize_net(); /* Wait until smoke has cleared */ | ||
1029 | |||
1030 | get_counters(info, counters); | ||
1031 | put_counters(private, counters); | ||
1032 | mutex_unlock(&table->lock); | ||
1033 | |||
1034 | xt_free_table_info(info); | ||
1035 | 974 | ||
1036 | return counters; | 975 | return counters; |
1037 | |||
1038 | free_counters: | ||
1039 | vfree(counters); | ||
1040 | nomem: | ||
1041 | return ERR_PTR(-ENOMEM); | ||
1042 | } | 976 | } |
1043 | 977 | ||
1044 | static int | 978 | static int |
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, | |||
1334 | (newinfo->number <= oldinfo->initial_entries)) | 1268 | (newinfo->number <= oldinfo->initial_entries)) |
1335 | module_put(t->me); | 1269 | module_put(t->me); |
1336 | 1270 | ||
1337 | /* Get the old counters. */ | 1271 | /* Get the old counters, and synchronize with replace */ |
1338 | get_counters(oldinfo, counters); | 1272 | get_counters(oldinfo, counters); |
1273 | |||
1339 | /* Decrease module usage counts and free resource */ | 1274 | /* Decrease module usage counts and free resource */ |
1340 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; | 1275 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; |
1341 | IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, | 1276 | IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, |
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len) | |||
1405 | return ret; | 1340 | return ret; |
1406 | } | 1341 | } |
1407 | 1342 | ||
1343 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
1344 | * and everything is OK. */ | ||
1345 | static int | ||
1346 | add_counter_to_entry(struct ip6t_entry *e, | ||
1347 | const struct xt_counters addme[], | ||
1348 | unsigned int *i) | ||
1349 | { | ||
1350 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
1351 | |||
1352 | (*i)++; | ||
1353 | return 0; | ||
1354 | } | ||
1355 | |||
1408 | static int | 1356 | static int |
1409 | do_add_counters(struct net *net, void __user *user, unsigned int len, | 1357 | do_add_counters(struct net *net, void __user *user, unsigned int len, |
1410 | int compat) | 1358 | int compat) |
1411 | { | 1359 | { |
1412 | unsigned int i; | 1360 | unsigned int i, curcpu; |
1413 | struct xt_counters_info tmp; | 1361 | struct xt_counters_info tmp; |
1414 | struct xt_counters *paddc; | 1362 | struct xt_counters *paddc; |
1415 | unsigned int num_counters; | 1363 | unsigned int num_counters; |
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, | |||
1465 | goto free; | 1413 | goto free; |
1466 | } | 1414 | } |
1467 | 1415 | ||
1468 | mutex_lock(&t->lock); | 1416 | |
1417 | local_bh_disable(); | ||
1469 | private = t->private; | 1418 | private = t->private; |
1470 | if (private->number != num_counters) { | 1419 | if (private->number != num_counters) { |
1471 | ret = -EINVAL; | 1420 | ret = -EINVAL; |
1472 | goto unlock_up_free; | 1421 | goto unlock_up_free; |
1473 | } | 1422 | } |
1474 | 1423 | ||
1475 | preempt_disable(); | ||
1476 | i = 0; | 1424 | i = 0; |
1477 | /* Choose the copy that is on our node */ | 1425 | /* Choose the copy that is on our node */ |
1478 | loc_cpu_entry = private->entries[raw_smp_processor_id()]; | 1426 | curcpu = smp_processor_id(); |
1427 | xt_info_wrlock(curcpu); | ||
1428 | loc_cpu_entry = private->entries[curcpu]; | ||
1479 | IP6T_ENTRY_ITERATE(loc_cpu_entry, | 1429 | IP6T_ENTRY_ITERATE(loc_cpu_entry, |
1480 | private->size, | 1430 | private->size, |
1481 | add_counter_to_entry, | 1431 | add_counter_to_entry, |
1482 | paddc, | 1432 | paddc, |
1483 | &i); | 1433 | &i); |
1484 | preempt_enable(); | 1434 | xt_info_wrunlock(curcpu); |
1435 | |||
1485 | unlock_up_free: | 1436 | unlock_up_free: |
1486 | mutex_unlock(&t->lock); | 1437 | local_bh_enable(); |
1487 | xt_table_unlock(t); | 1438 | xt_table_unlock(t); |
1488 | module_put(t->me); | 1439 | module_put(t->me); |
1489 | free: | 1440 | free: |