aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Hemminger <shemminger@vyatta.com>2009-04-29 01:36:33 -0400
committerDavid S. Miller <davem@davemloft.net>2009-04-29 01:36:33 -0400
commit942e4a2bd680c606af0211e64eb216be2e19bf61 (patch)
treea83af49242d4a8d53aa0f3b5814eb17da72edc09
parentbf368e4e70cd4e0f880923c44e95a4273d725ab4 (diff)
netfilter: revised locking for x_tables
The x_tables are organized with a table structure and a per-cpu copies of the counters and rules. On older kernels there was a reader/writer lock per table which was a performance bottleneck. In 2.6.30-rc, this was converted to use RCU and the counters/rules which solved the performance problems for do_table but made replacing rules much slower because of the necessary RCU grace period. This version uses a per-cpu set of spinlocks and counters to allow to table processing to proceed without the cache thrashing of a global reader lock and keeps the same performance for table updates. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/netfilter/x_tables.h73
-rw-r--r--net/ipv4/netfilter/arp_tables.c125
-rw-r--r--net/ipv4/netfilter/ip_tables.c126
-rw-r--r--net/ipv6/netfilter/ip6_tables.c123
-rw-r--r--net/netfilter/x_tables.c53
5 files changed, 204 insertions, 296 deletions
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 7b1a652066c0..1b2e43502ef7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -354,9 +354,6 @@ struct xt_table
354 /* What hooks you will enter on */ 354 /* What hooks you will enter on */
355 unsigned int valid_hooks; 355 unsigned int valid_hooks;
356 356
357 /* Lock for the curtain */
358 struct mutex lock;
359
360 /* Man behind the curtain... */ 357 /* Man behind the curtain... */
361 struct xt_table_info *private; 358 struct xt_table_info *private;
362 359
@@ -434,8 +431,74 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
434 431
435extern struct xt_table_info *xt_alloc_table_info(unsigned int size); 432extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
436extern void xt_free_table_info(struct xt_table_info *info); 433extern void xt_free_table_info(struct xt_table_info *info);
437extern void xt_table_entry_swap_rcu(struct xt_table_info *old, 434
438 struct xt_table_info *new); 435/*
436 * Per-CPU spinlock associated with per-cpu table entries, and
437 * with a counter for the "reading" side that allows a recursive
438 * reader to avoid taking the lock and deadlocking.
439 *
440 * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
441 * It needs to ensure that the rules are not being changed while the packet
442 * is being processed. In some cases, the read lock will be acquired
443 * twice on the same CPU; this is okay because of the count.
444 *
445 * "writing" is used when reading counters.
446 * During replace any readers that are using the old tables have to complete
447 * before freeing the old table. This is handled by the write locking
448 * necessary for reading the counters.
449 */
450struct xt_info_lock {
451 spinlock_t lock;
452 unsigned char readers;
453};
454DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
455
456/*
457 * Note: we need to ensure that preemption is disabled before acquiring
458 * the per-cpu-variable, so we do it as a two step process rather than
459 * using "spin_lock_bh()".
460 *
461 * We _also_ need to disable bottom half processing before updating our
462 * nesting count, to make sure that the only kind of re-entrancy is this
463 * code being called by itself: since the count+lock is not an atomic
464 * operation, we can allow no races.
465 *
466 * _Only_ that special combination of being per-cpu and never getting
467 * re-entered asynchronously means that the count is safe.
468 */
469static inline void xt_info_rdlock_bh(void)
470{
471 struct xt_info_lock *lock;
472
473 local_bh_disable();
474 lock = &__get_cpu_var(xt_info_locks);
475 if (!lock->readers++)
476 spin_lock(&lock->lock);
477}
478
479static inline void xt_info_rdunlock_bh(void)
480{
481 struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
482
483 if (!--lock->readers)
484 spin_unlock(&lock->lock);
485 local_bh_enable();
486}
487
488/*
489 * The "writer" side needs to get exclusive access to the lock,
490 * regardless of readers. This must be called with bottom half
491 * processing (and thus also preemption) disabled.
492 */
493static inline void xt_info_wrlock(unsigned int cpu)
494{
495 spin_lock(&per_cpu(xt_info_locks, cpu).lock);
496}
497
498static inline void xt_info_wrunlock(unsigned int cpu)
499{
500 spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
501}
439 502
440/* 503/*
441 * This helper is performance critical and must be inlined 504 * This helper is performance critical and must be inlined
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d234db..831fe1879dc0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
253 indev = in ? in->name : nulldevname; 253 indev = in ? in->name : nulldevname;
254 outdev = out ? out->name : nulldevname; 254 outdev = out ? out->name : nulldevname;
255 255
256 rcu_read_lock_bh(); 256 xt_info_rdlock_bh();
257 private = rcu_dereference(table->private); 257 private = table->private;
258 table_base = rcu_dereference(private->entries[smp_processor_id()]); 258 table_base = private->entries[smp_processor_id()];
259 259
260 e = get_entry(table_base, private->hook_entry[hook]); 260 e = get_entry(table_base, private->hook_entry[hook]);
261 back = get_entry(table_base, private->underflow[hook]); 261 back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
273 273
274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + 274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
275 (2 * skb->dev->addr_len); 275 (2 * skb->dev->addr_len);
276
276 ADD_COUNTER(e->counters, hdr_len, 1); 277 ADD_COUNTER(e->counters, hdr_len, 1);
277 278
278 t = arpt_get_target(e); 279 t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
328 e = (void *)e + e->next_offset; 329 e = (void *)e + e->next_offset;
329 } 330 }
330 } while (!hotdrop); 331 } while (!hotdrop);
331 332 xt_info_rdunlock_bh();
332 rcu_read_unlock_bh();
333 333
334 if (hotdrop) 334 if (hotdrop)
335 return NF_DROP; 335 return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
711 /* Instead of clearing (by a previous call to memset()) 711 /* Instead of clearing (by a previous call to memset())
712 * the counters and using adds, we set the counters 712 * the counters and using adds, we set the counters
713 * with data used by 'current' CPU 713 * with data used by 'current' CPU
714 * We dont care about preemption here. 714 *
715 * Bottom half has to be disabled to prevent deadlock
716 * if new softirq were to run and call ipt_do_table
715 */ 717 */
716 curcpu = raw_smp_processor_id(); 718 local_bh_disable();
719 curcpu = smp_processor_id();
717 720
718 i = 0; 721 i = 0;
719 ARPT_ENTRY_ITERATE(t->entries[curcpu], 722 ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
726 if (cpu == curcpu) 729 if (cpu == curcpu)
727 continue; 730 continue;
728 i = 0; 731 i = 0;
732 xt_info_wrlock(cpu);
729 ARPT_ENTRY_ITERATE(t->entries[cpu], 733 ARPT_ENTRY_ITERATE(t->entries[cpu],
730 t->size, 734 t->size,
731 add_entry_to_counter, 735 add_entry_to_counter,
732 counters, 736 counters,
733 &i); 737 &i);
738 xt_info_wrunlock(cpu);
734 } 739 }
735}
736
737
738/* We're lazy, and add to the first CPU; overflow works its fey magic
739 * and everything is OK. */
740static int
741add_counter_to_entry(struct arpt_entry *e,
742 const struct xt_counters addme[],
743 unsigned int *i)
744{
745 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
746
747 (*i)++;
748 return 0;
749}
750
751/* Take values from counters and add them back onto the current cpu */
752static void put_counters(struct xt_table_info *t,
753 const struct xt_counters counters[])
754{
755 unsigned int i, cpu;
756
757 local_bh_disable();
758 cpu = smp_processor_id();
759 i = 0;
760 ARPT_ENTRY_ITERATE(t->entries[cpu],
761 t->size,
762 add_counter_to_entry,
763 counters,
764 &i);
765 local_bh_enable(); 740 local_bh_enable();
766} 741}
767 742
768static inline int
769zero_entry_counter(struct arpt_entry *e, void *arg)
770{
771 e->counters.bcnt = 0;
772 e->counters.pcnt = 0;
773 return 0;
774}
775
776static void
777clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
778{
779 unsigned int cpu;
780 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
781
782 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
783 for_each_possible_cpu(cpu) {
784 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
785 ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
786 zero_entry_counter, NULL);
787 }
788}
789
790static struct xt_counters *alloc_counters(struct xt_table *table) 743static struct xt_counters *alloc_counters(struct xt_table *table)
791{ 744{
792 unsigned int countersize; 745 unsigned int countersize;
793 struct xt_counters *counters; 746 struct xt_counters *counters;
794 struct xt_table_info *private = table->private; 747 struct xt_table_info *private = table->private;
795 struct xt_table_info *info;
796 748
797 /* We need atomic snapshot of counters: rest doesn't change 749 /* We need atomic snapshot of counters: rest doesn't change
798 * (other than comefrom, which userspace doesn't care 750 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
802 counters = vmalloc_node(countersize, numa_node_id()); 754 counters = vmalloc_node(countersize, numa_node_id());
803 755
804 if (counters == NULL) 756 if (counters == NULL)
805 goto nomem; 757 return ERR_PTR(-ENOMEM);
806
807 info = xt_alloc_table_info(private->size);
808 if (!info)
809 goto free_counters;
810
811 clone_counters(info, private);
812
813 mutex_lock(&table->lock);
814 xt_table_entry_swap_rcu(private, info);
815 synchronize_net(); /* Wait until smoke has cleared */
816 758
817 get_counters(info, counters); 759 get_counters(private, counters);
818 put_counters(private, counters);
819 mutex_unlock(&table->lock);
820
821 xt_free_table_info(info);
822 760
823 return counters; 761 return counters;
824
825 free_counters:
826 vfree(counters);
827 nomem:
828 return ERR_PTR(-ENOMEM);
829} 762}
830 763
831static int copy_entries_to_user(unsigned int total_size, 764static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
1094 (newinfo->number <= oldinfo->initial_entries)) 1027 (newinfo->number <= oldinfo->initial_entries))
1095 module_put(t->me); 1028 module_put(t->me);
1096 1029
1097 /* Get the old counters. */ 1030 /* Get the old counters, and synchronize with replace */
1098 get_counters(oldinfo, counters); 1031 get_counters(oldinfo, counters);
1032
1099 /* Decrease module usage counts and free resource */ 1033 /* Decrease module usage counts and free resource */
1100 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1034 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1101 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1035 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1165 return ret; 1099 return ret;
1166} 1100}
1167 1101
1102/* We're lazy, and add to the first CPU; overflow works its fey magic
1103 * and everything is OK. */
1104static int
1105add_counter_to_entry(struct arpt_entry *e,
1106 const struct xt_counters addme[],
1107 unsigned int *i)
1108{
1109 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1110
1111 (*i)++;
1112 return 0;
1113}
1114
1168static int do_add_counters(struct net *net, void __user *user, unsigned int len, 1115static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1169 int compat) 1116 int compat)
1170{ 1117{
1171 unsigned int i; 1118 unsigned int i, curcpu;
1172 struct xt_counters_info tmp; 1119 struct xt_counters_info tmp;
1173 struct xt_counters *paddc; 1120 struct xt_counters *paddc;
1174 unsigned int num_counters; 1121 unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1224 goto free; 1171 goto free;
1225 } 1172 }
1226 1173
1227 mutex_lock(&t->lock); 1174 local_bh_disable();
1228 private = t->private; 1175 private = t->private;
1229 if (private->number != num_counters) { 1176 if (private->number != num_counters) {
1230 ret = -EINVAL; 1177 ret = -EINVAL;
1231 goto unlock_up_free; 1178 goto unlock_up_free;
1232 } 1179 }
1233 1180
1234 preempt_disable();
1235 i = 0; 1181 i = 0;
1236 /* Choose the copy that is on our node */ 1182 /* Choose the copy that is on our node */
1237 loc_cpu_entry = private->entries[smp_processor_id()]; 1183 curcpu = smp_processor_id();
1184 loc_cpu_entry = private->entries[curcpu];
1185 xt_info_wrlock(curcpu);
1238 ARPT_ENTRY_ITERATE(loc_cpu_entry, 1186 ARPT_ENTRY_ITERATE(loc_cpu_entry,
1239 private->size, 1187 private->size,
1240 add_counter_to_entry, 1188 add_counter_to_entry,
1241 paddc, 1189 paddc,
1242 &i); 1190 &i);
1243 preempt_enable(); 1191 xt_info_wrunlock(curcpu);
1244 unlock_up_free: 1192 unlock_up_free:
1245 mutex_unlock(&t->lock); 1193 local_bh_enable();
1246
1247 xt_table_unlock(t); 1194 xt_table_unlock(t);
1248 module_put(t->me); 1195 module_put(t->me);
1249 free: 1196 free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b62c7d4..2ec8d7290c40 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
338 tgpar.hooknum = hook; 338 tgpar.hooknum = hook;
339 339
340 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 340 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
341 341 xt_info_rdlock_bh();
342 rcu_read_lock_bh(); 342 private = table->private;
343 private = rcu_dereference(table->private); 343 table_base = private->entries[smp_processor_id()];
344 table_base = rcu_dereference(private->entries[smp_processor_id()]);
345 344
346 e = get_entry(table_base, private->hook_entry[hook]); 345 e = get_entry(table_base, private->hook_entry[hook]);
347 346
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
436 e = (void *)e + e->next_offset; 435 e = (void *)e + e->next_offset;
437 } 436 }
438 } while (!hotdrop); 437 } while (!hotdrop);
439 438 xt_info_rdunlock_bh();
440 rcu_read_unlock_bh();
441 439
442#ifdef DEBUG_ALLOW_ALL 440#ifdef DEBUG_ALLOW_ALL
443 return NF_ACCEPT; 441 return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
896 894
897 /* Instead of clearing (by a previous call to memset()) 895 /* Instead of clearing (by a previous call to memset())
898 * the counters and using adds, we set the counters 896 * the counters and using adds, we set the counters
899 * with data used by 'current' CPU 897 * with data used by 'current' CPU.
900 * We dont care about preemption here. 898 *
899 * Bottom half has to be disabled to prevent deadlock
900 * if new softirq were to run and call ipt_do_table
901 */ 901 */
902 curcpu = raw_smp_processor_id(); 902 local_bh_disable();
903 curcpu = smp_processor_id();
903 904
904 i = 0; 905 i = 0;
905 IPT_ENTRY_ITERATE(t->entries[curcpu], 906 IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
912 if (cpu == curcpu) 913 if (cpu == curcpu)
913 continue; 914 continue;
914 i = 0; 915 i = 0;
916 xt_info_wrlock(cpu);
915 IPT_ENTRY_ITERATE(t->entries[cpu], 917 IPT_ENTRY_ITERATE(t->entries[cpu],
916 t->size, 918 t->size,
917 add_entry_to_counter, 919 add_entry_to_counter,
918 counters, 920 counters,
919 &i); 921 &i);
922 xt_info_wrunlock(cpu);
920 } 923 }
921
922}
923
924/* We're lazy, and add to the first CPU; overflow works its fey magic
925 * and everything is OK. */
926static int
927add_counter_to_entry(struct ipt_entry *e,
928 const struct xt_counters addme[],
929 unsigned int *i)
930{
931 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
932
933 (*i)++;
934 return 0;
935}
936
937/* Take values from counters and add them back onto the current cpu */
938static void put_counters(struct xt_table_info *t,
939 const struct xt_counters counters[])
940{
941 unsigned int i, cpu;
942
943 local_bh_disable();
944 cpu = smp_processor_id();
945 i = 0;
946 IPT_ENTRY_ITERATE(t->entries[cpu],
947 t->size,
948 add_counter_to_entry,
949 counters,
950 &i);
951 local_bh_enable(); 924 local_bh_enable();
952} 925}
953 926
954
955static inline int
956zero_entry_counter(struct ipt_entry *e, void *arg)
957{
958 e->counters.bcnt = 0;
959 e->counters.pcnt = 0;
960 return 0;
961}
962
963static void
964clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
965{
966 unsigned int cpu;
967 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
968
969 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
970 for_each_possible_cpu(cpu) {
971 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
972 IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
973 zero_entry_counter, NULL);
974 }
975}
976
977static struct xt_counters * alloc_counters(struct xt_table *table) 927static struct xt_counters * alloc_counters(struct xt_table *table)
978{ 928{
979 unsigned int countersize; 929 unsigned int countersize;
980 struct xt_counters *counters; 930 struct xt_counters *counters;
981 struct xt_table_info *private = table->private; 931 struct xt_table_info *private = table->private;
982 struct xt_table_info *info;
983 932
984 /* We need atomic snapshot of counters: rest doesn't change 933 /* We need atomic snapshot of counters: rest doesn't change
985 (other than comefrom, which userspace doesn't care 934 (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
988 counters = vmalloc_node(countersize, numa_node_id()); 937 counters = vmalloc_node(countersize, numa_node_id());
989 938
990 if (counters == NULL) 939 if (counters == NULL)
991 goto nomem; 940 return ERR_PTR(-ENOMEM);
992 941
993 info = xt_alloc_table_info(private->size); 942 get_counters(private, counters);
994 if (!info)
995 goto free_counters;
996
997 clone_counters(info, private);
998
999 mutex_lock(&table->lock);
1000 xt_table_entry_swap_rcu(private, info);
1001 synchronize_net(); /* Wait until smoke has cleared */
1002
1003 get_counters(info, counters);
1004 put_counters(private, counters);
1005 mutex_unlock(&table->lock);
1006
1007 xt_free_table_info(info);
1008 943
1009 return counters; 944 return counters;
1010
1011 free_counters:
1012 vfree(counters);
1013 nomem:
1014 return ERR_PTR(-ENOMEM);
1015} 945}
1016 946
1017static int 947static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1306 (newinfo->number <= oldinfo->initial_entries)) 1236 (newinfo->number <= oldinfo->initial_entries))
1307 module_put(t->me); 1237 module_put(t->me);
1308 1238
1309 /* Get the old counters. */ 1239 /* Get the old counters, and synchronize with replace */
1310 get_counters(oldinfo, counters); 1240 get_counters(oldinfo, counters);
1241
1311 /* Decrease module usage counts and free resource */ 1242 /* Decrease module usage counts and free resource */
1312 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1243 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1313 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1244 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1377 return ret; 1308 return ret;
1378} 1309}
1379 1310
1311/* We're lazy, and add to the first CPU; overflow works its fey magic
1312 * and everything is OK. */
1313static int
1314add_counter_to_entry(struct ipt_entry *e,
1315 const struct xt_counters addme[],
1316 unsigned int *i)
1317{
1318 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1319
1320 (*i)++;
1321 return 0;
1322}
1380 1323
1381static int 1324static int
1382do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) 1325do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
1383{ 1326{
1384 unsigned int i; 1327 unsigned int i, curcpu;
1385 struct xt_counters_info tmp; 1328 struct xt_counters_info tmp;
1386 struct xt_counters *paddc; 1329 struct xt_counters *paddc;
1387 unsigned int num_counters; 1330 unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1437 goto free; 1380 goto free;
1438 } 1381 }
1439 1382
1440 mutex_lock(&t->lock); 1383 local_bh_disable();
1441 private = t->private; 1384 private = t->private;
1442 if (private->number != num_counters) { 1385 if (private->number != num_counters) {
1443 ret = -EINVAL; 1386 ret = -EINVAL;
1444 goto unlock_up_free; 1387 goto unlock_up_free;
1445 } 1388 }
1446 1389
1447 preempt_disable();
1448 i = 0; 1390 i = 0;
1449 /* Choose the copy that is on our node */ 1391 /* Choose the copy that is on our node */
1450 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1392 curcpu = smp_processor_id();
1393 loc_cpu_entry = private->entries[curcpu];
1394 xt_info_wrlock(curcpu);
1451 IPT_ENTRY_ITERATE(loc_cpu_entry, 1395 IPT_ENTRY_ITERATE(loc_cpu_entry,
1452 private->size, 1396 private->size,
1453 add_counter_to_entry, 1397 add_counter_to_entry,
1454 paddc, 1398 paddc,
1455 &i); 1399 &i);
1456 preempt_enable(); 1400 xt_info_wrunlock(curcpu);
1457 unlock_up_free: 1401 unlock_up_free:
1458 mutex_unlock(&t->lock); 1402 local_bh_enable();
1459 xt_table_unlock(t); 1403 xt_table_unlock(t);
1460 module_put(t->me); 1404 module_put(t->me);
1461 free: 1405 free:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 800ae8542471..219e165aea10 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
365 365
366 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 366 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
367 367
368 rcu_read_lock_bh(); 368 xt_info_rdlock_bh();
369 private = rcu_dereference(table->private); 369 private = table->private;
370 table_base = rcu_dereference(private->entries[smp_processor_id()]); 370 table_base = private->entries[smp_processor_id()];
371 371
372 e = get_entry(table_base, private->hook_entry[hook]); 372 e = get_entry(table_base, private->hook_entry[hook]);
373 373
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
466#ifdef CONFIG_NETFILTER_DEBUG 466#ifdef CONFIG_NETFILTER_DEBUG
467 ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; 467 ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
468#endif 468#endif
469 rcu_read_unlock_bh(); 469 xt_info_rdunlock_bh();
470 470
471#ifdef DEBUG_ALLOW_ALL 471#ifdef DEBUG_ALLOW_ALL
472 return NF_ACCEPT; 472 return NF_ACCEPT;
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t,
926 /* Instead of clearing (by a previous call to memset()) 926 /* Instead of clearing (by a previous call to memset())
927 * the counters and using adds, we set the counters 927 * the counters and using adds, we set the counters
928 * with data used by 'current' CPU 928 * with data used by 'current' CPU
929 * We dont care about preemption here. 929 *
930 * Bottom half has to be disabled to prevent deadlock
931 * if new softirq were to run and call ipt_do_table
930 */ 932 */
931 curcpu = raw_smp_processor_id(); 933 local_bh_disable();
934 curcpu = smp_processor_id();
932 935
933 i = 0; 936 i = 0;
934 IP6T_ENTRY_ITERATE(t->entries[curcpu], 937 IP6T_ENTRY_ITERATE(t->entries[curcpu],
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t,
941 if (cpu == curcpu) 944 if (cpu == curcpu)
942 continue; 945 continue;
943 i = 0; 946 i = 0;
947 xt_info_wrlock(cpu);
944 IP6T_ENTRY_ITERATE(t->entries[cpu], 948 IP6T_ENTRY_ITERATE(t->entries[cpu],
945 t->size, 949 t->size,
946 add_entry_to_counter, 950 add_entry_to_counter,
947 counters, 951 counters,
948 &i); 952 &i);
953 xt_info_wrunlock(cpu);
949 } 954 }
950}
951
952/* We're lazy, and add to the first CPU; overflow works its fey magic
953 * and everything is OK. */
954static int
955add_counter_to_entry(struct ip6t_entry *e,
956 const struct xt_counters addme[],
957 unsigned int *i)
958{
959 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
960
961 (*i)++;
962 return 0;
963}
964
965/* Take values from counters and add them back onto the current cpu */
966static void put_counters(struct xt_table_info *t,
967 const struct xt_counters counters[])
968{
969 unsigned int i, cpu;
970
971 local_bh_disable();
972 cpu = smp_processor_id();
973 i = 0;
974 IP6T_ENTRY_ITERATE(t->entries[cpu],
975 t->size,
976 add_counter_to_entry,
977 counters,
978 &i);
979 local_bh_enable(); 955 local_bh_enable();
980} 956}
981 957
982static inline int
983zero_entry_counter(struct ip6t_entry *e, void *arg)
984{
985 e->counters.bcnt = 0;
986 e->counters.pcnt = 0;
987 return 0;
988}
989
990static void
991clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
992{
993 unsigned int cpu;
994 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
995
996 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
997 for_each_possible_cpu(cpu) {
998 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
999 IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
1000 zero_entry_counter, NULL);
1001 }
1002}
1003
1004static struct xt_counters *alloc_counters(struct xt_table *table) 958static struct xt_counters *alloc_counters(struct xt_table *table)
1005{ 959{
1006 unsigned int countersize; 960 unsigned int countersize;
1007 struct xt_counters *counters; 961 struct xt_counters *counters;
1008 struct xt_table_info *private = table->private; 962 struct xt_table_info *private = table->private;
1009 struct xt_table_info *info;
1010 963
1011 /* We need atomic snapshot of counters: rest doesn't change 964 /* We need atomic snapshot of counters: rest doesn't change
1012 (other than comefrom, which userspace doesn't care 965 (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
1015 counters = vmalloc_node(countersize, numa_node_id()); 968 counters = vmalloc_node(countersize, numa_node_id());
1016 969
1017 if (counters == NULL) 970 if (counters == NULL)
1018 goto nomem; 971 return ERR_PTR(-ENOMEM);
1019 972
1020 info = xt_alloc_table_info(private->size); 973 get_counters(private, counters);
1021 if (!info)
1022 goto free_counters;
1023
1024 clone_counters(info, private);
1025
1026 mutex_lock(&table->lock);
1027 xt_table_entry_swap_rcu(private, info);
1028 synchronize_net(); /* Wait until smoke has cleared */
1029
1030 get_counters(info, counters);
1031 put_counters(private, counters);
1032 mutex_unlock(&table->lock);
1033
1034 xt_free_table_info(info);
1035 974
1036 return counters; 975 return counters;
1037
1038 free_counters:
1039 vfree(counters);
1040 nomem:
1041 return ERR_PTR(-ENOMEM);
1042} 976}
1043 977
1044static int 978static int
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1334 (newinfo->number <= oldinfo->initial_entries)) 1268 (newinfo->number <= oldinfo->initial_entries))
1335 module_put(t->me); 1269 module_put(t->me);
1336 1270
1337 /* Get the old counters. */ 1271 /* Get the old counters, and synchronize with replace */
1338 get_counters(oldinfo, counters); 1272 get_counters(oldinfo, counters);
1273
1339 /* Decrease module usage counts and free resource */ 1274 /* Decrease module usage counts and free resource */
1340 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1275 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1341 IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1276 IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1405 return ret; 1340 return ret;
1406} 1341}
1407 1342
1343/* We're lazy, and add to the first CPU; overflow works its fey magic
1344 * and everything is OK. */
1345static int
1346add_counter_to_entry(struct ip6t_entry *e,
1347 const struct xt_counters addme[],
1348 unsigned int *i)
1349{
1350 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1351
1352 (*i)++;
1353 return 0;
1354}
1355
1408static int 1356static int
1409do_add_counters(struct net *net, void __user *user, unsigned int len, 1357do_add_counters(struct net *net, void __user *user, unsigned int len,
1410 int compat) 1358 int compat)
1411{ 1359{
1412 unsigned int i; 1360 unsigned int i, curcpu;
1413 struct xt_counters_info tmp; 1361 struct xt_counters_info tmp;
1414 struct xt_counters *paddc; 1362 struct xt_counters *paddc;
1415 unsigned int num_counters; 1363 unsigned int num_counters;
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
1465 goto free; 1413 goto free;
1466 } 1414 }
1467 1415
1468 mutex_lock(&t->lock); 1416
1417 local_bh_disable();
1469 private = t->private; 1418 private = t->private;
1470 if (private->number != num_counters) { 1419 if (private->number != num_counters) {
1471 ret = -EINVAL; 1420 ret = -EINVAL;
1472 goto unlock_up_free; 1421 goto unlock_up_free;
1473 } 1422 }
1474 1423
1475 preempt_disable();
1476 i = 0; 1424 i = 0;
1477 /* Choose the copy that is on our node */ 1425 /* Choose the copy that is on our node */
1478 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1426 curcpu = smp_processor_id();
1427 xt_info_wrlock(curcpu);
1428 loc_cpu_entry = private->entries[curcpu];
1479 IP6T_ENTRY_ITERATE(loc_cpu_entry, 1429 IP6T_ENTRY_ITERATE(loc_cpu_entry,
1480 private->size, 1430 private->size,
1481 add_counter_to_entry, 1431 add_counter_to_entry,
1482 paddc, 1432 paddc,
1483 &i); 1433 &i);
1484 preempt_enable(); 1434 xt_info_wrunlock(curcpu);
1435
1485 unlock_up_free: 1436 unlock_up_free:
1486 mutex_unlock(&t->lock); 1437 local_bh_enable();
1487 xt_table_unlock(t); 1438 xt_table_unlock(t);
1488 module_put(t->me); 1439 module_put(t->me);
1489 free: 1440 free:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 509a95621f9f..150e5cf62f85 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
625} 625}
626EXPORT_SYMBOL(xt_free_table_info); 626EXPORT_SYMBOL(xt_free_table_info);
627 627
628void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
629 struct xt_table_info *newinfo)
630{
631 unsigned int cpu;
632
633 for_each_possible_cpu(cpu) {
634 void *p = oldinfo->entries[cpu];
635 rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
636 newinfo->entries[cpu] = p;
637 }
638
639}
640EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
641
642/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ 628/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
643struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, 629struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
644 const char *name) 630 const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
676EXPORT_SYMBOL_GPL(xt_compat_unlock); 662EXPORT_SYMBOL_GPL(xt_compat_unlock);
677#endif 663#endif
678 664
665DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
666EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
667
668
679struct xt_table_info * 669struct xt_table_info *
680xt_replace_table(struct xt_table *table, 670xt_replace_table(struct xt_table *table,
681 unsigned int num_counters, 671 unsigned int num_counters,
682 struct xt_table_info *newinfo, 672 struct xt_table_info *newinfo,
683 int *error) 673 int *error)
684{ 674{
685 struct xt_table_info *oldinfo, *private; 675 struct xt_table_info *private;
686 676
687 /* Do the substitution. */ 677 /* Do the substitution. */
688 mutex_lock(&table->lock); 678 local_bh_disable();
689 private = table->private; 679 private = table->private;
680
690 /* Check inside lock: is the old number correct? */ 681 /* Check inside lock: is the old number correct? */
691 if (num_counters != private->number) { 682 if (num_counters != private->number) {
692 duprintf("num_counters != table->private->number (%u/%u)\n", 683 duprintf("num_counters != table->private->number (%u/%u)\n",
693 num_counters, private->number); 684 num_counters, private->number);
694 mutex_unlock(&table->lock); 685 local_bh_enable();
695 *error = -EAGAIN; 686 *error = -EAGAIN;
696 return NULL; 687 return NULL;
697 } 688 }
698 oldinfo = private;
699 rcu_assign_pointer(table->private, newinfo);
700 newinfo->initial_entries = oldinfo->initial_entries;
701 mutex_unlock(&table->lock);
702 689
703 synchronize_net(); 690 table->private = newinfo;
704 return oldinfo; 691 newinfo->initial_entries = private->initial_entries;
692
693 /*
694 * Even though table entries have now been swapped, other CPU's
695 * may still be using the old entries. This is okay, because
696 * resynchronization happens because of the locking done
697 * during the get_counters() routine.
698 */
699 local_bh_enable();
700
701 return private;
705} 702}
706EXPORT_SYMBOL_GPL(xt_replace_table); 703EXPORT_SYMBOL_GPL(xt_replace_table);
707 704
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
734 731
735 /* Simplifies replace_table code. */ 732 /* Simplifies replace_table code. */
736 table->private = bootstrap; 733 table->private = bootstrap;
737 mutex_init(&table->lock);
738 734
739 if (!xt_replace_table(table, 0, newinfo, &ret)) 735 if (!xt_replace_table(table, 0, newinfo, &ret))
740 goto unlock; 736 goto unlock;
@@ -1147,7 +1143,14 @@ static struct pernet_operations xt_net_ops = {
1147 1143
1148static int __init xt_init(void) 1144static int __init xt_init(void)
1149{ 1145{
1150 int i, rv; 1146 unsigned int i;
1147 int rv;
1148
1149 for_each_possible_cpu(i) {
1150 struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
1151 spin_lock_init(&lock->lock);
1152 lock->readers = 0;
1153 }
1151 1154
1152 xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); 1155 xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
1153 if (!xt) 1156 if (!xt)