aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorStephen Hemminger <shemminger@vyatta.com>2009-04-29 01:36:33 -0400
committerDavid S. Miller <davem@davemloft.net>2009-04-29 01:36:33 -0400
commit942e4a2bd680c606af0211e64eb216be2e19bf61 (patch)
treea83af49242d4a8d53aa0f3b5814eb17da72edc09 /net/ipv4
parentbf368e4e70cd4e0f880923c44e95a4273d725ab4 (diff)
netfilter: revised locking for x_tables
The x_tables are organized with a table structure and a per-cpu copies of the counters and rules. On older kernels there was a reader/writer lock per table which was a performance bottleneck. In 2.6.30-rc, this was converted to use RCU and the counters/rules which solved the performance problems for do_table but made replacing rules much slower because of the necessary RCU grace period. This version uses a per-cpu set of spinlocks and counters to allow to table processing to proceed without the cache thrashing of a global reader lock and keeps the same performance for table updates. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/netfilter/arp_tables.c125
-rw-r--r--net/ipv4/netfilter/ip_tables.c126
2 files changed, 71 insertions, 180 deletions
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d234db..831fe1879dc0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
253 indev = in ? in->name : nulldevname; 253 indev = in ? in->name : nulldevname;
254 outdev = out ? out->name : nulldevname; 254 outdev = out ? out->name : nulldevname;
255 255
256 rcu_read_lock_bh(); 256 xt_info_rdlock_bh();
257 private = rcu_dereference(table->private); 257 private = table->private;
258 table_base = rcu_dereference(private->entries[smp_processor_id()]); 258 table_base = private->entries[smp_processor_id()];
259 259
260 e = get_entry(table_base, private->hook_entry[hook]); 260 e = get_entry(table_base, private->hook_entry[hook]);
261 back = get_entry(table_base, private->underflow[hook]); 261 back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
273 273
274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + 274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
275 (2 * skb->dev->addr_len); 275 (2 * skb->dev->addr_len);
276
276 ADD_COUNTER(e->counters, hdr_len, 1); 277 ADD_COUNTER(e->counters, hdr_len, 1);
277 278
278 t = arpt_get_target(e); 279 t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
328 e = (void *)e + e->next_offset; 329 e = (void *)e + e->next_offset;
329 } 330 }
330 } while (!hotdrop); 331 } while (!hotdrop);
331 332 xt_info_rdunlock_bh();
332 rcu_read_unlock_bh();
333 333
334 if (hotdrop) 334 if (hotdrop)
335 return NF_DROP; 335 return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
711 /* Instead of clearing (by a previous call to memset()) 711 /* Instead of clearing (by a previous call to memset())
712 * the counters and using adds, we set the counters 712 * the counters and using adds, we set the counters
713 * with data used by 'current' CPU 713 * with data used by 'current' CPU
714 * We dont care about preemption here. 714 *
715 * Bottom half has to be disabled to prevent deadlock
716 * if new softirq were to run and call ipt_do_table
715 */ 717 */
716 curcpu = raw_smp_processor_id(); 718 local_bh_disable();
719 curcpu = smp_processor_id();
717 720
718 i = 0; 721 i = 0;
719 ARPT_ENTRY_ITERATE(t->entries[curcpu], 722 ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
726 if (cpu == curcpu) 729 if (cpu == curcpu)
727 continue; 730 continue;
728 i = 0; 731 i = 0;
732 xt_info_wrlock(cpu);
729 ARPT_ENTRY_ITERATE(t->entries[cpu], 733 ARPT_ENTRY_ITERATE(t->entries[cpu],
730 t->size, 734 t->size,
731 add_entry_to_counter, 735 add_entry_to_counter,
732 counters, 736 counters,
733 &i); 737 &i);
738 xt_info_wrunlock(cpu);
734 } 739 }
735}
736
737
738/* We're lazy, and add to the first CPU; overflow works its fey magic
739 * and everything is OK. */
740static int
741add_counter_to_entry(struct arpt_entry *e,
742 const struct xt_counters addme[],
743 unsigned int *i)
744{
745 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
746
747 (*i)++;
748 return 0;
749}
750
751/* Take values from counters and add them back onto the current cpu */
752static void put_counters(struct xt_table_info *t,
753 const struct xt_counters counters[])
754{
755 unsigned int i, cpu;
756
757 local_bh_disable();
758 cpu = smp_processor_id();
759 i = 0;
760 ARPT_ENTRY_ITERATE(t->entries[cpu],
761 t->size,
762 add_counter_to_entry,
763 counters,
764 &i);
765 local_bh_enable(); 740 local_bh_enable();
766} 741}
767 742
768static inline int
769zero_entry_counter(struct arpt_entry *e, void *arg)
770{
771 e->counters.bcnt = 0;
772 e->counters.pcnt = 0;
773 return 0;
774}
775
776static void
777clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
778{
779 unsigned int cpu;
780 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
781
782 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
783 for_each_possible_cpu(cpu) {
784 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
785 ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
786 zero_entry_counter, NULL);
787 }
788}
789
790static struct xt_counters *alloc_counters(struct xt_table *table) 743static struct xt_counters *alloc_counters(struct xt_table *table)
791{ 744{
792 unsigned int countersize; 745 unsigned int countersize;
793 struct xt_counters *counters; 746 struct xt_counters *counters;
794 struct xt_table_info *private = table->private; 747 struct xt_table_info *private = table->private;
795 struct xt_table_info *info;
796 748
797 /* We need atomic snapshot of counters: rest doesn't change 749 /* We need atomic snapshot of counters: rest doesn't change
798 * (other than comefrom, which userspace doesn't care 750 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
802 counters = vmalloc_node(countersize, numa_node_id()); 754 counters = vmalloc_node(countersize, numa_node_id());
803 755
804 if (counters == NULL) 756 if (counters == NULL)
805 goto nomem; 757 return ERR_PTR(-ENOMEM);
806
807 info = xt_alloc_table_info(private->size);
808 if (!info)
809 goto free_counters;
810
811 clone_counters(info, private);
812
813 mutex_lock(&table->lock);
814 xt_table_entry_swap_rcu(private, info);
815 synchronize_net(); /* Wait until smoke has cleared */
816 758
817 get_counters(info, counters); 759 get_counters(private, counters);
818 put_counters(private, counters);
819 mutex_unlock(&table->lock);
820
821 xt_free_table_info(info);
822 760
823 return counters; 761 return counters;
824
825 free_counters:
826 vfree(counters);
827 nomem:
828 return ERR_PTR(-ENOMEM);
829} 762}
830 763
831static int copy_entries_to_user(unsigned int total_size, 764static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
1094 (newinfo->number <= oldinfo->initial_entries)) 1027 (newinfo->number <= oldinfo->initial_entries))
1095 module_put(t->me); 1028 module_put(t->me);
1096 1029
1097 /* Get the old counters. */ 1030 /* Get the old counters, and synchronize with replace */
1098 get_counters(oldinfo, counters); 1031 get_counters(oldinfo, counters);
1032
1099 /* Decrease module usage counts and free resource */ 1033 /* Decrease module usage counts and free resource */
1100 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1034 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1101 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1035 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1165 return ret; 1099 return ret;
1166} 1100}
1167 1101
1102/* We're lazy, and add to the first CPU; overflow works its fey magic
1103 * and everything is OK. */
1104static int
1105add_counter_to_entry(struct arpt_entry *e,
1106 const struct xt_counters addme[],
1107 unsigned int *i)
1108{
1109 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1110
1111 (*i)++;
1112 return 0;
1113}
1114
1168static int do_add_counters(struct net *net, void __user *user, unsigned int len, 1115static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1169 int compat) 1116 int compat)
1170{ 1117{
1171 unsigned int i; 1118 unsigned int i, curcpu;
1172 struct xt_counters_info tmp; 1119 struct xt_counters_info tmp;
1173 struct xt_counters *paddc; 1120 struct xt_counters *paddc;
1174 unsigned int num_counters; 1121 unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1224 goto free; 1171 goto free;
1225 } 1172 }
1226 1173
1227 mutex_lock(&t->lock); 1174 local_bh_disable();
1228 private = t->private; 1175 private = t->private;
1229 if (private->number != num_counters) { 1176 if (private->number != num_counters) {
1230 ret = -EINVAL; 1177 ret = -EINVAL;
1231 goto unlock_up_free; 1178 goto unlock_up_free;
1232 } 1179 }
1233 1180
1234 preempt_disable();
1235 i = 0; 1181 i = 0;
1236 /* Choose the copy that is on our node */ 1182 /* Choose the copy that is on our node */
1237 loc_cpu_entry = private->entries[smp_processor_id()]; 1183 curcpu = smp_processor_id();
1184 loc_cpu_entry = private->entries[curcpu];
1185 xt_info_wrlock(curcpu);
1238 ARPT_ENTRY_ITERATE(loc_cpu_entry, 1186 ARPT_ENTRY_ITERATE(loc_cpu_entry,
1239 private->size, 1187 private->size,
1240 add_counter_to_entry, 1188 add_counter_to_entry,
1241 paddc, 1189 paddc,
1242 &i); 1190 &i);
1243 preempt_enable(); 1191 xt_info_wrunlock(curcpu);
1244 unlock_up_free: 1192 unlock_up_free:
1245 mutex_unlock(&t->lock); 1193 local_bh_enable();
1246
1247 xt_table_unlock(t); 1194 xt_table_unlock(t);
1248 module_put(t->me); 1195 module_put(t->me);
1249 free: 1196 free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b62c7d4..2ec8d7290c40 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
338 tgpar.hooknum = hook; 338 tgpar.hooknum = hook;
339 339
340 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 340 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
341 341 xt_info_rdlock_bh();
342 rcu_read_lock_bh(); 342 private = table->private;
343 private = rcu_dereference(table->private); 343 table_base = private->entries[smp_processor_id()];
344 table_base = rcu_dereference(private->entries[smp_processor_id()]);
345 344
346 e = get_entry(table_base, private->hook_entry[hook]); 345 e = get_entry(table_base, private->hook_entry[hook]);
347 346
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
436 e = (void *)e + e->next_offset; 435 e = (void *)e + e->next_offset;
437 } 436 }
438 } while (!hotdrop); 437 } while (!hotdrop);
439 438 xt_info_rdunlock_bh();
440 rcu_read_unlock_bh();
441 439
442#ifdef DEBUG_ALLOW_ALL 440#ifdef DEBUG_ALLOW_ALL
443 return NF_ACCEPT; 441 return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
896 894
897 /* Instead of clearing (by a previous call to memset()) 895 /* Instead of clearing (by a previous call to memset())
898 * the counters and using adds, we set the counters 896 * the counters and using adds, we set the counters
899 * with data used by 'current' CPU 897 * with data used by 'current' CPU.
900 * We dont care about preemption here. 898 *
899 * Bottom half has to be disabled to prevent deadlock
900 * if new softirq were to run and call ipt_do_table
901 */ 901 */
902 curcpu = raw_smp_processor_id(); 902 local_bh_disable();
903 curcpu = smp_processor_id();
903 904
904 i = 0; 905 i = 0;
905 IPT_ENTRY_ITERATE(t->entries[curcpu], 906 IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
912 if (cpu == curcpu) 913 if (cpu == curcpu)
913 continue; 914 continue;
914 i = 0; 915 i = 0;
916 xt_info_wrlock(cpu);
915 IPT_ENTRY_ITERATE(t->entries[cpu], 917 IPT_ENTRY_ITERATE(t->entries[cpu],
916 t->size, 918 t->size,
917 add_entry_to_counter, 919 add_entry_to_counter,
918 counters, 920 counters,
919 &i); 921 &i);
922 xt_info_wrunlock(cpu);
920 } 923 }
921
922}
923
924/* We're lazy, and add to the first CPU; overflow works its fey magic
925 * and everything is OK. */
926static int
927add_counter_to_entry(struct ipt_entry *e,
928 const struct xt_counters addme[],
929 unsigned int *i)
930{
931 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
932
933 (*i)++;
934 return 0;
935}
936
937/* Take values from counters and add them back onto the current cpu */
938static void put_counters(struct xt_table_info *t,
939 const struct xt_counters counters[])
940{
941 unsigned int i, cpu;
942
943 local_bh_disable();
944 cpu = smp_processor_id();
945 i = 0;
946 IPT_ENTRY_ITERATE(t->entries[cpu],
947 t->size,
948 add_counter_to_entry,
949 counters,
950 &i);
951 local_bh_enable(); 924 local_bh_enable();
952} 925}
953 926
954
955static inline int
956zero_entry_counter(struct ipt_entry *e, void *arg)
957{
958 e->counters.bcnt = 0;
959 e->counters.pcnt = 0;
960 return 0;
961}
962
963static void
964clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
965{
966 unsigned int cpu;
967 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
968
969 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
970 for_each_possible_cpu(cpu) {
971 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
972 IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
973 zero_entry_counter, NULL);
974 }
975}
976
977static struct xt_counters * alloc_counters(struct xt_table *table) 927static struct xt_counters * alloc_counters(struct xt_table *table)
978{ 928{
979 unsigned int countersize; 929 unsigned int countersize;
980 struct xt_counters *counters; 930 struct xt_counters *counters;
981 struct xt_table_info *private = table->private; 931 struct xt_table_info *private = table->private;
982 struct xt_table_info *info;
983 932
984 /* We need atomic snapshot of counters: rest doesn't change 933 /* We need atomic snapshot of counters: rest doesn't change
985 (other than comefrom, which userspace doesn't care 934 (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
988 counters = vmalloc_node(countersize, numa_node_id()); 937 counters = vmalloc_node(countersize, numa_node_id());
989 938
990 if (counters == NULL) 939 if (counters == NULL)
991 goto nomem; 940 return ERR_PTR(-ENOMEM);
992 941
993 info = xt_alloc_table_info(private->size); 942 get_counters(private, counters);
994 if (!info)
995 goto free_counters;
996
997 clone_counters(info, private);
998
999 mutex_lock(&table->lock);
1000 xt_table_entry_swap_rcu(private, info);
1001 synchronize_net(); /* Wait until smoke has cleared */
1002
1003 get_counters(info, counters);
1004 put_counters(private, counters);
1005 mutex_unlock(&table->lock);
1006
1007 xt_free_table_info(info);
1008 943
1009 return counters; 944 return counters;
1010
1011 free_counters:
1012 vfree(counters);
1013 nomem:
1014 return ERR_PTR(-ENOMEM);
1015} 945}
1016 946
1017static int 947static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1306 (newinfo->number <= oldinfo->initial_entries)) 1236 (newinfo->number <= oldinfo->initial_entries))
1307 module_put(t->me); 1237 module_put(t->me);
1308 1238
1309 /* Get the old counters. */ 1239 /* Get the old counters, and synchronize with replace */
1310 get_counters(oldinfo, counters); 1240 get_counters(oldinfo, counters);
1241
1311 /* Decrease module usage counts and free resource */ 1242 /* Decrease module usage counts and free resource */
1312 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1243 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1313 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1244 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1377 return ret; 1308 return ret;
1378} 1309}
1379 1310
1311/* We're lazy, and add to the first CPU; overflow works its fey magic
1312 * and everything is OK. */
1313static int
1314add_counter_to_entry(struct ipt_entry *e,
1315 const struct xt_counters addme[],
1316 unsigned int *i)
1317{
1318 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1319
1320 (*i)++;
1321 return 0;
1322}
1380 1323
1381static int 1324static int
1382do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) 1325do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
1383{ 1326{
1384 unsigned int i; 1327 unsigned int i, curcpu;
1385 struct xt_counters_info tmp; 1328 struct xt_counters_info tmp;
1386 struct xt_counters *paddc; 1329 struct xt_counters *paddc;
1387 unsigned int num_counters; 1330 unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1437 goto free; 1380 goto free;
1438 } 1381 }
1439 1382
1440 mutex_lock(&t->lock); 1383 local_bh_disable();
1441 private = t->private; 1384 private = t->private;
1442 if (private->number != num_counters) { 1385 if (private->number != num_counters) {
1443 ret = -EINVAL; 1386 ret = -EINVAL;
1444 goto unlock_up_free; 1387 goto unlock_up_free;
1445 } 1388 }
1446 1389
1447 preempt_disable();
1448 i = 0; 1390 i = 0;
1449 /* Choose the copy that is on our node */ 1391 /* Choose the copy that is on our node */
1450 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1392 curcpu = smp_processor_id();
1393 loc_cpu_entry = private->entries[curcpu];
1394 xt_info_wrlock(curcpu);
1451 IPT_ENTRY_ITERATE(loc_cpu_entry, 1395 IPT_ENTRY_ITERATE(loc_cpu_entry,
1452 private->size, 1396 private->size,
1453 add_counter_to_entry, 1397 add_counter_to_entry,
1454 paddc, 1398 paddc,
1455 &i); 1399 &i);
1456 preempt_enable(); 1400 xt_info_wrunlock(curcpu);
1457 unlock_up_free: 1401 unlock_up_free:
1458 mutex_unlock(&t->lock); 1402 local_bh_enable();
1459 xt_table_unlock(t); 1403 xt_table_unlock(t);
1460 module_put(t->me); 1404 module_put(t->me);
1461 free: 1405 free: