aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorStephen Hemminger <shemminger@vyatta.com>2009-02-20 04:35:32 -0500
committerPatrick McHardy <kaber@trash.net>2009-02-20 04:35:32 -0500
commit784544739a25c30637397ace5489eeb6e15d7d49 (patch)
treec48bbf30f3eb753858de9a03b74e81925cf39018 /net/ipv4
parent323dbf96382f057d035afce0237f08e18571ac1d (diff)
netfilter: iptables: lock free counters
The reader/writer lock in ip_tables is acquired in the critical path of processing packets and is one of the reasons just loading iptables can cause a 20% performance loss. The rwlock serves two functions: 1) it prevents changes to table state (xt_replace) while table is in use. This is now handled by doing rcu on the xt_table. When table is replaced, the new table(s) are put in and the old one table(s) are freed after RCU period. 2) it provides synchronization when accesing the counter values. This is now handled by swapping in new table_info entries for each cpu then summing the old values, and putting the result back onto one cpu. On a busy system it may cause sampling to occur at different times on each cpu, but no packet/byte counts are lost in the process. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here) BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago) Acked-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/netfilter/arp_tables.c115
-rw-r--r--net/ipv4/netfilter/ip_tables.c120
2 files changed, 175 insertions, 60 deletions
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b5db46342614..64a7c6ce0b98 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -261,9 +261,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
261 indev = in ? in->name : nulldevname; 261 indev = in ? in->name : nulldevname;
262 outdev = out ? out->name : nulldevname; 262 outdev = out ? out->name : nulldevname;
263 263
264 read_lock_bh(&table->lock); 264 rcu_read_lock();
265 private = table->private; 265 private = rcu_dereference(table->private);
266 table_base = (void *)private->entries[smp_processor_id()]; 266 table_base = rcu_dereference(private->entries[smp_processor_id()]);
267
267 e = get_entry(table_base, private->hook_entry[hook]); 268 e = get_entry(table_base, private->hook_entry[hook]);
268 back = get_entry(table_base, private->underflow[hook]); 269 back = get_entry(table_base, private->underflow[hook]);
269 270
@@ -335,7 +336,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
335 e = (void *)e + e->next_offset; 336 e = (void *)e + e->next_offset;
336 } 337 }
337 } while (!hotdrop); 338 } while (!hotdrop);
338 read_unlock_bh(&table->lock); 339
340 rcu_read_unlock();
339 341
340 if (hotdrop) 342 if (hotdrop)
341 return NF_DROP; 343 return NF_DROP;
@@ -738,11 +740,65 @@ static void get_counters(const struct xt_table_info *t,
738 } 740 }
739} 741}
740 742
741static inline struct xt_counters *alloc_counters(struct xt_table *table) 743
744/* We're lazy, and add to the first CPU; overflow works its fey magic
745 * and everything is OK. */
746static int
747add_counter_to_entry(struct arpt_entry *e,
748 const struct xt_counters addme[],
749 unsigned int *i)
750{
751 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
752
753 (*i)++;
754 return 0;
755}
756
757/* Take values from counters and add them back onto the current cpu */
758static void put_counters(struct xt_table_info *t,
759 const struct xt_counters counters[])
760{
761 unsigned int i, cpu;
762
763 local_bh_disable();
764 cpu = smp_processor_id();
765 i = 0;
766 ARPT_ENTRY_ITERATE(t->entries[cpu],
767 t->size,
768 add_counter_to_entry,
769 counters,
770 &i);
771 local_bh_enable();
772}
773
774static inline int
775zero_entry_counter(struct arpt_entry *e, void *arg)
776{
777 e->counters.bcnt = 0;
778 e->counters.pcnt = 0;
779 return 0;
780}
781
782static void
783clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
784{
785 unsigned int cpu;
786 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
787
788 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
789 for_each_possible_cpu(cpu) {
790 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
791 ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
792 zero_entry_counter, NULL);
793 }
794}
795
796static struct xt_counters *alloc_counters(struct xt_table *table)
742{ 797{
743 unsigned int countersize; 798 unsigned int countersize;
744 struct xt_counters *counters; 799 struct xt_counters *counters;
745 const struct xt_table_info *private = table->private; 800 struct xt_table_info *private = table->private;
801 struct xt_table_info *info;
746 802
747 /* We need atomic snapshot of counters: rest doesn't change 803 /* We need atomic snapshot of counters: rest doesn't change
748 * (other than comefrom, which userspace doesn't care 804 * (other than comefrom, which userspace doesn't care
@@ -752,14 +808,30 @@ static inline struct xt_counters *alloc_counters(struct xt_table *table)
752 counters = vmalloc_node(countersize, numa_node_id()); 808 counters = vmalloc_node(countersize, numa_node_id());
753 809
754 if (counters == NULL) 810 if (counters == NULL)
755 return ERR_PTR(-ENOMEM); 811 goto nomem;
812
813 info = xt_alloc_table_info(private->size);
814 if (!info)
815 goto free_counters;
756 816
757 /* First, sum counters... */ 817 clone_counters(info, private);
758 write_lock_bh(&table->lock); 818
759 get_counters(private, counters); 819 mutex_lock(&table->lock);
760 write_unlock_bh(&table->lock); 820 xt_table_entry_swap_rcu(private, info);
821 synchronize_net(); /* Wait until smoke has cleared */
822
823 get_counters(info, counters);
824 put_counters(private, counters);
825 mutex_unlock(&table->lock);
826
827 xt_free_table_info(info);
761 828
762 return counters; 829 return counters;
830
831 free_counters:
832 vfree(counters);
833 nomem:
834 return ERR_PTR(-ENOMEM);
763} 835}
764 836
765static int copy_entries_to_user(unsigned int total_size, 837static int copy_entries_to_user(unsigned int total_size,
@@ -1099,20 +1171,6 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1099 return ret; 1171 return ret;
1100} 1172}
1101 1173
1102/* We're lazy, and add to the first CPU; overflow works its fey magic
1103 * and everything is OK.
1104 */
1105static inline int add_counter_to_entry(struct arpt_entry *e,
1106 const struct xt_counters addme[],
1107 unsigned int *i)
1108{
1109
1110 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1111
1112 (*i)++;
1113 return 0;
1114}
1115
1116static int do_add_counters(struct net *net, void __user *user, unsigned int len, 1174static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1117 int compat) 1175 int compat)
1118{ 1176{
@@ -1172,13 +1230,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1172 goto free; 1230 goto free;
1173 } 1231 }
1174 1232
1175 write_lock_bh(&t->lock); 1233 mutex_lock(&t->lock);
1176 private = t->private; 1234 private = t->private;
1177 if (private->number != num_counters) { 1235 if (private->number != num_counters) {
1178 ret = -EINVAL; 1236 ret = -EINVAL;
1179 goto unlock_up_free; 1237 goto unlock_up_free;
1180 } 1238 }
1181 1239
1240 preempt_disable();
1182 i = 0; 1241 i = 0;
1183 /* Choose the copy that is on our node */ 1242 /* Choose the copy that is on our node */
1184 loc_cpu_entry = private->entries[smp_processor_id()]; 1243 loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1187,8 +1246,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1187 add_counter_to_entry, 1246 add_counter_to_entry,
1188 paddc, 1247 paddc,
1189 &i); 1248 &i);
1249 preempt_enable();
1190 unlock_up_free: 1250 unlock_up_free:
1191 write_unlock_bh(&t->lock); 1251 mutex_unlock(&t->lock);
1252
1192 xt_table_unlock(t); 1253 xt_table_unlock(t);
1193 module_put(t->me); 1254 module_put(t->me);
1194 free: 1255 free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ef8b6ca068b2..08cde5bd70a5 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
347 mtpar.family = tgpar.family = NFPROTO_IPV4; 347 mtpar.family = tgpar.family = NFPROTO_IPV4;
348 tgpar.hooknum = hook; 348 tgpar.hooknum = hook;
349 349
350 read_lock_bh(&table->lock);
351 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 350 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
352 private = table->private; 351
353 table_base = (void *)private->entries[smp_processor_id()]; 352 rcu_read_lock();
353 private = rcu_dereference(table->private);
354 table_base = rcu_dereference(private->entries[smp_processor_id()]);
355
354 e = get_entry(table_base, private->hook_entry[hook]); 356 e = get_entry(table_base, private->hook_entry[hook]);
355 357
356 /* For return from builtin chain */ 358 /* For return from builtin chain */
@@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
445 } 447 }
446 } while (!hotdrop); 448 } while (!hotdrop);
447 449
448 read_unlock_bh(&table->lock); 450 rcu_read_unlock();
449 451
450#ifdef DEBUG_ALLOW_ALL 452#ifdef DEBUG_ALLOW_ALL
451 return NF_ACCEPT; 453 return NF_ACCEPT;
@@ -924,13 +926,68 @@ get_counters(const struct xt_table_info *t,
924 counters, 926 counters,
925 &i); 927 &i);
926 } 928 }
929
930}
931
932/* We're lazy, and add to the first CPU; overflow works its fey magic
933 * and everything is OK. */
934static int
935add_counter_to_entry(struct ipt_entry *e,
936 const struct xt_counters addme[],
937 unsigned int *i)
938{
939 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
940
941 (*i)++;
942 return 0;
943}
944
945/* Take values from counters and add them back onto the current cpu */
946static void put_counters(struct xt_table_info *t,
947 const struct xt_counters counters[])
948{
949 unsigned int i, cpu;
950
951 local_bh_disable();
952 cpu = smp_processor_id();
953 i = 0;
954 IPT_ENTRY_ITERATE(t->entries[cpu],
955 t->size,
956 add_counter_to_entry,
957 counters,
958 &i);
959 local_bh_enable();
960}
961
962
963static inline int
964zero_entry_counter(struct ipt_entry *e, void *arg)
965{
966 e->counters.bcnt = 0;
967 e->counters.pcnt = 0;
968 return 0;
969}
970
971static void
972clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
973{
974 unsigned int cpu;
975 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
976
977 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
978 for_each_possible_cpu(cpu) {
979 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
980 IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
981 zero_entry_counter, NULL);
982 }
927} 983}
928 984
929static struct xt_counters * alloc_counters(struct xt_table *table) 985static struct xt_counters * alloc_counters(struct xt_table *table)
930{ 986{
931 unsigned int countersize; 987 unsigned int countersize;
932 struct xt_counters *counters; 988 struct xt_counters *counters;
933 const struct xt_table_info *private = table->private; 989 struct xt_table_info *private = table->private;
990 struct xt_table_info *info;
934 991
935 /* We need atomic snapshot of counters: rest doesn't change 992 /* We need atomic snapshot of counters: rest doesn't change
936 (other than comefrom, which userspace doesn't care 993 (other than comefrom, which userspace doesn't care
@@ -939,14 +996,30 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
939 counters = vmalloc_node(countersize, numa_node_id()); 996 counters = vmalloc_node(countersize, numa_node_id());
940 997
941 if (counters == NULL) 998 if (counters == NULL)
942 return ERR_PTR(-ENOMEM); 999 goto nomem;
943 1000
944 /* First, sum counters... */ 1001 info = xt_alloc_table_info(private->size);
945 write_lock_bh(&table->lock); 1002 if (!info)
946 get_counters(private, counters); 1003 goto free_counters;
947 write_unlock_bh(&table->lock); 1004
1005 clone_counters(info, private);
1006
1007 mutex_lock(&table->lock);
1008 xt_table_entry_swap_rcu(private, info);
1009 synchronize_net(); /* Wait until smoke has cleared */
1010
1011 get_counters(info, counters);
1012 put_counters(private, counters);
1013 mutex_unlock(&table->lock);
1014
1015 xt_free_table_info(info);
948 1016
949 return counters; 1017 return counters;
1018
1019 free_counters:
1020 vfree(counters);
1021 nomem:
1022 return ERR_PTR(-ENOMEM);
950} 1023}
951 1024
952static int 1025static int
@@ -1312,27 +1385,6 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1312 return ret; 1385 return ret;
1313} 1386}
1314 1387
1315/* We're lazy, and add to the first CPU; overflow works its fey magic
1316 * and everything is OK. */
1317static int
1318add_counter_to_entry(struct ipt_entry *e,
1319 const struct xt_counters addme[],
1320 unsigned int *i)
1321{
1322#if 0
1323 duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
1324 *i,
1325 (long unsigned int)e->counters.pcnt,
1326 (long unsigned int)e->counters.bcnt,
1327 (long unsigned int)addme[*i].pcnt,
1328 (long unsigned int)addme[*i].bcnt);
1329#endif
1330
1331 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1332
1333 (*i)++;
1334 return 0;
1335}
1336 1388
1337static int 1389static int
1338do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) 1390do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1445,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1393 goto free; 1445 goto free;
1394 } 1446 }
1395 1447
1396 write_lock_bh(&t->lock); 1448 mutex_lock(&t->lock);
1397 private = t->private; 1449 private = t->private;
1398 if (private->number != num_counters) { 1450 if (private->number != num_counters) {
1399 ret = -EINVAL; 1451 ret = -EINVAL;
1400 goto unlock_up_free; 1452 goto unlock_up_free;
1401 } 1453 }
1402 1454
1455 preempt_disable();
1403 i = 0; 1456 i = 0;
1404 /* Choose the copy that is on our node */ 1457 /* Choose the copy that is on our node */
1405 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1458 loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1461,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1408 add_counter_to_entry, 1461 add_counter_to_entry,
1409 paddc, 1462 paddc,
1410 &i); 1463 &i);
1464 preempt_enable();
1411 unlock_up_free: 1465 unlock_up_free:
1412 write_unlock_bh(&t->lock); 1466 mutex_unlock(&t->lock);
1413 xt_table_unlock(t); 1467 xt_table_unlock(t);
1414 module_put(t->me); 1468 module_put(t->me);
1415 free: 1469 free: