diff options
author | Stephen Hemminger <shemminger@vyatta.com> | 2009-04-29 01:36:33 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-04-29 01:36:33 -0400 |
commit | 942e4a2bd680c606af0211e64eb216be2e19bf61 (patch) | |
tree | a83af49242d4a8d53aa0f3b5814eb17da72edc09 /net | |
parent | bf368e4e70cd4e0f880923c44e95a4273d725ab4 (diff) |
netfilter: revised locking for x_tables
The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.
This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/ipv4/netfilter/arp_tables.c | 125 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_tables.c | 126 | ||||
-rw-r--r-- | net/ipv6/netfilter/ip6_tables.c | 123 | ||||
-rw-r--r-- | net/netfilter/x_tables.c | 53 |
4 files changed, 136 insertions, 291 deletions
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 5ba533d234db..831fe1879dc0 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c | |||
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
253 | indev = in ? in->name : nulldevname; | 253 | indev = in ? in->name : nulldevname; |
254 | outdev = out ? out->name : nulldevname; | 254 | outdev = out ? out->name : nulldevname; |
255 | 255 | ||
256 | rcu_read_lock_bh(); | 256 | xt_info_rdlock_bh(); |
257 | private = rcu_dereference(table->private); | 257 | private = table->private; |
258 | table_base = rcu_dereference(private->entries[smp_processor_id()]); | 258 | table_base = private->entries[smp_processor_id()]; |
259 | 259 | ||
260 | e = get_entry(table_base, private->hook_entry[hook]); | 260 | e = get_entry(table_base, private->hook_entry[hook]); |
261 | back = get_entry(table_base, private->underflow[hook]); | 261 | back = get_entry(table_base, private->underflow[hook]); |
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
273 | 273 | ||
274 | hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + | 274 | hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + |
275 | (2 * skb->dev->addr_len); | 275 | (2 * skb->dev->addr_len); |
276 | |||
276 | ADD_COUNTER(e->counters, hdr_len, 1); | 277 | ADD_COUNTER(e->counters, hdr_len, 1); |
277 | 278 | ||
278 | t = arpt_get_target(e); | 279 | t = arpt_get_target(e); |
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
328 | e = (void *)e + e->next_offset; | 329 | e = (void *)e + e->next_offset; |
329 | } | 330 | } |
330 | } while (!hotdrop); | 331 | } while (!hotdrop); |
331 | 332 | xt_info_rdunlock_bh(); | |
332 | rcu_read_unlock_bh(); | ||
333 | 333 | ||
334 | if (hotdrop) | 334 | if (hotdrop) |
335 | return NF_DROP; | 335 | return NF_DROP; |
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t, | |||
711 | /* Instead of clearing (by a previous call to memset()) | 711 | /* Instead of clearing (by a previous call to memset()) |
712 | * the counters and using adds, we set the counters | 712 | * the counters and using adds, we set the counters |
713 | * with data used by 'current' CPU | 713 | * with data used by 'current' CPU |
714 | * We dont care about preemption here. | 714 | * |
715 | * Bottom half has to be disabled to prevent deadlock | ||
716 | * if new softirq were to run and call ipt_do_table | ||
715 | */ | 717 | */ |
716 | curcpu = raw_smp_processor_id(); | 718 | local_bh_disable(); |
719 | curcpu = smp_processor_id(); | ||
717 | 720 | ||
718 | i = 0; | 721 | i = 0; |
719 | ARPT_ENTRY_ITERATE(t->entries[curcpu], | 722 | ARPT_ENTRY_ITERATE(t->entries[curcpu], |
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t, | |||
726 | if (cpu == curcpu) | 729 | if (cpu == curcpu) |
727 | continue; | 730 | continue; |
728 | i = 0; | 731 | i = 0; |
732 | xt_info_wrlock(cpu); | ||
729 | ARPT_ENTRY_ITERATE(t->entries[cpu], | 733 | ARPT_ENTRY_ITERATE(t->entries[cpu], |
730 | t->size, | 734 | t->size, |
731 | add_entry_to_counter, | 735 | add_entry_to_counter, |
732 | counters, | 736 | counters, |
733 | &i); | 737 | &i); |
738 | xt_info_wrunlock(cpu); | ||
734 | } | 739 | } |
735 | } | ||
736 | |||
737 | |||
738 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
739 | * and everything is OK. */ | ||
740 | static int | ||
741 | add_counter_to_entry(struct arpt_entry *e, | ||
742 | const struct xt_counters addme[], | ||
743 | unsigned int *i) | ||
744 | { | ||
745 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
746 | |||
747 | (*i)++; | ||
748 | return 0; | ||
749 | } | ||
750 | |||
751 | /* Take values from counters and add them back onto the current cpu */ | ||
752 | static void put_counters(struct xt_table_info *t, | ||
753 | const struct xt_counters counters[]) | ||
754 | { | ||
755 | unsigned int i, cpu; | ||
756 | |||
757 | local_bh_disable(); | ||
758 | cpu = smp_processor_id(); | ||
759 | i = 0; | ||
760 | ARPT_ENTRY_ITERATE(t->entries[cpu], | ||
761 | t->size, | ||
762 | add_counter_to_entry, | ||
763 | counters, | ||
764 | &i); | ||
765 | local_bh_enable(); | 740 | local_bh_enable(); |
766 | } | 741 | } |
767 | 742 | ||
768 | static inline int | ||
769 | zero_entry_counter(struct arpt_entry *e, void *arg) | ||
770 | { | ||
771 | e->counters.bcnt = 0; | ||
772 | e->counters.pcnt = 0; | ||
773 | return 0; | ||
774 | } | ||
775 | |||
776 | static void | ||
777 | clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) | ||
778 | { | ||
779 | unsigned int cpu; | ||
780 | const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; | ||
781 | |||
782 | memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | ||
783 | for_each_possible_cpu(cpu) { | ||
784 | memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); | ||
785 | ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, | ||
786 | zero_entry_counter, NULL); | ||
787 | } | ||
788 | } | ||
789 | |||
790 | static struct xt_counters *alloc_counters(struct xt_table *table) | 743 | static struct xt_counters *alloc_counters(struct xt_table *table) |
791 | { | 744 | { |
792 | unsigned int countersize; | 745 | unsigned int countersize; |
793 | struct xt_counters *counters; | 746 | struct xt_counters *counters; |
794 | struct xt_table_info *private = table->private; | 747 | struct xt_table_info *private = table->private; |
795 | struct xt_table_info *info; | ||
796 | 748 | ||
797 | /* We need atomic snapshot of counters: rest doesn't change | 749 | /* We need atomic snapshot of counters: rest doesn't change |
798 | * (other than comefrom, which userspace doesn't care | 750 | * (other than comefrom, which userspace doesn't care |
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) | |||
802 | counters = vmalloc_node(countersize, numa_node_id()); | 754 | counters = vmalloc_node(countersize, numa_node_id()); |
803 | 755 | ||
804 | if (counters == NULL) | 756 | if (counters == NULL) |
805 | goto nomem; | 757 | return ERR_PTR(-ENOMEM); |
806 | |||
807 | info = xt_alloc_table_info(private->size); | ||
808 | if (!info) | ||
809 | goto free_counters; | ||
810 | |||
811 | clone_counters(info, private); | ||
812 | |||
813 | mutex_lock(&table->lock); | ||
814 | xt_table_entry_swap_rcu(private, info); | ||
815 | synchronize_net(); /* Wait until smoke has cleared */ | ||
816 | 758 | ||
817 | get_counters(info, counters); | 759 | get_counters(private, counters); |
818 | put_counters(private, counters); | ||
819 | mutex_unlock(&table->lock); | ||
820 | |||
821 | xt_free_table_info(info); | ||
822 | 760 | ||
823 | return counters; | 761 | return counters; |
824 | |||
825 | free_counters: | ||
826 | vfree(counters); | ||
827 | nomem: | ||
828 | return ERR_PTR(-ENOMEM); | ||
829 | } | 762 | } |
830 | 763 | ||
831 | static int copy_entries_to_user(unsigned int total_size, | 764 | static int copy_entries_to_user(unsigned int total_size, |
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name, | |||
1094 | (newinfo->number <= oldinfo->initial_entries)) | 1027 | (newinfo->number <= oldinfo->initial_entries)) |
1095 | module_put(t->me); | 1028 | module_put(t->me); |
1096 | 1029 | ||
1097 | /* Get the old counters. */ | 1030 | /* Get the old counters, and synchronize with replace */ |
1098 | get_counters(oldinfo, counters); | 1031 | get_counters(oldinfo, counters); |
1032 | |||
1099 | /* Decrease module usage counts and free resource */ | 1033 | /* Decrease module usage counts and free resource */ |
1100 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; | 1034 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; |
1101 | ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, | 1035 | ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, |
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) | |||
1165 | return ret; | 1099 | return ret; |
1166 | } | 1100 | } |
1167 | 1101 | ||
1102 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
1103 | * and everything is OK. */ | ||
1104 | static int | ||
1105 | add_counter_to_entry(struct arpt_entry *e, | ||
1106 | const struct xt_counters addme[], | ||
1107 | unsigned int *i) | ||
1108 | { | ||
1109 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
1110 | |||
1111 | (*i)++; | ||
1112 | return 0; | ||
1113 | } | ||
1114 | |||
1168 | static int do_add_counters(struct net *net, void __user *user, unsigned int len, | 1115 | static int do_add_counters(struct net *net, void __user *user, unsigned int len, |
1169 | int compat) | 1116 | int compat) |
1170 | { | 1117 | { |
1171 | unsigned int i; | 1118 | unsigned int i, curcpu; |
1172 | struct xt_counters_info tmp; | 1119 | struct xt_counters_info tmp; |
1173 | struct xt_counters *paddc; | 1120 | struct xt_counters *paddc; |
1174 | unsigned int num_counters; | 1121 | unsigned int num_counters; |
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, | |||
1224 | goto free; | 1171 | goto free; |
1225 | } | 1172 | } |
1226 | 1173 | ||
1227 | mutex_lock(&t->lock); | 1174 | local_bh_disable(); |
1228 | private = t->private; | 1175 | private = t->private; |
1229 | if (private->number != num_counters) { | 1176 | if (private->number != num_counters) { |
1230 | ret = -EINVAL; | 1177 | ret = -EINVAL; |
1231 | goto unlock_up_free; | 1178 | goto unlock_up_free; |
1232 | } | 1179 | } |
1233 | 1180 | ||
1234 | preempt_disable(); | ||
1235 | i = 0; | 1181 | i = 0; |
1236 | /* Choose the copy that is on our node */ | 1182 | /* Choose the copy that is on our node */ |
1237 | loc_cpu_entry = private->entries[smp_processor_id()]; | 1183 | curcpu = smp_processor_id(); |
1184 | loc_cpu_entry = private->entries[curcpu]; | ||
1185 | xt_info_wrlock(curcpu); | ||
1238 | ARPT_ENTRY_ITERATE(loc_cpu_entry, | 1186 | ARPT_ENTRY_ITERATE(loc_cpu_entry, |
1239 | private->size, | 1187 | private->size, |
1240 | add_counter_to_entry, | 1188 | add_counter_to_entry, |
1241 | paddc, | 1189 | paddc, |
1242 | &i); | 1190 | &i); |
1243 | preempt_enable(); | 1191 | xt_info_wrunlock(curcpu); |
1244 | unlock_up_free: | 1192 | unlock_up_free: |
1245 | mutex_unlock(&t->lock); | 1193 | local_bh_enable(); |
1246 | |||
1247 | xt_table_unlock(t); | 1194 | xt_table_unlock(t); |
1248 | module_put(t->me); | 1195 | module_put(t->me); |
1249 | free: | 1196 | free: |
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 810c0b62c7d4..2ec8d7290c40 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c | |||
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb, | |||
338 | tgpar.hooknum = hook; | 338 | tgpar.hooknum = hook; |
339 | 339 | ||
340 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); | 340 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); |
341 | 341 | xt_info_rdlock_bh(); | |
342 | rcu_read_lock_bh(); | 342 | private = table->private; |
343 | private = rcu_dereference(table->private); | 343 | table_base = private->entries[smp_processor_id()]; |
344 | table_base = rcu_dereference(private->entries[smp_processor_id()]); | ||
345 | 344 | ||
346 | e = get_entry(table_base, private->hook_entry[hook]); | 345 | e = get_entry(table_base, private->hook_entry[hook]); |
347 | 346 | ||
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb, | |||
436 | e = (void *)e + e->next_offset; | 435 | e = (void *)e + e->next_offset; |
437 | } | 436 | } |
438 | } while (!hotdrop); | 437 | } while (!hotdrop); |
439 | 438 | xt_info_rdunlock_bh(); | |
440 | rcu_read_unlock_bh(); | ||
441 | 439 | ||
442 | #ifdef DEBUG_ALLOW_ALL | 440 | #ifdef DEBUG_ALLOW_ALL |
443 | return NF_ACCEPT; | 441 | return NF_ACCEPT; |
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t, | |||
896 | 894 | ||
897 | /* Instead of clearing (by a previous call to memset()) | 895 | /* Instead of clearing (by a previous call to memset()) |
898 | * the counters and using adds, we set the counters | 896 | * the counters and using adds, we set the counters |
899 | * with data used by 'current' CPU | 897 | * with data used by 'current' CPU. |
900 | * We dont care about preemption here. | 898 | * |
899 | * Bottom half has to be disabled to prevent deadlock | ||
900 | * if new softirq were to run and call ipt_do_table | ||
901 | */ | 901 | */ |
902 | curcpu = raw_smp_processor_id(); | 902 | local_bh_disable(); |
903 | curcpu = smp_processor_id(); | ||
903 | 904 | ||
904 | i = 0; | 905 | i = 0; |
905 | IPT_ENTRY_ITERATE(t->entries[curcpu], | 906 | IPT_ENTRY_ITERATE(t->entries[curcpu], |
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t, | |||
912 | if (cpu == curcpu) | 913 | if (cpu == curcpu) |
913 | continue; | 914 | continue; |
914 | i = 0; | 915 | i = 0; |
916 | xt_info_wrlock(cpu); | ||
915 | IPT_ENTRY_ITERATE(t->entries[cpu], | 917 | IPT_ENTRY_ITERATE(t->entries[cpu], |
916 | t->size, | 918 | t->size, |
917 | add_entry_to_counter, | 919 | add_entry_to_counter, |
918 | counters, | 920 | counters, |
919 | &i); | 921 | &i); |
922 | xt_info_wrunlock(cpu); | ||
920 | } | 923 | } |
921 | |||
922 | } | ||
923 | |||
924 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
925 | * and everything is OK. */ | ||
926 | static int | ||
927 | add_counter_to_entry(struct ipt_entry *e, | ||
928 | const struct xt_counters addme[], | ||
929 | unsigned int *i) | ||
930 | { | ||
931 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
932 | |||
933 | (*i)++; | ||
934 | return 0; | ||
935 | } | ||
936 | |||
937 | /* Take values from counters and add them back onto the current cpu */ | ||
938 | static void put_counters(struct xt_table_info *t, | ||
939 | const struct xt_counters counters[]) | ||
940 | { | ||
941 | unsigned int i, cpu; | ||
942 | |||
943 | local_bh_disable(); | ||
944 | cpu = smp_processor_id(); | ||
945 | i = 0; | ||
946 | IPT_ENTRY_ITERATE(t->entries[cpu], | ||
947 | t->size, | ||
948 | add_counter_to_entry, | ||
949 | counters, | ||
950 | &i); | ||
951 | local_bh_enable(); | 924 | local_bh_enable(); |
952 | } | 925 | } |
953 | 926 | ||
954 | |||
955 | static inline int | ||
956 | zero_entry_counter(struct ipt_entry *e, void *arg) | ||
957 | { | ||
958 | e->counters.bcnt = 0; | ||
959 | e->counters.pcnt = 0; | ||
960 | return 0; | ||
961 | } | ||
962 | |||
963 | static void | ||
964 | clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) | ||
965 | { | ||
966 | unsigned int cpu; | ||
967 | const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; | ||
968 | |||
969 | memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | ||
970 | for_each_possible_cpu(cpu) { | ||
971 | memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); | ||
972 | IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, | ||
973 | zero_entry_counter, NULL); | ||
974 | } | ||
975 | } | ||
976 | |||
977 | static struct xt_counters * alloc_counters(struct xt_table *table) | 927 | static struct xt_counters * alloc_counters(struct xt_table *table) |
978 | { | 928 | { |
979 | unsigned int countersize; | 929 | unsigned int countersize; |
980 | struct xt_counters *counters; | 930 | struct xt_counters *counters; |
981 | struct xt_table_info *private = table->private; | 931 | struct xt_table_info *private = table->private; |
982 | struct xt_table_info *info; | ||
983 | 932 | ||
984 | /* We need atomic snapshot of counters: rest doesn't change | 933 | /* We need atomic snapshot of counters: rest doesn't change |
985 | (other than comefrom, which userspace doesn't care | 934 | (other than comefrom, which userspace doesn't care |
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table) | |||
988 | counters = vmalloc_node(countersize, numa_node_id()); | 937 | counters = vmalloc_node(countersize, numa_node_id()); |
989 | 938 | ||
990 | if (counters == NULL) | 939 | if (counters == NULL) |
991 | goto nomem; | 940 | return ERR_PTR(-ENOMEM); |
992 | 941 | ||
993 | info = xt_alloc_table_info(private->size); | 942 | get_counters(private, counters); |
994 | if (!info) | ||
995 | goto free_counters; | ||
996 | |||
997 | clone_counters(info, private); | ||
998 | |||
999 | mutex_lock(&table->lock); | ||
1000 | xt_table_entry_swap_rcu(private, info); | ||
1001 | synchronize_net(); /* Wait until smoke has cleared */ | ||
1002 | |||
1003 | get_counters(info, counters); | ||
1004 | put_counters(private, counters); | ||
1005 | mutex_unlock(&table->lock); | ||
1006 | |||
1007 | xt_free_table_info(info); | ||
1008 | 943 | ||
1009 | return counters; | 944 | return counters; |
1010 | |||
1011 | free_counters: | ||
1012 | vfree(counters); | ||
1013 | nomem: | ||
1014 | return ERR_PTR(-ENOMEM); | ||
1015 | } | 945 | } |
1016 | 946 | ||
1017 | static int | 947 | static int |
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, | |||
1306 | (newinfo->number <= oldinfo->initial_entries)) | 1236 | (newinfo->number <= oldinfo->initial_entries)) |
1307 | module_put(t->me); | 1237 | module_put(t->me); |
1308 | 1238 | ||
1309 | /* Get the old counters. */ | 1239 | /* Get the old counters, and synchronize with replace */ |
1310 | get_counters(oldinfo, counters); | 1240 | get_counters(oldinfo, counters); |
1241 | |||
1311 | /* Decrease module usage counts and free resource */ | 1242 | /* Decrease module usage counts and free resource */ |
1312 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; | 1243 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; |
1313 | IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, | 1244 | IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, |
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len) | |||
1377 | return ret; | 1308 | return ret; |
1378 | } | 1309 | } |
1379 | 1310 | ||
1311 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
1312 | * and everything is OK. */ | ||
1313 | static int | ||
1314 | add_counter_to_entry(struct ipt_entry *e, | ||
1315 | const struct xt_counters addme[], | ||
1316 | unsigned int *i) | ||
1317 | { | ||
1318 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
1319 | |||
1320 | (*i)++; | ||
1321 | return 0; | ||
1322 | } | ||
1380 | 1323 | ||
1381 | static int | 1324 | static int |
1382 | do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) | 1325 | do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) |
1383 | { | 1326 | { |
1384 | unsigned int i; | 1327 | unsigned int i, curcpu; |
1385 | struct xt_counters_info tmp; | 1328 | struct xt_counters_info tmp; |
1386 | struct xt_counters *paddc; | 1329 | struct xt_counters *paddc; |
1387 | unsigned int num_counters; | 1330 | unsigned int num_counters; |
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat | |||
1437 | goto free; | 1380 | goto free; |
1438 | } | 1381 | } |
1439 | 1382 | ||
1440 | mutex_lock(&t->lock); | 1383 | local_bh_disable(); |
1441 | private = t->private; | 1384 | private = t->private; |
1442 | if (private->number != num_counters) { | 1385 | if (private->number != num_counters) { |
1443 | ret = -EINVAL; | 1386 | ret = -EINVAL; |
1444 | goto unlock_up_free; | 1387 | goto unlock_up_free; |
1445 | } | 1388 | } |
1446 | 1389 | ||
1447 | preempt_disable(); | ||
1448 | i = 0; | 1390 | i = 0; |
1449 | /* Choose the copy that is on our node */ | 1391 | /* Choose the copy that is on our node */ |
1450 | loc_cpu_entry = private->entries[raw_smp_processor_id()]; | 1392 | curcpu = smp_processor_id(); |
1393 | loc_cpu_entry = private->entries[curcpu]; | ||
1394 | xt_info_wrlock(curcpu); | ||
1451 | IPT_ENTRY_ITERATE(loc_cpu_entry, | 1395 | IPT_ENTRY_ITERATE(loc_cpu_entry, |
1452 | private->size, | 1396 | private->size, |
1453 | add_counter_to_entry, | 1397 | add_counter_to_entry, |
1454 | paddc, | 1398 | paddc, |
1455 | &i); | 1399 | &i); |
1456 | preempt_enable(); | 1400 | xt_info_wrunlock(curcpu); |
1457 | unlock_up_free: | 1401 | unlock_up_free: |
1458 | mutex_unlock(&t->lock); | 1402 | local_bh_enable(); |
1459 | xt_table_unlock(t); | 1403 | xt_table_unlock(t); |
1460 | module_put(t->me); | 1404 | module_put(t->me); |
1461 | free: | 1405 | free: |
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 800ae8542471..219e165aea10 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c | |||
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb, | |||
365 | 365 | ||
366 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); | 366 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); |
367 | 367 | ||
368 | rcu_read_lock_bh(); | 368 | xt_info_rdlock_bh(); |
369 | private = rcu_dereference(table->private); | 369 | private = table->private; |
370 | table_base = rcu_dereference(private->entries[smp_processor_id()]); | 370 | table_base = private->entries[smp_processor_id()]; |
371 | 371 | ||
372 | e = get_entry(table_base, private->hook_entry[hook]); | 372 | e = get_entry(table_base, private->hook_entry[hook]); |
373 | 373 | ||
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb, | |||
466 | #ifdef CONFIG_NETFILTER_DEBUG | 466 | #ifdef CONFIG_NETFILTER_DEBUG |
467 | ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; | 467 | ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; |
468 | #endif | 468 | #endif |
469 | rcu_read_unlock_bh(); | 469 | xt_info_rdunlock_bh(); |
470 | 470 | ||
471 | #ifdef DEBUG_ALLOW_ALL | 471 | #ifdef DEBUG_ALLOW_ALL |
472 | return NF_ACCEPT; | 472 | return NF_ACCEPT; |
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t, | |||
926 | /* Instead of clearing (by a previous call to memset()) | 926 | /* Instead of clearing (by a previous call to memset()) |
927 | * the counters and using adds, we set the counters | 927 | * the counters and using adds, we set the counters |
928 | * with data used by 'current' CPU | 928 | * with data used by 'current' CPU |
929 | * We dont care about preemption here. | 929 | * |
930 | * Bottom half has to be disabled to prevent deadlock | ||
931 | * if new softirq were to run and call ipt_do_table | ||
930 | */ | 932 | */ |
931 | curcpu = raw_smp_processor_id(); | 933 | local_bh_disable(); |
934 | curcpu = smp_processor_id(); | ||
932 | 935 | ||
933 | i = 0; | 936 | i = 0; |
934 | IP6T_ENTRY_ITERATE(t->entries[curcpu], | 937 | IP6T_ENTRY_ITERATE(t->entries[curcpu], |
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t, | |||
941 | if (cpu == curcpu) | 944 | if (cpu == curcpu) |
942 | continue; | 945 | continue; |
943 | i = 0; | 946 | i = 0; |
947 | xt_info_wrlock(cpu); | ||
944 | IP6T_ENTRY_ITERATE(t->entries[cpu], | 948 | IP6T_ENTRY_ITERATE(t->entries[cpu], |
945 | t->size, | 949 | t->size, |
946 | add_entry_to_counter, | 950 | add_entry_to_counter, |
947 | counters, | 951 | counters, |
948 | &i); | 952 | &i); |
953 | xt_info_wrunlock(cpu); | ||
949 | } | 954 | } |
950 | } | ||
951 | |||
952 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
953 | * and everything is OK. */ | ||
954 | static int | ||
955 | add_counter_to_entry(struct ip6t_entry *e, | ||
956 | const struct xt_counters addme[], | ||
957 | unsigned int *i) | ||
958 | { | ||
959 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
960 | |||
961 | (*i)++; | ||
962 | return 0; | ||
963 | } | ||
964 | |||
965 | /* Take values from counters and add them back onto the current cpu */ | ||
966 | static void put_counters(struct xt_table_info *t, | ||
967 | const struct xt_counters counters[]) | ||
968 | { | ||
969 | unsigned int i, cpu; | ||
970 | |||
971 | local_bh_disable(); | ||
972 | cpu = smp_processor_id(); | ||
973 | i = 0; | ||
974 | IP6T_ENTRY_ITERATE(t->entries[cpu], | ||
975 | t->size, | ||
976 | add_counter_to_entry, | ||
977 | counters, | ||
978 | &i); | ||
979 | local_bh_enable(); | 955 | local_bh_enable(); |
980 | } | 956 | } |
981 | 957 | ||
982 | static inline int | ||
983 | zero_entry_counter(struct ip6t_entry *e, void *arg) | ||
984 | { | ||
985 | e->counters.bcnt = 0; | ||
986 | e->counters.pcnt = 0; | ||
987 | return 0; | ||
988 | } | ||
989 | |||
990 | static void | ||
991 | clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) | ||
992 | { | ||
993 | unsigned int cpu; | ||
994 | const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; | ||
995 | |||
996 | memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | ||
997 | for_each_possible_cpu(cpu) { | ||
998 | memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); | ||
999 | IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, | ||
1000 | zero_entry_counter, NULL); | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | static struct xt_counters *alloc_counters(struct xt_table *table) | 958 | static struct xt_counters *alloc_counters(struct xt_table *table) |
1005 | { | 959 | { |
1006 | unsigned int countersize; | 960 | unsigned int countersize; |
1007 | struct xt_counters *counters; | 961 | struct xt_counters *counters; |
1008 | struct xt_table_info *private = table->private; | 962 | struct xt_table_info *private = table->private; |
1009 | struct xt_table_info *info; | ||
1010 | 963 | ||
1011 | /* We need atomic snapshot of counters: rest doesn't change | 964 | /* We need atomic snapshot of counters: rest doesn't change |
1012 | (other than comefrom, which userspace doesn't care | 965 | (other than comefrom, which userspace doesn't care |
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) | |||
1015 | counters = vmalloc_node(countersize, numa_node_id()); | 968 | counters = vmalloc_node(countersize, numa_node_id()); |
1016 | 969 | ||
1017 | if (counters == NULL) | 970 | if (counters == NULL) |
1018 | goto nomem; | 971 | return ERR_PTR(-ENOMEM); |
1019 | 972 | ||
1020 | info = xt_alloc_table_info(private->size); | 973 | get_counters(private, counters); |
1021 | if (!info) | ||
1022 | goto free_counters; | ||
1023 | |||
1024 | clone_counters(info, private); | ||
1025 | |||
1026 | mutex_lock(&table->lock); | ||
1027 | xt_table_entry_swap_rcu(private, info); | ||
1028 | synchronize_net(); /* Wait until smoke has cleared */ | ||
1029 | |||
1030 | get_counters(info, counters); | ||
1031 | put_counters(private, counters); | ||
1032 | mutex_unlock(&table->lock); | ||
1033 | |||
1034 | xt_free_table_info(info); | ||
1035 | 974 | ||
1036 | return counters; | 975 | return counters; |
1037 | |||
1038 | free_counters: | ||
1039 | vfree(counters); | ||
1040 | nomem: | ||
1041 | return ERR_PTR(-ENOMEM); | ||
1042 | } | 976 | } |
1043 | 977 | ||
1044 | static int | 978 | static int |
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, | |||
1334 | (newinfo->number <= oldinfo->initial_entries)) | 1268 | (newinfo->number <= oldinfo->initial_entries)) |
1335 | module_put(t->me); | 1269 | module_put(t->me); |
1336 | 1270 | ||
1337 | /* Get the old counters. */ | 1271 | /* Get the old counters, and synchronize with replace */ |
1338 | get_counters(oldinfo, counters); | 1272 | get_counters(oldinfo, counters); |
1273 | |||
1339 | /* Decrease module usage counts and free resource */ | 1274 | /* Decrease module usage counts and free resource */ |
1340 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; | 1275 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; |
1341 | IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, | 1276 | IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, |
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len) | |||
1405 | return ret; | 1340 | return ret; |
1406 | } | 1341 | } |
1407 | 1342 | ||
1343 | /* We're lazy, and add to the first CPU; overflow works its fey magic | ||
1344 | * and everything is OK. */ | ||
1345 | static int | ||
1346 | add_counter_to_entry(struct ip6t_entry *e, | ||
1347 | const struct xt_counters addme[], | ||
1348 | unsigned int *i) | ||
1349 | { | ||
1350 | ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | ||
1351 | |||
1352 | (*i)++; | ||
1353 | return 0; | ||
1354 | } | ||
1355 | |||
1408 | static int | 1356 | static int |
1409 | do_add_counters(struct net *net, void __user *user, unsigned int len, | 1357 | do_add_counters(struct net *net, void __user *user, unsigned int len, |
1410 | int compat) | 1358 | int compat) |
1411 | { | 1359 | { |
1412 | unsigned int i; | 1360 | unsigned int i, curcpu; |
1413 | struct xt_counters_info tmp; | 1361 | struct xt_counters_info tmp; |
1414 | struct xt_counters *paddc; | 1362 | struct xt_counters *paddc; |
1415 | unsigned int num_counters; | 1363 | unsigned int num_counters; |
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, | |||
1465 | goto free; | 1413 | goto free; |
1466 | } | 1414 | } |
1467 | 1415 | ||
1468 | mutex_lock(&t->lock); | 1416 | |
1417 | local_bh_disable(); | ||
1469 | private = t->private; | 1418 | private = t->private; |
1470 | if (private->number != num_counters) { | 1419 | if (private->number != num_counters) { |
1471 | ret = -EINVAL; | 1420 | ret = -EINVAL; |
1472 | goto unlock_up_free; | 1421 | goto unlock_up_free; |
1473 | } | 1422 | } |
1474 | 1423 | ||
1475 | preempt_disable(); | ||
1476 | i = 0; | 1424 | i = 0; |
1477 | /* Choose the copy that is on our node */ | 1425 | /* Choose the copy that is on our node */ |
1478 | loc_cpu_entry = private->entries[raw_smp_processor_id()]; | 1426 | curcpu = smp_processor_id(); |
1427 | xt_info_wrlock(curcpu); | ||
1428 | loc_cpu_entry = private->entries[curcpu]; | ||
1479 | IP6T_ENTRY_ITERATE(loc_cpu_entry, | 1429 | IP6T_ENTRY_ITERATE(loc_cpu_entry, |
1480 | private->size, | 1430 | private->size, |
1481 | add_counter_to_entry, | 1431 | add_counter_to_entry, |
1482 | paddc, | 1432 | paddc, |
1483 | &i); | 1433 | &i); |
1484 | preempt_enable(); | 1434 | xt_info_wrunlock(curcpu); |
1435 | |||
1485 | unlock_up_free: | 1436 | unlock_up_free: |
1486 | mutex_unlock(&t->lock); | 1437 | local_bh_enable(); |
1487 | xt_table_unlock(t); | 1438 | xt_table_unlock(t); |
1488 | module_put(t->me); | 1439 | module_put(t->me); |
1489 | free: | 1440 | free: |
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 509a95621f9f..150e5cf62f85 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c | |||
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info) | |||
625 | } | 625 | } |
626 | EXPORT_SYMBOL(xt_free_table_info); | 626 | EXPORT_SYMBOL(xt_free_table_info); |
627 | 627 | ||
628 | void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, | ||
629 | struct xt_table_info *newinfo) | ||
630 | { | ||
631 | unsigned int cpu; | ||
632 | |||
633 | for_each_possible_cpu(cpu) { | ||
634 | void *p = oldinfo->entries[cpu]; | ||
635 | rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); | ||
636 | newinfo->entries[cpu] = p; | ||
637 | } | ||
638 | |||
639 | } | ||
640 | EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); | ||
641 | |||
642 | /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ | 628 | /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ |
643 | struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, | 629 | struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, |
644 | const char *name) | 630 | const char *name) |
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af) | |||
676 | EXPORT_SYMBOL_GPL(xt_compat_unlock); | 662 | EXPORT_SYMBOL_GPL(xt_compat_unlock); |
677 | #endif | 663 | #endif |
678 | 664 | ||
665 | DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks); | ||
666 | EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); | ||
667 | |||
668 | |||
679 | struct xt_table_info * | 669 | struct xt_table_info * |
680 | xt_replace_table(struct xt_table *table, | 670 | xt_replace_table(struct xt_table *table, |
681 | unsigned int num_counters, | 671 | unsigned int num_counters, |
682 | struct xt_table_info *newinfo, | 672 | struct xt_table_info *newinfo, |
683 | int *error) | 673 | int *error) |
684 | { | 674 | { |
685 | struct xt_table_info *oldinfo, *private; | 675 | struct xt_table_info *private; |
686 | 676 | ||
687 | /* Do the substitution. */ | 677 | /* Do the substitution. */ |
688 | mutex_lock(&table->lock); | 678 | local_bh_disable(); |
689 | private = table->private; | 679 | private = table->private; |
680 | |||
690 | /* Check inside lock: is the old number correct? */ | 681 | /* Check inside lock: is the old number correct? */ |
691 | if (num_counters != private->number) { | 682 | if (num_counters != private->number) { |
692 | duprintf("num_counters != table->private->number (%u/%u)\n", | 683 | duprintf("num_counters != table->private->number (%u/%u)\n", |
693 | num_counters, private->number); | 684 | num_counters, private->number); |
694 | mutex_unlock(&table->lock); | 685 | local_bh_enable(); |
695 | *error = -EAGAIN; | 686 | *error = -EAGAIN; |
696 | return NULL; | 687 | return NULL; |
697 | } | 688 | } |
698 | oldinfo = private; | ||
699 | rcu_assign_pointer(table->private, newinfo); | ||
700 | newinfo->initial_entries = oldinfo->initial_entries; | ||
701 | mutex_unlock(&table->lock); | ||
702 | 689 | ||
703 | synchronize_net(); | 690 | table->private = newinfo; |
704 | return oldinfo; | 691 | newinfo->initial_entries = private->initial_entries; |
692 | |||
693 | /* | ||
694 | * Even though table entries have now been swapped, other CPU's | ||
695 | * may still be using the old entries. This is okay, because | ||
696 | * resynchronization happens because of the locking done | ||
697 | * during the get_counters() routine. | ||
698 | */ | ||
699 | local_bh_enable(); | ||
700 | |||
701 | return private; | ||
705 | } | 702 | } |
706 | EXPORT_SYMBOL_GPL(xt_replace_table); | 703 | EXPORT_SYMBOL_GPL(xt_replace_table); |
707 | 704 | ||
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table, | |||
734 | 731 | ||
735 | /* Simplifies replace_table code. */ | 732 | /* Simplifies replace_table code. */ |
736 | table->private = bootstrap; | 733 | table->private = bootstrap; |
737 | mutex_init(&table->lock); | ||
738 | 734 | ||
739 | if (!xt_replace_table(table, 0, newinfo, &ret)) | 735 | if (!xt_replace_table(table, 0, newinfo, &ret)) |
740 | goto unlock; | 736 | goto unlock; |
@@ -1147,7 +1143,14 @@ static struct pernet_operations xt_net_ops = { | |||
1147 | 1143 | ||
1148 | static int __init xt_init(void) | 1144 | static int __init xt_init(void) |
1149 | { | 1145 | { |
1150 | int i, rv; | 1146 | unsigned int i; |
1147 | int rv; | ||
1148 | |||
1149 | for_each_possible_cpu(i) { | ||
1150 | struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); | ||
1151 | spin_lock_init(&lock->lock); | ||
1152 | lock->readers = 0; | ||
1153 | } | ||
1151 | 1154 | ||
1152 | xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); | 1155 | xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); |
1153 | if (!xt) | 1156 | if (!xt) |