aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/fib_trie.c6
-rw-r--r--net/ipv4/ipconfig.c12
-rw-r--r--net/ipv4/netfilter/arp_tables.c125
-rw-r--r--net/ipv4/netfilter/ip_tables.c126
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c3
-rw-r--r--net/ipv4/route.c62
-rw-r--r--net/ipv4/tcp.c11
-rw-r--r--net/ipv4/tcp_input.c10
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/tcp_vegas.c11
11 files changed, 131 insertions, 241 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index b2cf91e4ccaa..5b919f7b45db 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -407,8 +407,8 @@ config INET_XFRM_MODE_BEET
407 If unsure, say Y. 407 If unsure, say Y.
408 408
409config INET_LRO 409config INET_LRO
410 tristate "Large Receive Offload (ipv4/tcp)" 410 bool "Large Receive Offload (ipv4/tcp)"
411 411 default y
412 ---help--- 412 ---help---
413 Support for Large Receive Offload (ipv4/tcp). 413 Support for Large Receive Offload (ipv4/tcp).
414 414
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ec0ae490f0b6..33c7c85dfe40 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -986,9 +986,12 @@ fib_find_node(struct trie *t, u32 key)
986static struct node *trie_rebalance(struct trie *t, struct tnode *tn) 986static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
987{ 987{
988 int wasfull; 988 int wasfull;
989 t_key cindex, key = tn->key; 989 t_key cindex, key;
990 struct tnode *tp; 990 struct tnode *tp;
991 991
992 preempt_disable();
993 key = tn->key;
994
992 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { 995 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
993 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 996 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
994 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 997 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
@@ -1007,6 +1010,7 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
1007 if (IS_TNODE(tn)) 1010 if (IS_TNODE(tn))
1008 tn = (struct tnode *)resize(t, (struct tnode *)tn); 1011 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1009 1012
1013 preempt_enable();
1010 return (struct node *)tn; 1014 return (struct node *)tn;
1011} 1015}
1012 1016
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 90d22ae0a419..88bf051d0cbb 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -139,6 +139,8 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */
139__be32 root_server_addr = NONE; /* Address of NFS server */ 139__be32 root_server_addr = NONE; /* Address of NFS server */
140u8 root_server_path[256] = { 0, }; /* Path to mount as root */ 140u8 root_server_path[256] = { 0, }; /* Path to mount as root */
141 141
142u32 ic_dev_xid; /* Device under configuration */
143
142/* vendor class identifier */ 144/* vendor class identifier */
143static char vendor_class_identifier[253] __initdata; 145static char vendor_class_identifier[253] __initdata;
144 146
@@ -932,6 +934,13 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
932 goto drop_unlock; 934 goto drop_unlock;
933 } 935 }
934 936
937 /* Is it a reply for the device we are configuring? */
938 if (b->xid != ic_dev_xid) {
939 if (net_ratelimit())
940 printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet \n");
941 goto drop_unlock;
942 }
943
935 /* Parse extensions */ 944 /* Parse extensions */
936 if (ext_len >= 4 && 945 if (ext_len >= 4 &&
937 !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */ 946 !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
@@ -1115,6 +1124,9 @@ static int __init ic_dynamic(void)
1115 get_random_bytes(&timeout, sizeof(timeout)); 1124 get_random_bytes(&timeout, sizeof(timeout));
1116 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); 1125 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
1117 for (;;) { 1126 for (;;) {
1127 /* Track the device we are configuring */
1128 ic_dev_xid = d->xid;
1129
1118#ifdef IPCONFIG_BOOTP 1130#ifdef IPCONFIG_BOOTP
1119 if (do_bootp && (d->able & IC_BOOTP)) 1131 if (do_bootp && (d->able & IC_BOOTP))
1120 ic_bootp_send_if(d, jiffies - start_jiffies); 1132 ic_bootp_send_if(d, jiffies - start_jiffies);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d234db..831fe1879dc0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
253 indev = in ? in->name : nulldevname; 253 indev = in ? in->name : nulldevname;
254 outdev = out ? out->name : nulldevname; 254 outdev = out ? out->name : nulldevname;
255 255
256 rcu_read_lock_bh(); 256 xt_info_rdlock_bh();
257 private = rcu_dereference(table->private); 257 private = table->private;
258 table_base = rcu_dereference(private->entries[smp_processor_id()]); 258 table_base = private->entries[smp_processor_id()];
259 259
260 e = get_entry(table_base, private->hook_entry[hook]); 260 e = get_entry(table_base, private->hook_entry[hook]);
261 back = get_entry(table_base, private->underflow[hook]); 261 back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
273 273
274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + 274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
275 (2 * skb->dev->addr_len); 275 (2 * skb->dev->addr_len);
276
276 ADD_COUNTER(e->counters, hdr_len, 1); 277 ADD_COUNTER(e->counters, hdr_len, 1);
277 278
278 t = arpt_get_target(e); 279 t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
328 e = (void *)e + e->next_offset; 329 e = (void *)e + e->next_offset;
329 } 330 }
330 } while (!hotdrop); 331 } while (!hotdrop);
331 332 xt_info_rdunlock_bh();
332 rcu_read_unlock_bh();
333 333
334 if (hotdrop) 334 if (hotdrop)
335 return NF_DROP; 335 return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
711 /* Instead of clearing (by a previous call to memset()) 711 /* Instead of clearing (by a previous call to memset())
712 * the counters and using adds, we set the counters 712 * the counters and using adds, we set the counters
713 * with data used by 'current' CPU 713 * with data used by 'current' CPU
714 * We dont care about preemption here. 714 *
715 * Bottom half has to be disabled to prevent deadlock
716 * if new softirq were to run and call ipt_do_table
715 */ 717 */
716 curcpu = raw_smp_processor_id(); 718 local_bh_disable();
719 curcpu = smp_processor_id();
717 720
718 i = 0; 721 i = 0;
719 ARPT_ENTRY_ITERATE(t->entries[curcpu], 722 ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
726 if (cpu == curcpu) 729 if (cpu == curcpu)
727 continue; 730 continue;
728 i = 0; 731 i = 0;
732 xt_info_wrlock(cpu);
729 ARPT_ENTRY_ITERATE(t->entries[cpu], 733 ARPT_ENTRY_ITERATE(t->entries[cpu],
730 t->size, 734 t->size,
731 add_entry_to_counter, 735 add_entry_to_counter,
732 counters, 736 counters,
733 &i); 737 &i);
738 xt_info_wrunlock(cpu);
734 } 739 }
735}
736
737
738/* We're lazy, and add to the first CPU; overflow works its fey magic
739 * and everything is OK. */
740static int
741add_counter_to_entry(struct arpt_entry *e,
742 const struct xt_counters addme[],
743 unsigned int *i)
744{
745 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
746
747 (*i)++;
748 return 0;
749}
750
751/* Take values from counters and add them back onto the current cpu */
752static void put_counters(struct xt_table_info *t,
753 const struct xt_counters counters[])
754{
755 unsigned int i, cpu;
756
757 local_bh_disable();
758 cpu = smp_processor_id();
759 i = 0;
760 ARPT_ENTRY_ITERATE(t->entries[cpu],
761 t->size,
762 add_counter_to_entry,
763 counters,
764 &i);
765 local_bh_enable(); 740 local_bh_enable();
766} 741}
767 742
768static inline int
769zero_entry_counter(struct arpt_entry *e, void *arg)
770{
771 e->counters.bcnt = 0;
772 e->counters.pcnt = 0;
773 return 0;
774}
775
776static void
777clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
778{
779 unsigned int cpu;
780 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
781
782 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
783 for_each_possible_cpu(cpu) {
784 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
785 ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
786 zero_entry_counter, NULL);
787 }
788}
789
790static struct xt_counters *alloc_counters(struct xt_table *table) 743static struct xt_counters *alloc_counters(struct xt_table *table)
791{ 744{
792 unsigned int countersize; 745 unsigned int countersize;
793 struct xt_counters *counters; 746 struct xt_counters *counters;
794 struct xt_table_info *private = table->private; 747 struct xt_table_info *private = table->private;
795 struct xt_table_info *info;
796 748
797 /* We need atomic snapshot of counters: rest doesn't change 749 /* We need atomic snapshot of counters: rest doesn't change
798 * (other than comefrom, which userspace doesn't care 750 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
802 counters = vmalloc_node(countersize, numa_node_id()); 754 counters = vmalloc_node(countersize, numa_node_id());
803 755
804 if (counters == NULL) 756 if (counters == NULL)
805 goto nomem; 757 return ERR_PTR(-ENOMEM);
806
807 info = xt_alloc_table_info(private->size);
808 if (!info)
809 goto free_counters;
810
811 clone_counters(info, private);
812
813 mutex_lock(&table->lock);
814 xt_table_entry_swap_rcu(private, info);
815 synchronize_net(); /* Wait until smoke has cleared */
816 758
817 get_counters(info, counters); 759 get_counters(private, counters);
818 put_counters(private, counters);
819 mutex_unlock(&table->lock);
820
821 xt_free_table_info(info);
822 760
823 return counters; 761 return counters;
824
825 free_counters:
826 vfree(counters);
827 nomem:
828 return ERR_PTR(-ENOMEM);
829} 762}
830 763
831static int copy_entries_to_user(unsigned int total_size, 764static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
1094 (newinfo->number <= oldinfo->initial_entries)) 1027 (newinfo->number <= oldinfo->initial_entries))
1095 module_put(t->me); 1028 module_put(t->me);
1096 1029
1097 /* Get the old counters. */ 1030 /* Get the old counters, and synchronize with replace */
1098 get_counters(oldinfo, counters); 1031 get_counters(oldinfo, counters);
1032
1099 /* Decrease module usage counts and free resource */ 1033 /* Decrease module usage counts and free resource */
1100 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1034 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1101 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1035 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1165 return ret; 1099 return ret;
1166} 1100}
1167 1101
1102/* We're lazy, and add to the first CPU; overflow works its fey magic
1103 * and everything is OK. */
1104static int
1105add_counter_to_entry(struct arpt_entry *e,
1106 const struct xt_counters addme[],
1107 unsigned int *i)
1108{
1109 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1110
1111 (*i)++;
1112 return 0;
1113}
1114
1168static int do_add_counters(struct net *net, void __user *user, unsigned int len, 1115static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1169 int compat) 1116 int compat)
1170{ 1117{
1171 unsigned int i; 1118 unsigned int i, curcpu;
1172 struct xt_counters_info tmp; 1119 struct xt_counters_info tmp;
1173 struct xt_counters *paddc; 1120 struct xt_counters *paddc;
1174 unsigned int num_counters; 1121 unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1224 goto free; 1171 goto free;
1225 } 1172 }
1226 1173
1227 mutex_lock(&t->lock); 1174 local_bh_disable();
1228 private = t->private; 1175 private = t->private;
1229 if (private->number != num_counters) { 1176 if (private->number != num_counters) {
1230 ret = -EINVAL; 1177 ret = -EINVAL;
1231 goto unlock_up_free; 1178 goto unlock_up_free;
1232 } 1179 }
1233 1180
1234 preempt_disable();
1235 i = 0; 1181 i = 0;
1236 /* Choose the copy that is on our node */ 1182 /* Choose the copy that is on our node */
1237 loc_cpu_entry = private->entries[smp_processor_id()]; 1183 curcpu = smp_processor_id();
1184 loc_cpu_entry = private->entries[curcpu];
1185 xt_info_wrlock(curcpu);
1238 ARPT_ENTRY_ITERATE(loc_cpu_entry, 1186 ARPT_ENTRY_ITERATE(loc_cpu_entry,
1239 private->size, 1187 private->size,
1240 add_counter_to_entry, 1188 add_counter_to_entry,
1241 paddc, 1189 paddc,
1242 &i); 1190 &i);
1243 preempt_enable(); 1191 xt_info_wrunlock(curcpu);
1244 unlock_up_free: 1192 unlock_up_free:
1245 mutex_unlock(&t->lock); 1193 local_bh_enable();
1246
1247 xt_table_unlock(t); 1194 xt_table_unlock(t);
1248 module_put(t->me); 1195 module_put(t->me);
1249 free: 1196 free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b62c7d4..2ec8d7290c40 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
338 tgpar.hooknum = hook; 338 tgpar.hooknum = hook;
339 339
340 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 340 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
341 341 xt_info_rdlock_bh();
342 rcu_read_lock_bh(); 342 private = table->private;
343 private = rcu_dereference(table->private); 343 table_base = private->entries[smp_processor_id()];
344 table_base = rcu_dereference(private->entries[smp_processor_id()]);
345 344
346 e = get_entry(table_base, private->hook_entry[hook]); 345 e = get_entry(table_base, private->hook_entry[hook]);
347 346
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
436 e = (void *)e + e->next_offset; 435 e = (void *)e + e->next_offset;
437 } 436 }
438 } while (!hotdrop); 437 } while (!hotdrop);
439 438 xt_info_rdunlock_bh();
440 rcu_read_unlock_bh();
441 439
442#ifdef DEBUG_ALLOW_ALL 440#ifdef DEBUG_ALLOW_ALL
443 return NF_ACCEPT; 441 return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
896 894
897 /* Instead of clearing (by a previous call to memset()) 895 /* Instead of clearing (by a previous call to memset())
898 * the counters and using adds, we set the counters 896 * the counters and using adds, we set the counters
899 * with data used by 'current' CPU 897 * with data used by 'current' CPU.
900 * We dont care about preemption here. 898 *
899 * Bottom half has to be disabled to prevent deadlock
900 * if new softirq were to run and call ipt_do_table
901 */ 901 */
902 curcpu = raw_smp_processor_id(); 902 local_bh_disable();
903 curcpu = smp_processor_id();
903 904
904 i = 0; 905 i = 0;
905 IPT_ENTRY_ITERATE(t->entries[curcpu], 906 IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
912 if (cpu == curcpu) 913 if (cpu == curcpu)
913 continue; 914 continue;
914 i = 0; 915 i = 0;
916 xt_info_wrlock(cpu);
915 IPT_ENTRY_ITERATE(t->entries[cpu], 917 IPT_ENTRY_ITERATE(t->entries[cpu],
916 t->size, 918 t->size,
917 add_entry_to_counter, 919 add_entry_to_counter,
918 counters, 920 counters,
919 &i); 921 &i);
922 xt_info_wrunlock(cpu);
920 } 923 }
921
922}
923
924/* We're lazy, and add to the first CPU; overflow works its fey magic
925 * and everything is OK. */
926static int
927add_counter_to_entry(struct ipt_entry *e,
928 const struct xt_counters addme[],
929 unsigned int *i)
930{
931 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
932
933 (*i)++;
934 return 0;
935}
936
937/* Take values from counters and add them back onto the current cpu */
938static void put_counters(struct xt_table_info *t,
939 const struct xt_counters counters[])
940{
941 unsigned int i, cpu;
942
943 local_bh_disable();
944 cpu = smp_processor_id();
945 i = 0;
946 IPT_ENTRY_ITERATE(t->entries[cpu],
947 t->size,
948 add_counter_to_entry,
949 counters,
950 &i);
951 local_bh_enable(); 924 local_bh_enable();
952} 925}
953 926
954
955static inline int
956zero_entry_counter(struct ipt_entry *e, void *arg)
957{
958 e->counters.bcnt = 0;
959 e->counters.pcnt = 0;
960 return 0;
961}
962
963static void
964clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
965{
966 unsigned int cpu;
967 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
968
969 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
970 for_each_possible_cpu(cpu) {
971 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
972 IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
973 zero_entry_counter, NULL);
974 }
975}
976
977static struct xt_counters * alloc_counters(struct xt_table *table) 927static struct xt_counters * alloc_counters(struct xt_table *table)
978{ 928{
979 unsigned int countersize; 929 unsigned int countersize;
980 struct xt_counters *counters; 930 struct xt_counters *counters;
981 struct xt_table_info *private = table->private; 931 struct xt_table_info *private = table->private;
982 struct xt_table_info *info;
983 932
984 /* We need atomic snapshot of counters: rest doesn't change 933 /* We need atomic snapshot of counters: rest doesn't change
985 (other than comefrom, which userspace doesn't care 934 (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
988 counters = vmalloc_node(countersize, numa_node_id()); 937 counters = vmalloc_node(countersize, numa_node_id());
989 938
990 if (counters == NULL) 939 if (counters == NULL)
991 goto nomem; 940 return ERR_PTR(-ENOMEM);
992 941
993 info = xt_alloc_table_info(private->size); 942 get_counters(private, counters);
994 if (!info)
995 goto free_counters;
996
997 clone_counters(info, private);
998
999 mutex_lock(&table->lock);
1000 xt_table_entry_swap_rcu(private, info);
1001 synchronize_net(); /* Wait until smoke has cleared */
1002
1003 get_counters(info, counters);
1004 put_counters(private, counters);
1005 mutex_unlock(&table->lock);
1006
1007 xt_free_table_info(info);
1008 943
1009 return counters; 944 return counters;
1010
1011 free_counters:
1012 vfree(counters);
1013 nomem:
1014 return ERR_PTR(-ENOMEM);
1015} 945}
1016 946
1017static int 947static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1306 (newinfo->number <= oldinfo->initial_entries)) 1236 (newinfo->number <= oldinfo->initial_entries))
1307 module_put(t->me); 1237 module_put(t->me);
1308 1238
1309 /* Get the old counters. */ 1239 /* Get the old counters, and synchronize with replace */
1310 get_counters(oldinfo, counters); 1240 get_counters(oldinfo, counters);
1241
1311 /* Decrease module usage counts and free resource */ 1242 /* Decrease module usage counts and free resource */
1312 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1243 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1313 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1244 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1377 return ret; 1308 return ret;
1378} 1309}
1379 1310
1311/* We're lazy, and add to the first CPU; overflow works its fey magic
1312 * and everything is OK. */
1313static int
1314add_counter_to_entry(struct ipt_entry *e,
1315 const struct xt_counters addme[],
1316 unsigned int *i)
1317{
1318 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1319
1320 (*i)++;
1321 return 0;
1322}
1380 1323
1381static int 1324static int
1382do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) 1325do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
1383{ 1326{
1384 unsigned int i; 1327 unsigned int i, curcpu;
1385 struct xt_counters_info tmp; 1328 struct xt_counters_info tmp;
1386 struct xt_counters *paddc; 1329 struct xt_counters *paddc;
1387 unsigned int num_counters; 1330 unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1437 goto free; 1380 goto free;
1438 } 1381 }
1439 1382
1440 mutex_lock(&t->lock); 1383 local_bh_disable();
1441 private = t->private; 1384 private = t->private;
1442 if (private->number != num_counters) { 1385 if (private->number != num_counters) {
1443 ret = -EINVAL; 1386 ret = -EINVAL;
1444 goto unlock_up_free; 1387 goto unlock_up_free;
1445 } 1388 }
1446 1389
1447 preempt_disable();
1448 i = 0; 1390 i = 0;
1449 /* Choose the copy that is on our node */ 1391 /* Choose the copy that is on our node */
1450 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1392 curcpu = smp_processor_id();
1393 loc_cpu_entry = private->entries[curcpu];
1394 xt_info_wrlock(curcpu);
1451 IPT_ENTRY_ITERATE(loc_cpu_entry, 1395 IPT_ENTRY_ITERATE(loc_cpu_entry,
1452 private->size, 1396 private->size,
1453 add_counter_to_entry, 1397 add_counter_to_entry,
1454 paddc, 1398 paddc,
1455 &i); 1399 &i);
1456 preempt_enable(); 1400 xt_info_wrunlock(curcpu);
1457 unlock_up_free: 1401 unlock_up_free:
1458 mutex_unlock(&t->lock); 1402 local_bh_enable();
1459 xt_table_unlock(t); 1403 xt_table_unlock(t);
1460 module_put(t->me); 1404 module_put(t->me);
1461 free: 1405 free:
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index fe65187810f0..3229e0a81ba6 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -211,7 +211,8 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple,
211 minip = ntohl(range->min_ip); 211 minip = ntohl(range->min_ip);
212 maxip = ntohl(range->max_ip); 212 maxip = ntohl(range->max_ip);
213 j = jhash_2words((__force u32)tuple->src.u3.ip, 213 j = jhash_2words((__force u32)tuple->src.u3.ip,
214 (__force u32)tuple->dst.u3.ip, 0); 214 range->flags & IP_NAT_RANGE_PERSISTENT ?
215 (__force u32)tuple->dst.u3.ip : 0, 0);
215 j = ((u64)j * (maxip - minip + 1)) >> 32; 216 j = ((u64)j * (maxip - minip + 1)) >> 32;
216 *var_ipp = htonl(minip + j); 217 *var_ipp = htonl(minip + j);
217} 218}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c40debe51b38..28205e5bfa9b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -784,8 +784,8 @@ static void rt_check_expire(void)
784{ 784{
785 static unsigned int rover; 785 static unsigned int rover;
786 unsigned int i = rover, goal; 786 unsigned int i = rover, goal;
787 struct rtable *rth, **rthp; 787 struct rtable *rth, *aux, **rthp;
788 unsigned long length = 0, samples = 0; 788 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0; 789 unsigned long sum = 0, sum2 = 0;
790 u64 mult; 790 u64 mult;
791 791
@@ -795,9 +795,9 @@ static void rt_check_expire(void)
795 goal = (unsigned int)mult; 795 goal = (unsigned int)mult;
796 if (goal > rt_hash_mask) 796 if (goal > rt_hash_mask)
797 goal = rt_hash_mask + 1; 797 goal = rt_hash_mask + 1;
798 length = 0;
799 for (; goal > 0; goal--) { 798 for (; goal > 0; goal--) {
800 unsigned long tmo = ip_rt_gc_timeout; 799 unsigned long tmo = ip_rt_gc_timeout;
800 unsigned long length;
801 801
802 i = (i + 1) & rt_hash_mask; 802 i = (i + 1) & rt_hash_mask;
803 rthp = &rt_hash_table[i].chain; 803 rthp = &rt_hash_table[i].chain;
@@ -809,8 +809,10 @@ static void rt_check_expire(void)
809 809
810 if (*rthp == NULL) 810 if (*rthp == NULL)
811 continue; 811 continue;
812 length = 0;
812 spin_lock_bh(rt_hash_lock_addr(i)); 813 spin_lock_bh(rt_hash_lock_addr(i));
813 while ((rth = *rthp) != NULL) { 814 while ((rth = *rthp) != NULL) {
815 prefetch(rth->u.dst.rt_next);
814 if (rt_is_expired(rth)) { 816 if (rt_is_expired(rth)) {
815 *rthp = rth->u.dst.rt_next; 817 *rthp = rth->u.dst.rt_next;
816 rt_free(rth); 818 rt_free(rth);
@@ -819,33 +821,30 @@ static void rt_check_expire(void)
819 if (rth->u.dst.expires) { 821 if (rth->u.dst.expires) {
820 /* Entry is expired even if it is in use */ 822 /* Entry is expired even if it is in use */
821 if (time_before_eq(jiffies, rth->u.dst.expires)) { 823 if (time_before_eq(jiffies, rth->u.dst.expires)) {
824nofree:
822 tmo >>= 1; 825 tmo >>= 1;
823 rthp = &rth->u.dst.rt_next; 826 rthp = &rth->u.dst.rt_next;
824 /* 827 /*
825 * Only bump our length if the hash 828 * We only count entries on
826 * inputs on entries n and n+1 are not
827 * the same, we only count entries on
828 * a chain with equal hash inputs once 829 * a chain with equal hash inputs once
829 * so that entries for different QOS 830 * so that entries for different QOS
830 * levels, and other non-hash input 831 * levels, and other non-hash input
831 * attributes don't unfairly skew 832 * attributes don't unfairly skew
832 * the length computation 833 * the length computation
833 */ 834 */
834 if ((*rthp == NULL) || 835 for (aux = rt_hash_table[i].chain;;) {
835 !compare_hash_inputs(&(*rthp)->fl, 836 if (aux == rth) {
836 &rth->fl)) 837 length += ONE;
837 length += ONE; 838 break;
839 }
840 if (compare_hash_inputs(&aux->fl, &rth->fl))
841 break;
842 aux = aux->u.dst.rt_next;
843 }
838 continue; 844 continue;
839 } 845 }
840 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { 846 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
841 tmo >>= 1; 847 goto nofree;
842 rthp = &rth->u.dst.rt_next;
843 if ((*rthp == NULL) ||
844 !compare_hash_inputs(&(*rthp)->fl,
845 &rth->fl))
846 length += ONE;
847 continue;
848 }
849 848
850 /* Cleanup aged off entries. */ 849 /* Cleanup aged off entries. */
851 *rthp = rth->u.dst.rt_next; 850 *rthp = rth->u.dst.rt_next;
@@ -1068,7 +1067,6 @@ out: return 0;
1068static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) 1067static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
1069{ 1068{
1070 struct rtable *rth, **rthp; 1069 struct rtable *rth, **rthp;
1071 struct rtable *rthi;
1072 unsigned long now; 1070 unsigned long now;
1073 struct rtable *cand, **candp; 1071 struct rtable *cand, **candp;
1074 u32 min_score; 1072 u32 min_score;
@@ -1088,7 +1086,6 @@ restart:
1088 } 1086 }
1089 1087
1090 rthp = &rt_hash_table[hash].chain; 1088 rthp = &rt_hash_table[hash].chain;
1091 rthi = NULL;
1092 1089
1093 spin_lock_bh(rt_hash_lock_addr(hash)); 1090 spin_lock_bh(rt_hash_lock_addr(hash));
1094 while ((rth = *rthp) != NULL) { 1091 while ((rth = *rthp) != NULL) {
@@ -1134,17 +1131,6 @@ restart:
1134 chain_length++; 1131 chain_length++;
1135 1132
1136 rthp = &rth->u.dst.rt_next; 1133 rthp = &rth->u.dst.rt_next;
1137
1138 /*
1139 * check to see if the next entry in the chain
1140 * contains the same hash input values as rt. If it does
1141 * This is where we will insert into the list, instead of
1142 * at the head. This groups entries that differ by aspects not
1143 * relvant to the hash function together, which we use to adjust
1144 * our chain length
1145 */
1146 if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
1147 rthi = rth;
1148 } 1134 }
1149 1135
1150 if (cand) { 1136 if (cand) {
@@ -1205,10 +1191,7 @@ restart:
1205 } 1191 }
1206 } 1192 }
1207 1193
1208 if (rthi) 1194 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1209 rt->u.dst.rt_next = rthi->u.dst.rt_next;
1210 else
1211 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1212 1195
1213#if RT_CACHE_DEBUG >= 2 1196#if RT_CACHE_DEBUG >= 2
1214 if (rt->u.dst.rt_next) { 1197 if (rt->u.dst.rt_next) {
@@ -1224,10 +1207,7 @@ restart:
1224 * previous writes to rt are comitted to memory 1207 * previous writes to rt are comitted to memory
1225 * before making rt visible to other CPUS. 1208 * before making rt visible to other CPUS.
1226 */ 1209 */
1227 if (rthi) 1210 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1228 rcu_assign_pointer(rthi->u.dst.rt_next, rt);
1229 else
1230 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1231 1211
1232 spin_unlock_bh(rt_hash_lock_addr(hash)); 1212 spin_unlock_bh(rt_hash_lock_addr(hash));
1233 *rp = rt; 1213 *rp = rt;
@@ -3397,7 +3377,7 @@ int __init ip_rt_init(void)
3397 0, 3377 0,
3398 &rt_hash_log, 3378 &rt_hash_log,
3399 &rt_hash_mask, 3379 &rt_hash_mask,
3400 0); 3380 rhash_entries ? 0 : 512 * 1024);
3401 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); 3381 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3402 rt_hash_lock_init(); 3382 rt_hash_lock_init();
3403 3383
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fafbec8b073e..7a0f0b27bf1f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1321,6 +1321,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1321 struct task_struct *user_recv = NULL; 1321 struct task_struct *user_recv = NULL;
1322 int copied_early = 0; 1322 int copied_early = 0;
1323 struct sk_buff *skb; 1323 struct sk_buff *skb;
1324 u32 urg_hole = 0;
1324 1325
1325 lock_sock(sk); 1326 lock_sock(sk);
1326 1327
@@ -1532,7 +1533,8 @@ do_prequeue:
1532 } 1533 }
1533 } 1534 }
1534 } 1535 }
1535 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { 1536 if ((flags & MSG_PEEK) &&
1537 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1536 if (net_ratelimit()) 1538 if (net_ratelimit())
1537 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", 1539 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1538 current->comm, task_pid_nr(current)); 1540 current->comm, task_pid_nr(current));
@@ -1553,6 +1555,7 @@ do_prequeue:
1553 if (!urg_offset) { 1555 if (!urg_offset) {
1554 if (!sock_flag(sk, SOCK_URGINLINE)) { 1556 if (!sock_flag(sk, SOCK_URGINLINE)) {
1555 ++*seq; 1557 ++*seq;
1558 urg_hole++;
1556 offset++; 1559 offset++;
1557 used--; 1560 used--;
1558 if (!used) 1561 if (!used)
@@ -2511,6 +2514,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2511 struct sk_buff *p; 2514 struct sk_buff *p;
2512 struct tcphdr *th; 2515 struct tcphdr *th;
2513 struct tcphdr *th2; 2516 struct tcphdr *th2;
2517 unsigned int len;
2514 unsigned int thlen; 2518 unsigned int thlen;
2515 unsigned int flags; 2519 unsigned int flags;
2516 unsigned int mss = 1; 2520 unsigned int mss = 1;
@@ -2531,6 +2535,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2531 2535
2532 skb_gro_pull(skb, thlen); 2536 skb_gro_pull(skb, thlen);
2533 2537
2538 len = skb_gro_len(skb);
2534 flags = tcp_flag_word(th); 2539 flags = tcp_flag_word(th);
2535 2540
2536 for (; (p = *head); head = &p->next) { 2541 for (; (p = *head); head = &p->next) {
@@ -2561,7 +2566,7 @@ found:
2561 2566
2562 mss = skb_shinfo(p)->gso_size; 2567 mss = skb_shinfo(p)->gso_size;
2563 2568
2564 flush |= (skb_gro_len(skb) > mss) | !skb_gro_len(skb); 2569 flush |= (len > mss) | !len;
2565 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); 2570 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
2566 2571
2567 if (flush || skb_gro_receive(head, skb)) { 2572 if (flush || skb_gro_receive(head, skb)) {
@@ -2574,7 +2579,7 @@ found:
2574 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); 2579 tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
2575 2580
2576out_check_final: 2581out_check_final:
2577 flush = skb_gro_len(skb) < mss; 2582 flush = len < mss;
2578 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | 2583 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
2579 TCP_FLAG_SYN | TCP_FLAG_FIN); 2584 TCP_FLAG_SYN | TCP_FLAG_FIN);
2580 2585
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c96a6bb25430..eec3e6f9956c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -597,16 +597,6 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
597 tcp_grow_window(sk, skb); 597 tcp_grow_window(sk, skb);
598} 598}
599 599
600static u32 tcp_rto_min(struct sock *sk)
601{
602 struct dst_entry *dst = __sk_dst_get(sk);
603 u32 rto_min = TCP_RTO_MIN;
604
605 if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
606 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
607 return rto_min;
608}
609
610/* Called to compute a smoothed rtt estimate. The data fed to this 600/* Called to compute a smoothed rtt estimate. The data fed to this
611 * routine either comes from timestamps, or from segments that were 601 * routine either comes from timestamps, or from segments that were
612 * known _not_ to have been retransmitted [see Karn/Partridge 602 * known _not_ to have been retransmitted [see Karn/Partridge
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 53300fa2359f..59aec609cec6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -778,7 +778,7 @@ static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)
778 778
779 if (tp->lost_skb_hint && 779 if (tp->lost_skb_hint &&
780 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && 780 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
781 (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked)) 781 (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
782 tp->lost_cnt_hint -= decr; 782 tp->lost_cnt_hint -= decr;
783 783
784 tcp_verify_left_out(tp); 784 tcp_verify_left_out(tp);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a453aac91bd3..c6743eec9b7d 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -158,6 +158,11 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
158} 158}
159EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); 159EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
160 160
161static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
162{
163 return min(tp->snd_ssthresh, tp->snd_cwnd-1);
164}
165
161static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) 166static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
162{ 167{
163 struct tcp_sock *tp = tcp_sk(sk); 168 struct tcp_sock *tp = tcp_sk(sk);
@@ -221,11 +226,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
221 */ 226 */
222 diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; 227 diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
223 228
224 if (diff > gamma && tp->snd_ssthresh > 2 ) { 229 if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
225 /* Going too fast. Time to slow down 230 /* Going too fast. Time to slow down
226 * and switch to congestion avoidance. 231 * and switch to congestion avoidance.
227 */ 232 */
228 tp->snd_ssthresh = 2;
229 233
230 /* Set cwnd to match the actual rate 234 /* Set cwnd to match the actual rate
231 * exactly: 235 * exactly:
@@ -235,6 +239,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
235 * utilization. 239 * utilization.
236 */ 240 */
237 tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); 241 tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
242 tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
238 243
239 } else if (tp->snd_cwnd <= tp->snd_ssthresh) { 244 } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
240 /* Slow start. */ 245 /* Slow start. */
@@ -250,6 +255,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
250 * we slow down. 255 * we slow down.
251 */ 256 */
252 tp->snd_cwnd--; 257 tp->snd_cwnd--;
258 tp->snd_ssthresh
259 = tcp_vegas_ssthresh(tp);
253 } else if (diff < alpha) { 260 } else if (diff < alpha) {
254 /* We don't have enough extra packets 261 /* We don't have enough extra packets
255 * in the network, so speed up. 262 * in the network, so speed up.