aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/cipso_ipv4.c51
-rw-r--r--net/ipv4/devinet.c16
-rw-r--r--net/ipv4/fib_frontend.c29
-rw-r--r--net/ipv4/fib_lookup.h1
-rw-r--r--net/ipv4/fib_rules.c22
-rw-r--r--net/ipv4/fib_semantics.c35
-rw-r--r--net/ipv4/fib_trie.c1960
-rw-r--r--net/ipv4/fou.c32
-rw-r--r--net/ipv4/geneve.c211
-rw-r--r--net/ipv4/icmp.c17
-rw-r--r--net/ipv4/inet_diag.c9
-rw-r--r--net/ipv4/ip_forward.c3
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/ip_output.c35
-rw-r--r--net/ipv4/ip_sockglue.c123
-rw-r--r--net/ipv4/ip_tunnel.c8
-rw-r--r--net/ipv4/ip_vti.c1
-rw-r--r--net/ipv4/ipconfig.c6
-rw-r--r--net/ipv4/ipip.c13
-rw-r--r--net/ipv4/ipmr.c3
-rw-r--r--net/ipv4/ping.c22
-rw-r--r--net/ipv4/proc.c6
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c63
-rw-r--r--net/ipv4/sysctl_net_ipv4.c35
-rw-r--r--net/ipv4/tcp.c233
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cong.c153
-rw-r--r--net/ipv4/tcp_cubic.c39
-rw-r--r--net/ipv4/tcp_fastopen.c13
-rw-r--r--net/ipv4/tcp_input.c88
-rw-r--r--net/ipv4/tcp_ipv4.c40
-rw-r--r--net/ipv4/tcp_memcontrol.c2
-rw-r--r--net/ipv4/tcp_metrics.c3
-rw-r--r--net/ipv4/tcp_minisocks.c66
-rw-r--r--net/ipv4/tcp_output.c50
-rw-r--r--net/ipv4/tcp_scalable.c3
-rw-r--r--net/ipv4/tcp_timer.c7
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c4
-rw-r--r--net/ipv4/udp_diag.c4
-rw-r--r--net/ipv4/udp_offload.c7
-rw-r--r--net/ipv4/udp_tunnel.c14
45 files changed, 1841 insertions, 1616 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a44773c8346c..d2e49baaff63 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
395 if (sk) { 395 if (sk) {
396 long timeout; 396 long timeout;
397 397
398 sock_rps_reset_flow(sk);
399
400 /* Applications forget to leave groups before exiting */ 398 /* Applications forget to leave groups before exiting */
401 ip_mc_drop_socket(sk); 399 ip_mc_drop_socket(sk);
402 400
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 5160c710f2eb..e361ea6f3fc8 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -378,20 +378,18 @@ static int cipso_v4_cache_check(const unsigned char *key,
378 * negative values on failure. 378 * negative values on failure.
379 * 379 *
380 */ 380 */
381int cipso_v4_cache_add(const struct sk_buff *skb, 381int cipso_v4_cache_add(const unsigned char *cipso_ptr,
382 const struct netlbl_lsm_secattr *secattr) 382 const struct netlbl_lsm_secattr *secattr)
383{ 383{
384 int ret_val = -EPERM; 384 int ret_val = -EPERM;
385 u32 bkt; 385 u32 bkt;
386 struct cipso_v4_map_cache_entry *entry = NULL; 386 struct cipso_v4_map_cache_entry *entry = NULL;
387 struct cipso_v4_map_cache_entry *old_entry = NULL; 387 struct cipso_v4_map_cache_entry *old_entry = NULL;
388 unsigned char *cipso_ptr;
389 u32 cipso_ptr_len; 388 u32 cipso_ptr_len;
390 389
391 if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0) 390 if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
392 return 0; 391 return 0;
393 392
394 cipso_ptr = CIPSO_V4_OPTPTR(skb);
395 cipso_ptr_len = cipso_ptr[1]; 393 cipso_ptr_len = cipso_ptr[1];
396 394
397 entry = kzalloc(sizeof(*entry), GFP_ATOMIC); 395 entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
@@ -1579,6 +1577,33 @@ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
1579} 1577}
1580 1578
1581/** 1579/**
1580 * cipso_v4_optptr - Find the CIPSO option in the packet
1581 * @skb: the packet
1582 *
1583 * Description:
1584 * Parse the packet's IP header looking for a CIPSO option. Returns a pointer
1585 * to the start of the CIPSO option on success, NULL if one if not found.
1586 *
1587 */
1588unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
1589{
1590 const struct iphdr *iph = ip_hdr(skb);
1591 unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]);
1592 int optlen;
1593 int taglen;
1594
1595 for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
1596 if (optptr[0] == IPOPT_CIPSO)
1597 return optptr;
1598 taglen = optptr[1];
1599 optlen -= taglen;
1600 optptr += taglen;
1601 }
1602
1603 return NULL;
1604}
1605
1606/**
1582 * cipso_v4_validate - Validate a CIPSO option 1607 * cipso_v4_validate - Validate a CIPSO option
1583 * @option: the start of the option, on error it is set to point to the error 1608 * @option: the start of the option, on error it is set to point to the error
1584 * 1609 *
@@ -2119,8 +2144,8 @@ void cipso_v4_req_delattr(struct request_sock *req)
2119 * on success and negative values on failure. 2144 * on success and negative values on failure.
2120 * 2145 *
2121 */ 2146 */
2122static int cipso_v4_getattr(const unsigned char *cipso, 2147int cipso_v4_getattr(const unsigned char *cipso,
2123 struct netlbl_lsm_secattr *secattr) 2148 struct netlbl_lsm_secattr *secattr)
2124{ 2149{
2125 int ret_val = -ENOMSG; 2150 int ret_val = -ENOMSG;
2126 u32 doi; 2151 u32 doi;
@@ -2305,22 +2330,6 @@ int cipso_v4_skbuff_delattr(struct sk_buff *skb)
2305 return 0; 2330 return 0;
2306} 2331}
2307 2332
2308/**
2309 * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option
2310 * @skb: the packet
2311 * @secattr: the security attributes
2312 *
2313 * Description:
2314 * Parse the given packet's CIPSO option and return the security attributes.
2315 * Returns zero on success and negative values on failure.
2316 *
2317 */
2318int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
2319 struct netlbl_lsm_secattr *secattr)
2320{
2321 return cipso_v4_getattr(CIPSO_V4_OPTPTR(skb), secattr);
2322}
2323
2324/* 2333/*
2325 * Setup Functions 2334 * Setup Functions
2326 */ 2335 */
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 214882e7d6de..f0b4a31d7bd6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1522,7 +1522,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1522 preferred, valid)) 1522 preferred, valid))
1523 goto nla_put_failure; 1523 goto nla_put_failure;
1524 1524
1525 return nlmsg_end(skb, nlh); 1525 nlmsg_end(skb, nlh);
1526 return 0;
1526 1527
1527nla_put_failure: 1528nla_put_failure:
1528 nlmsg_cancel(skb, nlh); 1529 nlmsg_cancel(skb, nlh);
@@ -1566,7 +1567,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1566 if (inet_fill_ifaddr(skb, ifa, 1567 if (inet_fill_ifaddr(skb, ifa,
1567 NETLINK_CB(cb->skb).portid, 1568 NETLINK_CB(cb->skb).portid,
1568 cb->nlh->nlmsg_seq, 1569 cb->nlh->nlmsg_seq,
1569 RTM_NEWADDR, NLM_F_MULTI) <= 0) { 1570 RTM_NEWADDR, NLM_F_MULTI) < 0) {
1570 rcu_read_unlock(); 1571 rcu_read_unlock();
1571 goto done; 1572 goto done;
1572 } 1573 }
@@ -1749,7 +1750,8 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
1749 IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) 1750 IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
1750 goto nla_put_failure; 1751 goto nla_put_failure;
1751 1752
1752 return nlmsg_end(skb, nlh); 1753 nlmsg_end(skb, nlh);
1754 return 0;
1753 1755
1754nla_put_failure: 1756nla_put_failure:
1755 nlmsg_cancel(skb, nlh); 1757 nlmsg_cancel(skb, nlh);
@@ -1881,7 +1883,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
1881 cb->nlh->nlmsg_seq, 1883 cb->nlh->nlmsg_seq,
1882 RTM_NEWNETCONF, 1884 RTM_NEWNETCONF,
1883 NLM_F_MULTI, 1885 NLM_F_MULTI,
1884 -1) <= 0) { 1886 -1) < 0) {
1885 rcu_read_unlock(); 1887 rcu_read_unlock();
1886 goto done; 1888 goto done;
1887 } 1889 }
@@ -1897,7 +1899,7 @@ cont:
1897 NETLINK_CB(cb->skb).portid, 1899 NETLINK_CB(cb->skb).portid,
1898 cb->nlh->nlmsg_seq, 1900 cb->nlh->nlmsg_seq,
1899 RTM_NEWNETCONF, NLM_F_MULTI, 1901 RTM_NEWNETCONF, NLM_F_MULTI,
1900 -1) <= 0) 1902 -1) < 0)
1901 goto done; 1903 goto done;
1902 else 1904 else
1903 h++; 1905 h++;
@@ -1908,7 +1910,7 @@ cont:
1908 NETLINK_CB(cb->skb).portid, 1910 NETLINK_CB(cb->skb).portid,
1909 cb->nlh->nlmsg_seq, 1911 cb->nlh->nlmsg_seq,
1910 RTM_NEWNETCONF, NLM_F_MULTI, 1912 RTM_NEWNETCONF, NLM_F_MULTI,
1911 -1) <= 0) 1913 -1) < 0)
1912 goto done; 1914 goto done;
1913 else 1915 else
1914 h++; 1916 h++;
@@ -2320,7 +2322,7 @@ static __net_initdata struct pernet_operations devinet_ops = {
2320 .exit = devinet_exit_net, 2322 .exit = devinet_exit_net,
2321}; 2323};
2322 2324
2323static struct rtnl_af_ops inet_af_ops = { 2325static struct rtnl_af_ops inet_af_ops __read_mostly = {
2324 .family = AF_INET, 2326 .family = AF_INET,
2325 .fill_link_af = inet_fill_link_af, 2327 .fill_link_af = inet_fill_link_af,
2326 .get_link_af_size = inet_get_link_af_size, 2328 .get_link_af_size = inet_get_link_af_size,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 23104a3f2924..57be71dd6a9e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -67,7 +67,7 @@ static int __net_init fib4_rules_init(struct net *net)
67 return 0; 67 return 0;
68 68
69fail: 69fail:
70 kfree(local_table); 70 fib_free_table(local_table);
71 return -ENOMEM; 71 return -ENOMEM;
72} 72}
73#else 73#else
@@ -109,6 +109,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
109 return tb; 109 return tb;
110} 110}
111 111
112/* caller must hold either rtnl or rcu read lock */
112struct fib_table *fib_get_table(struct net *net, u32 id) 113struct fib_table *fib_get_table(struct net *net, u32 id)
113{ 114{
114 struct fib_table *tb; 115 struct fib_table *tb;
@@ -119,15 +120,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
119 id = RT_TABLE_MAIN; 120 id = RT_TABLE_MAIN;
120 h = id & (FIB_TABLE_HASHSZ - 1); 121 h = id & (FIB_TABLE_HASHSZ - 1);
121 122
122 rcu_read_lock();
123 head = &net->ipv4.fib_table_hash[h]; 123 head = &net->ipv4.fib_table_hash[h];
124 hlist_for_each_entry_rcu(tb, head, tb_hlist) { 124 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
125 if (tb->tb_id == id) { 125 if (tb->tb_id == id)
126 rcu_read_unlock();
127 return tb; 126 return tb;
128 }
129 } 127 }
130 rcu_read_unlock();
131 return NULL; 128 return NULL;
132} 129}
133#endif /* CONFIG_IP_MULTIPLE_TABLES */ 130#endif /* CONFIG_IP_MULTIPLE_TABLES */
@@ -167,16 +164,18 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
167 if (ipv4_is_multicast(addr)) 164 if (ipv4_is_multicast(addr))
168 return RTN_MULTICAST; 165 return RTN_MULTICAST;
169 166
167 rcu_read_lock();
168
170 local_table = fib_get_table(net, RT_TABLE_LOCAL); 169 local_table = fib_get_table(net, RT_TABLE_LOCAL);
171 if (local_table) { 170 if (local_table) {
172 ret = RTN_UNICAST; 171 ret = RTN_UNICAST;
173 rcu_read_lock();
174 if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { 172 if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
175 if (!dev || dev == res.fi->fib_dev) 173 if (!dev || dev == res.fi->fib_dev)
176 ret = res.type; 174 ret = res.type;
177 } 175 }
178 rcu_read_unlock();
179 } 176 }
177
178 rcu_read_unlock();
180 return ret; 179 return ret;
181} 180}
182 181
@@ -919,7 +918,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
919#undef BRD1_OK 918#undef BRD1_OK
920} 919}
921 920
922static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) 921static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
923{ 922{
924 923
925 struct fib_result res; 924 struct fib_result res;
@@ -929,6 +928,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
929 .flowi4_tos = frn->fl_tos, 928 .flowi4_tos = frn->fl_tos,
930 .flowi4_scope = frn->fl_scope, 929 .flowi4_scope = frn->fl_scope,
931 }; 930 };
931 struct fib_table *tb;
932
933 rcu_read_lock();
934
935 tb = fib_get_table(net, frn->tb_id_in);
932 936
933 frn->err = -ENOENT; 937 frn->err = -ENOENT;
934 if (tb) { 938 if (tb) {
@@ -945,6 +949,8 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
945 } 949 }
946 local_bh_enable(); 950 local_bh_enable();
947 } 951 }
952
953 rcu_read_unlock();
948} 954}
949 955
950static void nl_fib_input(struct sk_buff *skb) 956static void nl_fib_input(struct sk_buff *skb)
@@ -952,7 +958,6 @@ static void nl_fib_input(struct sk_buff *skb)
952 struct net *net; 958 struct net *net;
953 struct fib_result_nl *frn; 959 struct fib_result_nl *frn;
954 struct nlmsghdr *nlh; 960 struct nlmsghdr *nlh;
955 struct fib_table *tb;
956 u32 portid; 961 u32 portid;
957 962
958 net = sock_net(skb->sk); 963 net = sock_net(skb->sk);
@@ -967,9 +972,7 @@ static void nl_fib_input(struct sk_buff *skb)
967 nlh = nlmsg_hdr(skb); 972 nlh = nlmsg_hdr(skb);
968 973
969 frn = (struct fib_result_nl *) nlmsg_data(nlh); 974 frn = (struct fib_result_nl *) nlmsg_data(nlh);
970 tb = fib_get_table(net, frn->tb_id_in); 975 nl_fib_lookup(net, frn);
971
972 nl_fib_lookup(frn, tb);
973 976
974 portid = NETLINK_CB(skb).portid; /* netlink portid */ 977 portid = NETLINK_CB(skb).portid; /* netlink portid */
975 NETLINK_CB(skb).portid = 0; /* from kernel */ 978 NETLINK_CB(skb).portid = 0; /* from kernel */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 1e4f6600b31d..825981b1049a 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -32,7 +32,6 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
32 unsigned int); 32 unsigned int);
33void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, 33void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
34 u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); 34 u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
35struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
36 35
37static inline void fib_result_assign(struct fib_result *res, 36static inline void fib_result_assign(struct fib_result *res,
38 struct fib_info *fi) 37 struct fib_info *fi)
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 8f7bd56955b0..d3db718be51d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -81,27 +81,25 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
81 break; 81 break;
82 82
83 case FR_ACT_UNREACHABLE: 83 case FR_ACT_UNREACHABLE:
84 err = -ENETUNREACH; 84 return -ENETUNREACH;
85 goto errout;
86 85
87 case FR_ACT_PROHIBIT: 86 case FR_ACT_PROHIBIT:
88 err = -EACCES; 87 return -EACCES;
89 goto errout;
90 88
91 case FR_ACT_BLACKHOLE: 89 case FR_ACT_BLACKHOLE:
92 default: 90 default:
93 err = -EINVAL; 91 return -EINVAL;
94 goto errout;
95 } 92 }
96 93
94 rcu_read_lock();
95
97 tbl = fib_get_table(rule->fr_net, rule->table); 96 tbl = fib_get_table(rule->fr_net, rule->table);
98 if (!tbl) 97 if (tbl)
99 goto errout; 98 err = fib_table_lookup(tbl, &flp->u.ip4,
99 (struct fib_result *)arg->result,
100 arg->flags);
100 101
101 err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags); 102 rcu_read_unlock();
102 if (err > 0)
103 err = -EAGAIN;
104errout:
105 return err; 103 return err;
106} 104}
107 105
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f99f41bd15b8..1e2090ea663e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
360 + nla_total_size(4) /* RTA_TABLE */ 360 + nla_total_size(4) /* RTA_TABLE */
361 + nla_total_size(4) /* RTA_DST */ 361 + nla_total_size(4) /* RTA_DST */
362 + nla_total_size(4) /* RTA_PRIORITY */ 362 + nla_total_size(4) /* RTA_PRIORITY */
363 + nla_total_size(4); /* RTA_PREFSRC */ 363 + nla_total_size(4) /* RTA_PREFSRC */
364 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
364 365
365 /* space for nested metrics */ 366 /* space for nested metrics */
366 payload += nla_total_size((RTAX_MAX * nla_total_size(4))); 367 payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
@@ -410,24 +411,6 @@ errout:
410 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 411 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
411} 412}
412 413
413/* Return the first fib alias matching TOS with
414 * priority less than or equal to PRIO.
415 */
416struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
417{
418 if (fah) {
419 struct fib_alias *fa;
420 list_for_each_entry(fa, fah, fa_list) {
421 if (fa->fa_tos > tos)
422 continue;
423 if (fa->fa_info->fib_priority >= prio ||
424 fa->fa_tos < tos)
425 return fa;
426 }
427 }
428 return NULL;
429}
430
431static int fib_detect_death(struct fib_info *fi, int order, 414static int fib_detect_death(struct fib_info *fi, int order,
432 struct fib_info **last_resort, int *last_idx, 415 struct fib_info **last_resort, int *last_idx,
433 int dflt) 416 int dflt)
@@ -859,7 +842,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
859 842
860 if (type > RTAX_MAX) 843 if (type > RTAX_MAX)
861 goto err_inval; 844 goto err_inval;
862 val = nla_get_u32(nla); 845 if (type == RTAX_CC_ALGO) {
846 char tmp[TCP_CA_NAME_MAX];
847
848 nla_strlcpy(tmp, nla, sizeof(tmp));
849 val = tcp_ca_get_key_by_name(tmp);
850 if (val == TCP_CA_UNSPEC)
851 goto err_inval;
852 } else {
853 val = nla_get_u32(nla);
854 }
863 if (type == RTAX_ADVMSS && val > 65535 - 40) 855 if (type == RTAX_ADVMSS && val > 65535 - 40)
864 val = 65535 - 40; 856 val = 65535 - 40;
865 if (type == RTAX_MTU && val > 65535 - 15) 857 if (type == RTAX_MTU && val > 65535 - 15)
@@ -1081,7 +1073,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1081 nla_nest_end(skb, mp); 1073 nla_nest_end(skb, mp);
1082 } 1074 }
1083#endif 1075#endif
1084 return nlmsg_end(skb, nlh); 1076 nlmsg_end(skb, nlh);
1077 return 0;
1085 1078
1086nla_put_failure: 1079nla_put_failure:
1087 nlmsg_cancel(skb, nlh); 1080 nlmsg_cancel(skb, nlh);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 18bcaf2ff2fd..3daf0224ff2e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -83,28 +83,33 @@
83 83
84#define MAX_STAT_DEPTH 32 84#define MAX_STAT_DEPTH 32
85 85
86#define KEYLENGTH (8*sizeof(t_key)) 86#define KEYLENGTH (8*sizeof(t_key))
87#define KEY_MAX ((t_key)~0)
87 88
88typedef unsigned int t_key; 89typedef unsigned int t_key;
89 90
90#define T_TNODE 0 91#define IS_TNODE(n) ((n)->bits)
91#define T_LEAF 1 92#define IS_LEAF(n) (!(n)->bits)
92#define NODE_TYPE_MASK 0x1UL
93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 93
95#define IS_TNODE(n) (!(n->parent & T_LEAF)) 94#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
96#define IS_LEAF(n) (n->parent & T_LEAF)
97 95
98struct rt_trie_node { 96struct tnode {
99 unsigned long parent;
100 t_key key;
101};
102
103struct leaf {
104 unsigned long parent;
105 t_key key; 97 t_key key;
106 struct hlist_head list; 98 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
99 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
100 unsigned char slen;
101 struct tnode __rcu *parent;
107 struct rcu_head rcu; 102 struct rcu_head rcu;
103 union {
104 /* The fields in this struct are valid if bits > 0 (TNODE) */
105 struct {
106 t_key empty_children; /* KEYLENGTH bits needed */
107 t_key full_children; /* KEYLENGTH bits needed */
108 struct tnode __rcu *child[0];
109 };
110 /* This list pointer if valid if bits == 0 (LEAF) */
111 struct hlist_head list;
112 };
108}; 113};
109 114
110struct leaf_info { 115struct leaf_info {
@@ -115,20 +120,6 @@ struct leaf_info {
115 struct rcu_head rcu; 120 struct rcu_head rcu;
116}; 121};
117 122
118struct tnode {
119 unsigned long parent;
120 t_key key;
121 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
122 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
123 unsigned int full_children; /* KEYLENGTH bits needed */
124 unsigned int empty_children; /* KEYLENGTH bits needed */
125 union {
126 struct rcu_head rcu;
127 struct tnode *tnode_free;
128 };
129 struct rt_trie_node __rcu *child[0];
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS 123#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats { 124struct trie_use_stats {
134 unsigned int gets; 125 unsigned int gets;
@@ -151,19 +142,13 @@ struct trie_stat {
151}; 142};
152 143
153struct trie { 144struct trie {
154 struct rt_trie_node __rcu *trie; 145 struct tnode __rcu *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS 146#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats; 147 struct trie_use_stats __percpu *stats;
157#endif 148#endif
158}; 149};
159 150
160static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, 151static void resize(struct trie *t, struct tnode *tn);
161 int wasfull);
162static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
165/* tnodes to free after resize(); protected by RTNL */
166static struct tnode *tnode_free_head;
167static size_t tnode_free_size; 152static size_t tnode_free_size;
168 153
169/* 154/*
@@ -176,170 +161,101 @@ static const int sync_pages = 128;
176static struct kmem_cache *fn_alias_kmem __read_mostly; 161static struct kmem_cache *fn_alias_kmem __read_mostly;
177static struct kmem_cache *trie_leaf_kmem __read_mostly; 162static struct kmem_cache *trie_leaf_kmem __read_mostly;
178 163
179/* 164/* caller must hold RTNL */
180 * caller must hold RTNL 165#define node_parent(n) rtnl_dereference((n)->parent)
181 */
182static inline struct tnode *node_parent(const struct rt_trie_node *node)
183{
184 unsigned long parent;
185
186 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
187 166
188 return (struct tnode *)(parent & ~NODE_TYPE_MASK); 167/* caller must hold RCU read lock or RTNL */
189} 168#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
190 169
191/* 170/* wrapper for rcu_assign_pointer */
192 * caller must hold RCU read lock or RTNL 171static inline void node_set_parent(struct tnode *n, struct tnode *tp)
193 */
194static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
195{ 172{
196 unsigned long parent; 173 if (n)
197 174 rcu_assign_pointer(n->parent, tp);
198 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
199 lockdep_rtnl_is_held());
200
201 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
202} 175}
203 176
204/* Same as rcu_assign_pointer 177#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
205 * but that macro() assumes that value is a pointer. 178
179/* This provides us with the number of children in this node, in the case of a
180 * leaf this will return 0 meaning none of the children are accessible.
206 */ 181 */
207static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) 182static inline unsigned long tnode_child_length(const struct tnode *tn)
208{ 183{
209 smp_wmb(); 184 return (1ul << tn->bits) & ~(1ul);
210 node->parent = (unsigned long)ptr | NODE_TYPE(node);
211} 185}
212 186
213/* 187/* caller must hold RTNL */
214 * caller must hold RTNL 188static inline struct tnode *tnode_get_child(const struct tnode *tn,
215 */ 189 unsigned long i)
216static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
217{ 190{
218 BUG_ON(i >= 1U << tn->bits);
219
220 return rtnl_dereference(tn->child[i]); 191 return rtnl_dereference(tn->child[i]);
221} 192}
222 193
223/* 194/* caller must hold RCU read lock or RTNL */
224 * caller must hold RCU read lock or RTNL 195static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
225 */ 196 unsigned long i)
226static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
227{ 197{
228 BUG_ON(i >= 1U << tn->bits);
229
230 return rcu_dereference_rtnl(tn->child[i]); 198 return rcu_dereference_rtnl(tn->child[i]);
231} 199}
232 200
233static inline int tnode_child_length(const struct tnode *tn) 201/* To understand this stuff, an understanding of keys and all their bits is
234{ 202 * necessary. Every node in the trie has a key associated with it, but not
235 return 1 << tn->bits; 203 * all of the bits in that key are significant.
236} 204 *
237 205 * Consider a node 'n' and its parent 'tp'.
238static inline t_key mask_pfx(t_key k, unsigned int l) 206 *
239{ 207 * If n is a leaf, every bit in its key is significant. Its presence is
240 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); 208 * necessitated by path compression, since during a tree traversal (when
241} 209 * searching for a leaf - unless we are doing an insertion) we will completely
242 210 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
243static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) 211 * a potentially successful search, that we have indeed been walking the
244{ 212 * correct key path.
245 if (offset < KEYLENGTH) 213 *
246 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 214 * Note that we can never "miss" the correct key in the tree if present by
247 else 215 * following the wrong path. Path compression ensures that segments of the key
248 return 0; 216 * that are the same for all keys with a given prefix are skipped, but the
249} 217 * skipped part *is* identical for each node in the subtrie below the skipped
250 218 * bit! trie_insert() in this implementation takes care of that.
251static inline int tkey_equals(t_key a, t_key b) 219 *
252{ 220 * if n is an internal node - a 'tnode' here, the various parts of its key
253 return a == b; 221 * have many different meanings.
254} 222 *
255 223 * Example:
256static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) 224 * _________________________________________________________________
257{ 225 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
258 if (bits == 0 || offset >= KEYLENGTH) 226 * -----------------------------------------------------------------
259 return 1; 227 * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
260 bits = bits > KEYLENGTH ? KEYLENGTH : bits; 228 *
261 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; 229 * _________________________________________________________________
262} 230 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
263 231 * -----------------------------------------------------------------
264static inline int tkey_mismatch(t_key a, int offset, t_key b) 232 * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
265{ 233 *
266 t_key diff = a ^ b; 234 * tp->pos = 22
267 int i = offset; 235 * tp->bits = 3
268 236 * n->pos = 13
269 if (!diff) 237 * n->bits = 4
270 return 0; 238 *
271 while ((diff << i) >> (KEYLENGTH-1) == 0) 239 * First, let's just ignore the bits that come before the parent tp, that is
272 i++; 240 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
273 return i; 241 * point we do not use them for anything.
274} 242 *
275 243 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
276/* 244 * index into the parent's child array. That is, they will be used to find
277 To understand this stuff, an understanding of keys and all their bits is 245 * 'n' among tp's children.
278 necessary. Every node in the trie has a key associated with it, but not 246 *
279 all of the bits in that key are significant. 247 * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
280 248 * for the node n.
281 Consider a node 'n' and its parent 'tp'. 249 *
282 250 * All the bits we have seen so far are significant to the node n. The rest
283 If n is a leaf, every bit in its key is significant. Its presence is 251 * of the bits are really not needed or indeed known in n->key.
284 necessitated by path compression, since during a tree traversal (when 252 *
285 searching for a leaf - unless we are doing an insertion) we will completely 253 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
286 ignore all skipped bits we encounter. Thus we need to verify, at the end of 254 * n's child array, and will of course be different for each child.
287 a potentially successful search, that we have indeed been walking the 255 *
288 correct key path. 256 * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
289 257 * at this point.
290 Note that we can never "miss" the correct key in the tree if present by 258 */
291 following the wrong path. Path compression ensures that segments of the key
292 that are the same for all keys with a given prefix are skipped, but the
293 skipped part *is* identical for each node in the subtrie below the skipped
294 bit! trie_insert() in this implementation takes care of that - note the
295 call to tkey_sub_equals() in trie_insert().
296
297 if n is an internal node - a 'tnode' here, the various parts of its key
298 have many different meanings.
299
300 Example:
301 _________________________________________________________________
302 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
303 -----------------------------------------------------------------
304 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
305
306 _________________________________________________________________
307 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
308 -----------------------------------------------------------------
309 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
310
311 tp->pos = 7
312 tp->bits = 3
313 n->pos = 15
314 n->bits = 4
315
316 First, let's just ignore the bits that come before the parent tp, that is
317 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
318 not use them for anything.
319
320 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
321 index into the parent's child array. That is, they will be used to find
322 'n' among tp's children.
323
324 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
325 for the node n.
326
327 All the bits we have seen so far are significant to the node n. The rest
328 of the bits are really not needed or indeed known in n->key.
329
330 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
331 n's child array, and will of course be different for each child.
332
333
334 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
335 at this point.
336
337*/
338
339static inline void check_tnode(const struct tnode *tn)
340{
341 WARN_ON(tn && tn->pos+tn->bits > 32);
342}
343 259
344static const int halve_threshold = 25; 260static const int halve_threshold = 25;
345static const int inflate_threshold = 50; 261static const int inflate_threshold = 50;
@@ -357,17 +273,23 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
357 call_rcu(&fa->rcu, __alias_free_mem); 273 call_rcu(&fa->rcu, __alias_free_mem);
358} 274}
359 275
360static void __leaf_free_rcu(struct rcu_head *head) 276#define TNODE_KMALLOC_MAX \
361{ 277 ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
362 struct leaf *l = container_of(head, struct leaf, rcu);
363 kmem_cache_free(trie_leaf_kmem, l);
364}
365 278
366static inline void free_leaf(struct leaf *l) 279static void __node_free_rcu(struct rcu_head *head)
367{ 280{
368 call_rcu(&l->rcu, __leaf_free_rcu); 281 struct tnode *n = container_of(head, struct tnode, rcu);
282
283 if (IS_LEAF(n))
284 kmem_cache_free(trie_leaf_kmem, n);
285 else if (n->bits <= TNODE_KMALLOC_MAX)
286 kfree(n);
287 else
288 vfree(n);
369} 289}
370 290
291#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
292
371static inline void free_leaf_info(struct leaf_info *leaf) 293static inline void free_leaf_info(struct leaf_info *leaf)
372{ 294{
373 kfree_rcu(leaf, rcu); 295 kfree_rcu(leaf, rcu);
@@ -381,56 +303,31 @@ static struct tnode *tnode_alloc(size_t size)
381 return vzalloc(size); 303 return vzalloc(size);
382} 304}
383 305
384static void __tnode_free_rcu(struct rcu_head *head) 306static inline void empty_child_inc(struct tnode *n)
385{
386 struct tnode *tn = container_of(head, struct tnode, rcu);
387 size_t size = sizeof(struct tnode) +
388 (sizeof(struct rt_trie_node *) << tn->bits);
389
390 if (size <= PAGE_SIZE)
391 kfree(tn);
392 else
393 vfree(tn);
394}
395
396static inline void tnode_free(struct tnode *tn)
397{
398 if (IS_LEAF(tn))
399 free_leaf((struct leaf *) tn);
400 else
401 call_rcu(&tn->rcu, __tnode_free_rcu);
402}
403
404static void tnode_free_safe(struct tnode *tn)
405{ 307{
406 BUG_ON(IS_LEAF(tn)); 308 ++n->empty_children ? : ++n->full_children;
407 tn->tnode_free = tnode_free_head;
408 tnode_free_head = tn;
409 tnode_free_size += sizeof(struct tnode) +
410 (sizeof(struct rt_trie_node *) << tn->bits);
411} 309}
412 310
413static void tnode_free_flush(void) 311static inline void empty_child_dec(struct tnode *n)
414{ 312{
415 struct tnode *tn; 313 n->empty_children-- ? : n->full_children--;
416
417 while ((tn = tnode_free_head)) {
418 tnode_free_head = tn->tnode_free;
419 tn->tnode_free = NULL;
420 tnode_free(tn);
421 }
422
423 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
424 tnode_free_size = 0;
425 synchronize_rcu();
426 }
427} 314}
428 315
429static struct leaf *leaf_new(void) 316static struct tnode *leaf_new(t_key key)
430{ 317{
431 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); 318 struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
432 if (l) { 319 if (l) {
433 l->parent = T_LEAF; 320 l->parent = NULL;
321 /* set key and pos to reflect full key value
322 * any trailing zeros in the key should be ignored
323 * as the nodes are searched
324 */
325 l->key = key;
326 l->slen = 0;
327 l->pos = 0;
328 /* set bits to 0 indicating we are not a tnode */
329 l->bits = 0;
330
434 INIT_HLIST_HEAD(&l->list); 331 INIT_HLIST_HEAD(&l->list);
435 } 332 }
436 return l; 333 return l;
@@ -449,462 +346,530 @@ static struct leaf_info *leaf_info_new(int plen)
449 346
450static struct tnode *tnode_new(t_key key, int pos, int bits) 347static struct tnode *tnode_new(t_key key, int pos, int bits)
451{ 348{
452 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); 349 size_t sz = offsetof(struct tnode, child[1ul << bits]);
453 struct tnode *tn = tnode_alloc(sz); 350 struct tnode *tn = tnode_alloc(sz);
351 unsigned int shift = pos + bits;
352
353 /* verify bits and pos their msb bits clear and values are valid */
354 BUG_ON(!bits || (shift > KEYLENGTH));
454 355
455 if (tn) { 356 if (tn) {
456 tn->parent = T_TNODE; 357 tn->parent = NULL;
358 tn->slen = pos;
457 tn->pos = pos; 359 tn->pos = pos;
458 tn->bits = bits; 360 tn->bits = bits;
459 tn->key = key; 361 tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
460 tn->full_children = 0; 362 if (bits == KEYLENGTH)
461 tn->empty_children = 1<<bits; 363 tn->full_children = 1;
364 else
365 tn->empty_children = 1ul << bits;
462 } 366 }
463 367
464 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), 368 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
465 sizeof(struct rt_trie_node *) << bits); 369 sizeof(struct tnode *) << bits);
466 return tn; 370 return tn;
467} 371}
468 372
469/* 373/* Check whether a tnode 'n' is "full", i.e. it is an internal node
470 * Check whether a tnode 'n' is "full", i.e. it is an internal node
471 * and no bits are skipped. See discussion in dyntree paper p. 6 374 * and no bits are skipped. See discussion in dyntree paper p. 6
472 */ 375 */
473 376static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
474static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
475{ 377{
476 if (n == NULL || IS_LEAF(n)) 378 return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
477 return 0;
478
479 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
480} 379}
481 380
482static inline void put_child(struct tnode *tn, int i, 381/* Add a child at position i overwriting the old value.
483 struct rt_trie_node *n) 382 * Update the value of full_children and empty_children.
484{ 383 */
485 tnode_put_child_reorg(tn, i, n, -1); 384static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
486}
487
488 /*
489 * Add a child at position i overwriting the old value.
490 * Update the value of full_children and empty_children.
491 */
492
493static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
494 int wasfull)
495{ 385{
496 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); 386 struct tnode *chi = tnode_get_child(tn, i);
497 int isfull; 387 int isfull, wasfull;
498 388
499 BUG_ON(i >= 1<<tn->bits); 389 BUG_ON(i >= tnode_child_length(tn));
500 390
501 /* update emptyChildren */ 391 /* update emptyChildren, overflow into fullChildren */
502 if (n == NULL && chi != NULL) 392 if (n == NULL && chi != NULL)
503 tn->empty_children++; 393 empty_child_inc(tn);
504 else if (n != NULL && chi == NULL) 394 if (n != NULL && chi == NULL)
505 tn->empty_children--; 395 empty_child_dec(tn);
506 396
507 /* update fullChildren */ 397 /* update fullChildren */
508 if (wasfull == -1) 398 wasfull = tnode_full(tn, chi);
509 wasfull = tnode_full(tn, chi);
510
511 isfull = tnode_full(tn, n); 399 isfull = tnode_full(tn, n);
400
512 if (wasfull && !isfull) 401 if (wasfull && !isfull)
513 tn->full_children--; 402 tn->full_children--;
514 else if (!wasfull && isfull) 403 else if (!wasfull && isfull)
515 tn->full_children++; 404 tn->full_children++;
516 405
517 if (n) 406 if (n && (tn->slen < n->slen))
518 node_set_parent(n, tn); 407 tn->slen = n->slen;
519 408
520 rcu_assign_pointer(tn->child[i], n); 409 rcu_assign_pointer(tn->child[i], n);
521} 410}
522 411
523#define MAX_WORK 10 412static void update_children(struct tnode *tn)
524static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
525{ 413{
526 int i; 414 unsigned long i;
527 struct tnode *old_tn;
528 int inflate_threshold_use;
529 int halve_threshold_use;
530 int max_work;
531 415
532 if (!tn) 416 /* update all of the child parent pointers */
533 return NULL; 417 for (i = tnode_child_length(tn); i;) {
418 struct tnode *inode = tnode_get_child(tn, --i);
534 419
535 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 420 if (!inode)
536 tn, inflate_threshold, halve_threshold); 421 continue;
537 422
538 /* No children */ 423 /* Either update the children of a tnode that
539 if (tn->empty_children == tnode_child_length(tn)) { 424 * already belongs to us or update the child
540 tnode_free_safe(tn); 425 * to point to ourselves.
541 return NULL; 426 */
427 if (node_parent(inode) == tn)
428 update_children(inode);
429 else
430 node_set_parent(inode, tn);
542 } 431 }
543 /* One child */ 432}
544 if (tn->empty_children == tnode_child_length(tn) - 1)
545 goto one_child;
546 /*
547 * Double as long as the resulting node has a number of
548 * nonempty nodes that are above the threshold.
549 */
550
551 /*
552 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
553 * the Helsinki University of Technology and Matti Tikkanen of Nokia
554 * Telecommunications, page 6:
555 * "A node is doubled if the ratio of non-empty children to all
556 * children in the *doubled* node is at least 'high'."
557 *
558 * 'high' in this instance is the variable 'inflate_threshold'. It
559 * is expressed as a percentage, so we multiply it with
560 * tnode_child_length() and instead of multiplying by 2 (since the
561 * child array will be doubled by inflate()) and multiplying
562 * the left-hand side by 100 (to handle the percentage thing) we
563 * multiply the left-hand side by 50.
564 *
565 * The left-hand side may look a bit weird: tnode_child_length(tn)
566 * - tn->empty_children is of course the number of non-null children
567 * in the current node. tn->full_children is the number of "full"
568 * children, that is non-null tnodes with a skip value of 0.
569 * All of those will be doubled in the resulting inflated tnode, so
570 * we just count them one extra time here.
571 *
572 * A clearer way to write this would be:
573 *
574 * to_be_doubled = tn->full_children;
575 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
576 * tn->full_children;
577 *
578 * new_child_length = tnode_child_length(tn) * 2;
579 *
580 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
581 * new_child_length;
582 * if (new_fill_factor >= inflate_threshold)
583 *
584 * ...and so on, tho it would mess up the while () loop.
585 *
586 * anyway,
587 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
588 * inflate_threshold
589 *
590 * avoid a division:
591 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
592 * inflate_threshold * new_child_length
593 *
594 * expand not_to_be_doubled and to_be_doubled, and shorten:
595 * 100 * (tnode_child_length(tn) - tn->empty_children +
596 * tn->full_children) >= inflate_threshold * new_child_length
597 *
598 * expand new_child_length:
599 * 100 * (tnode_child_length(tn) - tn->empty_children +
600 * tn->full_children) >=
601 * inflate_threshold * tnode_child_length(tn) * 2
602 *
603 * shorten again:
604 * 50 * (tn->full_children + tnode_child_length(tn) -
605 * tn->empty_children) >= inflate_threshold *
606 * tnode_child_length(tn)
607 *
608 */
609 433
610 check_tnode(tn); 434static inline void put_child_root(struct tnode *tp, struct trie *t,
435 t_key key, struct tnode *n)
436{
437 if (tp)
438 put_child(tp, get_index(key, tp), n);
439 else
440 rcu_assign_pointer(t->trie, n);
441}
611 442
612 /* Keep root node larger */ 443static inline void tnode_free_init(struct tnode *tn)
444{
445 tn->rcu.next = NULL;
446}
613 447
614 if (!node_parent((struct rt_trie_node *)tn)) { 448static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
615 inflate_threshold_use = inflate_threshold_root; 449{
616 halve_threshold_use = halve_threshold_root; 450 n->rcu.next = tn->rcu.next;
617 } else { 451 tn->rcu.next = &n->rcu;
618 inflate_threshold_use = inflate_threshold; 452}
619 halve_threshold_use = halve_threshold;
620 }
621 453
622 max_work = MAX_WORK; 454static void tnode_free(struct tnode *tn)
623 while ((tn->full_children > 0 && max_work-- && 455{
624 50 * (tn->full_children + tnode_child_length(tn) 456 struct callback_head *head = &tn->rcu;
625 - tn->empty_children)
626 >= inflate_threshold_use * tnode_child_length(tn))) {
627 457
628 old_tn = tn; 458 while (head) {
629 tn = inflate(t, tn); 459 head = head->next;
460 tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
461 node_free(tn);
630 462
631 if (IS_ERR(tn)) { 463 tn = container_of(head, struct tnode, rcu);
632 tn = old_tn;
633#ifdef CONFIG_IP_FIB_TRIE_STATS
634 t->stats.resize_node_skipped++;
635#endif
636 break;
637 }
638 } 464 }
639 465
640 check_tnode(tn); 466 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
641 467 tnode_free_size = 0;
642 /* Return if at least one inflate is run */ 468 synchronize_rcu();
643 if (max_work != MAX_WORK)
644 return (struct rt_trie_node *) tn;
645
646 /*
647 * Halve as long as the number of empty children in this
648 * node is above threshold.
649 */
650
651 max_work = MAX_WORK;
652 while (tn->bits > 1 && max_work-- &&
653 100 * (tnode_child_length(tn) - tn->empty_children) <
654 halve_threshold_use * tnode_child_length(tn)) {
655
656 old_tn = tn;
657 tn = halve(t, tn);
658 if (IS_ERR(tn)) {
659 tn = old_tn;
660#ifdef CONFIG_IP_FIB_TRIE_STATS
661 t->stats.resize_node_skipped++;
662#endif
663 break;
664 }
665 } 469 }
470}
666 471
472static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
473{
474 struct tnode *tp = node_parent(oldtnode);
475 unsigned long i;
667 476
668 /* Only one child remains */ 477 /* setup the parent pointer out of and back into this node */
669 if (tn->empty_children == tnode_child_length(tn) - 1) { 478 NODE_INIT_PARENT(tn, tp);
670one_child: 479 put_child_root(tp, t, tn->key, tn);
671 for (i = 0; i < tnode_child_length(tn); i++) {
672 struct rt_trie_node *n;
673
674 n = rtnl_dereference(tn->child[i]);
675 if (!n)
676 continue;
677
678 /* compress one level */
679 480
680 node_set_parent(n, NULL); 481 /* update all of the child parent pointers */
681 tnode_free_safe(tn); 482 update_children(tn);
682 return n;
683 }
684 }
685 return (struct rt_trie_node *) tn;
686}
687 483
484 /* all pointers should be clean so we are done */
485 tnode_free(oldtnode);
688 486
689static void tnode_clean_free(struct tnode *tn) 487 /* resize children now that oldtnode is freed */
690{ 488 for (i = tnode_child_length(tn); i;) {
691 int i; 489 struct tnode *inode = tnode_get_child(tn, --i);
692 struct tnode *tofree;
693 490
694 for (i = 0; i < tnode_child_length(tn); i++) { 491 /* resize child node */
695 tofree = (struct tnode *)rtnl_dereference(tn->child[i]); 492 if (tnode_full(tn, inode))
696 if (tofree) 493 resize(t, inode);
697 tnode_free(tofree);
698 } 494 }
699 tnode_free(tn);
700} 495}
701 496
702static struct tnode *inflate(struct trie *t, struct tnode *tn) 497static int inflate(struct trie *t, struct tnode *oldtnode)
703{ 498{
704 struct tnode *oldtnode = tn; 499 struct tnode *tn;
705 int olen = tnode_child_length(tn); 500 unsigned long i;
706 int i; 501 t_key m;
707 502
708 pr_debug("In inflate\n"); 503 pr_debug("In inflate\n");
709 504
710 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 505 tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
711
712 if (!tn) 506 if (!tn)
713 return ERR_PTR(-ENOMEM); 507 return -ENOMEM;
714
715 /*
716 * Preallocate and store tnodes before the actual work so we
717 * don't get into an inconsistent state if memory allocation
718 * fails. In case of failure we return the oldnode and inflate
719 * of tnode is ignored.
720 */
721
722 for (i = 0; i < olen; i++) {
723 struct tnode *inode;
724
725 inode = (struct tnode *) tnode_get_child(oldtnode, i);
726 if (inode &&
727 IS_TNODE(inode) &&
728 inode->pos == oldtnode->pos + oldtnode->bits &&
729 inode->bits > 1) {
730 struct tnode *left, *right;
731 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
732
733 left = tnode_new(inode->key&(~m), inode->pos + 1,
734 inode->bits - 1);
735 if (!left)
736 goto nomem;
737
738 right = tnode_new(inode->key|m, inode->pos + 1,
739 inode->bits - 1);
740
741 if (!right) {
742 tnode_free(left);
743 goto nomem;
744 }
745 508
746 put_child(tn, 2*i, (struct rt_trie_node *) left); 509 /* prepare oldtnode to be freed */
747 put_child(tn, 2*i+1, (struct rt_trie_node *) right); 510 tnode_free_init(oldtnode);
748 }
749 }
750 511
751 for (i = 0; i < olen; i++) { 512 /* Assemble all of the pointers in our cluster, in this case that
752 struct tnode *inode; 513 * represents all of the pointers out of our allocated nodes that
753 struct rt_trie_node *node = tnode_get_child(oldtnode, i); 514 * point to existing tnodes and the links between our allocated
754 struct tnode *left, *right; 515 * nodes.
755 int size, j; 516 */
517 for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
518 struct tnode *inode = tnode_get_child(oldtnode, --i);
519 struct tnode *node0, *node1;
520 unsigned long j, k;
756 521
757 /* An empty child */ 522 /* An empty child */
758 if (node == NULL) 523 if (inode == NULL)
759 continue; 524 continue;
760 525
761 /* A leaf or an internal node with skipped bits */ 526 /* A leaf or an internal node with skipped bits */
762 527 if (!tnode_full(oldtnode, inode)) {
763 if (IS_LEAF(node) || ((struct tnode *) node)->pos > 528 put_child(tn, get_index(inode->key, tn), inode);
764 tn->pos + tn->bits - 1) {
765 put_child(tn,
766 tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
767 node);
768 continue; 529 continue;
769 } 530 }
770 531
771 /* An internal node with two children */ 532 /* drop the node in the old tnode free list */
772 inode = (struct tnode *) node; 533 tnode_free_append(oldtnode, inode);
773 534
535 /* An internal node with two children */
774 if (inode->bits == 1) { 536 if (inode->bits == 1) {
775 put_child(tn, 2*i, rtnl_dereference(inode->child[0])); 537 put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
776 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); 538 put_child(tn, 2 * i, tnode_get_child(inode, 0));
777
778 tnode_free_safe(inode);
779 continue; 539 continue;
780 } 540 }
781 541
782 /* An internal node with more than two children */
783
784 /* We will replace this node 'inode' with two new 542 /* We will replace this node 'inode' with two new
785 * ones, 'left' and 'right', each with half of the 543 * ones, 'node0' and 'node1', each with half of the
786 * original children. The two new nodes will have 544 * original children. The two new nodes will have
787 * a position one bit further down the key and this 545 * a position one bit further down the key and this
788 * means that the "significant" part of their keys 546 * means that the "significant" part of their keys
789 * (see the discussion near the top of this file) 547 * (see the discussion near the top of this file)
790 * will differ by one bit, which will be "0" in 548 * will differ by one bit, which will be "0" in
791 * left's key and "1" in right's key. Since we are 549 * node0's key and "1" in node1's key. Since we are
792 * moving the key position by one step, the bit that 550 * moving the key position by one step, the bit that
793 * we are moving away from - the bit at position 551 * we are moving away from - the bit at position
794 * (inode->pos) - is the one that will differ between 552 * (tn->pos) - is the one that will differ between
795 * left and right. So... we synthesize that bit in the 553 * node0 and node1. So... we synthesize that bit in the
796 * two new keys. 554 * two new keys.
797 * The mask 'm' below will be a single "one" bit at
798 * the position (inode->pos)
799 */ 555 */
556 node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
557 if (!node1)
558 goto nomem;
559 node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
560
561 tnode_free_append(tn, node1);
562 if (!node0)
563 goto nomem;
564 tnode_free_append(tn, node0);
565
566 /* populate child pointers in new nodes */
567 for (k = tnode_child_length(inode), j = k / 2; j;) {
568 put_child(node1, --j, tnode_get_child(inode, --k));
569 put_child(node0, j, tnode_get_child(inode, j));
570 put_child(node1, --j, tnode_get_child(inode, --k));
571 put_child(node0, j, tnode_get_child(inode, j));
572 }
800 573
801 /* Use the old key, but set the new significant 574 /* link new nodes to parent */
802 * bit to zero. 575 NODE_INIT_PARENT(node1, tn);
803 */ 576 NODE_INIT_PARENT(node0, tn);
577
578 /* link parent to nodes */
579 put_child(tn, 2 * i + 1, node1);
580 put_child(tn, 2 * i, node0);
581 }
582
583 /* setup the parent pointers into and out of this node */
584 replace(t, oldtnode, tn);
585
586 return 0;
587nomem:
588 /* all pointers should be clean so we are done */
589 tnode_free(tn);
590 return -ENOMEM;
591}
592
593static int halve(struct trie *t, struct tnode *oldtnode)
594{
595 struct tnode *tn;
596 unsigned long i;
597
598 pr_debug("In halve\n");
804 599
805 left = (struct tnode *) tnode_get_child(tn, 2*i); 600 tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
806 put_child(tn, 2*i, NULL); 601 if (!tn)
602 return -ENOMEM;
807 603
808 BUG_ON(!left); 604 /* prepare oldtnode to be freed */
605 tnode_free_init(oldtnode);
809 606
810 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 607 /* Assemble all of the pointers in our cluster, in this case that
811 put_child(tn, 2*i+1, NULL); 608 * represents all of the pointers out of our allocated nodes that
609 * point to existing tnodes and the links between our allocated
610 * nodes.
611 */
612 for (i = tnode_child_length(oldtnode); i;) {
613 struct tnode *node1 = tnode_get_child(oldtnode, --i);
614 struct tnode *node0 = tnode_get_child(oldtnode, --i);
615 struct tnode *inode;
812 616
813 BUG_ON(!right); 617 /* At least one of the children is empty */
618 if (!node1 || !node0) {
619 put_child(tn, i / 2, node1 ? : node0);
620 continue;
621 }
814 622
815 size = tnode_child_length(left); 623 /* Two nonempty children */
816 for (j = 0; j < size; j++) { 624 inode = tnode_new(node0->key, oldtnode->pos, 1);
817 put_child(left, j, rtnl_dereference(inode->child[j])); 625 if (!inode) {
818 put_child(right, j, rtnl_dereference(inode->child[j + size])); 626 tnode_free(tn);
627 return -ENOMEM;
819 } 628 }
820 put_child(tn, 2*i, resize(t, left)); 629 tnode_free_append(tn, inode);
821 put_child(tn, 2*i+1, resize(t, right)); 630
631 /* initialize pointers out of node */
632 put_child(inode, 1, node1);
633 put_child(inode, 0, node0);
634 NODE_INIT_PARENT(inode, tn);
822 635
823 tnode_free_safe(inode); 636 /* link parent to node */
637 put_child(tn, i / 2, inode);
824 } 638 }
825 tnode_free_safe(oldtnode); 639
826 return tn; 640 /* setup the parent pointers into and out of this node */
827nomem: 641 replace(t, oldtnode, tn);
828 tnode_clean_free(tn); 642
829 return ERR_PTR(-ENOMEM); 643 return 0;
830} 644}
831 645
832static struct tnode *halve(struct trie *t, struct tnode *tn) 646static void collapse(struct trie *t, struct tnode *oldtnode)
833{ 647{
834 struct tnode *oldtnode = tn; 648 struct tnode *n, *tp;
835 struct rt_trie_node *left, *right; 649 unsigned long i;
836 int i;
837 int olen = tnode_child_length(tn);
838 650
839 pr_debug("In halve\n"); 651 /* scan the tnode looking for that one child that might still exist */
652 for (n = NULL, i = tnode_child_length(oldtnode); !n && i;)
653 n = tnode_get_child(oldtnode, --i);
840 654
841 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 655 /* compress one level */
656 tp = node_parent(oldtnode);
657 put_child_root(tp, t, oldtnode->key, n);
658 node_set_parent(n, tp);
842 659
843 if (!tn) 660 /* drop dead node */
844 return ERR_PTR(-ENOMEM); 661 node_free(oldtnode);
662}
845 663
846 /* 664static unsigned char update_suffix(struct tnode *tn)
847 * Preallocate and store tnodes before the actual work so we 665{
848 * don't get into an inconsistent state if memory allocation 666 unsigned char slen = tn->pos;
849 * fails. In case of failure we return the oldnode and halve 667 unsigned long stride, i;
850 * of tnode is ignored. 668
669 /* search though the list of children looking for nodes that might
670 * have a suffix greater than the one we currently have. This is
671 * why we start with a stride of 2 since a stride of 1 would
672 * represent the nodes with suffix length equal to tn->pos
851 */ 673 */
674 for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) {
675 struct tnode *n = tnode_get_child(tn, i);
852 676
853 for (i = 0; i < olen; i += 2) { 677 if (!n || (n->slen <= slen))
854 left = tnode_get_child(oldtnode, i); 678 continue;
855 right = tnode_get_child(oldtnode, i+1);
856 679
857 /* Two nonempty children */ 680 /* update stride and slen based on new value */
858 if (left && right) { 681 stride <<= (n->slen - slen);
859 struct tnode *newn; 682 slen = n->slen;
683 i &= ~(stride - 1);
860 684
861 newn = tnode_new(left->key, tn->pos + tn->bits, 1); 685 /* if slen covers all but the last bit we can stop here
686 * there will be nothing longer than that since only node
687 * 0 and 1 << (bits - 1) could have that as their suffix
688 * length.
689 */
690 if ((slen + 1) >= (tn->pos + tn->bits))
691 break;
692 }
862 693
863 if (!newn) 694 tn->slen = slen;
864 goto nomem;
865 695
866 put_child(tn, i/2, (struct rt_trie_node *)newn); 696 return slen;
867 } 697}
868 698
869 } 699/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
700 * the Helsinki University of Technology and Matti Tikkanen of Nokia
701 * Telecommunications, page 6:
702 * "A node is doubled if the ratio of non-empty children to all
703 * children in the *doubled* node is at least 'high'."
704 *
705 * 'high' in this instance is the variable 'inflate_threshold'. It
706 * is expressed as a percentage, so we multiply it with
707 * tnode_child_length() and instead of multiplying by 2 (since the
708 * child array will be doubled by inflate()) and multiplying
709 * the left-hand side by 100 (to handle the percentage thing) we
710 * multiply the left-hand side by 50.
711 *
712 * The left-hand side may look a bit weird: tnode_child_length(tn)
713 * - tn->empty_children is of course the number of non-null children
714 * in the current node. tn->full_children is the number of "full"
715 * children, that is non-null tnodes with a skip value of 0.
716 * All of those will be doubled in the resulting inflated tnode, so
717 * we just count them one extra time here.
718 *
719 * A clearer way to write this would be:
720 *
721 * to_be_doubled = tn->full_children;
722 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
723 * tn->full_children;
724 *
725 * new_child_length = tnode_child_length(tn) * 2;
726 *
727 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
728 * new_child_length;
729 * if (new_fill_factor >= inflate_threshold)
730 *
731 * ...and so on, tho it would mess up the while () loop.
732 *
733 * anyway,
734 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
735 * inflate_threshold
736 *
737 * avoid a division:
738 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
739 * inflate_threshold * new_child_length
740 *
741 * expand not_to_be_doubled and to_be_doubled, and shorten:
742 * 100 * (tnode_child_length(tn) - tn->empty_children +
743 * tn->full_children) >= inflate_threshold * new_child_length
744 *
745 * expand new_child_length:
746 * 100 * (tnode_child_length(tn) - tn->empty_children +
747 * tn->full_children) >=
748 * inflate_threshold * tnode_child_length(tn) * 2
749 *
750 * shorten again:
751 * 50 * (tn->full_children + tnode_child_length(tn) -
752 * tn->empty_children) >= inflate_threshold *
753 * tnode_child_length(tn)
754 *
755 */
756static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
757{
758 unsigned long used = tnode_child_length(tn);
759 unsigned long threshold = used;
870 760
871 for (i = 0; i < olen; i += 2) { 761 /* Keep root node larger */
872 struct tnode *newBinNode; 762 threshold *= tp ? inflate_threshold : inflate_threshold_root;
763 used -= tn->empty_children;
764 used += tn->full_children;
873 765
874 left = tnode_get_child(oldtnode, i); 766 /* if bits == KEYLENGTH then pos = 0, and will fail below */
875 right = tnode_get_child(oldtnode, i+1);
876 767
877 /* At least one of the children is empty */ 768 return (used > 1) && tn->pos && ((50 * used) >= threshold);
878 if (left == NULL) { 769}
879 if (right == NULL) /* Both are empty */ 770
880 continue; 771static bool should_halve(const struct tnode *tp, const struct tnode *tn)
881 put_child(tn, i/2, right); 772{
882 continue; 773 unsigned long used = tnode_child_length(tn);
774 unsigned long threshold = used;
775
776 /* Keep root node larger */
777 threshold *= tp ? halve_threshold : halve_threshold_root;
778 used -= tn->empty_children;
779
780 /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
781
782 return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
783}
784
785static bool should_collapse(const struct tnode *tn)
786{
787 unsigned long used = tnode_child_length(tn);
788
789 used -= tn->empty_children;
790
791 /* account for bits == KEYLENGTH case */
792 if ((tn->bits == KEYLENGTH) && tn->full_children)
793 used -= KEY_MAX;
794
795 /* One child or none, time to drop us from the trie */
796 return used < 2;
797}
798
799#define MAX_WORK 10
800static void resize(struct trie *t, struct tnode *tn)
801{
802 struct tnode *tp = node_parent(tn);
803 struct tnode __rcu **cptr;
804 int max_work = MAX_WORK;
805
806 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
807 tn, inflate_threshold, halve_threshold);
808
809 /* track the tnode via the pointer from the parent instead of
810 * doing it ourselves. This way we can let RCU fully do its
811 * thing without us interfering
812 */
813 cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
814 BUG_ON(tn != rtnl_dereference(*cptr));
815
816 /* Double as long as the resulting node has a number of
817 * nonempty nodes that are above the threshold.
818 */
819 while (should_inflate(tp, tn) && max_work) {
820 if (inflate(t, tn)) {
821#ifdef CONFIG_IP_FIB_TRIE_STATS
822 this_cpu_inc(t->stats->resize_node_skipped);
823#endif
824 break;
883 } 825 }
884 826
885 if (right == NULL) { 827 max_work--;
886 put_child(tn, i/2, left); 828 tn = rtnl_dereference(*cptr);
887 continue; 829 }
830
831 /* Return if at least one inflate is run */
832 if (max_work != MAX_WORK)
833 return;
834
835 /* Halve as long as the number of empty children in this
836 * node is above threshold.
837 */
838 while (should_halve(tp, tn) && max_work) {
839 if (halve(t, tn)) {
840#ifdef CONFIG_IP_FIB_TRIE_STATS
841 this_cpu_inc(t->stats->resize_node_skipped);
842#endif
843 break;
888 } 844 }
889 845
890 /* Two nonempty children */ 846 max_work--;
891 newBinNode = (struct tnode *) tnode_get_child(tn, i/2); 847 tn = rtnl_dereference(*cptr);
892 put_child(tn, i/2, NULL); 848 }
893 put_child(newBinNode, 0, left); 849
894 put_child(newBinNode, 1, right); 850 /* Only one child remains */
895 put_child(tn, i/2, resize(t, newBinNode)); 851 if (should_collapse(tn)) {
852 collapse(t, tn);
853 return;
854 }
855
856 /* Return if at least one deflate was run */
857 if (max_work != MAX_WORK)
858 return;
859
860 /* push the suffix length to the parent node */
861 if (tn->slen > tn->pos) {
862 unsigned char slen = update_suffix(tn);
863
864 if (tp && (slen > tp->slen))
865 tp->slen = slen;
896 } 866 }
897 tnode_free_safe(oldtnode);
898 return tn;
899nomem:
900 tnode_clean_free(tn);
901 return ERR_PTR(-ENOMEM);
902} 867}
903 868
904/* readside must use rcu_read_lock currently dump routines 869/* readside must use rcu_read_lock currently dump routines
905 via get_fa_head and dump */ 870 via get_fa_head and dump */
906 871
907static struct leaf_info *find_leaf_info(struct leaf *l, int plen) 872static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
908{ 873{
909 struct hlist_head *head = &l->list; 874 struct hlist_head *head = &l->list;
910 struct leaf_info *li; 875 struct leaf_info *li;
@@ -916,7 +881,7 @@ static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
916 return NULL; 881 return NULL;
917} 882}
918 883
919static inline struct list_head *get_fa_head(struct leaf *l, int plen) 884static inline struct list_head *get_fa_head(struct tnode *l, int plen)
920{ 885{
921 struct leaf_info *li = find_leaf_info(l, plen); 886 struct leaf_info *li = find_leaf_info(l, plen);
922 887
@@ -926,8 +891,51 @@ static inline struct list_head *get_fa_head(struct leaf *l, int plen)
926 return &li->falh; 891 return &li->falh;
927} 892}
928 893
929static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 894static void leaf_pull_suffix(struct tnode *l)
895{
896 struct tnode *tp = node_parent(l);
897
898 while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
899 if (update_suffix(tp) > l->slen)
900 break;
901 tp = node_parent(tp);
902 }
903}
904
905static void leaf_push_suffix(struct tnode *l)
906{
907 struct tnode *tn = node_parent(l);
908
909 /* if this is a new leaf then tn will be NULL and we can sort
910 * out parent suffix lengths as a part of trie_rebalance
911 */
912 while (tn && (tn->slen < l->slen)) {
913 tn->slen = l->slen;
914 tn = node_parent(tn);
915 }
916}
917
918static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
930{ 919{
920 /* record the location of the previous list_info entry */
921 struct hlist_node **pprev = old->hlist.pprev;
922 struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next);
923
924 /* remove the leaf info from the list */
925 hlist_del_rcu(&old->hlist);
926
927 /* only access li if it is pointing at the last valid hlist_node */
928 if (hlist_empty(&l->list) || (*pprev))
929 return;
930
931 /* update the trie with the latest suffix length */
932 l->slen = KEYLENGTH - li->plen;
933 leaf_pull_suffix(l);
934}
935
936static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
937{
938 struct hlist_head *head = &l->list;
931 struct leaf_info *li = NULL, *last = NULL; 939 struct leaf_info *li = NULL, *last = NULL;
932 940
933 if (hlist_empty(head)) { 941 if (hlist_empty(head)) {
@@ -944,218 +952,174 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
944 else 952 else
945 hlist_add_before_rcu(&new->hlist, &li->hlist); 953 hlist_add_before_rcu(&new->hlist, &li->hlist);
946 } 954 }
955
956 /* if we added to the tail node then we need to update slen */
957 if (l->slen < (KEYLENGTH - new->plen)) {
958 l->slen = KEYLENGTH - new->plen;
959 leaf_push_suffix(l);
960 }
947} 961}
948 962
949/* rcu_read_lock needs to be hold by caller from readside */ 963/* rcu_read_lock needs to be hold by caller from readside */
964static struct tnode *fib_find_node(struct trie *t, u32 key)
965{
966 struct tnode *n = rcu_dereference_rtnl(t->trie);
967
968 while (n) {
969 unsigned long index = get_index(key, n);
970
971 /* This bit of code is a bit tricky but it combines multiple
972 * checks into a single check. The prefix consists of the
973 * prefix plus zeros for the bits in the cindex. The index
974 * is the difference between the key and this value. From
975 * this we can actually derive several pieces of data.
976 * if (index & (~0ul << bits))
977 * we have a mismatch in skip bits and failed
978 * else
979 * we know the value is cindex
980 */
981 if (index & (~0ul << n->bits))
982 return NULL;
950 983
951static struct leaf * 984 /* we have found a leaf. Prefixes have already been compared */
952fib_find_node(struct trie *t, u32 key) 985 if (IS_LEAF(n))
953{ 986 break;
954 int pos;
955 struct tnode *tn;
956 struct rt_trie_node *n;
957 987
958 pos = 0; 988 n = tnode_get_child_rcu(n, index);
959 n = rcu_dereference_rtnl(t->trie); 989 }
960 990
961 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 991 return n;
962 tn = (struct tnode *) n; 992}
963 993
964 check_tnode(tn); 994/* Return the first fib alias matching TOS with
995 * priority less than or equal to PRIO.
996 */
997static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
998{
999 struct fib_alias *fa;
965 1000
966 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 1001 if (!fah)
967 pos = tn->pos + tn->bits; 1002 return NULL;
968 n = tnode_get_child_rcu(tn,
969 tkey_extract_bits(key,
970 tn->pos,
971 tn->bits));
972 } else
973 break;
974 }
975 /* Case we have found a leaf. Compare prefixes */
976 1003
977 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) 1004 list_for_each_entry(fa, fah, fa_list) {
978 return (struct leaf *)n; 1005 if (fa->fa_tos > tos)
1006 continue;
1007 if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
1008 return fa;
1009 }
979 1010
980 return NULL; 1011 return NULL;
981} 1012}
982 1013
983static void trie_rebalance(struct trie *t, struct tnode *tn) 1014static void trie_rebalance(struct trie *t, struct tnode *tn)
984{ 1015{
985 int wasfull;
986 t_key cindex, key;
987 struct tnode *tp; 1016 struct tnode *tp;
988 1017
989 key = tn->key; 1018 while ((tp = node_parent(tn)) != NULL) {
990 1019 resize(t, tn);
991 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
992 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
993 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
994 tn = (struct tnode *)resize(t, tn);
995
996 tnode_put_child_reorg(tp, cindex,
997 (struct rt_trie_node *)tn, wasfull);
998
999 tp = node_parent((struct rt_trie_node *) tn);
1000 if (!tp)
1001 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1002
1003 tnode_free_flush();
1004 if (!tp)
1005 break;
1006 tn = tp; 1020 tn = tp;
1007 } 1021 }
1008 1022
1009 /* Handle last (top) tnode */ 1023 /* Handle last (top) tnode */
1010 if (IS_TNODE(tn)) 1024 if (IS_TNODE(tn))
1011 tn = (struct tnode *)resize(t, tn); 1025 resize(t, tn);
1012
1013 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1014 tnode_free_flush();
1015} 1026}
1016 1027
1017/* only used from updater-side */ 1028/* only used from updater-side */
1018 1029
1019static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) 1030static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1020{ 1031{
1021 int pos, newpos;
1022 struct tnode *tp = NULL, *tn = NULL;
1023 struct rt_trie_node *n;
1024 struct leaf *l;
1025 int missbit;
1026 struct list_head *fa_head = NULL; 1032 struct list_head *fa_head = NULL;
1033 struct tnode *l, *n, *tp = NULL;
1027 struct leaf_info *li; 1034 struct leaf_info *li;
1028 t_key cindex;
1029 1035
1030 pos = 0; 1036 li = leaf_info_new(plen);
1037 if (!li)
1038 return NULL;
1039 fa_head = &li->falh;
1040
1031 n = rtnl_dereference(t->trie); 1041 n = rtnl_dereference(t->trie);
1032 1042
1033 /* If we point to NULL, stop. Either the tree is empty and we should 1043 /* If we point to NULL, stop. Either the tree is empty and we should
1034 * just put a new leaf in if, or we have reached an empty child slot, 1044 * just put a new leaf in if, or we have reached an empty child slot,
1035 * and we should just put our new leaf in that. 1045 * and we should just put our new leaf in that.
1036 * If we point to a T_TNODE, check if it matches our key. Note that
1037 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 * not be the parent's 'pos'+'bits'!
1039 *
1040 * If it does match the current key, get pos/bits from it, extract
1041 * the index from our key, push the T_TNODE and walk the tree.
1042 *
1043 * If it doesn't, we have to replace it with a new T_TNODE.
1044 * 1046 *
1045 * If we point to a T_LEAF, it might or might not have the same key 1047 * If we hit a node with a key that does't match then we should stop
1046 * as we do. If it does, just change the value, update the T_LEAF's 1048 * and create a new tnode to replace that node and insert ourselves
1047 * value, and return it. 1049 * and the other node into the new tnode.
1048 * If it doesn't, we need to replace it with a T_TNODE.
1049 */ 1050 */
1050 1051 while (n) {
1051 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 1052 unsigned long index = get_index(key, n);
1052 tn = (struct tnode *) n; 1053
1053 1054 /* This bit of code is a bit tricky but it combines multiple
1054 check_tnode(tn); 1055 * checks into a single check. The prefix consists of the
1055 1056 * prefix plus zeros for the "bits" in the prefix. The index
1056 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 1057 * is the difference between the key and this value. From
1057 tp = tn; 1058 * this we can actually derive several pieces of data.
1058 pos = tn->pos + tn->bits; 1059 * if !(index >> bits)
1059 n = tnode_get_child(tn, 1060 * we know the value is child index
1060 tkey_extract_bits(key, 1061 * else
1061 tn->pos, 1062 * we have a mismatch in skip bits and failed
1062 tn->bits)); 1063 */
1063 1064 if (index >> n->bits)
1064 BUG_ON(n && node_parent(n) != tn);
1065 } else
1066 break; 1065 break;
1067 }
1068 1066
1069 /* 1067 /* we have found a leaf. Prefixes have already been compared */
1070 * n ----> NULL, LEAF or TNODE 1068 if (IS_LEAF(n)) {
1071 * 1069 /* Case 1: n is a leaf, and prefixes match*/
1072 * tp is n's (parent) ----> NULL or TNODE 1070 insert_leaf_info(n, li);
1073 */ 1071 return fa_head;
1074 1072 }
1075 BUG_ON(tp && IS_LEAF(tp));
1076
1077 /* Case 1: n is a leaf. Compare prefixes */
1078
1079 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1080 l = (struct leaf *) n;
1081 li = leaf_info_new(plen);
1082
1083 if (!li)
1084 return NULL;
1085 1073
1086 fa_head = &li->falh; 1074 tp = n;
1087 insert_leaf_info(&l->list, li); 1075 n = tnode_get_child_rcu(n, index);
1088 goto done;
1089 } 1076 }
1090 l = leaf_new();
1091 1077
1092 if (!l) 1078 l = leaf_new(key);
1093 return NULL; 1079 if (!l) {
1094 1080 free_leaf_info(li);
1095 l->key = key;
1096 li = leaf_info_new(plen);
1097
1098 if (!li) {
1099 free_leaf(l);
1100 return NULL; 1081 return NULL;
1101 } 1082 }
1102 1083
1103 fa_head = &li->falh; 1084 insert_leaf_info(l, li);
1104 insert_leaf_info(&l->list, li);
1105
1106 if (t->trie && n == NULL) {
1107 /* Case 2: n is NULL, and will just insert a new leaf */
1108 1085
1109 node_set_parent((struct rt_trie_node *)l, tp); 1086 /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1110 1087 *
1111 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1088 * Add a new tnode here
1112 put_child(tp, cindex, (struct rt_trie_node *)l); 1089 * first tnode need some special handling
1113 } else { 1090 * leaves us in position for handling as case 3
1114 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1091 */
1115 /* 1092 if (n) {
1116 * Add a new tnode here 1093 struct tnode *tn;
1117 * first tnode need some special handling
1118 */
1119
1120 if (n) {
1121 pos = tp ? tp->pos+tp->bits : 0;
1122 newpos = tkey_mismatch(key, pos, n->key);
1123 tn = tnode_new(n->key, newpos, 1);
1124 } else {
1125 newpos = 0;
1126 tn = tnode_new(key, newpos, 1); /* First tnode */
1127 }
1128 1094
1095 tn = tnode_new(key, __fls(key ^ n->key), 1);
1129 if (!tn) { 1096 if (!tn) {
1130 free_leaf_info(li); 1097 free_leaf_info(li);
1131 free_leaf(l); 1098 node_free(l);
1132 return NULL; 1099 return NULL;
1133 } 1100 }
1134 1101
1135 node_set_parent((struct rt_trie_node *)tn, tp); 1102 /* initialize routes out of node */
1103 NODE_INIT_PARENT(tn, tp);
1104 put_child(tn, get_index(key, tn) ^ 1, n);
1136 1105
1137 missbit = tkey_extract_bits(key, newpos, 1); 1106 /* start adding routes into the node */
1138 put_child(tn, missbit, (struct rt_trie_node *)l); 1107 put_child_root(tp, t, key, tn);
1139 put_child(tn, 1-missbit, n); 1108 node_set_parent(n, tn);
1140
1141 if (tp) {
1142 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1143 put_child(tp, cindex, (struct rt_trie_node *)tn);
1144 } else {
1145 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1146 }
1147 1109
1110 /* parent now has a NULL spot where the leaf can go */
1148 tp = tn; 1111 tp = tn;
1149 } 1112 }
1150 1113
1151 if (tp && tp->pos + tp->bits > 32) 1114 /* Case 3: n is NULL, and will just insert a new leaf */
1152 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1115 if (tp) {
1153 tp, tp->pos, tp->bits, key, plen); 1116 NODE_INIT_PARENT(l, tp);
1154 1117 put_child(tp, get_index(key, tp), l);
1155 /* Rebalance the trie */ 1118 trie_rebalance(t, tp);
1119 } else {
1120 rcu_assign_pointer(t->trie, l);
1121 }
1156 1122
1157 trie_rebalance(t, tp);
1158done:
1159 return fa_head; 1123 return fa_head;
1160} 1124}
1161 1125
@@ -1172,7 +1136,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1172 u8 tos = cfg->fc_tos; 1136 u8 tos = cfg->fc_tos;
1173 u32 key, mask; 1137 u32 key, mask;
1174 int err; 1138 int err;
1175 struct leaf *l; 1139 struct tnode *l;
1176 1140
1177 if (plen > 32) 1141 if (plen > 32)
1178 return -EINVAL; 1142 return -EINVAL;
@@ -1329,18 +1293,130 @@ err:
1329 return err; 1293 return err;
1330} 1294}
1331 1295
1296static inline t_key prefix_mismatch(t_key key, struct tnode *n)
1297{
1298 t_key prefix = n->key;
1299
1300 return (key ^ prefix) & (prefix | -prefix);
1301}
1302
1332/* should be called with rcu_read_lock */ 1303/* should be called with rcu_read_lock */
1333static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, 1304int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1334 t_key key, const struct flowi4 *flp, 1305 struct fib_result *res, int fib_flags)
1335 struct fib_result *res, int fib_flags)
1336{ 1306{
1307 struct trie *t = (struct trie *)tb->tb_data;
1308#ifdef CONFIG_IP_FIB_TRIE_STATS
1309 struct trie_use_stats __percpu *stats = t->stats;
1310#endif
1311 const t_key key = ntohl(flp->daddr);
1312 struct tnode *n, *pn;
1337 struct leaf_info *li; 1313 struct leaf_info *li;
1338 struct hlist_head *hhead = &l->list; 1314 t_key cindex;
1315
1316 n = rcu_dereference(t->trie);
1317 if (!n)
1318 return -EAGAIN;
1319
1320#ifdef CONFIG_IP_FIB_TRIE_STATS
1321 this_cpu_inc(stats->gets);
1322#endif
1323
1324 pn = n;
1325 cindex = 0;
1326
1327 /* Step 1: Travel to the longest prefix match in the trie */
1328 for (;;) {
1329 unsigned long index = get_index(key, n);
1330
1331 /* This bit of code is a bit tricky but it combines multiple
1332 * checks into a single check. The prefix consists of the
1333 * prefix plus zeros for the "bits" in the prefix. The index
1334 * is the difference between the key and this value. From
1335 * this we can actually derive several pieces of data.
1336 * if (index & (~0ul << bits))
1337 * we have a mismatch in skip bits and failed
1338 * else
1339 * we know the value is cindex
1340 */
1341 if (index & (~0ul << n->bits))
1342 break;
1343
1344 /* we have found a leaf. Prefixes have already been compared */
1345 if (IS_LEAF(n))
1346 goto found;
1347
1348 /* only record pn and cindex if we are going to be chopping
1349 * bits later. Otherwise we are just wasting cycles.
1350 */
1351 if (n->slen > n->pos) {
1352 pn = n;
1353 cindex = index;
1354 }
1355
1356 n = tnode_get_child_rcu(n, index);
1357 if (unlikely(!n))
1358 goto backtrace;
1359 }
1360
1361 /* Step 2: Sort out leaves and begin backtracing for longest prefix */
1362 for (;;) {
1363 /* record the pointer where our next node pointer is stored */
1364 struct tnode __rcu **cptr = n->child;
1365
1366 /* This test verifies that none of the bits that differ
1367 * between the key and the prefix exist in the region of
1368 * the lsb and higher in the prefix.
1369 */
1370 if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
1371 goto backtrace;
1372
1373 /* exit out and process leaf */
1374 if (unlikely(IS_LEAF(n)))
1375 break;
1376
1377 /* Don't bother recording parent info. Since we are in
1378 * prefix match mode we will have to come back to wherever
1379 * we started this traversal anyway
1380 */
1381
1382 while ((n = rcu_dereference(*cptr)) == NULL) {
1383backtrace:
1384#ifdef CONFIG_IP_FIB_TRIE_STATS
1385 if (!n)
1386 this_cpu_inc(stats->null_node_hit);
1387#endif
1388 /* If we are at cindex 0 there are no more bits for
1389 * us to strip at this level so we must ascend back
1390 * up one level to see if there are any more bits to
1391 * be stripped there.
1392 */
1393 while (!cindex) {
1394 t_key pkey = pn->key;
1395
1396 pn = node_parent_rcu(pn);
1397 if (unlikely(!pn))
1398 return -EAGAIN;
1399#ifdef CONFIG_IP_FIB_TRIE_STATS
1400 this_cpu_inc(stats->backtrack);
1401#endif
1402 /* Get Child's index */
1403 cindex = get_index(pkey, pn);
1404 }
1405
1406 /* strip the least significant bit from the cindex */
1407 cindex &= cindex - 1;
1408
1409 /* grab pointer for next child node */
1410 cptr = &pn->child[cindex];
1411 }
1412 }
1339 1413
1340 hlist_for_each_entry_rcu(li, hhead, hlist) { 1414found:
1415 /* Step 3: Process the leaf, if that fails fall back to backtracing */
1416 hlist_for_each_entry_rcu(li, &n->list, hlist) {
1341 struct fib_alias *fa; 1417 struct fib_alias *fa;
1342 1418
1343 if (l->key != (key & li->mask_plen)) 1419 if ((key ^ n->key) & li->mask_plen)
1344 continue; 1420 continue;
1345 1421
1346 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 1422 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -1355,9 +1431,9 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1355 continue; 1431 continue;
1356 fib_alias_accessed(fa); 1432 fib_alias_accessed(fa);
1357 err = fib_props[fa->fa_type].error; 1433 err = fib_props[fa->fa_type].error;
1358 if (err) { 1434 if (unlikely(err < 0)) {
1359#ifdef CONFIG_IP_FIB_TRIE_STATS 1435#ifdef CONFIG_IP_FIB_TRIE_STATS
1360 t->stats.semantic_match_passed++; 1436 this_cpu_inc(stats->semantic_match_passed);
1361#endif 1437#endif
1362 return err; 1438 return err;
1363 } 1439 }
@@ -1371,241 +1447,48 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1371 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) 1447 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1372 continue; 1448 continue;
1373 1449
1374#ifdef CONFIG_IP_FIB_TRIE_STATS 1450 if (!(fib_flags & FIB_LOOKUP_NOREF))
1375 t->stats.semantic_match_passed++; 1451 atomic_inc(&fi->fib_clntref);
1376#endif 1452
1377 res->prefixlen = li->plen; 1453 res->prefixlen = li->plen;
1378 res->nh_sel = nhsel; 1454 res->nh_sel = nhsel;
1379 res->type = fa->fa_type; 1455 res->type = fa->fa_type;
1380 res->scope = fa->fa_info->fib_scope; 1456 res->scope = fi->fib_scope;
1381 res->fi = fi; 1457 res->fi = fi;
1382 res->table = tb; 1458 res->table = tb;
1383 res->fa_head = &li->falh; 1459 res->fa_head = &li->falh;
1384 if (!(fib_flags & FIB_LOOKUP_NOREF))
1385 atomic_inc(&fi->fib_clntref);
1386 return 0;
1387 }
1388 }
1389
1390#ifdef CONFIG_IP_FIB_TRIE_STATS
1391 t->stats.semantic_match_miss++;
1392#endif
1393 }
1394
1395 return 1;
1396}
1397
1398int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1399 struct fib_result *res, int fib_flags)
1400{
1401 struct trie *t = (struct trie *) tb->tb_data;
1402 int ret;
1403 struct rt_trie_node *n;
1404 struct tnode *pn;
1405 unsigned int pos, bits;
1406 t_key key = ntohl(flp->daddr);
1407 unsigned int chopped_off;
1408 t_key cindex = 0;
1409 unsigned int current_prefix_length = KEYLENGTH;
1410 struct tnode *cn;
1411 t_key pref_mismatch;
1412
1413 rcu_read_lock();
1414
1415 n = rcu_dereference(t->trie);
1416 if (!n)
1417 goto failed;
1418
1419#ifdef CONFIG_IP_FIB_TRIE_STATS 1460#ifdef CONFIG_IP_FIB_TRIE_STATS
1420 t->stats.gets++; 1461 this_cpu_inc(stats->semantic_match_passed);
1421#endif 1462#endif
1422 1463 return err;
1423 /* Just a leaf? */ 1464 }
1424 if (IS_LEAF(n)) {
1425 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1426 goto found;
1427 }
1428
1429 pn = (struct tnode *) n;
1430 chopped_off = 0;
1431
1432 while (pn) {
1433 pos = pn->pos;
1434 bits = pn->bits;
1435
1436 if (!chopped_off)
1437 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1438 pos, bits);
1439
1440 n = tnode_get_child_rcu(pn, cindex);
1441
1442 if (n == NULL) {
1443#ifdef CONFIG_IP_FIB_TRIE_STATS
1444 t->stats.null_node_hit++;
1445#endif
1446 goto backtrace;
1447 }
1448
1449 if (IS_LEAF(n)) {
1450 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1451 if (ret > 0)
1452 goto backtrace;
1453 goto found;
1454 }
1455
1456 cn = (struct tnode *)n;
1457
1458 /*
1459 * It's a tnode, and we can do some extra checks here if we
1460 * like, to avoid descending into a dead-end branch.
1461 * This tnode is in the parent's child array at index
1462 * key[p_pos..p_pos+p_bits] but potentially with some bits
1463 * chopped off, so in reality the index may be just a
1464 * subprefix, padded with zero at the end.
1465 * We can also take a look at any skipped bits in this
1466 * tnode - everything up to p_pos is supposed to be ok,
1467 * and the non-chopped bits of the index (se previous
1468 * paragraph) are also guaranteed ok, but the rest is
1469 * considered unknown.
1470 *
1471 * The skipped bits are key[pos+bits..cn->pos].
1472 */
1473
1474 /* If current_prefix_length < pos+bits, we are already doing
1475 * actual prefix matching, which means everything from
1476 * pos+(bits-chopped_off) onward must be zero along some
1477 * branch of this subtree - otherwise there is *no* valid
1478 * prefix present. Here we can only check the skipped
1479 * bits. Remember, since we have already indexed into the
1480 * parent's child array, we know that the bits we chopped of
1481 * *are* zero.
1482 */
1483
1484 /* NOTA BENE: Checking only skipped bits
1485 for the new node here */
1486
1487 if (current_prefix_length < pos+bits) {
1488 if (tkey_extract_bits(cn->key, current_prefix_length,
1489 cn->pos - current_prefix_length)
1490 || !(cn->child[0]))
1491 goto backtrace;
1492 }
1493
1494 /*
1495 * If chopped_off=0, the index is fully validated and we
1496 * only need to look at the skipped bits for this, the new,
1497 * tnode. What we actually want to do is to find out if
1498 * these skipped bits match our key perfectly, or if we will
1499 * have to count on finding a matching prefix further down,
1500 * because if we do, we would like to have some way of
1501 * verifying the existence of such a prefix at this point.
1502 */
1503
1504 /* The only thing we can do at this point is to verify that
1505 * any such matching prefix can indeed be a prefix to our
1506 * key, and if the bits in the node we are inspecting that
1507 * do not match our key are not ZERO, this cannot be true.
1508 * Thus, find out where there is a mismatch (before cn->pos)
1509 * and verify that all the mismatching bits are zero in the
1510 * new tnode's key.
1511 */
1512
1513 /*
1514 * Note: We aren't very concerned about the piece of
1515 * the key that precede pn->pos+pn->bits, since these
1516 * have already been checked. The bits after cn->pos
1517 * aren't checked since these are by definition
1518 * "unknown" at this point. Thus, what we want to see
1519 * is if we are about to enter the "prefix matching"
1520 * state, and in that case verify that the skipped
1521 * bits that will prevail throughout this subtree are
1522 * zero, as they have to be if we are to find a
1523 * matching prefix.
1524 */
1525
1526 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1527
1528 /*
1529 * In short: If skipped bits in this node do not match
1530 * the search key, enter the "prefix matching"
1531 * state.directly.
1532 */
1533 if (pref_mismatch) {
1534 /* fls(x) = __fls(x) + 1 */
1535 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1536
1537 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1538 goto backtrace;
1539
1540 if (current_prefix_length >= cn->pos)
1541 current_prefix_length = mp;
1542 } 1465 }
1543 1466
1544 pn = (struct tnode *)n; /* Descend */
1545 chopped_off = 0;
1546 continue;
1547
1548backtrace:
1549 chopped_off++;
1550
1551 /* As zero don't change the child key (cindex) */
1552 while ((chopped_off <= pn->bits)
1553 && !(cindex & (1<<(chopped_off-1))))
1554 chopped_off++;
1555
1556 /* Decrease current_... with bits chopped off */
1557 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1558 current_prefix_length = pn->pos + pn->bits
1559 - chopped_off;
1560
1561 /*
1562 * Either we do the actual chop off according or if we have
1563 * chopped off all bits in this tnode walk up to our parent.
1564 */
1565
1566 if (chopped_off <= pn->bits) {
1567 cindex &= ~(1 << (chopped_off-1));
1568 } else {
1569 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1570 if (!parent)
1571 goto failed;
1572
1573 /* Get Child's index */
1574 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1575 pn = parent;
1576 chopped_off = 0;
1577
1578#ifdef CONFIG_IP_FIB_TRIE_STATS 1467#ifdef CONFIG_IP_FIB_TRIE_STATS
1579 t->stats.backtrack++; 1468 this_cpu_inc(stats->semantic_match_miss);
1580#endif 1469#endif
1581 goto backtrace;
1582 }
1583 } 1470 }
1584failed: 1471 goto backtrace;
1585 ret = 1;
1586found:
1587 rcu_read_unlock();
1588 return ret;
1589} 1472}
1590EXPORT_SYMBOL_GPL(fib_table_lookup); 1473EXPORT_SYMBOL_GPL(fib_table_lookup);
1591 1474
1592/* 1475/*
1593 * Remove the leaf and return parent. 1476 * Remove the leaf and return parent.
1594 */ 1477 */
1595static void trie_leaf_remove(struct trie *t, struct leaf *l) 1478static void trie_leaf_remove(struct trie *t, struct tnode *l)
1596{ 1479{
1597 struct tnode *tp = node_parent((struct rt_trie_node *) l); 1480 struct tnode *tp = node_parent(l);
1598 1481
1599 pr_debug("entering trie_leaf_remove(%p)\n", l); 1482 pr_debug("entering trie_leaf_remove(%p)\n", l);
1600 1483
1601 if (tp) { 1484 if (tp) {
1602 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); 1485 put_child(tp, get_index(l->key, tp), NULL);
1603 put_child(tp, cindex, NULL);
1604 trie_rebalance(t, tp); 1486 trie_rebalance(t, tp);
1605 } else 1487 } else {
1606 RCU_INIT_POINTER(t->trie, NULL); 1488 RCU_INIT_POINTER(t->trie, NULL);
1489 }
1607 1490
1608 free_leaf(l); 1491 node_free(l);
1609} 1492}
1610 1493
1611/* 1494/*
@@ -1619,7 +1502,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1619 u8 tos = cfg->fc_tos; 1502 u8 tos = cfg->fc_tos;
1620 struct fib_alias *fa, *fa_to_delete; 1503 struct fib_alias *fa, *fa_to_delete;
1621 struct list_head *fa_head; 1504 struct list_head *fa_head;
1622 struct leaf *l; 1505 struct tnode *l;
1623 struct leaf_info *li; 1506 struct leaf_info *li;
1624 1507
1625 if (plen > 32) 1508 if (plen > 32)
@@ -1684,7 +1567,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1684 tb->tb_num_default--; 1567 tb->tb_num_default--;
1685 1568
1686 if (list_empty(fa_head)) { 1569 if (list_empty(fa_head)) {
1687 hlist_del_rcu(&li->hlist); 1570 remove_leaf_info(l, li);
1688 free_leaf_info(li); 1571 free_leaf_info(li);
1689 } 1572 }
1690 1573
@@ -1717,12 +1600,13 @@ static int trie_flush_list(struct list_head *head)
1717 return found; 1600 return found;
1718} 1601}
1719 1602
1720static int trie_flush_leaf(struct leaf *l) 1603static int trie_flush_leaf(struct tnode *l)
1721{ 1604{
1722 int found = 0; 1605 int found = 0;
1723 struct hlist_head *lih = &l->list; 1606 struct hlist_head *lih = &l->list;
1724 struct hlist_node *tmp; 1607 struct hlist_node *tmp;
1725 struct leaf_info *li = NULL; 1608 struct leaf_info *li = NULL;
1609 unsigned char plen = KEYLENGTH;
1726 1610
1727 hlist_for_each_entry_safe(li, tmp, lih, hlist) { 1611 hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1728 found += trie_flush_list(&li->falh); 1612 found += trie_flush_list(&li->falh);
@@ -1730,8 +1614,14 @@ static int trie_flush_leaf(struct leaf *l)
1730 if (list_empty(&li->falh)) { 1614 if (list_empty(&li->falh)) {
1731 hlist_del_rcu(&li->hlist); 1615 hlist_del_rcu(&li->hlist);
1732 free_leaf_info(li); 1616 free_leaf_info(li);
1617 continue;
1733 } 1618 }
1619
1620 plen = li->plen;
1734 } 1621 }
1622
1623 l->slen = KEYLENGTH - plen;
1624
1735 return found; 1625 return found;
1736} 1626}
1737 1627
@@ -1739,63 +1629,57 @@ static int trie_flush_leaf(struct leaf *l)
1739 * Scan for the next right leaf starting at node p->child[idx] 1629 * Scan for the next right leaf starting at node p->child[idx]
1740 * Since we have back pointer, no recursion necessary. 1630 * Since we have back pointer, no recursion necessary.
1741 */ 1631 */
1742static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) 1632static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
1743{ 1633{
1744 do { 1634 do {
1745 t_key idx; 1635 unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
1746 1636
1747 if (c) 1637 while (idx < tnode_child_length(p)) {
1748 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1749 else
1750 idx = 0;
1751
1752 while (idx < 1u << p->bits) {
1753 c = tnode_get_child_rcu(p, idx++); 1638 c = tnode_get_child_rcu(p, idx++);
1754 if (!c) 1639 if (!c)
1755 continue; 1640 continue;
1756 1641
1757 if (IS_LEAF(c)) 1642 if (IS_LEAF(c))
1758 return (struct leaf *) c; 1643 return c;
1759 1644
1760 /* Rescan start scanning in new node */ 1645 /* Rescan start scanning in new node */
1761 p = (struct tnode *) c; 1646 p = c;
1762 idx = 0; 1647 idx = 0;
1763 } 1648 }
1764 1649
1765 /* Node empty, walk back up to parent */ 1650 /* Node empty, walk back up to parent */
1766 c = (struct rt_trie_node *) p; 1651 c = p;
1767 } while ((p = node_parent_rcu(c)) != NULL); 1652 } while ((p = node_parent_rcu(c)) != NULL);
1768 1653
1769 return NULL; /* Root of trie */ 1654 return NULL; /* Root of trie */
1770} 1655}
1771 1656
1772static struct leaf *trie_firstleaf(struct trie *t) 1657static struct tnode *trie_firstleaf(struct trie *t)
1773{ 1658{
1774 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie); 1659 struct tnode *n = rcu_dereference_rtnl(t->trie);
1775 1660
1776 if (!n) 1661 if (!n)
1777 return NULL; 1662 return NULL;
1778 1663
1779 if (IS_LEAF(n)) /* trie is just a leaf */ 1664 if (IS_LEAF(n)) /* trie is just a leaf */
1780 return (struct leaf *) n; 1665 return n;
1781 1666
1782 return leaf_walk_rcu(n, NULL); 1667 return leaf_walk_rcu(n, NULL);
1783} 1668}
1784 1669
1785static struct leaf *trie_nextleaf(struct leaf *l) 1670static struct tnode *trie_nextleaf(struct tnode *l)
1786{ 1671{
1787 struct rt_trie_node *c = (struct rt_trie_node *) l; 1672 struct tnode *p = node_parent_rcu(l);
1788 struct tnode *p = node_parent_rcu(c);
1789 1673
1790 if (!p) 1674 if (!p)
1791 return NULL; /* trie with just one leaf */ 1675 return NULL; /* trie with just one leaf */
1792 1676
1793 return leaf_walk_rcu(p, c); 1677 return leaf_walk_rcu(p, l);
1794} 1678}
1795 1679
1796static struct leaf *trie_leafindex(struct trie *t, int index) 1680static struct tnode *trie_leafindex(struct trie *t, int index)
1797{ 1681{
1798 struct leaf *l = trie_firstleaf(t); 1682 struct tnode *l = trie_firstleaf(t);
1799 1683
1800 while (l && index-- > 0) 1684 while (l && index-- > 0)
1801 l = trie_nextleaf(l); 1685 l = trie_nextleaf(l);
@@ -1810,19 +1694,28 @@ static struct leaf *trie_leafindex(struct trie *t, int index)
1810int fib_table_flush(struct fib_table *tb) 1694int fib_table_flush(struct fib_table *tb)
1811{ 1695{
1812 struct trie *t = (struct trie *) tb->tb_data; 1696 struct trie *t = (struct trie *) tb->tb_data;
1813 struct leaf *l, *ll = NULL; 1697 struct tnode *l, *ll = NULL;
1814 int found = 0; 1698 int found = 0;
1815 1699
1816 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { 1700 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1817 found += trie_flush_leaf(l); 1701 found += trie_flush_leaf(l);
1818 1702
1819 if (ll && hlist_empty(&ll->list)) 1703 if (ll) {
1820 trie_leaf_remove(t, ll); 1704 if (hlist_empty(&ll->list))
1705 trie_leaf_remove(t, ll);
1706 else
1707 leaf_pull_suffix(ll);
1708 }
1709
1821 ll = l; 1710 ll = l;
1822 } 1711 }
1823 1712
1824 if (ll && hlist_empty(&ll->list)) 1713 if (ll) {
1825 trie_leaf_remove(t, ll); 1714 if (hlist_empty(&ll->list))
1715 trie_leaf_remove(t, ll);
1716 else
1717 leaf_pull_suffix(ll);
1718 }
1826 1719
1827 pr_debug("trie_flush found=%d\n", found); 1720 pr_debug("trie_flush found=%d\n", found);
1828 return found; 1721 return found;
@@ -1830,6 +1723,11 @@ int fib_table_flush(struct fib_table *tb)
1830 1723
1831void fib_free_table(struct fib_table *tb) 1724void fib_free_table(struct fib_table *tb)
1832{ 1725{
1726#ifdef CONFIG_IP_FIB_TRIE_STATS
1727 struct trie *t = (struct trie *)tb->tb_data;
1728
1729 free_percpu(t->stats);
1730#endif /* CONFIG_IP_FIB_TRIE_STATS */
1833 kfree(tb); 1731 kfree(tb);
1834} 1732}
1835 1733
@@ -1870,7 +1768,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1870 return skb->len; 1768 return skb->len;
1871} 1769}
1872 1770
1873static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, 1771static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
1874 struct sk_buff *skb, struct netlink_callback *cb) 1772 struct sk_buff *skb, struct netlink_callback *cb)
1875{ 1773{
1876 struct leaf_info *li; 1774 struct leaf_info *li;
@@ -1906,7 +1804,7 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1906int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, 1804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1907 struct netlink_callback *cb) 1805 struct netlink_callback *cb)
1908{ 1806{
1909 struct leaf *l; 1807 struct tnode *l;
1910 struct trie *t = (struct trie *) tb->tb_data; 1808 struct trie *t = (struct trie *) tb->tb_data;
1911 t_key key = cb->args[2]; 1809 t_key key = cb->args[2];
1912 int count = cb->args[3]; 1810 int count = cb->args[3];
@@ -1952,7 +1850,7 @@ void __init fib_trie_init(void)
1952 0, SLAB_PANIC, NULL); 1850 0, SLAB_PANIC, NULL);
1953 1851
1954 trie_leaf_kmem = kmem_cache_create("ip_fib_trie", 1852 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1955 max(sizeof(struct leaf), 1853 max(sizeof(struct tnode),
1956 sizeof(struct leaf_info)), 1854 sizeof(struct leaf_info)),
1957 0, SLAB_PANIC, NULL); 1855 0, SLAB_PANIC, NULL);
1958} 1856}
@@ -1973,7 +1871,14 @@ struct fib_table *fib_trie_table(u32 id)
1973 tb->tb_num_default = 0; 1871 tb->tb_num_default = 0;
1974 1872
1975 t = (struct trie *) tb->tb_data; 1873 t = (struct trie *) tb->tb_data;
1976 memset(t, 0, sizeof(*t)); 1874 RCU_INIT_POINTER(t->trie, NULL);
1875#ifdef CONFIG_IP_FIB_TRIE_STATS
1876 t->stats = alloc_percpu(struct trie_use_stats);
1877 if (!t->stats) {
1878 kfree(tb);
1879 tb = NULL;
1880 }
1881#endif
1977 1882
1978 return tb; 1883 return tb;
1979} 1884}
@@ -1988,10 +1893,10 @@ struct fib_trie_iter {
1988 unsigned int depth; 1893 unsigned int depth;
1989}; 1894};
1990 1895
1991static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) 1896static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
1992{ 1897{
1898 unsigned long cindex = iter->index;
1993 struct tnode *tn = iter->tnode; 1899 struct tnode *tn = iter->tnode;
1994 unsigned int cindex = iter->index;
1995 struct tnode *p; 1900 struct tnode *p;
1996 1901
1997 /* A single entry routing table */ 1902 /* A single entry routing table */
@@ -2001,8 +1906,8 @@ static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2001 pr_debug("get_next iter={node=%p index=%d depth=%d}\n", 1906 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2002 iter->tnode, iter->index, iter->depth); 1907 iter->tnode, iter->index, iter->depth);
2003rescan: 1908rescan:
2004 while (cindex < (1<<tn->bits)) { 1909 while (cindex < tnode_child_length(tn)) {
2005 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); 1910 struct tnode *n = tnode_get_child_rcu(tn, cindex);
2006 1911
2007 if (n) { 1912 if (n) {
2008 if (IS_LEAF(n)) { 1913 if (IS_LEAF(n)) {
@@ -2010,7 +1915,7 @@ rescan:
2010 iter->index = cindex + 1; 1915 iter->index = cindex + 1;
2011 } else { 1916 } else {
2012 /* push down one level */ 1917 /* push down one level */
2013 iter->tnode = (struct tnode *) n; 1918 iter->tnode = n;
2014 iter->index = 0; 1919 iter->index = 0;
2015 ++iter->depth; 1920 ++iter->depth;
2016 } 1921 }
@@ -2021,9 +1926,9 @@ rescan:
2021 } 1926 }
2022 1927
2023 /* Current node exhausted, pop back up */ 1928 /* Current node exhausted, pop back up */
2024 p = node_parent_rcu((struct rt_trie_node *)tn); 1929 p = node_parent_rcu(tn);
2025 if (p) { 1930 if (p) {
2026 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; 1931 cindex = get_index(tn->key, p) + 1;
2027 tn = p; 1932 tn = p;
2028 --iter->depth; 1933 --iter->depth;
2029 goto rescan; 1934 goto rescan;
@@ -2033,10 +1938,10 @@ rescan:
2033 return NULL; 1938 return NULL;
2034} 1939}
2035 1940
2036static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, 1941static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
2037 struct trie *t) 1942 struct trie *t)
2038{ 1943{
2039 struct rt_trie_node *n; 1944 struct tnode *n;
2040 1945
2041 if (!t) 1946 if (!t)
2042 return NULL; 1947 return NULL;
@@ -2046,7 +1951,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2046 return NULL; 1951 return NULL;
2047 1952
2048 if (IS_TNODE(n)) { 1953 if (IS_TNODE(n)) {
2049 iter->tnode = (struct tnode *) n; 1954 iter->tnode = n;
2050 iter->index = 0; 1955 iter->index = 0;
2051 iter->depth = 1; 1956 iter->depth = 1;
2052 } else { 1957 } else {
@@ -2060,7 +1965,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2060 1965
2061static void trie_collect_stats(struct trie *t, struct trie_stat *s) 1966static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2062{ 1967{
2063 struct rt_trie_node *n; 1968 struct tnode *n;
2064 struct fib_trie_iter iter; 1969 struct fib_trie_iter iter;
2065 1970
2066 memset(s, 0, sizeof(*s)); 1971 memset(s, 0, sizeof(*s));
@@ -2068,7 +1973,6 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2068 rcu_read_lock(); 1973 rcu_read_lock();
2069 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { 1974 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2070 if (IS_LEAF(n)) { 1975 if (IS_LEAF(n)) {
2071 struct leaf *l = (struct leaf *)n;
2072 struct leaf_info *li; 1976 struct leaf_info *li;
2073 1977
2074 s->leaves++; 1978 s->leaves++;
@@ -2076,19 +1980,13 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2076 if (iter.depth > s->maxdepth) 1980 if (iter.depth > s->maxdepth)
2077 s->maxdepth = iter.depth; 1981 s->maxdepth = iter.depth;
2078 1982
2079 hlist_for_each_entry_rcu(li, &l->list, hlist) 1983 hlist_for_each_entry_rcu(li, &n->list, hlist)
2080 ++s->prefixes; 1984 ++s->prefixes;
2081 } else { 1985 } else {
2082 const struct tnode *tn = (const struct tnode *) n;
2083 int i;
2084
2085 s->tnodes++; 1986 s->tnodes++;
2086 if (tn->bits < MAX_STAT_DEPTH) 1987 if (n->bits < MAX_STAT_DEPTH)
2087 s->nodesizes[tn->bits]++; 1988 s->nodesizes[n->bits]++;
2088 1989 s->nullpointers += n->empty_children;
2089 for (i = 0; i < (1<<tn->bits); i++)
2090 if (!tn->child[i])
2091 s->nullpointers++;
2092 } 1990 }
2093 } 1991 }
2094 rcu_read_unlock(); 1992 rcu_read_unlock();
@@ -2111,7 +2009,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2111 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); 2009 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2112 2010
2113 seq_printf(seq, "\tLeaves: %u\n", stat->leaves); 2011 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2114 bytes = sizeof(struct leaf) * stat->leaves; 2012 bytes = sizeof(struct tnode) * stat->leaves;
2115 2013
2116 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); 2014 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2117 bytes += sizeof(struct leaf_info) * stat->prefixes; 2015 bytes += sizeof(struct leaf_info) * stat->prefixes;
@@ -2132,25 +2030,38 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2132 seq_putc(seq, '\n'); 2030 seq_putc(seq, '\n');
2133 seq_printf(seq, "\tPointers: %u\n", pointers); 2031 seq_printf(seq, "\tPointers: %u\n", pointers);
2134 2032
2135 bytes += sizeof(struct rt_trie_node *) * pointers; 2033 bytes += sizeof(struct tnode *) * pointers;
2136 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2034 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2137 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2035 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2138} 2036}
2139 2037
2140#ifdef CONFIG_IP_FIB_TRIE_STATS 2038#ifdef CONFIG_IP_FIB_TRIE_STATS
2141static void trie_show_usage(struct seq_file *seq, 2039static void trie_show_usage(struct seq_file *seq,
2142 const struct trie_use_stats *stats) 2040 const struct trie_use_stats __percpu *stats)
2143{ 2041{
2042 struct trie_use_stats s = { 0 };
2043 int cpu;
2044
2045 /* loop through all of the CPUs and gather up the stats */
2046 for_each_possible_cpu(cpu) {
2047 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2048
2049 s.gets += pcpu->gets;
2050 s.backtrack += pcpu->backtrack;
2051 s.semantic_match_passed += pcpu->semantic_match_passed;
2052 s.semantic_match_miss += pcpu->semantic_match_miss;
2053 s.null_node_hit += pcpu->null_node_hit;
2054 s.resize_node_skipped += pcpu->resize_node_skipped;
2055 }
2056
2144 seq_printf(seq, "\nCounters:\n---------\n"); 2057 seq_printf(seq, "\nCounters:\n---------\n");
2145 seq_printf(seq, "gets = %u\n", stats->gets); 2058 seq_printf(seq, "gets = %u\n", s.gets);
2146 seq_printf(seq, "backtracks = %u\n", stats->backtrack); 2059 seq_printf(seq, "backtracks = %u\n", s.backtrack);
2147 seq_printf(seq, "semantic match passed = %u\n", 2060 seq_printf(seq, "semantic match passed = %u\n",
2148 stats->semantic_match_passed); 2061 s.semantic_match_passed);
2149 seq_printf(seq, "semantic match miss = %u\n", 2062 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2150 stats->semantic_match_miss); 2063 seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2151 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit); 2064 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2152 seq_printf(seq, "skipped node resize = %u\n\n",
2153 stats->resize_node_skipped);
2154} 2065}
2155#endif /* CONFIG_IP_FIB_TRIE_STATS */ 2066#endif /* CONFIG_IP_FIB_TRIE_STATS */
2156 2067
@@ -2173,7 +2084,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2173 seq_printf(seq, 2084 seq_printf(seq,
2174 "Basic info: size of leaf:" 2085 "Basic info: size of leaf:"
2175 " %Zd bytes, size of tnode: %Zd bytes.\n", 2086 " %Zd bytes, size of tnode: %Zd bytes.\n",
2176 sizeof(struct leaf), sizeof(struct tnode)); 2087 sizeof(struct tnode), sizeof(struct tnode));
2177 2088
2178 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 2089 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2179 struct hlist_head *head = &net->ipv4.fib_table_hash[h]; 2090 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2191,7 +2102,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2191 trie_collect_stats(t, &stat); 2102 trie_collect_stats(t, &stat);
2192 trie_show_stats(seq, &stat); 2103 trie_show_stats(seq, &stat);
2193#ifdef CONFIG_IP_FIB_TRIE_STATS 2104#ifdef CONFIG_IP_FIB_TRIE_STATS
2194 trie_show_usage(seq, &t->stats); 2105 trie_show_usage(seq, t->stats);
2195#endif 2106#endif
2196 } 2107 }
2197 } 2108 }
@@ -2212,7 +2123,7 @@ static const struct file_operations fib_triestat_fops = {
2212 .release = single_release_net, 2123 .release = single_release_net,
2213}; 2124};
2214 2125
2215static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) 2126static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2216{ 2127{
2217 struct fib_trie_iter *iter = seq->private; 2128 struct fib_trie_iter *iter = seq->private;
2218 struct net *net = seq_file_net(seq); 2129 struct net *net = seq_file_net(seq);
@@ -2224,7 +2135,7 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2224 struct fib_table *tb; 2135 struct fib_table *tb;
2225 2136
2226 hlist_for_each_entry_rcu(tb, head, tb_hlist) { 2137 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2227 struct rt_trie_node *n; 2138 struct tnode *n;
2228 2139
2229 for (n = fib_trie_get_first(iter, 2140 for (n = fib_trie_get_first(iter,
2230 (struct trie *) tb->tb_data); 2141 (struct trie *) tb->tb_data);
@@ -2253,7 +2164,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2253 struct fib_table *tb = iter->tb; 2164 struct fib_table *tb = iter->tb;
2254 struct hlist_node *tb_node; 2165 struct hlist_node *tb_node;
2255 unsigned int h; 2166 unsigned int h;
2256 struct rt_trie_node *n; 2167 struct tnode *n;
2257 2168
2258 ++*pos; 2169 ++*pos;
2259 /* next node in same table */ 2170 /* next node in same table */
@@ -2339,29 +2250,26 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2339static int fib_trie_seq_show(struct seq_file *seq, void *v) 2250static int fib_trie_seq_show(struct seq_file *seq, void *v)
2340{ 2251{
2341 const struct fib_trie_iter *iter = seq->private; 2252 const struct fib_trie_iter *iter = seq->private;
2342 struct rt_trie_node *n = v; 2253 struct tnode *n = v;
2343 2254
2344 if (!node_parent_rcu(n)) 2255 if (!node_parent_rcu(n))
2345 fib_table_print(seq, iter->tb); 2256 fib_table_print(seq, iter->tb);
2346 2257
2347 if (IS_TNODE(n)) { 2258 if (IS_TNODE(n)) {
2348 struct tnode *tn = (struct tnode *) n; 2259 __be32 prf = htonl(n->key);
2349 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2350 2260
2351 seq_indent(seq, iter->depth-1); 2261 seq_indent(seq, iter->depth-1);
2352 seq_printf(seq, " +-- %pI4/%d %d %d %d\n", 2262 seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
2353 &prf, tn->pos, tn->bits, tn->full_children, 2263 &prf, KEYLENGTH - n->pos - n->bits, n->bits,
2354 tn->empty_children); 2264 n->full_children, n->empty_children);
2355
2356 } else { 2265 } else {
2357 struct leaf *l = (struct leaf *) n;
2358 struct leaf_info *li; 2266 struct leaf_info *li;
2359 __be32 val = htonl(l->key); 2267 __be32 val = htonl(n->key);
2360 2268
2361 seq_indent(seq, iter->depth); 2269 seq_indent(seq, iter->depth);
2362 seq_printf(seq, " |-- %pI4\n", &val); 2270 seq_printf(seq, " |-- %pI4\n", &val);
2363 2271
2364 hlist_for_each_entry_rcu(li, &l->list, hlist) { 2272 hlist_for_each_entry_rcu(li, &n->list, hlist) {
2365 struct fib_alias *fa; 2273 struct fib_alias *fa;
2366 2274
2367 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2275 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -2411,9 +2319,9 @@ struct fib_route_iter {
2411 t_key key; 2319 t_key key;
2412}; 2320};
2413 2321
2414static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) 2322static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2415{ 2323{
2416 struct leaf *l = NULL; 2324 struct tnode *l = NULL;
2417 struct trie *t = iter->main_trie; 2325 struct trie *t = iter->main_trie;
2418 2326
2419 /* use cache location of last found key */ 2327 /* use cache location of last found key */
@@ -2458,7 +2366,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2458static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2366static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2459{ 2367{
2460 struct fib_route_iter *iter = seq->private; 2368 struct fib_route_iter *iter = seq->private;
2461 struct leaf *l = v; 2369 struct tnode *l = v;
2462 2370
2463 ++*pos; 2371 ++*pos;
2464 if (v == SEQ_START_TOKEN) { 2372 if (v == SEQ_START_TOKEN) {
@@ -2504,7 +2412,7 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info
2504 */ 2412 */
2505static int fib_route_seq_show(struct seq_file *seq, void *v) 2413static int fib_route_seq_show(struct seq_file *seq, void *v)
2506{ 2414{
2507 struct leaf *l = v; 2415 struct tnode *l = v;
2508 struct leaf_info *li; 2416 struct leaf_info *li;
2509 2417
2510 if (v == SEQ_START_TOKEN) { 2418 if (v == SEQ_START_TOKEN) {
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index b986298a7ba3..92ddea1e6457 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -70,7 +70,6 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
70 size_t start = ntohs(pd[0]); 70 size_t start = ntohs(pd[0]);
71 size_t offset = ntohs(pd[1]); 71 size_t offset = ntohs(pd[1]);
72 size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); 72 size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
73 __wsum delta;
74 73
75 if (skb->remcsum_offload) { 74 if (skb->remcsum_offload) {
76 /* Already processed in GRO path */ 75 /* Already processed in GRO path */
@@ -82,14 +81,7 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
82 return NULL; 81 return NULL;
83 guehdr = (struct guehdr *)&udp_hdr(skb)[1]; 82 guehdr = (struct guehdr *)&udp_hdr(skb)[1];
84 83
85 if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) 84 skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
86 __skb_checksum_complete(skb);
87
88 delta = remcsum_adjust((void *)guehdr + hdrlen,
89 skb->csum, start, offset);
90
91 /* Adjust skb->csum since we changed the packet */
92 skb->csum = csum_add(skb->csum, delta);
93 85
94 return guehdr; 86 return guehdr;
95} 87}
@@ -174,7 +166,8 @@ drop:
174} 166}
175 167
176static struct sk_buff **fou_gro_receive(struct sk_buff **head, 168static struct sk_buff **fou_gro_receive(struct sk_buff **head,
177 struct sk_buff *skb) 169 struct sk_buff *skb,
170 struct udp_offload *uoff)
178{ 171{
179 const struct net_offload *ops; 172 const struct net_offload *ops;
180 struct sk_buff **pp = NULL; 173 struct sk_buff **pp = NULL;
@@ -195,7 +188,8 @@ out_unlock:
195 return pp; 188 return pp;
196} 189}
197 190
198static int fou_gro_complete(struct sk_buff *skb, int nhoff) 191static int fou_gro_complete(struct sk_buff *skb, int nhoff,
192 struct udp_offload *uoff)
199{ 193{
200 const struct net_offload *ops; 194 const struct net_offload *ops;
201 u8 proto = NAPI_GRO_CB(skb)->proto; 195 u8 proto = NAPI_GRO_CB(skb)->proto;
@@ -226,7 +220,6 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
226 size_t start = ntohs(pd[0]); 220 size_t start = ntohs(pd[0]);
227 size_t offset = ntohs(pd[1]); 221 size_t offset = ntohs(pd[1]);
228 size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); 222 size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
229 __wsum delta;
230 223
231 if (skb->remcsum_offload) 224 if (skb->remcsum_offload)
232 return guehdr; 225 return guehdr;
@@ -241,12 +234,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
241 return NULL; 234 return NULL;
242 } 235 }
243 236
244 delta = remcsum_adjust((void *)guehdr + hdrlen, 237 skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
245 NAPI_GRO_CB(skb)->csum, start, offset);
246
247 /* Adjust skb->csum since we changed the packet */
248 skb->csum = csum_add(skb->csum, delta);
249 NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
250 238
251 skb->remcsum_offload = 1; 239 skb->remcsum_offload = 1;
252 240
@@ -254,7 +242,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
254} 242}
255 243
256static struct sk_buff **gue_gro_receive(struct sk_buff **head, 244static struct sk_buff **gue_gro_receive(struct sk_buff **head,
257 struct sk_buff *skb) 245 struct sk_buff *skb,
246 struct udp_offload *uoff)
258{ 247{
259 const struct net_offload **offloads; 248 const struct net_offload **offloads;
260 const struct net_offload *ops; 249 const struct net_offload *ops;
@@ -360,7 +349,8 @@ out:
360 return pp; 349 return pp;
361} 350}
362 351
363static int gue_gro_complete(struct sk_buff *skb, int nhoff) 352static int gue_gro_complete(struct sk_buff *skb, int nhoff,
353 struct udp_offload *uoff)
364{ 354{
365 const struct net_offload **offloads; 355 const struct net_offload **offloads;
366 struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); 356 struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
@@ -490,7 +480,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
490 sk->sk_user_data = fou; 480 sk->sk_user_data = fou;
491 fou->sock = sock; 481 fou->sock = sock;
492 482
493 udp_set_convert_csum(sk, true); 483 inet_inc_convert_csum(sk);
494 484
495 sk->sk_allocation = GFP_ATOMIC; 485 sk->sk_allocation = GFP_ATOMIC;
496 486
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 394a200f93c1..5a4828ba05ad 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -17,7 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/skbuff.h> 19#include <linux/skbuff.h>
20#include <linux/rculist.h> 20#include <linux/list.h>
21#include <linux/netdevice.h> 21#include <linux/netdevice.h>
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/ip.h> 23#include <linux/ip.h>
@@ -26,8 +26,8 @@
26#include <linux/etherdevice.h> 26#include <linux/etherdevice.h>
27#include <linux/if_ether.h> 27#include <linux/if_ether.h>
28#include <linux/if_vlan.h> 28#include <linux/if_vlan.h>
29#include <linux/hash.h>
30#include <linux/ethtool.h> 29#include <linux/ethtool.h>
30#include <linux/mutex.h>
31#include <net/arp.h> 31#include <net/arp.h>
32#include <net/ndisc.h> 32#include <net/ndisc.h>
33#include <net/ip.h> 33#include <net/ip.h>
@@ -50,38 +50,30 @@
50#include <net/ip6_checksum.h> 50#include <net/ip6_checksum.h>
51#endif 51#endif
52 52
53#define PORT_HASH_BITS 8 53/* Protects sock_list and refcounts. */
54#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) 54static DEFINE_MUTEX(geneve_mutex);
55 55
56/* per-network namespace private data for this module */ 56/* per-network namespace private data for this module */
57struct geneve_net { 57struct geneve_net {
58 struct hlist_head sock_list[PORT_HASH_SIZE]; 58 struct list_head sock_list;
59 spinlock_t sock_lock; /* Protects sock_list */
60}; 59};
61 60
62static int geneve_net_id; 61static int geneve_net_id;
63 62
64static struct workqueue_struct *geneve_wq;
65
66static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) 63static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
67{ 64{
68 return (struct genevehdr *)(udp_hdr(skb) + 1); 65 return (struct genevehdr *)(udp_hdr(skb) + 1);
69} 66}
70 67
71static struct hlist_head *gs_head(struct net *net, __be16 port) 68static struct geneve_sock *geneve_find_sock(struct net *net,
69 sa_family_t family, __be16 port)
72{ 70{
73 struct geneve_net *gn = net_generic(net, geneve_net_id); 71 struct geneve_net *gn = net_generic(net, geneve_net_id);
74
75 return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
76}
77
78/* Find geneve socket based on network namespace and UDP port */
79static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port)
80{
81 struct geneve_sock *gs; 72 struct geneve_sock *gs;
82 73
83 hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) { 74 list_for_each_entry(gs, &gn->sock_list, list) {
84 if (inet_sk(gs->sock->sk)->inet_sport == port) 75 if (inet_sk(gs->sock->sk)->inet_sport == port &&
76 inet_sk(gs->sock->sk)->sk.sk_family == family)
85 return gs; 77 return gs;
86 } 78 }
87 79
@@ -115,19 +107,19 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
115 struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, 107 struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
116 __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, 108 __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
117 __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, 109 __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
118 bool xnet) 110 bool csum, bool xnet)
119{ 111{
120 struct genevehdr *gnvh; 112 struct genevehdr *gnvh;
121 int min_headroom; 113 int min_headroom;
122 int err; 114 int err;
123 115
124 skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx); 116 skb = udp_tunnel_handle_offloads(skb, csum);
125 if (IS_ERR(skb)) 117 if (IS_ERR(skb))
126 return PTR_ERR(skb); 118 return PTR_ERR(skb);
127 119
128 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len 120 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
129 + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) 121 + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
130 + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); 122 + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
131 123
132 err = skb_cow_head(skb, min_headroom); 124 err = skb_cow_head(skb, min_headroom);
133 if (unlikely(err)) { 125 if (unlikely(err)) {
@@ -144,11 +136,107 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
144 136
145 skb_set_inner_protocol(skb, htons(ETH_P_TEB)); 137 skb_set_inner_protocol(skb, htons(ETH_P_TEB));
146 138
147 return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst, 139 return udp_tunnel_xmit_skb(rt, skb, src, dst,
148 tos, ttl, df, src_port, dst_port, xnet); 140 tos, ttl, df, src_port, dst_port, xnet,
141 !csum);
149} 142}
150EXPORT_SYMBOL_GPL(geneve_xmit_skb); 143EXPORT_SYMBOL_GPL(geneve_xmit_skb);
151 144
145static int geneve_hlen(struct genevehdr *gh)
146{
147 return sizeof(*gh) + gh->opt_len * 4;
148}
149
150static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
151 struct sk_buff *skb,
152 struct udp_offload *uoff)
153{
154 struct sk_buff *p, **pp = NULL;
155 struct genevehdr *gh, *gh2;
156 unsigned int hlen, gh_len, off_gnv;
157 const struct packet_offload *ptype;
158 __be16 type;
159 int flush = 1;
160
161 off_gnv = skb_gro_offset(skb);
162 hlen = off_gnv + sizeof(*gh);
163 gh = skb_gro_header_fast(skb, off_gnv);
164 if (skb_gro_header_hard(skb, hlen)) {
165 gh = skb_gro_header_slow(skb, hlen, off_gnv);
166 if (unlikely(!gh))
167 goto out;
168 }
169
170 if (gh->ver != GENEVE_VER || gh->oam)
171 goto out;
172 gh_len = geneve_hlen(gh);
173
174 hlen = off_gnv + gh_len;
175 if (skb_gro_header_hard(skb, hlen)) {
176 gh = skb_gro_header_slow(skb, hlen, off_gnv);
177 if (unlikely(!gh))
178 goto out;
179 }
180
181 flush = 0;
182
183 for (p = *head; p; p = p->next) {
184 if (!NAPI_GRO_CB(p)->same_flow)
185 continue;
186
187 gh2 = (struct genevehdr *)(p->data + off_gnv);
188 if (gh->opt_len != gh2->opt_len ||
189 memcmp(gh, gh2, gh_len)) {
190 NAPI_GRO_CB(p)->same_flow = 0;
191 continue;
192 }
193 }
194
195 type = gh->proto_type;
196
197 rcu_read_lock();
198 ptype = gro_find_receive_by_type(type);
199 if (ptype == NULL) {
200 flush = 1;
201 goto out_unlock;
202 }
203
204 skb_gro_pull(skb, gh_len);
205 skb_gro_postpull_rcsum(skb, gh, gh_len);
206 pp = ptype->callbacks.gro_receive(head, skb);
207
208out_unlock:
209 rcu_read_unlock();
210out:
211 NAPI_GRO_CB(skb)->flush |= flush;
212
213 return pp;
214}
215
216static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
217 struct udp_offload *uoff)
218{
219 struct genevehdr *gh;
220 struct packet_offload *ptype;
221 __be16 type;
222 int gh_len;
223 int err = -ENOSYS;
224
225 udp_tunnel_gro_complete(skb, nhoff);
226
227 gh = (struct genevehdr *)(skb->data + nhoff);
228 gh_len = geneve_hlen(gh);
229 type = gh->proto_type;
230
231 rcu_read_lock();
232 ptype = gro_find_complete_by_type(type);
233 if (ptype != NULL)
234 err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
235
236 rcu_read_unlock();
237 return err;
238}
239
152static void geneve_notify_add_rx_port(struct geneve_sock *gs) 240static void geneve_notify_add_rx_port(struct geneve_sock *gs)
153{ 241{
154 struct sock *sk = gs->sock->sk; 242 struct sock *sk = gs->sock->sk;
@@ -214,15 +302,6 @@ error:
214 return 1; 302 return 1;
215} 303}
216 304
217static void geneve_del_work(struct work_struct *work)
218{
219 struct geneve_sock *gs = container_of(work, struct geneve_sock,
220 del_work);
221
222 udp_tunnel_sock_release(gs->sock);
223 kfree_rcu(gs, rcu);
224}
225
226static struct socket *geneve_create_sock(struct net *net, bool ipv6, 305static struct socket *geneve_create_sock(struct net *net, bool ipv6,
227 __be16 port) 306 __be16 port)
228{ 307{
@@ -263,8 +342,6 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
263 if (!gs) 342 if (!gs)
264 return ERR_PTR(-ENOMEM); 343 return ERR_PTR(-ENOMEM);
265 344
266 INIT_WORK(&gs->del_work, geneve_del_work);
267
268 sock = geneve_create_sock(net, ipv6, port); 345 sock = geneve_create_sock(net, ipv6, port);
269 if (IS_ERR(sock)) { 346 if (IS_ERR(sock)) {
270 kfree(gs); 347 kfree(gs);
@@ -272,19 +349,15 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
272 } 349 }
273 350
274 gs->sock = sock; 351 gs->sock = sock;
275 atomic_set(&gs->refcnt, 1); 352 gs->refcnt = 1;
276 gs->rcv = rcv; 353 gs->rcv = rcv;
277 gs->rcv_data = data; 354 gs->rcv_data = data;
278 355
279 /* Initialize the geneve udp offloads structure */ 356 /* Initialize the geneve udp offloads structure */
280 gs->udp_offloads.port = port; 357 gs->udp_offloads.port = port;
281 gs->udp_offloads.callbacks.gro_receive = NULL; 358 gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive;
282 gs->udp_offloads.callbacks.gro_complete = NULL; 359 gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
283
284 spin_lock(&gn->sock_lock);
285 hlist_add_head_rcu(&gs->hlist, gs_head(net, port));
286 geneve_notify_add_rx_port(gs); 360 geneve_notify_add_rx_port(gs);
287 spin_unlock(&gn->sock_lock);
288 361
289 /* Mark socket as an encapsulation socket */ 362 /* Mark socket as an encapsulation socket */
290 tunnel_cfg.sk_user_data = gs; 363 tunnel_cfg.sk_user_data = gs;
@@ -293,6 +366,8 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
293 tunnel_cfg.encap_destroy = NULL; 366 tunnel_cfg.encap_destroy = NULL;
294 setup_udp_tunnel_sock(net, sock, &tunnel_cfg); 367 setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
295 368
369 list_add(&gs->list, &gn->sock_list);
370
296 return gs; 371 return gs;
297} 372}
298 373
@@ -300,25 +375,21 @@ struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
300 geneve_rcv_t *rcv, void *data, 375 geneve_rcv_t *rcv, void *data,
301 bool no_share, bool ipv6) 376 bool no_share, bool ipv6)
302{ 377{
303 struct geneve_net *gn = net_generic(net, geneve_net_id);
304 struct geneve_sock *gs; 378 struct geneve_sock *gs;
305 379
306 gs = geneve_socket_create(net, port, rcv, data, ipv6); 380 mutex_lock(&geneve_mutex);
307 if (!IS_ERR(gs))
308 return gs;
309
310 if (no_share) /* Return error if sharing is not allowed. */
311 return ERR_PTR(-EINVAL);
312 381
313 spin_lock(&gn->sock_lock); 382 gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
314 gs = geneve_find_sock(net, port); 383 if (gs) {
315 if (gs && ((gs->rcv != rcv) || 384 if (!no_share && gs->rcv == rcv)
316 !atomic_add_unless(&gs->refcnt, 1, 0))) 385 gs->refcnt++;
386 else
317 gs = ERR_PTR(-EBUSY); 387 gs = ERR_PTR(-EBUSY);
318 spin_unlock(&gn->sock_lock); 388 } else {
389 gs = geneve_socket_create(net, port, rcv, data, ipv6);
390 }
319 391
320 if (!gs) 392 mutex_unlock(&geneve_mutex);
321 gs = ERR_PTR(-EINVAL);
322 393
323 return gs; 394 return gs;
324} 395}
@@ -326,37 +397,32 @@ EXPORT_SYMBOL_GPL(geneve_sock_add);
326 397
327void geneve_sock_release(struct geneve_sock *gs) 398void geneve_sock_release(struct geneve_sock *gs)
328{ 399{
329 struct net *net = sock_net(gs->sock->sk); 400 mutex_lock(&geneve_mutex);
330 struct geneve_net *gn = net_generic(net, geneve_net_id);
331 401
332 if (!atomic_dec_and_test(&gs->refcnt)) 402 if (--gs->refcnt)
333 return; 403 goto unlock;
334 404
335 spin_lock(&gn->sock_lock); 405 list_del(&gs->list);
336 hlist_del_rcu(&gs->hlist);
337 geneve_notify_del_rx_port(gs); 406 geneve_notify_del_rx_port(gs);
338 spin_unlock(&gn->sock_lock); 407 udp_tunnel_sock_release(gs->sock);
408 kfree_rcu(gs, rcu);
339 409
340 queue_work(geneve_wq, &gs->del_work); 410unlock:
411 mutex_unlock(&geneve_mutex);
341} 412}
342EXPORT_SYMBOL_GPL(geneve_sock_release); 413EXPORT_SYMBOL_GPL(geneve_sock_release);
343 414
344static __net_init int geneve_init_net(struct net *net) 415static __net_init int geneve_init_net(struct net *net)
345{ 416{
346 struct geneve_net *gn = net_generic(net, geneve_net_id); 417 struct geneve_net *gn = net_generic(net, geneve_net_id);
347 unsigned int h;
348 418
349 spin_lock_init(&gn->sock_lock); 419 INIT_LIST_HEAD(&gn->sock_list);
350
351 for (h = 0; h < PORT_HASH_SIZE; ++h)
352 INIT_HLIST_HEAD(&gn->sock_list[h]);
353 420
354 return 0; 421 return 0;
355} 422}
356 423
357static struct pernet_operations geneve_net_ops = { 424static struct pernet_operations geneve_net_ops = {
358 .init = geneve_init_net, 425 .init = geneve_init_net,
359 .exit = NULL,
360 .id = &geneve_net_id, 426 .id = &geneve_net_id,
361 .size = sizeof(struct geneve_net), 427 .size = sizeof(struct geneve_net),
362}; 428};
@@ -365,10 +431,6 @@ static int __init geneve_init_module(void)
365{ 431{
366 int rc; 432 int rc;
367 433
368 geneve_wq = alloc_workqueue("geneve", 0, 0);
369 if (!geneve_wq)
370 return -ENOMEM;
371
372 rc = register_pernet_subsys(&geneve_net_ops); 434 rc = register_pernet_subsys(&geneve_net_ops);
373 if (rc) 435 if (rc)
374 return rc; 436 return rc;
@@ -377,11 +439,10 @@ static int __init geneve_init_module(void)
377 439
378 return 0; 440 return 0;
379} 441}
380late_initcall(geneve_init_module); 442module_init(geneve_init_module);
381 443
382static void __exit geneve_cleanup_module(void) 444static void __exit geneve_cleanup_module(void)
383{ 445{
384 destroy_workqueue(geneve_wq);
385 unregister_pernet_subsys(&geneve_net_ops); 446 unregister_pernet_subsys(&geneve_net_ops);
386} 447}
387module_exit(geneve_cleanup_module); 448module_exit(geneve_cleanup_module);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36f5584d93c5..5e564014a0b7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -205,7 +205,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
205 */ 205 */
206static struct sock *icmp_sk(struct net *net) 206static struct sock *icmp_sk(struct net *net)
207{ 207{
208 return net->ipv4.icmp_sk[smp_processor_id()]; 208 return *this_cpu_ptr(net->ipv4.icmp_sk);
209} 209}
210 210
211static inline struct sock *icmp_xmit_lock(struct net *net) 211static inline struct sock *icmp_xmit_lock(struct net *net)
@@ -1140,8 +1140,8 @@ static void __net_exit icmp_sk_exit(struct net *net)
1140 int i; 1140 int i;
1141 1141
1142 for_each_possible_cpu(i) 1142 for_each_possible_cpu(i)
1143 inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); 1143 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
1144 kfree(net->ipv4.icmp_sk); 1144 free_percpu(net->ipv4.icmp_sk);
1145 net->ipv4.icmp_sk = NULL; 1145 net->ipv4.icmp_sk = NULL;
1146} 1146}
1147 1147
@@ -1149,9 +1149,8 @@ static int __net_init icmp_sk_init(struct net *net)
1149{ 1149{
1150 int i, err; 1150 int i, err;
1151 1151
1152 net->ipv4.icmp_sk = 1152 net->ipv4.icmp_sk = alloc_percpu(struct sock *);
1153 kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); 1153 if (!net->ipv4.icmp_sk)
1154 if (net->ipv4.icmp_sk == NULL)
1155 return -ENOMEM; 1154 return -ENOMEM;
1156 1155
1157 for_each_possible_cpu(i) { 1156 for_each_possible_cpu(i) {
@@ -1162,7 +1161,7 @@ static int __net_init icmp_sk_init(struct net *net)
1162 if (err < 0) 1161 if (err < 0)
1163 goto fail; 1162 goto fail;
1164 1163
1165 net->ipv4.icmp_sk[i] = sk; 1164 *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
1166 1165
1167 /* Enough space for 2 64K ICMP packets, including 1166 /* Enough space for 2 64K ICMP packets, including
1168 * sk_buff/skb_shared_info struct overhead. 1167 * sk_buff/skb_shared_info struct overhead.
@@ -1203,8 +1202,8 @@ static int __net_init icmp_sk_init(struct net *net)
1203 1202
1204fail: 1203fail:
1205 for_each_possible_cpu(i) 1204 for_each_possible_cpu(i)
1206 inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); 1205 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
1207 kfree(net->ipv4.icmp_sk); 1206 free_percpu(net->ipv4.icmp_sk);
1208 return err; 1207 return err;
1209} 1208}
1210 1209
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e34dccbc4d70..81751f12645f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -203,7 +203,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
203 icsk->icsk_ca_ops->get_info(sk, ext, skb); 203 icsk->icsk_ca_ops->get_info(sk, ext, skb);
204 204
205out: 205out:
206 return nlmsg_end(skb, nlh); 206 nlmsg_end(skb, nlh);
207 return 0;
207 208
208errout: 209errout:
209 nlmsg_cancel(skb, nlh); 210 nlmsg_cancel(skb, nlh);
@@ -271,7 +272,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
271 } 272 }
272#endif 273#endif
273 274
274 return nlmsg_end(skb, nlh); 275 nlmsg_end(skb, nlh);
276 return 0;
275} 277}
276 278
277static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, 279static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
@@ -758,7 +760,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
758 } 760 }
759#endif 761#endif
760 762
761 return nlmsg_end(skb, nlh); 763 nlmsg_end(skb, nlh);
764 return 0;
762} 765}
763 766
764static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, 767static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 3a83ce5efa80..787b3c294ce6 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -129,7 +129,8 @@ int ip_forward(struct sk_buff *skb)
129 * We now generate an ICMP HOST REDIRECT giving the route 129 * We now generate an ICMP HOST REDIRECT giving the route
130 * we calculated. 130 * we calculated.
131 */ 131 */
132 if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) 132 if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
133 !skb_sec_path(skb))
133 ip_rt_send_redirect(skb); 134 ip_rt_send_redirect(skb);
134 135
135 skb->priority = rt_tos2priority(iph->tos); 136 skb->priority = rt_tos2priority(iph->tos);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4f4bf5b99686..6207275fc749 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -659,12 +659,12 @@ static bool ipgre_netlink_encap_parms(struct nlattr *data[],
659 659
660 if (data[IFLA_GRE_ENCAP_SPORT]) { 660 if (data[IFLA_GRE_ENCAP_SPORT]) {
661 ret = true; 661 ret = true;
662 ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]); 662 ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
663 } 663 }
664 664
665 if (data[IFLA_GRE_ENCAP_DPORT]) { 665 if (data[IFLA_GRE_ENCAP_DPORT]) {
666 ret = true; 666 ret = true;
667 ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]); 667 ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
668 } 668 }
669 669
670 return ret; 670 return ret;
@@ -673,6 +673,7 @@ static bool ipgre_netlink_encap_parms(struct nlattr *data[],
673static int gre_tap_init(struct net_device *dev) 673static int gre_tap_init(struct net_device *dev)
674{ 674{
675 __gre_tunnel_init(dev); 675 __gre_tunnel_init(dev);
676 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
676 677
677 return ip_tunnel_init(dev); 678 return ip_tunnel_init(dev);
678} 679}
@@ -785,10 +786,10 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
785 786
786 if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, 787 if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
787 t->encap.type) || 788 t->encap.type) ||
788 nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT, 789 nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
789 t->encap.sport) || 790 t->encap.sport) ||
790 nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, 791 nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
791 t->encap.dport) || 792 t->encap.dport) ||
792 nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, 793 nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
793 t->encap.flags)) 794 t->encap.flags))
794 goto nla_put_failure; 795 goto nla_put_failure;
@@ -828,6 +829,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
828 .dellink = ip_tunnel_dellink, 829 .dellink = ip_tunnel_dellink,
829 .get_size = ipgre_get_size, 830 .get_size = ipgre_get_size,
830 .fill_info = ipgre_fill_info, 831 .fill_info = ipgre_fill_info,
832 .get_link_net = ip_tunnel_get_link_net,
831}; 833};
832 834
833static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { 835static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
@@ -842,6 +844,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
842 .dellink = ip_tunnel_dellink, 844 .dellink = ip_tunnel_dellink,
843 .get_size = ipgre_get_size, 845 .get_size = ipgre_get_size,
844 .fill_info = ipgre_fill_info, 846 .fill_info = ipgre_fill_info,
847 .get_link_net = ip_tunnel_get_link_net,
845}; 848};
846 849
847static int __net_init ipgre_tap_init_net(struct net *net) 850static int __net_init ipgre_tap_init_net(struct net *net)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b50861b22b6b..d68199d9b2b0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -755,13 +755,11 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
755 struct msghdr *msg = from; 755 struct msghdr *msg = from;
756 756
757 if (skb->ip_summed == CHECKSUM_PARTIAL) { 757 if (skb->ip_summed == CHECKSUM_PARTIAL) {
758 /* XXX: stripping const */ 758 if (copy_from_iter(to, len, &msg->msg_iter) != len)
759 if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0)
760 return -EFAULT; 759 return -EFAULT;
761 } else { 760 } else {
762 __wsum csum = 0; 761 __wsum csum = 0;
763 /* XXX: stripping const */ 762 if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
764 if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0)
765 return -EFAULT; 763 return -EFAULT;
766 skb->csum = csum_block_add(skb->csum, csum, odd); 764 skb->csum = csum_block_add(skb->csum, csum, odd);
767 } 765 }
@@ -1506,23 +1504,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1506/* 1504/*
1507 * Generic function to send a packet as reply to another packet. 1505 * Generic function to send a packet as reply to another packet.
1508 * Used to send some TCP resets/acks so far. 1506 * Used to send some TCP resets/acks so far.
1509 *
1510 * Use a fake percpu inet socket to avoid false sharing and contention.
1511 */ 1507 */
1512static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { 1508void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1513 .sk = {
1514 .__sk_common = {
1515 .skc_refcnt = ATOMIC_INIT(1),
1516 },
1517 .sk_wmem_alloc = ATOMIC_INIT(1),
1518 .sk_allocation = GFP_ATOMIC,
1519 .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
1520 },
1521 .pmtudisc = IP_PMTUDISC_WANT,
1522 .uc_ttl = -1,
1523};
1524
1525void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1526 const struct ip_options *sopt, 1509 const struct ip_options *sopt,
1527 __be32 daddr, __be32 saddr, 1510 __be32 daddr, __be32 saddr,
1528 const struct ip_reply_arg *arg, 1511 const struct ip_reply_arg *arg,
@@ -1532,9 +1515,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1532 struct ipcm_cookie ipc; 1515 struct ipcm_cookie ipc;
1533 struct flowi4 fl4; 1516 struct flowi4 fl4;
1534 struct rtable *rt = skb_rtable(skb); 1517 struct rtable *rt = skb_rtable(skb);
1518 struct net *net = sock_net(sk);
1535 struct sk_buff *nskb; 1519 struct sk_buff *nskb;
1536 struct sock *sk;
1537 struct inet_sock *inet;
1538 int err; 1520 int err;
1539 1521
1540 if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) 1522 if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
@@ -1565,15 +1547,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1565 if (IS_ERR(rt)) 1547 if (IS_ERR(rt))
1566 return; 1548 return;
1567 1549
1568 inet = &get_cpu_var(unicast_sock); 1550 inet_sk(sk)->tos = arg->tos;
1569 1551
1570 inet->tos = arg->tos;
1571 sk = &inet->sk;
1572 sk->sk_priority = skb->priority; 1552 sk->sk_priority = skb->priority;
1573 sk->sk_protocol = ip_hdr(skb)->protocol; 1553 sk->sk_protocol = ip_hdr(skb)->protocol;
1574 sk->sk_bound_dev_if = arg->bound_dev_if; 1554 sk->sk_bound_dev_if = arg->bound_dev_if;
1575 sock_net_set(sk, net);
1576 __skb_queue_head_init(&sk->sk_write_queue);
1577 sk->sk_sndbuf = sysctl_wmem_default; 1555 sk->sk_sndbuf = sysctl_wmem_default;
1578 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, 1556 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
1579 len, 0, &ipc, &rt, MSG_DONTWAIT); 1557 len, 0, &ipc, &rt, MSG_DONTWAIT);
@@ -1589,13 +1567,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1589 arg->csumoffset) = csum_fold(csum_add(nskb->csum, 1567 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1590 arg->csum)); 1568 arg->csum));
1591 nskb->ip_summed = CHECKSUM_NONE; 1569 nskb->ip_summed = CHECKSUM_NONE;
1592 skb_orphan(nskb);
1593 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); 1570 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1594 ip_push_pending_frames(sk, &fl4); 1571 ip_push_pending_frames(sk, &fl4);
1595 } 1572 }
1596out: 1573out:
1597 put_cpu_var(unicast_sock);
1598
1599 ip_rt_put(rt); 1574 ip_rt_put(rt);
1600} 1575}
1601 1576
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8a89c738b7a3..31d8c71986b4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -37,6 +37,7 @@
37#include <net/route.h> 37#include <net/route.h>
38#include <net/xfrm.h> 38#include <net/xfrm.h>
39#include <net/compat.h> 39#include <net/compat.h>
40#include <net/checksum.h>
40#if IS_ENABLED(CONFIG_IPV6) 41#if IS_ENABLED(CONFIG_IPV6)
41#include <net/transp_v6.h> 42#include <net/transp_v6.h>
42#endif 43#endif
@@ -45,14 +46,6 @@
45#include <linux/errqueue.h> 46#include <linux/errqueue.h>
46#include <asm/uaccess.h> 47#include <asm/uaccess.h>
47 48
48#define IP_CMSG_PKTINFO 1
49#define IP_CMSG_TTL 2
50#define IP_CMSG_TOS 4
51#define IP_CMSG_RECVOPTS 8
52#define IP_CMSG_RETOPTS 16
53#define IP_CMSG_PASSSEC 32
54#define IP_CMSG_ORIGDSTADDR 64
55
56/* 49/*
57 * SOL_IP control messages. 50 * SOL_IP control messages.
58 */ 51 */
@@ -104,6 +97,20 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
104 put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); 97 put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
105} 98}
106 99
100static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
101 int offset)
102{
103 __wsum csum = skb->csum;
104
105 if (skb->ip_summed != CHECKSUM_COMPLETE)
106 return;
107
108 if (offset != 0)
109 csum = csum_sub(csum, csum_partial(skb->data, offset, 0));
110
111 put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
112}
113
107static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) 114static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
108{ 115{
109 char *secdata; 116 char *secdata;
@@ -144,47 +151,73 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
144 put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); 151 put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
145} 152}
146 153
147void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) 154void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
155 int offset)
148{ 156{
149 struct inet_sock *inet = inet_sk(skb->sk); 157 struct inet_sock *inet = inet_sk(skb->sk);
150 unsigned int flags = inet->cmsg_flags; 158 unsigned int flags = inet->cmsg_flags;
151 159
152 /* Ordered by supposed usage frequency */ 160 /* Ordered by supposed usage frequency */
153 if (flags & 1) 161 if (flags & IP_CMSG_PKTINFO) {
154 ip_cmsg_recv_pktinfo(msg, skb); 162 ip_cmsg_recv_pktinfo(msg, skb);
155 if ((flags >>= 1) == 0)
156 return;
157 163
158 if (flags & 1) 164 flags &= ~IP_CMSG_PKTINFO;
165 if (!flags)
166 return;
167 }
168
169 if (flags & IP_CMSG_TTL) {
159 ip_cmsg_recv_ttl(msg, skb); 170 ip_cmsg_recv_ttl(msg, skb);
160 if ((flags >>= 1) == 0)
161 return;
162 171
163 if (flags & 1) 172 flags &= ~IP_CMSG_TTL;
173 if (!flags)
174 return;
175 }
176
177 if (flags & IP_CMSG_TOS) {
164 ip_cmsg_recv_tos(msg, skb); 178 ip_cmsg_recv_tos(msg, skb);
165 if ((flags >>= 1) == 0)
166 return;
167 179
168 if (flags & 1) 180 flags &= ~IP_CMSG_TOS;
181 if (!flags)
182 return;
183 }
184
185 if (flags & IP_CMSG_RECVOPTS) {
169 ip_cmsg_recv_opts(msg, skb); 186 ip_cmsg_recv_opts(msg, skb);
170 if ((flags >>= 1) == 0)
171 return;
172 187
173 if (flags & 1) 188 flags &= ~IP_CMSG_RECVOPTS;
189 if (!flags)
190 return;
191 }
192
193 if (flags & IP_CMSG_RETOPTS) {
174 ip_cmsg_recv_retopts(msg, skb); 194 ip_cmsg_recv_retopts(msg, skb);
175 if ((flags >>= 1) == 0)
176 return;
177 195
178 if (flags & 1) 196 flags &= ~IP_CMSG_RETOPTS;
197 if (!flags)
198 return;
199 }
200
201 if (flags & IP_CMSG_PASSSEC) {
179 ip_cmsg_recv_security(msg, skb); 202 ip_cmsg_recv_security(msg, skb);
180 203
181 if ((flags >>= 1) == 0) 204 flags &= ~IP_CMSG_PASSSEC;
182 return; 205 if (!flags)
183 if (flags & 1) 206 return;
207 }
208
209 if (flags & IP_CMSG_ORIGDSTADDR) {
184 ip_cmsg_recv_dstaddr(msg, skb); 210 ip_cmsg_recv_dstaddr(msg, skb);
185 211
212 flags &= ~IP_CMSG_ORIGDSTADDR;
213 if (!flags)
214 return;
215 }
216
217 if (flags & IP_CMSG_CHECKSUM)
218 ip_cmsg_recv_checksum(msg, skb, offset);
186} 219}
187EXPORT_SYMBOL(ip_cmsg_recv); 220EXPORT_SYMBOL(ip_cmsg_recv_offset);
188 221
189int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, 222int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
190 bool allow_ipv6) 223 bool allow_ipv6)
@@ -450,7 +483,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
450 483
451 serr = SKB_EXT_ERR(skb); 484 serr = SKB_EXT_ERR(skb);
452 485
453 if (sin) { 486 if (sin && skb->len) {
454 sin->sin_family = AF_INET; 487 sin->sin_family = AF_INET;
455 sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + 488 sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
456 serr->addr_offset); 489 serr->addr_offset);
@@ -461,17 +494,14 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
461 494
462 memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); 495 memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
463 sin = &errhdr.offender; 496 sin = &errhdr.offender;
464 sin->sin_family = AF_UNSPEC; 497 memset(sin, 0, sizeof(*sin));
465
466 if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
467 ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) {
468 struct inet_sock *inet = inet_sk(sk);
469 498
499 if (skb->len &&
500 (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
501 ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) {
470 sin->sin_family = AF_INET; 502 sin->sin_family = AF_INET;
471 sin->sin_addr.s_addr = ip_hdr(skb)->saddr; 503 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
472 sin->sin_port = 0; 504 if (inet_sk(sk)->cmsg_flags)
473 memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
474 if (inet->cmsg_flags)
475 ip_cmsg_recv(msg, skb); 505 ip_cmsg_recv(msg, skb);
476 } 506 }
477 507
@@ -522,6 +552,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
522 case IP_MULTICAST_ALL: 552 case IP_MULTICAST_ALL:
523 case IP_MULTICAST_LOOP: 553 case IP_MULTICAST_LOOP:
524 case IP_RECVORIGDSTADDR: 554 case IP_RECVORIGDSTADDR:
555 case IP_CHECKSUM:
525 if (optlen >= sizeof(int)) { 556 if (optlen >= sizeof(int)) {
526 if (get_user(val, (int __user *) optval)) 557 if (get_user(val, (int __user *) optval))
527 return -EFAULT; 558 return -EFAULT;
@@ -619,6 +650,19 @@ static int do_ip_setsockopt(struct sock *sk, int level,
619 else 650 else
620 inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR; 651 inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
621 break; 652 break;
653 case IP_CHECKSUM:
654 if (val) {
655 if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) {
656 inet_inc_convert_csum(sk);
657 inet->cmsg_flags |= IP_CMSG_CHECKSUM;
658 }
659 } else {
660 if (inet->cmsg_flags & IP_CMSG_CHECKSUM) {
661 inet_dec_convert_csum(sk);
662 inet->cmsg_flags &= ~IP_CMSG_CHECKSUM;
663 }
664 }
665 break;
622 case IP_TOS: /* This sets both TOS and Precedence */ 666 case IP_TOS: /* This sets both TOS and Precedence */
623 if (sk->sk_type == SOCK_STREAM) { 667 if (sk->sk_type == SOCK_STREAM) {
624 val &= ~INET_ECN_MASK; 668 val &= ~INET_ECN_MASK;
@@ -1222,6 +1266,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1222 case IP_RECVORIGDSTADDR: 1266 case IP_RECVORIGDSTADDR:
1223 val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0; 1267 val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
1224 break; 1268 break;
1269 case IP_CHECKSUM:
1270 val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
1271 break;
1225 case IP_TOS: 1272 case IP_TOS:
1226 val = inet->tos; 1273 val = inet->tos;
1227 break; 1274 break;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d3e447936720..2cd08280c77b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -972,6 +972,14 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
972} 972}
973EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 973EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
974 974
975struct net *ip_tunnel_get_link_net(const struct net_device *dev)
976{
977 struct ip_tunnel *tunnel = netdev_priv(dev);
978
979 return tunnel->net;
980}
981EXPORT_SYMBOL(ip_tunnel_get_link_net);
982
975int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, 983int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
976 struct rtnl_link_ops *ops, char *devname) 984 struct rtnl_link_ops *ops, char *devname)
977{ 985{
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 1a7e979e80ba..94efe148181c 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -531,6 +531,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
531 .dellink = ip_tunnel_dellink, 531 .dellink = ip_tunnel_dellink,
532 .get_size = vti_get_size, 532 .get_size = vti_get_size,
533 .fill_info = vti_fill_info, 533 .fill_info = vti_fill_info,
534 .get_link_net = ip_tunnel_get_link_net,
534}; 535};
535 536
536static int __init vti_init(void) 537static int __init vti_init(void)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 7fa18bc7e47f..b26376ef87f6 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -209,9 +209,9 @@ static int __init ic_open_devs(void)
209 last = &ic_first_dev; 209 last = &ic_first_dev;
210 rtnl_lock(); 210 rtnl_lock();
211 211
212 /* bring loopback device up first */ 212 /* bring loopback and DSA master network devices up first */
213 for_each_netdev(&init_net, dev) { 213 for_each_netdev(&init_net, dev) {
214 if (!(dev->flags & IFF_LOOPBACK)) 214 if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))
215 continue; 215 continue;
216 if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) 216 if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
217 pr_err("IP-Config: Failed to open %s\n", dev->name); 217 pr_err("IP-Config: Failed to open %s\n", dev->name);
@@ -306,7 +306,7 @@ static void __init ic_close_devs(void)
306 while ((d = next)) { 306 while ((d = next)) {
307 next = d->next; 307 next = d->next;
308 dev = d->dev; 308 dev = d->dev;
309 if (dev != ic_dev) { 309 if (dev != ic_dev && !netdev_uses_dsa(dev)) {
310 DBG(("IP-Config: Downing %s\n", dev->name)); 310 DBG(("IP-Config: Downing %s\n", dev->name));
311 dev_change_flags(dev, d->flags); 311 dev_change_flags(dev, d->flags);
312 } 312 }
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 40403114f00a..915d215a7d14 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -366,12 +366,12 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
366 366
367 if (data[IFLA_IPTUN_ENCAP_SPORT]) { 367 if (data[IFLA_IPTUN_ENCAP_SPORT]) {
368 ret = true; 368 ret = true;
369 ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]); 369 ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
370 } 370 }
371 371
372 if (data[IFLA_IPTUN_ENCAP_DPORT]) { 372 if (data[IFLA_IPTUN_ENCAP_DPORT]) {
373 ret = true; 373 ret = true;
374 ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]); 374 ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
375 } 375 }
376 376
377 return ret; 377 return ret;
@@ -460,10 +460,10 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
460 460
461 if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, 461 if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
462 tunnel->encap.type) || 462 tunnel->encap.type) ||
463 nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT, 463 nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
464 tunnel->encap.sport) || 464 tunnel->encap.sport) ||
465 nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, 465 nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
466 tunnel->encap.dport) || 466 tunnel->encap.dport) ||
467 nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, 467 nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
468 tunnel->encap.flags)) 468 tunnel->encap.flags))
469 goto nla_put_failure; 469 goto nla_put_failure;
@@ -498,6 +498,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = {
498 .dellink = ip_tunnel_dellink, 498 .dellink = ip_tunnel_dellink,
499 .get_size = ipip_get_size, 499 .get_size = ipip_get_size,
500 .fill_info = ipip_fill_info, 500 .fill_info = ipip_fill_info,
501 .get_link_net = ip_tunnel_get_link_net,
501}; 502};
502 503
503static struct xfrm_tunnel ipip_handler __read_mostly = { 504static struct xfrm_tunnel ipip_handler __read_mostly = {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c8034587859d..9d78427652d2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2290,7 +2290,8 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2290 if (err < 0 && err != -ENOENT) 2290 if (err < 0 && err != -ENOENT)
2291 goto nla_put_failure; 2291 goto nla_put_failure;
2292 2292
2293 return nlmsg_end(skb, nlh); 2293 nlmsg_end(skb, nlh);
2294 return 0;
2294 2295
2295nla_put_failure: 2296nla_put_failure:
2296 nlmsg_cancel(skb, nlh); 2297 nlmsg_cancel(skb, nlh);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index c0d82f78d364..e9f66e1cda50 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -599,18 +599,18 @@ int ping_getfrag(void *from, char *to,
599 struct pingfakehdr *pfh = (struct pingfakehdr *)from; 599 struct pingfakehdr *pfh = (struct pingfakehdr *)from;
600 600
601 if (offset == 0) { 601 if (offset == 0) {
602 if (fraglen < sizeof(struct icmphdr)) 602 fraglen -= sizeof(struct icmphdr);
603 if (fraglen < 0)
603 BUG(); 604 BUG();
604 if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr), 605 if (csum_and_copy_from_iter(to + sizeof(struct icmphdr),
605 pfh->iov, 0, fraglen - sizeof(struct icmphdr), 606 fraglen, &pfh->wcheck,
606 &pfh->wcheck)) 607 &pfh->msg->msg_iter) != fraglen)
607 return -EFAULT; 608 return -EFAULT;
608 } else if (offset < sizeof(struct icmphdr)) { 609 } else if (offset < sizeof(struct icmphdr)) {
609 BUG(); 610 BUG();
610 } else { 611 } else {
611 if (csum_partial_copy_fromiovecend 612 if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck,
612 (to, pfh->iov, offset - sizeof(struct icmphdr), 613 &pfh->msg->msg_iter) != fraglen)
613 fraglen, &pfh->wcheck))
614 return -EFAULT; 614 return -EFAULT;
615 } 615 }
616 616
@@ -811,8 +811,7 @@ back_from_confirm:
811 pfh.icmph.checksum = 0; 811 pfh.icmph.checksum = 0;
812 pfh.icmph.un.echo.id = inet->inet_sport; 812 pfh.icmph.un.echo.id = inet->inet_sport;
813 pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; 813 pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
814 /* XXX: stripping const */ 814 pfh.msg = msg;
815 pfh.iov = (struct iovec *)msg->msg_iter.iov;
816 pfh.wcheck = 0; 815 pfh.wcheck = 0;
817 pfh.family = AF_INET; 816 pfh.family = AF_INET;
818 817
@@ -966,8 +965,11 @@ bool ping_rcv(struct sk_buff *skb)
966 965
967 sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); 966 sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
968 if (sk != NULL) { 967 if (sk != NULL) {
968 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
969
969 pr_debug("rcv on socket %p\n", sk); 970 pr_debug("rcv on socket %p\n", sk);
970 ping_queue_rcv_skb(sk, skb_get(skb)); 971 if (skb2)
972 ping_queue_rcv_skb(sk, skb2);
971 sock_put(sk); 973 sock_put(sk);
972 return true; 974 return true;
973 } 975 }
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8f9cd200ce20..d8953ef0770c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -292,6 +292,12 @@ static const struct snmp_mib snmp4_net_list[] = {
292 SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), 292 SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND),
293 SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), 293 SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT),
294 SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), 294 SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND),
295 SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV),
296 SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS),
297 SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ),
298 SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
299 SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
300 SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
295 SNMP_MIB_SENTINEL 301 SNMP_MIB_SENTINEL
296}; 302};
297 303
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 0bb68df5055d..f027a708b7e0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -337,7 +337,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
337} 337}
338 338
339static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, 339static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
340 void *from, size_t length, 340 struct msghdr *msg, size_t length,
341 struct rtable **rtp, 341 struct rtable **rtp,
342 unsigned int flags) 342 unsigned int flags)
343{ 343{
@@ -382,7 +382,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
382 382
383 skb->transport_header = skb->network_header; 383 skb->transport_header = skb->network_header;
384 err = -EFAULT; 384 err = -EFAULT;
385 if (memcpy_fromiovecend((void *)iph, from, 0, length)) 385 if (memcpy_from_msg(iph, msg, length))
386 goto error_free; 386 goto error_free;
387 387
388 iphlen = iph->ihl * 4; 388 iphlen = iph->ihl * 4;
@@ -625,8 +625,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
625back_from_confirm: 625back_from_confirm:
626 626
627 if (inet->hdrincl) 627 if (inet->hdrincl)
628 /* XXX: stripping const */ 628 err = raw_send_hdrinc(sk, &fl4, msg, len,
629 err = raw_send_hdrinc(sk, &fl4, (struct iovec *)msg->msg_iter.iov, len,
630 &rt, msg->msg_flags); 629 &rt, msg->msg_flags);
631 630
632 else { 631 else {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6a2155b02602..ad5064362c5c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -966,6 +966,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
966 if (dst->dev->mtu < mtu) 966 if (dst->dev->mtu < mtu)
967 return; 967 return;
968 968
969 if (rt->rt_pmtu && rt->rt_pmtu < mtu)
970 return;
971
969 if (mtu < ip_rt_min_pmtu) 972 if (mtu < ip_rt_min_pmtu)
970 mtu = ip_rt_min_pmtu; 973 mtu = ip_rt_min_pmtu;
971 974
@@ -1325,14 +1328,22 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1325 return ret; 1328 return ret;
1326} 1329}
1327 1330
1328static DEFINE_SPINLOCK(rt_uncached_lock); 1331struct uncached_list {
1329static LIST_HEAD(rt_uncached_list); 1332 spinlock_t lock;
1333 struct list_head head;
1334};
1335
1336static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1330 1337
1331static void rt_add_uncached_list(struct rtable *rt) 1338static void rt_add_uncached_list(struct rtable *rt)
1332{ 1339{
1333 spin_lock_bh(&rt_uncached_lock); 1340 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1334 list_add_tail(&rt->rt_uncached, &rt_uncached_list); 1341
1335 spin_unlock_bh(&rt_uncached_lock); 1342 rt->rt_uncached_list = ul;
1343
1344 spin_lock_bh(&ul->lock);
1345 list_add_tail(&rt->rt_uncached, &ul->head);
1346 spin_unlock_bh(&ul->lock);
1336} 1347}
1337 1348
1338static void ipv4_dst_destroy(struct dst_entry *dst) 1349static void ipv4_dst_destroy(struct dst_entry *dst)
@@ -1340,27 +1351,32 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1340 struct rtable *rt = (struct rtable *) dst; 1351 struct rtable *rt = (struct rtable *) dst;
1341 1352
1342 if (!list_empty(&rt->rt_uncached)) { 1353 if (!list_empty(&rt->rt_uncached)) {
1343 spin_lock_bh(&rt_uncached_lock); 1354 struct uncached_list *ul = rt->rt_uncached_list;
1355
1356 spin_lock_bh(&ul->lock);
1344 list_del(&rt->rt_uncached); 1357 list_del(&rt->rt_uncached);
1345 spin_unlock_bh(&rt_uncached_lock); 1358 spin_unlock_bh(&ul->lock);
1346 } 1359 }
1347} 1360}
1348 1361
1349void rt_flush_dev(struct net_device *dev) 1362void rt_flush_dev(struct net_device *dev)
1350{ 1363{
1351 if (!list_empty(&rt_uncached_list)) { 1364 struct net *net = dev_net(dev);
1352 struct net *net = dev_net(dev); 1365 struct rtable *rt;
1353 struct rtable *rt; 1366 int cpu;
1367
1368 for_each_possible_cpu(cpu) {
1369 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1354 1370
1355 spin_lock_bh(&rt_uncached_lock); 1371 spin_lock_bh(&ul->lock);
1356 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { 1372 list_for_each_entry(rt, &ul->head, rt_uncached) {
1357 if (rt->dst.dev != dev) 1373 if (rt->dst.dev != dev)
1358 continue; 1374 continue;
1359 rt->dst.dev = net->loopback_dev; 1375 rt->dst.dev = net->loopback_dev;
1360 dev_hold(rt->dst.dev); 1376 dev_hold(rt->dst.dev);
1361 dev_put(dev); 1377 dev_put(dev);
1362 } 1378 }
1363 spin_unlock_bh(&rt_uncached_lock); 1379 spin_unlock_bh(&ul->lock);
1364 } 1380 }
1365} 1381}
1366 1382
@@ -1554,11 +1570,10 @@ static int __mkroute_input(struct sk_buff *skb,
1554 1570
1555 do_cache = res->fi && !itag; 1571 do_cache = res->fi && !itag;
1556 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && 1572 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1573 skb->protocol == htons(ETH_P_IP) &&
1557 (IN_DEV_SHARED_MEDIA(out_dev) || 1574 (IN_DEV_SHARED_MEDIA(out_dev) ||
1558 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { 1575 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1559 flags |= RTCF_DOREDIRECT; 1576 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1560 do_cache = false;
1561 }
1562 1577
1563 if (skb->protocol != htons(ETH_P_IP)) { 1578 if (skb->protocol != htons(ETH_P_IP)) {
1564 /* Not IP (i.e. ARP). Do not create route, if it is 1579 /* Not IP (i.e. ARP). Do not create route, if it is
@@ -2303,6 +2318,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2303 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2318 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2304 if (rt->rt_flags & RTCF_NOTIFY) 2319 if (rt->rt_flags & RTCF_NOTIFY)
2305 r->rtm_flags |= RTM_F_NOTIFY; 2320 r->rtm_flags |= RTM_F_NOTIFY;
2321 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2322 r->rtm_flags |= RTCF_DOREDIRECT;
2306 2323
2307 if (nla_put_be32(skb, RTA_DST, dst)) 2324 if (nla_put_be32(skb, RTA_DST, dst))
2308 goto nla_put_failure; 2325 goto nla_put_failure;
@@ -2377,7 +2394,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2377 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2394 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2378 goto nla_put_failure; 2395 goto nla_put_failure;
2379 2396
2380 return nlmsg_end(skb, nlh); 2397 nlmsg_end(skb, nlh);
2398 return 0;
2381 2399
2382nla_put_failure: 2400nla_put_failure:
2383 nlmsg_cancel(skb, nlh); 2401 nlmsg_cancel(skb, nlh);
@@ -2469,7 +2487,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2469 err = rt_fill_info(net, dst, src, &fl4, skb, 2487 err = rt_fill_info(net, dst, src, &fl4, skb,
2470 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 2488 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2471 RTM_NEWROUTE, 0, 0); 2489 RTM_NEWROUTE, 0, 0);
2472 if (err <= 0) 2490 if (err < 0)
2473 goto errout_free; 2491 goto errout_free;
2474 2492
2475 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 2493 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
@@ -2717,6 +2735,7 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2717int __init ip_rt_init(void) 2735int __init ip_rt_init(void)
2718{ 2736{
2719 int rc = 0; 2737 int rc = 0;
2738 int cpu;
2720 2739
2721 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); 2740 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2722 if (!ip_idents) 2741 if (!ip_idents)
@@ -2724,6 +2743,12 @@ int __init ip_rt_init(void)
2724 2743
2725 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); 2744 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2726 2745
2746 for_each_possible_cpu(cpu) {
2747 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2748
2749 INIT_LIST_HEAD(&ul->head);
2750 spin_lock_init(&ul->lock);
2751 }
2727#ifdef CONFIG_IP_ROUTE_CLASSID 2752#ifdef CONFIG_IP_ROUTE_CLASSID
2728 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 2753 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2729 if (!ip_rt_acct) 2754 if (!ip_rt_acct)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e0ee384a448f..d151539da8e6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -604,20 +604,6 @@ static struct ctl_table ipv4_table[] = {
604 .proc_handler = proc_tcp_congestion_control, 604 .proc_handler = proc_tcp_congestion_control,
605 }, 605 },
606 { 606 {
607 .procname = "tcp_mtu_probing",
608 .data = &sysctl_tcp_mtu_probing,
609 .maxlen = sizeof(int),
610 .mode = 0644,
611 .proc_handler = proc_dointvec,
612 },
613 {
614 .procname = "tcp_base_mss",
615 .data = &sysctl_tcp_base_mss,
616 .maxlen = sizeof(int),
617 .mode = 0644,
618 .proc_handler = proc_dointvec,
619 },
620 {
621 .procname = "tcp_workaround_signed_windows", 607 .procname = "tcp_workaround_signed_windows",
622 .data = &sysctl_tcp_workaround_signed_windows, 608 .data = &sysctl_tcp_workaround_signed_windows,
623 .maxlen = sizeof(int), 609 .maxlen = sizeof(int),
@@ -729,6 +715,13 @@ static struct ctl_table ipv4_table[] = {
729 .extra2 = &one, 715 .extra2 = &one,
730 }, 716 },
731 { 717 {
718 .procname = "tcp_invalid_ratelimit",
719 .data = &sysctl_tcp_invalid_ratelimit,
720 .maxlen = sizeof(int),
721 .mode = 0644,
722 .proc_handler = proc_dointvec_ms_jiffies,
723 },
724 {
732 .procname = "icmp_msgs_per_sec", 725 .procname = "icmp_msgs_per_sec",
733 .data = &sysctl_icmp_msgs_per_sec, 726 .data = &sysctl_icmp_msgs_per_sec,
734 .maxlen = sizeof(int), 727 .maxlen = sizeof(int),
@@ -876,6 +869,20 @@ static struct ctl_table ipv4_net_table[] = {
876 .mode = 0644, 869 .mode = 0644,
877 .proc_handler = proc_dointvec, 870 .proc_handler = proc_dointvec,
878 }, 871 },
872 {
873 .procname = "tcp_mtu_probing",
874 .data = &init_net.ipv4.sysctl_tcp_mtu_probing,
875 .maxlen = sizeof(int),
876 .mode = 0644,
877 .proc_handler = proc_dointvec,
878 },
879 {
880 .procname = "tcp_base_mss",
881 .data = &init_net.ipv4.sysctl_tcp_base_mss,
882 .maxlen = sizeof(int),
883 .mode = 0644,
884 .proc_handler = proc_dointvec,
885 },
879 { } 886 { }
880}; 887};
881 888
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3075723c729b..9d72a0fcd928 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1067,11 +1067,10 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1067int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 1067int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1068 size_t size) 1068 size_t size)
1069{ 1069{
1070 const struct iovec *iov;
1071 struct tcp_sock *tp = tcp_sk(sk); 1070 struct tcp_sock *tp = tcp_sk(sk);
1072 struct sk_buff *skb; 1071 struct sk_buff *skb;
1073 int iovlen, flags, err, copied = 0; 1072 int flags, err, copied = 0;
1074 int mss_now = 0, size_goal, copied_syn = 0, offset = 0; 1073 int mss_now = 0, size_goal, copied_syn = 0;
1075 bool sg; 1074 bool sg;
1076 long timeo; 1075 long timeo;
1077 1076
@@ -1084,7 +1083,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1084 goto out; 1083 goto out;
1085 else if (err) 1084 else if (err)
1086 goto out_err; 1085 goto out_err;
1087 offset = copied_syn;
1088 } 1086 }
1089 1087
1090 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1088 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -1118,8 +1116,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1118 mss_now = tcp_send_mss(sk, &size_goal, flags); 1116 mss_now = tcp_send_mss(sk, &size_goal, flags);
1119 1117
1120 /* Ok commence sending. */ 1118 /* Ok commence sending. */
1121 iovlen = msg->msg_iter.nr_segs;
1122 iov = msg->msg_iter.iov;
1123 copied = 0; 1119 copied = 0;
1124 1120
1125 err = -EPIPE; 1121 err = -EPIPE;
@@ -1128,151 +1124,134 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1128 1124
1129 sg = !!(sk->sk_route_caps & NETIF_F_SG); 1125 sg = !!(sk->sk_route_caps & NETIF_F_SG);
1130 1126
1131 while (--iovlen >= 0) { 1127 while (iov_iter_count(&msg->msg_iter)) {
1132 size_t seglen = iov->iov_len; 1128 int copy = 0;
1133 unsigned char __user *from = iov->iov_base; 1129 int max = size_goal;
1134 1130
1135 iov++; 1131 skb = tcp_write_queue_tail(sk);
1136 if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ 1132 if (tcp_send_head(sk)) {
1137 if (offset >= seglen) { 1133 if (skb->ip_summed == CHECKSUM_NONE)
1138 offset -= seglen; 1134 max = mss_now;
1139 continue; 1135 copy = max - skb->len;
1140 }
1141 seglen -= offset;
1142 from += offset;
1143 offset = 0;
1144 } 1136 }
1145 1137
1146 while (seglen > 0) { 1138 if (copy <= 0) {
1147 int copy = 0;
1148 int max = size_goal;
1149
1150 skb = tcp_write_queue_tail(sk);
1151 if (tcp_send_head(sk)) {
1152 if (skb->ip_summed == CHECKSUM_NONE)
1153 max = mss_now;
1154 copy = max - skb->len;
1155 }
1156
1157 if (copy <= 0) {
1158new_segment: 1139new_segment:
1159 /* Allocate new segment. If the interface is SG, 1140 /* Allocate new segment. If the interface is SG,
1160 * allocate skb fitting to single page. 1141 * allocate skb fitting to single page.
1161 */ 1142 */
1162 if (!sk_stream_memory_free(sk)) 1143 if (!sk_stream_memory_free(sk))
1163 goto wait_for_sndbuf; 1144 goto wait_for_sndbuf;
1164 1145
1165 skb = sk_stream_alloc_skb(sk, 1146 skb = sk_stream_alloc_skb(sk,
1166 select_size(sk, sg), 1147 select_size(sk, sg),
1167 sk->sk_allocation); 1148 sk->sk_allocation);
1168 if (!skb) 1149 if (!skb)
1169 goto wait_for_memory; 1150 goto wait_for_memory;
1170 1151
1171 /* 1152 /*
1172 * Check whether we can use HW checksum. 1153 * Check whether we can use HW checksum.
1173 */ 1154 */
1174 if (sk->sk_route_caps & NETIF_F_ALL_CSUM) 1155 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
1175 skb->ip_summed = CHECKSUM_PARTIAL; 1156 skb->ip_summed = CHECKSUM_PARTIAL;
1176 1157
1177 skb_entail(sk, skb); 1158 skb_entail(sk, skb);
1178 copy = size_goal; 1159 copy = size_goal;
1179 max = size_goal; 1160 max = size_goal;
1180 1161
1181 /* All packets are restored as if they have 1162 /* All packets are restored as if they have
1182 * already been sent. skb_mstamp isn't set to 1163 * already been sent. skb_mstamp isn't set to
1183 * avoid wrong rtt estimation. 1164 * avoid wrong rtt estimation.
1184 */ 1165 */
1185 if (tp->repair) 1166 if (tp->repair)
1186 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; 1167 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1187 } 1168 }
1188 1169
1189 /* Try to append data to the end of skb. */ 1170 /* Try to append data to the end of skb. */
1190 if (copy > seglen) 1171 if (copy > iov_iter_count(&msg->msg_iter))
1191 copy = seglen; 1172 copy = iov_iter_count(&msg->msg_iter);
1192 1173
1193 /* Where to copy to? */ 1174 /* Where to copy to? */
1194 if (skb_availroom(skb) > 0) { 1175 if (skb_availroom(skb) > 0) {
1195 /* We have some space in skb head. Superb! */ 1176 /* We have some space in skb head. Superb! */
1196 copy = min_t(int, copy, skb_availroom(skb)); 1177 copy = min_t(int, copy, skb_availroom(skb));
1197 err = skb_add_data_nocache(sk, skb, from, copy); 1178 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1198 if (err) 1179 if (err)
1199 goto do_fault; 1180 goto do_fault;
1200 } else { 1181 } else {
1201 bool merge = true; 1182 bool merge = true;
1202 int i = skb_shinfo(skb)->nr_frags; 1183 int i = skb_shinfo(skb)->nr_frags;
1203 struct page_frag *pfrag = sk_page_frag(sk); 1184 struct page_frag *pfrag = sk_page_frag(sk);
1204 1185
1205 if (!sk_page_frag_refill(sk, pfrag)) 1186 if (!sk_page_frag_refill(sk, pfrag))
1206 goto wait_for_memory; 1187 goto wait_for_memory;
1207
1208 if (!skb_can_coalesce(skb, i, pfrag->page,
1209 pfrag->offset)) {
1210 if (i == MAX_SKB_FRAGS || !sg) {
1211 tcp_mark_push(tp, skb);
1212 goto new_segment;
1213 }
1214 merge = false;
1215 }
1216 1188
1217 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1189 if (!skb_can_coalesce(skb, i, pfrag->page,
1218 1190 pfrag->offset)) {
1219 if (!sk_wmem_schedule(sk, copy)) 1191 if (i == MAX_SKB_FRAGS || !sg) {
1220 goto wait_for_memory; 1192 tcp_mark_push(tp, skb);
1221 1193 goto new_segment;
1222 err = skb_copy_to_page_nocache(sk, from, skb,
1223 pfrag->page,
1224 pfrag->offset,
1225 copy);
1226 if (err)
1227 goto do_error;
1228
1229 /* Update the skb. */
1230 if (merge) {
1231 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1232 } else {
1233 skb_fill_page_desc(skb, i, pfrag->page,
1234 pfrag->offset, copy);
1235 get_page(pfrag->page);
1236 } 1194 }
1237 pfrag->offset += copy; 1195 merge = false;
1238 } 1196 }
1239 1197
1240 if (!copied) 1198 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1241 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1242 1199
1243 tp->write_seq += copy; 1200 if (!sk_wmem_schedule(sk, copy))
1244 TCP_SKB_CB(skb)->end_seq += copy; 1201 goto wait_for_memory;
1245 tcp_skb_pcount_set(skb, 0);
1246 1202
1247 from += copy; 1203 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1248 copied += copy; 1204 pfrag->page,
1249 if ((seglen -= copy) == 0 && iovlen == 0) { 1205 pfrag->offset,
1250 tcp_tx_timestamp(sk, skb); 1206 copy);
1251 goto out; 1207 if (err)
1208 goto do_error;
1209
1210 /* Update the skb. */
1211 if (merge) {
1212 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1213 } else {
1214 skb_fill_page_desc(skb, i, pfrag->page,
1215 pfrag->offset, copy);
1216 get_page(pfrag->page);
1252 } 1217 }
1218 pfrag->offset += copy;
1219 }
1253 1220
1254 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) 1221 if (!copied)
1255 continue; 1222 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1223
1224 tp->write_seq += copy;
1225 TCP_SKB_CB(skb)->end_seq += copy;
1226 tcp_skb_pcount_set(skb, 0);
1227
1228 copied += copy;
1229 if (!iov_iter_count(&msg->msg_iter)) {
1230 tcp_tx_timestamp(sk, skb);
1231 goto out;
1232 }
1256 1233
1257 if (forced_push(tp)) { 1234 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
1258 tcp_mark_push(tp, skb);
1259 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1260 } else if (skb == tcp_send_head(sk))
1261 tcp_push_one(sk, mss_now);
1262 continue; 1235 continue;
1263 1236
1237 if (forced_push(tp)) {
1238 tcp_mark_push(tp, skb);
1239 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1240 } else if (skb == tcp_send_head(sk))
1241 tcp_push_one(sk, mss_now);
1242 continue;
1243
1264wait_for_sndbuf: 1244wait_for_sndbuf:
1265 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1245 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1266wait_for_memory: 1246wait_for_memory:
1267 if (copied) 1247 if (copied)
1268 tcp_push(sk, flags & ~MSG_MORE, mss_now, 1248 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1269 TCP_NAGLE_PUSH, size_goal); 1249 TCP_NAGLE_PUSH, size_goal);
1270 1250
1271 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 1251 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1272 goto do_error; 1252 goto do_error;
1273 1253
1274 mss_now = tcp_send_mss(sk, &size_goal, flags); 1254 mss_now = tcp_send_mss(sk, &size_goal, flags);
1275 }
1276 } 1255 }
1277 1256
1278out: 1257out:
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index bb395d46a389..c037644eafb7 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
150 tcp_slow_start(tp, acked); 150 tcp_slow_start(tp, acked);
151 else { 151 else {
152 bictcp_update(ca, tp->snd_cwnd); 152 bictcp_update(ca, tp->snd_cwnd);
153 tcp_cong_avoid_ai(tp, ca->cnt); 153 tcp_cong_avoid_ai(tp, ca->cnt, 1);
154 } 154 }
155} 155}
156 156
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0dd16bc..d694088214cd 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -13,6 +13,7 @@
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/list.h> 14#include <linux/list.h>
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/jhash.h>
16#include <net/tcp.h> 17#include <net/tcp.h>
17 18
18static DEFINE_SPINLOCK(tcp_cong_list_lock); 19static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -31,6 +32,34 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
31 return NULL; 32 return NULL;
32} 33}
33 34
35/* Must be called with rcu lock held */
36static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
37{
38 const struct tcp_congestion_ops *ca = tcp_ca_find(name);
39#ifdef CONFIG_MODULES
40 if (!ca && capable(CAP_NET_ADMIN)) {
41 rcu_read_unlock();
42 request_module("tcp_%s", name);
43 rcu_read_lock();
44 ca = tcp_ca_find(name);
45 }
46#endif
47 return ca;
48}
49
50/* Simple linear search, not much in here. */
51struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
52{
53 struct tcp_congestion_ops *e;
54
55 list_for_each_entry_rcu(e, &tcp_cong_list, list) {
56 if (e->key == key)
57 return e;
58 }
59
60 return NULL;
61}
62
34/* 63/*
35 * Attach new congestion control algorithm to the list 64 * Attach new congestion control algorithm to the list
36 * of available options. 65 * of available options.
@@ -45,9 +74,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
45 return -EINVAL; 74 return -EINVAL;
46 } 75 }
47 76
77 ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
78
48 spin_lock(&tcp_cong_list_lock); 79 spin_lock(&tcp_cong_list_lock);
49 if (tcp_ca_find(ca->name)) { 80 if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
50 pr_notice("%s already registered\n", ca->name); 81 pr_notice("%s already registered or non-unique key\n",
82 ca->name);
51 ret = -EEXIST; 83 ret = -EEXIST;
52 } else { 84 } else {
53 list_add_tail_rcu(&ca->list, &tcp_cong_list); 85 list_add_tail_rcu(&ca->list, &tcp_cong_list);
@@ -70,9 +102,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
70 spin_lock(&tcp_cong_list_lock); 102 spin_lock(&tcp_cong_list_lock);
71 list_del_rcu(&ca->list); 103 list_del_rcu(&ca->list);
72 spin_unlock(&tcp_cong_list_lock); 104 spin_unlock(&tcp_cong_list_lock);
105
106 /* Wait for outstanding readers to complete before the
107 * module gets removed entirely.
108 *
109 * A try_module_get() should fail by now as our module is
110 * in "going" state since no refs are held anymore and
111 * module_exit() handler being called.
112 */
113 synchronize_rcu();
73} 114}
74EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); 115EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
75 116
117u32 tcp_ca_get_key_by_name(const char *name)
118{
119 const struct tcp_congestion_ops *ca;
120 u32 key;
121
122 might_sleep();
123
124 rcu_read_lock();
125 ca = __tcp_ca_find_autoload(name);
126 key = ca ? ca->key : TCP_CA_UNSPEC;
127 rcu_read_unlock();
128
129 return key;
130}
131EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
132
133char *tcp_ca_get_name_by_key(u32 key, char *buffer)
134{
135 const struct tcp_congestion_ops *ca;
136 char *ret = NULL;
137
138 rcu_read_lock();
139 ca = tcp_ca_find_key(key);
140 if (ca)
141 ret = strncpy(buffer, ca->name,
142 TCP_CA_NAME_MAX);
143 rcu_read_unlock();
144
145 return ret;
146}
147EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
148
76/* Assign choice of congestion control. */ 149/* Assign choice of congestion control. */
77void tcp_assign_congestion_control(struct sock *sk) 150void tcp_assign_congestion_control(struct sock *sk)
78{ 151{
@@ -107,6 +180,18 @@ void tcp_init_congestion_control(struct sock *sk)
107 icsk->icsk_ca_ops->init(sk); 180 icsk->icsk_ca_ops->init(sk);
108} 181}
109 182
183static void tcp_reinit_congestion_control(struct sock *sk,
184 const struct tcp_congestion_ops *ca)
185{
186 struct inet_connection_sock *icsk = inet_csk(sk);
187
188 tcp_cleanup_congestion_control(sk);
189 icsk->icsk_ca_ops = ca;
190
191 if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
192 icsk->icsk_ca_ops->init(sk);
193}
194
110/* Manage refcounts on socket close. */ 195/* Manage refcounts on socket close. */
111void tcp_cleanup_congestion_control(struct sock *sk) 196void tcp_cleanup_congestion_control(struct sock *sk)
112{ 197{
@@ -241,42 +326,26 @@ out:
241int tcp_set_congestion_control(struct sock *sk, const char *name) 326int tcp_set_congestion_control(struct sock *sk, const char *name)
242{ 327{
243 struct inet_connection_sock *icsk = inet_csk(sk); 328 struct inet_connection_sock *icsk = inet_csk(sk);
244 struct tcp_congestion_ops *ca; 329 const struct tcp_congestion_ops *ca;
245 int err = 0; 330 int err = 0;
246 331
247 rcu_read_lock(); 332 if (icsk->icsk_ca_dst_locked)
248 ca = tcp_ca_find(name); 333 return -EPERM;
249 334
250 /* no change asking for existing value */ 335 rcu_read_lock();
336 ca = __tcp_ca_find_autoload(name);
337 /* No change asking for existing value */
251 if (ca == icsk->icsk_ca_ops) 338 if (ca == icsk->icsk_ca_ops)
252 goto out; 339 goto out;
253
254#ifdef CONFIG_MODULES
255 /* not found attempt to autoload module */
256 if (!ca && capable(CAP_NET_ADMIN)) {
257 rcu_read_unlock();
258 request_module("tcp_%s", name);
259 rcu_read_lock();
260 ca = tcp_ca_find(name);
261 }
262#endif
263 if (!ca) 340 if (!ca)
264 err = -ENOENT; 341 err = -ENOENT;
265
266 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || 342 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
267 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) 343 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
268 err = -EPERM; 344 err = -EPERM;
269
270 else if (!try_module_get(ca->owner)) 345 else if (!try_module_get(ca->owner))
271 err = -EBUSY; 346 err = -EBUSY;
272 347 else
273 else { 348 tcp_reinit_congestion_control(sk, ca);
274 tcp_cleanup_congestion_control(sk);
275 icsk->icsk_ca_ops = ca;
276
277 if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
278 icsk->icsk_ca_ops->init(sk);
279 }
280 out: 349 out:
281 rcu_read_unlock(); 350 rcu_read_unlock();
282 return err; 351 return err;
@@ -291,26 +360,32 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
291 * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and 360 * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
292 * returns the leftover acks to adjust cwnd in congestion avoidance mode. 361 * returns the leftover acks to adjust cwnd in congestion avoidance mode.
293 */ 362 */
294void tcp_slow_start(struct tcp_sock *tp, u32 acked) 363u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
295{ 364{
296 u32 cwnd = tp->snd_cwnd + acked; 365 u32 cwnd = tp->snd_cwnd + acked;
297 366
298 if (cwnd > tp->snd_ssthresh) 367 if (cwnd > tp->snd_ssthresh)
299 cwnd = tp->snd_ssthresh + 1; 368 cwnd = tp->snd_ssthresh + 1;
369 acked -= cwnd - tp->snd_cwnd;
300 tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); 370 tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
371
372 return acked;
301} 373}
302EXPORT_SYMBOL_GPL(tcp_slow_start); 374EXPORT_SYMBOL_GPL(tcp_slow_start);
303 375
304/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ 376/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
305void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) 377 * for every packet that was ACKed.
378 */
379void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
306{ 380{
381 tp->snd_cwnd_cnt += acked;
307 if (tp->snd_cwnd_cnt >= w) { 382 if (tp->snd_cwnd_cnt >= w) {
308 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 383 u32 delta = tp->snd_cwnd_cnt / w;
309 tp->snd_cwnd++; 384
310 tp->snd_cwnd_cnt = 0; 385 tp->snd_cwnd_cnt -= delta * w;
311 } else { 386 tp->snd_cwnd += delta;
312 tp->snd_cwnd_cnt++;
313 } 387 }
388 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
314} 389}
315EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); 390EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
316 391
@@ -329,11 +404,13 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
329 return; 404 return;
330 405
331 /* In "safe" area, increase. */ 406 /* In "safe" area, increase. */
332 if (tp->snd_cwnd <= tp->snd_ssthresh) 407 if (tp->snd_cwnd <= tp->snd_ssthresh) {
333 tcp_slow_start(tp, acked); 408 acked = tcp_slow_start(tp, acked);
409 if (!acked)
410 return;
411 }
334 /* In dangerous area, increase slowly. */ 412 /* In dangerous area, increase slowly. */
335 else 413 tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
336 tcp_cong_avoid_ai(tp, tp->snd_cwnd);
337} 414}
338EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 415EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
339 416
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 6b6002416a73..4b276d1ed980 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -93,9 +93,7 @@ struct bictcp {
93 u32 epoch_start; /* beginning of an epoch */ 93 u32 epoch_start; /* beginning of an epoch */
94 u32 ack_cnt; /* number of acks */ 94 u32 ack_cnt; /* number of acks */
95 u32 tcp_cwnd; /* estimated tcp cwnd */ 95 u32 tcp_cwnd; /* estimated tcp cwnd */
96#define ACK_RATIO_SHIFT 4 96 u16 unused;
97#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
98 u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
99 u8 sample_cnt; /* number of samples to decide curr_rtt */ 97 u8 sample_cnt; /* number of samples to decide curr_rtt */
100 u8 found; /* the exit point is found? */ 98 u8 found; /* the exit point is found? */
101 u32 round_start; /* beginning of each round */ 99 u32 round_start; /* beginning of each round */
@@ -114,7 +112,6 @@ static inline void bictcp_reset(struct bictcp *ca)
114 ca->bic_K = 0; 112 ca->bic_K = 0;
115 ca->delay_min = 0; 113 ca->delay_min = 0;
116 ca->epoch_start = 0; 114 ca->epoch_start = 0;
117 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
118 ca->ack_cnt = 0; 115 ca->ack_cnt = 0;
119 ca->tcp_cwnd = 0; 116 ca->tcp_cwnd = 0;
120 ca->found = 0; 117 ca->found = 0;
@@ -205,23 +202,30 @@ static u32 cubic_root(u64 a)
205/* 202/*
206 * Compute congestion window to use. 203 * Compute congestion window to use.
207 */ 204 */
208static inline void bictcp_update(struct bictcp *ca, u32 cwnd) 205static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
209{ 206{
210 u32 delta, bic_target, max_cnt; 207 u32 delta, bic_target, max_cnt;
211 u64 offs, t; 208 u64 offs, t;
212 209
213 ca->ack_cnt++; /* count the number of ACKs */ 210 ca->ack_cnt += acked; /* count the number of ACKed packets */
214 211
215 if (ca->last_cwnd == cwnd && 212 if (ca->last_cwnd == cwnd &&
216 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) 213 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
217 return; 214 return;
218 215
216 /* The CUBIC function can update ca->cnt at most once per jiffy.
217 * On all cwnd reduction events, ca->epoch_start is set to 0,
218 * which will force a recalculation of ca->cnt.
219 */
220 if (ca->epoch_start && tcp_time_stamp == ca->last_time)
221 goto tcp_friendliness;
222
219 ca->last_cwnd = cwnd; 223 ca->last_cwnd = cwnd;
220 ca->last_time = tcp_time_stamp; 224 ca->last_time = tcp_time_stamp;
221 225
222 if (ca->epoch_start == 0) { 226 if (ca->epoch_start == 0) {
223 ca->epoch_start = tcp_time_stamp; /* record beginning */ 227 ca->epoch_start = tcp_time_stamp; /* record beginning */
224 ca->ack_cnt = 1; /* start counting */ 228 ca->ack_cnt = acked; /* start counting */
225 ca->tcp_cwnd = cwnd; /* syn with cubic */ 229 ca->tcp_cwnd = cwnd; /* syn with cubic */
226 230
227 if (ca->last_max_cwnd <= cwnd) { 231 if (ca->last_max_cwnd <= cwnd) {
@@ -283,6 +287,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
283 if (ca->last_max_cwnd == 0 && ca->cnt > 20) 287 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
284 ca->cnt = 20; /* increase cwnd 5% per RTT */ 288 ca->cnt = 20; /* increase cwnd 5% per RTT */
285 289
290tcp_friendliness:
286 /* TCP Friendly */ 291 /* TCP Friendly */
287 if (tcp_friendliness) { 292 if (tcp_friendliness) {
288 u32 scale = beta_scale; 293 u32 scale = beta_scale;
@@ -301,7 +306,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
301 } 306 }
302 } 307 }
303 308
304 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
305 if (ca->cnt == 0) /* cannot be zero */ 309 if (ca->cnt == 0) /* cannot be zero */
306 ca->cnt = 1; 310 ca->cnt = 1;
307} 311}
@@ -317,11 +321,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
317 if (tp->snd_cwnd <= tp->snd_ssthresh) { 321 if (tp->snd_cwnd <= tp->snd_ssthresh) {
318 if (hystart && after(ack, ca->end_seq)) 322 if (hystart && after(ack, ca->end_seq))
319 bictcp_hystart_reset(sk); 323 bictcp_hystart_reset(sk);
320 tcp_slow_start(tp, acked); 324 acked = tcp_slow_start(tp, acked);
321 } else { 325 if (!acked)
322 bictcp_update(ca, tp->snd_cwnd); 326 return;
323 tcp_cong_avoid_ai(tp, ca->cnt);
324 } 327 }
328 bictcp_update(ca, tp->snd_cwnd, acked);
329 tcp_cong_avoid_ai(tp, ca->cnt, acked);
325} 330}
326 331
327static u32 bictcp_recalc_ssthresh(struct sock *sk) 332static u32 bictcp_recalc_ssthresh(struct sock *sk)
@@ -411,20 +416,10 @@ static void hystart_update(struct sock *sk, u32 delay)
411 */ 416 */
412static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) 417static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
413{ 418{
414 const struct inet_connection_sock *icsk = inet_csk(sk);
415 const struct tcp_sock *tp = tcp_sk(sk); 419 const struct tcp_sock *tp = tcp_sk(sk);
416 struct bictcp *ca = inet_csk_ca(sk); 420 struct bictcp *ca = inet_csk_ca(sk);
417 u32 delay; 421 u32 delay;
418 422
419 if (icsk->icsk_ca_state == TCP_CA_Open) {
420 u32 ratio = ca->delayed_ack;
421
422 ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
423 ratio += cnt;
424
425 ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
426 }
427
428 /* Some calls are for duplicates without timetamps */ 423 /* Some calls are for duplicates without timetamps */
429 if (rtt_us < 0) 424 if (rtt_us < 0)
430 return; 425 return;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 815c85e3b1e0..53db2c309572 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -255,6 +255,9 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
255 struct tcp_fastopen_cookie valid_foc = { .len = -1 }; 255 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
256 bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; 256 bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
257 257
258 if (foc->len == 0) /* Client requests a cookie */
259 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
260
258 if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && 261 if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
259 (syn_data || foc->len >= 0) && 262 (syn_data || foc->len >= 0) &&
260 tcp_fastopen_queue_check(sk))) { 263 tcp_fastopen_queue_check(sk))) {
@@ -265,7 +268,8 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
265 if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) 268 if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
266 goto fastopen; 269 goto fastopen;
267 270
268 if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) && 271 if (foc->len >= 0 && /* Client presents or requests a cookie */
272 tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
269 foc->len == TCP_FASTOPEN_COOKIE_SIZE && 273 foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
270 foc->len == valid_foc.len && 274 foc->len == valid_foc.len &&
271 !memcmp(foc->val, valid_foc.val, foc->len)) { 275 !memcmp(foc->val, valid_foc.val, foc->len)) {
@@ -284,11 +288,10 @@ fastopen:
284 LINUX_MIB_TCPFASTOPENPASSIVE); 288 LINUX_MIB_TCPFASTOPENPASSIVE);
285 return true; 289 return true;
286 } 290 }
287 } 291 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
292 } else if (foc->len > 0) /* Client presents an invalid cookie */
293 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
288 294
289 NET_INC_STATS_BH(sock_net(sk), foc->len ?
290 LINUX_MIB_TCPFASTOPENPASSIVEFAIL :
291 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
292 *foc = valid_foc; 295 *foc = valid_foc;
293 return false; 296 return false;
294} 297}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 075ab4d5af5e..8fdd27b17306 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
100 100
101int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 101int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
102int sysctl_tcp_early_retrans __read_mostly = 3; 102int sysctl_tcp_early_retrans __read_mostly = 3;
103int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
103 104
104#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 105#define FLAG_DATA 0x01 /* Incoming frame contained data. */
105#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 106#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -3183,8 +3184,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3183 3184
3184 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 3185 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3185 3186
3186 if (ca_ops->pkts_acked) 3187 if (ca_ops->pkts_acked) {
3187 ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); 3188 long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us);
3189 ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
3190 }
3188 3191
3189 } else if (skb && rtt_update && sack_rtt_us >= 0 && 3192 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3190 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { 3193 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
@@ -3319,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3319} 3322}
3320 3323
3321/* RFC 5961 7 [ACK Throttling] */ 3324/* RFC 5961 7 [ACK Throttling] */
3322static void tcp_send_challenge_ack(struct sock *sk) 3325static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3323{ 3326{
3324 /* unprotected vars, we dont care of overwrites */ 3327 /* unprotected vars, we dont care of overwrites */
3325 static u32 challenge_timestamp; 3328 static u32 challenge_timestamp;
3326 static unsigned int challenge_count; 3329 static unsigned int challenge_count;
3327 u32 now = jiffies / HZ; 3330 struct tcp_sock *tp = tcp_sk(sk);
3331 u32 now;
3332
3333 /* First check our per-socket dupack rate limit. */
3334 if (tcp_oow_rate_limited(sock_net(sk), skb,
3335 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3336 &tp->last_oow_ack_time))
3337 return;
3328 3338
3339 /* Then check the check host-wide RFC 5961 rate limit. */
3340 now = jiffies / HZ;
3329 if (now != challenge_timestamp) { 3341 if (now != challenge_timestamp) {
3330 challenge_timestamp = now; 3342 challenge_timestamp = now;
3331 challenge_count = 0; 3343 challenge_count = 0;
@@ -3358,34 +3370,34 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3358} 3370}
3359 3371
3360/* This routine deals with acks during a TLP episode. 3372/* This routine deals with acks during a TLP episode.
3373 * We mark the end of a TLP episode on receiving TLP dupack or when
3374 * ack is after tlp_high_seq.
3361 * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. 3375 * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
3362 */ 3376 */
3363static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) 3377static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3364{ 3378{
3365 struct tcp_sock *tp = tcp_sk(sk); 3379 struct tcp_sock *tp = tcp_sk(sk);
3366 bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
3367 !(flag & (FLAG_SND_UNA_ADVANCED |
3368 FLAG_NOT_DUP | FLAG_DATA_SACKED));
3369 3380
3370 /* Mark the end of TLP episode on receiving TLP dupack or when 3381 if (before(ack, tp->tlp_high_seq))
3371 * ack is after tlp_high_seq.
3372 */
3373 if (is_tlp_dupack) {
3374 tp->tlp_high_seq = 0;
3375 return; 3382 return;
3376 }
3377 3383
3378 if (after(ack, tp->tlp_high_seq)) { 3384 if (flag & FLAG_DSACKING_ACK) {
3385 /* This DSACK means original and TLP probe arrived; no loss */
3386 tp->tlp_high_seq = 0;
3387 } else if (after(ack, tp->tlp_high_seq)) {
3388 /* ACK advances: there was a loss, so reduce cwnd. Reset
3389 * tlp_high_seq in tcp_init_cwnd_reduction()
3390 */
3391 tcp_init_cwnd_reduction(sk);
3392 tcp_set_ca_state(sk, TCP_CA_CWR);
3393 tcp_end_cwnd_reduction(sk);
3394 tcp_try_keep_open(sk);
3395 NET_INC_STATS_BH(sock_net(sk),
3396 LINUX_MIB_TCPLOSSPROBERECOVERY);
3397 } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
3398 FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
3399 /* Pure dupack: original and TLP probe arrived; no loss */
3379 tp->tlp_high_seq = 0; 3400 tp->tlp_high_seq = 0;
3380 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3381 if (!(flag & FLAG_DSACKING_ACK)) {
3382 tcp_init_cwnd_reduction(sk);
3383 tcp_set_ca_state(sk, TCP_CA_CWR);
3384 tcp_end_cwnd_reduction(sk);
3385 tcp_try_keep_open(sk);
3386 NET_INC_STATS_BH(sock_net(sk),
3387 LINUX_MIB_TCPLOSSPROBERECOVERY);
3388 }
3389 } 3401 }
3390} 3402}
3391 3403
@@ -3421,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3421 if (before(ack, prior_snd_una)) { 3433 if (before(ack, prior_snd_una)) {
3422 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ 3434 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
3423 if (before(ack, prior_snd_una - tp->max_window)) { 3435 if (before(ack, prior_snd_una - tp->max_window)) {
3424 tcp_send_challenge_ack(sk); 3436 tcp_send_challenge_ack(sk, skb);
3425 return -1; 3437 return -1;
3426 } 3438 }
3427 goto old_ack; 3439 goto old_ack;
@@ -4990,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
4990 tcp_paws_discard(sk, skb)) { 5002 tcp_paws_discard(sk, skb)) {
4991 if (!th->rst) { 5003 if (!th->rst) {
4992 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 5004 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4993 tcp_send_dupack(sk, skb); 5005 if (!tcp_oow_rate_limited(sock_net(sk), skb,
5006 LINUX_MIB_TCPACKSKIPPEDPAWS,
5007 &tp->last_oow_ack_time))
5008 tcp_send_dupack(sk, skb);
4994 goto discard; 5009 goto discard;
4995 } 5010 }
4996 /* Reset is accepted even if it did not pass PAWS. */ 5011 /* Reset is accepted even if it did not pass PAWS. */
@@ -5007,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5007 if (!th->rst) { 5022 if (!th->rst) {
5008 if (th->syn) 5023 if (th->syn)
5009 goto syn_challenge; 5024 goto syn_challenge;
5010 tcp_send_dupack(sk, skb); 5025 if (!tcp_oow_rate_limited(sock_net(sk), skb,
5026 LINUX_MIB_TCPACKSKIPPEDSEQ,
5027 &tp->last_oow_ack_time))
5028 tcp_send_dupack(sk, skb);
5011 } 5029 }
5012 goto discard; 5030 goto discard;
5013 } 5031 }
@@ -5023,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5023 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) 5041 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
5024 tcp_reset(sk); 5042 tcp_reset(sk);
5025 else 5043 else
5026 tcp_send_challenge_ack(sk); 5044 tcp_send_challenge_ack(sk, skb);
5027 goto discard; 5045 goto discard;
5028 } 5046 }
5029 5047
@@ -5037,7 +5055,7 @@ syn_challenge:
5037 if (syn_inerr) 5055 if (syn_inerr)
5038 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 5056 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5039 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); 5057 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5040 tcp_send_challenge_ack(sk); 5058 tcp_send_challenge_ack(sk, skb);
5041 goto discard; 5059 goto discard;
5042 } 5060 }
5043 5061
@@ -5870,10 +5888,9 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5870 * TCP ECN negotiation. 5888 * TCP ECN negotiation.
5871 * 5889 *
5872 * Exception: tcp_ca wants ECN. This is required for DCTCP 5890 * Exception: tcp_ca wants ECN. This is required for DCTCP
5873 * congestion control; it requires setting ECT on all packets, 5891 * congestion control: Linux DCTCP asserts ECT on all packets,
5874 * including SYN. We inverse the test in this case: If our 5892 * including SYN, which is most optimal solution; however,
5875 * local socket wants ECN, but peer only set ece/cwr (but not 5893 * others, such as FreeBSD do not.
5876 * ECT in IP header) its probably a non-DCTCP aware sender.
5877 */ 5894 */
5878static void tcp_ecn_create_request(struct request_sock *req, 5895static void tcp_ecn_create_request(struct request_sock *req,
5879 const struct sk_buff *skb, 5896 const struct sk_buff *skb,
@@ -5883,18 +5900,15 @@ static void tcp_ecn_create_request(struct request_sock *req,
5883 const struct tcphdr *th = tcp_hdr(skb); 5900 const struct tcphdr *th = tcp_hdr(skb);
5884 const struct net *net = sock_net(listen_sk); 5901 const struct net *net = sock_net(listen_sk);
5885 bool th_ecn = th->ece && th->cwr; 5902 bool th_ecn = th->ece && th->cwr;
5886 bool ect, need_ecn, ecn_ok; 5903 bool ect, ecn_ok;
5887 5904
5888 if (!th_ecn) 5905 if (!th_ecn)
5889 return; 5906 return;
5890 5907
5891 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); 5908 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
5892 need_ecn = tcp_ca_needs_ecn(listen_sk);
5893 ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); 5909 ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
5894 5910
5895 if (!ect && !need_ecn && ecn_ok) 5911 if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
5896 inet_rsk(req)->ecn_ok = 1;
5897 else if (ect && need_ecn)
5898 inet_rsk(req)->ecn_ok = 1; 5912 inet_rsk(req)->ecn_ok = 1;
5899} 5913}
5900 5914
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7fc06c..5a2dfed4783b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
683 arg.bound_dev_if = sk->sk_bound_dev_if; 683 arg.bound_dev_if = sk->sk_bound_dev_if;
684 684
685 arg.tos = ip_hdr(skb)->tos; 685 arg.tos = ip_hdr(skb)->tos;
686 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, 686 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
687 skb, &TCP_SKB_CB(skb)->header.h4.opt,
687 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 688 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
688 &arg, arg.iov[0].iov_len); 689 &arg, arg.iov[0].iov_len);
689 690
@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
767 if (oif) 768 if (oif)
768 arg.bound_dev_if = oif; 769 arg.bound_dev_if = oif;
769 arg.tos = tos; 770 arg.tos = tos;
770 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, 771 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
772 skb, &TCP_SKB_CB(skb)->header.h4.opt,
771 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 773 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
772 &arg, arg.iov[0].iov_len); 774 &arg, arg.iov[0].iov_len);
773 775
@@ -1340,6 +1342,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1340 } 1342 }
1341 sk_setup_caps(newsk, dst); 1343 sk_setup_caps(newsk, dst);
1342 1344
1345 tcp_ca_openreq_child(newsk, dst);
1346
1343 tcp_sync_mss(newsk, dst_mtu(dst)); 1347 tcp_sync_mss(newsk, dst_mtu(dst));
1344 newtp->advmss = dst_metric_advmss(dst); 1348 newtp->advmss = dst_metric_advmss(dst);
1345 if (tcp_sk(sk)->rx_opt.user_mss && 1349 if (tcp_sk(sk)->rx_opt.user_mss &&
@@ -2428,14 +2432,40 @@ struct proto tcp_prot = {
2428}; 2432};
2429EXPORT_SYMBOL(tcp_prot); 2433EXPORT_SYMBOL(tcp_prot);
2430 2434
2435static void __net_exit tcp_sk_exit(struct net *net)
2436{
2437 int cpu;
2438
2439 for_each_possible_cpu(cpu)
2440 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2441 free_percpu(net->ipv4.tcp_sk);
2442}
2443
2431static int __net_init tcp_sk_init(struct net *net) 2444static int __net_init tcp_sk_init(struct net *net)
2432{ 2445{
2446 int res, cpu;
2447
2448 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2449 if (!net->ipv4.tcp_sk)
2450 return -ENOMEM;
2451
2452 for_each_possible_cpu(cpu) {
2453 struct sock *sk;
2454
2455 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2456 IPPROTO_TCP, net);
2457 if (res)
2458 goto fail;
2459 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2460 }
2433 net->ipv4.sysctl_tcp_ecn = 2; 2461 net->ipv4.sysctl_tcp_ecn = 2;
2462 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2434 return 0; 2463 return 0;
2435}
2436 2464
2437static void __net_exit tcp_sk_exit(struct net *net) 2465fail:
2438{ 2466 tcp_sk_exit(net);
2467
2468 return res;
2439} 2469}
2440 2470
2441static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2471static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 272327134a1b..c2a75c6957a1 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -120,7 +120,7 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
120 switch (of_cft(of)->private) { 120 switch (of_cft(of)->private) {
121 case RES_LIMIT: 121 case RES_LIMIT:
122 /* see memcontrol.c */ 122 /* see memcontrol.c */
123 ret = page_counter_memparse(buf, &nr_pages); 123 ret = page_counter_memparse(buf, "-1", &nr_pages);
124 if (ret) 124 if (ret)
125 break; 125 break;
126 mutex_lock(&tcp_limit_mutex); 126 mutex_lock(&tcp_limit_mutex);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index ed9c9a91851c..e5f41bd5ec1b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -886,7 +886,8 @@ static int tcp_metrics_dump_info(struct sk_buff *skb,
886 if (tcp_metrics_fill_info(skb, tm) < 0) 886 if (tcp_metrics_fill_info(skb, tm) < 0)
887 goto nla_put_failure; 887 goto nla_put_failure;
888 888
889 return genlmsg_end(skb, hdr); 889 genlmsg_end(skb, hdr);
890 return 0;
890 891
891nla_put_failure: 892nla_put_failure:
892 genlmsg_cancel(skb, hdr); 893 genlmsg_cancel(skb, hdr);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d2680b65db..dd11ac7798c6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -58,6 +58,25 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
58 return seq == e_win && seq == end_seq; 58 return seq == e_win && seq == end_seq;
59} 59}
60 60
61static enum tcp_tw_status
62tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
63 const struct sk_buff *skb, int mib_idx)
64{
65 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
66
67 if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
68 &tcptw->tw_last_oow_ack_time)) {
69 /* Send ACK. Note, we do not put the bucket,
70 * it will be released by caller.
71 */
72 return TCP_TW_ACK;
73 }
74
75 /* We are rate-limiting, so just release the tw sock and drop skb. */
76 inet_twsk_put(tw);
77 return TCP_TW_SUCCESS;
78}
79
61/* 80/*
62 * * Main purpose of TIME-WAIT state is to close connection gracefully, 81 * * Main purpose of TIME-WAIT state is to close connection gracefully,
63 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN 82 * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -116,7 +135,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
116 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 135 !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
117 tcptw->tw_rcv_nxt, 136 tcptw->tw_rcv_nxt,
118 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) 137 tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
119 return TCP_TW_ACK; 138 return tcp_timewait_check_oow_rate_limit(
139 tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
120 140
121 if (th->rst) 141 if (th->rst)
122 goto kill; 142 goto kill;
@@ -250,10 +270,8 @@ kill:
250 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, 270 inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
251 TCP_TIMEWAIT_LEN); 271 TCP_TIMEWAIT_LEN);
252 272
253 /* Send ACK. Note, we do not put the bucket, 273 return tcp_timewait_check_oow_rate_limit(
254 * it will be released by caller. 274 tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
255 */
256 return TCP_TW_ACK;
257 } 275 }
258 inet_twsk_put(tw); 276 inet_twsk_put(tw);
259 return TCP_TW_SUCCESS; 277 return TCP_TW_SUCCESS;
@@ -289,6 +307,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
289 tcptw->tw_ts_recent = tp->rx_opt.ts_recent; 307 tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
290 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; 308 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
291 tcptw->tw_ts_offset = tp->tsoffset; 309 tcptw->tw_ts_offset = tp->tsoffset;
310 tcptw->tw_last_oow_ack_time = 0;
292 311
293#if IS_ENABLED(CONFIG_IPV6) 312#if IS_ENABLED(CONFIG_IPV6)
294 if (tw->tw_family == PF_INET6) { 313 if (tw->tw_family == PF_INET6) {
@@ -399,6 +418,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp,
399 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; 418 tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
400} 419}
401 420
421void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
422{
423 struct inet_connection_sock *icsk = inet_csk(sk);
424 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
425 bool ca_got_dst = false;
426
427 if (ca_key != TCP_CA_UNSPEC) {
428 const struct tcp_congestion_ops *ca;
429
430 rcu_read_lock();
431 ca = tcp_ca_find_key(ca_key);
432 if (likely(ca && try_module_get(ca->owner))) {
433 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
434 icsk->icsk_ca_ops = ca;
435 ca_got_dst = true;
436 }
437 rcu_read_unlock();
438 }
439
440 if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner))
441 tcp_assign_congestion_control(sk);
442
443 tcp_set_ca_state(sk, TCP_CA_Open);
444}
445EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
446
402/* This is not only more efficient than what we used to do, it eliminates 447/* This is not only more efficient than what we used to do, it eliminates
403 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM 448 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
404 * 449 *
@@ -441,6 +486,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
441 tcp_enable_early_retrans(newtp); 486 tcp_enable_early_retrans(newtp);
442 newtp->tlp_high_seq = 0; 487 newtp->tlp_high_seq = 0;
443 newtp->lsndtime = treq->snt_synack; 488 newtp->lsndtime = treq->snt_synack;
489 newtp->last_oow_ack_time = 0;
444 newtp->total_retrans = req->num_retrans; 490 newtp->total_retrans = req->num_retrans;
445 491
446 /* So many TCP implementations out there (incorrectly) count the 492 /* So many TCP implementations out there (incorrectly) count the
@@ -451,10 +497,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
451 newtp->snd_cwnd = TCP_INIT_CWND; 497 newtp->snd_cwnd = TCP_INIT_CWND;
452 newtp->snd_cwnd_cnt = 0; 498 newtp->snd_cwnd_cnt = 0;
453 499
454 if (!try_module_get(newicsk->icsk_ca_ops->owner))
455 tcp_assign_congestion_control(newsk);
456
457 tcp_set_ca_state(newsk, TCP_CA_Open);
458 tcp_init_xmit_timers(newsk); 500 tcp_init_xmit_timers(newsk);
459 __skb_queue_head_init(&newtp->out_of_order_queue); 501 __skb_queue_head_init(&newtp->out_of_order_queue);
460 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; 502 newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
@@ -583,7 +625,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
583 * Reset timer after retransmitting SYNACK, similar to 625 * Reset timer after retransmitting SYNACK, similar to
584 * the idea of fast retransmit in recovery. 626 * the idea of fast retransmit in recovery.
585 */ 627 */
586 if (!inet_rtx_syn_ack(sk, req)) 628 if (!tcp_oow_rate_limited(sock_net(sk), skb,
629 LINUX_MIB_TCPACKSKIPPEDSYNRECV,
630 &tcp_rsk(req)->last_oow_ack_time) &&
631
632 !inet_rtx_syn_ack(sk, req))
587 req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, 633 req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
588 TCP_RTO_MAX) + jiffies; 634 TCP_RTO_MAX) + jiffies;
589 return NULL; 635 return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 65caf8b95e17..a2a796c5536b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,9 +59,6 @@ int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
59 */ 59 */
60int sysctl_tcp_tso_win_divisor __read_mostly = 3; 60int sysctl_tcp_tso_win_divisor __read_mostly = 3;
61 61
62int sysctl_tcp_mtu_probing __read_mostly = 0;
63int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
64
65/* By default, RFC2861 behavior. */ 62/* By default, RFC2861 behavior. */
66int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 63int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
67 64
@@ -948,7 +945,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
948 945
949 skb_orphan(skb); 946 skb_orphan(skb);
950 skb->sk = sk; 947 skb->sk = sk;
951 skb->destructor = tcp_wfree; 948 skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
952 skb_set_hash_from_sk(skb, sk); 949 skb_set_hash_from_sk(skb, sk);
953 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 950 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
954 951
@@ -1350,11 +1347,12 @@ void tcp_mtup_init(struct sock *sk)
1350{ 1347{
1351 struct tcp_sock *tp = tcp_sk(sk); 1348 struct tcp_sock *tp = tcp_sk(sk);
1352 struct inet_connection_sock *icsk = inet_csk(sk); 1349 struct inet_connection_sock *icsk = inet_csk(sk);
1350 struct net *net = sock_net(sk);
1353 1351
1354 icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; 1352 icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
1355 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + 1353 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1356 icsk->icsk_af_ops->net_header_len; 1354 icsk->icsk_af_ops->net_header_len;
1357 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); 1355 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
1358 icsk->icsk_mtup.probe_size = 0; 1356 icsk->icsk_mtup.probe_size = 0;
1359} 1357}
1360EXPORT_SYMBOL(tcp_mtup_init); 1358EXPORT_SYMBOL(tcp_mtup_init);
@@ -2939,6 +2937,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2939} 2937}
2940EXPORT_SYMBOL(tcp_make_synack); 2938EXPORT_SYMBOL(tcp_make_synack);
2941 2939
2940static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
2941{
2942 struct inet_connection_sock *icsk = inet_csk(sk);
2943 const struct tcp_congestion_ops *ca;
2944 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
2945
2946 if (ca_key == TCP_CA_UNSPEC)
2947 return;
2948
2949 rcu_read_lock();
2950 ca = tcp_ca_find_key(ca_key);
2951 if (likely(ca && try_module_get(ca->owner))) {
2952 module_put(icsk->icsk_ca_ops->owner);
2953 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
2954 icsk->icsk_ca_ops = ca;
2955 }
2956 rcu_read_unlock();
2957}
2958
2942/* Do all connect socket setups that can be done AF independent. */ 2959/* Do all connect socket setups that can be done AF independent. */
2943static void tcp_connect_init(struct sock *sk) 2960static void tcp_connect_init(struct sock *sk)
2944{ 2961{
@@ -2964,6 +2981,8 @@ static void tcp_connect_init(struct sock *sk)
2964 tcp_mtup_init(sk); 2981 tcp_mtup_init(sk);
2965 tcp_sync_mss(sk, dst_mtu(dst)); 2982 tcp_sync_mss(sk, dst_mtu(dst));
2966 2983
2984 tcp_ca_dst_init(sk, dst);
2985
2967 if (!tp->window_clamp) 2986 if (!tp->window_clamp)
2968 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2987 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2969 tp->advmss = dst_metric_advmss(dst); 2988 tp->advmss = dst_metric_advmss(dst);
@@ -3034,7 +3053,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3034{ 3053{
3035 struct tcp_sock *tp = tcp_sk(sk); 3054 struct tcp_sock *tp = tcp_sk(sk);
3036 struct tcp_fastopen_request *fo = tp->fastopen_req; 3055 struct tcp_fastopen_request *fo = tp->fastopen_req;
3037 int syn_loss = 0, space, err = 0; 3056 int syn_loss = 0, space, err = 0, copied;
3038 unsigned long last_syn_loss = 0; 3057 unsigned long last_syn_loss = 0;
3039 struct sk_buff *syn_data; 3058 struct sk_buff *syn_data;
3040 3059
@@ -3072,11 +3091,16 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3072 goto fallback; 3091 goto fallback;
3073 syn_data->ip_summed = CHECKSUM_PARTIAL; 3092 syn_data->ip_summed = CHECKSUM_PARTIAL;
3074 memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); 3093 memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3075 if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), 3094 copied = copy_from_iter(skb_put(syn_data, space), space,
3076 fo->data->msg_iter.iov, 0, space))) { 3095 &fo->data->msg_iter);
3096 if (unlikely(!copied)) {
3077 kfree_skb(syn_data); 3097 kfree_skb(syn_data);
3078 goto fallback; 3098 goto fallback;
3079 } 3099 }
3100 if (copied != space) {
3101 skb_trim(syn_data, copied);
3102 space = copied;
3103 }
3080 3104
3081 /* No more data pending in inet_wait_for_connect() */ 3105 /* No more data pending in inet_wait_for_connect() */
3082 if (space == fo->size) 3106 if (space == fo->size)
@@ -3244,6 +3268,14 @@ void tcp_send_ack(struct sock *sk)
3244 skb_reserve(buff, MAX_TCP_HEADER); 3268 skb_reserve(buff, MAX_TCP_HEADER);
3245 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); 3269 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3246 3270
3271 /* We do not want pure acks influencing TCP Small Queues or fq/pacing
3272 * too much.
3273 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
3274 * We also avoid tcp_wfree() overhead (cache line miss accessing
3275 * tp->tsq_flags) by using regular sock_wfree()
3276 */
3277 skb_set_tcp_pure_ack(buff);
3278
3247 /* Send it off, this clears delayed acks for us. */ 3279 /* Send it off, this clears delayed acks for us. */
3248 skb_mstamp_get(&buff->skb_mstamp); 3280 skb_mstamp_get(&buff->skb_mstamp);
3249 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); 3281 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 6824afb65d93..333bcb2415ff 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
25 if (tp->snd_cwnd <= tp->snd_ssthresh) 25 if (tp->snd_cwnd <= tp->snd_ssthresh)
26 tcp_slow_start(tp, acked); 26 tcp_slow_start(tp, acked);
27 else 27 else
28 tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); 28 tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
29 1);
29} 30}
30 31
31static u32 tcp_scalable_ssthresh(struct sock *sk) 32static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 1829c7fbc77e..0732b787904e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -101,17 +101,20 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
101 101
102static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) 102static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
103{ 103{
104 struct net *net = sock_net(sk);
105
104 /* Black hole detection */ 106 /* Black hole detection */
105 if (sysctl_tcp_mtu_probing) { 107 if (net->ipv4.sysctl_tcp_mtu_probing) {
106 if (!icsk->icsk_mtup.enabled) { 108 if (!icsk->icsk_mtup.enabled) {
107 icsk->icsk_mtup.enabled = 1; 109 icsk->icsk_mtup.enabled = 1;
108 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 110 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
109 } else { 111 } else {
112 struct net *net = sock_net(sk);
110 struct tcp_sock *tp = tcp_sk(sk); 113 struct tcp_sock *tp = tcp_sk(sk);
111 int mss; 114 int mss;
112 115
113 mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; 116 mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
114 mss = min(sysctl_tcp_base_mss, mss); 117 mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
115 mss = max(mss, 68 - tp->tcp_header_len); 118 mss = max(mss, 68 - tp->tcp_header_len);
116 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); 119 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
117 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 120 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index a4d2d2d88dca..112151eeee45 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
159 /* In the "non-congestive state", increase cwnd 159 /* In the "non-congestive state", increase cwnd
160 * every rtt. 160 * every rtt.
161 */ 161 */
162 tcp_cong_avoid_ai(tp, tp->snd_cwnd); 162 tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
163 } else { 163 } else {
164 /* In the "congestive state", increase cwnd 164 /* In the "congestive state", increase cwnd
165 * every other rtt. 165 * every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index cd7273218598..17d35662930d 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
92 92
93 } else { 93 } else {
94 /* Reno */ 94 /* Reno */
95 tcp_cong_avoid_ai(tp, tp->snd_cwnd); 95 tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
96 } 96 }
97 97
98 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. 98 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 13b4dcf86ef6..97ef1f8b7be8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1329,7 +1329,7 @@ try_again:
1329 *addr_len = sizeof(*sin); 1329 *addr_len = sizeof(*sin);
1330 } 1330 }
1331 if (inet->cmsg_flags) 1331 if (inet->cmsg_flags)
1332 ip_cmsg_recv(msg, skb); 1332 ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr));
1333 1333
1334 err = copied; 1334 err = copied;
1335 if (flags & MSG_TRUNC) 1335 if (flags & MSG_TRUNC)
@@ -1806,7 +1806,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1806 if (sk != NULL) { 1806 if (sk != NULL) {
1807 int ret; 1807 int ret;
1808 1808
1809 if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) 1809 if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
1810 skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, 1810 skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
1811 inet_compute_pseudo); 1811 inet_compute_pseudo);
1812 1812
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 7927db0a9279..4a000f1dd757 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -99,11 +99,13 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
99 s_slot = cb->args[0]; 99 s_slot = cb->args[0];
100 num = s_num = cb->args[1]; 100 num = s_num = cb->args[1];
101 101
102 for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) { 102 for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
103 struct sock *sk; 103 struct sock *sk;
104 struct hlist_nulls_node *node; 104 struct hlist_nulls_node *node;
105 struct udp_hslot *hslot = &table->hash[slot]; 105 struct udp_hslot *hslot = &table->hash[slot];
106 106
107 num = 0;
108
107 if (hlist_nulls_empty(&hslot->head)) 109 if (hlist_nulls_empty(&hslot->head))
108 continue; 110 continue;
109 111
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index d3e537ef6b7f..d10f6f4ead27 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -339,7 +339,8 @@ unflush:
339 skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ 339 skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
340 skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); 340 skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
341 NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; 341 NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
342 pp = uo_priv->offload->callbacks.gro_receive(head, skb); 342 pp = uo_priv->offload->callbacks.gro_receive(head, skb,
343 uo_priv->offload);
343 344
344out_unlock: 345out_unlock:
345 rcu_read_unlock(); 346 rcu_read_unlock();
@@ -395,7 +396,9 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
395 396
396 if (uo_priv != NULL) { 397 if (uo_priv != NULL) {
397 NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; 398 NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
398 err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); 399 err = uo_priv->offload->callbacks.gro_complete(skb,
400 nhoff + sizeof(struct udphdr),
401 uo_priv->offload);
399 } 402 }
400 403
401 rcu_read_unlock(); 404 rcu_read_unlock();
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 1671263e5fa0..c83b35485056 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -63,7 +63,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
63 inet_sk(sk)->mc_loop = 0; 63 inet_sk(sk)->mc_loop = 0;
64 64
65 /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ 65 /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
66 udp_set_convert_csum(sk, true); 66 inet_inc_convert_csum(sk);
67 67
68 rcu_assign_sk_user_data(sk, cfg->sk_user_data); 68 rcu_assign_sk_user_data(sk, cfg->sk_user_data);
69 69
@@ -75,10 +75,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
75} 75}
76EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); 76EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
77 77
78int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, 78int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb,
79 struct sk_buff *skb, __be32 src, __be32 dst, 79 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
80 __u8 tos, __u8 ttl, __be16 df, __be16 src_port, 80 __be16 df, __be16 src_port, __be16 dst_port,
81 __be16 dst_port, bool xnet) 81 bool xnet, bool nocheck)
82{ 82{
83 struct udphdr *uh; 83 struct udphdr *uh;
84 84
@@ -90,9 +90,9 @@ int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt,
90 uh->source = src_port; 90 uh->source = src_port;
91 uh->len = htons(skb->len); 91 uh->len = htons(skb->len);
92 92
93 udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len); 93 udp_set_csum(nocheck, skb, src, dst, skb->len);
94 94
95 return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP, 95 return iptunnel_xmit(skb->sk, rt, skb, src, dst, IPPROTO_UDP,
96 tos, ttl, df, xnet); 96 tos, ttl, df, xnet);
97} 97}
98EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); 98EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);