diff options
Diffstat (limited to 'net/ipv4')
45 files changed, 1841 insertions, 1616 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a44773c8346c..d2e49baaff63 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -395,8 +395,6 @@ int inet_release(struct socket *sock) | |||
395 | if (sk) { | 395 | if (sk) { |
396 | long timeout; | 396 | long timeout; |
397 | 397 | ||
398 | sock_rps_reset_flow(sk); | ||
399 | |||
400 | /* Applications forget to leave groups before exiting */ | 398 | /* Applications forget to leave groups before exiting */ |
401 | ip_mc_drop_socket(sk); | 399 | ip_mc_drop_socket(sk); |
402 | 400 | ||
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 5160c710f2eb..e361ea6f3fc8 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c | |||
@@ -378,20 +378,18 @@ static int cipso_v4_cache_check(const unsigned char *key, | |||
378 | * negative values on failure. | 378 | * negative values on failure. |
379 | * | 379 | * |
380 | */ | 380 | */ |
381 | int cipso_v4_cache_add(const struct sk_buff *skb, | 381 | int cipso_v4_cache_add(const unsigned char *cipso_ptr, |
382 | const struct netlbl_lsm_secattr *secattr) | 382 | const struct netlbl_lsm_secattr *secattr) |
383 | { | 383 | { |
384 | int ret_val = -EPERM; | 384 | int ret_val = -EPERM; |
385 | u32 bkt; | 385 | u32 bkt; |
386 | struct cipso_v4_map_cache_entry *entry = NULL; | 386 | struct cipso_v4_map_cache_entry *entry = NULL; |
387 | struct cipso_v4_map_cache_entry *old_entry = NULL; | 387 | struct cipso_v4_map_cache_entry *old_entry = NULL; |
388 | unsigned char *cipso_ptr; | ||
389 | u32 cipso_ptr_len; | 388 | u32 cipso_ptr_len; |
390 | 389 | ||
391 | if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0) | 390 | if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0) |
392 | return 0; | 391 | return 0; |
393 | 392 | ||
394 | cipso_ptr = CIPSO_V4_OPTPTR(skb); | ||
395 | cipso_ptr_len = cipso_ptr[1]; | 393 | cipso_ptr_len = cipso_ptr[1]; |
396 | 394 | ||
397 | entry = kzalloc(sizeof(*entry), GFP_ATOMIC); | 395 | entry = kzalloc(sizeof(*entry), GFP_ATOMIC); |
@@ -1579,6 +1577,33 @@ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, | |||
1579 | } | 1577 | } |
1580 | 1578 | ||
1581 | /** | 1579 | /** |
1580 | * cipso_v4_optptr - Find the CIPSO option in the packet | ||
1581 | * @skb: the packet | ||
1582 | * | ||
1583 | * Description: | ||
1584 | * Parse the packet's IP header looking for a CIPSO option. Returns a pointer | ||
1585 | * to the start of the CIPSO option on success, NULL if one if not found. | ||
1586 | * | ||
1587 | */ | ||
1588 | unsigned char *cipso_v4_optptr(const struct sk_buff *skb) | ||
1589 | { | ||
1590 | const struct iphdr *iph = ip_hdr(skb); | ||
1591 | unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]); | ||
1592 | int optlen; | ||
1593 | int taglen; | ||
1594 | |||
1595 | for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) { | ||
1596 | if (optptr[0] == IPOPT_CIPSO) | ||
1597 | return optptr; | ||
1598 | taglen = optptr[1]; | ||
1599 | optlen -= taglen; | ||
1600 | optptr += taglen; | ||
1601 | } | ||
1602 | |||
1603 | return NULL; | ||
1604 | } | ||
1605 | |||
1606 | /** | ||
1582 | * cipso_v4_validate - Validate a CIPSO option | 1607 | * cipso_v4_validate - Validate a CIPSO option |
1583 | * @option: the start of the option, on error it is set to point to the error | 1608 | * @option: the start of the option, on error it is set to point to the error |
1584 | * | 1609 | * |
@@ -2119,8 +2144,8 @@ void cipso_v4_req_delattr(struct request_sock *req) | |||
2119 | * on success and negative values on failure. | 2144 | * on success and negative values on failure. |
2120 | * | 2145 | * |
2121 | */ | 2146 | */ |
2122 | static int cipso_v4_getattr(const unsigned char *cipso, | 2147 | int cipso_v4_getattr(const unsigned char *cipso, |
2123 | struct netlbl_lsm_secattr *secattr) | 2148 | struct netlbl_lsm_secattr *secattr) |
2124 | { | 2149 | { |
2125 | int ret_val = -ENOMSG; | 2150 | int ret_val = -ENOMSG; |
2126 | u32 doi; | 2151 | u32 doi; |
@@ -2305,22 +2330,6 @@ int cipso_v4_skbuff_delattr(struct sk_buff *skb) | |||
2305 | return 0; | 2330 | return 0; |
2306 | } | 2331 | } |
2307 | 2332 | ||
2308 | /** | ||
2309 | * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option | ||
2310 | * @skb: the packet | ||
2311 | * @secattr: the security attributes | ||
2312 | * | ||
2313 | * Description: | ||
2314 | * Parse the given packet's CIPSO option and return the security attributes. | ||
2315 | * Returns zero on success and negative values on failure. | ||
2316 | * | ||
2317 | */ | ||
2318 | int cipso_v4_skbuff_getattr(const struct sk_buff *skb, | ||
2319 | struct netlbl_lsm_secattr *secattr) | ||
2320 | { | ||
2321 | return cipso_v4_getattr(CIPSO_V4_OPTPTR(skb), secattr); | ||
2322 | } | ||
2323 | |||
2324 | /* | 2333 | /* |
2325 | * Setup Functions | 2334 | * Setup Functions |
2326 | */ | 2335 | */ |
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 214882e7d6de..f0b4a31d7bd6 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
@@ -1522,7 +1522,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, | |||
1522 | preferred, valid)) | 1522 | preferred, valid)) |
1523 | goto nla_put_failure; | 1523 | goto nla_put_failure; |
1524 | 1524 | ||
1525 | return nlmsg_end(skb, nlh); | 1525 | nlmsg_end(skb, nlh); |
1526 | return 0; | ||
1526 | 1527 | ||
1527 | nla_put_failure: | 1528 | nla_put_failure: |
1528 | nlmsg_cancel(skb, nlh); | 1529 | nlmsg_cancel(skb, nlh); |
@@ -1566,7 +1567,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) | |||
1566 | if (inet_fill_ifaddr(skb, ifa, | 1567 | if (inet_fill_ifaddr(skb, ifa, |
1567 | NETLINK_CB(cb->skb).portid, | 1568 | NETLINK_CB(cb->skb).portid, |
1568 | cb->nlh->nlmsg_seq, | 1569 | cb->nlh->nlmsg_seq, |
1569 | RTM_NEWADDR, NLM_F_MULTI) <= 0) { | 1570 | RTM_NEWADDR, NLM_F_MULTI) < 0) { |
1570 | rcu_read_unlock(); | 1571 | rcu_read_unlock(); |
1571 | goto done; | 1572 | goto done; |
1572 | } | 1573 | } |
@@ -1749,7 +1750,8 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, | |||
1749 | IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) | 1750 | IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) |
1750 | goto nla_put_failure; | 1751 | goto nla_put_failure; |
1751 | 1752 | ||
1752 | return nlmsg_end(skb, nlh); | 1753 | nlmsg_end(skb, nlh); |
1754 | return 0; | ||
1753 | 1755 | ||
1754 | nla_put_failure: | 1756 | nla_put_failure: |
1755 | nlmsg_cancel(skb, nlh); | 1757 | nlmsg_cancel(skb, nlh); |
@@ -1881,7 +1883,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, | |||
1881 | cb->nlh->nlmsg_seq, | 1883 | cb->nlh->nlmsg_seq, |
1882 | RTM_NEWNETCONF, | 1884 | RTM_NEWNETCONF, |
1883 | NLM_F_MULTI, | 1885 | NLM_F_MULTI, |
1884 | -1) <= 0) { | 1886 | -1) < 0) { |
1885 | rcu_read_unlock(); | 1887 | rcu_read_unlock(); |
1886 | goto done; | 1888 | goto done; |
1887 | } | 1889 | } |
@@ -1897,7 +1899,7 @@ cont: | |||
1897 | NETLINK_CB(cb->skb).portid, | 1899 | NETLINK_CB(cb->skb).portid, |
1898 | cb->nlh->nlmsg_seq, | 1900 | cb->nlh->nlmsg_seq, |
1899 | RTM_NEWNETCONF, NLM_F_MULTI, | 1901 | RTM_NEWNETCONF, NLM_F_MULTI, |
1900 | -1) <= 0) | 1902 | -1) < 0) |
1901 | goto done; | 1903 | goto done; |
1902 | else | 1904 | else |
1903 | h++; | 1905 | h++; |
@@ -1908,7 +1910,7 @@ cont: | |||
1908 | NETLINK_CB(cb->skb).portid, | 1910 | NETLINK_CB(cb->skb).portid, |
1909 | cb->nlh->nlmsg_seq, | 1911 | cb->nlh->nlmsg_seq, |
1910 | RTM_NEWNETCONF, NLM_F_MULTI, | 1912 | RTM_NEWNETCONF, NLM_F_MULTI, |
1911 | -1) <= 0) | 1913 | -1) < 0) |
1912 | goto done; | 1914 | goto done; |
1913 | else | 1915 | else |
1914 | h++; | 1916 | h++; |
@@ -2320,7 +2322,7 @@ static __net_initdata struct pernet_operations devinet_ops = { | |||
2320 | .exit = devinet_exit_net, | 2322 | .exit = devinet_exit_net, |
2321 | }; | 2323 | }; |
2322 | 2324 | ||
2323 | static struct rtnl_af_ops inet_af_ops = { | 2325 | static struct rtnl_af_ops inet_af_ops __read_mostly = { |
2324 | .family = AF_INET, | 2326 | .family = AF_INET, |
2325 | .fill_link_af = inet_fill_link_af, | 2327 | .fill_link_af = inet_fill_link_af, |
2326 | .get_link_af_size = inet_get_link_af_size, | 2328 | .get_link_af_size = inet_get_link_af_size, |
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 23104a3f2924..57be71dd6a9e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -67,7 +67,7 @@ static int __net_init fib4_rules_init(struct net *net) | |||
67 | return 0; | 67 | return 0; |
68 | 68 | ||
69 | fail: | 69 | fail: |
70 | kfree(local_table); | 70 | fib_free_table(local_table); |
71 | return -ENOMEM; | 71 | return -ENOMEM; |
72 | } | 72 | } |
73 | #else | 73 | #else |
@@ -109,6 +109,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) | |||
109 | return tb; | 109 | return tb; |
110 | } | 110 | } |
111 | 111 | ||
112 | /* caller must hold either rtnl or rcu read lock */ | ||
112 | struct fib_table *fib_get_table(struct net *net, u32 id) | 113 | struct fib_table *fib_get_table(struct net *net, u32 id) |
113 | { | 114 | { |
114 | struct fib_table *tb; | 115 | struct fib_table *tb; |
@@ -119,15 +120,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id) | |||
119 | id = RT_TABLE_MAIN; | 120 | id = RT_TABLE_MAIN; |
120 | h = id & (FIB_TABLE_HASHSZ - 1); | 121 | h = id & (FIB_TABLE_HASHSZ - 1); |
121 | 122 | ||
122 | rcu_read_lock(); | ||
123 | head = &net->ipv4.fib_table_hash[h]; | 123 | head = &net->ipv4.fib_table_hash[h]; |
124 | hlist_for_each_entry_rcu(tb, head, tb_hlist) { | 124 | hlist_for_each_entry_rcu(tb, head, tb_hlist) { |
125 | if (tb->tb_id == id) { | 125 | if (tb->tb_id == id) |
126 | rcu_read_unlock(); | ||
127 | return tb; | 126 | return tb; |
128 | } | ||
129 | } | 127 | } |
130 | rcu_read_unlock(); | ||
131 | return NULL; | 128 | return NULL; |
132 | } | 129 | } |
133 | #endif /* CONFIG_IP_MULTIPLE_TABLES */ | 130 | #endif /* CONFIG_IP_MULTIPLE_TABLES */ |
@@ -167,16 +164,18 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, | |||
167 | if (ipv4_is_multicast(addr)) | 164 | if (ipv4_is_multicast(addr)) |
168 | return RTN_MULTICAST; | 165 | return RTN_MULTICAST; |
169 | 166 | ||
167 | rcu_read_lock(); | ||
168 | |||
170 | local_table = fib_get_table(net, RT_TABLE_LOCAL); | 169 | local_table = fib_get_table(net, RT_TABLE_LOCAL); |
171 | if (local_table) { | 170 | if (local_table) { |
172 | ret = RTN_UNICAST; | 171 | ret = RTN_UNICAST; |
173 | rcu_read_lock(); | ||
174 | if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { | 172 | if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { |
175 | if (!dev || dev == res.fi->fib_dev) | 173 | if (!dev || dev == res.fi->fib_dev) |
176 | ret = res.type; | 174 | ret = res.type; |
177 | } | 175 | } |
178 | rcu_read_unlock(); | ||
179 | } | 176 | } |
177 | |||
178 | rcu_read_unlock(); | ||
180 | return ret; | 179 | return ret; |
181 | } | 180 | } |
182 | 181 | ||
@@ -919,7 +918,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) | |||
919 | #undef BRD1_OK | 918 | #undef BRD1_OK |
920 | } | 919 | } |
921 | 920 | ||
922 | static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) | 921 | static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) |
923 | { | 922 | { |
924 | 923 | ||
925 | struct fib_result res; | 924 | struct fib_result res; |
@@ -929,6 +928,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) | |||
929 | .flowi4_tos = frn->fl_tos, | 928 | .flowi4_tos = frn->fl_tos, |
930 | .flowi4_scope = frn->fl_scope, | 929 | .flowi4_scope = frn->fl_scope, |
931 | }; | 930 | }; |
931 | struct fib_table *tb; | ||
932 | |||
933 | rcu_read_lock(); | ||
934 | |||
935 | tb = fib_get_table(net, frn->tb_id_in); | ||
932 | 936 | ||
933 | frn->err = -ENOENT; | 937 | frn->err = -ENOENT; |
934 | if (tb) { | 938 | if (tb) { |
@@ -945,6 +949,8 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) | |||
945 | } | 949 | } |
946 | local_bh_enable(); | 950 | local_bh_enable(); |
947 | } | 951 | } |
952 | |||
953 | rcu_read_unlock(); | ||
948 | } | 954 | } |
949 | 955 | ||
950 | static void nl_fib_input(struct sk_buff *skb) | 956 | static void nl_fib_input(struct sk_buff *skb) |
@@ -952,7 +958,6 @@ static void nl_fib_input(struct sk_buff *skb) | |||
952 | struct net *net; | 958 | struct net *net; |
953 | struct fib_result_nl *frn; | 959 | struct fib_result_nl *frn; |
954 | struct nlmsghdr *nlh; | 960 | struct nlmsghdr *nlh; |
955 | struct fib_table *tb; | ||
956 | u32 portid; | 961 | u32 portid; |
957 | 962 | ||
958 | net = sock_net(skb->sk); | 963 | net = sock_net(skb->sk); |
@@ -967,9 +972,7 @@ static void nl_fib_input(struct sk_buff *skb) | |||
967 | nlh = nlmsg_hdr(skb); | 972 | nlh = nlmsg_hdr(skb); |
968 | 973 | ||
969 | frn = (struct fib_result_nl *) nlmsg_data(nlh); | 974 | frn = (struct fib_result_nl *) nlmsg_data(nlh); |
970 | tb = fib_get_table(net, frn->tb_id_in); | 975 | nl_fib_lookup(net, frn); |
971 | |||
972 | nl_fib_lookup(frn, tb); | ||
973 | 976 | ||
974 | portid = NETLINK_CB(skb).portid; /* netlink portid */ | 977 | portid = NETLINK_CB(skb).portid; /* netlink portid */ |
975 | NETLINK_CB(skb).portid = 0; /* from kernel */ | 978 | NETLINK_CB(skb).portid = 0; /* from kernel */ |
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 1e4f6600b31d..825981b1049a 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h | |||
@@ -32,7 +32,6 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, | |||
32 | unsigned int); | 32 | unsigned int); |
33 | void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, | 33 | void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, |
34 | u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); | 34 | u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); |
35 | struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); | ||
36 | 35 | ||
37 | static inline void fib_result_assign(struct fib_result *res, | 36 | static inline void fib_result_assign(struct fib_result *res, |
38 | struct fib_info *fi) | 37 | struct fib_info *fi) |
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 8f7bd56955b0..d3db718be51d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c | |||
@@ -81,27 +81,25 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, | |||
81 | break; | 81 | break; |
82 | 82 | ||
83 | case FR_ACT_UNREACHABLE: | 83 | case FR_ACT_UNREACHABLE: |
84 | err = -ENETUNREACH; | 84 | return -ENETUNREACH; |
85 | goto errout; | ||
86 | 85 | ||
87 | case FR_ACT_PROHIBIT: | 86 | case FR_ACT_PROHIBIT: |
88 | err = -EACCES; | 87 | return -EACCES; |
89 | goto errout; | ||
90 | 88 | ||
91 | case FR_ACT_BLACKHOLE: | 89 | case FR_ACT_BLACKHOLE: |
92 | default: | 90 | default: |
93 | err = -EINVAL; | 91 | return -EINVAL; |
94 | goto errout; | ||
95 | } | 92 | } |
96 | 93 | ||
94 | rcu_read_lock(); | ||
95 | |||
97 | tbl = fib_get_table(rule->fr_net, rule->table); | 96 | tbl = fib_get_table(rule->fr_net, rule->table); |
98 | if (!tbl) | 97 | if (tbl) |
99 | goto errout; | 98 | err = fib_table_lookup(tbl, &flp->u.ip4, |
99 | (struct fib_result *)arg->result, | ||
100 | arg->flags); | ||
100 | 101 | ||
101 | err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags); | 102 | rcu_read_unlock(); |
102 | if (err > 0) | ||
103 | err = -EAGAIN; | ||
104 | errout: | ||
105 | return err; | 103 | return err; |
106 | } | 104 | } |
107 | 105 | ||
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f99f41bd15b8..1e2090ea663e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
@@ -360,7 +360,8 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) | |||
360 | + nla_total_size(4) /* RTA_TABLE */ | 360 | + nla_total_size(4) /* RTA_TABLE */ |
361 | + nla_total_size(4) /* RTA_DST */ | 361 | + nla_total_size(4) /* RTA_DST */ |
362 | + nla_total_size(4) /* RTA_PRIORITY */ | 362 | + nla_total_size(4) /* RTA_PRIORITY */ |
363 | + nla_total_size(4); /* RTA_PREFSRC */ | 363 | + nla_total_size(4) /* RTA_PREFSRC */ |
364 | + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ | ||
364 | 365 | ||
365 | /* space for nested metrics */ | 366 | /* space for nested metrics */ |
366 | payload += nla_total_size((RTAX_MAX * nla_total_size(4))); | 367 | payload += nla_total_size((RTAX_MAX * nla_total_size(4))); |
@@ -410,24 +411,6 @@ errout: | |||
410 | rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); | 411 | rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); |
411 | } | 412 | } |
412 | 413 | ||
413 | /* Return the first fib alias matching TOS with | ||
414 | * priority less than or equal to PRIO. | ||
415 | */ | ||
416 | struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) | ||
417 | { | ||
418 | if (fah) { | ||
419 | struct fib_alias *fa; | ||
420 | list_for_each_entry(fa, fah, fa_list) { | ||
421 | if (fa->fa_tos > tos) | ||
422 | continue; | ||
423 | if (fa->fa_info->fib_priority >= prio || | ||
424 | fa->fa_tos < tos) | ||
425 | return fa; | ||
426 | } | ||
427 | } | ||
428 | return NULL; | ||
429 | } | ||
430 | |||
431 | static int fib_detect_death(struct fib_info *fi, int order, | 414 | static int fib_detect_death(struct fib_info *fi, int order, |
432 | struct fib_info **last_resort, int *last_idx, | 415 | struct fib_info **last_resort, int *last_idx, |
433 | int dflt) | 416 | int dflt) |
@@ -859,7 +842,16 @@ struct fib_info *fib_create_info(struct fib_config *cfg) | |||
859 | 842 | ||
860 | if (type > RTAX_MAX) | 843 | if (type > RTAX_MAX) |
861 | goto err_inval; | 844 | goto err_inval; |
862 | val = nla_get_u32(nla); | 845 | if (type == RTAX_CC_ALGO) { |
846 | char tmp[TCP_CA_NAME_MAX]; | ||
847 | |||
848 | nla_strlcpy(tmp, nla, sizeof(tmp)); | ||
849 | val = tcp_ca_get_key_by_name(tmp); | ||
850 | if (val == TCP_CA_UNSPEC) | ||
851 | goto err_inval; | ||
852 | } else { | ||
853 | val = nla_get_u32(nla); | ||
854 | } | ||
863 | if (type == RTAX_ADVMSS && val > 65535 - 40) | 855 | if (type == RTAX_ADVMSS && val > 65535 - 40) |
864 | val = 65535 - 40; | 856 | val = 65535 - 40; |
865 | if (type == RTAX_MTU && val > 65535 - 15) | 857 | if (type == RTAX_MTU && val > 65535 - 15) |
@@ -1081,7 +1073,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, | |||
1081 | nla_nest_end(skb, mp); | 1073 | nla_nest_end(skb, mp); |
1082 | } | 1074 | } |
1083 | #endif | 1075 | #endif |
1084 | return nlmsg_end(skb, nlh); | 1076 | nlmsg_end(skb, nlh); |
1077 | return 0; | ||
1085 | 1078 | ||
1086 | nla_put_failure: | 1079 | nla_put_failure: |
1087 | nlmsg_cancel(skb, nlh); | 1080 | nlmsg_cancel(skb, nlh); |
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 18bcaf2ff2fd..3daf0224ff2e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c | |||
@@ -83,28 +83,33 @@ | |||
83 | 83 | ||
84 | #define MAX_STAT_DEPTH 32 | 84 | #define MAX_STAT_DEPTH 32 |
85 | 85 | ||
86 | #define KEYLENGTH (8*sizeof(t_key)) | 86 | #define KEYLENGTH (8*sizeof(t_key)) |
87 | #define KEY_MAX ((t_key)~0) | ||
87 | 88 | ||
88 | typedef unsigned int t_key; | 89 | typedef unsigned int t_key; |
89 | 90 | ||
90 | #define T_TNODE 0 | 91 | #define IS_TNODE(n) ((n)->bits) |
91 | #define T_LEAF 1 | 92 | #define IS_LEAF(n) (!(n)->bits) |
92 | #define NODE_TYPE_MASK 0x1UL | ||
93 | #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) | ||
94 | 93 | ||
95 | #define IS_TNODE(n) (!(n->parent & T_LEAF)) | 94 | #define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos) |
96 | #define IS_LEAF(n) (n->parent & T_LEAF) | ||
97 | 95 | ||
98 | struct rt_trie_node { | 96 | struct tnode { |
99 | unsigned long parent; | ||
100 | t_key key; | ||
101 | }; | ||
102 | |||
103 | struct leaf { | ||
104 | unsigned long parent; | ||
105 | t_key key; | 97 | t_key key; |
106 | struct hlist_head list; | 98 | unsigned char bits; /* 2log(KEYLENGTH) bits needed */ |
99 | unsigned char pos; /* 2log(KEYLENGTH) bits needed */ | ||
100 | unsigned char slen; | ||
101 | struct tnode __rcu *parent; | ||
107 | struct rcu_head rcu; | 102 | struct rcu_head rcu; |
103 | union { | ||
104 | /* The fields in this struct are valid if bits > 0 (TNODE) */ | ||
105 | struct { | ||
106 | t_key empty_children; /* KEYLENGTH bits needed */ | ||
107 | t_key full_children; /* KEYLENGTH bits needed */ | ||
108 | struct tnode __rcu *child[0]; | ||
109 | }; | ||
110 | /* This list pointer if valid if bits == 0 (LEAF) */ | ||
111 | struct hlist_head list; | ||
112 | }; | ||
108 | }; | 113 | }; |
109 | 114 | ||
110 | struct leaf_info { | 115 | struct leaf_info { |
@@ -115,20 +120,6 @@ struct leaf_info { | |||
115 | struct rcu_head rcu; | 120 | struct rcu_head rcu; |
116 | }; | 121 | }; |
117 | 122 | ||
118 | struct tnode { | ||
119 | unsigned long parent; | ||
120 | t_key key; | ||
121 | unsigned char pos; /* 2log(KEYLENGTH) bits needed */ | ||
122 | unsigned char bits; /* 2log(KEYLENGTH) bits needed */ | ||
123 | unsigned int full_children; /* KEYLENGTH bits needed */ | ||
124 | unsigned int empty_children; /* KEYLENGTH bits needed */ | ||
125 | union { | ||
126 | struct rcu_head rcu; | ||
127 | struct tnode *tnode_free; | ||
128 | }; | ||
129 | struct rt_trie_node __rcu *child[0]; | ||
130 | }; | ||
131 | |||
132 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 123 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
133 | struct trie_use_stats { | 124 | struct trie_use_stats { |
134 | unsigned int gets; | 125 | unsigned int gets; |
@@ -151,19 +142,13 @@ struct trie_stat { | |||
151 | }; | 142 | }; |
152 | 143 | ||
153 | struct trie { | 144 | struct trie { |
154 | struct rt_trie_node __rcu *trie; | 145 | struct tnode __rcu *trie; |
155 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 146 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
156 | struct trie_use_stats stats; | 147 | struct trie_use_stats __percpu *stats; |
157 | #endif | 148 | #endif |
158 | }; | 149 | }; |
159 | 150 | ||
160 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, | 151 | static void resize(struct trie *t, struct tnode *tn); |
161 | int wasfull); | ||
162 | static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); | ||
163 | static struct tnode *inflate(struct trie *t, struct tnode *tn); | ||
164 | static struct tnode *halve(struct trie *t, struct tnode *tn); | ||
165 | /* tnodes to free after resize(); protected by RTNL */ | ||
166 | static struct tnode *tnode_free_head; | ||
167 | static size_t tnode_free_size; | 152 | static size_t tnode_free_size; |
168 | 153 | ||
169 | /* | 154 | /* |
@@ -176,170 +161,101 @@ static const int sync_pages = 128; | |||
176 | static struct kmem_cache *fn_alias_kmem __read_mostly; | 161 | static struct kmem_cache *fn_alias_kmem __read_mostly; |
177 | static struct kmem_cache *trie_leaf_kmem __read_mostly; | 162 | static struct kmem_cache *trie_leaf_kmem __read_mostly; |
178 | 163 | ||
179 | /* | 164 | /* caller must hold RTNL */ |
180 | * caller must hold RTNL | 165 | #define node_parent(n) rtnl_dereference((n)->parent) |
181 | */ | ||
182 | static inline struct tnode *node_parent(const struct rt_trie_node *node) | ||
183 | { | ||
184 | unsigned long parent; | ||
185 | |||
186 | parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held()); | ||
187 | 166 | ||
188 | return (struct tnode *)(parent & ~NODE_TYPE_MASK); | 167 | /* caller must hold RCU read lock or RTNL */ |
189 | } | 168 | #define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent) |
190 | 169 | ||
191 | /* | 170 | /* wrapper for rcu_assign_pointer */ |
192 | * caller must hold RCU read lock or RTNL | 171 | static inline void node_set_parent(struct tnode *n, struct tnode *tp) |
193 | */ | ||
194 | static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node) | ||
195 | { | 172 | { |
196 | unsigned long parent; | 173 | if (n) |
197 | 174 | rcu_assign_pointer(n->parent, tp); | |
198 | parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() || | ||
199 | lockdep_rtnl_is_held()); | ||
200 | |||
201 | return (struct tnode *)(parent & ~NODE_TYPE_MASK); | ||
202 | } | 175 | } |
203 | 176 | ||
204 | /* Same as rcu_assign_pointer | 177 | #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p) |
205 | * but that macro() assumes that value is a pointer. | 178 | |
179 | /* This provides us with the number of children in this node, in the case of a | ||
180 | * leaf this will return 0 meaning none of the children are accessible. | ||
206 | */ | 181 | */ |
207 | static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) | 182 | static inline unsigned long tnode_child_length(const struct tnode *tn) |
208 | { | 183 | { |
209 | smp_wmb(); | 184 | return (1ul << tn->bits) & ~(1ul); |
210 | node->parent = (unsigned long)ptr | NODE_TYPE(node); | ||
211 | } | 185 | } |
212 | 186 | ||
213 | /* | 187 | /* caller must hold RTNL */ |
214 | * caller must hold RTNL | 188 | static inline struct tnode *tnode_get_child(const struct tnode *tn, |
215 | */ | 189 | unsigned long i) |
216 | static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i) | ||
217 | { | 190 | { |
218 | BUG_ON(i >= 1U << tn->bits); | ||
219 | |||
220 | return rtnl_dereference(tn->child[i]); | 191 | return rtnl_dereference(tn->child[i]); |
221 | } | 192 | } |
222 | 193 | ||
223 | /* | 194 | /* caller must hold RCU read lock or RTNL */ |
224 | * caller must hold RCU read lock or RTNL | 195 | static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, |
225 | */ | 196 | unsigned long i) |
226 | static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) | ||
227 | { | 197 | { |
228 | BUG_ON(i >= 1U << tn->bits); | ||
229 | |||
230 | return rcu_dereference_rtnl(tn->child[i]); | 198 | return rcu_dereference_rtnl(tn->child[i]); |
231 | } | 199 | } |
232 | 200 | ||
233 | static inline int tnode_child_length(const struct tnode *tn) | 201 | /* To understand this stuff, an understanding of keys and all their bits is |
234 | { | 202 | * necessary. Every node in the trie has a key associated with it, but not |
235 | return 1 << tn->bits; | 203 | * all of the bits in that key are significant. |
236 | } | 204 | * |
237 | 205 | * Consider a node 'n' and its parent 'tp'. | |
238 | static inline t_key mask_pfx(t_key k, unsigned int l) | 206 | * |
239 | { | 207 | * If n is a leaf, every bit in its key is significant. Its presence is |
240 | return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); | 208 | * necessitated by path compression, since during a tree traversal (when |
241 | } | 209 | * searching for a leaf - unless we are doing an insertion) we will completely |
242 | 210 | * ignore all skipped bits we encounter. Thus we need to verify, at the end of | |
243 | static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) | 211 | * a potentially successful search, that we have indeed been walking the |
244 | { | 212 | * correct key path. |
245 | if (offset < KEYLENGTH) | 213 | * |
246 | return ((t_key)(a << offset)) >> (KEYLENGTH - bits); | 214 | * Note that we can never "miss" the correct key in the tree if present by |
247 | else | 215 | * following the wrong path. Path compression ensures that segments of the key |
248 | return 0; | 216 | * that are the same for all keys with a given prefix are skipped, but the |
249 | } | 217 | * skipped part *is* identical for each node in the subtrie below the skipped |
250 | 218 | * bit! trie_insert() in this implementation takes care of that. | |
251 | static inline int tkey_equals(t_key a, t_key b) | 219 | * |
252 | { | 220 | * if n is an internal node - a 'tnode' here, the various parts of its key |
253 | return a == b; | 221 | * have many different meanings. |
254 | } | 222 | * |
255 | 223 | * Example: | |
256 | static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) | 224 | * _________________________________________________________________ |
257 | { | 225 | * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | |
258 | if (bits == 0 || offset >= KEYLENGTH) | 226 | * ----------------------------------------------------------------- |
259 | return 1; | 227 | * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 |
260 | bits = bits > KEYLENGTH ? KEYLENGTH : bits; | 228 | * |
261 | return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; | 229 | * _________________________________________________________________ |
262 | } | 230 | * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | |
263 | 231 | * ----------------------------------------------------------------- | |
264 | static inline int tkey_mismatch(t_key a, int offset, t_key b) | 232 | * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 |
265 | { | 233 | * |
266 | t_key diff = a ^ b; | 234 | * tp->pos = 22 |
267 | int i = offset; | 235 | * tp->bits = 3 |
268 | 236 | * n->pos = 13 | |
269 | if (!diff) | 237 | * n->bits = 4 |
270 | return 0; | 238 | * |
271 | while ((diff << i) >> (KEYLENGTH-1) == 0) | 239 | * First, let's just ignore the bits that come before the parent tp, that is |
272 | i++; | 240 | * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this |
273 | return i; | 241 | * point we do not use them for anything. |
274 | } | 242 | * |
275 | 243 | * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the | |
276 | /* | 244 | * index into the parent's child array. That is, they will be used to find |
277 | To understand this stuff, an understanding of keys and all their bits is | 245 | * 'n' among tp's children. |
278 | necessary. Every node in the trie has a key associated with it, but not | 246 | * |
279 | all of the bits in that key are significant. | 247 | * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits |
280 | 248 | * for the node n. | |
281 | Consider a node 'n' and its parent 'tp'. | 249 | * |
282 | 250 | * All the bits we have seen so far are significant to the node n. The rest | |
283 | If n is a leaf, every bit in its key is significant. Its presence is | 251 | * of the bits are really not needed or indeed known in n->key. |
284 | necessitated by path compression, since during a tree traversal (when | 252 | * |
285 | searching for a leaf - unless we are doing an insertion) we will completely | 253 | * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into |
286 | ignore all skipped bits we encounter. Thus we need to verify, at the end of | 254 | * n's child array, and will of course be different for each child. |
287 | a potentially successful search, that we have indeed been walking the | 255 | * |
288 | correct key path. | 256 | * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown |
289 | 257 | * at this point. | |
290 | Note that we can never "miss" the correct key in the tree if present by | 258 | */ |
291 | following the wrong path. Path compression ensures that segments of the key | ||
292 | that are the same for all keys with a given prefix are skipped, but the | ||
293 | skipped part *is* identical for each node in the subtrie below the skipped | ||
294 | bit! trie_insert() in this implementation takes care of that - note the | ||
295 | call to tkey_sub_equals() in trie_insert(). | ||
296 | |||
297 | if n is an internal node - a 'tnode' here, the various parts of its key | ||
298 | have many different meanings. | ||
299 | |||
300 | Example: | ||
301 | _________________________________________________________________ | ||
302 | | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | | ||
303 | ----------------------------------------------------------------- | ||
304 | 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | ||
305 | |||
306 | _________________________________________________________________ | ||
307 | | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | | ||
308 | ----------------------------------------------------------------- | ||
309 | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | ||
310 | |||
311 | tp->pos = 7 | ||
312 | tp->bits = 3 | ||
313 | n->pos = 15 | ||
314 | n->bits = 4 | ||
315 | |||
316 | First, let's just ignore the bits that come before the parent tp, that is | ||
317 | the bits from 0 to (tp->pos-1). They are *known* but at this point we do | ||
318 | not use them for anything. | ||
319 | |||
320 | The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the | ||
321 | index into the parent's child array. That is, they will be used to find | ||
322 | 'n' among tp's children. | ||
323 | |||
324 | The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits | ||
325 | for the node n. | ||
326 | |||
327 | All the bits we have seen so far are significant to the node n. The rest | ||
328 | of the bits are really not needed or indeed known in n->key. | ||
329 | |||
330 | The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into | ||
331 | n's child array, and will of course be different for each child. | ||
332 | |||
333 | |||
334 | The rest of the bits, from (n->pos + n->bits) onward, are completely unknown | ||
335 | at this point. | ||
336 | |||
337 | */ | ||
338 | |||
339 | static inline void check_tnode(const struct tnode *tn) | ||
340 | { | ||
341 | WARN_ON(tn && tn->pos+tn->bits > 32); | ||
342 | } | ||
343 | 259 | ||
344 | static const int halve_threshold = 25; | 260 | static const int halve_threshold = 25; |
345 | static const int inflate_threshold = 50; | 261 | static const int inflate_threshold = 50; |
@@ -357,17 +273,23 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) | |||
357 | call_rcu(&fa->rcu, __alias_free_mem); | 273 | call_rcu(&fa->rcu, __alias_free_mem); |
358 | } | 274 | } |
359 | 275 | ||
360 | static void __leaf_free_rcu(struct rcu_head *head) | 276 | #define TNODE_KMALLOC_MAX \ |
361 | { | 277 | ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *)) |
362 | struct leaf *l = container_of(head, struct leaf, rcu); | ||
363 | kmem_cache_free(trie_leaf_kmem, l); | ||
364 | } | ||
365 | 278 | ||
366 | static inline void free_leaf(struct leaf *l) | 279 | static void __node_free_rcu(struct rcu_head *head) |
367 | { | 280 | { |
368 | call_rcu(&l->rcu, __leaf_free_rcu); | 281 | struct tnode *n = container_of(head, struct tnode, rcu); |
282 | |||
283 | if (IS_LEAF(n)) | ||
284 | kmem_cache_free(trie_leaf_kmem, n); | ||
285 | else if (n->bits <= TNODE_KMALLOC_MAX) | ||
286 | kfree(n); | ||
287 | else | ||
288 | vfree(n); | ||
369 | } | 289 | } |
370 | 290 | ||
291 | #define node_free(n) call_rcu(&n->rcu, __node_free_rcu) | ||
292 | |||
371 | static inline void free_leaf_info(struct leaf_info *leaf) | 293 | static inline void free_leaf_info(struct leaf_info *leaf) |
372 | { | 294 | { |
373 | kfree_rcu(leaf, rcu); | 295 | kfree_rcu(leaf, rcu); |
@@ -381,56 +303,31 @@ static struct tnode *tnode_alloc(size_t size) | |||
381 | return vzalloc(size); | 303 | return vzalloc(size); |
382 | } | 304 | } |
383 | 305 | ||
384 | static void __tnode_free_rcu(struct rcu_head *head) | 306 | static inline void empty_child_inc(struct tnode *n) |
385 | { | ||
386 | struct tnode *tn = container_of(head, struct tnode, rcu); | ||
387 | size_t size = sizeof(struct tnode) + | ||
388 | (sizeof(struct rt_trie_node *) << tn->bits); | ||
389 | |||
390 | if (size <= PAGE_SIZE) | ||
391 | kfree(tn); | ||
392 | else | ||
393 | vfree(tn); | ||
394 | } | ||
395 | |||
396 | static inline void tnode_free(struct tnode *tn) | ||
397 | { | ||
398 | if (IS_LEAF(tn)) | ||
399 | free_leaf((struct leaf *) tn); | ||
400 | else | ||
401 | call_rcu(&tn->rcu, __tnode_free_rcu); | ||
402 | } | ||
403 | |||
404 | static void tnode_free_safe(struct tnode *tn) | ||
405 | { | 307 | { |
406 | BUG_ON(IS_LEAF(tn)); | 308 | ++n->empty_children ? : ++n->full_children; |
407 | tn->tnode_free = tnode_free_head; | ||
408 | tnode_free_head = tn; | ||
409 | tnode_free_size += sizeof(struct tnode) + | ||
410 | (sizeof(struct rt_trie_node *) << tn->bits); | ||
411 | } | 309 | } |
412 | 310 | ||
413 | static void tnode_free_flush(void) | 311 | static inline void empty_child_dec(struct tnode *n) |
414 | { | 312 | { |
415 | struct tnode *tn; | 313 | n->empty_children-- ? : n->full_children--; |
416 | |||
417 | while ((tn = tnode_free_head)) { | ||
418 | tnode_free_head = tn->tnode_free; | ||
419 | tn->tnode_free = NULL; | ||
420 | tnode_free(tn); | ||
421 | } | ||
422 | |||
423 | if (tnode_free_size >= PAGE_SIZE * sync_pages) { | ||
424 | tnode_free_size = 0; | ||
425 | synchronize_rcu(); | ||
426 | } | ||
427 | } | 314 | } |
428 | 315 | ||
429 | static struct leaf *leaf_new(void) | 316 | static struct tnode *leaf_new(t_key key) |
430 | { | 317 | { |
431 | struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); | 318 | struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); |
432 | if (l) { | 319 | if (l) { |
433 | l->parent = T_LEAF; | 320 | l->parent = NULL; |
321 | /* set key and pos to reflect full key value | ||
322 | * any trailing zeros in the key should be ignored | ||
323 | * as the nodes are searched | ||
324 | */ | ||
325 | l->key = key; | ||
326 | l->slen = 0; | ||
327 | l->pos = 0; | ||
328 | /* set bits to 0 indicating we are not a tnode */ | ||
329 | l->bits = 0; | ||
330 | |||
434 | INIT_HLIST_HEAD(&l->list); | 331 | INIT_HLIST_HEAD(&l->list); |
435 | } | 332 | } |
436 | return l; | 333 | return l; |
@@ -449,462 +346,530 @@ static struct leaf_info *leaf_info_new(int plen) | |||
449 | 346 | ||
450 | static struct tnode *tnode_new(t_key key, int pos, int bits) | 347 | static struct tnode *tnode_new(t_key key, int pos, int bits) |
451 | { | 348 | { |
452 | size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); | 349 | size_t sz = offsetof(struct tnode, child[1ul << bits]); |
453 | struct tnode *tn = tnode_alloc(sz); | 350 | struct tnode *tn = tnode_alloc(sz); |
351 | unsigned int shift = pos + bits; | ||
352 | |||
353 | /* verify bits and pos their msb bits clear and values are valid */ | ||
354 | BUG_ON(!bits || (shift > KEYLENGTH)); | ||
454 | 355 | ||
455 | if (tn) { | 356 | if (tn) { |
456 | tn->parent = T_TNODE; | 357 | tn->parent = NULL; |
358 | tn->slen = pos; | ||
457 | tn->pos = pos; | 359 | tn->pos = pos; |
458 | tn->bits = bits; | 360 | tn->bits = bits; |
459 | tn->key = key; | 361 | tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; |
460 | tn->full_children = 0; | 362 | if (bits == KEYLENGTH) |
461 | tn->empty_children = 1<<bits; | 363 | tn->full_children = 1; |
364 | else | ||
365 | tn->empty_children = 1ul << bits; | ||
462 | } | 366 | } |
463 | 367 | ||
464 | pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), | 368 | pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), |
465 | sizeof(struct rt_trie_node *) << bits); | 369 | sizeof(struct tnode *) << bits); |
466 | return tn; | 370 | return tn; |
467 | } | 371 | } |
468 | 372 | ||
469 | /* | 373 | /* Check whether a tnode 'n' is "full", i.e. it is an internal node |
470 | * Check whether a tnode 'n' is "full", i.e. it is an internal node | ||
471 | * and no bits are skipped. See discussion in dyntree paper p. 6 | 374 | * and no bits are skipped. See discussion in dyntree paper p. 6 |
472 | */ | 375 | */ |
473 | 376 | static inline int tnode_full(const struct tnode *tn, const struct tnode *n) | |
474 | static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n) | ||
475 | { | 377 | { |
476 | if (n == NULL || IS_LEAF(n)) | 378 | return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); |
477 | return 0; | ||
478 | |||
479 | return ((struct tnode *) n)->pos == tn->pos + tn->bits; | ||
480 | } | 379 | } |
481 | 380 | ||
482 | static inline void put_child(struct tnode *tn, int i, | 381 | /* Add a child at position i overwriting the old value. |
483 | struct rt_trie_node *n) | 382 | * Update the value of full_children and empty_children. |
484 | { | 383 | */ |
485 | tnode_put_child_reorg(tn, i, n, -1); | 384 | static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) |
486 | } | ||
487 | |||
488 | /* | ||
489 | * Add a child at position i overwriting the old value. | ||
490 | * Update the value of full_children and empty_children. | ||
491 | */ | ||
492 | |||
493 | static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, | ||
494 | int wasfull) | ||
495 | { | 385 | { |
496 | struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); | 386 | struct tnode *chi = tnode_get_child(tn, i); |
497 | int isfull; | 387 | int isfull, wasfull; |
498 | 388 | ||
499 | BUG_ON(i >= 1<<tn->bits); | 389 | BUG_ON(i >= tnode_child_length(tn)); |
500 | 390 | ||
501 | /* update emptyChildren */ | 391 | /* update emptyChildren, overflow into fullChildren */ |
502 | if (n == NULL && chi != NULL) | 392 | if (n == NULL && chi != NULL) |
503 | tn->empty_children++; | 393 | empty_child_inc(tn); |
504 | else if (n != NULL && chi == NULL) | 394 | if (n != NULL && chi == NULL) |
505 | tn->empty_children--; | 395 | empty_child_dec(tn); |
506 | 396 | ||
507 | /* update fullChildren */ | 397 | /* update fullChildren */ |
508 | if (wasfull == -1) | 398 | wasfull = tnode_full(tn, chi); |
509 | wasfull = tnode_full(tn, chi); | ||
510 | |||
511 | isfull = tnode_full(tn, n); | 399 | isfull = tnode_full(tn, n); |
400 | |||
512 | if (wasfull && !isfull) | 401 | if (wasfull && !isfull) |
513 | tn->full_children--; | 402 | tn->full_children--; |
514 | else if (!wasfull && isfull) | 403 | else if (!wasfull && isfull) |
515 | tn->full_children++; | 404 | tn->full_children++; |
516 | 405 | ||
517 | if (n) | 406 | if (n && (tn->slen < n->slen)) |
518 | node_set_parent(n, tn); | 407 | tn->slen = n->slen; |
519 | 408 | ||
520 | rcu_assign_pointer(tn->child[i], n); | 409 | rcu_assign_pointer(tn->child[i], n); |
521 | } | 410 | } |
522 | 411 | ||
523 | #define MAX_WORK 10 | 412 | static void update_children(struct tnode *tn) |
524 | static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) | ||
525 | { | 413 | { |
526 | int i; | 414 | unsigned long i; |
527 | struct tnode *old_tn; | ||
528 | int inflate_threshold_use; | ||
529 | int halve_threshold_use; | ||
530 | int max_work; | ||
531 | 415 | ||
532 | if (!tn) | 416 | /* update all of the child parent pointers */ |
533 | return NULL; | 417 | for (i = tnode_child_length(tn); i;) { |
418 | struct tnode *inode = tnode_get_child(tn, --i); | ||
534 | 419 | ||
535 | pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", | 420 | if (!inode) |
536 | tn, inflate_threshold, halve_threshold); | 421 | continue; |
537 | 422 | ||
538 | /* No children */ | 423 | /* Either update the children of a tnode that |
539 | if (tn->empty_children == tnode_child_length(tn)) { | 424 | * already belongs to us or update the child |
540 | tnode_free_safe(tn); | 425 | * to point to ourselves. |
541 | return NULL; | 426 | */ |
427 | if (node_parent(inode) == tn) | ||
428 | update_children(inode); | ||
429 | else | ||
430 | node_set_parent(inode, tn); | ||
542 | } | 431 | } |
543 | /* One child */ | 432 | } |
544 | if (tn->empty_children == tnode_child_length(tn) - 1) | ||
545 | goto one_child; | ||
546 | /* | ||
547 | * Double as long as the resulting node has a number of | ||
548 | * nonempty nodes that are above the threshold. | ||
549 | */ | ||
550 | |||
551 | /* | ||
552 | * From "Implementing a dynamic compressed trie" by Stefan Nilsson of | ||
553 | * the Helsinki University of Technology and Matti Tikkanen of Nokia | ||
554 | * Telecommunications, page 6: | ||
555 | * "A node is doubled if the ratio of non-empty children to all | ||
556 | * children in the *doubled* node is at least 'high'." | ||
557 | * | ||
558 | * 'high' in this instance is the variable 'inflate_threshold'. It | ||
559 | * is expressed as a percentage, so we multiply it with | ||
560 | * tnode_child_length() and instead of multiplying by 2 (since the | ||
561 | * child array will be doubled by inflate()) and multiplying | ||
562 | * the left-hand side by 100 (to handle the percentage thing) we | ||
563 | * multiply the left-hand side by 50. | ||
564 | * | ||
565 | * The left-hand side may look a bit weird: tnode_child_length(tn) | ||
566 | * - tn->empty_children is of course the number of non-null children | ||
567 | * in the current node. tn->full_children is the number of "full" | ||
568 | * children, that is non-null tnodes with a skip value of 0. | ||
569 | * All of those will be doubled in the resulting inflated tnode, so | ||
570 | * we just count them one extra time here. | ||
571 | * | ||
572 | * A clearer way to write this would be: | ||
573 | * | ||
574 | * to_be_doubled = tn->full_children; | ||
575 | * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - | ||
576 | * tn->full_children; | ||
577 | * | ||
578 | * new_child_length = tnode_child_length(tn) * 2; | ||
579 | * | ||
580 | * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / | ||
581 | * new_child_length; | ||
582 | * if (new_fill_factor >= inflate_threshold) | ||
583 | * | ||
584 | * ...and so on, tho it would mess up the while () loop. | ||
585 | * | ||
586 | * anyway, | ||
587 | * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= | ||
588 | * inflate_threshold | ||
589 | * | ||
590 | * avoid a division: | ||
591 | * 100 * (not_to_be_doubled + 2*to_be_doubled) >= | ||
592 | * inflate_threshold * new_child_length | ||
593 | * | ||
594 | * expand not_to_be_doubled and to_be_doubled, and shorten: | ||
595 | * 100 * (tnode_child_length(tn) - tn->empty_children + | ||
596 | * tn->full_children) >= inflate_threshold * new_child_length | ||
597 | * | ||
598 | * expand new_child_length: | ||
599 | * 100 * (tnode_child_length(tn) - tn->empty_children + | ||
600 | * tn->full_children) >= | ||
601 | * inflate_threshold * tnode_child_length(tn) * 2 | ||
602 | * | ||
603 | * shorten again: | ||
604 | * 50 * (tn->full_children + tnode_child_length(tn) - | ||
605 | * tn->empty_children) >= inflate_threshold * | ||
606 | * tnode_child_length(tn) | ||
607 | * | ||
608 | */ | ||
609 | 433 | ||
610 | check_tnode(tn); | 434 | static inline void put_child_root(struct tnode *tp, struct trie *t, |
435 | t_key key, struct tnode *n) | ||
436 | { | ||
437 | if (tp) | ||
438 | put_child(tp, get_index(key, tp), n); | ||
439 | else | ||
440 | rcu_assign_pointer(t->trie, n); | ||
441 | } | ||
611 | 442 | ||
612 | /* Keep root node larger */ | 443 | static inline void tnode_free_init(struct tnode *tn) |
444 | { | ||
445 | tn->rcu.next = NULL; | ||
446 | } | ||
613 | 447 | ||
614 | if (!node_parent((struct rt_trie_node *)tn)) { | 448 | static inline void tnode_free_append(struct tnode *tn, struct tnode *n) |
615 | inflate_threshold_use = inflate_threshold_root; | 449 | { |
616 | halve_threshold_use = halve_threshold_root; | 450 | n->rcu.next = tn->rcu.next; |
617 | } else { | 451 | tn->rcu.next = &n->rcu; |
618 | inflate_threshold_use = inflate_threshold; | 452 | } |
619 | halve_threshold_use = halve_threshold; | ||
620 | } | ||
621 | 453 | ||
622 | max_work = MAX_WORK; | 454 | static void tnode_free(struct tnode *tn) |
623 | while ((tn->full_children > 0 && max_work-- && | 455 | { |
624 | 50 * (tn->full_children + tnode_child_length(tn) | 456 | struct callback_head *head = &tn->rcu; |
625 | - tn->empty_children) | ||
626 | >= inflate_threshold_use * tnode_child_length(tn))) { | ||
627 | 457 | ||
628 | old_tn = tn; | 458 | while (head) { |
629 | tn = inflate(t, tn); | 459 | head = head->next; |
460 | tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); | ||
461 | node_free(tn); | ||
630 | 462 | ||
631 | if (IS_ERR(tn)) { | 463 | tn = container_of(head, struct tnode, rcu); |
632 | tn = old_tn; | ||
633 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
634 | t->stats.resize_node_skipped++; | ||
635 | #endif | ||
636 | break; | ||
637 | } | ||
638 | } | 464 | } |
639 | 465 | ||
640 | check_tnode(tn); | 466 | if (tnode_free_size >= PAGE_SIZE * sync_pages) { |
641 | 467 | tnode_free_size = 0; | |
642 | /* Return if at least one inflate is run */ | 468 | synchronize_rcu(); |
643 | if (max_work != MAX_WORK) | ||
644 | return (struct rt_trie_node *) tn; | ||
645 | |||
646 | /* | ||
647 | * Halve as long as the number of empty children in this | ||
648 | * node is above threshold. | ||
649 | */ | ||
650 | |||
651 | max_work = MAX_WORK; | ||
652 | while (tn->bits > 1 && max_work-- && | ||
653 | 100 * (tnode_child_length(tn) - tn->empty_children) < | ||
654 | halve_threshold_use * tnode_child_length(tn)) { | ||
655 | |||
656 | old_tn = tn; | ||
657 | tn = halve(t, tn); | ||
658 | if (IS_ERR(tn)) { | ||
659 | tn = old_tn; | ||
660 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
661 | t->stats.resize_node_skipped++; | ||
662 | #endif | ||
663 | break; | ||
664 | } | ||
665 | } | 469 | } |
470 | } | ||
666 | 471 | ||
472 | static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn) | ||
473 | { | ||
474 | struct tnode *tp = node_parent(oldtnode); | ||
475 | unsigned long i; | ||
667 | 476 | ||
668 | /* Only one child remains */ | 477 | /* setup the parent pointer out of and back into this node */ |
669 | if (tn->empty_children == tnode_child_length(tn) - 1) { | 478 | NODE_INIT_PARENT(tn, tp); |
670 | one_child: | 479 | put_child_root(tp, t, tn->key, tn); |
671 | for (i = 0; i < tnode_child_length(tn); i++) { | ||
672 | struct rt_trie_node *n; | ||
673 | |||
674 | n = rtnl_dereference(tn->child[i]); | ||
675 | if (!n) | ||
676 | continue; | ||
677 | |||
678 | /* compress one level */ | ||
679 | 480 | ||
680 | node_set_parent(n, NULL); | 481 | /* update all of the child parent pointers */ |
681 | tnode_free_safe(tn); | 482 | update_children(tn); |
682 | return n; | ||
683 | } | ||
684 | } | ||
685 | return (struct rt_trie_node *) tn; | ||
686 | } | ||
687 | 483 | ||
484 | /* all pointers should be clean so we are done */ | ||
485 | tnode_free(oldtnode); | ||
688 | 486 | ||
689 | static void tnode_clean_free(struct tnode *tn) | 487 | /* resize children now that oldtnode is freed */ |
690 | { | 488 | for (i = tnode_child_length(tn); i;) { |
691 | int i; | 489 | struct tnode *inode = tnode_get_child(tn, --i); |
692 | struct tnode *tofree; | ||
693 | 490 | ||
694 | for (i = 0; i < tnode_child_length(tn); i++) { | 491 | /* resize child node */ |
695 | tofree = (struct tnode *)rtnl_dereference(tn->child[i]); | 492 | if (tnode_full(tn, inode)) |
696 | if (tofree) | 493 | resize(t, inode); |
697 | tnode_free(tofree); | ||
698 | } | 494 | } |
699 | tnode_free(tn); | ||
700 | } | 495 | } |
701 | 496 | ||
702 | static struct tnode *inflate(struct trie *t, struct tnode *tn) | 497 | static int inflate(struct trie *t, struct tnode *oldtnode) |
703 | { | 498 | { |
704 | struct tnode *oldtnode = tn; | 499 | struct tnode *tn; |
705 | int olen = tnode_child_length(tn); | 500 | unsigned long i; |
706 | int i; | 501 | t_key m; |
707 | 502 | ||
708 | pr_debug("In inflate\n"); | 503 | pr_debug("In inflate\n"); |
709 | 504 | ||
710 | tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); | 505 | tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); |
711 | |||
712 | if (!tn) | 506 | if (!tn) |
713 | return ERR_PTR(-ENOMEM); | 507 | return -ENOMEM; |
714 | |||
715 | /* | ||
716 | * Preallocate and store tnodes before the actual work so we | ||
717 | * don't get into an inconsistent state if memory allocation | ||
718 | * fails. In case of failure we return the oldnode and inflate | ||
719 | * of tnode is ignored. | ||
720 | */ | ||
721 | |||
722 | for (i = 0; i < olen; i++) { | ||
723 | struct tnode *inode; | ||
724 | |||
725 | inode = (struct tnode *) tnode_get_child(oldtnode, i); | ||
726 | if (inode && | ||
727 | IS_TNODE(inode) && | ||
728 | inode->pos == oldtnode->pos + oldtnode->bits && | ||
729 | inode->bits > 1) { | ||
730 | struct tnode *left, *right; | ||
731 | t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos; | ||
732 | |||
733 | left = tnode_new(inode->key&(~m), inode->pos + 1, | ||
734 | inode->bits - 1); | ||
735 | if (!left) | ||
736 | goto nomem; | ||
737 | |||
738 | right = tnode_new(inode->key|m, inode->pos + 1, | ||
739 | inode->bits - 1); | ||
740 | |||
741 | if (!right) { | ||
742 | tnode_free(left); | ||
743 | goto nomem; | ||
744 | } | ||
745 | 508 | ||
746 | put_child(tn, 2*i, (struct rt_trie_node *) left); | 509 | /* prepare oldtnode to be freed */ |
747 | put_child(tn, 2*i+1, (struct rt_trie_node *) right); | 510 | tnode_free_init(oldtnode); |
748 | } | ||
749 | } | ||
750 | 511 | ||
751 | for (i = 0; i < olen; i++) { | 512 | /* Assemble all of the pointers in our cluster, in this case that |
752 | struct tnode *inode; | 513 | * represents all of the pointers out of our allocated nodes that |
753 | struct rt_trie_node *node = tnode_get_child(oldtnode, i); | 514 | * point to existing tnodes and the links between our allocated |
754 | struct tnode *left, *right; | 515 | * nodes. |
755 | int size, j; | 516 | */ |
517 | for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) { | ||
518 | struct tnode *inode = tnode_get_child(oldtnode, --i); | ||
519 | struct tnode *node0, *node1; | ||
520 | unsigned long j, k; | ||
756 | 521 | ||
757 | /* An empty child */ | 522 | /* An empty child */ |
758 | if (node == NULL) | 523 | if (inode == NULL) |
759 | continue; | 524 | continue; |
760 | 525 | ||
761 | /* A leaf or an internal node with skipped bits */ | 526 | /* A leaf or an internal node with skipped bits */ |
762 | 527 | if (!tnode_full(oldtnode, inode)) { | |
763 | if (IS_LEAF(node) || ((struct tnode *) node)->pos > | 528 | put_child(tn, get_index(inode->key, tn), inode); |
764 | tn->pos + tn->bits - 1) { | ||
765 | put_child(tn, | ||
766 | tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1), | ||
767 | node); | ||
768 | continue; | 529 | continue; |
769 | } | 530 | } |
770 | 531 | ||
771 | /* An internal node with two children */ | 532 | /* drop the node in the old tnode free list */ |
772 | inode = (struct tnode *) node; | 533 | tnode_free_append(oldtnode, inode); |
773 | 534 | ||
535 | /* An internal node with two children */ | ||
774 | if (inode->bits == 1) { | 536 | if (inode->bits == 1) { |
775 | put_child(tn, 2*i, rtnl_dereference(inode->child[0])); | 537 | put_child(tn, 2 * i + 1, tnode_get_child(inode, 1)); |
776 | put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); | 538 | put_child(tn, 2 * i, tnode_get_child(inode, 0)); |
777 | |||
778 | tnode_free_safe(inode); | ||
779 | continue; | 539 | continue; |
780 | } | 540 | } |
781 | 541 | ||
782 | /* An internal node with more than two children */ | ||
783 | |||
784 | /* We will replace this node 'inode' with two new | 542 | /* We will replace this node 'inode' with two new |
785 | * ones, 'left' and 'right', each with half of the | 543 | * ones, 'node0' and 'node1', each with half of the |
786 | * original children. The two new nodes will have | 544 | * original children. The two new nodes will have |
787 | * a position one bit further down the key and this | 545 | * a position one bit further down the key and this |
788 | * means that the "significant" part of their keys | 546 | * means that the "significant" part of their keys |
789 | * (see the discussion near the top of this file) | 547 | * (see the discussion near the top of this file) |
790 | * will differ by one bit, which will be "0" in | 548 | * will differ by one bit, which will be "0" in |
791 | * left's key and "1" in right's key. Since we are | 549 | * node0's key and "1" in node1's key. Since we are |
792 | * moving the key position by one step, the bit that | 550 | * moving the key position by one step, the bit that |
793 | * we are moving away from - the bit at position | 551 | * we are moving away from - the bit at position |
794 | * (inode->pos) - is the one that will differ between | 552 | * (tn->pos) - is the one that will differ between |
795 | * left and right. So... we synthesize that bit in the | 553 | * node0 and node1. So... we synthesize that bit in the |
796 | * two new keys. | 554 | * two new keys. |
797 | * The mask 'm' below will be a single "one" bit at | ||
798 | * the position (inode->pos) | ||
799 | */ | 555 | */ |
556 | node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1); | ||
557 | if (!node1) | ||
558 | goto nomem; | ||
559 | node0 = tnode_new(inode->key, inode->pos, inode->bits - 1); | ||
560 | |||
561 | tnode_free_append(tn, node1); | ||
562 | if (!node0) | ||
563 | goto nomem; | ||
564 | tnode_free_append(tn, node0); | ||
565 | |||
566 | /* populate child pointers in new nodes */ | ||
567 | for (k = tnode_child_length(inode), j = k / 2; j;) { | ||
568 | put_child(node1, --j, tnode_get_child(inode, --k)); | ||
569 | put_child(node0, j, tnode_get_child(inode, j)); | ||
570 | put_child(node1, --j, tnode_get_child(inode, --k)); | ||
571 | put_child(node0, j, tnode_get_child(inode, j)); | ||
572 | } | ||
800 | 573 | ||
801 | /* Use the old key, but set the new significant | 574 | /* link new nodes to parent */ |
802 | * bit to zero. | 575 | NODE_INIT_PARENT(node1, tn); |
803 | */ | 576 | NODE_INIT_PARENT(node0, tn); |
577 | |||
578 | /* link parent to nodes */ | ||
579 | put_child(tn, 2 * i + 1, node1); | ||
580 | put_child(tn, 2 * i, node0); | ||
581 | } | ||
582 | |||
583 | /* setup the parent pointers into and out of this node */ | ||
584 | replace(t, oldtnode, tn); | ||
585 | |||
586 | return 0; | ||
587 | nomem: | ||
588 | /* all pointers should be clean so we are done */ | ||
589 | tnode_free(tn); | ||
590 | return -ENOMEM; | ||
591 | } | ||
592 | |||
593 | static int halve(struct trie *t, struct tnode *oldtnode) | ||
594 | { | ||
595 | struct tnode *tn; | ||
596 | unsigned long i; | ||
597 | |||
598 | pr_debug("In halve\n"); | ||
804 | 599 | ||
805 | left = (struct tnode *) tnode_get_child(tn, 2*i); | 600 | tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); |
806 | put_child(tn, 2*i, NULL); | 601 | if (!tn) |
602 | return -ENOMEM; | ||
807 | 603 | ||
808 | BUG_ON(!left); | 604 | /* prepare oldtnode to be freed */ |
605 | tnode_free_init(oldtnode); | ||
809 | 606 | ||
810 | right = (struct tnode *) tnode_get_child(tn, 2*i+1); | 607 | /* Assemble all of the pointers in our cluster, in this case that |
811 | put_child(tn, 2*i+1, NULL); | 608 | * represents all of the pointers out of our allocated nodes that |
609 | * point to existing tnodes and the links between our allocated | ||
610 | * nodes. | ||
611 | */ | ||
612 | for (i = tnode_child_length(oldtnode); i;) { | ||
613 | struct tnode *node1 = tnode_get_child(oldtnode, --i); | ||
614 | struct tnode *node0 = tnode_get_child(oldtnode, --i); | ||
615 | struct tnode *inode; | ||
812 | 616 | ||
813 | BUG_ON(!right); | 617 | /* At least one of the children is empty */ |
618 | if (!node1 || !node0) { | ||
619 | put_child(tn, i / 2, node1 ? : node0); | ||
620 | continue; | ||
621 | } | ||
814 | 622 | ||
815 | size = tnode_child_length(left); | 623 | /* Two nonempty children */ |
816 | for (j = 0; j < size; j++) { | 624 | inode = tnode_new(node0->key, oldtnode->pos, 1); |
817 | put_child(left, j, rtnl_dereference(inode->child[j])); | 625 | if (!inode) { |
818 | put_child(right, j, rtnl_dereference(inode->child[j + size])); | 626 | tnode_free(tn); |
627 | return -ENOMEM; | ||
819 | } | 628 | } |
820 | put_child(tn, 2*i, resize(t, left)); | 629 | tnode_free_append(tn, inode); |
821 | put_child(tn, 2*i+1, resize(t, right)); | 630 | |
631 | /* initialize pointers out of node */ | ||
632 | put_child(inode, 1, node1); | ||
633 | put_child(inode, 0, node0); | ||
634 | NODE_INIT_PARENT(inode, tn); | ||
822 | 635 | ||
823 | tnode_free_safe(inode); | 636 | /* link parent to node */ |
637 | put_child(tn, i / 2, inode); | ||
824 | } | 638 | } |
825 | tnode_free_safe(oldtnode); | 639 | |
826 | return tn; | 640 | /* setup the parent pointers into and out of this node */ |
827 | nomem: | 641 | replace(t, oldtnode, tn); |
828 | tnode_clean_free(tn); | 642 | |
829 | return ERR_PTR(-ENOMEM); | 643 | return 0; |
830 | } | 644 | } |
831 | 645 | ||
832 | static struct tnode *halve(struct trie *t, struct tnode *tn) | 646 | static void collapse(struct trie *t, struct tnode *oldtnode) |
833 | { | 647 | { |
834 | struct tnode *oldtnode = tn; | 648 | struct tnode *n, *tp; |
835 | struct rt_trie_node *left, *right; | 649 | unsigned long i; |
836 | int i; | ||
837 | int olen = tnode_child_length(tn); | ||
838 | 650 | ||
839 | pr_debug("In halve\n"); | 651 | /* scan the tnode looking for that one child that might still exist */ |
652 | for (n = NULL, i = tnode_child_length(oldtnode); !n && i;) | ||
653 | n = tnode_get_child(oldtnode, --i); | ||
840 | 654 | ||
841 | tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); | 655 | /* compress one level */ |
656 | tp = node_parent(oldtnode); | ||
657 | put_child_root(tp, t, oldtnode->key, n); | ||
658 | node_set_parent(n, tp); | ||
842 | 659 | ||
843 | if (!tn) | 660 | /* drop dead node */ |
844 | return ERR_PTR(-ENOMEM); | 661 | node_free(oldtnode); |
662 | } | ||
845 | 663 | ||
846 | /* | 664 | static unsigned char update_suffix(struct tnode *tn) |
847 | * Preallocate and store tnodes before the actual work so we | 665 | { |
848 | * don't get into an inconsistent state if memory allocation | 666 | unsigned char slen = tn->pos; |
849 | * fails. In case of failure we return the oldnode and halve | 667 | unsigned long stride, i; |
850 | * of tnode is ignored. | 668 | |
669 | /* search though the list of children looking for nodes that might | ||
670 | * have a suffix greater than the one we currently have. This is | ||
671 | * why we start with a stride of 2 since a stride of 1 would | ||
672 | * represent the nodes with suffix length equal to tn->pos | ||
851 | */ | 673 | */ |
674 | for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) { | ||
675 | struct tnode *n = tnode_get_child(tn, i); | ||
852 | 676 | ||
853 | for (i = 0; i < olen; i += 2) { | 677 | if (!n || (n->slen <= slen)) |
854 | left = tnode_get_child(oldtnode, i); | 678 | continue; |
855 | right = tnode_get_child(oldtnode, i+1); | ||
856 | 679 | ||
857 | /* Two nonempty children */ | 680 | /* update stride and slen based on new value */ |
858 | if (left && right) { | 681 | stride <<= (n->slen - slen); |
859 | struct tnode *newn; | 682 | slen = n->slen; |
683 | i &= ~(stride - 1); | ||
860 | 684 | ||
861 | newn = tnode_new(left->key, tn->pos + tn->bits, 1); | 685 | /* if slen covers all but the last bit we can stop here |
686 | * there will be nothing longer than that since only node | ||
687 | * 0 and 1 << (bits - 1) could have that as their suffix | ||
688 | * length. | ||
689 | */ | ||
690 | if ((slen + 1) >= (tn->pos + tn->bits)) | ||
691 | break; | ||
692 | } | ||
862 | 693 | ||
863 | if (!newn) | 694 | tn->slen = slen; |
864 | goto nomem; | ||
865 | 695 | ||
866 | put_child(tn, i/2, (struct rt_trie_node *)newn); | 696 | return slen; |
867 | } | 697 | } |
868 | 698 | ||
869 | } | 699 | /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of |
700 | * the Helsinki University of Technology and Matti Tikkanen of Nokia | ||
701 | * Telecommunications, page 6: | ||
702 | * "A node is doubled if the ratio of non-empty children to all | ||
703 | * children in the *doubled* node is at least 'high'." | ||
704 | * | ||
705 | * 'high' in this instance is the variable 'inflate_threshold'. It | ||
706 | * is expressed as a percentage, so we multiply it with | ||
707 | * tnode_child_length() and instead of multiplying by 2 (since the | ||
708 | * child array will be doubled by inflate()) and multiplying | ||
709 | * the left-hand side by 100 (to handle the percentage thing) we | ||
710 | * multiply the left-hand side by 50. | ||
711 | * | ||
712 | * The left-hand side may look a bit weird: tnode_child_length(tn) | ||
713 | * - tn->empty_children is of course the number of non-null children | ||
714 | * in the current node. tn->full_children is the number of "full" | ||
715 | * children, that is non-null tnodes with a skip value of 0. | ||
716 | * All of those will be doubled in the resulting inflated tnode, so | ||
717 | * we just count them one extra time here. | ||
718 | * | ||
719 | * A clearer way to write this would be: | ||
720 | * | ||
721 | * to_be_doubled = tn->full_children; | ||
722 | * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - | ||
723 | * tn->full_children; | ||
724 | * | ||
725 | * new_child_length = tnode_child_length(tn) * 2; | ||
726 | * | ||
727 | * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / | ||
728 | * new_child_length; | ||
729 | * if (new_fill_factor >= inflate_threshold) | ||
730 | * | ||
731 | * ...and so on, tho it would mess up the while () loop. | ||
732 | * | ||
733 | * anyway, | ||
734 | * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= | ||
735 | * inflate_threshold | ||
736 | * | ||
737 | * avoid a division: | ||
738 | * 100 * (not_to_be_doubled + 2*to_be_doubled) >= | ||
739 | * inflate_threshold * new_child_length | ||
740 | * | ||
741 | * expand not_to_be_doubled and to_be_doubled, and shorten: | ||
742 | * 100 * (tnode_child_length(tn) - tn->empty_children + | ||
743 | * tn->full_children) >= inflate_threshold * new_child_length | ||
744 | * | ||
745 | * expand new_child_length: | ||
746 | * 100 * (tnode_child_length(tn) - tn->empty_children + | ||
747 | * tn->full_children) >= | ||
748 | * inflate_threshold * tnode_child_length(tn) * 2 | ||
749 | * | ||
750 | * shorten again: | ||
751 | * 50 * (tn->full_children + tnode_child_length(tn) - | ||
752 | * tn->empty_children) >= inflate_threshold * | ||
753 | * tnode_child_length(tn) | ||
754 | * | ||
755 | */ | ||
756 | static bool should_inflate(const struct tnode *tp, const struct tnode *tn) | ||
757 | { | ||
758 | unsigned long used = tnode_child_length(tn); | ||
759 | unsigned long threshold = used; | ||
870 | 760 | ||
871 | for (i = 0; i < olen; i += 2) { | 761 | /* Keep root node larger */ |
872 | struct tnode *newBinNode; | 762 | threshold *= tp ? inflate_threshold : inflate_threshold_root; |
763 | used -= tn->empty_children; | ||
764 | used += tn->full_children; | ||
873 | 765 | ||
874 | left = tnode_get_child(oldtnode, i); | 766 | /* if bits == KEYLENGTH then pos = 0, and will fail below */ |
875 | right = tnode_get_child(oldtnode, i+1); | ||
876 | 767 | ||
877 | /* At least one of the children is empty */ | 768 | return (used > 1) && tn->pos && ((50 * used) >= threshold); |
878 | if (left == NULL) { | 769 | } |
879 | if (right == NULL) /* Both are empty */ | 770 | |
880 | continue; | 771 | static bool should_halve(const struct tnode *tp, const struct tnode *tn) |
881 | put_child(tn, i/2, right); | 772 | { |
882 | continue; | 773 | unsigned long used = tnode_child_length(tn); |
774 | unsigned long threshold = used; | ||
775 | |||
776 | /* Keep root node larger */ | ||
777 | threshold *= tp ? halve_threshold : halve_threshold_root; | ||
778 | used -= tn->empty_children; | ||
779 | |||
780 | /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ | ||
781 | |||
782 | return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); | ||
783 | } | ||
784 | |||
785 | static bool should_collapse(const struct tnode *tn) | ||
786 | { | ||
787 | unsigned long used = tnode_child_length(tn); | ||
788 | |||
789 | used -= tn->empty_children; | ||
790 | |||
791 | /* account for bits == KEYLENGTH case */ | ||
792 | if ((tn->bits == KEYLENGTH) && tn->full_children) | ||
793 | used -= KEY_MAX; | ||
794 | |||
795 | /* One child or none, time to drop us from the trie */ | ||
796 | return used < 2; | ||
797 | } | ||
798 | |||
799 | #define MAX_WORK 10 | ||
800 | static void resize(struct trie *t, struct tnode *tn) | ||
801 | { | ||
802 | struct tnode *tp = node_parent(tn); | ||
803 | struct tnode __rcu **cptr; | ||
804 | int max_work = MAX_WORK; | ||
805 | |||
806 | pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", | ||
807 | tn, inflate_threshold, halve_threshold); | ||
808 | |||
809 | /* track the tnode via the pointer from the parent instead of | ||
810 | * doing it ourselves. This way we can let RCU fully do its | ||
811 | * thing without us interfering | ||
812 | */ | ||
813 | cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie; | ||
814 | BUG_ON(tn != rtnl_dereference(*cptr)); | ||
815 | |||
816 | /* Double as long as the resulting node has a number of | ||
817 | * nonempty nodes that are above the threshold. | ||
818 | */ | ||
819 | while (should_inflate(tp, tn) && max_work) { | ||
820 | if (inflate(t, tn)) { | ||
821 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
822 | this_cpu_inc(t->stats->resize_node_skipped); | ||
823 | #endif | ||
824 | break; | ||
883 | } | 825 | } |
884 | 826 | ||
885 | if (right == NULL) { | 827 | max_work--; |
886 | put_child(tn, i/2, left); | 828 | tn = rtnl_dereference(*cptr); |
887 | continue; | 829 | } |
830 | |||
831 | /* Return if at least one inflate is run */ | ||
832 | if (max_work != MAX_WORK) | ||
833 | return; | ||
834 | |||
835 | /* Halve as long as the number of empty children in this | ||
836 | * node is above threshold. | ||
837 | */ | ||
838 | while (should_halve(tp, tn) && max_work) { | ||
839 | if (halve(t, tn)) { | ||
840 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
841 | this_cpu_inc(t->stats->resize_node_skipped); | ||
842 | #endif | ||
843 | break; | ||
888 | } | 844 | } |
889 | 845 | ||
890 | /* Two nonempty children */ | 846 | max_work--; |
891 | newBinNode = (struct tnode *) tnode_get_child(tn, i/2); | 847 | tn = rtnl_dereference(*cptr); |
892 | put_child(tn, i/2, NULL); | 848 | } |
893 | put_child(newBinNode, 0, left); | 849 | |
894 | put_child(newBinNode, 1, right); | 850 | /* Only one child remains */ |
895 | put_child(tn, i/2, resize(t, newBinNode)); | 851 | if (should_collapse(tn)) { |
852 | collapse(t, tn); | ||
853 | return; | ||
854 | } | ||
855 | |||
856 | /* Return if at least one deflate was run */ | ||
857 | if (max_work != MAX_WORK) | ||
858 | return; | ||
859 | |||
860 | /* push the suffix length to the parent node */ | ||
861 | if (tn->slen > tn->pos) { | ||
862 | unsigned char slen = update_suffix(tn); | ||
863 | |||
864 | if (tp && (slen > tp->slen)) | ||
865 | tp->slen = slen; | ||
896 | } | 866 | } |
897 | tnode_free_safe(oldtnode); | ||
898 | return tn; | ||
899 | nomem: | ||
900 | tnode_clean_free(tn); | ||
901 | return ERR_PTR(-ENOMEM); | ||
902 | } | 867 | } |
903 | 868 | ||
904 | /* readside must use rcu_read_lock currently dump routines | 869 | /* readside must use rcu_read_lock currently dump routines |
905 | via get_fa_head and dump */ | 870 | via get_fa_head and dump */ |
906 | 871 | ||
907 | static struct leaf_info *find_leaf_info(struct leaf *l, int plen) | 872 | static struct leaf_info *find_leaf_info(struct tnode *l, int plen) |
908 | { | 873 | { |
909 | struct hlist_head *head = &l->list; | 874 | struct hlist_head *head = &l->list; |
910 | struct leaf_info *li; | 875 | struct leaf_info *li; |
@@ -916,7 +881,7 @@ static struct leaf_info *find_leaf_info(struct leaf *l, int plen) | |||
916 | return NULL; | 881 | return NULL; |
917 | } | 882 | } |
918 | 883 | ||
919 | static inline struct list_head *get_fa_head(struct leaf *l, int plen) | 884 | static inline struct list_head *get_fa_head(struct tnode *l, int plen) |
920 | { | 885 | { |
921 | struct leaf_info *li = find_leaf_info(l, plen); | 886 | struct leaf_info *li = find_leaf_info(l, plen); |
922 | 887 | ||
@@ -926,8 +891,51 @@ static inline struct list_head *get_fa_head(struct leaf *l, int plen) | |||
926 | return &li->falh; | 891 | return &li->falh; |
927 | } | 892 | } |
928 | 893 | ||
929 | static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) | 894 | static void leaf_pull_suffix(struct tnode *l) |
895 | { | ||
896 | struct tnode *tp = node_parent(l); | ||
897 | |||
898 | while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) { | ||
899 | if (update_suffix(tp) > l->slen) | ||
900 | break; | ||
901 | tp = node_parent(tp); | ||
902 | } | ||
903 | } | ||
904 | |||
905 | static void leaf_push_suffix(struct tnode *l) | ||
906 | { | ||
907 | struct tnode *tn = node_parent(l); | ||
908 | |||
909 | /* if this is a new leaf then tn will be NULL and we can sort | ||
910 | * out parent suffix lengths as a part of trie_rebalance | ||
911 | */ | ||
912 | while (tn && (tn->slen < l->slen)) { | ||
913 | tn->slen = l->slen; | ||
914 | tn = node_parent(tn); | ||
915 | } | ||
916 | } | ||
917 | |||
918 | static void remove_leaf_info(struct tnode *l, struct leaf_info *old) | ||
930 | { | 919 | { |
920 | /* record the location of the previous list_info entry */ | ||
921 | struct hlist_node **pprev = old->hlist.pprev; | ||
922 | struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next); | ||
923 | |||
924 | /* remove the leaf info from the list */ | ||
925 | hlist_del_rcu(&old->hlist); | ||
926 | |||
927 | /* only access li if it is pointing at the last valid hlist_node */ | ||
928 | if (hlist_empty(&l->list) || (*pprev)) | ||
929 | return; | ||
930 | |||
931 | /* update the trie with the latest suffix length */ | ||
932 | l->slen = KEYLENGTH - li->plen; | ||
933 | leaf_pull_suffix(l); | ||
934 | } | ||
935 | |||
936 | static void insert_leaf_info(struct tnode *l, struct leaf_info *new) | ||
937 | { | ||
938 | struct hlist_head *head = &l->list; | ||
931 | struct leaf_info *li = NULL, *last = NULL; | 939 | struct leaf_info *li = NULL, *last = NULL; |
932 | 940 | ||
933 | if (hlist_empty(head)) { | 941 | if (hlist_empty(head)) { |
@@ -944,218 +952,174 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) | |||
944 | else | 952 | else |
945 | hlist_add_before_rcu(&new->hlist, &li->hlist); | 953 | hlist_add_before_rcu(&new->hlist, &li->hlist); |
946 | } | 954 | } |
955 | |||
956 | /* if we added to the tail node then we need to update slen */ | ||
957 | if (l->slen < (KEYLENGTH - new->plen)) { | ||
958 | l->slen = KEYLENGTH - new->plen; | ||
959 | leaf_push_suffix(l); | ||
960 | } | ||
947 | } | 961 | } |
948 | 962 | ||
949 | /* rcu_read_lock needs to be hold by caller from readside */ | 963 | /* rcu_read_lock needs to be hold by caller from readside */ |
964 | static struct tnode *fib_find_node(struct trie *t, u32 key) | ||
965 | { | ||
966 | struct tnode *n = rcu_dereference_rtnl(t->trie); | ||
967 | |||
968 | while (n) { | ||
969 | unsigned long index = get_index(key, n); | ||
970 | |||
971 | /* This bit of code is a bit tricky but it combines multiple | ||
972 | * checks into a single check. The prefix consists of the | ||
973 | * prefix plus zeros for the bits in the cindex. The index | ||
974 | * is the difference between the key and this value. From | ||
975 | * this we can actually derive several pieces of data. | ||
976 | * if (index & (~0ul << bits)) | ||
977 | * we have a mismatch in skip bits and failed | ||
978 | * else | ||
979 | * we know the value is cindex | ||
980 | */ | ||
981 | if (index & (~0ul << n->bits)) | ||
982 | return NULL; | ||
950 | 983 | ||
951 | static struct leaf * | 984 | /* we have found a leaf. Prefixes have already been compared */ |
952 | fib_find_node(struct trie *t, u32 key) | 985 | if (IS_LEAF(n)) |
953 | { | 986 | break; |
954 | int pos; | ||
955 | struct tnode *tn; | ||
956 | struct rt_trie_node *n; | ||
957 | 987 | ||
958 | pos = 0; | 988 | n = tnode_get_child_rcu(n, index); |
959 | n = rcu_dereference_rtnl(t->trie); | 989 | } |
960 | 990 | ||
961 | while (n != NULL && NODE_TYPE(n) == T_TNODE) { | 991 | return n; |
962 | tn = (struct tnode *) n; | 992 | } |
963 | 993 | ||
964 | check_tnode(tn); | 994 | /* Return the first fib alias matching TOS with |
995 | * priority less than or equal to PRIO. | ||
996 | */ | ||
997 | static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) | ||
998 | { | ||
999 | struct fib_alias *fa; | ||
965 | 1000 | ||
966 | if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { | 1001 | if (!fah) |
967 | pos = tn->pos + tn->bits; | 1002 | return NULL; |
968 | n = tnode_get_child_rcu(tn, | ||
969 | tkey_extract_bits(key, | ||
970 | tn->pos, | ||
971 | tn->bits)); | ||
972 | } else | ||
973 | break; | ||
974 | } | ||
975 | /* Case we have found a leaf. Compare prefixes */ | ||
976 | 1003 | ||
977 | if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) | 1004 | list_for_each_entry(fa, fah, fa_list) { |
978 | return (struct leaf *)n; | 1005 | if (fa->fa_tos > tos) |
1006 | continue; | ||
1007 | if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) | ||
1008 | return fa; | ||
1009 | } | ||
979 | 1010 | ||
980 | return NULL; | 1011 | return NULL; |
981 | } | 1012 | } |
982 | 1013 | ||
983 | static void trie_rebalance(struct trie *t, struct tnode *tn) | 1014 | static void trie_rebalance(struct trie *t, struct tnode *tn) |
984 | { | 1015 | { |
985 | int wasfull; | ||
986 | t_key cindex, key; | ||
987 | struct tnode *tp; | 1016 | struct tnode *tp; |
988 | 1017 | ||
989 | key = tn->key; | 1018 | while ((tp = node_parent(tn)) != NULL) { |
990 | 1019 | resize(t, tn); | |
991 | while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { | ||
992 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | ||
993 | wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); | ||
994 | tn = (struct tnode *)resize(t, tn); | ||
995 | |||
996 | tnode_put_child_reorg(tp, cindex, | ||
997 | (struct rt_trie_node *)tn, wasfull); | ||
998 | |||
999 | tp = node_parent((struct rt_trie_node *) tn); | ||
1000 | if (!tp) | ||
1001 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); | ||
1002 | |||
1003 | tnode_free_flush(); | ||
1004 | if (!tp) | ||
1005 | break; | ||
1006 | tn = tp; | 1020 | tn = tp; |
1007 | } | 1021 | } |
1008 | 1022 | ||
1009 | /* Handle last (top) tnode */ | 1023 | /* Handle last (top) tnode */ |
1010 | if (IS_TNODE(tn)) | 1024 | if (IS_TNODE(tn)) |
1011 | tn = (struct tnode *)resize(t, tn); | 1025 | resize(t, tn); |
1012 | |||
1013 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); | ||
1014 | tnode_free_flush(); | ||
1015 | } | 1026 | } |
1016 | 1027 | ||
1017 | /* only used from updater-side */ | 1028 | /* only used from updater-side */ |
1018 | 1029 | ||
1019 | static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) | 1030 | static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) |
1020 | { | 1031 | { |
1021 | int pos, newpos; | ||
1022 | struct tnode *tp = NULL, *tn = NULL; | ||
1023 | struct rt_trie_node *n; | ||
1024 | struct leaf *l; | ||
1025 | int missbit; | ||
1026 | struct list_head *fa_head = NULL; | 1032 | struct list_head *fa_head = NULL; |
1033 | struct tnode *l, *n, *tp = NULL; | ||
1027 | struct leaf_info *li; | 1034 | struct leaf_info *li; |
1028 | t_key cindex; | ||
1029 | 1035 | ||
1030 | pos = 0; | 1036 | li = leaf_info_new(plen); |
1037 | if (!li) | ||
1038 | return NULL; | ||
1039 | fa_head = &li->falh; | ||
1040 | |||
1031 | n = rtnl_dereference(t->trie); | 1041 | n = rtnl_dereference(t->trie); |
1032 | 1042 | ||
1033 | /* If we point to NULL, stop. Either the tree is empty and we should | 1043 | /* If we point to NULL, stop. Either the tree is empty and we should |
1034 | * just put a new leaf in if, or we have reached an empty child slot, | 1044 | * just put a new leaf in if, or we have reached an empty child slot, |
1035 | * and we should just put our new leaf in that. | 1045 | * and we should just put our new leaf in that. |
1036 | * If we point to a T_TNODE, check if it matches our key. Note that | ||
1037 | * a T_TNODE might be skipping any number of bits - its 'pos' need | ||
1038 | * not be the parent's 'pos'+'bits'! | ||
1039 | * | ||
1040 | * If it does match the current key, get pos/bits from it, extract | ||
1041 | * the index from our key, push the T_TNODE and walk the tree. | ||
1042 | * | ||
1043 | * If it doesn't, we have to replace it with a new T_TNODE. | ||
1044 | * | 1046 | * |
1045 | * If we point to a T_LEAF, it might or might not have the same key | 1047 | * If we hit a node with a key that does't match then we should stop |
1046 | * as we do. If it does, just change the value, update the T_LEAF's | 1048 | * and create a new tnode to replace that node and insert ourselves |
1047 | * value, and return it. | 1049 | * and the other node into the new tnode. |
1048 | * If it doesn't, we need to replace it with a T_TNODE. | ||
1049 | */ | 1050 | */ |
1050 | 1051 | while (n) { | |
1051 | while (n != NULL && NODE_TYPE(n) == T_TNODE) { | 1052 | unsigned long index = get_index(key, n); |
1052 | tn = (struct tnode *) n; | 1053 | |
1053 | 1054 | /* This bit of code is a bit tricky but it combines multiple | |
1054 | check_tnode(tn); | 1055 | * checks into a single check. The prefix consists of the |
1055 | 1056 | * prefix plus zeros for the "bits" in the prefix. The index | |
1056 | if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { | 1057 | * is the difference between the key and this value. From |
1057 | tp = tn; | 1058 | * this we can actually derive several pieces of data. |
1058 | pos = tn->pos + tn->bits; | 1059 | * if !(index >> bits) |
1059 | n = tnode_get_child(tn, | 1060 | * we know the value is child index |
1060 | tkey_extract_bits(key, | 1061 | * else |
1061 | tn->pos, | 1062 | * we have a mismatch in skip bits and failed |
1062 | tn->bits)); | 1063 | */ |
1063 | 1064 | if (index >> n->bits) | |
1064 | BUG_ON(n && node_parent(n) != tn); | ||
1065 | } else | ||
1066 | break; | 1065 | break; |
1067 | } | ||
1068 | 1066 | ||
1069 | /* | 1067 | /* we have found a leaf. Prefixes have already been compared */ |
1070 | * n ----> NULL, LEAF or TNODE | 1068 | if (IS_LEAF(n)) { |
1071 | * | 1069 | /* Case 1: n is a leaf, and prefixes match*/ |
1072 | * tp is n's (parent) ----> NULL or TNODE | 1070 | insert_leaf_info(n, li); |
1073 | */ | 1071 | return fa_head; |
1074 | 1072 | } | |
1075 | BUG_ON(tp && IS_LEAF(tp)); | ||
1076 | |||
1077 | /* Case 1: n is a leaf. Compare prefixes */ | ||
1078 | |||
1079 | if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { | ||
1080 | l = (struct leaf *) n; | ||
1081 | li = leaf_info_new(plen); | ||
1082 | |||
1083 | if (!li) | ||
1084 | return NULL; | ||
1085 | 1073 | ||
1086 | fa_head = &li->falh; | 1074 | tp = n; |
1087 | insert_leaf_info(&l->list, li); | 1075 | n = tnode_get_child_rcu(n, index); |
1088 | goto done; | ||
1089 | } | 1076 | } |
1090 | l = leaf_new(); | ||
1091 | 1077 | ||
1092 | if (!l) | 1078 | l = leaf_new(key); |
1093 | return NULL; | 1079 | if (!l) { |
1094 | 1080 | free_leaf_info(li); | |
1095 | l->key = key; | ||
1096 | li = leaf_info_new(plen); | ||
1097 | |||
1098 | if (!li) { | ||
1099 | free_leaf(l); | ||
1100 | return NULL; | 1081 | return NULL; |
1101 | } | 1082 | } |
1102 | 1083 | ||
1103 | fa_head = &li->falh; | 1084 | insert_leaf_info(l, li); |
1104 | insert_leaf_info(&l->list, li); | ||
1105 | |||
1106 | if (t->trie && n == NULL) { | ||
1107 | /* Case 2: n is NULL, and will just insert a new leaf */ | ||
1108 | 1085 | ||
1109 | node_set_parent((struct rt_trie_node *)l, tp); | 1086 | /* Case 2: n is a LEAF or a TNODE and the key doesn't match. |
1110 | 1087 | * | |
1111 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | 1088 | * Add a new tnode here |
1112 | put_child(tp, cindex, (struct rt_trie_node *)l); | 1089 | * first tnode need some special handling |
1113 | } else { | 1090 | * leaves us in position for handling as case 3 |
1114 | /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ | 1091 | */ |
1115 | /* | 1092 | if (n) { |
1116 | * Add a new tnode here | 1093 | struct tnode *tn; |
1117 | * first tnode need some special handling | ||
1118 | */ | ||
1119 | |||
1120 | if (n) { | ||
1121 | pos = tp ? tp->pos+tp->bits : 0; | ||
1122 | newpos = tkey_mismatch(key, pos, n->key); | ||
1123 | tn = tnode_new(n->key, newpos, 1); | ||
1124 | } else { | ||
1125 | newpos = 0; | ||
1126 | tn = tnode_new(key, newpos, 1); /* First tnode */ | ||
1127 | } | ||
1128 | 1094 | ||
1095 | tn = tnode_new(key, __fls(key ^ n->key), 1); | ||
1129 | if (!tn) { | 1096 | if (!tn) { |
1130 | free_leaf_info(li); | 1097 | free_leaf_info(li); |
1131 | free_leaf(l); | 1098 | node_free(l); |
1132 | return NULL; | 1099 | return NULL; |
1133 | } | 1100 | } |
1134 | 1101 | ||
1135 | node_set_parent((struct rt_trie_node *)tn, tp); | 1102 | /* initialize routes out of node */ |
1103 | NODE_INIT_PARENT(tn, tp); | ||
1104 | put_child(tn, get_index(key, tn) ^ 1, n); | ||
1136 | 1105 | ||
1137 | missbit = tkey_extract_bits(key, newpos, 1); | 1106 | /* start adding routes into the node */ |
1138 | put_child(tn, missbit, (struct rt_trie_node *)l); | 1107 | put_child_root(tp, t, key, tn); |
1139 | put_child(tn, 1-missbit, n); | 1108 | node_set_parent(n, tn); |
1140 | |||
1141 | if (tp) { | ||
1142 | cindex = tkey_extract_bits(key, tp->pos, tp->bits); | ||
1143 | put_child(tp, cindex, (struct rt_trie_node *)tn); | ||
1144 | } else { | ||
1145 | rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); | ||
1146 | } | ||
1147 | 1109 | ||
1110 | /* parent now has a NULL spot where the leaf can go */ | ||
1148 | tp = tn; | 1111 | tp = tn; |
1149 | } | 1112 | } |
1150 | 1113 | ||
1151 | if (tp && tp->pos + tp->bits > 32) | 1114 | /* Case 3: n is NULL, and will just insert a new leaf */ |
1152 | pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", | 1115 | if (tp) { |
1153 | tp, tp->pos, tp->bits, key, plen); | 1116 | NODE_INIT_PARENT(l, tp); |
1154 | 1117 | put_child(tp, get_index(key, tp), l); | |
1155 | /* Rebalance the trie */ | 1118 | trie_rebalance(t, tp); |
1119 | } else { | ||
1120 | rcu_assign_pointer(t->trie, l); | ||
1121 | } | ||
1156 | 1122 | ||
1157 | trie_rebalance(t, tp); | ||
1158 | done: | ||
1159 | return fa_head; | 1123 | return fa_head; |
1160 | } | 1124 | } |
1161 | 1125 | ||
@@ -1172,7 +1136,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) | |||
1172 | u8 tos = cfg->fc_tos; | 1136 | u8 tos = cfg->fc_tos; |
1173 | u32 key, mask; | 1137 | u32 key, mask; |
1174 | int err; | 1138 | int err; |
1175 | struct leaf *l; | 1139 | struct tnode *l; |
1176 | 1140 | ||
1177 | if (plen > 32) | 1141 | if (plen > 32) |
1178 | return -EINVAL; | 1142 | return -EINVAL; |
@@ -1329,18 +1293,130 @@ err: | |||
1329 | return err; | 1293 | return err; |
1330 | } | 1294 | } |
1331 | 1295 | ||
1296 | static inline t_key prefix_mismatch(t_key key, struct tnode *n) | ||
1297 | { | ||
1298 | t_key prefix = n->key; | ||
1299 | |||
1300 | return (key ^ prefix) & (prefix | -prefix); | ||
1301 | } | ||
1302 | |||
1332 | /* should be called with rcu_read_lock */ | 1303 | /* should be called with rcu_read_lock */ |
1333 | static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, | 1304 | int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, |
1334 | t_key key, const struct flowi4 *flp, | 1305 | struct fib_result *res, int fib_flags) |
1335 | struct fib_result *res, int fib_flags) | ||
1336 | { | 1306 | { |
1307 | struct trie *t = (struct trie *)tb->tb_data; | ||
1308 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1309 | struct trie_use_stats __percpu *stats = t->stats; | ||
1310 | #endif | ||
1311 | const t_key key = ntohl(flp->daddr); | ||
1312 | struct tnode *n, *pn; | ||
1337 | struct leaf_info *li; | 1313 | struct leaf_info *li; |
1338 | struct hlist_head *hhead = &l->list; | 1314 | t_key cindex; |
1315 | |||
1316 | n = rcu_dereference(t->trie); | ||
1317 | if (!n) | ||
1318 | return -EAGAIN; | ||
1319 | |||
1320 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1321 | this_cpu_inc(stats->gets); | ||
1322 | #endif | ||
1323 | |||
1324 | pn = n; | ||
1325 | cindex = 0; | ||
1326 | |||
1327 | /* Step 1: Travel to the longest prefix match in the trie */ | ||
1328 | for (;;) { | ||
1329 | unsigned long index = get_index(key, n); | ||
1330 | |||
1331 | /* This bit of code is a bit tricky but it combines multiple | ||
1332 | * checks into a single check. The prefix consists of the | ||
1333 | * prefix plus zeros for the "bits" in the prefix. The index | ||
1334 | * is the difference between the key and this value. From | ||
1335 | * this we can actually derive several pieces of data. | ||
1336 | * if (index & (~0ul << bits)) | ||
1337 | * we have a mismatch in skip bits and failed | ||
1338 | * else | ||
1339 | * we know the value is cindex | ||
1340 | */ | ||
1341 | if (index & (~0ul << n->bits)) | ||
1342 | break; | ||
1343 | |||
1344 | /* we have found a leaf. Prefixes have already been compared */ | ||
1345 | if (IS_LEAF(n)) | ||
1346 | goto found; | ||
1347 | |||
1348 | /* only record pn and cindex if we are going to be chopping | ||
1349 | * bits later. Otherwise we are just wasting cycles. | ||
1350 | */ | ||
1351 | if (n->slen > n->pos) { | ||
1352 | pn = n; | ||
1353 | cindex = index; | ||
1354 | } | ||
1355 | |||
1356 | n = tnode_get_child_rcu(n, index); | ||
1357 | if (unlikely(!n)) | ||
1358 | goto backtrace; | ||
1359 | } | ||
1360 | |||
1361 | /* Step 2: Sort out leaves and begin backtracing for longest prefix */ | ||
1362 | for (;;) { | ||
1363 | /* record the pointer where our next node pointer is stored */ | ||
1364 | struct tnode __rcu **cptr = n->child; | ||
1365 | |||
1366 | /* This test verifies that none of the bits that differ | ||
1367 | * between the key and the prefix exist in the region of | ||
1368 | * the lsb and higher in the prefix. | ||
1369 | */ | ||
1370 | if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos)) | ||
1371 | goto backtrace; | ||
1372 | |||
1373 | /* exit out and process leaf */ | ||
1374 | if (unlikely(IS_LEAF(n))) | ||
1375 | break; | ||
1376 | |||
1377 | /* Don't bother recording parent info. Since we are in | ||
1378 | * prefix match mode we will have to come back to wherever | ||
1379 | * we started this traversal anyway | ||
1380 | */ | ||
1381 | |||
1382 | while ((n = rcu_dereference(*cptr)) == NULL) { | ||
1383 | backtrace: | ||
1384 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1385 | if (!n) | ||
1386 | this_cpu_inc(stats->null_node_hit); | ||
1387 | #endif | ||
1388 | /* If we are at cindex 0 there are no more bits for | ||
1389 | * us to strip at this level so we must ascend back | ||
1390 | * up one level to see if there are any more bits to | ||
1391 | * be stripped there. | ||
1392 | */ | ||
1393 | while (!cindex) { | ||
1394 | t_key pkey = pn->key; | ||
1395 | |||
1396 | pn = node_parent_rcu(pn); | ||
1397 | if (unlikely(!pn)) | ||
1398 | return -EAGAIN; | ||
1399 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1400 | this_cpu_inc(stats->backtrack); | ||
1401 | #endif | ||
1402 | /* Get Child's index */ | ||
1403 | cindex = get_index(pkey, pn); | ||
1404 | } | ||
1405 | |||
1406 | /* strip the least significant bit from the cindex */ | ||
1407 | cindex &= cindex - 1; | ||
1408 | |||
1409 | /* grab pointer for next child node */ | ||
1410 | cptr = &pn->child[cindex]; | ||
1411 | } | ||
1412 | } | ||
1339 | 1413 | ||
1340 | hlist_for_each_entry_rcu(li, hhead, hlist) { | 1414 | found: |
1415 | /* Step 3: Process the leaf, if that fails fall back to backtracing */ | ||
1416 | hlist_for_each_entry_rcu(li, &n->list, hlist) { | ||
1341 | struct fib_alias *fa; | 1417 | struct fib_alias *fa; |
1342 | 1418 | ||
1343 | if (l->key != (key & li->mask_plen)) | 1419 | if ((key ^ n->key) & li->mask_plen) |
1344 | continue; | 1420 | continue; |
1345 | 1421 | ||
1346 | list_for_each_entry_rcu(fa, &li->falh, fa_list) { | 1422 | list_for_each_entry_rcu(fa, &li->falh, fa_list) { |
@@ -1355,9 +1431,9 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, | |||
1355 | continue; | 1431 | continue; |
1356 | fib_alias_accessed(fa); | 1432 | fib_alias_accessed(fa); |
1357 | err = fib_props[fa->fa_type].error; | 1433 | err = fib_props[fa->fa_type].error; |
1358 | if (err) { | 1434 | if (unlikely(err < 0)) { |
1359 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 1435 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
1360 | t->stats.semantic_match_passed++; | 1436 | this_cpu_inc(stats->semantic_match_passed); |
1361 | #endif | 1437 | #endif |
1362 | return err; | 1438 | return err; |
1363 | } | 1439 | } |
@@ -1371,241 +1447,48 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, | |||
1371 | if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) | 1447 | if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) |
1372 | continue; | 1448 | continue; |
1373 | 1449 | ||
1374 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 1450 | if (!(fib_flags & FIB_LOOKUP_NOREF)) |
1375 | t->stats.semantic_match_passed++; | 1451 | atomic_inc(&fi->fib_clntref); |
1376 | #endif | 1452 | |
1377 | res->prefixlen = li->plen; | 1453 | res->prefixlen = li->plen; |
1378 | res->nh_sel = nhsel; | 1454 | res->nh_sel = nhsel; |
1379 | res->type = fa->fa_type; | 1455 | res->type = fa->fa_type; |
1380 | res->scope = fa->fa_info->fib_scope; | 1456 | res->scope = fi->fib_scope; |
1381 | res->fi = fi; | 1457 | res->fi = fi; |
1382 | res->table = tb; | 1458 | res->table = tb; |
1383 | res->fa_head = &li->falh; | 1459 | res->fa_head = &li->falh; |
1384 | if (!(fib_flags & FIB_LOOKUP_NOREF)) | ||
1385 | atomic_inc(&fi->fib_clntref); | ||
1386 | return 0; | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1390 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1391 | t->stats.semantic_match_miss++; | ||
1392 | #endif | ||
1393 | } | ||
1394 | |||
1395 | return 1; | ||
1396 | } | ||
1397 | |||
1398 | int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, | ||
1399 | struct fib_result *res, int fib_flags) | ||
1400 | { | ||
1401 | struct trie *t = (struct trie *) tb->tb_data; | ||
1402 | int ret; | ||
1403 | struct rt_trie_node *n; | ||
1404 | struct tnode *pn; | ||
1405 | unsigned int pos, bits; | ||
1406 | t_key key = ntohl(flp->daddr); | ||
1407 | unsigned int chopped_off; | ||
1408 | t_key cindex = 0; | ||
1409 | unsigned int current_prefix_length = KEYLENGTH; | ||
1410 | struct tnode *cn; | ||
1411 | t_key pref_mismatch; | ||
1412 | |||
1413 | rcu_read_lock(); | ||
1414 | |||
1415 | n = rcu_dereference(t->trie); | ||
1416 | if (!n) | ||
1417 | goto failed; | ||
1418 | |||
1419 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 1460 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
1420 | t->stats.gets++; | 1461 | this_cpu_inc(stats->semantic_match_passed); |
1421 | #endif | 1462 | #endif |
1422 | 1463 | return err; | |
1423 | /* Just a leaf? */ | 1464 | } |
1424 | if (IS_LEAF(n)) { | ||
1425 | ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); | ||
1426 | goto found; | ||
1427 | } | ||
1428 | |||
1429 | pn = (struct tnode *) n; | ||
1430 | chopped_off = 0; | ||
1431 | |||
1432 | while (pn) { | ||
1433 | pos = pn->pos; | ||
1434 | bits = pn->bits; | ||
1435 | |||
1436 | if (!chopped_off) | ||
1437 | cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), | ||
1438 | pos, bits); | ||
1439 | |||
1440 | n = tnode_get_child_rcu(pn, cindex); | ||
1441 | |||
1442 | if (n == NULL) { | ||
1443 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1444 | t->stats.null_node_hit++; | ||
1445 | #endif | ||
1446 | goto backtrace; | ||
1447 | } | ||
1448 | |||
1449 | if (IS_LEAF(n)) { | ||
1450 | ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); | ||
1451 | if (ret > 0) | ||
1452 | goto backtrace; | ||
1453 | goto found; | ||
1454 | } | ||
1455 | |||
1456 | cn = (struct tnode *)n; | ||
1457 | |||
1458 | /* | ||
1459 | * It's a tnode, and we can do some extra checks here if we | ||
1460 | * like, to avoid descending into a dead-end branch. | ||
1461 | * This tnode is in the parent's child array at index | ||
1462 | * key[p_pos..p_pos+p_bits] but potentially with some bits | ||
1463 | * chopped off, so in reality the index may be just a | ||
1464 | * subprefix, padded with zero at the end. | ||
1465 | * We can also take a look at any skipped bits in this | ||
1466 | * tnode - everything up to p_pos is supposed to be ok, | ||
1467 | * and the non-chopped bits of the index (se previous | ||
1468 | * paragraph) are also guaranteed ok, but the rest is | ||
1469 | * considered unknown. | ||
1470 | * | ||
1471 | * The skipped bits are key[pos+bits..cn->pos]. | ||
1472 | */ | ||
1473 | |||
1474 | /* If current_prefix_length < pos+bits, we are already doing | ||
1475 | * actual prefix matching, which means everything from | ||
1476 | * pos+(bits-chopped_off) onward must be zero along some | ||
1477 | * branch of this subtree - otherwise there is *no* valid | ||
1478 | * prefix present. Here we can only check the skipped | ||
1479 | * bits. Remember, since we have already indexed into the | ||
1480 | * parent's child array, we know that the bits we chopped of | ||
1481 | * *are* zero. | ||
1482 | */ | ||
1483 | |||
1484 | /* NOTA BENE: Checking only skipped bits | ||
1485 | for the new node here */ | ||
1486 | |||
1487 | if (current_prefix_length < pos+bits) { | ||
1488 | if (tkey_extract_bits(cn->key, current_prefix_length, | ||
1489 | cn->pos - current_prefix_length) | ||
1490 | || !(cn->child[0])) | ||
1491 | goto backtrace; | ||
1492 | } | ||
1493 | |||
1494 | /* | ||
1495 | * If chopped_off=0, the index is fully validated and we | ||
1496 | * only need to look at the skipped bits for this, the new, | ||
1497 | * tnode. What we actually want to do is to find out if | ||
1498 | * these skipped bits match our key perfectly, or if we will | ||
1499 | * have to count on finding a matching prefix further down, | ||
1500 | * because if we do, we would like to have some way of | ||
1501 | * verifying the existence of such a prefix at this point. | ||
1502 | */ | ||
1503 | |||
1504 | /* The only thing we can do at this point is to verify that | ||
1505 | * any such matching prefix can indeed be a prefix to our | ||
1506 | * key, and if the bits in the node we are inspecting that | ||
1507 | * do not match our key are not ZERO, this cannot be true. | ||
1508 | * Thus, find out where there is a mismatch (before cn->pos) | ||
1509 | * and verify that all the mismatching bits are zero in the | ||
1510 | * new tnode's key. | ||
1511 | */ | ||
1512 | |||
1513 | /* | ||
1514 | * Note: We aren't very concerned about the piece of | ||
1515 | * the key that precede pn->pos+pn->bits, since these | ||
1516 | * have already been checked. The bits after cn->pos | ||
1517 | * aren't checked since these are by definition | ||
1518 | * "unknown" at this point. Thus, what we want to see | ||
1519 | * is if we are about to enter the "prefix matching" | ||
1520 | * state, and in that case verify that the skipped | ||
1521 | * bits that will prevail throughout this subtree are | ||
1522 | * zero, as they have to be if we are to find a | ||
1523 | * matching prefix. | ||
1524 | */ | ||
1525 | |||
1526 | pref_mismatch = mask_pfx(cn->key ^ key, cn->pos); | ||
1527 | |||
1528 | /* | ||
1529 | * In short: If skipped bits in this node do not match | ||
1530 | * the search key, enter the "prefix matching" | ||
1531 | * state.directly. | ||
1532 | */ | ||
1533 | if (pref_mismatch) { | ||
1534 | /* fls(x) = __fls(x) + 1 */ | ||
1535 | int mp = KEYLENGTH - __fls(pref_mismatch) - 1; | ||
1536 | |||
1537 | if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) | ||
1538 | goto backtrace; | ||
1539 | |||
1540 | if (current_prefix_length >= cn->pos) | ||
1541 | current_prefix_length = mp; | ||
1542 | } | 1465 | } |
1543 | 1466 | ||
1544 | pn = (struct tnode *)n; /* Descend */ | ||
1545 | chopped_off = 0; | ||
1546 | continue; | ||
1547 | |||
1548 | backtrace: | ||
1549 | chopped_off++; | ||
1550 | |||
1551 | /* As zero don't change the child key (cindex) */ | ||
1552 | while ((chopped_off <= pn->bits) | ||
1553 | && !(cindex & (1<<(chopped_off-1)))) | ||
1554 | chopped_off++; | ||
1555 | |||
1556 | /* Decrease current_... with bits chopped off */ | ||
1557 | if (current_prefix_length > pn->pos + pn->bits - chopped_off) | ||
1558 | current_prefix_length = pn->pos + pn->bits | ||
1559 | - chopped_off; | ||
1560 | |||
1561 | /* | ||
1562 | * Either we do the actual chop off according or if we have | ||
1563 | * chopped off all bits in this tnode walk up to our parent. | ||
1564 | */ | ||
1565 | |||
1566 | if (chopped_off <= pn->bits) { | ||
1567 | cindex &= ~(1 << (chopped_off-1)); | ||
1568 | } else { | ||
1569 | struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn); | ||
1570 | if (!parent) | ||
1571 | goto failed; | ||
1572 | |||
1573 | /* Get Child's index */ | ||
1574 | cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits); | ||
1575 | pn = parent; | ||
1576 | chopped_off = 0; | ||
1577 | |||
1578 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 1467 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
1579 | t->stats.backtrack++; | 1468 | this_cpu_inc(stats->semantic_match_miss); |
1580 | #endif | 1469 | #endif |
1581 | goto backtrace; | ||
1582 | } | ||
1583 | } | 1470 | } |
1584 | failed: | 1471 | goto backtrace; |
1585 | ret = 1; | ||
1586 | found: | ||
1587 | rcu_read_unlock(); | ||
1588 | return ret; | ||
1589 | } | 1472 | } |
1590 | EXPORT_SYMBOL_GPL(fib_table_lookup); | 1473 | EXPORT_SYMBOL_GPL(fib_table_lookup); |
1591 | 1474 | ||
1592 | /* | 1475 | /* |
1593 | * Remove the leaf and return parent. | 1476 | * Remove the leaf and return parent. |
1594 | */ | 1477 | */ |
1595 | static void trie_leaf_remove(struct trie *t, struct leaf *l) | 1478 | static void trie_leaf_remove(struct trie *t, struct tnode *l) |
1596 | { | 1479 | { |
1597 | struct tnode *tp = node_parent((struct rt_trie_node *) l); | 1480 | struct tnode *tp = node_parent(l); |
1598 | 1481 | ||
1599 | pr_debug("entering trie_leaf_remove(%p)\n", l); | 1482 | pr_debug("entering trie_leaf_remove(%p)\n", l); |
1600 | 1483 | ||
1601 | if (tp) { | 1484 | if (tp) { |
1602 | t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); | 1485 | put_child(tp, get_index(l->key, tp), NULL); |
1603 | put_child(tp, cindex, NULL); | ||
1604 | trie_rebalance(t, tp); | 1486 | trie_rebalance(t, tp); |
1605 | } else | 1487 | } else { |
1606 | RCU_INIT_POINTER(t->trie, NULL); | 1488 | RCU_INIT_POINTER(t->trie, NULL); |
1489 | } | ||
1607 | 1490 | ||
1608 | free_leaf(l); | 1491 | node_free(l); |
1609 | } | 1492 | } |
1610 | 1493 | ||
1611 | /* | 1494 | /* |
@@ -1619,7 +1502,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) | |||
1619 | u8 tos = cfg->fc_tos; | 1502 | u8 tos = cfg->fc_tos; |
1620 | struct fib_alias *fa, *fa_to_delete; | 1503 | struct fib_alias *fa, *fa_to_delete; |
1621 | struct list_head *fa_head; | 1504 | struct list_head *fa_head; |
1622 | struct leaf *l; | 1505 | struct tnode *l; |
1623 | struct leaf_info *li; | 1506 | struct leaf_info *li; |
1624 | 1507 | ||
1625 | if (plen > 32) | 1508 | if (plen > 32) |
@@ -1684,7 +1567,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) | |||
1684 | tb->tb_num_default--; | 1567 | tb->tb_num_default--; |
1685 | 1568 | ||
1686 | if (list_empty(fa_head)) { | 1569 | if (list_empty(fa_head)) { |
1687 | hlist_del_rcu(&li->hlist); | 1570 | remove_leaf_info(l, li); |
1688 | free_leaf_info(li); | 1571 | free_leaf_info(li); |
1689 | } | 1572 | } |
1690 | 1573 | ||
@@ -1717,12 +1600,13 @@ static int trie_flush_list(struct list_head *head) | |||
1717 | return found; | 1600 | return found; |
1718 | } | 1601 | } |
1719 | 1602 | ||
1720 | static int trie_flush_leaf(struct leaf *l) | 1603 | static int trie_flush_leaf(struct tnode *l) |
1721 | { | 1604 | { |
1722 | int found = 0; | 1605 | int found = 0; |
1723 | struct hlist_head *lih = &l->list; | 1606 | struct hlist_head *lih = &l->list; |
1724 | struct hlist_node *tmp; | 1607 | struct hlist_node *tmp; |
1725 | struct leaf_info *li = NULL; | 1608 | struct leaf_info *li = NULL; |
1609 | unsigned char plen = KEYLENGTH; | ||
1726 | 1610 | ||
1727 | hlist_for_each_entry_safe(li, tmp, lih, hlist) { | 1611 | hlist_for_each_entry_safe(li, tmp, lih, hlist) { |
1728 | found += trie_flush_list(&li->falh); | 1612 | found += trie_flush_list(&li->falh); |
@@ -1730,8 +1614,14 @@ static int trie_flush_leaf(struct leaf *l) | |||
1730 | if (list_empty(&li->falh)) { | 1614 | if (list_empty(&li->falh)) { |
1731 | hlist_del_rcu(&li->hlist); | 1615 | hlist_del_rcu(&li->hlist); |
1732 | free_leaf_info(li); | 1616 | free_leaf_info(li); |
1617 | continue; | ||
1733 | } | 1618 | } |
1619 | |||
1620 | plen = li->plen; | ||
1734 | } | 1621 | } |
1622 | |||
1623 | l->slen = KEYLENGTH - plen; | ||
1624 | |||
1735 | return found; | 1625 | return found; |
1736 | } | 1626 | } |
1737 | 1627 | ||
@@ -1739,63 +1629,57 @@ static int trie_flush_leaf(struct leaf *l) | |||
1739 | * Scan for the next right leaf starting at node p->child[idx] | 1629 | * Scan for the next right leaf starting at node p->child[idx] |
1740 | * Since we have back pointer, no recursion necessary. | 1630 | * Since we have back pointer, no recursion necessary. |
1741 | */ | 1631 | */ |
1742 | static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) | 1632 | static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c) |
1743 | { | 1633 | { |
1744 | do { | 1634 | do { |
1745 | t_key idx; | 1635 | unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0; |
1746 | 1636 | ||
1747 | if (c) | 1637 | while (idx < tnode_child_length(p)) { |
1748 | idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1; | ||
1749 | else | ||
1750 | idx = 0; | ||
1751 | |||
1752 | while (idx < 1u << p->bits) { | ||
1753 | c = tnode_get_child_rcu(p, idx++); | 1638 | c = tnode_get_child_rcu(p, idx++); |
1754 | if (!c) | 1639 | if (!c) |
1755 | continue; | 1640 | continue; |
1756 | 1641 | ||
1757 | if (IS_LEAF(c)) | 1642 | if (IS_LEAF(c)) |
1758 | return (struct leaf *) c; | 1643 | return c; |
1759 | 1644 | ||
1760 | /* Rescan start scanning in new node */ | 1645 | /* Rescan start scanning in new node */ |
1761 | p = (struct tnode *) c; | 1646 | p = c; |
1762 | idx = 0; | 1647 | idx = 0; |
1763 | } | 1648 | } |
1764 | 1649 | ||
1765 | /* Node empty, walk back up to parent */ | 1650 | /* Node empty, walk back up to parent */ |
1766 | c = (struct rt_trie_node *) p; | 1651 | c = p; |
1767 | } while ((p = node_parent_rcu(c)) != NULL); | 1652 | } while ((p = node_parent_rcu(c)) != NULL); |
1768 | 1653 | ||
1769 | return NULL; /* Root of trie */ | 1654 | return NULL; /* Root of trie */ |
1770 | } | 1655 | } |
1771 | 1656 | ||
1772 | static struct leaf *trie_firstleaf(struct trie *t) | 1657 | static struct tnode *trie_firstleaf(struct trie *t) |
1773 | { | 1658 | { |
1774 | struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie); | 1659 | struct tnode *n = rcu_dereference_rtnl(t->trie); |
1775 | 1660 | ||
1776 | if (!n) | 1661 | if (!n) |
1777 | return NULL; | 1662 | return NULL; |
1778 | 1663 | ||
1779 | if (IS_LEAF(n)) /* trie is just a leaf */ | 1664 | if (IS_LEAF(n)) /* trie is just a leaf */ |
1780 | return (struct leaf *) n; | 1665 | return n; |
1781 | 1666 | ||
1782 | return leaf_walk_rcu(n, NULL); | 1667 | return leaf_walk_rcu(n, NULL); |
1783 | } | 1668 | } |
1784 | 1669 | ||
1785 | static struct leaf *trie_nextleaf(struct leaf *l) | 1670 | static struct tnode *trie_nextleaf(struct tnode *l) |
1786 | { | 1671 | { |
1787 | struct rt_trie_node *c = (struct rt_trie_node *) l; | 1672 | struct tnode *p = node_parent_rcu(l); |
1788 | struct tnode *p = node_parent_rcu(c); | ||
1789 | 1673 | ||
1790 | if (!p) | 1674 | if (!p) |
1791 | return NULL; /* trie with just one leaf */ | 1675 | return NULL; /* trie with just one leaf */ |
1792 | 1676 | ||
1793 | return leaf_walk_rcu(p, c); | 1677 | return leaf_walk_rcu(p, l); |
1794 | } | 1678 | } |
1795 | 1679 | ||
1796 | static struct leaf *trie_leafindex(struct trie *t, int index) | 1680 | static struct tnode *trie_leafindex(struct trie *t, int index) |
1797 | { | 1681 | { |
1798 | struct leaf *l = trie_firstleaf(t); | 1682 | struct tnode *l = trie_firstleaf(t); |
1799 | 1683 | ||
1800 | while (l && index-- > 0) | 1684 | while (l && index-- > 0) |
1801 | l = trie_nextleaf(l); | 1685 | l = trie_nextleaf(l); |
@@ -1810,19 +1694,28 @@ static struct leaf *trie_leafindex(struct trie *t, int index) | |||
1810 | int fib_table_flush(struct fib_table *tb) | 1694 | int fib_table_flush(struct fib_table *tb) |
1811 | { | 1695 | { |
1812 | struct trie *t = (struct trie *) tb->tb_data; | 1696 | struct trie *t = (struct trie *) tb->tb_data; |
1813 | struct leaf *l, *ll = NULL; | 1697 | struct tnode *l, *ll = NULL; |
1814 | int found = 0; | 1698 | int found = 0; |
1815 | 1699 | ||
1816 | for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { | 1700 | for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { |
1817 | found += trie_flush_leaf(l); | 1701 | found += trie_flush_leaf(l); |
1818 | 1702 | ||
1819 | if (ll && hlist_empty(&ll->list)) | 1703 | if (ll) { |
1820 | trie_leaf_remove(t, ll); | 1704 | if (hlist_empty(&ll->list)) |
1705 | trie_leaf_remove(t, ll); | ||
1706 | else | ||
1707 | leaf_pull_suffix(ll); | ||
1708 | } | ||
1709 | |||
1821 | ll = l; | 1710 | ll = l; |
1822 | } | 1711 | } |
1823 | 1712 | ||
1824 | if (ll && hlist_empty(&ll->list)) | 1713 | if (ll) { |
1825 | trie_leaf_remove(t, ll); | 1714 | if (hlist_empty(&ll->list)) |
1715 | trie_leaf_remove(t, ll); | ||
1716 | else | ||
1717 | leaf_pull_suffix(ll); | ||
1718 | } | ||
1826 | 1719 | ||
1827 | pr_debug("trie_flush found=%d\n", found); | 1720 | pr_debug("trie_flush found=%d\n", found); |
1828 | return found; | 1721 | return found; |
@@ -1830,6 +1723,11 @@ int fib_table_flush(struct fib_table *tb) | |||
1830 | 1723 | ||
1831 | void fib_free_table(struct fib_table *tb) | 1724 | void fib_free_table(struct fib_table *tb) |
1832 | { | 1725 | { |
1726 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1727 | struct trie *t = (struct trie *)tb->tb_data; | ||
1728 | |||
1729 | free_percpu(t->stats); | ||
1730 | #endif /* CONFIG_IP_FIB_TRIE_STATS */ | ||
1833 | kfree(tb); | 1731 | kfree(tb); |
1834 | } | 1732 | } |
1835 | 1733 | ||
@@ -1870,7 +1768,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, | |||
1870 | return skb->len; | 1768 | return skb->len; |
1871 | } | 1769 | } |
1872 | 1770 | ||
1873 | static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, | 1771 | static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, |
1874 | struct sk_buff *skb, struct netlink_callback *cb) | 1772 | struct sk_buff *skb, struct netlink_callback *cb) |
1875 | { | 1773 | { |
1876 | struct leaf_info *li; | 1774 | struct leaf_info *li; |
@@ -1906,7 +1804,7 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, | |||
1906 | int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, | 1804 | int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, |
1907 | struct netlink_callback *cb) | 1805 | struct netlink_callback *cb) |
1908 | { | 1806 | { |
1909 | struct leaf *l; | 1807 | struct tnode *l; |
1910 | struct trie *t = (struct trie *) tb->tb_data; | 1808 | struct trie *t = (struct trie *) tb->tb_data; |
1911 | t_key key = cb->args[2]; | 1809 | t_key key = cb->args[2]; |
1912 | int count = cb->args[3]; | 1810 | int count = cb->args[3]; |
@@ -1952,7 +1850,7 @@ void __init fib_trie_init(void) | |||
1952 | 0, SLAB_PANIC, NULL); | 1850 | 0, SLAB_PANIC, NULL); |
1953 | 1851 | ||
1954 | trie_leaf_kmem = kmem_cache_create("ip_fib_trie", | 1852 | trie_leaf_kmem = kmem_cache_create("ip_fib_trie", |
1955 | max(sizeof(struct leaf), | 1853 | max(sizeof(struct tnode), |
1956 | sizeof(struct leaf_info)), | 1854 | sizeof(struct leaf_info)), |
1957 | 0, SLAB_PANIC, NULL); | 1855 | 0, SLAB_PANIC, NULL); |
1958 | } | 1856 | } |
@@ -1973,7 +1871,14 @@ struct fib_table *fib_trie_table(u32 id) | |||
1973 | tb->tb_num_default = 0; | 1871 | tb->tb_num_default = 0; |
1974 | 1872 | ||
1975 | t = (struct trie *) tb->tb_data; | 1873 | t = (struct trie *) tb->tb_data; |
1976 | memset(t, 0, sizeof(*t)); | 1874 | RCU_INIT_POINTER(t->trie, NULL); |
1875 | #ifdef CONFIG_IP_FIB_TRIE_STATS | ||
1876 | t->stats = alloc_percpu(struct trie_use_stats); | ||
1877 | if (!t->stats) { | ||
1878 | kfree(tb); | ||
1879 | tb = NULL; | ||
1880 | } | ||
1881 | #endif | ||
1977 | 1882 | ||
1978 | return tb; | 1883 | return tb; |
1979 | } | 1884 | } |
@@ -1988,10 +1893,10 @@ struct fib_trie_iter { | |||
1988 | unsigned int depth; | 1893 | unsigned int depth; |
1989 | }; | 1894 | }; |
1990 | 1895 | ||
1991 | static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) | 1896 | static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter) |
1992 | { | 1897 | { |
1898 | unsigned long cindex = iter->index; | ||
1993 | struct tnode *tn = iter->tnode; | 1899 | struct tnode *tn = iter->tnode; |
1994 | unsigned int cindex = iter->index; | ||
1995 | struct tnode *p; | 1900 | struct tnode *p; |
1996 | 1901 | ||
1997 | /* A single entry routing table */ | 1902 | /* A single entry routing table */ |
@@ -2001,8 +1906,8 @@ static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) | |||
2001 | pr_debug("get_next iter={node=%p index=%d depth=%d}\n", | 1906 | pr_debug("get_next iter={node=%p index=%d depth=%d}\n", |
2002 | iter->tnode, iter->index, iter->depth); | 1907 | iter->tnode, iter->index, iter->depth); |
2003 | rescan: | 1908 | rescan: |
2004 | while (cindex < (1<<tn->bits)) { | 1909 | while (cindex < tnode_child_length(tn)) { |
2005 | struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); | 1910 | struct tnode *n = tnode_get_child_rcu(tn, cindex); |
2006 | 1911 | ||
2007 | if (n) { | 1912 | if (n) { |
2008 | if (IS_LEAF(n)) { | 1913 | if (IS_LEAF(n)) { |
@@ -2010,7 +1915,7 @@ rescan: | |||
2010 | iter->index = cindex + 1; | 1915 | iter->index = cindex + 1; |
2011 | } else { | 1916 | } else { |
2012 | /* push down one level */ | 1917 | /* push down one level */ |
2013 | iter->tnode = (struct tnode *) n; | 1918 | iter->tnode = n; |
2014 | iter->index = 0; | 1919 | iter->index = 0; |
2015 | ++iter->depth; | 1920 | ++iter->depth; |
2016 | } | 1921 | } |
@@ -2021,9 +1926,9 @@ rescan: | |||
2021 | } | 1926 | } |
2022 | 1927 | ||
2023 | /* Current node exhausted, pop back up */ | 1928 | /* Current node exhausted, pop back up */ |
2024 | p = node_parent_rcu((struct rt_trie_node *)tn); | 1929 | p = node_parent_rcu(tn); |
2025 | if (p) { | 1930 | if (p) { |
2026 | cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; | 1931 | cindex = get_index(tn->key, p) + 1; |
2027 | tn = p; | 1932 | tn = p; |
2028 | --iter->depth; | 1933 | --iter->depth; |
2029 | goto rescan; | 1934 | goto rescan; |
@@ -2033,10 +1938,10 @@ rescan: | |||
2033 | return NULL; | 1938 | return NULL; |
2034 | } | 1939 | } |
2035 | 1940 | ||
2036 | static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, | 1941 | static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, |
2037 | struct trie *t) | 1942 | struct trie *t) |
2038 | { | 1943 | { |
2039 | struct rt_trie_node *n; | 1944 | struct tnode *n; |
2040 | 1945 | ||
2041 | if (!t) | 1946 | if (!t) |
2042 | return NULL; | 1947 | return NULL; |
@@ -2046,7 +1951,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, | |||
2046 | return NULL; | 1951 | return NULL; |
2047 | 1952 | ||
2048 | if (IS_TNODE(n)) { | 1953 | if (IS_TNODE(n)) { |
2049 | iter->tnode = (struct tnode *) n; | 1954 | iter->tnode = n; |
2050 | iter->index = 0; | 1955 | iter->index = 0; |
2051 | iter->depth = 1; | 1956 | iter->depth = 1; |
2052 | } else { | 1957 | } else { |
@@ -2060,7 +1965,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, | |||
2060 | 1965 | ||
2061 | static void trie_collect_stats(struct trie *t, struct trie_stat *s) | 1966 | static void trie_collect_stats(struct trie *t, struct trie_stat *s) |
2062 | { | 1967 | { |
2063 | struct rt_trie_node *n; | 1968 | struct tnode *n; |
2064 | struct fib_trie_iter iter; | 1969 | struct fib_trie_iter iter; |
2065 | 1970 | ||
2066 | memset(s, 0, sizeof(*s)); | 1971 | memset(s, 0, sizeof(*s)); |
@@ -2068,7 +1973,6 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) | |||
2068 | rcu_read_lock(); | 1973 | rcu_read_lock(); |
2069 | for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { | 1974 | for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { |
2070 | if (IS_LEAF(n)) { | 1975 | if (IS_LEAF(n)) { |
2071 | struct leaf *l = (struct leaf *)n; | ||
2072 | struct leaf_info *li; | 1976 | struct leaf_info *li; |
2073 | 1977 | ||
2074 | s->leaves++; | 1978 | s->leaves++; |
@@ -2076,19 +1980,13 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) | |||
2076 | if (iter.depth > s->maxdepth) | 1980 | if (iter.depth > s->maxdepth) |
2077 | s->maxdepth = iter.depth; | 1981 | s->maxdepth = iter.depth; |
2078 | 1982 | ||
2079 | hlist_for_each_entry_rcu(li, &l->list, hlist) | 1983 | hlist_for_each_entry_rcu(li, &n->list, hlist) |
2080 | ++s->prefixes; | 1984 | ++s->prefixes; |
2081 | } else { | 1985 | } else { |
2082 | const struct tnode *tn = (const struct tnode *) n; | ||
2083 | int i; | ||
2084 | |||
2085 | s->tnodes++; | 1986 | s->tnodes++; |
2086 | if (tn->bits < MAX_STAT_DEPTH) | 1987 | if (n->bits < MAX_STAT_DEPTH) |
2087 | s->nodesizes[tn->bits]++; | 1988 | s->nodesizes[n->bits]++; |
2088 | 1989 | s->nullpointers += n->empty_children; | |
2089 | for (i = 0; i < (1<<tn->bits); i++) | ||
2090 | if (!tn->child[i]) | ||
2091 | s->nullpointers++; | ||
2092 | } | 1990 | } |
2093 | } | 1991 | } |
2094 | rcu_read_unlock(); | 1992 | rcu_read_unlock(); |
@@ -2111,7 +2009,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) | |||
2111 | seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); | 2009 | seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); |
2112 | 2010 | ||
2113 | seq_printf(seq, "\tLeaves: %u\n", stat->leaves); | 2011 | seq_printf(seq, "\tLeaves: %u\n", stat->leaves); |
2114 | bytes = sizeof(struct leaf) * stat->leaves; | 2012 | bytes = sizeof(struct tnode) * stat->leaves; |
2115 | 2013 | ||
2116 | seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); | 2014 | seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); |
2117 | bytes += sizeof(struct leaf_info) * stat->prefixes; | 2015 | bytes += sizeof(struct leaf_info) * stat->prefixes; |
@@ -2132,25 +2030,38 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) | |||
2132 | seq_putc(seq, '\n'); | 2030 | seq_putc(seq, '\n'); |
2133 | seq_printf(seq, "\tPointers: %u\n", pointers); | 2031 | seq_printf(seq, "\tPointers: %u\n", pointers); |
2134 | 2032 | ||
2135 | bytes += sizeof(struct rt_trie_node *) * pointers; | 2033 | bytes += sizeof(struct tnode *) * pointers; |
2136 | seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); | 2034 | seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); |
2137 | seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); | 2035 | seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); |
2138 | } | 2036 | } |
2139 | 2037 | ||
2140 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 2038 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
2141 | static void trie_show_usage(struct seq_file *seq, | 2039 | static void trie_show_usage(struct seq_file *seq, |
2142 | const struct trie_use_stats *stats) | 2040 | const struct trie_use_stats __percpu *stats) |
2143 | { | 2041 | { |
2042 | struct trie_use_stats s = { 0 }; | ||
2043 | int cpu; | ||
2044 | |||
2045 | /* loop through all of the CPUs and gather up the stats */ | ||
2046 | for_each_possible_cpu(cpu) { | ||
2047 | const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu); | ||
2048 | |||
2049 | s.gets += pcpu->gets; | ||
2050 | s.backtrack += pcpu->backtrack; | ||
2051 | s.semantic_match_passed += pcpu->semantic_match_passed; | ||
2052 | s.semantic_match_miss += pcpu->semantic_match_miss; | ||
2053 | s.null_node_hit += pcpu->null_node_hit; | ||
2054 | s.resize_node_skipped += pcpu->resize_node_skipped; | ||
2055 | } | ||
2056 | |||
2144 | seq_printf(seq, "\nCounters:\n---------\n"); | 2057 | seq_printf(seq, "\nCounters:\n---------\n"); |
2145 | seq_printf(seq, "gets = %u\n", stats->gets); | 2058 | seq_printf(seq, "gets = %u\n", s.gets); |
2146 | seq_printf(seq, "backtracks = %u\n", stats->backtrack); | 2059 | seq_printf(seq, "backtracks = %u\n", s.backtrack); |
2147 | seq_printf(seq, "semantic match passed = %u\n", | 2060 | seq_printf(seq, "semantic match passed = %u\n", |
2148 | stats->semantic_match_passed); | 2061 | s.semantic_match_passed); |
2149 | seq_printf(seq, "semantic match miss = %u\n", | 2062 | seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss); |
2150 | stats->semantic_match_miss); | 2063 | seq_printf(seq, "null node hit= %u\n", s.null_node_hit); |
2151 | seq_printf(seq, "null node hit= %u\n", stats->null_node_hit); | 2064 | seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped); |
2152 | seq_printf(seq, "skipped node resize = %u\n\n", | ||
2153 | stats->resize_node_skipped); | ||
2154 | } | 2065 | } |
2155 | #endif /* CONFIG_IP_FIB_TRIE_STATS */ | 2066 | #endif /* CONFIG_IP_FIB_TRIE_STATS */ |
2156 | 2067 | ||
@@ -2173,7 +2084,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) | |||
2173 | seq_printf(seq, | 2084 | seq_printf(seq, |
2174 | "Basic info: size of leaf:" | 2085 | "Basic info: size of leaf:" |
2175 | " %Zd bytes, size of tnode: %Zd bytes.\n", | 2086 | " %Zd bytes, size of tnode: %Zd bytes.\n", |
2176 | sizeof(struct leaf), sizeof(struct tnode)); | 2087 | sizeof(struct tnode), sizeof(struct tnode)); |
2177 | 2088 | ||
2178 | for (h = 0; h < FIB_TABLE_HASHSZ; h++) { | 2089 | for (h = 0; h < FIB_TABLE_HASHSZ; h++) { |
2179 | struct hlist_head *head = &net->ipv4.fib_table_hash[h]; | 2090 | struct hlist_head *head = &net->ipv4.fib_table_hash[h]; |
@@ -2191,7 +2102,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) | |||
2191 | trie_collect_stats(t, &stat); | 2102 | trie_collect_stats(t, &stat); |
2192 | trie_show_stats(seq, &stat); | 2103 | trie_show_stats(seq, &stat); |
2193 | #ifdef CONFIG_IP_FIB_TRIE_STATS | 2104 | #ifdef CONFIG_IP_FIB_TRIE_STATS |
2194 | trie_show_usage(seq, &t->stats); | 2105 | trie_show_usage(seq, t->stats); |
2195 | #endif | 2106 | #endif |
2196 | } | 2107 | } |
2197 | } | 2108 | } |
@@ -2212,7 +2123,7 @@ static const struct file_operations fib_triestat_fops = { | |||
2212 | .release = single_release_net, | 2123 | .release = single_release_net, |
2213 | }; | 2124 | }; |
2214 | 2125 | ||
2215 | static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) | 2126 | static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos) |
2216 | { | 2127 | { |
2217 | struct fib_trie_iter *iter = seq->private; | 2128 | struct fib_trie_iter *iter = seq->private; |
2218 | struct net *net = seq_file_net(seq); | 2129 | struct net *net = seq_file_net(seq); |
@@ -2224,7 +2135,7 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) | |||
2224 | struct fib_table *tb; | 2135 | struct fib_table *tb; |
2225 | 2136 | ||
2226 | hlist_for_each_entry_rcu(tb, head, tb_hlist) { | 2137 | hlist_for_each_entry_rcu(tb, head, tb_hlist) { |
2227 | struct rt_trie_node *n; | 2138 | struct tnode *n; |
2228 | 2139 | ||
2229 | for (n = fib_trie_get_first(iter, | 2140 | for (n = fib_trie_get_first(iter, |
2230 | (struct trie *) tb->tb_data); | 2141 | (struct trie *) tb->tb_data); |
@@ -2253,7 +2164,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
2253 | struct fib_table *tb = iter->tb; | 2164 | struct fib_table *tb = iter->tb; |
2254 | struct hlist_node *tb_node; | 2165 | struct hlist_node *tb_node; |
2255 | unsigned int h; | 2166 | unsigned int h; |
2256 | struct rt_trie_node *n; | 2167 | struct tnode *n; |
2257 | 2168 | ||
2258 | ++*pos; | 2169 | ++*pos; |
2259 | /* next node in same table */ | 2170 | /* next node in same table */ |
@@ -2339,29 +2250,26 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t) | |||
2339 | static int fib_trie_seq_show(struct seq_file *seq, void *v) | 2250 | static int fib_trie_seq_show(struct seq_file *seq, void *v) |
2340 | { | 2251 | { |
2341 | const struct fib_trie_iter *iter = seq->private; | 2252 | const struct fib_trie_iter *iter = seq->private; |
2342 | struct rt_trie_node *n = v; | 2253 | struct tnode *n = v; |
2343 | 2254 | ||
2344 | if (!node_parent_rcu(n)) | 2255 | if (!node_parent_rcu(n)) |
2345 | fib_table_print(seq, iter->tb); | 2256 | fib_table_print(seq, iter->tb); |
2346 | 2257 | ||
2347 | if (IS_TNODE(n)) { | 2258 | if (IS_TNODE(n)) { |
2348 | struct tnode *tn = (struct tnode *) n; | 2259 | __be32 prf = htonl(n->key); |
2349 | __be32 prf = htonl(mask_pfx(tn->key, tn->pos)); | ||
2350 | 2260 | ||
2351 | seq_indent(seq, iter->depth-1); | 2261 | seq_indent(seq, iter->depth-1); |
2352 | seq_printf(seq, " +-- %pI4/%d %d %d %d\n", | 2262 | seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", |
2353 | &prf, tn->pos, tn->bits, tn->full_children, | 2263 | &prf, KEYLENGTH - n->pos - n->bits, n->bits, |
2354 | tn->empty_children); | 2264 | n->full_children, n->empty_children); |
2355 | |||
2356 | } else { | 2265 | } else { |
2357 | struct leaf *l = (struct leaf *) n; | ||
2358 | struct leaf_info *li; | 2266 | struct leaf_info *li; |
2359 | __be32 val = htonl(l->key); | 2267 | __be32 val = htonl(n->key); |
2360 | 2268 | ||
2361 | seq_indent(seq, iter->depth); | 2269 | seq_indent(seq, iter->depth); |
2362 | seq_printf(seq, " |-- %pI4\n", &val); | 2270 | seq_printf(seq, " |-- %pI4\n", &val); |
2363 | 2271 | ||
2364 | hlist_for_each_entry_rcu(li, &l->list, hlist) { | 2272 | hlist_for_each_entry_rcu(li, &n->list, hlist) { |
2365 | struct fib_alias *fa; | 2273 | struct fib_alias *fa; |
2366 | 2274 | ||
2367 | list_for_each_entry_rcu(fa, &li->falh, fa_list) { | 2275 | list_for_each_entry_rcu(fa, &li->falh, fa_list) { |
@@ -2411,9 +2319,9 @@ struct fib_route_iter { | |||
2411 | t_key key; | 2319 | t_key key; |
2412 | }; | 2320 | }; |
2413 | 2321 | ||
2414 | static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) | 2322 | static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) |
2415 | { | 2323 | { |
2416 | struct leaf *l = NULL; | 2324 | struct tnode *l = NULL; |
2417 | struct trie *t = iter->main_trie; | 2325 | struct trie *t = iter->main_trie; |
2418 | 2326 | ||
2419 | /* use cache location of last found key */ | 2327 | /* use cache location of last found key */ |
@@ -2458,7 +2366,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) | |||
2458 | static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) | 2366 | static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
2459 | { | 2367 | { |
2460 | struct fib_route_iter *iter = seq->private; | 2368 | struct fib_route_iter *iter = seq->private; |
2461 | struct leaf *l = v; | 2369 | struct tnode *l = v; |
2462 | 2370 | ||
2463 | ++*pos; | 2371 | ++*pos; |
2464 | if (v == SEQ_START_TOKEN) { | 2372 | if (v == SEQ_START_TOKEN) { |
@@ -2504,7 +2412,7 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info | |||
2504 | */ | 2412 | */ |
2505 | static int fib_route_seq_show(struct seq_file *seq, void *v) | 2413 | static int fib_route_seq_show(struct seq_file *seq, void *v) |
2506 | { | 2414 | { |
2507 | struct leaf *l = v; | 2415 | struct tnode *l = v; |
2508 | struct leaf_info *li; | 2416 | struct leaf_info *li; |
2509 | 2417 | ||
2510 | if (v == SEQ_START_TOKEN) { | 2418 | if (v == SEQ_START_TOKEN) { |
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index b986298a7ba3..92ddea1e6457 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c | |||
@@ -70,7 +70,6 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, | |||
70 | size_t start = ntohs(pd[0]); | 70 | size_t start = ntohs(pd[0]); |
71 | size_t offset = ntohs(pd[1]); | 71 | size_t offset = ntohs(pd[1]); |
72 | size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); | 72 | size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); |
73 | __wsum delta; | ||
74 | 73 | ||
75 | if (skb->remcsum_offload) { | 74 | if (skb->remcsum_offload) { |
76 | /* Already processed in GRO path */ | 75 | /* Already processed in GRO path */ |
@@ -82,14 +81,7 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, | |||
82 | return NULL; | 81 | return NULL; |
83 | guehdr = (struct guehdr *)&udp_hdr(skb)[1]; | 82 | guehdr = (struct guehdr *)&udp_hdr(skb)[1]; |
84 | 83 | ||
85 | if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) | 84 | skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset); |
86 | __skb_checksum_complete(skb); | ||
87 | |||
88 | delta = remcsum_adjust((void *)guehdr + hdrlen, | ||
89 | skb->csum, start, offset); | ||
90 | |||
91 | /* Adjust skb->csum since we changed the packet */ | ||
92 | skb->csum = csum_add(skb->csum, delta); | ||
93 | 85 | ||
94 | return guehdr; | 86 | return guehdr; |
95 | } | 87 | } |
@@ -174,7 +166,8 @@ drop: | |||
174 | } | 166 | } |
175 | 167 | ||
176 | static struct sk_buff **fou_gro_receive(struct sk_buff **head, | 168 | static struct sk_buff **fou_gro_receive(struct sk_buff **head, |
177 | struct sk_buff *skb) | 169 | struct sk_buff *skb, |
170 | struct udp_offload *uoff) | ||
178 | { | 171 | { |
179 | const struct net_offload *ops; | 172 | const struct net_offload *ops; |
180 | struct sk_buff **pp = NULL; | 173 | struct sk_buff **pp = NULL; |
@@ -195,7 +188,8 @@ out_unlock: | |||
195 | return pp; | 188 | return pp; |
196 | } | 189 | } |
197 | 190 | ||
198 | static int fou_gro_complete(struct sk_buff *skb, int nhoff) | 191 | static int fou_gro_complete(struct sk_buff *skb, int nhoff, |
192 | struct udp_offload *uoff) | ||
199 | { | 193 | { |
200 | const struct net_offload *ops; | 194 | const struct net_offload *ops; |
201 | u8 proto = NAPI_GRO_CB(skb)->proto; | 195 | u8 proto = NAPI_GRO_CB(skb)->proto; |
@@ -226,7 +220,6 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, | |||
226 | size_t start = ntohs(pd[0]); | 220 | size_t start = ntohs(pd[0]); |
227 | size_t offset = ntohs(pd[1]); | 221 | size_t offset = ntohs(pd[1]); |
228 | size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); | 222 | size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); |
229 | __wsum delta; | ||
230 | 223 | ||
231 | if (skb->remcsum_offload) | 224 | if (skb->remcsum_offload) |
232 | return guehdr; | 225 | return guehdr; |
@@ -241,12 +234,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, | |||
241 | return NULL; | 234 | return NULL; |
242 | } | 235 | } |
243 | 236 | ||
244 | delta = remcsum_adjust((void *)guehdr + hdrlen, | 237 | skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset); |
245 | NAPI_GRO_CB(skb)->csum, start, offset); | ||
246 | |||
247 | /* Adjust skb->csum since we changed the packet */ | ||
248 | skb->csum = csum_add(skb->csum, delta); | ||
249 | NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); | ||
250 | 238 | ||
251 | skb->remcsum_offload = 1; | 239 | skb->remcsum_offload = 1; |
252 | 240 | ||
@@ -254,7 +242,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, | |||
254 | } | 242 | } |
255 | 243 | ||
256 | static struct sk_buff **gue_gro_receive(struct sk_buff **head, | 244 | static struct sk_buff **gue_gro_receive(struct sk_buff **head, |
257 | struct sk_buff *skb) | 245 | struct sk_buff *skb, |
246 | struct udp_offload *uoff) | ||
258 | { | 247 | { |
259 | const struct net_offload **offloads; | 248 | const struct net_offload **offloads; |
260 | const struct net_offload *ops; | 249 | const struct net_offload *ops; |
@@ -360,7 +349,8 @@ out: | |||
360 | return pp; | 349 | return pp; |
361 | } | 350 | } |
362 | 351 | ||
363 | static int gue_gro_complete(struct sk_buff *skb, int nhoff) | 352 | static int gue_gro_complete(struct sk_buff *skb, int nhoff, |
353 | struct udp_offload *uoff) | ||
364 | { | 354 | { |
365 | const struct net_offload **offloads; | 355 | const struct net_offload **offloads; |
366 | struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); | 356 | struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); |
@@ -490,7 +480,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, | |||
490 | sk->sk_user_data = fou; | 480 | sk->sk_user_data = fou; |
491 | fou->sock = sock; | 481 | fou->sock = sock; |
492 | 482 | ||
493 | udp_set_convert_csum(sk, true); | 483 | inet_inc_convert_csum(sk); |
494 | 484 | ||
495 | sk->sk_allocation = GFP_ATOMIC; | 485 | sk->sk_allocation = GFP_ATOMIC; |
496 | 486 | ||
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 394a200f93c1..5a4828ba05ad 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c | |||
@@ -17,7 +17,7 @@ | |||
17 | #include <linux/errno.h> | 17 | #include <linux/errno.h> |
18 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
19 | #include <linux/skbuff.h> | 19 | #include <linux/skbuff.h> |
20 | #include <linux/rculist.h> | 20 | #include <linux/list.h> |
21 | #include <linux/netdevice.h> | 21 | #include <linux/netdevice.h> |
22 | #include <linux/in.h> | 22 | #include <linux/in.h> |
23 | #include <linux/ip.h> | 23 | #include <linux/ip.h> |
@@ -26,8 +26,8 @@ | |||
26 | #include <linux/etherdevice.h> | 26 | #include <linux/etherdevice.h> |
27 | #include <linux/if_ether.h> | 27 | #include <linux/if_ether.h> |
28 | #include <linux/if_vlan.h> | 28 | #include <linux/if_vlan.h> |
29 | #include <linux/hash.h> | ||
30 | #include <linux/ethtool.h> | 29 | #include <linux/ethtool.h> |
30 | #include <linux/mutex.h> | ||
31 | #include <net/arp.h> | 31 | #include <net/arp.h> |
32 | #include <net/ndisc.h> | 32 | #include <net/ndisc.h> |
33 | #include <net/ip.h> | 33 | #include <net/ip.h> |
@@ -50,38 +50,30 @@ | |||
50 | #include <net/ip6_checksum.h> | 50 | #include <net/ip6_checksum.h> |
51 | #endif | 51 | #endif |
52 | 52 | ||
53 | #define PORT_HASH_BITS 8 | 53 | /* Protects sock_list and refcounts. */ |
54 | #define PORT_HASH_SIZE (1<<PORT_HASH_BITS) | 54 | static DEFINE_MUTEX(geneve_mutex); |
55 | 55 | ||
56 | /* per-network namespace private data for this module */ | 56 | /* per-network namespace private data for this module */ |
57 | struct geneve_net { | 57 | struct geneve_net { |
58 | struct hlist_head sock_list[PORT_HASH_SIZE]; | 58 | struct list_head sock_list; |
59 | spinlock_t sock_lock; /* Protects sock_list */ | ||
60 | }; | 59 | }; |
61 | 60 | ||
62 | static int geneve_net_id; | 61 | static int geneve_net_id; |
63 | 62 | ||
64 | static struct workqueue_struct *geneve_wq; | ||
65 | |||
66 | static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) | 63 | static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) |
67 | { | 64 | { |
68 | return (struct genevehdr *)(udp_hdr(skb) + 1); | 65 | return (struct genevehdr *)(udp_hdr(skb) + 1); |
69 | } | 66 | } |
70 | 67 | ||
71 | static struct hlist_head *gs_head(struct net *net, __be16 port) | 68 | static struct geneve_sock *geneve_find_sock(struct net *net, |
69 | sa_family_t family, __be16 port) | ||
72 | { | 70 | { |
73 | struct geneve_net *gn = net_generic(net, geneve_net_id); | 71 | struct geneve_net *gn = net_generic(net, geneve_net_id); |
74 | |||
75 | return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; | ||
76 | } | ||
77 | |||
78 | /* Find geneve socket based on network namespace and UDP port */ | ||
79 | static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port) | ||
80 | { | ||
81 | struct geneve_sock *gs; | 72 | struct geneve_sock *gs; |
82 | 73 | ||
83 | hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) { | 74 | list_for_each_entry(gs, &gn->sock_list, list) { |
84 | if (inet_sk(gs->sock->sk)->inet_sport == port) | 75 | if (inet_sk(gs->sock->sk)->inet_sport == port && |
76 | inet_sk(gs->sock->sk)->sk.sk_family == family) | ||
85 | return gs; | 77 | return gs; |
86 | } | 78 | } |
87 | 79 | ||
@@ -115,19 +107,19 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, | |||
115 | struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, | 107 | struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, |
116 | __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, | 108 | __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, |
117 | __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, | 109 | __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, |
118 | bool xnet) | 110 | bool csum, bool xnet) |
119 | { | 111 | { |
120 | struct genevehdr *gnvh; | 112 | struct genevehdr *gnvh; |
121 | int min_headroom; | 113 | int min_headroom; |
122 | int err; | 114 | int err; |
123 | 115 | ||
124 | skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx); | 116 | skb = udp_tunnel_handle_offloads(skb, csum); |
125 | if (IS_ERR(skb)) | 117 | if (IS_ERR(skb)) |
126 | return PTR_ERR(skb); | 118 | return PTR_ERR(skb); |
127 | 119 | ||
128 | min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len | 120 | min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len |
129 | + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) | 121 | + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) |
130 | + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); | 122 | + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); |
131 | 123 | ||
132 | err = skb_cow_head(skb, min_headroom); | 124 | err = skb_cow_head(skb, min_headroom); |
133 | if (unlikely(err)) { | 125 | if (unlikely(err)) { |
@@ -144,11 +136,107 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, | |||
144 | 136 | ||
145 | skb_set_inner_protocol(skb, htons(ETH_P_TEB)); | 137 | skb_set_inner_protocol(skb, htons(ETH_P_TEB)); |
146 | 138 | ||
147 | return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst, | 139 | return udp_tunnel_xmit_skb(rt, skb, src, dst, |
148 | tos, ttl, df, src_port, dst_port, xnet); | 140 | tos, ttl, df, src_port, dst_port, xnet, |
141 | !csum); | ||
149 | } | 142 | } |
150 | EXPORT_SYMBOL_GPL(geneve_xmit_skb); | 143 | EXPORT_SYMBOL_GPL(geneve_xmit_skb); |
151 | 144 | ||
145 | static int geneve_hlen(struct genevehdr *gh) | ||
146 | { | ||
147 | return sizeof(*gh) + gh->opt_len * 4; | ||
148 | } | ||
149 | |||
150 | static struct sk_buff **geneve_gro_receive(struct sk_buff **head, | ||
151 | struct sk_buff *skb, | ||
152 | struct udp_offload *uoff) | ||
153 | { | ||
154 | struct sk_buff *p, **pp = NULL; | ||
155 | struct genevehdr *gh, *gh2; | ||
156 | unsigned int hlen, gh_len, off_gnv; | ||
157 | const struct packet_offload *ptype; | ||
158 | __be16 type; | ||
159 | int flush = 1; | ||
160 | |||
161 | off_gnv = skb_gro_offset(skb); | ||
162 | hlen = off_gnv + sizeof(*gh); | ||
163 | gh = skb_gro_header_fast(skb, off_gnv); | ||
164 | if (skb_gro_header_hard(skb, hlen)) { | ||
165 | gh = skb_gro_header_slow(skb, hlen, off_gnv); | ||
166 | if (unlikely(!gh)) | ||
167 | goto out; | ||
168 | } | ||
169 | |||
170 | if (gh->ver != GENEVE_VER || gh->oam) | ||
171 | goto out; | ||
172 | gh_len = geneve_hlen(gh); | ||
173 | |||
174 | hlen = off_gnv + gh_len; | ||
175 | if (skb_gro_header_hard(skb, hlen)) { | ||
176 | gh = skb_gro_header_slow(skb, hlen, off_gnv); | ||
177 | if (unlikely(!gh)) | ||
178 | goto out; | ||
179 | } | ||
180 | |||
181 | flush = 0; | ||
182 | |||
183 | for (p = *head; p; p = p->next) { | ||
184 | if (!NAPI_GRO_CB(p)->same_flow) | ||
185 | continue; | ||
186 | |||
187 | gh2 = (struct genevehdr *)(p->data + off_gnv); | ||
188 | if (gh->opt_len != gh2->opt_len || | ||
189 | memcmp(gh, gh2, gh_len)) { | ||
190 | NAPI_GRO_CB(p)->same_flow = 0; | ||
191 | continue; | ||
192 | } | ||
193 | } | ||
194 | |||
195 | type = gh->proto_type; | ||
196 | |||
197 | rcu_read_lock(); | ||
198 | ptype = gro_find_receive_by_type(type); | ||
199 | if (ptype == NULL) { | ||
200 | flush = 1; | ||
201 | goto out_unlock; | ||
202 | } | ||
203 | |||
204 | skb_gro_pull(skb, gh_len); | ||
205 | skb_gro_postpull_rcsum(skb, gh, gh_len); | ||
206 | pp = ptype->callbacks.gro_receive(head, skb); | ||
207 | |||
208 | out_unlock: | ||
209 | rcu_read_unlock(); | ||
210 | out: | ||
211 | NAPI_GRO_CB(skb)->flush |= flush; | ||
212 | |||
213 | return pp; | ||
214 | } | ||
215 | |||
216 | static int geneve_gro_complete(struct sk_buff *skb, int nhoff, | ||
217 | struct udp_offload *uoff) | ||
218 | { | ||
219 | struct genevehdr *gh; | ||
220 | struct packet_offload *ptype; | ||
221 | __be16 type; | ||
222 | int gh_len; | ||
223 | int err = -ENOSYS; | ||
224 | |||
225 | udp_tunnel_gro_complete(skb, nhoff); | ||
226 | |||
227 | gh = (struct genevehdr *)(skb->data + nhoff); | ||
228 | gh_len = geneve_hlen(gh); | ||
229 | type = gh->proto_type; | ||
230 | |||
231 | rcu_read_lock(); | ||
232 | ptype = gro_find_complete_by_type(type); | ||
233 | if (ptype != NULL) | ||
234 | err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); | ||
235 | |||
236 | rcu_read_unlock(); | ||
237 | return err; | ||
238 | } | ||
239 | |||
152 | static void geneve_notify_add_rx_port(struct geneve_sock *gs) | 240 | static void geneve_notify_add_rx_port(struct geneve_sock *gs) |
153 | { | 241 | { |
154 | struct sock *sk = gs->sock->sk; | 242 | struct sock *sk = gs->sock->sk; |
@@ -214,15 +302,6 @@ error: | |||
214 | return 1; | 302 | return 1; |
215 | } | 303 | } |
216 | 304 | ||
217 | static void geneve_del_work(struct work_struct *work) | ||
218 | { | ||
219 | struct geneve_sock *gs = container_of(work, struct geneve_sock, | ||
220 | del_work); | ||
221 | |||
222 | udp_tunnel_sock_release(gs->sock); | ||
223 | kfree_rcu(gs, rcu); | ||
224 | } | ||
225 | |||
226 | static struct socket *geneve_create_sock(struct net *net, bool ipv6, | 305 | static struct socket *geneve_create_sock(struct net *net, bool ipv6, |
227 | __be16 port) | 306 | __be16 port) |
228 | { | 307 | { |
@@ -263,8 +342,6 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, | |||
263 | if (!gs) | 342 | if (!gs) |
264 | return ERR_PTR(-ENOMEM); | 343 | return ERR_PTR(-ENOMEM); |
265 | 344 | ||
266 | INIT_WORK(&gs->del_work, geneve_del_work); | ||
267 | |||
268 | sock = geneve_create_sock(net, ipv6, port); | 345 | sock = geneve_create_sock(net, ipv6, port); |
269 | if (IS_ERR(sock)) { | 346 | if (IS_ERR(sock)) { |
270 | kfree(gs); | 347 | kfree(gs); |
@@ -272,19 +349,15 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, | |||
272 | } | 349 | } |
273 | 350 | ||
274 | gs->sock = sock; | 351 | gs->sock = sock; |
275 | atomic_set(&gs->refcnt, 1); | 352 | gs->refcnt = 1; |
276 | gs->rcv = rcv; | 353 | gs->rcv = rcv; |
277 | gs->rcv_data = data; | 354 | gs->rcv_data = data; |
278 | 355 | ||
279 | /* Initialize the geneve udp offloads structure */ | 356 | /* Initialize the geneve udp offloads structure */ |
280 | gs->udp_offloads.port = port; | 357 | gs->udp_offloads.port = port; |
281 | gs->udp_offloads.callbacks.gro_receive = NULL; | 358 | gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; |
282 | gs->udp_offloads.callbacks.gro_complete = NULL; | 359 | gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; |
283 | |||
284 | spin_lock(&gn->sock_lock); | ||
285 | hlist_add_head_rcu(&gs->hlist, gs_head(net, port)); | ||
286 | geneve_notify_add_rx_port(gs); | 360 | geneve_notify_add_rx_port(gs); |
287 | spin_unlock(&gn->sock_lock); | ||
288 | 361 | ||
289 | /* Mark socket as an encapsulation socket */ | 362 | /* Mark socket as an encapsulation socket */ |
290 | tunnel_cfg.sk_user_data = gs; | 363 | tunnel_cfg.sk_user_data = gs; |
@@ -293,6 +366,8 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, | |||
293 | tunnel_cfg.encap_destroy = NULL; | 366 | tunnel_cfg.encap_destroy = NULL; |
294 | setup_udp_tunnel_sock(net, sock, &tunnel_cfg); | 367 | setup_udp_tunnel_sock(net, sock, &tunnel_cfg); |
295 | 368 | ||
369 | list_add(&gs->list, &gn->sock_list); | ||
370 | |||
296 | return gs; | 371 | return gs; |
297 | } | 372 | } |
298 | 373 | ||
@@ -300,25 +375,21 @@ struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, | |||
300 | geneve_rcv_t *rcv, void *data, | 375 | geneve_rcv_t *rcv, void *data, |
301 | bool no_share, bool ipv6) | 376 | bool no_share, bool ipv6) |
302 | { | 377 | { |
303 | struct geneve_net *gn = net_generic(net, geneve_net_id); | ||
304 | struct geneve_sock *gs; | 378 | struct geneve_sock *gs; |
305 | 379 | ||
306 | gs = geneve_socket_create(net, port, rcv, data, ipv6); | 380 | mutex_lock(&geneve_mutex); |
307 | if (!IS_ERR(gs)) | ||
308 | return gs; | ||
309 | |||
310 | if (no_share) /* Return error if sharing is not allowed. */ | ||
311 | return ERR_PTR(-EINVAL); | ||
312 | 381 | ||
313 | spin_lock(&gn->sock_lock); | 382 | gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); |
314 | gs = geneve_find_sock(net, port); | 383 | if (gs) { |
315 | if (gs && ((gs->rcv != rcv) || | 384 | if (!no_share && gs->rcv == rcv) |
316 | !atomic_add_unless(&gs->refcnt, 1, 0))) | 385 | gs->refcnt++; |
386 | else | ||
317 | gs = ERR_PTR(-EBUSY); | 387 | gs = ERR_PTR(-EBUSY); |
318 | spin_unlock(&gn->sock_lock); | 388 | } else { |
389 | gs = geneve_socket_create(net, port, rcv, data, ipv6); | ||
390 | } | ||
319 | 391 | ||
320 | if (!gs) | 392 | mutex_unlock(&geneve_mutex); |
321 | gs = ERR_PTR(-EINVAL); | ||
322 | 393 | ||
323 | return gs; | 394 | return gs; |
324 | } | 395 | } |
@@ -326,37 +397,32 @@ EXPORT_SYMBOL_GPL(geneve_sock_add); | |||
326 | 397 | ||
327 | void geneve_sock_release(struct geneve_sock *gs) | 398 | void geneve_sock_release(struct geneve_sock *gs) |
328 | { | 399 | { |
329 | struct net *net = sock_net(gs->sock->sk); | 400 | mutex_lock(&geneve_mutex); |
330 | struct geneve_net *gn = net_generic(net, geneve_net_id); | ||
331 | 401 | ||
332 | if (!atomic_dec_and_test(&gs->refcnt)) | 402 | if (--gs->refcnt) |
333 | return; | 403 | goto unlock; |
334 | 404 | ||
335 | spin_lock(&gn->sock_lock); | 405 | list_del(&gs->list); |
336 | hlist_del_rcu(&gs->hlist); | ||
337 | geneve_notify_del_rx_port(gs); | 406 | geneve_notify_del_rx_port(gs); |
338 | spin_unlock(&gn->sock_lock); | 407 | udp_tunnel_sock_release(gs->sock); |
408 | kfree_rcu(gs, rcu); | ||
339 | 409 | ||
340 | queue_work(geneve_wq, &gs->del_work); | 410 | unlock: |
411 | mutex_unlock(&geneve_mutex); | ||
341 | } | 412 | } |
342 | EXPORT_SYMBOL_GPL(geneve_sock_release); | 413 | EXPORT_SYMBOL_GPL(geneve_sock_release); |
343 | 414 | ||
344 | static __net_init int geneve_init_net(struct net *net) | 415 | static __net_init int geneve_init_net(struct net *net) |
345 | { | 416 | { |
346 | struct geneve_net *gn = net_generic(net, geneve_net_id); | 417 | struct geneve_net *gn = net_generic(net, geneve_net_id); |
347 | unsigned int h; | ||
348 | 418 | ||
349 | spin_lock_init(&gn->sock_lock); | 419 | INIT_LIST_HEAD(&gn->sock_list); |
350 | |||
351 | for (h = 0; h < PORT_HASH_SIZE; ++h) | ||
352 | INIT_HLIST_HEAD(&gn->sock_list[h]); | ||
353 | 420 | ||
354 | return 0; | 421 | return 0; |
355 | } | 422 | } |
356 | 423 | ||
357 | static struct pernet_operations geneve_net_ops = { | 424 | static struct pernet_operations geneve_net_ops = { |
358 | .init = geneve_init_net, | 425 | .init = geneve_init_net, |
359 | .exit = NULL, | ||
360 | .id = &geneve_net_id, | 426 | .id = &geneve_net_id, |
361 | .size = sizeof(struct geneve_net), | 427 | .size = sizeof(struct geneve_net), |
362 | }; | 428 | }; |
@@ -365,10 +431,6 @@ static int __init geneve_init_module(void) | |||
365 | { | 431 | { |
366 | int rc; | 432 | int rc; |
367 | 433 | ||
368 | geneve_wq = alloc_workqueue("geneve", 0, 0); | ||
369 | if (!geneve_wq) | ||
370 | return -ENOMEM; | ||
371 | |||
372 | rc = register_pernet_subsys(&geneve_net_ops); | 434 | rc = register_pernet_subsys(&geneve_net_ops); |
373 | if (rc) | 435 | if (rc) |
374 | return rc; | 436 | return rc; |
@@ -377,11 +439,10 @@ static int __init geneve_init_module(void) | |||
377 | 439 | ||
378 | return 0; | 440 | return 0; |
379 | } | 441 | } |
380 | late_initcall(geneve_init_module); | 442 | module_init(geneve_init_module); |
381 | 443 | ||
382 | static void __exit geneve_cleanup_module(void) | 444 | static void __exit geneve_cleanup_module(void) |
383 | { | 445 | { |
384 | destroy_workqueue(geneve_wq); | ||
385 | unregister_pernet_subsys(&geneve_net_ops); | 446 | unregister_pernet_subsys(&geneve_net_ops); |
386 | } | 447 | } |
387 | module_exit(geneve_cleanup_module); | 448 | module_exit(geneve_cleanup_module); |
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 36f5584d93c5..5e564014a0b7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -205,7 +205,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; | |||
205 | */ | 205 | */ |
206 | static struct sock *icmp_sk(struct net *net) | 206 | static struct sock *icmp_sk(struct net *net) |
207 | { | 207 | { |
208 | return net->ipv4.icmp_sk[smp_processor_id()]; | 208 | return *this_cpu_ptr(net->ipv4.icmp_sk); |
209 | } | 209 | } |
210 | 210 | ||
211 | static inline struct sock *icmp_xmit_lock(struct net *net) | 211 | static inline struct sock *icmp_xmit_lock(struct net *net) |
@@ -1140,8 +1140,8 @@ static void __net_exit icmp_sk_exit(struct net *net) | |||
1140 | int i; | 1140 | int i; |
1141 | 1141 | ||
1142 | for_each_possible_cpu(i) | 1142 | for_each_possible_cpu(i) |
1143 | inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); | 1143 | inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i)); |
1144 | kfree(net->ipv4.icmp_sk); | 1144 | free_percpu(net->ipv4.icmp_sk); |
1145 | net->ipv4.icmp_sk = NULL; | 1145 | net->ipv4.icmp_sk = NULL; |
1146 | } | 1146 | } |
1147 | 1147 | ||
@@ -1149,9 +1149,8 @@ static int __net_init icmp_sk_init(struct net *net) | |||
1149 | { | 1149 | { |
1150 | int i, err; | 1150 | int i, err; |
1151 | 1151 | ||
1152 | net->ipv4.icmp_sk = | 1152 | net->ipv4.icmp_sk = alloc_percpu(struct sock *); |
1153 | kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); | 1153 | if (!net->ipv4.icmp_sk) |
1154 | if (net->ipv4.icmp_sk == NULL) | ||
1155 | return -ENOMEM; | 1154 | return -ENOMEM; |
1156 | 1155 | ||
1157 | for_each_possible_cpu(i) { | 1156 | for_each_possible_cpu(i) { |
@@ -1162,7 +1161,7 @@ static int __net_init icmp_sk_init(struct net *net) | |||
1162 | if (err < 0) | 1161 | if (err < 0) |
1163 | goto fail; | 1162 | goto fail; |
1164 | 1163 | ||
1165 | net->ipv4.icmp_sk[i] = sk; | 1164 | *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk; |
1166 | 1165 | ||
1167 | /* Enough space for 2 64K ICMP packets, including | 1166 | /* Enough space for 2 64K ICMP packets, including |
1168 | * sk_buff/skb_shared_info struct overhead. | 1167 | * sk_buff/skb_shared_info struct overhead. |
@@ -1203,8 +1202,8 @@ static int __net_init icmp_sk_init(struct net *net) | |||
1203 | 1202 | ||
1204 | fail: | 1203 | fail: |
1205 | for_each_possible_cpu(i) | 1204 | for_each_possible_cpu(i) |
1206 | inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); | 1205 | inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i)); |
1207 | kfree(net->ipv4.icmp_sk); | 1206 | free_percpu(net->ipv4.icmp_sk); |
1208 | return err; | 1207 | return err; |
1209 | } | 1208 | } |
1210 | 1209 | ||
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e34dccbc4d70..81751f12645f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -203,7 +203,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
203 | icsk->icsk_ca_ops->get_info(sk, ext, skb); | 203 | icsk->icsk_ca_ops->get_info(sk, ext, skb); |
204 | 204 | ||
205 | out: | 205 | out: |
206 | return nlmsg_end(skb, nlh); | 206 | nlmsg_end(skb, nlh); |
207 | return 0; | ||
207 | 208 | ||
208 | errout: | 209 | errout: |
209 | nlmsg_cancel(skb, nlh); | 210 | nlmsg_cancel(skb, nlh); |
@@ -271,7 +272,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | |||
271 | } | 272 | } |
272 | #endif | 273 | #endif |
273 | 274 | ||
274 | return nlmsg_end(skb, nlh); | 275 | nlmsg_end(skb, nlh); |
276 | return 0; | ||
275 | } | 277 | } |
276 | 278 | ||
277 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, | 279 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, |
@@ -758,7 +760,8 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | |||
758 | } | 760 | } |
759 | #endif | 761 | #endif |
760 | 762 | ||
761 | return nlmsg_end(skb, nlh); | 763 | nlmsg_end(skb, nlh); |
764 | return 0; | ||
762 | } | 765 | } |
763 | 766 | ||
764 | static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | 767 | static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, |
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 3a83ce5efa80..787b3c294ce6 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c | |||
@@ -129,7 +129,8 @@ int ip_forward(struct sk_buff *skb) | |||
129 | * We now generate an ICMP HOST REDIRECT giving the route | 129 | * We now generate an ICMP HOST REDIRECT giving the route |
130 | * we calculated. | 130 | * we calculated. |
131 | */ | 131 | */ |
132 | if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) | 132 | if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr && |
133 | !skb_sec_path(skb)) | ||
133 | ip_rt_send_redirect(skb); | 134 | ip_rt_send_redirect(skb); |
134 | 135 | ||
135 | skb->priority = rt_tos2priority(iph->tos); | 136 | skb->priority = rt_tos2priority(iph->tos); |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4f4bf5b99686..6207275fc749 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -659,12 +659,12 @@ static bool ipgre_netlink_encap_parms(struct nlattr *data[], | |||
659 | 659 | ||
660 | if (data[IFLA_GRE_ENCAP_SPORT]) { | 660 | if (data[IFLA_GRE_ENCAP_SPORT]) { |
661 | ret = true; | 661 | ret = true; |
662 | ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]); | 662 | ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]); |
663 | } | 663 | } |
664 | 664 | ||
665 | if (data[IFLA_GRE_ENCAP_DPORT]) { | 665 | if (data[IFLA_GRE_ENCAP_DPORT]) { |
666 | ret = true; | 666 | ret = true; |
667 | ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]); | 667 | ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]); |
668 | } | 668 | } |
669 | 669 | ||
670 | return ret; | 670 | return ret; |
@@ -673,6 +673,7 @@ static bool ipgre_netlink_encap_parms(struct nlattr *data[], | |||
673 | static int gre_tap_init(struct net_device *dev) | 673 | static int gre_tap_init(struct net_device *dev) |
674 | { | 674 | { |
675 | __gre_tunnel_init(dev); | 675 | __gre_tunnel_init(dev); |
676 | dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; | ||
676 | 677 | ||
677 | return ip_tunnel_init(dev); | 678 | return ip_tunnel_init(dev); |
678 | } | 679 | } |
@@ -785,10 +786,10 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) | |||
785 | 786 | ||
786 | if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, | 787 | if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, |
787 | t->encap.type) || | 788 | t->encap.type) || |
788 | nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT, | 789 | nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT, |
789 | t->encap.sport) || | 790 | t->encap.sport) || |
790 | nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, | 791 | nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT, |
791 | t->encap.dport) || | 792 | t->encap.dport) || |
792 | nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, | 793 | nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, |
793 | t->encap.flags)) | 794 | t->encap.flags)) |
794 | goto nla_put_failure; | 795 | goto nla_put_failure; |
@@ -828,6 +829,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = { | |||
828 | .dellink = ip_tunnel_dellink, | 829 | .dellink = ip_tunnel_dellink, |
829 | .get_size = ipgre_get_size, | 830 | .get_size = ipgre_get_size, |
830 | .fill_info = ipgre_fill_info, | 831 | .fill_info = ipgre_fill_info, |
832 | .get_link_net = ip_tunnel_get_link_net, | ||
831 | }; | 833 | }; |
832 | 834 | ||
833 | static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { | 835 | static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { |
@@ -842,6 +844,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { | |||
842 | .dellink = ip_tunnel_dellink, | 844 | .dellink = ip_tunnel_dellink, |
843 | .get_size = ipgre_get_size, | 845 | .get_size = ipgre_get_size, |
844 | .fill_info = ipgre_fill_info, | 846 | .fill_info = ipgre_fill_info, |
847 | .get_link_net = ip_tunnel_get_link_net, | ||
845 | }; | 848 | }; |
846 | 849 | ||
847 | static int __net_init ipgre_tap_init_net(struct net *net) | 850 | static int __net_init ipgre_tap_init_net(struct net *net) |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b50861b22b6b..d68199d9b2b0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -755,13 +755,11 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk | |||
755 | struct msghdr *msg = from; | 755 | struct msghdr *msg = from; |
756 | 756 | ||
757 | if (skb->ip_summed == CHECKSUM_PARTIAL) { | 757 | if (skb->ip_summed == CHECKSUM_PARTIAL) { |
758 | /* XXX: stripping const */ | 758 | if (copy_from_iter(to, len, &msg->msg_iter) != len) |
759 | if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0) | ||
760 | return -EFAULT; | 759 | return -EFAULT; |
761 | } else { | 760 | } else { |
762 | __wsum csum = 0; | 761 | __wsum csum = 0; |
763 | /* XXX: stripping const */ | 762 | if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len) |
764 | if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0) | ||
765 | return -EFAULT; | 763 | return -EFAULT; |
766 | skb->csum = csum_block_add(skb->csum, csum, odd); | 764 | skb->csum = csum_block_add(skb->csum, csum, odd); |
767 | } | 765 | } |
@@ -1506,23 +1504,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, | |||
1506 | /* | 1504 | /* |
1507 | * Generic function to send a packet as reply to another packet. | 1505 | * Generic function to send a packet as reply to another packet. |
1508 | * Used to send some TCP resets/acks so far. | 1506 | * Used to send some TCP resets/acks so far. |
1509 | * | ||
1510 | * Use a fake percpu inet socket to avoid false sharing and contention. | ||
1511 | */ | 1507 | */ |
1512 | static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { | 1508 | void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, |
1513 | .sk = { | ||
1514 | .__sk_common = { | ||
1515 | .skc_refcnt = ATOMIC_INIT(1), | ||
1516 | }, | ||
1517 | .sk_wmem_alloc = ATOMIC_INIT(1), | ||
1518 | .sk_allocation = GFP_ATOMIC, | ||
1519 | .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), | ||
1520 | }, | ||
1521 | .pmtudisc = IP_PMTUDISC_WANT, | ||
1522 | .uc_ttl = -1, | ||
1523 | }; | ||
1524 | |||
1525 | void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | ||
1526 | const struct ip_options *sopt, | 1509 | const struct ip_options *sopt, |
1527 | __be32 daddr, __be32 saddr, | 1510 | __be32 daddr, __be32 saddr, |
1528 | const struct ip_reply_arg *arg, | 1511 | const struct ip_reply_arg *arg, |
@@ -1532,9 +1515,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | |||
1532 | struct ipcm_cookie ipc; | 1515 | struct ipcm_cookie ipc; |
1533 | struct flowi4 fl4; | 1516 | struct flowi4 fl4; |
1534 | struct rtable *rt = skb_rtable(skb); | 1517 | struct rtable *rt = skb_rtable(skb); |
1518 | struct net *net = sock_net(sk); | ||
1535 | struct sk_buff *nskb; | 1519 | struct sk_buff *nskb; |
1536 | struct sock *sk; | ||
1537 | struct inet_sock *inet; | ||
1538 | int err; | 1520 | int err; |
1539 | 1521 | ||
1540 | if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) | 1522 | if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) |
@@ -1565,15 +1547,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | |||
1565 | if (IS_ERR(rt)) | 1547 | if (IS_ERR(rt)) |
1566 | return; | 1548 | return; |
1567 | 1549 | ||
1568 | inet = &get_cpu_var(unicast_sock); | 1550 | inet_sk(sk)->tos = arg->tos; |
1569 | 1551 | ||
1570 | inet->tos = arg->tos; | ||
1571 | sk = &inet->sk; | ||
1572 | sk->sk_priority = skb->priority; | 1552 | sk->sk_priority = skb->priority; |
1573 | sk->sk_protocol = ip_hdr(skb)->protocol; | 1553 | sk->sk_protocol = ip_hdr(skb)->protocol; |
1574 | sk->sk_bound_dev_if = arg->bound_dev_if; | 1554 | sk->sk_bound_dev_if = arg->bound_dev_if; |
1575 | sock_net_set(sk, net); | ||
1576 | __skb_queue_head_init(&sk->sk_write_queue); | ||
1577 | sk->sk_sndbuf = sysctl_wmem_default; | 1555 | sk->sk_sndbuf = sysctl_wmem_default; |
1578 | err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, | 1556 | err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, |
1579 | len, 0, &ipc, &rt, MSG_DONTWAIT); | 1557 | len, 0, &ipc, &rt, MSG_DONTWAIT); |
@@ -1589,13 +1567,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | |||
1589 | arg->csumoffset) = csum_fold(csum_add(nskb->csum, | 1567 | arg->csumoffset) = csum_fold(csum_add(nskb->csum, |
1590 | arg->csum)); | 1568 | arg->csum)); |
1591 | nskb->ip_summed = CHECKSUM_NONE; | 1569 | nskb->ip_summed = CHECKSUM_NONE; |
1592 | skb_orphan(nskb); | ||
1593 | skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); | 1570 | skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); |
1594 | ip_push_pending_frames(sk, &fl4); | 1571 | ip_push_pending_frames(sk, &fl4); |
1595 | } | 1572 | } |
1596 | out: | 1573 | out: |
1597 | put_cpu_var(unicast_sock); | ||
1598 | |||
1599 | ip_rt_put(rt); | 1574 | ip_rt_put(rt); |
1600 | } | 1575 | } |
1601 | 1576 | ||
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 8a89c738b7a3..31d8c71986b4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <net/route.h> | 37 | #include <net/route.h> |
38 | #include <net/xfrm.h> | 38 | #include <net/xfrm.h> |
39 | #include <net/compat.h> | 39 | #include <net/compat.h> |
40 | #include <net/checksum.h> | ||
40 | #if IS_ENABLED(CONFIG_IPV6) | 41 | #if IS_ENABLED(CONFIG_IPV6) |
41 | #include <net/transp_v6.h> | 42 | #include <net/transp_v6.h> |
42 | #endif | 43 | #endif |
@@ -45,14 +46,6 @@ | |||
45 | #include <linux/errqueue.h> | 46 | #include <linux/errqueue.h> |
46 | #include <asm/uaccess.h> | 47 | #include <asm/uaccess.h> |
47 | 48 | ||
48 | #define IP_CMSG_PKTINFO 1 | ||
49 | #define IP_CMSG_TTL 2 | ||
50 | #define IP_CMSG_TOS 4 | ||
51 | #define IP_CMSG_RECVOPTS 8 | ||
52 | #define IP_CMSG_RETOPTS 16 | ||
53 | #define IP_CMSG_PASSSEC 32 | ||
54 | #define IP_CMSG_ORIGDSTADDR 64 | ||
55 | |||
56 | /* | 49 | /* |
57 | * SOL_IP control messages. | 50 | * SOL_IP control messages. |
58 | */ | 51 | */ |
@@ -104,6 +97,20 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) | |||
104 | put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); | 97 | put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); |
105 | } | 98 | } |
106 | 99 | ||
100 | static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, | ||
101 | int offset) | ||
102 | { | ||
103 | __wsum csum = skb->csum; | ||
104 | |||
105 | if (skb->ip_summed != CHECKSUM_COMPLETE) | ||
106 | return; | ||
107 | |||
108 | if (offset != 0) | ||
109 | csum = csum_sub(csum, csum_partial(skb->data, offset, 0)); | ||
110 | |||
111 | put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); | ||
112 | } | ||
113 | |||
107 | static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) | 114 | static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) |
108 | { | 115 | { |
109 | char *secdata; | 116 | char *secdata; |
@@ -144,47 +151,73 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) | |||
144 | put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); | 151 | put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); |
145 | } | 152 | } |
146 | 153 | ||
147 | void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) | 154 | void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, |
155 | int offset) | ||
148 | { | 156 | { |
149 | struct inet_sock *inet = inet_sk(skb->sk); | 157 | struct inet_sock *inet = inet_sk(skb->sk); |
150 | unsigned int flags = inet->cmsg_flags; | 158 | unsigned int flags = inet->cmsg_flags; |
151 | 159 | ||
152 | /* Ordered by supposed usage frequency */ | 160 | /* Ordered by supposed usage frequency */ |
153 | if (flags & 1) | 161 | if (flags & IP_CMSG_PKTINFO) { |
154 | ip_cmsg_recv_pktinfo(msg, skb); | 162 | ip_cmsg_recv_pktinfo(msg, skb); |
155 | if ((flags >>= 1) == 0) | ||
156 | return; | ||
157 | 163 | ||
158 | if (flags & 1) | 164 | flags &= ~IP_CMSG_PKTINFO; |
165 | if (!flags) | ||
166 | return; | ||
167 | } | ||
168 | |||
169 | if (flags & IP_CMSG_TTL) { | ||
159 | ip_cmsg_recv_ttl(msg, skb); | 170 | ip_cmsg_recv_ttl(msg, skb); |
160 | if ((flags >>= 1) == 0) | ||
161 | return; | ||
162 | 171 | ||
163 | if (flags & 1) | 172 | flags &= ~IP_CMSG_TTL; |
173 | if (!flags) | ||
174 | return; | ||
175 | } | ||
176 | |||
177 | if (flags & IP_CMSG_TOS) { | ||
164 | ip_cmsg_recv_tos(msg, skb); | 178 | ip_cmsg_recv_tos(msg, skb); |
165 | if ((flags >>= 1) == 0) | ||
166 | return; | ||
167 | 179 | ||
168 | if (flags & 1) | 180 | flags &= ~IP_CMSG_TOS; |
181 | if (!flags) | ||
182 | return; | ||
183 | } | ||
184 | |||
185 | if (flags & IP_CMSG_RECVOPTS) { | ||
169 | ip_cmsg_recv_opts(msg, skb); | 186 | ip_cmsg_recv_opts(msg, skb); |
170 | if ((flags >>= 1) == 0) | ||
171 | return; | ||
172 | 187 | ||
173 | if (flags & 1) | 188 | flags &= ~IP_CMSG_RECVOPTS; |
189 | if (!flags) | ||
190 | return; | ||
191 | } | ||
192 | |||
193 | if (flags & IP_CMSG_RETOPTS) { | ||
174 | ip_cmsg_recv_retopts(msg, skb); | 194 | ip_cmsg_recv_retopts(msg, skb); |
175 | if ((flags >>= 1) == 0) | ||
176 | return; | ||
177 | 195 | ||
178 | if (flags & 1) | 196 | flags &= ~IP_CMSG_RETOPTS; |
197 | if (!flags) | ||
198 | return; | ||
199 | } | ||
200 | |||
201 | if (flags & IP_CMSG_PASSSEC) { | ||
179 | ip_cmsg_recv_security(msg, skb); | 202 | ip_cmsg_recv_security(msg, skb); |
180 | 203 | ||
181 | if ((flags >>= 1) == 0) | 204 | flags &= ~IP_CMSG_PASSSEC; |
182 | return; | 205 | if (!flags) |
183 | if (flags & 1) | 206 | return; |
207 | } | ||
208 | |||
209 | if (flags & IP_CMSG_ORIGDSTADDR) { | ||
184 | ip_cmsg_recv_dstaddr(msg, skb); | 210 | ip_cmsg_recv_dstaddr(msg, skb); |
185 | 211 | ||
212 | flags &= ~IP_CMSG_ORIGDSTADDR; | ||
213 | if (!flags) | ||
214 | return; | ||
215 | } | ||
216 | |||
217 | if (flags & IP_CMSG_CHECKSUM) | ||
218 | ip_cmsg_recv_checksum(msg, skb, offset); | ||
186 | } | 219 | } |
187 | EXPORT_SYMBOL(ip_cmsg_recv); | 220 | EXPORT_SYMBOL(ip_cmsg_recv_offset); |
188 | 221 | ||
189 | int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, | 222 | int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, |
190 | bool allow_ipv6) | 223 | bool allow_ipv6) |
@@ -450,7 +483,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) | |||
450 | 483 | ||
451 | serr = SKB_EXT_ERR(skb); | 484 | serr = SKB_EXT_ERR(skb); |
452 | 485 | ||
453 | if (sin) { | 486 | if (sin && skb->len) { |
454 | sin->sin_family = AF_INET; | 487 | sin->sin_family = AF_INET; |
455 | sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + | 488 | sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + |
456 | serr->addr_offset); | 489 | serr->addr_offset); |
@@ -461,17 +494,14 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) | |||
461 | 494 | ||
462 | memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); | 495 | memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); |
463 | sin = &errhdr.offender; | 496 | sin = &errhdr.offender; |
464 | sin->sin_family = AF_UNSPEC; | 497 | memset(sin, 0, sizeof(*sin)); |
465 | |||
466 | if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || | ||
467 | ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { | ||
468 | struct inet_sock *inet = inet_sk(sk); | ||
469 | 498 | ||
499 | if (skb->len && | ||
500 | (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || | ||
501 | ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) { | ||
470 | sin->sin_family = AF_INET; | 502 | sin->sin_family = AF_INET; |
471 | sin->sin_addr.s_addr = ip_hdr(skb)->saddr; | 503 | sin->sin_addr.s_addr = ip_hdr(skb)->saddr; |
472 | sin->sin_port = 0; | 504 | if (inet_sk(sk)->cmsg_flags) |
473 | memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); | ||
474 | if (inet->cmsg_flags) | ||
475 | ip_cmsg_recv(msg, skb); | 505 | ip_cmsg_recv(msg, skb); |
476 | } | 506 | } |
477 | 507 | ||
@@ -522,6 +552,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
522 | case IP_MULTICAST_ALL: | 552 | case IP_MULTICAST_ALL: |
523 | case IP_MULTICAST_LOOP: | 553 | case IP_MULTICAST_LOOP: |
524 | case IP_RECVORIGDSTADDR: | 554 | case IP_RECVORIGDSTADDR: |
555 | case IP_CHECKSUM: | ||
525 | if (optlen >= sizeof(int)) { | 556 | if (optlen >= sizeof(int)) { |
526 | if (get_user(val, (int __user *) optval)) | 557 | if (get_user(val, (int __user *) optval)) |
527 | return -EFAULT; | 558 | return -EFAULT; |
@@ -619,6 +650,19 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
619 | else | 650 | else |
620 | inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR; | 651 | inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR; |
621 | break; | 652 | break; |
653 | case IP_CHECKSUM: | ||
654 | if (val) { | ||
655 | if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) { | ||
656 | inet_inc_convert_csum(sk); | ||
657 | inet->cmsg_flags |= IP_CMSG_CHECKSUM; | ||
658 | } | ||
659 | } else { | ||
660 | if (inet->cmsg_flags & IP_CMSG_CHECKSUM) { | ||
661 | inet_dec_convert_csum(sk); | ||
662 | inet->cmsg_flags &= ~IP_CMSG_CHECKSUM; | ||
663 | } | ||
664 | } | ||
665 | break; | ||
622 | case IP_TOS: /* This sets both TOS and Precedence */ | 666 | case IP_TOS: /* This sets both TOS and Precedence */ |
623 | if (sk->sk_type == SOCK_STREAM) { | 667 | if (sk->sk_type == SOCK_STREAM) { |
624 | val &= ~INET_ECN_MASK; | 668 | val &= ~INET_ECN_MASK; |
@@ -1222,6 +1266,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, | |||
1222 | case IP_RECVORIGDSTADDR: | 1266 | case IP_RECVORIGDSTADDR: |
1223 | val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0; | 1267 | val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0; |
1224 | break; | 1268 | break; |
1269 | case IP_CHECKSUM: | ||
1270 | val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0; | ||
1271 | break; | ||
1225 | case IP_TOS: | 1272 | case IP_TOS: |
1226 | val = inet->tos; | 1273 | val = inet->tos; |
1227 | break; | 1274 | break; |
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d3e447936720..2cd08280c77b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c | |||
@@ -972,6 +972,14 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) | |||
972 | } | 972 | } |
973 | EXPORT_SYMBOL_GPL(ip_tunnel_dellink); | 973 | EXPORT_SYMBOL_GPL(ip_tunnel_dellink); |
974 | 974 | ||
975 | struct net *ip_tunnel_get_link_net(const struct net_device *dev) | ||
976 | { | ||
977 | struct ip_tunnel *tunnel = netdev_priv(dev); | ||
978 | |||
979 | return tunnel->net; | ||
980 | } | ||
981 | EXPORT_SYMBOL(ip_tunnel_get_link_net); | ||
982 | |||
975 | int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, | 983 | int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, |
976 | struct rtnl_link_ops *ops, char *devname) | 984 | struct rtnl_link_ops *ops, char *devname) |
977 | { | 985 | { |
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 1a7e979e80ba..94efe148181c 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c | |||
@@ -531,6 +531,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = { | |||
531 | .dellink = ip_tunnel_dellink, | 531 | .dellink = ip_tunnel_dellink, |
532 | .get_size = vti_get_size, | 532 | .get_size = vti_get_size, |
533 | .fill_info = vti_fill_info, | 533 | .fill_info = vti_fill_info, |
534 | .get_link_net = ip_tunnel_get_link_net, | ||
534 | }; | 535 | }; |
535 | 536 | ||
536 | static int __init vti_init(void) | 537 | static int __init vti_init(void) |
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 7fa18bc7e47f..b26376ef87f6 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c | |||
@@ -209,9 +209,9 @@ static int __init ic_open_devs(void) | |||
209 | last = &ic_first_dev; | 209 | last = &ic_first_dev; |
210 | rtnl_lock(); | 210 | rtnl_lock(); |
211 | 211 | ||
212 | /* bring loopback device up first */ | 212 | /* bring loopback and DSA master network devices up first */ |
213 | for_each_netdev(&init_net, dev) { | 213 | for_each_netdev(&init_net, dev) { |
214 | if (!(dev->flags & IFF_LOOPBACK)) | 214 | if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev)) |
215 | continue; | 215 | continue; |
216 | if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) | 216 | if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) |
217 | pr_err("IP-Config: Failed to open %s\n", dev->name); | 217 | pr_err("IP-Config: Failed to open %s\n", dev->name); |
@@ -306,7 +306,7 @@ static void __init ic_close_devs(void) | |||
306 | while ((d = next)) { | 306 | while ((d = next)) { |
307 | next = d->next; | 307 | next = d->next; |
308 | dev = d->dev; | 308 | dev = d->dev; |
309 | if (dev != ic_dev) { | 309 | if (dev != ic_dev && !netdev_uses_dsa(dev)) { |
310 | DBG(("IP-Config: Downing %s\n", dev->name)); | 310 | DBG(("IP-Config: Downing %s\n", dev->name)); |
311 | dev_change_flags(dev, d->flags); | 311 | dev_change_flags(dev, d->flags); |
312 | } | 312 | } |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 40403114f00a..915d215a7d14 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
@@ -366,12 +366,12 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[], | |||
366 | 366 | ||
367 | if (data[IFLA_IPTUN_ENCAP_SPORT]) { | 367 | if (data[IFLA_IPTUN_ENCAP_SPORT]) { |
368 | ret = true; | 368 | ret = true; |
369 | ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]); | 369 | ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); |
370 | } | 370 | } |
371 | 371 | ||
372 | if (data[IFLA_IPTUN_ENCAP_DPORT]) { | 372 | if (data[IFLA_IPTUN_ENCAP_DPORT]) { |
373 | ret = true; | 373 | ret = true; |
374 | ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]); | 374 | ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); |
375 | } | 375 | } |
376 | 376 | ||
377 | return ret; | 377 | return ret; |
@@ -460,10 +460,10 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) | |||
460 | 460 | ||
461 | if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, | 461 | if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, |
462 | tunnel->encap.type) || | 462 | tunnel->encap.type) || |
463 | nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT, | 463 | nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, |
464 | tunnel->encap.sport) || | 464 | tunnel->encap.sport) || |
465 | nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, | 465 | nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, |
466 | tunnel->encap.dport) || | 466 | tunnel->encap.dport) || |
467 | nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, | 467 | nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, |
468 | tunnel->encap.flags)) | 468 | tunnel->encap.flags)) |
469 | goto nla_put_failure; | 469 | goto nla_put_failure; |
@@ -498,6 +498,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = { | |||
498 | .dellink = ip_tunnel_dellink, | 498 | .dellink = ip_tunnel_dellink, |
499 | .get_size = ipip_get_size, | 499 | .get_size = ipip_get_size, |
500 | .fill_info = ipip_fill_info, | 500 | .fill_info = ipip_fill_info, |
501 | .get_link_net = ip_tunnel_get_link_net, | ||
501 | }; | 502 | }; |
502 | 503 | ||
503 | static struct xfrm_tunnel ipip_handler __read_mostly = { | 504 | static struct xfrm_tunnel ipip_handler __read_mostly = { |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c8034587859d..9d78427652d2 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -2290,7 +2290,8 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | |||
2290 | if (err < 0 && err != -ENOENT) | 2290 | if (err < 0 && err != -ENOENT) |
2291 | goto nla_put_failure; | 2291 | goto nla_put_failure; |
2292 | 2292 | ||
2293 | return nlmsg_end(skb, nlh); | 2293 | nlmsg_end(skb, nlh); |
2294 | return 0; | ||
2294 | 2295 | ||
2295 | nla_put_failure: | 2296 | nla_put_failure: |
2296 | nlmsg_cancel(skb, nlh); | 2297 | nlmsg_cancel(skb, nlh); |
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c0d82f78d364..e9f66e1cda50 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c | |||
@@ -599,18 +599,18 @@ int ping_getfrag(void *from, char *to, | |||
599 | struct pingfakehdr *pfh = (struct pingfakehdr *)from; | 599 | struct pingfakehdr *pfh = (struct pingfakehdr *)from; |
600 | 600 | ||
601 | if (offset == 0) { | 601 | if (offset == 0) { |
602 | if (fraglen < sizeof(struct icmphdr)) | 602 | fraglen -= sizeof(struct icmphdr); |
603 | if (fraglen < 0) | ||
603 | BUG(); | 604 | BUG(); |
604 | if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr), | 605 | if (csum_and_copy_from_iter(to + sizeof(struct icmphdr), |
605 | pfh->iov, 0, fraglen - sizeof(struct icmphdr), | 606 | fraglen, &pfh->wcheck, |
606 | &pfh->wcheck)) | 607 | &pfh->msg->msg_iter) != fraglen) |
607 | return -EFAULT; | 608 | return -EFAULT; |
608 | } else if (offset < sizeof(struct icmphdr)) { | 609 | } else if (offset < sizeof(struct icmphdr)) { |
609 | BUG(); | 610 | BUG(); |
610 | } else { | 611 | } else { |
611 | if (csum_partial_copy_fromiovecend | 612 | if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck, |
612 | (to, pfh->iov, offset - sizeof(struct icmphdr), | 613 | &pfh->msg->msg_iter) != fraglen) |
613 | fraglen, &pfh->wcheck)) | ||
614 | return -EFAULT; | 614 | return -EFAULT; |
615 | } | 615 | } |
616 | 616 | ||
@@ -811,8 +811,7 @@ back_from_confirm: | |||
811 | pfh.icmph.checksum = 0; | 811 | pfh.icmph.checksum = 0; |
812 | pfh.icmph.un.echo.id = inet->inet_sport; | 812 | pfh.icmph.un.echo.id = inet->inet_sport; |
813 | pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; | 813 | pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; |
814 | /* XXX: stripping const */ | 814 | pfh.msg = msg; |
815 | pfh.iov = (struct iovec *)msg->msg_iter.iov; | ||
816 | pfh.wcheck = 0; | 815 | pfh.wcheck = 0; |
817 | pfh.family = AF_INET; | 816 | pfh.family = AF_INET; |
818 | 817 | ||
@@ -966,8 +965,11 @@ bool ping_rcv(struct sk_buff *skb) | |||
966 | 965 | ||
967 | sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); | 966 | sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); |
968 | if (sk != NULL) { | 967 | if (sk != NULL) { |
968 | struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); | ||
969 | |||
969 | pr_debug("rcv on socket %p\n", sk); | 970 | pr_debug("rcv on socket %p\n", sk); |
970 | ping_queue_rcv_skb(sk, skb_get(skb)); | 971 | if (skb2) |
972 | ping_queue_rcv_skb(sk, skb2); | ||
971 | sock_put(sk); | 973 | sock_put(sk); |
972 | return true; | 974 | return true; |
973 | } | 975 | } |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8f9cd200ce20..d8953ef0770c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -292,6 +292,12 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
292 | SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), | 292 | SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), |
293 | SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), | 293 | SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), |
294 | SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), | 294 | SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), |
295 | SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV), | ||
296 | SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS), | ||
297 | SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ), | ||
298 | SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), | ||
299 | SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), | ||
300 | SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), | ||
295 | SNMP_MIB_SENTINEL | 301 | SNMP_MIB_SENTINEL |
296 | }; | 302 | }; |
297 | 303 | ||
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0bb68df5055d..f027a708b7e0 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -337,7 +337,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) | |||
337 | } | 337 | } |
338 | 338 | ||
339 | static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, | 339 | static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, |
340 | void *from, size_t length, | 340 | struct msghdr *msg, size_t length, |
341 | struct rtable **rtp, | 341 | struct rtable **rtp, |
342 | unsigned int flags) | 342 | unsigned int flags) |
343 | { | 343 | { |
@@ -382,7 +382,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, | |||
382 | 382 | ||
383 | skb->transport_header = skb->network_header; | 383 | skb->transport_header = skb->network_header; |
384 | err = -EFAULT; | 384 | err = -EFAULT; |
385 | if (memcpy_fromiovecend((void *)iph, from, 0, length)) | 385 | if (memcpy_from_msg(iph, msg, length)) |
386 | goto error_free; | 386 | goto error_free; |
387 | 387 | ||
388 | iphlen = iph->ihl * 4; | 388 | iphlen = iph->ihl * 4; |
@@ -625,8 +625,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
625 | back_from_confirm: | 625 | back_from_confirm: |
626 | 626 | ||
627 | if (inet->hdrincl) | 627 | if (inet->hdrincl) |
628 | /* XXX: stripping const */ | 628 | err = raw_send_hdrinc(sk, &fl4, msg, len, |
629 | err = raw_send_hdrinc(sk, &fl4, (struct iovec *)msg->msg_iter.iov, len, | ||
630 | &rt, msg->msg_flags); | 629 | &rt, msg->msg_flags); |
631 | 630 | ||
632 | else { | 631 | else { |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a2155b02602..ad5064362c5c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -966,6 +966,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) | |||
966 | if (dst->dev->mtu < mtu) | 966 | if (dst->dev->mtu < mtu) |
967 | return; | 967 | return; |
968 | 968 | ||
969 | if (rt->rt_pmtu && rt->rt_pmtu < mtu) | ||
970 | return; | ||
971 | |||
969 | if (mtu < ip_rt_min_pmtu) | 972 | if (mtu < ip_rt_min_pmtu) |
970 | mtu = ip_rt_min_pmtu; | 973 | mtu = ip_rt_min_pmtu; |
971 | 974 | ||
@@ -1325,14 +1328,22 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) | |||
1325 | return ret; | 1328 | return ret; |
1326 | } | 1329 | } |
1327 | 1330 | ||
1328 | static DEFINE_SPINLOCK(rt_uncached_lock); | 1331 | struct uncached_list { |
1329 | static LIST_HEAD(rt_uncached_list); | 1332 | spinlock_t lock; |
1333 | struct list_head head; | ||
1334 | }; | ||
1335 | |||
1336 | static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); | ||
1330 | 1337 | ||
1331 | static void rt_add_uncached_list(struct rtable *rt) | 1338 | static void rt_add_uncached_list(struct rtable *rt) |
1332 | { | 1339 | { |
1333 | spin_lock_bh(&rt_uncached_lock); | 1340 | struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); |
1334 | list_add_tail(&rt->rt_uncached, &rt_uncached_list); | 1341 | |
1335 | spin_unlock_bh(&rt_uncached_lock); | 1342 | rt->rt_uncached_list = ul; |
1343 | |||
1344 | spin_lock_bh(&ul->lock); | ||
1345 | list_add_tail(&rt->rt_uncached, &ul->head); | ||
1346 | spin_unlock_bh(&ul->lock); | ||
1336 | } | 1347 | } |
1337 | 1348 | ||
1338 | static void ipv4_dst_destroy(struct dst_entry *dst) | 1349 | static void ipv4_dst_destroy(struct dst_entry *dst) |
@@ -1340,27 +1351,32 @@ static void ipv4_dst_destroy(struct dst_entry *dst) | |||
1340 | struct rtable *rt = (struct rtable *) dst; | 1351 | struct rtable *rt = (struct rtable *) dst; |
1341 | 1352 | ||
1342 | if (!list_empty(&rt->rt_uncached)) { | 1353 | if (!list_empty(&rt->rt_uncached)) { |
1343 | spin_lock_bh(&rt_uncached_lock); | 1354 | struct uncached_list *ul = rt->rt_uncached_list; |
1355 | |||
1356 | spin_lock_bh(&ul->lock); | ||
1344 | list_del(&rt->rt_uncached); | 1357 | list_del(&rt->rt_uncached); |
1345 | spin_unlock_bh(&rt_uncached_lock); | 1358 | spin_unlock_bh(&ul->lock); |
1346 | } | 1359 | } |
1347 | } | 1360 | } |
1348 | 1361 | ||
1349 | void rt_flush_dev(struct net_device *dev) | 1362 | void rt_flush_dev(struct net_device *dev) |
1350 | { | 1363 | { |
1351 | if (!list_empty(&rt_uncached_list)) { | 1364 | struct net *net = dev_net(dev); |
1352 | struct net *net = dev_net(dev); | 1365 | struct rtable *rt; |
1353 | struct rtable *rt; | 1366 | int cpu; |
1367 | |||
1368 | for_each_possible_cpu(cpu) { | ||
1369 | struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); | ||
1354 | 1370 | ||
1355 | spin_lock_bh(&rt_uncached_lock); | 1371 | spin_lock_bh(&ul->lock); |
1356 | list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { | 1372 | list_for_each_entry(rt, &ul->head, rt_uncached) { |
1357 | if (rt->dst.dev != dev) | 1373 | if (rt->dst.dev != dev) |
1358 | continue; | 1374 | continue; |
1359 | rt->dst.dev = net->loopback_dev; | 1375 | rt->dst.dev = net->loopback_dev; |
1360 | dev_hold(rt->dst.dev); | 1376 | dev_hold(rt->dst.dev); |
1361 | dev_put(dev); | 1377 | dev_put(dev); |
1362 | } | 1378 | } |
1363 | spin_unlock_bh(&rt_uncached_lock); | 1379 | spin_unlock_bh(&ul->lock); |
1364 | } | 1380 | } |
1365 | } | 1381 | } |
1366 | 1382 | ||
@@ -1554,11 +1570,10 @@ static int __mkroute_input(struct sk_buff *skb, | |||
1554 | 1570 | ||
1555 | do_cache = res->fi && !itag; | 1571 | do_cache = res->fi && !itag; |
1556 | if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && | 1572 | if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && |
1573 | skb->protocol == htons(ETH_P_IP) && | ||
1557 | (IN_DEV_SHARED_MEDIA(out_dev) || | 1574 | (IN_DEV_SHARED_MEDIA(out_dev) || |
1558 | inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { | 1575 | inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) |
1559 | flags |= RTCF_DOREDIRECT; | 1576 | IPCB(skb)->flags |= IPSKB_DOREDIRECT; |
1560 | do_cache = false; | ||
1561 | } | ||
1562 | 1577 | ||
1563 | if (skb->protocol != htons(ETH_P_IP)) { | 1578 | if (skb->protocol != htons(ETH_P_IP)) { |
1564 | /* Not IP (i.e. ARP). Do not create route, if it is | 1579 | /* Not IP (i.e. ARP). Do not create route, if it is |
@@ -2303,6 +2318,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, | |||
2303 | r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; | 2318 | r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; |
2304 | if (rt->rt_flags & RTCF_NOTIFY) | 2319 | if (rt->rt_flags & RTCF_NOTIFY) |
2305 | r->rtm_flags |= RTM_F_NOTIFY; | 2320 | r->rtm_flags |= RTM_F_NOTIFY; |
2321 | if (IPCB(skb)->flags & IPSKB_DOREDIRECT) | ||
2322 | r->rtm_flags |= RTCF_DOREDIRECT; | ||
2306 | 2323 | ||
2307 | if (nla_put_be32(skb, RTA_DST, dst)) | 2324 | if (nla_put_be32(skb, RTA_DST, dst)) |
2308 | goto nla_put_failure; | 2325 | goto nla_put_failure; |
@@ -2377,7 +2394,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, | |||
2377 | if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) | 2394 | if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) |
2378 | goto nla_put_failure; | 2395 | goto nla_put_failure; |
2379 | 2396 | ||
2380 | return nlmsg_end(skb, nlh); | 2397 | nlmsg_end(skb, nlh); |
2398 | return 0; | ||
2381 | 2399 | ||
2382 | nla_put_failure: | 2400 | nla_put_failure: |
2383 | nlmsg_cancel(skb, nlh); | 2401 | nlmsg_cancel(skb, nlh); |
@@ -2469,7 +2487,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) | |||
2469 | err = rt_fill_info(net, dst, src, &fl4, skb, | 2487 | err = rt_fill_info(net, dst, src, &fl4, skb, |
2470 | NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, | 2488 | NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, |
2471 | RTM_NEWROUTE, 0, 0); | 2489 | RTM_NEWROUTE, 0, 0); |
2472 | if (err <= 0) | 2490 | if (err < 0) |
2473 | goto errout_free; | 2491 | goto errout_free; |
2474 | 2492 | ||
2475 | err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); | 2493 | err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); |
@@ -2717,6 +2735,7 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; | |||
2717 | int __init ip_rt_init(void) | 2735 | int __init ip_rt_init(void) |
2718 | { | 2736 | { |
2719 | int rc = 0; | 2737 | int rc = 0; |
2738 | int cpu; | ||
2720 | 2739 | ||
2721 | ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); | 2740 | ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); |
2722 | if (!ip_idents) | 2741 | if (!ip_idents) |
@@ -2724,6 +2743,12 @@ int __init ip_rt_init(void) | |||
2724 | 2743 | ||
2725 | prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); | 2744 | prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); |
2726 | 2745 | ||
2746 | for_each_possible_cpu(cpu) { | ||
2747 | struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); | ||
2748 | |||
2749 | INIT_LIST_HEAD(&ul->head); | ||
2750 | spin_lock_init(&ul->lock); | ||
2751 | } | ||
2727 | #ifdef CONFIG_IP_ROUTE_CLASSID | 2752 | #ifdef CONFIG_IP_ROUTE_CLASSID |
2728 | ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); | 2753 | ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); |
2729 | if (!ip_rt_acct) | 2754 | if (!ip_rt_acct) |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e0ee384a448f..d151539da8e6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -604,20 +604,6 @@ static struct ctl_table ipv4_table[] = { | |||
604 | .proc_handler = proc_tcp_congestion_control, | 604 | .proc_handler = proc_tcp_congestion_control, |
605 | }, | 605 | }, |
606 | { | 606 | { |
607 | .procname = "tcp_mtu_probing", | ||
608 | .data = &sysctl_tcp_mtu_probing, | ||
609 | .maxlen = sizeof(int), | ||
610 | .mode = 0644, | ||
611 | .proc_handler = proc_dointvec, | ||
612 | }, | ||
613 | { | ||
614 | .procname = "tcp_base_mss", | ||
615 | .data = &sysctl_tcp_base_mss, | ||
616 | .maxlen = sizeof(int), | ||
617 | .mode = 0644, | ||
618 | .proc_handler = proc_dointvec, | ||
619 | }, | ||
620 | { | ||
621 | .procname = "tcp_workaround_signed_windows", | 607 | .procname = "tcp_workaround_signed_windows", |
622 | .data = &sysctl_tcp_workaround_signed_windows, | 608 | .data = &sysctl_tcp_workaround_signed_windows, |
623 | .maxlen = sizeof(int), | 609 | .maxlen = sizeof(int), |
@@ -729,6 +715,13 @@ static struct ctl_table ipv4_table[] = { | |||
729 | .extra2 = &one, | 715 | .extra2 = &one, |
730 | }, | 716 | }, |
731 | { | 717 | { |
718 | .procname = "tcp_invalid_ratelimit", | ||
719 | .data = &sysctl_tcp_invalid_ratelimit, | ||
720 | .maxlen = sizeof(int), | ||
721 | .mode = 0644, | ||
722 | .proc_handler = proc_dointvec_ms_jiffies, | ||
723 | }, | ||
724 | { | ||
732 | .procname = "icmp_msgs_per_sec", | 725 | .procname = "icmp_msgs_per_sec", |
733 | .data = &sysctl_icmp_msgs_per_sec, | 726 | .data = &sysctl_icmp_msgs_per_sec, |
734 | .maxlen = sizeof(int), | 727 | .maxlen = sizeof(int), |
@@ -876,6 +869,20 @@ static struct ctl_table ipv4_net_table[] = { | |||
876 | .mode = 0644, | 869 | .mode = 0644, |
877 | .proc_handler = proc_dointvec, | 870 | .proc_handler = proc_dointvec, |
878 | }, | 871 | }, |
872 | { | ||
873 | .procname = "tcp_mtu_probing", | ||
874 | .data = &init_net.ipv4.sysctl_tcp_mtu_probing, | ||
875 | .maxlen = sizeof(int), | ||
876 | .mode = 0644, | ||
877 | .proc_handler = proc_dointvec, | ||
878 | }, | ||
879 | { | ||
880 | .procname = "tcp_base_mss", | ||
881 | .data = &init_net.ipv4.sysctl_tcp_base_mss, | ||
882 | .maxlen = sizeof(int), | ||
883 | .mode = 0644, | ||
884 | .proc_handler = proc_dointvec, | ||
885 | }, | ||
879 | { } | 886 | { } |
880 | }; | 887 | }; |
881 | 888 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3075723c729b..9d72a0fcd928 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -1067,11 +1067,10 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, | |||
1067 | int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | 1067 | int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, |
1068 | size_t size) | 1068 | size_t size) |
1069 | { | 1069 | { |
1070 | const struct iovec *iov; | ||
1071 | struct tcp_sock *tp = tcp_sk(sk); | 1070 | struct tcp_sock *tp = tcp_sk(sk); |
1072 | struct sk_buff *skb; | 1071 | struct sk_buff *skb; |
1073 | int iovlen, flags, err, copied = 0; | 1072 | int flags, err, copied = 0; |
1074 | int mss_now = 0, size_goal, copied_syn = 0, offset = 0; | 1073 | int mss_now = 0, size_goal, copied_syn = 0; |
1075 | bool sg; | 1074 | bool sg; |
1076 | long timeo; | 1075 | long timeo; |
1077 | 1076 | ||
@@ -1084,7 +1083,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1084 | goto out; | 1083 | goto out; |
1085 | else if (err) | 1084 | else if (err) |
1086 | goto out_err; | 1085 | goto out_err; |
1087 | offset = copied_syn; | ||
1088 | } | 1086 | } |
1089 | 1087 | ||
1090 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 1088 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
@@ -1118,8 +1116,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1118 | mss_now = tcp_send_mss(sk, &size_goal, flags); | 1116 | mss_now = tcp_send_mss(sk, &size_goal, flags); |
1119 | 1117 | ||
1120 | /* Ok commence sending. */ | 1118 | /* Ok commence sending. */ |
1121 | iovlen = msg->msg_iter.nr_segs; | ||
1122 | iov = msg->msg_iter.iov; | ||
1123 | copied = 0; | 1119 | copied = 0; |
1124 | 1120 | ||
1125 | err = -EPIPE; | 1121 | err = -EPIPE; |
@@ -1128,151 +1124,134 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1128 | 1124 | ||
1129 | sg = !!(sk->sk_route_caps & NETIF_F_SG); | 1125 | sg = !!(sk->sk_route_caps & NETIF_F_SG); |
1130 | 1126 | ||
1131 | while (--iovlen >= 0) { | 1127 | while (iov_iter_count(&msg->msg_iter)) { |
1132 | size_t seglen = iov->iov_len; | 1128 | int copy = 0; |
1133 | unsigned char __user *from = iov->iov_base; | 1129 | int max = size_goal; |
1134 | 1130 | ||
1135 | iov++; | 1131 | skb = tcp_write_queue_tail(sk); |
1136 | if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */ | 1132 | if (tcp_send_head(sk)) { |
1137 | if (offset >= seglen) { | 1133 | if (skb->ip_summed == CHECKSUM_NONE) |
1138 | offset -= seglen; | 1134 | max = mss_now; |
1139 | continue; | 1135 | copy = max - skb->len; |
1140 | } | ||
1141 | seglen -= offset; | ||
1142 | from += offset; | ||
1143 | offset = 0; | ||
1144 | } | 1136 | } |
1145 | 1137 | ||
1146 | while (seglen > 0) { | 1138 | if (copy <= 0) { |
1147 | int copy = 0; | ||
1148 | int max = size_goal; | ||
1149 | |||
1150 | skb = tcp_write_queue_tail(sk); | ||
1151 | if (tcp_send_head(sk)) { | ||
1152 | if (skb->ip_summed == CHECKSUM_NONE) | ||
1153 | max = mss_now; | ||
1154 | copy = max - skb->len; | ||
1155 | } | ||
1156 | |||
1157 | if (copy <= 0) { | ||
1158 | new_segment: | 1139 | new_segment: |
1159 | /* Allocate new segment. If the interface is SG, | 1140 | /* Allocate new segment. If the interface is SG, |
1160 | * allocate skb fitting to single page. | 1141 | * allocate skb fitting to single page. |
1161 | */ | 1142 | */ |
1162 | if (!sk_stream_memory_free(sk)) | 1143 | if (!sk_stream_memory_free(sk)) |
1163 | goto wait_for_sndbuf; | 1144 | goto wait_for_sndbuf; |
1164 | 1145 | ||
1165 | skb = sk_stream_alloc_skb(sk, | 1146 | skb = sk_stream_alloc_skb(sk, |
1166 | select_size(sk, sg), | 1147 | select_size(sk, sg), |
1167 | sk->sk_allocation); | 1148 | sk->sk_allocation); |
1168 | if (!skb) | 1149 | if (!skb) |
1169 | goto wait_for_memory; | 1150 | goto wait_for_memory; |
1170 | 1151 | ||
1171 | /* | 1152 | /* |
1172 | * Check whether we can use HW checksum. | 1153 | * Check whether we can use HW checksum. |
1173 | */ | 1154 | */ |
1174 | if (sk->sk_route_caps & NETIF_F_ALL_CSUM) | 1155 | if (sk->sk_route_caps & NETIF_F_ALL_CSUM) |
1175 | skb->ip_summed = CHECKSUM_PARTIAL; | 1156 | skb->ip_summed = CHECKSUM_PARTIAL; |
1176 | 1157 | ||
1177 | skb_entail(sk, skb); | 1158 | skb_entail(sk, skb); |
1178 | copy = size_goal; | 1159 | copy = size_goal; |
1179 | max = size_goal; | 1160 | max = size_goal; |
1180 | 1161 | ||
1181 | /* All packets are restored as if they have | 1162 | /* All packets are restored as if they have |
1182 | * already been sent. skb_mstamp isn't set to | 1163 | * already been sent. skb_mstamp isn't set to |
1183 | * avoid wrong rtt estimation. | 1164 | * avoid wrong rtt estimation. |
1184 | */ | 1165 | */ |
1185 | if (tp->repair) | 1166 | if (tp->repair) |
1186 | TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; | 1167 | TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; |
1187 | } | 1168 | } |
1188 | 1169 | ||
1189 | /* Try to append data to the end of skb. */ | 1170 | /* Try to append data to the end of skb. */ |
1190 | if (copy > seglen) | 1171 | if (copy > iov_iter_count(&msg->msg_iter)) |
1191 | copy = seglen; | 1172 | copy = iov_iter_count(&msg->msg_iter); |
1192 | 1173 | ||
1193 | /* Where to copy to? */ | 1174 | /* Where to copy to? */ |
1194 | if (skb_availroom(skb) > 0) { | 1175 | if (skb_availroom(skb) > 0) { |
1195 | /* We have some space in skb head. Superb! */ | 1176 | /* We have some space in skb head. Superb! */ |
1196 | copy = min_t(int, copy, skb_availroom(skb)); | 1177 | copy = min_t(int, copy, skb_availroom(skb)); |
1197 | err = skb_add_data_nocache(sk, skb, from, copy); | 1178 | err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); |
1198 | if (err) | 1179 | if (err) |
1199 | goto do_fault; | 1180 | goto do_fault; |
1200 | } else { | 1181 | } else { |
1201 | bool merge = true; | 1182 | bool merge = true; |
1202 | int i = skb_shinfo(skb)->nr_frags; | 1183 | int i = skb_shinfo(skb)->nr_frags; |
1203 | struct page_frag *pfrag = sk_page_frag(sk); | 1184 | struct page_frag *pfrag = sk_page_frag(sk); |
1204 | 1185 | ||
1205 | if (!sk_page_frag_refill(sk, pfrag)) | 1186 | if (!sk_page_frag_refill(sk, pfrag)) |
1206 | goto wait_for_memory; | 1187 | goto wait_for_memory; |
1207 | |||
1208 | if (!skb_can_coalesce(skb, i, pfrag->page, | ||
1209 | pfrag->offset)) { | ||
1210 | if (i == MAX_SKB_FRAGS || !sg) { | ||
1211 | tcp_mark_push(tp, skb); | ||
1212 | goto new_segment; | ||
1213 | } | ||
1214 | merge = false; | ||
1215 | } | ||
1216 | 1188 | ||
1217 | copy = min_t(int, copy, pfrag->size - pfrag->offset); | 1189 | if (!skb_can_coalesce(skb, i, pfrag->page, |
1218 | 1190 | pfrag->offset)) { | |
1219 | if (!sk_wmem_schedule(sk, copy)) | 1191 | if (i == MAX_SKB_FRAGS || !sg) { |
1220 | goto wait_for_memory; | 1192 | tcp_mark_push(tp, skb); |
1221 | 1193 | goto new_segment; | |
1222 | err = skb_copy_to_page_nocache(sk, from, skb, | ||
1223 | pfrag->page, | ||
1224 | pfrag->offset, | ||
1225 | copy); | ||
1226 | if (err) | ||
1227 | goto do_error; | ||
1228 | |||
1229 | /* Update the skb. */ | ||
1230 | if (merge) { | ||
1231 | skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); | ||
1232 | } else { | ||
1233 | skb_fill_page_desc(skb, i, pfrag->page, | ||
1234 | pfrag->offset, copy); | ||
1235 | get_page(pfrag->page); | ||
1236 | } | 1194 | } |
1237 | pfrag->offset += copy; | 1195 | merge = false; |
1238 | } | 1196 | } |
1239 | 1197 | ||
1240 | if (!copied) | 1198 | copy = min_t(int, copy, pfrag->size - pfrag->offset); |
1241 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; | ||
1242 | 1199 | ||
1243 | tp->write_seq += copy; | 1200 | if (!sk_wmem_schedule(sk, copy)) |
1244 | TCP_SKB_CB(skb)->end_seq += copy; | 1201 | goto wait_for_memory; |
1245 | tcp_skb_pcount_set(skb, 0); | ||
1246 | 1202 | ||
1247 | from += copy; | 1203 | err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, |
1248 | copied += copy; | 1204 | pfrag->page, |
1249 | if ((seglen -= copy) == 0 && iovlen == 0) { | 1205 | pfrag->offset, |
1250 | tcp_tx_timestamp(sk, skb); | 1206 | copy); |
1251 | goto out; | 1207 | if (err) |
1208 | goto do_error; | ||
1209 | |||
1210 | /* Update the skb. */ | ||
1211 | if (merge) { | ||
1212 | skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); | ||
1213 | } else { | ||
1214 | skb_fill_page_desc(skb, i, pfrag->page, | ||
1215 | pfrag->offset, copy); | ||
1216 | get_page(pfrag->page); | ||
1252 | } | 1217 | } |
1218 | pfrag->offset += copy; | ||
1219 | } | ||
1253 | 1220 | ||
1254 | if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) | 1221 | if (!copied) |
1255 | continue; | 1222 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; |
1223 | |||
1224 | tp->write_seq += copy; | ||
1225 | TCP_SKB_CB(skb)->end_seq += copy; | ||
1226 | tcp_skb_pcount_set(skb, 0); | ||
1227 | |||
1228 | copied += copy; | ||
1229 | if (!iov_iter_count(&msg->msg_iter)) { | ||
1230 | tcp_tx_timestamp(sk, skb); | ||
1231 | goto out; | ||
1232 | } | ||
1256 | 1233 | ||
1257 | if (forced_push(tp)) { | 1234 | if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) |
1258 | tcp_mark_push(tp, skb); | ||
1259 | __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); | ||
1260 | } else if (skb == tcp_send_head(sk)) | ||
1261 | tcp_push_one(sk, mss_now); | ||
1262 | continue; | 1235 | continue; |
1263 | 1236 | ||
1237 | if (forced_push(tp)) { | ||
1238 | tcp_mark_push(tp, skb); | ||
1239 | __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); | ||
1240 | } else if (skb == tcp_send_head(sk)) | ||
1241 | tcp_push_one(sk, mss_now); | ||
1242 | continue; | ||
1243 | |||
1264 | wait_for_sndbuf: | 1244 | wait_for_sndbuf: |
1265 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); | 1245 | set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); |
1266 | wait_for_memory: | 1246 | wait_for_memory: |
1267 | if (copied) | 1247 | if (copied) |
1268 | tcp_push(sk, flags & ~MSG_MORE, mss_now, | 1248 | tcp_push(sk, flags & ~MSG_MORE, mss_now, |
1269 | TCP_NAGLE_PUSH, size_goal); | 1249 | TCP_NAGLE_PUSH, size_goal); |
1270 | 1250 | ||
1271 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) | 1251 | if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) |
1272 | goto do_error; | 1252 | goto do_error; |
1273 | 1253 | ||
1274 | mss_now = tcp_send_mss(sk, &size_goal, flags); | 1254 | mss_now = tcp_send_mss(sk, &size_goal, flags); |
1275 | } | ||
1276 | } | 1255 | } |
1277 | 1256 | ||
1278 | out: | 1257 | out: |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index bb395d46a389..c037644eafb7 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
@@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
150 | tcp_slow_start(tp, acked); | 150 | tcp_slow_start(tp, acked); |
151 | else { | 151 | else { |
152 | bictcp_update(ca, tp->snd_cwnd); | 152 | bictcp_update(ca, tp->snd_cwnd); |
153 | tcp_cong_avoid_ai(tp, ca->cnt); | 153 | tcp_cong_avoid_ai(tp, ca->cnt, 1); |
154 | } | 154 | } |
155 | } | 155 | } |
156 | 156 | ||
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 27ead0dd16bc..d694088214cd 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/types.h> | 13 | #include <linux/types.h> |
14 | #include <linux/list.h> | 14 | #include <linux/list.h> |
15 | #include <linux/gfp.h> | 15 | #include <linux/gfp.h> |
16 | #include <linux/jhash.h> | ||
16 | #include <net/tcp.h> | 17 | #include <net/tcp.h> |
17 | 18 | ||
18 | static DEFINE_SPINLOCK(tcp_cong_list_lock); | 19 | static DEFINE_SPINLOCK(tcp_cong_list_lock); |
@@ -31,6 +32,34 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) | |||
31 | return NULL; | 32 | return NULL; |
32 | } | 33 | } |
33 | 34 | ||
35 | /* Must be called with rcu lock held */ | ||
36 | static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) | ||
37 | { | ||
38 | const struct tcp_congestion_ops *ca = tcp_ca_find(name); | ||
39 | #ifdef CONFIG_MODULES | ||
40 | if (!ca && capable(CAP_NET_ADMIN)) { | ||
41 | rcu_read_unlock(); | ||
42 | request_module("tcp_%s", name); | ||
43 | rcu_read_lock(); | ||
44 | ca = tcp_ca_find(name); | ||
45 | } | ||
46 | #endif | ||
47 | return ca; | ||
48 | } | ||
49 | |||
50 | /* Simple linear search, not much in here. */ | ||
51 | struct tcp_congestion_ops *tcp_ca_find_key(u32 key) | ||
52 | { | ||
53 | struct tcp_congestion_ops *e; | ||
54 | |||
55 | list_for_each_entry_rcu(e, &tcp_cong_list, list) { | ||
56 | if (e->key == key) | ||
57 | return e; | ||
58 | } | ||
59 | |||
60 | return NULL; | ||
61 | } | ||
62 | |||
34 | /* | 63 | /* |
35 | * Attach new congestion control algorithm to the list | 64 | * Attach new congestion control algorithm to the list |
36 | * of available options. | 65 | * of available options. |
@@ -45,9 +74,12 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) | |||
45 | return -EINVAL; | 74 | return -EINVAL; |
46 | } | 75 | } |
47 | 76 | ||
77 | ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); | ||
78 | |||
48 | spin_lock(&tcp_cong_list_lock); | 79 | spin_lock(&tcp_cong_list_lock); |
49 | if (tcp_ca_find(ca->name)) { | 80 | if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) { |
50 | pr_notice("%s already registered\n", ca->name); | 81 | pr_notice("%s already registered or non-unique key\n", |
82 | ca->name); | ||
51 | ret = -EEXIST; | 83 | ret = -EEXIST; |
52 | } else { | 84 | } else { |
53 | list_add_tail_rcu(&ca->list, &tcp_cong_list); | 85 | list_add_tail_rcu(&ca->list, &tcp_cong_list); |
@@ -70,9 +102,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) | |||
70 | spin_lock(&tcp_cong_list_lock); | 102 | spin_lock(&tcp_cong_list_lock); |
71 | list_del_rcu(&ca->list); | 103 | list_del_rcu(&ca->list); |
72 | spin_unlock(&tcp_cong_list_lock); | 104 | spin_unlock(&tcp_cong_list_lock); |
105 | |||
106 | /* Wait for outstanding readers to complete before the | ||
107 | * module gets removed entirely. | ||
108 | * | ||
109 | * A try_module_get() should fail by now as our module is | ||
110 | * in "going" state since no refs are held anymore and | ||
111 | * module_exit() handler being called. | ||
112 | */ | ||
113 | synchronize_rcu(); | ||
73 | } | 114 | } |
74 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); | 115 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); |
75 | 116 | ||
117 | u32 tcp_ca_get_key_by_name(const char *name) | ||
118 | { | ||
119 | const struct tcp_congestion_ops *ca; | ||
120 | u32 key; | ||
121 | |||
122 | might_sleep(); | ||
123 | |||
124 | rcu_read_lock(); | ||
125 | ca = __tcp_ca_find_autoload(name); | ||
126 | key = ca ? ca->key : TCP_CA_UNSPEC; | ||
127 | rcu_read_unlock(); | ||
128 | |||
129 | return key; | ||
130 | } | ||
131 | EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name); | ||
132 | |||
133 | char *tcp_ca_get_name_by_key(u32 key, char *buffer) | ||
134 | { | ||
135 | const struct tcp_congestion_ops *ca; | ||
136 | char *ret = NULL; | ||
137 | |||
138 | rcu_read_lock(); | ||
139 | ca = tcp_ca_find_key(key); | ||
140 | if (ca) | ||
141 | ret = strncpy(buffer, ca->name, | ||
142 | TCP_CA_NAME_MAX); | ||
143 | rcu_read_unlock(); | ||
144 | |||
145 | return ret; | ||
146 | } | ||
147 | EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); | ||
148 | |||
76 | /* Assign choice of congestion control. */ | 149 | /* Assign choice of congestion control. */ |
77 | void tcp_assign_congestion_control(struct sock *sk) | 150 | void tcp_assign_congestion_control(struct sock *sk) |
78 | { | 151 | { |
@@ -107,6 +180,18 @@ void tcp_init_congestion_control(struct sock *sk) | |||
107 | icsk->icsk_ca_ops->init(sk); | 180 | icsk->icsk_ca_ops->init(sk); |
108 | } | 181 | } |
109 | 182 | ||
183 | static void tcp_reinit_congestion_control(struct sock *sk, | ||
184 | const struct tcp_congestion_ops *ca) | ||
185 | { | ||
186 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
187 | |||
188 | tcp_cleanup_congestion_control(sk); | ||
189 | icsk->icsk_ca_ops = ca; | ||
190 | |||
191 | if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) | ||
192 | icsk->icsk_ca_ops->init(sk); | ||
193 | } | ||
194 | |||
110 | /* Manage refcounts on socket close. */ | 195 | /* Manage refcounts on socket close. */ |
111 | void tcp_cleanup_congestion_control(struct sock *sk) | 196 | void tcp_cleanup_congestion_control(struct sock *sk) |
112 | { | 197 | { |
@@ -241,42 +326,26 @@ out: | |||
241 | int tcp_set_congestion_control(struct sock *sk, const char *name) | 326 | int tcp_set_congestion_control(struct sock *sk, const char *name) |
242 | { | 327 | { |
243 | struct inet_connection_sock *icsk = inet_csk(sk); | 328 | struct inet_connection_sock *icsk = inet_csk(sk); |
244 | struct tcp_congestion_ops *ca; | 329 | const struct tcp_congestion_ops *ca; |
245 | int err = 0; | 330 | int err = 0; |
246 | 331 | ||
247 | rcu_read_lock(); | 332 | if (icsk->icsk_ca_dst_locked) |
248 | ca = tcp_ca_find(name); | 333 | return -EPERM; |
249 | 334 | ||
250 | /* no change asking for existing value */ | 335 | rcu_read_lock(); |
336 | ca = __tcp_ca_find_autoload(name); | ||
337 | /* No change asking for existing value */ | ||
251 | if (ca == icsk->icsk_ca_ops) | 338 | if (ca == icsk->icsk_ca_ops) |
252 | goto out; | 339 | goto out; |
253 | |||
254 | #ifdef CONFIG_MODULES | ||
255 | /* not found attempt to autoload module */ | ||
256 | if (!ca && capable(CAP_NET_ADMIN)) { | ||
257 | rcu_read_unlock(); | ||
258 | request_module("tcp_%s", name); | ||
259 | rcu_read_lock(); | ||
260 | ca = tcp_ca_find(name); | ||
261 | } | ||
262 | #endif | ||
263 | if (!ca) | 340 | if (!ca) |
264 | err = -ENOENT; | 341 | err = -ENOENT; |
265 | |||
266 | else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || | 342 | else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || |
267 | ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) | 343 | ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) |
268 | err = -EPERM; | 344 | err = -EPERM; |
269 | |||
270 | else if (!try_module_get(ca->owner)) | 345 | else if (!try_module_get(ca->owner)) |
271 | err = -EBUSY; | 346 | err = -EBUSY; |
272 | 347 | else | |
273 | else { | 348 | tcp_reinit_congestion_control(sk, ca); |
274 | tcp_cleanup_congestion_control(sk); | ||
275 | icsk->icsk_ca_ops = ca; | ||
276 | |||
277 | if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) | ||
278 | icsk->icsk_ca_ops->init(sk); | ||
279 | } | ||
280 | out: | 349 | out: |
281 | rcu_read_unlock(); | 350 | rcu_read_unlock(); |
282 | return err; | 351 | return err; |
@@ -291,26 +360,32 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) | |||
291 | * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and | 360 | * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and |
292 | * returns the leftover acks to adjust cwnd in congestion avoidance mode. | 361 | * returns the leftover acks to adjust cwnd in congestion avoidance mode. |
293 | */ | 362 | */ |
294 | void tcp_slow_start(struct tcp_sock *tp, u32 acked) | 363 | u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) |
295 | { | 364 | { |
296 | u32 cwnd = tp->snd_cwnd + acked; | 365 | u32 cwnd = tp->snd_cwnd + acked; |
297 | 366 | ||
298 | if (cwnd > tp->snd_ssthresh) | 367 | if (cwnd > tp->snd_ssthresh) |
299 | cwnd = tp->snd_ssthresh + 1; | 368 | cwnd = tp->snd_ssthresh + 1; |
369 | acked -= cwnd - tp->snd_cwnd; | ||
300 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); | 370 | tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); |
371 | |||
372 | return acked; | ||
301 | } | 373 | } |
302 | EXPORT_SYMBOL_GPL(tcp_slow_start); | 374 | EXPORT_SYMBOL_GPL(tcp_slow_start); |
303 | 375 | ||
304 | /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ | 376 | /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), |
305 | void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) | 377 | * for every packet that was ACKed. |
378 | */ | ||
379 | void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) | ||
306 | { | 380 | { |
381 | tp->snd_cwnd_cnt += acked; | ||
307 | if (tp->snd_cwnd_cnt >= w) { | 382 | if (tp->snd_cwnd_cnt >= w) { |
308 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 383 | u32 delta = tp->snd_cwnd_cnt / w; |
309 | tp->snd_cwnd++; | 384 | |
310 | tp->snd_cwnd_cnt = 0; | 385 | tp->snd_cwnd_cnt -= delta * w; |
311 | } else { | 386 | tp->snd_cwnd += delta; |
312 | tp->snd_cwnd_cnt++; | ||
313 | } | 387 | } |
388 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
314 | } | 389 | } |
315 | EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); | 390 | EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); |
316 | 391 | ||
@@ -329,11 +404,13 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
329 | return; | 404 | return; |
330 | 405 | ||
331 | /* In "safe" area, increase. */ | 406 | /* In "safe" area, increase. */ |
332 | if (tp->snd_cwnd <= tp->snd_ssthresh) | 407 | if (tp->snd_cwnd <= tp->snd_ssthresh) { |
333 | tcp_slow_start(tp, acked); | 408 | acked = tcp_slow_start(tp, acked); |
409 | if (!acked) | ||
410 | return; | ||
411 | } | ||
334 | /* In dangerous area, increase slowly. */ | 412 | /* In dangerous area, increase slowly. */ |
335 | else | 413 | tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); |
336 | tcp_cong_avoid_ai(tp, tp->snd_cwnd); | ||
337 | } | 414 | } |
338 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | 415 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); |
339 | 416 | ||
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 6b6002416a73..4b276d1ed980 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c | |||
@@ -93,9 +93,7 @@ struct bictcp { | |||
93 | u32 epoch_start; /* beginning of an epoch */ | 93 | u32 epoch_start; /* beginning of an epoch */ |
94 | u32 ack_cnt; /* number of acks */ | 94 | u32 ack_cnt; /* number of acks */ |
95 | u32 tcp_cwnd; /* estimated tcp cwnd */ | 95 | u32 tcp_cwnd; /* estimated tcp cwnd */ |
96 | #define ACK_RATIO_SHIFT 4 | 96 | u16 unused; |
97 | #define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) | ||
98 | u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ | ||
99 | u8 sample_cnt; /* number of samples to decide curr_rtt */ | 97 | u8 sample_cnt; /* number of samples to decide curr_rtt */ |
100 | u8 found; /* the exit point is found? */ | 98 | u8 found; /* the exit point is found? */ |
101 | u32 round_start; /* beginning of each round */ | 99 | u32 round_start; /* beginning of each round */ |
@@ -114,7 +112,6 @@ static inline void bictcp_reset(struct bictcp *ca) | |||
114 | ca->bic_K = 0; | 112 | ca->bic_K = 0; |
115 | ca->delay_min = 0; | 113 | ca->delay_min = 0; |
116 | ca->epoch_start = 0; | 114 | ca->epoch_start = 0; |
117 | ca->delayed_ack = 2 << ACK_RATIO_SHIFT; | ||
118 | ca->ack_cnt = 0; | 115 | ca->ack_cnt = 0; |
119 | ca->tcp_cwnd = 0; | 116 | ca->tcp_cwnd = 0; |
120 | ca->found = 0; | 117 | ca->found = 0; |
@@ -205,23 +202,30 @@ static u32 cubic_root(u64 a) | |||
205 | /* | 202 | /* |
206 | * Compute congestion window to use. | 203 | * Compute congestion window to use. |
207 | */ | 204 | */ |
208 | static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | 205 | static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) |
209 | { | 206 | { |
210 | u32 delta, bic_target, max_cnt; | 207 | u32 delta, bic_target, max_cnt; |
211 | u64 offs, t; | 208 | u64 offs, t; |
212 | 209 | ||
213 | ca->ack_cnt++; /* count the number of ACKs */ | 210 | ca->ack_cnt += acked; /* count the number of ACKed packets */ |
214 | 211 | ||
215 | if (ca->last_cwnd == cwnd && | 212 | if (ca->last_cwnd == cwnd && |
216 | (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) | 213 | (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) |
217 | return; | 214 | return; |
218 | 215 | ||
216 | /* The CUBIC function can update ca->cnt at most once per jiffy. | ||
217 | * On all cwnd reduction events, ca->epoch_start is set to 0, | ||
218 | * which will force a recalculation of ca->cnt. | ||
219 | */ | ||
220 | if (ca->epoch_start && tcp_time_stamp == ca->last_time) | ||
221 | goto tcp_friendliness; | ||
222 | |||
219 | ca->last_cwnd = cwnd; | 223 | ca->last_cwnd = cwnd; |
220 | ca->last_time = tcp_time_stamp; | 224 | ca->last_time = tcp_time_stamp; |
221 | 225 | ||
222 | if (ca->epoch_start == 0) { | 226 | if (ca->epoch_start == 0) { |
223 | ca->epoch_start = tcp_time_stamp; /* record beginning */ | 227 | ca->epoch_start = tcp_time_stamp; /* record beginning */ |
224 | ca->ack_cnt = 1; /* start counting */ | 228 | ca->ack_cnt = acked; /* start counting */ |
225 | ca->tcp_cwnd = cwnd; /* syn with cubic */ | 229 | ca->tcp_cwnd = cwnd; /* syn with cubic */ |
226 | 230 | ||
227 | if (ca->last_max_cwnd <= cwnd) { | 231 | if (ca->last_max_cwnd <= cwnd) { |
@@ -283,6 +287,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
283 | if (ca->last_max_cwnd == 0 && ca->cnt > 20) | 287 | if (ca->last_max_cwnd == 0 && ca->cnt > 20) |
284 | ca->cnt = 20; /* increase cwnd 5% per RTT */ | 288 | ca->cnt = 20; /* increase cwnd 5% per RTT */ |
285 | 289 | ||
290 | tcp_friendliness: | ||
286 | /* TCP Friendly */ | 291 | /* TCP Friendly */ |
287 | if (tcp_friendliness) { | 292 | if (tcp_friendliness) { |
288 | u32 scale = beta_scale; | 293 | u32 scale = beta_scale; |
@@ -301,7 +306,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | |||
301 | } | 306 | } |
302 | } | 307 | } |
303 | 308 | ||
304 | ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; | ||
305 | if (ca->cnt == 0) /* cannot be zero */ | 309 | if (ca->cnt == 0) /* cannot be zero */ |
306 | ca->cnt = 1; | 310 | ca->cnt = 1; |
307 | } | 311 | } |
@@ -317,11 +321,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
317 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 321 | if (tp->snd_cwnd <= tp->snd_ssthresh) { |
318 | if (hystart && after(ack, ca->end_seq)) | 322 | if (hystart && after(ack, ca->end_seq)) |
319 | bictcp_hystart_reset(sk); | 323 | bictcp_hystart_reset(sk); |
320 | tcp_slow_start(tp, acked); | 324 | acked = tcp_slow_start(tp, acked); |
321 | } else { | 325 | if (!acked) |
322 | bictcp_update(ca, tp->snd_cwnd); | 326 | return; |
323 | tcp_cong_avoid_ai(tp, ca->cnt); | ||
324 | } | 327 | } |
328 | bictcp_update(ca, tp->snd_cwnd, acked); | ||
329 | tcp_cong_avoid_ai(tp, ca->cnt, acked); | ||
325 | } | 330 | } |
326 | 331 | ||
327 | static u32 bictcp_recalc_ssthresh(struct sock *sk) | 332 | static u32 bictcp_recalc_ssthresh(struct sock *sk) |
@@ -411,20 +416,10 @@ static void hystart_update(struct sock *sk, u32 delay) | |||
411 | */ | 416 | */ |
412 | static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) | 417 | static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) |
413 | { | 418 | { |
414 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
415 | const struct tcp_sock *tp = tcp_sk(sk); | 419 | const struct tcp_sock *tp = tcp_sk(sk); |
416 | struct bictcp *ca = inet_csk_ca(sk); | 420 | struct bictcp *ca = inet_csk_ca(sk); |
417 | u32 delay; | 421 | u32 delay; |
418 | 422 | ||
419 | if (icsk->icsk_ca_state == TCP_CA_Open) { | ||
420 | u32 ratio = ca->delayed_ack; | ||
421 | |||
422 | ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; | ||
423 | ratio += cnt; | ||
424 | |||
425 | ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); | ||
426 | } | ||
427 | |||
428 | /* Some calls are for duplicates without timetamps */ | 423 | /* Some calls are for duplicates without timetamps */ |
429 | if (rtt_us < 0) | 424 | if (rtt_us < 0) |
430 | return; | 425 | return; |
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 815c85e3b1e0..53db2c309572 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c | |||
@@ -255,6 +255,9 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, | |||
255 | struct tcp_fastopen_cookie valid_foc = { .len = -1 }; | 255 | struct tcp_fastopen_cookie valid_foc = { .len = -1 }; |
256 | bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; | 256 | bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; |
257 | 257 | ||
258 | if (foc->len == 0) /* Client requests a cookie */ | ||
259 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); | ||
260 | |||
258 | if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && | 261 | if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && |
259 | (syn_data || foc->len >= 0) && | 262 | (syn_data || foc->len >= 0) && |
260 | tcp_fastopen_queue_check(sk))) { | 263 | tcp_fastopen_queue_check(sk))) { |
@@ -265,7 +268,8 @@ bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, | |||
265 | if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) | 268 | if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) |
266 | goto fastopen; | 269 | goto fastopen; |
267 | 270 | ||
268 | if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) && | 271 | if (foc->len >= 0 && /* Client presents or requests a cookie */ |
272 | tcp_fastopen_cookie_gen(req, skb, &valid_foc) && | ||
269 | foc->len == TCP_FASTOPEN_COOKIE_SIZE && | 273 | foc->len == TCP_FASTOPEN_COOKIE_SIZE && |
270 | foc->len == valid_foc.len && | 274 | foc->len == valid_foc.len && |
271 | !memcmp(foc->val, valid_foc.val, foc->len)) { | 275 | !memcmp(foc->val, valid_foc.val, foc->len)) { |
@@ -284,11 +288,10 @@ fastopen: | |||
284 | LINUX_MIB_TCPFASTOPENPASSIVE); | 288 | LINUX_MIB_TCPFASTOPENPASSIVE); |
285 | return true; | 289 | return true; |
286 | } | 290 | } |
287 | } | 291 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); |
292 | } else if (foc->len > 0) /* Client presents an invalid cookie */ | ||
293 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); | ||
288 | 294 | ||
289 | NET_INC_STATS_BH(sock_net(sk), foc->len ? | ||
290 | LINUX_MIB_TCPFASTOPENPASSIVEFAIL : | ||
291 | LINUX_MIB_TCPFASTOPENCOOKIEREQD); | ||
292 | *foc = valid_foc; | 295 | *foc = valid_foc; |
293 | return false; | 296 | return false; |
294 | } | 297 | } |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 075ab4d5af5e..8fdd27b17306 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly; | |||
100 | 100 | ||
101 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | 101 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; |
102 | int sysctl_tcp_early_retrans __read_mostly = 3; | 102 | int sysctl_tcp_early_retrans __read_mostly = 3; |
103 | int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; | ||
103 | 104 | ||
104 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 105 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
105 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 106 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
@@ -3183,8 +3184,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3183 | 3184 | ||
3184 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); | 3185 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); |
3185 | 3186 | ||
3186 | if (ca_ops->pkts_acked) | 3187 | if (ca_ops->pkts_acked) { |
3187 | ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); | 3188 | long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us); |
3189 | ca_ops->pkts_acked(sk, pkts_acked, rtt_us); | ||
3190 | } | ||
3188 | 3191 | ||
3189 | } else if (skb && rtt_update && sack_rtt_us >= 0 && | 3192 | } else if (skb && rtt_update && sack_rtt_us >= 0 && |
3190 | sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { | 3193 | sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { |
@@ -3319,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 | |||
3319 | } | 3322 | } |
3320 | 3323 | ||
3321 | /* RFC 5961 7 [ACK Throttling] */ | 3324 | /* RFC 5961 7 [ACK Throttling] */ |
3322 | static void tcp_send_challenge_ack(struct sock *sk) | 3325 | static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) |
3323 | { | 3326 | { |
3324 | /* unprotected vars, we dont care of overwrites */ | 3327 | /* unprotected vars, we dont care of overwrites */ |
3325 | static u32 challenge_timestamp; | 3328 | static u32 challenge_timestamp; |
3326 | static unsigned int challenge_count; | 3329 | static unsigned int challenge_count; |
3327 | u32 now = jiffies / HZ; | 3330 | struct tcp_sock *tp = tcp_sk(sk); |
3331 | u32 now; | ||
3332 | |||
3333 | /* First check our per-socket dupack rate limit. */ | ||
3334 | if (tcp_oow_rate_limited(sock_net(sk), skb, | ||
3335 | LINUX_MIB_TCPACKSKIPPEDCHALLENGE, | ||
3336 | &tp->last_oow_ack_time)) | ||
3337 | return; | ||
3328 | 3338 | ||
3339 | /* Then check the check host-wide RFC 5961 rate limit. */ | ||
3340 | now = jiffies / HZ; | ||
3329 | if (now != challenge_timestamp) { | 3341 | if (now != challenge_timestamp) { |
3330 | challenge_timestamp = now; | 3342 | challenge_timestamp = now; |
3331 | challenge_count = 0; | 3343 | challenge_count = 0; |
@@ -3358,34 +3370,34 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) | |||
3358 | } | 3370 | } |
3359 | 3371 | ||
3360 | /* This routine deals with acks during a TLP episode. | 3372 | /* This routine deals with acks during a TLP episode. |
3373 | * We mark the end of a TLP episode on receiving TLP dupack or when | ||
3374 | * ack is after tlp_high_seq. | ||
3361 | * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. | 3375 | * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. |
3362 | */ | 3376 | */ |
3363 | static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) | 3377 | static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) |
3364 | { | 3378 | { |
3365 | struct tcp_sock *tp = tcp_sk(sk); | 3379 | struct tcp_sock *tp = tcp_sk(sk); |
3366 | bool is_tlp_dupack = (ack == tp->tlp_high_seq) && | ||
3367 | !(flag & (FLAG_SND_UNA_ADVANCED | | ||
3368 | FLAG_NOT_DUP | FLAG_DATA_SACKED)); | ||
3369 | 3380 | ||
3370 | /* Mark the end of TLP episode on receiving TLP dupack or when | 3381 | if (before(ack, tp->tlp_high_seq)) |
3371 | * ack is after tlp_high_seq. | ||
3372 | */ | ||
3373 | if (is_tlp_dupack) { | ||
3374 | tp->tlp_high_seq = 0; | ||
3375 | return; | 3382 | return; |
3376 | } | ||
3377 | 3383 | ||
3378 | if (after(ack, tp->tlp_high_seq)) { | 3384 | if (flag & FLAG_DSACKING_ACK) { |
3385 | /* This DSACK means original and TLP probe arrived; no loss */ | ||
3386 | tp->tlp_high_seq = 0; | ||
3387 | } else if (after(ack, tp->tlp_high_seq)) { | ||
3388 | /* ACK advances: there was a loss, so reduce cwnd. Reset | ||
3389 | * tlp_high_seq in tcp_init_cwnd_reduction() | ||
3390 | */ | ||
3391 | tcp_init_cwnd_reduction(sk); | ||
3392 | tcp_set_ca_state(sk, TCP_CA_CWR); | ||
3393 | tcp_end_cwnd_reduction(sk); | ||
3394 | tcp_try_keep_open(sk); | ||
3395 | NET_INC_STATS_BH(sock_net(sk), | ||
3396 | LINUX_MIB_TCPLOSSPROBERECOVERY); | ||
3397 | } else if (!(flag & (FLAG_SND_UNA_ADVANCED | | ||
3398 | FLAG_NOT_DUP | FLAG_DATA_SACKED))) { | ||
3399 | /* Pure dupack: original and TLP probe arrived; no loss */ | ||
3379 | tp->tlp_high_seq = 0; | 3400 | tp->tlp_high_seq = 0; |
3380 | /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ | ||
3381 | if (!(flag & FLAG_DSACKING_ACK)) { | ||
3382 | tcp_init_cwnd_reduction(sk); | ||
3383 | tcp_set_ca_state(sk, TCP_CA_CWR); | ||
3384 | tcp_end_cwnd_reduction(sk); | ||
3385 | tcp_try_keep_open(sk); | ||
3386 | NET_INC_STATS_BH(sock_net(sk), | ||
3387 | LINUX_MIB_TCPLOSSPROBERECOVERY); | ||
3388 | } | ||
3389 | } | 3401 | } |
3390 | } | 3402 | } |
3391 | 3403 | ||
@@ -3421,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3421 | if (before(ack, prior_snd_una)) { | 3433 | if (before(ack, prior_snd_una)) { |
3422 | /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ | 3434 | /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ |
3423 | if (before(ack, prior_snd_una - tp->max_window)) { | 3435 | if (before(ack, prior_snd_una - tp->max_window)) { |
3424 | tcp_send_challenge_ack(sk); | 3436 | tcp_send_challenge_ack(sk, skb); |
3425 | return -1; | 3437 | return -1; |
3426 | } | 3438 | } |
3427 | goto old_ack; | 3439 | goto old_ack; |
@@ -4990,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | |||
4990 | tcp_paws_discard(sk, skb)) { | 5002 | tcp_paws_discard(sk, skb)) { |
4991 | if (!th->rst) { | 5003 | if (!th->rst) { |
4992 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); | 5004 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); |
4993 | tcp_send_dupack(sk, skb); | 5005 | if (!tcp_oow_rate_limited(sock_net(sk), skb, |
5006 | LINUX_MIB_TCPACKSKIPPEDPAWS, | ||
5007 | &tp->last_oow_ack_time)) | ||
5008 | tcp_send_dupack(sk, skb); | ||
4994 | goto discard; | 5009 | goto discard; |
4995 | } | 5010 | } |
4996 | /* Reset is accepted even if it did not pass PAWS. */ | 5011 | /* Reset is accepted even if it did not pass PAWS. */ |
@@ -5007,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | |||
5007 | if (!th->rst) { | 5022 | if (!th->rst) { |
5008 | if (th->syn) | 5023 | if (th->syn) |
5009 | goto syn_challenge; | 5024 | goto syn_challenge; |
5010 | tcp_send_dupack(sk, skb); | 5025 | if (!tcp_oow_rate_limited(sock_net(sk), skb, |
5026 | LINUX_MIB_TCPACKSKIPPEDSEQ, | ||
5027 | &tp->last_oow_ack_time)) | ||
5028 | tcp_send_dupack(sk, skb); | ||
5011 | } | 5029 | } |
5012 | goto discard; | 5030 | goto discard; |
5013 | } | 5031 | } |
@@ -5023,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | |||
5023 | if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) | 5041 | if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) |
5024 | tcp_reset(sk); | 5042 | tcp_reset(sk); |
5025 | else | 5043 | else |
5026 | tcp_send_challenge_ack(sk); | 5044 | tcp_send_challenge_ack(sk, skb); |
5027 | goto discard; | 5045 | goto discard; |
5028 | } | 5046 | } |
5029 | 5047 | ||
@@ -5037,7 +5055,7 @@ syn_challenge: | |||
5037 | if (syn_inerr) | 5055 | if (syn_inerr) |
5038 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); | 5056 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); |
5039 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); | 5057 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); |
5040 | tcp_send_challenge_ack(sk); | 5058 | tcp_send_challenge_ack(sk, skb); |
5041 | goto discard; | 5059 | goto discard; |
5042 | } | 5060 | } |
5043 | 5061 | ||
@@ -5870,10 +5888,9 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) | |||
5870 | * TCP ECN negotiation. | 5888 | * TCP ECN negotiation. |
5871 | * | 5889 | * |
5872 | * Exception: tcp_ca wants ECN. This is required for DCTCP | 5890 | * Exception: tcp_ca wants ECN. This is required for DCTCP |
5873 | * congestion control; it requires setting ECT on all packets, | 5891 | * congestion control: Linux DCTCP asserts ECT on all packets, |
5874 | * including SYN. We inverse the test in this case: If our | 5892 | * including SYN, which is most optimal solution; however, |
5875 | * local socket wants ECN, but peer only set ece/cwr (but not | 5893 | * others, such as FreeBSD do not. |
5876 | * ECT in IP header) its probably a non-DCTCP aware sender. | ||
5877 | */ | 5894 | */ |
5878 | static void tcp_ecn_create_request(struct request_sock *req, | 5895 | static void tcp_ecn_create_request(struct request_sock *req, |
5879 | const struct sk_buff *skb, | 5896 | const struct sk_buff *skb, |
@@ -5883,18 +5900,15 @@ static void tcp_ecn_create_request(struct request_sock *req, | |||
5883 | const struct tcphdr *th = tcp_hdr(skb); | 5900 | const struct tcphdr *th = tcp_hdr(skb); |
5884 | const struct net *net = sock_net(listen_sk); | 5901 | const struct net *net = sock_net(listen_sk); |
5885 | bool th_ecn = th->ece && th->cwr; | 5902 | bool th_ecn = th->ece && th->cwr; |
5886 | bool ect, need_ecn, ecn_ok; | 5903 | bool ect, ecn_ok; |
5887 | 5904 | ||
5888 | if (!th_ecn) | 5905 | if (!th_ecn) |
5889 | return; | 5906 | return; |
5890 | 5907 | ||
5891 | ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); | 5908 | ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); |
5892 | need_ecn = tcp_ca_needs_ecn(listen_sk); | ||
5893 | ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); | 5909 | ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); |
5894 | 5910 | ||
5895 | if (!ect && !need_ecn && ecn_ok) | 5911 | if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) |
5896 | inet_rsk(req)->ecn_ok = 1; | ||
5897 | else if (ect && need_ecn) | ||
5898 | inet_rsk(req)->ecn_ok = 1; | 5912 | inet_rsk(req)->ecn_ok = 1; |
5899 | } | 5913 | } |
5900 | 5914 | ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a3f72d7fc06c..5a2dfed4783b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
683 | arg.bound_dev_if = sk->sk_bound_dev_if; | 683 | arg.bound_dev_if = sk->sk_bound_dev_if; |
684 | 684 | ||
685 | arg.tos = ip_hdr(skb)->tos; | 685 | arg.tos = ip_hdr(skb)->tos; |
686 | ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, | 686 | ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), |
687 | skb, &TCP_SKB_CB(skb)->header.h4.opt, | ||
687 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, | 688 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, |
688 | &arg, arg.iov[0].iov_len); | 689 | &arg, arg.iov[0].iov_len); |
689 | 690 | ||
@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | |||
767 | if (oif) | 768 | if (oif) |
768 | arg.bound_dev_if = oif; | 769 | arg.bound_dev_if = oif; |
769 | arg.tos = tos; | 770 | arg.tos = tos; |
770 | ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, | 771 | ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), |
772 | skb, &TCP_SKB_CB(skb)->header.h4.opt, | ||
771 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, | 773 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, |
772 | &arg, arg.iov[0].iov_len); | 774 | &arg, arg.iov[0].iov_len); |
773 | 775 | ||
@@ -1340,6 +1342,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
1340 | } | 1342 | } |
1341 | sk_setup_caps(newsk, dst); | 1343 | sk_setup_caps(newsk, dst); |
1342 | 1344 | ||
1345 | tcp_ca_openreq_child(newsk, dst); | ||
1346 | |||
1343 | tcp_sync_mss(newsk, dst_mtu(dst)); | 1347 | tcp_sync_mss(newsk, dst_mtu(dst)); |
1344 | newtp->advmss = dst_metric_advmss(dst); | 1348 | newtp->advmss = dst_metric_advmss(dst); |
1345 | if (tcp_sk(sk)->rx_opt.user_mss && | 1349 | if (tcp_sk(sk)->rx_opt.user_mss && |
@@ -2428,14 +2432,40 @@ struct proto tcp_prot = { | |||
2428 | }; | 2432 | }; |
2429 | EXPORT_SYMBOL(tcp_prot); | 2433 | EXPORT_SYMBOL(tcp_prot); |
2430 | 2434 | ||
2435 | static void __net_exit tcp_sk_exit(struct net *net) | ||
2436 | { | ||
2437 | int cpu; | ||
2438 | |||
2439 | for_each_possible_cpu(cpu) | ||
2440 | inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); | ||
2441 | free_percpu(net->ipv4.tcp_sk); | ||
2442 | } | ||
2443 | |||
2431 | static int __net_init tcp_sk_init(struct net *net) | 2444 | static int __net_init tcp_sk_init(struct net *net) |
2432 | { | 2445 | { |
2446 | int res, cpu; | ||
2447 | |||
2448 | net->ipv4.tcp_sk = alloc_percpu(struct sock *); | ||
2449 | if (!net->ipv4.tcp_sk) | ||
2450 | return -ENOMEM; | ||
2451 | |||
2452 | for_each_possible_cpu(cpu) { | ||
2453 | struct sock *sk; | ||
2454 | |||
2455 | res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, | ||
2456 | IPPROTO_TCP, net); | ||
2457 | if (res) | ||
2458 | goto fail; | ||
2459 | *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; | ||
2460 | } | ||
2433 | net->ipv4.sysctl_tcp_ecn = 2; | 2461 | net->ipv4.sysctl_tcp_ecn = 2; |
2462 | net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; | ||
2434 | return 0; | 2463 | return 0; |
2435 | } | ||
2436 | 2464 | ||
2437 | static void __net_exit tcp_sk_exit(struct net *net) | 2465 | fail: |
2438 | { | 2466 | tcp_sk_exit(net); |
2467 | |||
2468 | return res; | ||
2439 | } | 2469 | } |
2440 | 2470 | ||
2441 | static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) | 2471 | static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) |
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 272327134a1b..c2a75c6957a1 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c | |||
@@ -120,7 +120,7 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, | |||
120 | switch (of_cft(of)->private) { | 120 | switch (of_cft(of)->private) { |
121 | case RES_LIMIT: | 121 | case RES_LIMIT: |
122 | /* see memcontrol.c */ | 122 | /* see memcontrol.c */ |
123 | ret = page_counter_memparse(buf, &nr_pages); | 123 | ret = page_counter_memparse(buf, "-1", &nr_pages); |
124 | if (ret) | 124 | if (ret) |
125 | break; | 125 | break; |
126 | mutex_lock(&tcp_limit_mutex); | 126 | mutex_lock(&tcp_limit_mutex); |
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index ed9c9a91851c..e5f41bd5ec1b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c | |||
@@ -886,7 +886,8 @@ static int tcp_metrics_dump_info(struct sk_buff *skb, | |||
886 | if (tcp_metrics_fill_info(skb, tm) < 0) | 886 | if (tcp_metrics_fill_info(skb, tm) < 0) |
887 | goto nla_put_failure; | 887 | goto nla_put_failure; |
888 | 888 | ||
889 | return genlmsg_end(skb, hdr); | 889 | genlmsg_end(skb, hdr); |
890 | return 0; | ||
890 | 891 | ||
891 | nla_put_failure: | 892 | nla_put_failure: |
892 | genlmsg_cancel(skb, hdr); | 893 | genlmsg_cancel(skb, hdr); |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 63d2680b65db..dd11ac7798c6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -58,6 +58,25 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | |||
58 | return seq == e_win && seq == end_seq; | 58 | return seq == e_win && seq == end_seq; |
59 | } | 59 | } |
60 | 60 | ||
61 | static enum tcp_tw_status | ||
62 | tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, | ||
63 | const struct sk_buff *skb, int mib_idx) | ||
64 | { | ||
65 | struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); | ||
66 | |||
67 | if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, | ||
68 | &tcptw->tw_last_oow_ack_time)) { | ||
69 | /* Send ACK. Note, we do not put the bucket, | ||
70 | * it will be released by caller. | ||
71 | */ | ||
72 | return TCP_TW_ACK; | ||
73 | } | ||
74 | |||
75 | /* We are rate-limiting, so just release the tw sock and drop skb. */ | ||
76 | inet_twsk_put(tw); | ||
77 | return TCP_TW_SUCCESS; | ||
78 | } | ||
79 | |||
61 | /* | 80 | /* |
62 | * * Main purpose of TIME-WAIT state is to close connection gracefully, | 81 | * * Main purpose of TIME-WAIT state is to close connection gracefully, |
63 | * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN | 82 | * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN |
@@ -116,7 +135,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, | |||
116 | !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, | 135 | !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, |
117 | tcptw->tw_rcv_nxt, | 136 | tcptw->tw_rcv_nxt, |
118 | tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) | 137 | tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) |
119 | return TCP_TW_ACK; | 138 | return tcp_timewait_check_oow_rate_limit( |
139 | tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); | ||
120 | 140 | ||
121 | if (th->rst) | 141 | if (th->rst) |
122 | goto kill; | 142 | goto kill; |
@@ -250,10 +270,8 @@ kill: | |||
250 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, | 270 | inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, |
251 | TCP_TIMEWAIT_LEN); | 271 | TCP_TIMEWAIT_LEN); |
252 | 272 | ||
253 | /* Send ACK. Note, we do not put the bucket, | 273 | return tcp_timewait_check_oow_rate_limit( |
254 | * it will be released by caller. | 274 | tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); |
255 | */ | ||
256 | return TCP_TW_ACK; | ||
257 | } | 275 | } |
258 | inet_twsk_put(tw); | 276 | inet_twsk_put(tw); |
259 | return TCP_TW_SUCCESS; | 277 | return TCP_TW_SUCCESS; |
@@ -289,6 +307,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) | |||
289 | tcptw->tw_ts_recent = tp->rx_opt.ts_recent; | 307 | tcptw->tw_ts_recent = tp->rx_opt.ts_recent; |
290 | tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; | 308 | tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; |
291 | tcptw->tw_ts_offset = tp->tsoffset; | 309 | tcptw->tw_ts_offset = tp->tsoffset; |
310 | tcptw->tw_last_oow_ack_time = 0; | ||
292 | 311 | ||
293 | #if IS_ENABLED(CONFIG_IPV6) | 312 | #if IS_ENABLED(CONFIG_IPV6) |
294 | if (tw->tw_family == PF_INET6) { | 313 | if (tw->tw_family == PF_INET6) { |
@@ -399,6 +418,32 @@ static void tcp_ecn_openreq_child(struct tcp_sock *tp, | |||
399 | tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; | 418 | tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; |
400 | } | 419 | } |
401 | 420 | ||
421 | void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) | ||
422 | { | ||
423 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
424 | u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); | ||
425 | bool ca_got_dst = false; | ||
426 | |||
427 | if (ca_key != TCP_CA_UNSPEC) { | ||
428 | const struct tcp_congestion_ops *ca; | ||
429 | |||
430 | rcu_read_lock(); | ||
431 | ca = tcp_ca_find_key(ca_key); | ||
432 | if (likely(ca && try_module_get(ca->owner))) { | ||
433 | icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); | ||
434 | icsk->icsk_ca_ops = ca; | ||
435 | ca_got_dst = true; | ||
436 | } | ||
437 | rcu_read_unlock(); | ||
438 | } | ||
439 | |||
440 | if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner)) | ||
441 | tcp_assign_congestion_control(sk); | ||
442 | |||
443 | tcp_set_ca_state(sk, TCP_CA_Open); | ||
444 | } | ||
445 | EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); | ||
446 | |||
402 | /* This is not only more efficient than what we used to do, it eliminates | 447 | /* This is not only more efficient than what we used to do, it eliminates |
403 | * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM | 448 | * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM |
404 | * | 449 | * |
@@ -441,6 +486,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
441 | tcp_enable_early_retrans(newtp); | 486 | tcp_enable_early_retrans(newtp); |
442 | newtp->tlp_high_seq = 0; | 487 | newtp->tlp_high_seq = 0; |
443 | newtp->lsndtime = treq->snt_synack; | 488 | newtp->lsndtime = treq->snt_synack; |
489 | newtp->last_oow_ack_time = 0; | ||
444 | newtp->total_retrans = req->num_retrans; | 490 | newtp->total_retrans = req->num_retrans; |
445 | 491 | ||
446 | /* So many TCP implementations out there (incorrectly) count the | 492 | /* So many TCP implementations out there (incorrectly) count the |
@@ -451,10 +497,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
451 | newtp->snd_cwnd = TCP_INIT_CWND; | 497 | newtp->snd_cwnd = TCP_INIT_CWND; |
452 | newtp->snd_cwnd_cnt = 0; | 498 | newtp->snd_cwnd_cnt = 0; |
453 | 499 | ||
454 | if (!try_module_get(newicsk->icsk_ca_ops->owner)) | ||
455 | tcp_assign_congestion_control(newsk); | ||
456 | |||
457 | tcp_set_ca_state(newsk, TCP_CA_Open); | ||
458 | tcp_init_xmit_timers(newsk); | 500 | tcp_init_xmit_timers(newsk); |
459 | __skb_queue_head_init(&newtp->out_of_order_queue); | 501 | __skb_queue_head_init(&newtp->out_of_order_queue); |
460 | newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; | 502 | newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; |
@@ -583,7 +625,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
583 | * Reset timer after retransmitting SYNACK, similar to | 625 | * Reset timer after retransmitting SYNACK, similar to |
584 | * the idea of fast retransmit in recovery. | 626 | * the idea of fast retransmit in recovery. |
585 | */ | 627 | */ |
586 | if (!inet_rtx_syn_ack(sk, req)) | 628 | if (!tcp_oow_rate_limited(sock_net(sk), skb, |
629 | LINUX_MIB_TCPACKSKIPPEDSYNRECV, | ||
630 | &tcp_rsk(req)->last_oow_ack_time) && | ||
631 | |||
632 | !inet_rtx_syn_ack(sk, req)) | ||
587 | req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, | 633 | req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, |
588 | TCP_RTO_MAX) + jiffies; | 634 | TCP_RTO_MAX) + jiffies; |
589 | return NULL; | 635 | return NULL; |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 65caf8b95e17..a2a796c5536b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -59,9 +59,6 @@ int sysctl_tcp_limit_output_bytes __read_mostly = 131072; | |||
59 | */ | 59 | */ |
60 | int sysctl_tcp_tso_win_divisor __read_mostly = 3; | 60 | int sysctl_tcp_tso_win_divisor __read_mostly = 3; |
61 | 61 | ||
62 | int sysctl_tcp_mtu_probing __read_mostly = 0; | ||
63 | int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; | ||
64 | |||
65 | /* By default, RFC2861 behavior. */ | 62 | /* By default, RFC2861 behavior. */ |
66 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | 63 | int sysctl_tcp_slow_start_after_idle __read_mostly = 1; |
67 | 64 | ||
@@ -948,7 +945,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
948 | 945 | ||
949 | skb_orphan(skb); | 946 | skb_orphan(skb); |
950 | skb->sk = sk; | 947 | skb->sk = sk; |
951 | skb->destructor = tcp_wfree; | 948 | skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree; |
952 | skb_set_hash_from_sk(skb, sk); | 949 | skb_set_hash_from_sk(skb, sk); |
953 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | 950 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); |
954 | 951 | ||
@@ -1350,11 +1347,12 @@ void tcp_mtup_init(struct sock *sk) | |||
1350 | { | 1347 | { |
1351 | struct tcp_sock *tp = tcp_sk(sk); | 1348 | struct tcp_sock *tp = tcp_sk(sk); |
1352 | struct inet_connection_sock *icsk = inet_csk(sk); | 1349 | struct inet_connection_sock *icsk = inet_csk(sk); |
1350 | struct net *net = sock_net(sk); | ||
1353 | 1351 | ||
1354 | icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; | 1352 | icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1; |
1355 | icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + | 1353 | icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + |
1356 | icsk->icsk_af_ops->net_header_len; | 1354 | icsk->icsk_af_ops->net_header_len; |
1357 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); | 1355 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); |
1358 | icsk->icsk_mtup.probe_size = 0; | 1356 | icsk->icsk_mtup.probe_size = 0; |
1359 | } | 1357 | } |
1360 | EXPORT_SYMBOL(tcp_mtup_init); | 1358 | EXPORT_SYMBOL(tcp_mtup_init); |
@@ -2939,6 +2937,25 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2939 | } | 2937 | } |
2940 | EXPORT_SYMBOL(tcp_make_synack); | 2938 | EXPORT_SYMBOL(tcp_make_synack); |
2941 | 2939 | ||
2940 | static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) | ||
2941 | { | ||
2942 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
2943 | const struct tcp_congestion_ops *ca; | ||
2944 | u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); | ||
2945 | |||
2946 | if (ca_key == TCP_CA_UNSPEC) | ||
2947 | return; | ||
2948 | |||
2949 | rcu_read_lock(); | ||
2950 | ca = tcp_ca_find_key(ca_key); | ||
2951 | if (likely(ca && try_module_get(ca->owner))) { | ||
2952 | module_put(icsk->icsk_ca_ops->owner); | ||
2953 | icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); | ||
2954 | icsk->icsk_ca_ops = ca; | ||
2955 | } | ||
2956 | rcu_read_unlock(); | ||
2957 | } | ||
2958 | |||
2942 | /* Do all connect socket setups that can be done AF independent. */ | 2959 | /* Do all connect socket setups that can be done AF independent. */ |
2943 | static void tcp_connect_init(struct sock *sk) | 2960 | static void tcp_connect_init(struct sock *sk) |
2944 | { | 2961 | { |
@@ -2964,6 +2981,8 @@ static void tcp_connect_init(struct sock *sk) | |||
2964 | tcp_mtup_init(sk); | 2981 | tcp_mtup_init(sk); |
2965 | tcp_sync_mss(sk, dst_mtu(dst)); | 2982 | tcp_sync_mss(sk, dst_mtu(dst)); |
2966 | 2983 | ||
2984 | tcp_ca_dst_init(sk, dst); | ||
2985 | |||
2967 | if (!tp->window_clamp) | 2986 | if (!tp->window_clamp) |
2968 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); | 2987 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
2969 | tp->advmss = dst_metric_advmss(dst); | 2988 | tp->advmss = dst_metric_advmss(dst); |
@@ -3034,7 +3053,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
3034 | { | 3053 | { |
3035 | struct tcp_sock *tp = tcp_sk(sk); | 3054 | struct tcp_sock *tp = tcp_sk(sk); |
3036 | struct tcp_fastopen_request *fo = tp->fastopen_req; | 3055 | struct tcp_fastopen_request *fo = tp->fastopen_req; |
3037 | int syn_loss = 0, space, err = 0; | 3056 | int syn_loss = 0, space, err = 0, copied; |
3038 | unsigned long last_syn_loss = 0; | 3057 | unsigned long last_syn_loss = 0; |
3039 | struct sk_buff *syn_data; | 3058 | struct sk_buff *syn_data; |
3040 | 3059 | ||
@@ -3072,11 +3091,16 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) | |||
3072 | goto fallback; | 3091 | goto fallback; |
3073 | syn_data->ip_summed = CHECKSUM_PARTIAL; | 3092 | syn_data->ip_summed = CHECKSUM_PARTIAL; |
3074 | memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); | 3093 | memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); |
3075 | if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), | 3094 | copied = copy_from_iter(skb_put(syn_data, space), space, |
3076 | fo->data->msg_iter.iov, 0, space))) { | 3095 | &fo->data->msg_iter); |
3096 | if (unlikely(!copied)) { | ||
3077 | kfree_skb(syn_data); | 3097 | kfree_skb(syn_data); |
3078 | goto fallback; | 3098 | goto fallback; |
3079 | } | 3099 | } |
3100 | if (copied != space) { | ||
3101 | skb_trim(syn_data, copied); | ||
3102 | space = copied; | ||
3103 | } | ||
3080 | 3104 | ||
3081 | /* No more data pending in inet_wait_for_connect() */ | 3105 | /* No more data pending in inet_wait_for_connect() */ |
3082 | if (space == fo->size) | 3106 | if (space == fo->size) |
@@ -3244,6 +3268,14 @@ void tcp_send_ack(struct sock *sk) | |||
3244 | skb_reserve(buff, MAX_TCP_HEADER); | 3268 | skb_reserve(buff, MAX_TCP_HEADER); |
3245 | tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); | 3269 | tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); |
3246 | 3270 | ||
3271 | /* We do not want pure acks influencing TCP Small Queues or fq/pacing | ||
3272 | * too much. | ||
3273 | * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 | ||
3274 | * We also avoid tcp_wfree() overhead (cache line miss accessing | ||
3275 | * tp->tsq_flags) by using regular sock_wfree() | ||
3276 | */ | ||
3277 | skb_set_tcp_pure_ack(buff); | ||
3278 | |||
3247 | /* Send it off, this clears delayed acks for us. */ | 3279 | /* Send it off, this clears delayed acks for us. */ |
3248 | skb_mstamp_get(&buff->skb_mstamp); | 3280 | skb_mstamp_get(&buff->skb_mstamp); |
3249 | tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); | 3281 | tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); |
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 6824afb65d93..333bcb2415ff 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c | |||
@@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
25 | if (tp->snd_cwnd <= tp->snd_ssthresh) | 25 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
26 | tcp_slow_start(tp, acked); | 26 | tcp_slow_start(tp, acked); |
27 | else | 27 | else |
28 | tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); | 28 | tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), |
29 | 1); | ||
29 | } | 30 | } |
30 | 31 | ||
31 | static u32 tcp_scalable_ssthresh(struct sock *sk) | 32 | static u32 tcp_scalable_ssthresh(struct sock *sk) |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1829c7fbc77e..0732b787904e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -101,17 +101,20 @@ static int tcp_orphan_retries(struct sock *sk, int alive) | |||
101 | 101 | ||
102 | static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) | 102 | static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) |
103 | { | 103 | { |
104 | struct net *net = sock_net(sk); | ||
105 | |||
104 | /* Black hole detection */ | 106 | /* Black hole detection */ |
105 | if (sysctl_tcp_mtu_probing) { | 107 | if (net->ipv4.sysctl_tcp_mtu_probing) { |
106 | if (!icsk->icsk_mtup.enabled) { | 108 | if (!icsk->icsk_mtup.enabled) { |
107 | icsk->icsk_mtup.enabled = 1; | 109 | icsk->icsk_mtup.enabled = 1; |
108 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 110 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
109 | } else { | 111 | } else { |
112 | struct net *net = sock_net(sk); | ||
110 | struct tcp_sock *tp = tcp_sk(sk); | 113 | struct tcp_sock *tp = tcp_sk(sk); |
111 | int mss; | 114 | int mss; |
112 | 115 | ||
113 | mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; | 116 | mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; |
114 | mss = min(sysctl_tcp_base_mss, mss); | 117 | mss = min(net->ipv4.sysctl_tcp_base_mss, mss); |
115 | mss = max(mss, 68 - tp->tcp_header_len); | 118 | mss = max(mss, 68 - tp->tcp_header_len); |
116 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); | 119 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); |
117 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | 120 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index a4d2d2d88dca..112151eeee45 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c | |||
@@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
159 | /* In the "non-congestive state", increase cwnd | 159 | /* In the "non-congestive state", increase cwnd |
160 | * every rtt. | 160 | * every rtt. |
161 | */ | 161 | */ |
162 | tcp_cong_avoid_ai(tp, tp->snd_cwnd); | 162 | tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); |
163 | } else { | 163 | } else { |
164 | /* In the "congestive state", increase cwnd | 164 | /* In the "congestive state", increase cwnd |
165 | * every other rtt. | 165 | * every other rtt. |
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index cd7273218598..17d35662930d 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c | |||
@@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) | |||
92 | 92 | ||
93 | } else { | 93 | } else { |
94 | /* Reno */ | 94 | /* Reno */ |
95 | tcp_cong_avoid_ai(tp, tp->snd_cwnd); | 95 | tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); |
96 | } | 96 | } |
97 | 97 | ||
98 | /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. | 98 | /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 13b4dcf86ef6..97ef1f8b7be8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -1329,7 +1329,7 @@ try_again: | |||
1329 | *addr_len = sizeof(*sin); | 1329 | *addr_len = sizeof(*sin); |
1330 | } | 1330 | } |
1331 | if (inet->cmsg_flags) | 1331 | if (inet->cmsg_flags) |
1332 | ip_cmsg_recv(msg, skb); | 1332 | ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr)); |
1333 | 1333 | ||
1334 | err = copied; | 1334 | err = copied; |
1335 | if (flags & MSG_TRUNC) | 1335 | if (flags & MSG_TRUNC) |
@@ -1806,7 +1806,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, | |||
1806 | if (sk != NULL) { | 1806 | if (sk != NULL) { |
1807 | int ret; | 1807 | int ret; |
1808 | 1808 | ||
1809 | if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk)) | 1809 | if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) |
1810 | skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, | 1810 | skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, |
1811 | inet_compute_pseudo); | 1811 | inet_compute_pseudo); |
1812 | 1812 | ||
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 7927db0a9279..4a000f1dd757 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c | |||
@@ -99,11 +99,13 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin | |||
99 | s_slot = cb->args[0]; | 99 | s_slot = cb->args[0]; |
100 | num = s_num = cb->args[1]; | 100 | num = s_num = cb->args[1]; |
101 | 101 | ||
102 | for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) { | 102 | for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { |
103 | struct sock *sk; | 103 | struct sock *sk; |
104 | struct hlist_nulls_node *node; | 104 | struct hlist_nulls_node *node; |
105 | struct udp_hslot *hslot = &table->hash[slot]; | 105 | struct udp_hslot *hslot = &table->hash[slot]; |
106 | 106 | ||
107 | num = 0; | ||
108 | |||
107 | if (hlist_nulls_empty(&hslot->head)) | 109 | if (hlist_nulls_empty(&hslot->head)) |
108 | continue; | 110 | continue; |
109 | 111 | ||
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index d3e537ef6b7f..d10f6f4ead27 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c | |||
@@ -339,7 +339,8 @@ unflush: | |||
339 | skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ | 339 | skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ |
340 | skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); | 340 | skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); |
341 | NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; | 341 | NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; |
342 | pp = uo_priv->offload->callbacks.gro_receive(head, skb); | 342 | pp = uo_priv->offload->callbacks.gro_receive(head, skb, |
343 | uo_priv->offload); | ||
343 | 344 | ||
344 | out_unlock: | 345 | out_unlock: |
345 | rcu_read_unlock(); | 346 | rcu_read_unlock(); |
@@ -395,7 +396,9 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) | |||
395 | 396 | ||
396 | if (uo_priv != NULL) { | 397 | if (uo_priv != NULL) { |
397 | NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; | 398 | NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; |
398 | err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); | 399 | err = uo_priv->offload->callbacks.gro_complete(skb, |
400 | nhoff + sizeof(struct udphdr), | ||
401 | uo_priv->offload); | ||
399 | } | 402 | } |
400 | 403 | ||
401 | rcu_read_unlock(); | 404 | rcu_read_unlock(); |
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 1671263e5fa0..c83b35485056 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c | |||
@@ -63,7 +63,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, | |||
63 | inet_sk(sk)->mc_loop = 0; | 63 | inet_sk(sk)->mc_loop = 0; |
64 | 64 | ||
65 | /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ | 65 | /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ |
66 | udp_set_convert_csum(sk, true); | 66 | inet_inc_convert_csum(sk); |
67 | 67 | ||
68 | rcu_assign_sk_user_data(sk, cfg->sk_user_data); | 68 | rcu_assign_sk_user_data(sk, cfg->sk_user_data); |
69 | 69 | ||
@@ -75,10 +75,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, | |||
75 | } | 75 | } |
76 | EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); | 76 | EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); |
77 | 77 | ||
78 | int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, | 78 | int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb, |
79 | struct sk_buff *skb, __be32 src, __be32 dst, | 79 | __be32 src, __be32 dst, __u8 tos, __u8 ttl, |
80 | __u8 tos, __u8 ttl, __be16 df, __be16 src_port, | 80 | __be16 df, __be16 src_port, __be16 dst_port, |
81 | __be16 dst_port, bool xnet) | 81 | bool xnet, bool nocheck) |
82 | { | 82 | { |
83 | struct udphdr *uh; | 83 | struct udphdr *uh; |
84 | 84 | ||
@@ -90,9 +90,9 @@ int udp_tunnel_xmit_skb(struct socket *sock, struct rtable *rt, | |||
90 | uh->source = src_port; | 90 | uh->source = src_port; |
91 | uh->len = htons(skb->len); | 91 | uh->len = htons(skb->len); |
92 | 92 | ||
93 | udp_set_csum(sock->sk->sk_no_check_tx, skb, src, dst, skb->len); | 93 | udp_set_csum(nocheck, skb, src, dst, skb->len); |
94 | 94 | ||
95 | return iptunnel_xmit(sock->sk, rt, skb, src, dst, IPPROTO_UDP, | 95 | return iptunnel_xmit(skb->sk, rt, skb, src, dst, IPPROTO_UDP, |
96 | tos, ttl, df, xnet); | 96 | tos, ttl, df, xnet); |
97 | } | 97 | } |
98 | EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); | 98 | EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); |