aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/trans_virtio.c24
-rw-r--r--net/bridge/br_if.c2
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/can/af_can.c3
-rw-r--r--net/compat.c7
-rw-r--r--net/core/rtnetlink.c26
-rw-r--r--net/core/skbuff.c10
-rw-r--r--net/core/sock.c4
-rw-r--r--net/core/sysctl_net_core.c10
-rw-r--r--net/ipv4/inet_connection_sock.c1
-rw-r--r--net/ipv4/inet_diag.c18
-rw-r--r--net/ipv4/ip_forward.c1
-rw-r--r--net/ipv4/ip_fragment.c11
-rw-r--r--net/ipv4/ip_sockglue.c33
-rw-r--r--net/ipv4/netfilter/ip_tables.c6
-rw-r--r--net/ipv4/ping.c12
-rw-r--r--net/ipv4/tcp.c10
-rw-r--r--net/ipv4/tcp_cong.c6
-rw-r--r--net/ipv4/tcp_cubic.c6
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv4/xfrm4_output.c2
-rw-r--r--net/ipv6/datagram.c39
-rw-r--r--net/ipv6/fib6_rules.c1
-rw-r--r--net/ipv6/ip6_output.c1
-rw-r--r--net/ipv6/ip6_tunnel.c33
-rw-r--r--net/ipv6/netfilter/ip6_tables.c6
-rw-r--r--net/ipv6/ping.c5
-rw-r--r--net/ipv6/udp_offload.c8
-rw-r--r--net/ipv6/xfrm6_output.c2
-rw-r--r--net/ipv6/xfrm6_policy.c1
-rw-r--r--net/mac80211/ieee80211_i.h24
-rw-r--r--net/mac80211/mlme.c16
-rw-r--r--net/mac80211/rx.c3
-rw-r--r--net/mac80211/util.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c3
-rw-r--r--net/netfilter/nf_log.c24
-rw-r--r--net/netfilter/nf_tables_api.c66
-rw-r--r--net/netfilter/nf_tables_core.c8
-rw-r--r--net/netfilter/nfnetlink_cthelper.c3
-rw-r--r--net/netfilter/nft_compat.c20
-rw-r--r--net/netfilter/nft_hash.c2
-rw-r--r--net/netfilter/xt_TPROXY.c4
-rw-r--r--net/packet/af_packet.c22
-rw-r--r--net/rds/iw_rdma.c40
-rw-r--r--net/rxrpc/ar-error.c4
-rw-r--r--net/rxrpc/ar-recvmsg.c2
-rw-r--r--net/sched/act_bpf.c36
-rw-r--r--net/sched/cls_u32.c5
-rw-r--r--net/socket.c4
-rw-r--r--net/sunrpc/xprtrdma/Makefile3
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c208
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c353
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c94
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c87
-rw-r--r--net/sunrpc/xprtrdma/transport.c61
-rw-r--r--net/sunrpc/xprtrdma/verbs.c699
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h90
-rw-r--r--net/tipc/link.c7
-rw-r--r--net/wireless/nl80211.c10
-rw-r--r--net/xfrm/xfrm_policy.c12
60 files changed, 1294 insertions, 914 deletions
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index d8e376a5f0f1..36a1a739ad68 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -658,14 +658,30 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
658static void p9_virtio_remove(struct virtio_device *vdev) 658static void p9_virtio_remove(struct virtio_device *vdev)
659{ 659{
660 struct virtio_chan *chan = vdev->priv; 660 struct virtio_chan *chan = vdev->priv;
661 661 unsigned long warning_time;
662 if (chan->inuse)
663 p9_virtio_close(chan->client);
664 vdev->config->del_vqs(vdev);
665 662
666 mutex_lock(&virtio_9p_lock); 663 mutex_lock(&virtio_9p_lock);
664
665 /* Remove self from list so we don't get new users. */
667 list_del(&chan->chan_list); 666 list_del(&chan->chan_list);
667 warning_time = jiffies;
668
669 /* Wait for existing users to close. */
670 while (chan->inuse) {
671 mutex_unlock(&virtio_9p_lock);
672 msleep(250);
673 if (time_after(jiffies, warning_time + 10 * HZ)) {
674 dev_emerg(&vdev->dev,
675 "p9_virtio_remove: waiting for device in use.\n");
676 warning_time = jiffies;
677 }
678 mutex_lock(&virtio_9p_lock);
679 }
680
668 mutex_unlock(&virtio_9p_lock); 681 mutex_unlock(&virtio_9p_lock);
682
683 vdev->config->del_vqs(vdev);
684
669 sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); 685 sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
670 kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); 686 kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE);
671 kfree(chan->tag); 687 kfree(chan->tag);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index b087d278c679..1849d96b3c91 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -563,6 +563,8 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
563 */ 563 */
564 del_nbp(p); 564 del_nbp(p);
565 565
566 dev_set_mtu(br->dev, br_min_mtu(br));
567
566 spin_lock_bh(&br->lock); 568 spin_lock_bh(&br->lock);
567 changed_addr = br_stp_recalculate_bridge_id(br); 569 changed_addr = br_stp_recalculate_bridge_id(br);
568 spin_unlock_bh(&br->lock); 570 spin_unlock_bh(&br->lock);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 769b185fefbd..a6e2da0bc718 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -281,7 +281,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock,
281 int copylen; 281 int copylen;
282 282
283 ret = -EOPNOTSUPP; 283 ret = -EOPNOTSUPP;
284 if (m->msg_flags&MSG_OOB) 284 if (flags & MSG_OOB)
285 goto read_error; 285 goto read_error;
286 286
287 skb = skb_recv_datagram(sk, flags, 0 , &ret); 287 skb = skb_recv_datagram(sk, flags, 0 , &ret);
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 66e08040ced7..32d710eaf1fc 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -259,6 +259,9 @@ int can_send(struct sk_buff *skb, int loop)
259 goto inval_skb; 259 goto inval_skb;
260 } 260 }
261 261
262 skb->ip_summed = CHECKSUM_UNNECESSARY;
263
264 skb_reset_mac_header(skb);
262 skb_reset_network_header(skb); 265 skb_reset_network_header(skb);
263 skb_reset_transport_header(skb); 266 skb_reset_transport_header(skb);
264 267
diff --git a/net/compat.c b/net/compat.c
index 94d3d5e97883..f7bd286a8280 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -49,6 +49,13 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg,
49 __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || 49 __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
50 __get_user(kmsg->msg_flags, &umsg->msg_flags)) 50 __get_user(kmsg->msg_flags, &umsg->msg_flags))
51 return -EFAULT; 51 return -EFAULT;
52
53 if (!uaddr)
54 kmsg->msg_namelen = 0;
55
56 if (kmsg->msg_namelen < 0)
57 return -EINVAL;
58
52 if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) 59 if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
53 kmsg->msg_namelen = sizeof(struct sockaddr_storage); 60 kmsg->msg_namelen = sizeof(struct sockaddr_storage);
54 kmsg->msg_control = compat_ptr(tmp3); 61 kmsg->msg_control = compat_ptr(tmp3);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 25b4b5d23485..ee0608bb3bc0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2166,28 +2166,28 @@ replay:
2166 } 2166 }
2167 } 2167 }
2168 err = rtnl_configure_link(dev, ifm); 2168 err = rtnl_configure_link(dev, ifm);
2169 if (err < 0) { 2169 if (err < 0)
2170 if (ops->newlink) { 2170 goto out_unregister;
2171 LIST_HEAD(list_kill);
2172
2173 ops->dellink(dev, &list_kill);
2174 unregister_netdevice_many(&list_kill);
2175 } else {
2176 unregister_netdevice(dev);
2177 }
2178 goto out;
2179 }
2180
2181 if (link_net) { 2171 if (link_net) {
2182 err = dev_change_net_namespace(dev, dest_net, ifname); 2172 err = dev_change_net_namespace(dev, dest_net, ifname);
2183 if (err < 0) 2173 if (err < 0)
2184 unregister_netdevice(dev); 2174 goto out_unregister;
2185 } 2175 }
2186out: 2176out:
2187 if (link_net) 2177 if (link_net)
2188 put_net(link_net); 2178 put_net(link_net);
2189 put_net(dest_net); 2179 put_net(dest_net);
2190 return err; 2180 return err;
2181out_unregister:
2182 if (ops->newlink) {
2183 LIST_HEAD(list_kill);
2184
2185 ops->dellink(dev, &list_kill);
2186 unregister_netdevice_many(&list_kill);
2187 } else {
2188 unregister_netdevice(dev);
2189 }
2190 goto out;
2191 } 2191 }
2192} 2192}
2193 2193
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f80507823531..8e4ac97c8477 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3733,9 +3733,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3733 struct sock *sk, int tstype) 3733 struct sock *sk, int tstype)
3734{ 3734{
3735 struct sk_buff *skb; 3735 struct sk_buff *skb;
3736 bool tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; 3736 bool tsonly;
3737 3737
3738 if (!sk || !skb_may_tx_timestamp(sk, tsonly)) 3738 if (!sk)
3739 return;
3740
3741 tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
3742 if (!skb_may_tx_timestamp(sk, tsonly))
3739 return; 3743 return;
3740 3744
3741 if (tsonly) 3745 if (tsonly)
@@ -4173,7 +4177,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
4173 skb->ignore_df = 0; 4177 skb->ignore_df = 0;
4174 skb_dst_drop(skb); 4178 skb_dst_drop(skb);
4175 skb->mark = 0; 4179 skb->mark = 0;
4176 skb->sender_cpu = 0; 4180 skb_sender_cpu_clear(skb);
4177 skb_init_secmark(skb); 4181 skb_init_secmark(skb);
4178 secpath_reset(skb); 4182 secpath_reset(skb);
4179 nf_reset(skb); 4183 nf_reset(skb);
diff --git a/net/core/sock.c b/net/core/sock.c
index 93c8b20c91e4..78e89eb7eb70 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1655,6 +1655,10 @@ void sock_rfree(struct sk_buff *skb)
1655} 1655}
1656EXPORT_SYMBOL(sock_rfree); 1656EXPORT_SYMBOL(sock_rfree);
1657 1657
1658/*
1659 * Buffer destructor for skbs that are not used directly in read or write
1660 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1661 */
1658void sock_efree(struct sk_buff *skb) 1662void sock_efree(struct sk_buff *skb)
1659{ 1663{
1660 sock_put(skb->sk); 1664 sock_put(skb->sk);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 433424804284..8ce351ffceb1 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -25,6 +25,8 @@
25static int zero = 0; 25static int zero = 0;
26static int one = 1; 26static int one = 1;
27static int ushort_max = USHRT_MAX; 27static int ushort_max = USHRT_MAX;
28static int min_sndbuf = SOCK_MIN_SNDBUF;
29static int min_rcvbuf = SOCK_MIN_RCVBUF;
28 30
29static int net_msg_warn; /* Unused, but still a sysctl */ 31static int net_msg_warn; /* Unused, but still a sysctl */
30 32
@@ -237,7 +239,7 @@ static struct ctl_table net_core_table[] = {
237 .maxlen = sizeof(int), 239 .maxlen = sizeof(int),
238 .mode = 0644, 240 .mode = 0644,
239 .proc_handler = proc_dointvec_minmax, 241 .proc_handler = proc_dointvec_minmax,
240 .extra1 = &one, 242 .extra1 = &min_sndbuf,
241 }, 243 },
242 { 244 {
243 .procname = "rmem_max", 245 .procname = "rmem_max",
@@ -245,7 +247,7 @@ static struct ctl_table net_core_table[] = {
245 .maxlen = sizeof(int), 247 .maxlen = sizeof(int),
246 .mode = 0644, 248 .mode = 0644,
247 .proc_handler = proc_dointvec_minmax, 249 .proc_handler = proc_dointvec_minmax,
248 .extra1 = &one, 250 .extra1 = &min_rcvbuf,
249 }, 251 },
250 { 252 {
251 .procname = "wmem_default", 253 .procname = "wmem_default",
@@ -253,7 +255,7 @@ static struct ctl_table net_core_table[] = {
253 .maxlen = sizeof(int), 255 .maxlen = sizeof(int),
254 .mode = 0644, 256 .mode = 0644,
255 .proc_handler = proc_dointvec_minmax, 257 .proc_handler = proc_dointvec_minmax,
256 .extra1 = &one, 258 .extra1 = &min_sndbuf,
257 }, 259 },
258 { 260 {
259 .procname = "rmem_default", 261 .procname = "rmem_default",
@@ -261,7 +263,7 @@ static struct ctl_table net_core_table[] = {
261 .maxlen = sizeof(int), 263 .maxlen = sizeof(int),
262 .mode = 0644, 264 .mode = 0644,
263 .proc_handler = proc_dointvec_minmax, 265 .proc_handler = proc_dointvec_minmax,
264 .extra1 = &one, 266 .extra1 = &min_rcvbuf,
265 }, 267 },
266 { 268 {
267 .procname = "dev_weight", 269 .procname = "dev_weight",
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 14d02ea905b6..3e44b9b0b78e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -268,6 +268,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
268 release_sock(sk); 268 release_sock(sk);
269 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 269 if (reqsk_queue_empty(&icsk->icsk_accept_queue))
270 timeo = schedule_timeout(timeo); 270 timeo = schedule_timeout(timeo);
271 sched_annotate_sleep();
271 lock_sock(sk); 272 lock_sock(sk);
272 err = 0; 273 err = 0;
273 if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) 274 if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 81751f12645f..592aff37366b 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -71,6 +71,20 @@ static inline void inet_diag_unlock_handler(
71 mutex_unlock(&inet_diag_table_mutex); 71 mutex_unlock(&inet_diag_table_mutex);
72} 72}
73 73
74static size_t inet_sk_attr_size(void)
75{
76 return nla_total_size(sizeof(struct tcp_info))
77 + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
78 + nla_total_size(1) /* INET_DIAG_TOS */
79 + nla_total_size(1) /* INET_DIAG_TCLASS */
80 + nla_total_size(sizeof(struct inet_diag_meminfo))
81 + nla_total_size(sizeof(struct inet_diag_msg))
82 + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
83 + nla_total_size(TCP_CA_NAME_MAX)
84 + nla_total_size(sizeof(struct tcpvegas_info))
85 + 64;
86}
87
74int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, 88int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
75 struct sk_buff *skb, struct inet_diag_req_v2 *req, 89 struct sk_buff *skb, struct inet_diag_req_v2 *req,
76 struct user_namespace *user_ns, 90 struct user_namespace *user_ns,
@@ -326,9 +340,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
326 if (err) 340 if (err)
327 goto out; 341 goto out;
328 342
329 rep = nlmsg_new(sizeof(struct inet_diag_msg) + 343 rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
330 sizeof(struct inet_diag_meminfo) +
331 sizeof(struct tcp_info) + 64, GFP_KERNEL);
332 if (!rep) { 344 if (!rep) {
333 err = -ENOMEM; 345 err = -ENOMEM;
334 goto out; 346 goto out;
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 787b3c294ce6..d9bc28ac5d1b 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -67,6 +67,7 @@ static int ip_forward_finish(struct sk_buff *skb)
67 if (unlikely(opt->optlen)) 67 if (unlikely(opt->optlen))
68 ip_forward_options(skb); 68 ip_forward_options(skb);
69 69
70 skb_sender_cpu_clear(skb);
70 return dst_output(skb); 71 return dst_output(skb);
71} 72}
72 73
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 2c8d98e728c0..145a50c4d566 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -659,27 +659,30 @@ EXPORT_SYMBOL(ip_defrag);
659struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) 659struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
660{ 660{
661 struct iphdr iph; 661 struct iphdr iph;
662 int netoff;
662 u32 len; 663 u32 len;
663 664
664 if (skb->protocol != htons(ETH_P_IP)) 665 if (skb->protocol != htons(ETH_P_IP))
665 return skb; 666 return skb;
666 667
667 if (skb_copy_bits(skb, 0, &iph, sizeof(iph)) < 0) 668 netoff = skb_network_offset(skb);
669
670 if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
668 return skb; 671 return skb;
669 672
670 if (iph.ihl < 5 || iph.version != 4) 673 if (iph.ihl < 5 || iph.version != 4)
671 return skb; 674 return skb;
672 675
673 len = ntohs(iph.tot_len); 676 len = ntohs(iph.tot_len);
674 if (skb->len < len || len < (iph.ihl * 4)) 677 if (skb->len < netoff + len || len < (iph.ihl * 4))
675 return skb; 678 return skb;
676 679
677 if (ip_is_fragment(&iph)) { 680 if (ip_is_fragment(&iph)) {
678 skb = skb_share_check(skb, GFP_ATOMIC); 681 skb = skb_share_check(skb, GFP_ATOMIC);
679 if (skb) { 682 if (skb) {
680 if (!pskb_may_pull(skb, iph.ihl*4)) 683 if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
681 return skb; 684 return skb;
682 if (pskb_trim_rcsum(skb, len)) 685 if (pskb_trim_rcsum(skb, netoff + len))
683 return skb; 686 return skb;
684 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); 687 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
685 if (ip_defrag(skb, user)) 688 if (ip_defrag(skb, user))
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 31d8c71986b4..5cd99271d3a6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -432,17 +432,32 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
432 kfree_skb(skb); 432 kfree_skb(skb);
433} 433}
434 434
435static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, 435/* IPv4 supports cmsg on all imcp errors and some timestamps
436 const struct sk_buff *skb, 436 *
437 int ee_origin) 437 * Timestamp code paths do not initialize the fields expected by cmsg:
438 * the PKTINFO fields in skb->cb[]. Fill those in here.
439 */
440static bool ipv4_datagram_support_cmsg(const struct sock *sk,
441 struct sk_buff *skb,
442 int ee_origin)
438{ 443{
439 struct in_pktinfo *info = PKTINFO_SKB_CB(skb); 444 struct in_pktinfo *info;
445
446 if (ee_origin == SO_EE_ORIGIN_ICMP)
447 return true;
440 448
441 if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || 449 if (ee_origin == SO_EE_ORIGIN_LOCAL)
442 (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || 450 return false;
451
452 /* Support IP_PKTINFO on tstamp packets if requested, to correlate
453 * timestamp with egress dev. Not possible for packets without dev
454 * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
455 */
456 if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) ||
443 (!skb->dev)) 457 (!skb->dev))
444 return false; 458 return false;
445 459
460 info = PKTINFO_SKB_CB(skb);
446 info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; 461 info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
447 info->ipi_ifindex = skb->dev->ifindex; 462 info->ipi_ifindex = skb->dev->ifindex;
448 return true; 463 return true;
@@ -483,7 +498,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
483 498
484 serr = SKB_EXT_ERR(skb); 499 serr = SKB_EXT_ERR(skb);
485 500
486 if (sin && skb->len) { 501 if (sin && serr->port) {
487 sin->sin_family = AF_INET; 502 sin->sin_family = AF_INET;
488 sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + 503 sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
489 serr->addr_offset); 504 serr->addr_offset);
@@ -496,9 +511,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
496 sin = &errhdr.offender; 511 sin = &errhdr.offender;
497 memset(sin, 0, sizeof(*sin)); 512 memset(sin, 0, sizeof(*sin));
498 513
499 if (skb->len && 514 if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) {
500 (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
501 ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) {
502 sin->sin_family = AF_INET; 515 sin->sin_family = AF_INET;
503 sin->sin_addr.s_addr = ip_hdr(skb)->saddr; 516 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
504 if (inet_sk(sk)->cmsg_flags) 517 if (inet_sk(sk)->cmsg_flags)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 99e810f84671..cf5e82f39d3b 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -272,9 +272,9 @@ static void trace_packet(const struct sk_buff *skb,
272 &chainname, &comment, &rulenum) != 0) 272 &chainname, &comment, &rulenum) != 0)
273 break; 273 break;
274 274
275 nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo, 275 nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo,
276 "TRACE: %s:%s:%s:%u ", 276 "TRACE: %s:%s:%s:%u ",
277 tablename, chainname, comment, rulenum); 277 tablename, chainname, comment, rulenum);
278} 278}
279#endif 279#endif
280 280
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index e9f66e1cda50..208d5439e59b 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -259,6 +259,9 @@ int ping_init_sock(struct sock *sk)
259 kgid_t low, high; 259 kgid_t low, high;
260 int ret = 0; 260 int ret = 0;
261 261
262 if (sk->sk_family == AF_INET6)
263 sk->sk_ipv6only = 1;
264
262 inet_get_ping_group_range_net(net, &low, &high); 265 inet_get_ping_group_range_net(net, &low, &high);
263 if (gid_lte(low, group) && gid_lte(group, high)) 266 if (gid_lte(low, group) && gid_lte(group, high))
264 return 0; 267 return 0;
@@ -305,6 +308,11 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
305 if (addr_len < sizeof(*addr)) 308 if (addr_len < sizeof(*addr))
306 return -EINVAL; 309 return -EINVAL;
307 310
311 if (addr->sin_family != AF_INET &&
312 !(addr->sin_family == AF_UNSPEC &&
313 addr->sin_addr.s_addr == htonl(INADDR_ANY)))
314 return -EAFNOSUPPORT;
315
308 pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", 316 pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
309 sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); 317 sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
310 318
@@ -330,7 +338,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
330 return -EINVAL; 338 return -EINVAL;
331 339
332 if (addr->sin6_family != AF_INET6) 340 if (addr->sin6_family != AF_INET6)
333 return -EINVAL; 341 return -EAFNOSUPPORT;
334 342
335 pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", 343 pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
336 sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); 344 sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));
@@ -716,7 +724,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
716 if (msg->msg_namelen < sizeof(*usin)) 724 if (msg->msg_namelen < sizeof(*usin))
717 return -EINVAL; 725 return -EINVAL;
718 if (usin->sin_family != AF_INET) 726 if (usin->sin_family != AF_INET)
719 return -EINVAL; 727 return -EAFNOSUPPORT;
720 daddr = usin->sin_addr.s_addr; 728 daddr = usin->sin_addr.s_addr;
721 /* no remote port */ 729 /* no remote port */
722 } else { 730 } else {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9d72a0fcd928..995a2259bcfc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -835,17 +835,13 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
835 int large_allowed) 835 int large_allowed)
836{ 836{
837 struct tcp_sock *tp = tcp_sk(sk); 837 struct tcp_sock *tp = tcp_sk(sk);
838 u32 new_size_goal, size_goal, hlen; 838 u32 new_size_goal, size_goal;
839 839
840 if (!large_allowed || !sk_can_gso(sk)) 840 if (!large_allowed || !sk_can_gso(sk))
841 return mss_now; 841 return mss_now;
842 842
843 /* Maybe we should/could use sk->sk_prot->max_header here ? */ 843 /* Note : tcp_tso_autosize() will eventually split this later */
844 hlen = inet_csk(sk)->icsk_af_ops->net_header_len + 844 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
845 inet_csk(sk)->icsk_ext_hdr_len +
846 tp->tcp_header_len;
847
848 new_size_goal = sk->sk_gso_max_size - 1 - hlen;
849 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); 845 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
850 846
851 /* We try hard to avoid divides here */ 847 /* We try hard to avoid divides here */
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index d694088214cd..62856e185a93 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -378,6 +378,12 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
378 */ 378 */
379void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) 379void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
380{ 380{
381 /* If credits accumulated at a higher w, apply them gently now. */
382 if (tp->snd_cwnd_cnt >= w) {
383 tp->snd_cwnd_cnt = 0;
384 tp->snd_cwnd++;
385 }
386
381 tp->snd_cwnd_cnt += acked; 387 tp->snd_cwnd_cnt += acked;
382 if (tp->snd_cwnd_cnt >= w) { 388 if (tp->snd_cwnd_cnt >= w) {
383 u32 delta = tp->snd_cwnd_cnt / w; 389 u32 delta = tp->snd_cwnd_cnt / w;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 4b276d1ed980..06d3d665a9fd 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -306,8 +306,10 @@ tcp_friendliness:
306 } 306 }
307 } 307 }
308 308
309 if (ca->cnt == 0) /* cannot be zero */ 309 /* The maximum rate of cwnd increase CUBIC allows is 1 packet per
310 ca->cnt = 1; 310 * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
311 */
312 ca->cnt = max(ca->cnt, 2U);
311} 313}
312 314
313static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) 315static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a2a796c5536b..1db253e36045 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2773,15 +2773,11 @@ void tcp_send_fin(struct sock *sk)
2773 } else { 2773 } else {
2774 /* Socket is locked, keep trying until memory is available. */ 2774 /* Socket is locked, keep trying until memory is available. */
2775 for (;;) { 2775 for (;;) {
2776 skb = alloc_skb_fclone(MAX_TCP_HEADER, 2776 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
2777 sk->sk_allocation);
2778 if (skb) 2777 if (skb)
2779 break; 2778 break;
2780 yield(); 2779 yield();
2781 } 2780 }
2782
2783 /* Reserve space for headers and prepare control bits. */
2784 skb_reserve(skb, MAX_TCP_HEADER);
2785 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 2781 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2786 tcp_init_nondata_skb(skb, tp->write_seq, 2782 tcp_init_nondata_skb(skb, tp->write_seq,
2787 TCPHDR_ACK | TCPHDR_FIN); 2783 TCPHDR_ACK | TCPHDR_FIN);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index d5f6bd9a210a..dab73813cb92 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -63,6 +63,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
63 return err; 63 return err;
64 64
65 IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; 65 IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
66 skb->protocol = htons(ETH_P_IP);
66 67
67 return x->outer_mode->output2(x, skb); 68 return x->outer_mode->output2(x, skb);
68} 69}
@@ -71,7 +72,6 @@ EXPORT_SYMBOL(xfrm4_prepare_output);
71int xfrm4_output_finish(struct sk_buff *skb) 72int xfrm4_output_finish(struct sk_buff *skb)
72{ 73{
73 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 74 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
74 skb->protocol = htons(ETH_P_IP);
75 75
76#ifdef CONFIG_NETFILTER 76#ifdef CONFIG_NETFILTER
77 IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; 77 IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index c215be70cac0..ace8daca5c83 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -325,14 +325,34 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu)
325 kfree_skb(skb); 325 kfree_skb(skb);
326} 326}
327 327
328static void ip6_datagram_prepare_pktinfo_errqueue(struct sk_buff *skb) 328/* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL.
329 *
330 * At one point, excluding local errors was a quick test to identify icmp/icmp6
331 * errors. This is no longer true, but the test remained, so the v6 stack,
332 * unlike v4, also honors cmsg requests on all wifi and timestamp errors.
333 *
334 * Timestamp code paths do not initialize the fields expected by cmsg:
335 * the PKTINFO fields in skb->cb[]. Fill those in here.
336 */
337static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
338 struct sock_exterr_skb *serr)
329{ 339{
330 int ifindex = skb->dev ? skb->dev->ifindex : -1; 340 if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
341 serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6)
342 return true;
343
344 if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL)
345 return false;
346
347 if (!skb->dev)
348 return false;
331 349
332 if (skb->protocol == htons(ETH_P_IPV6)) 350 if (skb->protocol == htons(ETH_P_IPV6))
333 IP6CB(skb)->iif = ifindex; 351 IP6CB(skb)->iif = skb->dev->ifindex;
334 else 352 else
335 PKTINFO_SKB_CB(skb)->ipi_ifindex = ifindex; 353 PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex;
354
355 return true;
336} 356}
337 357
338/* 358/*
@@ -369,7 +389,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
369 389
370 serr = SKB_EXT_ERR(skb); 390 serr = SKB_EXT_ERR(skb);
371 391
372 if (sin && skb->len) { 392 if (sin && serr->port) {
373 const unsigned char *nh = skb_network_header(skb); 393 const unsigned char *nh = skb_network_header(skb);
374 sin->sin6_family = AF_INET6; 394 sin->sin6_family = AF_INET6;
375 sin->sin6_flowinfo = 0; 395 sin->sin6_flowinfo = 0;
@@ -394,14 +414,11 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
394 memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); 414 memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
395 sin = &errhdr.offender; 415 sin = &errhdr.offender;
396 memset(sin, 0, sizeof(*sin)); 416 memset(sin, 0, sizeof(*sin));
397 if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL && skb->len) { 417
418 if (ip6_datagram_support_cmsg(skb, serr)) {
398 sin->sin6_family = AF_INET6; 419 sin->sin6_family = AF_INET6;
399 if (np->rxopt.all) { 420 if (np->rxopt.all)
400 if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP &&
401 serr->ee.ee_origin != SO_EE_ORIGIN_ICMP6)
402 ip6_datagram_prepare_pktinfo_errqueue(skb);
403 ip6_datagram_recv_common_ctl(sk, msg, skb); 421 ip6_datagram_recv_common_ctl(sk, msg, skb);
404 }
405 if (skb->protocol == htons(ETH_P_IPV6)) { 422 if (skb->protocol == htons(ETH_P_IPV6)) {
406 sin->sin6_addr = ipv6_hdr(skb)->saddr; 423 sin->sin6_addr = ipv6_hdr(skb)->saddr;
407 if (np->rxopt.all) 424 if (np->rxopt.all)
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index b4d5e1d97c1b..27ca79682efb 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -104,6 +104,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
104 goto again; 104 goto again;
105 flp6->saddr = saddr; 105 flp6->saddr = saddr;
106 } 106 }
107 err = rt->dst.error;
107 goto out; 108 goto out;
108 } 109 }
109again: 110again:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 0a04a37305d5..7e80b61b51ff 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -318,6 +318,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
318 318
319static inline int ip6_forward_finish(struct sk_buff *skb) 319static inline int ip6_forward_finish(struct sk_buff *skb)
320{ 320{
321 skb_sender_cpu_clear(skb);
321 return dst_output(skb); 322 return dst_output(skb);
322} 323}
323 324
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 266a264ec212..ddd94eca19b3 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -314,7 +314,7 @@ out:
314 * Create tunnel matching given parameters. 314 * Create tunnel matching given parameters.
315 * 315 *
316 * Return: 316 * Return:
317 * created tunnel or NULL 317 * created tunnel or error pointer
318 **/ 318 **/
319 319
320static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) 320static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
@@ -322,7 +322,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
322 struct net_device *dev; 322 struct net_device *dev;
323 struct ip6_tnl *t; 323 struct ip6_tnl *t;
324 char name[IFNAMSIZ]; 324 char name[IFNAMSIZ];
325 int err; 325 int err = -ENOMEM;
326 326
327 if (p->name[0]) 327 if (p->name[0])
328 strlcpy(name, p->name, IFNAMSIZ); 328 strlcpy(name, p->name, IFNAMSIZ);
@@ -348,7 +348,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
348failed_free: 348failed_free:
349 ip6_dev_free(dev); 349 ip6_dev_free(dev);
350failed: 350failed:
351 return NULL; 351 return ERR_PTR(err);
352} 352}
353 353
354/** 354/**
@@ -362,7 +362,7 @@ failed:
362 * tunnel device is created and registered for use. 362 * tunnel device is created and registered for use.
363 * 363 *
364 * Return: 364 * Return:
365 * matching tunnel or NULL 365 * matching tunnel or error pointer
366 **/ 366 **/
367 367
368static struct ip6_tnl *ip6_tnl_locate(struct net *net, 368static struct ip6_tnl *ip6_tnl_locate(struct net *net,
@@ -380,13 +380,13 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net,
380 if (ipv6_addr_equal(local, &t->parms.laddr) && 380 if (ipv6_addr_equal(local, &t->parms.laddr) &&
381 ipv6_addr_equal(remote, &t->parms.raddr)) { 381 ipv6_addr_equal(remote, &t->parms.raddr)) {
382 if (create) 382 if (create)
383 return NULL; 383 return ERR_PTR(-EEXIST);
384 384
385 return t; 385 return t;
386 } 386 }
387 } 387 }
388 if (!create) 388 if (!create)
389 return NULL; 389 return ERR_PTR(-ENODEV);
390 return ip6_tnl_create(net, p); 390 return ip6_tnl_create(net, p);
391} 391}
392 392
@@ -1420,7 +1420,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
1420 } 1420 }
1421 ip6_tnl_parm_from_user(&p1, &p); 1421 ip6_tnl_parm_from_user(&p1, &p);
1422 t = ip6_tnl_locate(net, &p1, 0); 1422 t = ip6_tnl_locate(net, &p1, 0);
1423 if (t == NULL) 1423 if (IS_ERR(t))
1424 t = netdev_priv(dev); 1424 t = netdev_priv(dev);
1425 } else { 1425 } else {
1426 memset(&p, 0, sizeof(p)); 1426 memset(&p, 0, sizeof(p));
@@ -1445,7 +1445,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
1445 ip6_tnl_parm_from_user(&p1, &p); 1445 ip6_tnl_parm_from_user(&p1, &p);
1446 t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); 1446 t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL);
1447 if (cmd == SIOCCHGTUNNEL) { 1447 if (cmd == SIOCCHGTUNNEL) {
1448 if (t != NULL) { 1448 if (!IS_ERR(t)) {
1449 if (t->dev != dev) { 1449 if (t->dev != dev) {
1450 err = -EEXIST; 1450 err = -EEXIST;
1451 break; 1451 break;
@@ -1457,14 +1457,15 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
1457 else 1457 else
1458 err = ip6_tnl_update(t, &p1); 1458 err = ip6_tnl_update(t, &p1);
1459 } 1459 }
1460 if (t) { 1460 if (!IS_ERR(t)) {
1461 err = 0; 1461 err = 0;
1462 ip6_tnl_parm_to_user(&p, &t->parms); 1462 ip6_tnl_parm_to_user(&p, &t->parms);
1463 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 1463 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1464 err = -EFAULT; 1464 err = -EFAULT;
1465 1465
1466 } else 1466 } else {
1467 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); 1467 err = PTR_ERR(t);
1468 }
1468 break; 1469 break;
1469 case SIOCDELTUNNEL: 1470 case SIOCDELTUNNEL:
1470 err = -EPERM; 1471 err = -EPERM;
@@ -1478,7 +1479,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
1478 err = -ENOENT; 1479 err = -ENOENT;
1479 ip6_tnl_parm_from_user(&p1, &p); 1480 ip6_tnl_parm_from_user(&p1, &p);
1480 t = ip6_tnl_locate(net, &p1, 0); 1481 t = ip6_tnl_locate(net, &p1, 0);
1481 if (t == NULL) 1482 if (IS_ERR(t))
1482 break; 1483 break;
1483 err = -EPERM; 1484 err = -EPERM;
1484 if (t->dev == ip6n->fb_tnl_dev) 1485 if (t->dev == ip6n->fb_tnl_dev)
@@ -1672,12 +1673,13 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
1672 struct nlattr *tb[], struct nlattr *data[]) 1673 struct nlattr *tb[], struct nlattr *data[])
1673{ 1674{
1674 struct net *net = dev_net(dev); 1675 struct net *net = dev_net(dev);
1675 struct ip6_tnl *nt; 1676 struct ip6_tnl *nt, *t;
1676 1677
1677 nt = netdev_priv(dev); 1678 nt = netdev_priv(dev);
1678 ip6_tnl_netlink_parms(data, &nt->parms); 1679 ip6_tnl_netlink_parms(data, &nt->parms);
1679 1680
1680 if (ip6_tnl_locate(net, &nt->parms, 0)) 1681 t = ip6_tnl_locate(net, &nt->parms, 0);
1682 if (!IS_ERR(t))
1681 return -EEXIST; 1683 return -EEXIST;
1682 1684
1683 return ip6_tnl_create2(dev); 1685 return ip6_tnl_create2(dev);
@@ -1697,8 +1699,7 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
1697 ip6_tnl_netlink_parms(data, &p); 1699 ip6_tnl_netlink_parms(data, &p);
1698 1700
1699 t = ip6_tnl_locate(net, &p, 0); 1701 t = ip6_tnl_locate(net, &p, 0);
1700 1702 if (!IS_ERR(t)) {
1701 if (t) {
1702 if (t->dev != dev) 1703 if (t->dev != dev)
1703 return -EEXIST; 1704 return -EEXIST;
1704 } else 1705 } else
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e080fbbbc0e5..bb00c6f2a885 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -298,9 +298,9 @@ static void trace_packet(const struct sk_buff *skb,
298 &chainname, &comment, &rulenum) != 0) 298 &chainname, &comment, &rulenum) != 0)
299 break; 299 break;
300 300
301 nf_log_packet(net, AF_INET6, hook, skb, in, out, &trace_loginfo, 301 nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo,
302 "TRACE: %s:%s:%s:%u ", 302 "TRACE: %s:%s:%s:%u ",
303 tablename, chainname, comment, rulenum); 303 tablename, chainname, comment, rulenum);
304} 304}
305#endif 305#endif
306 306
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index bd46f736f61d..a2dfff6ff227 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -102,9 +102,10 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
102 102
103 if (msg->msg_name) { 103 if (msg->msg_name) {
104 DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name); 104 DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name);
105 if (msg->msg_namelen < sizeof(struct sockaddr_in6) || 105 if (msg->msg_namelen < sizeof(*u))
106 u->sin6_family != AF_INET6) {
107 return -EINVAL; 106 return -EINVAL;
107 if (u->sin6_family != AF_INET6) {
108 return -EAFNOSUPPORT;
108 } 109 }
109 if (sk->sk_bound_dev_if && 110 if (sk->sk_bound_dev_if &&
110 sk->sk_bound_dev_if != u->sin6_scope_id) { 111 sk->sk_bound_dev_if != u->sin6_scope_id) {
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index ab889bb16b3c..be2c0ba82c85 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -112,11 +112,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
112 fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); 112 fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
113 fptr->nexthdr = nexthdr; 113 fptr->nexthdr = nexthdr;
114 fptr->reserved = 0; 114 fptr->reserved = 0;
115 if (skb_shinfo(skb)->ip6_frag_id) 115 if (!skb_shinfo(skb)->ip6_frag_id)
116 fptr->identification = skb_shinfo(skb)->ip6_frag_id; 116 ipv6_proxy_select_ident(skb);
117 else 117 fptr->identification = skb_shinfo(skb)->ip6_frag_id;
118 ipv6_select_ident(fptr,
119 (struct rt6_info *)skb_dst(skb));
120 118
121 /* Fragment the skb. ipv6 header and the remaining fields of the 119 /* Fragment the skb. ipv6 header and the remaining fields of the
122 * fragment header are updated in ipv6_gso_segment() 120 * fragment header are updated in ipv6_gso_segment()
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index ca3f29b98ae5..010f8bd2d577 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -114,6 +114,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
114 return err; 114 return err;
115 115
116 skb->ignore_df = 1; 116 skb->ignore_df = 1;
117 skb->protocol = htons(ETH_P_IPV6);
117 118
118 return x->outer_mode->output2(x, skb); 119 return x->outer_mode->output2(x, skb);
119} 120}
@@ -122,7 +123,6 @@ EXPORT_SYMBOL(xfrm6_prepare_output);
122int xfrm6_output_finish(struct sk_buff *skb) 123int xfrm6_output_finish(struct sk_buff *skb)
123{ 124{
124 memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); 125 memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
125 skb->protocol = htons(ETH_P_IPV6);
126 126
127#ifdef CONFIG_NETFILTER 127#ifdef CONFIG_NETFILTER
128 IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; 128 IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 48bf5a06847b..8d2d01b4800a 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -200,6 +200,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
200 200
201#if IS_ENABLED(CONFIG_IPV6_MIP6) 201#if IS_ENABLED(CONFIG_IPV6_MIP6)
202 case IPPROTO_MH: 202 case IPPROTO_MH:
203 offset += ipv6_optlen(exthdr);
203 if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { 204 if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) {
204 struct ip6_mh *mh; 205 struct ip6_mh *mh;
205 206
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 3afe36824703..8d53d65bd2ab 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -58,13 +58,24 @@ struct ieee80211_local;
58#define IEEE80211_UNSET_POWER_LEVEL INT_MIN 58#define IEEE80211_UNSET_POWER_LEVEL INT_MIN
59 59
60/* 60/*
61 * Some APs experience problems when working with U-APSD. Decrease the 61 * Some APs experience problems when working with U-APSD. Decreasing the
62 * probability of that happening by using legacy mode for all ACs but VO. 62 * probability of that happening by using legacy mode for all ACs but VO isn't
63 * The AP that caused us trouble was a Cisco 4410N. It ignores our 63 * enough.
64 * setting, and always treats non-VO ACs as legacy. 64 *
65 * Cisco 4410N originally forced us to enable VO by default only because it
66 * treated non-VO ACs as legacy.
67 *
68 * However some APs (notably Netgear R7000) silently reclassify packets to
69 * different ACs. Since u-APSD ACs require trigger frames for frame retrieval
70 * clients would never see some frames (e.g. ARP responses) or would fetch them
71 * accidentally after a long time.
72 *
73 * It makes little sense to enable u-APSD queues by default because it needs
74 * userspace applications to be aware of it to actually take advantage of the
75 * possible additional powersavings. Implicitly depending on driver autotrigger
76 * frame support doesn't make much sense.
65 */ 77 */
66#define IEEE80211_DEFAULT_UAPSD_QUEUES \ 78#define IEEE80211_DEFAULT_UAPSD_QUEUES 0
67 IEEE80211_WMM_IE_STA_QOSINFO_AC_VO
68 79
69#define IEEE80211_DEFAULT_MAX_SP_LEN \ 80#define IEEE80211_DEFAULT_MAX_SP_LEN \
70 IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL 81 IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL
@@ -453,6 +464,7 @@ struct ieee80211_if_managed {
453 unsigned int flags; 464 unsigned int flags;
454 465
455 bool csa_waiting_bcn; 466 bool csa_waiting_bcn;
467 bool csa_ignored_same_chan;
456 468
457 bool beacon_crc_valid; 469 bool beacon_crc_valid;
458 u32 beacon_crc; 470 u32 beacon_crc;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 10ac6324c1d0..142f66aece18 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1150,6 +1150,17 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
1150 return; 1150 return;
1151 } 1151 }
1152 1152
1153 if (cfg80211_chandef_identical(&csa_ie.chandef,
1154 &sdata->vif.bss_conf.chandef)) {
1155 if (ifmgd->csa_ignored_same_chan)
1156 return;
1157 sdata_info(sdata,
1158 "AP %pM tries to chanswitch to same channel, ignore\n",
1159 ifmgd->associated->bssid);
1160 ifmgd->csa_ignored_same_chan = true;
1161 return;
1162 }
1163
1153 mutex_lock(&local->mtx); 1164 mutex_lock(&local->mtx);
1154 mutex_lock(&local->chanctx_mtx); 1165 mutex_lock(&local->chanctx_mtx);
1155 conf = rcu_dereference_protected(sdata->vif.chanctx_conf, 1166 conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
@@ -1210,6 +1221,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
1210 sdata->vif.csa_active = true; 1221 sdata->vif.csa_active = true;
1211 sdata->csa_chandef = csa_ie.chandef; 1222 sdata->csa_chandef = csa_ie.chandef;
1212 sdata->csa_block_tx = csa_ie.mode; 1223 sdata->csa_block_tx = csa_ie.mode;
1224 ifmgd->csa_ignored_same_chan = false;
1213 1225
1214 if (sdata->csa_block_tx) 1226 if (sdata->csa_block_tx)
1215 ieee80211_stop_vif_queues(local, sdata, 1227 ieee80211_stop_vif_queues(local, sdata,
@@ -2090,6 +2102,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
2090 2102
2091 sdata->vif.csa_active = false; 2103 sdata->vif.csa_active = false;
2092 ifmgd->csa_waiting_bcn = false; 2104 ifmgd->csa_waiting_bcn = false;
2105 ifmgd->csa_ignored_same_chan = false;
2093 if (sdata->csa_block_tx) { 2106 if (sdata->csa_block_tx) {
2094 ieee80211_wake_vif_queues(local, sdata, 2107 ieee80211_wake_vif_queues(local, sdata,
2095 IEEE80211_QUEUE_STOP_REASON_CSA); 2108 IEEE80211_QUEUE_STOP_REASON_CSA);
@@ -3204,7 +3217,8 @@ static const u64 care_about_ies =
3204 (1ULL << WLAN_EID_CHANNEL_SWITCH) | 3217 (1ULL << WLAN_EID_CHANNEL_SWITCH) |
3205 (1ULL << WLAN_EID_PWR_CONSTRAINT) | 3218 (1ULL << WLAN_EID_PWR_CONSTRAINT) |
3206 (1ULL << WLAN_EID_HT_CAPABILITY) | 3219 (1ULL << WLAN_EID_HT_CAPABILITY) |
3207 (1ULL << WLAN_EID_HT_OPERATION); 3220 (1ULL << WLAN_EID_HT_OPERATION) |
3221 (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN);
3208 3222
3209static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, 3223static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
3210 struct ieee80211_mgmt *mgmt, size_t len, 3224 struct ieee80211_mgmt *mgmt, size_t len,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1101563357ea..944bdc04e913 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2214,6 +2214,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
2214 hdr = (struct ieee80211_hdr *) skb->data; 2214 hdr = (struct ieee80211_hdr *) skb->data;
2215 mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); 2215 mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen);
2216 2216
2217 if (ieee80211_drop_unencrypted(rx, hdr->frame_control))
2218 return RX_DROP_MONITOR;
2219
2217 /* frame is in RMC, don't forward */ 2220 /* frame is in RMC, don't forward */
2218 if (ieee80211_is_data(hdr->frame_control) && 2221 if (ieee80211_is_data(hdr->frame_control) &&
2219 is_multicast_ether_addr(hdr->addr1) && 2222 is_multicast_ether_addr(hdr->addr1) &&
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 8428f4a95479..747bdcf72e92 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3178,7 +3178,7 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
3178 wdev_iter = &sdata_iter->wdev; 3178 wdev_iter = &sdata_iter->wdev;
3179 3179
3180 if (sdata_iter == sdata || 3180 if (sdata_iter == sdata ||
3181 rcu_access_pointer(sdata_iter->vif.chanctx_conf) == NULL || 3181 !ieee80211_sdata_running(sdata_iter) ||
3182 local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype)) 3182 local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype))
3183 continue; 3183 continue;
3184 3184
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index c47ffd7a0a70..d93ceeb3ef04 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -896,6 +896,8 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
896 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 896 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
897 return; 897 return;
898 } 898 }
899 if (!(flags & IP_VS_CONN_F_TEMPLATE))
900 kfree(param->pe_data);
899 } 901 }
900 902
901 if (opt) 903 if (opt)
@@ -1169,6 +1171,7 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
1169 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1171 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1170 ); 1172 );
1171#endif 1173#endif
1174 ip_vs_pe_put(param.pe);
1172 return 0; 1175 return 0;
1173 /* Error exit */ 1176 /* Error exit */
1174out: 1177out:
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 0d8448f19dfe..675d12c69e32 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -212,6 +212,30 @@ void nf_log_packet(struct net *net,
212} 212}
213EXPORT_SYMBOL(nf_log_packet); 213EXPORT_SYMBOL(nf_log_packet);
214 214
215void nf_log_trace(struct net *net,
216 u_int8_t pf,
217 unsigned int hooknum,
218 const struct sk_buff *skb,
219 const struct net_device *in,
220 const struct net_device *out,
221 const struct nf_loginfo *loginfo, const char *fmt, ...)
222{
223 va_list args;
224 char prefix[NF_LOG_PREFIXLEN];
225 const struct nf_logger *logger;
226
227 rcu_read_lock();
228 logger = rcu_dereference(net->nf.nf_loggers[pf]);
229 if (logger) {
230 va_start(args, fmt);
231 vsnprintf(prefix, sizeof(prefix), fmt, args);
232 va_end(args);
233 logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix);
234 }
235 rcu_read_unlock();
236}
237EXPORT_SYMBOL(nf_log_trace);
238
215#define S_SIZE (1024 - (sizeof(unsigned int) + 1)) 239#define S_SIZE (1024 - (sizeof(unsigned int) + 1))
216 240
217struct nf_log_buf { 241struct nf_log_buf {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 199fd0f27b0e..ac1a9528dbf2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -227,7 +227,7 @@ nft_rule_deactivate_next(struct net *net, struct nft_rule *rule)
227 227
228static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) 228static inline void nft_rule_clear(struct net *net, struct nft_rule *rule)
229{ 229{
230 rule->genmask = 0; 230 rule->genmask &= ~(1 << gencursor_next(net));
231} 231}
232 232
233static int 233static int
@@ -1225,7 +1225,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
1225 1225
1226 if (nla[NFTA_CHAIN_POLICY]) { 1226 if (nla[NFTA_CHAIN_POLICY]) {
1227 if ((chain != NULL && 1227 if ((chain != NULL &&
1228 !(chain->flags & NFT_BASE_CHAIN)) || 1228 !(chain->flags & NFT_BASE_CHAIN)))
1229 return -EOPNOTSUPP;
1230
1231 if (chain == NULL &&
1229 nla[NFTA_CHAIN_HOOK] == NULL) 1232 nla[NFTA_CHAIN_HOOK] == NULL)
1230 return -EOPNOTSUPP; 1233 return -EOPNOTSUPP;
1231 1234
@@ -1711,9 +1714,12 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
1711 } 1714 }
1712 nla_nest_end(skb, list); 1715 nla_nest_end(skb, list);
1713 1716
1714 if (rule->ulen && 1717 if (rule->udata) {
1715 nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) 1718 struct nft_userdata *udata = nft_userdata(rule);
1716 goto nla_put_failure; 1719 if (nla_put(skb, NFTA_RULE_USERDATA, udata->len + 1,
1720 udata->data) < 0)
1721 goto nla_put_failure;
1722 }
1717 1723
1718 nlmsg_end(skb, nlh); 1724 nlmsg_end(skb, nlh);
1719 return 0; 1725 return 0;
@@ -1896,11 +1902,12 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
1896 struct nft_table *table; 1902 struct nft_table *table;
1897 struct nft_chain *chain; 1903 struct nft_chain *chain;
1898 struct nft_rule *rule, *old_rule = NULL; 1904 struct nft_rule *rule, *old_rule = NULL;
1905 struct nft_userdata *udata;
1899 struct nft_trans *trans = NULL; 1906 struct nft_trans *trans = NULL;
1900 struct nft_expr *expr; 1907 struct nft_expr *expr;
1901 struct nft_ctx ctx; 1908 struct nft_ctx ctx;
1902 struct nlattr *tmp; 1909 struct nlattr *tmp;
1903 unsigned int size, i, n, ulen = 0; 1910 unsigned int size, i, n, ulen = 0, usize = 0;
1904 int err, rem; 1911 int err, rem;
1905 bool create; 1912 bool create;
1906 u64 handle, pos_handle; 1913 u64 handle, pos_handle;
@@ -1968,12 +1975,19 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
1968 n++; 1975 n++;
1969 } 1976 }
1970 } 1977 }
1978 /* Check for overflow of dlen field */
1979 err = -EFBIG;
1980 if (size >= 1 << 12)
1981 goto err1;
1971 1982
1972 if (nla[NFTA_RULE_USERDATA]) 1983 if (nla[NFTA_RULE_USERDATA]) {
1973 ulen = nla_len(nla[NFTA_RULE_USERDATA]); 1984 ulen = nla_len(nla[NFTA_RULE_USERDATA]);
1985 if (ulen > 0)
1986 usize = sizeof(struct nft_userdata) + ulen;
1987 }
1974 1988
1975 err = -ENOMEM; 1989 err = -ENOMEM;
1976 rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); 1990 rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL);
1977 if (rule == NULL) 1991 if (rule == NULL)
1978 goto err1; 1992 goto err1;
1979 1993
@@ -1981,10 +1995,13 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
1981 1995
1982 rule->handle = handle; 1996 rule->handle = handle;
1983 rule->dlen = size; 1997 rule->dlen = size;
1984 rule->ulen = ulen; 1998 rule->udata = ulen ? 1 : 0;
1985 1999
1986 if (ulen) 2000 if (ulen) {
1987 nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); 2001 udata = nft_userdata(rule);
2002 udata->len = ulen - 1;
2003 nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen);
2004 }
1988 2005
1989 expr = nft_expr_first(rule); 2006 expr = nft_expr_first(rule);
1990 for (i = 0; i < n; i++) { 2007 for (i = 0; i < n; i++) {
@@ -2031,12 +2048,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
2031 2048
2032err3: 2049err3:
2033 list_del_rcu(&rule->list); 2050 list_del_rcu(&rule->list);
2034 if (trans) {
2035 list_del_rcu(&nft_trans_rule(trans)->list);
2036 nft_rule_clear(net, nft_trans_rule(trans));
2037 nft_trans_destroy(trans);
2038 chain->use++;
2039 }
2040err2: 2051err2:
2041 nf_tables_rule_destroy(&ctx, rule); 2052 nf_tables_rule_destroy(&ctx, rule);
2042err1: 2053err1:
@@ -3612,12 +3623,11 @@ static int nf_tables_commit(struct sk_buff *skb)
3612 &te->elem, 3623 &te->elem,
3613 NFT_MSG_DELSETELEM, 0); 3624 NFT_MSG_DELSETELEM, 0);
3614 te->set->ops->get(te->set, &te->elem); 3625 te->set->ops->get(te->set, &te->elem);
3615 te->set->ops->remove(te->set, &te->elem);
3616 nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); 3626 nft_data_uninit(&te->elem.key, NFT_DATA_VALUE);
3617 if (te->elem.flags & NFT_SET_MAP) { 3627 if (te->set->flags & NFT_SET_MAP &&
3618 nft_data_uninit(&te->elem.data, 3628 !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END))
3619 te->set->dtype); 3629 nft_data_uninit(&te->elem.data, te->set->dtype);
3620 } 3630 te->set->ops->remove(te->set, &te->elem);
3621 nft_trans_destroy(trans); 3631 nft_trans_destroy(trans);
3622 break; 3632 break;
3623 } 3633 }
@@ -3658,7 +3668,7 @@ static int nf_tables_abort(struct sk_buff *skb)
3658{ 3668{
3659 struct net *net = sock_net(skb->sk); 3669 struct net *net = sock_net(skb->sk);
3660 struct nft_trans *trans, *next; 3670 struct nft_trans *trans, *next;
3661 struct nft_set *set; 3671 struct nft_trans_elem *te;
3662 3672
3663 list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { 3673 list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
3664 switch (trans->msg_type) { 3674 switch (trans->msg_type) {
@@ -3719,9 +3729,13 @@ static int nf_tables_abort(struct sk_buff *skb)
3719 break; 3729 break;
3720 case NFT_MSG_NEWSETELEM: 3730 case NFT_MSG_NEWSETELEM:
3721 nft_trans_elem_set(trans)->nelems--; 3731 nft_trans_elem_set(trans)->nelems--;
3722 set = nft_trans_elem_set(trans); 3732 te = (struct nft_trans_elem *)trans->data;
3723 set->ops->get(set, &nft_trans_elem(trans)); 3733 te->set->ops->get(te->set, &te->elem);
3724 set->ops->remove(set, &nft_trans_elem(trans)); 3734 nft_data_uninit(&te->elem.key, NFT_DATA_VALUE);
3735 if (te->set->flags & NFT_SET_MAP &&
3736 !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END))
3737 nft_data_uninit(&te->elem.data, te->set->dtype);
3738 te->set->ops->remove(te->set, &te->elem);
3725 nft_trans_destroy(trans); 3739 nft_trans_destroy(trans);
3726 break; 3740 break;
3727 case NFT_MSG_DELSETELEM: 3741 case NFT_MSG_DELSETELEM:
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 3b90eb2b2c55..2d298dccb6dd 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -94,10 +94,10 @@ static void nft_trace_packet(const struct nft_pktinfo *pkt,
94{ 94{
95 struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); 95 struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
96 96
97 nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, 97 nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
98 pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", 98 pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
99 chain->table->name, chain->name, comments[type], 99 chain->table->name, chain->name, comments[type],
100 rulenum); 100 rulenum);
101} 101}
102 102
103unsigned int 103unsigned int
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index a5599fc51a6f..54330fb5efaf 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -77,6 +77,9 @@ nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple,
77 if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM]) 77 if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM])
78 return -EINVAL; 78 return -EINVAL;
79 79
80 /* Not all fields are initialized so first zero the tuple */
81 memset(tuple, 0, sizeof(struct nf_conntrack_tuple));
82
80 tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); 83 tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM]));
81 tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); 84 tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]);
82 85
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 1279cd85663e..65f3e2b6be44 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -123,7 +123,7 @@ static void
123nft_target_set_tgchk_param(struct xt_tgchk_param *par, 123nft_target_set_tgchk_param(struct xt_tgchk_param *par,
124 const struct nft_ctx *ctx, 124 const struct nft_ctx *ctx,
125 struct xt_target *target, void *info, 125 struct xt_target *target, void *info,
126 union nft_entry *entry, u8 proto, bool inv) 126 union nft_entry *entry, u16 proto, bool inv)
127{ 127{
128 par->net = ctx->net; 128 par->net = ctx->net;
129 par->table = ctx->table->name; 129 par->table = ctx->table->name;
@@ -133,11 +133,14 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
133 entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; 133 entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
134 break; 134 break;
135 case AF_INET6: 135 case AF_INET6:
136 if (proto)
137 entry->e6.ipv6.flags |= IP6T_F_PROTO;
138
136 entry->e6.ipv6.proto = proto; 139 entry->e6.ipv6.proto = proto;
137 entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; 140 entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
138 break; 141 break;
139 case NFPROTO_BRIDGE: 142 case NFPROTO_BRIDGE:
140 entry->ebt.ethproto = proto; 143 entry->ebt.ethproto = (__force __be16)proto;
141 entry->ebt.invflags = inv ? EBT_IPROTO : 0; 144 entry->ebt.invflags = inv ? EBT_IPROTO : 0;
142 break; 145 break;
143 } 146 }
@@ -171,7 +174,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1]
171 [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 }, 174 [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 },
172}; 175};
173 176
174static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv) 177static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
175{ 178{
176 struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; 179 struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
177 u32 flags; 180 u32 flags;
@@ -203,7 +206,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
203 struct xt_target *target = expr->ops->data; 206 struct xt_target *target = expr->ops->data;
204 struct xt_tgchk_param par; 207 struct xt_tgchk_param par;
205 size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); 208 size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO]));
206 u8 proto = 0; 209 u16 proto = 0;
207 bool inv = false; 210 bool inv = false;
208 union nft_entry e = {}; 211 union nft_entry e = {};
209 int ret; 212 int ret;
@@ -334,7 +337,7 @@ static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
334static void 337static void
335nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, 338nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
336 struct xt_match *match, void *info, 339 struct xt_match *match, void *info,
337 union nft_entry *entry, u8 proto, bool inv) 340 union nft_entry *entry, u16 proto, bool inv)
338{ 341{
339 par->net = ctx->net; 342 par->net = ctx->net;
340 par->table = ctx->table->name; 343 par->table = ctx->table->name;
@@ -344,11 +347,14 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
344 entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; 347 entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
345 break; 348 break;
346 case AF_INET6: 349 case AF_INET6:
350 if (proto)
351 entry->e6.ipv6.flags |= IP6T_F_PROTO;
352
347 entry->e6.ipv6.proto = proto; 353 entry->e6.ipv6.proto = proto;
348 entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; 354 entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
349 break; 355 break;
350 case NFPROTO_BRIDGE: 356 case NFPROTO_BRIDGE:
351 entry->ebt.ethproto = proto; 357 entry->ebt.ethproto = (__force __be16)proto;
352 entry->ebt.invflags = inv ? EBT_IPROTO : 0; 358 entry->ebt.invflags = inv ? EBT_IPROTO : 0;
353 break; 359 break;
354 } 360 }
@@ -385,7 +391,7 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
385 struct xt_match *match = expr->ops->data; 391 struct xt_match *match = expr->ops->data;
386 struct xt_mtchk_param par; 392 struct xt_mtchk_param par;
387 size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); 393 size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
388 u8 proto = 0; 394 u16 proto = 0;
389 bool inv = false; 395 bool inv = false;
390 union nft_entry e = {}; 396 union nft_entry e = {};
391 int ret; 397 int ret;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index c82df0a48fcd..37c15e674884 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -153,6 +153,8 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
153 iter->err = err; 153 iter->err = err;
154 goto out; 154 goto out;
155 } 155 }
156
157 continue;
156 } 158 }
157 159
158 if (iter->count < iter->skip) 160 if (iter->count < iter->skip)
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index ef8a926752a9..50e1e5aaf4ce 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -513,8 +513,8 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par)
513{ 513{
514 const struct ip6t_ip6 *i = par->entryinfo; 514 const struct ip6t_ip6 *i = par->entryinfo;
515 515
516 if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) 516 if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) &&
517 && !(i->flags & IP6T_INV_PROTO)) 517 !(i->invflags & IP6T_INV_PROTO))
518 return 0; 518 return 0;
519 519
520 pr_info("Can be used only in combination with " 520 pr_info("Can be used only in combination with "
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 5bf1e968a728..f8db7064d81c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3123,11 +3123,18 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3123 return 0; 3123 return 0;
3124} 3124}
3125 3125
3126static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) 3126static void packet_dev_mclist_delete(struct net_device *dev,
3127 struct packet_mclist **mlp)
3127{ 3128{
3128 for ( ; i; i = i->next) { 3129 struct packet_mclist *ml;
3129 if (i->ifindex == dev->ifindex) 3130
3130 packet_dev_mc(dev, i, what); 3131 while ((ml = *mlp) != NULL) {
3132 if (ml->ifindex == dev->ifindex) {
3133 packet_dev_mc(dev, ml, -1);
3134 *mlp = ml->next;
3135 kfree(ml);
3136 } else
3137 mlp = &ml->next;
3131 } 3138 }
3132} 3139}
3133 3140
@@ -3204,12 +3211,11 @@ static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3204 packet_dev_mc(dev, ml, -1); 3211 packet_dev_mc(dev, ml, -1);
3205 kfree(ml); 3212 kfree(ml);
3206 } 3213 }
3207 rtnl_unlock(); 3214 break;
3208 return 0;
3209 } 3215 }
3210 } 3216 }
3211 rtnl_unlock(); 3217 rtnl_unlock();
3212 return -EADDRNOTAVAIL; 3218 return 0;
3213} 3219}
3214 3220
3215static void packet_flush_mclist(struct sock *sk) 3221static void packet_flush_mclist(struct sock *sk)
@@ -3559,7 +3565,7 @@ static int packet_notifier(struct notifier_block *this,
3559 switch (msg) { 3565 switch (msg) {
3560 case NETDEV_UNREGISTER: 3566 case NETDEV_UNREGISTER:
3561 if (po->mclist) 3567 if (po->mclist)
3562 packet_dev_mclist(dev, po->mclist, -1); 3568 packet_dev_mclist_delete(dev, &po->mclist);
3563 /* fallthrough */ 3569 /* fallthrough */
3564 3570
3565 case NETDEV_DOWN: 3571 case NETDEV_DOWN:
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index a817705ce2d0..dba8d0864f18 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -88,7 +88,9 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
88 int *unpinned); 88 int *unpinned);
89static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); 89static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
90 90
91static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) 91static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
92 struct rds_iw_device **rds_iwdev,
93 struct rdma_cm_id **cm_id)
92{ 94{
93 struct rds_iw_device *iwdev; 95 struct rds_iw_device *iwdev;
94 struct rds_iw_cm_id *i_cm_id; 96 struct rds_iw_cm_id *i_cm_id;
@@ -112,15 +114,15 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd
112 src_addr->sin_port, 114 src_addr->sin_port,
113 dst_addr->sin_addr.s_addr, 115 dst_addr->sin_addr.s_addr,
114 dst_addr->sin_port, 116 dst_addr->sin_port,
115 rs->rs_bound_addr, 117 src->sin_addr.s_addr,
116 rs->rs_bound_port, 118 src->sin_port,
117 rs->rs_conn_addr, 119 dst->sin_addr.s_addr,
118 rs->rs_conn_port); 120 dst->sin_port);
119#ifdef WORKING_TUPLE_DETECTION 121#ifdef WORKING_TUPLE_DETECTION
120 if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && 122 if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr &&
121 src_addr->sin_port == rs->rs_bound_port && 123 src_addr->sin_port == src->sin_port &&
122 dst_addr->sin_addr.s_addr == rs->rs_conn_addr && 124 dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr &&
123 dst_addr->sin_port == rs->rs_conn_port) { 125 dst_addr->sin_port == dst->sin_port) {
124#else 126#else
125 /* FIXME - needs to compare the local and remote 127 /* FIXME - needs to compare the local and remote
126 * ipaddr/port tuple, but the ipaddr is the only 128 * ipaddr/port tuple, but the ipaddr is the only
@@ -128,7 +130,7 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd
128 * zero'ed. It doesn't appear to be properly populated 130 * zero'ed. It doesn't appear to be properly populated
129 * during connection setup... 131 * during connection setup...
130 */ 132 */
131 if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { 133 if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) {
132#endif 134#endif
133 spin_unlock_irq(&iwdev->spinlock); 135 spin_unlock_irq(&iwdev->spinlock);
134 *rds_iwdev = iwdev; 136 *rds_iwdev = iwdev;
@@ -180,19 +182,13 @@ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_i
180{ 182{
181 struct sockaddr_in *src_addr, *dst_addr; 183 struct sockaddr_in *src_addr, *dst_addr;
182 struct rds_iw_device *rds_iwdev_old; 184 struct rds_iw_device *rds_iwdev_old;
183 struct rds_sock rs;
184 struct rdma_cm_id *pcm_id; 185 struct rdma_cm_id *pcm_id;
185 int rc; 186 int rc;
186 187
187 src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; 188 src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
188 dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; 189 dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
189 190
190 rs.rs_bound_addr = src_addr->sin_addr.s_addr; 191 rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id);
191 rs.rs_bound_port = src_addr->sin_port;
192 rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
193 rs.rs_conn_port = dst_addr->sin_port;
194
195 rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
196 if (rc) 192 if (rc)
197 rds_iw_remove_cm_id(rds_iwdev, cm_id); 193 rds_iw_remove_cm_id(rds_iwdev, cm_id);
198 194
@@ -598,9 +594,17 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
598 struct rds_iw_device *rds_iwdev; 594 struct rds_iw_device *rds_iwdev;
599 struct rds_iw_mr *ibmr = NULL; 595 struct rds_iw_mr *ibmr = NULL;
600 struct rdma_cm_id *cm_id; 596 struct rdma_cm_id *cm_id;
597 struct sockaddr_in src = {
598 .sin_addr.s_addr = rs->rs_bound_addr,
599 .sin_port = rs->rs_bound_port,
600 };
601 struct sockaddr_in dst = {
602 .sin_addr.s_addr = rs->rs_conn_addr,
603 .sin_port = rs->rs_conn_port,
604 };
601 int ret; 605 int ret;
602 606
603 ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); 607 ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id);
604 if (ret || !cm_id) { 608 if (ret || !cm_id) {
605 ret = -ENODEV; 609 ret = -ENODEV;
606 goto out; 610 goto out;
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
index 5394b6be46ec..0610efa83d72 100644
--- a/net/rxrpc/ar-error.c
+++ b/net/rxrpc/ar-error.c
@@ -42,7 +42,8 @@ void rxrpc_UDP_error_report(struct sock *sk)
42 _leave("UDP socket errqueue empty"); 42 _leave("UDP socket errqueue empty");
43 return; 43 return;
44 } 44 }
45 if (!skb->len) { 45 serr = SKB_EXT_ERR(skb);
46 if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
46 _leave("UDP empty message"); 47 _leave("UDP empty message");
47 kfree_skb(skb); 48 kfree_skb(skb);
48 return; 49 return;
@@ -50,7 +51,6 @@ void rxrpc_UDP_error_report(struct sock *sk)
50 51
51 rxrpc_new_skb(skb); 52 rxrpc_new_skb(skb);
52 53
53 serr = SKB_EXT_ERR(skb);
54 addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); 54 addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset);
55 port = serr->port; 55 port = serr->port;
56 56
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index 4575485ad1b4..19a560626dc4 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -87,7 +87,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock,
87 if (!skb) { 87 if (!skb) {
88 /* nothing remains on the queue */ 88 /* nothing remains on the queue */
89 if (copied && 89 if (copied &&
90 (msg->msg_flags & MSG_PEEK || timeo == 0)) 90 (flags & MSG_PEEK || timeo == 0))
91 goto out; 91 goto out;
92 92
93 /* wait for a message to turn up */ 93 /* wait for a message to turn up */
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 82c5d7fc1988..5f6288fa3f12 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -25,21 +25,41 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a,
25 struct tcf_result *res) 25 struct tcf_result *res)
26{ 26{
27 struct tcf_bpf *b = a->priv; 27 struct tcf_bpf *b = a->priv;
28 int action; 28 int action, filter_res;
29 int filter_res;
30 29
31 spin_lock(&b->tcf_lock); 30 spin_lock(&b->tcf_lock);
31
32 b->tcf_tm.lastuse = jiffies; 32 b->tcf_tm.lastuse = jiffies;
33 bstats_update(&b->tcf_bstats, skb); 33 bstats_update(&b->tcf_bstats, skb);
34 action = b->tcf_action;
35 34
36 filter_res = BPF_PROG_RUN(b->filter, skb); 35 filter_res = BPF_PROG_RUN(b->filter, skb);
37 if (filter_res == 0) { 36
38 /* Return code 0 from the BPF program 37 /* A BPF program may overwrite the default action opcode.
39 * is being interpreted as a drop here. 38 * Similarly as in cls_bpf, if filter_res == -1 we use the
40 */ 39 * default action specified from tc.
41 action = TC_ACT_SHOT; 40 *
41 * In case a different well-known TC_ACT opcode has been
42 * returned, it will overwrite the default one.
43 *
44 * For everything else that is unkown, TC_ACT_UNSPEC is
45 * returned.
46 */
47 switch (filter_res) {
48 case TC_ACT_PIPE:
49 case TC_ACT_RECLASSIFY:
50 case TC_ACT_OK:
51 action = filter_res;
52 break;
53 case TC_ACT_SHOT:
54 action = filter_res;
42 b->tcf_qstats.drops++; 55 b->tcf_qstats.drops++;
56 break;
57 case TC_ACT_UNSPEC:
58 action = b->tcf_action;
59 break;
60 default:
61 action = TC_ACT_UNSPEC;
62 break;
43 } 63 }
44 64
45 spin_unlock(&b->tcf_lock); 65 spin_unlock(&b->tcf_lock);
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 09487afbfd51..95fdf4e40051 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -78,8 +78,11 @@ struct tc_u_hnode {
78 struct tc_u_common *tp_c; 78 struct tc_u_common *tp_c;
79 int refcnt; 79 int refcnt;
80 unsigned int divisor; 80 unsigned int divisor;
81 struct tc_u_knode __rcu *ht[1];
82 struct rcu_head rcu; 81 struct rcu_head rcu;
82 /* The 'ht' field MUST be the last field in structure to allow for
83 * more entries allocated at end of structure.
84 */
85 struct tc_u_knode __rcu *ht[1];
83}; 86};
84 87
85struct tc_u_common { 88struct tc_u_common {
diff --git a/net/socket.c b/net/socket.c
index bbedbfcb42c2..245330ca0015 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1702,6 +1702,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
1702 1702
1703 if (len > INT_MAX) 1703 if (len > INT_MAX)
1704 len = INT_MAX; 1704 len = INT_MAX;
1705 if (unlikely(!access_ok(VERIFY_READ, buff, len)))
1706 return -EFAULT;
1705 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1707 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1706 if (!sock) 1708 if (!sock)
1707 goto out; 1709 goto out;
@@ -1760,6 +1762,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
1760 1762
1761 if (size > INT_MAX) 1763 if (size > INT_MAX)
1762 size = INT_MAX; 1764 size = INT_MAX;
1765 if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size)))
1766 return -EFAULT;
1763 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1767 sock = sockfd_lookup_light(fd, &err, &fput_needed);
1764 if (!sock) 1768 if (!sock)
1765 goto out; 1769 goto out;
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index da5136fd5694..579f72bbcf4b 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,6 +1,7 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o 1obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
2 2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o 3xprtrdma-y := transport.o rpc_rdma.o verbs.o \
4 fmr_ops.o frwr_ops.o physical_ops.o
4 5
5obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o 6obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
6 7
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
new file mode 100644
index 000000000000..a91ba2c8ef1e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -0,0 +1,208 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* Lightweight memory registration using Fast Memory Regions (FMR).
7 * Referred to sometimes as MTHCAFMR mode.
8 *
9 * FMR uses synchronous memory registration and deregistration.
10 * FMR registration is known to be fast, but FMR deregistration
11 * can take tens of usecs to complete.
12 */
13
14#include "xprt_rdma.h"
15
16#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
17# define RPCDBG_FACILITY RPCDBG_TRANS
18#endif
19
20/* Maximum scatter/gather per FMR */
21#define RPCRDMA_MAX_FMR_SGES (64)
22
23static int
24fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
25 struct rpcrdma_create_data_internal *cdata)
26{
27 return 0;
28}
29
30/* FMR mode conveys up to 64 pages of payload per chunk segment.
31 */
32static size_t
33fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
34{
35 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
36 rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
37}
38
39static int
40fmr_op_init(struct rpcrdma_xprt *r_xprt)
41{
42 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
43 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
44 struct ib_fmr_attr fmr_attr = {
45 .max_pages = RPCRDMA_MAX_FMR_SGES,
46 .max_maps = 1,
47 .page_shift = PAGE_SHIFT
48 };
49 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
50 struct rpcrdma_mw *r;
51 int i, rc;
52
53 INIT_LIST_HEAD(&buf->rb_mws);
54 INIT_LIST_HEAD(&buf->rb_all);
55
56 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
57 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
58
59 while (i--) {
60 r = kzalloc(sizeof(*r), GFP_KERNEL);
61 if (!r)
62 return -ENOMEM;
63
64 r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
65 if (IS_ERR(r->r.fmr))
66 goto out_fmr_err;
67
68 list_add(&r->mw_list, &buf->rb_mws);
69 list_add(&r->mw_all, &buf->rb_all);
70 }
71 return 0;
72
73out_fmr_err:
74 rc = PTR_ERR(r->r.fmr);
75 dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
76 kfree(r);
77 return rc;
78}
79
80/* Use the ib_map_phys_fmr() verb to register a memory region
81 * for remote access via RDMA READ or RDMA WRITE.
82 */
83static int
84fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
85 int nsegs, bool writing)
86{
87 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
88 struct ib_device *device = ia->ri_id->device;
89 enum dma_data_direction direction = rpcrdma_data_dir(writing);
90 struct rpcrdma_mr_seg *seg1 = seg;
91 struct rpcrdma_mw *mw = seg1->rl_mw;
92 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
93 int len, pageoff, i, rc;
94
95 pageoff = offset_in_page(seg1->mr_offset);
96 seg1->mr_offset -= pageoff; /* start of page */
97 seg1->mr_len += pageoff;
98 len = -pageoff;
99 if (nsegs > RPCRDMA_MAX_FMR_SGES)
100 nsegs = RPCRDMA_MAX_FMR_SGES;
101 for (i = 0; i < nsegs;) {
102 rpcrdma_map_one(device, seg, direction);
103 physaddrs[i] = seg->mr_dma;
104 len += seg->mr_len;
105 ++seg;
106 ++i;
107 /* Check for holes */
108 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
109 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
110 break;
111 }
112
113 rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
114 if (rc)
115 goto out_maperr;
116
117 seg1->mr_rkey = mw->r.fmr->rkey;
118 seg1->mr_base = seg1->mr_dma + pageoff;
119 seg1->mr_nsegs = i;
120 seg1->mr_len = len;
121 return i;
122
123out_maperr:
124 dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
125 __func__, len, (unsigned long long)seg1->mr_dma,
126 pageoff, i, rc);
127 while (i--)
128 rpcrdma_unmap_one(device, --seg);
129 return rc;
130}
131
132/* Use the ib_unmap_fmr() verb to prevent further remote
133 * access via RDMA READ or RDMA WRITE.
134 */
135static int
136fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
137{
138 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
139 struct rpcrdma_mr_seg *seg1 = seg;
140 struct ib_device *device;
141 int rc, nsegs = seg->mr_nsegs;
142 LIST_HEAD(l);
143
144 list_add(&seg1->rl_mw->r.fmr->list, &l);
145 rc = ib_unmap_fmr(&l);
146 read_lock(&ia->ri_qplock);
147 device = ia->ri_id->device;
148 while (seg1->mr_nsegs--)
149 rpcrdma_unmap_one(device, seg++);
150 read_unlock(&ia->ri_qplock);
151 if (rc)
152 goto out_err;
153 return nsegs;
154
155out_err:
156 dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
157 return nsegs;
158}
159
160/* After a disconnect, unmap all FMRs.
161 *
162 * This is invoked only in the transport connect worker in order
163 * to serialize with rpcrdma_register_fmr_external().
164 */
165static void
166fmr_op_reset(struct rpcrdma_xprt *r_xprt)
167{
168 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
169 struct rpcrdma_mw *r;
170 LIST_HEAD(list);
171 int rc;
172
173 list_for_each_entry(r, &buf->rb_all, mw_all)
174 list_add(&r->r.fmr->list, &list);
175
176 rc = ib_unmap_fmr(&list);
177 if (rc)
178 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
179 __func__, rc);
180}
181
182static void
183fmr_op_destroy(struct rpcrdma_buffer *buf)
184{
185 struct rpcrdma_mw *r;
186 int rc;
187
188 while (!list_empty(&buf->rb_all)) {
189 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
190 list_del(&r->mw_all);
191 rc = ib_dealloc_fmr(r->r.fmr);
192 if (rc)
193 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
194 __func__, rc);
195 kfree(r);
196 }
197}
198
199const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
200 .ro_map = fmr_op_map,
201 .ro_unmap = fmr_op_unmap,
202 .ro_open = fmr_op_open,
203 .ro_maxpages = fmr_op_maxpages,
204 .ro_init = fmr_op_init,
205 .ro_reset = fmr_op_reset,
206 .ro_destroy = fmr_op_destroy,
207 .ro_displayname = "fmr",
208};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644
index 000000000000..0a7b9df70133
--- /dev/null
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -0,0 +1,353 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* Lightweight memory registration using Fast Registration Work
7 * Requests (FRWR). Also referred to sometimes as FRMR mode.
8 *
9 * FRWR features ordered asynchronous registration and deregistration
10 * of arbitrarily sized memory regions. This is the fastest and safest
11 * but most complex memory registration mode.
12 */
13
14#include "xprt_rdma.h"
15
16#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
17# define RPCDBG_FACILITY RPCDBG_TRANS
18#endif
19
20static int
21__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
22 unsigned int depth)
23{
24 struct rpcrdma_frmr *f = &r->r.frmr;
25 int rc;
26
27 f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
28 if (IS_ERR(f->fr_mr))
29 goto out_mr_err;
30 f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
31 if (IS_ERR(f->fr_pgl))
32 goto out_list_err;
33 return 0;
34
35out_mr_err:
36 rc = PTR_ERR(f->fr_mr);
37 dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n",
38 __func__, rc);
39 return rc;
40
41out_list_err:
42 rc = PTR_ERR(f->fr_pgl);
43 dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n",
44 __func__, rc);
45 ib_dereg_mr(f->fr_mr);
46 return rc;
47}
48
49static void
50__frwr_release(struct rpcrdma_mw *r)
51{
52 int rc;
53
54 rc = ib_dereg_mr(r->r.frmr.fr_mr);
55 if (rc)
56 dprintk("RPC: %s: ib_dereg_mr status %i\n",
57 __func__, rc);
58 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
59}
60
61static int
62frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
63 struct rpcrdma_create_data_internal *cdata)
64{
65 struct ib_device_attr *devattr = &ia->ri_devattr;
66 int depth, delta;
67
68 ia->ri_max_frmr_depth =
69 min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
70 devattr->max_fast_reg_page_list_len);
71 dprintk("RPC: %s: device's max FR page list len = %u\n",
72 __func__, ia->ri_max_frmr_depth);
73
74 /* Add room for frmr register and invalidate WRs.
75 * 1. FRMR reg WR for head
76 * 2. FRMR invalidate WR for head
77 * 3. N FRMR reg WRs for pagelist
78 * 4. N FRMR invalidate WRs for pagelist
79 * 5. FRMR reg WR for tail
80 * 6. FRMR invalidate WR for tail
81 * 7. The RDMA_SEND WR
82 */
83 depth = 7;
84
85 /* Calculate N if the device max FRMR depth is smaller than
86 * RPCRDMA_MAX_DATA_SEGS.
87 */
88 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
89 delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
90 do {
91 depth += 2; /* FRMR reg + invalidate */
92 delta -= ia->ri_max_frmr_depth;
93 } while (delta > 0);
94 }
95
96 ep->rep_attr.cap.max_send_wr *= depth;
97 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
98 cdata->max_requests = devattr->max_qp_wr / depth;
99 if (!cdata->max_requests)
100 return -EINVAL;
101 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
102 depth;
103 }
104
105 return 0;
106}
107
108/* FRWR mode conveys a list of pages per chunk segment. The
109 * maximum length of that list is the FRWR page list depth.
110 */
111static size_t
112frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
113{
114 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
115
116 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
117 rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
118}
119
120/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
121static void
122frwr_sendcompletion(struct ib_wc *wc)
123{
124 struct rpcrdma_mw *r;
125
126 if (likely(wc->status == IB_WC_SUCCESS))
127 return;
128
129 /* WARNING: Only wr_id and status are reliable at this point */
130 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
131 dprintk("RPC: %s: frmr %p (stale), status %d\n",
132 __func__, r, wc->status);
133 r->r.frmr.fr_state = FRMR_IS_STALE;
134}
135
136static int
137frwr_op_init(struct rpcrdma_xprt *r_xprt)
138{
139 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
140 struct ib_device *device = r_xprt->rx_ia.ri_id->device;
141 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
142 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
143 int i;
144
145 INIT_LIST_HEAD(&buf->rb_mws);
146 INIT_LIST_HEAD(&buf->rb_all);
147
148 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
149 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
150
151 while (i--) {
152 struct rpcrdma_mw *r;
153 int rc;
154
155 r = kzalloc(sizeof(*r), GFP_KERNEL);
156 if (!r)
157 return -ENOMEM;
158
159 rc = __frwr_init(r, pd, device, depth);
160 if (rc) {
161 kfree(r);
162 return rc;
163 }
164
165 list_add(&r->mw_list, &buf->rb_mws);
166 list_add(&r->mw_all, &buf->rb_all);
167 r->mw_sendcompletion = frwr_sendcompletion;
168 }
169
170 return 0;
171}
172
173/* Post a FAST_REG Work Request to register a memory region
174 * for remote access via RDMA READ or RDMA WRITE.
175 */
176static int
177frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
178 int nsegs, bool writing)
179{
180 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
181 struct ib_device *device = ia->ri_id->device;
182 enum dma_data_direction direction = rpcrdma_data_dir(writing);
183 struct rpcrdma_mr_seg *seg1 = seg;
184 struct rpcrdma_mw *mw = seg1->rl_mw;
185 struct rpcrdma_frmr *frmr = &mw->r.frmr;
186 struct ib_mr *mr = frmr->fr_mr;
187 struct ib_send_wr fastreg_wr, *bad_wr;
188 u8 key;
189 int len, pageoff;
190 int i, rc;
191 int seg_len;
192 u64 pa;
193 int page_no;
194
195 pageoff = offset_in_page(seg1->mr_offset);
196 seg1->mr_offset -= pageoff; /* start of page */
197 seg1->mr_len += pageoff;
198 len = -pageoff;
199 if (nsegs > ia->ri_max_frmr_depth)
200 nsegs = ia->ri_max_frmr_depth;
201 for (page_no = i = 0; i < nsegs;) {
202 rpcrdma_map_one(device, seg, direction);
203 pa = seg->mr_dma;
204 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
205 frmr->fr_pgl->page_list[page_no++] = pa;
206 pa += PAGE_SIZE;
207 }
208 len += seg->mr_len;
209 ++seg;
210 ++i;
211 /* Check for holes */
212 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
213 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
214 break;
215 }
216 dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
217 __func__, mw, i, len);
218
219 frmr->fr_state = FRMR_IS_VALID;
220
221 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
222 fastreg_wr.wr_id = (unsigned long)(void *)mw;
223 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
224 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
225 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
226 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
227 fastreg_wr.wr.fast_reg.page_list_len = page_no;
228 fastreg_wr.wr.fast_reg.length = len;
229 fastreg_wr.wr.fast_reg.access_flags = writing ?
230 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
231 IB_ACCESS_REMOTE_READ;
232 key = (u8)(mr->rkey & 0x000000FF);
233 ib_update_fast_reg_key(mr, ++key);
234 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
235
236 DECR_CQCOUNT(&r_xprt->rx_ep);
237 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
238 if (rc)
239 goto out_senderr;
240
241 seg1->mr_rkey = mr->rkey;
242 seg1->mr_base = seg1->mr_dma + pageoff;
243 seg1->mr_nsegs = i;
244 seg1->mr_len = len;
245 return i;
246
247out_senderr:
248 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
249 ib_update_fast_reg_key(mr, --key);
250 frmr->fr_state = FRMR_IS_INVALID;
251 while (i--)
252 rpcrdma_unmap_one(device, --seg);
253 return rc;
254}
255
256/* Post a LOCAL_INV Work Request to prevent further remote access
257 * via RDMA READ or RDMA WRITE.
258 */
259static int
260frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
261{
262 struct rpcrdma_mr_seg *seg1 = seg;
263 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
264 struct ib_send_wr invalidate_wr, *bad_wr;
265 int rc, nsegs = seg->mr_nsegs;
266 struct ib_device *device;
267
268 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
269
270 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
271 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
272 invalidate_wr.opcode = IB_WR_LOCAL_INV;
273 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
274 DECR_CQCOUNT(&r_xprt->rx_ep);
275
276 read_lock(&ia->ri_qplock);
277 device = ia->ri_id->device;
278 while (seg1->mr_nsegs--)
279 rpcrdma_unmap_one(device, seg++);
280 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
281 read_unlock(&ia->ri_qplock);
282 if (rc)
283 goto out_err;
284 return nsegs;
285
286out_err:
287 /* Force rpcrdma_buffer_get() to retry */
288 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
289 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
290 return nsegs;
291}
292
293/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
294 * an unusable state. Find FRMRs in this state and dereg / reg
295 * each. FRMRs that are VALID and attached to an rpcrdma_req are
296 * also torn down.
297 *
298 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
299 *
300 * This is invoked only in the transport connect worker in order
301 * to serialize with rpcrdma_register_frmr_external().
302 */
303static void
304frwr_op_reset(struct rpcrdma_xprt *r_xprt)
305{
306 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
307 struct ib_device *device = r_xprt->rx_ia.ri_id->device;
308 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
309 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
310 struct rpcrdma_mw *r;
311 int rc;
312
313 list_for_each_entry(r, &buf->rb_all, mw_all) {
314 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
315 continue;
316
317 __frwr_release(r);
318 rc = __frwr_init(r, pd, device, depth);
319 if (rc) {
320 dprintk("RPC: %s: mw %p left %s\n",
321 __func__, r,
322 (r->r.frmr.fr_state == FRMR_IS_STALE ?
323 "stale" : "valid"));
324 continue;
325 }
326
327 r->r.frmr.fr_state = FRMR_IS_INVALID;
328 }
329}
330
331static void
332frwr_op_destroy(struct rpcrdma_buffer *buf)
333{
334 struct rpcrdma_mw *r;
335
336 while (!list_empty(&buf->rb_all)) {
337 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
338 list_del(&r->mw_all);
339 __frwr_release(r);
340 kfree(r);
341 }
342}
343
344const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
345 .ro_map = frwr_op_map,
346 .ro_unmap = frwr_op_unmap,
347 .ro_open = frwr_op_open,
348 .ro_maxpages = frwr_op_maxpages,
349 .ro_init = frwr_op_init,
350 .ro_reset = frwr_op_reset,
351 .ro_destroy = frwr_op_destroy,
352 .ro_displayname = "frwr",
353};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
new file mode 100644
index 000000000000..ba518af16787
--- /dev/null
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -0,0 +1,94 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* No-op chunk preparation. All client memory is pre-registered.
7 * Sometimes referred to as ALLPHYSICAL mode.
8 *
9 * Physical registration is simple because all client memory is
10 * pre-registered and never deregistered. This mode is good for
11 * adapter bring up, but is considered not safe: the server is
12 * trusted not to abuse its access to client memory not involved
13 * in RDMA I/O.
14 */
15
16#include "xprt_rdma.h"
17
18#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
19# define RPCDBG_FACILITY RPCDBG_TRANS
20#endif
21
22static int
23physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
24 struct rpcrdma_create_data_internal *cdata)
25{
26 return 0;
27}
28
29/* PHYSICAL memory registration conveys one page per chunk segment.
30 */
31static size_t
32physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
33{
34 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
35 rpcrdma_max_segments(r_xprt));
36}
37
38static int
39physical_op_init(struct rpcrdma_xprt *r_xprt)
40{
41 return 0;
42}
43
44/* The client's physical memory is already exposed for
45 * remote access via RDMA READ or RDMA WRITE.
46 */
47static int
48physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
49 int nsegs, bool writing)
50{
51 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
52
53 rpcrdma_map_one(ia->ri_id->device, seg,
54 rpcrdma_data_dir(writing));
55 seg->mr_rkey = ia->ri_bind_mem->rkey;
56 seg->mr_base = seg->mr_dma;
57 seg->mr_nsegs = 1;
58 return 1;
59}
60
61/* Unmap a memory region, but leave it registered.
62 */
63static int
64physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
65{
66 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
67
68 read_lock(&ia->ri_qplock);
69 rpcrdma_unmap_one(ia->ri_id->device, seg);
70 read_unlock(&ia->ri_qplock);
71
72 return 1;
73}
74
75static void
76physical_op_reset(struct rpcrdma_xprt *r_xprt)
77{
78}
79
80static void
81physical_op_destroy(struct rpcrdma_buffer *buf)
82{
83}
84
85const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
86 .ro_map = physical_op_map,
87 .ro_unmap = physical_op_unmap,
88 .ro_open = physical_op_open,
89 .ro_maxpages = physical_op_maxpages,
90 .ro_init = physical_op_init,
91 .ro_reset = physical_op_reset,
92 .ro_destroy = physical_op_destroy,
93 .ro_displayname = "physical",
94};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 91ffde82fa0c..2c53ea9e1b83 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,6 +53,14 @@
53# define RPCDBG_FACILITY RPCDBG_TRANS 53# define RPCDBG_FACILITY RPCDBG_TRANS
54#endif 54#endif
55 55
56enum rpcrdma_chunktype {
57 rpcrdma_noch = 0,
58 rpcrdma_readch,
59 rpcrdma_areadch,
60 rpcrdma_writech,
61 rpcrdma_replych
62};
63
56#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 64#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
57static const char transfertypes[][12] = { 65static const char transfertypes[][12] = {
58 "pure inline", /* no chunks */ 66 "pure inline", /* no chunks */
@@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
179 struct rpcrdma_write_array *warray = NULL; 187 struct rpcrdma_write_array *warray = NULL;
180 struct rpcrdma_write_chunk *cur_wchunk = NULL; 188 struct rpcrdma_write_chunk *cur_wchunk = NULL;
181 __be32 *iptr = headerp->rm_body.rm_chunks; 189 __be32 *iptr = headerp->rm_body.rm_chunks;
190 int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
182 191
183 if (type == rpcrdma_readch || type == rpcrdma_areadch) { 192 if (type == rpcrdma_readch || type == rpcrdma_areadch) {
184 /* a read chunk - server will RDMA Read our memory */ 193 /* a read chunk - server will RDMA Read our memory */
@@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
201 if (nsegs < 0) 210 if (nsegs < 0)
202 return nsegs; 211 return nsegs;
203 212
213 map = r_xprt->rx_ia.ri_ops->ro_map;
204 do { 214 do {
205 n = rpcrdma_register_external(seg, nsegs, 215 n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
206 cur_wchunk != NULL, r_xprt);
207 if (n <= 0) 216 if (n <= 0)
208 goto out; 217 goto out;
209 if (cur_rchunk) { /* read */ 218 if (cur_rchunk) { /* read */
@@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
275 return (unsigned char *)iptr - (unsigned char *)headerp; 284 return (unsigned char *)iptr - (unsigned char *)headerp;
276 285
277out: 286out:
278 if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { 287 if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
279 for (pos = 0; nchunks--;) 288 return n;
280 pos += rpcrdma_deregister_external(
281 &req->rl_segments[pos], r_xprt);
282 }
283 return n;
284}
285 289
286/* 290 for (pos = 0; nchunks--;)
287 * Marshal chunks. This routine returns the header length 291 pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
288 * consumed by marshaling. 292 &req->rl_segments[pos]);
289 * 293 return n;
290 * Returns positive RPC/RDMA header size, or negative errno.
291 */
292
293ssize_t
294rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
295{
296 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
297 struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf);
298
299 if (req->rl_rtype != rpcrdma_noch)
300 result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
301 headerp, req->rl_rtype);
302 else if (req->rl_wtype != rpcrdma_noch)
303 result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
304 headerp, req->rl_wtype);
305 return result;
306} 294}
307 295
308/* 296/*
@@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
397 char *base; 385 char *base;
398 size_t rpclen, padlen; 386 size_t rpclen, padlen;
399 ssize_t hdrlen; 387 ssize_t hdrlen;
388 enum rpcrdma_chunktype rtype, wtype;
400 struct rpcrdma_msg *headerp; 389 struct rpcrdma_msg *headerp;
401 390
402 /* 391 /*
@@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
433 * into pages; otherwise use reply chunks. 422 * into pages; otherwise use reply chunks.
434 */ 423 */
435 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) 424 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
436 req->rl_wtype = rpcrdma_noch; 425 wtype = rpcrdma_noch;
437 else if (rqst->rq_rcv_buf.page_len == 0) 426 else if (rqst->rq_rcv_buf.page_len == 0)
438 req->rl_wtype = rpcrdma_replych; 427 wtype = rpcrdma_replych;
439 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 428 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
440 req->rl_wtype = rpcrdma_writech; 429 wtype = rpcrdma_writech;
441 else 430 else
442 req->rl_wtype = rpcrdma_replych; 431 wtype = rpcrdma_replych;
443 432
444 /* 433 /*
445 * Chunks needed for arguments? 434 * Chunks needed for arguments?
@@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
456 * TBD check NFSv4 setacl 445 * TBD check NFSv4 setacl
457 */ 446 */
458 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) 447 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
459 req->rl_rtype = rpcrdma_noch; 448 rtype = rpcrdma_noch;
460 else if (rqst->rq_snd_buf.page_len == 0) 449 else if (rqst->rq_snd_buf.page_len == 0)
461 req->rl_rtype = rpcrdma_areadch; 450 rtype = rpcrdma_areadch;
462 else 451 else
463 req->rl_rtype = rpcrdma_readch; 452 rtype = rpcrdma_readch;
464 453
465 /* The following simplification is not true forever */ 454 /* The following simplification is not true forever */
466 if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) 455 if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
467 req->rl_wtype = rpcrdma_noch; 456 wtype = rpcrdma_noch;
468 if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { 457 if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
469 dprintk("RPC: %s: cannot marshal multiple chunk lists\n", 458 dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
470 __func__); 459 __func__);
471 return -EIO; 460 return -EIO;
@@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
479 * When padding is in use and applies to the transfer, insert 468 * When padding is in use and applies to the transfer, insert
480 * it and change the message type. 469 * it and change the message type.
481 */ 470 */
482 if (req->rl_rtype == rpcrdma_noch) { 471 if (rtype == rpcrdma_noch) {
483 472
484 padlen = rpcrdma_inline_pullup(rqst, 473 padlen = rpcrdma_inline_pullup(rqst,
485 RPCRDMA_INLINE_PAD_VALUE(rqst)); 474 RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
494 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 483 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
495 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; 484 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
496 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ 485 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
497 if (req->rl_wtype != rpcrdma_noch) { 486 if (wtype != rpcrdma_noch) {
498 dprintk("RPC: %s: invalid chunk list\n", 487 dprintk("RPC: %s: invalid chunk list\n",
499 __func__); 488 __func__);
500 return -EIO; 489 return -EIO;
@@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
515 * on receive. Therefore, we request a reply chunk 504 * on receive. Therefore, we request a reply chunk
516 * for non-writes wherever feasible and efficient. 505 * for non-writes wherever feasible and efficient.
517 */ 506 */
518 if (req->rl_wtype == rpcrdma_noch) 507 if (wtype == rpcrdma_noch)
519 req->rl_wtype = rpcrdma_replych; 508 wtype = rpcrdma_replych;
520 } 509 }
521 } 510 }
522 511
523 hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); 512 if (rtype != rpcrdma_noch) {
513 hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
514 headerp, rtype);
515 wtype = rtype; /* simplify dprintk */
516
517 } else if (wtype != rpcrdma_noch) {
518 hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
519 headerp, wtype);
520 }
524 if (hdrlen < 0) 521 if (hdrlen < 0)
525 return hdrlen; 522 return hdrlen;
526 523
527 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" 524 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
528 " headerp 0x%p base 0x%p lkey 0x%x\n", 525 " headerp 0x%p base 0x%p lkey 0x%x\n",
529 __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, 526 __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
530 headerp, base, rdmab_lkey(req->rl_rdmabuf)); 527 headerp, base, rdmab_lkey(req->rl_rdmabuf));
531 528
532 /* 529 /*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2e192baa59f3..54f23b1be986 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = {
157static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ 157static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
158 158
159static void 159static void
160xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
161{
162 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
163 char buf[20];
164
165 snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
166 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
167
168 xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
169}
170
171static void
172xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
173{
174 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
175 char buf[40];
176
177 snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
178 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
179
180 xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
181}
182
183static void
160xprt_rdma_format_addresses(struct rpc_xprt *xprt) 184xprt_rdma_format_addresses(struct rpc_xprt *xprt)
161{ 185{
162 struct sockaddr *sap = (struct sockaddr *) 186 struct sockaddr *sap = (struct sockaddr *)
163 &rpcx_to_rdmad(xprt).addr; 187 &rpcx_to_rdmad(xprt).addr;
164 struct sockaddr_in *sin = (struct sockaddr_in *)sap; 188 char buf[128];
165 char buf[64]; 189
190 switch (sap->sa_family) {
191 case AF_INET:
192 xprt_rdma_format_addresses4(xprt, sap);
193 break;
194 case AF_INET6:
195 xprt_rdma_format_addresses6(xprt, sap);
196 break;
197 default:
198 pr_err("rpcrdma: Unrecognized address family\n");
199 return;
200 }
166 201
167 (void)rpc_ntop(sap, buf, sizeof(buf)); 202 (void)rpc_ntop(sap, buf, sizeof(buf));
168 xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); 203 xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
@@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt)
170 snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); 205 snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
171 xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); 206 xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
172 207
173 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
174
175 snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
176 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
177
178 snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); 208 snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
179 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); 209 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
180 210
181 /* netid */ 211 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
182 xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
183} 212}
184 213
185static void 214static void
@@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args)
377 xprt_rdma_connect_worker); 406 xprt_rdma_connect_worker);
378 407
379 xprt_rdma_format_addresses(xprt); 408 xprt_rdma_format_addresses(xprt);
380 xprt->max_payload = rpcrdma_max_payload(new_xprt); 409 xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
410 if (xprt->max_payload == 0)
411 goto out4;
412 xprt->max_payload <<= PAGE_SHIFT;
381 dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", 413 dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
382 __func__, xprt->max_payload); 414 __func__, xprt->max_payload);
383 415
@@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer)
552 584
553 for (i = 0; req->rl_nchunks;) { 585 for (i = 0; req->rl_nchunks;) {
554 --req->rl_nchunks; 586 --req->rl_nchunks;
555 i += rpcrdma_deregister_external( 587 i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
556 &req->rl_segments[i], r_xprt); 588 &req->rl_segments[i]);
557 } 589 }
558 590
559 rpcrdma_buffer_put(req); 591 rpcrdma_buffer_put(req);
@@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task)
579 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 611 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
580 int rc = 0; 612 int rc = 0;
581 613
582 if (req->rl_niovs == 0) 614 rc = rpcrdma_marshal_req(rqst);
583 rc = rpcrdma_marshal_req(rqst);
584 else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL)
585 rc = rpcrdma_marshal_chunks(rqst, 0);
586 if (rc < 0) 615 if (rc < 0)
587 goto failed_marshal; 616 goto failed_marshal;
588 617
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 124676c13780..4870d272e006 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -50,6 +50,7 @@
50#include <linux/interrupt.h> 50#include <linux/interrupt.h>
51#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/sunrpc/addr.h>
53#include <asm/bitops.h> 54#include <asm/bitops.h>
54 55
55#include "xprt_rdma.h" 56#include "xprt_rdma.h"
@@ -62,9 +63,6 @@
62# define RPCDBG_FACILITY RPCDBG_TRANS 63# define RPCDBG_FACILITY RPCDBG_TRANS
63#endif 64#endif
64 65
65static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
66static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
67
68/* 66/*
69 * internal functions 67 * internal functions
70 */ 68 */
@@ -188,7 +186,7 @@ static const char * const wc_status[] = {
188 "remote access error", 186 "remote access error",
189 "remote operation error", 187 "remote operation error",
190 "transport retry counter exceeded", 188 "transport retry counter exceeded",
191 "RNR retrycounter exceeded", 189 "RNR retry counter exceeded",
192 "local RDD violation error", 190 "local RDD violation error",
193 "remove invalid RD request", 191 "remove invalid RD request",
194 "operation aborted", 192 "operation aborted",
@@ -206,21 +204,17 @@ static const char * const wc_status[] = {
206static void 204static void
207rpcrdma_sendcq_process_wc(struct ib_wc *wc) 205rpcrdma_sendcq_process_wc(struct ib_wc *wc)
208{ 206{
209 if (likely(wc->status == IB_WC_SUCCESS))
210 return;
211
212 /* WARNING: Only wr_id and status are reliable at this point */ 207 /* WARNING: Only wr_id and status are reliable at this point */
213 if (wc->wr_id == 0ULL) { 208 if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
214 if (wc->status != IB_WC_WR_FLUSH_ERR) 209 if (wc->status != IB_WC_SUCCESS &&
210 wc->status != IB_WC_WR_FLUSH_ERR)
215 pr_err("RPC: %s: SEND: %s\n", 211 pr_err("RPC: %s: SEND: %s\n",
216 __func__, COMPLETION_MSG(wc->status)); 212 __func__, COMPLETION_MSG(wc->status));
217 } else { 213 } else {
218 struct rpcrdma_mw *r; 214 struct rpcrdma_mw *r;
219 215
220 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 216 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
221 r->r.frmr.fr_state = FRMR_IS_STALE; 217 r->mw_sendcompletion(wc);
222 pr_err("RPC: %s: frmr %p (stale): %s\n",
223 __func__, r, COMPLETION_MSG(wc->status));
224 } 218 }
225} 219}
226 220
@@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
424 struct rpcrdma_ia *ia = &xprt->rx_ia; 418 struct rpcrdma_ia *ia = &xprt->rx_ia;
425 struct rpcrdma_ep *ep = &xprt->rx_ep; 419 struct rpcrdma_ep *ep = &xprt->rx_ep;
426#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 420#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
427 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; 421 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
428#endif 422#endif
429 struct ib_qp_attr *attr = &ia->ri_qp_attr; 423 struct ib_qp_attr *attr = &ia->ri_qp_attr;
430 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; 424 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
@@ -480,9 +474,8 @@ connected:
480 wake_up_all(&ep->rep_connect_wait); 474 wake_up_all(&ep->rep_connect_wait);
481 /*FALLTHROUGH*/ 475 /*FALLTHROUGH*/
482 default: 476 default:
483 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", 477 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
484 __func__, &addr->sin_addr.s_addr, 478 __func__, sap, rpc_get_port(sap), ep,
485 ntohs(addr->sin_port), ep,
486 CONNECTION_MSG(event->event)); 479 CONNECTION_MSG(event->event));
487 break; 480 break;
488 } 481 }
@@ -491,19 +484,16 @@ connected:
491 if (connstate == 1) { 484 if (connstate == 1) {
492 int ird = attr->max_dest_rd_atomic; 485 int ird = attr->max_dest_rd_atomic;
493 int tird = ep->rep_remote_cma.responder_resources; 486 int tird = ep->rep_remote_cma.responder_resources;
494 printk(KERN_INFO "rpcrdma: connection to %pI4:%u " 487
495 "on %s, memreg %d slots %d ird %d%s\n", 488 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
496 &addr->sin_addr.s_addr, 489 sap, rpc_get_port(sap),
497 ntohs(addr->sin_port),
498 ia->ri_id->device->name, 490 ia->ri_id->device->name,
499 ia->ri_memreg_strategy, 491 ia->ri_ops->ro_displayname,
500 xprt->rx_buf.rb_max_requests, 492 xprt->rx_buf.rb_max_requests,
501 ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); 493 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
502 } else if (connstate < 0) { 494 } else if (connstate < 0) {
503 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", 495 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
504 &addr->sin_addr.s_addr, 496 sap, rpc_get_port(sap), connstate);
505 ntohs(addr->sin_port),
506 connstate);
507 } 497 }
508#endif 498#endif
509 499
@@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
621 611
622 if (memreg == RPCRDMA_FRMR) { 612 if (memreg == RPCRDMA_FRMR) {
623 /* Requires both frmr reg and local dma lkey */ 613 /* Requires both frmr reg and local dma lkey */
624 if ((devattr->device_cap_flags & 614 if (((devattr->device_cap_flags &
625 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 615 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
626 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { 616 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
617 (devattr->max_fast_reg_page_list_len == 0)) {
627 dprintk("RPC: %s: FRMR registration " 618 dprintk("RPC: %s: FRMR registration "
628 "not supported by HCA\n", __func__); 619 "not supported by HCA\n", __func__);
629 memreg = RPCRDMA_MTHCAFMR; 620 memreg = RPCRDMA_MTHCAFMR;
630 } else {
631 /* Mind the ia limit on FRMR page list depth */
632 ia->ri_max_frmr_depth = min_t(unsigned int,
633 RPCRDMA_MAX_DATA_SEGS,
634 devattr->max_fast_reg_page_list_len);
635 } 621 }
636 } 622 }
637 if (memreg == RPCRDMA_MTHCAFMR) { 623 if (memreg == RPCRDMA_MTHCAFMR) {
@@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
652 */ 638 */
653 switch (memreg) { 639 switch (memreg) {
654 case RPCRDMA_FRMR: 640 case RPCRDMA_FRMR:
641 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
655 break; 642 break;
656 case RPCRDMA_ALLPHYSICAL: 643 case RPCRDMA_ALLPHYSICAL:
644 ia->ri_ops = &rpcrdma_physical_memreg_ops;
657 mem_priv = IB_ACCESS_LOCAL_WRITE | 645 mem_priv = IB_ACCESS_LOCAL_WRITE |
658 IB_ACCESS_REMOTE_WRITE | 646 IB_ACCESS_REMOTE_WRITE |
659 IB_ACCESS_REMOTE_READ; 647 IB_ACCESS_REMOTE_READ;
660 goto register_setup; 648 goto register_setup;
661 case RPCRDMA_MTHCAFMR: 649 case RPCRDMA_MTHCAFMR:
650 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
662 if (ia->ri_have_dma_lkey) 651 if (ia->ri_have_dma_lkey)
663 break; 652 break;
664 mem_priv = IB_ACCESS_LOCAL_WRITE; 653 mem_priv = IB_ACCESS_LOCAL_WRITE;
@@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
678 rc = -ENOMEM; 667 rc = -ENOMEM;
679 goto out3; 668 goto out3;
680 } 669 }
681 dprintk("RPC: %s: memory registration strategy is %d\n", 670 dprintk("RPC: %s: memory registration strategy is '%s'\n",
682 __func__, memreg); 671 __func__, ia->ri_ops->ro_displayname);
683 672
684 /* Else will do memory reg/dereg for each chunk */ 673 /* Else will do memory reg/dereg for each chunk */
685 ia->ri_memreg_strategy = memreg; 674 ia->ri_memreg_strategy = memreg;
@@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
743 732
744 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 733 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
745 ep->rep_attr.qp_context = ep; 734 ep->rep_attr.qp_context = ep;
746 /* send_cq and recv_cq initialized below */
747 ep->rep_attr.srq = NULL; 735 ep->rep_attr.srq = NULL;
748 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 736 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
749 switch (ia->ri_memreg_strategy) { 737 rc = ia->ri_ops->ro_open(ia, ep, cdata);
750 case RPCRDMA_FRMR: { 738 if (rc)
751 int depth = 7; 739 return rc;
752
753 /* Add room for frmr register and invalidate WRs.
754 * 1. FRMR reg WR for head
755 * 2. FRMR invalidate WR for head
756 * 3. N FRMR reg WRs for pagelist
757 * 4. N FRMR invalidate WRs for pagelist
758 * 5. FRMR reg WR for tail
759 * 6. FRMR invalidate WR for tail
760 * 7. The RDMA_SEND WR
761 */
762
763 /* Calculate N if the device max FRMR depth is smaller than
764 * RPCRDMA_MAX_DATA_SEGS.
765 */
766 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
767 int delta = RPCRDMA_MAX_DATA_SEGS -
768 ia->ri_max_frmr_depth;
769
770 do {
771 depth += 2; /* FRMR reg + invalidate */
772 delta -= ia->ri_max_frmr_depth;
773 } while (delta > 0);
774
775 }
776 ep->rep_attr.cap.max_send_wr *= depth;
777 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
778 cdata->max_requests = devattr->max_qp_wr / depth;
779 if (!cdata->max_requests)
780 return -EINVAL;
781 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
782 depth;
783 }
784 break;
785 }
786 default:
787 break;
788 }
789 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 740 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
790 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); 741 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
791 ep->rep_attr.cap.max_recv_sge = 1; 742 ep->rep_attr.cap.max_recv_sge = 1;
@@ -944,21 +895,9 @@ retry:
944 rpcrdma_ep_disconnect(ep, ia); 895 rpcrdma_ep_disconnect(ep, ia);
945 rpcrdma_flush_cqs(ep); 896 rpcrdma_flush_cqs(ep);
946 897
947 switch (ia->ri_memreg_strategy) {
948 case RPCRDMA_FRMR:
949 rpcrdma_reset_frmrs(ia);
950 break;
951 case RPCRDMA_MTHCAFMR:
952 rpcrdma_reset_fmrs(ia);
953 break;
954 case RPCRDMA_ALLPHYSICAL:
955 break;
956 default:
957 rc = -EIO;
958 goto out;
959 }
960
961 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 898 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
899 ia->ri_ops->ro_reset(xprt);
900
962 id = rpcrdma_create_id(xprt, ia, 901 id = rpcrdma_create_id(xprt, ia,
963 (struct sockaddr *)&xprt->rx_data.addr); 902 (struct sockaddr *)&xprt->rx_data.addr);
964 if (IS_ERR(id)) { 903 if (IS_ERR(id)) {
@@ -1123,91 +1062,6 @@ out:
1123 return ERR_PTR(rc); 1062 return ERR_PTR(rc);
1124} 1063}
1125 1064
1126static int
1127rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1128{
1129 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1130 struct ib_fmr_attr fmr_attr = {
1131 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1132 .max_maps = 1,
1133 .page_shift = PAGE_SHIFT
1134 };
1135 struct rpcrdma_mw *r;
1136 int i, rc;
1137
1138 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1139 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1140
1141 while (i--) {
1142 r = kzalloc(sizeof(*r), GFP_KERNEL);
1143 if (r == NULL)
1144 return -ENOMEM;
1145
1146 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1147 if (IS_ERR(r->r.fmr)) {
1148 rc = PTR_ERR(r->r.fmr);
1149 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1150 __func__, rc);
1151 goto out_free;
1152 }
1153
1154 list_add(&r->mw_list, &buf->rb_mws);
1155 list_add(&r->mw_all, &buf->rb_all);
1156 }
1157 return 0;
1158
1159out_free:
1160 kfree(r);
1161 return rc;
1162}
1163
1164static int
1165rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1166{
1167 struct rpcrdma_frmr *f;
1168 struct rpcrdma_mw *r;
1169 int i, rc;
1170
1171 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1172 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1173
1174 while (i--) {
1175 r = kzalloc(sizeof(*r), GFP_KERNEL);
1176 if (r == NULL)
1177 return -ENOMEM;
1178 f = &r->r.frmr;
1179
1180 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1181 ia->ri_max_frmr_depth);
1182 if (IS_ERR(f->fr_mr)) {
1183 rc = PTR_ERR(f->fr_mr);
1184 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1185 "failed %i\n", __func__, rc);
1186 goto out_free;
1187 }
1188
1189 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1190 ia->ri_max_frmr_depth);
1191 if (IS_ERR(f->fr_pgl)) {
1192 rc = PTR_ERR(f->fr_pgl);
1193 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1194 "failed %i\n", __func__, rc);
1195
1196 ib_dereg_mr(f->fr_mr);
1197 goto out_free;
1198 }
1199
1200 list_add(&r->mw_list, &buf->rb_mws);
1201 list_add(&r->mw_all, &buf->rb_all);
1202 }
1203
1204 return 0;
1205
1206out_free:
1207 kfree(r);
1208 return rc;
1209}
1210
1211int 1065int
1212rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) 1066rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1213{ 1067{
@@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1244 buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 1098 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1245 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 1099 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1246 1100
1247 INIT_LIST_HEAD(&buf->rb_mws); 1101 rc = ia->ri_ops->ro_init(r_xprt);
1248 INIT_LIST_HEAD(&buf->rb_all); 1102 if (rc)
1249 switch (ia->ri_memreg_strategy) { 1103 goto out;
1250 case RPCRDMA_FRMR:
1251 rc = rpcrdma_init_frmrs(ia, buf);
1252 if (rc)
1253 goto out;
1254 break;
1255 case RPCRDMA_MTHCAFMR:
1256 rc = rpcrdma_init_fmrs(ia, buf);
1257 if (rc)
1258 goto out;
1259 break;
1260 default:
1261 break;
1262 }
1263 1104
1264 for (i = 0; i < buf->rb_max_requests; i++) { 1105 for (i = 0; i < buf->rb_max_requests; i++) {
1265 struct rpcrdma_req *req; 1106 struct rpcrdma_req *req;
@@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1311 kfree(req); 1152 kfree(req);
1312} 1153}
1313 1154
1314static void
1315rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1316{
1317 struct rpcrdma_mw *r;
1318 int rc;
1319
1320 while (!list_empty(&buf->rb_all)) {
1321 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1322 list_del(&r->mw_all);
1323 list_del(&r->mw_list);
1324
1325 rc = ib_dealloc_fmr(r->r.fmr);
1326 if (rc)
1327 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1328 __func__, rc);
1329
1330 kfree(r);
1331 }
1332}
1333
1334static void
1335rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1336{
1337 struct rpcrdma_mw *r;
1338 int rc;
1339
1340 while (!list_empty(&buf->rb_all)) {
1341 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1342 list_del(&r->mw_all);
1343 list_del(&r->mw_list);
1344
1345 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1346 if (rc)
1347 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1348 __func__, rc);
1349 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1350
1351 kfree(r);
1352 }
1353}
1354
1355void 1155void
1356rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1156rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1357{ 1157{
@@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1372 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); 1172 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1373 } 1173 }
1374 1174
1375 switch (ia->ri_memreg_strategy) { 1175 ia->ri_ops->ro_destroy(buf);
1376 case RPCRDMA_FRMR:
1377 rpcrdma_destroy_frmrs(buf);
1378 break;
1379 case RPCRDMA_MTHCAFMR:
1380 rpcrdma_destroy_fmrs(buf);
1381 break;
1382 default:
1383 break;
1384 }
1385 1176
1386 kfree(buf->rb_pool); 1177 kfree(buf->rb_pool);
1387} 1178}
1388 1179
1389/* After a disconnect, unmap all FMRs.
1390 *
1391 * This is invoked only in the transport connect worker in order
1392 * to serialize with rpcrdma_register_fmr_external().
1393 */
1394static void
1395rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1396{
1397 struct rpcrdma_xprt *r_xprt =
1398 container_of(ia, struct rpcrdma_xprt, rx_ia);
1399 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1400 struct list_head *pos;
1401 struct rpcrdma_mw *r;
1402 LIST_HEAD(l);
1403 int rc;
1404
1405 list_for_each(pos, &buf->rb_all) {
1406 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1407
1408 INIT_LIST_HEAD(&l);
1409 list_add(&r->r.fmr->list, &l);
1410 rc = ib_unmap_fmr(&l);
1411 if (rc)
1412 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1413 __func__, rc);
1414 }
1415}
1416
1417/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1418 * an unusable state. Find FRMRs in this state and dereg / reg
1419 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1420 * also torn down.
1421 *
1422 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1423 *
1424 * This is invoked only in the transport connect worker in order
1425 * to serialize with rpcrdma_register_frmr_external().
1426 */
1427static void
1428rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1429{
1430 struct rpcrdma_xprt *r_xprt =
1431 container_of(ia, struct rpcrdma_xprt, rx_ia);
1432 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1433 struct list_head *pos;
1434 struct rpcrdma_mw *r;
1435 int rc;
1436
1437 list_for_each(pos, &buf->rb_all) {
1438 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1439
1440 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1441 continue;
1442
1443 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1444 if (rc)
1445 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1446 __func__, rc);
1447 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1448
1449 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1450 ia->ri_max_frmr_depth);
1451 if (IS_ERR(r->r.frmr.fr_mr)) {
1452 rc = PTR_ERR(r->r.frmr.fr_mr);
1453 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1454 " failed %i\n", __func__, rc);
1455 continue;
1456 }
1457 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1458 ia->ri_id->device,
1459 ia->ri_max_frmr_depth);
1460 if (IS_ERR(r->r.frmr.fr_pgl)) {
1461 rc = PTR_ERR(r->r.frmr.fr_pgl);
1462 dprintk("RPC: %s: "
1463 "ib_alloc_fast_reg_page_list "
1464 "failed %i\n", __func__, rc);
1465
1466 ib_dereg_mr(r->r.frmr.fr_mr);
1467 continue;
1468 }
1469 r->r.frmr.fr_state = FRMR_IS_INVALID;
1470 }
1471}
1472
1473/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving 1180/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1474 * some req segments uninitialized. 1181 * some req segments uninitialized.
1475 */ 1182 */
@@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1509 } 1216 }
1510} 1217}
1511 1218
1512/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). 1219/* rpcrdma_unmap_one() was already done during deregistration.
1513 * Redo only the ib_post_send(). 1220 * Redo only the ib_post_send().
1514 */ 1221 */
1515static void 1222static void
@@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1729 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1436 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1730 */ 1437 */
1731 1438
1439void
1440rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
1441{
1442 dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
1443 seg->mr_offset,
1444 (unsigned long long)seg->mr_dma, seg->mr_dmalen);
1445}
1446
1732static int 1447static int
1733rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1448rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1734 struct ib_mr **mrp, struct ib_sge *iov) 1449 struct ib_mr **mrp, struct ib_sge *iov)
@@ -1854,287 +1569,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1854} 1569}
1855 1570
1856/* 1571/*
1857 * Wrappers for chunk registration, shared by read/write chunk code.
1858 */
1859
1860static void
1861rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1862{
1863 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1864 seg->mr_dmalen = seg->mr_len;
1865 if (seg->mr_page)
1866 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1867 seg->mr_page, offset_in_page(seg->mr_offset),
1868 seg->mr_dmalen, seg->mr_dir);
1869 else
1870 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1871 seg->mr_offset,
1872 seg->mr_dmalen, seg->mr_dir);
1873 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1874 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1875 __func__,
1876 (unsigned long long)seg->mr_dma,
1877 seg->mr_offset, seg->mr_dmalen);
1878 }
1879}
1880
1881static void
1882rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1883{
1884 if (seg->mr_page)
1885 ib_dma_unmap_page(ia->ri_id->device,
1886 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1887 else
1888 ib_dma_unmap_single(ia->ri_id->device,
1889 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1890}
1891
1892static int
1893rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1894 int *nsegs, int writing, struct rpcrdma_ia *ia,
1895 struct rpcrdma_xprt *r_xprt)
1896{
1897 struct rpcrdma_mr_seg *seg1 = seg;
1898 struct rpcrdma_mw *mw = seg1->rl_mw;
1899 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1900 struct ib_mr *mr = frmr->fr_mr;
1901 struct ib_send_wr fastreg_wr, *bad_wr;
1902 u8 key;
1903 int len, pageoff;
1904 int i, rc;
1905 int seg_len;
1906 u64 pa;
1907 int page_no;
1908
1909 pageoff = offset_in_page(seg1->mr_offset);
1910 seg1->mr_offset -= pageoff; /* start of page */
1911 seg1->mr_len += pageoff;
1912 len = -pageoff;
1913 if (*nsegs > ia->ri_max_frmr_depth)
1914 *nsegs = ia->ri_max_frmr_depth;
1915 for (page_no = i = 0; i < *nsegs;) {
1916 rpcrdma_map_one(ia, seg, writing);
1917 pa = seg->mr_dma;
1918 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1919 frmr->fr_pgl->page_list[page_no++] = pa;
1920 pa += PAGE_SIZE;
1921 }
1922 len += seg->mr_len;
1923 ++seg;
1924 ++i;
1925 /* Check for holes */
1926 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1927 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1928 break;
1929 }
1930 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1931 __func__, mw, i);
1932
1933 frmr->fr_state = FRMR_IS_VALID;
1934
1935 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1936 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1937 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1938 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1939 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1940 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1941 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1942 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1943 if (fastreg_wr.wr.fast_reg.length < len) {
1944 rc = -EIO;
1945 goto out_err;
1946 }
1947
1948 /* Bump the key */
1949 key = (u8)(mr->rkey & 0x000000FF);
1950 ib_update_fast_reg_key(mr, ++key);
1951
1952 fastreg_wr.wr.fast_reg.access_flags = (writing ?
1953 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1954 IB_ACCESS_REMOTE_READ);
1955 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1956 DECR_CQCOUNT(&r_xprt->rx_ep);
1957
1958 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1959 if (rc) {
1960 dprintk("RPC: %s: failed ib_post_send for register,"
1961 " status %i\n", __func__, rc);
1962 ib_update_fast_reg_key(mr, --key);
1963 goto out_err;
1964 } else {
1965 seg1->mr_rkey = mr->rkey;
1966 seg1->mr_base = seg1->mr_dma + pageoff;
1967 seg1->mr_nsegs = i;
1968 seg1->mr_len = len;
1969 }
1970 *nsegs = i;
1971 return 0;
1972out_err:
1973 frmr->fr_state = FRMR_IS_INVALID;
1974 while (i--)
1975 rpcrdma_unmap_one(ia, --seg);
1976 return rc;
1977}
1978
1979static int
1980rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1981 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1982{
1983 struct rpcrdma_mr_seg *seg1 = seg;
1984 struct ib_send_wr invalidate_wr, *bad_wr;
1985 int rc;
1986
1987 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1988
1989 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1990 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
1991 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1992 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
1993 DECR_CQCOUNT(&r_xprt->rx_ep);
1994
1995 read_lock(&ia->ri_qplock);
1996 while (seg1->mr_nsegs--)
1997 rpcrdma_unmap_one(ia, seg++);
1998 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1999 read_unlock(&ia->ri_qplock);
2000 if (rc) {
2001 /* Force rpcrdma_buffer_get() to retry */
2002 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
2003 dprintk("RPC: %s: failed ib_post_send for invalidate,"
2004 " status %i\n", __func__, rc);
2005 }
2006 return rc;
2007}
2008
2009static int
2010rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
2011 int *nsegs, int writing, struct rpcrdma_ia *ia)
2012{
2013 struct rpcrdma_mr_seg *seg1 = seg;
2014 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
2015 int len, pageoff, i, rc;
2016
2017 pageoff = offset_in_page(seg1->mr_offset);
2018 seg1->mr_offset -= pageoff; /* start of page */
2019 seg1->mr_len += pageoff;
2020 len = -pageoff;
2021 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
2022 *nsegs = RPCRDMA_MAX_DATA_SEGS;
2023 for (i = 0; i < *nsegs;) {
2024 rpcrdma_map_one(ia, seg, writing);
2025 physaddrs[i] = seg->mr_dma;
2026 len += seg->mr_len;
2027 ++seg;
2028 ++i;
2029 /* Check for holes */
2030 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
2031 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
2032 break;
2033 }
2034 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
2035 if (rc) {
2036 dprintk("RPC: %s: failed ib_map_phys_fmr "
2037 "%u@0x%llx+%i (%d)... status %i\n", __func__,
2038 len, (unsigned long long)seg1->mr_dma,
2039 pageoff, i, rc);
2040 while (i--)
2041 rpcrdma_unmap_one(ia, --seg);
2042 } else {
2043 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
2044 seg1->mr_base = seg1->mr_dma + pageoff;
2045 seg1->mr_nsegs = i;
2046 seg1->mr_len = len;
2047 }
2048 *nsegs = i;
2049 return rc;
2050}
2051
2052static int
2053rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
2054 struct rpcrdma_ia *ia)
2055{
2056 struct rpcrdma_mr_seg *seg1 = seg;
2057 LIST_HEAD(l);
2058 int rc;
2059
2060 list_add(&seg1->rl_mw->r.fmr->list, &l);
2061 rc = ib_unmap_fmr(&l);
2062 read_lock(&ia->ri_qplock);
2063 while (seg1->mr_nsegs--)
2064 rpcrdma_unmap_one(ia, seg++);
2065 read_unlock(&ia->ri_qplock);
2066 if (rc)
2067 dprintk("RPC: %s: failed ib_unmap_fmr,"
2068 " status %i\n", __func__, rc);
2069 return rc;
2070}
2071
2072int
2073rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2074 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2075{
2076 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2077 int rc = 0;
2078
2079 switch (ia->ri_memreg_strategy) {
2080
2081 case RPCRDMA_ALLPHYSICAL:
2082 rpcrdma_map_one(ia, seg, writing);
2083 seg->mr_rkey = ia->ri_bind_mem->rkey;
2084 seg->mr_base = seg->mr_dma;
2085 seg->mr_nsegs = 1;
2086 nsegs = 1;
2087 break;
2088
2089 /* Registration using frmr registration */
2090 case RPCRDMA_FRMR:
2091 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2092 break;
2093
2094 /* Registration using fmr memory registration */
2095 case RPCRDMA_MTHCAFMR:
2096 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
2097 break;
2098
2099 default:
2100 return -EIO;
2101 }
2102 if (rc)
2103 return rc;
2104
2105 return nsegs;
2106}
2107
2108int
2109rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
2110 struct rpcrdma_xprt *r_xprt)
2111{
2112 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2113 int nsegs = seg->mr_nsegs, rc;
2114
2115 switch (ia->ri_memreg_strategy) {
2116
2117 case RPCRDMA_ALLPHYSICAL:
2118 read_lock(&ia->ri_qplock);
2119 rpcrdma_unmap_one(ia, seg);
2120 read_unlock(&ia->ri_qplock);
2121 break;
2122
2123 case RPCRDMA_FRMR:
2124 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2125 break;
2126
2127 case RPCRDMA_MTHCAFMR:
2128 rc = rpcrdma_deregister_fmr_external(seg, ia);
2129 break;
2130
2131 default:
2132 break;
2133 }
2134 return nsegs;
2135}
2136
2137/*
2138 * Prepost any receive buffer, then post send. 1572 * Prepost any receive buffer, then post send.
2139 * 1573 *
2140 * Receive buffer is donated to hardware, reclaimed upon recv completion. 1574 * Receive buffer is donated to hardware, reclaimed upon recv completion.
@@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
2156 } 1590 }
2157 1591
2158 send_wr.next = NULL; 1592 send_wr.next = NULL;
2159 send_wr.wr_id = 0ULL; /* no send cookie */ 1593 send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
2160 send_wr.sg_list = req->rl_send_iov; 1594 send_wr.sg_list = req->rl_send_iov;
2161 send_wr.num_sge = req->rl_niovs; 1595 send_wr.num_sge = req->rl_niovs;
2162 send_wr.opcode = IB_WR_SEND; 1596 send_wr.opcode = IB_WR_SEND;
@@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2215 return rc; 1649 return rc;
2216} 1650}
2217 1651
2218/* Physical mapping means one Read/Write list entry per-page. 1652/* How many chunk list items fit within our inline buffers?
2219 * All list entries must fit within an inline buffer
2220 *
2221 * NB: The server must return a Write list for NFS READ,
2222 * which has the same constraint. Factor in the inline
2223 * rsize as well.
2224 */ 1653 */
2225static size_t 1654unsigned int
2226rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) 1655rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
2227{ 1656{
2228 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 1657 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2229 unsigned int inline_size, pages; 1658 int bytes, segments;
2230 1659
2231 inline_size = min_t(unsigned int, 1660 bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
2232 cdata->inline_wsize, cdata->inline_rsize); 1661 bytes -= RPCRDMA_HDRLEN_MIN;
2233 inline_size -= RPCRDMA_HDRLEN_MIN; 1662 if (bytes < sizeof(struct rpcrdma_segment) * 2) {
2234 pages = inline_size / sizeof(struct rpcrdma_segment); 1663 pr_warn("RPC: %s: inline threshold too small\n",
2235 return pages << PAGE_SHIFT; 1664 __func__);
2236} 1665 return 0;
2237
2238static size_t
2239rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2240{
2241 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2242}
2243
2244size_t
2245rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2246{
2247 size_t result;
2248
2249 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2250 case RPCRDMA_ALLPHYSICAL:
2251 result = rpcrdma_physical_max_payload(r_xprt);
2252 break;
2253 default:
2254 result = rpcrdma_mr_max_payload(r_xprt);
2255 } 1666 }
2256 return result; 1667
1668 segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
1669 dprintk("RPC: %s: max chunk list size = %d segments\n",
1670 __func__, segments);
1671 return segments;
2257} 1672}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0a16fb6f0885..78e0b8beaa36 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -60,6 +60,7 @@
60 * Interface Adapter -- one per transport instance 60 * Interface Adapter -- one per transport instance
61 */ 61 */
62struct rpcrdma_ia { 62struct rpcrdma_ia {
63 const struct rpcrdma_memreg_ops *ri_ops;
63 rwlock_t ri_qplock; 64 rwlock_t ri_qplock;
64 struct rdma_cm_id *ri_id; 65 struct rdma_cm_id *ri_id;
65 struct ib_pd *ri_pd; 66 struct ib_pd *ri_pd;
@@ -105,6 +106,10 @@ struct rpcrdma_ep {
105#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) 106#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
106#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) 107#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
107 108
109/* Force completion handler to ignore the signal
110 */
111#define RPCRDMA_IGNORE_COMPLETION (0ULL)
112
108/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV 113/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
109 * 114 *
110 * The below structure appears at the front of a large region of kmalloc'd 115 * The below structure appears at the front of a large region of kmalloc'd
@@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
143 return (struct rpcrdma_msg *)rb->rg_base; 148 return (struct rpcrdma_msg *)rb->rg_base;
144} 149}
145 150
146enum rpcrdma_chunktype {
147 rpcrdma_noch = 0,
148 rpcrdma_readch,
149 rpcrdma_areadch,
150 rpcrdma_writech,
151 rpcrdma_replych
152};
153
154/* 151/*
155 * struct rpcrdma_rep -- this structure encapsulates state required to recv 152 * struct rpcrdma_rep -- this structure encapsulates state required to recv
156 * and complete a reply, asychronously. It needs several pieces of 153 * and complete a reply, asychronously. It needs several pieces of
@@ -213,6 +210,7 @@ struct rpcrdma_mw {
213 struct ib_fmr *fmr; 210 struct ib_fmr *fmr;
214 struct rpcrdma_frmr frmr; 211 struct rpcrdma_frmr frmr;
215 } r; 212 } r;
213 void (*mw_sendcompletion)(struct ib_wc *);
216 struct list_head mw_list; 214 struct list_head mw_list;
217 struct list_head mw_all; 215 struct list_head mw_all;
218}; 216};
@@ -258,7 +256,6 @@ struct rpcrdma_req {
258 unsigned int rl_niovs; /* 0, 2 or 4 */ 256 unsigned int rl_niovs; /* 0, 2 or 4 */
259 unsigned int rl_nchunks; /* non-zero if chunks */ 257 unsigned int rl_nchunks; /* non-zero if chunks */
260 unsigned int rl_connect_cookie; /* retry detection */ 258 unsigned int rl_connect_cookie; /* retry detection */
261 enum rpcrdma_chunktype rl_rtype, rl_wtype;
262 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ 259 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
263 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 260 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
264 struct ib_sge rl_send_iov[4]; /* for active requests */ 261 struct ib_sge rl_send_iov[4]; /* for active requests */
@@ -340,6 +337,29 @@ struct rpcrdma_stats {
340}; 337};
341 338
342/* 339/*
340 * Per-registration mode operations
341 */
342struct rpcrdma_xprt;
343struct rpcrdma_memreg_ops {
344 int (*ro_map)(struct rpcrdma_xprt *,
345 struct rpcrdma_mr_seg *, int, bool);
346 int (*ro_unmap)(struct rpcrdma_xprt *,
347 struct rpcrdma_mr_seg *);
348 int (*ro_open)(struct rpcrdma_ia *,
349 struct rpcrdma_ep *,
350 struct rpcrdma_create_data_internal *);
351 size_t (*ro_maxpages)(struct rpcrdma_xprt *);
352 int (*ro_init)(struct rpcrdma_xprt *);
353 void (*ro_reset)(struct rpcrdma_xprt *);
354 void (*ro_destroy)(struct rpcrdma_buffer *);
355 const char *ro_displayname;
356};
357
358extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
359extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
360extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
361
362/*
343 * RPCRDMA transport -- encapsulates the structures above for 363 * RPCRDMA transport -- encapsulates the structures above for
344 * integration with RPC. 364 * integration with RPC.
345 * 365 *
@@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
398void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 418void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
399void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 419void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
400 420
401int rpcrdma_register_external(struct rpcrdma_mr_seg *,
402 int, int, struct rpcrdma_xprt *);
403int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
404 struct rpcrdma_xprt *);
405
406struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, 421struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
407 size_t, gfp_t); 422 size_t, gfp_t);
408void rpcrdma_free_regbuf(struct rpcrdma_ia *, 423void rpcrdma_free_regbuf(struct rpcrdma_ia *,
409 struct rpcrdma_regbuf *); 424 struct rpcrdma_regbuf *);
410 425
426unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
427
428/*
429 * Wrappers for chunk registration, shared by read/write chunk code.
430 */
431
432void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
433
434static inline enum dma_data_direction
435rpcrdma_data_dir(bool writing)
436{
437 return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
438}
439
440static inline void
441rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
442 enum dma_data_direction direction)
443{
444 seg->mr_dir = direction;
445 seg->mr_dmalen = seg->mr_len;
446
447 if (seg->mr_page)
448 seg->mr_dma = ib_dma_map_page(device,
449 seg->mr_page, offset_in_page(seg->mr_offset),
450 seg->mr_dmalen, seg->mr_dir);
451 else
452 seg->mr_dma = ib_dma_map_single(device,
453 seg->mr_offset,
454 seg->mr_dmalen, seg->mr_dir);
455
456 if (ib_dma_mapping_error(device, seg->mr_dma))
457 rpcrdma_mapping_error(seg);
458}
459
460static inline void
461rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
462{
463 if (seg->mr_page)
464 ib_dma_unmap_page(device,
465 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
466 else
467 ib_dma_unmap_single(device,
468 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
469}
470
411/* 471/*
412 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c 472 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
413 */ 473 */
@@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
418/* 478/*
419 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c 479 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
420 */ 480 */
421ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
422int rpcrdma_marshal_req(struct rpc_rqst *); 481int rpcrdma_marshal_req(struct rpc_rqst *);
423size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
424 482
425/* Temporary NFS request map cache. Created in svc_rdma.c */ 483/* Temporary NFS request map cache. Created in svc_rdma.c */
426extern struct kmem_cache *svc_rdma_map_cachep; 484extern struct kmem_cache *svc_rdma_map_cachep;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a4cf364316de..14f09b3cb87c 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -464,10 +464,11 @@ void tipc_link_reset(struct tipc_link *l_ptr)
464 /* Clean up all queues, except inputq: */ 464 /* Clean up all queues, except inputq: */
465 __skb_queue_purge(&l_ptr->outqueue); 465 __skb_queue_purge(&l_ptr->outqueue);
466 __skb_queue_purge(&l_ptr->deferred_queue); 466 __skb_queue_purge(&l_ptr->deferred_queue);
467 skb_queue_splice_init(&l_ptr->wakeupq, &l_ptr->inputq); 467 if (!owner->inputq)
468 if (!skb_queue_empty(&l_ptr->inputq)) 468 owner->inputq = &l_ptr->inputq;
469 skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq);
470 if (!skb_queue_empty(owner->inputq))
469 owner->action_flags |= TIPC_MSG_EVT; 471 owner->action_flags |= TIPC_MSG_EVT;
470 owner->inputq = &l_ptr->inputq;
471 l_ptr->next_out = NULL; 472 l_ptr->next_out = NULL;
472 l_ptr->unacked_window = 0; 473 l_ptr->unacked_window = 0;
473 l_ptr->checkpoint = 1; 474 l_ptr->checkpoint = 1;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index be2501538011..b6f84f6a2a09 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4400,6 +4400,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
4400 if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params)) 4400 if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params))
4401 return -EINVAL; 4401 return -EINVAL;
4402 4402
4403 /* HT/VHT requires QoS, but if we don't have that just ignore HT/VHT
4404 * as userspace might just pass through the capabilities from the IEs
4405 * directly, rather than enforcing this restriction and returning an
4406 * error in this case.
4407 */
4408 if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) {
4409 params.ht_capa = NULL;
4410 params.vht_capa = NULL;
4411 }
4412
4403 /* When you run into this, adjust the code below for the new flag */ 4413 /* When you run into this, adjust the code below for the new flag */
4404 BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); 4414 BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7);
4405 4415
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index cee479bc655c..638af0655aaf 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2269,11 +2269,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
2269 * have the xfrm_state's. We need to wait for KM to 2269 * have the xfrm_state's. We need to wait for KM to
2270 * negotiate new SA's or bail out with error.*/ 2270 * negotiate new SA's or bail out with error.*/
2271 if (net->xfrm.sysctl_larval_drop) { 2271 if (net->xfrm.sysctl_larval_drop) {
2272 dst_release(dst);
2273 xfrm_pols_put(pols, drop_pols);
2274 XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); 2272 XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
2275 2273 err = -EREMOTE;
2276 return ERR_PTR(-EREMOTE); 2274 goto error;
2277 } 2275 }
2278 2276
2279 err = -EAGAIN; 2277 err = -EAGAIN;
@@ -2324,7 +2322,8 @@ nopol:
2324error: 2322error:
2325 dst_release(dst); 2323 dst_release(dst);
2326dropdst: 2324dropdst:
2327 dst_release(dst_orig); 2325 if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
2326 dst_release(dst_orig);
2328 xfrm_pols_put(pols, drop_pols); 2327 xfrm_pols_put(pols, drop_pols);
2329 return ERR_PTR(err); 2328 return ERR_PTR(err);
2330} 2329}
@@ -2338,7 +2337,8 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
2338 struct sock *sk, int flags) 2337 struct sock *sk, int flags)
2339{ 2338{
2340 struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, 2339 struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
2341 flags | XFRM_LOOKUP_QUEUE); 2340 flags | XFRM_LOOKUP_QUEUE |
2341 XFRM_LOOKUP_KEEP_DST_REF);
2342 2342
2343 if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) 2343 if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE)
2344 return make_blackhole(net, dst_orig->ops->family, dst_orig); 2344 return make_blackhole(net, dst_orig->ops->family, dst_orig);