aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/protocol.c6
-rw-r--r--net/9p/trans_fd.c3
-rw-r--r--net/9p/trans_rdma.c52
-rw-r--r--net/9p/trans_virtio.c5
-rw-r--r--net/bluetooth/hci_core.c3
-rw-r--r--net/bluetooth/hidp/core.c3
-rw-r--r--net/bridge/br_mdb.c2
-rw-r--r--net/bridge/br_netlink.c4
-rw-r--r--net/bridge/br_private.h2
-rw-r--r--net/ceph/ceph_common.c37
-rw-r--r--net/ceph/crush/crush.c14
-rw-r--r--net/ceph/crush/crush_ln_table.h166
-rw-r--r--net/ceph/crush/mapper.c118
-rw-r--r--net/ceph/debugfs.c24
-rw-r--r--net/ceph/messenger.c25
-rw-r--r--net/ceph/osdmap.c25
-rw-r--r--net/core/dev.c16
-rw-r--r--net/core/filter.c41
-rw-r--r--net/core/net_namespace.c3
-rw-r--r--net/core/rtnetlink.c12
-rw-r--r--net/core/skbuff.c40
-rw-r--r--net/core/sock.c2
-rw-r--r--net/dccp/ipv4.c3
-rw-r--r--net/dccp/ipv6.c3
-rw-r--r--net/dccp/minisocks.c3
-rw-r--r--net/dsa/dsa.c8
-rw-r--r--net/ieee802154/Makefile4
-rw-r--r--net/ieee802154/nl-phy.c5
-rw-r--r--net/ieee802154/nl802154.c2
-rw-r--r--net/ieee802154/rdev-ops.h85
-rw-r--r--net/ieee802154/trace.c7
-rw-r--r--net/ieee802154/trace.h247
-rw-r--r--net/ipv4/fou.c3
-rw-r--r--net/ipv4/inet_connection_sock.c34
-rw-r--r--net/ipv4/inet_diag.c30
-rw-r--r--net/ipv4/ip_forward.c3
-rw-r--r--net/ipv4/ping.c1
-rw-r--r--net/ipv4/route.c5
-rw-r--r--net/ipv4/tcp.c42
-rw-r--r--net/ipv4/tcp_dctcp.c21
-rw-r--r--net/ipv4/tcp_fastopen.c1
-rw-r--r--net/ipv4/tcp_illinois.c23
-rw-r--r--net/ipv4/tcp_input.c38
-rw-r--r--net/ipv4/tcp_ipv4.c3
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c64
-rw-r--r--net/ipv4/tcp_vegas.c20
-rw-r--r--net/ipv4/tcp_vegas.h3
-rw-r--r--net/ipv4/tcp_westwood.c17
-rw-r--r--net/ipv6/ip6_gre.c9
-rw-r--r--net/ipv6/ip6_output.c39
-rw-r--r--net/ipv6/route.c5
-rw-r--r--net/ipv6/tcp_ipv6.c3
-rw-r--r--net/mac80211/iface.c12
-rw-r--r--net/mac80211/sta_info.c19
-rw-r--r--net/mac802154/cfg.c9
-rw-r--r--net/mac802154/ieee802154_i.h3
-rw-r--r--net/mac802154/iface.c5
-rw-r--r--net/mac802154/llsec.c4
-rw-r--r--net/mac802154/main.c7
-rw-r--r--net/mpls/af_mpls.c141
-rw-r--r--net/mpls/internal.h16
-rw-r--r--net/netfilter/nf_tables_api.c3
-rw-r--r--net/netfilter/nft_reject.c2
-rw-r--r--net/netfilter/nft_reject_inet.c2
-rw-r--r--net/netlink/af_netlink.c7
-rw-r--r--net/packet/af_packet.c9
-rw-r--r--net/rds/connection.c17
-rw-r--r--net/rds/ib_cm.c13
-rw-r--r--net/rds/tcp_connect.c1
-rw-r--r--net/rds/tcp_listen.c46
-rw-r--r--net/sched/act_bpf.c3
-rw-r--r--net/sched/act_connmark.c2
-rw-r--r--net/sched/act_mirred.c2
-rw-r--r--net/sched/cls_api.c7
-rw-r--r--net/sched/cls_bpf.c3
-rw-r--r--net/sched/sch_codel.c2
-rw-r--r--net/sched/sch_fq_codel.c2
-rw-r--r--net/sched/sch_gred.c4
-rw-r--r--net/socket.c6
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c23
-rw-r--r--net/sunrpc/rpc_pipe.c32
-rw-r--r--net/sunrpc/sched.c4
-rw-r--r--net/sunrpc/xprt.c22
-rw-r--r--net/sunrpc/xprtrdma/Makefile3
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c208
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c353
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c94
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c87
-rw-r--r--net/sunrpc/xprtrdma/transport.c61
-rw-r--r--net/sunrpc/xprtrdma/verbs.c699
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h90
-rw-r--r--net/tipc/bearer.c17
-rw-r--r--net/tipc/link.c16
-rw-r--r--net/tipc/server.c9
-rw-r--r--net/tipc/socket.c3
-rw-r--r--net/unix/af_unix.c8
-rw-r--r--net/unix/diag.c2
-rw-r--r--net/unix/garbage.c70
99 files changed, 2392 insertions, 1097 deletions
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index e9d0f0c1a048..16d287565987 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -275,7 +275,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
275 } 275 }
276 break; 276 break;
277 case 'R':{ 277 case 'R':{
278 int16_t *nwqid = va_arg(ap, int16_t *); 278 uint16_t *nwqid = va_arg(ap, uint16_t *);
279 struct p9_qid **wqids = 279 struct p9_qid **wqids =
280 va_arg(ap, struct p9_qid **); 280 va_arg(ap, struct p9_qid **);
281 281
@@ -440,7 +440,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
440 stbuf->n_gid, stbuf->n_muid); 440 stbuf->n_gid, stbuf->n_muid);
441 } break; 441 } break;
442 case 'V':{ 442 case 'V':{
443 int32_t count = va_arg(ap, int32_t); 443 uint32_t count = va_arg(ap, uint32_t);
444 struct iov_iter *from = 444 struct iov_iter *from =
445 va_arg(ap, struct iov_iter *); 445 va_arg(ap, struct iov_iter *);
446 errcode = p9pdu_writef(pdu, proto_version, "d", 446 errcode = p9pdu_writef(pdu, proto_version, "d",
@@ -471,7 +471,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
471 } 471 }
472 break; 472 break;
473 case 'R':{ 473 case 'R':{
474 int16_t nwqid = va_arg(ap, int); 474 uint16_t nwqid = va_arg(ap, int);
475 struct p9_qid *wqids = 475 struct p9_qid *wqids =
476 va_arg(ap, struct p9_qid *); 476 va_arg(ap, struct p9_qid *);
477 477
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 3e3d82d8ff70..bced8c074c12 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -734,6 +734,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts)
734 opts->port = P9_PORT; 734 opts->port = P9_PORT;
735 opts->rfd = ~0; 735 opts->rfd = ~0;
736 opts->wfd = ~0; 736 opts->wfd = ~0;
737 opts->privport = 0;
737 738
738 if (!params) 739 if (!params)
739 return 0; 740 return 0;
@@ -1013,7 +1014,6 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args)
1013{ 1014{
1014 int err; 1015 int err;
1015 struct p9_fd_opts opts; 1016 struct p9_fd_opts opts;
1016 struct p9_trans_fd *p;
1017 1017
1018 parse_opts(args, &opts); 1018 parse_opts(args, &opts);
1019 1019
@@ -1026,7 +1026,6 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args)
1026 if (err < 0) 1026 if (err < 0)
1027 return err; 1027 return err;
1028 1028
1029 p = (struct p9_trans_fd *) client->trans;
1030 p9_conn_create(client); 1029 p9_conn_create(client);
1031 1030
1032 return 0; 1031 return 0;
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 14ad43b5cf89..3533d2a53ab6 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -139,6 +139,7 @@ struct p9_rdma_opts {
139 int sq_depth; 139 int sq_depth;
140 int rq_depth; 140 int rq_depth;
141 long timeout; 141 long timeout;
142 int privport;
142}; 143};
143 144
144/* 145/*
@@ -146,7 +147,10 @@ struct p9_rdma_opts {
146 */ 147 */
147enum { 148enum {
148 /* Options that take integer arguments */ 149 /* Options that take integer arguments */
149 Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err, 150 Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout,
151 /* Options that take no argument */
152 Opt_privport,
153 Opt_err,
150}; 154};
151 155
152static match_table_t tokens = { 156static match_table_t tokens = {
@@ -154,6 +158,7 @@ static match_table_t tokens = {
154 {Opt_sq_depth, "sq=%u"}, 158 {Opt_sq_depth, "sq=%u"},
155 {Opt_rq_depth, "rq=%u"}, 159 {Opt_rq_depth, "rq=%u"},
156 {Opt_timeout, "timeout=%u"}, 160 {Opt_timeout, "timeout=%u"},
161 {Opt_privport, "privport"},
157 {Opt_err, NULL}, 162 {Opt_err, NULL},
158}; 163};
159 164
@@ -175,6 +180,7 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
175 opts->sq_depth = P9_RDMA_SQ_DEPTH; 180 opts->sq_depth = P9_RDMA_SQ_DEPTH;
176 opts->rq_depth = P9_RDMA_RQ_DEPTH; 181 opts->rq_depth = P9_RDMA_RQ_DEPTH;
177 opts->timeout = P9_RDMA_TIMEOUT; 182 opts->timeout = P9_RDMA_TIMEOUT;
183 opts->privport = 0;
178 184
179 if (!params) 185 if (!params)
180 return 0; 186 return 0;
@@ -193,13 +199,13 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
193 if (!*p) 199 if (!*p)
194 continue; 200 continue;
195 token = match_token(p, tokens, args); 201 token = match_token(p, tokens, args);
196 if (token == Opt_err) 202 if ((token != Opt_err) && (token != Opt_privport)) {
197 continue; 203 r = match_int(&args[0], &option);
198 r = match_int(&args[0], &option); 204 if (r < 0) {
199 if (r < 0) { 205 p9_debug(P9_DEBUG_ERROR,
200 p9_debug(P9_DEBUG_ERROR, 206 "integer field, but no integer?\n");
201 "integer field, but no integer?\n"); 207 continue;
202 continue; 208 }
203 } 209 }
204 switch (token) { 210 switch (token) {
205 case Opt_port: 211 case Opt_port:
@@ -214,6 +220,9 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
214 case Opt_timeout: 220 case Opt_timeout:
215 opts->timeout = option; 221 opts->timeout = option;
216 break; 222 break;
223 case Opt_privport:
224 opts->privport = 1;
225 break;
217 default: 226 default:
218 continue; 227 continue;
219 } 228 }
@@ -607,6 +616,23 @@ static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
607 return 0; 616 return 0;
608} 617}
609 618
619static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
620{
621 struct sockaddr_in cl = {
622 .sin_family = AF_INET,
623 .sin_addr.s_addr = htonl(INADDR_ANY),
624 };
625 int port, err = -EINVAL;
626
627 for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) {
628 cl.sin_port = htons((ushort)port);
629 err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl);
630 if (err != -EADDRINUSE)
631 break;
632 }
633 return err;
634}
635
610/** 636/**
611 * trans_create_rdma - Transport method for creating atransport instance 637 * trans_create_rdma - Transport method for creating atransport instance
612 * @client: client instance 638 * @client: client instance
@@ -642,6 +668,16 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
642 /* Associate the client with the transport */ 668 /* Associate the client with the transport */
643 client->trans = rdma; 669 client->trans = rdma;
644 670
671 /* Bind to a privileged port if we need to */
672 if (opts.privport) {
673 err = p9_rdma_bind_privport(rdma);
674 if (err < 0) {
675 pr_err("%s (%d): problem binding to privport: %d\n",
676 __func__, task_pid_nr(current), -err);
677 goto error;
678 }
679 }
680
645 /* Resolve the server's address */ 681 /* Resolve the server's address */
646 rdma->addr.sin_family = AF_INET; 682 rdma->addr.sin_family = AF_INET;
647 rdma->addr.sin_addr.s_addr = in_aton(addr); 683 rdma->addr.sin_addr.s_addr = in_aton(addr);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index e62bcbbabb5e..9dd49ca67dbc 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -525,7 +525,10 @@ static ssize_t p9_mount_tag_show(struct device *dev,
525 vdev = dev_to_virtio(dev); 525 vdev = dev_to_virtio(dev);
526 chan = vdev->priv; 526 chan = vdev->priv;
527 527
528 return snprintf(buf, chan->tag_len + 1, "%s", chan->tag); 528 memcpy(buf, chan->tag, chan->tag_len);
529 buf[chan->tag_len] = 0;
530
531 return chan->tag_len + 1;
529} 532}
530 533
531static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); 534static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 476709bd068a..4663c3dad3f5 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1557,7 +1557,8 @@ static int hci_dev_do_close(struct hci_dev *hdev)
1557{ 1557{
1558 BT_DBG("%s %p", hdev->name, hdev); 1558 BT_DBG("%s %p", hdev->name, hdev);
1559 1559
1560 if (!hci_dev_test_flag(hdev, HCI_UNREGISTER)) { 1560 if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
1561 test_bit(HCI_UP, &hdev->flags)) {
1561 /* Execute vendor specific shutdown routine */ 1562 /* Execute vendor specific shutdown routine */
1562 if (hdev->shutdown) 1563 if (hdev->shutdown)
1563 hdev->shutdown(hdev); 1564 hdev->shutdown(hdev);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index a05b9dbf14c9..9070dfd6b4ad 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1313,7 +1313,8 @@ int hidp_connection_add(struct hidp_connadd_req *req,
1313 struct socket *ctrl_sock, 1313 struct socket *ctrl_sock,
1314 struct socket *intr_sock) 1314 struct socket *intr_sock)
1315{ 1315{
1316 u32 valid_flags = 0; 1316 u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG) |
1317 BIT(HIDP_BOOT_PROTOCOL_MODE);
1317 struct hidp_session *session; 1318 struct hidp_session *session;
1318 struct l2cap_conn *conn; 1319 struct l2cap_conn *conn;
1319 struct l2cap_chan *chan; 1320 struct l2cap_chan *chan;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 409608960899..e29ad70b3000 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -170,7 +170,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
170 struct br_port_msg *bpm; 170 struct br_port_msg *bpm;
171 struct nlattr *nest, *nest2; 171 struct nlattr *nest, *nest2;
172 172
173 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI); 173 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
174 if (!nlh) 174 if (!nlh)
175 return -EMSGSIZE; 175 return -EMSGSIZE;
176 176
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 0e4ddb81610d..4b5c236998ff 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -394,7 +394,7 @@ errout:
394 * Dump information about all ports, in response to GETLINK 394 * Dump information about all ports, in response to GETLINK
395 */ 395 */
396int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, 396int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
397 struct net_device *dev, u32 filter_mask) 397 struct net_device *dev, u32 filter_mask, int nlflags)
398{ 398{
399 struct net_bridge_port *port = br_port_get_rtnl(dev); 399 struct net_bridge_port *port = br_port_get_rtnl(dev);
400 400
@@ -402,7 +402,7 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
402 !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) 402 !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED))
403 return 0; 403 return 0;
404 404
405 return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI, 405 return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags,
406 filter_mask, dev); 406 filter_mask, dev);
407} 407}
408 408
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6ca0251cb478..3362c29400f1 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -828,7 +828,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port);
828int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); 828int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
829int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); 829int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
830int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, 830int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
831 u32 filter_mask); 831 u32 filter_mask, int nlflags);
832 832
833#ifdef CONFIG_SYSFS 833#ifdef CONFIG_SYSFS
834/* br_sysfs_if.c */ 834/* br_sysfs_if.c */
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index ec565508e904..79e8f71aef5b 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -490,6 +490,43 @@ out:
490} 490}
491EXPORT_SYMBOL(ceph_parse_options); 491EXPORT_SYMBOL(ceph_parse_options);
492 492
493int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
494{
495 struct ceph_options *opt = client->options;
496 size_t pos = m->count;
497
498 if (opt->name)
499 seq_printf(m, "name=%s,", opt->name);
500 if (opt->key)
501 seq_puts(m, "secret=<hidden>,");
502
503 if (opt->flags & CEPH_OPT_FSID)
504 seq_printf(m, "fsid=%pU,", &opt->fsid);
505 if (opt->flags & CEPH_OPT_NOSHARE)
506 seq_puts(m, "noshare,");
507 if (opt->flags & CEPH_OPT_NOCRC)
508 seq_puts(m, "nocrc,");
509 if (opt->flags & CEPH_OPT_NOMSGAUTH)
510 seq_puts(m, "nocephx_require_signatures,");
511 if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
512 seq_puts(m, "notcp_nodelay,");
513
514 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
515 seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
516 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
517 seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
518 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
519 seq_printf(m, "osdkeepalivetimeout=%d,",
520 opt->osd_keepalive_timeout);
521
522 /* drop redundant comma */
523 if (m->count != pos)
524 m->count--;
525
526 return 0;
527}
528EXPORT_SYMBOL(ceph_print_client_options);
529
493u64 ceph_client_id(struct ceph_client *client) 530u64 ceph_client_id(struct ceph_client *client)
494{ 531{
495 return client->monc.auth->global_id; 532 return client->monc.auth->global_id;
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 16bc199d9a62..9d84ce4ea0df 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -17,6 +17,7 @@ const char *crush_bucket_alg_name(int alg)
17 case CRUSH_BUCKET_LIST: return "list"; 17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree"; 18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw"; 19 case CRUSH_BUCKET_STRAW: return "straw";
20 case CRUSH_BUCKET_STRAW2: return "straw2";
20 default: return "unknown"; 21 default: return "unknown";
21 } 22 }
22} 23}
@@ -40,6 +41,8 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
40 return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; 41 return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)];
41 case CRUSH_BUCKET_STRAW: 42 case CRUSH_BUCKET_STRAW:
42 return ((struct crush_bucket_straw *)b)->item_weights[p]; 43 return ((struct crush_bucket_straw *)b)->item_weights[p];
44 case CRUSH_BUCKET_STRAW2:
45 return ((struct crush_bucket_straw2 *)b)->item_weights[p];
43 } 46 }
44 return 0; 47 return 0;
45} 48}
@@ -77,6 +80,14 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
77 kfree(b); 80 kfree(b);
78} 81}
79 82
83void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
84{
85 kfree(b->item_weights);
86 kfree(b->h.perm);
87 kfree(b->h.items);
88 kfree(b);
89}
90
80void crush_destroy_bucket(struct crush_bucket *b) 91void crush_destroy_bucket(struct crush_bucket *b)
81{ 92{
82 switch (b->alg) { 93 switch (b->alg) {
@@ -92,6 +103,9 @@ void crush_destroy_bucket(struct crush_bucket *b)
92 case CRUSH_BUCKET_STRAW: 103 case CRUSH_BUCKET_STRAW:
93 crush_destroy_bucket_straw((struct crush_bucket_straw *)b); 104 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
94 break; 105 break;
106 case CRUSH_BUCKET_STRAW2:
107 crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b);
108 break;
95 } 109 }
96} 110}
97 111
diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h
new file mode 100644
index 000000000000..6192c7fc958c
--- /dev/null
+++ b/net/ceph/crush/crush_ln_table.h
@@ -0,0 +1,166 @@
1/*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2015 Intel Corporation All Rights Reserved
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13#if defined(__linux__)
14#include <linux/types.h>
15#elif defined(__FreeBSD__)
16#include <sys/types.h>
17#endif
18
19#ifndef CEPH_CRUSH_LN_H
20#define CEPH_CRUSH_LN_H
21
22
23// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
24// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
25
26static int64_t __RH_LH_tbl[128*2+2] = {
27 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
28 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
29 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
30 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll,
31 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll,
32 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll,
33 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll,
34 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell,
35 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll,
36 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll,
37 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll,
38 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll,
39 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll,
40 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll,
41 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all,
42 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll,
43 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all,
44 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell,
45 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll,
46 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll,
47 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll,
48 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll,
49 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll,
50 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll,
51 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll,
52 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll,
53 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell,
54 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll,
55 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll,
56 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll,
57 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll,
58 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll,
59 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll,
60 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll,
61 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll,
62 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll,
63 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll,
64 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll,
65 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll,
66 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll,
67 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll,
68 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll,
69 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll,
70 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll,
71 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll,
72 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll,
73 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll,
74 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll,
75 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll,
76 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll,
77 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll,
78 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll,
79 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll,
80 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell,
81 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell,
82 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll,
83 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell,
84 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll,
85 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll,
86 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll,
87 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll,
88 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll,
89 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
90 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
91 0x0000800000000000ll, 0x0000ffff00000000ll,
92 };
93
94
95 // LL_tbl[k] = 2^48*log2(1.0+k/2^15);
96static int64_t __LL_tbl[256] = {
97 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
98 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
99 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
100 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull,
101 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull,
102 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull,
103 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull,
104 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull,
105 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull,
106 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull,
107 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull,
108 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull,
109 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull,
110 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull,
111 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull,
112 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull,
113 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull,
114 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull,
115 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull,
116 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull,
117 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull,
118 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull,
119 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull,
120 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull,
121 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull,
122 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull,
123 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull,
124 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull,
125 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull,
126 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull,
127 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull,
128 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull,
129 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull,
130 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull,
131 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull,
132 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull,
133 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull,
134 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull,
135 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull,
136 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull,
137 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull,
138 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull,
139 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull,
140 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull,
141 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull,
142 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull,
143 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull,
144 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull,
145 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull,
146 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull,
147 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull,
148 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull,
149 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull,
150 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull,
151 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull,
152 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull,
153 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull,
154 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull,
155 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull,
156 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull,
157 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull,
158 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull,
159 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull,
160 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
161};
162
163
164
165
166#endif
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index a1ef53c04415..5b47736d27d9 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -20,7 +20,7 @@
20 20
21#include <linux/crush/crush.h> 21#include <linux/crush/crush.h>
22#include <linux/crush/hash.h> 22#include <linux/crush/hash.h>
23#include <linux/crush/mapper.h> 23#include "crush_ln_table.h"
24 24
25/* 25/*
26 * Implement the core CRUSH mapping algorithm. 26 * Implement the core CRUSH mapping algorithm.
@@ -238,6 +238,102 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
238 return bucket->h.items[high]; 238 return bucket->h.items[high];
239} 239}
240 240
241// compute 2^44*log2(input+1)
242uint64_t crush_ln(unsigned xin)
243{
244 unsigned x=xin, x1;
245 int iexpon, index1, index2;
246 uint64_t RH, LH, LL, xl64, result;
247
248 x++;
249
250 // normalize input
251 iexpon = 15;
252 while(!(x&0x18000)) { x<<=1; iexpon--; }
253
254 index1 = (x>>8)<<1;
255 // RH ~ 2^56/index1
256 RH = __RH_LH_tbl[index1 - 256];
257 // LH ~ 2^48 * log2(index1/256)
258 LH = __RH_LH_tbl[index1 + 1 - 256];
259
260 // RH*x ~ 2^48 * (2^15 + xf), xf<2^8
261 xl64 = (int64_t)x * RH;
262 xl64 >>= 48;
263 x1 = xl64;
264
265 result = iexpon;
266 result <<= (12 + 32);
267
268 index2 = x1 & 0xff;
269 // LL ~ 2^48*log2(1.0+index2/2^15)
270 LL = __LL_tbl[index2];
271
272 LH = LH + LL;
273
274 LH >>= (48-12 - 32);
275 result += LH;
276
277 return result;
278}
279
280
281/*
282 * straw2
283 *
284 * for reference, see:
285 *
286 * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
287 *
288 */
289
290static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
291 int x, int r)
292{
293 unsigned i, high = 0;
294 unsigned u;
295 unsigned w;
296 __s64 ln, draw, high_draw = 0;
297
298 for (i = 0; i < bucket->h.size; i++) {
299 w = bucket->item_weights[i];
300 if (w) {
301 u = crush_hash32_3(bucket->h.hash, x,
302 bucket->h.items[i], r);
303 u &= 0xffff;
304
305 /*
306 * for some reason slightly less than 0x10000 produces
307 * a slightly more accurate distribution... probably a
308 * rounding effect.
309 *
310 * the natural log lookup table maps [0,0xffff]
311 * (corresponding to real numbers [1/0x10000, 1] to
312 * [0, 0xffffffffffff] (corresponding to real numbers
313 * [-11.090355,0]).
314 */
315 ln = crush_ln(u) - 0x1000000000000ll;
316
317 /*
318 * divide by 16.16 fixed-point weight. note
319 * that the ln value is negative, so a larger
320 * weight means a larger (less negative) value
321 * for draw.
322 */
323 draw = div64_s64(ln, w);
324 } else {
325 draw = S64_MIN;
326 }
327
328 if (i == 0 || draw > high_draw) {
329 high = i;
330 high_draw = draw;
331 }
332 }
333 return bucket->h.items[high];
334}
335
336
241static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 337static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
242{ 338{
243 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 339 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
@@ -255,12 +351,16 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
255 case CRUSH_BUCKET_STRAW: 351 case CRUSH_BUCKET_STRAW:
256 return bucket_straw_choose((struct crush_bucket_straw *)in, 352 return bucket_straw_choose((struct crush_bucket_straw *)in,
257 x, r); 353 x, r);
354 case CRUSH_BUCKET_STRAW2:
355 return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
356 x, r);
258 default: 357 default:
259 dprintk("unknown bucket %d alg %d\n", in->id, in->alg); 358 dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
260 return in->items[0]; 359 return in->items[0];
261 } 360 }
262} 361}
263 362
363
264/* 364/*
265 * true if device is marked "out" (failed, fully offloaded) 365 * true if device is marked "out" (failed, fully offloaded)
266 * of the cluster 366 * of the cluster
@@ -290,6 +390,7 @@ static int is_out(const struct crush_map *map,
290 * @type: the type of item to choose 390 * @type: the type of item to choose
291 * @out: pointer to output vector 391 * @out: pointer to output vector
292 * @outpos: our position in that vector 392 * @outpos: our position in that vector
393 * @out_size: size of the out vector
293 * @tries: number of attempts to make 394 * @tries: number of attempts to make
294 * @recurse_tries: number of attempts to have recursive chooseleaf make 395 * @recurse_tries: number of attempts to have recursive chooseleaf make
295 * @local_retries: localized retries 396 * @local_retries: localized retries
@@ -304,6 +405,7 @@ static int crush_choose_firstn(const struct crush_map *map,
304 const __u32 *weight, int weight_max, 405 const __u32 *weight, int weight_max,
305 int x, int numrep, int type, 406 int x, int numrep, int type,
306 int *out, int outpos, 407 int *out, int outpos,
408 int out_size,
307 unsigned int tries, 409 unsigned int tries,
308 unsigned int recurse_tries, 410 unsigned int recurse_tries,
309 unsigned int local_retries, 411 unsigned int local_retries,
@@ -322,6 +424,7 @@ static int crush_choose_firstn(const struct crush_map *map,
322 int item = 0; 424 int item = 0;
323 int itemtype; 425 int itemtype;
324 int collide, reject; 426 int collide, reject;
427 int count = out_size;
325 428
326 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", 429 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
327 recurse_to_leaf ? "_LEAF" : "", 430 recurse_to_leaf ? "_LEAF" : "",
@@ -329,7 +432,7 @@ static int crush_choose_firstn(const struct crush_map *map,
329 tries, recurse_tries, local_retries, local_fallback_retries, 432 tries, recurse_tries, local_retries, local_fallback_retries,
330 parent_r); 433 parent_r);
331 434
332 for (rep = outpos; rep < numrep; rep++) { 435 for (rep = outpos; rep < numrep && count > 0 ; rep++) {
333 /* keep trying until we get a non-out, non-colliding item */ 436 /* keep trying until we get a non-out, non-colliding item */
334 ftotal = 0; 437 ftotal = 0;
335 skip_rep = 0; 438 skip_rep = 0;
@@ -403,7 +506,7 @@ static int crush_choose_firstn(const struct crush_map *map,
403 map->buckets[-1-item], 506 map->buckets[-1-item],
404 weight, weight_max, 507 weight, weight_max,
405 x, outpos+1, 0, 508 x, outpos+1, 0,
406 out2, outpos, 509 out2, outpos, count,
407 recurse_tries, 0, 510 recurse_tries, 0,
408 local_retries, 511 local_retries,
409 local_fallback_retries, 512 local_fallback_retries,
@@ -463,6 +566,7 @@ reject:
463 dprintk("CHOOSE got %d\n", item); 566 dprintk("CHOOSE got %d\n", item);
464 out[outpos] = item; 567 out[outpos] = item;
465 outpos++; 568 outpos++;
569 count--;
466 } 570 }
467 571
468 dprintk("CHOOSE returns %d\n", outpos); 572 dprintk("CHOOSE returns %d\n", outpos);
@@ -654,6 +758,7 @@ int crush_do_rule(const struct crush_map *map,
654 __u32 step; 758 __u32 step;
655 int i, j; 759 int i, j;
656 int numrep; 760 int numrep;
761 int out_size;
657 /* 762 /*
658 * the original choose_total_tries value was off by one (it 763 * the original choose_total_tries value was off by one (it
659 * counted "retries" and not "tries"). add one. 764 * counted "retries" and not "tries"). add one.
@@ -761,6 +866,7 @@ int crush_do_rule(const struct crush_map *map,
761 x, numrep, 866 x, numrep,
762 curstep->arg2, 867 curstep->arg2,
763 o+osize, j, 868 o+osize, j,
869 result_max-osize,
764 choose_tries, 870 choose_tries,
765 recurse_tries, 871 recurse_tries,
766 choose_local_retries, 872 choose_local_retries,
@@ -770,11 +876,13 @@ int crush_do_rule(const struct crush_map *map,
770 c+osize, 876 c+osize,
771 0); 877 0);
772 } else { 878 } else {
879 out_size = ((numrep < (result_max-osize)) ?
880 numrep : (result_max-osize));
773 crush_choose_indep( 881 crush_choose_indep(
774 map, 882 map,
775 map->buckets[-1-w[i]], 883 map->buckets[-1-w[i]],
776 weight, weight_max, 884 weight, weight_max,
777 x, numrep, numrep, 885 x, out_size, numrep,
778 curstep->arg2, 886 curstep->arg2,
779 o+osize, j, 887 o+osize, j,
780 choose_tries, 888 choose_tries,
@@ -783,7 +891,7 @@ int crush_do_rule(const struct crush_map *map,
783 recurse_to_leaf, 891 recurse_to_leaf,
784 c+osize, 892 c+osize,
785 0); 893 0);
786 osize += numrep; 894 osize += out_size;
787 } 895 }
788 } 896 }
789 897
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 14d9995097cc..593dc2eabcc8 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -22,6 +22,7 @@
22 * .../monmap - current monmap 22 * .../monmap - current monmap
23 * .../osdc - active osd requests 23 * .../osdc - active osd requests
24 * .../monc - mon client state 24 * .../monc - mon client state
25 * .../client_options - libceph-only (i.e. not rbd or cephfs) options
25 * .../dentry_lru - dump contents of dentry lru 26 * .../dentry_lru - dump contents of dentry lru
26 * .../caps - expose cap (reservation) stats 27 * .../caps - expose cap (reservation) stats
27 * .../bdi - symlink to ../../bdi/something 28 * .../bdi - symlink to ../../bdi/something
@@ -177,10 +178,24 @@ static int osdc_show(struct seq_file *s, void *pp)
177 return 0; 178 return 0;
178} 179}
179 180
181static int client_options_show(struct seq_file *s, void *p)
182{
183 struct ceph_client *client = s->private;
184 int ret;
185
186 ret = ceph_print_client_options(s, client);
187 if (ret)
188 return ret;
189
190 seq_putc(s, '\n');
191 return 0;
192}
193
180CEPH_DEFINE_SHOW_FUNC(monmap_show) 194CEPH_DEFINE_SHOW_FUNC(monmap_show)
181CEPH_DEFINE_SHOW_FUNC(osdmap_show) 195CEPH_DEFINE_SHOW_FUNC(osdmap_show)
182CEPH_DEFINE_SHOW_FUNC(monc_show) 196CEPH_DEFINE_SHOW_FUNC(monc_show)
183CEPH_DEFINE_SHOW_FUNC(osdc_show) 197CEPH_DEFINE_SHOW_FUNC(osdc_show)
198CEPH_DEFINE_SHOW_FUNC(client_options_show)
184 199
185int ceph_debugfs_init(void) 200int ceph_debugfs_init(void)
186{ 201{
@@ -242,6 +257,14 @@ int ceph_debugfs_client_init(struct ceph_client *client)
242 if (!client->debugfs_osdmap) 257 if (!client->debugfs_osdmap)
243 goto out; 258 goto out;
244 259
260 client->debugfs_options = debugfs_create_file("client_options",
261 0600,
262 client->debugfs_dir,
263 client,
264 &client_options_show_fops);
265 if (!client->debugfs_options)
266 goto out;
267
245 return 0; 268 return 0;
246 269
247out: 270out:
@@ -252,6 +275,7 @@ out:
252void ceph_debugfs_client_cleanup(struct ceph_client *client) 275void ceph_debugfs_client_cleanup(struct ceph_client *client)
253{ 276{
254 dout("ceph_debugfs_client_cleanup %p\n", client); 277 dout("ceph_debugfs_client_cleanup %p\n", client);
278 debugfs_remove(client->debugfs_options);
255 debugfs_remove(client->debugfs_osdmap); 279 debugfs_remove(client->debugfs_osdmap);
256 debugfs_remove(client->debugfs_monmap); 280 debugfs_remove(client->debugfs_monmap);
257 debugfs_remove(client->osdc.debugfs_file); 281 debugfs_remove(client->osdc.debugfs_file);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a9f4ae45b7fb..967080a9f043 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -505,8 +505,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
505 pr_err("connect %s error %d\n", 505 pr_err("connect %s error %d\n",
506 ceph_pr_addr(&con->peer_addr.in_addr), ret); 506 ceph_pr_addr(&con->peer_addr.in_addr), ret);
507 sock_release(sock); 507 sock_release(sock);
508 con->error_msg = "connect error";
509
510 return ret; 508 return ret;
511 } 509 }
512 510
@@ -2145,12 +2143,10 @@ static int process_connect(struct ceph_connection *con)
2145 * to WAIT. This shouldn't happen if we are the 2143 * to WAIT. This shouldn't happen if we are the
2146 * client. 2144 * client.
2147 */ 2145 */
2148 pr_err("process_connect got WAIT as client\n");
2149 con->error_msg = "protocol error, got WAIT as client"; 2146 con->error_msg = "protocol error, got WAIT as client";
2150 return -1; 2147 return -1;
2151 2148
2152 default: 2149 default:
2153 pr_err("connect protocol error, will retry\n");
2154 con->error_msg = "protocol error, garbage tag during connect"; 2150 con->error_msg = "protocol error, garbage tag during connect";
2155 return -1; 2151 return -1;
2156 } 2152 }
@@ -2282,8 +2278,7 @@ static int read_partial_message(struct ceph_connection *con)
2282 2278
2283 crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); 2279 crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
2284 if (cpu_to_le32(crc) != con->in_hdr.crc) { 2280 if (cpu_to_le32(crc) != con->in_hdr.crc) {
2285 pr_err("read_partial_message bad hdr " 2281 pr_err("read_partial_message bad hdr crc %u != expected %u\n",
2286 " crc %u != expected %u\n",
2287 crc, con->in_hdr.crc); 2282 crc, con->in_hdr.crc);
2288 return -EBADMSG; 2283 return -EBADMSG;
2289 } 2284 }
@@ -2313,7 +2308,7 @@ static int read_partial_message(struct ceph_connection *con)
2313 pr_err("read_partial_message bad seq %lld expected %lld\n", 2308 pr_err("read_partial_message bad seq %lld expected %lld\n",
2314 seq, con->in_seq + 1); 2309 seq, con->in_seq + 1);
2315 con->error_msg = "bad message sequence # for incoming message"; 2310 con->error_msg = "bad message sequence # for incoming message";
2316 return -EBADMSG; 2311 return -EBADE;
2317 } 2312 }
2318 2313
2319 /* allocate message? */ 2314 /* allocate message? */
@@ -2660,6 +2655,8 @@ more:
2660 switch (ret) { 2655 switch (ret) {
2661 case -EBADMSG: 2656 case -EBADMSG:
2662 con->error_msg = "bad crc"; 2657 con->error_msg = "bad crc";
2658 /* fall through */
2659 case -EBADE:
2663 ret = -EIO; 2660 ret = -EIO;
2664 break; 2661 break;
2665 case -EIO: 2662 case -EIO:
@@ -2838,7 +2835,8 @@ static void con_work(struct work_struct *work)
2838 if (ret < 0) { 2835 if (ret < 0) {
2839 if (ret == -EAGAIN) 2836 if (ret == -EAGAIN)
2840 continue; 2837 continue;
2841 con->error_msg = "socket error on read"; 2838 if (!con->error_msg)
2839 con->error_msg = "socket error on read";
2842 fault = true; 2840 fault = true;
2843 break; 2841 break;
2844 } 2842 }
@@ -2847,7 +2845,8 @@ static void con_work(struct work_struct *work)
2847 if (ret < 0) { 2845 if (ret < 0) {
2848 if (ret == -EAGAIN) 2846 if (ret == -EAGAIN)
2849 continue; 2847 continue;
2850 con->error_msg = "socket error on write"; 2848 if (!con->error_msg)
2849 con->error_msg = "socket error on write";
2851 fault = true; 2850 fault = true;
2852 } 2851 }
2853 2852
@@ -2869,11 +2868,13 @@ static void con_work(struct work_struct *work)
2869 */ 2868 */
2870static void con_fault(struct ceph_connection *con) 2869static void con_fault(struct ceph_connection *con)
2871{ 2870{
2872 pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2873 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
2874 dout("fault %p state %lu to peer %s\n", 2871 dout("fault %p state %lu to peer %s\n",
2875 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); 2872 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
2876 2873
2874 pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2875 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
2876 con->error_msg = NULL;
2877
2877 WARN_ON(con->state != CON_STATE_CONNECTING && 2878 WARN_ON(con->state != CON_STATE_CONNECTING &&
2878 con->state != CON_STATE_NEGOTIATING && 2879 con->state != CON_STATE_NEGOTIATING &&
2879 con->state != CON_STATE_OPEN); 2880 con->state != CON_STATE_OPEN);
@@ -3295,8 +3296,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
3295 */ 3296 */
3296 if (*skip) 3297 if (*skip)
3297 return 0; 3298 return 0;
3298 con->error_msg = "error allocating memory for incoming message";
3299 3299
3300 con->error_msg = "error allocating memory for incoming message";
3300 return -ENOMEM; 3301 return -ENOMEM;
3301 } 3302 }
3302 memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 3303 memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index b8c3fde5b04f..15796696d64e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -122,6 +122,22 @@ bad:
122 return -EINVAL; 122 return -EINVAL;
123} 123}
124 124
125static int crush_decode_straw2_bucket(void **p, void *end,
126 struct crush_bucket_straw2 *b)
127{
128 int j;
129 dout("crush_decode_straw2_bucket %p to %p\n", *p, end);
130 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
131 if (b->item_weights == NULL)
132 return -ENOMEM;
133 ceph_decode_need(p, end, b->h.size * sizeof(u32), bad);
134 for (j = 0; j < b->h.size; j++)
135 b->item_weights[j] = ceph_decode_32(p);
136 return 0;
137bad:
138 return -EINVAL;
139}
140
125static int skip_name_map(void **p, void *end) 141static int skip_name_map(void **p, void *end)
126{ 142{
127 int len; 143 int len;
@@ -204,6 +220,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
204 case CRUSH_BUCKET_STRAW: 220 case CRUSH_BUCKET_STRAW:
205 size = sizeof(struct crush_bucket_straw); 221 size = sizeof(struct crush_bucket_straw);
206 break; 222 break;
223 case CRUSH_BUCKET_STRAW2:
224 size = sizeof(struct crush_bucket_straw2);
225 break;
207 default: 226 default:
208 err = -EINVAL; 227 err = -EINVAL;
209 goto bad; 228 goto bad;
@@ -261,6 +280,12 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
261 if (err < 0) 280 if (err < 0)
262 goto bad; 281 goto bad;
263 break; 282 break;
283 case CRUSH_BUCKET_STRAW2:
284 err = crush_decode_straw2_bucket(p, end,
285 (struct crush_bucket_straw2 *)b);
286 if (err < 0)
287 goto bad;
288 break;
264 } 289 }
265 } 290 }
266 291
diff --git a/net/core/dev.c b/net/core/dev.c
index af4a1b0adc10..2c1c67fad64d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2713,7 +2713,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
2713 if (unlikely(!skb)) 2713 if (unlikely(!skb))
2714 goto out_null; 2714 goto out_null;
2715 2715
2716 if (netif_needs_gso(dev, skb, features)) { 2716 if (netif_needs_gso(skb, features)) {
2717 struct sk_buff *segs; 2717 struct sk_buff *segs;
2718 2718
2719 segs = skb_gso_segment(skb, features); 2719 segs = skb_gso_segment(skb, features);
@@ -3079,7 +3079,7 @@ static struct rps_dev_flow *
3079set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3079set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3080 struct rps_dev_flow *rflow, u16 next_cpu) 3080 struct rps_dev_flow *rflow, u16 next_cpu)
3081{ 3081{
3082 if (next_cpu != RPS_NO_CPU) { 3082 if (next_cpu < nr_cpu_ids) {
3083#ifdef CONFIG_RFS_ACCEL 3083#ifdef CONFIG_RFS_ACCEL
3084 struct netdev_rx_queue *rxqueue; 3084 struct netdev_rx_queue *rxqueue;
3085 struct rps_dev_flow_table *flow_table; 3085 struct rps_dev_flow_table *flow_table;
@@ -3184,7 +3184,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3184 * If the desired CPU (where last recvmsg was done) is 3184 * If the desired CPU (where last recvmsg was done) is
3185 * different from current CPU (one in the rx-queue flow 3185 * different from current CPU (one in the rx-queue flow
3186 * table entry), switch if one of the following holds: 3186 * table entry), switch if one of the following holds:
3187 * - Current CPU is unset (equal to RPS_NO_CPU). 3187 * - Current CPU is unset (>= nr_cpu_ids).
3188 * - Current CPU is offline. 3188 * - Current CPU is offline.
3189 * - The current CPU's queue tail has advanced beyond the 3189 * - The current CPU's queue tail has advanced beyond the
3190 * last packet that was enqueued using this table entry. 3190 * last packet that was enqueued using this table entry.
@@ -3192,14 +3192,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3192 * have been dequeued, thus preserving in order delivery. 3192 * have been dequeued, thus preserving in order delivery.
3193 */ 3193 */
3194 if (unlikely(tcpu != next_cpu) && 3194 if (unlikely(tcpu != next_cpu) &&
3195 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 3195 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3196 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3196 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3197 rflow->last_qtail)) >= 0)) { 3197 rflow->last_qtail)) >= 0)) {
3198 tcpu = next_cpu; 3198 tcpu = next_cpu;
3199 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3199 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3200 } 3200 }
3201 3201
3202 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 3202 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3203 *rflowp = rflow; 3203 *rflowp = rflow;
3204 cpu = tcpu; 3204 cpu = tcpu;
3205 goto done; 3205 goto done;
@@ -3240,14 +3240,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3240 struct rps_dev_flow_table *flow_table; 3240 struct rps_dev_flow_table *flow_table;
3241 struct rps_dev_flow *rflow; 3241 struct rps_dev_flow *rflow;
3242 bool expire = true; 3242 bool expire = true;
3243 int cpu; 3243 unsigned int cpu;
3244 3244
3245 rcu_read_lock(); 3245 rcu_read_lock();
3246 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3246 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3247 if (flow_table && flow_id <= flow_table->mask) { 3247 if (flow_table && flow_id <= flow_table->mask) {
3248 rflow = &flow_table->flows[flow_id]; 3248 rflow = &flow_table->flows[flow_id];
3249 cpu = ACCESS_ONCE(rflow->cpu); 3249 cpu = ACCESS_ONCE(rflow->cpu);
3250 if (rflow->filter == filter_id && cpu != RPS_NO_CPU && 3250 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3251 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3251 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3252 rflow->last_qtail) < 3252 rflow->last_qtail) <
3253 (int)(10 * flow_table->mask))) 3253 (int)(10 * flow_table->mask)))
@@ -5209,7 +5209,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
5209 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) 5209 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5210 return -EBUSY; 5210 return -EBUSY;
5211 5211
5212 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) 5212 if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5213 return -EEXIST; 5213 return -EEXIST;
5214 5214
5215 if (master && netdev_master_upper_dev_get(dev)) 5215 if (master && netdev_master_upper_dev_get(dev))
diff --git a/net/core/filter.c b/net/core/filter.c
index b669e75d2b36..bf831a85c315 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1175,12 +1175,27 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
1175 return 0; 1175 return 0;
1176} 1176}
1177 1177
1178/**
1179 * bpf_skb_clone_not_writable - is the header of a clone not writable
1180 * @skb: buffer to check
1181 * @len: length up to which to write, can be negative
1182 *
1183 * Returns true if modifying the header part of the cloned buffer
1184 * does require the data to be copied. I.e. this version works with
1185 * negative lengths needed for eBPF case!
1186 */
1187static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len)
1188{
1189 return skb_header_cloned(skb) ||
1190 (int) skb_headroom(skb) + len > skb->hdr_len;
1191}
1192
1178#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) 1193#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1)
1179 1194
1180static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) 1195static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
1181{ 1196{
1182 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1197 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1183 unsigned int offset = (unsigned int) r2; 1198 int offset = (int) r2;
1184 void *from = (void *) (long) r3; 1199 void *from = (void *) (long) r3;
1185 unsigned int len = (unsigned int) r4; 1200 unsigned int len = (unsigned int) r4;
1186 char buf[16]; 1201 char buf[16];
@@ -1194,10 +1209,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
1194 * 1209 *
1195 * so check for invalid 'offset' and too large 'len' 1210 * so check for invalid 'offset' and too large 'len'
1196 */ 1211 */
1197 if (unlikely(offset > 0xffff || len > sizeof(buf))) 1212 if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
1198 return -EFAULT; 1213 return -EFAULT;
1199 1214
1200 if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len)) 1215 offset -= skb->data - skb_mac_header(skb);
1216 if (unlikely(skb_cloned(skb) &&
1217 bpf_skb_clone_unwritable(skb, offset + len)))
1201 return -EFAULT; 1218 return -EFAULT;
1202 1219
1203 ptr = skb_header_pointer(skb, offset, len, buf); 1220 ptr = skb_header_pointer(skb, offset, len, buf);
@@ -1232,15 +1249,18 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1232#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) 1249#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f)
1233#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) 1250#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10)
1234 1251
1235static u64 bpf_l3_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) 1252static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
1236{ 1253{
1237 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1254 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1255 int offset = (int) r2;
1238 __sum16 sum, *ptr; 1256 __sum16 sum, *ptr;
1239 1257
1240 if (unlikely(offset > 0xffff)) 1258 if (unlikely((u32) offset > 0xffff))
1241 return -EFAULT; 1259 return -EFAULT;
1242 1260
1243 if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) 1261 offset -= skb->data - skb_mac_header(skb);
1262 if (unlikely(skb_cloned(skb) &&
1263 bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
1244 return -EFAULT; 1264 return -EFAULT;
1245 1265
1246 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); 1266 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1276,16 +1296,19 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = {
1276 .arg5_type = ARG_ANYTHING, 1296 .arg5_type = ARG_ANYTHING,
1277}; 1297};
1278 1298
1279static u64 bpf_l4_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags) 1299static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
1280{ 1300{
1281 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1301 struct sk_buff *skb = (struct sk_buff *) (long) r1;
1282 u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); 1302 u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags);
1303 int offset = (int) r2;
1283 __sum16 sum, *ptr; 1304 __sum16 sum, *ptr;
1284 1305
1285 if (unlikely(offset > 0xffff)) 1306 if (unlikely((u32) offset > 0xffff))
1286 return -EFAULT; 1307 return -EFAULT;
1287 1308
1288 if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum))) 1309 offset -= skb->data - skb_mac_header(skb);
1310 if (unlikely(skb_cloned(skb) &&
1311 bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
1289 return -EFAULT; 1312 return -EFAULT;
1290 1313
1291 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); 1314 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a3abb719221f..572af0011997 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -16,7 +16,6 @@
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/net_namespace.h> 18#include <linux/net_namespace.h>
19#include <linux/rtnetlink.h>
20#include <net/sock.h> 19#include <net/sock.h>
21#include <net/netlink.h> 20#include <net/netlink.h>
22#include <net/net_namespace.h> 21#include <net/net_namespace.h>
@@ -602,7 +601,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
602 } 601 }
603 602
604 err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 603 err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
605 RTM_GETNSID, net, peer, -1); 604 RTM_NEWNSID, net, peer, -1);
606 if (err < 0) 605 if (err < 0)
607 goto err_out; 606 goto err_out;
608 607
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 358d52a38533..666e0928ba40 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2854,7 +2854,7 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
2854 2854
2855int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, 2855int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
2856 struct net_device *dev, u16 mode, 2856 struct net_device *dev, u16 mode,
2857 u32 flags, u32 mask) 2857 u32 flags, u32 mask, int nlflags)
2858{ 2858{
2859 struct nlmsghdr *nlh; 2859 struct nlmsghdr *nlh;
2860 struct ifinfomsg *ifm; 2860 struct ifinfomsg *ifm;
@@ -2863,7 +2863,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
2863 u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; 2863 u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
2864 struct net_device *br_dev = netdev_master_upper_dev_get(dev); 2864 struct net_device *br_dev = netdev_master_upper_dev_get(dev);
2865 2865
2866 nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); 2866 nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
2867 if (nlh == NULL) 2867 if (nlh == NULL)
2868 return -EMSGSIZE; 2868 return -EMSGSIZE;
2869 2869
@@ -2969,7 +2969,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
2969 if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { 2969 if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
2970 if (idx >= cb->args[0] && 2970 if (idx >= cb->args[0] &&
2971 br_dev->netdev_ops->ndo_bridge_getlink( 2971 br_dev->netdev_ops->ndo_bridge_getlink(
2972 skb, portid, seq, dev, filter_mask) < 0) 2972 skb, portid, seq, dev, filter_mask,
2973 NLM_F_MULTI) < 0)
2973 break; 2974 break;
2974 idx++; 2975 idx++;
2975 } 2976 }
@@ -2977,7 +2978,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
2977 if (ops->ndo_bridge_getlink) { 2978 if (ops->ndo_bridge_getlink) {
2978 if (idx >= cb->args[0] && 2979 if (idx >= cb->args[0] &&
2979 ops->ndo_bridge_getlink(skb, portid, seq, dev, 2980 ops->ndo_bridge_getlink(skb, portid, seq, dev,
2980 filter_mask) < 0) 2981 filter_mask,
2982 NLM_F_MULTI) < 0)
2981 break; 2983 break;
2982 idx++; 2984 idx++;
2983 } 2985 }
@@ -3018,7 +3020,7 @@ static int rtnl_bridge_notify(struct net_device *dev)
3018 goto errout; 3020 goto errout;
3019 } 3021 }
3020 3022
3021 err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0); 3023 err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0);
3022 if (err < 0) 3024 if (err < 0)
3023 goto errout; 3025 goto errout;
3024 3026
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3b6e5830256e..3cfff2a3d651 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -280,13 +280,14 @@ nodata:
280EXPORT_SYMBOL(__alloc_skb); 280EXPORT_SYMBOL(__alloc_skb);
281 281
282/** 282/**
283 * build_skb - build a network buffer 283 * __build_skb - build a network buffer
284 * @data: data buffer provided by caller 284 * @data: data buffer provided by caller
285 * @frag_size: size of fragment, or 0 if head was kmalloced 285 * @frag_size: size of data, or 0 if head was kmalloced
286 * 286 *
287 * Allocate a new &sk_buff. Caller provides space holding head and 287 * Allocate a new &sk_buff. Caller provides space holding head and
288 * skb_shared_info. @data must have been allocated by kmalloc() only if 288 * skb_shared_info. @data must have been allocated by kmalloc() only if
289 * @frag_size is 0, otherwise data should come from the page allocator. 289 * @frag_size is 0, otherwise data should come from the page allocator
290 * or vmalloc()
290 * The return is the new skb buffer. 291 * The return is the new skb buffer.
291 * On a failure the return is %NULL, and @data is not freed. 292 * On a failure the return is %NULL, and @data is not freed.
292 * Notes : 293 * Notes :
@@ -297,7 +298,7 @@ EXPORT_SYMBOL(__alloc_skb);
297 * before giving packet to stack. 298 * before giving packet to stack.
298 * RX rings only contains data buffers, not full skbs. 299 * RX rings only contains data buffers, not full skbs.
299 */ 300 */
300struct sk_buff *build_skb(void *data, unsigned int frag_size) 301struct sk_buff *__build_skb(void *data, unsigned int frag_size)
301{ 302{
302 struct skb_shared_info *shinfo; 303 struct skb_shared_info *shinfo;
303 struct sk_buff *skb; 304 struct sk_buff *skb;
@@ -311,7 +312,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
311 312
312 memset(skb, 0, offsetof(struct sk_buff, tail)); 313 memset(skb, 0, offsetof(struct sk_buff, tail));
313 skb->truesize = SKB_TRUESIZE(size); 314 skb->truesize = SKB_TRUESIZE(size);
314 skb->head_frag = frag_size != 0;
315 atomic_set(&skb->users, 1); 315 atomic_set(&skb->users, 1);
316 skb->head = data; 316 skb->head = data;
317 skb->data = data; 317 skb->data = data;
@@ -328,6 +328,23 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
328 328
329 return skb; 329 return skb;
330} 330}
331
332/* build_skb() is wrapper over __build_skb(), that specifically
333 * takes care of skb->head and skb->pfmemalloc
334 * This means that if @frag_size is not zero, then @data must be backed
335 * by a page fragment, not kmalloc() or vmalloc()
336 */
337struct sk_buff *build_skb(void *data, unsigned int frag_size)
338{
339 struct sk_buff *skb = __build_skb(data, frag_size);
340
341 if (skb && frag_size) {
342 skb->head_frag = 1;
343 if (virt_to_head_page(data)->pfmemalloc)
344 skb->pfmemalloc = 1;
345 }
346 return skb;
347}
331EXPORT_SYMBOL(build_skb); 348EXPORT_SYMBOL(build_skb);
332 349
333struct netdev_alloc_cache { 350struct netdev_alloc_cache {
@@ -348,7 +365,8 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
348 gfp_t gfp = gfp_mask; 365 gfp_t gfp = gfp_mask;
349 366
350 if (order) { 367 if (order) {
351 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; 368 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
369 __GFP_NOMEMALLOC;
352 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); 370 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
353 nc->frag.size = PAGE_SIZE << (page ? order : 0); 371 nc->frag.size = PAGE_SIZE << (page ? order : 0);
354 } 372 }
@@ -4124,19 +4142,21 @@ EXPORT_SYMBOL(skb_try_coalesce);
4124 */ 4142 */
4125void skb_scrub_packet(struct sk_buff *skb, bool xnet) 4143void skb_scrub_packet(struct sk_buff *skb, bool xnet)
4126{ 4144{
4127 if (xnet)
4128 skb_orphan(skb);
4129 skb->tstamp.tv64 = 0; 4145 skb->tstamp.tv64 = 0;
4130 skb->pkt_type = PACKET_HOST; 4146 skb->pkt_type = PACKET_HOST;
4131 skb->skb_iif = 0; 4147 skb->skb_iif = 0;
4132 skb->ignore_df = 0; 4148 skb->ignore_df = 0;
4133 skb_dst_drop(skb); 4149 skb_dst_drop(skb);
4134 skb->mark = 0;
4135 skb_sender_cpu_clear(skb); 4150 skb_sender_cpu_clear(skb);
4136 skb_init_secmark(skb);
4137 secpath_reset(skb); 4151 secpath_reset(skb);
4138 nf_reset(skb); 4152 nf_reset(skb);
4139 nf_reset_trace(skb); 4153 nf_reset_trace(skb);
4154
4155 if (!xnet)
4156 return;
4157
4158 skb_orphan(skb);
4159 skb->mark = 0;
4140} 4160}
4141EXPORT_SYMBOL_GPL(skb_scrub_packet); 4161EXPORT_SYMBOL_GPL(skb_scrub_packet);
4142 4162
diff --git a/net/core/sock.c b/net/core/sock.c
index e891bcf325ca..292f42228bfb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1474,8 +1474,8 @@ void sk_release_kernel(struct sock *sk)
1474 return; 1474 return;
1475 1475
1476 sock_hold(sk); 1476 sock_hold(sk);
1477 sock_net_set(sk, get_net(&init_net));
1478 sock_release(sk->sk_socket); 1477 sock_release(sk->sk_socket);
1478 sock_net_set(sk, get_net(&init_net));
1479 sock_put(sk); 1479 sock_put(sk);
1480} 1480}
1481EXPORT_SYMBOL(sk_release_kernel); 1481EXPORT_SYMBOL(sk_release_kernel);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 2b4f21d34df6..ccf4c5629b3c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -453,7 +453,8 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
453 iph->saddr, iph->daddr); 453 iph->saddr, iph->daddr);
454 if (req) { 454 if (req) {
455 nsk = dccp_check_req(sk, skb, req); 455 nsk = dccp_check_req(sk, skb, req);
456 reqsk_put(req); 456 if (!nsk)
457 reqsk_put(req);
457 return nsk; 458 return nsk;
458 } 459 }
459 nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo, 460 nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9d0551092c6c..5165571f397a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -301,7 +301,8 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
301 &iph->daddr, inet6_iif(skb)); 301 &iph->daddr, inet6_iif(skb));
302 if (req) { 302 if (req) {
303 nsk = dccp_check_req(sk, skb, req); 303 nsk = dccp_check_req(sk, skb, req);
304 reqsk_put(req); 304 if (!nsk)
305 reqsk_put(req);
305 return nsk; 306 return nsk;
306 } 307 }
307 nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo, 308 nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 5f566663e47f..30addee2dd03 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -186,8 +186,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
186 if (child == NULL) 186 if (child == NULL)
187 goto listen_overflow; 187 goto listen_overflow;
188 188
189 inet_csk_reqsk_queue_unlink(sk, req); 189 inet_csk_reqsk_queue_drop(sk, req);
190 inet_csk_reqsk_queue_removed(sk, req);
191 inet_csk_reqsk_queue_add(sk, req, child); 190 inet_csk_reqsk_queue_add(sk, req, child);
192out: 191out:
193 return child; 192 return child;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 5eaadabe23a1..e6f6cc3a1bcf 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -124,7 +124,7 @@ static ssize_t temp1_max_store(struct device *dev,
124 124
125 return count; 125 return count;
126} 126}
127static DEVICE_ATTR(temp1_max, S_IRUGO, temp1_max_show, temp1_max_store); 127static DEVICE_ATTR_RW(temp1_max);
128 128
129static ssize_t temp1_max_alarm_show(struct device *dev, 129static ssize_t temp1_max_alarm_show(struct device *dev,
130 struct device_attribute *attr, char *buf) 130 struct device_attribute *attr, char *buf)
@@ -159,8 +159,8 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
159 if (index == 1) { 159 if (index == 1) {
160 if (!drv->get_temp_limit) 160 if (!drv->get_temp_limit)
161 mode = 0; 161 mode = 0;
162 else if (drv->set_temp_limit) 162 else if (!drv->set_temp_limit)
163 mode |= S_IWUSR; 163 mode &= ~S_IWUSR;
164 } else if (index == 2 && !drv->get_temp_alarm) { 164 } else if (index == 2 && !drv->get_temp_alarm) {
165 mode = 0; 165 mode = 0;
166 } 166 }
@@ -633,7 +633,7 @@ static int dsa_of_probe(struct device *dev)
633 if (cd->sw_addr > PHY_MAX_ADDR) 633 if (cd->sw_addr > PHY_MAX_ADDR)
634 continue; 634 continue;
635 635
636 if (!of_property_read_u32(np, "eeprom-length", &eeprom_len)) 636 if (!of_property_read_u32(child, "eeprom-length", &eeprom_len))
637 cd->eeprom_len = eeprom_len; 637 cd->eeprom_len = eeprom_len;
638 638
639 for_each_available_child_of_node(child, port) { 639 for_each_available_child_of_node(child, port) {
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
index 05dab2957cd4..4adfd4d5471b 100644
--- a/net/ieee802154/Makefile
+++ b/net/ieee802154/Makefile
@@ -3,7 +3,9 @@ obj-$(CONFIG_IEEE802154_SOCKET) += ieee802154_socket.o
3obj-y += 6lowpan/ 3obj-y += 6lowpan/
4 4
5ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \ 5ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \
6 header_ops.o sysfs.o nl802154.o 6 header_ops.o sysfs.o nl802154.o trace.o
7ieee802154_socket-y := socket.o 7ieee802154_socket-y := socket.o
8 8
9CFLAGS_trace.o := -I$(src)
10
9ccflags-y += -D__CHECK_ENDIAN__ 11ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index 1b9d25f6e898..346c6665d25e 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -175,6 +175,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
175 int rc = -ENOBUFS; 175 int rc = -ENOBUFS;
176 struct net_device *dev; 176 struct net_device *dev;
177 int type = __IEEE802154_DEV_INVALID; 177 int type = __IEEE802154_DEV_INVALID;
178 unsigned char name_assign_type;
178 179
179 pr_debug("%s\n", __func__); 180 pr_debug("%s\n", __func__);
180 181
@@ -190,8 +191,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
190 if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] 191 if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
191 != '\0') 192 != '\0')
192 return -EINVAL; /* phy name should be null-terminated */ 193 return -EINVAL; /* phy name should be null-terminated */
194 name_assign_type = NET_NAME_USER;
193 } else { 195 } else {
194 devname = "wpan%d"; 196 devname = "wpan%d";
197 name_assign_type = NET_NAME_ENUM;
195 } 198 }
196 199
197 if (strlen(devname) >= IFNAMSIZ) 200 if (strlen(devname) >= IFNAMSIZ)
@@ -221,7 +224,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
221 } 224 }
222 225
223 dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname, 226 dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname,
224 type); 227 name_assign_type, type);
225 if (IS_ERR(dev)) { 228 if (IS_ERR(dev)) {
226 rc = PTR_ERR(dev); 229 rc = PTR_ERR(dev);
227 goto nla_put_failure; 230 goto nla_put_failure;
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index a4daf91b8d0a..f3c12f6a4a39 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -589,7 +589,7 @@ static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info)
589 589
590 return rdev_add_virtual_intf(rdev, 590 return rdev_add_virtual_intf(rdev,
591 nla_data(info->attrs[NL802154_ATTR_IFNAME]), 591 nla_data(info->attrs[NL802154_ATTR_IFNAME]),
592 type, extended_addr); 592 NET_NAME_USER, type, extended_addr);
593} 593}
594 594
595static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info) 595static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info)
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index 7c46732fad2b..7b5a9dd94fe5 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -4,13 +4,16 @@
4#include <net/cfg802154.h> 4#include <net/cfg802154.h>
5 5
6#include "core.h" 6#include "core.h"
7#include "trace.h"
7 8
8static inline struct net_device * 9static inline struct net_device *
9rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, 10rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
10 const char *name, int type) 11 const char *name,
12 unsigned char name_assign_type,
13 int type)
11{ 14{
12 return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name, 15 return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name,
13 type); 16 name_assign_type, type);
14} 17}
15 18
16static inline void 19static inline void
@@ -22,75 +25,131 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
22 25
23static inline int 26static inline int
24rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name, 27rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name,
28 unsigned char name_assign_type,
25 enum nl802154_iftype type, __le64 extended_addr) 29 enum nl802154_iftype type, __le64 extended_addr)
26{ 30{
27 return rdev->ops->add_virtual_intf(&rdev->wpan_phy, name, type, 31 int ret;
32
33 trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type,
28 extended_addr); 34 extended_addr);
35 ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name,
36 name_assign_type, type,
37 extended_addr);
38 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
39 return ret;
29} 40}
30 41
31static inline int 42static inline int
32rdev_del_virtual_intf(struct cfg802154_registered_device *rdev, 43rdev_del_virtual_intf(struct cfg802154_registered_device *rdev,
33 struct wpan_dev *wpan_dev) 44 struct wpan_dev *wpan_dev)
34{ 45{
35 return rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev); 46 int ret;
47
48 trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev);
49 ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev);
50 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
51 return ret;
36} 52}
37 53
38static inline int 54static inline int
39rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel) 55rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel)
40{ 56{
41 return rdev->ops->set_channel(&rdev->wpan_phy, page, channel); 57 int ret;
58
59 trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel);
60 ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel);
61 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
62 return ret;
42} 63}
43 64
44static inline int 65static inline int
45rdev_set_cca_mode(struct cfg802154_registered_device *rdev, 66rdev_set_cca_mode(struct cfg802154_registered_device *rdev,
46 const struct wpan_phy_cca *cca) 67 const struct wpan_phy_cca *cca)
47{ 68{
48 return rdev->ops->set_cca_mode(&rdev->wpan_phy, cca); 69 int ret;
70
71 trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca);
72 ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca);
73 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
74 return ret;
49} 75}
50 76
51static inline int 77static inline int
52rdev_set_pan_id(struct cfg802154_registered_device *rdev, 78rdev_set_pan_id(struct cfg802154_registered_device *rdev,
53 struct wpan_dev *wpan_dev, __le16 pan_id) 79 struct wpan_dev *wpan_dev, __le16 pan_id)
54{ 80{
55 return rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id); 81 int ret;
82
83 trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
84 ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
85 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
86 return ret;
56} 87}
57 88
58static inline int 89static inline int
59rdev_set_short_addr(struct cfg802154_registered_device *rdev, 90rdev_set_short_addr(struct cfg802154_registered_device *rdev,
60 struct wpan_dev *wpan_dev, __le16 short_addr) 91 struct wpan_dev *wpan_dev, __le16 short_addr)
61{ 92{
62 return rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr); 93 int ret;
94
95 trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
96 ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
97 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
98 return ret;
63} 99}
64 100
65static inline int 101static inline int
66rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev, 102rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev,
67 struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) 103 struct wpan_dev *wpan_dev, u8 min_be, u8 max_be)
68{ 104{
69 return rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev, 105 int ret;
106
107 trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
70 min_be, max_be); 108 min_be, max_be);
109 ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
110 min_be, max_be);
111 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
112 return ret;
71} 113}
72 114
73static inline int 115static inline int
74rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev, 116rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev,
75 struct wpan_dev *wpan_dev, u8 max_csma_backoffs) 117 struct wpan_dev *wpan_dev, u8 max_csma_backoffs)
76{ 118{
77 return rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev, 119 int ret;
78 max_csma_backoffs); 120
121 trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev,
122 max_csma_backoffs);
123 ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev,
124 max_csma_backoffs);
125 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
126 return ret;
79} 127}
80 128
81static inline int 129static inline int
82rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev, 130rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev,
83 struct wpan_dev *wpan_dev, s8 max_frame_retries) 131 struct wpan_dev *wpan_dev, s8 max_frame_retries)
84{ 132{
85 return rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev, 133 int ret;
134
135 trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
86 max_frame_retries); 136 max_frame_retries);
137 ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
138 max_frame_retries);
139 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
140 return ret;
87} 141}
88 142
89static inline int 143static inline int
90rdev_set_lbt_mode(struct cfg802154_registered_device *rdev, 144rdev_set_lbt_mode(struct cfg802154_registered_device *rdev,
91 struct wpan_dev *wpan_dev, bool mode) 145 struct wpan_dev *wpan_dev, bool mode)
92{ 146{
93 return rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode); 147 int ret;
148
149 trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
150 ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
151 trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
152 return ret;
94} 153}
95 154
96#endif /* __CFG802154_RDEV_OPS */ 155#endif /* __CFG802154_RDEV_OPS */
diff --git a/net/ieee802154/trace.c b/net/ieee802154/trace.c
new file mode 100644
index 000000000000..95f997fad755
--- /dev/null
+++ b/net/ieee802154/trace.c
@@ -0,0 +1,7 @@
1#include <linux/module.h>
2
3#ifndef __CHECKER__
4#define CREATE_TRACE_POINTS
5#include "trace.h"
6
7#endif
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
new file mode 100644
index 000000000000..5ac25eb6ed17
--- /dev/null
+++ b/net/ieee802154/trace.h
@@ -0,0 +1,247 @@
1/* Based on net/wireless/tracing.h */
2
3#undef TRACE_SYSTEM
4#define TRACE_SYSTEM cfg802154
5
6#if !defined(__RDEV_CFG802154_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ)
7#define __RDEV_CFG802154_OPS_TRACE
8
9#include <linux/tracepoint.h>
10
11#include <net/cfg802154.h>
12
13#define MAXNAME 32
14#define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME)
15#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \
16 wpan_phy_name(wpan_phy), \
17 MAXNAME)
18#define WPAN_PHY_PR_FMT "%s"
19#define WPAN_PHY_PR_ARG __entry->wpan_phy_name
20
21#define WPAN_DEV_ENTRY __field(u32, identifier)
22#define WPAN_DEV_ASSIGN (__entry->identifier) = (!IS_ERR_OR_NULL(wpan_dev) \
23 ? wpan_dev->identifier : 0)
24#define WPAN_DEV_PR_FMT "wpan_dev(%u)"
25#define WPAN_DEV_PR_ARG (__entry->identifier)
26
27#define WPAN_CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \
28 __field(enum nl802154_cca_opts, cca_opt)
29#define WPAN_CCA_ASSIGN \
30 do { \
31 (__entry->cca_mode) = cca->mode; \
32 (__entry->cca_opt) = cca->opt; \
33 } while (0)
34#define WPAN_CCA_PR_FMT "cca_mode: %d, cca_opt: %d"
35#define WPAN_CCA_PR_ARG __entry->cca_mode, __entry->cca_opt
36
37#define BOOL_TO_STR(bo) (bo) ? "true" : "false"
38
39/*************************************************************
40 * rdev->ops traces *
41 *************************************************************/
42
43TRACE_EVENT(802154_rdev_add_virtual_intf,
44 TP_PROTO(struct wpan_phy *wpan_phy, char *name,
45 enum nl802154_iftype type, __le64 extended_addr),
46 TP_ARGS(wpan_phy, name, type, extended_addr),
47 TP_STRUCT__entry(
48 WPAN_PHY_ENTRY
49 __string(vir_intf_name, name ? name : "<noname>")
50 __field(enum nl802154_iftype, type)
51 __field(__le64, extended_addr)
52 ),
53 TP_fast_assign(
54 WPAN_PHY_ASSIGN;
55 __assign_str(vir_intf_name, name ? name : "<noname>");
56 __entry->type = type;
57 __entry->extended_addr = extended_addr;
58 ),
59 TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, ea %llx",
60 WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type,
61 __le64_to_cpu(__entry->extended_addr))
62);
63
64TRACE_EVENT(802154_rdev_del_virtual_intf,
65 TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
66 TP_ARGS(wpan_phy, wpan_dev),
67 TP_STRUCT__entry(
68 WPAN_PHY_ENTRY
69 WPAN_DEV_ENTRY
70 ),
71 TP_fast_assign(
72 WPAN_PHY_ASSIGN;
73 WPAN_DEV_ASSIGN;
74 ),
75 TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG,
76 WPAN_DEV_PR_ARG)
77);
78
79TRACE_EVENT(802154_rdev_set_channel,
80 TP_PROTO(struct wpan_phy *wpan_phy, u8 page, u8 channel),
81 TP_ARGS(wpan_phy, page, channel),
82 TP_STRUCT__entry(
83 WPAN_PHY_ENTRY
84 __field(u8, page)
85 __field(u8, channel)
86 ),
87 TP_fast_assign(
88 WPAN_PHY_ASSIGN;
89 __entry->page = page;
90 __entry->channel = channel;
91 ),
92 TP_printk(WPAN_PHY_PR_FMT ", page: %d, channel: %d", WPAN_PHY_PR_ARG,
93 __entry->page, __entry->channel)
94);
95
96TRACE_EVENT(802154_rdev_set_cca_mode,
97 TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca),
98 TP_ARGS(wpan_phy, cca),
99 TP_STRUCT__entry(
100 WPAN_PHY_ENTRY
101 WPAN_CCA_ENTRY
102 ),
103 TP_fast_assign(
104 WPAN_PHY_ASSIGN;
105 WPAN_CCA_ASSIGN;
106 ),
107 TP_printk(WPAN_PHY_PR_FMT ", " WPAN_CCA_PR_FMT, WPAN_PHY_PR_ARG,
108 WPAN_CCA_PR_ARG)
109);
110
111DECLARE_EVENT_CLASS(802154_le16_template,
112 TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
113 __le16 le16arg),
114 TP_ARGS(wpan_phy, wpan_dev, le16arg),
115 TP_STRUCT__entry(
116 WPAN_PHY_ENTRY
117 WPAN_DEV_ENTRY
118 __field(__le16, le16arg)
119 ),
120 TP_fast_assign(
121 WPAN_PHY_ASSIGN;
122 WPAN_DEV_ASSIGN;
123 __entry->le16arg = le16arg;
124 ),
125 TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", pan id: 0x%04x",
126 WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
127 __le16_to_cpu(__entry->le16arg))
128);
129
130DEFINE_EVENT(802154_le16_template, 802154_rdev_set_pan_id,
131 TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
132 __le16 le16arg),
133 TP_ARGS(wpan_phy, wpan_dev, le16arg)
134);
135
136DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr,
137 TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
138 __le16 le16arg),
139 TP_ARGS(wpan_phy, wpan_dev, le16arg),
140 TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", sa: 0x%04x",
141 WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
142 __le16_to_cpu(__entry->le16arg))
143);
144
145TRACE_EVENT(802154_rdev_set_backoff_exponent,
146 TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
147 u8 min_be, u8 max_be),
148 TP_ARGS(wpan_phy, wpan_dev, min_be, max_be),
149 TP_STRUCT__entry(
150 WPAN_PHY_ENTRY
151 WPAN_DEV_ENTRY
152 __field(u8, min_be)
153 __field(u8, max_be)
154 ),
155 TP_fast_assign(
156 WPAN_PHY_ASSIGN;
157 WPAN_DEV_ASSIGN;
158 __entry->min_be = min_be;
159 __entry->max_be = max_be;
160 ),
161
162 TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
163 ", min be: %d, max_be: %d", WPAN_PHY_PR_ARG,
164 WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be)
165);
166
167TRACE_EVENT(802154_rdev_set_csma_backoffs,
168 TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
169 u8 max_csma_backoffs),
170 TP_ARGS(wpan_phy, wpan_dev, max_csma_backoffs),
171 TP_STRUCT__entry(
172 WPAN_PHY_ENTRY
173 WPAN_DEV_ENTRY
174 __field(u8, max_csma_backoffs)
175 ),
176 TP_fast_assign(
177 WPAN_PHY_ASSIGN;
178 WPAN_DEV_ASSIGN;
179 __entry->max_csma_backoffs = max_csma_backoffs;
180 ),
181
182 TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
183 ", max csma backoffs: %d", WPAN_PHY_PR_ARG,
184 WPAN_DEV_PR_ARG, __entry->max_csma_backoffs)
185);
186
187TRACE_EVENT(802154_rdev_set_max_frame_retries,
188 TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
189 s8 max_frame_retries),
190 TP_ARGS(wpan_phy, wpan_dev, max_frame_retries),
191 TP_STRUCT__entry(
192 WPAN_PHY_ENTRY
193 WPAN_DEV_ENTRY
194 __field(s8, max_frame_retries)
195 ),
196 TP_fast_assign(
197 WPAN_PHY_ASSIGN;
198 WPAN_DEV_ASSIGN;
199 __entry->max_frame_retries = max_frame_retries;
200 ),
201
202 TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
203 ", max frame retries: %d", WPAN_PHY_PR_ARG,
204 WPAN_DEV_PR_ARG, __entry->max_frame_retries)
205);
206
207TRACE_EVENT(802154_rdev_set_lbt_mode,
208 TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
209 bool mode),
210 TP_ARGS(wpan_phy, wpan_dev, mode),
211 TP_STRUCT__entry(
212 WPAN_PHY_ENTRY
213 WPAN_DEV_ENTRY
214 __field(bool, mode)
215 ),
216 TP_fast_assign(
217 WPAN_PHY_ASSIGN;
218 WPAN_DEV_ASSIGN;
219 __entry->mode = mode;
220 ),
221 TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
222 ", lbt mode: %s", WPAN_PHY_PR_ARG,
223 WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode))
224);
225
226TRACE_EVENT(802154_rdev_return_int,
227 TP_PROTO(struct wpan_phy *wpan_phy, int ret),
228 TP_ARGS(wpan_phy, ret),
229 TP_STRUCT__entry(
230 WPAN_PHY_ENTRY
231 __field(int, ret)
232 ),
233 TP_fast_assign(
234 WPAN_PHY_ASSIGN;
235 __entry->ret = ret;
236 ),
237 TP_printk(WPAN_PHY_PR_FMT ", returned: %d", WPAN_PHY_PR_ARG,
238 __entry->ret)
239);
240
241#endif /* !__RDEV_CFG802154_OPS_TRACE || TRACE_HEADER_MULTI_READ */
242
243#undef TRACE_INCLUDE_PATH
244#define TRACE_INCLUDE_PATH .
245#undef TRACE_INCLUDE_FILE
246#define TRACE_INCLUDE_FILE trace
247#include <trace/define_trace.h>
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index af150b43b214..34968cd5c146 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -711,11 +711,10 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
711 cb->nlh->nlmsg_seq, NLM_F_MULTI, 711 cb->nlh->nlmsg_seq, NLM_F_MULTI,
712 skb, FOU_CMD_GET); 712 skb, FOU_CMD_GET);
713 if (ret) 713 if (ret)
714 goto done; 714 break;
715 } 715 }
716 mutex_unlock(&fn->fou_lock); 716 mutex_unlock(&fn->fou_lock);
717 717
718done:
719 cb->args[0] = idx; 718 cb->args[0] = idx;
720 return skb->len; 719 return skb->len;
721} 720}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5c3dd6267ed3..8976ca423a07 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -564,6 +564,40 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
564} 564}
565EXPORT_SYMBOL(inet_rtx_syn_ack); 565EXPORT_SYMBOL(inet_rtx_syn_ack);
566 566
567/* return true if req was found in the syn_table[] */
568static bool reqsk_queue_unlink(struct request_sock_queue *queue,
569 struct request_sock *req)
570{
571 struct listen_sock *lopt = queue->listen_opt;
572 struct request_sock **prev;
573 bool found = false;
574
575 spin_lock(&queue->syn_wait_lock);
576
577 for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
578 prev = &(*prev)->dl_next) {
579 if (*prev == req) {
580 *prev = req->dl_next;
581 found = true;
582 break;
583 }
584 }
585
586 spin_unlock(&queue->syn_wait_lock);
587 if (del_timer(&req->rsk_timer))
588 reqsk_put(req);
589 return found;
590}
591
592void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
593{
594 if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
595 reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
596 reqsk_put(req);
597 }
598}
599EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
600
567static void reqsk_timer_handler(unsigned long data) 601static void reqsk_timer_handler(unsigned long data)
568{ 602{
569 struct request_sock *req = (struct request_sock *)data; 603 struct request_sock *req = (struct request_sock *)data;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 70e8b3c308ec..4d32262c7502 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -111,6 +111,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
111 const struct nlmsghdr *unlh) 111 const struct nlmsghdr *unlh)
112{ 112{
113 const struct inet_sock *inet = inet_sk(sk); 113 const struct inet_sock *inet = inet_sk(sk);
114 const struct tcp_congestion_ops *ca_ops;
114 const struct inet_diag_handler *handler; 115 const struct inet_diag_handler *handler;
115 int ext = req->idiag_ext; 116 int ext = req->idiag_ext;
116 struct inet_diag_msg *r; 117 struct inet_diag_msg *r;
@@ -208,16 +209,33 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
208 info = nla_data(attr); 209 info = nla_data(attr);
209 } 210 }
210 211
211 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) 212 if (ext & (1 << (INET_DIAG_CONG - 1))) {
212 if (nla_put_string(skb, INET_DIAG_CONG, 213 int err = 0;
213 icsk->icsk_ca_ops->name) < 0) 214
215 rcu_read_lock();
216 ca_ops = READ_ONCE(icsk->icsk_ca_ops);
217 if (ca_ops)
218 err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name);
219 rcu_read_unlock();
220 if (err < 0)
214 goto errout; 221 goto errout;
222 }
215 223
216 handler->idiag_get_info(sk, r, info); 224 handler->idiag_get_info(sk, r, info);
217 225
218 if (sk->sk_state < TCP_TIME_WAIT && 226 if (sk->sk_state < TCP_TIME_WAIT) {
219 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) 227 union tcp_cc_info info;
220 icsk->icsk_ca_ops->get_info(sk, ext, skb); 228 size_t sz = 0;
229 int attr;
230
231 rcu_read_lock();
232 ca_ops = READ_ONCE(icsk->icsk_ca_ops);
233 if (ca_ops && ca_ops->get_info)
234 sz = ca_ops->get_info(sk, ext, &attr, &info);
235 rcu_read_unlock();
236 if (sz && nla_put(skb, attr, sz, &info) < 0)
237 goto errout;
238 }
221 239
222out: 240out:
223 nlmsg_end(skb, nlh); 241 nlmsg_end(skb, nlh);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 939992c456f3..3674484946a5 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -82,6 +82,9 @@ int ip_forward(struct sk_buff *skb)
82 if (skb->pkt_type != PACKET_HOST) 82 if (skb->pkt_type != PACKET_HOST)
83 goto drop; 83 goto drop;
84 84
85 if (unlikely(skb->sk))
86 goto drop;
87
85 if (skb_warn_if_lro(skb)) 88 if (skb_warn_if_lro(skb))
86 goto drop; 89 goto drop;
87 90
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index a93f260cf24c..05ff44b758df 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -158,6 +158,7 @@ void ping_unhash(struct sock *sk)
158 if (sk_hashed(sk)) { 158 if (sk_hashed(sk)) {
159 write_lock_bh(&ping_table.lock); 159 write_lock_bh(&ping_table.lock);
160 hlist_nulls_del(&sk->sk_nulls_node); 160 hlist_nulls_del(&sk->sk_nulls_node);
161 sk_nulls_node_init(&sk->sk_nulls_node);
161 sock_put(sk); 162 sock_put(sk);
162 isk->inet_num = 0; 163 isk->inet_num = 0;
163 isk->inet_sport = 0; 164 isk->inet_sport = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a78540f28276..bff62fc87b8e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -962,10 +962,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
962 if (dst_metric_locked(dst, RTAX_MTU)) 962 if (dst_metric_locked(dst, RTAX_MTU))
963 return; 963 return;
964 964
965 if (dst->dev->mtu < mtu) 965 if (ipv4_mtu(dst) < mtu)
966 return;
967
968 if (rt->rt_pmtu && rt->rt_pmtu < mtu)
969 return; 966 return;
970 967
971 if (mtu < ip_rt_min_pmtu) 968 if (mtu < ip_rt_min_pmtu)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 18e3a12eb1b2..46efa03d2b11 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -252,6 +252,7 @@
252#include <linux/types.h> 252#include <linux/types.h>
253#include <linux/fcntl.h> 253#include <linux/fcntl.h>
254#include <linux/poll.h> 254#include <linux/poll.h>
255#include <linux/inet_diag.h>
255#include <linux/init.h> 256#include <linux/init.h>
256#include <linux/fs.h> 257#include <linux/fs.h>
257#include <linux/skbuff.h> 258#include <linux/skbuff.h>
@@ -520,8 +521,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
520 521
521 /* Race breaker. If space is freed after 522 /* Race breaker. If space is freed after
522 * wspace test but before the flags are set, 523 * wspace test but before the flags are set,
523 * IO signal will be lost. 524 * IO signal will be lost. Memory barrier
525 * pairs with the input side.
524 */ 526 */
527 smp_mb__after_atomic();
525 if (sk_stream_is_writeable(sk)) 528 if (sk_stream_is_writeable(sk))
526 mask |= POLLOUT | POLLWRNORM; 529 mask |= POLLOUT | POLLWRNORM;
527 } 530 }
@@ -2590,11 +2593,12 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
2590#endif 2593#endif
2591 2594
2592/* Return information about state of tcp endpoint in API format. */ 2595/* Return information about state of tcp endpoint in API format. */
2593void tcp_get_info(const struct sock *sk, struct tcp_info *info) 2596void tcp_get_info(struct sock *sk, struct tcp_info *info)
2594{ 2597{
2595 const struct tcp_sock *tp = tcp_sk(sk); 2598 const struct tcp_sock *tp = tcp_sk(sk);
2596 const struct inet_connection_sock *icsk = inet_csk(sk); 2599 const struct inet_connection_sock *icsk = inet_csk(sk);
2597 u32 now = tcp_time_stamp; 2600 u32 now = tcp_time_stamp;
2601 u32 rate;
2598 2602
2599 memset(info, 0, sizeof(*info)); 2603 memset(info, 0, sizeof(*info));
2600 2604
@@ -2655,10 +2659,16 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2655 2659
2656 info->tcpi_total_retrans = tp->total_retrans; 2660 info->tcpi_total_retrans = tp->total_retrans;
2657 2661
2658 info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ? 2662 rate = READ_ONCE(sk->sk_pacing_rate);
2659 sk->sk_pacing_rate : ~0ULL; 2663 info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
2660 info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ? 2664
2661 sk->sk_max_pacing_rate : ~0ULL; 2665 rate = READ_ONCE(sk->sk_max_pacing_rate);
2666 info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
2667
2668 spin_lock_bh(&sk->sk_lock.slock);
2669 info->tcpi_bytes_acked = tp->bytes_acked;
2670 info->tcpi_bytes_received = tp->bytes_received;
2671 spin_unlock_bh(&sk->sk_lock.slock);
2662} 2672}
2663EXPORT_SYMBOL_GPL(tcp_get_info); 2673EXPORT_SYMBOL_GPL(tcp_get_info);
2664 2674
@@ -2730,6 +2740,26 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2730 return -EFAULT; 2740 return -EFAULT;
2731 return 0; 2741 return 0;
2732 } 2742 }
2743 case TCP_CC_INFO: {
2744 const struct tcp_congestion_ops *ca_ops;
2745 union tcp_cc_info info;
2746 size_t sz = 0;
2747 int attr;
2748
2749 if (get_user(len, optlen))
2750 return -EFAULT;
2751
2752 ca_ops = icsk->icsk_ca_ops;
2753 if (ca_ops && ca_ops->get_info)
2754 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
2755
2756 len = min_t(unsigned int, len, sz);
2757 if (put_user(len, optlen))
2758 return -EFAULT;
2759 if (copy_to_user(optval, &info, len))
2760 return -EFAULT;
2761 return 0;
2762 }
2733 case TCP_QUICKACK: 2763 case TCP_QUICKACK:
2734 val = !icsk->icsk_ack.pingpong; 2764 val = !icsk->icsk_ack.pingpong;
2735 break; 2765 break;
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index b504371af742..4c41c1287197 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -277,7 +277,8 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
277 } 277 }
278} 278}
279 279
280static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) 280static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
281 union tcp_cc_info *info)
281{ 282{
282 const struct dctcp *ca = inet_csk_ca(sk); 283 const struct dctcp *ca = inet_csk_ca(sk);
283 284
@@ -286,19 +287,19 @@ static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
286 */ 287 */
287 if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || 288 if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
288 ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 289 ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
289 struct tcp_dctcp_info info; 290 memset(info, 0, sizeof(struct tcp_dctcp_info));
290
291 memset(&info, 0, sizeof(info));
292 if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { 291 if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
293 info.dctcp_enabled = 1; 292 info->dctcp.dctcp_enabled = 1;
294 info.dctcp_ce_state = (u16) ca->ce_state; 293 info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
295 info.dctcp_alpha = ca->dctcp_alpha; 294 info->dctcp.dctcp_alpha = ca->dctcp_alpha;
296 info.dctcp_ab_ecn = ca->acked_bytes_ecn; 295 info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn;
297 info.dctcp_ab_tot = ca->acked_bytes_total; 296 info->dctcp.dctcp_ab_tot = ca->acked_bytes_total;
298 } 297 }
299 298
300 nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); 299 *attr = INET_DIAG_DCTCPINFO;
300 return sizeof(*info);
301 } 301 }
302 return 0;
302} 303}
303 304
304static struct tcp_congestion_ops dctcp __read_mostly = { 305static struct tcp_congestion_ops dctcp __read_mostly = {
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index e3d87aca6be8..3c673d5e6cff 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -206,6 +206,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
206 skb_set_owner_r(skb2, child); 206 skb_set_owner_r(skb2, child);
207 __skb_queue_tail(&child->sk_receive_queue, skb2); 207 __skb_queue_tail(&child->sk_receive_queue, skb2);
208 tp->syn_data_acked = 1; 208 tp->syn_data_acked = 1;
209 tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
209 } else { 210 } else {
210 end_seq = TCP_SKB_CB(skb)->seq + 1; 211 end_seq = TCP_SKB_CB(skb)->seq + 1;
211 } 212 }
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1d5a30a90adf..f71002e4db0b 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -300,26 +300,27 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
300} 300}
301 301
302/* Extract info for Tcp socket info provided via netlink. */ 302/* Extract info for Tcp socket info provided via netlink. */
303static void tcp_illinois_info(struct sock *sk, u32 ext, 303static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
304 struct sk_buff *skb) 304 union tcp_cc_info *info)
305{ 305{
306 const struct illinois *ca = inet_csk_ca(sk); 306 const struct illinois *ca = inet_csk_ca(sk);
307 307
308 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 308 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
309 struct tcpvegas_info info = { 309 info->vegas.tcpv_enabled = 1;
310 .tcpv_enabled = 1, 310 info->vegas.tcpv_rttcnt = ca->cnt_rtt;
311 .tcpv_rttcnt = ca->cnt_rtt, 311 info->vegas.tcpv_minrtt = ca->base_rtt;
312 .tcpv_minrtt = ca->base_rtt, 312 info->vegas.tcpv_rtt = 0;
313 };
314 313
315 if (info.tcpv_rttcnt > 0) { 314 if (info->vegas.tcpv_rttcnt > 0) {
316 u64 t = ca->sum_rtt; 315 u64 t = ca->sum_rtt;
317 316
318 do_div(t, info.tcpv_rttcnt); 317 do_div(t, info->vegas.tcpv_rttcnt);
319 info.tcpv_rtt = t; 318 info->vegas.tcpv_rtt = t;
320 } 319 }
321 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); 320 *attr = INET_DIAG_VEGASINFO;
321 return sizeof(struct tcpvegas_info);
322 } 322 }
323 return 0;
323} 324}
324 325
325static struct tcp_congestion_ops tcp_illinois __read_mostly = { 326static struct tcp_congestion_ops tcp_illinois __read_mostly = {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a7ef679dd3ea..bc790ea9960f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1820,14 +1820,12 @@ advance_sp:
1820 for (j = 0; j < used_sacks; j++) 1820 for (j = 0; j < used_sacks; j++)
1821 tp->recv_sack_cache[i++] = sp[j]; 1821 tp->recv_sack_cache[i++] = sp[j];
1822 1822
1823 tcp_mark_lost_retrans(sk);
1824
1825 tcp_verify_left_out(tp);
1826
1827 if ((state.reord < tp->fackets_out) && 1823 if ((state.reord < tp->fackets_out) &&
1828 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) 1824 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
1829 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0); 1825 tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
1830 1826
1827 tcp_mark_lost_retrans(sk);
1828 tcp_verify_left_out(tp);
1831out: 1829out:
1832 1830
1833#if FASTRETRANS_DEBUG > 0 1831#if FASTRETRANS_DEBUG > 0
@@ -3280,6 +3278,24 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3280 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); 3278 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3281} 3279}
3282 3280
3281/* If we update tp->snd_una, also update tp->bytes_acked */
3282static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
3283{
3284 u32 delta = ack - tp->snd_una;
3285
3286 tp->bytes_acked += delta;
3287 tp->snd_una = ack;
3288}
3289
3290/* If we update tp->rcv_nxt, also update tp->bytes_received */
3291static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
3292{
3293 u32 delta = seq - tp->rcv_nxt;
3294
3295 tp->bytes_received += delta;
3296 tp->rcv_nxt = seq;
3297}
3298
3283/* Update our send window. 3299/* Update our send window.
3284 * 3300 *
3285 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 3301 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
@@ -3315,7 +3331,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
3315 } 3331 }
3316 } 3332 }
3317 3333
3318 tp->snd_una = ack; 3334 tcp_snd_una_update(tp, ack);
3319 3335
3320 return flag; 3336 return flag;
3321} 3337}
@@ -3497,7 +3513,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3497 * Note, we use the fact that SND.UNA>=SND.WL2. 3513 * Note, we use the fact that SND.UNA>=SND.WL2.
3498 */ 3514 */
3499 tcp_update_wl(tp, ack_seq); 3515 tcp_update_wl(tp, ack_seq);
3500 tp->snd_una = ack; 3516 tcp_snd_una_update(tp, ack);
3501 flag |= FLAG_WIN_UPDATE; 3517 flag |= FLAG_WIN_UPDATE;
3502 3518
3503 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); 3519 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
@@ -4236,7 +4252,7 @@ static void tcp_ofo_queue(struct sock *sk)
4236 4252
4237 tail = skb_peek_tail(&sk->sk_receive_queue); 4253 tail = skb_peek_tail(&sk->sk_receive_queue);
4238 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); 4254 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4239 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4255 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4240 if (!eaten) 4256 if (!eaten)
4241 __skb_queue_tail(&sk->sk_receive_queue, skb); 4257 __skb_queue_tail(&sk->sk_receive_queue, skb);
4242 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 4258 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4404,7 +4420,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
4404 __skb_pull(skb, hdrlen); 4420 __skb_pull(skb, hdrlen);
4405 eaten = (tail && 4421 eaten = (tail &&
4406 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; 4422 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
4407 tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4423 tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
4408 if (!eaten) { 4424 if (!eaten) {
4409 __skb_queue_tail(&sk->sk_receive_queue, skb); 4425 __skb_queue_tail(&sk->sk_receive_queue, skb);
4410 skb_set_owner_r(skb, sk); 4426 skb_set_owner_r(skb, sk);
@@ -4497,7 +4513,7 @@ queue_and_out:
4497 4513
4498 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); 4514 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4499 } 4515 }
4500 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4516 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4501 if (skb->len) 4517 if (skb->len)
4502 tcp_event_data_recv(sk, skb); 4518 tcp_event_data_recv(sk, skb);
4503 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 4519 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4845,6 +4861,8 @@ static void tcp_check_space(struct sock *sk)
4845{ 4861{
4846 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { 4862 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
4847 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); 4863 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4864 /* pairs with tcp_poll() */
4865 smp_mb__after_atomic();
4848 if (sk->sk_socket && 4866 if (sk->sk_socket &&
4849 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) 4867 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
4850 tcp_new_space(sk); 4868 tcp_new_space(sk);
@@ -5243,7 +5261,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5243 tcp_rcv_rtt_measure_ts(sk, skb); 5261 tcp_rcv_rtt_measure_ts(sk, skb);
5244 5262
5245 __skb_pull(skb, tcp_header_len); 5263 __skb_pull(skb, tcp_header_len);
5246 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 5264 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
5247 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); 5265 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
5248 eaten = 1; 5266 eaten = 1;
5249 } 5267 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3571f2be4470..fc1c658ec6c1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1348,7 +1348,8 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1348 req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); 1348 req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
1349 if (req) { 1349 if (req) {
1350 nsk = tcp_check_req(sk, skb, req, false); 1350 nsk = tcp_check_req(sk, skb, req, false);
1351 reqsk_put(req); 1351 if (!nsk)
1352 reqsk_put(req);
1352 return nsk; 1353 return nsk;
1353 } 1354 }
1354 1355
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d6311b5365..e5d7649136fc 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -755,10 +755,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
755 if (!child) 755 if (!child)
756 goto listen_overflow; 756 goto listen_overflow;
757 757
758 inet_csk_reqsk_queue_unlink(sk, req); 758 inet_csk_reqsk_queue_drop(sk, req);
759 inet_csk_reqsk_queue_removed(sk, req);
760
761 inet_csk_reqsk_queue_add(sk, req, child); 759 inet_csk_reqsk_queue_add(sk, req, child);
760 /* Warning: caller must not call reqsk_put(req);
761 * child stole last reference on it.
762 */
762 return child; 763 return child;
763 764
764listen_overflow: 765listen_overflow:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8c8d7e06b72f..a369e8a70b2c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2812,39 +2812,65 @@ begin_fwd:
2812 } 2812 }
2813} 2813}
2814 2814
2815/* Send a fin. The caller locks the socket for us. This cannot be 2815/* We allow to exceed memory limits for FIN packets to expedite
2816 * allowed to fail queueing a FIN frame under any circumstances. 2816 * connection tear down and (memory) recovery.
2817 * Otherwise tcp_send_fin() could be tempted to either delay FIN
2818 * or even be forced to close flow without any FIN.
2819 */
2820static void sk_forced_wmem_schedule(struct sock *sk, int size)
2821{
2822 int amt, status;
2823
2824 if (size <= sk->sk_forward_alloc)
2825 return;
2826 amt = sk_mem_pages(size);
2827 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2828 sk_memory_allocated_add(sk, amt, &status);
2829}
2830
2831/* Send a FIN. The caller locks the socket for us.
2832 * We should try to send a FIN packet really hard, but eventually give up.
2817 */ 2833 */
2818void tcp_send_fin(struct sock *sk) 2834void tcp_send_fin(struct sock *sk)
2819{ 2835{
2836 struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
2820 struct tcp_sock *tp = tcp_sk(sk); 2837 struct tcp_sock *tp = tcp_sk(sk);
2821 struct sk_buff *skb = tcp_write_queue_tail(sk);
2822 int mss_now;
2823 2838
2824 /* Optimization, tack on the FIN if we have a queue of 2839 /* Optimization, tack on the FIN if we have one skb in write queue and
2825 * unsent frames. But be careful about outgoing SACKS 2840 * this skb was not yet sent, or we are under memory pressure.
2826 * and IP options. 2841 * Note: in the latter case, FIN packet will be sent after a timeout,
2842 * as TCP stack thinks it has already been transmitted.
2827 */ 2843 */
2828 mss_now = tcp_current_mss(sk); 2844 if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
2829 2845coalesce:
2830 if (tcp_send_head(sk)) { 2846 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
2831 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; 2847 TCP_SKB_CB(tskb)->end_seq++;
2832 TCP_SKB_CB(skb)->end_seq++;
2833 tp->write_seq++; 2848 tp->write_seq++;
2849 if (!tcp_send_head(sk)) {
2850 /* This means tskb was already sent.
2851 * Pretend we included the FIN on previous transmit.
2852 * We need to set tp->snd_nxt to the value it would have
2853 * if FIN had been sent. This is because retransmit path
2854 * does not change tp->snd_nxt.
2855 */
2856 tp->snd_nxt++;
2857 return;
2858 }
2834 } else { 2859 } else {
2835 /* Socket is locked, keep trying until memory is available. */ 2860 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
2836 for (;;) { 2861 if (unlikely(!skb)) {
2837 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); 2862 if (tskb)
2838 if (skb) 2863 goto coalesce;
2839 break; 2864 return;
2840 yield();
2841 } 2865 }
2866 skb_reserve(skb, MAX_TCP_HEADER);
2867 sk_forced_wmem_schedule(sk, skb->truesize);
2842 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 2868 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2843 tcp_init_nondata_skb(skb, tp->write_seq, 2869 tcp_init_nondata_skb(skb, tp->write_seq,
2844 TCPHDR_ACK | TCPHDR_FIN); 2870 TCPHDR_ACK | TCPHDR_FIN);
2845 tcp_queue_skb(sk, skb); 2871 tcp_queue_skb(sk, skb);
2846 } 2872 }
2847 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); 2873 __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
2848} 2874}
2849 2875
2850/* We get here when a process closes a file descriptor (either due to 2876/* We get here when a process closes a file descriptor (either due to
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a6afde666ab1..a6cea1d5e20d 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -286,19 +286,21 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
286} 286}
287 287
288/* Extract info for Tcp socket info provided via netlink. */ 288/* Extract info for Tcp socket info provided via netlink. */
289void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) 289size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
290 union tcp_cc_info *info)
290{ 291{
291 const struct vegas *ca = inet_csk_ca(sk); 292 const struct vegas *ca = inet_csk_ca(sk);
293
292 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 294 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
293 struct tcpvegas_info info = { 295 info->vegas.tcpv_enabled = ca->doing_vegas_now,
294 .tcpv_enabled = ca->doing_vegas_now, 296 info->vegas.tcpv_rttcnt = ca->cntRTT,
295 .tcpv_rttcnt = ca->cntRTT, 297 info->vegas.tcpv_rtt = ca->baseRTT,
296 .tcpv_rtt = ca->baseRTT, 298 info->vegas.tcpv_minrtt = ca->minRTT,
297 .tcpv_minrtt = ca->minRTT, 299
298 }; 300 *attr = INET_DIAG_VEGASINFO;
299 301 return sizeof(struct tcpvegas_info);
300 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
301 } 302 }
303 return 0;
302} 304}
303EXPORT_SYMBOL_GPL(tcp_vegas_get_info); 305EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
304 306
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index 0531b99d8637..ef9da5306c68 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -19,6 +19,7 @@ void tcp_vegas_init(struct sock *sk);
19void tcp_vegas_state(struct sock *sk, u8 ca_state); 19void tcp_vegas_state(struct sock *sk, u8 ca_state);
20void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); 20void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
21void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); 21void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
22void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb); 22size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
23 union tcp_cc_info *info);
23 24
24#endif /* __TCP_VEGAS_H */ 25#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index bb63fba47d47..c10732e39837 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -256,20 +256,21 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
256} 256}
257 257
258/* Extract info for Tcp socket info provided via netlink. */ 258/* Extract info for Tcp socket info provided via netlink. */
259static void tcp_westwood_info(struct sock *sk, u32 ext, 259static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr,
260 struct sk_buff *skb) 260 union tcp_cc_info *info)
261{ 261{
262 const struct westwood *ca = inet_csk_ca(sk); 262 const struct westwood *ca = inet_csk_ca(sk);
263 263
264 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 264 if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
265 struct tcpvegas_info info = { 265 info->vegas.tcpv_enabled = 1;
266 .tcpv_enabled = 1, 266 info->vegas.tcpv_rttcnt = 0;
267 .tcpv_rtt = jiffies_to_usecs(ca->rtt), 267 info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt),
268 .tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), 268 info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
269 };
270 269
271 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); 270 *attr = INET_DIAG_VEGASINFO;
271 return sizeof(struct tcpvegas_info);
272 } 272 }
273 return 0;
273} 274}
274 275
275static struct tcp_congestion_ops tcp_westwood __read_mostly = { 276static struct tcp_congestion_ops tcp_westwood __read_mostly = {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b5e6cc1d4a73..a38d3ac0f18f 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1246,7 +1246,6 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
1246static int ip6gre_tunnel_init(struct net_device *dev) 1246static int ip6gre_tunnel_init(struct net_device *dev)
1247{ 1247{
1248 struct ip6_tnl *tunnel; 1248 struct ip6_tnl *tunnel;
1249 int i;
1250 1249
1251 tunnel = netdev_priv(dev); 1250 tunnel = netdev_priv(dev);
1252 1251
@@ -1260,16 +1259,10 @@ static int ip6gre_tunnel_init(struct net_device *dev)
1260 if (ipv6_addr_any(&tunnel->parms.raddr)) 1259 if (ipv6_addr_any(&tunnel->parms.raddr))
1261 dev->header_ops = &ip6gre_header_ops; 1260 dev->header_ops = &ip6gre_header_ops;
1262 1261
1263 dev->tstats = alloc_percpu(struct pcpu_sw_netstats); 1262 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1264 if (!dev->tstats) 1263 if (!dev->tstats)
1265 return -ENOMEM; 1264 return -ENOMEM;
1266 1265
1267 for_each_possible_cpu(i) {
1268 struct pcpu_sw_netstats *ip6gre_tunnel_stats;
1269 ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i);
1270 u64_stats_init(&ip6gre_tunnel_stats->syncp);
1271 }
1272
1273 return 0; 1266 return 0;
1274} 1267}
1275 1268
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7fde1f265c90..c21777565c58 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -886,22 +886,45 @@ static int ip6_dst_lookup_tail(struct sock *sk,
886#endif 886#endif
887 int err; 887 int err;
888 888
889 if (!*dst) 889 /* The correct way to handle this would be to do
890 *dst = ip6_route_output(net, sk, fl6); 890 * ip6_route_get_saddr, and then ip6_route_output; however,
891 891 * the route-specific preferred source forces the
892 err = (*dst)->error; 892 * ip6_route_output call _before_ ip6_route_get_saddr.
893 if (err) 893 *
894 goto out_err_release; 894 * In source specific routing (no src=any default route),
895 * ip6_route_output will fail given src=any saddr, though, so
896 * that's why we try it again later.
897 */
898 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
899 struct rt6_info *rt;
900 bool had_dst = *dst != NULL;
895 901
896 if (ipv6_addr_any(&fl6->saddr)) { 902 if (!had_dst)
897 struct rt6_info *rt = (struct rt6_info *) *dst; 903 *dst = ip6_route_output(net, sk, fl6);
904 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
898 err = ip6_route_get_saddr(net, rt, &fl6->daddr, 905 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
899 sk ? inet6_sk(sk)->srcprefs : 0, 906 sk ? inet6_sk(sk)->srcprefs : 0,
900 &fl6->saddr); 907 &fl6->saddr);
901 if (err) 908 if (err)
902 goto out_err_release; 909 goto out_err_release;
910
911 /* If we had an erroneous initial result, pretend it
912 * never existed and let the SA-enabled version take
913 * over.
914 */
915 if (!had_dst && (*dst)->error) {
916 dst_release(*dst);
917 *dst = NULL;
918 }
903 } 919 }
904 920
921 if (!*dst)
922 *dst = ip6_route_output(net, sk, fl6);
923
924 err = (*dst)->error;
925 if (err)
926 goto out_err_release;
927
905#ifdef CONFIG_IPV6_OPTIMISTIC_DAD 928#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
906 /* 929 /*
907 * Here if the dst entry we've looked up 930 * Here if the dst entry we've looked up
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5c48293ff062..d3588885f097 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2245,9 +2245,10 @@ int ip6_route_get_saddr(struct net *net,
2245 unsigned int prefs, 2245 unsigned int prefs,
2246 struct in6_addr *saddr) 2246 struct in6_addr *saddr)
2247{ 2247{
2248 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt); 2248 struct inet6_dev *idev =
2249 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2249 int err = 0; 2250 int err = 0;
2250 if (rt->rt6i_prefsrc.plen) 2251 if (rt && rt->rt6i_prefsrc.plen)
2251 *saddr = rt->rt6i_prefsrc.addr; 2252 *saddr = rt->rt6i_prefsrc.addr;
2252 else 2253 else
2253 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, 2254 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ad51df85aa00..b6575d665568 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -946,7 +946,8 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
946 &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb)); 946 &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
947 if (req) { 947 if (req) {
948 nsk = tcp_check_req(sk, skb, req, false); 948 nsk = tcp_check_req(sk, skb, req, false);
949 reqsk_put(req); 949 if (!nsk)
950 reqsk_put(req);
950 return nsk; 951 return nsk;
951 } 952 }
952 nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, 953 nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b4ac596a7cb7..bab5c63c0bad 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -819,13 +819,15 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
819 * (because if we remove a STA after ops->remove_interface() 819 * (because if we remove a STA after ops->remove_interface()
820 * the driver will have removed the vif info already!) 820 * the driver will have removed the vif info already!)
821 * 821 *
822 * This is relevant only in WDS mode, in all other modes we've 822 * In WDS mode a station must exist here and be flushed, for
823 * already removed all stations when disconnecting or similar, 823 * AP_VLANs stations may exist since there's nothing else that
824 * so warn otherwise. 824 * would have removed them, but in other modes there shouldn't
825 * be any stations.
825 */ 826 */
826 flushed = sta_info_flush(sdata); 827 flushed = sta_info_flush(sdata);
827 WARN_ON_ONCE((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) || 828 WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
828 (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)); 829 ((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) ||
830 (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)));
829 831
830 /* don't count this interface for promisc/allmulti while it is down */ 832 /* don't count this interface for promisc/allmulti while it is down */
831 if (sdata->flags & IEEE80211_SDATA_ALLMULTI) 833 if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 12971b71d0fa..2880f2ae99ab 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -66,6 +66,7 @@
66 66
67static const struct rhashtable_params sta_rht_params = { 67static const struct rhashtable_params sta_rht_params = {
68 .nelem_hint = 3, /* start small */ 68 .nelem_hint = 3, /* start small */
69 .automatic_shrinking = true,
69 .head_offset = offsetof(struct sta_info, hash_node), 70 .head_offset = offsetof(struct sta_info, hash_node),
70 .key_offset = offsetof(struct sta_info, sta.addr), 71 .key_offset = offsetof(struct sta_info, sta.addr),
71 .key_len = ETH_ALEN, 72 .key_len = ETH_ALEN,
@@ -157,8 +158,24 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
157 const u8 *addr) 158 const u8 *addr)
158{ 159{
159 struct ieee80211_local *local = sdata->local; 160 struct ieee80211_local *local = sdata->local;
161 struct sta_info *sta;
162 struct rhash_head *tmp;
163 const struct bucket_table *tbl;
164
165 rcu_read_lock();
166 tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
160 167
161 return rhashtable_lookup_fast(&local->sta_hash, addr, sta_rht_params); 168 for_each_sta_info(local, tbl, addr, sta, tmp) {
169 if (sta->sdata == sdata) {
170 rcu_read_unlock();
171 /* this is safe as the caller must already hold
172 * another rcu read section or the mutex
173 */
174 return sta;
175 }
176 }
177 rcu_read_unlock();
178 return NULL;
162} 179}
163 180
164/* 181/*
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index 5d9f68c75e5f..70be9c799f8a 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -22,13 +22,14 @@
22 22
23static struct net_device * 23static struct net_device *
24ieee802154_add_iface_deprecated(struct wpan_phy *wpan_phy, 24ieee802154_add_iface_deprecated(struct wpan_phy *wpan_phy,
25 const char *name, int type) 25 const char *name,
26 unsigned char name_assign_type, int type)
26{ 27{
27 struct ieee802154_local *local = wpan_phy_priv(wpan_phy); 28 struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
28 struct net_device *dev; 29 struct net_device *dev;
29 30
30 rtnl_lock(); 31 rtnl_lock();
31 dev = ieee802154_if_add(local, name, type, 32 dev = ieee802154_if_add(local, name, name_assign_type, type,
32 cpu_to_le64(0x0000000000000000ULL)); 33 cpu_to_le64(0x0000000000000000ULL));
33 rtnl_unlock(); 34 rtnl_unlock();
34 35
@@ -45,12 +46,14 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy,
45 46
46static int 47static int
47ieee802154_add_iface(struct wpan_phy *phy, const char *name, 48ieee802154_add_iface(struct wpan_phy *phy, const char *name,
49 unsigned char name_assign_type,
48 enum nl802154_iftype type, __le64 extended_addr) 50 enum nl802154_iftype type, __le64 extended_addr)
49{ 51{
50 struct ieee802154_local *local = wpan_phy_priv(phy); 52 struct ieee802154_local *local = wpan_phy_priv(phy);
51 struct net_device *err; 53 struct net_device *err;
52 54
53 err = ieee802154_if_add(local, name, type, extended_addr); 55 err = ieee802154_if_add(local, name, name_assign_type, type,
56 extended_addr);
54 return PTR_ERR_OR_ZERO(err); 57 return PTR_ERR_OR_ZERO(err);
55} 58}
56 59
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index bebd70ffc7a3..127ba18386fc 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -182,7 +182,8 @@ void ieee802154_iface_exit(void);
182void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata); 182void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata);
183struct net_device * 183struct net_device *
184ieee802154_if_add(struct ieee802154_local *local, const char *name, 184ieee802154_if_add(struct ieee802154_local *local, const char *name,
185 enum nl802154_iftype type, __le64 extended_addr); 185 unsigned char name_assign_type, enum nl802154_iftype type,
186 __le64 extended_addr);
186void ieee802154_remove_interfaces(struct ieee802154_local *local); 187void ieee802154_remove_interfaces(struct ieee802154_local *local);
187 188
188#endif /* __IEEE802154_I_H */ 189#endif /* __IEEE802154_I_H */
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 38b56f9d9386..91b75abbd1a1 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -522,7 +522,8 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
522 522
523struct net_device * 523struct net_device *
524ieee802154_if_add(struct ieee802154_local *local, const char *name, 524ieee802154_if_add(struct ieee802154_local *local, const char *name,
525 enum nl802154_iftype type, __le64 extended_addr) 525 unsigned char name_assign_type, enum nl802154_iftype type,
526 __le64 extended_addr)
526{ 527{
527 struct net_device *ndev = NULL; 528 struct net_device *ndev = NULL;
528 struct ieee802154_sub_if_data *sdata = NULL; 529 struct ieee802154_sub_if_data *sdata = NULL;
@@ -531,7 +532,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
531 ASSERT_RTNL(); 532 ASSERT_RTNL();
532 533
533 ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name, 534 ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name,
534 NET_NAME_UNKNOWN, ieee802154_if_setup); 535 name_assign_type, ieee802154_if_setup);
535 if (!ndev) 536 if (!ndev)
536 return ERR_PTR(-ENOMEM); 537 return ERR_PTR(-ENOMEM);
537 538
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index dcf73958133a..5b2be12832e6 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -134,7 +134,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template)
134 for (i = 0; i < ARRAY_SIZE(key->tfm); i++) { 134 for (i = 0; i < ARRAY_SIZE(key->tfm); i++) {
135 key->tfm[i] = crypto_alloc_aead("ccm(aes)", 0, 135 key->tfm[i] = crypto_alloc_aead("ccm(aes)", 0,
136 CRYPTO_ALG_ASYNC); 136 CRYPTO_ALG_ASYNC);
137 if (!key->tfm[i]) 137 if (IS_ERR(key->tfm[i]))
138 goto err_tfm; 138 goto err_tfm;
139 if (crypto_aead_setkey(key->tfm[i], template->key, 139 if (crypto_aead_setkey(key->tfm[i], template->key,
140 IEEE802154_LLSEC_KEY_SIZE)) 140 IEEE802154_LLSEC_KEY_SIZE))
@@ -144,7 +144,7 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template)
144 } 144 }
145 145
146 key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC); 146 key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
147 if (!key->tfm0) 147 if (IS_ERR(key->tfm0))
148 goto err_tfm; 148 goto err_tfm;
149 149
150 if (crypto_blkcipher_setkey(key->tfm0, template->key, 150 if (crypto_blkcipher_setkey(key->tfm0, template->key,
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index 8500378c8318..08cb32dc8fd3 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -161,18 +161,21 @@ int ieee802154_register_hw(struct ieee802154_hw *hw)
161 161
162 rtnl_lock(); 162 rtnl_lock();
163 163
164 dev = ieee802154_if_add(local, "wpan%d", NL802154_IFTYPE_NODE, 164 dev = ieee802154_if_add(local, "wpan%d", NET_NAME_ENUM,
165 NL802154_IFTYPE_NODE,
165 cpu_to_le64(0x0000000000000000ULL)); 166 cpu_to_le64(0x0000000000000000ULL));
166 if (IS_ERR(dev)) { 167 if (IS_ERR(dev)) {
167 rtnl_unlock(); 168 rtnl_unlock();
168 rc = PTR_ERR(dev); 169 rc = PTR_ERR(dev);
169 goto out_wq; 170 goto out_phy;
170 } 171 }
171 172
172 rtnl_unlock(); 173 rtnl_unlock();
173 174
174 return 0; 175 return 0;
175 176
177out_phy:
178 wpan_phy_unregister(local->phy);
176out_wq: 179out_wq:
177 destroy_workqueue(local->workqueue); 180 destroy_workqueue(local->workqueue);
178out: 181out:
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index db8a2ea6d4de..7b3f732269e4 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -53,6 +53,11 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
53 return rt; 53 return rt;
54} 54}
55 55
56static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
57{
58 return rcu_dereference_rtnl(dev->mpls_ptr);
59}
60
56static bool mpls_output_possible(const struct net_device *dev) 61static bool mpls_output_possible(const struct net_device *dev)
57{ 62{
58 return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); 63 return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -136,6 +141,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
136 struct mpls_route *rt; 141 struct mpls_route *rt;
137 struct mpls_entry_decoded dec; 142 struct mpls_entry_decoded dec;
138 struct net_device *out_dev; 143 struct net_device *out_dev;
144 struct mpls_dev *mdev;
139 unsigned int hh_len; 145 unsigned int hh_len;
140 unsigned int new_header_size; 146 unsigned int new_header_size;
141 unsigned int mtu; 147 unsigned int mtu;
@@ -143,6 +149,10 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
143 149
144 /* Careful this entire function runs inside of an rcu critical section */ 150 /* Careful this entire function runs inside of an rcu critical section */
145 151
152 mdev = mpls_dev_get(dev);
153 if (!mdev || !mdev->input_enabled)
154 goto drop;
155
146 if (skb->pkt_type != PACKET_HOST) 156 if (skb->pkt_type != PACKET_HOST)
147 goto drop; 157 goto drop;
148 158
@@ -352,9 +362,9 @@ static int mpls_route_add(struct mpls_route_config *cfg)
352 if (!dev) 362 if (!dev)
353 goto errout; 363 goto errout;
354 364
355 /* For now just support ethernet devices */ 365 /* Ensure this is a supported device */
356 err = -EINVAL; 366 err = -EINVAL;
357 if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK)) 367 if (!mpls_dev_get(dev))
358 goto errout; 368 goto errout;
359 369
360 err = -EINVAL; 370 err = -EINVAL;
@@ -428,10 +438,89 @@ errout:
428 return err; 438 return err;
429} 439}
430 440
441#define MPLS_PERDEV_SYSCTL_OFFSET(field) \
442 (&((struct mpls_dev *)0)->field)
443
444static const struct ctl_table mpls_dev_table[] = {
445 {
446 .procname = "input",
447 .maxlen = sizeof(int),
448 .mode = 0644,
449 .proc_handler = proc_dointvec,
450 .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
451 },
452 { }
453};
454
455static int mpls_dev_sysctl_register(struct net_device *dev,
456 struct mpls_dev *mdev)
457{
458 char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
459 struct ctl_table *table;
460 int i;
461
462 table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL);
463 if (!table)
464 goto out;
465
466 /* Table data contains only offsets relative to the base of
467 * the mdev at this point, so make them absolute.
468 */
469 for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++)
470 table[i].data = (char *)mdev + (uintptr_t)table[i].data;
471
472 snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
473
474 mdev->sysctl = register_net_sysctl(dev_net(dev), path, table);
475 if (!mdev->sysctl)
476 goto free;
477
478 return 0;
479
480free:
481 kfree(table);
482out:
483 return -ENOBUFS;
484}
485
486static void mpls_dev_sysctl_unregister(struct mpls_dev *mdev)
487{
488 struct ctl_table *table;
489
490 table = mdev->sysctl->ctl_table_arg;
491 unregister_net_sysctl_table(mdev->sysctl);
492 kfree(table);
493}
494
495static struct mpls_dev *mpls_add_dev(struct net_device *dev)
496{
497 struct mpls_dev *mdev;
498 int err = -ENOMEM;
499
500 ASSERT_RTNL();
501
502 mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
503 if (!mdev)
504 return ERR_PTR(err);
505
506 err = mpls_dev_sysctl_register(dev, mdev);
507 if (err)
508 goto free;
509
510 rcu_assign_pointer(dev->mpls_ptr, mdev);
511
512 return mdev;
513
514free:
515 kfree(mdev);
516 return ERR_PTR(err);
517}
518
431static void mpls_ifdown(struct net_device *dev) 519static void mpls_ifdown(struct net_device *dev)
432{ 520{
433 struct mpls_route __rcu **platform_label; 521 struct mpls_route __rcu **platform_label;
434 struct net *net = dev_net(dev); 522 struct net *net = dev_net(dev);
523 struct mpls_dev *mdev;
435 unsigned index; 524 unsigned index;
436 525
437 platform_label = rtnl_dereference(net->mpls.platform_label); 526 platform_label = rtnl_dereference(net->mpls.platform_label);
@@ -443,14 +532,35 @@ static void mpls_ifdown(struct net_device *dev)
443 continue; 532 continue;
444 rt->rt_dev = NULL; 533 rt->rt_dev = NULL;
445 } 534 }
535
536 mdev = mpls_dev_get(dev);
537 if (!mdev)
538 return;
539
540 mpls_dev_sysctl_unregister(mdev);
541
542 RCU_INIT_POINTER(dev->mpls_ptr, NULL);
543
544 kfree(mdev);
446} 545}
447 546
448static int mpls_dev_notify(struct notifier_block *this, unsigned long event, 547static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
449 void *ptr) 548 void *ptr)
450{ 549{
451 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 550 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
551 struct mpls_dev *mdev;
452 552
453 switch(event) { 553 switch(event) {
554 case NETDEV_REGISTER:
555 /* For now just support ethernet devices */
556 if ((dev->type == ARPHRD_ETHER) ||
557 (dev->type == ARPHRD_LOOPBACK)) {
558 mdev = mpls_add_dev(dev);
559 if (IS_ERR(mdev))
560 return notifier_from_errno(PTR_ERR(mdev));
561 }
562 break;
563
454 case NETDEV_UNREGISTER: 564 case NETDEV_UNREGISTER:
455 mpls_ifdown(dev); 565 mpls_ifdown(dev);
456 break; 566 break;
@@ -536,6 +646,15 @@ int nla_get_labels(const struct nlattr *nla,
536 if ((dec.bos != bos) || dec.ttl || dec.tc) 646 if ((dec.bos != bos) || dec.ttl || dec.tc)
537 return -EINVAL; 647 return -EINVAL;
538 648
649 switch (dec.label) {
650 case MPLS_LABEL_IMPLNULL:
651 /* RFC3032: This is a label that an LSR may
652 * assign and distribute, but which never
653 * actually appears in the encapsulation.
654 */
655 return -EINVAL;
656 }
657
539 label[i] = dec.label; 658 label[i] = dec.label;
540 } 659 }
541 *labels = nla_labels; 660 *labels = nla_labels;
@@ -816,7 +935,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
816 } 935 }
817 936
818 /* In case the predefined labels need to be populated */ 937 /* In case the predefined labels need to be populated */
819 if (limit > LABEL_IPV4_EXPLICIT_NULL) { 938 if (limit > MPLS_LABEL_IPV4NULL) {
820 struct net_device *lo = net->loopback_dev; 939 struct net_device *lo = net->loopback_dev;
821 rt0 = mpls_rt_alloc(lo->addr_len); 940 rt0 = mpls_rt_alloc(lo->addr_len);
822 if (!rt0) 941 if (!rt0)
@@ -826,7 +945,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
826 rt0->rt_via_table = NEIGH_LINK_TABLE; 945 rt0->rt_via_table = NEIGH_LINK_TABLE;
827 memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); 946 memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
828 } 947 }
829 if (limit > LABEL_IPV6_EXPLICIT_NULL) { 948 if (limit > MPLS_LABEL_IPV6NULL) {
830 struct net_device *lo = net->loopback_dev; 949 struct net_device *lo = net->loopback_dev;
831 rt2 = mpls_rt_alloc(lo->addr_len); 950 rt2 = mpls_rt_alloc(lo->addr_len);
832 if (!rt2) 951 if (!rt2)
@@ -854,15 +973,15 @@ static int resize_platform_label_table(struct net *net, size_t limit)
854 memcpy(labels, old, cp_size); 973 memcpy(labels, old, cp_size);
855 974
856 /* If needed set the predefined labels */ 975 /* If needed set the predefined labels */
857 if ((old_limit <= LABEL_IPV6_EXPLICIT_NULL) && 976 if ((old_limit <= MPLS_LABEL_IPV6NULL) &&
858 (limit > LABEL_IPV6_EXPLICIT_NULL)) { 977 (limit > MPLS_LABEL_IPV6NULL)) {
859 RCU_INIT_POINTER(labels[LABEL_IPV6_EXPLICIT_NULL], rt2); 978 RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2);
860 rt2 = NULL; 979 rt2 = NULL;
861 } 980 }
862 981
863 if ((old_limit <= LABEL_IPV4_EXPLICIT_NULL) && 982 if ((old_limit <= MPLS_LABEL_IPV4NULL) &&
864 (limit > LABEL_IPV4_EXPLICIT_NULL)) { 983 (limit > MPLS_LABEL_IPV4NULL)) {
865 RCU_INIT_POINTER(labels[LABEL_IPV4_EXPLICIT_NULL], rt0); 984 RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0);
866 rt0 = NULL; 985 rt0 = NULL;
867 } 986 }
868 987
@@ -912,7 +1031,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write,
912 return ret; 1031 return ret;
913} 1032}
914 1033
915static struct ctl_table mpls_table[] = { 1034static const struct ctl_table mpls_table[] = {
916 { 1035 {
917 .procname = "platform_labels", 1036 .procname = "platform_labels",
918 .data = NULL, 1037 .data = NULL,
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index fb6de92052c4..b064c345042c 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -1,16 +1,6 @@
1#ifndef MPLS_INTERNAL_H 1#ifndef MPLS_INTERNAL_H
2#define MPLS_INTERNAL_H 2#define MPLS_INTERNAL_H
3 3
4#define LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */
5#define LABEL_ROUTER_ALERT_LABEL 1 /* RFC3032 */
6#define LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */
7#define LABEL_IMPLICIT_NULL 3 /* RFC3032 */
8#define LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */
9#define LABEL_GAL 13 /* RFC5586 */
10#define LABEL_OAM_ALERT 14 /* RFC3429 */
11#define LABEL_EXTENSION 15 /* RFC7274 */
12
13
14struct mpls_shim_hdr { 4struct mpls_shim_hdr {
15 __be32 label_stack_entry; 5 __be32 label_stack_entry;
16}; 6};
@@ -22,6 +12,12 @@ struct mpls_entry_decoded {
22 u8 bos; 12 u8 bos;
23}; 13};
24 14
15struct mpls_dev {
16 int input_enabled;
17
18 struct ctl_table_header *sysctl;
19};
20
25struct sk_buff; 21struct sk_buff;
26 22
27static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) 23static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 78af83bc9c8e..ad9d11fb29fd 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4340,7 +4340,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
4340 case NFT_CONTINUE: 4340 case NFT_CONTINUE:
4341 case NFT_BREAK: 4341 case NFT_BREAK:
4342 case NFT_RETURN: 4342 case NFT_RETURN:
4343 desc->len = sizeof(data->verdict);
4344 break; 4343 break;
4345 case NFT_JUMP: 4344 case NFT_JUMP:
4346 case NFT_GOTO: 4345 case NFT_GOTO:
@@ -4355,10 +4354,10 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
4355 4354
4356 chain->use++; 4355 chain->use++;
4357 data->verdict.chain = chain; 4356 data->verdict.chain = chain;
4358 desc->len = sizeof(data);
4359 break; 4357 break;
4360 } 4358 }
4361 4359
4360 desc->len = sizeof(data->verdict);
4362 desc->type = NFT_DATA_VERDICT; 4361 desc->type = NFT_DATA_VERDICT;
4363 return 0; 4362 return 0;
4364} 4363}
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 57d3e1af5630..0522fc9bfb0a 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -63,6 +63,8 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
63 if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) 63 if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
64 goto nla_put_failure; 64 goto nla_put_failure;
65 break; 65 break;
66 default:
67 break;
66 } 68 }
67 69
68 return 0; 70 return 0;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 62cabee42fbe..635dbba93d01 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -108,6 +108,8 @@ static int nft_reject_inet_dump(struct sk_buff *skb,
108 if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) 108 if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
109 goto nla_put_failure; 109 goto nla_put_failure;
110 break; 110 break;
111 default:
112 break;
111 } 113 }
112 114
113 return 0; 115 return 0;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 19909d0786a2..daa0b818174b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1629,13 +1629,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
1629 if (data == NULL) 1629 if (data == NULL)
1630 return NULL; 1630 return NULL;
1631 1631
1632 skb = build_skb(data, size); 1632 skb = __build_skb(data, size);
1633 if (skb == NULL) 1633 if (skb == NULL)
1634 vfree(data); 1634 vfree(data);
1635 else { 1635 else
1636 skb->head_frag = 0;
1637 skb->destructor = netlink_skb_destructor; 1636 skb->destructor = netlink_skb_destructor;
1638 }
1639 1637
1640 return skb; 1638 return skb;
1641} 1639}
@@ -3141,7 +3139,6 @@ static const struct rhashtable_params netlink_rhashtable_params = {
3141 .key_len = netlink_compare_arg_len, 3139 .key_len = netlink_compare_arg_len,
3142 .obj_hashfn = netlink_hash, 3140 .obj_hashfn = netlink_hash,
3143 .obj_cmpfn = netlink_compare, 3141 .obj_cmpfn = netlink_compare,
3144 .max_size = 65536,
3145 .automatic_shrinking = true, 3142 .automatic_shrinking = true,
3146}; 3143};
3147 3144
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 5102c3cc4eec..b5989c6ee551 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2311,11 +2311,14 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2311 tlen = dev->needed_tailroom; 2311 tlen = dev->needed_tailroom;
2312 skb = sock_alloc_send_skb(&po->sk, 2312 skb = sock_alloc_send_skb(&po->sk,
2313 hlen + tlen + sizeof(struct sockaddr_ll), 2313 hlen + tlen + sizeof(struct sockaddr_ll),
2314 0, &err); 2314 !need_wait, &err);
2315 2315
2316 if (unlikely(skb == NULL)) 2316 if (unlikely(skb == NULL)) {
2317 /* we assume the socket was initially writeable ... */
2318 if (likely(len_sum > 0))
2319 err = len_sum;
2317 goto out_status; 2320 goto out_status;
2318 2321 }
2319 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, 2322 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
2320 addr, hlen); 2323 addr, hlen);
2321 if (tp_len > dev->mtu + dev->hard_header_len) { 2324 if (tp_len > dev->mtu + dev->hard_header_len) {
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 14f041398ca1..da6da57e5f36 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
126 struct rds_transport *loop_trans; 126 struct rds_transport *loop_trans;
127 unsigned long flags; 127 unsigned long flags;
128 int ret; 128 int ret;
129 struct rds_transport *otrans = trans;
129 130
131 if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
132 goto new_conn;
130 rcu_read_lock(); 133 rcu_read_lock();
131 conn = rds_conn_lookup(head, laddr, faddr, trans); 134 conn = rds_conn_lookup(head, laddr, faddr, trans);
132 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && 135 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
@@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
142 if (conn) 145 if (conn)
143 goto out; 146 goto out;
144 147
148new_conn:
145 conn = kmem_cache_zalloc(rds_conn_slab, gfp); 149 conn = kmem_cache_zalloc(rds_conn_slab, gfp);
146 if (!conn) { 150 if (!conn) {
147 conn = ERR_PTR(-ENOMEM); 151 conn = ERR_PTR(-ENOMEM);
@@ -230,13 +234,22 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
230 /* Creating normal conn */ 234 /* Creating normal conn */
231 struct rds_connection *found; 235 struct rds_connection *found;
232 236
233 found = rds_conn_lookup(head, laddr, faddr, trans); 237 if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
238 found = NULL;
239 else
240 found = rds_conn_lookup(head, laddr, faddr, trans);
234 if (found) { 241 if (found) {
235 trans->conn_free(conn->c_transport_data); 242 trans->conn_free(conn->c_transport_data);
236 kmem_cache_free(rds_conn_slab, conn); 243 kmem_cache_free(rds_conn_slab, conn);
237 conn = found; 244 conn = found;
238 } else { 245 } else {
239 hlist_add_head_rcu(&conn->c_hash_node, head); 246 if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
247 (otrans->t_type != RDS_TRANS_TCP)) {
248 /* Only the active side should be added to
249 * reconnect list for TCP.
250 */
251 hlist_add_head_rcu(&conn->c_hash_node, head);
252 }
240 rds_cong_add_conn(conn); 253 rds_cong_add_conn(conn);
241 rds_conn_count++; 254 rds_conn_count++;
242 } 255 }
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 31b74f5e61ad..8a09ee7db3c1 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -183,8 +183,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
183 183
184 /* If the peer gave us the last packet it saw, process this as if 184 /* If the peer gave us the last packet it saw, process this as if
185 * we had received a regular ACK. */ 185 * we had received a regular ACK. */
186 if (dp && dp->dp_ack_seq) 186 if (dp) {
187 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); 187 /* dp structure start is not guaranteed to be 8 bytes aligned.
188 * Since dp_ack_seq is 64-bit extended load operations can be
189 * used so go through get_unaligned to avoid unaligned errors.
190 */
191 __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
192
193 if (dp_ack_seq)
194 rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
195 NULL);
196 }
188 197
189 rds_connect_complete(conn); 198 rds_connect_complete(conn);
190} 199}
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index f9f564a6c960..973109c7b8e8 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk)
62 case TCP_ESTABLISHED: 62 case TCP_ESTABLISHED:
63 rds_connect_complete(conn); 63 rds_connect_complete(conn);
64 break; 64 break;
65 case TCP_CLOSE_WAIT:
65 case TCP_CLOSE: 66 case TCP_CLOSE:
66 rds_conn_drop(conn); 67 rds_conn_drop(conn);
67 default: 68 default:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 23ab4dcd1d9f..0da49e34495f 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work);
45static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); 45static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
46static struct socket *rds_tcp_listen_sock; 46static struct socket *rds_tcp_listen_sock;
47 47
48static int rds_tcp_keepalive(struct socket *sock)
49{
50 /* values below based on xs_udp_default_timeout */
51 int keepidle = 5; /* send a probe 'keepidle' secs after last data */
52 int keepcnt = 5; /* number of unack'ed probes before declaring dead */
53 int keepalive = 1;
54 int ret = 0;
55
56 ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
57 (char *)&keepalive, sizeof(keepalive));
58 if (ret < 0)
59 goto bail;
60
61 ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
62 (char *)&keepcnt, sizeof(keepcnt));
63 if (ret < 0)
64 goto bail;
65
66 ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
67 (char *)&keepidle, sizeof(keepidle));
68 if (ret < 0)
69 goto bail;
70
71 /* KEEPINTVL is the interval between successive probes. We follow
72 * the model in xs_tcp_finish_connecting() and re-use keepidle.
73 */
74 ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
75 (char *)&keepidle, sizeof(keepidle));
76bail:
77 return ret;
78}
79
48static int rds_tcp_accept_one(struct socket *sock) 80static int rds_tcp_accept_one(struct socket *sock)
49{ 81{
50 struct socket *new_sock = NULL; 82 struct socket *new_sock = NULL;
51 struct rds_connection *conn; 83 struct rds_connection *conn;
52 int ret; 84 int ret;
53 struct inet_sock *inet; 85 struct inet_sock *inet;
86 struct rds_tcp_connection *rs_tcp;
54 87
55 ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, 88 ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
56 sock->sk->sk_protocol, &new_sock); 89 sock->sk->sk_protocol, &new_sock);
@@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock)
63 if (ret < 0) 96 if (ret < 0)
64 goto out; 97 goto out;
65 98
99 ret = rds_tcp_keepalive(new_sock);
100 if (ret < 0)
101 goto out;
102
66 rds_tcp_tune(new_sock); 103 rds_tcp_tune(new_sock);
67 104
68 inet = inet_sk(new_sock->sk); 105 inet = inet_sk(new_sock->sk);
@@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock)
77 ret = PTR_ERR(conn); 114 ret = PTR_ERR(conn);
78 goto out; 115 goto out;
79 } 116 }
117 /* An incoming SYN request came in, and TCP just accepted it.
118 * We always create a new conn for listen side of TCP, and do not
119 * add it to the c_hash_list.
120 *
121 * If the client reboots, this conn will need to be cleaned up.
122 * rds_tcp_state_change() will do that cleanup
123 */
124 rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
125 WARN_ON(!rs_tcp || rs_tcp->t_sock);
80 126
81 /* 127 /*
82 * see the comment above rds_queue_delayed_reconnect() 128 * see the comment above rds_queue_delayed_reconnect()
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 4d2cede17468..dc6a2d324bd8 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -38,6 +38,9 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
38 struct tcf_bpf *prog = act->priv; 38 struct tcf_bpf *prog = act->priv;
39 int action, filter_res; 39 int action, filter_res;
40 40
41 if (unlikely(!skb_mac_header_was_set(skb)))
42 return TC_ACT_UNSPEC;
43
41 spin_lock(&prog->tcf_lock); 44 spin_lock(&prog->tcf_lock);
42 45
43 prog->tcf_tm.lastuse = jiffies; 46 prog->tcf_tm.lastuse = jiffies;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 8e472518f9f6..295d14bd6c67 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -63,7 +63,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
63 skb->mark = c->mark; 63 skb->mark = c->mark;
64 /* using overlimits stats to count how many packets marked */ 64 /* using overlimits stats to count how many packets marked */
65 ca->tcf_qstats.overlimits++; 65 ca->tcf_qstats.overlimits++;
66 nf_ct_put(c);
67 goto out; 66 goto out;
68 } 67 }
69 68
@@ -82,7 +81,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
82 nf_ct_put(c); 81 nf_ct_put(c);
83 82
84out: 83out:
85 skb->nfct = NULL;
86 spin_unlock(&ca->tcf_lock); 84 spin_unlock(&ca->tcf_lock);
87 return ca->tcf_action; 85 return ca->tcf_action;
88} 86}
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5953517ec059..3f63ceac8e01 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -157,7 +157,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
157 157
158 if (!(at & AT_EGRESS)) { 158 if (!(at & AT_EGRESS)) {
159 if (m->tcfm_ok_push) 159 if (m->tcfm_ok_push)
160 skb_push(skb2, skb2->dev->hard_header_len); 160 skb_push(skb2, skb->mac_len);
161 } 161 }
162 162
163 /* mirror is always swallowed */ 163 /* mirror is always swallowed */
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8b0470e418dc..b6ef9a04de06 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -308,12 +308,11 @@ replay:
308 case RTM_DELTFILTER: 308 case RTM_DELTFILTER:
309 err = tp->ops->delete(tp, fh); 309 err = tp->ops->delete(tp, fh);
310 if (err == 0) { 310 if (err == 0) {
311 tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); 311 struct tcf_proto *next = rtnl_dereference(tp->next);
312 if (tcf_destroy(tp, false)) {
313 struct tcf_proto *next = rtnl_dereference(tp->next);
314 312
313 tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
314 if (tcf_destroy(tp, false))
315 RCU_INIT_POINTER(*back, next); 315 RCU_INIT_POINTER(*back, next);
316 }
317 } 316 }
318 goto errout; 317 goto errout;
319 case RTM_GETTFILTER: 318 case RTM_GETTFILTER:
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 5c4171c5d2bd..91bd9c19471d 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -66,6 +66,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
66 struct cls_bpf_prog *prog; 66 struct cls_bpf_prog *prog;
67 int ret = -1; 67 int ret = -1;
68 68
69 if (unlikely(!skb_mac_header_was_set(skb)))
70 return -1;
71
69 /* Needed here for accessing maps. */ 72 /* Needed here for accessing maps. */
70 rcu_read_lock(); 73 rcu_read_lock();
71 list_for_each_entry_rcu(prog, &head->plist, link) { 74 list_for_each_entry_rcu(prog, &head->plist, link) {
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index de28f8e968e8..7a0bdb16ac92 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -164,7 +164,7 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt)
164 164
165 sch->limit = DEFAULT_CODEL_LIMIT; 165 sch->limit = DEFAULT_CODEL_LIMIT;
166 166
167 codel_params_init(&q->params); 167 codel_params_init(&q->params, sch);
168 codel_vars_init(&q->vars); 168 codel_vars_init(&q->vars);
169 codel_stats_init(&q->stats); 169 codel_stats_init(&q->stats);
170 170
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 1e52decb7b59..c244c45b78d7 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -391,7 +391,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
391 q->perturbation = prandom_u32(); 391 q->perturbation = prandom_u32();
392 INIT_LIST_HEAD(&q->new_flows); 392 INIT_LIST_HEAD(&q->new_flows);
393 INIT_LIST_HEAD(&q->old_flows); 393 INIT_LIST_HEAD(&q->old_flows);
394 codel_params_init(&q->cparams); 394 codel_params_init(&q->cparams, sch);
395 codel_stats_init(&q->cstats); 395 codel_stats_init(&q->cstats);
396 q->cparams.ecn = true; 396 q->cparams.ecn = true;
397 397
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index a4ca4517cdc8..634529e0ce6b 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -229,7 +229,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
229 break; 229 break;
230 } 230 }
231 231
232 if (q->backlog + qdisc_pkt_len(skb) <= q->limit) { 232 if (gred_backlog(t, q, sch) + qdisc_pkt_len(skb) <= q->limit) {
233 q->backlog += qdisc_pkt_len(skb); 233 q->backlog += qdisc_pkt_len(skb);
234 return qdisc_enqueue_tail(skb, sch); 234 return qdisc_enqueue_tail(skb, sch);
235 } 235 }
@@ -553,7 +553,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
553 553
554 opt.limit = q->limit; 554 opt.limit = q->limit;
555 opt.DP = q->DP; 555 opt.DP = q->DP;
556 opt.backlog = q->backlog; 556 opt.backlog = gred_backlog(table, q, sch);
557 opt.prio = q->prio; 557 opt.prio = q->prio;
558 opt.qth_min = q->parms.qth_min >> q->parms.Wlog; 558 opt.qth_min = q->parms.qth_min >> q->parms.Wlog;
559 opt.qth_max = q->parms.qth_max >> q->parms.Wlog; 559 opt.qth_max = q->parms.qth_max >> q->parms.Wlog;
diff --git a/net/socket.c b/net/socket.c
index 3e33959f3ce5..884e32997698 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -312,7 +312,7 @@ static const struct super_operations sockfs_ops = {
312static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) 312static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
313{ 313{
314 return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", 314 return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
315 dentry->d_inode->i_ino); 315 d_inode(dentry)->i_ino);
316} 316}
317 317
318static const struct dentry_operations sockfs_dentry_operations = { 318static const struct dentry_operations sockfs_dentry_operations = {
@@ -375,7 +375,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
375 &socket_file_ops); 375 &socket_file_ops);
376 if (unlikely(IS_ERR(file))) { 376 if (unlikely(IS_ERR(file))) {
377 /* drop dentry, keep inode */ 377 /* drop dentry, keep inode */
378 ihold(path.dentry->d_inode); 378 ihold(d_inode(path.dentry));
379 path_put(&path); 379 path_put(&path);
380 return file; 380 return file;
381 } 381 }
@@ -497,7 +497,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
497 ssize_t len; 497 ssize_t len;
498 ssize_t used = 0; 498 ssize_t used = 0;
499 499
500 len = security_inode_listsecurity(dentry->d_inode, buffer, size); 500 len = security_inode_listsecurity(d_inode(dentry), buffer, size);
501 if (len < 0) 501 if (len < 0)
502 return len; 502 return len;
503 used += len; 503 used += len;
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 1ec19f6f0c2b..eeeba5adee6d 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -793,20 +793,26 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
793{ 793{
794 u32 value_follows; 794 u32 value_follows;
795 int err; 795 int err;
796 struct page *scratch;
797
798 scratch = alloc_page(GFP_KERNEL);
799 if (!scratch)
800 return -ENOMEM;
801 xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE);
796 802
797 /* res->status */ 803 /* res->status */
798 err = gssx_dec_status(xdr, &res->status); 804 err = gssx_dec_status(xdr, &res->status);
799 if (err) 805 if (err)
800 return err; 806 goto out_free;
801 807
802 /* res->context_handle */ 808 /* res->context_handle */
803 err = gssx_dec_bool(xdr, &value_follows); 809 err = gssx_dec_bool(xdr, &value_follows);
804 if (err) 810 if (err)
805 return err; 811 goto out_free;
806 if (value_follows) { 812 if (value_follows) {
807 err = gssx_dec_ctx(xdr, res->context_handle); 813 err = gssx_dec_ctx(xdr, res->context_handle);
808 if (err) 814 if (err)
809 return err; 815 goto out_free;
810 } else { 816 } else {
811 res->context_handle = NULL; 817 res->context_handle = NULL;
812 } 818 }
@@ -814,11 +820,11 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
814 /* res->output_token */ 820 /* res->output_token */
815 err = gssx_dec_bool(xdr, &value_follows); 821 err = gssx_dec_bool(xdr, &value_follows);
816 if (err) 822 if (err)
817 return err; 823 goto out_free;
818 if (value_follows) { 824 if (value_follows) {
819 err = gssx_dec_buffer(xdr, res->output_token); 825 err = gssx_dec_buffer(xdr, res->output_token);
820 if (err) 826 if (err)
821 return err; 827 goto out_free;
822 } else { 828 } else {
823 res->output_token = NULL; 829 res->output_token = NULL;
824 } 830 }
@@ -826,14 +832,17 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
826 /* res->delegated_cred_handle */ 832 /* res->delegated_cred_handle */
827 err = gssx_dec_bool(xdr, &value_follows); 833 err = gssx_dec_bool(xdr, &value_follows);
828 if (err) 834 if (err)
829 return err; 835 goto out_free;
830 if (value_follows) { 836 if (value_follows) {
831 /* we do not support upcall servers sending this data. */ 837 /* we do not support upcall servers sending this data. */
832 return -EINVAL; 838 err = -EINVAL;
839 goto out_free;
833 } 840 }
834 841
835 /* res->options */ 842 /* res->options */
836 err = gssx_dec_option_array(xdr, &res->options); 843 err = gssx_dec_option_array(xdr, &res->options);
837 844
845out_free:
846 __free_page(scratch);
838 return err; 847 return err;
839} 848}
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 2d12b76b5a64..d81186d34558 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -94,7 +94,7 @@ rpc_timeout_upcall_queue(struct work_struct *work)
94 } 94 }
95 dentry = dget(pipe->dentry); 95 dentry = dget(pipe->dentry);
96 spin_unlock(&pipe->lock); 96 spin_unlock(&pipe->lock);
97 rpc_purge_list(dentry ? &RPC_I(dentry->d_inode)->waitq : NULL, 97 rpc_purge_list(dentry ? &RPC_I(d_inode(dentry))->waitq : NULL,
98 &free_list, destroy_msg, -ETIMEDOUT); 98 &free_list, destroy_msg, -ETIMEDOUT);
99 dput(dentry); 99 dput(dentry);
100} 100}
@@ -152,7 +152,7 @@ rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg)
152 dentry = dget(pipe->dentry); 152 dentry = dget(pipe->dentry);
153 spin_unlock(&pipe->lock); 153 spin_unlock(&pipe->lock);
154 if (dentry) { 154 if (dentry) {
155 wake_up(&RPC_I(dentry->d_inode)->waitq); 155 wake_up(&RPC_I(d_inode(dentry))->waitq);
156 dput(dentry); 156 dput(dentry);
157 } 157 }
158 return res; 158 return res;
@@ -591,7 +591,7 @@ static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry,
591 err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private); 591 err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private);
592 if (err) 592 if (err)
593 return err; 593 return err;
594 rpci = RPC_I(dentry->d_inode); 594 rpci = RPC_I(d_inode(dentry));
595 rpci->private = private; 595 rpci->private = private;
596 rpci->pipe = pipe; 596 rpci->pipe = pipe;
597 fsnotify_create(dir, dentry); 597 fsnotify_create(dir, dentry);
@@ -616,7 +616,7 @@ int rpc_rmdir(struct dentry *dentry)
616 int error; 616 int error;
617 617
618 parent = dget_parent(dentry); 618 parent = dget_parent(dentry);
619 dir = parent->d_inode; 619 dir = d_inode(parent);
620 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 620 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
621 error = __rpc_rmdir(dir, dentry); 621 error = __rpc_rmdir(dir, dentry);
622 mutex_unlock(&dir->i_mutex); 622 mutex_unlock(&dir->i_mutex);
@@ -638,7 +638,7 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
638 638
639static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) 639static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
640{ 640{
641 struct inode *inode = dentry->d_inode; 641 struct inode *inode = d_inode(dentry);
642 642
643 rpc_close_pipes(inode); 643 rpc_close_pipes(inode);
644 return __rpc_unlink(dir, dentry); 644 return __rpc_unlink(dir, dentry);
@@ -654,7 +654,7 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
654 if (!dentry) 654 if (!dentry)
655 return ERR_PTR(-ENOMEM); 655 return ERR_PTR(-ENOMEM);
656 } 656 }
657 if (dentry->d_inode == NULL) 657 if (d_really_is_negative(dentry))
658 return dentry; 658 return dentry;
659 dput(dentry); 659 dput(dentry);
660 return ERR_PTR(-EEXIST); 660 return ERR_PTR(-EEXIST);
@@ -667,7 +667,7 @@ static void __rpc_depopulate(struct dentry *parent,
667 const struct rpc_filelist *files, 667 const struct rpc_filelist *files,
668 int start, int eof) 668 int start, int eof)
669{ 669{
670 struct inode *dir = parent->d_inode; 670 struct inode *dir = d_inode(parent);
671 struct dentry *dentry; 671 struct dentry *dentry;
672 struct qstr name; 672 struct qstr name;
673 int i; 673 int i;
@@ -679,9 +679,9 @@ static void __rpc_depopulate(struct dentry *parent,
679 679
680 if (dentry == NULL) 680 if (dentry == NULL)
681 continue; 681 continue;
682 if (dentry->d_inode == NULL) 682 if (d_really_is_negative(dentry))
683 goto next; 683 goto next;
684 switch (dentry->d_inode->i_mode & S_IFMT) { 684 switch (d_inode(dentry)->i_mode & S_IFMT) {
685 default: 685 default:
686 BUG(); 686 BUG();
687 case S_IFREG: 687 case S_IFREG:
@@ -699,7 +699,7 @@ static void rpc_depopulate(struct dentry *parent,
699 const struct rpc_filelist *files, 699 const struct rpc_filelist *files,
700 int start, int eof) 700 int start, int eof)
701{ 701{
702 struct inode *dir = parent->d_inode; 702 struct inode *dir = d_inode(parent);
703 703
704 mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); 704 mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
705 __rpc_depopulate(parent, files, start, eof); 705 __rpc_depopulate(parent, files, start, eof);
@@ -711,7 +711,7 @@ static int rpc_populate(struct dentry *parent,
711 int start, int eof, 711 int start, int eof,
712 void *private) 712 void *private)
713{ 713{
714 struct inode *dir = parent->d_inode; 714 struct inode *dir = d_inode(parent);
715 struct dentry *dentry; 715 struct dentry *dentry;
716 int i, err; 716 int i, err;
717 717
@@ -754,7 +754,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
754 int (*populate)(struct dentry *, void *), void *args_populate) 754 int (*populate)(struct dentry *, void *), void *args_populate)
755{ 755{
756 struct dentry *dentry; 756 struct dentry *dentry;
757 struct inode *dir = parent->d_inode; 757 struct inode *dir = d_inode(parent);
758 int error; 758 int error;
759 759
760 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 760 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
@@ -787,7 +787,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
787 int error; 787 int error;
788 788
789 parent = dget_parent(dentry); 789 parent = dget_parent(dentry);
790 dir = parent->d_inode; 790 dir = d_inode(parent);
791 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 791 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
792 if (depopulate != NULL) 792 if (depopulate != NULL)
793 depopulate(dentry); 793 depopulate(dentry);
@@ -819,7 +819,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
819 void *private, struct rpc_pipe *pipe) 819 void *private, struct rpc_pipe *pipe)
820{ 820{
821 struct dentry *dentry; 821 struct dentry *dentry;
822 struct inode *dir = parent->d_inode; 822 struct inode *dir = d_inode(parent);
823 umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR; 823 umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR;
824 int err; 824 int err;
825 825
@@ -864,7 +864,7 @@ rpc_unlink(struct dentry *dentry)
864 int error = 0; 864 int error = 0;
865 865
866 parent = dget_parent(dentry); 866 parent = dget_parent(dentry);
867 dir = parent->d_inode; 867 dir = d_inode(parent);
868 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 868 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
869 error = __rpc_rmpipe(dir, dentry); 869 error = __rpc_rmpipe(dir, dentry);
870 mutex_unlock(&dir->i_mutex); 870 mutex_unlock(&dir->i_mutex);
@@ -1375,7 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
1375 struct dentry *clnt_dir = pipe_dentry->d_parent; 1375 struct dentry *clnt_dir = pipe_dentry->d_parent;
1376 struct dentry *gssd_dir = clnt_dir->d_parent; 1376 struct dentry *gssd_dir = clnt_dir->d_parent;
1377 1377
1378 __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry); 1378 __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry);
1379 __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); 1379 __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
1380 __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); 1380 __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);
1381 dput(pipe_dentry); 1381 dput(pipe_dentry);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index b91fd9c597b4..337ca851a350 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
89 if (!task->tk_timeout) 89 if (!task->tk_timeout)
90 return; 90 return;
91 91
92 dprintk("RPC: %5u setting alarm for %lu ms\n", 92 dprintk("RPC: %5u setting alarm for %u ms\n",
93 task->tk_pid, task->tk_timeout * 1000 / HZ); 93 task->tk_pid, jiffies_to_msecs(task->tk_timeout));
94 94
95 task->u.tk_wait.expires = jiffies + task->tk_timeout; 95 task->u.tk_wait.expires = jiffies + task->tk_timeout;
96 if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) 96 if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires))
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 9949722d99ce..1d4fe24af06a 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -326,6 +326,15 @@ out_unlock:
326 xprt_clear_locked(xprt); 326 xprt_clear_locked(xprt);
327} 327}
328 328
329static void xprt_task_clear_bytes_sent(struct rpc_task *task)
330{
331 if (task != NULL) {
332 struct rpc_rqst *req = task->tk_rqstp;
333 if (req != NULL)
334 req->rq_bytes_sent = 0;
335 }
336}
337
329/** 338/**
330 * xprt_release_xprt - allow other requests to use a transport 339 * xprt_release_xprt - allow other requests to use a transport
331 * @xprt: transport with other tasks potentially waiting 340 * @xprt: transport with other tasks potentially waiting
@@ -336,11 +345,7 @@ out_unlock:
336void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) 345void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
337{ 346{
338 if (xprt->snd_task == task) { 347 if (xprt->snd_task == task) {
339 if (task != NULL) { 348 xprt_task_clear_bytes_sent(task);
340 struct rpc_rqst *req = task->tk_rqstp;
341 if (req != NULL)
342 req->rq_bytes_sent = 0;
343 }
344 xprt_clear_locked(xprt); 349 xprt_clear_locked(xprt);
345 __xprt_lock_write_next(xprt); 350 __xprt_lock_write_next(xprt);
346 } 351 }
@@ -358,11 +363,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt);
358void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) 363void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
359{ 364{
360 if (xprt->snd_task == task) { 365 if (xprt->snd_task == task) {
361 if (task != NULL) { 366 xprt_task_clear_bytes_sent(task);
362 struct rpc_rqst *req = task->tk_rqstp;
363 if (req != NULL)
364 req->rq_bytes_sent = 0;
365 }
366 xprt_clear_locked(xprt); 367 xprt_clear_locked(xprt);
367 __xprt_lock_write_next_cong(xprt); 368 __xprt_lock_write_next_cong(xprt);
368 } 369 }
@@ -700,6 +701,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
700 goto out; 701 goto out;
701 if (xprt->snd_task != task) 702 if (xprt->snd_task != task)
702 goto out; 703 goto out;
704 xprt_task_clear_bytes_sent(task);
703 xprt->snd_task = cookie; 705 xprt->snd_task = cookie;
704 ret = true; 706 ret = true;
705out: 707out:
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index da5136fd5694..579f72bbcf4b 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,6 +1,7 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o 1obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
2 2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o 3xprtrdma-y := transport.o rpc_rdma.o verbs.o \
4 fmr_ops.o frwr_ops.o physical_ops.o
4 5
5obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o 6obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
6 7
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
new file mode 100644
index 000000000000..302d4ebf6fbf
--- /dev/null
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -0,0 +1,208 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* Lightweight memory registration using Fast Memory Regions (FMR).
7 * Referred to sometimes as MTHCAFMR mode.
8 *
9 * FMR uses synchronous memory registration and deregistration.
10 * FMR registration is known to be fast, but FMR deregistration
11 * can take tens of usecs to complete.
12 */
13
14#include "xprt_rdma.h"
15
16#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
17# define RPCDBG_FACILITY RPCDBG_TRANS
18#endif
19
20/* Maximum scatter/gather per FMR */
21#define RPCRDMA_MAX_FMR_SGES (64)
22
23static int
24fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
25 struct rpcrdma_create_data_internal *cdata)
26{
27 return 0;
28}
29
30/* FMR mode conveys up to 64 pages of payload per chunk segment.
31 */
32static size_t
33fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
34{
35 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
36 rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
37}
38
39static int
40fmr_op_init(struct rpcrdma_xprt *r_xprt)
41{
42 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
43 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
44 struct ib_fmr_attr fmr_attr = {
45 .max_pages = RPCRDMA_MAX_FMR_SGES,
46 .max_maps = 1,
47 .page_shift = PAGE_SHIFT
48 };
49 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
50 struct rpcrdma_mw *r;
51 int i, rc;
52
53 INIT_LIST_HEAD(&buf->rb_mws);
54 INIT_LIST_HEAD(&buf->rb_all);
55
56 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
57 dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
58
59 while (i--) {
60 r = kzalloc(sizeof(*r), GFP_KERNEL);
61 if (!r)
62 return -ENOMEM;
63
64 r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
65 if (IS_ERR(r->r.fmr))
66 goto out_fmr_err;
67
68 list_add(&r->mw_list, &buf->rb_mws);
69 list_add(&r->mw_all, &buf->rb_all);
70 }
71 return 0;
72
73out_fmr_err:
74 rc = PTR_ERR(r->r.fmr);
75 dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
76 kfree(r);
77 return rc;
78}
79
80/* Use the ib_map_phys_fmr() verb to register a memory region
81 * for remote access via RDMA READ or RDMA WRITE.
82 */
83static int
84fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
85 int nsegs, bool writing)
86{
87 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
88 struct ib_device *device = ia->ri_id->device;
89 enum dma_data_direction direction = rpcrdma_data_dir(writing);
90 struct rpcrdma_mr_seg *seg1 = seg;
91 struct rpcrdma_mw *mw = seg1->rl_mw;
92 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
93 int len, pageoff, i, rc;
94
95 pageoff = offset_in_page(seg1->mr_offset);
96 seg1->mr_offset -= pageoff; /* start of page */
97 seg1->mr_len += pageoff;
98 len = -pageoff;
99 if (nsegs > RPCRDMA_MAX_FMR_SGES)
100 nsegs = RPCRDMA_MAX_FMR_SGES;
101 for (i = 0; i < nsegs;) {
102 rpcrdma_map_one(device, seg, direction);
103 physaddrs[i] = seg->mr_dma;
104 len += seg->mr_len;
105 ++seg;
106 ++i;
107 /* Check for holes */
108 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
109 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
110 break;
111 }
112
113 rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
114 if (rc)
115 goto out_maperr;
116
117 seg1->mr_rkey = mw->r.fmr->rkey;
118 seg1->mr_base = seg1->mr_dma + pageoff;
119 seg1->mr_nsegs = i;
120 seg1->mr_len = len;
121 return i;
122
123out_maperr:
124 dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
125 __func__, len, (unsigned long long)seg1->mr_dma,
126 pageoff, i, rc);
127 while (i--)
128 rpcrdma_unmap_one(device, --seg);
129 return rc;
130}
131
132/* Use the ib_unmap_fmr() verb to prevent further remote
133 * access via RDMA READ or RDMA WRITE.
134 */
135static int
136fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
137{
138 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
139 struct rpcrdma_mr_seg *seg1 = seg;
140 struct ib_device *device;
141 int rc, nsegs = seg->mr_nsegs;
142 LIST_HEAD(l);
143
144 list_add(&seg1->rl_mw->r.fmr->list, &l);
145 rc = ib_unmap_fmr(&l);
146 read_lock(&ia->ri_qplock);
147 device = ia->ri_id->device;
148 while (seg1->mr_nsegs--)
149 rpcrdma_unmap_one(device, seg++);
150 read_unlock(&ia->ri_qplock);
151 if (rc)
152 goto out_err;
153 return nsegs;
154
155out_err:
156 dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
157 return nsegs;
158}
159
160/* After a disconnect, unmap all FMRs.
161 *
162 * This is invoked only in the transport connect worker in order
163 * to serialize with rpcrdma_register_fmr_external().
164 */
165static void
166fmr_op_reset(struct rpcrdma_xprt *r_xprt)
167{
168 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
169 struct rpcrdma_mw *r;
170 LIST_HEAD(list);
171 int rc;
172
173 list_for_each_entry(r, &buf->rb_all, mw_all)
174 list_add(&r->r.fmr->list, &list);
175
176 rc = ib_unmap_fmr(&list);
177 if (rc)
178 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
179 __func__, rc);
180}
181
182static void
183fmr_op_destroy(struct rpcrdma_buffer *buf)
184{
185 struct rpcrdma_mw *r;
186 int rc;
187
188 while (!list_empty(&buf->rb_all)) {
189 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
190 list_del(&r->mw_all);
191 rc = ib_dealloc_fmr(r->r.fmr);
192 if (rc)
193 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
194 __func__, rc);
195 kfree(r);
196 }
197}
198
199const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
200 .ro_map = fmr_op_map,
201 .ro_unmap = fmr_op_unmap,
202 .ro_open = fmr_op_open,
203 .ro_maxpages = fmr_op_maxpages,
204 .ro_init = fmr_op_init,
205 .ro_reset = fmr_op_reset,
206 .ro_destroy = fmr_op_destroy,
207 .ro_displayname = "fmr",
208};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644
index 000000000000..dff0481dbcf8
--- /dev/null
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -0,0 +1,353 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* Lightweight memory registration using Fast Registration Work
7 * Requests (FRWR). Also referred to sometimes as FRMR mode.
8 *
9 * FRWR features ordered asynchronous registration and deregistration
10 * of arbitrarily sized memory regions. This is the fastest and safest
11 * but most complex memory registration mode.
12 */
13
14#include "xprt_rdma.h"
15
16#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
17# define RPCDBG_FACILITY RPCDBG_TRANS
18#endif
19
20static int
21__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
22 unsigned int depth)
23{
24 struct rpcrdma_frmr *f = &r->r.frmr;
25 int rc;
26
27 f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
28 if (IS_ERR(f->fr_mr))
29 goto out_mr_err;
30 f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
31 if (IS_ERR(f->fr_pgl))
32 goto out_list_err;
33 return 0;
34
35out_mr_err:
36 rc = PTR_ERR(f->fr_mr);
37 dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n",
38 __func__, rc);
39 return rc;
40
41out_list_err:
42 rc = PTR_ERR(f->fr_pgl);
43 dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n",
44 __func__, rc);
45 ib_dereg_mr(f->fr_mr);
46 return rc;
47}
48
49static void
50__frwr_release(struct rpcrdma_mw *r)
51{
52 int rc;
53
54 rc = ib_dereg_mr(r->r.frmr.fr_mr);
55 if (rc)
56 dprintk("RPC: %s: ib_dereg_mr status %i\n",
57 __func__, rc);
58 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
59}
60
61static int
62frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
63 struct rpcrdma_create_data_internal *cdata)
64{
65 struct ib_device_attr *devattr = &ia->ri_devattr;
66 int depth, delta;
67
68 ia->ri_max_frmr_depth =
69 min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
70 devattr->max_fast_reg_page_list_len);
71 dprintk("RPC: %s: device's max FR page list len = %u\n",
72 __func__, ia->ri_max_frmr_depth);
73
74 /* Add room for frmr register and invalidate WRs.
75 * 1. FRMR reg WR for head
76 * 2. FRMR invalidate WR for head
77 * 3. N FRMR reg WRs for pagelist
78 * 4. N FRMR invalidate WRs for pagelist
79 * 5. FRMR reg WR for tail
80 * 6. FRMR invalidate WR for tail
81 * 7. The RDMA_SEND WR
82 */
83 depth = 7;
84
85 /* Calculate N if the device max FRMR depth is smaller than
86 * RPCRDMA_MAX_DATA_SEGS.
87 */
88 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
89 delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
90 do {
91 depth += 2; /* FRMR reg + invalidate */
92 delta -= ia->ri_max_frmr_depth;
93 } while (delta > 0);
94 }
95
96 ep->rep_attr.cap.max_send_wr *= depth;
97 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
98 cdata->max_requests = devattr->max_qp_wr / depth;
99 if (!cdata->max_requests)
100 return -EINVAL;
101 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
102 depth;
103 }
104
105 return 0;
106}
107
108/* FRWR mode conveys a list of pages per chunk segment. The
109 * maximum length of that list is the FRWR page list depth.
110 */
111static size_t
112frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
113{
114 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
115
116 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
117 rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
118}
119
120/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
121static void
122frwr_sendcompletion(struct ib_wc *wc)
123{
124 struct rpcrdma_mw *r;
125
126 if (likely(wc->status == IB_WC_SUCCESS))
127 return;
128
129 /* WARNING: Only wr_id and status are reliable at this point */
130 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
131 dprintk("RPC: %s: frmr %p (stale), status %d\n",
132 __func__, r, wc->status);
133 r->r.frmr.fr_state = FRMR_IS_STALE;
134}
135
136static int
137frwr_op_init(struct rpcrdma_xprt *r_xprt)
138{
139 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
140 struct ib_device *device = r_xprt->rx_ia.ri_id->device;
141 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
142 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
143 int i;
144
145 INIT_LIST_HEAD(&buf->rb_mws);
146 INIT_LIST_HEAD(&buf->rb_all);
147
148 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
149 dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
150
151 while (i--) {
152 struct rpcrdma_mw *r;
153 int rc;
154
155 r = kzalloc(sizeof(*r), GFP_KERNEL);
156 if (!r)
157 return -ENOMEM;
158
159 rc = __frwr_init(r, pd, device, depth);
160 if (rc) {
161 kfree(r);
162 return rc;
163 }
164
165 list_add(&r->mw_list, &buf->rb_mws);
166 list_add(&r->mw_all, &buf->rb_all);
167 r->mw_sendcompletion = frwr_sendcompletion;
168 }
169
170 return 0;
171}
172
173/* Post a FAST_REG Work Request to register a memory region
174 * for remote access via RDMA READ or RDMA WRITE.
175 */
176static int
177frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
178 int nsegs, bool writing)
179{
180 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
181 struct ib_device *device = ia->ri_id->device;
182 enum dma_data_direction direction = rpcrdma_data_dir(writing);
183 struct rpcrdma_mr_seg *seg1 = seg;
184 struct rpcrdma_mw *mw = seg1->rl_mw;
185 struct rpcrdma_frmr *frmr = &mw->r.frmr;
186 struct ib_mr *mr = frmr->fr_mr;
187 struct ib_send_wr fastreg_wr, *bad_wr;
188 u8 key;
189 int len, pageoff;
190 int i, rc;
191 int seg_len;
192 u64 pa;
193 int page_no;
194
195 pageoff = offset_in_page(seg1->mr_offset);
196 seg1->mr_offset -= pageoff; /* start of page */
197 seg1->mr_len += pageoff;
198 len = -pageoff;
199 if (nsegs > ia->ri_max_frmr_depth)
200 nsegs = ia->ri_max_frmr_depth;
201 for (page_no = i = 0; i < nsegs;) {
202 rpcrdma_map_one(device, seg, direction);
203 pa = seg->mr_dma;
204 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
205 frmr->fr_pgl->page_list[page_no++] = pa;
206 pa += PAGE_SIZE;
207 }
208 len += seg->mr_len;
209 ++seg;
210 ++i;
211 /* Check for holes */
212 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
213 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
214 break;
215 }
216 dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
217 __func__, mw, i, len);
218
219 frmr->fr_state = FRMR_IS_VALID;
220
221 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
222 fastreg_wr.wr_id = (unsigned long)(void *)mw;
223 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
224 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
225 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
226 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
227 fastreg_wr.wr.fast_reg.page_list_len = page_no;
228 fastreg_wr.wr.fast_reg.length = len;
229 fastreg_wr.wr.fast_reg.access_flags = writing ?
230 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
231 IB_ACCESS_REMOTE_READ;
232 key = (u8)(mr->rkey & 0x000000FF);
233 ib_update_fast_reg_key(mr, ++key);
234 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
235
236 DECR_CQCOUNT(&r_xprt->rx_ep);
237 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
238 if (rc)
239 goto out_senderr;
240
241 seg1->mr_rkey = mr->rkey;
242 seg1->mr_base = seg1->mr_dma + pageoff;
243 seg1->mr_nsegs = i;
244 seg1->mr_len = len;
245 return i;
246
247out_senderr:
248 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
249 ib_update_fast_reg_key(mr, --key);
250 frmr->fr_state = FRMR_IS_INVALID;
251 while (i--)
252 rpcrdma_unmap_one(device, --seg);
253 return rc;
254}
255
256/* Post a LOCAL_INV Work Request to prevent further remote access
257 * via RDMA READ or RDMA WRITE.
258 */
259static int
260frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
261{
262 struct rpcrdma_mr_seg *seg1 = seg;
263 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
264 struct ib_send_wr invalidate_wr, *bad_wr;
265 int rc, nsegs = seg->mr_nsegs;
266 struct ib_device *device;
267
268 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
269
270 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
271 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
272 invalidate_wr.opcode = IB_WR_LOCAL_INV;
273 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
274 DECR_CQCOUNT(&r_xprt->rx_ep);
275
276 read_lock(&ia->ri_qplock);
277 device = ia->ri_id->device;
278 while (seg1->mr_nsegs--)
279 rpcrdma_unmap_one(device, seg++);
280 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
281 read_unlock(&ia->ri_qplock);
282 if (rc)
283 goto out_err;
284 return nsegs;
285
286out_err:
287 /* Force rpcrdma_buffer_get() to retry */
288 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
289 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
290 return nsegs;
291}
292
293/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
294 * an unusable state. Find FRMRs in this state and dereg / reg
295 * each. FRMRs that are VALID and attached to an rpcrdma_req are
296 * also torn down.
297 *
298 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
299 *
300 * This is invoked only in the transport connect worker in order
301 * to serialize with rpcrdma_register_frmr_external().
302 */
303static void
304frwr_op_reset(struct rpcrdma_xprt *r_xprt)
305{
306 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
307 struct ib_device *device = r_xprt->rx_ia.ri_id->device;
308 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
309 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
310 struct rpcrdma_mw *r;
311 int rc;
312
313 list_for_each_entry(r, &buf->rb_all, mw_all) {
314 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
315 continue;
316
317 __frwr_release(r);
318 rc = __frwr_init(r, pd, device, depth);
319 if (rc) {
320 dprintk("RPC: %s: mw %p left %s\n",
321 __func__, r,
322 (r->r.frmr.fr_state == FRMR_IS_STALE ?
323 "stale" : "valid"));
324 continue;
325 }
326
327 r->r.frmr.fr_state = FRMR_IS_INVALID;
328 }
329}
330
331static void
332frwr_op_destroy(struct rpcrdma_buffer *buf)
333{
334 struct rpcrdma_mw *r;
335
336 while (!list_empty(&buf->rb_all)) {
337 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
338 list_del(&r->mw_all);
339 __frwr_release(r);
340 kfree(r);
341 }
342}
343
344const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
345 .ro_map = frwr_op_map,
346 .ro_unmap = frwr_op_unmap,
347 .ro_open = frwr_op_open,
348 .ro_maxpages = frwr_op_maxpages,
349 .ro_init = frwr_op_init,
350 .ro_reset = frwr_op_reset,
351 .ro_destroy = frwr_op_destroy,
352 .ro_displayname = "frwr",
353};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
new file mode 100644
index 000000000000..ba518af16787
--- /dev/null
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -0,0 +1,94 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* No-op chunk preparation. All client memory is pre-registered.
7 * Sometimes referred to as ALLPHYSICAL mode.
8 *
9 * Physical registration is simple because all client memory is
10 * pre-registered and never deregistered. This mode is good for
11 * adapter bring up, but is considered not safe: the server is
12 * trusted not to abuse its access to client memory not involved
13 * in RDMA I/O.
14 */
15
16#include "xprt_rdma.h"
17
18#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
19# define RPCDBG_FACILITY RPCDBG_TRANS
20#endif
21
22static int
23physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
24 struct rpcrdma_create_data_internal *cdata)
25{
26 return 0;
27}
28
29/* PHYSICAL memory registration conveys one page per chunk segment.
30 */
31static size_t
32physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
33{
34 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
35 rpcrdma_max_segments(r_xprt));
36}
37
38static int
39physical_op_init(struct rpcrdma_xprt *r_xprt)
40{
41 return 0;
42}
43
44/* The client's physical memory is already exposed for
45 * remote access via RDMA READ or RDMA WRITE.
46 */
47static int
48physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
49 int nsegs, bool writing)
50{
51 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
52
53 rpcrdma_map_one(ia->ri_id->device, seg,
54 rpcrdma_data_dir(writing));
55 seg->mr_rkey = ia->ri_bind_mem->rkey;
56 seg->mr_base = seg->mr_dma;
57 seg->mr_nsegs = 1;
58 return 1;
59}
60
61/* Unmap a memory region, but leave it registered.
62 */
63static int
64physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
65{
66 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
67
68 read_lock(&ia->ri_qplock);
69 rpcrdma_unmap_one(ia->ri_id->device, seg);
70 read_unlock(&ia->ri_qplock);
71
72 return 1;
73}
74
75static void
76physical_op_reset(struct rpcrdma_xprt *r_xprt)
77{
78}
79
80static void
81physical_op_destroy(struct rpcrdma_buffer *buf)
82{
83}
84
85const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
86 .ro_map = physical_op_map,
87 .ro_unmap = physical_op_unmap,
88 .ro_open = physical_op_open,
89 .ro_maxpages = physical_op_maxpages,
90 .ro_init = physical_op_init,
91 .ro_reset = physical_op_reset,
92 .ro_destroy = physical_op_destroy,
93 .ro_displayname = "physical",
94};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 91ffde82fa0c..2c53ea9e1b83 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,6 +53,14 @@
53# define RPCDBG_FACILITY RPCDBG_TRANS 53# define RPCDBG_FACILITY RPCDBG_TRANS
54#endif 54#endif
55 55
56enum rpcrdma_chunktype {
57 rpcrdma_noch = 0,
58 rpcrdma_readch,
59 rpcrdma_areadch,
60 rpcrdma_writech,
61 rpcrdma_replych
62};
63
56#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 64#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
57static const char transfertypes[][12] = { 65static const char transfertypes[][12] = {
58 "pure inline", /* no chunks */ 66 "pure inline", /* no chunks */
@@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
179 struct rpcrdma_write_array *warray = NULL; 187 struct rpcrdma_write_array *warray = NULL;
180 struct rpcrdma_write_chunk *cur_wchunk = NULL; 188 struct rpcrdma_write_chunk *cur_wchunk = NULL;
181 __be32 *iptr = headerp->rm_body.rm_chunks; 189 __be32 *iptr = headerp->rm_body.rm_chunks;
190 int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
182 191
183 if (type == rpcrdma_readch || type == rpcrdma_areadch) { 192 if (type == rpcrdma_readch || type == rpcrdma_areadch) {
184 /* a read chunk - server will RDMA Read our memory */ 193 /* a read chunk - server will RDMA Read our memory */
@@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
201 if (nsegs < 0) 210 if (nsegs < 0)
202 return nsegs; 211 return nsegs;
203 212
213 map = r_xprt->rx_ia.ri_ops->ro_map;
204 do { 214 do {
205 n = rpcrdma_register_external(seg, nsegs, 215 n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
206 cur_wchunk != NULL, r_xprt);
207 if (n <= 0) 216 if (n <= 0)
208 goto out; 217 goto out;
209 if (cur_rchunk) { /* read */ 218 if (cur_rchunk) { /* read */
@@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
275 return (unsigned char *)iptr - (unsigned char *)headerp; 284 return (unsigned char *)iptr - (unsigned char *)headerp;
276 285
277out: 286out:
278 if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { 287 if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
279 for (pos = 0; nchunks--;) 288 return n;
280 pos += rpcrdma_deregister_external(
281 &req->rl_segments[pos], r_xprt);
282 }
283 return n;
284}
285 289
286/* 290 for (pos = 0; nchunks--;)
287 * Marshal chunks. This routine returns the header length 291 pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
288 * consumed by marshaling. 292 &req->rl_segments[pos]);
289 * 293 return n;
290 * Returns positive RPC/RDMA header size, or negative errno.
291 */
292
293ssize_t
294rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
295{
296 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
297 struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf);
298
299 if (req->rl_rtype != rpcrdma_noch)
300 result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
301 headerp, req->rl_rtype);
302 else if (req->rl_wtype != rpcrdma_noch)
303 result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
304 headerp, req->rl_wtype);
305 return result;
306} 294}
307 295
308/* 296/*
@@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
397 char *base; 385 char *base;
398 size_t rpclen, padlen; 386 size_t rpclen, padlen;
399 ssize_t hdrlen; 387 ssize_t hdrlen;
388 enum rpcrdma_chunktype rtype, wtype;
400 struct rpcrdma_msg *headerp; 389 struct rpcrdma_msg *headerp;
401 390
402 /* 391 /*
@@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
433 * into pages; otherwise use reply chunks. 422 * into pages; otherwise use reply chunks.
434 */ 423 */
435 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) 424 if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
436 req->rl_wtype = rpcrdma_noch; 425 wtype = rpcrdma_noch;
437 else if (rqst->rq_rcv_buf.page_len == 0) 426 else if (rqst->rq_rcv_buf.page_len == 0)
438 req->rl_wtype = rpcrdma_replych; 427 wtype = rpcrdma_replych;
439 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 428 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
440 req->rl_wtype = rpcrdma_writech; 429 wtype = rpcrdma_writech;
441 else 430 else
442 req->rl_wtype = rpcrdma_replych; 431 wtype = rpcrdma_replych;
443 432
444 /* 433 /*
445 * Chunks needed for arguments? 434 * Chunks needed for arguments?
@@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
456 * TBD check NFSv4 setacl 445 * TBD check NFSv4 setacl
457 */ 446 */
458 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) 447 if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
459 req->rl_rtype = rpcrdma_noch; 448 rtype = rpcrdma_noch;
460 else if (rqst->rq_snd_buf.page_len == 0) 449 else if (rqst->rq_snd_buf.page_len == 0)
461 req->rl_rtype = rpcrdma_areadch; 450 rtype = rpcrdma_areadch;
462 else 451 else
463 req->rl_rtype = rpcrdma_readch; 452 rtype = rpcrdma_readch;
464 453
465 /* The following simplification is not true forever */ 454 /* The following simplification is not true forever */
466 if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) 455 if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
467 req->rl_wtype = rpcrdma_noch; 456 wtype = rpcrdma_noch;
468 if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { 457 if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
469 dprintk("RPC: %s: cannot marshal multiple chunk lists\n", 458 dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
470 __func__); 459 __func__);
471 return -EIO; 460 return -EIO;
@@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
479 * When padding is in use and applies to the transfer, insert 468 * When padding is in use and applies to the transfer, insert
480 * it and change the message type. 469 * it and change the message type.
481 */ 470 */
482 if (req->rl_rtype == rpcrdma_noch) { 471 if (rtype == rpcrdma_noch) {
483 472
484 padlen = rpcrdma_inline_pullup(rqst, 473 padlen = rpcrdma_inline_pullup(rqst,
485 RPCRDMA_INLINE_PAD_VALUE(rqst)); 474 RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
494 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 483 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
495 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; 484 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
496 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ 485 hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
497 if (req->rl_wtype != rpcrdma_noch) { 486 if (wtype != rpcrdma_noch) {
498 dprintk("RPC: %s: invalid chunk list\n", 487 dprintk("RPC: %s: invalid chunk list\n",
499 __func__); 488 __func__);
500 return -EIO; 489 return -EIO;
@@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
515 * on receive. Therefore, we request a reply chunk 504 * on receive. Therefore, we request a reply chunk
516 * for non-writes wherever feasible and efficient. 505 * for non-writes wherever feasible and efficient.
517 */ 506 */
518 if (req->rl_wtype == rpcrdma_noch) 507 if (wtype == rpcrdma_noch)
519 req->rl_wtype = rpcrdma_replych; 508 wtype = rpcrdma_replych;
520 } 509 }
521 } 510 }
522 511
523 hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); 512 if (rtype != rpcrdma_noch) {
513 hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
514 headerp, rtype);
515 wtype = rtype; /* simplify dprintk */
516
517 } else if (wtype != rpcrdma_noch) {
518 hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
519 headerp, wtype);
520 }
524 if (hdrlen < 0) 521 if (hdrlen < 0)
525 return hdrlen; 522 return hdrlen;
526 523
527 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" 524 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
528 " headerp 0x%p base 0x%p lkey 0x%x\n", 525 " headerp 0x%p base 0x%p lkey 0x%x\n",
529 __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, 526 __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
530 headerp, base, rdmab_lkey(req->rl_rdmabuf)); 527 headerp, base, rdmab_lkey(req->rl_rdmabuf));
531 528
532 /* 529 /*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2e192baa59f3..54f23b1be986 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = {
157static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ 157static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
158 158
159static void 159static void
160xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
161{
162 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
163 char buf[20];
164
165 snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
166 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
167
168 xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
169}
170
171static void
172xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
173{
174 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
175 char buf[40];
176
177 snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
178 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
179
180 xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
181}
182
183static void
160xprt_rdma_format_addresses(struct rpc_xprt *xprt) 184xprt_rdma_format_addresses(struct rpc_xprt *xprt)
161{ 185{
162 struct sockaddr *sap = (struct sockaddr *) 186 struct sockaddr *sap = (struct sockaddr *)
163 &rpcx_to_rdmad(xprt).addr; 187 &rpcx_to_rdmad(xprt).addr;
164 struct sockaddr_in *sin = (struct sockaddr_in *)sap; 188 char buf[128];
165 char buf[64]; 189
190 switch (sap->sa_family) {
191 case AF_INET:
192 xprt_rdma_format_addresses4(xprt, sap);
193 break;
194 case AF_INET6:
195 xprt_rdma_format_addresses6(xprt, sap);
196 break;
197 default:
198 pr_err("rpcrdma: Unrecognized address family\n");
199 return;
200 }
166 201
167 (void)rpc_ntop(sap, buf, sizeof(buf)); 202 (void)rpc_ntop(sap, buf, sizeof(buf));
168 xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); 203 xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
@@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt)
170 snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); 205 snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
171 xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); 206 xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
172 207
173 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
174
175 snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
176 xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
177
178 snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); 208 snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
179 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); 209 xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
180 210
181 /* netid */ 211 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
182 xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
183} 212}
184 213
185static void 214static void
@@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args)
377 xprt_rdma_connect_worker); 406 xprt_rdma_connect_worker);
378 407
379 xprt_rdma_format_addresses(xprt); 408 xprt_rdma_format_addresses(xprt);
380 xprt->max_payload = rpcrdma_max_payload(new_xprt); 409 xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
410 if (xprt->max_payload == 0)
411 goto out4;
412 xprt->max_payload <<= PAGE_SHIFT;
381 dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", 413 dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
382 __func__, xprt->max_payload); 414 __func__, xprt->max_payload);
383 415
@@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer)
552 584
553 for (i = 0; req->rl_nchunks;) { 585 for (i = 0; req->rl_nchunks;) {
554 --req->rl_nchunks; 586 --req->rl_nchunks;
555 i += rpcrdma_deregister_external( 587 i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
556 &req->rl_segments[i], r_xprt); 588 &req->rl_segments[i]);
557 } 589 }
558 590
559 rpcrdma_buffer_put(req); 591 rpcrdma_buffer_put(req);
@@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task)
579 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 611 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
580 int rc = 0; 612 int rc = 0;
581 613
582 if (req->rl_niovs == 0) 614 rc = rpcrdma_marshal_req(rqst);
583 rc = rpcrdma_marshal_req(rqst);
584 else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL)
585 rc = rpcrdma_marshal_chunks(rqst, 0);
586 if (rc < 0) 615 if (rc < 0)
587 goto failed_marshal; 616 goto failed_marshal;
588 617
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e28909fddd30..4870d272e006 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -50,6 +50,7 @@
50#include <linux/interrupt.h> 50#include <linux/interrupt.h>
51#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/sunrpc/addr.h>
53#include <asm/bitops.h> 54#include <asm/bitops.h>
54 55
55#include "xprt_rdma.h" 56#include "xprt_rdma.h"
@@ -62,9 +63,6 @@
62# define RPCDBG_FACILITY RPCDBG_TRANS 63# define RPCDBG_FACILITY RPCDBG_TRANS
63#endif 64#endif
64 65
65static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
66static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
67
68/* 66/*
69 * internal functions 67 * internal functions
70 */ 68 */
@@ -188,7 +186,7 @@ static const char * const wc_status[] = {
188 "remote access error", 186 "remote access error",
189 "remote operation error", 187 "remote operation error",
190 "transport retry counter exceeded", 188 "transport retry counter exceeded",
191 "RNR retrycounter exceeded", 189 "RNR retry counter exceeded",
192 "local RDD violation error", 190 "local RDD violation error",
193 "remove invalid RD request", 191 "remove invalid RD request",
194 "operation aborted", 192 "operation aborted",
@@ -206,21 +204,17 @@ static const char * const wc_status[] = {
206static void 204static void
207rpcrdma_sendcq_process_wc(struct ib_wc *wc) 205rpcrdma_sendcq_process_wc(struct ib_wc *wc)
208{ 206{
209 if (likely(wc->status == IB_WC_SUCCESS))
210 return;
211
212 /* WARNING: Only wr_id and status are reliable at this point */ 207 /* WARNING: Only wr_id and status are reliable at this point */
213 if (wc->wr_id == 0ULL) { 208 if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
214 if (wc->status != IB_WC_WR_FLUSH_ERR) 209 if (wc->status != IB_WC_SUCCESS &&
210 wc->status != IB_WC_WR_FLUSH_ERR)
215 pr_err("RPC: %s: SEND: %s\n", 211 pr_err("RPC: %s: SEND: %s\n",
216 __func__, COMPLETION_MSG(wc->status)); 212 __func__, COMPLETION_MSG(wc->status));
217 } else { 213 } else {
218 struct rpcrdma_mw *r; 214 struct rpcrdma_mw *r;
219 215
220 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 216 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
221 r->r.frmr.fr_state = FRMR_IS_STALE; 217 r->mw_sendcompletion(wc);
222 pr_err("RPC: %s: frmr %p (stale): %s\n",
223 __func__, r, COMPLETION_MSG(wc->status));
224 } 218 }
225} 219}
226 220
@@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
424 struct rpcrdma_ia *ia = &xprt->rx_ia; 418 struct rpcrdma_ia *ia = &xprt->rx_ia;
425 struct rpcrdma_ep *ep = &xprt->rx_ep; 419 struct rpcrdma_ep *ep = &xprt->rx_ep;
426#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 420#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
427 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; 421 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
428#endif 422#endif
429 struct ib_qp_attr *attr = &ia->ri_qp_attr; 423 struct ib_qp_attr *attr = &ia->ri_qp_attr;
430 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; 424 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
@@ -480,9 +474,8 @@ connected:
480 wake_up_all(&ep->rep_connect_wait); 474 wake_up_all(&ep->rep_connect_wait);
481 /*FALLTHROUGH*/ 475 /*FALLTHROUGH*/
482 default: 476 default:
483 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", 477 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
484 __func__, &addr->sin_addr.s_addr, 478 __func__, sap, rpc_get_port(sap), ep,
485 ntohs(addr->sin_port), ep,
486 CONNECTION_MSG(event->event)); 479 CONNECTION_MSG(event->event));
487 break; 480 break;
488 } 481 }
@@ -491,19 +484,16 @@ connected:
491 if (connstate == 1) { 484 if (connstate == 1) {
492 int ird = attr->max_dest_rd_atomic; 485 int ird = attr->max_dest_rd_atomic;
493 int tird = ep->rep_remote_cma.responder_resources; 486 int tird = ep->rep_remote_cma.responder_resources;
494 printk(KERN_INFO "rpcrdma: connection to %pI4:%u " 487
495 "on %s, memreg %d slots %d ird %d%s\n", 488 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
496 &addr->sin_addr.s_addr, 489 sap, rpc_get_port(sap),
497 ntohs(addr->sin_port),
498 ia->ri_id->device->name, 490 ia->ri_id->device->name,
499 ia->ri_memreg_strategy, 491 ia->ri_ops->ro_displayname,
500 xprt->rx_buf.rb_max_requests, 492 xprt->rx_buf.rb_max_requests,
501 ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); 493 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
502 } else if (connstate < 0) { 494 } else if (connstate < 0) {
503 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", 495 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
504 &addr->sin_addr.s_addr, 496 sap, rpc_get_port(sap), connstate);
505 ntohs(addr->sin_port),
506 connstate);
507 } 497 }
508#endif 498#endif
509 499
@@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
621 611
622 if (memreg == RPCRDMA_FRMR) { 612 if (memreg == RPCRDMA_FRMR) {
623 /* Requires both frmr reg and local dma lkey */ 613 /* Requires both frmr reg and local dma lkey */
624 if ((devattr->device_cap_flags & 614 if (((devattr->device_cap_flags &
625 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 615 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
626 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { 616 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
617 (devattr->max_fast_reg_page_list_len == 0)) {
627 dprintk("RPC: %s: FRMR registration " 618 dprintk("RPC: %s: FRMR registration "
628 "not supported by HCA\n", __func__); 619 "not supported by HCA\n", __func__);
629 memreg = RPCRDMA_MTHCAFMR; 620 memreg = RPCRDMA_MTHCAFMR;
630 } else {
631 /* Mind the ia limit on FRMR page list depth */
632 ia->ri_max_frmr_depth = min_t(unsigned int,
633 RPCRDMA_MAX_DATA_SEGS,
634 devattr->max_fast_reg_page_list_len);
635 } 621 }
636 } 622 }
637 if (memreg == RPCRDMA_MTHCAFMR) { 623 if (memreg == RPCRDMA_MTHCAFMR) {
@@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
652 */ 638 */
653 switch (memreg) { 639 switch (memreg) {
654 case RPCRDMA_FRMR: 640 case RPCRDMA_FRMR:
641 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
655 break; 642 break;
656 case RPCRDMA_ALLPHYSICAL: 643 case RPCRDMA_ALLPHYSICAL:
644 ia->ri_ops = &rpcrdma_physical_memreg_ops;
657 mem_priv = IB_ACCESS_LOCAL_WRITE | 645 mem_priv = IB_ACCESS_LOCAL_WRITE |
658 IB_ACCESS_REMOTE_WRITE | 646 IB_ACCESS_REMOTE_WRITE |
659 IB_ACCESS_REMOTE_READ; 647 IB_ACCESS_REMOTE_READ;
660 goto register_setup; 648 goto register_setup;
661 case RPCRDMA_MTHCAFMR: 649 case RPCRDMA_MTHCAFMR:
650 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
662 if (ia->ri_have_dma_lkey) 651 if (ia->ri_have_dma_lkey)
663 break; 652 break;
664 mem_priv = IB_ACCESS_LOCAL_WRITE; 653 mem_priv = IB_ACCESS_LOCAL_WRITE;
@@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
678 rc = -ENOMEM; 667 rc = -ENOMEM;
679 goto out3; 668 goto out3;
680 } 669 }
681 dprintk("RPC: %s: memory registration strategy is %d\n", 670 dprintk("RPC: %s: memory registration strategy is '%s'\n",
682 __func__, memreg); 671 __func__, ia->ri_ops->ro_displayname);
683 672
684 /* Else will do memory reg/dereg for each chunk */ 673 /* Else will do memory reg/dereg for each chunk */
685 ia->ri_memreg_strategy = memreg; 674 ia->ri_memreg_strategy = memreg;
@@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
743 732
744 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 733 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
745 ep->rep_attr.qp_context = ep; 734 ep->rep_attr.qp_context = ep;
746 /* send_cq and recv_cq initialized below */
747 ep->rep_attr.srq = NULL; 735 ep->rep_attr.srq = NULL;
748 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 736 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
749 switch (ia->ri_memreg_strategy) { 737 rc = ia->ri_ops->ro_open(ia, ep, cdata);
750 case RPCRDMA_FRMR: { 738 if (rc)
751 int depth = 7; 739 return rc;
752
753 /* Add room for frmr register and invalidate WRs.
754 * 1. FRMR reg WR for head
755 * 2. FRMR invalidate WR for head
756 * 3. N FRMR reg WRs for pagelist
757 * 4. N FRMR invalidate WRs for pagelist
758 * 5. FRMR reg WR for tail
759 * 6. FRMR invalidate WR for tail
760 * 7. The RDMA_SEND WR
761 */
762
763 /* Calculate N if the device max FRMR depth is smaller than
764 * RPCRDMA_MAX_DATA_SEGS.
765 */
766 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
767 int delta = RPCRDMA_MAX_DATA_SEGS -
768 ia->ri_max_frmr_depth;
769
770 do {
771 depth += 2; /* FRMR reg + invalidate */
772 delta -= ia->ri_max_frmr_depth;
773 } while (delta > 0);
774
775 }
776 ep->rep_attr.cap.max_send_wr *= depth;
777 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
778 cdata->max_requests = devattr->max_qp_wr / depth;
779 if (!cdata->max_requests)
780 return -EINVAL;
781 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
782 depth;
783 }
784 break;
785 }
786 default:
787 break;
788 }
789 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 740 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
790 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); 741 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
791 ep->rep_attr.cap.max_recv_sge = 1; 742 ep->rep_attr.cap.max_recv_sge = 1;
@@ -944,21 +895,9 @@ retry:
944 rpcrdma_ep_disconnect(ep, ia); 895 rpcrdma_ep_disconnect(ep, ia);
945 rpcrdma_flush_cqs(ep); 896 rpcrdma_flush_cqs(ep);
946 897
947 switch (ia->ri_memreg_strategy) {
948 case RPCRDMA_FRMR:
949 rpcrdma_reset_frmrs(ia);
950 break;
951 case RPCRDMA_MTHCAFMR:
952 rpcrdma_reset_fmrs(ia);
953 break;
954 case RPCRDMA_ALLPHYSICAL:
955 break;
956 default:
957 rc = -EIO;
958 goto out;
959 }
960
961 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 898 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
899 ia->ri_ops->ro_reset(xprt);
900
962 id = rpcrdma_create_id(xprt, ia, 901 id = rpcrdma_create_id(xprt, ia,
963 (struct sockaddr *)&xprt->rx_data.addr); 902 (struct sockaddr *)&xprt->rx_data.addr);
964 if (IS_ERR(id)) { 903 if (IS_ERR(id)) {
@@ -1123,91 +1062,6 @@ out:
1123 return ERR_PTR(rc); 1062 return ERR_PTR(rc);
1124} 1063}
1125 1064
1126static int
1127rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1128{
1129 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1130 struct ib_fmr_attr fmr_attr = {
1131 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1132 .max_maps = 1,
1133 .page_shift = PAGE_SHIFT
1134 };
1135 struct rpcrdma_mw *r;
1136 int i, rc;
1137
1138 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1139 dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
1140
1141 while (i--) {
1142 r = kzalloc(sizeof(*r), GFP_KERNEL);
1143 if (r == NULL)
1144 return -ENOMEM;
1145
1146 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1147 if (IS_ERR(r->r.fmr)) {
1148 rc = PTR_ERR(r->r.fmr);
1149 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1150 __func__, rc);
1151 goto out_free;
1152 }
1153
1154 list_add(&r->mw_list, &buf->rb_mws);
1155 list_add(&r->mw_all, &buf->rb_all);
1156 }
1157 return 0;
1158
1159out_free:
1160 kfree(r);
1161 return rc;
1162}
1163
1164static int
1165rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1166{
1167 struct rpcrdma_frmr *f;
1168 struct rpcrdma_mw *r;
1169 int i, rc;
1170
1171 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1172 dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
1173
1174 while (i--) {
1175 r = kzalloc(sizeof(*r), GFP_KERNEL);
1176 if (r == NULL)
1177 return -ENOMEM;
1178 f = &r->r.frmr;
1179
1180 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1181 ia->ri_max_frmr_depth);
1182 if (IS_ERR(f->fr_mr)) {
1183 rc = PTR_ERR(f->fr_mr);
1184 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1185 "failed %i\n", __func__, rc);
1186 goto out_free;
1187 }
1188
1189 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1190 ia->ri_max_frmr_depth);
1191 if (IS_ERR(f->fr_pgl)) {
1192 rc = PTR_ERR(f->fr_pgl);
1193 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1194 "failed %i\n", __func__, rc);
1195
1196 ib_dereg_mr(f->fr_mr);
1197 goto out_free;
1198 }
1199
1200 list_add(&r->mw_list, &buf->rb_mws);
1201 list_add(&r->mw_all, &buf->rb_all);
1202 }
1203
1204 return 0;
1205
1206out_free:
1207 kfree(r);
1208 return rc;
1209}
1210
1211int 1065int
1212rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) 1066rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1213{ 1067{
@@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1244 buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 1098 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1245 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 1099 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1246 1100
1247 INIT_LIST_HEAD(&buf->rb_mws); 1101 rc = ia->ri_ops->ro_init(r_xprt);
1248 INIT_LIST_HEAD(&buf->rb_all); 1102 if (rc)
1249 switch (ia->ri_memreg_strategy) { 1103 goto out;
1250 case RPCRDMA_FRMR:
1251 rc = rpcrdma_init_frmrs(ia, buf);
1252 if (rc)
1253 goto out;
1254 break;
1255 case RPCRDMA_MTHCAFMR:
1256 rc = rpcrdma_init_fmrs(ia, buf);
1257 if (rc)
1258 goto out;
1259 break;
1260 default:
1261 break;
1262 }
1263 1104
1264 for (i = 0; i < buf->rb_max_requests; i++) { 1105 for (i = 0; i < buf->rb_max_requests; i++) {
1265 struct rpcrdma_req *req; 1106 struct rpcrdma_req *req;
@@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1311 kfree(req); 1152 kfree(req);
1312} 1153}
1313 1154
1314static void
1315rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1316{
1317 struct rpcrdma_mw *r;
1318 int rc;
1319
1320 while (!list_empty(&buf->rb_all)) {
1321 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1322 list_del(&r->mw_all);
1323 list_del(&r->mw_list);
1324
1325 rc = ib_dealloc_fmr(r->r.fmr);
1326 if (rc)
1327 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1328 __func__, rc);
1329
1330 kfree(r);
1331 }
1332}
1333
1334static void
1335rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1336{
1337 struct rpcrdma_mw *r;
1338 int rc;
1339
1340 while (!list_empty(&buf->rb_all)) {
1341 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1342 list_del(&r->mw_all);
1343 list_del(&r->mw_list);
1344
1345 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1346 if (rc)
1347 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1348 __func__, rc);
1349 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1350
1351 kfree(r);
1352 }
1353}
1354
1355void 1155void
1356rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1156rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1357{ 1157{
@@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1372 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); 1172 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1373 } 1173 }
1374 1174
1375 switch (ia->ri_memreg_strategy) { 1175 ia->ri_ops->ro_destroy(buf);
1376 case RPCRDMA_FRMR:
1377 rpcrdma_destroy_frmrs(buf);
1378 break;
1379 case RPCRDMA_MTHCAFMR:
1380 rpcrdma_destroy_fmrs(buf);
1381 break;
1382 default:
1383 break;
1384 }
1385 1176
1386 kfree(buf->rb_pool); 1177 kfree(buf->rb_pool);
1387} 1178}
1388 1179
1389/* After a disconnect, unmap all FMRs.
1390 *
1391 * This is invoked only in the transport connect worker in order
1392 * to serialize with rpcrdma_register_fmr_external().
1393 */
1394static void
1395rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1396{
1397 struct rpcrdma_xprt *r_xprt =
1398 container_of(ia, struct rpcrdma_xprt, rx_ia);
1399 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1400 struct list_head *pos;
1401 struct rpcrdma_mw *r;
1402 LIST_HEAD(l);
1403 int rc;
1404
1405 list_for_each(pos, &buf->rb_all) {
1406 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1407
1408 INIT_LIST_HEAD(&l);
1409 list_add(&r->r.fmr->list, &l);
1410 rc = ib_unmap_fmr(&l);
1411 if (rc)
1412 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1413 __func__, rc);
1414 }
1415}
1416
1417/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1418 * an unusable state. Find FRMRs in this state and dereg / reg
1419 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1420 * also torn down.
1421 *
1422 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1423 *
1424 * This is invoked only in the transport connect worker in order
1425 * to serialize with rpcrdma_register_frmr_external().
1426 */
1427static void
1428rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1429{
1430 struct rpcrdma_xprt *r_xprt =
1431 container_of(ia, struct rpcrdma_xprt, rx_ia);
1432 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1433 struct list_head *pos;
1434 struct rpcrdma_mw *r;
1435 int rc;
1436
1437 list_for_each(pos, &buf->rb_all) {
1438 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1439
1440 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1441 continue;
1442
1443 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1444 if (rc)
1445 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1446 __func__, rc);
1447 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1448
1449 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1450 ia->ri_max_frmr_depth);
1451 if (IS_ERR(r->r.frmr.fr_mr)) {
1452 rc = PTR_ERR(r->r.frmr.fr_mr);
1453 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1454 " failed %i\n", __func__, rc);
1455 continue;
1456 }
1457 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1458 ia->ri_id->device,
1459 ia->ri_max_frmr_depth);
1460 if (IS_ERR(r->r.frmr.fr_pgl)) {
1461 rc = PTR_ERR(r->r.frmr.fr_pgl);
1462 dprintk("RPC: %s: "
1463 "ib_alloc_fast_reg_page_list "
1464 "failed %i\n", __func__, rc);
1465
1466 ib_dereg_mr(r->r.frmr.fr_mr);
1467 continue;
1468 }
1469 r->r.frmr.fr_state = FRMR_IS_INVALID;
1470 }
1471}
1472
1473/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving 1180/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1474 * some req segments uninitialized. 1181 * some req segments uninitialized.
1475 */ 1182 */
@@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1509 } 1216 }
1510} 1217}
1511 1218
1512/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). 1219/* rpcrdma_unmap_one() was already done during deregistration.
1513 * Redo only the ib_post_send(). 1220 * Redo only the ib_post_send().
1514 */ 1221 */
1515static void 1222static void
@@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1729 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1436 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1730 */ 1437 */
1731 1438
1439void
1440rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
1441{
1442 dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
1443 seg->mr_offset,
1444 (unsigned long long)seg->mr_dma, seg->mr_dmalen);
1445}
1446
1732static int 1447static int
1733rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1448rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1734 struct ib_mr **mrp, struct ib_sge *iov) 1449 struct ib_mr **mrp, struct ib_sge *iov)
@@ -1854,287 +1569,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1854} 1569}
1855 1570
1856/* 1571/*
1857 * Wrappers for chunk registration, shared by read/write chunk code.
1858 */
1859
1860static void
1861rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1862{
1863 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1864 seg->mr_dmalen = seg->mr_len;
1865 if (seg->mr_page)
1866 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1867 seg->mr_page, offset_in_page(seg->mr_offset),
1868 seg->mr_dmalen, seg->mr_dir);
1869 else
1870 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1871 seg->mr_offset,
1872 seg->mr_dmalen, seg->mr_dir);
1873 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1874 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1875 __func__,
1876 (unsigned long long)seg->mr_dma,
1877 seg->mr_offset, seg->mr_dmalen);
1878 }
1879}
1880
1881static void
1882rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1883{
1884 if (seg->mr_page)
1885 ib_dma_unmap_page(ia->ri_id->device,
1886 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1887 else
1888 ib_dma_unmap_single(ia->ri_id->device,
1889 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1890}
1891
1892static int
1893rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1894 int *nsegs, int writing, struct rpcrdma_ia *ia,
1895 struct rpcrdma_xprt *r_xprt)
1896{
1897 struct rpcrdma_mr_seg *seg1 = seg;
1898 struct rpcrdma_mw *mw = seg1->rl_mw;
1899 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1900 struct ib_mr *mr = frmr->fr_mr;
1901 struct ib_send_wr fastreg_wr, *bad_wr;
1902 u8 key;
1903 int len, pageoff;
1904 int i, rc;
1905 int seg_len;
1906 u64 pa;
1907 int page_no;
1908
1909 pageoff = offset_in_page(seg1->mr_offset);
1910 seg1->mr_offset -= pageoff; /* start of page */
1911 seg1->mr_len += pageoff;
1912 len = -pageoff;
1913 if (*nsegs > ia->ri_max_frmr_depth)
1914 *nsegs = ia->ri_max_frmr_depth;
1915 for (page_no = i = 0; i < *nsegs;) {
1916 rpcrdma_map_one(ia, seg, writing);
1917 pa = seg->mr_dma;
1918 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1919 frmr->fr_pgl->page_list[page_no++] = pa;
1920 pa += PAGE_SIZE;
1921 }
1922 len += seg->mr_len;
1923 ++seg;
1924 ++i;
1925 /* Check for holes */
1926 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1927 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1928 break;
1929 }
1930 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1931 __func__, mw, i);
1932
1933 frmr->fr_state = FRMR_IS_VALID;
1934
1935 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1936 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1937 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1938 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1939 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1940 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1941 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1942 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1943 if (fastreg_wr.wr.fast_reg.length < len) {
1944 rc = -EIO;
1945 goto out_err;
1946 }
1947
1948 /* Bump the key */
1949 key = (u8)(mr->rkey & 0x000000FF);
1950 ib_update_fast_reg_key(mr, ++key);
1951
1952 fastreg_wr.wr.fast_reg.access_flags = (writing ?
1953 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1954 IB_ACCESS_REMOTE_READ);
1955 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1956 DECR_CQCOUNT(&r_xprt->rx_ep);
1957
1958 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1959 if (rc) {
1960 dprintk("RPC: %s: failed ib_post_send for register,"
1961 " status %i\n", __func__, rc);
1962 ib_update_fast_reg_key(mr, --key);
1963 goto out_err;
1964 } else {
1965 seg1->mr_rkey = mr->rkey;
1966 seg1->mr_base = seg1->mr_dma + pageoff;
1967 seg1->mr_nsegs = i;
1968 seg1->mr_len = len;
1969 }
1970 *nsegs = i;
1971 return 0;
1972out_err:
1973 frmr->fr_state = FRMR_IS_INVALID;
1974 while (i--)
1975 rpcrdma_unmap_one(ia, --seg);
1976 return rc;
1977}
1978
1979static int
1980rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1981 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1982{
1983 struct rpcrdma_mr_seg *seg1 = seg;
1984 struct ib_send_wr invalidate_wr, *bad_wr;
1985 int rc;
1986
1987 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1988
1989 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1990 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
1991 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1992 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
1993 DECR_CQCOUNT(&r_xprt->rx_ep);
1994
1995 read_lock(&ia->ri_qplock);
1996 while (seg1->mr_nsegs--)
1997 rpcrdma_unmap_one(ia, seg++);
1998 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1999 read_unlock(&ia->ri_qplock);
2000 if (rc) {
2001 /* Force rpcrdma_buffer_get() to retry */
2002 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
2003 dprintk("RPC: %s: failed ib_post_send for invalidate,"
2004 " status %i\n", __func__, rc);
2005 }
2006 return rc;
2007}
2008
2009static int
2010rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
2011 int *nsegs, int writing, struct rpcrdma_ia *ia)
2012{
2013 struct rpcrdma_mr_seg *seg1 = seg;
2014 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
2015 int len, pageoff, i, rc;
2016
2017 pageoff = offset_in_page(seg1->mr_offset);
2018 seg1->mr_offset -= pageoff; /* start of page */
2019 seg1->mr_len += pageoff;
2020 len = -pageoff;
2021 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
2022 *nsegs = RPCRDMA_MAX_DATA_SEGS;
2023 for (i = 0; i < *nsegs;) {
2024 rpcrdma_map_one(ia, seg, writing);
2025 physaddrs[i] = seg->mr_dma;
2026 len += seg->mr_len;
2027 ++seg;
2028 ++i;
2029 /* Check for holes */
2030 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
2031 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
2032 break;
2033 }
2034 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
2035 if (rc) {
2036 dprintk("RPC: %s: failed ib_map_phys_fmr "
2037 "%u@0x%llx+%i (%d)... status %i\n", __func__,
2038 len, (unsigned long long)seg1->mr_dma,
2039 pageoff, i, rc);
2040 while (i--)
2041 rpcrdma_unmap_one(ia, --seg);
2042 } else {
2043 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
2044 seg1->mr_base = seg1->mr_dma + pageoff;
2045 seg1->mr_nsegs = i;
2046 seg1->mr_len = len;
2047 }
2048 *nsegs = i;
2049 return rc;
2050}
2051
2052static int
2053rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
2054 struct rpcrdma_ia *ia)
2055{
2056 struct rpcrdma_mr_seg *seg1 = seg;
2057 LIST_HEAD(l);
2058 int rc;
2059
2060 list_add(&seg1->rl_mw->r.fmr->list, &l);
2061 rc = ib_unmap_fmr(&l);
2062 read_lock(&ia->ri_qplock);
2063 while (seg1->mr_nsegs--)
2064 rpcrdma_unmap_one(ia, seg++);
2065 read_unlock(&ia->ri_qplock);
2066 if (rc)
2067 dprintk("RPC: %s: failed ib_unmap_fmr,"
2068 " status %i\n", __func__, rc);
2069 return rc;
2070}
2071
2072int
2073rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2074 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2075{
2076 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2077 int rc = 0;
2078
2079 switch (ia->ri_memreg_strategy) {
2080
2081 case RPCRDMA_ALLPHYSICAL:
2082 rpcrdma_map_one(ia, seg, writing);
2083 seg->mr_rkey = ia->ri_bind_mem->rkey;
2084 seg->mr_base = seg->mr_dma;
2085 seg->mr_nsegs = 1;
2086 nsegs = 1;
2087 break;
2088
2089 /* Registration using frmr registration */
2090 case RPCRDMA_FRMR:
2091 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2092 break;
2093
2094 /* Registration using fmr memory registration */
2095 case RPCRDMA_MTHCAFMR:
2096 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
2097 break;
2098
2099 default:
2100 return -EIO;
2101 }
2102 if (rc)
2103 return rc;
2104
2105 return nsegs;
2106}
2107
2108int
2109rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
2110 struct rpcrdma_xprt *r_xprt)
2111{
2112 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2113 int nsegs = seg->mr_nsegs, rc;
2114
2115 switch (ia->ri_memreg_strategy) {
2116
2117 case RPCRDMA_ALLPHYSICAL:
2118 read_lock(&ia->ri_qplock);
2119 rpcrdma_unmap_one(ia, seg);
2120 read_unlock(&ia->ri_qplock);
2121 break;
2122
2123 case RPCRDMA_FRMR:
2124 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2125 break;
2126
2127 case RPCRDMA_MTHCAFMR:
2128 rc = rpcrdma_deregister_fmr_external(seg, ia);
2129 break;
2130
2131 default:
2132 break;
2133 }
2134 return nsegs;
2135}
2136
2137/*
2138 * Prepost any receive buffer, then post send. 1572 * Prepost any receive buffer, then post send.
2139 * 1573 *
2140 * Receive buffer is donated to hardware, reclaimed upon recv completion. 1574 * Receive buffer is donated to hardware, reclaimed upon recv completion.
@@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
2156 } 1590 }
2157 1591
2158 send_wr.next = NULL; 1592 send_wr.next = NULL;
2159 send_wr.wr_id = 0ULL; /* no send cookie */ 1593 send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
2160 send_wr.sg_list = req->rl_send_iov; 1594 send_wr.sg_list = req->rl_send_iov;
2161 send_wr.num_sge = req->rl_niovs; 1595 send_wr.num_sge = req->rl_niovs;
2162 send_wr.opcode = IB_WR_SEND; 1596 send_wr.opcode = IB_WR_SEND;
@@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2215 return rc; 1649 return rc;
2216} 1650}
2217 1651
2218/* Physical mapping means one Read/Write list entry per-page. 1652/* How many chunk list items fit within our inline buffers?
2219 * All list entries must fit within an inline buffer
2220 *
2221 * NB: The server must return a Write list for NFS READ,
2222 * which has the same constraint. Factor in the inline
2223 * rsize as well.
2224 */ 1653 */
2225static size_t 1654unsigned int
2226rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) 1655rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
2227{ 1656{
2228 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 1657 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2229 unsigned int inline_size, pages; 1658 int bytes, segments;
2230 1659
2231 inline_size = min_t(unsigned int, 1660 bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
2232 cdata->inline_wsize, cdata->inline_rsize); 1661 bytes -= RPCRDMA_HDRLEN_MIN;
2233 inline_size -= RPCRDMA_HDRLEN_MIN; 1662 if (bytes < sizeof(struct rpcrdma_segment) * 2) {
2234 pages = inline_size / sizeof(struct rpcrdma_segment); 1663 pr_warn("RPC: %s: inline threshold too small\n",
2235 return pages << PAGE_SHIFT; 1664 __func__);
2236} 1665 return 0;
2237
2238static size_t
2239rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2240{
2241 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2242}
2243
2244size_t
2245rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2246{
2247 size_t result;
2248
2249 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2250 case RPCRDMA_ALLPHYSICAL:
2251 result = rpcrdma_physical_max_payload(r_xprt);
2252 break;
2253 default:
2254 result = rpcrdma_mr_max_payload(r_xprt);
2255 } 1666 }
2256 return result; 1667
1668 segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
1669 dprintk("RPC: %s: max chunk list size = %d segments\n",
1670 __func__, segments);
1671 return segments;
2257} 1672}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0a16fb6f0885..78e0b8beaa36 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -60,6 +60,7 @@
60 * Interface Adapter -- one per transport instance 60 * Interface Adapter -- one per transport instance
61 */ 61 */
62struct rpcrdma_ia { 62struct rpcrdma_ia {
63 const struct rpcrdma_memreg_ops *ri_ops;
63 rwlock_t ri_qplock; 64 rwlock_t ri_qplock;
64 struct rdma_cm_id *ri_id; 65 struct rdma_cm_id *ri_id;
65 struct ib_pd *ri_pd; 66 struct ib_pd *ri_pd;
@@ -105,6 +106,10 @@ struct rpcrdma_ep {
105#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) 106#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
106#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) 107#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
107 108
109/* Force completion handler to ignore the signal
110 */
111#define RPCRDMA_IGNORE_COMPLETION (0ULL)
112
108/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV 113/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
109 * 114 *
110 * The below structure appears at the front of a large region of kmalloc'd 115 * The below structure appears at the front of a large region of kmalloc'd
@@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
143 return (struct rpcrdma_msg *)rb->rg_base; 148 return (struct rpcrdma_msg *)rb->rg_base;
144} 149}
145 150
146enum rpcrdma_chunktype {
147 rpcrdma_noch = 0,
148 rpcrdma_readch,
149 rpcrdma_areadch,
150 rpcrdma_writech,
151 rpcrdma_replych
152};
153
154/* 151/*
155 * struct rpcrdma_rep -- this structure encapsulates state required to recv 152 * struct rpcrdma_rep -- this structure encapsulates state required to recv
156 * and complete a reply, asychronously. It needs several pieces of 153 * and complete a reply, asychronously. It needs several pieces of
@@ -213,6 +210,7 @@ struct rpcrdma_mw {
213 struct ib_fmr *fmr; 210 struct ib_fmr *fmr;
214 struct rpcrdma_frmr frmr; 211 struct rpcrdma_frmr frmr;
215 } r; 212 } r;
213 void (*mw_sendcompletion)(struct ib_wc *);
216 struct list_head mw_list; 214 struct list_head mw_list;
217 struct list_head mw_all; 215 struct list_head mw_all;
218}; 216};
@@ -258,7 +256,6 @@ struct rpcrdma_req {
258 unsigned int rl_niovs; /* 0, 2 or 4 */ 256 unsigned int rl_niovs; /* 0, 2 or 4 */
259 unsigned int rl_nchunks; /* non-zero if chunks */ 257 unsigned int rl_nchunks; /* non-zero if chunks */
260 unsigned int rl_connect_cookie; /* retry detection */ 258 unsigned int rl_connect_cookie; /* retry detection */
261 enum rpcrdma_chunktype rl_rtype, rl_wtype;
262 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ 259 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
263 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 260 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
264 struct ib_sge rl_send_iov[4]; /* for active requests */ 261 struct ib_sge rl_send_iov[4]; /* for active requests */
@@ -340,6 +337,29 @@ struct rpcrdma_stats {
340}; 337};
341 338
342/* 339/*
340 * Per-registration mode operations
341 */
342struct rpcrdma_xprt;
343struct rpcrdma_memreg_ops {
344 int (*ro_map)(struct rpcrdma_xprt *,
345 struct rpcrdma_mr_seg *, int, bool);
346 int (*ro_unmap)(struct rpcrdma_xprt *,
347 struct rpcrdma_mr_seg *);
348 int (*ro_open)(struct rpcrdma_ia *,
349 struct rpcrdma_ep *,
350 struct rpcrdma_create_data_internal *);
351 size_t (*ro_maxpages)(struct rpcrdma_xprt *);
352 int (*ro_init)(struct rpcrdma_xprt *);
353 void (*ro_reset)(struct rpcrdma_xprt *);
354 void (*ro_destroy)(struct rpcrdma_buffer *);
355 const char *ro_displayname;
356};
357
358extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
359extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
360extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
361
362/*
343 * RPCRDMA transport -- encapsulates the structures above for 363 * RPCRDMA transport -- encapsulates the structures above for
344 * integration with RPC. 364 * integration with RPC.
345 * 365 *
@@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
398void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 418void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
399void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 419void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
400 420
401int rpcrdma_register_external(struct rpcrdma_mr_seg *,
402 int, int, struct rpcrdma_xprt *);
403int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
404 struct rpcrdma_xprt *);
405
406struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, 421struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
407 size_t, gfp_t); 422 size_t, gfp_t);
408void rpcrdma_free_regbuf(struct rpcrdma_ia *, 423void rpcrdma_free_regbuf(struct rpcrdma_ia *,
409 struct rpcrdma_regbuf *); 424 struct rpcrdma_regbuf *);
410 425
426unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
427
428/*
429 * Wrappers for chunk registration, shared by read/write chunk code.
430 */
431
432void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
433
434static inline enum dma_data_direction
435rpcrdma_data_dir(bool writing)
436{
437 return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
438}
439
440static inline void
441rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
442 enum dma_data_direction direction)
443{
444 seg->mr_dir = direction;
445 seg->mr_dmalen = seg->mr_len;
446
447 if (seg->mr_page)
448 seg->mr_dma = ib_dma_map_page(device,
449 seg->mr_page, offset_in_page(seg->mr_offset),
450 seg->mr_dmalen, seg->mr_dir);
451 else
452 seg->mr_dma = ib_dma_map_single(device,
453 seg->mr_offset,
454 seg->mr_dmalen, seg->mr_dir);
455
456 if (ib_dma_mapping_error(device, seg->mr_dma))
457 rpcrdma_mapping_error(seg);
458}
459
460static inline void
461rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
462{
463 if (seg->mr_page)
464 ib_dma_unmap_page(device,
465 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
466 else
467 ib_dma_unmap_single(device,
468 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
469}
470
411/* 471/*
412 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c 472 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
413 */ 473 */
@@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
418/* 478/*
419 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c 479 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
420 */ 480 */
421ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
422int rpcrdma_marshal_req(struct rpc_rqst *); 481int rpcrdma_marshal_req(struct rpc_rqst *);
423size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
424 482
425/* Temporary NFS request map cache. Created in svc_rdma.c */ 483/* Temporary NFS request map cache. Created in svc_rdma.c */
426extern struct kmem_cache *svc_rdma_map_cachep; 484extern struct kmem_cache *svc_rdma_map_cachep;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 3613e72e858e..70e3dacbf84a 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -591,14 +591,14 @@ void tipc_bearer_stop(struct net *net)
591 591
592/* Caller should hold rtnl_lock to protect the bearer */ 592/* Caller should hold rtnl_lock to protect the bearer */
593static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, 593static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
594 struct tipc_bearer *bearer) 594 struct tipc_bearer *bearer, int nlflags)
595{ 595{
596 void *hdr; 596 void *hdr;
597 struct nlattr *attrs; 597 struct nlattr *attrs;
598 struct nlattr *prop; 598 struct nlattr *prop;
599 599
600 hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 600 hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
601 NLM_F_MULTI, TIPC_NL_BEARER_GET); 601 nlflags, TIPC_NL_BEARER_GET);
602 if (!hdr) 602 if (!hdr)
603 return -EMSGSIZE; 603 return -EMSGSIZE;
604 604
@@ -657,7 +657,7 @@ int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb)
657 if (!bearer) 657 if (!bearer)
658 continue; 658 continue;
659 659
660 err = __tipc_nl_add_bearer(&msg, bearer); 660 err = __tipc_nl_add_bearer(&msg, bearer, NLM_F_MULTI);
661 if (err) 661 if (err)
662 break; 662 break;
663 } 663 }
@@ -705,7 +705,7 @@ int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info)
705 goto err_out; 705 goto err_out;
706 } 706 }
707 707
708 err = __tipc_nl_add_bearer(&msg, bearer); 708 err = __tipc_nl_add_bearer(&msg, bearer, 0);
709 if (err) 709 if (err)
710 goto err_out; 710 goto err_out;
711 rtnl_unlock(); 711 rtnl_unlock();
@@ -857,14 +857,14 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
857} 857}
858 858
859static int __tipc_nl_add_media(struct tipc_nl_msg *msg, 859static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
860 struct tipc_media *media) 860 struct tipc_media *media, int nlflags)
861{ 861{
862 void *hdr; 862 void *hdr;
863 struct nlattr *attrs; 863 struct nlattr *attrs;
864 struct nlattr *prop; 864 struct nlattr *prop;
865 865
866 hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 866 hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
867 NLM_F_MULTI, TIPC_NL_MEDIA_GET); 867 nlflags, TIPC_NL_MEDIA_GET);
868 if (!hdr) 868 if (!hdr)
869 return -EMSGSIZE; 869 return -EMSGSIZE;
870 870
@@ -916,7 +916,8 @@ int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb)
916 916
917 rtnl_lock(); 917 rtnl_lock();
918 for (; media_info_array[i] != NULL; i++) { 918 for (; media_info_array[i] != NULL; i++) {
919 err = __tipc_nl_add_media(&msg, media_info_array[i]); 919 err = __tipc_nl_add_media(&msg, media_info_array[i],
920 NLM_F_MULTI);
920 if (err) 921 if (err)
921 break; 922 break;
922 } 923 }
@@ -963,7 +964,7 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info)
963 goto err_out; 964 goto err_out;
964 } 965 }
965 966
966 err = __tipc_nl_add_media(&msg, media); 967 err = __tipc_nl_add_media(&msg, media, 0);
967 if (err) 968 if (err)
968 goto err_out; 969 goto err_out;
969 rtnl_unlock(); 970 rtnl_unlock();
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a6b30df6ec02..43a515dc97b0 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1145,11 +1145,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr)
1145 } 1145 }
1146 /* Synchronize with parallel link if applicable */ 1146 /* Synchronize with parallel link if applicable */
1147 if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) { 1147 if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) {
1148 link_handle_out_of_seq_msg(l_ptr, skb); 1148 if (!link_synch(l_ptr))
1149 if (link_synch(l_ptr)) 1149 goto unlock;
1150 link_retrieve_defq(l_ptr, &head);
1151 skb = NULL;
1152 goto unlock;
1153 } 1150 }
1154 l_ptr->next_in_no++; 1151 l_ptr->next_in_no++;
1155 if (unlikely(!skb_queue_empty(&l_ptr->deferdq))) 1152 if (unlikely(!skb_queue_empty(&l_ptr->deferdq)))
@@ -2013,7 +2010,7 @@ msg_full:
2013 2010
2014/* Caller should hold appropriate locks to protect the link */ 2011/* Caller should hold appropriate locks to protect the link */
2015static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, 2012static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
2016 struct tipc_link *link) 2013 struct tipc_link *link, int nlflags)
2017{ 2014{
2018 int err; 2015 int err;
2019 void *hdr; 2016 void *hdr;
@@ -2022,7 +2019,7 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
2022 struct tipc_net *tn = net_generic(net, tipc_net_id); 2019 struct tipc_net *tn = net_generic(net, tipc_net_id);
2023 2020
2024 hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 2021 hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
2025 NLM_F_MULTI, TIPC_NL_LINK_GET); 2022 nlflags, TIPC_NL_LINK_GET);
2026 if (!hdr) 2023 if (!hdr)
2027 return -EMSGSIZE; 2024 return -EMSGSIZE;
2028 2025
@@ -2095,7 +2092,7 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
2095 if (!node->links[i]) 2092 if (!node->links[i])
2096 continue; 2093 continue;
2097 2094
2098 err = __tipc_nl_add_link(net, msg, node->links[i]); 2095 err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI);
2099 if (err) 2096 if (err)
2100 return err; 2097 return err;
2101 } 2098 }
@@ -2143,7 +2140,6 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb)
2143 err = __tipc_nl_add_node_links(net, &msg, node, 2140 err = __tipc_nl_add_node_links(net, &msg, node,
2144 &prev_link); 2141 &prev_link);
2145 tipc_node_unlock(node); 2142 tipc_node_unlock(node);
2146 tipc_node_put(node);
2147 if (err) 2143 if (err)
2148 goto out; 2144 goto out;
2149 2145
@@ -2210,7 +2206,7 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info)
2210 goto err_out; 2206 goto err_out;
2211 } 2207 }
2212 2208
2213 err = __tipc_nl_add_link(net, &msg, link); 2209 err = __tipc_nl_add_link(net, &msg, link, 0);
2214 if (err) 2210 if (err)
2215 goto err_out; 2211 goto err_out;
2216 2212
diff --git a/net/tipc/server.c b/net/tipc/server.c
index ab6183cdb121..77ff03ed1e18 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -102,7 +102,7 @@ static void tipc_conn_kref_release(struct kref *kref)
102 } 102 }
103 saddr->scope = -TIPC_NODE_SCOPE; 103 saddr->scope = -TIPC_NODE_SCOPE;
104 kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); 104 kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
105 sk_release_kernel(sk); 105 sock_release(sock);
106 con->sock = NULL; 106 con->sock = NULL;
107 } 107 }
108 108
@@ -321,12 +321,9 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
321 struct socket *sock = NULL; 321 struct socket *sock = NULL;
322 int ret; 322 int ret;
323 323
324 ret = sock_create_kern(AF_TIPC, SOCK_SEQPACKET, 0, &sock); 324 ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1);
325 if (ret < 0) 325 if (ret < 0)
326 return NULL; 326 return NULL;
327
328 sk_change_net(sock->sk, s->net);
329
330 ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, 327 ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE,
331 (char *)&s->imp, sizeof(s->imp)); 328 (char *)&s->imp, sizeof(s->imp));
332 if (ret < 0) 329 if (ret < 0)
@@ -376,7 +373,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
376 373
377create_err: 374create_err:
378 kernel_sock_shutdown(sock, SHUT_RDWR); 375 kernel_sock_shutdown(sock, SHUT_RDWR);
379 sk_release_kernel(sock->sk); 376 sock_release(sock);
380 return NULL; 377 return NULL;
381} 378}
382 379
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index ee90d74d7516..9074b5cede38 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1764,13 +1764,14 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
1764int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) 1764int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
1765{ 1765{
1766 u32 dnode, dport = 0; 1766 u32 dnode, dport = 0;
1767 int err = -TIPC_ERR_NO_PORT; 1767 int err;
1768 struct sk_buff *skb; 1768 struct sk_buff *skb;
1769 struct tipc_sock *tsk; 1769 struct tipc_sock *tsk;
1770 struct tipc_net *tn; 1770 struct tipc_net *tn;
1771 struct sock *sk; 1771 struct sock *sk;
1772 1772
1773 while (skb_queue_len(inputq)) { 1773 while (skb_queue_len(inputq)) {
1774 err = -TIPC_ERR_NO_PORT;
1774 skb = NULL; 1775 skb = NULL;
1775 dport = tipc_skb_peek_port(inputq, dport); 1776 dport = tipc_skb_peek_port(inputq, dport);
1776 tsk = tipc_sk_lookup(net, dport); 1777 tsk = tipc_sk_lookup(net, dport);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 433f287ee548..5266ea7b922b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -305,7 +305,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
305 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { 305 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
306 struct dentry *dentry = unix_sk(s)->path.dentry; 306 struct dentry *dentry = unix_sk(s)->path.dentry;
307 307
308 if (dentry && dentry->d_inode == i) { 308 if (dentry && d_backing_inode(dentry) == i) {
309 sock_hold(s); 309 sock_hold(s);
310 goto found; 310 goto found;
311 } 311 }
@@ -778,7 +778,7 @@ static struct sock *unix_find_other(struct net *net,
778 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); 778 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
779 if (err) 779 if (err)
780 goto fail; 780 goto fail;
781 inode = path.dentry->d_inode; 781 inode = d_backing_inode(path.dentry);
782 err = inode_permission(inode, MAY_WRITE); 782 err = inode_permission(inode, MAY_WRITE);
783 if (err) 783 if (err)
784 goto put_fail; 784 goto put_fail;
@@ -839,7 +839,7 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
839 */ 839 */
840 err = security_path_mknod(&path, dentry, mode, 0); 840 err = security_path_mknod(&path, dentry, mode, 0);
841 if (!err) { 841 if (!err) {
842 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); 842 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
843 if (!err) { 843 if (!err) {
844 res->mnt = mntget(path.mnt); 844 res->mnt = mntget(path.mnt);
845 res->dentry = dget(dentry); 845 res->dentry = dget(dentry);
@@ -905,7 +905,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
905 goto out_up; 905 goto out_up;
906 } 906 }
907 addr->hash = UNIX_HASH_SIZE; 907 addr->hash = UNIX_HASH_SIZE;
908 hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); 908 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
909 spin_lock(&unix_table_lock); 909 spin_lock(&unix_table_lock);
910 u->path = path; 910 u->path = path;
911 list = &unix_socket_table[hash]; 911 list = &unix_socket_table[hash];
diff --git a/net/unix/diag.c b/net/unix/diag.c
index ef542fbca9fe..c512f64d5287 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -25,7 +25,7 @@ static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb)
25 25
26 if (dentry) { 26 if (dentry) {
27 struct unix_diag_vfs uv = { 27 struct unix_diag_vfs uv = {
28 .udiag_vfs_ino = dentry->d_inode->i_ino, 28 .udiag_vfs_ino = d_backing_inode(dentry)->i_ino,
29 .udiag_vfs_dev = dentry->d_sb->s_dev, 29 .udiag_vfs_dev = dentry->d_sb->s_dev,
30 }; 30 };
31 31
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 99f7012b23b9..a73a226f2d33 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -95,39 +95,36 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
95 95
96unsigned int unix_tot_inflight; 96unsigned int unix_tot_inflight;
97 97
98
99struct sock *unix_get_socket(struct file *filp) 98struct sock *unix_get_socket(struct file *filp)
100{ 99{
101 struct sock *u_sock = NULL; 100 struct sock *u_sock = NULL;
102 struct inode *inode = file_inode(filp); 101 struct inode *inode = file_inode(filp);
103 102
104 /* 103 /* Socket ? */
105 * Socket ?
106 */
107 if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { 104 if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
108 struct socket *sock = SOCKET_I(inode); 105 struct socket *sock = SOCKET_I(inode);
109 struct sock *s = sock->sk; 106 struct sock *s = sock->sk;
110 107
111 /* 108 /* PF_UNIX ? */
112 * PF_UNIX ?
113 */
114 if (s && sock->ops && sock->ops->family == PF_UNIX) 109 if (s && sock->ops && sock->ops->family == PF_UNIX)
115 u_sock = s; 110 u_sock = s;
116 } 111 }
117 return u_sock; 112 return u_sock;
118} 113}
119 114
120/* 115/* Keep the number of times in flight count for the file
121 * Keep the number of times in flight count for the file 116 * descriptor if it is for an AF_UNIX socket.
122 * descriptor if it is for an AF_UNIX socket.
123 */ 117 */
124 118
125void unix_inflight(struct file *fp) 119void unix_inflight(struct file *fp)
126{ 120{
127 struct sock *s = unix_get_socket(fp); 121 struct sock *s = unix_get_socket(fp);
122
128 if (s) { 123 if (s) {
129 struct unix_sock *u = unix_sk(s); 124 struct unix_sock *u = unix_sk(s);
125
130 spin_lock(&unix_gc_lock); 126 spin_lock(&unix_gc_lock);
127
131 if (atomic_long_inc_return(&u->inflight) == 1) { 128 if (atomic_long_inc_return(&u->inflight) == 1) {
132 BUG_ON(!list_empty(&u->link)); 129 BUG_ON(!list_empty(&u->link));
133 list_add_tail(&u->link, &gc_inflight_list); 130 list_add_tail(&u->link, &gc_inflight_list);
@@ -142,10 +139,13 @@ void unix_inflight(struct file *fp)
142void unix_notinflight(struct file *fp) 139void unix_notinflight(struct file *fp)
143{ 140{
144 struct sock *s = unix_get_socket(fp); 141 struct sock *s = unix_get_socket(fp);
142
145 if (s) { 143 if (s) {
146 struct unix_sock *u = unix_sk(s); 144 struct unix_sock *u = unix_sk(s);
145
147 spin_lock(&unix_gc_lock); 146 spin_lock(&unix_gc_lock);
148 BUG_ON(list_empty(&u->link)); 147 BUG_ON(list_empty(&u->link));
148
149 if (atomic_long_dec_and_test(&u->inflight)) 149 if (atomic_long_dec_and_test(&u->inflight))
150 list_del_init(&u->link); 150 list_del_init(&u->link);
151 unix_tot_inflight--; 151 unix_tot_inflight--;
@@ -161,32 +161,27 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
161 161
162 spin_lock(&x->sk_receive_queue.lock); 162 spin_lock(&x->sk_receive_queue.lock);
163 skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { 163 skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
164 /* 164 /* Do we have file descriptors ? */
165 * Do we have file descriptors ?
166 */
167 if (UNIXCB(skb).fp) { 165 if (UNIXCB(skb).fp) {
168 bool hit = false; 166 bool hit = false;
169 /* 167 /* Process the descriptors of this socket */
170 * Process the descriptors of this socket
171 */
172 int nfd = UNIXCB(skb).fp->count; 168 int nfd = UNIXCB(skb).fp->count;
173 struct file **fp = UNIXCB(skb).fp->fp; 169 struct file **fp = UNIXCB(skb).fp->fp;
170
174 while (nfd--) { 171 while (nfd--) {
175 /* 172 /* Get the socket the fd matches if it indeed does so */
176 * Get the socket the fd matches
177 * if it indeed does so
178 */
179 struct sock *sk = unix_get_socket(*fp++); 173 struct sock *sk = unix_get_socket(*fp++);
174
180 if (sk) { 175 if (sk) {
181 struct unix_sock *u = unix_sk(sk); 176 struct unix_sock *u = unix_sk(sk);
182 177
183 /* 178 /* Ignore non-candidates, they could
184 * Ignore non-candidates, they could
185 * have been added to the queues after 179 * have been added to the queues after
186 * starting the garbage collection 180 * starting the garbage collection
187 */ 181 */
188 if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { 182 if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
189 hit = true; 183 hit = true;
184
190 func(u); 185 func(u);
191 } 186 }
192 } 187 }
@@ -203,24 +198,22 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
203static void scan_children(struct sock *x, void (*func)(struct unix_sock *), 198static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
204 struct sk_buff_head *hitlist) 199 struct sk_buff_head *hitlist)
205{ 200{
206 if (x->sk_state != TCP_LISTEN) 201 if (x->sk_state != TCP_LISTEN) {
207 scan_inflight(x, func, hitlist); 202 scan_inflight(x, func, hitlist);
208 else { 203 } else {
209 struct sk_buff *skb; 204 struct sk_buff *skb;
210 struct sk_buff *next; 205 struct sk_buff *next;
211 struct unix_sock *u; 206 struct unix_sock *u;
212 LIST_HEAD(embryos); 207 LIST_HEAD(embryos);
213 208
214 /* 209 /* For a listening socket collect the queued embryos
215 * For a listening socket collect the queued embryos
216 * and perform a scan on them as well. 210 * and perform a scan on them as well.
217 */ 211 */
218 spin_lock(&x->sk_receive_queue.lock); 212 spin_lock(&x->sk_receive_queue.lock);
219 skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { 213 skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
220 u = unix_sk(skb->sk); 214 u = unix_sk(skb->sk);
221 215
222 /* 216 /* An embryo cannot be in-flight, so it's safe
223 * An embryo cannot be in-flight, so it's safe
224 * to use the list link. 217 * to use the list link.
225 */ 218 */
226 BUG_ON(!list_empty(&u->link)); 219 BUG_ON(!list_empty(&u->link));
@@ -249,8 +242,7 @@ static void inc_inflight(struct unix_sock *usk)
249static void inc_inflight_move_tail(struct unix_sock *u) 242static void inc_inflight_move_tail(struct unix_sock *u)
250{ 243{
251 atomic_long_inc(&u->inflight); 244 atomic_long_inc(&u->inflight);
252 /* 245 /* If this still might be part of a cycle, move it to the end
253 * If this still might be part of a cycle, move it to the end
254 * of the list, so that it's checked even if it was already 246 * of the list, so that it's checked even if it was already
255 * passed over 247 * passed over
256 */ 248 */
@@ -263,8 +255,7 @@ static bool gc_in_progress;
263 255
264void wait_for_unix_gc(void) 256void wait_for_unix_gc(void)
265{ 257{
266 /* 258 /* If number of inflight sockets is insane,
267 * If number of inflight sockets is insane,
268 * force a garbage collect right now. 259 * force a garbage collect right now.
269 */ 260 */
270 if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress) 261 if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
@@ -288,8 +279,7 @@ void unix_gc(void)
288 goto out; 279 goto out;
289 280
290 gc_in_progress = true; 281 gc_in_progress = true;
291 /* 282 /* First, select candidates for garbage collection. Only
292 * First, select candidates for garbage collection. Only
293 * in-flight sockets are considered, and from those only ones 283 * in-flight sockets are considered, and from those only ones
294 * which don't have any external reference. 284 * which don't have any external reference.
295 * 285 *
@@ -320,15 +310,13 @@ void unix_gc(void)
320 } 310 }
321 } 311 }
322 312
323 /* 313 /* Now remove all internal in-flight reference to children of
324 * Now remove all internal in-flight reference to children of
325 * the candidates. 314 * the candidates.
326 */ 315 */
327 list_for_each_entry(u, &gc_candidates, link) 316 list_for_each_entry(u, &gc_candidates, link)
328 scan_children(&u->sk, dec_inflight, NULL); 317 scan_children(&u->sk, dec_inflight, NULL);
329 318
330 /* 319 /* Restore the references for children of all candidates,
331 * Restore the references for children of all candidates,
332 * which have remaining references. Do this recursively, so 320 * which have remaining references. Do this recursively, so
333 * only those remain, which form cyclic references. 321 * only those remain, which form cyclic references.
334 * 322 *
@@ -350,8 +338,7 @@ void unix_gc(void)
350 } 338 }
351 list_del(&cursor); 339 list_del(&cursor);
352 340
353 /* 341 /* not_cycle_list contains those sockets which do not make up a
354 * not_cycle_list contains those sockets which do not make up a
355 * cycle. Restore these to the inflight list. 342 * cycle. Restore these to the inflight list.
356 */ 343 */
357 while (!list_empty(&not_cycle_list)) { 344 while (!list_empty(&not_cycle_list)) {
@@ -360,8 +347,7 @@ void unix_gc(void)
360 list_move_tail(&u->link, &gc_inflight_list); 347 list_move_tail(&u->link, &gc_inflight_list);
361 } 348 }
362 349
363 /* 350 /* Now gc_candidates contains only garbage. Restore original
364 * Now gc_candidates contains only garbage. Restore original
365 * inflight counters for these as well, and remove the skbuffs 351 * inflight counters for these as well, and remove the skbuffs
366 * which are creating the cycle(s). 352 * which are creating the cycle(s).
367 */ 353 */