aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan_core.c14
-rw-r--r--net/9p/client.c7
-rw-r--r--net/9p/trans_rdma.c29
-rw-r--r--net/9p/trans_virtio.c3
-rw-r--r--net/Kconfig3
-rw-r--r--net/Makefile1
-rw-r--r--net/atm/br2684.c12
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/atm/proc.c1
-rw-r--r--net/bluetooth/l2cap.c62
-rw-r--r--net/bluetooth/rfcomm/sock.c4
-rw-r--r--net/bluetooth/rfcomm/tty.c4
-rw-r--r--net/bridge/br_netfilter.c6
-rw-r--r--net/caif/caif_socket.c21
-rw-r--r--net/ceph/Kconfig28
-rw-r--r--net/ceph/Makefile37
-rw-r--r--net/ceph/armor.c103
-rw-r--r--net/ceph/auth.c259
-rw-r--r--net/ceph/auth_none.c132
-rw-r--r--net/ceph/auth_none.h29
-rw-r--r--net/ceph/auth_x.c688
-rw-r--r--net/ceph/auth_x.h50
-rw-r--r--net/ceph/auth_x_protocol.h90
-rw-r--r--net/ceph/buffer.c68
-rw-r--r--net/ceph/ceph_common.c529
-rw-r--r--net/ceph/ceph_fs.c75
-rw-r--r--net/ceph/ceph_hash.c118
-rw-r--r--net/ceph/ceph_strings.c84
-rw-r--r--net/ceph/crush/crush.c151
-rw-r--r--net/ceph/crush/hash.c149
-rw-r--r--net/ceph/crush/mapper.c609
-rw-r--r--net/ceph/crypto.c412
-rw-r--r--net/ceph/crypto.h48
-rw-r--r--net/ceph/debugfs.c267
-rw-r--r--net/ceph/messenger.c2453
-rw-r--r--net/ceph/mon_client.c1027
-rw-r--r--net/ceph/msgpool.c64
-rw-r--r--net/ceph/osd_client.c1773
-rw-r--r--net/ceph/osdmap.c1128
-rw-r--r--net/ceph/pagelist.c154
-rw-r--r--net/ceph/pagevec.c223
-rw-r--r--net/core/datagram.c1
-rw-r--r--net/core/dev.c26
-rw-r--r--net/core/ethtool.c8
-rw-r--r--net/core/gen_estimator.c12
-rw-r--r--net/core/iovec.c5
-rw-r--r--net/core/net-traces.c1
-rw-r--r--net/core/skbuff.c7
-rw-r--r--net/core/sock.c13
-rw-r--r--net/core/stream.c8
-rw-r--r--net/dccp/probe.c1
-rw-r--r--net/ipv4/Kconfig5
-rw-r--r--net/ipv4/datagram.c5
-rw-r--r--net/ipv4/fib_frontend.c15
-rw-r--r--net/ipv4/fib_trie.c8
-rw-r--r--net/ipv4/igmp.c12
-rw-r--r--net/ipv4/ip_gre.c8
-rw-r--r--net/ipv4/ip_output.c19
-rw-r--r--net/ipv4/ip_sockglue.c3
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c28
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c6
-rw-r--r--net/ipv4/route.c9
-rw-r--r--net/ipv4/tcp.c9
-rw-r--r--net/ipv4/tcp_input.c5
-rw-r--r--net/ipv4/tcp_probe.c1
-rw-r--r--net/ipv4/tcp_timer.c24
-rw-r--r--net/ipv4/udp.c44
-rw-r--r--net/ipv4/xfrm4_policy.c2
-rw-r--r--net/ipv4/xfrm4_state.c33
-rw-r--r--net/ipv6/addrconf.c11
-rw-r--r--net/ipv6/addrlabel.c5
-rw-r--r--net/ipv6/datagram.c7
-rw-r--r--net/ipv6/ip6_output.c18
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c80
-rw-r--r--net/ipv6/reassembly.c71
-rw-r--r--net/ipv6/route.c30
-rw-r--r--net/ipv6/udp.c10
-rw-r--r--net/ipv6/xfrm6_state.c33
-rw-r--r--net/ipx/Kconfig1
-rw-r--r--net/irda/af_irda.c4
-rw-r--r--net/irda/irlan/irlan_common.c2
-rw-r--r--net/llc/af_llc.c3
-rw-r--r--net/llc/llc_station.c2
-rw-r--r--net/mac80211/agg-tx.c2
-rw-r--r--net/mac80211/debugfs.c21
-rw-r--r--net/mac80211/debugfs_key.c2
-rw-r--r--net/mac80211/debugfs_netdev.c1
-rw-r--r--net/mac80211/debugfs_sta.c2
-rw-r--r--net/mac80211/main.c6
-rw-r--r--net/mac80211/rate.c1
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c1
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c3
-rw-r--r--net/mac80211/rc80211_pid_debugfs.c1
-rw-r--r--net/mac80211/rx.c4
-rw-r--r--net/mac80211/status.c4
-rw-r--r--net/netfilter/core.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c1
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c9
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c18
-rw-r--r--net/netfilter/nf_conntrack_ecache.c4
-rw-r--r--net/netfilter/nf_conntrack_extend.c6
-rw-r--r--net/netfilter/nf_conntrack_netlink.c44
-rw-r--r--net/netfilter/nf_conntrack_proto.c4
-rw-r--r--net/netfilter/nf_conntrack_sip.c2
-rw-r--r--net/netfilter/nf_conntrack_standalone.c28
-rw-r--r--net/netfilter/nf_log.c2
-rw-r--r--net/netfilter/nf_queue.c2
-rw-r--r--net/netfilter/nf_tproxy_core.c6
-rw-r--r--net/netfilter/xt_CT.c1
-rw-r--r--net/netfilter/xt_SECMARK.c35
-rw-r--r--net/netfilter/xt_recent.c1
-rw-r--r--net/netlink/af_netlink.c22
-rw-r--r--net/nonet.c1
-rw-r--r--net/phonet/pep.c3
-rw-r--r--net/rds/page.c27
-rw-r--r--net/rds/tcp_connect.c4
-rw-r--r--net/rds/tcp_listen.c4
-rw-r--r--net/rds/tcp_recv.c4
-rw-r--r--net/rds/tcp_send.c4
-rw-r--r--net/rfkill/core.c1
-rw-r--r--net/rose/af_rose.c4
-rw-r--r--net/sched/act_police.c21
-rw-r--r--net/sched/cls_cgroup.c2
-rw-r--r--net/sched/cls_u32.c2
-rw-r--r--net/sched/sch_atm.c4
-rw-r--r--net/sched/sch_hfsc.c2
-rw-r--r--net/sctp/auth.c8
-rw-r--r--net/sctp/output.c1
-rw-r--r--net/sctp/probe.c1
-rw-r--r--net/sctp/sm_statefuns.c46
-rw-r--r--net/sctp/socket.c13
-rw-r--r--net/socket.c1
-rw-r--r--net/sunrpc/auth.c2
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c9
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c10
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_mech.c5
-rw-r--r--net/sunrpc/cache.c17
-rw-r--r--net/sunrpc/clnt.c116
-rw-r--r--net/sunrpc/rpc_pipe.c46
-rw-r--r--net/sunrpc/xprtsock.c28
-rw-r--r--net/unix/af_unix.c15
-rw-r--r--net/wireless/core.c21
-rw-r--r--net/wireless/debugfs.c2
-rw-r--r--net/wireless/wext-compat.c3
-rw-r--r--net/wireless/wext-core.c16
-rw-r--r--net/wireless/wext-priv.c2
-rw-r--r--net/x25/Kconfig1
-rw-r--r--net/xfrm/xfrm_output.c2
-rw-r--r--net/xfrm/xfrm_policy.c5
-rw-r--r--net/xfrm/xfrm_state.c45
-rw-r--r--net/xfrm/xfrm_user.c2
154 files changed, 11640 insertions, 614 deletions
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 01ddb0472f86..0eb96f7e44be 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -24,8 +24,11 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
24 24
25 if (vlan_dev) 25 if (vlan_dev)
26 skb->dev = vlan_dev; 26 skb->dev = vlan_dev;
27 else if (vlan_id) 27 else if (vlan_id) {
28 goto drop; 28 if (!(skb->dev->flags & IFF_PROMISC))
29 goto drop;
30 skb->pkt_type = PACKET_OTHERHOST;
31 }
29 32
30 return (polling ? netif_receive_skb(skb) : netif_rx(skb)); 33 return (polling ? netif_receive_skb(skb) : netif_rx(skb));
31 34
@@ -102,8 +105,11 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
102 105
103 if (vlan_dev) 106 if (vlan_dev)
104 skb->dev = vlan_dev; 107 skb->dev = vlan_dev;
105 else if (vlan_id) 108 else if (vlan_id) {
106 goto drop; 109 if (!(skb->dev->flags & IFF_PROMISC))
110 goto drop;
111 skb->pkt_type = PACKET_OTHERHOST;
112 }
107 113
108 for (p = napi->gro_list; p; p = p->next) { 114 for (p = napi->gro_list; p; p = p->next) {
109 NAPI_GRO_CB(p)->same_flow = 115 NAPI_GRO_CB(p)->same_flow =
diff --git a/net/9p/client.c b/net/9p/client.c
index dc6f2f26d023..9eb72505308f 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -331,8 +331,10 @@ static void p9_tag_cleanup(struct p9_client *c)
331 } 331 }
332 } 332 }
333 333
334 if (c->tagpool) 334 if (c->tagpool) {
335 p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */
335 p9_idpool_destroy(c->tagpool); 336 p9_idpool_destroy(c->tagpool);
337 }
336 338
337 /* free requests associated with tags */ 339 /* free requests associated with tags */
338 for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { 340 for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
@@ -944,6 +946,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
944 int16_t nwqids, count; 946 int16_t nwqids, count;
945 947
946 err = 0; 948 err = 0;
949 wqids = NULL;
947 clnt = oldfid->clnt; 950 clnt = oldfid->clnt;
948 if (clone) { 951 if (clone) {
949 fid = p9_fid_create(clnt); 952 fid = p9_fid_create(clnt);
@@ -994,9 +997,11 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
994 else 997 else
995 fid->qid = oldfid->qid; 998 fid->qid = oldfid->qid;
996 999
1000 kfree(wqids);
997 return fid; 1001 return fid;
998 1002
999clunk_fid: 1003clunk_fid:
1004 kfree(wqids);
1000 p9_client_clunk(fid); 1005 p9_client_clunk(fid);
1001 fid = NULL; 1006 fid = NULL;
1002 1007
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 0ea20c30466c..17c5ba7551a5 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -426,8 +426,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
426 426
427 /* Allocate an fcall for the reply */ 427 /* Allocate an fcall for the reply */
428 rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); 428 rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL);
429 if (!rpl_context) 429 if (!rpl_context) {
430 err = -ENOMEM;
430 goto err_close; 431 goto err_close;
432 }
431 433
432 /* 434 /*
433 * If the request has a buffer, steal it, otherwise 435 * If the request has a buffer, steal it, otherwise
@@ -445,8 +447,8 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
445 } 447 }
446 rpl_context->rc = req->rc; 448 rpl_context->rc = req->rc;
447 if (!rpl_context->rc) { 449 if (!rpl_context->rc) {
448 kfree(rpl_context); 450 err = -ENOMEM;
449 goto err_close; 451 goto err_free2;
450 } 452 }
451 453
452 /* 454 /*
@@ -458,11 +460,8 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
458 */ 460 */
459 if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) { 461 if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) {
460 err = post_recv(client, rpl_context); 462 err = post_recv(client, rpl_context);
461 if (err) { 463 if (err)
462 kfree(rpl_context->rc); 464 goto err_free1;
463 kfree(rpl_context);
464 goto err_close;
465 }
466 } else 465 } else
467 atomic_dec(&rdma->rq_count); 466 atomic_dec(&rdma->rq_count);
468 467
@@ -471,8 +470,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
471 470
472 /* Post the request */ 471 /* Post the request */
473 c = kmalloc(sizeof *c, GFP_KERNEL); 472 c = kmalloc(sizeof *c, GFP_KERNEL);
474 if (!c) 473 if (!c) {
475 goto err_close; 474 err = -ENOMEM;
475 goto err_free1;
476 }
476 c->req = req; 477 c->req = req;
477 478
478 c->busa = ib_dma_map_single(rdma->cm_id->device, 479 c->busa = ib_dma_map_single(rdma->cm_id->device,
@@ -499,9 +500,15 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
499 return ib_post_send(rdma->qp, &wr, &bad_wr); 500 return ib_post_send(rdma->qp, &wr, &bad_wr);
500 501
501 error: 502 error:
503 kfree(c);
504 kfree(rpl_context->rc);
505 kfree(rpl_context);
502 P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); 506 P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
503 return -EIO; 507 return -EIO;
504 508 err_free1:
509 kfree(rpl_context->rc);
510 err_free2:
511 kfree(rpl_context);
505 err_close: 512 err_close:
506 spin_lock_irqsave(&rdma->req_lock, flags); 513 spin_lock_irqsave(&rdma->req_lock, flags);
507 if (rdma->state < P9_RDMA_CLOSING) { 514 if (rdma->state < P9_RDMA_CLOSING) {
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index dcfbe99ff81c..b88515936e4b 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -329,7 +329,8 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
329 329
330 mutex_lock(&virtio_9p_lock); 330 mutex_lock(&virtio_9p_lock);
331 list_for_each_entry(chan, &virtio_chan_list, chan_list) { 331 list_for_each_entry(chan, &virtio_chan_list, chan_list) {
332 if (!strncmp(devname, chan->tag, chan->tag_len)) { 332 if (!strncmp(devname, chan->tag, chan->tag_len) &&
333 strlen(devname) == chan->tag_len) {
333 if (!chan->inuse) { 334 if (!chan->inuse) {
334 chan->inuse = true; 335 chan->inuse = true;
335 found = 1; 336 found = 1;
diff --git a/net/Kconfig b/net/Kconfig
index e330594d3709..55fd82e9ffd9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -217,7 +217,7 @@ source "net/dns_resolver/Kconfig"
217 217
218config RPS 218config RPS
219 boolean 219 boolean
220 depends on SMP && SYSFS 220 depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
221 default y 221 default y
222 222
223menu "Network testing" 223menu "Network testing"
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
293source "net/rfkill/Kconfig" 293source "net/rfkill/Kconfig"
294source "net/9p/Kconfig" 294source "net/9p/Kconfig"
295source "net/caif/Kconfig" 295source "net/caif/Kconfig"
296source "net/ceph/Kconfig"
296 297
297 298
298endif # if NET 299endif # if NET
diff --git a/net/Makefile b/net/Makefile
index ea60fbce9b1b..6b7bfd7f1416 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL) += sysctl_net.o
68endif 68endif
69obj-$(CONFIG_WIMAX) += wimax/ 69obj-$(CONFIG_WIMAX) += wimax/
70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ 70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
71obj-$(CONFIG_CEPH_LIB) += ceph/
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 651babdfab38..ad2b232a2055 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -399,12 +399,6 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
399 unregister_netdev(net_dev); 399 unregister_netdev(net_dev);
400 free_netdev(net_dev); 400 free_netdev(net_dev);
401 } 401 }
402 read_lock_irq(&devs_lock);
403 if (list_empty(&br2684_devs)) {
404 /* last br2684 device */
405 unregister_atmdevice_notifier(&atm_dev_notifier);
406 }
407 read_unlock_irq(&devs_lock);
408 return; 402 return;
409 } 403 }
410 404
@@ -675,7 +669,6 @@ static int br2684_create(void __user *arg)
675 669
676 if (list_empty(&br2684_devs)) { 670 if (list_empty(&br2684_devs)) {
677 /* 1st br2684 device */ 671 /* 1st br2684 device */
678 register_atmdevice_notifier(&atm_dev_notifier);
679 brdev->number = 1; 672 brdev->number = 1;
680 } else 673 } else
681 brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1; 674 brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
@@ -815,6 +808,7 @@ static int __init br2684_init(void)
815 return -ENOMEM; 808 return -ENOMEM;
816#endif 809#endif
817 register_atm_ioctl(&br2684_ioctl_ops); 810 register_atm_ioctl(&br2684_ioctl_ops);
811 register_atmdevice_notifier(&atm_dev_notifier);
818 return 0; 812 return 0;
819} 813}
820 814
@@ -830,9 +824,7 @@ static void __exit br2684_exit(void)
830#endif 824#endif
831 825
832 826
833 /* if not already empty */ 827 unregister_atmdevice_notifier(&atm_dev_notifier);
834 if (!list_empty(&br2684_devs))
835 unregister_atmdevice_notifier(&atm_dev_notifier);
836 828
837 while (!list_empty(&br2684_devs)) { 829 while (!list_empty(&br2684_devs)) {
838 net_dev = list_entry_brdev(br2684_devs.next); 830 net_dev = list_entry_brdev(br2684_devs.next);
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 622b471e14e0..74bcc662c3dd 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -778,7 +778,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
778 eg->packets_rcvd++; 778 eg->packets_rcvd++;
779 mpc->eg_ops->put(eg); 779 mpc->eg_ops->put(eg);
780 780
781 memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); 781 memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data));
782 netif_rx(new_skb); 782 netif_rx(new_skb);
783} 783}
784 784
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 6262aeae398e..f85da0779e5e 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -38,6 +38,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
38static const struct file_operations proc_atm_dev_ops = { 38static const struct file_operations proc_atm_dev_ops = {
39 .owner = THIS_MODULE, 39 .owner = THIS_MODULE,
40 .read = proc_dev_atm_read, 40 .read = proc_dev_atm_read,
41 .llseek = noop_llseek,
41}; 42};
42 43
43static void add_stats(struct seq_file *seq, const char *aal, 44static void add_stats(struct seq_file *seq, const char *aal,
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index fadf26b4ed7c..0b54b7dd8401 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -1441,33 +1441,23 @@ static inline void l2cap_do_send(struct sock *sk, struct sk_buff *skb)
1441 1441
1442static void l2cap_streaming_send(struct sock *sk) 1442static void l2cap_streaming_send(struct sock *sk)
1443{ 1443{
1444 struct sk_buff *skb, *tx_skb; 1444 struct sk_buff *skb;
1445 struct l2cap_pinfo *pi = l2cap_pi(sk); 1445 struct l2cap_pinfo *pi = l2cap_pi(sk);
1446 u16 control, fcs; 1446 u16 control, fcs;
1447 1447
1448 while ((skb = sk->sk_send_head)) { 1448 while ((skb = skb_dequeue(TX_QUEUE(sk)))) {
1449 tx_skb = skb_clone(skb, GFP_ATOMIC); 1449 control = get_unaligned_le16(skb->data + L2CAP_HDR_SIZE);
1450
1451 control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE);
1452 control |= pi->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT; 1450 control |= pi->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT;
1453 put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE); 1451 put_unaligned_le16(control, skb->data + L2CAP_HDR_SIZE);
1454 1452
1455 if (pi->fcs == L2CAP_FCS_CRC16) { 1453 if (pi->fcs == L2CAP_FCS_CRC16) {
1456 fcs = crc16(0, (u8 *)tx_skb->data, tx_skb->len - 2); 1454 fcs = crc16(0, (u8 *)skb->data, skb->len - 2);
1457 put_unaligned_le16(fcs, tx_skb->data + tx_skb->len - 2); 1455 put_unaligned_le16(fcs, skb->data + skb->len - 2);
1458 } 1456 }
1459 1457
1460 l2cap_do_send(sk, tx_skb); 1458 l2cap_do_send(sk, skb);
1461 1459
1462 pi->next_tx_seq = (pi->next_tx_seq + 1) % 64; 1460 pi->next_tx_seq = (pi->next_tx_seq + 1) % 64;
1463
1464 if (skb_queue_is_last(TX_QUEUE(sk), skb))
1465 sk->sk_send_head = NULL;
1466 else
1467 sk->sk_send_head = skb_queue_next(TX_QUEUE(sk), skb);
1468
1469 skb = skb_dequeue(TX_QUEUE(sk));
1470 kfree_skb(skb);
1471 } 1461 }
1472} 1462}
1473 1463
@@ -1960,6 +1950,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __us
1960 1950
1961 switch (optname) { 1951 switch (optname) {
1962 case L2CAP_OPTIONS: 1952 case L2CAP_OPTIONS:
1953 if (sk->sk_state == BT_CONNECTED) {
1954 err = -EINVAL;
1955 break;
1956 }
1957
1963 opts.imtu = l2cap_pi(sk)->imtu; 1958 opts.imtu = l2cap_pi(sk)->imtu;
1964 opts.omtu = l2cap_pi(sk)->omtu; 1959 opts.omtu = l2cap_pi(sk)->omtu;
1965 opts.flush_to = l2cap_pi(sk)->flush_to; 1960 opts.flush_to = l2cap_pi(sk)->flush_to;
@@ -2771,10 +2766,10 @@ static int l2cap_parse_conf_rsp(struct sock *sk, void *rsp, int len, void *data,
2771 case L2CAP_CONF_MTU: 2766 case L2CAP_CONF_MTU:
2772 if (val < L2CAP_DEFAULT_MIN_MTU) { 2767 if (val < L2CAP_DEFAULT_MIN_MTU) {
2773 *result = L2CAP_CONF_UNACCEPT; 2768 *result = L2CAP_CONF_UNACCEPT;
2774 pi->omtu = L2CAP_DEFAULT_MIN_MTU; 2769 pi->imtu = L2CAP_DEFAULT_MIN_MTU;
2775 } else 2770 } else
2776 pi->omtu = val; 2771 pi->imtu = val;
2777 l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->omtu); 2772 l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->imtu);
2778 break; 2773 break;
2779 2774
2780 case L2CAP_CONF_FLUSH_TO: 2775 case L2CAP_CONF_FLUSH_TO:
@@ -3071,6 +3066,17 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd
3071 return 0; 3066 return 0;
3072} 3067}
3073 3068
3069static inline void set_default_fcs(struct l2cap_pinfo *pi)
3070{
3071 /* FCS is enabled only in ERTM or streaming mode, if one or both
3072 * sides request it.
3073 */
3074 if (pi->mode != L2CAP_MODE_ERTM && pi->mode != L2CAP_MODE_STREAMING)
3075 pi->fcs = L2CAP_FCS_NONE;
3076 else if (!(pi->conf_state & L2CAP_CONF_NO_FCS_RECV))
3077 pi->fcs = L2CAP_FCS_CRC16;
3078}
3079
3074static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) 3080static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data)
3075{ 3081{
3076 struct l2cap_conf_req *req = (struct l2cap_conf_req *) data; 3082 struct l2cap_conf_req *req = (struct l2cap_conf_req *) data;
@@ -3088,14 +3094,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
3088 if (!sk) 3094 if (!sk)
3089 return -ENOENT; 3095 return -ENOENT;
3090 3096
3091 if (sk->sk_state != BT_CONFIG) { 3097 if (sk->sk_state == BT_DISCONN)
3092 struct l2cap_cmd_rej rej;
3093
3094 rej.reason = cpu_to_le16(0x0002);
3095 l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ,
3096 sizeof(rej), &rej);
3097 goto unlock; 3098 goto unlock;
3098 }
3099 3099
3100 /* Reject if config buffer is too small. */ 3100 /* Reject if config buffer is too small. */
3101 len = cmd_len - sizeof(*req); 3101 len = cmd_len - sizeof(*req);
@@ -3135,9 +3135,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
3135 goto unlock; 3135 goto unlock;
3136 3136
3137 if (l2cap_pi(sk)->conf_state & L2CAP_CONF_INPUT_DONE) { 3137 if (l2cap_pi(sk)->conf_state & L2CAP_CONF_INPUT_DONE) {
3138 if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) || 3138 set_default_fcs(l2cap_pi(sk));
3139 l2cap_pi(sk)->fcs != L2CAP_FCS_NONE)
3140 l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16;
3141 3139
3142 sk->sk_state = BT_CONNECTED; 3140 sk->sk_state = BT_CONNECTED;
3143 3141
@@ -3225,9 +3223,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr
3225 l2cap_pi(sk)->conf_state |= L2CAP_CONF_INPUT_DONE; 3223 l2cap_pi(sk)->conf_state |= L2CAP_CONF_INPUT_DONE;
3226 3224
3227 if (l2cap_pi(sk)->conf_state & L2CAP_CONF_OUTPUT_DONE) { 3225 if (l2cap_pi(sk)->conf_state & L2CAP_CONF_OUTPUT_DONE) {
3228 if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) || 3226 set_default_fcs(l2cap_pi(sk));
3229 l2cap_pi(sk)->fcs != L2CAP_FCS_NONE)
3230 l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16;
3231 3227
3232 sk->sk_state = BT_CONNECTED; 3228 sk->sk_state = BT_CONNECTED;
3233 l2cap_pi(sk)->next_tx_seq = 0; 3229 l2cap_pi(sk)->next_tx_seq = 0;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 44a623275951..194b3a04cfd3 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -82,11 +82,14 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
82static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) 82static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
83{ 83{
84 struct sock *sk = d->owner, *parent; 84 struct sock *sk = d->owner, *parent;
85 unsigned long flags;
86
85 if (!sk) 87 if (!sk)
86 return; 88 return;
87 89
88 BT_DBG("dlc %p state %ld err %d", d, d->state, err); 90 BT_DBG("dlc %p state %ld err %d", d, d->state, err);
89 91
92 local_irq_save(flags);
90 bh_lock_sock(sk); 93 bh_lock_sock(sk);
91 94
92 if (err) 95 if (err)
@@ -108,6 +111,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
108 } 111 }
109 112
110 bh_unlock_sock(sk); 113 bh_unlock_sock(sk);
114 local_irq_restore(flags);
111 115
112 if (parent && sock_flag(sk, SOCK_ZAPPED)) { 116 if (parent && sock_flag(sk, SOCK_ZAPPED)) {
113 /* We have to drop DLC lock here, otherwise 117 /* We have to drop DLC lock here, otherwise
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index befc3a52aa04..84c2a4d013c6 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -844,10 +844,6 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
844 BT_DBG("TIOCMIWAIT"); 844 BT_DBG("TIOCMIWAIT");
845 break; 845 break;
846 846
847 case TIOCGICOUNT:
848 BT_DBG("TIOCGICOUNT");
849 break;
850
851 case TIOCGSERIAL: 847 case TIOCGSERIAL:
852 BT_ERR("TIOCGSERIAL is not supported"); 848 BT_ERR("TIOCGSERIAL is not supported");
853 return -ENOIOCTLCMD; 849 return -ENOIOCTLCMD;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 5ed00bd7009f..137f23259a93 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -761,9 +761,11 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
761{ 761{
762 if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) && 762 if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) &&
763 skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && 763 skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
764 !skb_is_gso(skb)) 764 !skb_is_gso(skb)) {
765 /* BUG: Should really parse the IP options here. */
766 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
765 return ip_fragment(skb, br_dev_queue_push_xmit); 767 return ip_fragment(skb, br_dev_queue_push_xmit);
766 else 768 } else
767 return br_dev_queue_push_xmit(skb); 769 return br_dev_queue_push_xmit(skb);
768} 770}
769#else 771#else
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 8ce904786116..4bf28f25f368 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -827,6 +827,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
827 long timeo; 827 long timeo;
828 int err; 828 int err;
829 int ifindex, headroom, tailroom; 829 int ifindex, headroom, tailroom;
830 unsigned int mtu;
830 struct net_device *dev; 831 struct net_device *dev;
831 832
832 lock_sock(sk); 833 lock_sock(sk);
@@ -896,15 +897,23 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
896 cf_sk->sk.sk_state = CAIF_DISCONNECTED; 897 cf_sk->sk.sk_state = CAIF_DISCONNECTED;
897 goto out; 898 goto out;
898 } 899 }
899 dev = dev_get_by_index(sock_net(sk), ifindex); 900
901 err = -ENODEV;
902 rcu_read_lock();
903 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
904 if (!dev) {
905 rcu_read_unlock();
906 goto out;
907 }
900 cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom); 908 cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom);
909 mtu = dev->mtu;
910 rcu_read_unlock();
911
901 cf_sk->tailroom = tailroom; 912 cf_sk->tailroom = tailroom;
902 cf_sk->maxframe = dev->mtu - (headroom + tailroom); 913 cf_sk->maxframe = mtu - (headroom + tailroom);
903 dev_put(dev);
904 if (cf_sk->maxframe < 1) { 914 if (cf_sk->maxframe < 1) {
905 pr_warning("CAIF: %s(): CAIF Interface MTU too small (%d)\n", 915 pr_warning("CAIF: %s(): CAIF Interface MTU too small (%u)\n",
906 __func__, dev->mtu); 916 __func__, mtu);
907 err = -ENODEV;
908 goto out; 917 goto out;
909 } 918 }
910 919
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 000000000000..ad424049b0cf
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
1config CEPH_LIB
2 tristate "Ceph core library (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CRYPTO_AES
6 select CRYPTO
7 default n
8 help
9 Choose Y or M here to include cephlib, which provides the
10 common functionality to both the Ceph filesystem and
11 to the rados block device (rbd).
12
13 More information at http://ceph.newdream.net/.
14
15 If unsure, say N.
16
17config CEPH_LIB_PRETTYDEBUG
18 bool "Include file:line in ceph debug output"
19 depends on CEPH_LIB
20 default n
21 help
22 If you say Y here, debug output will include a filename and
23 line to aid debugging. This increases kernel size and slows
24 execution slightly when debug call sites are enabled (e.g.,
25 via CONFIG_DYNAMIC_DEBUG).
26
27 If unsure, say N.
28
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 000000000000..aab1cabb8035
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_LIB) += libceph.o
8
9libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
10 mon_client.o \
11 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
12 debugfs.o \
13 auth.o auth_none.o \
14 crypto.o armor.o \
15 auth_x.o \
16 ceph_fs.o ceph_strings.o ceph_hash.o \
17 pagevec.o
18
19else
20#Otherwise we were called directly from the command
21# line; invoke the kernel build system.
22
23KERNELDIR ?= /lib/modules/$(shell uname -r)/build
24PWD := $(shell pwd)
25
26default: all
27
28all:
29 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
30
31modules_install:
32 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
33
34clean:
35 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
36
37endif
diff --git a/net/ceph/armor.c b/net/ceph/armor.c
new file mode 100644
index 000000000000..eb2a666b0be7
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,103 @@
1
2#include <linux/errno.h>
3
4int ceph_armor(char *dst, const char *src, const char *end);
5int ceph_unarmor(char *dst, const char *src, const char *end);
6
7/*
8 * base64 encode/decode.
9 */
10
11static const char *pem_key =
12 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
13
14static int encode_bits(int c)
15{
16 return pem_key[c];
17}
18
19static int decode_bits(char c)
20{
21 if (c >= 'A' && c <= 'Z')
22 return c - 'A';
23 if (c >= 'a' && c <= 'z')
24 return c - 'a' + 26;
25 if (c >= '0' && c <= '9')
26 return c - '0' + 52;
27 if (c == '+')
28 return 62;
29 if (c == '/')
30 return 63;
31 if (c == '=')
32 return 0; /* just non-negative, please */
33 return -EINVAL;
34}
35
36int ceph_armor(char *dst, const char *src, const char *end)
37{
38 int olen = 0;
39 int line = 0;
40
41 while (src < end) {
42 unsigned char a, b, c;
43
44 a = *src++;
45 *dst++ = encode_bits(a >> 2);
46 if (src < end) {
47 b = *src++;
48 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
49 if (src < end) {
50 c = *src++;
51 *dst++ = encode_bits(((b & 15) << 2) |
52 (c >> 6));
53 *dst++ = encode_bits(c & 63);
54 } else {
55 *dst++ = encode_bits((b & 15) << 2);
56 *dst++ = '=';
57 }
58 } else {
59 *dst++ = encode_bits(((a & 3) << 4));
60 *dst++ = '=';
61 *dst++ = '=';
62 }
63 olen += 4;
64 line += 4;
65 if (line == 64) {
66 line = 0;
67 *(dst++) = '\n';
68 olen++;
69 }
70 }
71 return olen;
72}
73
74int ceph_unarmor(char *dst, const char *src, const char *end)
75{
76 int olen = 0;
77
78 while (src < end) {
79 int a, b, c, d;
80
81 if (src < end && src[0] == '\n')
82 src++;
83 if (src + 4 > end)
84 return -EINVAL;
85 a = decode_bits(src[0]);
86 b = decode_bits(src[1]);
87 c = decode_bits(src[2]);
88 d = decode_bits(src[3]);
89 if (a < 0 || b < 0 || c < 0 || d < 0)
90 return -EINVAL;
91
92 *dst++ = (a << 2) | (b >> 4);
93 if (src[2] == '=')
94 return olen + 1;
95 *dst++ = ((b & 15) << 4) | (c >> 2);
96 if (src[3] == '=')
97 return olen + 2;
98 *dst++ = ((c & 3) << 6) | d;
99 olen += 3;
100 src += 4;
101 }
102 return olen;
103}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
new file mode 100644
index 000000000000..549c1f43e1d5
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,259 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/slab.h>
6
7#include <linux/ceph/types.h>
8#include <linux/ceph/decode.h>
9#include <linux/ceph/libceph.h>
10#include <linux/ceph/messenger.h>
11#include "auth_none.h"
12#include "auth_x.h"
13
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136static int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret;
155 }
156 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf;
159}
160
161/*
162 * Handle auth message from monitor.
163 */
164int ceph_handle_auth_reply(struct ceph_auth_client *ac,
165 void *buf, size_t len,
166 void *reply_buf, size_t reply_len)
167{
168 void *p = buf;
169 void *end = buf + len;
170 int protocol;
171 s32 result;
172 u64 global_id;
173 void *payload, *payload_end;
174 int payload_len;
175 char *result_msg;
176 int result_msg_len;
177 int ret = -EINVAL;
178
179 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p);
182 result = ceph_decode_32(&p);
183 global_id = ceph_decode_64(&p);
184 payload_len = ceph_decode_32(&p);
185 payload = p;
186 p += payload_len;
187 ceph_decode_need(&p, end, sizeof(u32), bad);
188 result_msg_len = ceph_decode_32(&p);
189 result_msg = p;
190 p += result_msg_len;
191 if (p != end)
192 goto bad;
193
194 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
195 result_msg, global_id, payload_len);
196
197 payload_end = payload + payload_len;
198
199 if (global_id && ac->global_id != global_id) {
200 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
201 ac->global_id = global_id;
202 }
203
204 if (ac->negotiating) {
205 /* server does not support our protocols? */
206 if (!protocol && result < 0) {
207 ret = result;
208 goto out;
209 }
210 /* set up (new) protocol handler? */
211 if (ac->protocol && ac->protocol != protocol) {
212 ac->ops->destroy(ac);
213 ac->protocol = 0;
214 ac->ops = NULL;
215 }
216 if (ac->protocol != protocol) {
217 ret = ceph_auth_init_protocol(ac, protocol);
218 if (ret) {
219 pr_err("error %d on auth protocol %d init\n",
220 ret, protocol);
221 goto out;
222 }
223 }
224
225 ac->negotiating = false;
226 }
227
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) {
232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret;
234 }
235 return 0;
236
237bad:
238 pr_err("failed to decode auth msg\n");
239out:
240 return ret;
241}
242
243int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len)
245{
246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops);
249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0;
252}
253
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{
256 if (!ac->ops)
257 return 0;
258 return ac->ops->is_authenticated(ac);
259}
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
new file mode 100644
index 000000000000..214c2bb43d62
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,132 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
12#include "auth_none.h"
13
14static void reset(struct ceph_auth_client *ac)
15{
16 struct ceph_auth_none_info *xi = ac->private;
17
18 xi->starting = true;
19 xi->built_authorizer = false;
20}
21
22static void destroy(struct ceph_auth_client *ac)
23{
24 kfree(ac->private);
25 ac->private = NULL;
26}
27
28static int is_authenticated(struct ceph_auth_client *ac)
29{
30 struct ceph_auth_none_info *xi = ac->private;
31
32 return !xi->starting;
33}
34
35static int should_authenticate(struct ceph_auth_client *ac)
36{
37 struct ceph_auth_none_info *xi = ac->private;
38
39 return xi->starting;
40}
41
42/*
43 * the generic auth code decode the global_id, and we carry no actual
44 * authenticate state, so nothing happens here.
45 */
46static int handle_reply(struct ceph_auth_client *ac, int result,
47 void *buf, void *end)
48{
49 struct ceph_auth_none_info *xi = ac->private;
50
51 xi->starting = false;
52 return result;
53}
54
55/*
56 * build an 'authorizer' with our entity_name and global_id. we can
57 * reuse a single static copy since it is identical for all services
58 * we connect to.
59 */
60static int ceph_auth_none_create_authorizer(
61 struct ceph_auth_client *ac, int peer_type,
62 struct ceph_authorizer **a,
63 void **buf, size_t *len,
64 void **reply_buf, size_t *reply_len)
65{
66 struct ceph_auth_none_info *ai = ac->private;
67 struct ceph_none_authorizer *au = &ai->au;
68 void *p, *end;
69 int ret;
70
71 if (!ai->built_authorizer) {
72 p = au->buf;
73 end = p + sizeof(au->buf);
74 ceph_encode_8(&p, 1);
75 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
76 if (ret < 0)
77 goto bad;
78 ceph_decode_need(&p, end, sizeof(u64), bad2);
79 ceph_encode_64(&p, ac->global_id);
80 au->buf_len = p - (void *)au->buf;
81 ai->built_authorizer = true;
82 dout("built authorizer len %d\n", au->buf_len);
83 }
84
85 *a = (struct ceph_authorizer *)au;
86 *buf = au->buf;
87 *len = au->buf_len;
88 *reply_buf = au->reply_buf;
89 *reply_len = sizeof(au->reply_buf);
90 return 0;
91
92bad2:
93 ret = -ERANGE;
94bad:
95 return ret;
96}
97
98static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
99 struct ceph_authorizer *a)
100{
101 /* nothing to do */
102}
103
104static const struct ceph_auth_client_ops ceph_auth_none_ops = {
105 .name = "none",
106 .reset = reset,
107 .destroy = destroy,
108 .is_authenticated = is_authenticated,
109 .should_authenticate = should_authenticate,
110 .handle_reply = handle_reply,
111 .create_authorizer = ceph_auth_none_create_authorizer,
112 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
113};
114
115int ceph_auth_none_init(struct ceph_auth_client *ac)
116{
117 struct ceph_auth_none_info *xi;
118
119 dout("ceph_auth_none_init %p\n", ac);
120 xi = kzalloc(sizeof(*xi), GFP_NOFS);
121 if (!xi)
122 return -ENOMEM;
123
124 xi->starting = true;
125 xi->built_authorizer = false;
126
127 ac->protocol = CEPH_AUTH_NONE;
128 ac->private = xi;
129 ac->ops = &ceph_auth_none_ops;
130 return 0;
131}
132
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
new file mode 100644
index 000000000000..ed7d088b1bc9
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include <linux/slab.h>
5#include <linux/ceph/auth.h>
6
7/*
8 * null security mode.
9 *
10 * we use a single static authorizer that simply encodes our entity name
11 * and global id.
12 */
13
14struct ceph_none_authorizer {
15 char buf[128];
16 int buf_len;
17 char reply_buf[0];
18};
19
20struct ceph_auth_none_info {
21 bool starting;
22 bool built_authorizer;
23 struct ceph_none_authorizer au; /* we only need one; it's static */
24};
25
26extern int ceph_auth_none_init(struct ceph_auth_client *ac);
27
28#endif
29
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
new file mode 100644
index 000000000000..7fd5dfcf6e18
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,688 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
12#include "crypto.h"
13#include "auth_x.h"
14#include "auth_x_protocol.h"
15
16#define TEMP_TICKET_BUF_LEN 256
17
18static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
19
20static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
21{
22 struct ceph_x_info *xi = ac->private;
23 int need;
24
25 ceph_x_validate_tickets(ac, &need);
26 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
27 ac->want_keys, need, xi->have_keys);
28 return (ac->want_keys & xi->have_keys) == ac->want_keys;
29}
30
31static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
32{
33 struct ceph_x_info *xi = ac->private;
34 int need;
35
36 ceph_x_validate_tickets(ac, &need);
37 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
38 ac->want_keys, need, xi->have_keys);
39 return need != 0;
40}
41
42static int ceph_x_encrypt_buflen(int ilen)
43{
44 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
45 sizeof(u32);
46}
47
48static int ceph_x_encrypt(struct ceph_crypto_key *secret,
49 void *ibuf, int ilen, void *obuf, size_t olen)
50{
51 struct ceph_x_encrypt_header head = {
52 .struct_v = 1,
53 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
54 };
55 size_t len = olen - sizeof(u32);
56 int ret;
57
58 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
59 &head, sizeof(head), ibuf, ilen);
60 if (ret)
61 return ret;
62 ceph_encode_32(&obuf, len);
63 return len + sizeof(u32);
64}
65
66static int ceph_x_decrypt(struct ceph_crypto_key *secret,
67 void **p, void *end, void *obuf, size_t olen)
68{
69 struct ceph_x_encrypt_header head;
70 size_t head_len = sizeof(head);
71 int len, ret;
72
73 len = ceph_decode_32(p);
74 if (*p + len > end)
75 return -EINVAL;
76
77 dout("ceph_x_decrypt len %d\n", len);
78 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
79 *p, len);
80 if (ret)
81 return ret;
82 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
83 return -EPERM;
84 *p += len;
85 return olen;
86}
87
88/*
89 * get existing (or insert new) ticket handler
90 */
91static struct ceph_x_ticket_handler *
92get_ticket_handler(struct ceph_auth_client *ac, int service)
93{
94 struct ceph_x_ticket_handler *th;
95 struct ceph_x_info *xi = ac->private;
96 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
97
98 while (*p) {
99 parent = *p;
100 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
101 if (service < th->service)
102 p = &(*p)->rb_left;
103 else if (service > th->service)
104 p = &(*p)->rb_right;
105 else
106 return th;
107 }
108
109 /* add it */
110 th = kzalloc(sizeof(*th), GFP_NOFS);
111 if (!th)
112 return ERR_PTR(-ENOMEM);
113 th->service = service;
114 rb_link_node(&th->node, parent, p);
115 rb_insert_color(&th->node, &xi->ticket_handlers);
116 return th;
117}
118
119static void remove_ticket_handler(struct ceph_auth_client *ac,
120 struct ceph_x_ticket_handler *th)
121{
122 struct ceph_x_info *xi = ac->private;
123
124 dout("remove_ticket_handler %p %d\n", th, th->service);
125 rb_erase(&th->node, &xi->ticket_handlers);
126 ceph_crypto_key_destroy(&th->session_key);
127 if (th->ticket_blob)
128 ceph_buffer_put(th->ticket_blob);
129 kfree(th);
130}
131
132static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
133 struct ceph_crypto_key *secret,
134 void *buf, void *end)
135{
136 struct ceph_x_info *xi = ac->private;
137 int num;
138 void *p = buf;
139 int ret;
140 char *dbuf;
141 char *ticket_buf;
142 u8 reply_struct_v;
143
144 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
145 if (!dbuf)
146 return -ENOMEM;
147
148 ret = -ENOMEM;
149 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
150 if (!ticket_buf)
151 goto out_dbuf;
152
153 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
154 reply_struct_v = ceph_decode_8(&p);
155 if (reply_struct_v != 1)
156 goto bad;
157 num = ceph_decode_32(&p);
158 dout("%d tickets\n", num);
159 while (num--) {
160 int type;
161 u8 tkt_struct_v, blob_struct_v;
162 struct ceph_x_ticket_handler *th;
163 void *dp, *dend;
164 int dlen;
165 char is_enc;
166 struct timespec validity;
167 struct ceph_crypto_key old_key;
168 void *tp, *tpend;
169 struct ceph_timespec new_validity;
170 struct ceph_crypto_key new_session_key;
171 struct ceph_buffer *new_ticket_blob;
172 unsigned long new_expires, new_renew_after;
173 u64 new_secret_id;
174
175 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
176
177 type = ceph_decode_32(&p);
178 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
179
180 tkt_struct_v = ceph_decode_8(&p);
181 if (tkt_struct_v != 1)
182 goto bad;
183
184 th = get_ticket_handler(ac, type);
185 if (IS_ERR(th)) {
186 ret = PTR_ERR(th);
187 goto out;
188 }
189
190 /* blob for me */
191 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
192 TEMP_TICKET_BUF_LEN);
193 if (dlen <= 0) {
194 ret = dlen;
195 goto out;
196 }
197 dout(" decrypted %d bytes\n", dlen);
198 dend = dbuf + dlen;
199 dp = dbuf;
200
201 tkt_struct_v = ceph_decode_8(&dp);
202 if (tkt_struct_v != 1)
203 goto bad;
204
205 memcpy(&old_key, &th->session_key, sizeof(old_key));
206 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
207 if (ret)
208 goto out;
209
210 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
211 ceph_decode_timespec(&validity, &new_validity);
212 new_expires = get_seconds() + validity.tv_sec;
213 new_renew_after = new_expires - (validity.tv_sec / 4);
214 dout(" expires=%lu renew_after=%lu\n", new_expires,
215 new_renew_after);
216
217 /* ticket blob for service */
218 ceph_decode_8_safe(&p, end, is_enc, bad);
219 tp = ticket_buf;
220 if (is_enc) {
221 /* encrypted */
222 dout(" encrypted ticket\n");
223 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
224 TEMP_TICKET_BUF_LEN);
225 if (dlen < 0) {
226 ret = dlen;
227 goto out;
228 }
229 dlen = ceph_decode_32(&tp);
230 } else {
231 /* unencrypted */
232 ceph_decode_32_safe(&p, end, dlen, bad);
233 ceph_decode_need(&p, end, dlen, bad);
234 ceph_decode_copy(&p, ticket_buf, dlen);
235 }
236 tpend = tp + dlen;
237 dout(" ticket blob is %d bytes\n", dlen);
238 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
239 blob_struct_v = ceph_decode_8(&tp);
240 new_secret_id = ceph_decode_64(&tp);
241 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
242 if (ret)
243 goto out;
244
245 /* all is well, update our ticket */
246 ceph_crypto_key_destroy(&th->session_key);
247 if (th->ticket_blob)
248 ceph_buffer_put(th->ticket_blob);
249 th->session_key = new_session_key;
250 th->ticket_blob = new_ticket_blob;
251 th->validity = new_validity;
252 th->secret_id = new_secret_id;
253 th->expires = new_expires;
254 th->renew_after = new_renew_after;
255 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
256 type, ceph_entity_type_name(type), th->secret_id,
257 (int)th->ticket_blob->vec.iov_len);
258 xi->have_keys |= th->service;
259 }
260
261 ret = 0;
262out:
263 kfree(ticket_buf);
264out_dbuf:
265 kfree(dbuf);
266 return ret;
267
268bad:
269 ret = -EINVAL;
270 goto out;
271}
272
273static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
274 struct ceph_x_ticket_handler *th,
275 struct ceph_x_authorizer *au)
276{
277 int maxlen;
278 struct ceph_x_authorize_a *msg_a;
279 struct ceph_x_authorize_b msg_b;
280 void *p, *end;
281 int ret;
282 int ticket_blob_len =
283 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
284
285 dout("build_authorizer for %s %p\n",
286 ceph_entity_type_name(th->service), au);
287
288 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
289 ceph_x_encrypt_buflen(ticket_blob_len);
290 dout(" need len %d\n", maxlen);
291 if (au->buf && au->buf->alloc_len < maxlen) {
292 ceph_buffer_put(au->buf);
293 au->buf = NULL;
294 }
295 if (!au->buf) {
296 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
297 if (!au->buf)
298 return -ENOMEM;
299 }
300 au->service = th->service;
301
302 msg_a = au->buf->vec.iov_base;
303 msg_a->struct_v = 1;
304 msg_a->global_id = cpu_to_le64(ac->global_id);
305 msg_a->service_id = cpu_to_le32(th->service);
306 msg_a->ticket_blob.struct_v = 1;
307 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
308 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
309 if (ticket_blob_len) {
310 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
311 th->ticket_blob->vec.iov_len);
312 }
313 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
314 le64_to_cpu(msg_a->ticket_blob.secret_id));
315
316 p = msg_a + 1;
317 p += ticket_blob_len;
318 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
319
320 get_random_bytes(&au->nonce, sizeof(au->nonce));
321 msg_b.struct_v = 1;
322 msg_b.nonce = cpu_to_le64(au->nonce);
323 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
324 p, end - p);
325 if (ret < 0)
326 goto out_buf;
327 p += ret;
328 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
329 dout(" built authorizer nonce %llx len %d\n", au->nonce,
330 (int)au->buf->vec.iov_len);
331 BUG_ON(au->buf->vec.iov_len > maxlen);
332 return 0;
333
334out_buf:
335 ceph_buffer_put(au->buf);
336 au->buf = NULL;
337 return ret;
338}
339
340static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
341 void **p, void *end)
342{
343 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
344 ceph_encode_8(p, 1);
345 ceph_encode_64(p, th->secret_id);
346 if (th->ticket_blob) {
347 const char *buf = th->ticket_blob->vec.iov_base;
348 u32 len = th->ticket_blob->vec.iov_len;
349
350 ceph_encode_32_safe(p, end, len, bad);
351 ceph_encode_copy_safe(p, end, buf, len, bad);
352 } else {
353 ceph_encode_32_safe(p, end, 0, bad);
354 }
355
356 return 0;
357bad:
358 return -ERANGE;
359}
360
361static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
362{
363 int want = ac->want_keys;
364 struct ceph_x_info *xi = ac->private;
365 int service;
366
367 *pneed = ac->want_keys & ~(xi->have_keys);
368
369 for (service = 1; service <= want; service <<= 1) {
370 struct ceph_x_ticket_handler *th;
371
372 if (!(ac->want_keys & service))
373 continue;
374
375 if (*pneed & service)
376 continue;
377
378 th = get_ticket_handler(ac, service);
379
380 if (IS_ERR(th)) {
381 *pneed |= service;
382 continue;
383 }
384
385 if (get_seconds() >= th->renew_after)
386 *pneed |= service;
387 if (get_seconds() >= th->expires)
388 xi->have_keys &= ~service;
389 }
390}
391
392
393static int ceph_x_build_request(struct ceph_auth_client *ac,
394 void *buf, void *end)
395{
396 struct ceph_x_info *xi = ac->private;
397 int need;
398 struct ceph_x_request_header *head = buf;
399 int ret;
400 struct ceph_x_ticket_handler *th =
401 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
402
403 if (IS_ERR(th))
404 return PTR_ERR(th);
405
406 ceph_x_validate_tickets(ac, &need);
407
408 dout("build_request want %x have %x need %x\n",
409 ac->want_keys, xi->have_keys, need);
410
411 if (need & CEPH_ENTITY_TYPE_AUTH) {
412 struct ceph_x_authenticate *auth = (void *)(head + 1);
413 void *p = auth + 1;
414 struct ceph_x_challenge_blob tmp;
415 char tmp_enc[40];
416 u64 *u;
417
418 if (p > end)
419 return -ERANGE;
420
421 dout(" get_auth_session_key\n");
422 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
423
424 /* encrypt and hash */
425 get_random_bytes(&auth->client_challenge, sizeof(u64));
426 tmp.client_challenge = auth->client_challenge;
427 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
428 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
429 tmp_enc, sizeof(tmp_enc));
430 if (ret < 0)
431 return ret;
432
433 auth->struct_v = 1;
434 auth->key = 0;
435 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
436 auth->key ^= *(__le64 *)u;
437 dout(" server_challenge %llx client_challenge %llx key %llx\n",
438 xi->server_challenge, le64_to_cpu(auth->client_challenge),
439 le64_to_cpu(auth->key));
440
441 /* now encode the old ticket if exists */
442 ret = ceph_x_encode_ticket(th, &p, end);
443 if (ret < 0)
444 return ret;
445
446 return p - buf;
447 }
448
449 if (need) {
450 void *p = head + 1;
451 struct ceph_x_service_ticket_request *req;
452
453 if (p > end)
454 return -ERANGE;
455 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
456
457 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
458 if (ret)
459 return ret;
460 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
461 xi->auth_authorizer.buf->vec.iov_len);
462
463 req = p;
464 req->keys = cpu_to_le32(need);
465 p += sizeof(*req);
466 return p - buf;
467 }
468
469 return 0;
470}
471
472static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
473 void *buf, void *end)
474{
475 struct ceph_x_info *xi = ac->private;
476 struct ceph_x_reply_header *head = buf;
477 struct ceph_x_ticket_handler *th;
478 int len = end - buf;
479 int op;
480 int ret;
481
482 if (result)
483 return result; /* XXX hmm? */
484
485 if (xi->starting) {
486 /* it's a hello */
487 struct ceph_x_server_challenge *sc = buf;
488
489 if (len != sizeof(*sc))
490 return -EINVAL;
491 xi->server_challenge = le64_to_cpu(sc->server_challenge);
492 dout("handle_reply got server challenge %llx\n",
493 xi->server_challenge);
494 xi->starting = false;
495 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
496 return -EAGAIN;
497 }
498
499 op = le16_to_cpu(head->op);
500 result = le32_to_cpu(head->result);
501 dout("handle_reply op %d result %d\n", op, result);
502 switch (op) {
503 case CEPHX_GET_AUTH_SESSION_KEY:
504 /* verify auth key */
505 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
506 buf + sizeof(*head), end);
507 break;
508
509 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
510 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
511 if (IS_ERR(th))
512 return PTR_ERR(th);
513 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
514 buf + sizeof(*head), end);
515 break;
516
517 default:
518 return -EINVAL;
519 }
520 if (ret)
521 return ret;
522 if (ac->want_keys == xi->have_keys)
523 return 0;
524 return -EAGAIN;
525}
526
527static int ceph_x_create_authorizer(
528 struct ceph_auth_client *ac, int peer_type,
529 struct ceph_authorizer **a,
530 void **buf, size_t *len,
531 void **reply_buf, size_t *reply_len)
532{
533 struct ceph_x_authorizer *au;
534 struct ceph_x_ticket_handler *th;
535 int ret;
536
537 th = get_ticket_handler(ac, peer_type);
538 if (IS_ERR(th))
539 return PTR_ERR(th);
540
541 au = kzalloc(sizeof(*au), GFP_NOFS);
542 if (!au)
543 return -ENOMEM;
544
545 ret = ceph_x_build_authorizer(ac, th, au);
546 if (ret) {
547 kfree(au);
548 return ret;
549 }
550
551 *a = (struct ceph_authorizer *)au;
552 *buf = au->buf->vec.iov_base;
553 *len = au->buf->vec.iov_len;
554 *reply_buf = au->reply_buf;
555 *reply_len = sizeof(au->reply_buf);
556 return 0;
557}
558
559static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
560 struct ceph_authorizer *a, size_t len)
561{
562 struct ceph_x_authorizer *au = (void *)a;
563 struct ceph_x_ticket_handler *th;
564 int ret = 0;
565 struct ceph_x_authorize_reply reply;
566 void *p = au->reply_buf;
567 void *end = p + sizeof(au->reply_buf);
568
569 th = get_ticket_handler(ac, au->service);
570 if (IS_ERR(th))
571 return PTR_ERR(th);
572 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
573 if (ret < 0)
574 return ret;
575 if (ret != sizeof(reply))
576 return -EPERM;
577
578 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
579 ret = -EPERM;
580 else
581 ret = 0;
582 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
583 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
584 return ret;
585}
586
587static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
588 struct ceph_authorizer *a)
589{
590 struct ceph_x_authorizer *au = (void *)a;
591
592 ceph_buffer_put(au->buf);
593 kfree(au);
594}
595
596
597static void ceph_x_reset(struct ceph_auth_client *ac)
598{
599 struct ceph_x_info *xi = ac->private;
600
601 dout("reset\n");
602 xi->starting = true;
603 xi->server_challenge = 0;
604}
605
606static void ceph_x_destroy(struct ceph_auth_client *ac)
607{
608 struct ceph_x_info *xi = ac->private;
609 struct rb_node *p;
610
611 dout("ceph_x_destroy %p\n", ac);
612 ceph_crypto_key_destroy(&xi->secret);
613
614 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
615 struct ceph_x_ticket_handler *th =
616 rb_entry(p, struct ceph_x_ticket_handler, node);
617 remove_ticket_handler(ac, th);
618 }
619
620 if (xi->auth_authorizer.buf)
621 ceph_buffer_put(xi->auth_authorizer.buf);
622
623 kfree(ac->private);
624 ac->private = NULL;
625}
626
627static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
628 int peer_type)
629{
630 struct ceph_x_ticket_handler *th;
631
632 th = get_ticket_handler(ac, peer_type);
633 if (!IS_ERR(th))
634 remove_ticket_handler(ac, th);
635}
636
637
638static const struct ceph_auth_client_ops ceph_x_ops = {
639 .name = "x",
640 .is_authenticated = ceph_x_is_authenticated,
641 .should_authenticate = ceph_x_should_authenticate,
642 .build_request = ceph_x_build_request,
643 .handle_reply = ceph_x_handle_reply,
644 .create_authorizer = ceph_x_create_authorizer,
645 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
646 .destroy_authorizer = ceph_x_destroy_authorizer,
647 .invalidate_authorizer = ceph_x_invalidate_authorizer,
648 .reset = ceph_x_reset,
649 .destroy = ceph_x_destroy,
650};
651
652
653int ceph_x_init(struct ceph_auth_client *ac)
654{
655 struct ceph_x_info *xi;
656 int ret;
657
658 dout("ceph_x_init %p\n", ac);
659 ret = -ENOMEM;
660 xi = kzalloc(sizeof(*xi), GFP_NOFS);
661 if (!xi)
662 goto out;
663
664 ret = -EINVAL;
665 if (!ac->secret) {
666 pr_err("no secret set (for auth_x protocol)\n");
667 goto out_nomem;
668 }
669
670 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
671 if (ret)
672 goto out_nomem;
673
674 xi->starting = true;
675 xi->ticket_handlers = RB_ROOT;
676
677 ac->protocol = CEPH_AUTH_CEPHX;
678 ac->private = xi;
679 ac->ops = &ceph_x_ops;
680 return 0;
681
682out_nomem:
683 kfree(xi);
684out:
685 return ret;
686}
687
688
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
new file mode 100644
index 000000000000..e02da7a5c5a1
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,50 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include <linux/ceph/auth.h>
7
8#include "crypto.h"
9#include "auth_x_protocol.h"
10
11/*
12 * Handle ticket for a single service.
13 */
14struct ceph_x_ticket_handler {
15 struct rb_node node;
16 unsigned service;
17
18 struct ceph_crypto_key session_key;
19 struct ceph_timespec validity;
20
21 u64 secret_id;
22 struct ceph_buffer *ticket_blob;
23
24 unsigned long renew_after, expires;
25};
26
27
28struct ceph_x_authorizer {
29 struct ceph_buffer *buf;
30 unsigned service;
31 u64 nonce;
32 char reply_buf[128]; /* big enough for encrypted blob */
33};
34
35struct ceph_x_info {
36 struct ceph_crypto_key secret;
37
38 bool starting;
39 u64 server_challenge;
40
41 unsigned have_keys;
42 struct rb_root ticket_handlers;
43
44 struct ceph_x_authorizer auth_authorizer;
45};
46
47extern int ceph_x_init(struct ceph_auth_client *ac);
48
49#endif
50
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
new file mode 100644
index 000000000000..53d8abfa25d5
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,68 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/module.h>
5#include <linux/slab.h>
6
7#include <linux/ceph/buffer.h>
8#include <linux/ceph/decode.h>
9
10struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
11{
12 struct ceph_buffer *b;
13
14 b = kmalloc(sizeof(*b), gfp);
15 if (!b)
16 return NULL;
17
18 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
19 if (b->vec.iov_base) {
20 b->is_vmalloc = false;
21 } else {
22 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
23 if (!b->vec.iov_base) {
24 kfree(b);
25 return NULL;
26 }
27 b->is_vmalloc = true;
28 }
29
30 kref_init(&b->kref);
31 b->alloc_len = len;
32 b->vec.iov_len = len;
33 dout("buffer_new %p\n", b);
34 return b;
35}
36EXPORT_SYMBOL(ceph_buffer_new);
37
38void ceph_buffer_release(struct kref *kref)
39{
40 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
41
42 dout("buffer_release %p\n", b);
43 if (b->vec.iov_base) {
44 if (b->is_vmalloc)
45 vfree(b->vec.iov_base);
46 else
47 kfree(b->vec.iov_base);
48 }
49 kfree(b);
50}
51EXPORT_SYMBOL(ceph_buffer_release);
52
53int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
54{
55 size_t len;
56
57 ceph_decode_need(p, end, sizeof(u32), bad);
58 len = ceph_decode_32(p);
59 dout("decode_buffer len %d\n", (int)len);
60 ceph_decode_need(p, end, len, bad);
61 *b = ceph_buffer_new(len, GFP_NOFS);
62 if (!*b)
63 return -ENOMEM;
64 ceph_decode_copy(p, (*b)->vec.iov_base, len);
65 return 0;
66bad:
67 return -EINVAL;
68}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 000000000000..f3e4a13fea0c
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
1
2#include <linux/ceph/ceph_debug.h>
3#include <linux/backing-dev.h>
4#include <linux/ctype.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/sched.h>
12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/statfs.h>
15#include <linux/string.h>
16
17
18#include <linux/ceph/libceph.h>
19#include <linux/ceph/debugfs.h>
20#include <linux/ceph/decode.h>
21#include <linux/ceph/mon_client.h>
22#include <linux/ceph/auth.h>
23
24
25
26/*
27 * find filename portion of a path (/foo/bar/baz -> baz)
28 */
29const char *ceph_file_part(const char *s, int len)
30{
31 const char *e = s + len;
32
33 while (e != s && *(e-1) != '/')
34 e--;
35 return e;
36}
37EXPORT_SYMBOL(ceph_file_part);
38
39const char *ceph_msg_type_name(int type)
40{
41 switch (type) {
42 case CEPH_MSG_SHUTDOWN: return "shutdown";
43 case CEPH_MSG_PING: return "ping";
44 case CEPH_MSG_AUTH: return "auth";
45 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
46 case CEPH_MSG_MON_MAP: return "mon_map";
47 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
48 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
49 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
50 case CEPH_MSG_STATFS: return "statfs";
51 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
52 case CEPH_MSG_MDS_MAP: return "mds_map";
53 case CEPH_MSG_CLIENT_SESSION: return "client_session";
54 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
55 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
56 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
57 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
58 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
59 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
60 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
61 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
62 case CEPH_MSG_OSD_MAP: return "osd_map";
63 case CEPH_MSG_OSD_OP: return "osd_op";
64 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
65 default: return "unknown";
66 }
67}
68EXPORT_SYMBOL(ceph_msg_type_name);
69
70/*
71 * Initially learn our fsid, or verify an fsid matches.
72 */
73int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
74{
75 if (client->have_fsid) {
76 if (ceph_fsid_compare(&client->fsid, fsid)) {
77 pr_err("bad fsid, had %pU got %pU",
78 &client->fsid, fsid);
79 return -1;
80 }
81 } else {
82 pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
83 memcpy(&client->fsid, fsid, sizeof(*fsid));
84 ceph_debugfs_client_init(client);
85 client->have_fsid = true;
86 }
87 return 0;
88}
89EXPORT_SYMBOL(ceph_check_fsid);
90
91static int strcmp_null(const char *s1, const char *s2)
92{
93 if (!s1 && !s2)
94 return 0;
95 if (s1 && !s2)
96 return -1;
97 if (!s1 && s2)
98 return 1;
99 return strcmp(s1, s2);
100}
101
102int ceph_compare_options(struct ceph_options *new_opt,
103 struct ceph_client *client)
104{
105 struct ceph_options *opt1 = new_opt;
106 struct ceph_options *opt2 = client->options;
107 int ofs = offsetof(struct ceph_options, mon_addr);
108 int i;
109 int ret;
110
111 ret = memcmp(opt1, opt2, ofs);
112 if (ret)
113 return ret;
114
115 ret = strcmp_null(opt1->name, opt2->name);
116 if (ret)
117 return ret;
118
119 ret = strcmp_null(opt1->secret, opt2->secret);
120 if (ret)
121 return ret;
122
123 /* any matching mon ip implies a match */
124 for (i = 0; i < opt1->num_mon; i++) {
125 if (ceph_monmap_contains(client->monc.monmap,
126 &opt1->mon_addr[i]))
127 return 0;
128 }
129 return -1;
130}
131EXPORT_SYMBOL(ceph_compare_options);
132
133
134static int parse_fsid(const char *str, struct ceph_fsid *fsid)
135{
136 int i = 0;
137 char tmp[3];
138 int err = -EINVAL;
139 int d;
140
141 dout("parse_fsid '%s'\n", str);
142 tmp[2] = 0;
143 while (*str && i < 16) {
144 if (ispunct(*str)) {
145 str++;
146 continue;
147 }
148 if (!isxdigit(str[0]) || !isxdigit(str[1]))
149 break;
150 tmp[0] = str[0];
151 tmp[1] = str[1];
152 if (sscanf(tmp, "%x", &d) < 1)
153 break;
154 fsid->fsid[i] = d & 0xff;
155 i++;
156 str += 2;
157 }
158
159 if (i == 16)
160 err = 0;
161 dout("parse_fsid ret %d got fsid %pU", err, fsid);
162 return err;
163}
164
165/*
166 * ceph options
167 */
168enum {
169 Opt_osdtimeout,
170 Opt_osdkeepalivetimeout,
171 Opt_mount_timeout,
172 Opt_osd_idle_ttl,
173 Opt_last_int,
174 /* int args above */
175 Opt_fsid,
176 Opt_name,
177 Opt_secret,
178 Opt_ip,
179 Opt_last_string,
180 /* string args above */
181 Opt_noshare,
182 Opt_nocrc,
183};
184
185static match_table_t opt_tokens = {
186 {Opt_osdtimeout, "osdtimeout=%d"},
187 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
188 {Opt_mount_timeout, "mount_timeout=%d"},
189 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
190 /* int args above */
191 {Opt_fsid, "fsid=%s"},
192 {Opt_name, "name=%s"},
193 {Opt_secret, "secret=%s"},
194 {Opt_ip, "ip=%s"},
195 /* string args above */
196 {Opt_noshare, "noshare"},
197 {Opt_nocrc, "nocrc"},
198 {-1, NULL}
199};
200
201void ceph_destroy_options(struct ceph_options *opt)
202{
203 dout("destroy_options %p\n", opt);
204 kfree(opt->name);
205 kfree(opt->secret);
206 kfree(opt);
207}
208EXPORT_SYMBOL(ceph_destroy_options);
209
210int ceph_parse_options(struct ceph_options **popt, char *options,
211 const char *dev_name, const char *dev_name_end,
212 int (*parse_extra_token)(char *c, void *private),
213 void *private)
214{
215 struct ceph_options *opt;
216 const char *c;
217 int err = -ENOMEM;
218 substring_t argstr[MAX_OPT_ARGS];
219
220 opt = kzalloc(sizeof(*opt), GFP_KERNEL);
221 if (!opt)
222 return err;
223 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
224 GFP_KERNEL);
225 if (!opt->mon_addr)
226 goto out;
227
228 dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
229 dev_name);
230
231 /* start with defaults */
232 opt->flags = CEPH_OPT_DEFAULT;
233 opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
234 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
235 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
236 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
237
238 /* get mon ip(s) */
239 /* ip1[:port1][,ip2[:port2]...] */
240 err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
241 CEPH_MAX_MON, &opt->num_mon);
242 if (err < 0)
243 goto out;
244
245 /* parse mount options */
246 while ((c = strsep(&options, ",")) != NULL) {
247 int token, intval, ret;
248 if (!*c)
249 continue;
250 err = -EINVAL;
251 token = match_token((char *)c, opt_tokens, argstr);
252 if (token < 0 && parse_extra_token) {
253 /* extra? */
254 err = parse_extra_token((char *)c, private);
255 if (err < 0) {
256 pr_err("bad option at '%s'\n", c);
257 goto out;
258 }
259 continue;
260 }
261 if (token < Opt_last_int) {
262 ret = match_int(&argstr[0], &intval);
263 if (ret < 0) {
264 pr_err("bad mount option arg (not int) "
265 "at '%s'\n", c);
266 continue;
267 }
268 dout("got int token %d val %d\n", token, intval);
269 } else if (token > Opt_last_int && token < Opt_last_string) {
270 dout("got string token %d val %s\n", token,
271 argstr[0].from);
272 } else {
273 dout("got token %d\n", token);
274 }
275 switch (token) {
276 case Opt_ip:
277 err = ceph_parse_ips(argstr[0].from,
278 argstr[0].to,
279 &opt->my_addr,
280 1, NULL);
281 if (err < 0)
282 goto out;
283 opt->flags |= CEPH_OPT_MYIP;
284 break;
285
286 case Opt_fsid:
287 err = parse_fsid(argstr[0].from, &opt->fsid);
288 if (err == 0)
289 opt->flags |= CEPH_OPT_FSID;
290 break;
291 case Opt_name:
292 opt->name = kstrndup(argstr[0].from,
293 argstr[0].to-argstr[0].from,
294 GFP_KERNEL);
295 break;
296 case Opt_secret:
297 opt->secret = kstrndup(argstr[0].from,
298 argstr[0].to-argstr[0].from,
299 GFP_KERNEL);
300 break;
301
302 /* misc */
303 case Opt_osdtimeout:
304 opt->osd_timeout = intval;
305 break;
306 case Opt_osdkeepalivetimeout:
307 opt->osd_keepalive_timeout = intval;
308 break;
309 case Opt_osd_idle_ttl:
310 opt->osd_idle_ttl = intval;
311 break;
312 case Opt_mount_timeout:
313 opt->mount_timeout = intval;
314 break;
315
316 case Opt_noshare:
317 opt->flags |= CEPH_OPT_NOSHARE;
318 break;
319
320 case Opt_nocrc:
321 opt->flags |= CEPH_OPT_NOCRC;
322 break;
323
324 default:
325 BUG_ON(token);
326 }
327 }
328
329 /* success */
330 *popt = opt;
331 return 0;
332
333out:
334 ceph_destroy_options(opt);
335 return err;
336}
337EXPORT_SYMBOL(ceph_parse_options);
338
339u64 ceph_client_id(struct ceph_client *client)
340{
341 return client->monc.auth->global_id;
342}
343EXPORT_SYMBOL(ceph_client_id);
344
345/*
346 * create a fresh client instance
347 */
348struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
349{
350 struct ceph_client *client;
351 int err = -ENOMEM;
352
353 client = kzalloc(sizeof(*client), GFP_KERNEL);
354 if (client == NULL)
355 return ERR_PTR(-ENOMEM);
356
357 client->private = private;
358 client->options = opt;
359
360 mutex_init(&client->mount_mutex);
361 init_waitqueue_head(&client->auth_wq);
362 client->auth_err = 0;
363
364 client->extra_mon_dispatch = NULL;
365 client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
366 client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
367
368 client->msgr = NULL;
369
370 /* subsystems */
371 err = ceph_monc_init(&client->monc, client);
372 if (err < 0)
373 goto fail;
374 err = ceph_osdc_init(&client->osdc, client);
375 if (err < 0)
376 goto fail_monc;
377
378 return client;
379
380fail_monc:
381 ceph_monc_stop(&client->monc);
382fail:
383 kfree(client);
384 return ERR_PTR(err);
385}
386EXPORT_SYMBOL(ceph_create_client);
387
388void ceph_destroy_client(struct ceph_client *client)
389{
390 dout("destroy_client %p\n", client);
391
392 /* unmount */
393 ceph_osdc_stop(&client->osdc);
394
395 /*
396 * make sure mds and osd connections close out before destroying
397 * the auth module, which is needed to free those connections'
398 * ceph_authorizers.
399 */
400 ceph_msgr_flush();
401
402 ceph_monc_stop(&client->monc);
403
404 ceph_debugfs_client_cleanup(client);
405
406 if (client->msgr)
407 ceph_messenger_destroy(client->msgr);
408
409 ceph_destroy_options(client->options);
410
411 kfree(client);
412 dout("destroy_client %p done\n", client);
413}
414EXPORT_SYMBOL(ceph_destroy_client);
415
416/*
417 * true if we have the mon map (and have thus joined the cluster)
418 */
419static int have_mon_and_osd_map(struct ceph_client *client)
420{
421 return client->monc.monmap && client->monc.monmap->epoch &&
422 client->osdc.osdmap && client->osdc.osdmap->epoch;
423}
424
425/*
426 * mount: join the ceph cluster, and open root directory.
427 */
428int __ceph_open_session(struct ceph_client *client, unsigned long started)
429{
430 struct ceph_entity_addr *myaddr = NULL;
431 int err;
432 unsigned long timeout = client->options->mount_timeout * HZ;
433
434 /* initialize the messenger */
435 if (client->msgr == NULL) {
436 if (ceph_test_opt(client, MYIP))
437 myaddr = &client->options->my_addr;
438 client->msgr = ceph_messenger_create(myaddr,
439 client->supported_features,
440 client->required_features);
441 if (IS_ERR(client->msgr)) {
442 client->msgr = NULL;
443 return PTR_ERR(client->msgr);
444 }
445 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
446 }
447
448 /* open session, and wait for mon and osd maps */
449 err = ceph_monc_open_session(&client->monc);
450 if (err < 0)
451 return err;
452
453 while (!have_mon_and_osd_map(client)) {
454 err = -EIO;
455 if (timeout && time_after_eq(jiffies, started + timeout))
456 return err;
457
458 /* wait */
459 dout("mount waiting for mon_map\n");
460 err = wait_event_interruptible_timeout(client->auth_wq,
461 have_mon_and_osd_map(client) || (client->auth_err < 0),
462 timeout);
463 if (err == -EINTR || err == -ERESTARTSYS)
464 return err;
465 if (client->auth_err < 0)
466 return client->auth_err;
467 }
468
469 return 0;
470}
471EXPORT_SYMBOL(__ceph_open_session);
472
473
474int ceph_open_session(struct ceph_client *client)
475{
476 int ret;
477 unsigned long started = jiffies; /* note the start time */
478
479 dout("open_session start\n");
480 mutex_lock(&client->mount_mutex);
481
482 ret = __ceph_open_session(client, started);
483
484 mutex_unlock(&client->mount_mutex);
485 return ret;
486}
487EXPORT_SYMBOL(ceph_open_session);
488
489
490static int __init init_ceph_lib(void)
491{
492 int ret = 0;
493
494 ret = ceph_debugfs_init();
495 if (ret < 0)
496 goto out;
497
498 ret = ceph_msgr_init();
499 if (ret < 0)
500 goto out_debugfs;
501
502 pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
503 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
504 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
505 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
506
507 return 0;
508
509out_debugfs:
510 ceph_debugfs_cleanup();
511out:
512 return ret;
513}
514
515static void __exit exit_ceph_lib(void)
516{
517 dout("exit_ceph_lib\n");
518 ceph_msgr_exit();
519 ceph_debugfs_cleanup();
520}
521
522module_init(init_ceph_lib);
523module_exit(exit_ceph_lib);
524
525MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
526MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
527MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
528MODULE_DESCRIPTION("Ceph filesystem for Linux");
529MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
new file mode 100644
index 000000000000..a3a3a31d3c37
--- /dev/null
+++ b/net/ceph/ceph_fs.c
@@ -0,0 +1,75 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include <linux/module.h>
5#include <linux/ceph/types.h>
6
7/*
8 * return true if @layout appears to be valid
9 */
10int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
11{
12 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
13 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
14 __u32 os = le32_to_cpu(layout->fl_object_size);
15
16 /* stripe unit, object size must be non-zero, 64k increment */
17 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
18 return 0;
19 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
20 return 0;
21 /* object size must be a multiple of stripe unit */
22 if (os < su || os % su)
23 return 0;
24 /* stripe count must be non-zero */
25 if (!sc)
26 return 0;
27 return 1;
28}
29
30
31int ceph_flags_to_mode(int flags)
32{
33 int mode;
34
35#ifdef O_DIRECTORY /* fixme */
36 if ((flags & O_DIRECTORY) == O_DIRECTORY)
37 return CEPH_FILE_MODE_PIN;
38#endif
39 if ((flags & O_APPEND) == O_APPEND)
40 flags |= O_WRONLY;
41
42 if ((flags & O_ACCMODE) == O_RDWR)
43 mode = CEPH_FILE_MODE_RDWR;
44 else if ((flags & O_ACCMODE) == O_WRONLY)
45 mode = CEPH_FILE_MODE_WR;
46 else
47 mode = CEPH_FILE_MODE_RD;
48
49#ifdef O_LAZY
50 if (flags & O_LAZY)
51 mode |= CEPH_FILE_MODE_LAZY;
52#endif
53
54 return mode;
55}
56EXPORT_SYMBOL(ceph_flags_to_mode);
57
58int ceph_caps_for_mode(int mode)
59{
60 int caps = CEPH_CAP_PIN;
61
62 if (mode & CEPH_FILE_MODE_RD)
63 caps |= CEPH_CAP_FILE_SHARED |
64 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
65 if (mode & CEPH_FILE_MODE_WR)
66 caps |= CEPH_CAP_FILE_EXCL |
67 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
68 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
69 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
70 if (mode & CEPH_FILE_MODE_LAZY)
71 caps |= CEPH_CAP_FILE_LAZYIO;
72
73 return caps;
74}
75EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
new file mode 100644
index 000000000000..815ef8826796
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include <linux/ceph/types.h>
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 000000000000..3fbda04de29c
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
1/*
2 * Ceph string constants
3 */
4#include <linux/module.h>
5#include <linux/ceph/types.h>
6
7const char *ceph_entity_type_name(int type)
8{
9 switch (type) {
10 case CEPH_ENTITY_TYPE_MDS: return "mds";
11 case CEPH_ENTITY_TYPE_OSD: return "osd";
12 case CEPH_ENTITY_TYPE_MON: return "mon";
13 case CEPH_ENTITY_TYPE_CLIENT: return "client";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32 case CEPH_OSD_OP_ROLLBACK: return "rollback";
33
34 case CEPH_OSD_OP_APPEND: return "append";
35 case CEPH_OSD_OP_STARTSYNC: return "startsync";
36 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
37 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
38
39 case CEPH_OSD_OP_TMAPUP: return "tmapup";
40 case CEPH_OSD_OP_TMAPGET: return "tmapget";
41 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
42
43 case CEPH_OSD_OP_GETXATTR: return "getxattr";
44 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
45 case CEPH_OSD_OP_SETXATTR: return "setxattr";
46 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
47 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
48 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
49 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
50
51 case CEPH_OSD_OP_PULL: return "pull";
52 case CEPH_OSD_OP_PUSH: return "push";
53 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
54 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
55 case CEPH_OSD_OP_SCRUB: return "scrub";
56
57 case CEPH_OSD_OP_WRLOCK: return "wrlock";
58 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
59 case CEPH_OSD_OP_RDLOCK: return "rdlock";
60 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
61 case CEPH_OSD_OP_UPLOCK: return "uplock";
62 case CEPH_OSD_OP_DNLOCK: return "dnlock";
63
64 case CEPH_OSD_OP_CALL: return "call";
65
66 case CEPH_OSD_OP_PGLS: return "pgls";
67 }
68 return "???";
69}
70
71
72const char *ceph_pool_op_name(int op)
73{
74 switch (op) {
75 case POOL_OP_CREATE: return "create";
76 case POOL_OP_DELETE: return "delete";
77 case POOL_OP_AUID_CHANGE: return "auid change";
78 case POOL_OP_CREATE_SNAP: return "create snap";
79 case POOL_OP_DELETE_SNAP: return "delete snap";
80 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
81 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
82 }
83 return "???";
84}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
new file mode 100644
index 000000000000..d6ebb13a18a4
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include <linux/crush/crush.h>
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
new file mode 100644
index 000000000000..5bb63e37a8a1
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include <linux/crush/hash.h>
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
new file mode 100644
index 000000000000..42599e31dcad
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,609 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include <linux/crush/crush.h>
22#include <linux/crush/hash.h>
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x10000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308
309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
310 bucket->id, x, outpos, numrep);
311
312 for (rep = outpos; rep < numrep; rep++) {
313 /* keep trying until we get a non-out, non-colliding item */
314 ftotal = 0;
315 skip_rep = 0;
316 do {
317 retry_descent = 0;
318 in = bucket; /* initial bucket */
319
320 /* choose through intervening buckets */
321 flocal = 0;
322 do {
323 collide = 0;
324 retry_bucket = 0;
325 r = rep;
326 if (in->alg == CRUSH_BUCKET_UNIFORM) {
327 /* be careful */
328 if (firstn || numrep >= in->size)
329 /* r' = r + f_total */
330 r += ftotal;
331 else if (in->size % numrep == 0)
332 /* r'=r+(n+1)*f_local */
333 r += (numrep+1) *
334 (flocal+ftotal);
335 else
336 /* r' = r + n*f_local */
337 r += numrep * (flocal+ftotal);
338 } else {
339 if (firstn)
340 /* r' = r + f_total */
341 r += ftotal;
342 else
343 /* r' = r + n*f_local */
344 r += numrep * (flocal+ftotal);
345 }
346
347 /* bucket choose */
348 if (in->size == 0) {
349 reject = 1;
350 goto reject;
351 }
352 if (flocal >= (in->size>>1) &&
353 flocal > orig_tries)
354 item = bucket_perm_choose(in, x, r);
355 else
356 item = crush_bucket_choose(in, x, r);
357 BUG_ON(item >= map->max_devices);
358
359 /* desired type? */
360 if (item < 0)
361 itemtype = map->buckets[-1-item]->type;
362 else
363 itemtype = 0;
364 dprintk(" item %d type %d\n", item, itemtype);
365
366 /* keep going? */
367 if (itemtype != type) {
368 BUG_ON(item >= 0 ||
369 (-1-item) >= map->max_buckets);
370 in = map->buckets[-1-item];
371 retry_bucket = 1;
372 continue;
373 }
374
375 /* collision? */
376 for (i = 0; i < outpos; i++) {
377 if (out[i] == item) {
378 collide = 1;
379 break;
380 }
381 }
382
383 reject = 0;
384 if (recurse_to_leaf) {
385 if (item < 0) {
386 if (crush_choose(map,
387 map->buckets[-1-item],
388 weight,
389 x, outpos+1, 0,
390 out2, outpos,
391 firstn, 0,
392 NULL) <= outpos)
393 /* didn't get leaf */
394 reject = 1;
395 } else {
396 /* we already have a leaf! */
397 out2[outpos] = item;
398 }
399 }
400
401 if (!reject) {
402 /* out? */
403 if (itemtype == 0)
404 reject = is_out(map, weight,
405 item, x);
406 else
407 reject = 0;
408 }
409
410reject:
411 if (reject || collide) {
412 ftotal++;
413 flocal++;
414
415 if (collide && flocal < 3)
416 /* retry locally a few times */
417 retry_bucket = 1;
418 else if (flocal < in->size + orig_tries)
419 /* exhaustive bucket search */
420 retry_bucket = 1;
421 else if (ftotal < 20)
422 /* then retry descent */
423 retry_descent = 1;
424 else
425 /* else give up */
426 skip_rep = 1;
427 dprintk(" reject %d collide %d "
428 "ftotal %d flocal %d\n",
429 reject, collide, ftotal,
430 flocal);
431 }
432 } while (retry_bucket);
433 } while (retry_descent);
434
435 if (skip_rep) {
436 dprintk("skip rep\n");
437 continue;
438 }
439
440 dprintk("CHOOSE got %d\n", item);
441 out[outpos] = item;
442 outpos++;
443 }
444
445 dprintk("CHOOSE returns %d\n", outpos);
446 return outpos;
447}
448
449
450/**
451 * crush_do_rule - calculate a mapping with the given input and rule
452 * @map: the crush_map
453 * @ruleno: the rule id
454 * @x: hash input
455 * @result: pointer to result vector
456 * @result_max: maximum result size
457 * @force: force initial replica choice; -1 for none
458 */
459int crush_do_rule(struct crush_map *map,
460 int ruleno, int x, int *result, int result_max,
461 int force, __u32 *weight)
462{
463 int result_len;
464 int force_context[CRUSH_MAX_DEPTH];
465 int force_pos = -1;
466 int a[CRUSH_MAX_SET];
467 int b[CRUSH_MAX_SET];
468 int c[CRUSH_MAX_SET];
469 int recurse_to_leaf;
470 int *w;
471 int wsize = 0;
472 int *o;
473 int osize;
474 int *tmp;
475 struct crush_rule *rule;
476 int step;
477 int i, j;
478 int numrep;
479 int firstn;
480 int rc = -1;
481
482 BUG_ON(ruleno >= map->max_rules);
483
484 rule = map->rules[ruleno];
485 result_len = 0;
486 w = a;
487 o = b;
488
489 /*
490 * determine hierarchical context of force, if any. note
491 * that this may or may not correspond to the specific types
492 * referenced by the crush rule.
493 */
494 if (force >= 0) {
495 if (force >= map->max_devices ||
496 map->device_parents[force] == 0) {
497 /*dprintk("CRUSH: forcefed device dne\n");*/
498 rc = -1; /* force fed device dne */
499 goto out;
500 }
501 if (!is_out(map, weight, force, x)) {
502 while (1) {
503 force_context[++force_pos] = force;
504 if (force >= 0)
505 force = map->device_parents[force];
506 else
507 force = map->bucket_parents[-1-force];
508 if (force == 0)
509 break;
510 }
511 }
512 }
513
514 for (step = 0; step < rule->len; step++) {
515 firstn = 0;
516 switch (rule->steps[step].op) {
517 case CRUSH_RULE_TAKE:
518 w[0] = rule->steps[step].arg1;
519 if (force_pos >= 0) {
520 BUG_ON(force_context[force_pos] != w[0]);
521 force_pos--;
522 }
523 wsize = 1;
524 break;
525
526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
527 case CRUSH_RULE_CHOOSE_FIRSTN:
528 firstn = 1;
529 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
530 case CRUSH_RULE_CHOOSE_INDEP:
531 BUG_ON(wsize == 0);
532
533 recurse_to_leaf =
534 rule->steps[step].op ==
535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
536 rule->steps[step].op ==
537 CRUSH_RULE_CHOOSE_LEAF_INDEP;
538
539 /* reset output */
540 osize = 0;
541
542 for (i = 0; i < wsize; i++) {
543 /*
544 * see CRUSH_N, CRUSH_N_MINUS macros.
545 * basically, numrep <= 0 means relative to
546 * the provided result_max
547 */
548 numrep = rule->steps[step].arg1;
549 if (numrep <= 0) {
550 numrep += result_max;
551 if (numrep <= 0)
552 continue;
553 }
554 j = 0;
555 if (osize == 0 && force_pos >= 0) {
556 /* skip any intermediate types */
557 while (force_pos &&
558 force_context[force_pos] < 0 &&
559 rule->steps[step].arg2 !=
560 map->buckets[-1 -
561 force_context[force_pos]]->type)
562 force_pos--;
563 o[osize] = force_context[force_pos];
564 if (recurse_to_leaf)
565 c[osize] = force_context[0];
566 j++;
567 force_pos--;
568 }
569 osize += crush_choose(map,
570 map->buckets[-1-w[i]],
571 weight,
572 x, numrep,
573 rule->steps[step].arg2,
574 o+osize, j,
575 firstn,
576 recurse_to_leaf, c+osize);
577 }
578
579 if (recurse_to_leaf)
580 /* copy final _leaf_ values to output set */
581 memcpy(o, c, osize*sizeof(*o));
582
583 /* swap t and w arrays */
584 tmp = o;
585 o = w;
586 w = tmp;
587 wsize = osize;
588 break;
589
590
591 case CRUSH_RULE_EMIT:
592 for (i = 0; i < wsize && result_len < result_max; i++) {
593 result[result_len] = w[i];
594 result_len++;
595 }
596 wsize = 0;
597 break;
598
599 default:
600 BUG_ON(1);
601 }
602 }
603 rc = result_len;
604
605out:
606 return rc;
607}
608
609
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
new file mode 100644
index 000000000000..7b505b0c983f
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,412 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include <linux/ceph/decode.h>
10#include "crypto.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
79
80static int ceph_aes_encrypt(const void *key, int key_len,
81 void *dst, size_t *dst_len,
82 const void *src, size_t src_len)
83{
84 struct scatterlist sg_in[2], sg_out[1];
85 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
86 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
87 int ret;
88 void *iv;
89 int ivsize;
90 size_t zero_padding = (0x10 - (src_len & 0x0f));
91 char pad[16];
92
93 if (IS_ERR(tfm))
94 return PTR_ERR(tfm);
95
96 memset(pad, zero_padding, zero_padding);
97
98 *dst_len = src_len + zero_padding;
99
100 crypto_blkcipher_setkey((void *)tfm, key, key_len);
101 sg_init_table(sg_in, 2);
102 sg_set_buf(&sg_in[0], src, src_len);
103 sg_set_buf(&sg_in[1], pad, zero_padding);
104 sg_init_table(sg_out, 1);
105 sg_set_buf(sg_out, dst, *dst_len);
106 iv = crypto_blkcipher_crt(tfm)->iv;
107 ivsize = crypto_blkcipher_ivsize(tfm);
108
109 memcpy(iv, aes_iv, ivsize);
110 /*
111 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
112 key, key_len, 1);
113 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
114 src, src_len, 1);
115 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
116 pad, zero_padding, 1);
117 */
118 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
119 src_len + zero_padding);
120 crypto_free_blkcipher(tfm);
121 if (ret < 0)
122 pr_err("ceph_aes_crypt failed %d\n", ret);
123 /*
124 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
125 dst, *dst_len, 1);
126 */
127 return 0;
128}
129
130static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
131 size_t *dst_len,
132 const void *src1, size_t src1_len,
133 const void *src2, size_t src2_len)
134{
135 struct scatterlist sg_in[3], sg_out[1];
136 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
137 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
138 int ret;
139 void *iv;
140 int ivsize;
141 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
142 char pad[16];
143
144 if (IS_ERR(tfm))
145 return PTR_ERR(tfm);
146
147 memset(pad, zero_padding, zero_padding);
148
149 *dst_len = src1_len + src2_len + zero_padding;
150
151 crypto_blkcipher_setkey((void *)tfm, key, key_len);
152 sg_init_table(sg_in, 3);
153 sg_set_buf(&sg_in[0], src1, src1_len);
154 sg_set_buf(&sg_in[1], src2, src2_len);
155 sg_set_buf(&sg_in[2], pad, zero_padding);
156 sg_init_table(sg_out, 1);
157 sg_set_buf(sg_out, dst, *dst_len);
158 iv = crypto_blkcipher_crt(tfm)->iv;
159 ivsize = crypto_blkcipher_ivsize(tfm);
160
161 memcpy(iv, aes_iv, ivsize);
162 /*
163 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
164 key, key_len, 1);
165 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
166 src1, src1_len, 1);
167 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
168 src2, src2_len, 1);
169 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
170 pad, zero_padding, 1);
171 */
172 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
173 src1_len + src2_len + zero_padding);
174 crypto_free_blkcipher(tfm);
175 if (ret < 0)
176 pr_err("ceph_aes_crypt2 failed %d\n", ret);
177 /*
178 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
179 dst, *dst_len, 1);
180 */
181 return 0;
182}
183
184static int ceph_aes_decrypt(const void *key, int key_len,
185 void *dst, size_t *dst_len,
186 const void *src, size_t src_len)
187{
188 struct scatterlist sg_in[1], sg_out[2];
189 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
190 struct blkcipher_desc desc = { .tfm = tfm };
191 char pad[16];
192 void *iv;
193 int ivsize;
194 int ret;
195 int last_byte;
196
197 if (IS_ERR(tfm))
198 return PTR_ERR(tfm);
199
200 crypto_blkcipher_setkey((void *)tfm, key, key_len);
201 sg_init_table(sg_in, 1);
202 sg_init_table(sg_out, 2);
203 sg_set_buf(sg_in, src, src_len);
204 sg_set_buf(&sg_out[0], dst, *dst_len);
205 sg_set_buf(&sg_out[1], pad, sizeof(pad));
206
207 iv = crypto_blkcipher_crt(tfm)->iv;
208 ivsize = crypto_blkcipher_ivsize(tfm);
209
210 memcpy(iv, aes_iv, ivsize);
211
212 /*
213 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
214 key, key_len, 1);
215 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
216 src, src_len, 1);
217 */
218
219 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
220 crypto_free_blkcipher(tfm);
221 if (ret < 0) {
222 pr_err("ceph_aes_decrypt failed %d\n", ret);
223 return ret;
224 }
225
226 if (src_len <= *dst_len)
227 last_byte = ((char *)dst)[src_len - 1];
228 else
229 last_byte = pad[src_len - *dst_len - 1];
230 if (last_byte <= 16 && src_len >= last_byte) {
231 *dst_len = src_len - last_byte;
232 } else {
233 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
234 last_byte, (int)src_len);
235 return -EPERM; /* bad padding */
236 }
237 /*
238 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
239 dst, *dst_len, 1);
240 */
241 return 0;
242}
243
244static int ceph_aes_decrypt2(const void *key, int key_len,
245 void *dst1, size_t *dst1_len,
246 void *dst2, size_t *dst2_len,
247 const void *src, size_t src_len)
248{
249 struct scatterlist sg_in[1], sg_out[3];
250 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
251 struct blkcipher_desc desc = { .tfm = tfm };
252 char pad[16];
253 void *iv;
254 int ivsize;
255 int ret;
256 int last_byte;
257
258 if (IS_ERR(tfm))
259 return PTR_ERR(tfm);
260
261 sg_init_table(sg_in, 1);
262 sg_set_buf(sg_in, src, src_len);
263 sg_init_table(sg_out, 3);
264 sg_set_buf(&sg_out[0], dst1, *dst1_len);
265 sg_set_buf(&sg_out[1], dst2, *dst2_len);
266 sg_set_buf(&sg_out[2], pad, sizeof(pad));
267
268 crypto_blkcipher_setkey((void *)tfm, key, key_len);
269 iv = crypto_blkcipher_crt(tfm)->iv;
270 ivsize = crypto_blkcipher_ivsize(tfm);
271
272 memcpy(iv, aes_iv, ivsize);
273
274 /*
275 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
276 key, key_len, 1);
277 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
278 src, src_len, 1);
279 */
280
281 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
282 crypto_free_blkcipher(tfm);
283 if (ret < 0) {
284 pr_err("ceph_aes_decrypt failed %d\n", ret);
285 return ret;
286 }
287
288 if (src_len <= *dst1_len)
289 last_byte = ((char *)dst1)[src_len - 1];
290 else if (src_len <= *dst1_len + *dst2_len)
291 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
292 else
293 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
294 if (last_byte <= 16 && src_len >= last_byte) {
295 src_len -= last_byte;
296 } else {
297 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
298 last_byte, (int)src_len);
299 return -EPERM; /* bad padding */
300 }
301
302 if (src_len < *dst1_len) {
303 *dst1_len = src_len;
304 *dst2_len = 0;
305 } else {
306 *dst2_len = src_len - *dst1_len;
307 }
308 /*
309 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
310 dst1, *dst1_len, 1);
311 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
312 dst2, *dst2_len, 1);
313 */
314
315 return 0;
316}
317
318
319int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
320 const void *src, size_t src_len)
321{
322 switch (secret->type) {
323 case CEPH_CRYPTO_NONE:
324 if (*dst_len < src_len)
325 return -ERANGE;
326 memcpy(dst, src, src_len);
327 *dst_len = src_len;
328 return 0;
329
330 case CEPH_CRYPTO_AES:
331 return ceph_aes_decrypt(secret->key, secret->len, dst,
332 dst_len, src, src_len);
333
334 default:
335 return -EINVAL;
336 }
337}
338
339int ceph_decrypt2(struct ceph_crypto_key *secret,
340 void *dst1, size_t *dst1_len,
341 void *dst2, size_t *dst2_len,
342 const void *src, size_t src_len)
343{
344 size_t t;
345
346 switch (secret->type) {
347 case CEPH_CRYPTO_NONE:
348 if (*dst1_len + *dst2_len < src_len)
349 return -ERANGE;
350 t = min(*dst1_len, src_len);
351 memcpy(dst1, src, t);
352 *dst1_len = t;
353 src += t;
354 src_len -= t;
355 if (src_len) {
356 t = min(*dst2_len, src_len);
357 memcpy(dst2, src, t);
358 *dst2_len = t;
359 }
360 return 0;
361
362 case CEPH_CRYPTO_AES:
363 return ceph_aes_decrypt2(secret->key, secret->len,
364 dst1, dst1_len, dst2, dst2_len,
365 src, src_len);
366
367 default:
368 return -EINVAL;
369 }
370}
371
372int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
373 const void *src, size_t src_len)
374{
375 switch (secret->type) {
376 case CEPH_CRYPTO_NONE:
377 if (*dst_len < src_len)
378 return -ERANGE;
379 memcpy(dst, src, src_len);
380 *dst_len = src_len;
381 return 0;
382
383 case CEPH_CRYPTO_AES:
384 return ceph_aes_encrypt(secret->key, secret->len, dst,
385 dst_len, src, src_len);
386
387 default:
388 return -EINVAL;
389 }
390}
391
392int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
393 const void *src1, size_t src1_len,
394 const void *src2, size_t src2_len)
395{
396 switch (secret->type) {
397 case CEPH_CRYPTO_NONE:
398 if (*dst_len < src1_len + src2_len)
399 return -ERANGE;
400 memcpy(dst, src1, src1_len);
401 memcpy(dst + src1_len, src2, src2_len);
402 *dst_len = src1_len + src2_len;
403 return 0;
404
405 case CEPH_CRYPTO_AES:
406 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
407 src1, src1_len, src2, src2_len);
408
409 default:
410 return -EINVAL;
411 }
412}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
new file mode 100644
index 000000000000..f9eccace592b
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include <linux/ceph/types.h>
5#include <linux/ceph/buffer.h>
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const char *src, const char *end);
46extern int ceph_unarmor(char *dst, const char *src, const char *end);
47
48#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 000000000000..27d4ea315d12
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../monmap - current monmap
23 * .../osdc - active osd requests
24 * .../monc - mon client state
25 * .../dentry_lru - dump contents of dentry lru
26 * .../caps - expose cap (reservation) stats
27 * .../bdi - symlink to ../../bdi/something
28 */
29
30static struct dentry *ceph_debugfs_dir;
31
32static int monmap_show(struct seq_file *s, void *p)
33{
34 int i;
35 struct ceph_client *client = s->private;
36
37 if (client->monc.monmap == NULL)
38 return 0;
39
40 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
41 for (i = 0; i < client->monc.monmap->num_mon; i++) {
42 struct ceph_entity_inst *inst =
43 &client->monc.monmap->mon_inst[i];
44
45 seq_printf(s, "\t%s%lld\t%s\n",
46 ENTITY_NAME(inst->name),
47 ceph_pr_addr(&inst->addr.in_addr));
48 }
49 return 0;
50}
51
52static int osdmap_show(struct seq_file *s, void *p)
53{
54 int i;
55 struct ceph_client *client = s->private;
56 struct rb_node *n;
57
58 if (client->osdc.osdmap == NULL)
59 return 0;
60 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
61 seq_printf(s, "flags%s%s\n",
62 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
63 " NEARFULL" : "",
64 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
65 " FULL" : "");
66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
67 struct ceph_pg_pool_info *pool =
68 rb_entry(n, struct ceph_pg_pool_info, node);
69 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
70 pool->id, pool->v.pg_num, pool->pg_num_mask,
71 pool->v.lpg_num, pool->lpg_num_mask);
72 }
73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
74 struct ceph_entity_addr *addr =
75 &client->osdc.osdmap->osd_addr[i];
76 int state = client->osdc.osdmap->osd_state[i];
77 char sb[64];
78
79 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
80 i, ceph_pr_addr(&addr->in_addr),
81 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
82 ceph_osdmap_state_str(sb, sizeof(sb), state));
83 }
84 return 0;
85}
86
87static int monc_show(struct seq_file *s, void *p)
88{
89 struct ceph_client *client = s->private;
90 struct ceph_mon_generic_request *req;
91 struct ceph_mon_client *monc = &client->monc;
92 struct rb_node *rp;
93
94 mutex_lock(&monc->mutex);
95
96 if (monc->have_mdsmap)
97 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
98 if (monc->have_osdmap)
99 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
100 if (monc->want_next_osdmap)
101 seq_printf(s, "want next osdmap\n");
102
103 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
104 __u16 op;
105 req = rb_entry(rp, struct ceph_mon_generic_request, node);
106 op = le16_to_cpu(req->request->hdr.type);
107 if (op == CEPH_MSG_STATFS)
108 seq_printf(s, "%lld statfs\n", req->tid);
109 else
110 seq_printf(s, "%lld unknown\n", req->tid);
111 }
112
113 mutex_unlock(&monc->mutex);
114 return 0;
115}
116
117static int osdc_show(struct seq_file *s, void *pp)
118{
119 struct ceph_client *client = s->private;
120 struct ceph_osd_client *osdc = &client->osdc;
121 struct rb_node *p;
122
123 mutex_lock(&osdc->request_mutex);
124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
125 struct ceph_osd_request *req;
126 struct ceph_osd_request_head *head;
127 struct ceph_osd_op *op;
128 int num_ops;
129 int opcode, olen;
130 int i;
131
132 req = rb_entry(p, struct ceph_osd_request, r_node);
133
134 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
135 req->r_osd ? req->r_osd->o_osd : -1,
136 le32_to_cpu(req->r_pgid.pool),
137 le16_to_cpu(req->r_pgid.ps));
138
139 head = req->r_request->front.iov_base;
140 op = (void *)(head + 1);
141
142 num_ops = le16_to_cpu(head->num_ops);
143 olen = le32_to_cpu(head->object_len);
144 seq_printf(s, "%.*s", olen,
145 (const char *)(head->ops + num_ops));
146
147 if (req->r_reassert_version.epoch)
148 seq_printf(s, "\t%u'%llu",
149 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
150 le64_to_cpu(req->r_reassert_version.version));
151 else
152 seq_printf(s, "\t");
153
154 for (i = 0; i < num_ops; i++) {
155 opcode = le16_to_cpu(op->op);
156 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
157 op++;
158 }
159
160 seq_printf(s, "\n");
161 }
162 mutex_unlock(&osdc->request_mutex);
163 return 0;
164}
165
166CEPH_DEFINE_SHOW_FUNC(monmap_show)
167CEPH_DEFINE_SHOW_FUNC(osdmap_show)
168CEPH_DEFINE_SHOW_FUNC(monc_show)
169CEPH_DEFINE_SHOW_FUNC(osdc_show)
170
171int ceph_debugfs_init(void)
172{
173 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
174 if (!ceph_debugfs_dir)
175 return -ENOMEM;
176 return 0;
177}
178
179void ceph_debugfs_cleanup(void)
180{
181 debugfs_remove(ceph_debugfs_dir);
182}
183
184int ceph_debugfs_client_init(struct ceph_client *client)
185{
186 int ret = -ENOMEM;
187 char name[80];
188
189 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
190 client->monc.auth->global_id);
191
192 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
193 if (!client->debugfs_dir)
194 goto out;
195
196 client->monc.debugfs_file = debugfs_create_file("monc",
197 0600,
198 client->debugfs_dir,
199 client,
200 &monc_show_fops);
201 if (!client->monc.debugfs_file)
202 goto out;
203
204 client->osdc.debugfs_file = debugfs_create_file("osdc",
205 0600,
206 client->debugfs_dir,
207 client,
208 &osdc_show_fops);
209 if (!client->osdc.debugfs_file)
210 goto out;
211
212 client->debugfs_monmap = debugfs_create_file("monmap",
213 0600,
214 client->debugfs_dir,
215 client,
216 &monmap_show_fops);
217 if (!client->debugfs_monmap)
218 goto out;
219
220 client->debugfs_osdmap = debugfs_create_file("osdmap",
221 0600,
222 client->debugfs_dir,
223 client,
224 &osdmap_show_fops);
225 if (!client->debugfs_osdmap)
226 goto out;
227
228 return 0;
229
230out:
231 ceph_debugfs_client_cleanup(client);
232 return ret;
233}
234
235void ceph_debugfs_client_cleanup(struct ceph_client *client)
236{
237 debugfs_remove(client->debugfs_osdmap);
238 debugfs_remove(client->debugfs_monmap);
239 debugfs_remove(client->osdc.debugfs_file);
240 debugfs_remove(client->monc.debugfs_file);
241 debugfs_remove(client->debugfs_dir);
242}
243
244#else /* CONFIG_DEBUG_FS */
245
246int ceph_debugfs_init(void)
247{
248 return 0;
249}
250
251void ceph_debugfs_cleanup(void)
252{
253}
254
255int ceph_debugfs_client_init(struct ceph_client *client)
256{
257 return 0;
258}
259
260void ceph_debugfs_client_cleanup(struct ceph_client *client)
261{
262}
263
264#endif /* CONFIG_DEBUG_FS */
265
266EXPORT_SYMBOL(ceph_debugfs_init);
267EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
new file mode 100644
index 000000000000..0e8157ee5d43
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,2453 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <linux/bio.h>
13#include <linux/blkdev.h>
14#include <net/tcp.h>
15
16#include <linux/ceph/libceph.h>
17#include <linux/ceph/messenger.h>
18#include <linux/ceph/decode.h>
19#include <linux/ceph/pagelist.h>
20
21/*
22 * Ceph uses the messenger to exchange ceph_msg messages with other
23 * hosts in the system. The messenger provides ordered and reliable
24 * delivery. We tolerate TCP disconnects by reconnecting (with
25 * exponential backoff) in the case of a fault (disconnection, bad
26 * crc, protocol error). Acks allow sent messages to be discarded by
27 * the sender.
28 */
29
30/* static tag bytes (protocol control messages) */
31static char tag_msg = CEPH_MSGR_TAG_MSG;
32static char tag_ack = CEPH_MSGR_TAG_ACK;
33static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
34
35#ifdef CONFIG_LOCKDEP
36static struct lock_class_key socket_class;
37#endif
38
39
40static void queue_con(struct ceph_connection *con);
41static void con_work(struct work_struct *);
42static void ceph_fault(struct ceph_connection *con);
43
44/*
45 * nicely render a sockaddr as a string.
46 */
47#define MAX_ADDR_STR 20
48#define MAX_ADDR_STR_LEN 60
49static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
50static DEFINE_SPINLOCK(addr_str_lock);
51static int last_addr_str;
52
53const char *ceph_pr_addr(const struct sockaddr_storage *ss)
54{
55 int i;
56 char *s;
57 struct sockaddr_in *in4 = (void *)ss;
58 struct sockaddr_in6 *in6 = (void *)ss;
59
60 spin_lock(&addr_str_lock);
61 i = last_addr_str++;
62 if (last_addr_str == MAX_ADDR_STR)
63 last_addr_str = 0;
64 spin_unlock(&addr_str_lock);
65 s = addr_str[i];
66
67 switch (ss->ss_family) {
68 case AF_INET:
69 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
70 (unsigned int)ntohs(in4->sin_port));
71 break;
72
73 case AF_INET6:
74 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
75 (unsigned int)ntohs(in6->sin6_port));
76 break;
77
78 default:
79 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
80 }
81
82 return s;
83}
84EXPORT_SYMBOL(ceph_pr_addr);
85
86static void encode_my_addr(struct ceph_messenger *msgr)
87{
88 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
89 ceph_encode_addr(&msgr->my_enc_addr);
90}
91
92/*
93 * work queue for all reading and writing to/from the socket.
94 */
95struct workqueue_struct *ceph_msgr_wq;
96
97int ceph_msgr_init(void)
98{
99 ceph_msgr_wq = create_workqueue("ceph-msgr");
100 if (IS_ERR(ceph_msgr_wq)) {
101 int ret = PTR_ERR(ceph_msgr_wq);
102 pr_err("msgr_init failed to create workqueue: %d\n", ret);
103 ceph_msgr_wq = NULL;
104 return ret;
105 }
106 return 0;
107}
108EXPORT_SYMBOL(ceph_msgr_init);
109
110void ceph_msgr_exit(void)
111{
112 destroy_workqueue(ceph_msgr_wq);
113}
114EXPORT_SYMBOL(ceph_msgr_exit);
115
116void ceph_msgr_flush(void)
117{
118 flush_workqueue(ceph_msgr_wq);
119}
120EXPORT_SYMBOL(ceph_msgr_flush);
121
122
123/*
124 * socket callback functions
125 */
126
127/* data available on socket, or listen socket received a connect */
128static void ceph_data_ready(struct sock *sk, int count_unused)
129{
130 struct ceph_connection *con =
131 (struct ceph_connection *)sk->sk_user_data;
132 if (sk->sk_state != TCP_CLOSE_WAIT) {
133 dout("ceph_data_ready on %p state = %lu, queueing work\n",
134 con, con->state);
135 queue_con(con);
136 }
137}
138
139/* socket has buffer space for writing */
140static void ceph_write_space(struct sock *sk)
141{
142 struct ceph_connection *con =
143 (struct ceph_connection *)sk->sk_user_data;
144
145 /* only queue to workqueue if there is data we want to write. */
146 if (test_bit(WRITE_PENDING, &con->state)) {
147 dout("ceph_write_space %p queueing write work\n", con);
148 queue_con(con);
149 } else {
150 dout("ceph_write_space %p nothing to write\n", con);
151 }
152
153 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
154 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
155}
156
157/* socket's state has changed */
158static void ceph_state_change(struct sock *sk)
159{
160 struct ceph_connection *con =
161 (struct ceph_connection *)sk->sk_user_data;
162
163 dout("ceph_state_change %p state = %lu sk_state = %u\n",
164 con, con->state, sk->sk_state);
165
166 if (test_bit(CLOSED, &con->state))
167 return;
168
169 switch (sk->sk_state) {
170 case TCP_CLOSE:
171 dout("ceph_state_change TCP_CLOSE\n");
172 case TCP_CLOSE_WAIT:
173 dout("ceph_state_change TCP_CLOSE_WAIT\n");
174 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
175 if (test_bit(CONNECTING, &con->state))
176 con->error_msg = "connection failed";
177 else
178 con->error_msg = "socket closed";
179 queue_con(con);
180 }
181 break;
182 case TCP_ESTABLISHED:
183 dout("ceph_state_change TCP_ESTABLISHED\n");
184 queue_con(con);
185 break;
186 }
187}
188
189/*
190 * set up socket callbacks
191 */
192static void set_sock_callbacks(struct socket *sock,
193 struct ceph_connection *con)
194{
195 struct sock *sk = sock->sk;
196 sk->sk_user_data = (void *)con;
197 sk->sk_data_ready = ceph_data_ready;
198 sk->sk_write_space = ceph_write_space;
199 sk->sk_state_change = ceph_state_change;
200}
201
202
203/*
204 * socket helpers
205 */
206
207/*
208 * initiate connection to a remote socket.
209 */
210static struct socket *ceph_tcp_connect(struct ceph_connection *con)
211{
212 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
213 struct socket *sock;
214 int ret;
215
216 BUG_ON(con->sock);
217 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
218 IPPROTO_TCP, &sock);
219 if (ret)
220 return ERR_PTR(ret);
221 con->sock = sock;
222 sock->sk->sk_allocation = GFP_NOFS;
223
224#ifdef CONFIG_LOCKDEP
225 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
226#endif
227
228 set_sock_callbacks(sock, con);
229
230 dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
231
232 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
233 O_NONBLOCK);
234 if (ret == -EINPROGRESS) {
235 dout("connect %s EINPROGRESS sk_state = %u\n",
236 ceph_pr_addr(&con->peer_addr.in_addr),
237 sock->sk->sk_state);
238 ret = 0;
239 }
240 if (ret < 0) {
241 pr_err("connect %s error %d\n",
242 ceph_pr_addr(&con->peer_addr.in_addr), ret);
243 sock_release(sock);
244 con->sock = NULL;
245 con->error_msg = "connect error";
246 }
247
248 if (ret < 0)
249 return ERR_PTR(ret);
250 return sock;
251}
252
253static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
254{
255 struct kvec iov = {buf, len};
256 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
257
258 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
259}
260
261/*
262 * write something. @more is true if caller will be sending more data
263 * shortly.
264 */
265static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
266 size_t kvlen, size_t len, int more)
267{
268 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
269
270 if (more)
271 msg.msg_flags |= MSG_MORE;
272 else
273 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
274
275 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
276}
277
278
279/*
280 * Shutdown/close the socket for the given connection.
281 */
282static int con_close_socket(struct ceph_connection *con)
283{
284 int rc;
285
286 dout("con_close_socket on %p sock %p\n", con, con->sock);
287 if (!con->sock)
288 return 0;
289 set_bit(SOCK_CLOSED, &con->state);
290 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
291 sock_release(con->sock);
292 con->sock = NULL;
293 clear_bit(SOCK_CLOSED, &con->state);
294 return rc;
295}
296
297/*
298 * Reset a connection. Discard all incoming and outgoing messages
299 * and clear *_seq state.
300 */
301static void ceph_msg_remove(struct ceph_msg *msg)
302{
303 list_del_init(&msg->list_head);
304 ceph_msg_put(msg);
305}
306static void ceph_msg_remove_list(struct list_head *head)
307{
308 while (!list_empty(head)) {
309 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
310 list_head);
311 ceph_msg_remove(msg);
312 }
313}
314
315static void reset_connection(struct ceph_connection *con)
316{
317 /* reset connection, out_queue, msg_ and connect_seq */
318 /* discard existing out_queue and msg_seq */
319 ceph_msg_remove_list(&con->out_queue);
320 ceph_msg_remove_list(&con->out_sent);
321
322 if (con->in_msg) {
323 ceph_msg_put(con->in_msg);
324 con->in_msg = NULL;
325 }
326
327 con->connect_seq = 0;
328 con->out_seq = 0;
329 if (con->out_msg) {
330 ceph_msg_put(con->out_msg);
331 con->out_msg = NULL;
332 }
333 con->out_keepalive_pending = false;
334 con->in_seq = 0;
335 con->in_seq_acked = 0;
336}
337
338/*
339 * mark a peer down. drop any open connections.
340 */
341void ceph_con_close(struct ceph_connection *con)
342{
343 dout("con_close %p peer %s\n", con,
344 ceph_pr_addr(&con->peer_addr.in_addr));
345 set_bit(CLOSED, &con->state); /* in case there's queued work */
346 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
347 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
348 clear_bit(KEEPALIVE_PENDING, &con->state);
349 clear_bit(WRITE_PENDING, &con->state);
350 mutex_lock(&con->mutex);
351 reset_connection(con);
352 con->peer_global_seq = 0;
353 cancel_delayed_work(&con->work);
354 mutex_unlock(&con->mutex);
355 queue_con(con);
356}
357EXPORT_SYMBOL(ceph_con_close);
358
359/*
360 * Reopen a closed connection, with a new peer address.
361 */
362void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
363{
364 dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
365 set_bit(OPENING, &con->state);
366 clear_bit(CLOSED, &con->state);
367 memcpy(&con->peer_addr, addr, sizeof(*addr));
368 con->delay = 0; /* reset backoff memory */
369 queue_con(con);
370}
371EXPORT_SYMBOL(ceph_con_open);
372
373/*
374 * return true if this connection ever successfully opened
375 */
376bool ceph_con_opened(struct ceph_connection *con)
377{
378 return con->connect_seq > 0;
379}
380
381/*
382 * generic get/put
383 */
384struct ceph_connection *ceph_con_get(struct ceph_connection *con)
385{
386 dout("con_get %p nref = %d -> %d\n", con,
387 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
388 if (atomic_inc_not_zero(&con->nref))
389 return con;
390 return NULL;
391}
392
393void ceph_con_put(struct ceph_connection *con)
394{
395 dout("con_put %p nref = %d -> %d\n", con,
396 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
397 BUG_ON(atomic_read(&con->nref) == 0);
398 if (atomic_dec_and_test(&con->nref)) {
399 BUG_ON(con->sock);
400 kfree(con);
401 }
402}
403
404/*
405 * initialize a new connection.
406 */
407void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
408{
409 dout("con_init %p\n", con);
410 memset(con, 0, sizeof(*con));
411 atomic_set(&con->nref, 1);
412 con->msgr = msgr;
413 mutex_init(&con->mutex);
414 INIT_LIST_HEAD(&con->out_queue);
415 INIT_LIST_HEAD(&con->out_sent);
416 INIT_DELAYED_WORK(&con->work, con_work);
417}
418EXPORT_SYMBOL(ceph_con_init);
419
420
421/*
422 * We maintain a global counter to order connection attempts. Get
423 * a unique seq greater than @gt.
424 */
425static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
426{
427 u32 ret;
428
429 spin_lock(&msgr->global_seq_lock);
430 if (msgr->global_seq < gt)
431 msgr->global_seq = gt;
432 ret = ++msgr->global_seq;
433 spin_unlock(&msgr->global_seq_lock);
434 return ret;
435}
436
437
438/*
439 * Prepare footer for currently outgoing message, and finish things
440 * off. Assumes out_kvec* are already valid.. we just add on to the end.
441 */
442static void prepare_write_message_footer(struct ceph_connection *con, int v)
443{
444 struct ceph_msg *m = con->out_msg;
445
446 dout("prepare_write_message_footer %p\n", con);
447 con->out_kvec_is_msg = true;
448 con->out_kvec[v].iov_base = &m->footer;
449 con->out_kvec[v].iov_len = sizeof(m->footer);
450 con->out_kvec_bytes += sizeof(m->footer);
451 con->out_kvec_left++;
452 con->out_more = m->more_to_follow;
453 con->out_msg_done = true;
454}
455
456/*
457 * Prepare headers for the next outgoing message.
458 */
459static void prepare_write_message(struct ceph_connection *con)
460{
461 struct ceph_msg *m;
462 int v = 0;
463
464 con->out_kvec_bytes = 0;
465 con->out_kvec_is_msg = true;
466 con->out_msg_done = false;
467
468 /* Sneak an ack in there first? If we can get it into the same
469 * TCP packet that's a good thing. */
470 if (con->in_seq > con->in_seq_acked) {
471 con->in_seq_acked = con->in_seq;
472 con->out_kvec[v].iov_base = &tag_ack;
473 con->out_kvec[v++].iov_len = 1;
474 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
475 con->out_kvec[v].iov_base = &con->out_temp_ack;
476 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
477 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
478 }
479
480 m = list_first_entry(&con->out_queue,
481 struct ceph_msg, list_head);
482 con->out_msg = m;
483 if (test_bit(LOSSYTX, &con->state)) {
484 list_del_init(&m->list_head);
485 } else {
486 /* put message on sent list */
487 ceph_msg_get(m);
488 list_move_tail(&m->list_head, &con->out_sent);
489 }
490
491 /*
492 * only assign outgoing seq # if we haven't sent this message
493 * yet. if it is requeued, resend with it's original seq.
494 */
495 if (m->needs_out_seq) {
496 m->hdr.seq = cpu_to_le64(++con->out_seq);
497 m->needs_out_seq = false;
498 }
499
500 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
501 m, con->out_seq, le16_to_cpu(m->hdr.type),
502 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
503 le32_to_cpu(m->hdr.data_len),
504 m->nr_pages);
505 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
506
507 /* tag + hdr + front + middle */
508 con->out_kvec[v].iov_base = &tag_msg;
509 con->out_kvec[v++].iov_len = 1;
510 con->out_kvec[v].iov_base = &m->hdr;
511 con->out_kvec[v++].iov_len = sizeof(m->hdr);
512 con->out_kvec[v++] = m->front;
513 if (m->middle)
514 con->out_kvec[v++] = m->middle->vec;
515 con->out_kvec_left = v;
516 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
517 (m->middle ? m->middle->vec.iov_len : 0);
518 con->out_kvec_cur = con->out_kvec;
519
520 /* fill in crc (except data pages), footer */
521 con->out_msg->hdr.crc =
522 cpu_to_le32(crc32c(0, (void *)&m->hdr,
523 sizeof(m->hdr) - sizeof(m->hdr.crc)));
524 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
525 con->out_msg->footer.front_crc =
526 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
527 if (m->middle)
528 con->out_msg->footer.middle_crc =
529 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
530 m->middle->vec.iov_len));
531 else
532 con->out_msg->footer.middle_crc = 0;
533 con->out_msg->footer.data_crc = 0;
534 dout("prepare_write_message front_crc %u data_crc %u\n",
535 le32_to_cpu(con->out_msg->footer.front_crc),
536 le32_to_cpu(con->out_msg->footer.middle_crc));
537
538 /* is there a data payload? */
539 if (le32_to_cpu(m->hdr.data_len) > 0) {
540 /* initialize page iterator */
541 con->out_msg_pos.page = 0;
542 if (m->pages)
543 con->out_msg_pos.page_pos =
544 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
545 else
546 con->out_msg_pos.page_pos = 0;
547 con->out_msg_pos.data_pos = 0;
548 con->out_msg_pos.did_page_crc = 0;
549 con->out_more = 1; /* data + footer will follow */
550 } else {
551 /* no, queue up footer too and be done */
552 prepare_write_message_footer(con, v);
553 }
554
555 set_bit(WRITE_PENDING, &con->state);
556}
557
558/*
559 * Prepare an ack.
560 */
561static void prepare_write_ack(struct ceph_connection *con)
562{
563 dout("prepare_write_ack %p %llu -> %llu\n", con,
564 con->in_seq_acked, con->in_seq);
565 con->in_seq_acked = con->in_seq;
566
567 con->out_kvec[0].iov_base = &tag_ack;
568 con->out_kvec[0].iov_len = 1;
569 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
570 con->out_kvec[1].iov_base = &con->out_temp_ack;
571 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
572 con->out_kvec_left = 2;
573 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
574 con->out_kvec_cur = con->out_kvec;
575 con->out_more = 1; /* more will follow.. eventually.. */
576 set_bit(WRITE_PENDING, &con->state);
577}
578
579/*
580 * Prepare to write keepalive byte.
581 */
582static void prepare_write_keepalive(struct ceph_connection *con)
583{
584 dout("prepare_write_keepalive %p\n", con);
585 con->out_kvec[0].iov_base = &tag_keepalive;
586 con->out_kvec[0].iov_len = 1;
587 con->out_kvec_left = 1;
588 con->out_kvec_bytes = 1;
589 con->out_kvec_cur = con->out_kvec;
590 set_bit(WRITE_PENDING, &con->state);
591}
592
593/*
594 * Connection negotiation.
595 */
596
597static void prepare_connect_authorizer(struct ceph_connection *con)
598{
599 void *auth_buf;
600 int auth_len = 0;
601 int auth_protocol = 0;
602
603 mutex_unlock(&con->mutex);
604 if (con->ops->get_authorizer)
605 con->ops->get_authorizer(con, &auth_buf, &auth_len,
606 &auth_protocol, &con->auth_reply_buf,
607 &con->auth_reply_buf_len,
608 con->auth_retry);
609 mutex_lock(&con->mutex);
610
611 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
612 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
613
614 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
615 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
616 con->out_kvec_left++;
617 con->out_kvec_bytes += auth_len;
618}
619
620/*
621 * We connected to a peer and are saying hello.
622 */
623static void prepare_write_banner(struct ceph_messenger *msgr,
624 struct ceph_connection *con)
625{
626 int len = strlen(CEPH_BANNER);
627
628 con->out_kvec[0].iov_base = CEPH_BANNER;
629 con->out_kvec[0].iov_len = len;
630 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
631 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
632 con->out_kvec_left = 2;
633 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
634 con->out_kvec_cur = con->out_kvec;
635 con->out_more = 0;
636 set_bit(WRITE_PENDING, &con->state);
637}
638
639static void prepare_write_connect(struct ceph_messenger *msgr,
640 struct ceph_connection *con,
641 int after_banner)
642{
643 unsigned global_seq = get_global_seq(con->msgr, 0);
644 int proto;
645
646 switch (con->peer_name.type) {
647 case CEPH_ENTITY_TYPE_MON:
648 proto = CEPH_MONC_PROTOCOL;
649 break;
650 case CEPH_ENTITY_TYPE_OSD:
651 proto = CEPH_OSDC_PROTOCOL;
652 break;
653 case CEPH_ENTITY_TYPE_MDS:
654 proto = CEPH_MDSC_PROTOCOL;
655 break;
656 default:
657 BUG();
658 }
659
660 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
661 con->connect_seq, global_seq, proto);
662
663 con->out_connect.features = cpu_to_le64(msgr->supported_features);
664 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
665 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
666 con->out_connect.global_seq = cpu_to_le32(global_seq);
667 con->out_connect.protocol_version = cpu_to_le32(proto);
668 con->out_connect.flags = 0;
669
670 if (!after_banner) {
671 con->out_kvec_left = 0;
672 con->out_kvec_bytes = 0;
673 }
674 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
675 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
676 con->out_kvec_left++;
677 con->out_kvec_bytes += sizeof(con->out_connect);
678 con->out_kvec_cur = con->out_kvec;
679 con->out_more = 0;
680 set_bit(WRITE_PENDING, &con->state);
681
682 prepare_connect_authorizer(con);
683}
684
685
686/*
687 * write as much of pending kvecs to the socket as we can.
688 * 1 -> done
689 * 0 -> socket full, but more to do
690 * <0 -> error
691 */
692static int write_partial_kvec(struct ceph_connection *con)
693{
694 int ret;
695
696 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
697 while (con->out_kvec_bytes > 0) {
698 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
699 con->out_kvec_left, con->out_kvec_bytes,
700 con->out_more);
701 if (ret <= 0)
702 goto out;
703 con->out_kvec_bytes -= ret;
704 if (con->out_kvec_bytes == 0)
705 break; /* done */
706 while (ret > 0) {
707 if (ret >= con->out_kvec_cur->iov_len) {
708 ret -= con->out_kvec_cur->iov_len;
709 con->out_kvec_cur++;
710 con->out_kvec_left--;
711 } else {
712 con->out_kvec_cur->iov_len -= ret;
713 con->out_kvec_cur->iov_base += ret;
714 ret = 0;
715 break;
716 }
717 }
718 }
719 con->out_kvec_left = 0;
720 con->out_kvec_is_msg = false;
721 ret = 1;
722out:
723 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
724 con->out_kvec_bytes, con->out_kvec_left, ret);
725 return ret; /* done! */
726}
727
728#ifdef CONFIG_BLOCK
729static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
730{
731 if (!bio) {
732 *iter = NULL;
733 *seg = 0;
734 return;
735 }
736 *iter = bio;
737 *seg = bio->bi_idx;
738}
739
740static void iter_bio_next(struct bio **bio_iter, int *seg)
741{
742 if (*bio_iter == NULL)
743 return;
744
745 BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
746
747 (*seg)++;
748 if (*seg == (*bio_iter)->bi_vcnt)
749 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
750}
751#endif
752
753/*
754 * Write as much message data payload as we can. If we finish, queue
755 * up the footer.
756 * 1 -> done, footer is now queued in out_kvec[].
757 * 0 -> socket full, but more to do
758 * <0 -> error
759 */
760static int write_partial_msg_pages(struct ceph_connection *con)
761{
762 struct ceph_msg *msg = con->out_msg;
763 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
764 size_t len;
765 int crc = con->msgr->nocrc;
766 int ret;
767 int total_max_write;
768 int in_trail = 0;
769 size_t trail_len = (msg->trail ? msg->trail->length : 0);
770
771 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
772 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
773 con->out_msg_pos.page_pos);
774
775#ifdef CONFIG_BLOCK
776 if (msg->bio && !msg->bio_iter)
777 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
778#endif
779
780 while (data_len > con->out_msg_pos.data_pos) {
781 struct page *page = NULL;
782 void *kaddr = NULL;
783 int max_write = PAGE_SIZE;
784 int page_shift = 0;
785
786 total_max_write = data_len - trail_len -
787 con->out_msg_pos.data_pos;
788
789 /*
790 * if we are calculating the data crc (the default), we need
791 * to map the page. if our pages[] has been revoked, use the
792 * zero page.
793 */
794
795 /* have we reached the trail part of the data? */
796 if (con->out_msg_pos.data_pos >= data_len - trail_len) {
797 in_trail = 1;
798
799 total_max_write = data_len - con->out_msg_pos.data_pos;
800
801 page = list_first_entry(&msg->trail->head,
802 struct page, lru);
803 if (crc)
804 kaddr = kmap(page);
805 max_write = PAGE_SIZE;
806 } else if (msg->pages) {
807 page = msg->pages[con->out_msg_pos.page];
808 if (crc)
809 kaddr = kmap(page);
810 } else if (msg->pagelist) {
811 page = list_first_entry(&msg->pagelist->head,
812 struct page, lru);
813 if (crc)
814 kaddr = kmap(page);
815#ifdef CONFIG_BLOCK
816 } else if (msg->bio) {
817 struct bio_vec *bv;
818
819 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
820 page = bv->bv_page;
821 page_shift = bv->bv_offset;
822 if (crc)
823 kaddr = kmap(page) + page_shift;
824 max_write = bv->bv_len;
825#endif
826 } else {
827 page = con->msgr->zero_page;
828 if (crc)
829 kaddr = page_address(con->msgr->zero_page);
830 }
831 len = min_t(int, max_write - con->out_msg_pos.page_pos,
832 total_max_write);
833
834 if (crc && !con->out_msg_pos.did_page_crc) {
835 void *base = kaddr + con->out_msg_pos.page_pos;
836 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
837
838 BUG_ON(kaddr == NULL);
839 con->out_msg->footer.data_crc =
840 cpu_to_le32(crc32c(tmpcrc, base, len));
841 con->out_msg_pos.did_page_crc = 1;
842 }
843 ret = kernel_sendpage(con->sock, page,
844 con->out_msg_pos.page_pos + page_shift,
845 len,
846 MSG_DONTWAIT | MSG_NOSIGNAL |
847 MSG_MORE);
848
849 if (crc &&
850 (msg->pages || msg->pagelist || msg->bio || in_trail))
851 kunmap(page);
852
853 if (ret <= 0)
854 goto out;
855
856 con->out_msg_pos.data_pos += ret;
857 con->out_msg_pos.page_pos += ret;
858 if (ret == len) {
859 con->out_msg_pos.page_pos = 0;
860 con->out_msg_pos.page++;
861 con->out_msg_pos.did_page_crc = 0;
862 if (in_trail)
863 list_move_tail(&page->lru,
864 &msg->trail->head);
865 else if (msg->pagelist)
866 list_move_tail(&page->lru,
867 &msg->pagelist->head);
868#ifdef CONFIG_BLOCK
869 else if (msg->bio)
870 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
871#endif
872 }
873 }
874
875 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
876
877 /* prepare and queue up footer, too */
878 if (!crc)
879 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
880 con->out_kvec_bytes = 0;
881 con->out_kvec_left = 0;
882 con->out_kvec_cur = con->out_kvec;
883 prepare_write_message_footer(con, 0);
884 ret = 1;
885out:
886 return ret;
887}
888
889/*
890 * write some zeros
891 */
892static int write_partial_skip(struct ceph_connection *con)
893{
894 int ret;
895
896 while (con->out_skip > 0) {
897 struct kvec iov = {
898 .iov_base = page_address(con->msgr->zero_page),
899 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
900 };
901
902 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
903 if (ret <= 0)
904 goto out;
905 con->out_skip -= ret;
906 }
907 ret = 1;
908out:
909 return ret;
910}
911
912/*
913 * Prepare to read connection handshake, or an ack.
914 */
915static void prepare_read_banner(struct ceph_connection *con)
916{
917 dout("prepare_read_banner %p\n", con);
918 con->in_base_pos = 0;
919}
920
921static void prepare_read_connect(struct ceph_connection *con)
922{
923 dout("prepare_read_connect %p\n", con);
924 con->in_base_pos = 0;
925}
926
927static void prepare_read_ack(struct ceph_connection *con)
928{
929 dout("prepare_read_ack %p\n", con);
930 con->in_base_pos = 0;
931}
932
933static void prepare_read_tag(struct ceph_connection *con)
934{
935 dout("prepare_read_tag %p\n", con);
936 con->in_base_pos = 0;
937 con->in_tag = CEPH_MSGR_TAG_READY;
938}
939
940/*
941 * Prepare to read a message.
942 */
943static int prepare_read_message(struct ceph_connection *con)
944{
945 dout("prepare_read_message %p\n", con);
946 BUG_ON(con->in_msg != NULL);
947 con->in_base_pos = 0;
948 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
949 return 0;
950}
951
952
953static int read_partial(struct ceph_connection *con,
954 int *to, int size, void *object)
955{
956 *to += size;
957 while (con->in_base_pos < *to) {
958 int left = *to - con->in_base_pos;
959 int have = size - left;
960 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
961 if (ret <= 0)
962 return ret;
963 con->in_base_pos += ret;
964 }
965 return 1;
966}
967
968
969/*
970 * Read all or part of the connect-side handshake on a new connection
971 */
972static int read_partial_banner(struct ceph_connection *con)
973{
974 int ret, to = 0;
975
976 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
977
978 /* peer's banner */
979 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
980 if (ret <= 0)
981 goto out;
982 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
983 &con->actual_peer_addr);
984 if (ret <= 0)
985 goto out;
986 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
987 &con->peer_addr_for_me);
988 if (ret <= 0)
989 goto out;
990out:
991 return ret;
992}
993
994static int read_partial_connect(struct ceph_connection *con)
995{
996 int ret, to = 0;
997
998 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
999
1000 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
1001 if (ret <= 0)
1002 goto out;
1003 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
1004 con->auth_reply_buf);
1005 if (ret <= 0)
1006 goto out;
1007
1008 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
1009 con, (int)con->in_reply.tag,
1010 le32_to_cpu(con->in_reply.connect_seq),
1011 le32_to_cpu(con->in_reply.global_seq));
1012out:
1013 return ret;
1014
1015}
1016
1017/*
1018 * Verify the hello banner looks okay.
1019 */
1020static int verify_hello(struct ceph_connection *con)
1021{
1022 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
1023 pr_err("connect to %s got bad banner\n",
1024 ceph_pr_addr(&con->peer_addr.in_addr));
1025 con->error_msg = "protocol error, bad banner";
1026 return -1;
1027 }
1028 return 0;
1029}
1030
1031static bool addr_is_blank(struct sockaddr_storage *ss)
1032{
1033 switch (ss->ss_family) {
1034 case AF_INET:
1035 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
1036 case AF_INET6:
1037 return
1038 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
1039 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
1040 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
1041 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
1042 }
1043 return false;
1044}
1045
1046static int addr_port(struct sockaddr_storage *ss)
1047{
1048 switch (ss->ss_family) {
1049 case AF_INET:
1050 return ntohs(((struct sockaddr_in *)ss)->sin_port);
1051 case AF_INET6:
1052 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
1053 }
1054 return 0;
1055}
1056
1057static void addr_set_port(struct sockaddr_storage *ss, int p)
1058{
1059 switch (ss->ss_family) {
1060 case AF_INET:
1061 ((struct sockaddr_in *)ss)->sin_port = htons(p);
1062 case AF_INET6:
1063 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
1064 }
1065}
1066
1067/*
1068 * Parse an ip[:port] list into an addr array. Use the default
1069 * monitor port if a port isn't specified.
1070 */
1071int ceph_parse_ips(const char *c, const char *end,
1072 struct ceph_entity_addr *addr,
1073 int max_count, int *count)
1074{
1075 int i;
1076 const char *p = c;
1077
1078 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
1079 for (i = 0; i < max_count; i++) {
1080 const char *ipend;
1081 struct sockaddr_storage *ss = &addr[i].in_addr;
1082 struct sockaddr_in *in4 = (void *)ss;
1083 struct sockaddr_in6 *in6 = (void *)ss;
1084 int port;
1085 char delim = ',';
1086
1087 if (*p == '[') {
1088 delim = ']';
1089 p++;
1090 }
1091
1092 memset(ss, 0, sizeof(*ss));
1093 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1094 delim, &ipend))
1095 ss->ss_family = AF_INET;
1096 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1097 delim, &ipend))
1098 ss->ss_family = AF_INET6;
1099 else
1100 goto bad;
1101 p = ipend;
1102
1103 if (delim == ']') {
1104 if (*p != ']') {
1105 dout("missing matching ']'\n");
1106 goto bad;
1107 }
1108 p++;
1109 }
1110
1111 /* port? */
1112 if (p < end && *p == ':') {
1113 port = 0;
1114 p++;
1115 while (p < end && *p >= '0' && *p <= '9') {
1116 port = (port * 10) + (*p - '0');
1117 p++;
1118 }
1119 if (port > 65535 || port == 0)
1120 goto bad;
1121 } else {
1122 port = CEPH_MON_PORT;
1123 }
1124
1125 addr_set_port(ss, port);
1126
1127 dout("parse_ips got %s\n", ceph_pr_addr(ss));
1128
1129 if (p == end)
1130 break;
1131 if (*p != ',')
1132 goto bad;
1133 p++;
1134 }
1135
1136 if (p != end)
1137 goto bad;
1138
1139 if (count)
1140 *count = i + 1;
1141 return 0;
1142
1143bad:
1144 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1145 return -EINVAL;
1146}
1147EXPORT_SYMBOL(ceph_parse_ips);
1148
1149static int process_banner(struct ceph_connection *con)
1150{
1151 dout("process_banner on %p\n", con);
1152
1153 if (verify_hello(con) < 0)
1154 return -1;
1155
1156 ceph_decode_addr(&con->actual_peer_addr);
1157 ceph_decode_addr(&con->peer_addr_for_me);
1158
1159 /*
1160 * Make sure the other end is who we wanted. note that the other
1161 * end may not yet know their ip address, so if it's 0.0.0.0, give
1162 * them the benefit of the doubt.
1163 */
1164 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1165 sizeof(con->peer_addr)) != 0 &&
1166 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1167 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1168 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1169 ceph_pr_addr(&con->peer_addr.in_addr),
1170 (int)le32_to_cpu(con->peer_addr.nonce),
1171 ceph_pr_addr(&con->actual_peer_addr.in_addr),
1172 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1173 con->error_msg = "wrong peer at address";
1174 return -1;
1175 }
1176
1177 /*
1178 * did we learn our address?
1179 */
1180 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1181 int port = addr_port(&con->msgr->inst.addr.in_addr);
1182
1183 memcpy(&con->msgr->inst.addr.in_addr,
1184 &con->peer_addr_for_me.in_addr,
1185 sizeof(con->peer_addr_for_me.in_addr));
1186 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1187 encode_my_addr(con->msgr);
1188 dout("process_banner learned my addr is %s\n",
1189 ceph_pr_addr(&con->msgr->inst.addr.in_addr));
1190 }
1191
1192 set_bit(NEGOTIATING, &con->state);
1193 prepare_read_connect(con);
1194 return 0;
1195}
1196
1197static void fail_protocol(struct ceph_connection *con)
1198{
1199 reset_connection(con);
1200 set_bit(CLOSED, &con->state); /* in case there's queued work */
1201
1202 mutex_unlock(&con->mutex);
1203 if (con->ops->bad_proto)
1204 con->ops->bad_proto(con);
1205 mutex_lock(&con->mutex);
1206}
1207
1208static int process_connect(struct ceph_connection *con)
1209{
1210 u64 sup_feat = con->msgr->supported_features;
1211 u64 req_feat = con->msgr->required_features;
1212 u64 server_feat = le64_to_cpu(con->in_reply.features);
1213
1214 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1215
1216 switch (con->in_reply.tag) {
1217 case CEPH_MSGR_TAG_FEATURES:
1218 pr_err("%s%lld %s feature set mismatch,"
1219 " my %llx < server's %llx, missing %llx\n",
1220 ENTITY_NAME(con->peer_name),
1221 ceph_pr_addr(&con->peer_addr.in_addr),
1222 sup_feat, server_feat, server_feat & ~sup_feat);
1223 con->error_msg = "missing required protocol features";
1224 fail_protocol(con);
1225 return -1;
1226
1227 case CEPH_MSGR_TAG_BADPROTOVER:
1228 pr_err("%s%lld %s protocol version mismatch,"
1229 " my %d != server's %d\n",
1230 ENTITY_NAME(con->peer_name),
1231 ceph_pr_addr(&con->peer_addr.in_addr),
1232 le32_to_cpu(con->out_connect.protocol_version),
1233 le32_to_cpu(con->in_reply.protocol_version));
1234 con->error_msg = "protocol version mismatch";
1235 fail_protocol(con);
1236 return -1;
1237
1238 case CEPH_MSGR_TAG_BADAUTHORIZER:
1239 con->auth_retry++;
1240 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1241 con->auth_retry);
1242 if (con->auth_retry == 2) {
1243 con->error_msg = "connect authorization failure";
1244 reset_connection(con);
1245 set_bit(CLOSED, &con->state);
1246 return -1;
1247 }
1248 con->auth_retry = 1;
1249 prepare_write_connect(con->msgr, con, 0);
1250 prepare_read_connect(con);
1251 break;
1252
1253 case CEPH_MSGR_TAG_RESETSESSION:
1254 /*
1255 * If we connected with a large connect_seq but the peer
1256 * has no record of a session with us (no connection, or
1257 * connect_seq == 0), they will send RESETSESION to indicate
1258 * that they must have reset their session, and may have
1259 * dropped messages.
1260 */
1261 dout("process_connect got RESET peer seq %u\n",
1262 le32_to_cpu(con->in_connect.connect_seq));
1263 pr_err("%s%lld %s connection reset\n",
1264 ENTITY_NAME(con->peer_name),
1265 ceph_pr_addr(&con->peer_addr.in_addr));
1266 reset_connection(con);
1267 prepare_write_connect(con->msgr, con, 0);
1268 prepare_read_connect(con);
1269
1270 /* Tell ceph about it. */
1271 mutex_unlock(&con->mutex);
1272 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1273 if (con->ops->peer_reset)
1274 con->ops->peer_reset(con);
1275 mutex_lock(&con->mutex);
1276 break;
1277
1278 case CEPH_MSGR_TAG_RETRY_SESSION:
1279 /*
1280 * If we sent a smaller connect_seq than the peer has, try
1281 * again with a larger value.
1282 */
1283 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1284 le32_to_cpu(con->out_connect.connect_seq),
1285 le32_to_cpu(con->in_connect.connect_seq));
1286 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1287 prepare_write_connect(con->msgr, con, 0);
1288 prepare_read_connect(con);
1289 break;
1290
1291 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1292 /*
1293 * If we sent a smaller global_seq than the peer has, try
1294 * again with a larger value.
1295 */
1296 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1297 con->peer_global_seq,
1298 le32_to_cpu(con->in_connect.global_seq));
1299 get_global_seq(con->msgr,
1300 le32_to_cpu(con->in_connect.global_seq));
1301 prepare_write_connect(con->msgr, con, 0);
1302 prepare_read_connect(con);
1303 break;
1304
1305 case CEPH_MSGR_TAG_READY:
1306 if (req_feat & ~server_feat) {
1307 pr_err("%s%lld %s protocol feature mismatch,"
1308 " my required %llx > server's %llx, need %llx\n",
1309 ENTITY_NAME(con->peer_name),
1310 ceph_pr_addr(&con->peer_addr.in_addr),
1311 req_feat, server_feat, req_feat & ~server_feat);
1312 con->error_msg = "missing required protocol features";
1313 fail_protocol(con);
1314 return -1;
1315 }
1316 clear_bit(CONNECTING, &con->state);
1317 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1318 con->connect_seq++;
1319 con->peer_features = server_feat;
1320 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1321 con->peer_global_seq,
1322 le32_to_cpu(con->in_reply.connect_seq),
1323 con->connect_seq);
1324 WARN_ON(con->connect_seq !=
1325 le32_to_cpu(con->in_reply.connect_seq));
1326
1327 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1328 set_bit(LOSSYTX, &con->state);
1329
1330 prepare_read_tag(con);
1331 break;
1332
1333 case CEPH_MSGR_TAG_WAIT:
1334 /*
1335 * If there is a connection race (we are opening
1336 * connections to each other), one of us may just have
1337 * to WAIT. This shouldn't happen if we are the
1338 * client.
1339 */
1340 pr_err("process_connect peer connecting WAIT\n");
1341
1342 default:
1343 pr_err("connect protocol error, will retry\n");
1344 con->error_msg = "protocol error, garbage tag during connect";
1345 return -1;
1346 }
1347 return 0;
1348}
1349
1350
1351/*
1352 * read (part of) an ack
1353 */
1354static int read_partial_ack(struct ceph_connection *con)
1355{
1356 int to = 0;
1357
1358 return read_partial(con, &to, sizeof(con->in_temp_ack),
1359 &con->in_temp_ack);
1360}
1361
1362
1363/*
1364 * We can finally discard anything that's been acked.
1365 */
1366static void process_ack(struct ceph_connection *con)
1367{
1368 struct ceph_msg *m;
1369 u64 ack = le64_to_cpu(con->in_temp_ack);
1370 u64 seq;
1371
1372 while (!list_empty(&con->out_sent)) {
1373 m = list_first_entry(&con->out_sent, struct ceph_msg,
1374 list_head);
1375 seq = le64_to_cpu(m->hdr.seq);
1376 if (seq > ack)
1377 break;
1378 dout("got ack for seq %llu type %d at %p\n", seq,
1379 le16_to_cpu(m->hdr.type), m);
1380 ceph_msg_remove(m);
1381 }
1382 prepare_read_tag(con);
1383}
1384
1385
1386
1387
1388static int read_partial_message_section(struct ceph_connection *con,
1389 struct kvec *section,
1390 unsigned int sec_len, u32 *crc)
1391{
1392 int ret, left;
1393
1394 BUG_ON(!section);
1395
1396 while (section->iov_len < sec_len) {
1397 BUG_ON(section->iov_base == NULL);
1398 left = sec_len - section->iov_len;
1399 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1400 section->iov_len, left);
1401 if (ret <= 0)
1402 return ret;
1403 section->iov_len += ret;
1404 if (section->iov_len == sec_len)
1405 *crc = crc32c(0, section->iov_base,
1406 section->iov_len);
1407 }
1408
1409 return 1;
1410}
1411
1412static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1413 struct ceph_msg_header *hdr,
1414 int *skip);
1415
1416
1417static int read_partial_message_pages(struct ceph_connection *con,
1418 struct page **pages,
1419 unsigned data_len, int datacrc)
1420{
1421 void *p;
1422 int ret;
1423 int left;
1424
1425 left = min((int)(data_len - con->in_msg_pos.data_pos),
1426 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1427 /* (page) data */
1428 BUG_ON(pages == NULL);
1429 p = kmap(pages[con->in_msg_pos.page]);
1430 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1431 left);
1432 if (ret > 0 && datacrc)
1433 con->in_data_crc =
1434 crc32c(con->in_data_crc,
1435 p + con->in_msg_pos.page_pos, ret);
1436 kunmap(pages[con->in_msg_pos.page]);
1437 if (ret <= 0)
1438 return ret;
1439 con->in_msg_pos.data_pos += ret;
1440 con->in_msg_pos.page_pos += ret;
1441 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1442 con->in_msg_pos.page_pos = 0;
1443 con->in_msg_pos.page++;
1444 }
1445
1446 return ret;
1447}
1448
1449#ifdef CONFIG_BLOCK
1450static int read_partial_message_bio(struct ceph_connection *con,
1451 struct bio **bio_iter, int *bio_seg,
1452 unsigned data_len, int datacrc)
1453{
1454 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
1455 void *p;
1456 int ret, left;
1457
1458 if (IS_ERR(bv))
1459 return PTR_ERR(bv);
1460
1461 left = min((int)(data_len - con->in_msg_pos.data_pos),
1462 (int)(bv->bv_len - con->in_msg_pos.page_pos));
1463
1464 p = kmap(bv->bv_page) + bv->bv_offset;
1465
1466 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1467 left);
1468 if (ret > 0 && datacrc)
1469 con->in_data_crc =
1470 crc32c(con->in_data_crc,
1471 p + con->in_msg_pos.page_pos, ret);
1472 kunmap(bv->bv_page);
1473 if (ret <= 0)
1474 return ret;
1475 con->in_msg_pos.data_pos += ret;
1476 con->in_msg_pos.page_pos += ret;
1477 if (con->in_msg_pos.page_pos == bv->bv_len) {
1478 con->in_msg_pos.page_pos = 0;
1479 iter_bio_next(bio_iter, bio_seg);
1480 }
1481
1482 return ret;
1483}
1484#endif
1485
1486/*
1487 * read (part of) a message.
1488 */
1489static int read_partial_message(struct ceph_connection *con)
1490{
1491 struct ceph_msg *m = con->in_msg;
1492 int ret;
1493 int to, left;
1494 unsigned front_len, middle_len, data_len, data_off;
1495 int datacrc = con->msgr->nocrc;
1496 int skip;
1497 u64 seq;
1498
1499 dout("read_partial_message con %p msg %p\n", con, m);
1500
1501 /* header */
1502 while (con->in_base_pos < sizeof(con->in_hdr)) {
1503 left = sizeof(con->in_hdr) - con->in_base_pos;
1504 ret = ceph_tcp_recvmsg(con->sock,
1505 (char *)&con->in_hdr + con->in_base_pos,
1506 left);
1507 if (ret <= 0)
1508 return ret;
1509 con->in_base_pos += ret;
1510 if (con->in_base_pos == sizeof(con->in_hdr)) {
1511 u32 crc = crc32c(0, (void *)&con->in_hdr,
1512 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1513 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1514 pr_err("read_partial_message bad hdr "
1515 " crc %u != expected %u\n",
1516 crc, con->in_hdr.crc);
1517 return -EBADMSG;
1518 }
1519 }
1520 }
1521 front_len = le32_to_cpu(con->in_hdr.front_len);
1522 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1523 return -EIO;
1524 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1525 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1526 return -EIO;
1527 data_len = le32_to_cpu(con->in_hdr.data_len);
1528 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1529 return -EIO;
1530 data_off = le16_to_cpu(con->in_hdr.data_off);
1531
1532 /* verify seq# */
1533 seq = le64_to_cpu(con->in_hdr.seq);
1534 if ((s64)seq - (s64)con->in_seq < 1) {
1535 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1536 ENTITY_NAME(con->peer_name),
1537 ceph_pr_addr(&con->peer_addr.in_addr),
1538 seq, con->in_seq + 1);
1539 con->in_base_pos = -front_len - middle_len - data_len -
1540 sizeof(m->footer);
1541 con->in_tag = CEPH_MSGR_TAG_READY;
1542 con->in_seq++;
1543 return 0;
1544 } else if ((s64)seq - (s64)con->in_seq > 1) {
1545 pr_err("read_partial_message bad seq %lld expected %lld\n",
1546 seq, con->in_seq + 1);
1547 con->error_msg = "bad message sequence # for incoming message";
1548 return -EBADMSG;
1549 }
1550
1551 /* allocate message? */
1552 if (!con->in_msg) {
1553 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1554 con->in_hdr.front_len, con->in_hdr.data_len);
1555 skip = 0;
1556 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1557 if (skip) {
1558 /* skip this message */
1559 dout("alloc_msg said skip message\n");
1560 BUG_ON(con->in_msg);
1561 con->in_base_pos = -front_len - middle_len - data_len -
1562 sizeof(m->footer);
1563 con->in_tag = CEPH_MSGR_TAG_READY;
1564 con->in_seq++;
1565 return 0;
1566 }
1567 if (!con->in_msg) {
1568 con->error_msg =
1569 "error allocating memory for incoming message";
1570 return -ENOMEM;
1571 }
1572 m = con->in_msg;
1573 m->front.iov_len = 0; /* haven't read it yet */
1574 if (m->middle)
1575 m->middle->vec.iov_len = 0;
1576
1577 con->in_msg_pos.page = 0;
1578 if (m->pages)
1579 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1580 else
1581 con->in_msg_pos.page_pos = 0;
1582 con->in_msg_pos.data_pos = 0;
1583 }
1584
1585 /* front */
1586 ret = read_partial_message_section(con, &m->front, front_len,
1587 &con->in_front_crc);
1588 if (ret <= 0)
1589 return ret;
1590
1591 /* middle */
1592 if (m->middle) {
1593 ret = read_partial_message_section(con, &m->middle->vec,
1594 middle_len,
1595 &con->in_middle_crc);
1596 if (ret <= 0)
1597 return ret;
1598 }
1599#ifdef CONFIG_BLOCK
1600 if (m->bio && !m->bio_iter)
1601 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1602#endif
1603
1604 /* (page) data */
1605 while (con->in_msg_pos.data_pos < data_len) {
1606 if (m->pages) {
1607 ret = read_partial_message_pages(con, m->pages,
1608 data_len, datacrc);
1609 if (ret <= 0)
1610 return ret;
1611#ifdef CONFIG_BLOCK
1612 } else if (m->bio) {
1613
1614 ret = read_partial_message_bio(con,
1615 &m->bio_iter, &m->bio_seg,
1616 data_len, datacrc);
1617 if (ret <= 0)
1618 return ret;
1619#endif
1620 } else {
1621 BUG_ON(1);
1622 }
1623 }
1624
1625 /* footer */
1626 to = sizeof(m->hdr) + sizeof(m->footer);
1627 while (con->in_base_pos < to) {
1628 left = to - con->in_base_pos;
1629 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1630 (con->in_base_pos - sizeof(m->hdr)),
1631 left);
1632 if (ret <= 0)
1633 return ret;
1634 con->in_base_pos += ret;
1635 }
1636 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1637 m, front_len, m->footer.front_crc, middle_len,
1638 m->footer.middle_crc, data_len, m->footer.data_crc);
1639
1640 /* crc ok? */
1641 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1642 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1643 m, con->in_front_crc, m->footer.front_crc);
1644 return -EBADMSG;
1645 }
1646 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1647 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1648 m, con->in_middle_crc, m->footer.middle_crc);
1649 return -EBADMSG;
1650 }
1651 if (datacrc &&
1652 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1653 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1654 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1655 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1656 return -EBADMSG;
1657 }
1658
1659 return 1; /* done! */
1660}
1661
1662/*
1663 * Process message. This happens in the worker thread. The callback should
1664 * be careful not to do anything that waits on other incoming messages or it
1665 * may deadlock.
1666 */
1667static void process_message(struct ceph_connection *con)
1668{
1669 struct ceph_msg *msg;
1670
1671 msg = con->in_msg;
1672 con->in_msg = NULL;
1673
1674 /* if first message, set peer_name */
1675 if (con->peer_name.type == 0)
1676 con->peer_name = msg->hdr.src;
1677
1678 con->in_seq++;
1679 mutex_unlock(&con->mutex);
1680
1681 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1682 msg, le64_to_cpu(msg->hdr.seq),
1683 ENTITY_NAME(msg->hdr.src),
1684 le16_to_cpu(msg->hdr.type),
1685 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1686 le32_to_cpu(msg->hdr.front_len),
1687 le32_to_cpu(msg->hdr.data_len),
1688 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1689 con->ops->dispatch(con, msg);
1690
1691 mutex_lock(&con->mutex);
1692 prepare_read_tag(con);
1693}
1694
1695
1696/*
1697 * Write something to the socket. Called in a worker thread when the
1698 * socket appears to be writeable and we have something ready to send.
1699 */
1700static int try_write(struct ceph_connection *con)
1701{
1702 struct ceph_messenger *msgr = con->msgr;
1703 int ret = 1;
1704
1705 dout("try_write start %p state %lu nref %d\n", con, con->state,
1706 atomic_read(&con->nref));
1707
1708more:
1709 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1710
1711 /* open the socket first? */
1712 if (con->sock == NULL) {
1713 /*
1714 * if we were STANDBY and are reconnecting _this_
1715 * connection, bump connect_seq now. Always bump
1716 * global_seq.
1717 */
1718 if (test_and_clear_bit(STANDBY, &con->state))
1719 con->connect_seq++;
1720
1721 prepare_write_banner(msgr, con);
1722 prepare_write_connect(msgr, con, 1);
1723 prepare_read_banner(con);
1724 set_bit(CONNECTING, &con->state);
1725 clear_bit(NEGOTIATING, &con->state);
1726
1727 BUG_ON(con->in_msg);
1728 con->in_tag = CEPH_MSGR_TAG_READY;
1729 dout("try_write initiating connect on %p new state %lu\n",
1730 con, con->state);
1731 con->sock = ceph_tcp_connect(con);
1732 if (IS_ERR(con->sock)) {
1733 con->sock = NULL;
1734 con->error_msg = "connect error";
1735 ret = -1;
1736 goto out;
1737 }
1738 }
1739
1740more_kvec:
1741 /* kvec data queued? */
1742 if (con->out_skip) {
1743 ret = write_partial_skip(con);
1744 if (ret <= 0)
1745 goto done;
1746 if (ret < 0) {
1747 dout("try_write write_partial_skip err %d\n", ret);
1748 goto done;
1749 }
1750 }
1751 if (con->out_kvec_left) {
1752 ret = write_partial_kvec(con);
1753 if (ret <= 0)
1754 goto done;
1755 }
1756
1757 /* msg pages? */
1758 if (con->out_msg) {
1759 if (con->out_msg_done) {
1760 ceph_msg_put(con->out_msg);
1761 con->out_msg = NULL; /* we're done with this one */
1762 goto do_next;
1763 }
1764
1765 ret = write_partial_msg_pages(con);
1766 if (ret == 1)
1767 goto more_kvec; /* we need to send the footer, too! */
1768 if (ret == 0)
1769 goto done;
1770 if (ret < 0) {
1771 dout("try_write write_partial_msg_pages err %d\n",
1772 ret);
1773 goto done;
1774 }
1775 }
1776
1777do_next:
1778 if (!test_bit(CONNECTING, &con->state)) {
1779 /* is anything else pending? */
1780 if (!list_empty(&con->out_queue)) {
1781 prepare_write_message(con);
1782 goto more;
1783 }
1784 if (con->in_seq > con->in_seq_acked) {
1785 prepare_write_ack(con);
1786 goto more;
1787 }
1788 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1789 prepare_write_keepalive(con);
1790 goto more;
1791 }
1792 }
1793
1794 /* Nothing to do! */
1795 clear_bit(WRITE_PENDING, &con->state);
1796 dout("try_write nothing else to write.\n");
1797done:
1798 ret = 0;
1799out:
1800 dout("try_write done on %p\n", con);
1801 return ret;
1802}
1803
1804
1805
1806/*
1807 * Read what we can from the socket.
1808 */
1809static int try_read(struct ceph_connection *con)
1810{
1811 int ret = -1;
1812
1813 if (!con->sock)
1814 return 0;
1815
1816 if (test_bit(STANDBY, &con->state))
1817 return 0;
1818
1819 dout("try_read start on %p\n", con);
1820
1821more:
1822 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1823 con->in_base_pos);
1824 if (test_bit(CONNECTING, &con->state)) {
1825 if (!test_bit(NEGOTIATING, &con->state)) {
1826 dout("try_read connecting\n");
1827 ret = read_partial_banner(con);
1828 if (ret <= 0)
1829 goto done;
1830 if (process_banner(con) < 0) {
1831 ret = -1;
1832 goto out;
1833 }
1834 }
1835 ret = read_partial_connect(con);
1836 if (ret <= 0)
1837 goto done;
1838 if (process_connect(con) < 0) {
1839 ret = -1;
1840 goto out;
1841 }
1842 goto more;
1843 }
1844
1845 if (con->in_base_pos < 0) {
1846 /*
1847 * skipping + discarding content.
1848 *
1849 * FIXME: there must be a better way to do this!
1850 */
1851 static char buf[1024];
1852 int skip = min(1024, -con->in_base_pos);
1853 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1854 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1855 if (ret <= 0)
1856 goto done;
1857 con->in_base_pos += ret;
1858 if (con->in_base_pos)
1859 goto more;
1860 }
1861 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1862 /*
1863 * what's next?
1864 */
1865 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1866 if (ret <= 0)
1867 goto done;
1868 dout("try_read got tag %d\n", (int)con->in_tag);
1869 switch (con->in_tag) {
1870 case CEPH_MSGR_TAG_MSG:
1871 prepare_read_message(con);
1872 break;
1873 case CEPH_MSGR_TAG_ACK:
1874 prepare_read_ack(con);
1875 break;
1876 case CEPH_MSGR_TAG_CLOSE:
1877 set_bit(CLOSED, &con->state); /* fixme */
1878 goto done;
1879 default:
1880 goto bad_tag;
1881 }
1882 }
1883 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1884 ret = read_partial_message(con);
1885 if (ret <= 0) {
1886 switch (ret) {
1887 case -EBADMSG:
1888 con->error_msg = "bad crc";
1889 ret = -EIO;
1890 goto out;
1891 case -EIO:
1892 con->error_msg = "io error";
1893 goto out;
1894 default:
1895 goto done;
1896 }
1897 }
1898 if (con->in_tag == CEPH_MSGR_TAG_READY)
1899 goto more;
1900 process_message(con);
1901 goto more;
1902 }
1903 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1904 ret = read_partial_ack(con);
1905 if (ret <= 0)
1906 goto done;
1907 process_ack(con);
1908 goto more;
1909 }
1910
1911done:
1912 ret = 0;
1913out:
1914 dout("try_read done on %p\n", con);
1915 return ret;
1916
1917bad_tag:
1918 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1919 con->error_msg = "protocol error, garbage tag";
1920 ret = -1;
1921 goto out;
1922}
1923
1924
1925/*
1926 * Atomically queue work on a connection. Bump @con reference to
1927 * avoid races with connection teardown.
1928 *
1929 * There is some trickery going on with QUEUED and BUSY because we
1930 * only want a _single_ thread operating on each connection at any
1931 * point in time, but we want to use all available CPUs.
1932 *
1933 * The worker thread only proceeds if it can atomically set BUSY. It
1934 * clears QUEUED and does it's thing. When it thinks it's done, it
1935 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1936 * (tries again to set BUSY).
1937 *
1938 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1939 * try to queue work. If that fails (work is already queued, or BUSY)
1940 * we give up (work also already being done or is queued) but leave QUEUED
1941 * set so that the worker thread will loop if necessary.
1942 */
1943static void queue_con(struct ceph_connection *con)
1944{
1945 if (test_bit(DEAD, &con->state)) {
1946 dout("queue_con %p ignoring: DEAD\n",
1947 con);
1948 return;
1949 }
1950
1951 if (!con->ops->get(con)) {
1952 dout("queue_con %p ref count 0\n", con);
1953 return;
1954 }
1955
1956 set_bit(QUEUED, &con->state);
1957 if (test_bit(BUSY, &con->state)) {
1958 dout("queue_con %p - already BUSY\n", con);
1959 con->ops->put(con);
1960 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1961 dout("queue_con %p - already queued\n", con);
1962 con->ops->put(con);
1963 } else {
1964 dout("queue_con %p\n", con);
1965 }
1966}
1967
1968/*
1969 * Do some work on a connection. Drop a connection ref when we're done.
1970 */
1971static void con_work(struct work_struct *work)
1972{
1973 struct ceph_connection *con = container_of(work, struct ceph_connection,
1974 work.work);
1975 int backoff = 0;
1976
1977more:
1978 if (test_and_set_bit(BUSY, &con->state) != 0) {
1979 dout("con_work %p BUSY already set\n", con);
1980 goto out;
1981 }
1982 dout("con_work %p start, clearing QUEUED\n", con);
1983 clear_bit(QUEUED, &con->state);
1984
1985 mutex_lock(&con->mutex);
1986
1987 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1988 dout("con_work CLOSED\n");
1989 con_close_socket(con);
1990 goto done;
1991 }
1992 if (test_and_clear_bit(OPENING, &con->state)) {
1993 /* reopen w/ new peer */
1994 dout("con_work OPENING\n");
1995 con_close_socket(con);
1996 }
1997
1998 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1999 try_read(con) < 0 ||
2000 try_write(con) < 0) {
2001 mutex_unlock(&con->mutex);
2002 backoff = 1;
2003 ceph_fault(con); /* error/fault path */
2004 goto done_unlocked;
2005 }
2006
2007done:
2008 mutex_unlock(&con->mutex);
2009
2010done_unlocked:
2011 clear_bit(BUSY, &con->state);
2012 dout("con->state=%lu\n", con->state);
2013 if (test_bit(QUEUED, &con->state)) {
2014 if (!backoff || test_bit(OPENING, &con->state)) {
2015 dout("con_work %p QUEUED reset, looping\n", con);
2016 goto more;
2017 }
2018 dout("con_work %p QUEUED reset, but just faulted\n", con);
2019 clear_bit(QUEUED, &con->state);
2020 }
2021 dout("con_work %p done\n", con);
2022
2023out:
2024 con->ops->put(con);
2025}
2026
2027
2028/*
2029 * Generic error/fault handler. A retry mechanism is used with
2030 * exponential backoff
2031 */
2032static void ceph_fault(struct ceph_connection *con)
2033{
2034 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2035 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
2036 dout("fault %p state %lu to peer %s\n",
2037 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
2038
2039 if (test_bit(LOSSYTX, &con->state)) {
2040 dout("fault on LOSSYTX channel\n");
2041 goto out;
2042 }
2043
2044 mutex_lock(&con->mutex);
2045 if (test_bit(CLOSED, &con->state))
2046 goto out_unlock;
2047
2048 con_close_socket(con);
2049
2050 if (con->in_msg) {
2051 ceph_msg_put(con->in_msg);
2052 con->in_msg = NULL;
2053 }
2054
2055 /* Requeue anything that hasn't been acked */
2056 list_splice_init(&con->out_sent, &con->out_queue);
2057
2058 /* If there are no messages in the queue, place the connection
2059 * in a STANDBY state (i.e., don't try to reconnect just yet). */
2060 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
2061 dout("fault setting STANDBY\n");
2062 set_bit(STANDBY, &con->state);
2063 } else {
2064 /* retry after a delay. */
2065 if (con->delay == 0)
2066 con->delay = BASE_DELAY_INTERVAL;
2067 else if (con->delay < MAX_DELAY_INTERVAL)
2068 con->delay *= 2;
2069 dout("fault queueing %p delay %lu\n", con, con->delay);
2070 con->ops->get(con);
2071 if (queue_delayed_work(ceph_msgr_wq, &con->work,
2072 round_jiffies_relative(con->delay)) == 0)
2073 con->ops->put(con);
2074 }
2075
2076out_unlock:
2077 mutex_unlock(&con->mutex);
2078out:
2079 /*
2080 * in case we faulted due to authentication, invalidate our
2081 * current tickets so that we can get new ones.
2082 */
2083 if (con->auth_retry && con->ops->invalidate_authorizer) {
2084 dout("calling invalidate_authorizer()\n");
2085 con->ops->invalidate_authorizer(con);
2086 }
2087
2088 if (con->ops->fault)
2089 con->ops->fault(con);
2090}
2091
2092
2093
2094/*
2095 * create a new messenger instance
2096 */
2097struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
2098 u32 supported_features,
2099 u32 required_features)
2100{
2101 struct ceph_messenger *msgr;
2102
2103 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
2104 if (msgr == NULL)
2105 return ERR_PTR(-ENOMEM);
2106
2107 msgr->supported_features = supported_features;
2108 msgr->required_features = required_features;
2109
2110 spin_lock_init(&msgr->global_seq_lock);
2111
2112 /* the zero page is needed if a request is "canceled" while the message
2113 * is being written over the socket */
2114 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
2115 if (!msgr->zero_page) {
2116 kfree(msgr);
2117 return ERR_PTR(-ENOMEM);
2118 }
2119 kmap(msgr->zero_page);
2120
2121 if (myaddr)
2122 msgr->inst.addr = *myaddr;
2123
2124 /* select a random nonce */
2125 msgr->inst.addr.type = 0;
2126 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
2127 encode_my_addr(msgr);
2128
2129 dout("messenger_create %p\n", msgr);
2130 return msgr;
2131}
2132EXPORT_SYMBOL(ceph_messenger_create);
2133
2134void ceph_messenger_destroy(struct ceph_messenger *msgr)
2135{
2136 dout("destroy %p\n", msgr);
2137 kunmap(msgr->zero_page);
2138 __free_page(msgr->zero_page);
2139 kfree(msgr);
2140 dout("destroyed messenger %p\n", msgr);
2141}
2142EXPORT_SYMBOL(ceph_messenger_destroy);
2143
2144/*
2145 * Queue up an outgoing message on the given connection.
2146 */
2147void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2148{
2149 if (test_bit(CLOSED, &con->state)) {
2150 dout("con_send %p closed, dropping %p\n", con, msg);
2151 ceph_msg_put(msg);
2152 return;
2153 }
2154
2155 /* set src+dst */
2156 msg->hdr.src = con->msgr->inst.name;
2157
2158 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
2159
2160 msg->needs_out_seq = true;
2161
2162 /* queue */
2163 mutex_lock(&con->mutex);
2164 BUG_ON(!list_empty(&msg->list_head));
2165 list_add_tail(&msg->list_head, &con->out_queue);
2166 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
2167 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
2168 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
2169 le32_to_cpu(msg->hdr.front_len),
2170 le32_to_cpu(msg->hdr.middle_len),
2171 le32_to_cpu(msg->hdr.data_len));
2172 mutex_unlock(&con->mutex);
2173
2174 /* if there wasn't anything waiting to send before, queue
2175 * new work */
2176 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2177 queue_con(con);
2178}
2179EXPORT_SYMBOL(ceph_con_send);
2180
2181/*
2182 * Revoke a message that was previously queued for send
2183 */
2184void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2185{
2186 mutex_lock(&con->mutex);
2187 if (!list_empty(&msg->list_head)) {
2188 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2189 list_del_init(&msg->list_head);
2190 ceph_msg_put(msg);
2191 msg->hdr.seq = 0;
2192 }
2193 if (con->out_msg == msg) {
2194 dout("con_revoke %p msg %p - was sending\n", con, msg);
2195 con->out_msg = NULL;
2196 if (con->out_kvec_is_msg) {
2197 con->out_skip = con->out_kvec_bytes;
2198 con->out_kvec_is_msg = false;
2199 }
2200 ceph_msg_put(msg);
2201 msg->hdr.seq = 0;
2202 }
2203 mutex_unlock(&con->mutex);
2204}
2205
2206/*
2207 * Revoke a message that we may be reading data into
2208 */
2209void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2210{
2211 mutex_lock(&con->mutex);
2212 if (con->in_msg && con->in_msg == msg) {
2213 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2214 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2215 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2216
2217 /* skip rest of message */
2218 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2219 con->in_base_pos = con->in_base_pos -
2220 sizeof(struct ceph_msg_header) -
2221 front_len -
2222 middle_len -
2223 data_len -
2224 sizeof(struct ceph_msg_footer);
2225 ceph_msg_put(con->in_msg);
2226 con->in_msg = NULL;
2227 con->in_tag = CEPH_MSGR_TAG_READY;
2228 con->in_seq++;
2229 } else {
2230 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2231 con, con->in_msg, msg);
2232 }
2233 mutex_unlock(&con->mutex);
2234}
2235
2236/*
2237 * Queue a keepalive byte to ensure the tcp connection is alive.
2238 */
2239void ceph_con_keepalive(struct ceph_connection *con)
2240{
2241 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2242 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2243 queue_con(con);
2244}
2245EXPORT_SYMBOL(ceph_con_keepalive);
2246
2247
2248/*
2249 * construct a new message with given type, size
2250 * the new msg has a ref count of 1.
2251 */
2252struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2253{
2254 struct ceph_msg *m;
2255
2256 m = kmalloc(sizeof(*m), flags);
2257 if (m == NULL)
2258 goto out;
2259 kref_init(&m->kref);
2260 INIT_LIST_HEAD(&m->list_head);
2261
2262 m->hdr.tid = 0;
2263 m->hdr.type = cpu_to_le16(type);
2264 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2265 m->hdr.version = 0;
2266 m->hdr.front_len = cpu_to_le32(front_len);
2267 m->hdr.middle_len = 0;
2268 m->hdr.data_len = 0;
2269 m->hdr.data_off = 0;
2270 m->hdr.reserved = 0;
2271 m->footer.front_crc = 0;
2272 m->footer.middle_crc = 0;
2273 m->footer.data_crc = 0;
2274 m->footer.flags = 0;
2275 m->front_max = front_len;
2276 m->front_is_vmalloc = false;
2277 m->more_to_follow = false;
2278 m->pool = NULL;
2279
2280 /* front */
2281 if (front_len) {
2282 if (front_len > PAGE_CACHE_SIZE) {
2283 m->front.iov_base = __vmalloc(front_len, flags,
2284 PAGE_KERNEL);
2285 m->front_is_vmalloc = true;
2286 } else {
2287 m->front.iov_base = kmalloc(front_len, flags);
2288 }
2289 if (m->front.iov_base == NULL) {
2290 pr_err("msg_new can't allocate %d bytes\n",
2291 front_len);
2292 goto out2;
2293 }
2294 } else {
2295 m->front.iov_base = NULL;
2296 }
2297 m->front.iov_len = front_len;
2298
2299 /* middle */
2300 m->middle = NULL;
2301
2302 /* data */
2303 m->nr_pages = 0;
2304 m->pages = NULL;
2305 m->pagelist = NULL;
2306 m->bio = NULL;
2307 m->bio_iter = NULL;
2308 m->bio_seg = 0;
2309 m->trail = NULL;
2310
2311 dout("ceph_msg_new %p front %d\n", m, front_len);
2312 return m;
2313
2314out2:
2315 ceph_msg_put(m);
2316out:
2317 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2318 return NULL;
2319}
2320EXPORT_SYMBOL(ceph_msg_new);
2321
2322/*
2323 * Allocate "middle" portion of a message, if it is needed and wasn't
2324 * allocated by alloc_msg. This allows us to read a small fixed-size
2325 * per-type header in the front and then gracefully fail (i.e.,
2326 * propagate the error to the caller based on info in the front) when
2327 * the middle is too large.
2328 */
2329static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2330{
2331 int type = le16_to_cpu(msg->hdr.type);
2332 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2333
2334 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2335 ceph_msg_type_name(type), middle_len);
2336 BUG_ON(!middle_len);
2337 BUG_ON(msg->middle);
2338
2339 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2340 if (!msg->middle)
2341 return -ENOMEM;
2342 return 0;
2343}
2344
2345/*
2346 * Generic message allocator, for incoming messages.
2347 */
2348static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2349 struct ceph_msg_header *hdr,
2350 int *skip)
2351{
2352 int type = le16_to_cpu(hdr->type);
2353 int front_len = le32_to_cpu(hdr->front_len);
2354 int middle_len = le32_to_cpu(hdr->middle_len);
2355 struct ceph_msg *msg = NULL;
2356 int ret;
2357
2358 if (con->ops->alloc_msg) {
2359 mutex_unlock(&con->mutex);
2360 msg = con->ops->alloc_msg(con, hdr, skip);
2361 mutex_lock(&con->mutex);
2362 if (!msg || *skip)
2363 return NULL;
2364 }
2365 if (!msg) {
2366 *skip = 0;
2367 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2368 if (!msg) {
2369 pr_err("unable to allocate msg type %d len %d\n",
2370 type, front_len);
2371 return NULL;
2372 }
2373 }
2374 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2375
2376 if (middle_len && !msg->middle) {
2377 ret = ceph_alloc_middle(con, msg);
2378 if (ret < 0) {
2379 ceph_msg_put(msg);
2380 return NULL;
2381 }
2382 }
2383
2384 return msg;
2385}
2386
2387
2388/*
2389 * Free a generically kmalloc'd message.
2390 */
2391void ceph_msg_kfree(struct ceph_msg *m)
2392{
2393 dout("msg_kfree %p\n", m);
2394 if (m->front_is_vmalloc)
2395 vfree(m->front.iov_base);
2396 else
2397 kfree(m->front.iov_base);
2398 kfree(m);
2399}
2400
2401/*
2402 * Drop a msg ref. Destroy as needed.
2403 */
2404void ceph_msg_last_put(struct kref *kref)
2405{
2406 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2407
2408 dout("ceph_msg_put last one on %p\n", m);
2409 WARN_ON(!list_empty(&m->list_head));
2410
2411 /* drop middle, data, if any */
2412 if (m->middle) {
2413 ceph_buffer_put(m->middle);
2414 m->middle = NULL;
2415 }
2416 m->nr_pages = 0;
2417 m->pages = NULL;
2418
2419 if (m->pagelist) {
2420 ceph_pagelist_release(m->pagelist);
2421 kfree(m->pagelist);
2422 m->pagelist = NULL;
2423 }
2424
2425 m->trail = NULL;
2426
2427 if (m->pool)
2428 ceph_msgpool_put(m->pool, m);
2429 else
2430 ceph_msg_kfree(m);
2431}
2432EXPORT_SYMBOL(ceph_msg_last_put);
2433
2434void ceph_msg_dump(struct ceph_msg *msg)
2435{
2436 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2437 msg->front_max, msg->nr_pages);
2438 print_hex_dump(KERN_DEBUG, "header: ",
2439 DUMP_PREFIX_OFFSET, 16, 1,
2440 &msg->hdr, sizeof(msg->hdr), true);
2441 print_hex_dump(KERN_DEBUG, " front: ",
2442 DUMP_PREFIX_OFFSET, 16, 1,
2443 msg->front.iov_base, msg->front.iov_len, true);
2444 if (msg->middle)
2445 print_hex_dump(KERN_DEBUG, "middle: ",
2446 DUMP_PREFIX_OFFSET, 16, 1,
2447 msg->middle->vec.iov_base,
2448 msg->middle->vec.iov_len, true);
2449 print_hex_dump(KERN_DEBUG, "footer: ",
2450 DUMP_PREFIX_OFFSET, 16, 1,
2451 &msg->footer, sizeof(msg->footer), true);
2452}
2453EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
new file mode 100644
index 000000000000..8a079399174a
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1027 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/types.h>
5#include <linux/slab.h>
6#include <linux/random.h>
7#include <linux/sched.h>
8
9#include <linux/ceph/mon_client.h>
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/decode.h>
12
13#include <linux/ceph/auth.h>
14
15/*
16 * Interact with Ceph monitor cluster. Handle requests for new map
17 * versions, and periodically resend as needed. Also implement
18 * statfs() and umount().
19 *
20 * A small cluster of Ceph "monitors" are responsible for managing critical
21 * cluster configuration and state information. An odd number (e.g., 3, 5)
22 * of cmon daemons use a modified version of the Paxos part-time parliament
23 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
24 * list of clients who have mounted the file system.
25 *
26 * We maintain an open, active session with a monitor at all times in order to
27 * receive timely MDSMap updates. We periodically send a keepalive byte on the
28 * TCP socket to ensure we detect a failure. If the connection does break, we
29 * randomly hunt for a new monitor. Once the connection is reestablished, we
30 * resend any outstanding requests.
31 */
32
33static const struct ceph_connection_operations mon_con_ops;
34
35static int __validate_auth(struct ceph_mon_client *monc);
36
37/*
38 * Decode a monmap blob (e.g., during mount).
39 */
40struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
41{
42 struct ceph_monmap *m = NULL;
43 int i, err = -EINVAL;
44 struct ceph_fsid fsid;
45 u32 epoch, num_mon;
46 u16 version;
47 u32 len;
48
49 ceph_decode_32_safe(&p, end, len, bad);
50 ceph_decode_need(&p, end, len, bad);
51
52 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
53
54 ceph_decode_16_safe(&p, end, version, bad);
55
56 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
57 ceph_decode_copy(&p, &fsid, sizeof(fsid));
58 epoch = ceph_decode_32(&p);
59
60 num_mon = ceph_decode_32(&p);
61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
62
63 if (num_mon >= CEPH_MAX_MON)
64 goto bad;
65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
66 if (m == NULL)
67 return ERR_PTR(-ENOMEM);
68 m->fsid = fsid;
69 m->epoch = epoch;
70 m->num_mon = num_mon;
71 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
72 for (i = 0; i < num_mon; i++)
73 ceph_decode_addr(&m->mon_inst[i].addr);
74
75 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
76 m->num_mon);
77 for (i = 0; i < m->num_mon; i++)
78 dout("monmap_decode mon%d is %s\n", i,
79 ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
80 return m;
81
82bad:
83 dout("monmap_decode failed with %d\n", err);
84 kfree(m);
85 return ERR_PTR(err);
86}
87
88/*
89 * return true if *addr is included in the monmap.
90 */
91int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
92{
93 int i;
94
95 for (i = 0; i < m->num_mon; i++)
96 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
97 return 1;
98 return 0;
99}
100
101/*
102 * Send an auth request.
103 */
104static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
105{
106 monc->pending_auth = 1;
107 monc->m_auth->front.iov_len = len;
108 monc->m_auth->hdr.front_len = cpu_to_le32(len);
109 ceph_con_revoke(monc->con, monc->m_auth);
110 ceph_msg_get(monc->m_auth); /* keep our ref */
111 ceph_con_send(monc->con, monc->m_auth);
112}
113
114/*
115 * Close monitor session, if any.
116 */
117static void __close_session(struct ceph_mon_client *monc)
118{
119 if (monc->con) {
120 dout("__close_session closing mon%d\n", monc->cur_mon);
121 ceph_con_revoke(monc->con, monc->m_auth);
122 ceph_con_close(monc->con);
123 monc->cur_mon = -1;
124 monc->pending_auth = 0;
125 ceph_auth_reset(monc->auth);
126 }
127}
128
129/*
130 * Open a session with a (new) monitor.
131 */
132static int __open_session(struct ceph_mon_client *monc)
133{
134 char r;
135 int ret;
136
137 if (monc->cur_mon < 0) {
138 get_random_bytes(&r, 1);
139 monc->cur_mon = r % monc->monmap->num_mon;
140 dout("open_session num=%d r=%d -> mon%d\n",
141 monc->monmap->num_mon, r, monc->cur_mon);
142 monc->sub_sent = 0;
143 monc->sub_renew_after = jiffies; /* i.e., expired */
144 monc->want_next_osdmap = !!monc->want_next_osdmap;
145
146 dout("open_session mon%d opening\n", monc->cur_mon);
147 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
148 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
149 ceph_con_open(monc->con,
150 &monc->monmap->mon_inst[monc->cur_mon].addr);
151
152 /* initiatiate authentication handshake */
153 ret = ceph_auth_build_hello(monc->auth,
154 monc->m_auth->front.iov_base,
155 monc->m_auth->front_max);
156 __send_prepared_auth_request(monc, ret);
157 } else {
158 dout("open_session mon%d already open\n", monc->cur_mon);
159 }
160 return 0;
161}
162
163static bool __sub_expired(struct ceph_mon_client *monc)
164{
165 return time_after_eq(jiffies, monc->sub_renew_after);
166}
167
168/*
169 * Reschedule delayed work timer.
170 */
171static void __schedule_delayed(struct ceph_mon_client *monc)
172{
173 unsigned delay;
174
175 if (monc->cur_mon < 0 || __sub_expired(monc))
176 delay = 10 * HZ;
177 else
178 delay = 20 * HZ;
179 dout("__schedule_delayed after %u\n", delay);
180 schedule_delayed_work(&monc->delayed_work, delay);
181}
182
183/*
184 * Send subscribe request for mdsmap and/or osdmap.
185 */
186static void __send_subscribe(struct ceph_mon_client *monc)
187{
188 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
189 (unsigned)monc->sub_sent, __sub_expired(monc),
190 monc->want_next_osdmap);
191 if ((__sub_expired(monc) && !monc->sub_sent) ||
192 monc->want_next_osdmap == 1) {
193 struct ceph_msg *msg = monc->m_subscribe;
194 struct ceph_mon_subscribe_item *i;
195 void *p, *end;
196 int num;
197
198 p = msg->front.iov_base;
199 end = p + msg->front_max;
200
201 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
202 ceph_encode_32(&p, num);
203
204 if (monc->want_next_osdmap) {
205 dout("__send_subscribe to 'osdmap' %u\n",
206 (unsigned)monc->have_osdmap);
207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1;
211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */
213 }
214 if (monc->want_mdsmap) {
215 dout("__send_subscribe to 'mdsmap' %u+\n",
216 (unsigned)monc->have_mdsmap);
217 ceph_encode_string(&p, end, "mdsmap", 6);
218 i = p;
219 i->have = cpu_to_le64(monc->have_mdsmap);
220 i->onetime = 0;
221 p += sizeof(*i);
222 }
223 ceph_encode_string(&p, end, "monmap", 6);
224 i = p;
225 i->have = 0;
226 i->onetime = 0;
227 p += sizeof(*i);
228
229 msg->front.iov_len = p - msg->front.iov_base;
230 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
231 ceph_con_revoke(monc->con, msg);
232 ceph_con_send(monc->con, ceph_msg_get(msg));
233
234 monc->sub_sent = jiffies | 1; /* never 0 */
235 }
236}
237
238static void handle_subscribe_ack(struct ceph_mon_client *monc,
239 struct ceph_msg *msg)
240{
241 unsigned seconds;
242 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
243
244 if (msg->front.iov_len < sizeof(*h))
245 goto bad;
246 seconds = le32_to_cpu(h->duration);
247
248 mutex_lock(&monc->mutex);
249 if (monc->hunting) {
250 pr_info("mon%d %s session established\n",
251 monc->cur_mon,
252 ceph_pr_addr(&monc->con->peer_addr.in_addr));
253 monc->hunting = false;
254 }
255 dout("handle_subscribe_ack after %d seconds\n", seconds);
256 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
257 monc->sub_sent = 0;
258 mutex_unlock(&monc->mutex);
259 return;
260bad:
261 pr_err("got corrupt subscribe-ack msg\n");
262 ceph_msg_dump(msg);
263}
264
265/*
266 * Keep track of which maps we have
267 */
268int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
269{
270 mutex_lock(&monc->mutex);
271 monc->have_mdsmap = got;
272 mutex_unlock(&monc->mutex);
273 return 0;
274}
275EXPORT_SYMBOL(ceph_monc_got_mdsmap);
276
277int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
278{
279 mutex_lock(&monc->mutex);
280 monc->have_osdmap = got;
281 monc->want_next_osdmap = 0;
282 mutex_unlock(&monc->mutex);
283 return 0;
284}
285
286/*
287 * Register interest in the next osdmap
288 */
289void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
290{
291 dout("request_next_osdmap have %u\n", monc->have_osdmap);
292 mutex_lock(&monc->mutex);
293 if (!monc->want_next_osdmap)
294 monc->want_next_osdmap = 1;
295 if (monc->want_next_osdmap < 2)
296 __send_subscribe(monc);
297 mutex_unlock(&monc->mutex);
298}
299
300/*
301 *
302 */
303int ceph_monc_open_session(struct ceph_mon_client *monc)
304{
305 if (!monc->con) {
306 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
307 if (!monc->con)
308 return -ENOMEM;
309 ceph_con_init(monc->client->msgr, monc->con);
310 monc->con->private = monc;
311 monc->con->ops = &mon_con_ops;
312 }
313
314 mutex_lock(&monc->mutex);
315 __open_session(monc);
316 __schedule_delayed(monc);
317 mutex_unlock(&monc->mutex);
318 return 0;
319}
320EXPORT_SYMBOL(ceph_monc_open_session);
321
322/*
323 * The monitor responds with mount ack indicate mount success. The
324 * included client ticket allows the client to talk to MDSs and OSDs.
325 */
326static void ceph_monc_handle_map(struct ceph_mon_client *monc,
327 struct ceph_msg *msg)
328{
329 struct ceph_client *client = monc->client;
330 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
331 void *p, *end;
332
333 mutex_lock(&monc->mutex);
334
335 dout("handle_monmap\n");
336 p = msg->front.iov_base;
337 end = p + msg->front.iov_len;
338
339 monmap = ceph_monmap_decode(p, end);
340 if (IS_ERR(monmap)) {
341 pr_err("problem decoding monmap, %d\n",
342 (int)PTR_ERR(monmap));
343 goto out;
344 }
345
346 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
347 kfree(monmap);
348 goto out;
349 }
350
351 client->monc.monmap = monmap;
352 kfree(old);
353
354out:
355 mutex_unlock(&monc->mutex);
356 wake_up_all(&client->auth_wq);
357}
358
359/*
360 * generic requests (e.g., statfs, poolop)
361 */
362static struct ceph_mon_generic_request *__lookup_generic_req(
363 struct ceph_mon_client *monc, u64 tid)
364{
365 struct ceph_mon_generic_request *req;
366 struct rb_node *n = monc->generic_request_tree.rb_node;
367
368 while (n) {
369 req = rb_entry(n, struct ceph_mon_generic_request, node);
370 if (tid < req->tid)
371 n = n->rb_left;
372 else if (tid > req->tid)
373 n = n->rb_right;
374 else
375 return req;
376 }
377 return NULL;
378}
379
380static void __insert_generic_request(struct ceph_mon_client *monc,
381 struct ceph_mon_generic_request *new)
382{
383 struct rb_node **p = &monc->generic_request_tree.rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_mon_generic_request *req = NULL;
386
387 while (*p) {
388 parent = *p;
389 req = rb_entry(parent, struct ceph_mon_generic_request, node);
390 if (new->tid < req->tid)
391 p = &(*p)->rb_left;
392 else if (new->tid > req->tid)
393 p = &(*p)->rb_right;
394 else
395 BUG();
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, &monc->generic_request_tree);
400}
401
402static void release_generic_request(struct kref *kref)
403{
404 struct ceph_mon_generic_request *req =
405 container_of(kref, struct ceph_mon_generic_request, kref);
406
407 if (req->reply)
408 ceph_msg_put(req->reply);
409 if (req->request)
410 ceph_msg_put(req->request);
411
412 kfree(req);
413}
414
415static void put_generic_request(struct ceph_mon_generic_request *req)
416{
417 kref_put(&req->kref, release_generic_request);
418}
419
420static void get_generic_request(struct ceph_mon_generic_request *req)
421{
422 kref_get(&req->kref);
423}
424
425static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
426 struct ceph_msg_header *hdr,
427 int *skip)
428{
429 struct ceph_mon_client *monc = con->private;
430 struct ceph_mon_generic_request *req;
431 u64 tid = le64_to_cpu(hdr->tid);
432 struct ceph_msg *m;
433
434 mutex_lock(&monc->mutex);
435 req = __lookup_generic_req(monc, tid);
436 if (!req) {
437 dout("get_generic_reply %lld dne\n", tid);
438 *skip = 1;
439 m = NULL;
440 } else {
441 dout("get_generic_reply %lld got %p\n", tid, req->reply);
442 m = ceph_msg_get(req->reply);
443 /*
444 * we don't need to track the connection reading into
445 * this reply because we only have one open connection
446 * at a time, ever.
447 */
448 }
449 mutex_unlock(&monc->mutex);
450 return m;
451}
452
453static int do_generic_request(struct ceph_mon_client *monc,
454 struct ceph_mon_generic_request *req)
455{
456 int err;
457
458 /* register request */
459 mutex_lock(&monc->mutex);
460 req->tid = ++monc->last_tid;
461 req->request->hdr.tid = cpu_to_le64(req->tid);
462 __insert_generic_request(monc, req);
463 monc->num_generic_requests++;
464 ceph_con_send(monc->con, ceph_msg_get(req->request));
465 mutex_unlock(&monc->mutex);
466
467 err = wait_for_completion_interruptible(&req->completion);
468
469 mutex_lock(&monc->mutex);
470 rb_erase(&req->node, &monc->generic_request_tree);
471 monc->num_generic_requests--;
472 mutex_unlock(&monc->mutex);
473
474 if (!err)
475 err = req->result;
476 return err;
477}
478
479/*
480 * statfs
481 */
482static void handle_statfs_reply(struct ceph_mon_client *monc,
483 struct ceph_msg *msg)
484{
485 struct ceph_mon_generic_request *req;
486 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
487 u64 tid = le64_to_cpu(msg->hdr.tid);
488
489 if (msg->front.iov_len != sizeof(*reply))
490 goto bad;
491 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
492
493 mutex_lock(&monc->mutex);
494 req = __lookup_generic_req(monc, tid);
495 if (req) {
496 *(struct ceph_statfs *)req->buf = reply->st;
497 req->result = 0;
498 get_generic_request(req);
499 }
500 mutex_unlock(&monc->mutex);
501 if (req) {
502 complete_all(&req->completion);
503 put_generic_request(req);
504 }
505 return;
506
507bad:
508 pr_err("corrupt generic reply, tid %llu\n", tid);
509 ceph_msg_dump(msg);
510}
511
512/*
513 * Do a synchronous statfs().
514 */
515int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
516{
517 struct ceph_mon_generic_request *req;
518 struct ceph_mon_statfs *h;
519 int err;
520
521 req = kzalloc(sizeof(*req), GFP_NOFS);
522 if (!req)
523 return -ENOMEM;
524
525 kref_init(&req->kref);
526 req->buf = buf;
527 req->buf_len = sizeof(*buf);
528 init_completion(&req->completion);
529
530 err = -ENOMEM;
531 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
532 if (!req->request)
533 goto out;
534 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
535 if (!req->reply)
536 goto out;
537
538 /* fill out request */
539 h = req->request->front.iov_base;
540 h->monhdr.have_version = 0;
541 h->monhdr.session_mon = cpu_to_le16(-1);
542 h->monhdr.session_mon_tid = 0;
543 h->fsid = monc->monmap->fsid;
544
545 err = do_generic_request(monc, req);
546
547out:
548 kref_put(&req->kref, release_generic_request);
549 return err;
550}
551EXPORT_SYMBOL(ceph_monc_do_statfs);
552
553/*
554 * pool ops
555 */
556static int get_poolop_reply_buf(const char *src, size_t src_len,
557 char *dst, size_t dst_len)
558{
559 u32 buf_len;
560
561 if (src_len != sizeof(u32) + dst_len)
562 return -EINVAL;
563
564 buf_len = le32_to_cpu(*(u32 *)src);
565 if (buf_len != dst_len)
566 return -EINVAL;
567
568 memcpy(dst, src + sizeof(u32), dst_len);
569 return 0;
570}
571
572static void handle_poolop_reply(struct ceph_mon_client *monc,
573 struct ceph_msg *msg)
574{
575 struct ceph_mon_generic_request *req;
576 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
577 u64 tid = le64_to_cpu(msg->hdr.tid);
578
579 if (msg->front.iov_len < sizeof(*reply))
580 goto bad;
581 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
582
583 mutex_lock(&monc->mutex);
584 req = __lookup_generic_req(monc, tid);
585 if (req) {
586 if (req->buf_len &&
587 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
588 msg->front.iov_len - sizeof(*reply),
589 req->buf, req->buf_len) < 0) {
590 mutex_unlock(&monc->mutex);
591 goto bad;
592 }
593 req->result = le32_to_cpu(reply->reply_code);
594 get_generic_request(req);
595 }
596 mutex_unlock(&monc->mutex);
597 if (req) {
598 complete(&req->completion);
599 put_generic_request(req);
600 }
601 return;
602
603bad:
604 pr_err("corrupt generic reply, tid %llu\n", tid);
605 ceph_msg_dump(msg);
606}
607
608/*
609 * Do a synchronous pool op.
610 */
611int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
612 u32 pool, u64 snapid,
613 char *buf, int len)
614{
615 struct ceph_mon_generic_request *req;
616 struct ceph_mon_poolop *h;
617 int err;
618
619 req = kzalloc(sizeof(*req), GFP_NOFS);
620 if (!req)
621 return -ENOMEM;
622
623 kref_init(&req->kref);
624 req->buf = buf;
625 req->buf_len = len;
626 init_completion(&req->completion);
627
628 err = -ENOMEM;
629 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
630 if (!req->request)
631 goto out;
632 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
633 if (!req->reply)
634 goto out;
635
636 /* fill out request */
637 req->request->hdr.version = cpu_to_le16(2);
638 h = req->request->front.iov_base;
639 h->monhdr.have_version = 0;
640 h->monhdr.session_mon = cpu_to_le16(-1);
641 h->monhdr.session_mon_tid = 0;
642 h->fsid = monc->monmap->fsid;
643 h->pool = cpu_to_le32(pool);
644 h->op = cpu_to_le32(op);
645 h->auid = 0;
646 h->snapid = cpu_to_le64(snapid);
647 h->name_len = 0;
648
649 err = do_generic_request(monc, req);
650
651out:
652 kref_put(&req->kref, release_generic_request);
653 return err;
654}
655
656int ceph_monc_create_snapid(struct ceph_mon_client *monc,
657 u32 pool, u64 *snapid)
658{
659 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
660 pool, 0, (char *)snapid, sizeof(*snapid));
661
662}
663EXPORT_SYMBOL(ceph_monc_create_snapid);
664
665int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
666 u32 pool, u64 snapid)
667{
668 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
669 pool, snapid, 0, 0);
670
671}
672
673/*
674 * Resend pending generic requests.
675 */
676static void __resend_generic_request(struct ceph_mon_client *monc)
677{
678 struct ceph_mon_generic_request *req;
679 struct rb_node *p;
680
681 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
682 req = rb_entry(p, struct ceph_mon_generic_request, node);
683 ceph_con_revoke(monc->con, req->request);
684 ceph_con_send(monc->con, ceph_msg_get(req->request));
685 }
686}
687
688/*
689 * Delayed work. If we haven't mounted yet, retry. Otherwise,
690 * renew/retry subscription as needed (in case it is timing out, or we
691 * got an ENOMEM). And keep the monitor connection alive.
692 */
693static void delayed_work(struct work_struct *work)
694{
695 struct ceph_mon_client *monc =
696 container_of(work, struct ceph_mon_client, delayed_work.work);
697
698 dout("monc delayed_work\n");
699 mutex_lock(&monc->mutex);
700 if (monc->hunting) {
701 __close_session(monc);
702 __open_session(monc); /* continue hunting */
703 } else {
704 ceph_con_keepalive(monc->con);
705
706 __validate_auth(monc);
707
708 if (monc->auth->ops->is_authenticated(monc->auth))
709 __send_subscribe(monc);
710 }
711 __schedule_delayed(monc);
712 mutex_unlock(&monc->mutex);
713}
714
715/*
716 * On startup, we build a temporary monmap populated with the IPs
717 * provided by mount(2).
718 */
719static int build_initial_monmap(struct ceph_mon_client *monc)
720{
721 struct ceph_options *opt = monc->client->options;
722 struct ceph_entity_addr *mon_addr = opt->mon_addr;
723 int num_mon = opt->num_mon;
724 int i;
725
726 /* build initial monmap */
727 monc->monmap = kzalloc(sizeof(*monc->monmap) +
728 num_mon*sizeof(monc->monmap->mon_inst[0]),
729 GFP_KERNEL);
730 if (!monc->monmap)
731 return -ENOMEM;
732 for (i = 0; i < num_mon; i++) {
733 monc->monmap->mon_inst[i].addr = mon_addr[i];
734 monc->monmap->mon_inst[i].addr.nonce = 0;
735 monc->monmap->mon_inst[i].name.type =
736 CEPH_ENTITY_TYPE_MON;
737 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
738 }
739 monc->monmap->num_mon = num_mon;
740 monc->have_fsid = false;
741 return 0;
742}
743
744int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
745{
746 int err = 0;
747
748 dout("init\n");
749 memset(monc, 0, sizeof(*monc));
750 monc->client = cl;
751 monc->monmap = NULL;
752 mutex_init(&monc->mutex);
753
754 err = build_initial_monmap(monc);
755 if (err)
756 goto out;
757
758 monc->con = NULL;
759
760 /* authentication */
761 monc->auth = ceph_auth_init(cl->options->name,
762 cl->options->secret);
763 if (IS_ERR(monc->auth))
764 return PTR_ERR(monc->auth);
765 monc->auth->want_keys =
766 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
767 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
768
769 /* msgs */
770 err = -ENOMEM;
771 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
772 sizeof(struct ceph_mon_subscribe_ack),
773 GFP_NOFS);
774 if (!monc->m_subscribe_ack)
775 goto out_monmap;
776
777 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
778 if (!monc->m_subscribe)
779 goto out_subscribe_ack;
780
781 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
782 if (!monc->m_auth_reply)
783 goto out_subscribe;
784
785 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
786 monc->pending_auth = 0;
787 if (!monc->m_auth)
788 goto out_auth_reply;
789
790 monc->cur_mon = -1;
791 monc->hunting = true;
792 monc->sub_renew_after = jiffies;
793 monc->sub_sent = 0;
794
795 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
796 monc->generic_request_tree = RB_ROOT;
797 monc->num_generic_requests = 0;
798 monc->last_tid = 0;
799
800 monc->have_mdsmap = 0;
801 monc->have_osdmap = 0;
802 monc->want_next_osdmap = 1;
803 return 0;
804
805out_auth_reply:
806 ceph_msg_put(monc->m_auth_reply);
807out_subscribe:
808 ceph_msg_put(monc->m_subscribe);
809out_subscribe_ack:
810 ceph_msg_put(monc->m_subscribe_ack);
811out_monmap:
812 kfree(monc->monmap);
813out:
814 return err;
815}
816EXPORT_SYMBOL(ceph_monc_init);
817
818void ceph_monc_stop(struct ceph_mon_client *monc)
819{
820 dout("stop\n");
821 cancel_delayed_work_sync(&monc->delayed_work);
822
823 mutex_lock(&monc->mutex);
824 __close_session(monc);
825 if (monc->con) {
826 monc->con->private = NULL;
827 monc->con->ops->put(monc->con);
828 monc->con = NULL;
829 }
830 mutex_unlock(&monc->mutex);
831
832 ceph_auth_destroy(monc->auth);
833
834 ceph_msg_put(monc->m_auth);
835 ceph_msg_put(monc->m_auth_reply);
836 ceph_msg_put(monc->m_subscribe);
837 ceph_msg_put(monc->m_subscribe_ack);
838
839 kfree(monc->monmap);
840}
841EXPORT_SYMBOL(ceph_monc_stop);
842
843static void handle_auth_reply(struct ceph_mon_client *monc,
844 struct ceph_msg *msg)
845{
846 int ret;
847 int was_auth = 0;
848
849 mutex_lock(&monc->mutex);
850 if (monc->auth->ops)
851 was_auth = monc->auth->ops->is_authenticated(monc->auth);
852 monc->pending_auth = 0;
853 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
854 msg->front.iov_len,
855 monc->m_auth->front.iov_base,
856 monc->m_auth->front_max);
857 if (ret < 0) {
858 monc->client->auth_err = ret;
859 wake_up_all(&monc->client->auth_wq);
860 } else if (ret > 0) {
861 __send_prepared_auth_request(monc, ret);
862 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
863 dout("authenticated, starting session\n");
864
865 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
866 monc->client->msgr->inst.name.num =
867 cpu_to_le64(monc->auth->global_id);
868
869 __send_subscribe(monc);
870 __resend_generic_request(monc);
871 }
872 mutex_unlock(&monc->mutex);
873}
874
875static int __validate_auth(struct ceph_mon_client *monc)
876{
877 int ret;
878
879 if (monc->pending_auth)
880 return 0;
881
882 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
883 monc->m_auth->front_max);
884 if (ret <= 0)
885 return ret; /* either an error, or no need to authenticate */
886 __send_prepared_auth_request(monc, ret);
887 return 0;
888}
889
890int ceph_monc_validate_auth(struct ceph_mon_client *monc)
891{
892 int ret;
893
894 mutex_lock(&monc->mutex);
895 ret = __validate_auth(monc);
896 mutex_unlock(&monc->mutex);
897 return ret;
898}
899EXPORT_SYMBOL(ceph_monc_validate_auth);
900
901/*
902 * handle incoming message
903 */
904static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
905{
906 struct ceph_mon_client *monc = con->private;
907 int type = le16_to_cpu(msg->hdr.type);
908
909 if (!monc)
910 return;
911
912 switch (type) {
913 case CEPH_MSG_AUTH_REPLY:
914 handle_auth_reply(monc, msg);
915 break;
916
917 case CEPH_MSG_MON_SUBSCRIBE_ACK:
918 handle_subscribe_ack(monc, msg);
919 break;
920
921 case CEPH_MSG_STATFS_REPLY:
922 handle_statfs_reply(monc, msg);
923 break;
924
925 case CEPH_MSG_POOLOP_REPLY:
926 handle_poolop_reply(monc, msg);
927 break;
928
929 case CEPH_MSG_MON_MAP:
930 ceph_monc_handle_map(monc, msg);
931 break;
932
933 case CEPH_MSG_OSD_MAP:
934 ceph_osdc_handle_map(&monc->client->osdc, msg);
935 break;
936
937 default:
938 /* can the chained handler handle it? */
939 if (monc->client->extra_mon_dispatch &&
940 monc->client->extra_mon_dispatch(monc->client, msg) == 0)
941 break;
942
943 pr_err("received unknown message type %d %s\n", type,
944 ceph_msg_type_name(type));
945 }
946 ceph_msg_put(msg);
947}
948
949/*
950 * Allocate memory for incoming message
951 */
952static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
953 struct ceph_msg_header *hdr,
954 int *skip)
955{
956 struct ceph_mon_client *monc = con->private;
957 int type = le16_to_cpu(hdr->type);
958 int front_len = le32_to_cpu(hdr->front_len);
959 struct ceph_msg *m = NULL;
960
961 *skip = 0;
962
963 switch (type) {
964 case CEPH_MSG_MON_SUBSCRIBE_ACK:
965 m = ceph_msg_get(monc->m_subscribe_ack);
966 break;
967 case CEPH_MSG_POOLOP_REPLY:
968 case CEPH_MSG_STATFS_REPLY:
969 return get_generic_reply(con, hdr, skip);
970 case CEPH_MSG_AUTH_REPLY:
971 m = ceph_msg_get(monc->m_auth_reply);
972 break;
973 case CEPH_MSG_MON_MAP:
974 case CEPH_MSG_MDS_MAP:
975 case CEPH_MSG_OSD_MAP:
976 m = ceph_msg_new(type, front_len, GFP_NOFS);
977 break;
978 }
979
980 if (!m) {
981 pr_info("alloc_msg unknown type %d\n", type);
982 *skip = 1;
983 }
984 return m;
985}
986
987/*
988 * If the monitor connection resets, pick a new monitor and resubmit
989 * any pending requests.
990 */
991static void mon_fault(struct ceph_connection *con)
992{
993 struct ceph_mon_client *monc = con->private;
994
995 if (!monc)
996 return;
997
998 dout("mon_fault\n");
999 mutex_lock(&monc->mutex);
1000 if (!con->private)
1001 goto out;
1002
1003 if (monc->con && !monc->hunting)
1004 pr_info("mon%d %s session lost, "
1005 "hunting for new mon\n", monc->cur_mon,
1006 ceph_pr_addr(&monc->con->peer_addr.in_addr));
1007
1008 __close_session(monc);
1009 if (!monc->hunting) {
1010 /* start hunting */
1011 monc->hunting = true;
1012 __open_session(monc);
1013 } else {
1014 /* already hunting, let's wait a bit */
1015 __schedule_delayed(monc);
1016 }
1017out:
1018 mutex_unlock(&monc->mutex);
1019}
1020
1021static const struct ceph_connection_operations mon_con_ops = {
1022 .get = ceph_con_get,
1023 .put = ceph_con_put,
1024 .dispatch = dispatch,
1025 .fault = mon_fault,
1026 .alloc_msg = mon_alloc_msg,
1027};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
new file mode 100644
index 000000000000..d5f2d97ac05c
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,64 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include <linux/ceph/msgpool.h>
9
10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11{
12 struct ceph_msgpool *pool = arg;
13 void *p;
14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
20
21static void free_fn(void *element, void *arg)
22{
23 ceph_msg_put(element);
24}
25
26int ceph_msgpool_init(struct ceph_msgpool *pool,
27 int front_len, int size, bool blocking, const char *name)
28{
29 pool->front_len = front_len;
30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
31 if (!pool->pool)
32 return -ENOMEM;
33 pool->name = name;
34 return 0;
35}
36
37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
38{
39 mempool_destroy(pool->pool);
40}
41
42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
44{
45 if (front_len > pool->front_len) {
46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
47 pool->name, front_len, pool->front_len);
48 WARN_ON(1);
49
50 /* try to alloc a fresh message */
51 return ceph_msg_new(0, front_len, GFP_NOFS);
52 }
53
54 return mempool_alloc(pool->pool, GFP_NOFS);
55}
56
57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
58{
59 /* reset msg front_len; user may have changed it */
60 msg->front.iov_len = pool->front_len;
61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
62
63 kref_init(&msg->kref); /* retake single ref */
64}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
new file mode 100644
index 000000000000..79391994b3ed
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,1773 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/err.h>
5#include <linux/highmem.h>
6#include <linux/mm.h>
7#include <linux/pagemap.h>
8#include <linux/slab.h>
9#include <linux/uaccess.h>
10#ifdef CONFIG_BLOCK
11#include <linux/bio.h>
12#endif
13
14#include <linux/ceph/libceph.h>
15#include <linux/ceph/osd_client.h>
16#include <linux/ceph/messenger.h>
17#include <linux/ceph/decode.h>
18#include <linux/ceph/auth.h>
19#include <linux/ceph/pagelist.h>
20
21#define OSD_OP_FRONT_LEN 4096
22#define OSD_OPREPLY_FRONT_LEN 512
23
24static const struct ceph_connection_operations osd_con_ops;
25static int __kick_requests(struct ceph_osd_client *osdc,
26 struct ceph_osd *kickosd);
27
28static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
29
30static int op_needs_trail(int op)
31{
32 switch (op) {
33 case CEPH_OSD_OP_GETXATTR:
34 case CEPH_OSD_OP_SETXATTR:
35 case CEPH_OSD_OP_CMPXATTR:
36 case CEPH_OSD_OP_CALL:
37 return 1;
38 default:
39 return 0;
40 }
41}
42
43static int op_has_extent(int op)
44{
45 return (op == CEPH_OSD_OP_READ ||
46 op == CEPH_OSD_OP_WRITE);
47}
48
49void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
50 struct ceph_file_layout *layout,
51 u64 snapid,
52 u64 off, u64 *plen, u64 *bno,
53 struct ceph_osd_request *req,
54 struct ceph_osd_req_op *op)
55{
56 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59
60 reqhead->snapid = cpu_to_le64(snapid);
61
62 /* object extent? */
63 ceph_calc_file_object_mapping(layout, off, plen, bno,
64 &objoff, &objlen);
65 if (*plen < orig_len)
66 dout(" skipping last %llu, final file extent %llu~%llu\n",
67 orig_len - *plen, off, *plen);
68
69 if (op_has_extent(op->op)) {
70 op->extent.offset = objoff;
71 op->extent.length = objlen;
72 }
73 req->r_num_pages = calc_pages_for(off, *plen);
74 if (op->op == CEPH_OSD_OP_WRITE)
75 op->payload_len = *plen;
76
77 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
78 *bno, objoff, objlen, req->r_num_pages);
79
80}
81EXPORT_SYMBOL(ceph_calc_raw_layout);
82
83/*
84 * Implement client access to distributed object storage cluster.
85 *
86 * All data objects are stored within a cluster/cloud of OSDs, or
87 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
88 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
89 * remote daemons serving up and coordinating consistent and safe
90 * access to storage.
91 *
92 * Cluster membership and the mapping of data objects onto storage devices
93 * are described by the osd map.
94 *
95 * We keep track of pending OSD requests (read, write), resubmit
96 * requests to different OSDs when the cluster topology/data layout
97 * change, or retry the affected requests when the communications
98 * channel with an OSD is reset.
99 */
100
101/*
102 * calculate the mapping of a file extent onto an object, and fill out the
103 * request accordingly. shorten extent as necessary if it crosses an
104 * object boundary.
105 *
106 * fill osd op in request message.
107 */
108static void calc_layout(struct ceph_osd_client *osdc,
109 struct ceph_vino vino,
110 struct ceph_file_layout *layout,
111 u64 off, u64 *plen,
112 struct ceph_osd_request *req,
113 struct ceph_osd_req_op *op)
114{
115 u64 bno;
116
117 ceph_calc_raw_layout(osdc, layout, vino.snap, off,
118 plen, &bno, req, op);
119
120 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
121 req->r_oid_len = strlen(req->r_oid);
122}
123
124/*
125 * requests
126 */
127void ceph_osdc_release_request(struct kref *kref)
128{
129 struct ceph_osd_request *req = container_of(kref,
130 struct ceph_osd_request,
131 r_kref);
132
133 if (req->r_request)
134 ceph_msg_put(req->r_request);
135 if (req->r_reply)
136 ceph_msg_put(req->r_reply);
137 if (req->r_con_filling_msg) {
138 dout("release_request revoking pages %p from con %p\n",
139 req->r_pages, req->r_con_filling_msg);
140 ceph_con_revoke_message(req->r_con_filling_msg,
141 req->r_reply);
142 ceph_con_put(req->r_con_filling_msg);
143 }
144 if (req->r_own_pages)
145 ceph_release_page_vector(req->r_pages,
146 req->r_num_pages);
147#ifdef CONFIG_BLOCK
148 if (req->r_bio)
149 bio_put(req->r_bio);
150#endif
151 ceph_put_snap_context(req->r_snapc);
152 if (req->r_trail) {
153 ceph_pagelist_release(req->r_trail);
154 kfree(req->r_trail);
155 }
156 if (req->r_mempool)
157 mempool_free(req, req->r_osdc->req_mempool);
158 else
159 kfree(req);
160}
161EXPORT_SYMBOL(ceph_osdc_release_request);
162
163static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
164{
165 int i = 0;
166
167 if (needs_trail)
168 *needs_trail = 0;
169 while (ops[i].op) {
170 if (needs_trail && op_needs_trail(ops[i].op))
171 *needs_trail = 1;
172 i++;
173 }
174
175 return i;
176}
177
178struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
179 int flags,
180 struct ceph_snap_context *snapc,
181 struct ceph_osd_req_op *ops,
182 bool use_mempool,
183 gfp_t gfp_flags,
184 struct page **pages,
185 struct bio *bio)
186{
187 struct ceph_osd_request *req;
188 struct ceph_msg *msg;
189 int needs_trail;
190 int num_op = get_num_ops(ops, &needs_trail);
191 size_t msg_size = sizeof(struct ceph_osd_request_head);
192
193 msg_size += num_op*sizeof(struct ceph_osd_op);
194
195 if (use_mempool) {
196 req = mempool_alloc(osdc->req_mempool, gfp_flags);
197 memset(req, 0, sizeof(*req));
198 } else {
199 req = kzalloc(sizeof(*req), gfp_flags);
200 }
201 if (req == NULL)
202 return NULL;
203
204 req->r_osdc = osdc;
205 req->r_mempool = use_mempool;
206
207 kref_init(&req->r_kref);
208 init_completion(&req->r_completion);
209 init_completion(&req->r_safe_completion);
210 INIT_LIST_HEAD(&req->r_unsafe_item);
211 req->r_flags = flags;
212
213 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
214
215 /* create reply message */
216 if (use_mempool)
217 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
218 else
219 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
220 OSD_OPREPLY_FRONT_LEN, gfp_flags);
221 if (!msg) {
222 ceph_osdc_put_request(req);
223 return NULL;
224 }
225 req->r_reply = msg;
226
227 /* allocate space for the trailing data */
228 if (needs_trail) {
229 req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
230 if (!req->r_trail) {
231 ceph_osdc_put_request(req);
232 return NULL;
233 }
234 ceph_pagelist_init(req->r_trail);
235 }
236 /* create request message; allow space for oid */
237 msg_size += 40;
238 if (snapc)
239 msg_size += sizeof(u64) * snapc->num_snaps;
240 if (use_mempool)
241 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
242 else
243 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
244 if (!msg) {
245 ceph_osdc_put_request(req);
246 return NULL;
247 }
248
249 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
250 memset(msg->front.iov_base, 0, msg->front.iov_len);
251
252 req->r_request = msg;
253 req->r_pages = pages;
254#ifdef CONFIG_BLOCK
255 if (bio) {
256 req->r_bio = bio;
257 bio_get(req->r_bio);
258 }
259#endif
260
261 return req;
262}
263EXPORT_SYMBOL(ceph_osdc_alloc_request);
264
265static void osd_req_encode_op(struct ceph_osd_request *req,
266 struct ceph_osd_op *dst,
267 struct ceph_osd_req_op *src)
268{
269 dst->op = cpu_to_le16(src->op);
270
271 switch (dst->op) {
272 case CEPH_OSD_OP_READ:
273 case CEPH_OSD_OP_WRITE:
274 dst->extent.offset =
275 cpu_to_le64(src->extent.offset);
276 dst->extent.length =
277 cpu_to_le64(src->extent.length);
278 dst->extent.truncate_size =
279 cpu_to_le64(src->extent.truncate_size);
280 dst->extent.truncate_seq =
281 cpu_to_le32(src->extent.truncate_seq);
282 break;
283
284 case CEPH_OSD_OP_GETXATTR:
285 case CEPH_OSD_OP_SETXATTR:
286 case CEPH_OSD_OP_CMPXATTR:
287 BUG_ON(!req->r_trail);
288
289 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
290 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
291 dst->xattr.cmp_op = src->xattr.cmp_op;
292 dst->xattr.cmp_mode = src->xattr.cmp_mode;
293 ceph_pagelist_append(req->r_trail, src->xattr.name,
294 src->xattr.name_len);
295 ceph_pagelist_append(req->r_trail, src->xattr.val,
296 src->xattr.value_len);
297 break;
298 case CEPH_OSD_OP_CALL:
299 BUG_ON(!req->r_trail);
300
301 dst->cls.class_len = src->cls.class_len;
302 dst->cls.method_len = src->cls.method_len;
303 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
304
305 ceph_pagelist_append(req->r_trail, src->cls.class_name,
306 src->cls.class_len);
307 ceph_pagelist_append(req->r_trail, src->cls.method_name,
308 src->cls.method_len);
309 ceph_pagelist_append(req->r_trail, src->cls.indata,
310 src->cls.indata_len);
311 break;
312 case CEPH_OSD_OP_ROLLBACK:
313 dst->snap.snapid = cpu_to_le64(src->snap.snapid);
314 break;
315 case CEPH_OSD_OP_STARTSYNC:
316 break;
317 default:
318 pr_err("unrecognized osd opcode %d\n", dst->op);
319 WARN_ON(1);
320 break;
321 }
322 dst->payload_len = cpu_to_le32(src->payload_len);
323}
324
325/*
326 * build new request AND message
327 *
328 */
329void ceph_osdc_build_request(struct ceph_osd_request *req,
330 u64 off, u64 *plen,
331 struct ceph_osd_req_op *src_ops,
332 struct ceph_snap_context *snapc,
333 struct timespec *mtime,
334 const char *oid,
335 int oid_len)
336{
337 struct ceph_msg *msg = req->r_request;
338 struct ceph_osd_request_head *head;
339 struct ceph_osd_req_op *src_op;
340 struct ceph_osd_op *op;
341 void *p;
342 int num_op = get_num_ops(src_ops, NULL);
343 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
344 int flags = req->r_flags;
345 u64 data_len = 0;
346 int i;
347
348 head = msg->front.iov_base;
349 op = (void *)(head + 1);
350 p = (void *)(op + num_op);
351
352 req->r_snapc = ceph_get_snap_context(snapc);
353
354 head->client_inc = cpu_to_le32(1); /* always, for now. */
355 head->flags = cpu_to_le32(flags);
356 if (flags & CEPH_OSD_FLAG_WRITE)
357 ceph_encode_timespec(&head->mtime, mtime);
358 head->num_ops = cpu_to_le16(num_op);
359
360
361 /* fill in oid */
362 head->object_len = cpu_to_le32(oid_len);
363 memcpy(p, oid, oid_len);
364 p += oid_len;
365
366 src_op = src_ops;
367 while (src_op->op) {
368 osd_req_encode_op(req, op, src_op);
369 src_op++;
370 op++;
371 }
372
373 if (req->r_trail)
374 data_len += req->r_trail->length;
375
376 if (snapc) {
377 head->snap_seq = cpu_to_le64(snapc->seq);
378 head->num_snaps = cpu_to_le32(snapc->num_snaps);
379 for (i = 0; i < snapc->num_snaps; i++) {
380 put_unaligned_le64(snapc->snaps[i], p);
381 p += sizeof(u64);
382 }
383 }
384
385 if (flags & CEPH_OSD_FLAG_WRITE) {
386 req->r_request->hdr.data_off = cpu_to_le16(off);
387 req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
388 } else if (data_len) {
389 req->r_request->hdr.data_off = 0;
390 req->r_request->hdr.data_len = cpu_to_le32(data_len);
391 }
392
393 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
394 msg_size = p - msg->front.iov_base;
395 msg->front.iov_len = msg_size;
396 msg->hdr.front_len = cpu_to_le32(msg_size);
397 return;
398}
399EXPORT_SYMBOL(ceph_osdc_build_request);
400
401/*
402 * build new request AND message, calculate layout, and adjust file
403 * extent as needed.
404 *
405 * if the file was recently truncated, we include information about its
406 * old and new size so that the object can be updated appropriately. (we
407 * avoid synchronously deleting truncated objects because it's slow.)
408 *
409 * if @do_sync, include a 'startsync' command so that the osd will flush
410 * data quickly.
411 */
412struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
413 struct ceph_file_layout *layout,
414 struct ceph_vino vino,
415 u64 off, u64 *plen,
416 int opcode, int flags,
417 struct ceph_snap_context *snapc,
418 int do_sync,
419 u32 truncate_seq,
420 u64 truncate_size,
421 struct timespec *mtime,
422 bool use_mempool, int num_reply)
423{
424 struct ceph_osd_req_op ops[3];
425 struct ceph_osd_request *req;
426
427 ops[0].op = opcode;
428 ops[0].extent.truncate_seq = truncate_seq;
429 ops[0].extent.truncate_size = truncate_size;
430 ops[0].payload_len = 0;
431
432 if (do_sync) {
433 ops[1].op = CEPH_OSD_OP_STARTSYNC;
434 ops[1].payload_len = 0;
435 ops[2].op = 0;
436 } else
437 ops[1].op = 0;
438
439 req = ceph_osdc_alloc_request(osdc, flags,
440 snapc, ops,
441 use_mempool,
442 GFP_NOFS, NULL, NULL);
443 if (IS_ERR(req))
444 return req;
445
446 /* calculate max write size */
447 calc_layout(osdc, vino, layout, off, plen, req, ops);
448 req->r_file_layout = *layout; /* keep a copy */
449
450 ceph_osdc_build_request(req, off, plen, ops,
451 snapc,
452 mtime,
453 req->r_oid, req->r_oid_len);
454
455 return req;
456}
457EXPORT_SYMBOL(ceph_osdc_new_request);
458
459/*
460 * We keep osd requests in an rbtree, sorted by ->r_tid.
461 */
462static void __insert_request(struct ceph_osd_client *osdc,
463 struct ceph_osd_request *new)
464{
465 struct rb_node **p = &osdc->requests.rb_node;
466 struct rb_node *parent = NULL;
467 struct ceph_osd_request *req = NULL;
468
469 while (*p) {
470 parent = *p;
471 req = rb_entry(parent, struct ceph_osd_request, r_node);
472 if (new->r_tid < req->r_tid)
473 p = &(*p)->rb_left;
474 else if (new->r_tid > req->r_tid)
475 p = &(*p)->rb_right;
476 else
477 BUG();
478 }
479
480 rb_link_node(&new->r_node, parent, p);
481 rb_insert_color(&new->r_node, &osdc->requests);
482}
483
484static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
485 u64 tid)
486{
487 struct ceph_osd_request *req;
488 struct rb_node *n = osdc->requests.rb_node;
489
490 while (n) {
491 req = rb_entry(n, struct ceph_osd_request, r_node);
492 if (tid < req->r_tid)
493 n = n->rb_left;
494 else if (tid > req->r_tid)
495 n = n->rb_right;
496 else
497 return req;
498 }
499 return NULL;
500}
501
502static struct ceph_osd_request *
503__lookup_request_ge(struct ceph_osd_client *osdc,
504 u64 tid)
505{
506 struct ceph_osd_request *req;
507 struct rb_node *n = osdc->requests.rb_node;
508
509 while (n) {
510 req = rb_entry(n, struct ceph_osd_request, r_node);
511 if (tid < req->r_tid) {
512 if (!n->rb_left)
513 return req;
514 n = n->rb_left;
515 } else if (tid > req->r_tid) {
516 n = n->rb_right;
517 } else {
518 return req;
519 }
520 }
521 return NULL;
522}
523
524
525/*
526 * If the osd connection drops, we need to resubmit all requests.
527 */
528static void osd_reset(struct ceph_connection *con)
529{
530 struct ceph_osd *osd = con->private;
531 struct ceph_osd_client *osdc;
532
533 if (!osd)
534 return;
535 dout("osd_reset osd%d\n", osd->o_osd);
536 osdc = osd->o_osdc;
537 down_read(&osdc->map_sem);
538 kick_requests(osdc, osd);
539 up_read(&osdc->map_sem);
540}
541
542/*
543 * Track open sessions with osds.
544 */
545static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
546{
547 struct ceph_osd *osd;
548
549 osd = kzalloc(sizeof(*osd), GFP_NOFS);
550 if (!osd)
551 return NULL;
552
553 atomic_set(&osd->o_ref, 1);
554 osd->o_osdc = osdc;
555 INIT_LIST_HEAD(&osd->o_requests);
556 INIT_LIST_HEAD(&osd->o_osd_lru);
557 osd->o_incarnation = 1;
558
559 ceph_con_init(osdc->client->msgr, &osd->o_con);
560 osd->o_con.private = osd;
561 osd->o_con.ops = &osd_con_ops;
562 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
563
564 INIT_LIST_HEAD(&osd->o_keepalive_item);
565 return osd;
566}
567
568static struct ceph_osd *get_osd(struct ceph_osd *osd)
569{
570 if (atomic_inc_not_zero(&osd->o_ref)) {
571 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
572 atomic_read(&osd->o_ref));
573 return osd;
574 } else {
575 dout("get_osd %p FAIL\n", osd);
576 return NULL;
577 }
578}
579
580static void put_osd(struct ceph_osd *osd)
581{
582 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
583 atomic_read(&osd->o_ref) - 1);
584 if (atomic_dec_and_test(&osd->o_ref)) {
585 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
586
587 if (osd->o_authorizer)
588 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
589 kfree(osd);
590 }
591}
592
593/*
594 * remove an osd from our map
595 */
596static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
597{
598 dout("__remove_osd %p\n", osd);
599 BUG_ON(!list_empty(&osd->o_requests));
600 rb_erase(&osd->o_node, &osdc->osds);
601 list_del_init(&osd->o_osd_lru);
602 ceph_con_close(&osd->o_con);
603 put_osd(osd);
604}
605
606static void __move_osd_to_lru(struct ceph_osd_client *osdc,
607 struct ceph_osd *osd)
608{
609 dout("__move_osd_to_lru %p\n", osd);
610 BUG_ON(!list_empty(&osd->o_osd_lru));
611 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
612 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
613}
614
615static void __remove_osd_from_lru(struct ceph_osd *osd)
616{
617 dout("__remove_osd_from_lru %p\n", osd);
618 if (!list_empty(&osd->o_osd_lru))
619 list_del_init(&osd->o_osd_lru);
620}
621
622static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
623{
624 struct ceph_osd *osd, *nosd;
625
626 dout("__remove_old_osds %p\n", osdc);
627 mutex_lock(&osdc->request_mutex);
628 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
629 if (!remove_all && time_before(jiffies, osd->lru_ttl))
630 break;
631 __remove_osd(osdc, osd);
632 }
633 mutex_unlock(&osdc->request_mutex);
634}
635
636/*
637 * reset osd connect
638 */
639static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
640{
641 struct ceph_osd_request *req;
642 int ret = 0;
643
644 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
645 if (list_empty(&osd->o_requests)) {
646 __remove_osd(osdc, osd);
647 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
648 &osd->o_con.peer_addr,
649 sizeof(osd->o_con.peer_addr)) == 0 &&
650 !ceph_con_opened(&osd->o_con)) {
651 dout(" osd addr hasn't changed and connection never opened,"
652 " letting msgr retry");
653 /* touch each r_stamp for handle_timeout()'s benfit */
654 list_for_each_entry(req, &osd->o_requests, r_osd_item)
655 req->r_stamp = jiffies;
656 ret = -EAGAIN;
657 } else {
658 ceph_con_close(&osd->o_con);
659 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
660 osd->o_incarnation++;
661 }
662 return ret;
663}
664
665static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
666{
667 struct rb_node **p = &osdc->osds.rb_node;
668 struct rb_node *parent = NULL;
669 struct ceph_osd *osd = NULL;
670
671 while (*p) {
672 parent = *p;
673 osd = rb_entry(parent, struct ceph_osd, o_node);
674 if (new->o_osd < osd->o_osd)
675 p = &(*p)->rb_left;
676 else if (new->o_osd > osd->o_osd)
677 p = &(*p)->rb_right;
678 else
679 BUG();
680 }
681
682 rb_link_node(&new->o_node, parent, p);
683 rb_insert_color(&new->o_node, &osdc->osds);
684}
685
686static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
687{
688 struct ceph_osd *osd;
689 struct rb_node *n = osdc->osds.rb_node;
690
691 while (n) {
692 osd = rb_entry(n, struct ceph_osd, o_node);
693 if (o < osd->o_osd)
694 n = n->rb_left;
695 else if (o > osd->o_osd)
696 n = n->rb_right;
697 else
698 return osd;
699 }
700 return NULL;
701}
702
703static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
704{
705 schedule_delayed_work(&osdc->timeout_work,
706 osdc->client->options->osd_keepalive_timeout * HZ);
707}
708
709static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
710{
711 cancel_delayed_work(&osdc->timeout_work);
712}
713
714/*
715 * Register request, assign tid. If this is the first request, set up
716 * the timeout event.
717 */
718static void register_request(struct ceph_osd_client *osdc,
719 struct ceph_osd_request *req)
720{
721 mutex_lock(&osdc->request_mutex);
722 req->r_tid = ++osdc->last_tid;
723 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
724 INIT_LIST_HEAD(&req->r_req_lru_item);
725
726 dout("register_request %p tid %lld\n", req, req->r_tid);
727 __insert_request(osdc, req);
728 ceph_osdc_get_request(req);
729 osdc->num_requests++;
730
731 if (osdc->num_requests == 1) {
732 dout(" first request, scheduling timeout\n");
733 __schedule_osd_timeout(osdc);
734 }
735 mutex_unlock(&osdc->request_mutex);
736}
737
738/*
739 * called under osdc->request_mutex
740 */
741static void __unregister_request(struct ceph_osd_client *osdc,
742 struct ceph_osd_request *req)
743{
744 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
745 rb_erase(&req->r_node, &osdc->requests);
746 osdc->num_requests--;
747
748 if (req->r_osd) {
749 /* make sure the original request isn't in flight. */
750 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
751
752 list_del_init(&req->r_osd_item);
753 if (list_empty(&req->r_osd->o_requests))
754 __move_osd_to_lru(osdc, req->r_osd);
755 req->r_osd = NULL;
756 }
757
758 ceph_osdc_put_request(req);
759
760 list_del_init(&req->r_req_lru_item);
761 if (osdc->num_requests == 0) {
762 dout(" no requests, canceling timeout\n");
763 __cancel_osd_timeout(osdc);
764 }
765}
766
767/*
768 * Cancel a previously queued request message
769 */
770static void __cancel_request(struct ceph_osd_request *req)
771{
772 if (req->r_sent && req->r_osd) {
773 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
774 req->r_sent = 0;
775 }
776 list_del_init(&req->r_req_lru_item);
777}
778
779/*
780 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
781 * (as needed), and set the request r_osd appropriately. If there is
782 * no up osd, set r_osd to NULL.
783 *
784 * Return 0 if unchanged, 1 if changed, or negative on error.
785 *
786 * Caller should hold map_sem for read and request_mutex.
787 */
788static int __map_osds(struct ceph_osd_client *osdc,
789 struct ceph_osd_request *req)
790{
791 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
792 struct ceph_pg pgid;
793 int acting[CEPH_PG_MAX_SIZE];
794 int o = -1, num = 0;
795 int err;
796
797 dout("map_osds %p tid %lld\n", req, req->r_tid);
798 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
799 &req->r_file_layout, osdc->osdmap);
800 if (err)
801 return err;
802 pgid = reqhead->layout.ol_pgid;
803 req->r_pgid = pgid;
804
805 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
806 if (err > 0) {
807 o = acting[0];
808 num = err;
809 }
810
811 if ((req->r_osd && req->r_osd->o_osd == o &&
812 req->r_sent >= req->r_osd->o_incarnation &&
813 req->r_num_pg_osds == num &&
814 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
815 (req->r_osd == NULL && o == -1))
816 return 0; /* no change */
817
818 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
819 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
820 req->r_osd ? req->r_osd->o_osd : -1);
821
822 /* record full pg acting set */
823 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
824 req->r_num_pg_osds = num;
825
826 if (req->r_osd) {
827 __cancel_request(req);
828 list_del_init(&req->r_osd_item);
829 req->r_osd = NULL;
830 }
831
832 req->r_osd = __lookup_osd(osdc, o);
833 if (!req->r_osd && o >= 0) {
834 err = -ENOMEM;
835 req->r_osd = create_osd(osdc);
836 if (!req->r_osd)
837 goto out;
838
839 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
840 req->r_osd->o_osd = o;
841 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
842 __insert_osd(osdc, req->r_osd);
843
844 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
845 }
846
847 if (req->r_osd) {
848 __remove_osd_from_lru(req->r_osd);
849 list_add(&req->r_osd_item, &req->r_osd->o_requests);
850 }
851 err = 1; /* osd or pg changed */
852
853out:
854 return err;
855}
856
857/*
858 * caller should hold map_sem (for read) and request_mutex
859 */
860static int __send_request(struct ceph_osd_client *osdc,
861 struct ceph_osd_request *req)
862{
863 struct ceph_osd_request_head *reqhead;
864 int err;
865
866 err = __map_osds(osdc, req);
867 if (err < 0)
868 return err;
869 if (req->r_osd == NULL) {
870 dout("send_request %p no up osds in pg\n", req);
871 ceph_monc_request_next_osdmap(&osdc->client->monc);
872 return 0;
873 }
874
875 dout("send_request %p tid %llu to osd%d flags %d\n",
876 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
877
878 reqhead = req->r_request->front.iov_base;
879 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
880 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
881 reqhead->reassert_version = req->r_reassert_version;
882
883 req->r_stamp = jiffies;
884 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
885
886 ceph_msg_get(req->r_request); /* send consumes a ref */
887 ceph_con_send(&req->r_osd->o_con, req->r_request);
888 req->r_sent = req->r_osd->o_incarnation;
889 return 0;
890}
891
892/*
893 * Timeout callback, called every N seconds when 1 or more osd
894 * requests has been active for more than N seconds. When this
895 * happens, we ping all OSDs with requests who have timed out to
896 * ensure any communications channel reset is detected. Reset the
897 * request timeouts another N seconds in the future as we go.
898 * Reschedule the timeout event another N seconds in future (unless
899 * there are no open requests).
900 */
901static void handle_timeout(struct work_struct *work)
902{
903 struct ceph_osd_client *osdc =
904 container_of(work, struct ceph_osd_client, timeout_work.work);
905 struct ceph_osd_request *req, *last_req = NULL;
906 struct ceph_osd *osd;
907 unsigned long timeout = osdc->client->options->osd_timeout * HZ;
908 unsigned long keepalive =
909 osdc->client->options->osd_keepalive_timeout * HZ;
910 unsigned long last_stamp = 0;
911 struct rb_node *p;
912 struct list_head slow_osds;
913
914 dout("timeout\n");
915 down_read(&osdc->map_sem);
916
917 ceph_monc_request_next_osdmap(&osdc->client->monc);
918
919 mutex_lock(&osdc->request_mutex);
920 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
921 req = rb_entry(p, struct ceph_osd_request, r_node);
922
923 if (req->r_resend) {
924 int err;
925
926 dout("osdc resending prev failed %lld\n", req->r_tid);
927 err = __send_request(osdc, req);
928 if (err)
929 dout("osdc failed again on %lld\n", req->r_tid);
930 else
931 req->r_resend = false;
932 continue;
933 }
934 }
935
936 /*
937 * reset osds that appear to be _really_ unresponsive. this
938 * is a failsafe measure.. we really shouldn't be getting to
939 * this point if the system is working properly. the monitors
940 * should mark the osd as failed and we should find out about
941 * it from an updated osd map.
942 */
943 while (timeout && !list_empty(&osdc->req_lru)) {
944 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
945 r_req_lru_item);
946
947 if (time_before(jiffies, req->r_stamp + timeout))
948 break;
949
950 BUG_ON(req == last_req && req->r_stamp == last_stamp);
951 last_req = req;
952 last_stamp = req->r_stamp;
953
954 osd = req->r_osd;
955 BUG_ON(!osd);
956 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
957 req->r_tid, osd->o_osd);
958 __kick_requests(osdc, osd);
959 }
960
961 /*
962 * ping osds that are a bit slow. this ensures that if there
963 * is a break in the TCP connection we will notice, and reopen
964 * a connection with that osd (from the fault callback).
965 */
966 INIT_LIST_HEAD(&slow_osds);
967 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
968 if (time_before(jiffies, req->r_stamp + keepalive))
969 break;
970
971 osd = req->r_osd;
972 BUG_ON(!osd);
973 dout(" tid %llu is slow, will send keepalive on osd%d\n",
974 req->r_tid, osd->o_osd);
975 list_move_tail(&osd->o_keepalive_item, &slow_osds);
976 }
977 while (!list_empty(&slow_osds)) {
978 osd = list_entry(slow_osds.next, struct ceph_osd,
979 o_keepalive_item);
980 list_del_init(&osd->o_keepalive_item);
981 ceph_con_keepalive(&osd->o_con);
982 }
983
984 __schedule_osd_timeout(osdc);
985 mutex_unlock(&osdc->request_mutex);
986
987 up_read(&osdc->map_sem);
988}
989
990static void handle_osds_timeout(struct work_struct *work)
991{
992 struct ceph_osd_client *osdc =
993 container_of(work, struct ceph_osd_client,
994 osds_timeout_work.work);
995 unsigned long delay =
996 osdc->client->options->osd_idle_ttl * HZ >> 2;
997
998 dout("osds timeout\n");
999 down_read(&osdc->map_sem);
1000 remove_old_osds(osdc, 0);
1001 up_read(&osdc->map_sem);
1002
1003 schedule_delayed_work(&osdc->osds_timeout_work,
1004 round_jiffies_relative(delay));
1005}
1006
1007/*
1008 * handle osd op reply. either call the callback if it is specified,
1009 * or do the completion to wake up the waiting thread.
1010 */
1011static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1012 struct ceph_connection *con)
1013{
1014 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
1015 struct ceph_osd_request *req;
1016 u64 tid;
1017 int numops, object_len, flags;
1018 s32 result;
1019
1020 tid = le64_to_cpu(msg->hdr.tid);
1021 if (msg->front.iov_len < sizeof(*rhead))
1022 goto bad;
1023 numops = le32_to_cpu(rhead->num_ops);
1024 object_len = le32_to_cpu(rhead->object_len);
1025 result = le32_to_cpu(rhead->result);
1026 if (msg->front.iov_len != sizeof(*rhead) + object_len +
1027 numops * sizeof(struct ceph_osd_op))
1028 goto bad;
1029 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
1030
1031 /* lookup */
1032 mutex_lock(&osdc->request_mutex);
1033 req = __lookup_request(osdc, tid);
1034 if (req == NULL) {
1035 dout("handle_reply tid %llu dne\n", tid);
1036 mutex_unlock(&osdc->request_mutex);
1037 return;
1038 }
1039 ceph_osdc_get_request(req);
1040 flags = le32_to_cpu(rhead->flags);
1041
1042 /*
1043 * if this connection filled our message, drop our reference now, to
1044 * avoid a (safe but slower) revoke later.
1045 */
1046 if (req->r_con_filling_msg == con && req->r_reply == msg) {
1047 dout(" dropping con_filling_msg ref %p\n", con);
1048 req->r_con_filling_msg = NULL;
1049 ceph_con_put(con);
1050 }
1051
1052 if (!req->r_got_reply) {
1053 unsigned bytes;
1054
1055 req->r_result = le32_to_cpu(rhead->result);
1056 bytes = le32_to_cpu(msg->hdr.data_len);
1057 dout("handle_reply result %d bytes %d\n", req->r_result,
1058 bytes);
1059 if (req->r_result == 0)
1060 req->r_result = bytes;
1061
1062 /* in case this is a write and we need to replay, */
1063 req->r_reassert_version = rhead->reassert_version;
1064
1065 req->r_got_reply = 1;
1066 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
1067 dout("handle_reply tid %llu dup ack\n", tid);
1068 mutex_unlock(&osdc->request_mutex);
1069 goto done;
1070 }
1071
1072 dout("handle_reply tid %llu flags %d\n", tid, flags);
1073
1074 /* either this is a read, or we got the safe response */
1075 if (result < 0 ||
1076 (flags & CEPH_OSD_FLAG_ONDISK) ||
1077 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
1078 __unregister_request(osdc, req);
1079
1080 mutex_unlock(&osdc->request_mutex);
1081
1082 if (req->r_callback)
1083 req->r_callback(req, msg);
1084 else
1085 complete_all(&req->r_completion);
1086
1087 if (flags & CEPH_OSD_FLAG_ONDISK) {
1088 if (req->r_safe_callback)
1089 req->r_safe_callback(req, msg);
1090 complete_all(&req->r_safe_completion); /* fsync waiter */
1091 }
1092
1093done:
1094 ceph_osdc_put_request(req);
1095 return;
1096
1097bad:
1098 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
1099 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
1100 (int)sizeof(*rhead));
1101 ceph_msg_dump(msg);
1102}
1103
1104
1105static int __kick_requests(struct ceph_osd_client *osdc,
1106 struct ceph_osd *kickosd)
1107{
1108 struct ceph_osd_request *req;
1109 struct rb_node *p, *n;
1110 int needmap = 0;
1111 int err;
1112
1113 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
1114 if (kickosd) {
1115 err = __reset_osd(osdc, kickosd);
1116 if (err == -EAGAIN)
1117 return 1;
1118 } else {
1119 for (p = rb_first(&osdc->osds); p; p = n) {
1120 struct ceph_osd *osd =
1121 rb_entry(p, struct ceph_osd, o_node);
1122
1123 n = rb_next(p);
1124 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
1125 memcmp(&osd->o_con.peer_addr,
1126 ceph_osd_addr(osdc->osdmap,
1127 osd->o_osd),
1128 sizeof(struct ceph_entity_addr)) != 0)
1129 __reset_osd(osdc, osd);
1130 }
1131 }
1132
1133 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
1134 req = rb_entry(p, struct ceph_osd_request, r_node);
1135
1136 if (req->r_resend) {
1137 dout(" r_resend set on tid %llu\n", req->r_tid);
1138 __cancel_request(req);
1139 goto kick;
1140 }
1141 if (req->r_osd && kickosd == req->r_osd) {
1142 __cancel_request(req);
1143 goto kick;
1144 }
1145
1146 err = __map_osds(osdc, req);
1147 if (err == 0)
1148 continue; /* no change */
1149 if (err < 0) {
1150 /*
1151 * FIXME: really, we should set the request
1152 * error and fail if this isn't a 'nofail'
1153 * request, but that's a fair bit more
1154 * complicated to do. So retry!
1155 */
1156 dout(" setting r_resend on %llu\n", req->r_tid);
1157 req->r_resend = true;
1158 continue;
1159 }
1160 if (req->r_osd == NULL) {
1161 dout("tid %llu maps to no valid osd\n", req->r_tid);
1162 needmap++; /* request a newer map */
1163 continue;
1164 }
1165
1166kick:
1167 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
1168 req->r_osd ? req->r_osd->o_osd : -1);
1169 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1170 err = __send_request(osdc, req);
1171 if (err) {
1172 dout(" setting r_resend on %llu\n", req->r_tid);
1173 req->r_resend = true;
1174 }
1175 }
1176
1177 return needmap;
1178}
1179
1180/*
1181 * Resubmit osd requests whose osd or osd address has changed. Request
1182 * a new osd map if osds are down, or we are otherwise unable to determine
1183 * how to direct a request.
1184 *
1185 * Close connections to down osds.
1186 *
1187 * If @who is specified, resubmit requests for that specific osd.
1188 *
1189 * Caller should hold map_sem for read and request_mutex.
1190 */
1191static void kick_requests(struct ceph_osd_client *osdc,
1192 struct ceph_osd *kickosd)
1193{
1194 int needmap;
1195
1196 mutex_lock(&osdc->request_mutex);
1197 needmap = __kick_requests(osdc, kickosd);
1198 mutex_unlock(&osdc->request_mutex);
1199
1200 if (needmap) {
1201 dout("%d requests for down osds, need new map\n", needmap);
1202 ceph_monc_request_next_osdmap(&osdc->client->monc);
1203 }
1204
1205}
1206/*
1207 * Process updated osd map.
1208 *
1209 * The message contains any number of incremental and full maps, normally
1210 * indicating some sort of topology change in the cluster. Kick requests
1211 * off to different OSDs as needed.
1212 */
1213void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1214{
1215 void *p, *end, *next;
1216 u32 nr_maps, maplen;
1217 u32 epoch;
1218 struct ceph_osdmap *newmap = NULL, *oldmap;
1219 int err;
1220 struct ceph_fsid fsid;
1221
1222 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
1223 p = msg->front.iov_base;
1224 end = p + msg->front.iov_len;
1225
1226 /* verify fsid */
1227 ceph_decode_need(&p, end, sizeof(fsid), bad);
1228 ceph_decode_copy(&p, &fsid, sizeof(fsid));
1229 if (ceph_check_fsid(osdc->client, &fsid) < 0)
1230 return;
1231
1232 down_write(&osdc->map_sem);
1233
1234 /* incremental maps */
1235 ceph_decode_32_safe(&p, end, nr_maps, bad);
1236 dout(" %d inc maps\n", nr_maps);
1237 while (nr_maps > 0) {
1238 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1239 epoch = ceph_decode_32(&p);
1240 maplen = ceph_decode_32(&p);
1241 ceph_decode_need(&p, end, maplen, bad);
1242 next = p + maplen;
1243 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1244 dout("applying incremental map %u len %d\n",
1245 epoch, maplen);
1246 newmap = osdmap_apply_incremental(&p, next,
1247 osdc->osdmap,
1248 osdc->client->msgr);
1249 if (IS_ERR(newmap)) {
1250 err = PTR_ERR(newmap);
1251 goto bad;
1252 }
1253 BUG_ON(!newmap);
1254 if (newmap != osdc->osdmap) {
1255 ceph_osdmap_destroy(osdc->osdmap);
1256 osdc->osdmap = newmap;
1257 }
1258 } else {
1259 dout("ignoring incremental map %u len %d\n",
1260 epoch, maplen);
1261 }
1262 p = next;
1263 nr_maps--;
1264 }
1265 if (newmap)
1266 goto done;
1267
1268 /* full maps */
1269 ceph_decode_32_safe(&p, end, nr_maps, bad);
1270 dout(" %d full maps\n", nr_maps);
1271 while (nr_maps) {
1272 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1273 epoch = ceph_decode_32(&p);
1274 maplen = ceph_decode_32(&p);
1275 ceph_decode_need(&p, end, maplen, bad);
1276 if (nr_maps > 1) {
1277 dout("skipping non-latest full map %u len %d\n",
1278 epoch, maplen);
1279 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1280 dout("skipping full map %u len %d, "
1281 "older than our %u\n", epoch, maplen,
1282 osdc->osdmap->epoch);
1283 } else {
1284 dout("taking full map %u len %d\n", epoch, maplen);
1285 newmap = osdmap_decode(&p, p+maplen);
1286 if (IS_ERR(newmap)) {
1287 err = PTR_ERR(newmap);
1288 goto bad;
1289 }
1290 BUG_ON(!newmap);
1291 oldmap = osdc->osdmap;
1292 osdc->osdmap = newmap;
1293 if (oldmap)
1294 ceph_osdmap_destroy(oldmap);
1295 }
1296 p += maplen;
1297 nr_maps--;
1298 }
1299
1300done:
1301 downgrade_write(&osdc->map_sem);
1302 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1303 if (newmap)
1304 kick_requests(osdc, NULL);
1305 up_read(&osdc->map_sem);
1306 wake_up_all(&osdc->client->auth_wq);
1307 return;
1308
1309bad:
1310 pr_err("osdc handle_map corrupt msg\n");
1311 ceph_msg_dump(msg);
1312 up_write(&osdc->map_sem);
1313 return;
1314}
1315
1316/*
1317 * Register request, send initial attempt.
1318 */
1319int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1320 struct ceph_osd_request *req,
1321 bool nofail)
1322{
1323 int rc = 0;
1324
1325 req->r_request->pages = req->r_pages;
1326 req->r_request->nr_pages = req->r_num_pages;
1327#ifdef CONFIG_BLOCK
1328 req->r_request->bio = req->r_bio;
1329#endif
1330 req->r_request->trail = req->r_trail;
1331
1332 register_request(osdc, req);
1333
1334 down_read(&osdc->map_sem);
1335 mutex_lock(&osdc->request_mutex);
1336 /*
1337 * a racing kick_requests() may have sent the message for us
1338 * while we dropped request_mutex above, so only send now if
1339 * the request still han't been touched yet.
1340 */
1341 if (req->r_sent == 0) {
1342 rc = __send_request(osdc, req);
1343 if (rc) {
1344 if (nofail) {
1345 dout("osdc_start_request failed send, "
1346 " marking %lld\n", req->r_tid);
1347 req->r_resend = true;
1348 rc = 0;
1349 } else {
1350 __unregister_request(osdc, req);
1351 }
1352 }
1353 }
1354 mutex_unlock(&osdc->request_mutex);
1355 up_read(&osdc->map_sem);
1356 return rc;
1357}
1358EXPORT_SYMBOL(ceph_osdc_start_request);
1359
1360/*
1361 * wait for a request to complete
1362 */
1363int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1364 struct ceph_osd_request *req)
1365{
1366 int rc;
1367
1368 rc = wait_for_completion_interruptible(&req->r_completion);
1369 if (rc < 0) {
1370 mutex_lock(&osdc->request_mutex);
1371 __cancel_request(req);
1372 __unregister_request(osdc, req);
1373 mutex_unlock(&osdc->request_mutex);
1374 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1375 return rc;
1376 }
1377
1378 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1379 return req->r_result;
1380}
1381EXPORT_SYMBOL(ceph_osdc_wait_request);
1382
1383/*
1384 * sync - wait for all in-flight requests to flush. avoid starvation.
1385 */
1386void ceph_osdc_sync(struct ceph_osd_client *osdc)
1387{
1388 struct ceph_osd_request *req;
1389 u64 last_tid, next_tid = 0;
1390
1391 mutex_lock(&osdc->request_mutex);
1392 last_tid = osdc->last_tid;
1393 while (1) {
1394 req = __lookup_request_ge(osdc, next_tid);
1395 if (!req)
1396 break;
1397 if (req->r_tid > last_tid)
1398 break;
1399
1400 next_tid = req->r_tid + 1;
1401 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1402 continue;
1403
1404 ceph_osdc_get_request(req);
1405 mutex_unlock(&osdc->request_mutex);
1406 dout("sync waiting on tid %llu (last is %llu)\n",
1407 req->r_tid, last_tid);
1408 wait_for_completion(&req->r_safe_completion);
1409 mutex_lock(&osdc->request_mutex);
1410 ceph_osdc_put_request(req);
1411 }
1412 mutex_unlock(&osdc->request_mutex);
1413 dout("sync done (thru tid %llu)\n", last_tid);
1414}
1415EXPORT_SYMBOL(ceph_osdc_sync);
1416
1417/*
1418 * init, shutdown
1419 */
1420int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1421{
1422 int err;
1423
1424 dout("init\n");
1425 osdc->client = client;
1426 osdc->osdmap = NULL;
1427 init_rwsem(&osdc->map_sem);
1428 init_completion(&osdc->map_waiters);
1429 osdc->last_requested_map = 0;
1430 mutex_init(&osdc->request_mutex);
1431 osdc->last_tid = 0;
1432 osdc->osds = RB_ROOT;
1433 INIT_LIST_HEAD(&osdc->osd_lru);
1434 osdc->requests = RB_ROOT;
1435 INIT_LIST_HEAD(&osdc->req_lru);
1436 osdc->num_requests = 0;
1437 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1438 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1439
1440 schedule_delayed_work(&osdc->osds_timeout_work,
1441 round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
1442
1443 err = -ENOMEM;
1444 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1445 sizeof(struct ceph_osd_request));
1446 if (!osdc->req_mempool)
1447 goto out;
1448
1449 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1450 "osd_op");
1451 if (err < 0)
1452 goto out_mempool;
1453 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1454 OSD_OPREPLY_FRONT_LEN, 10, true,
1455 "osd_op_reply");
1456 if (err < 0)
1457 goto out_msgpool;
1458 return 0;
1459
1460out_msgpool:
1461 ceph_msgpool_destroy(&osdc->msgpool_op);
1462out_mempool:
1463 mempool_destroy(osdc->req_mempool);
1464out:
1465 return err;
1466}
1467EXPORT_SYMBOL(ceph_osdc_init);
1468
1469void ceph_osdc_stop(struct ceph_osd_client *osdc)
1470{
1471 cancel_delayed_work_sync(&osdc->timeout_work);
1472 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1473 if (osdc->osdmap) {
1474 ceph_osdmap_destroy(osdc->osdmap);
1475 osdc->osdmap = NULL;
1476 }
1477 remove_old_osds(osdc, 1);
1478 mempool_destroy(osdc->req_mempool);
1479 ceph_msgpool_destroy(&osdc->msgpool_op);
1480 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1481}
1482EXPORT_SYMBOL(ceph_osdc_stop);
1483
1484/*
1485 * Read some contiguous pages. If we cross a stripe boundary, shorten
1486 * *plen. Return number of bytes read, or error.
1487 */
1488int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1489 struct ceph_vino vino, struct ceph_file_layout *layout,
1490 u64 off, u64 *plen,
1491 u32 truncate_seq, u64 truncate_size,
1492 struct page **pages, int num_pages)
1493{
1494 struct ceph_osd_request *req;
1495 int rc = 0;
1496
1497 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1498 vino.snap, off, *plen);
1499 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1500 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1501 NULL, 0, truncate_seq, truncate_size, NULL,
1502 false, 1);
1503 if (!req)
1504 return -ENOMEM;
1505
1506 /* it may be a short read due to an object boundary */
1507 req->r_pages = pages;
1508
1509 dout("readpages final extent is %llu~%llu (%d pages)\n",
1510 off, *plen, req->r_num_pages);
1511
1512 rc = ceph_osdc_start_request(osdc, req, false);
1513 if (!rc)
1514 rc = ceph_osdc_wait_request(osdc, req);
1515
1516 ceph_osdc_put_request(req);
1517 dout("readpages result %d\n", rc);
1518 return rc;
1519}
1520EXPORT_SYMBOL(ceph_osdc_readpages);
1521
1522/*
1523 * do a synchronous write on N pages
1524 */
1525int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1526 struct ceph_file_layout *layout,
1527 struct ceph_snap_context *snapc,
1528 u64 off, u64 len,
1529 u32 truncate_seq, u64 truncate_size,
1530 struct timespec *mtime,
1531 struct page **pages, int num_pages,
1532 int flags, int do_sync, bool nofail)
1533{
1534 struct ceph_osd_request *req;
1535 int rc = 0;
1536
1537 BUG_ON(vino.snap != CEPH_NOSNAP);
1538 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1539 CEPH_OSD_OP_WRITE,
1540 flags | CEPH_OSD_FLAG_ONDISK |
1541 CEPH_OSD_FLAG_WRITE,
1542 snapc, do_sync,
1543 truncate_seq, truncate_size, mtime,
1544 nofail, 1);
1545 if (!req)
1546 return -ENOMEM;
1547
1548 /* it may be a short write due to an object boundary */
1549 req->r_pages = pages;
1550 dout("writepages %llu~%llu (%d pages)\n", off, len,
1551 req->r_num_pages);
1552
1553 rc = ceph_osdc_start_request(osdc, req, nofail);
1554 if (!rc)
1555 rc = ceph_osdc_wait_request(osdc, req);
1556
1557 ceph_osdc_put_request(req);
1558 if (rc == 0)
1559 rc = len;
1560 dout("writepages result %d\n", rc);
1561 return rc;
1562}
1563EXPORT_SYMBOL(ceph_osdc_writepages);
1564
1565/*
1566 * handle incoming message
1567 */
1568static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1569{
1570 struct ceph_osd *osd = con->private;
1571 struct ceph_osd_client *osdc;
1572 int type = le16_to_cpu(msg->hdr.type);
1573
1574 if (!osd)
1575 goto out;
1576 osdc = osd->o_osdc;
1577
1578 switch (type) {
1579 case CEPH_MSG_OSD_MAP:
1580 ceph_osdc_handle_map(osdc, msg);
1581 break;
1582 case CEPH_MSG_OSD_OPREPLY:
1583 handle_reply(osdc, msg, con);
1584 break;
1585
1586 default:
1587 pr_err("received unknown message type %d %s\n", type,
1588 ceph_msg_type_name(type));
1589 }
1590out:
1591 ceph_msg_put(msg);
1592}
1593
1594/*
1595 * lookup and return message for incoming reply. set up reply message
1596 * pages.
1597 */
1598static struct ceph_msg *get_reply(struct ceph_connection *con,
1599 struct ceph_msg_header *hdr,
1600 int *skip)
1601{
1602 struct ceph_osd *osd = con->private;
1603 struct ceph_osd_client *osdc = osd->o_osdc;
1604 struct ceph_msg *m;
1605 struct ceph_osd_request *req;
1606 int front = le32_to_cpu(hdr->front_len);
1607 int data_len = le32_to_cpu(hdr->data_len);
1608 u64 tid;
1609
1610 tid = le64_to_cpu(hdr->tid);
1611 mutex_lock(&osdc->request_mutex);
1612 req = __lookup_request(osdc, tid);
1613 if (!req) {
1614 *skip = 1;
1615 m = NULL;
1616 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1617 osd->o_osd);
1618 goto out;
1619 }
1620
1621 if (req->r_con_filling_msg) {
1622 dout("get_reply revoking msg %p from old con %p\n",
1623 req->r_reply, req->r_con_filling_msg);
1624 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1625 ceph_con_put(req->r_con_filling_msg);
1626 req->r_con_filling_msg = NULL;
1627 }
1628
1629 if (front > req->r_reply->front.iov_len) {
1630 pr_warning("get_reply front %d > preallocated %d\n",
1631 front, (int)req->r_reply->front.iov_len);
1632 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1633 if (!m)
1634 goto out;
1635 ceph_msg_put(req->r_reply);
1636 req->r_reply = m;
1637 }
1638 m = ceph_msg_get(req->r_reply);
1639
1640 if (data_len > 0) {
1641 unsigned data_off = le16_to_cpu(hdr->data_off);
1642 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1643
1644 if (unlikely(req->r_num_pages < want)) {
1645 pr_warning("tid %lld reply %d > expected %d pages\n",
1646 tid, want, m->nr_pages);
1647 *skip = 1;
1648 ceph_msg_put(m);
1649 m = NULL;
1650 goto out;
1651 }
1652 m->pages = req->r_pages;
1653 m->nr_pages = req->r_num_pages;
1654#ifdef CONFIG_BLOCK
1655 m->bio = req->r_bio;
1656#endif
1657 }
1658 *skip = 0;
1659 req->r_con_filling_msg = ceph_con_get(con);
1660 dout("get_reply tid %lld %p\n", tid, m);
1661
1662out:
1663 mutex_unlock(&osdc->request_mutex);
1664 return m;
1665
1666}
1667
1668static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1669 struct ceph_msg_header *hdr,
1670 int *skip)
1671{
1672 struct ceph_osd *osd = con->private;
1673 int type = le16_to_cpu(hdr->type);
1674 int front = le32_to_cpu(hdr->front_len);
1675
1676 switch (type) {
1677 case CEPH_MSG_OSD_MAP:
1678 return ceph_msg_new(type, front, GFP_NOFS);
1679 case CEPH_MSG_OSD_OPREPLY:
1680 return get_reply(con, hdr, skip);
1681 default:
1682 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1683 osd->o_osd);
1684 *skip = 1;
1685 return NULL;
1686 }
1687}
1688
1689/*
1690 * Wrappers to refcount containing ceph_osd struct
1691 */
1692static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1693{
1694 struct ceph_osd *osd = con->private;
1695 if (get_osd(osd))
1696 return con;
1697 return NULL;
1698}
1699
1700static void put_osd_con(struct ceph_connection *con)
1701{
1702 struct ceph_osd *osd = con->private;
1703 put_osd(osd);
1704}
1705
1706/*
1707 * authentication
1708 */
1709static int get_authorizer(struct ceph_connection *con,
1710 void **buf, int *len, int *proto,
1711 void **reply_buf, int *reply_len, int force_new)
1712{
1713 struct ceph_osd *o = con->private;
1714 struct ceph_osd_client *osdc = o->o_osdc;
1715 struct ceph_auth_client *ac = osdc->client->monc.auth;
1716 int ret = 0;
1717
1718 if (force_new && o->o_authorizer) {
1719 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1720 o->o_authorizer = NULL;
1721 }
1722 if (o->o_authorizer == NULL) {
1723 ret = ac->ops->create_authorizer(
1724 ac, CEPH_ENTITY_TYPE_OSD,
1725 &o->o_authorizer,
1726 &o->o_authorizer_buf,
1727 &o->o_authorizer_buf_len,
1728 &o->o_authorizer_reply_buf,
1729 &o->o_authorizer_reply_buf_len);
1730 if (ret)
1731 return ret;
1732 }
1733
1734 *proto = ac->protocol;
1735 *buf = o->o_authorizer_buf;
1736 *len = o->o_authorizer_buf_len;
1737 *reply_buf = o->o_authorizer_reply_buf;
1738 *reply_len = o->o_authorizer_reply_buf_len;
1739 return 0;
1740}
1741
1742
1743static int verify_authorizer_reply(struct ceph_connection *con, int len)
1744{
1745 struct ceph_osd *o = con->private;
1746 struct ceph_osd_client *osdc = o->o_osdc;
1747 struct ceph_auth_client *ac = osdc->client->monc.auth;
1748
1749 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1750}
1751
1752static int invalidate_authorizer(struct ceph_connection *con)
1753{
1754 struct ceph_osd *o = con->private;
1755 struct ceph_osd_client *osdc = o->o_osdc;
1756 struct ceph_auth_client *ac = osdc->client->monc.auth;
1757
1758 if (ac->ops->invalidate_authorizer)
1759 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1760
1761 return ceph_monc_validate_auth(&osdc->client->monc);
1762}
1763
1764static const struct ceph_connection_operations osd_con_ops = {
1765 .get = get_osd_con,
1766 .put = put_osd_con,
1767 .dispatch = dispatch,
1768 .get_authorizer = get_authorizer,
1769 .verify_authorizer_reply = verify_authorizer_reply,
1770 .invalidate_authorizer = invalidate_authorizer,
1771 .alloc_msg = alloc_msg,
1772 .fault = osd_reset,
1773};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
new file mode 100644
index 000000000000..d73f3f6efa36
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,1128 @@
1
2#include <linux/ceph/ceph_debug.h>
3
4#include <linux/module.h>
5#include <linux/slab.h>
6#include <asm/div64.h>
7
8#include <linux/ceph/libceph.h>
9#include <linux/ceph/osdmap.h>
10#include <linux/ceph/decode.h>
11#include <linux/crush/hash.h>
12#include <linux/crush/mapper.h>
13
14char *ceph_osdmap_state_str(char *str, int len, int state)
15{
16 int flag = 0;
17
18 if (!len)
19 goto done;
20
21 *str = '\0';
22 if (state) {
23 if (state & CEPH_OSD_EXISTS) {
24 snprintf(str, len, "exists");
25 flag = 1;
26 }
27 if (state & CEPH_OSD_UP) {
28 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
29 "up");
30 flag = 1;
31 }
32 } else {
33 snprintf(str, len, "doesn't exist");
34 }
35done:
36 return str;
37}
38
39/* maps */
40
41static int calc_bits_of(unsigned t)
42{
43 int b = 0;
44 while (t) {
45 t = t >> 1;
46 b++;
47 }
48 return b;
49}
50
51/*
52 * the foo_mask is the smallest value 2^n-1 that is >= foo.
53 */
54static void calc_pg_masks(struct ceph_pg_pool_info *pi)
55{
56 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
57 pi->pgp_num_mask =
58 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
59 pi->lpg_num_mask =
60 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
61 pi->lpgp_num_mask =
62 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
63}
64
65/*
66 * decode crush map
67 */
68static int crush_decode_uniform_bucket(void **p, void *end,
69 struct crush_bucket_uniform *b)
70{
71 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
72 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
73 b->item_weight = ceph_decode_32(p);
74 return 0;
75bad:
76 return -EINVAL;
77}
78
79static int crush_decode_list_bucket(void **p, void *end,
80 struct crush_bucket_list *b)
81{
82 int j;
83 dout("crush_decode_list_bucket %p to %p\n", *p, end);
84 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
85 if (b->item_weights == NULL)
86 return -ENOMEM;
87 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
88 if (b->sum_weights == NULL)
89 return -ENOMEM;
90 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
91 for (j = 0; j < b->h.size; j++) {
92 b->item_weights[j] = ceph_decode_32(p);
93 b->sum_weights[j] = ceph_decode_32(p);
94 }
95 return 0;
96bad:
97 return -EINVAL;
98}
99
100static int crush_decode_tree_bucket(void **p, void *end,
101 struct crush_bucket_tree *b)
102{
103 int j;
104 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
105 ceph_decode_32_safe(p, end, b->num_nodes, bad);
106 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
107 if (b->node_weights == NULL)
108 return -ENOMEM;
109 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
110 for (j = 0; j < b->num_nodes; j++)
111 b->node_weights[j] = ceph_decode_32(p);
112 return 0;
113bad:
114 return -EINVAL;
115}
116
117static int crush_decode_straw_bucket(void **p, void *end,
118 struct crush_bucket_straw *b)
119{
120 int j;
121 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
122 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
123 if (b->item_weights == NULL)
124 return -ENOMEM;
125 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
126 if (b->straws == NULL)
127 return -ENOMEM;
128 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
129 for (j = 0; j < b->h.size; j++) {
130 b->item_weights[j] = ceph_decode_32(p);
131 b->straws[j] = ceph_decode_32(p);
132 }
133 return 0;
134bad:
135 return -EINVAL;
136}
137
138static struct crush_map *crush_decode(void *pbyval, void *end)
139{
140 struct crush_map *c;
141 int err = -EINVAL;
142 int i, j;
143 void **p = &pbyval;
144 void *start = pbyval;
145 u32 magic;
146
147 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
148
149 c = kzalloc(sizeof(*c), GFP_NOFS);
150 if (c == NULL)
151 return ERR_PTR(-ENOMEM);
152
153 ceph_decode_need(p, end, 4*sizeof(u32), bad);
154 magic = ceph_decode_32(p);
155 if (magic != CRUSH_MAGIC) {
156 pr_err("crush_decode magic %x != current %x\n",
157 (unsigned)magic, (unsigned)CRUSH_MAGIC);
158 goto bad;
159 }
160 c->max_buckets = ceph_decode_32(p);
161 c->max_rules = ceph_decode_32(p);
162 c->max_devices = ceph_decode_32(p);
163
164 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
165 if (c->device_parents == NULL)
166 goto badmem;
167 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
168 if (c->bucket_parents == NULL)
169 goto badmem;
170
171 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
172 if (c->buckets == NULL)
173 goto badmem;
174 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
175 if (c->rules == NULL)
176 goto badmem;
177
178 /* buckets */
179 for (i = 0; i < c->max_buckets; i++) {
180 int size = 0;
181 u32 alg;
182 struct crush_bucket *b;
183
184 ceph_decode_32_safe(p, end, alg, bad);
185 if (alg == 0) {
186 c->buckets[i] = NULL;
187 continue;
188 }
189 dout("crush_decode bucket %d off %x %p to %p\n",
190 i, (int)(*p-start), *p, end);
191
192 switch (alg) {
193 case CRUSH_BUCKET_UNIFORM:
194 size = sizeof(struct crush_bucket_uniform);
195 break;
196 case CRUSH_BUCKET_LIST:
197 size = sizeof(struct crush_bucket_list);
198 break;
199 case CRUSH_BUCKET_TREE:
200 size = sizeof(struct crush_bucket_tree);
201 break;
202 case CRUSH_BUCKET_STRAW:
203 size = sizeof(struct crush_bucket_straw);
204 break;
205 default:
206 err = -EINVAL;
207 goto bad;
208 }
209 BUG_ON(size == 0);
210 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
211 if (b == NULL)
212 goto badmem;
213
214 ceph_decode_need(p, end, 4*sizeof(u32), bad);
215 b->id = ceph_decode_32(p);
216 b->type = ceph_decode_16(p);
217 b->alg = ceph_decode_8(p);
218 b->hash = ceph_decode_8(p);
219 b->weight = ceph_decode_32(p);
220 b->size = ceph_decode_32(p);
221
222 dout("crush_decode bucket size %d off %x %p to %p\n",
223 b->size, (int)(*p-start), *p, end);
224
225 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
226 if (b->items == NULL)
227 goto badmem;
228 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
229 if (b->perm == NULL)
230 goto badmem;
231 b->perm_n = 0;
232
233 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
234 for (j = 0; j < b->size; j++)
235 b->items[j] = ceph_decode_32(p);
236
237 switch (b->alg) {
238 case CRUSH_BUCKET_UNIFORM:
239 err = crush_decode_uniform_bucket(p, end,
240 (struct crush_bucket_uniform *)b);
241 if (err < 0)
242 goto bad;
243 break;
244 case CRUSH_BUCKET_LIST:
245 err = crush_decode_list_bucket(p, end,
246 (struct crush_bucket_list *)b);
247 if (err < 0)
248 goto bad;
249 break;
250 case CRUSH_BUCKET_TREE:
251 err = crush_decode_tree_bucket(p, end,
252 (struct crush_bucket_tree *)b);
253 if (err < 0)
254 goto bad;
255 break;
256 case CRUSH_BUCKET_STRAW:
257 err = crush_decode_straw_bucket(p, end,
258 (struct crush_bucket_straw *)b);
259 if (err < 0)
260 goto bad;
261 break;
262 }
263 }
264
265 /* rules */
266 dout("rule vec is %p\n", c->rules);
267 for (i = 0; i < c->max_rules; i++) {
268 u32 yes;
269 struct crush_rule *r;
270
271 ceph_decode_32_safe(p, end, yes, bad);
272 if (!yes) {
273 dout("crush_decode NO rule %d off %x %p to %p\n",
274 i, (int)(*p-start), *p, end);
275 c->rules[i] = NULL;
276 continue;
277 }
278
279 dout("crush_decode rule %d off %x %p to %p\n",
280 i, (int)(*p-start), *p, end);
281
282 /* len */
283 ceph_decode_32_safe(p, end, yes, bad);
284#if BITS_PER_LONG == 32
285 err = -EINVAL;
286 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
287 goto bad;
288#endif
289 r = c->rules[i] = kmalloc(sizeof(*r) +
290 yes*sizeof(struct crush_rule_step),
291 GFP_NOFS);
292 if (r == NULL)
293 goto badmem;
294 dout(" rule %d is at %p\n", i, r);
295 r->len = yes;
296 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
297 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
298 for (j = 0; j < r->len; j++) {
299 r->steps[j].op = ceph_decode_32(p);
300 r->steps[j].arg1 = ceph_decode_32(p);
301 r->steps[j].arg2 = ceph_decode_32(p);
302 }
303 }
304
305 /* ignore trailing name maps. */
306
307 dout("crush_decode success\n");
308 return c;
309
310badmem:
311 err = -ENOMEM;
312bad:
313 dout("crush_decode fail %d\n", err);
314 crush_destroy(c);
315 return ERR_PTR(err);
316}
317
318/*
319 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
320 * to a set of osds)
321 */
322static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
323{
324 u64 a = *(u64 *)&l;
325 u64 b = *(u64 *)&r;
326
327 if (a < b)
328 return -1;
329 if (a > b)
330 return 1;
331 return 0;
332}
333
334static int __insert_pg_mapping(struct ceph_pg_mapping *new,
335 struct rb_root *root)
336{
337 struct rb_node **p = &root->rb_node;
338 struct rb_node *parent = NULL;
339 struct ceph_pg_mapping *pg = NULL;
340 int c;
341
342 while (*p) {
343 parent = *p;
344 pg = rb_entry(parent, struct ceph_pg_mapping, node);
345 c = pgid_cmp(new->pgid, pg->pgid);
346 if (c < 0)
347 p = &(*p)->rb_left;
348 else if (c > 0)
349 p = &(*p)->rb_right;
350 else
351 return -EEXIST;
352 }
353
354 rb_link_node(&new->node, parent, p);
355 rb_insert_color(&new->node, root);
356 return 0;
357}
358
359static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
360 struct ceph_pg pgid)
361{
362 struct rb_node *n = root->rb_node;
363 struct ceph_pg_mapping *pg;
364 int c;
365
366 while (n) {
367 pg = rb_entry(n, struct ceph_pg_mapping, node);
368 c = pgid_cmp(pgid, pg->pgid);
369 if (c < 0)
370 n = n->rb_left;
371 else if (c > 0)
372 n = n->rb_right;
373 else
374 return pg;
375 }
376 return NULL;
377}
378
379/*
380 * rbtree of pg pool info
381 */
382static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
383{
384 struct rb_node **p = &root->rb_node;
385 struct rb_node *parent = NULL;
386 struct ceph_pg_pool_info *pi = NULL;
387
388 while (*p) {
389 parent = *p;
390 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
391 if (new->id < pi->id)
392 p = &(*p)->rb_left;
393 else if (new->id > pi->id)
394 p = &(*p)->rb_right;
395 else
396 return -EEXIST;
397 }
398
399 rb_link_node(&new->node, parent, p);
400 rb_insert_color(&new->node, root);
401 return 0;
402}
403
404static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
405{
406 struct ceph_pg_pool_info *pi;
407 struct rb_node *n = root->rb_node;
408
409 while (n) {
410 pi = rb_entry(n, struct ceph_pg_pool_info, node);
411 if (id < pi->id)
412 n = n->rb_left;
413 else if (id > pi->id)
414 n = n->rb_right;
415 else
416 return pi;
417 }
418 return NULL;
419}
420
421int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
422{
423 struct rb_node *rbp;
424
425 for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
426 struct ceph_pg_pool_info *pi =
427 rb_entry(rbp, struct ceph_pg_pool_info, node);
428 if (pi->name && strcmp(pi->name, name) == 0)
429 return pi->id;
430 }
431 return -ENOENT;
432}
433EXPORT_SYMBOL(ceph_pg_poolid_by_name);
434
435static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
436{
437 rb_erase(&pi->node, root);
438 kfree(pi->name);
439 kfree(pi);
440}
441
442static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
443{
444 unsigned n, m;
445
446 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
447 calc_pg_masks(pi);
448
449 /* num_snaps * snap_info_t */
450 n = le32_to_cpu(pi->v.num_snaps);
451 while (n--) {
452 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
453 sizeof(struct ceph_timespec), bad);
454 *p += sizeof(u64) + /* key */
455 1 + sizeof(u64) + /* u8, snapid */
456 sizeof(struct ceph_timespec);
457 m = ceph_decode_32(p); /* snap name */
458 *p += m;
459 }
460
461 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
462 return 0;
463
464bad:
465 return -EINVAL;
466}
467
468static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
469{
470 struct ceph_pg_pool_info *pi;
471 u32 num, len, pool;
472
473 ceph_decode_32_safe(p, end, num, bad);
474 dout(" %d pool names\n", num);
475 while (num--) {
476 ceph_decode_32_safe(p, end, pool, bad);
477 ceph_decode_32_safe(p, end, len, bad);
478 dout(" pool %d len %d\n", pool, len);
479 pi = __lookup_pg_pool(&map->pg_pools, pool);
480 if (pi) {
481 kfree(pi->name);
482 pi->name = kmalloc(len + 1, GFP_NOFS);
483 if (pi->name) {
484 memcpy(pi->name, *p, len);
485 pi->name[len] = '\0';
486 dout(" name is %s\n", pi->name);
487 }
488 }
489 *p += len;
490 }
491 return 0;
492
493bad:
494 return -EINVAL;
495}
496
497/*
498 * osd map
499 */
500void ceph_osdmap_destroy(struct ceph_osdmap *map)
501{
502 dout("osdmap_destroy %p\n", map);
503 if (map->crush)
504 crush_destroy(map->crush);
505 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
506 struct ceph_pg_mapping *pg =
507 rb_entry(rb_first(&map->pg_temp),
508 struct ceph_pg_mapping, node);
509 rb_erase(&pg->node, &map->pg_temp);
510 kfree(pg);
511 }
512 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
513 struct ceph_pg_pool_info *pi =
514 rb_entry(rb_first(&map->pg_pools),
515 struct ceph_pg_pool_info, node);
516 __remove_pg_pool(&map->pg_pools, pi);
517 }
518 kfree(map->osd_state);
519 kfree(map->osd_weight);
520 kfree(map->osd_addr);
521 kfree(map);
522}
523
524/*
525 * adjust max osd value. reallocate arrays.
526 */
527static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
528{
529 u8 *state;
530 struct ceph_entity_addr *addr;
531 u32 *weight;
532
533 state = kcalloc(max, sizeof(*state), GFP_NOFS);
534 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
535 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
536 if (state == NULL || addr == NULL || weight == NULL) {
537 kfree(state);
538 kfree(addr);
539 kfree(weight);
540 return -ENOMEM;
541 }
542
543 /* copy old? */
544 if (map->osd_state) {
545 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
546 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
547 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
548 kfree(map->osd_state);
549 kfree(map->osd_addr);
550 kfree(map->osd_weight);
551 }
552
553 map->osd_state = state;
554 map->osd_weight = weight;
555 map->osd_addr = addr;
556 map->max_osd = max;
557 return 0;
558}
559
560/*
561 * decode a full map.
562 */
563struct ceph_osdmap *osdmap_decode(void **p, void *end)
564{
565 struct ceph_osdmap *map;
566 u16 version;
567 u32 len, max, i;
568 u8 ev;
569 int err = -EINVAL;
570 void *start = *p;
571 struct ceph_pg_pool_info *pi;
572
573 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
574
575 map = kzalloc(sizeof(*map), GFP_NOFS);
576 if (map == NULL)
577 return ERR_PTR(-ENOMEM);
578 map->pg_temp = RB_ROOT;
579
580 ceph_decode_16_safe(p, end, version, bad);
581 if (version > CEPH_OSDMAP_VERSION) {
582 pr_warning("got unknown v %d > %d of osdmap\n", version,
583 CEPH_OSDMAP_VERSION);
584 goto bad;
585 }
586
587 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
588 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
589 map->epoch = ceph_decode_32(p);
590 ceph_decode_copy(p, &map->created, sizeof(map->created));
591 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
592
593 ceph_decode_32_safe(p, end, max, bad);
594 while (max--) {
595 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
596 pi = kzalloc(sizeof(*pi), GFP_NOFS);
597 if (!pi)
598 goto bad;
599 pi->id = ceph_decode_32(p);
600 ev = ceph_decode_8(p); /* encoding version */
601 if (ev > CEPH_PG_POOL_VERSION) {
602 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
603 ev, CEPH_PG_POOL_VERSION);
604 kfree(pi);
605 goto bad;
606 }
607 err = __decode_pool(p, end, pi);
608 if (err < 0)
609 goto bad;
610 __insert_pg_pool(&map->pg_pools, pi);
611 }
612
613 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
614 goto bad;
615
616 ceph_decode_32_safe(p, end, map->pool_max, bad);
617
618 ceph_decode_32_safe(p, end, map->flags, bad);
619
620 max = ceph_decode_32(p);
621
622 /* (re)alloc osd arrays */
623 err = osdmap_set_max_osd(map, max);
624 if (err < 0)
625 goto bad;
626 dout("osdmap_decode max_osd = %d\n", map->max_osd);
627
628 /* osds */
629 err = -EINVAL;
630 ceph_decode_need(p, end, 3*sizeof(u32) +
631 map->max_osd*(1 + sizeof(*map->osd_weight) +
632 sizeof(*map->osd_addr)), bad);
633 *p += 4; /* skip length field (should match max) */
634 ceph_decode_copy(p, map->osd_state, map->max_osd);
635
636 *p += 4; /* skip length field (should match max) */
637 for (i = 0; i < map->max_osd; i++)
638 map->osd_weight[i] = ceph_decode_32(p);
639
640 *p += 4; /* skip length field (should match max) */
641 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
642 for (i = 0; i < map->max_osd; i++)
643 ceph_decode_addr(&map->osd_addr[i]);
644
645 /* pg_temp */
646 ceph_decode_32_safe(p, end, len, bad);
647 for (i = 0; i < len; i++) {
648 int n, j;
649 struct ceph_pg pgid;
650 struct ceph_pg_mapping *pg;
651
652 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
653 ceph_decode_copy(p, &pgid, sizeof(pgid));
654 n = ceph_decode_32(p);
655 ceph_decode_need(p, end, n * sizeof(u32), bad);
656 err = -ENOMEM;
657 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
658 if (!pg)
659 goto bad;
660 pg->pgid = pgid;
661 pg->len = n;
662 for (j = 0; j < n; j++)
663 pg->osds[j] = ceph_decode_32(p);
664
665 err = __insert_pg_mapping(pg, &map->pg_temp);
666 if (err)
667 goto bad;
668 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
669 }
670
671 /* crush */
672 ceph_decode_32_safe(p, end, len, bad);
673 dout("osdmap_decode crush len %d from off 0x%x\n", len,
674 (int)(*p - start));
675 ceph_decode_need(p, end, len, bad);
676 map->crush = crush_decode(*p, end);
677 *p += len;
678 if (IS_ERR(map->crush)) {
679 err = PTR_ERR(map->crush);
680 map->crush = NULL;
681 goto bad;
682 }
683
684 /* ignore the rest of the map */
685 *p = end;
686
687 dout("osdmap_decode done %p %p\n", *p, end);
688 return map;
689
690bad:
691 dout("osdmap_decode fail\n");
692 ceph_osdmap_destroy(map);
693 return ERR_PTR(err);
694}
695
696/*
697 * decode and apply an incremental map update.
698 */
699struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
700 struct ceph_osdmap *map,
701 struct ceph_messenger *msgr)
702{
703 struct crush_map *newcrush = NULL;
704 struct ceph_fsid fsid;
705 u32 epoch = 0;
706 struct ceph_timespec modified;
707 u32 len, pool;
708 __s32 new_pool_max, new_flags, max;
709 void *start = *p;
710 int err = -EINVAL;
711 u16 version;
712 struct rb_node *rbp;
713
714 ceph_decode_16_safe(p, end, version, bad);
715 if (version > CEPH_OSDMAP_INC_VERSION) {
716 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
717 CEPH_OSDMAP_INC_VERSION);
718 goto bad;
719 }
720
721 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
722 bad);
723 ceph_decode_copy(p, &fsid, sizeof(fsid));
724 epoch = ceph_decode_32(p);
725 BUG_ON(epoch != map->epoch+1);
726 ceph_decode_copy(p, &modified, sizeof(modified));
727 new_pool_max = ceph_decode_32(p);
728 new_flags = ceph_decode_32(p);
729
730 /* full map? */
731 ceph_decode_32_safe(p, end, len, bad);
732 if (len > 0) {
733 dout("apply_incremental full map len %d, %p to %p\n",
734 len, *p, end);
735 return osdmap_decode(p, min(*p+len, end));
736 }
737
738 /* new crush? */
739 ceph_decode_32_safe(p, end, len, bad);
740 if (len > 0) {
741 dout("apply_incremental new crush map len %d, %p to %p\n",
742 len, *p, end);
743 newcrush = crush_decode(*p, min(*p+len, end));
744 if (IS_ERR(newcrush))
745 return ERR_CAST(newcrush);
746 *p += len;
747 }
748
749 /* new flags? */
750 if (new_flags >= 0)
751 map->flags = new_flags;
752 if (new_pool_max >= 0)
753 map->pool_max = new_pool_max;
754
755 ceph_decode_need(p, end, 5*sizeof(u32), bad);
756
757 /* new max? */
758 max = ceph_decode_32(p);
759 if (max >= 0) {
760 err = osdmap_set_max_osd(map, max);
761 if (err < 0)
762 goto bad;
763 }
764
765 map->epoch++;
766 map->modified = map->modified;
767 if (newcrush) {
768 if (map->crush)
769 crush_destroy(map->crush);
770 map->crush = newcrush;
771 newcrush = NULL;
772 }
773
774 /* new_pool */
775 ceph_decode_32_safe(p, end, len, bad);
776 while (len--) {
777 __u8 ev;
778 struct ceph_pg_pool_info *pi;
779
780 ceph_decode_32_safe(p, end, pool, bad);
781 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
782 ev = ceph_decode_8(p); /* encoding version */
783 if (ev > CEPH_PG_POOL_VERSION) {
784 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
785 ev, CEPH_PG_POOL_VERSION);
786 goto bad;
787 }
788 pi = __lookup_pg_pool(&map->pg_pools, pool);
789 if (!pi) {
790 pi = kzalloc(sizeof(*pi), GFP_NOFS);
791 if (!pi) {
792 err = -ENOMEM;
793 goto bad;
794 }
795 pi->id = pool;
796 __insert_pg_pool(&map->pg_pools, pi);
797 }
798 err = __decode_pool(p, end, pi);
799 if (err < 0)
800 goto bad;
801 }
802 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
803 goto bad;
804
805 /* old_pool */
806 ceph_decode_32_safe(p, end, len, bad);
807 while (len--) {
808 struct ceph_pg_pool_info *pi;
809
810 ceph_decode_32_safe(p, end, pool, bad);
811 pi = __lookup_pg_pool(&map->pg_pools, pool);
812 if (pi)
813 __remove_pg_pool(&map->pg_pools, pi);
814 }
815
816 /* new_up */
817 err = -EINVAL;
818 ceph_decode_32_safe(p, end, len, bad);
819 while (len--) {
820 u32 osd;
821 struct ceph_entity_addr addr;
822 ceph_decode_32_safe(p, end, osd, bad);
823 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
824 ceph_decode_addr(&addr);
825 pr_info("osd%d up\n", osd);
826 BUG_ON(osd >= map->max_osd);
827 map->osd_state[osd] |= CEPH_OSD_UP;
828 map->osd_addr[osd] = addr;
829 }
830
831 /* new_down */
832 ceph_decode_32_safe(p, end, len, bad);
833 while (len--) {
834 u32 osd;
835 ceph_decode_32_safe(p, end, osd, bad);
836 (*p)++; /* clean flag */
837 pr_info("osd%d down\n", osd);
838 if (osd < map->max_osd)
839 map->osd_state[osd] &= ~CEPH_OSD_UP;
840 }
841
842 /* new_weight */
843 ceph_decode_32_safe(p, end, len, bad);
844 while (len--) {
845 u32 osd, off;
846 ceph_decode_need(p, end, sizeof(u32)*2, bad);
847 osd = ceph_decode_32(p);
848 off = ceph_decode_32(p);
849 pr_info("osd%d weight 0x%x %s\n", osd, off,
850 off == CEPH_OSD_IN ? "(in)" :
851 (off == CEPH_OSD_OUT ? "(out)" : ""));
852 if (osd < map->max_osd)
853 map->osd_weight[osd] = off;
854 }
855
856 /* new_pg_temp */
857 rbp = rb_first(&map->pg_temp);
858 ceph_decode_32_safe(p, end, len, bad);
859 while (len--) {
860 struct ceph_pg_mapping *pg;
861 int j;
862 struct ceph_pg pgid;
863 u32 pglen;
864 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
865 ceph_decode_copy(p, &pgid, sizeof(pgid));
866 pglen = ceph_decode_32(p);
867
868 /* remove any? */
869 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
870 node)->pgid, pgid) <= 0) {
871 struct ceph_pg_mapping *cur =
872 rb_entry(rbp, struct ceph_pg_mapping, node);
873
874 rbp = rb_next(rbp);
875 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
876 rb_erase(&cur->node, &map->pg_temp);
877 kfree(cur);
878 }
879
880 if (pglen) {
881 /* insert */
882 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
883 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
884 if (!pg) {
885 err = -ENOMEM;
886 goto bad;
887 }
888 pg->pgid = pgid;
889 pg->len = pglen;
890 for (j = 0; j < pglen; j++)
891 pg->osds[j] = ceph_decode_32(p);
892 err = __insert_pg_mapping(pg, &map->pg_temp);
893 if (err) {
894 kfree(pg);
895 goto bad;
896 }
897 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
898 pglen);
899 }
900 }
901 while (rbp) {
902 struct ceph_pg_mapping *cur =
903 rb_entry(rbp, struct ceph_pg_mapping, node);
904
905 rbp = rb_next(rbp);
906 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
907 rb_erase(&cur->node, &map->pg_temp);
908 kfree(cur);
909 }
910
911 /* ignore the rest */
912 *p = end;
913 return map;
914
915bad:
916 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
917 epoch, (int)(*p - start), *p, start, end);
918 print_hex_dump(KERN_DEBUG, "osdmap: ",
919 DUMP_PREFIX_OFFSET, 16, 1,
920 start, end - start, true);
921 if (newcrush)
922 crush_destroy(newcrush);
923 return ERR_PTR(err);
924}
925
926
927
928
929/*
930 * calculate file layout from given offset, length.
931 * fill in correct oid, logical length, and object extent
932 * offset, length.
933 *
934 * for now, we write only a single su, until we can
935 * pass a stride back to the caller.
936 */
937void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
938 u64 off, u64 *plen,
939 u64 *ono,
940 u64 *oxoff, u64 *oxlen)
941{
942 u32 osize = le32_to_cpu(layout->fl_object_size);
943 u32 su = le32_to_cpu(layout->fl_stripe_unit);
944 u32 sc = le32_to_cpu(layout->fl_stripe_count);
945 u32 bl, stripeno, stripepos, objsetno;
946 u32 su_per_object;
947 u64 t, su_offset;
948
949 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
950 osize, su);
951 su_per_object = osize / su;
952 dout("osize %u / su %u = su_per_object %u\n", osize, su,
953 su_per_object);
954
955 BUG_ON((su & ~PAGE_MASK) != 0);
956 /* bl = *off / su; */
957 t = off;
958 do_div(t, su);
959 bl = t;
960 dout("off %llu / su %u = bl %u\n", off, su, bl);
961
962 stripeno = bl / sc;
963 stripepos = bl % sc;
964 objsetno = stripeno / su_per_object;
965
966 *ono = objsetno * sc + stripepos;
967 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
968
969 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
970 t = off;
971 su_offset = do_div(t, su);
972 *oxoff = su_offset + (stripeno % su_per_object) * su;
973
974 /*
975 * Calculate the length of the extent being written to the selected
976 * object. This is the minimum of the full length requested (plen) or
977 * the remainder of the current stripe being written to.
978 */
979 *oxlen = min_t(u64, *plen, su - su_offset);
980 *plen = *oxlen;
981
982 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
983}
984EXPORT_SYMBOL(ceph_calc_file_object_mapping);
985
986/*
987 * calculate an object layout (i.e. pgid) from an oid,
988 * file_layout, and osdmap
989 */
990int ceph_calc_object_layout(struct ceph_object_layout *ol,
991 const char *oid,
992 struct ceph_file_layout *fl,
993 struct ceph_osdmap *osdmap)
994{
995 unsigned num, num_mask;
996 struct ceph_pg pgid;
997 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
998 int poolid = le32_to_cpu(fl->fl_pg_pool);
999 struct ceph_pg_pool_info *pool;
1000 unsigned ps;
1001
1002 BUG_ON(!osdmap);
1003
1004 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1005 if (!pool)
1006 return -EIO;
1007 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
1008 if (preferred >= 0) {
1009 ps += preferred;
1010 num = le32_to_cpu(pool->v.lpg_num);
1011 num_mask = pool->lpg_num_mask;
1012 } else {
1013 num = le32_to_cpu(pool->v.pg_num);
1014 num_mask = pool->pg_num_mask;
1015 }
1016
1017 pgid.ps = cpu_to_le16(ps);
1018 pgid.preferred = cpu_to_le16(preferred);
1019 pgid.pool = fl->fl_pg_pool;
1020 if (preferred >= 0)
1021 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
1022 (int)preferred);
1023 else
1024 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
1025
1026 ol->ol_pgid = pgid;
1027 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1028 return 0;
1029}
1030EXPORT_SYMBOL(ceph_calc_object_layout);
1031
1032/*
1033 * Calculate raw osd vector for the given pgid. Return pointer to osd
1034 * array, or NULL on failure.
1035 */
1036static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1037 int *osds, int *num)
1038{
1039 struct ceph_pg_mapping *pg;
1040 struct ceph_pg_pool_info *pool;
1041 int ruleno;
1042 unsigned poolid, ps, pps;
1043 int preferred;
1044
1045 /* pg_temp? */
1046 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1047 if (pg) {
1048 *num = pg->len;
1049 return pg->osds;
1050 }
1051
1052 /* crush */
1053 poolid = le32_to_cpu(pgid.pool);
1054 ps = le16_to_cpu(pgid.ps);
1055 preferred = (s16)le16_to_cpu(pgid.preferred);
1056
1057 /* don't forcefeed bad device ids to crush */
1058 if (preferred >= osdmap->max_osd ||
1059 preferred >= osdmap->crush->max_devices)
1060 preferred = -1;
1061
1062 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1063 if (!pool)
1064 return NULL;
1065 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1066 pool->v.type, pool->v.size);
1067 if (ruleno < 0) {
1068 pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1069 poolid, pool->v.crush_ruleset, pool->v.type,
1070 pool->v.size);
1071 return NULL;
1072 }
1073
1074 if (preferred >= 0)
1075 pps = ceph_stable_mod(ps,
1076 le32_to_cpu(pool->v.lpgp_num),
1077 pool->lpgp_num_mask);
1078 else
1079 pps = ceph_stable_mod(ps,
1080 le32_to_cpu(pool->v.pgp_num),
1081 pool->pgp_num_mask);
1082 pps += poolid;
1083 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1084 min_t(int, pool->v.size, *num),
1085 preferred, osdmap->osd_weight);
1086 return osds;
1087}
1088
1089/*
1090 * Return acting set for given pgid.
1091 */
1092int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1093 int *acting)
1094{
1095 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1096 int i, o, num = CEPH_PG_MAX_SIZE;
1097
1098 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1099 if (!osds)
1100 return -1;
1101
1102 /* primary is first up osd */
1103 o = 0;
1104 for (i = 0; i < num; i++)
1105 if (ceph_osd_is_up(osdmap, osds[i]))
1106 acting[o++] = osds[i];
1107 return o;
1108}
1109
1110/*
1111 * Return primary osd for given pgid, or -1 if none.
1112 */
1113int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1114{
1115 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1116 int i, num = CEPH_PG_MAX_SIZE;
1117
1118 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1119 if (!osds)
1120 return -1;
1121
1122 /* primary is first up osd */
1123 for (i = 0; i < num; i++)
1124 if (ceph_osd_is_up(osdmap, osds[i]))
1125 return osds[i];
1126 return -1;
1127}
1128EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 000000000000..13cb409a7bba
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
1
2#include <linux/module.h>
3#include <linux/gfp.h>
4#include <linux/pagemap.h>
5#include <linux/highmem.h>
6#include <linux/ceph/pagelist.h>
7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail) {
11 struct page *page = list_entry(pl->head.prev, struct page, lru);
12 kunmap(page);
13 pl->mapped_tail = NULL;
14 }
15}
16
17int ceph_pagelist_release(struct ceph_pagelist *pl)
18{
19 ceph_pagelist_unmap_tail(pl);
20 while (!list_empty(&pl->head)) {
21 struct page *page = list_first_entry(&pl->head, struct page,
22 lru);
23 list_del(&page->lru);
24 __free_page(page);
25 }
26 ceph_pagelist_free_reserve(pl);
27 return 0;
28}
29EXPORT_SYMBOL(ceph_pagelist_release);
30
31static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
32{
33 struct page *page;
34
35 if (!pl->num_pages_free) {
36 page = __page_cache_alloc(GFP_NOFS);
37 } else {
38 page = list_first_entry(&pl->free_list, struct page, lru);
39 list_del(&page->lru);
40 --pl->num_pages_free;
41 }
42 if (!page)
43 return -ENOMEM;
44 pl->room += PAGE_SIZE;
45 ceph_pagelist_unmap_tail(pl);
46 list_add_tail(&page->lru, &pl->head);
47 pl->mapped_tail = kmap(page);
48 return 0;
49}
50
51int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
52{
53 while (pl->room < len) {
54 size_t bit = pl->room;
55 int ret;
56
57 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
58 buf, bit);
59 pl->length += bit;
60 pl->room -= bit;
61 buf += bit;
62 len -= bit;
63 ret = ceph_pagelist_addpage(pl);
64 if (ret)
65 return ret;
66 }
67
68 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
69 pl->length += len;
70 pl->room -= len;
71 return 0;
72}
73EXPORT_SYMBOL(ceph_pagelist_append);
74
75/**
76 * Allocate enough pages for a pagelist to append the given amount
77 * of data without without allocating.
78 * Returns: 0 on success, -ENOMEM on error.
79 */
80int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
81{
82 if (space <= pl->room)
83 return 0;
84 space -= pl->room;
85 space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */
86
87 while (space > pl->num_pages_free) {
88 struct page *page = __page_cache_alloc(GFP_NOFS);
89 if (!page)
90 return -ENOMEM;
91 list_add_tail(&page->lru, &pl->free_list);
92 ++pl->num_pages_free;
93 }
94 return 0;
95}
96EXPORT_SYMBOL(ceph_pagelist_reserve);
97
98/**
99 * Free any pages that have been preallocated.
100 */
101int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
102{
103 while (!list_empty(&pl->free_list)) {
104 struct page *page = list_first_entry(&pl->free_list,
105 struct page, lru);
106 list_del(&page->lru);
107 __free_page(page);
108 --pl->num_pages_free;
109 }
110 BUG_ON(pl->num_pages_free);
111 return 0;
112}
113EXPORT_SYMBOL(ceph_pagelist_free_reserve);
114
115/**
116 * Create a truncation point.
117 */
118void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
119 struct ceph_pagelist_cursor *c)
120{
121 c->pl = pl;
122 c->page_lru = pl->head.prev;
123 c->room = pl->room;
124}
125EXPORT_SYMBOL(ceph_pagelist_set_cursor);
126
127/**
128 * Truncate a pagelist to the given point. Move extra pages to reserve.
129 * This won't sleep.
130 * Returns: 0 on success,
131 * -EINVAL if the pagelist doesn't match the trunc point pagelist
132 */
133int ceph_pagelist_truncate(struct ceph_pagelist *pl,
134 struct ceph_pagelist_cursor *c)
135{
136 struct page *page;
137
138 if (pl != c->pl)
139 return -EINVAL;
140 ceph_pagelist_unmap_tail(pl);
141 while (pl->head.prev != c->page_lru) {
142 page = list_entry(pl->head.prev, struct page, lru);
143 list_del(&page->lru); /* remove from pagelist */
144 list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
145 ++pl->num_pages_free;
146 }
147 pl->room = c->room;
148 if (!list_empty(&pl->head)) {
149 page = list_entry(pl->head.prev, struct page, lru);
150 pl->mapped_tail = kmap(page);
151 }
152 return 0;
153}
154EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 000000000000..54caf0687155
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/sched.h>
5#include <linux/slab.h>
6#include <linux/file.h>
7#include <linux/namei.h>
8#include <linux/writeback.h>
9
10#include <linux/ceph/libceph.h>
11
12/*
13 * build a vector of user pages
14 */
15struct page **ceph_get_direct_page_vector(const char __user *data,
16 int num_pages,
17 loff_t off, size_t len)
18{
19 struct page **pages;
20 int rc;
21
22 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
23 if (!pages)
24 return ERR_PTR(-ENOMEM);
25
26 down_read(&current->mm->mmap_sem);
27 rc = get_user_pages(current, current->mm, (unsigned long)data,
28 num_pages, 0, 0, pages, NULL);
29 up_read(&current->mm->mmap_sem);
30 if (rc < 0)
31 goto fail;
32 return pages;
33
34fail:
35 kfree(pages);
36 return ERR_PTR(rc);
37}
38EXPORT_SYMBOL(ceph_get_direct_page_vector);
39
40void ceph_put_page_vector(struct page **pages, int num_pages)
41{
42 int i;
43
44 for (i = 0; i < num_pages; i++)
45 put_page(pages[i]);
46 kfree(pages);
47}
48EXPORT_SYMBOL(ceph_put_page_vector);
49
50void ceph_release_page_vector(struct page **pages, int num_pages)
51{
52 int i;
53
54 for (i = 0; i < num_pages; i++)
55 __free_pages(pages[i], 0);
56 kfree(pages);
57}
58EXPORT_SYMBOL(ceph_release_page_vector);
59
60/*
61 * allocate a vector new pages
62 */
63struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
64{
65 struct page **pages;
66 int i;
67
68 pages = kmalloc(sizeof(*pages) * num_pages, flags);
69 if (!pages)
70 return ERR_PTR(-ENOMEM);
71 for (i = 0; i < num_pages; i++) {
72 pages[i] = __page_cache_alloc(flags);
73 if (pages[i] == NULL) {
74 ceph_release_page_vector(pages, i);
75 return ERR_PTR(-ENOMEM);
76 }
77 }
78 return pages;
79}
80EXPORT_SYMBOL(ceph_alloc_page_vector);
81
82/*
83 * copy user data into a page vector
84 */
85int ceph_copy_user_to_page_vector(struct page **pages,
86 const char __user *data,
87 loff_t off, size_t len)
88{
89 int i = 0;
90 int po = off & ~PAGE_CACHE_MASK;
91 int left = len;
92 int l, bad;
93
94 while (left > 0) {
95 l = min_t(int, PAGE_CACHE_SIZE-po, left);
96 bad = copy_from_user(page_address(pages[i]) + po, data, l);
97 if (bad == l)
98 return -EFAULT;
99 data += l - bad;
100 left -= l - bad;
101 po += l - bad;
102 if (po == PAGE_CACHE_SIZE) {
103 po = 0;
104 i++;
105 }
106 }
107 return len;
108}
109EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
110
111int ceph_copy_to_page_vector(struct page **pages,
112 const char *data,
113 loff_t off, size_t len)
114{
115 int i = 0;
116 size_t po = off & ~PAGE_CACHE_MASK;
117 size_t left = len;
118 size_t l;
119
120 while (left > 0) {
121 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
122 memcpy(page_address(pages[i]) + po, data, l);
123 data += l;
124 left -= l;
125 po += l;
126 if (po == PAGE_CACHE_SIZE) {
127 po = 0;
128 i++;
129 }
130 }
131 return len;
132}
133EXPORT_SYMBOL(ceph_copy_to_page_vector);
134
135int ceph_copy_from_page_vector(struct page **pages,
136 char *data,
137 loff_t off, size_t len)
138{
139 int i = 0;
140 size_t po = off & ~PAGE_CACHE_MASK;
141 size_t left = len;
142 size_t l;
143
144 while (left > 0) {
145 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
146 memcpy(data, page_address(pages[i]) + po, l);
147 data += l;
148 left -= l;
149 po += l;
150 if (po == PAGE_CACHE_SIZE) {
151 po = 0;
152 i++;
153 }
154 }
155 return len;
156}
157EXPORT_SYMBOL(ceph_copy_from_page_vector);
158
159/*
160 * copy user data from a page vector into a user pointer
161 */
162int ceph_copy_page_vector_to_user(struct page **pages,
163 char __user *data,
164 loff_t off, size_t len)
165{
166 int i = 0;
167 int po = off & ~PAGE_CACHE_MASK;
168 int left = len;
169 int l, bad;
170
171 while (left > 0) {
172 l = min_t(int, left, PAGE_CACHE_SIZE-po);
173 bad = copy_to_user(data, page_address(pages[i]) + po, l);
174 if (bad == l)
175 return -EFAULT;
176 data += l - bad;
177 left -= l - bad;
178 if (po) {
179 po += l - bad;
180 if (po == PAGE_CACHE_SIZE)
181 po = 0;
182 }
183 i++;
184 }
185 return len;
186}
187EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
188
189/*
190 * Zero an extent within a page vector. Offset is relative to the
191 * start of the first page.
192 */
193void ceph_zero_page_vector_range(int off, int len, struct page **pages)
194{
195 int i = off >> PAGE_CACHE_SHIFT;
196
197 off &= ~PAGE_CACHE_MASK;
198
199 dout("zero_page_vector_page %u~%u\n", off, len);
200
201 /* leading partial page? */
202 if (off) {
203 int end = min((int)PAGE_CACHE_SIZE, off + len);
204 dout("zeroing %d %p head from %d\n", i, pages[i],
205 (int)off);
206 zero_user_segment(pages[i], off, end);
207 len -= (end - off);
208 i++;
209 }
210 while (len >= PAGE_CACHE_SIZE) {
211 dout("zeroing %d %p len=%d\n", i, pages[i], len);
212 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
213 len -= PAGE_CACHE_SIZE;
214 i++;
215 }
216 /* trailing partial page? */
217 if (len) {
218 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
219 zero_user_segment(pages[i], 0, len);
220 }
221}
222EXPORT_SYMBOL(ceph_zero_page_vector_range);
223
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 251997a95483..282806ba7a57 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -243,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
243 unlock_sock_fast(sk, slow); 243 unlock_sock_fast(sk, slow);
244 244
245 /* skb is now orphaned, can be freed outside of locked section */ 245 /* skb is now orphaned, can be freed outside of locked section */
246 trace_kfree_skb(skb, skb_free_datagram_locked);
246 __kfree_skb(skb); 247 __kfree_skb(skb);
247} 248}
248EXPORT_SYMBOL(skb_free_datagram_locked); 249EXPORT_SYMBOL(skb_free_datagram_locked);
diff --git a/net/core/dev.c b/net/core/dev.c
index 3721fbb9a83c..7ec85e27beed 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -128,6 +128,8 @@
128#include <linux/jhash.h> 128#include <linux/jhash.h>
129#include <linux/random.h> 129#include <linux/random.h>
130#include <trace/events/napi.h> 130#include <trace/events/napi.h>
131#include <trace/events/net.h>
132#include <trace/events/skb.h>
131#include <linux/pci.h> 133#include <linux/pci.h>
132 134
133#include "net-sysfs.h" 135#include "net-sysfs.h"
@@ -1978,6 +1980,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1978 } 1980 }
1979 1981
1980 rc = ops->ndo_start_xmit(skb, dev); 1982 rc = ops->ndo_start_xmit(skb, dev);
1983 trace_net_dev_xmit(skb, rc);
1981 if (rc == NETDEV_TX_OK) 1984 if (rc == NETDEV_TX_OK)
1982 txq_trans_update(txq); 1985 txq_trans_update(txq);
1983 return rc; 1986 return rc;
@@ -1998,6 +2001,7 @@ gso:
1998 skb_dst_drop(nskb); 2001 skb_dst_drop(nskb);
1999 2002
2000 rc = ops->ndo_start_xmit(nskb, dev); 2003 rc = ops->ndo_start_xmit(nskb, dev);
2004 trace_net_dev_xmit(nskb, rc);
2001 if (unlikely(rc != NETDEV_TX_OK)) { 2005 if (unlikely(rc != NETDEV_TX_OK)) {
2002 if (rc & ~NETDEV_TX_MASK) 2006 if (rc & ~NETDEV_TX_MASK)
2003 goto out_kfree_gso_skb; 2007 goto out_kfree_gso_skb;
@@ -2058,16 +2062,16 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2058 struct sk_buff *skb) 2062 struct sk_buff *skb)
2059{ 2063{
2060 int queue_index; 2064 int queue_index;
2061 struct sock *sk = skb->sk; 2065 const struct net_device_ops *ops = dev->netdev_ops;
2062 2066
2063 queue_index = sk_tx_queue_get(sk); 2067 if (ops->ndo_select_queue) {
2064 if (queue_index < 0) { 2068 queue_index = ops->ndo_select_queue(dev, skb);
2065 const struct net_device_ops *ops = dev->netdev_ops; 2069 queue_index = dev_cap_txqueue(dev, queue_index);
2070 } else {
2071 struct sock *sk = skb->sk;
2072 queue_index = sk_tx_queue_get(sk);
2073 if (queue_index < 0) {
2066 2074
2067 if (ops->ndo_select_queue) {
2068 queue_index = ops->ndo_select_queue(dev, skb);
2069 queue_index = dev_cap_txqueue(dev, queue_index);
2070 } else {
2071 queue_index = 0; 2075 queue_index = 0;
2072 if (dev->real_num_tx_queues > 1) 2076 if (dev->real_num_tx_queues > 1)
2073 queue_index = skb_tx_hash(dev, skb); 2077 queue_index = skb_tx_hash(dev, skb);
@@ -2186,6 +2190,7 @@ int dev_queue_xmit(struct sk_buff *skb)
2186#ifdef CONFIG_NET_CLS_ACT 2190#ifdef CONFIG_NET_CLS_ACT
2187 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2191 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2188#endif 2192#endif
2193 trace_net_dev_queue(skb);
2189 if (q->enqueue) { 2194 if (q->enqueue) {
2190 rc = __dev_xmit_skb(skb, q, dev, txq); 2195 rc = __dev_xmit_skb(skb, q, dev, txq);
2191 goto out; 2196 goto out;
@@ -2512,6 +2517,7 @@ int netif_rx(struct sk_buff *skb)
2512 if (netdev_tstamp_prequeue) 2517 if (netdev_tstamp_prequeue)
2513 net_timestamp_check(skb); 2518 net_timestamp_check(skb);
2514 2519
2520 trace_netif_rx(skb);
2515#ifdef CONFIG_RPS 2521#ifdef CONFIG_RPS
2516 { 2522 {
2517 struct rps_dev_flow voidflow, *rflow = &voidflow; 2523 struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -2571,6 +2577,7 @@ static void net_tx_action(struct softirq_action *h)
2571 clist = clist->next; 2577 clist = clist->next;
2572 2578
2573 WARN_ON(atomic_read(&skb->users)); 2579 WARN_ON(atomic_read(&skb->users));
2580 trace_kfree_skb(skb, net_tx_action);
2574 __kfree_skb(skb); 2581 __kfree_skb(skb);
2575 } 2582 }
2576 } 2583 }
@@ -2828,6 +2835,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
2828 if (!netdev_tstamp_prequeue) 2835 if (!netdev_tstamp_prequeue)
2829 net_timestamp_check(skb); 2836 net_timestamp_check(skb);
2830 2837
2838 trace_netif_receive_skb(skb);
2831 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) 2839 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2832 return NET_RX_SUCCESS; 2840 return NET_RX_SUCCESS;
2833 2841
@@ -4845,7 +4853,7 @@ static void rollback_registered_many(struct list_head *head)
4845 dev = list_first_entry(head, struct net_device, unreg_list); 4853 dev = list_first_entry(head, struct net_device, unreg_list);
4846 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); 4854 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4847 4855
4848 synchronize_net(); 4856 rcu_barrier();
4849 4857
4850 list_for_each_entry(dev, head, unreg_list) 4858 list_for_each_entry(dev, head, unreg_list)
4851 dev_put(dev); 4859 dev_put(dev);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 7a85367b3c2f..8451ab481095 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -348,7 +348,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
348 if (info.cmd == ETHTOOL_GRXCLSRLALL) { 348 if (info.cmd == ETHTOOL_GRXCLSRLALL) {
349 if (info.rule_cnt > 0) { 349 if (info.rule_cnt > 0) {
350 if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) 350 if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
351 rule_buf = kmalloc(info.rule_cnt * sizeof(u32), 351 rule_buf = kzalloc(info.rule_cnt * sizeof(u32),
352 GFP_USER); 352 GFP_USER);
353 if (!rule_buf) 353 if (!rule_buf)
354 return -ENOMEM; 354 return -ENOMEM;
@@ -397,7 +397,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
397 (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) 397 (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
398 return -ENOMEM; 398 return -ENOMEM;
399 full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; 399 full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
400 indir = kmalloc(full_size, GFP_USER); 400 indir = kzalloc(full_size, GFP_USER);
401 if (!indir) 401 if (!indir)
402 return -ENOMEM; 402 return -ENOMEM;
403 403
@@ -538,7 +538,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
538 538
539 gstrings.len = ret; 539 gstrings.len = ret;
540 540
541 data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); 541 data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
542 if (!data) 542 if (!data)
543 return -ENOMEM; 543 return -ENOMEM;
544 544
@@ -775,7 +775,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
775 if (regs.len > reglen) 775 if (regs.len > reglen)
776 regs.len = reglen; 776 regs.len = reglen;
777 777
778 regbuf = kmalloc(reglen, GFP_USER); 778 regbuf = kzalloc(reglen, GFP_USER);
779 if (!regbuf) 779 if (!regbuf)
780 return -ENOMEM; 780 return -ENOMEM;
781 781
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9fbe7f7429b0..6743146e4d6b 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -232,7 +232,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
232 est->last_packets = bstats->packets; 232 est->last_packets = bstats->packets;
233 est->avpps = rate_est->pps<<10; 233 est->avpps = rate_est->pps<<10;
234 234
235 spin_lock(&est_tree_lock); 235 spin_lock_bh(&est_tree_lock);
236 if (!elist[idx].timer.function) { 236 if (!elist[idx].timer.function) {
237 INIT_LIST_HEAD(&elist[idx].list); 237 INIT_LIST_HEAD(&elist[idx].list);
238 setup_timer(&elist[idx].timer, est_timer, idx); 238 setup_timer(&elist[idx].timer, est_timer, idx);
@@ -243,7 +243,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
243 243
244 list_add_rcu(&est->list, &elist[idx].list); 244 list_add_rcu(&est->list, &elist[idx].list);
245 gen_add_node(est); 245 gen_add_node(est);
246 spin_unlock(&est_tree_lock); 246 spin_unlock_bh(&est_tree_lock);
247 247
248 return 0; 248 return 0;
249} 249}
@@ -270,7 +270,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
270{ 270{
271 struct gen_estimator *e; 271 struct gen_estimator *e;
272 272
273 spin_lock(&est_tree_lock); 273 spin_lock_bh(&est_tree_lock);
274 while ((e = gen_find_node(bstats, rate_est))) { 274 while ((e = gen_find_node(bstats, rate_est))) {
275 rb_erase(&e->node, &est_root); 275 rb_erase(&e->node, &est_root);
276 276
@@ -281,7 +281,7 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
281 list_del_rcu(&e->list); 281 list_del_rcu(&e->list);
282 call_rcu(&e->e_rcu, __gen_kill_estimator); 282 call_rcu(&e->e_rcu, __gen_kill_estimator);
283 } 283 }
284 spin_unlock(&est_tree_lock); 284 spin_unlock_bh(&est_tree_lock);
285} 285}
286EXPORT_SYMBOL(gen_kill_estimator); 286EXPORT_SYMBOL(gen_kill_estimator);
287 287
@@ -320,9 +320,9 @@ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
320 320
321 ASSERT_RTNL(); 321 ASSERT_RTNL();
322 322
323 spin_lock(&est_tree_lock); 323 spin_lock_bh(&est_tree_lock);
324 res = gen_find_node(bstats, rate_est) != NULL; 324 res = gen_find_node(bstats, rate_est) != NULL;
325 spin_unlock(&est_tree_lock); 325 spin_unlock_bh(&est_tree_lock);
326 326
327 return res; 327 return res;
328} 328}
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 1cd98df412df..e6b133b77ccb 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -35,9 +35,10 @@
35 * in any case. 35 * in any case.
36 */ 36 */
37 37
38int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode) 38long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode)
39{ 39{
40 int size, err, ct; 40 int size, ct;
41 long err;
41 42
42 if (m->msg_namelen) { 43 if (m->msg_namelen) {
43 if (mode == VERIFY_READ) { 44 if (mode == VERIFY_READ) {
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index afa6380ed88a..7f1bb2aba03b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
26 26
27#define CREATE_TRACE_POINTS 27#define CREATE_TRACE_POINTS
28#include <trace/events/skb.h> 28#include <trace/events/skb.h>
29#include <trace/events/net.h>
29#include <trace/events/napi.h> 30#include <trace/events/napi.h>
30 31
31EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); 32EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3a2513f0d0c3..56ba3c4e4761 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -466,6 +466,7 @@ void consume_skb(struct sk_buff *skb)
466 smp_rmb(); 466 smp_rmb();
467 else if (likely(!atomic_dec_and_test(&skb->users))) 467 else if (likely(!atomic_dec_and_test(&skb->users)))
468 return; 468 return;
469 trace_consume_skb(skb);
469 __kfree_skb(skb); 470 __kfree_skb(skb);
470} 471}
471EXPORT_SYMBOL(consume_skb); 472EXPORT_SYMBOL(consume_skb);
@@ -2573,6 +2574,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
2573 __copy_skb_header(nskb, skb); 2574 __copy_skb_header(nskb, skb);
2574 nskb->mac_len = skb->mac_len; 2575 nskb->mac_len = skb->mac_len;
2575 2576
2577 /* nskb and skb might have different headroom */
2578 if (nskb->ip_summed == CHECKSUM_PARTIAL)
2579 nskb->csum_start += skb_headroom(nskb) - headroom;
2580
2576 skb_reset_mac_header(nskb); 2581 skb_reset_mac_header(nskb);
2577 skb_set_network_header(nskb, skb->mac_len); 2582 skb_set_network_header(nskb, skb->mac_len);
2578 nskb->transport_header = (nskb->network_header + 2583 nskb->transport_header = (nskb->network_header +
@@ -2703,7 +2708,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2703 return -E2BIG; 2708 return -E2BIG;
2704 2709
2705 headroom = skb_headroom(p); 2710 headroom = skb_headroom(p);
2706 nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); 2711 nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
2707 if (unlikely(!nskb)) 2712 if (unlikely(!nskb))
2708 return -ENOMEM; 2713 return -ENOMEM;
2709 2714
diff --git a/net/core/sock.c b/net/core/sock.c
index b05b9b6ddb87..7d99e13148e6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
1078#ifdef CONFIG_CGROUPS 1078#ifdef CONFIG_CGROUPS
1079void sock_update_classid(struct sock *sk) 1079void sock_update_classid(struct sock *sk)
1080{ 1080{
1081 u32 classid = task_cls_classid(current); 1081 u32 classid;
1082 1082
1083 rcu_read_lock(); /* doing current task, which cannot vanish. */
1084 classid = task_cls_classid(current);
1085 rcu_read_unlock();
1083 if (classid && classid != sk->sk_classid) 1086 if (classid && classid != sk->sk_classid)
1084 sk->sk_classid = classid; 1087 sk->sk_classid = classid;
1085} 1088}
@@ -1351,9 +1354,9 @@ int sock_i_uid(struct sock *sk)
1351{ 1354{
1352 int uid; 1355 int uid;
1353 1356
1354 read_lock(&sk->sk_callback_lock); 1357 read_lock_bh(&sk->sk_callback_lock);
1355 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; 1358 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1356 read_unlock(&sk->sk_callback_lock); 1359 read_unlock_bh(&sk->sk_callback_lock);
1357 return uid; 1360 return uid;
1358} 1361}
1359EXPORT_SYMBOL(sock_i_uid); 1362EXPORT_SYMBOL(sock_i_uid);
@@ -1362,9 +1365,9 @@ unsigned long sock_i_ino(struct sock *sk)
1362{ 1365{
1363 unsigned long ino; 1366 unsigned long ino;
1364 1367
1365 read_lock(&sk->sk_callback_lock); 1368 read_lock_bh(&sk->sk_callback_lock);
1366 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1369 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1367 read_unlock(&sk->sk_callback_lock); 1370 read_unlock_bh(&sk->sk_callback_lock);
1368 return ino; 1371 return ino;
1369} 1372}
1370EXPORT_SYMBOL(sock_i_ino); 1373EXPORT_SYMBOL(sock_i_ino);
diff --git a/net/core/stream.c b/net/core/stream.c
index d959e0f41528..f5df85dcd20b 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -141,10 +141,10 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
141 141
142 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 142 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
143 sk->sk_write_pending++; 143 sk->sk_write_pending++;
144 sk_wait_event(sk, &current_timeo, !sk->sk_err && 144 sk_wait_event(sk, &current_timeo, sk->sk_err ||
145 !(sk->sk_shutdown & SEND_SHUTDOWN) && 145 (sk->sk_shutdown & SEND_SHUTDOWN) ||
146 sk_stream_memory_free(sk) && 146 (sk_stream_memory_free(sk) &&
147 vm_wait); 147 !vm_wait));
148 sk->sk_write_pending--; 148 sk->sk_write_pending--;
149 149
150 if (vm_wait) { 150 if (vm_wait) {
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 078e48d442fd..33d0e6297c21 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -149,6 +149,7 @@ static const struct file_operations dccpprobe_fops = {
149 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
150 .open = dccpprobe_open, 150 .open = dccpprobe_open,
151 .read = dccpprobe_read, 151 .read = dccpprobe_read,
152 .llseek = noop_llseek,
152}; 153};
153 154
154static __init int dccpprobe_init(void) 155static __init int dccpprobe_init(void)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7c3a7d191249..7cd7760144f7 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER
46 rp_filter on use: 46 rp_filter on use:
47 47
48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter 48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
49 and 49 or
50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter 50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
51 51
52 Note that some distributions enable it in startup scripts. 52 Note that some distributions enable it in startup scripts.
@@ -217,6 +217,7 @@ config NET_IPIP
217 217
218config NET_IPGRE 218config NET_IPGRE
219 tristate "IP: GRE tunnels over IP" 219 tristate "IP: GRE tunnels over IP"
220 depends on IPV6 || IPV6=n
220 help 221 help
221 Tunneling means encapsulating data of one protocol type within 222 Tunneling means encapsulating data of one protocol type within
222 another protocol and sending it over a channel that understands the 223 another protocol and sending it over a channel that understands the
@@ -412,7 +413,7 @@ config INET_XFRM_MODE_BEET
412 If unsure, say Y. 413 If unsure, say Y.
413 414
414config INET_LRO 415config INET_LRO
415 bool "Large Receive Offload (ipv4/tcp)" 416 tristate "Large Receive Offload (ipv4/tcp)"
416 default y 417 default y
417 ---help--- 418 ---help---
418 Support for Large Receive Offload (ipv4/tcp). 419 Support for Large Receive Offload (ipv4/tcp).
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f0550941df7b..721a8a37b45c 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -62,8 +62,11 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
62 } 62 }
63 if (!inet->inet_saddr) 63 if (!inet->inet_saddr)
64 inet->inet_saddr = rt->rt_src; /* Update source address */ 64 inet->inet_saddr = rt->rt_src; /* Update source address */
65 if (!inet->inet_rcv_saddr) 65 if (!inet->inet_rcv_saddr) {
66 inet->inet_rcv_saddr = rt->rt_src; 66 inet->inet_rcv_saddr = rt->rt_src;
67 if (sk->sk_prot->rehash)
68 sk->sk_prot->rehash(sk);
69 }
67 inet->inet_daddr = rt->rt_dst; 70 inet->inet_daddr = rt->rt_dst;
68 inet->inet_dport = usin->sin_port; 71 inet->inet_dport = usin->sin_port;
69 sk->sk_state = TCP_ESTABLISHED; 72 sk->sk_state = TCP_ESTABLISHED;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a43968918350..7d02a9f999fa 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -246,6 +246,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
246 246
247 struct fib_result res; 247 struct fib_result res;
248 int no_addr, rpf, accept_local; 248 int no_addr, rpf, accept_local;
249 bool dev_match;
249 int ret; 250 int ret;
250 struct net *net; 251 struct net *net;
251 252
@@ -273,12 +274,22 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
273 } 274 }
274 *spec_dst = FIB_RES_PREFSRC(res); 275 *spec_dst = FIB_RES_PREFSRC(res);
275 fib_combine_itag(itag, &res); 276 fib_combine_itag(itag, &res);
277 dev_match = false;
278
276#ifdef CONFIG_IP_ROUTE_MULTIPATH 279#ifdef CONFIG_IP_ROUTE_MULTIPATH
277 if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) 280 for (ret = 0; ret < res.fi->fib_nhs; ret++) {
281 struct fib_nh *nh = &res.fi->fib_nh[ret];
282
283 if (nh->nh_dev == dev) {
284 dev_match = true;
285 break;
286 }
287 }
278#else 288#else
279 if (FIB_RES_DEV(res) == dev) 289 if (FIB_RES_DEV(res) == dev)
290 dev_match = true;
280#endif 291#endif
281 { 292 if (dev_match) {
282 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 293 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
283 fib_res_put(&res); 294 fib_res_put(&res);
284 return ret; 295 return ret;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 79d057a939ba..4a8e370862bc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -186,7 +186,9 @@ static inline struct tnode *node_parent_rcu(struct node *node)
186{ 186{
187 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
188 188
189 return rcu_dereference(ret); 189 return rcu_dereference_check(ret,
190 rcu_read_lock_held() ||
191 lockdep_rtnl_is_held());
190} 192}
191 193
192/* Same as rcu_assign_pointer 194/* Same as rcu_assign_pointer
@@ -1753,7 +1755,9 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1753 1755
1754static struct leaf *trie_firstleaf(struct trie *t) 1756static struct leaf *trie_firstleaf(struct trie *t)
1755{ 1757{
1756 struct tnode *n = (struct tnode *) rcu_dereference(t->trie); 1758 struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie,
1759 rcu_read_lock_held() ||
1760 lockdep_rtnl_is_held());
1757 1761
1758 if (!n) 1762 if (!n)
1759 return NULL; 1763 return NULL;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a1ad0e7180d2..2a4bb76f2132 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
856 igmpv3_clear_delrec(in_dev); 856 igmpv3_clear_delrec(in_dev);
857 } else if (len < 12) { 857 } else if (len < 12) {
858 return; /* ignore bogus packet; freed by caller */ 858 return; /* ignore bogus packet; freed by caller */
859 } else if (IGMP_V1_SEEN(in_dev)) {
860 /* This is a v3 query with v1 queriers present */
861 max_delay = IGMP_Query_Response_Interval;
862 group = 0;
863 } else if (IGMP_V2_SEEN(in_dev)) {
864 /* this is a v3 query with v2 queriers present;
865 * Interpretation of the max_delay code is problematic here.
866 * A real v2 host would use ih_code directly, while v3 has a
867 * different encoding. We use the v3 encoding as more likely
868 * to be intended in a v3 query.
869 */
870 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
859 } else { /* v3 */ 871 } else { /* v3 */
860 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 872 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
861 return; 873 return;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 945b20a5ad50..35c93e8b6a46 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -45,7 +45,7 @@
45#include <net/netns/generic.h> 45#include <net/netns/generic.h>
46#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47 47
48#ifdef CONFIG_IPV6 48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49#include <net/ipv6.h> 49#include <net/ipv6.h>
50#include <net/ip6_fib.h> 50#include <net/ip6_fib.h>
51#include <net/ip6_route.h> 51#include <net/ip6_route.h>
@@ -699,7 +699,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
699 if ((dst = rt->rt_gateway) == 0) 699 if ((dst = rt->rt_gateway) == 0)
700 goto tx_error_icmp; 700 goto tx_error_icmp;
701 } 701 }
702#ifdef CONFIG_IPV6 702#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
703 else if (skb->protocol == htons(ETH_P_IPV6)) { 703 else if (skb->protocol == htons(ETH_P_IPV6)) {
704 struct in6_addr *addr6; 704 struct in6_addr *addr6;
705 int addr_type; 705 int addr_type;
@@ -774,7 +774,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
774 goto tx_error; 774 goto tx_error;
775 } 775 }
776 } 776 }
777#ifdef CONFIG_IPV6 777#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
778 else if (skb->protocol == htons(ETH_P_IPV6)) { 778 else if (skb->protocol == htons(ETH_P_IPV6)) {
779 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 779 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
780 780
@@ -850,7 +850,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
850 if ((iph->ttl = tiph->ttl) == 0) { 850 if ((iph->ttl = tiph->ttl) == 0) {
851 if (skb->protocol == htons(ETH_P_IP)) 851 if (skb->protocol == htons(ETH_P_IP))
852 iph->ttl = old_iph->ttl; 852 iph->ttl = old_iph->ttl;
853#ifdef CONFIG_IPV6 853#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
854 else if (skb->protocol == htons(ETH_P_IPV6)) 854 else if (skb->protocol == htons(ETH_P_IPV6))
855 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 855 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
856#endif 856#endif
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 04b69896df5f..7649d7750075 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -488,9 +488,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
488 * we can switch to copy when see the first bad fragment. 488 * we can switch to copy when see the first bad fragment.
489 */ 489 */
490 if (skb_has_frags(skb)) { 490 if (skb_has_frags(skb)) {
491 struct sk_buff *frag; 491 struct sk_buff *frag, *frag2;
492 int first_len = skb_pagelen(skb); 492 int first_len = skb_pagelen(skb);
493 int truesizes = 0;
494 493
495 if (first_len - hlen > mtu || 494 if (first_len - hlen > mtu ||
496 ((first_len - hlen) & 7) || 495 ((first_len - hlen) & 7) ||
@@ -503,18 +502,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
503 if (frag->len > mtu || 502 if (frag->len > mtu ||
504 ((frag->len & 7) && frag->next) || 503 ((frag->len & 7) && frag->next) ||
505 skb_headroom(frag) < hlen) 504 skb_headroom(frag) < hlen)
506 goto slow_path; 505 goto slow_path_clean;
507 506
508 /* Partially cloned skb? */ 507 /* Partially cloned skb? */
509 if (skb_shared(frag)) 508 if (skb_shared(frag))
510 goto slow_path; 509 goto slow_path_clean;
511 510
512 BUG_ON(frag->sk); 511 BUG_ON(frag->sk);
513 if (skb->sk) { 512 if (skb->sk) {
514 frag->sk = skb->sk; 513 frag->sk = skb->sk;
515 frag->destructor = sock_wfree; 514 frag->destructor = sock_wfree;
516 } 515 }
517 truesizes += frag->truesize; 516 skb->truesize -= frag->truesize;
518 } 517 }
519 518
520 /* Everything is OK. Generate! */ 519 /* Everything is OK. Generate! */
@@ -524,7 +523,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
524 frag = skb_shinfo(skb)->frag_list; 523 frag = skb_shinfo(skb)->frag_list;
525 skb_frag_list_init(skb); 524 skb_frag_list_init(skb);
526 skb->data_len = first_len - skb_headlen(skb); 525 skb->data_len = first_len - skb_headlen(skb);
527 skb->truesize -= truesizes;
528 skb->len = first_len; 526 skb->len = first_len;
529 iph->tot_len = htons(first_len); 527 iph->tot_len = htons(first_len);
530 iph->frag_off = htons(IP_MF); 528 iph->frag_off = htons(IP_MF);
@@ -576,6 +574,15 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
576 } 574 }
577 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 575 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
578 return err; 576 return err;
577
578slow_path_clean:
579 skb_walk_frags(skb, frag2) {
580 if (frag2 == frag)
581 break;
582 frag2->sk = NULL;
583 frag2->destructor = NULL;
584 skb->truesize += frag2->truesize;
585 }
579 } 586 }
580 587
581slow_path: 588slow_path:
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6c40a8c46e79..64b70ad162e3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1129,6 +1129,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1129 case IP_HDRINCL: 1129 case IP_HDRINCL:
1130 val = inet->hdrincl; 1130 val = inet->hdrincl;
1131 break; 1131 break;
1132 case IP_NODEFRAG:
1133 val = inet->nodefrag;
1134 break;
1132 case IP_MTU_DISCOVER: 1135 case IP_MTU_DISCOVER:
1133 val = inet->pmtudisc; 1136 val = inet->pmtudisc;
1134 break; 1137 break;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index b254dafaf429..43eec80c0e7c 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -112,6 +112,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
112 /* ip_route_me_harder expects skb->dst to be set */ 112 /* ip_route_me_harder expects skb->dst to be set */
113 skb_dst_set_noref(nskb, skb_dst(oldskb)); 113 skb_dst_set_noref(nskb, skb_dst(oldskb));
114 114
115 nskb->protocol = htons(ETH_P_IP);
115 if (ip_route_me_harder(nskb, addr_type)) 116 if (ip_route_me_harder(nskb, addr_type))
116 goto free_nskb; 117 goto free_nskb;
117 118
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 244f7cb08d68..37f8adb68c79 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/security.h>
14#include <net/net_namespace.h> 15#include <net/net_namespace.h>
15 16
16#include <linux/netfilter.h> 17#include <linux/netfilter.h>
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
87 rcu_read_unlock(); 88 rcu_read_unlock();
88} 89}
89 90
91#ifdef CONFIG_NF_CONNTRACK_SECMARK
92static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
93{
94 int ret;
95 u32 len;
96 char *secctx;
97
98 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
99 if (ret)
100 return ret;
101
102 ret = seq_printf(s, "secctx=%s ", secctx);
103
104 security_release_secctx(secctx, len);
105 return ret;
106}
107#else
108static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
109{
110 return 0;
111}
112#endif
113
90static int ct_seq_show(struct seq_file *s, void *v) 114static int ct_seq_show(struct seq_file *s, void *v)
91{ 115{
92 struct nf_conntrack_tuple_hash *hash = v; 116 struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
148 goto release; 172 goto release;
149#endif 173#endif
150 174
151#ifdef CONFIG_NF_CONNTRACK_SECMARK 175 if (ct_show_secctx(s, ct))
152 if (seq_printf(s, "secmark=%u ", ct->secmark))
153 goto release; 176 goto release;
154#endif
155 177
156 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) 178 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
157 goto release; 179 goto release;
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index eab8de32f200..f3a9b42b16c6 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,9 +66,11 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
66 const struct net_device *out, 66 const struct net_device *out,
67 int (*okfn)(struct sk_buff *)) 67 int (*okfn)(struct sk_buff *))
68{ 68{
69 struct sock *sk = skb->sk;
69 struct inet_sock *inet = inet_sk(skb->sk); 70 struct inet_sock *inet = inet_sk(skb->sk);
70 71
71 if (inet && inet->nodefrag) 72 if (sk && (sk->sk_family == PF_INET) &&
73 inet->nodefrag)
72 return NF_ACCEPT; 74 return NF_ACCEPT;
73 75
74#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 76#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d9b93c..957c9241fb0c 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
38static struct nf_conntrack_l3proto *l3proto __read_mostly; 38static struct nf_conntrack_l3proto *l3proto __read_mostly;
39 39
40#define MAX_IP_NAT_PROTO 256 40#define MAX_IP_NAT_PROTO 256
41static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] 41static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
42 __read_mostly; 42 __read_mostly;
43 43
44static inline const struct nf_nat_protocol * 44static inline const struct nf_nat_protocol *
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 1679e2c0963d..ee5f419d0a56 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -893,13 +893,15 @@ static void fast_csum(__sum16 *csum,
893 unsigned char s[4]; 893 unsigned char s[4];
894 894
895 if (offset & 1) { 895 if (offset & 1) {
896 s[0] = s[2] = 0; 896 s[0] = ~0;
897 s[1] = ~*optr; 897 s[1] = ~*optr;
898 s[2] = 0;
898 s[3] = *nptr; 899 s[3] = *nptr;
899 } else { 900 } else {
900 s[1] = s[3] = 0;
901 s[0] = ~*optr; 901 s[0] = ~*optr;
902 s[1] = ~0;
902 s[2] = *nptr; 903 s[2] = *nptr;
904 s[3] = 0;
903 } 905 }
904 906
905 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); 907 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3f56b6e6c6aa..ac6559cb54f9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1231,7 +1231,7 @@ restart:
1231 } 1231 }
1232 1232
1233 if (net_ratelimit()) 1233 if (net_ratelimit())
1234 printk(KERN_WARNING "Neighbour table overflow.\n"); 1234 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1235 rt_drop(rt); 1235 rt_drop(rt);
1236 return -ENOBUFS; 1236 return -ENOBUFS;
1237 } 1237 }
@@ -2738,6 +2738,11 @@ slow_output:
2738} 2738}
2739EXPORT_SYMBOL_GPL(__ip_route_output_key); 2739EXPORT_SYMBOL_GPL(__ip_route_output_key);
2740 2740
2741static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2742{
2743 return NULL;
2744}
2745
2741static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2746static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2742{ 2747{
2743} 2748}
@@ -2746,7 +2751,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2746 .family = AF_INET, 2751 .family = AF_INET,
2747 .protocol = cpu_to_be16(ETH_P_IP), 2752 .protocol = cpu_to_be16(ETH_P_IP),
2748 .destroy = ipv4_dst_destroy, 2753 .destroy = ipv4_dst_destroy,
2749 .check = ipv4_dst_check, 2754 .check = ipv4_blackhole_dst_check,
2750 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2755 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2751 .entries = ATOMIC_INIT(0), 2756 .entries = ATOMIC_INIT(0),
2752}; 2757};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3fb1428e526e..f115ea68a4ef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -386,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
386 */ 386 */
387 387
388 mask = 0; 388 mask = 0;
389 if (sk->sk_err)
390 mask = POLLERR;
391 389
392 /* 390 /*
393 * POLLHUP is certainly not done right. But poll() doesn't 391 * POLLHUP is certainly not done right. But poll() doesn't
@@ -457,6 +455,11 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
457 if (tp->urg_data & TCP_URG_VALID) 455 if (tp->urg_data & TCP_URG_VALID)
458 mask |= POLLPRI; 456 mask |= POLLPRI;
459 } 457 }
458 /* This barrier is coupled with smp_wmb() in tcp_reset() */
459 smp_rmb();
460 if (sk->sk_err)
461 mask |= POLLERR;
462
460 return mask; 463 return mask;
461} 464}
462EXPORT_SYMBOL(tcp_poll); 465EXPORT_SYMBOL(tcp_poll);
@@ -940,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
940 sg = sk->sk_route_caps & NETIF_F_SG; 943 sg = sk->sk_route_caps & NETIF_F_SG;
941 944
942 while (--iovlen >= 0) { 945 while (--iovlen >= 0) {
943 int seglen = iov->iov_len; 946 size_t seglen = iov->iov_len;
944 unsigned char __user *from = iov->iov_base; 947 unsigned char __user *from = iov->iov_base;
945 948
946 iov++; 949 iov++;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e663b78a2ef6..b55f60f6fcbe 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2545,7 +2545,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2545 cnt += tcp_skb_pcount(skb); 2545 cnt += tcp_skb_pcount(skb);
2546 2546
2547 if (cnt > packets) { 2547 if (cnt > packets) {
2548 if (tcp_is_sack(tp) || (oldcnt >= packets)) 2548 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2549 (oldcnt >= packets))
2549 break; 2550 break;
2550 2551
2551 mss = skb_shinfo(skb)->gso_size; 2552 mss = skb_shinfo(skb)->gso_size;
@@ -4048,6 +4049,8 @@ static void tcp_reset(struct sock *sk)
4048 default: 4049 default:
4049 sk->sk_err = ECONNRESET; 4050 sk->sk_err = ECONNRESET;
4050 } 4051 }
4052 /* This barrier is coupled with smp_rmb() in tcp_poll() */
4053 smp_wmb();
4051 4054
4052 if (!sock_flag(sk, SOCK_DEAD)) 4055 if (!sock_flag(sk, SOCK_DEAD))
4053 sk->sk_error_report(sk); 4056 sk->sk_error_report(sk);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f8efada580e8..6211e2114173 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = {
214 .owner = THIS_MODULE, 214 .owner = THIS_MODULE,
215 .open = tcpprobe_open, 215 .open = tcpprobe_open,
216 .read = tcpprobe_read, 216 .read = tcpprobe_read,
217 .llseek = noop_llseek,
217}; 218};
218 219
219static __init int tcpprobe_init(void) 220static __init int tcpprobe_init(void)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c35b469e851c..74c54b30600f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -135,13 +135,16 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
135 135
136/* This function calculates a "timeout" which is equivalent to the timeout of a 136/* This function calculates a "timeout" which is equivalent to the timeout of a
137 * TCP connection after "boundary" unsuccessful, exponentially backed-off 137 * TCP connection after "boundary" unsuccessful, exponentially backed-off
138 * retransmissions with an initial RTO of TCP_RTO_MIN. 138 * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
139 * syn_set flag is set.
139 */ 140 */
140static bool retransmits_timed_out(struct sock *sk, 141static bool retransmits_timed_out(struct sock *sk,
141 unsigned int boundary) 142 unsigned int boundary,
143 bool syn_set)
142{ 144{
143 unsigned int timeout, linear_backoff_thresh; 145 unsigned int timeout, linear_backoff_thresh;
144 unsigned int start_ts; 146 unsigned int start_ts;
147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
145 148
146 if (!inet_csk(sk)->icsk_retransmits) 149 if (!inet_csk(sk)->icsk_retransmits)
147 return false; 150 return false;
@@ -151,12 +154,12 @@ static bool retransmits_timed_out(struct sock *sk,
151 else 154 else
152 start_ts = tcp_sk(sk)->retrans_stamp; 155 start_ts = tcp_sk(sk)->retrans_stamp;
153 156
154 linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); 157 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
155 158
156 if (boundary <= linear_backoff_thresh) 159 if (boundary <= linear_backoff_thresh)
157 timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; 160 timeout = ((2 << boundary) - 1) * rto_base;
158 else 161 else
159 timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + 162 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
160 (boundary - linear_backoff_thresh) * TCP_RTO_MAX; 163 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
161 164
162 return (tcp_time_stamp - start_ts) >= timeout; 165 return (tcp_time_stamp - start_ts) >= timeout;
@@ -167,14 +170,15 @@ static int tcp_write_timeout(struct sock *sk)
167{ 170{
168 struct inet_connection_sock *icsk = inet_csk(sk); 171 struct inet_connection_sock *icsk = inet_csk(sk);
169 int retry_until; 172 int retry_until;
170 bool do_reset; 173 bool do_reset, syn_set = 0;
171 174
172 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 175 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
173 if (icsk->icsk_retransmits) 176 if (icsk->icsk_retransmits)
174 dst_negative_advice(sk); 177 dst_negative_advice(sk);
175 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 178 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
179 syn_set = 1;
176 } else { 180 } else {
177 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { 181 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) {
178 /* Black hole detection */ 182 /* Black hole detection */
179 tcp_mtu_probing(icsk, sk); 183 tcp_mtu_probing(icsk, sk);
180 184
@@ -187,14 +191,14 @@ static int tcp_write_timeout(struct sock *sk)
187 191
188 retry_until = tcp_orphan_retries(sk, alive); 192 retry_until = tcp_orphan_retries(sk, alive);
189 do_reset = alive || 193 do_reset = alive ||
190 !retransmits_timed_out(sk, retry_until); 194 !retransmits_timed_out(sk, retry_until, 0);
191 195
192 if (tcp_out_of_resources(sk, do_reset)) 196 if (tcp_out_of_resources(sk, do_reset))
193 return 1; 197 return 1;
194 } 198 }
195 } 199 }
196 200
197 if (retransmits_timed_out(sk, retry_until)) { 201 if (retransmits_timed_out(sk, retry_until, syn_set)) {
198 /* Has it gone just too far? */ 202 /* Has it gone just too far? */
199 tcp_write_err(sk); 203 tcp_write_err(sk);
200 return 1; 204 return 1;
@@ -436,7 +440,7 @@ out_reset_timer:
436 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 440 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
437 } 441 }
438 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 442 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
439 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) 443 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0))
440 __sk_dst_reset(sk); 444 __sk_dst_reset(sk);
441 445
442out:; 446out:;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32e0bef60d0a..fb23c2e63b52 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk)
1260} 1260}
1261EXPORT_SYMBOL(udp_lib_unhash); 1261EXPORT_SYMBOL(udp_lib_unhash);
1262 1262
1263/*
1264 * inet_rcv_saddr was changed, we must rehash secondary hash
1265 */
1266void udp_lib_rehash(struct sock *sk, u16 newhash)
1267{
1268 if (sk_hashed(sk)) {
1269 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1270 struct udp_hslot *hslot, *hslot2, *nhslot2;
1271
1272 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1273 nhslot2 = udp_hashslot2(udptable, newhash);
1274 udp_sk(sk)->udp_portaddr_hash = newhash;
1275 if (hslot2 != nhslot2) {
1276 hslot = udp_hashslot(udptable, sock_net(sk),
1277 udp_sk(sk)->udp_port_hash);
1278 /* we must lock primary chain too */
1279 spin_lock_bh(&hslot->lock);
1280
1281 spin_lock(&hslot2->lock);
1282 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1283 hslot2->count--;
1284 spin_unlock(&hslot2->lock);
1285
1286 spin_lock(&nhslot2->lock);
1287 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
1288 &nhslot2->head);
1289 nhslot2->count++;
1290 spin_unlock(&nhslot2->lock);
1291
1292 spin_unlock_bh(&hslot->lock);
1293 }
1294 }
1295}
1296EXPORT_SYMBOL(udp_lib_rehash);
1297
1298static void udp_v4_rehash(struct sock *sk)
1299{
1300 u16 new_hash = udp4_portaddr_hash(sock_net(sk),
1301 inet_sk(sk)->inet_rcv_saddr,
1302 inet_sk(sk)->inet_num);
1303 udp_lib_rehash(sk, new_hash);
1304}
1305
1263static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1306static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1264{ 1307{
1265 int rc; 1308 int rc;
@@ -1843,6 +1886,7 @@ struct proto udp_prot = {
1843 .backlog_rcv = __udp_queue_rcv_skb, 1886 .backlog_rcv = __udp_queue_rcv_skb,
1844 .hash = udp_lib_hash, 1887 .hash = udp_lib_hash,
1845 .unhash = udp_lib_unhash, 1888 .unhash = udp_lib_unhash,
1889 .rehash = udp_v4_rehash,
1846 .get_port = udp_v4_get_port, 1890 .get_port = udp_v4_get_port,
1847 .memory_allocated = &udp_memory_allocated, 1891 .memory_allocated = &udp_memory_allocated,
1848 .sysctl_mem = sysctl_udp_mem, 1892 .sysctl_mem = sysctl_udp_mem,
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 869078d4eeb9..a580349f0b8a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -61,7 +61,7 @@ static int xfrm4_get_saddr(struct net *net,
61 61
62static int xfrm4_get_tos(struct flowi *fl) 62static int xfrm4_get_tos(struct flowi *fl)
63{ 63{
64 return fl->fl4_tos; 64 return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
65} 65}
66 66
67static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 67static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a03..47947624eccc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
21} 21}
22 22
23static void 23static void
24__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, 24__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
25 struct xfrm_tmpl *tmpl, 25{
26 xfrm_address_t *daddr, xfrm_address_t *saddr) 26 sel->daddr.a4 = fl->fl4_dst;
27 sel->saddr.a4 = fl->fl4_src;
28 sel->dport = xfrm_flowi_dport(fl);
29 sel->dport_mask = htons(0xffff);
30 sel->sport = xfrm_flowi_sport(fl);
31 sel->sport_mask = htons(0xffff);
32 sel->family = AF_INET;
33 sel->prefixlen_d = 32;
34 sel->prefixlen_s = 32;
35 sel->proto = fl->proto;
36 sel->ifindex = fl->oif;
37}
38
39static void
40xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
41 xfrm_address_t *daddr, xfrm_address_t *saddr)
27{ 42{
28 x->sel.daddr.a4 = fl->fl4_dst;
29 x->sel.saddr.a4 = fl->fl4_src;
30 x->sel.dport = xfrm_flowi_dport(fl);
31 x->sel.dport_mask = htons(0xffff);
32 x->sel.sport = xfrm_flowi_sport(fl);
33 x->sel.sport_mask = htons(0xffff);
34 x->sel.family = AF_INET;
35 x->sel.prefixlen_d = 32;
36 x->sel.prefixlen_s = 32;
37 x->sel.proto = fl->proto;
38 x->sel.ifindex = fl->oif;
39 x->id = tmpl->id; 43 x->id = tmpl->id;
40 if (x->id.daddr.a4 == 0) 44 if (x->id.daddr.a4 == 0)
41 x->id.daddr.a4 = daddr->a4; 45 x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
70 .owner = THIS_MODULE, 74 .owner = THIS_MODULE,
71 .init_flags = xfrm4_init_flags, 75 .init_flags = xfrm4_init_flags,
72 .init_tempsel = __xfrm4_init_tempsel, 76 .init_tempsel = __xfrm4_init_tempsel,
77 .init_temprop = xfrm4_init_temprop,
73 .output = xfrm4_output, 78 .output = xfrm4_output,
74 .extract_input = xfrm4_extract_input, 79 .extract_input = xfrm4_extract_input,
75 .extract_output = xfrm4_extract_output, 80 .extract_output = xfrm4_extract_output,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ab70a3fbcafa..324fac3b6c16 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4637,10 +4637,12 @@ int __init addrconf_init(void)
4637 if (err < 0) { 4637 if (err < 0) {
4638 printk(KERN_CRIT "IPv6 Addrconf:" 4638 printk(KERN_CRIT "IPv6 Addrconf:"
4639 " cannot initialize default policy table: %d.\n", err); 4639 " cannot initialize default policy table: %d.\n", err);
4640 return err; 4640 goto out;
4641 } 4641 }
4642 4642
4643 register_pernet_subsys(&addrconf_ops); 4643 err = register_pernet_subsys(&addrconf_ops);
4644 if (err < 0)
4645 goto out_addrlabel;
4644 4646
4645 /* The addrconf netdev notifier requires that loopback_dev 4647 /* The addrconf netdev notifier requires that loopback_dev
4646 * has it's ipv6 private information allocated and setup 4648 * has it's ipv6 private information allocated and setup
@@ -4692,7 +4694,9 @@ errout:
4692 unregister_netdevice_notifier(&ipv6_dev_notf); 4694 unregister_netdevice_notifier(&ipv6_dev_notf);
4693errlo: 4695errlo:
4694 unregister_pernet_subsys(&addrconf_ops); 4696 unregister_pernet_subsys(&addrconf_ops);
4695 4697out_addrlabel:
4698 ipv6_addr_label_cleanup();
4699out:
4696 return err; 4700 return err;
4697} 4701}
4698 4702
@@ -4703,6 +4707,7 @@ void addrconf_cleanup(void)
4703 4707
4704 unregister_netdevice_notifier(&ipv6_dev_notf); 4708 unregister_netdevice_notifier(&ipv6_dev_notf);
4705 unregister_pernet_subsys(&addrconf_ops); 4709 unregister_pernet_subsys(&addrconf_ops);
4710 ipv6_addr_label_cleanup();
4706 4711
4707 rtnl_lock(); 4712 rtnl_lock();
4708 4713
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index f0e774cea386..8175f802651b 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -393,6 +393,11 @@ int __init ipv6_addr_label_init(void)
393 return register_pernet_subsys(&ipv6_addr_label_ops); 393 return register_pernet_subsys(&ipv6_addr_label_ops);
394} 394}
395 395
396void ipv6_addr_label_cleanup(void)
397{
398 unregister_pernet_subsys(&ipv6_addr_label_ops);
399}
400
396static const struct nla_policy ifal_policy[IFAL_MAX+1] = { 401static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
397 [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), }, 402 [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), },
398 [IFAL_LABEL] = { .len = sizeof(u32), }, 403 [IFAL_LABEL] = { .len = sizeof(u32), },
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 7d929a22cbc2..ef371aa01ac5 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -105,9 +105,12 @@ ipv4_connected:
105 if (ipv6_addr_any(&np->saddr)) 105 if (ipv6_addr_any(&np->saddr))
106 ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); 106 ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
107 107
108 if (ipv6_addr_any(&np->rcv_saddr)) 108 if (ipv6_addr_any(&np->rcv_saddr)) {
109 ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, 109 ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
110 &np->rcv_saddr); 110 &np->rcv_saddr);
111 if (sk->sk_prot->rehash)
112 sk->sk_prot->rehash(sk);
113 }
111 114
112 goto out; 115 goto out;
113 } 116 }
@@ -181,6 +184,8 @@ ipv4_connected:
181 if (ipv6_addr_any(&np->rcv_saddr)) { 184 if (ipv6_addr_any(&np->rcv_saddr)) {
182 ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src); 185 ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src);
183 inet->inet_rcv_saddr = LOOPBACK4_IPV6; 186 inet->inet_rcv_saddr = LOOPBACK4_IPV6;
187 if (sk->sk_prot->rehash)
188 sk->sk_prot->rehash(sk);
184 } 189 }
185 190
186 ip6_dst_store(sk, dst, 191 ip6_dst_store(sk, dst,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d40b330c0ee6..980912ed7a38 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -639,7 +639,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
639 639
640 if (skb_has_frags(skb)) { 640 if (skb_has_frags(skb)) {
641 int first_len = skb_pagelen(skb); 641 int first_len = skb_pagelen(skb);
642 int truesizes = 0; 642 struct sk_buff *frag2;
643 643
644 if (first_len - hlen > mtu || 644 if (first_len - hlen > mtu ||
645 ((first_len - hlen) & 7) || 645 ((first_len - hlen) & 7) ||
@@ -651,18 +651,18 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
651 if (frag->len > mtu || 651 if (frag->len > mtu ||
652 ((frag->len & 7) && frag->next) || 652 ((frag->len & 7) && frag->next) ||
653 skb_headroom(frag) < hlen) 653 skb_headroom(frag) < hlen)
654 goto slow_path; 654 goto slow_path_clean;
655 655
656 /* Partially cloned skb? */ 656 /* Partially cloned skb? */
657 if (skb_shared(frag)) 657 if (skb_shared(frag))
658 goto slow_path; 658 goto slow_path_clean;
659 659
660 BUG_ON(frag->sk); 660 BUG_ON(frag->sk);
661 if (skb->sk) { 661 if (skb->sk) {
662 frag->sk = skb->sk; 662 frag->sk = skb->sk;
663 frag->destructor = sock_wfree; 663 frag->destructor = sock_wfree;
664 truesizes += frag->truesize;
665 } 664 }
665 skb->truesize -= frag->truesize;
666 } 666 }
667 667
668 err = 0; 668 err = 0;
@@ -693,7 +693,6 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
693 693
694 first_len = skb_pagelen(skb); 694 first_len = skb_pagelen(skb);
695 skb->data_len = first_len - skb_headlen(skb); 695 skb->data_len = first_len - skb_headlen(skb);
696 skb->truesize -= truesizes;
697 skb->len = first_len; 696 skb->len = first_len;
698 ipv6_hdr(skb)->payload_len = htons(first_len - 697 ipv6_hdr(skb)->payload_len = htons(first_len -
699 sizeof(struct ipv6hdr)); 698 sizeof(struct ipv6hdr));
@@ -756,6 +755,15 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
756 IPSTATS_MIB_FRAGFAILS); 755 IPSTATS_MIB_FRAGFAILS);
757 dst_release(&rt->dst); 756 dst_release(&rt->dst);
758 return err; 757 return err;
758
759slow_path_clean:
760 skb_walk_frags(skb, frag2) {
761 if (frag2 == frag)
762 break;
763 frag2->sk = NULL;
764 frag2->destructor = NULL;
765 skb->truesize += frag2->truesize;
766 }
759 } 767 }
760 768
761slow_path: 769slow_path:
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 13ef5bc05cf5..578f3c1a16db 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -113,14 +113,6 @@ static void nf_skb_free(struct sk_buff *skb)
113 kfree_skb(NFCT_FRAG6_CB(skb)->orig); 113 kfree_skb(NFCT_FRAG6_CB(skb)->orig);
114} 114}
115 115
116/* Memory Tracking Functions. */
117static void frag_kfree_skb(struct sk_buff *skb)
118{
119 atomic_sub(skb->truesize, &nf_init_frags.mem);
120 nf_skb_free(skb);
121 kfree_skb(skb);
122}
123
124/* Destruction primitives. */ 116/* Destruction primitives. */
125 117
126static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) 118static __inline__ void fq_put(struct nf_ct_frag6_queue *fq)
@@ -282,66 +274,22 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
282 } 274 }
283 275
284found: 276found:
285 /* We found where to put this one. Check for overlap with 277 /* RFC5722, Section 4:
286 * preceding fragment, and, if needed, align things so that 278 * When reassembling an IPv6 datagram, if
287 * any overlaps are eliminated. 279 * one or more its constituent fragments is determined to be an
288 */ 280 * overlapping fragment, the entire datagram (and any constituent
289 if (prev) { 281 * fragments, including those not yet received) MUST be silently
290 int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset; 282 * discarded.
291
292 if (i > 0) {
293 offset += i;
294 if (end <= offset) {
295 pr_debug("overlap\n");
296 goto err;
297 }
298 if (!pskb_pull(skb, i)) {
299 pr_debug("Can't pull\n");
300 goto err;
301 }
302 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
303 skb->ip_summed = CHECKSUM_NONE;
304 }
305 }
306
307 /* Look for overlap with succeeding segments.
308 * If we can merge fragments, do it.
309 */ 283 */
310 while (next && NFCT_FRAG6_CB(next)->offset < end) {
311 /* overlap is 'i' bytes */
312 int i = end - NFCT_FRAG6_CB(next)->offset;
313
314 if (i < next->len) {
315 /* Eat head of the next overlapped fragment
316 * and leave the loop. The next ones cannot overlap.
317 */
318 pr_debug("Eat head of the overlapped parts.: %d", i);
319 if (!pskb_pull(next, i))
320 goto err;
321 284
322 /* next fragment */ 285 /* Check for overlap with preceding fragment. */
323 NFCT_FRAG6_CB(next)->offset += i; 286 if (prev &&
324 fq->q.meat -= i; 287 (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset > 0)
325 if (next->ip_summed != CHECKSUM_UNNECESSARY) 288 goto discard_fq;
326 next->ip_summed = CHECKSUM_NONE;
327 break;
328 } else {
329 struct sk_buff *free_it = next;
330
331 /* Old fragmnet is completely overridden with
332 * new one drop it.
333 */
334 next = next->next;
335 289
336 if (prev) 290 /* Look for overlap with succeeding segment. */
337 prev->next = next; 291 if (next && NFCT_FRAG6_CB(next)->offset < end)
338 else 292 goto discard_fq;
339 fq->q.fragments = next;
340
341 fq->q.meat -= free_it->len;
342 frag_kfree_skb(free_it);
343 }
344 }
345 293
346 NFCT_FRAG6_CB(skb)->offset = offset; 294 NFCT_FRAG6_CB(skb)->offset = offset;
347 295
@@ -371,6 +319,8 @@ found:
371 write_unlock(&nf_frags.lock); 319 write_unlock(&nf_frags.lock);
372 return 0; 320 return 0;
373 321
322discard_fq:
323 fq_kill(fq);
374err: 324err:
375 return -1; 325 return -1;
376} 326}
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 545c4141b755..64cfef1b0a4c 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -149,13 +149,6 @@ int ip6_frag_match(struct inet_frag_queue *q, void *a)
149} 149}
150EXPORT_SYMBOL(ip6_frag_match); 150EXPORT_SYMBOL(ip6_frag_match);
151 151
152/* Memory Tracking Functions. */
153static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
154{
155 atomic_sub(skb->truesize, &nf->mem);
156 kfree_skb(skb);
157}
158
159void ip6_frag_init(struct inet_frag_queue *q, void *a) 152void ip6_frag_init(struct inet_frag_queue *q, void *a)
160{ 153{
161 struct frag_queue *fq = container_of(q, struct frag_queue, q); 154 struct frag_queue *fq = container_of(q, struct frag_queue, q);
@@ -346,58 +339,22 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
346 } 339 }
347 340
348found: 341found:
349 /* We found where to put this one. Check for overlap with 342 /* RFC5722, Section 4:
350 * preceding fragment, and, if needed, align things so that 343 * When reassembling an IPv6 datagram, if
351 * any overlaps are eliminated. 344 * one or more its constituent fragments is determined to be an
345 * overlapping fragment, the entire datagram (and any constituent
346 * fragments, including those not yet received) MUST be silently
347 * discarded.
352 */ 348 */
353 if (prev) {
354 int i = (FRAG6_CB(prev)->offset + prev->len) - offset;
355 349
356 if (i > 0) { 350 /* Check for overlap with preceding fragment. */
357 offset += i; 351 if (prev &&
358 if (end <= offset) 352 (FRAG6_CB(prev)->offset + prev->len) - offset > 0)
359 goto err; 353 goto discard_fq;
360 if (!pskb_pull(skb, i))
361 goto err;
362 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
363 skb->ip_summed = CHECKSUM_NONE;
364 }
365 }
366 354
367 /* Look for overlap with succeeding segments. 355 /* Look for overlap with succeeding segment. */
368 * If we can merge fragments, do it. 356 if (next && FRAG6_CB(next)->offset < end)
369 */ 357 goto discard_fq;
370 while (next && FRAG6_CB(next)->offset < end) {
371 int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */
372
373 if (i < next->len) {
374 /* Eat head of the next overlapped fragment
375 * and leave the loop. The next ones cannot overlap.
376 */
377 if (!pskb_pull(next, i))
378 goto err;
379 FRAG6_CB(next)->offset += i; /* next fragment */
380 fq->q.meat -= i;
381 if (next->ip_summed != CHECKSUM_UNNECESSARY)
382 next->ip_summed = CHECKSUM_NONE;
383 break;
384 } else {
385 struct sk_buff *free_it = next;
386
387 /* Old fragment is completely overridden with
388 * new one drop it.
389 */
390 next = next->next;
391
392 if (prev)
393 prev->next = next;
394 else
395 fq->q.fragments = next;
396
397 fq->q.meat -= free_it->len;
398 frag_kfree_skb(fq->q.net, free_it);
399 }
400 }
401 358
402 FRAG6_CB(skb)->offset = offset; 359 FRAG6_CB(skb)->offset = offset;
403 360
@@ -436,6 +393,8 @@ found:
436 write_unlock(&ip6_frags.lock); 393 write_unlock(&ip6_frags.lock);
437 return -1; 394 return -1;
438 395
396discard_fq:
397 fq_kill(fq);
439err: 398err:
440 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 399 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
441 IPSTATS_MIB_REASMFAILS); 400 IPSTATS_MIB_REASMFAILS);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d126365ac046..a275c6e1e25c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -670,7 +670,7 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad
670 670
671 if (net_ratelimit()) 671 if (net_ratelimit())
672 printk(KERN_WARNING 672 printk(KERN_WARNING
673 "Neighbour table overflow.\n"); 673 "ipv6: Neighbour table overflow.\n");
674 dst_free(&rt->dst); 674 dst_free(&rt->dst);
675 return NULL; 675 return NULL;
676 } 676 }
@@ -1556,14 +1556,13 @@ out:
1556 * i.e. Path MTU discovery 1556 * i.e. Path MTU discovery
1557 */ 1557 */
1558 1558
1559void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, 1559static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1560 struct net_device *dev, u32 pmtu) 1560 struct net *net, u32 pmtu, int ifindex)
1561{ 1561{
1562 struct rt6_info *rt, *nrt; 1562 struct rt6_info *rt, *nrt;
1563 struct net *net = dev_net(dev);
1564 int allfrag = 0; 1563 int allfrag = 0;
1565 1564
1566 rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0); 1565 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1567 if (rt == NULL) 1566 if (rt == NULL)
1568 return; 1567 return;
1569 1568
@@ -1631,6 +1630,27 @@ out:
1631 dst_release(&rt->dst); 1630 dst_release(&rt->dst);
1632} 1631}
1633 1632
1633void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1634 struct net_device *dev, u32 pmtu)
1635{
1636 struct net *net = dev_net(dev);
1637
1638 /*
1639 * RFC 1981 states that a node "MUST reduce the size of the packets it
1640 * is sending along the path" that caused the Packet Too Big message.
1641 * Since it's not possible in the general case to determine which
1642 * interface was used to send the original packet, we update the MTU
1643 * on the interface that will be used to send future packets. We also
1644 * update the MTU on the interface that received the Packet Too Big in
1645 * case the original packet was forced out that interface with
1646 * SO_BINDTODEVICE or similar. This is the next best thing to the
1647 * correct behaviour, which would be to update the MTU on all
1648 * interfaces.
1649 */
1650 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1651 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1652}
1653
1634/* 1654/*
1635 * Misc support functions 1655 * Misc support functions
1636 */ 1656 */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1dd1affdead2..5acb3560ff15 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -111,6 +111,15 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
111 return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); 111 return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
112} 112}
113 113
114static void udp_v6_rehash(struct sock *sk)
115{
116 u16 new_hash = udp6_portaddr_hash(sock_net(sk),
117 &inet6_sk(sk)->rcv_saddr,
118 inet_sk(sk)->inet_num);
119
120 udp_lib_rehash(sk, new_hash);
121}
122
114static inline int compute_score(struct sock *sk, struct net *net, 123static inline int compute_score(struct sock *sk, struct net *net,
115 unsigned short hnum, 124 unsigned short hnum,
116 struct in6_addr *saddr, __be16 sport, 125 struct in6_addr *saddr, __be16 sport,
@@ -1447,6 +1456,7 @@ struct proto udpv6_prot = {
1447 .backlog_rcv = udpv6_queue_rcv_skb, 1456 .backlog_rcv = udpv6_queue_rcv_skb,
1448 .hash = udp_lib_hash, 1457 .hash = udp_lib_hash,
1449 .unhash = udp_lib_unhash, 1458 .unhash = udp_lib_unhash,
1459 .rehash = udp_v6_rehash,
1450 .get_port = udp_v6_get_port, 1460 .get_port = udp_v6_get_port,
1451 .memory_allocated = &udp_memory_allocated, 1461 .memory_allocated = &udp_memory_allocated,
1452 .sysctl_mem = sysctl_udp_mem, 1462 .sysctl_mem = sysctl_udp_mem,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index f417b77fa0e1..a67575d472a3 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -20,23 +20,27 @@
20#include <net/addrconf.h> 20#include <net/addrconf.h>
21 21
22static void 22static void
23__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, 23__xfrm6_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
24 struct xfrm_tmpl *tmpl,
25 xfrm_address_t *daddr, xfrm_address_t *saddr)
26{ 24{
27 /* Initialize temporary selector matching only 25 /* Initialize temporary selector matching only
28 * to current session. */ 26 * to current session. */
29 ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst); 27 ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl->fl6_dst);
30 ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src); 28 ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl->fl6_src);
31 x->sel.dport = xfrm_flowi_dport(fl); 29 sel->dport = xfrm_flowi_dport(fl);
32 x->sel.dport_mask = htons(0xffff); 30 sel->dport_mask = htons(0xffff);
33 x->sel.sport = xfrm_flowi_sport(fl); 31 sel->sport = xfrm_flowi_sport(fl);
34 x->sel.sport_mask = htons(0xffff); 32 sel->sport_mask = htons(0xffff);
35 x->sel.family = AF_INET6; 33 sel->family = AF_INET6;
36 x->sel.prefixlen_d = 128; 34 sel->prefixlen_d = 128;
37 x->sel.prefixlen_s = 128; 35 sel->prefixlen_s = 128;
38 x->sel.proto = fl->proto; 36 sel->proto = fl->proto;
39 x->sel.ifindex = fl->oif; 37 sel->ifindex = fl->oif;
38}
39
40static void
41xfrm6_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
42 xfrm_address_t *daddr, xfrm_address_t *saddr)
43{
40 x->id = tmpl->id; 44 x->id = tmpl->id;
41 if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) 45 if (ipv6_addr_any((struct in6_addr*)&x->id.daddr))
42 memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); 46 memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
@@ -168,6 +172,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
168 .eth_proto = htons(ETH_P_IPV6), 172 .eth_proto = htons(ETH_P_IPV6),
169 .owner = THIS_MODULE, 173 .owner = THIS_MODULE,
170 .init_tempsel = __xfrm6_init_tempsel, 174 .init_tempsel = __xfrm6_init_tempsel,
175 .init_temprop = xfrm6_init_temprop,
171 .tmpl_sort = __xfrm6_tmpl_sort, 176 .tmpl_sort = __xfrm6_tmpl_sort,
172 .state_sort = __xfrm6_state_sort, 177 .state_sort = __xfrm6_state_sort,
173 .output = xfrm6_output, 178 .output = xfrm6_output,
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index e9ad0062fbb6..02549cb2c328 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -3,6 +3,7 @@
3# 3#
4config IPX 4config IPX
5 tristate "The IPX protocol" 5 tristate "The IPX protocol"
6 depends on BKL # should be fixable
6 select LLC 7 select LLC
7 ---help--- 8 ---help---
8 This is support for the Novell networking protocol, IPX, commonly 9 This is support for the Novell networking protocol, IPX, commonly
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 79986a674f6e..fd55b5135de5 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -824,8 +824,8 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
824 824
825 err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name); 825 err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name);
826 if (err < 0) { 826 if (err < 0) {
827 kfree(self->ias_obj->name); 827 irias_delete_object(self->ias_obj);
828 kfree(self->ias_obj); 828 self->ias_obj = NULL;
829 goto out; 829 goto out;
830 } 830 }
831 831
diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c
index a788f9e9427d..6130f9d9dbe1 100644
--- a/net/irda/irlan/irlan_common.c
+++ b/net/irda/irlan/irlan_common.c
@@ -1102,7 +1102,7 @@ int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len)
1102 memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */ 1102 memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */
1103 le16_to_cpus(&val_len); n+=2; 1103 le16_to_cpus(&val_len); n+=2;
1104 1104
1105 if (val_len > 1016) { 1105 if (val_len >= 1016) {
1106 IRDA_DEBUG(2, "%s(), parameter length to long\n", __func__ ); 1106 IRDA_DEBUG(2, "%s(), parameter length to long\n", __func__ );
1107 return -RSP_INVALID_COMMAND_FORMAT; 1107 return -RSP_INVALID_COMMAND_FORMAT;
1108 } 1108 }
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 023ba820236f..582612998211 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1024,7 +1024,8 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
1024{ 1024{
1025 struct sock *sk = sock->sk; 1025 struct sock *sk = sock->sk;
1026 struct llc_sock *llc = llc_sk(sk); 1026 struct llc_sock *llc = llc_sk(sk);
1027 int rc = -EINVAL, opt; 1027 unsigned int opt;
1028 int rc = -EINVAL;
1028 1029
1029 lock_sock(sk); 1030 lock_sock(sk);
1030 if (unlikely(level != SOL_LLC || optlen != sizeof(int))) 1031 if (unlikely(level != SOL_LLC || optlen != sizeof(int)))
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
index e4dae0244d76..cf4aea3ba30f 100644
--- a/net/llc/llc_station.c
+++ b/net/llc/llc_station.c
@@ -689,7 +689,7 @@ static void llc_station_rcv(struct sk_buff *skb)
689 689
690int __init llc_station_init(void) 690int __init llc_station_init(void)
691{ 691{
692 u16 rc = -ENOBUFS; 692 int rc = -ENOBUFS;
693 struct sk_buff *skb; 693 struct sk_buff *skb;
694 struct llc_station_state_ev *ev; 694 struct llc_station_state_ev *ev;
695 695
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index c893f236acea..8f23401832b7 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -175,6 +175,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
175 175
176 set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state); 176 set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state);
177 177
178 del_timer_sync(&tid_tx->addba_resp_timer);
179
178 /* 180 /*
179 * After this packets are no longer handed right through 181 * After this packets are no longer handed right through
180 * to the driver but are put onto tid_tx->pending instead, 182 * to the driver but are put onto tid_tx->pending instead,
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index a694c593ff6a..b8b0ae79a743 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -36,6 +36,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \
36static const struct file_operations name## _ops = { \ 36static const struct file_operations name## _ops = { \
37 .read = name## _read, \ 37 .read = name## _read, \
38 .open = mac80211_open_file_generic, \ 38 .open = mac80211_open_file_generic, \
39 .llseek = generic_file_llseek, \
39}; 40};
40 41
41#define DEBUGFS_ADD(name) \ 42#define DEBUGFS_ADD(name) \
@@ -101,7 +102,8 @@ static ssize_t tsf_write(struct file *file,
101static const struct file_operations tsf_ops = { 102static const struct file_operations tsf_ops = {
102 .read = tsf_read, 103 .read = tsf_read,
103 .write = tsf_write, 104 .write = tsf_write,
104 .open = mac80211_open_file_generic 105 .open = mac80211_open_file_generic,
106 .llseek = default_llseek,
105}; 107};
106 108
107static ssize_t reset_write(struct file *file, const char __user *user_buf, 109static ssize_t reset_write(struct file *file, const char __user *user_buf,
@@ -120,6 +122,7 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf,
120static const struct file_operations reset_ops = { 122static const struct file_operations reset_ops = {
121 .write = reset_write, 123 .write = reset_write,
122 .open = mac80211_open_file_generic, 124 .open = mac80211_open_file_generic,
125 .llseek = noop_llseek,
123}; 126};
124 127
125static ssize_t noack_read(struct file *file, char __user *user_buf, 128static ssize_t noack_read(struct file *file, char __user *user_buf,
@@ -155,7 +158,8 @@ static ssize_t noack_write(struct file *file,
155static const struct file_operations noack_ops = { 158static const struct file_operations noack_ops = {
156 .read = noack_read, 159 .read = noack_read,
157 .write = noack_write, 160 .write = noack_write,
158 .open = mac80211_open_file_generic 161 .open = mac80211_open_file_generic,
162 .llseek = default_llseek,
159}; 163};
160 164
161static ssize_t uapsd_queues_read(struct file *file, char __user *user_buf, 165static ssize_t uapsd_queues_read(struct file *file, char __user *user_buf,
@@ -201,7 +205,8 @@ static ssize_t uapsd_queues_write(struct file *file,
201static const struct file_operations uapsd_queues_ops = { 205static const struct file_operations uapsd_queues_ops = {
202 .read = uapsd_queues_read, 206 .read = uapsd_queues_read,
203 .write = uapsd_queues_write, 207 .write = uapsd_queues_write,
204 .open = mac80211_open_file_generic 208 .open = mac80211_open_file_generic,
209 .llseek = default_llseek,
205}; 210};
206 211
207static ssize_t uapsd_max_sp_len_read(struct file *file, char __user *user_buf, 212static ssize_t uapsd_max_sp_len_read(struct file *file, char __user *user_buf,
@@ -247,7 +252,8 @@ static ssize_t uapsd_max_sp_len_write(struct file *file,
247static const struct file_operations uapsd_max_sp_len_ops = { 252static const struct file_operations uapsd_max_sp_len_ops = {
248 .read = uapsd_max_sp_len_read, 253 .read = uapsd_max_sp_len_read,
249 .write = uapsd_max_sp_len_write, 254 .write = uapsd_max_sp_len_write,
250 .open = mac80211_open_file_generic 255 .open = mac80211_open_file_generic,
256 .llseek = default_llseek,
251}; 257};
252 258
253static ssize_t channel_type_read(struct file *file, char __user *user_buf, 259static ssize_t channel_type_read(struct file *file, char __user *user_buf,
@@ -279,7 +285,8 @@ static ssize_t channel_type_read(struct file *file, char __user *user_buf,
279 285
280static const struct file_operations channel_type_ops = { 286static const struct file_operations channel_type_ops = {
281 .read = channel_type_read, 287 .read = channel_type_read,
282 .open = mac80211_open_file_generic 288 .open = mac80211_open_file_generic,
289 .llseek = default_llseek,
283}; 290};
284 291
285static ssize_t queues_read(struct file *file, char __user *user_buf, 292static ssize_t queues_read(struct file *file, char __user *user_buf,
@@ -302,7 +309,8 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
302 309
303static const struct file_operations queues_ops = { 310static const struct file_operations queues_ops = {
304 .read = queues_read, 311 .read = queues_read,
305 .open = mac80211_open_file_generic 312 .open = mac80211_open_file_generic,
313 .llseek = default_llseek,
306}; 314};
307 315
308/* statistics stuff */ 316/* statistics stuff */
@@ -346,6 +354,7 @@ static ssize_t stats_ ##name## _read(struct file *file, \
346static const struct file_operations stats_ ##name## _ops = { \ 354static const struct file_operations stats_ ##name## _ops = { \
347 .read = stats_ ##name## _read, \ 355 .read = stats_ ##name## _read, \
348 .open = mac80211_open_file_generic, \ 356 .open = mac80211_open_file_generic, \
357 .llseek = generic_file_llseek, \
349}; 358};
350 359
351#define DEBUGFS_STATS_ADD(name, field) \ 360#define DEBUGFS_STATS_ADD(name, field) \
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index fa5e76e658ef..7cd8dd9fc240 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -32,6 +32,7 @@ static ssize_t key_##name##_read(struct file *file, \
32static const struct file_operations key_ ##name## _ops = { \ 32static const struct file_operations key_ ##name## _ops = { \
33 .read = key_##name##_read, \ 33 .read = key_##name##_read, \
34 .open = mac80211_open_file_generic, \ 34 .open = mac80211_open_file_generic, \
35 .llseek = generic_file_llseek, \
35} 36}
36 37
37#define KEY_FILE(name, format) \ 38#define KEY_FILE(name, format) \
@@ -46,6 +47,7 @@ static const struct file_operations key_ ##name## _ops = { \
46static const struct file_operations key_ ##name## _ops = { \ 47static const struct file_operations key_ ##name## _ops = { \
47 .read = key_conf_##name##_read, \ 48 .read = key_conf_##name##_read, \
48 .open = mac80211_open_file_generic, \ 49 .open = mac80211_open_file_generic, \
50 .llseek = generic_file_llseek, \
49} 51}
50 52
51#define KEY_CONF_FILE(name, format) \ 53#define KEY_CONF_FILE(name, format) \
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 20b2998fa0ed..8ad33eef7dda 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -121,6 +121,7 @@ static const struct file_operations name##_ops = { \
121 .read = ieee80211_if_read_##name, \ 121 .read = ieee80211_if_read_##name, \
122 .write = (_write), \ 122 .write = (_write), \
123 .open = mac80211_open_file_generic, \ 123 .open = mac80211_open_file_generic, \
124 .llseek = generic_file_llseek, \
124} 125}
125 126
126#define __IEEE80211_IF_FILE_W(name) \ 127#define __IEEE80211_IF_FILE_W(name) \
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 76839d4dfaac..6a8fdc372c43 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -36,6 +36,7 @@ static ssize_t sta_ ##name## _read(struct file *file, \
36static const struct file_operations sta_ ##name## _ops = { \ 36static const struct file_operations sta_ ##name## _ops = { \
37 .read = sta_##name##_read, \ 37 .read = sta_##name##_read, \
38 .open = mac80211_open_file_generic, \ 38 .open = mac80211_open_file_generic, \
39 .llseek = generic_file_llseek, \
39} 40}
40 41
41#define STA_OPS_RW(name) \ 42#define STA_OPS_RW(name) \
@@ -43,6 +44,7 @@ static const struct file_operations sta_ ##name## _ops = { \
43 .read = sta_##name##_read, \ 44 .read = sta_##name##_read, \
44 .write = sta_##name##_write, \ 45 .write = sta_##name##_write, \
45 .open = mac80211_open_file_generic, \ 46 .open = mac80211_open_file_generic, \
47 .llseek = generic_file_llseek, \
46} 48}
47 49
48#define STA_FILE(name, field, format) \ 50#define STA_FILE(name, field, format) \
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 798a91b100cc..ded5c3843e06 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -732,6 +732,12 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
732 732
733 rtnl_unlock(); 733 rtnl_unlock();
734 734
735 /*
736 * Now all work items will be gone, but the
737 * timer might still be armed, so delete it
738 */
739 del_timer_sync(&local->work_timer);
740
735 cancel_work_sync(&local->reconfig_filter); 741 cancel_work_sync(&local->reconfig_filter);
736 742
737 ieee80211_clear_tx_pending(local); 743 ieee80211_clear_tx_pending(local);
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index be04d46110fe..334cbd3d2aae 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -145,6 +145,7 @@ static ssize_t rcname_read(struct file *file, char __user *userbuf,
145static const struct file_operations rcname_ops = { 145static const struct file_operations rcname_ops = {
146 .read = rcname_read, 146 .read = rcname_read,
147 .open = mac80211_open_file_generic, 147 .open = mac80211_open_file_generic,
148 .llseek = default_llseek,
148}; 149};
149#endif 150#endif
150 151
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 241e76f3fdf2..a290ad231d77 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -122,6 +122,7 @@ static const struct file_operations minstrel_stat_fops = {
122 .open = minstrel_stats_open, 122 .open = minstrel_stats_open,
123 .read = minstrel_stats_read, 123 .read = minstrel_stats_read,
124 .release = minstrel_stats_release, 124 .release = minstrel_stats_release,
125 .llseek = default_llseek,
125}; 126};
126 127
127void 128void
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 4a5a4b3e7799..cefcb5d2dae6 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -90,7 +90,7 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
90 MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10); 90 MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10);
91 ms->len = p - ms->buf; 91 ms->len = p - ms->buf;
92 92
93 return 0; 93 return nonseekable_open(inode, file);
94} 94}
95 95
96static const struct file_operations minstrel_ht_stat_fops = { 96static const struct file_operations minstrel_ht_stat_fops = {
@@ -98,6 +98,7 @@ static const struct file_operations minstrel_ht_stat_fops = {
98 .open = minstrel_ht_stats_open, 98 .open = minstrel_ht_stats_open,
99 .read = minstrel_stats_read, 99 .read = minstrel_stats_read,
100 .release = minstrel_stats_release, 100 .release = minstrel_stats_release,
101 .llseek = no_llseek,
101}; 102};
102 103
103void 104void
diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c
index 47438b4a9af5..7905f79cc2e4 100644
--- a/net/mac80211/rc80211_pid_debugfs.c
+++ b/net/mac80211/rc80211_pid_debugfs.c
@@ -206,6 +206,7 @@ static const struct file_operations rc_pid_fop_events = {
206 .poll = rate_control_pid_events_poll, 206 .poll = rate_control_pid_events_poll,
207 .open = rate_control_pid_events_open, 207 .open = rate_control_pid_events_open,
208 .release = rate_control_pid_events_release, 208 .release = rate_control_pid_events_release,
209 .llseek = noop_llseek,
209}; 210};
210 211
211void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta, 212void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index fa0f37e4afe4..28624282c5f3 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2199,9 +2199,6 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
2199 struct net_device *prev_dev = NULL; 2199 struct net_device *prev_dev = NULL;
2200 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); 2200 struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
2201 2201
2202 if (status->flag & RX_FLAG_INTERNAL_CMTR)
2203 goto out_free_skb;
2204
2205 if (skb_headroom(skb) < sizeof(*rthdr) && 2202 if (skb_headroom(skb) < sizeof(*rthdr) &&
2206 pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC)) 2203 pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC))
2207 goto out_free_skb; 2204 goto out_free_skb;
@@ -2260,7 +2257,6 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
2260 } else 2257 } else
2261 goto out_free_skb; 2258 goto out_free_skb;
2262 2259
2263 status->flag |= RX_FLAG_INTERNAL_CMTR;
2264 return; 2260 return;
2265 2261
2266 out_free_skb: 2262 out_free_skb:
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 10caec5ea8fa..34da67995d94 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -377,7 +377,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
377 skb2 = skb_clone(skb, GFP_ATOMIC); 377 skb2 = skb_clone(skb, GFP_ATOMIC);
378 if (skb2) { 378 if (skb2) {
379 skb2->dev = prev_dev; 379 skb2->dev = prev_dev;
380 netif_receive_skb(skb2); 380 netif_rx(skb2);
381 } 381 }
382 } 382 }
383 383
@@ -386,7 +386,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
386 } 386 }
387 if (prev_dev) { 387 if (prev_dev) {
388 skb->dev = prev_dev; 388 skb->dev = prev_dev;
389 netif_receive_skb(skb); 389 netif_rx(skb);
390 skb = NULL; 390 skb = NULL;
391 } 391 }
392 rcu_read_unlock(); 392 rcu_read_unlock();
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78b505d33bfb..fdaec7daff1d 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
27 27
28static DEFINE_MUTEX(afinfo_mutex); 28static DEFINE_MUTEX(afinfo_mutex);
29 29
30const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; 30const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
31EXPORT_SYMBOL(nf_afinfo); 31EXPORT_SYMBOL(nf_afinfo);
32 32
33int nf_register_afinfo(const struct nf_afinfo *afinfo) 33int nf_register_afinfo(const struct nf_afinfo *afinfo)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 4f8ddba48011..4c2f89df5cce 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -924,6 +924,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
924 924
925 ip_vs_out_stats(cp, skb); 925 ip_vs_out_stats(cp, skb);
926 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); 926 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
927 ip_vs_update_conntrack(skb, cp, 0);
927 ip_vs_conn_put(cp); 928 ip_vs_conn_put(cp);
928 929
929 skb->ipvs_property = 1; 930 skb->ipvs_property = 1;
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index f228a17ec649..7e9af5b76d9e 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -45,6 +45,7 @@
45#include <linux/netfilter.h> 45#include <linux/netfilter.h>
46#include <net/netfilter/nf_conntrack.h> 46#include <net/netfilter/nf_conntrack.h>
47#include <net/netfilter/nf_conntrack_expect.h> 47#include <net/netfilter/nf_conntrack_expect.h>
48#include <net/netfilter/nf_nat.h>
48#include <net/netfilter/nf_nat_helper.h> 49#include <net/netfilter/nf_nat_helper.h>
49#include <linux/gfp.h> 50#include <linux/gfp.h>
50#include <net/protocol.h> 51#include <net/protocol.h>
@@ -359,7 +360,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
359 buf_len = strlen(buf); 360 buf_len = strlen(buf);
360 361
361 ct = nf_ct_get(skb, &ctinfo); 362 ct = nf_ct_get(skb, &ctinfo);
362 if (ct && !nf_ct_is_untracked(ct)) { 363 if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) {
363 /* If mangling fails this function will return 0 364 /* If mangling fails this function will return 0
364 * which will cause the packet to be dropped. 365 * which will cause the packet to be dropped.
365 * Mangling can only fail under memory pressure, 366 * Mangling can only fail under memory pressure,
@@ -409,7 +410,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
409 union nf_inet_addr to; 410 union nf_inet_addr to;
410 __be16 port; 411 __be16 port;
411 struct ip_vs_conn *n_cp; 412 struct ip_vs_conn *n_cp;
412 struct nf_conn *ct;
413 413
414#ifdef CONFIG_IP_VS_IPV6 414#ifdef CONFIG_IP_VS_IPV6
415 /* This application helper doesn't work with IPv6 yet, 415 /* This application helper doesn't work with IPv6 yet,
@@ -496,11 +496,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
496 ip_vs_control_add(n_cp, cp); 496 ip_vs_control_add(n_cp, cp);
497 } 497 }
498 498
499 ct = (struct nf_conn *)skb->nfct;
500 if (ct && ct != &nf_conntrack_untracked)
501 ip_vs_expect_related(skb, ct, n_cp,
502 IPPROTO_TCP, &n_cp->dport, 1);
503
504 /* 499 /*
505 * Move tunnel to listen state 500 * Move tunnel to listen state
506 */ 501 */
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 21e1a5e9b9d3..49df6bea6a2d 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -349,8 +349,8 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
349} 349}
350#endif 350#endif
351 351
352static void 352void
353ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) 353ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
354{ 354{
355 struct nf_conn *ct = (struct nf_conn *)skb->nfct; 355 struct nf_conn *ct = (struct nf_conn *)skb->nfct;
356 struct nf_conntrack_tuple new_tuple; 356 struct nf_conntrack_tuple new_tuple;
@@ -365,11 +365,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
365 * real-server we will see RIP->DIP. 365 * real-server we will see RIP->DIP.
366 */ 366 */
367 new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; 367 new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
368 new_tuple.src.u3 = cp->daddr; 368 if (outin)
369 new_tuple.src.u3 = cp->daddr;
370 else
371 new_tuple.dst.u3 = cp->vaddr;
369 /* 372 /*
370 * This will also take care of UDP and other protocols. 373 * This will also take care of UDP and other protocols.
371 */ 374 */
372 new_tuple.src.u.tcp.port = cp->dport; 375 if (outin)
376 new_tuple.src.u.tcp.port = cp->dport;
377 else
378 new_tuple.dst.u.tcp.port = cp->vport;
373 nf_conntrack_alter_reply(ct, &new_tuple); 379 nf_conntrack_alter_reply(ct, &new_tuple);
374} 380}
375 381
@@ -428,7 +434,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
428 434
429 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 435 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
430 436
431 ip_vs_update_conntrack(skb, cp); 437 ip_vs_update_conntrack(skb, cp, 1);
432 438
433 /* FIXME: when application helper enlarges the packet and the length 439 /* FIXME: when application helper enlarges the packet and the length
434 is larger than the MTU of outgoing device, there will be still 440 is larger than the MTU of outgoing device, there will be still
@@ -506,7 +512,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
506 512
507 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 513 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
508 514
509 ip_vs_update_conntrack(skb, cp); 515 ip_vs_update_conntrack(skb, cp, 1);
510 516
511 /* FIXME: when application helper enlarges the packet and the length 517 /* FIXME: when application helper enlarges the packet and the length
512 is larger than the MTU of outgoing device, there will be still 518 is larger than the MTU of outgoing device, there will be still
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index cdcc7649476b..5702de35e2bb 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
26 26
27static DEFINE_MUTEX(nf_ct_ecache_mutex); 27static DEFINE_MUTEX(nf_ct_ecache_mutex);
28 28
29struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly; 29struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
30EXPORT_SYMBOL_GPL(nf_conntrack_event_cb); 30EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
31 31
32struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly; 32struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
33EXPORT_SYMBOL_GPL(nf_expect_event_cb); 33EXPORT_SYMBOL_GPL(nf_expect_event_cb);
34 34
35/* deliver cached events and clear cache entry - must be called with locally 35/* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 7dcf7a404190..bd82450c193f 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,7 +16,7 @@
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <net/netfilter/nf_conntrack_extend.h> 17#include <net/netfilter/nf_conntrack_extend.h>
18 18
19static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM]; 19static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
20static DEFINE_MUTEX(nf_ct_ext_type_mutex); 20static DEFINE_MUTEX(nf_ct_ext_type_mutex);
21 21
22void __nf_ct_ext_destroy(struct nf_conn *ct) 22void __nf_ct_ext_destroy(struct nf_conn *ct)
@@ -48,15 +48,17 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
48{ 48{
49 unsigned int off, len; 49 unsigned int off, len;
50 struct nf_ct_ext_type *t; 50 struct nf_ct_ext_type *t;
51 size_t alloc_size;
51 52
52 rcu_read_lock(); 53 rcu_read_lock();
53 t = rcu_dereference(nf_ct_ext_types[id]); 54 t = rcu_dereference(nf_ct_ext_types[id]);
54 BUG_ON(t == NULL); 55 BUG_ON(t == NULL);
55 off = ALIGN(sizeof(struct nf_ct_ext), t->align); 56 off = ALIGN(sizeof(struct nf_ct_ext), t->align);
56 len = off + t->len; 57 len = off + t->len;
58 alloc_size = t->alloc_size;
57 rcu_read_unlock(); 59 rcu_read_unlock();
58 60
59 *ext = kzalloc(t->alloc_size, gfp); 61 *ext = kzalloc(alloc_size, gfp);
60 if (!*ext) 62 if (!*ext)
61 return NULL; 63 return NULL;
62 64
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 5bae1cd15eea..146476c6441a 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -22,6 +22,7 @@
22#include <linux/rculist_nulls.h> 22#include <linux/rculist_nulls.h>
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/timer.h> 24#include <linux/timer.h>
25#include <linux/security.h>
25#include <linux/skbuff.h> 26#include <linux/skbuff.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/netlink.h> 28#include <linux/netlink.h>
@@ -245,16 +246,31 @@ nla_put_failure:
245 246
246#ifdef CONFIG_NF_CONNTRACK_SECMARK 247#ifdef CONFIG_NF_CONNTRACK_SECMARK
247static inline int 248static inline int
248ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct) 249ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
249{ 250{
250 NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark)); 251 struct nlattr *nest_secctx;
251 return 0; 252 int len, ret;
253 char *secctx;
254
255 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
256 if (ret)
257 return ret;
258
259 ret = -1;
260 nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
261 if (!nest_secctx)
262 goto nla_put_failure;
263
264 NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
265 nla_nest_end(skb, nest_secctx);
252 266
267 ret = 0;
253nla_put_failure: 268nla_put_failure:
254 return -1; 269 security_release_secctx(secctx, len);
270 return ret;
255} 271}
256#else 272#else
257#define ctnetlink_dump_secmark(a, b) (0) 273#define ctnetlink_dump_secctx(a, b) (0)
258#endif 274#endif
259 275
260#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 276#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
391 ctnetlink_dump_protoinfo(skb, ct) < 0 || 407 ctnetlink_dump_protoinfo(skb, ct) < 0 ||
392 ctnetlink_dump_helpinfo(skb, ct) < 0 || 408 ctnetlink_dump_helpinfo(skb, ct) < 0 ||
393 ctnetlink_dump_mark(skb, ct) < 0 || 409 ctnetlink_dump_mark(skb, ct) < 0 ||
394 ctnetlink_dump_secmark(skb, ct) < 0 || 410 ctnetlink_dump_secctx(skb, ct) < 0 ||
395 ctnetlink_dump_id(skb, ct) < 0 || 411 ctnetlink_dump_id(skb, ct) < 0 ||
396 ctnetlink_dump_use(skb, ct) < 0 || 412 ctnetlink_dump_use(skb, ct) < 0 ||
397 ctnetlink_dump_master(skb, ct) < 0 || 413 ctnetlink_dump_master(skb, ct) < 0 ||
@@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct)
437 ; 453 ;
438} 454}
439 455
456#ifdef CONFIG_NF_CONNTRACK_SECMARK
457static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
458{
459 int len;
460
461 security_secid_to_secctx(ct->secmark, NULL, &len);
462
463 return sizeof(char) * len;
464}
465#endif
466
440static inline size_t 467static inline size_t
441ctnetlink_nlmsg_size(const struct nf_conn *ct) 468ctnetlink_nlmsg_size(const struct nf_conn *ct)
442{ 469{
@@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
453 + nla_total_size(0) /* CTA_HELP */ 480 + nla_total_size(0) /* CTA_HELP */
454 + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ 481 + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
455#ifdef CONFIG_NF_CONNTRACK_SECMARK 482#ifdef CONFIG_NF_CONNTRACK_SECMARK
456 + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */ 483 + nla_total_size(0) /* CTA_SECCTX */
484 + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
457#endif 485#endif
458#ifdef CONFIG_NF_NAT_NEEDED 486#ifdef CONFIG_NF_NAT_NEEDED
459 + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ 487 + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
@@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
556 584
557#ifdef CONFIG_NF_CONNTRACK_SECMARK 585#ifdef CONFIG_NF_CONNTRACK_SECMARK
558 if ((events & (1 << IPCT_SECMARK) || ct->secmark) 586 if ((events & (1 << IPCT_SECMARK) || ct->secmark)
559 && ctnetlink_dump_secmark(skb, ct) < 0) 587 && ctnetlink_dump_secctx(skb, ct) < 0)
560 goto nla_put_failure; 588 goto nla_put_failure;
561#endif 589#endif
562 590
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 5886ba1d52a0..ed6d92958023 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
28#include <net/netfilter/nf_conntrack_l4proto.h> 28#include <net/netfilter/nf_conntrack_l4proto.h>
29#include <net/netfilter/nf_conntrack_core.h> 29#include <net/netfilter/nf_conntrack_core.h>
30 30
31static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; 31static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
32struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; 32struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
33EXPORT_SYMBOL_GPL(nf_ct_l3protos); 33EXPORT_SYMBOL_GPL(nf_ct_l3protos);
34 34
35static DEFINE_MUTEX(nf_ct_proto_mutex); 35static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 53d892210a04..f64de9544866 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1376,7 +1376,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
1376 unsigned int msglen, origlen; 1376 unsigned int msglen, origlen;
1377 const char *dptr, *end; 1377 const char *dptr, *end;
1378 s16 diff, tdiff = 0; 1378 s16 diff, tdiff = 0;
1379 int ret; 1379 int ret = NF_ACCEPT;
1380 typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust; 1380 typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust;
1381 1381
1382 if (ctinfo != IP_CT_ESTABLISHED && 1382 if (ctinfo != IP_CT_ESTABLISHED &&
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index eb973fcd67ab..0fb65705b44b 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/netdevice.h> 17#include <linux/netdevice.h>
18#include <linux/security.h>
18#include <net/net_namespace.h> 19#include <net/net_namespace.h>
19#ifdef CONFIG_SYSCTL 20#ifdef CONFIG_SYSCTL
20#include <linux/sysctl.h> 21#include <linux/sysctl.h>
@@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
108 rcu_read_unlock(); 109 rcu_read_unlock();
109} 110}
110 111
112#ifdef CONFIG_NF_CONNTRACK_SECMARK
113static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
114{
115 int ret;
116 u32 len;
117 char *secctx;
118
119 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
120 if (ret)
121 return ret;
122
123 ret = seq_printf(s, "secctx=%s ", secctx);
124
125 security_release_secctx(secctx, len);
126 return ret;
127}
128#else
129static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
130{
131 return 0;
132}
133#endif
134
111/* return 0 on success, 1 in case of error */ 135/* return 0 on success, 1 in case of error */
112static int ct_seq_show(struct seq_file *s, void *v) 136static int ct_seq_show(struct seq_file *s, void *v)
113{ 137{
@@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
168 goto release; 192 goto release;
169#endif 193#endif
170 194
171#ifdef CONFIG_NF_CONNTRACK_SECMARK 195 if (ct_show_secctx(s, ct))
172 if (seq_printf(s, "secmark=%u ", ct->secmark))
173 goto release; 196 goto release;
174#endif
175 197
176#ifdef CONFIG_NF_CONNTRACK_ZONES 198#ifdef CONFIG_NF_CONNTRACK_ZONES
177 if (seq_printf(s, "zone=%u ", nf_ct_zone(ct))) 199 if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 7df37fd786bc..b07393eab88e 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
16#define NF_LOG_PREFIXLEN 128 16#define NF_LOG_PREFIXLEN 128
17#define NFLOGGER_NAME_LEN 64 17#define NFLOGGER_NAME_LEN 64
18 18
19static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; 19static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
20static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; 20static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
21static DEFINE_MUTEX(nf_log_mutex); 21static DEFINE_MUTEX(nf_log_mutex);
22 22
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 78b3cf9c519c..74aebed5bd28 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
18 * long term mutex. The handler must provide an an outfn() to accept packets 18 * long term mutex. The handler must provide an an outfn() to accept packets
19 * for queueing and must reinject all packets it receives, no matter what. 19 * for queueing and must reinject all packets it receives, no matter what.
20 */ 20 */
21static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly; 21static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
22 22
23static DEFINE_MUTEX(queue_handler_mutex); 23static DEFINE_MUTEX(queue_handler_mutex);
24 24
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 5490fc37c92d..daab8c4a903c 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -70,7 +70,11 @@ nf_tproxy_destructor(struct sk_buff *skb)
70int 70int
71nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) 71nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
72{ 72{
73 if (inet_sk(sk)->transparent) { 73 bool transparent = (sk->sk_state == TCP_TIME_WAIT) ?
74 inet_twsk(sk)->tw_transparent :
75 inet_sk(sk)->transparent;
76
77 if (transparent) {
74 skb_orphan(skb); 78 skb_orphan(skb);
75 skb->sk = sk; 79 skb->sk = sk;
76 skb->destructor = nf_tproxy_destructor; 80 skb->destructor = nf_tproxy_destructor;
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 0cb6053f02fd..782e51986a6f 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -9,7 +9,6 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/gfp.h> 10#include <linux/gfp.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/selinux.h>
13#include <linux/netfilter_ipv4/ip_tables.h> 12#include <linux/netfilter_ipv4/ip_tables.h>
14#include <linux/netfilter_ipv6/ip6_tables.h> 13#include <linux/netfilter_ipv6/ip6_tables.h>
15#include <linux/netfilter/x_tables.h> 14#include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 23b2d6c486b5..9faf5e050b79 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -14,8 +14,8 @@
14 */ 14 */
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/security.h>
17#include <linux/skbuff.h> 18#include <linux/skbuff.h>
18#include <linux/selinux.h>
19#include <linux/netfilter/x_tables.h> 19#include <linux/netfilter/x_tables.h>
20#include <linux/netfilter/xt_SECMARK.h> 20#include <linux/netfilter/xt_SECMARK.h>
21 21
@@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
39 39
40 switch (mode) { 40 switch (mode) {
41 case SECMARK_MODE_SEL: 41 case SECMARK_MODE_SEL:
42 secmark = info->u.sel.selsid; 42 secmark = info->secid;
43 break; 43 break;
44
45 default: 44 default:
46 BUG(); 45 BUG();
47 } 46 }
@@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
50 return XT_CONTINUE; 49 return XT_CONTINUE;
51} 50}
52 51
53static int checkentry_selinux(struct xt_secmark_target_info *info) 52static int checkentry_lsm(struct xt_secmark_target_info *info)
54{ 53{
55 int err; 54 int err;
56 struct xt_secmark_target_selinux_info *sel = &info->u.sel;
57 55
58 sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0'; 56 info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
57 info->secid = 0;
59 58
60 err = selinux_string_to_sid(sel->selctx, &sel->selsid); 59 err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
60 &info->secid);
61 if (err) { 61 if (err) {
62 if (err == -EINVAL) 62 if (err == -EINVAL)
63 pr_info("invalid SELinux context \'%s\'\n", 63 pr_info("invalid security context \'%s\'\n", info->secctx);
64 sel->selctx);
65 return err; 64 return err;
66 } 65 }
67 66
68 if (!sel->selsid) { 67 if (!info->secid) {
69 pr_info("unable to map SELinux context \'%s\'\n", sel->selctx); 68 pr_info("unable to map security context \'%s\'\n", info->secctx);
70 return -ENOENT; 69 return -ENOENT;
71 } 70 }
72 71
73 err = selinux_secmark_relabel_packet_permission(sel->selsid); 72 err = security_secmark_relabel_packet(info->secid);
74 if (err) { 73 if (err) {
75 pr_info("unable to obtain relabeling permission\n"); 74 pr_info("unable to obtain relabeling permission\n");
76 return err; 75 return err;
77 } 76 }
78 77
79 selinux_secmark_refcount_inc(); 78 security_secmark_refcount_inc();
80 return 0; 79 return 0;
81} 80}
82 81
@@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par)
100 99
101 switch (info->mode) { 100 switch (info->mode) {
102 case SECMARK_MODE_SEL: 101 case SECMARK_MODE_SEL:
103 err = checkentry_selinux(info);
104 if (err <= 0)
105 return err;
106 break; 102 break;
107
108 default: 103 default:
109 pr_info("invalid mode: %hu\n", info->mode); 104 pr_info("invalid mode: %hu\n", info->mode);
110 return -EINVAL; 105 return -EINVAL;
111 } 106 }
112 107
108 err = checkentry_lsm(info);
109 if (err)
110 return err;
111
113 if (!mode) 112 if (!mode)
114 mode = info->mode; 113 mode = info->mode;
115 return 0; 114 return 0;
@@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
119{ 118{
120 switch (mode) { 119 switch (mode) {
121 case SECMARK_MODE_SEL: 120 case SECMARK_MODE_SEL:
122 selinux_secmark_refcount_dec(); 121 security_secmark_refcount_dec();
123 } 122 }
124} 123}
125 124
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 76aec6a44762..d2ff15a2412b 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -567,6 +567,7 @@ static const struct file_operations recent_mt_fops = {
567 .write = recent_mt_proc_write, 567 .write = recent_mt_proc_write,
568 .release = seq_release_private, 568 .release = seq_release_private,
569 .owner = THIS_MODULE, 569 .owner = THIS_MODULE,
570 .llseek = seq_lseek,
570}; 571};
571 572
572static int __net_init recent_proc_net_init(struct net *net) 573static int __net_init recent_proc_net_init(struct net *net)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 980fe4ad0016..cd96ed3ccee4 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2102,6 +2102,26 @@ static void __net_exit netlink_net_exit(struct net *net)
2102#endif 2102#endif
2103} 2103}
2104 2104
2105static void __init netlink_add_usersock_entry(void)
2106{
2107 unsigned long *listeners;
2108 int groups = 32;
2109
2110 listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head),
2111 GFP_KERNEL);
2112 if (!listeners)
2113 panic("netlink_add_usersock_entry: Cannot allocate listneres\n");
2114
2115 netlink_table_grab();
2116
2117 nl_table[NETLINK_USERSOCK].groups = groups;
2118 nl_table[NETLINK_USERSOCK].listeners = listeners;
2119 nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
2120 nl_table[NETLINK_USERSOCK].registered = 1;
2121
2122 netlink_table_ungrab();
2123}
2124
2105static struct pernet_operations __net_initdata netlink_net_ops = { 2125static struct pernet_operations __net_initdata netlink_net_ops = {
2106 .init = netlink_net_init, 2126 .init = netlink_net_init,
2107 .exit = netlink_net_exit, 2127 .exit = netlink_net_exit,
@@ -2150,6 +2170,8 @@ static int __init netlink_proto_init(void)
2150 hash->rehash_time = jiffies; 2170 hash->rehash_time = jiffies;
2151 } 2171 }
2152 2172
2173 netlink_add_usersock_entry();
2174
2153 sock_register(&netlink_family_ops); 2175 sock_register(&netlink_family_ops);
2154 register_pernet_subsys(&netlink_net_ops); 2176 register_pernet_subsys(&netlink_net_ops);
2155 /* The netlink device handler may be needed early. */ 2177 /* The netlink device handler may be needed early. */
diff --git a/net/nonet.c b/net/nonet.c
index 92e76640c7cd..b1a73fda9c12 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -22,4 +22,5 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
22const struct file_operations bad_sock_fops = { 22const struct file_operations bad_sock_fops = {
23 .owner = THIS_MODULE, 23 .owner = THIS_MODULE,
24 .open = sock_no_open, 24 .open = sock_no_open,
25 .llseek = noop_llseek,
25}; 26};
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index b2a3ae6cad78..15003021f4f0 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -225,12 +225,13 @@ static void pipe_grant_credits(struct sock *sk)
225static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) 225static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
226{ 226{
227 struct pep_sock *pn = pep_sk(sk); 227 struct pep_sock *pn = pep_sk(sk);
228 struct pnpipehdr *hdr = pnp_hdr(skb); 228 struct pnpipehdr *hdr;
229 int wake = 0; 229 int wake = 0;
230 230
231 if (!pskb_may_pull(skb, sizeof(*hdr) + 4)) 231 if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
232 return -EINVAL; 232 return -EINVAL;
233 233
234 hdr = pnp_hdr(skb);
234 if (hdr->data[0] != PN_PEP_TYPE_COMMON) { 235 if (hdr->data[0] != PN_PEP_TYPE_COMMON) {
235 LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n", 236 LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n",
236 (unsigned)hdr->data[0]); 237 (unsigned)hdr->data[0]);
diff --git a/net/rds/page.c b/net/rds/page.c
index 595a952d4b17..1dfbfea12e9b 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -57,30 +57,17 @@ int rds_page_copy_user(struct page *page, unsigned long offset,
57 unsigned long ret; 57 unsigned long ret;
58 void *addr; 58 void *addr;
59 59
60 if (to_user) 60 addr = kmap(page);
61 if (to_user) {
61 rds_stats_add(s_copy_to_user, bytes); 62 rds_stats_add(s_copy_to_user, bytes);
62 else 63 ret = copy_to_user(ptr, addr + offset, bytes);
64 } else {
63 rds_stats_add(s_copy_from_user, bytes); 65 rds_stats_add(s_copy_from_user, bytes);
64 66 ret = copy_from_user(addr + offset, ptr, bytes);
65 addr = kmap_atomic(page, KM_USER0);
66 if (to_user)
67 ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
68 else
69 ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
70 kunmap_atomic(addr, KM_USER0);
71
72 if (ret) {
73 addr = kmap(page);
74 if (to_user)
75 ret = copy_to_user(ptr, addr + offset, bytes);
76 else
77 ret = copy_from_user(addr + offset, ptr, bytes);
78 kunmap(page);
79 if (ret)
80 return -EFAULT;
81 } 67 }
68 kunmap(page);
82 69
83 return 0; 70 return ret ? -EFAULT : 0;
84} 71}
85EXPORT_SYMBOL_GPL(rds_page_copy_user); 72EXPORT_SYMBOL_GPL(rds_page_copy_user);
86 73
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index c397524c039c..c519939e8da9 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -43,7 +43,7 @@ void rds_tcp_state_change(struct sock *sk)
43 struct rds_connection *conn; 43 struct rds_connection *conn;
44 struct rds_tcp_connection *tc; 44 struct rds_tcp_connection *tc;
45 45
46 read_lock(&sk->sk_callback_lock); 46 read_lock_bh(&sk->sk_callback_lock);
47 conn = sk->sk_user_data; 47 conn = sk->sk_user_data;
48 if (conn == NULL) { 48 if (conn == NULL) {
49 state_change = sk->sk_state_change; 49 state_change = sk->sk_state_change;
@@ -68,7 +68,7 @@ void rds_tcp_state_change(struct sock *sk)
68 break; 68 break;
69 } 69 }
70out: 70out:
71 read_unlock(&sk->sk_callback_lock); 71 read_unlock_bh(&sk->sk_callback_lock);
72 state_change(sk); 72 state_change(sk);
73} 73}
74 74
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 975183fe6950..27844f231d10 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -114,7 +114,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
114 114
115 rdsdebug("listen data ready sk %p\n", sk); 115 rdsdebug("listen data ready sk %p\n", sk);
116 116
117 read_lock(&sk->sk_callback_lock); 117 read_lock_bh(&sk->sk_callback_lock);
118 ready = sk->sk_user_data; 118 ready = sk->sk_user_data;
119 if (ready == NULL) { /* check for teardown race */ 119 if (ready == NULL) { /* check for teardown race */
120 ready = sk->sk_data_ready; 120 ready = sk->sk_data_ready;
@@ -131,7 +131,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
131 queue_work(rds_wq, &rds_tcp_listen_work); 131 queue_work(rds_wq, &rds_tcp_listen_work);
132 132
133out: 133out:
134 read_unlock(&sk->sk_callback_lock); 134 read_unlock_bh(&sk->sk_callback_lock);
135 ready(sk, bytes); 135 ready(sk, bytes);
136} 136}
137 137
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 1aba6878fa5d..e43797404102 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -324,7 +324,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
324 324
325 rdsdebug("data ready sk %p bytes %d\n", sk, bytes); 325 rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
326 326
327 read_lock(&sk->sk_callback_lock); 327 read_lock_bh(&sk->sk_callback_lock);
328 conn = sk->sk_user_data; 328 conn = sk->sk_user_data;
329 if (conn == NULL) { /* check for teardown race */ 329 if (conn == NULL) { /* check for teardown race */
330 ready = sk->sk_data_ready; 330 ready = sk->sk_data_ready;
@@ -338,7 +338,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
338 if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) 338 if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM)
339 queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 339 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
340out: 340out:
341 read_unlock(&sk->sk_callback_lock); 341 read_unlock_bh(&sk->sk_callback_lock);
342 ready(sk, bytes); 342 ready(sk, bytes);
343} 343}
344 344
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index a28b895ff0d1..2f012a07d94d 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -224,7 +224,7 @@ void rds_tcp_write_space(struct sock *sk)
224 struct rds_connection *conn; 224 struct rds_connection *conn;
225 struct rds_tcp_connection *tc; 225 struct rds_tcp_connection *tc;
226 226
227 read_lock(&sk->sk_callback_lock); 227 read_lock_bh(&sk->sk_callback_lock);
228 conn = sk->sk_user_data; 228 conn = sk->sk_user_data;
229 if (conn == NULL) { 229 if (conn == NULL) {
230 write_space = sk->sk_write_space; 230 write_space = sk->sk_write_space;
@@ -244,7 +244,7 @@ void rds_tcp_write_space(struct sock *sk)
244 queue_delayed_work(rds_wq, &conn->c_send_w, 0); 244 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
245 245
246out: 246out:
247 read_unlock(&sk->sk_callback_lock); 247 read_unlock_bh(&sk->sk_callback_lock);
248 248
249 /* 249 /*
250 * write_space is only called when data leaves tcp's send queue if 250 * write_space is only called when data leaves tcp's send queue if
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 51875a0c5d48..04f599089e6d 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1241,6 +1241,7 @@ static const struct file_operations rfkill_fops = {
1241 .unlocked_ioctl = rfkill_fop_ioctl, 1241 .unlocked_ioctl = rfkill_fop_ioctl,
1242 .compat_ioctl = rfkill_fop_ioctl, 1242 .compat_ioctl = rfkill_fop_ioctl,
1243#endif 1243#endif
1244 .llseek = no_llseek,
1244}; 1245};
1245 1246
1246static struct miscdevice rfkill_miscdev = { 1247static struct miscdevice rfkill_miscdev = {
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 8e45e76a95f5..d952e7eac188 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -679,7 +679,7 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
679 if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) 679 if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
680 return -EINVAL; 680 return -EINVAL;
681 681
682 if (addr->srose_ndigis > ROSE_MAX_DIGIS) 682 if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
683 return -EINVAL; 683 return -EINVAL;
684 684
685 if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) { 685 if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) {
@@ -739,7 +739,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
739 if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) 739 if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
740 return -EINVAL; 740 return -EINVAL;
741 741
742 if (addr->srose_ndigis > ROSE_MAX_DIGIS) 742 if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
743 return -EINVAL; 743 return -EINVAL;
744 744
745 /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */ 745 /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 537a48732e9e..7ebf7439b478 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -350,22 +350,19 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
350{ 350{
351 unsigned char *b = skb_tail_pointer(skb); 351 unsigned char *b = skb_tail_pointer(skb);
352 struct tcf_police *police = a->priv; 352 struct tcf_police *police = a->priv;
353 struct tc_police opt; 353 struct tc_police opt = {
354 354 .index = police->tcf_index,
355 opt.index = police->tcf_index; 355 .action = police->tcf_action,
356 opt.action = police->tcf_action; 356 .mtu = police->tcfp_mtu,
357 opt.mtu = police->tcfp_mtu; 357 .burst = police->tcfp_burst,
358 opt.burst = police->tcfp_burst; 358 .refcnt = police->tcf_refcnt - ref,
359 opt.refcnt = police->tcf_refcnt - ref; 359 .bindcnt = police->tcf_bindcnt - bind,
360 opt.bindcnt = police->tcf_bindcnt - bind; 360 };
361
361 if (police->tcfp_R_tab) 362 if (police->tcfp_R_tab)
362 opt.rate = police->tcfp_R_tab->rate; 363 opt.rate = police->tcfp_R_tab->rate;
363 else
364 memset(&opt.rate, 0, sizeof(opt.rate));
365 if (police->tcfp_P_tab) 364 if (police->tcfp_P_tab)
366 opt.peakrate = police->tcfp_P_tab->rate; 365 opt.peakrate = police->tcfp_P_tab->rate;
367 else
368 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
369 NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); 366 NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
370 if (police->tcfp_result) 367 if (police->tcfp_result)
371 NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result); 368 NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 78ef2c5e130b..37dff78e9cb1 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
123 * calls by looking at the number of nested bh disable calls because 123 * calls by looking at the number of nested bh disable calls because
124 * softirqs always disables bh. 124 * softirqs always disables bh.
125 */ 125 */
126 if (softirq_count() != SOFTIRQ_OFFSET) { 126 if (in_serving_softirq()) {
127 /* If there is an sk_classid we'll use that. */ 127 /* If there is an sk_classid we'll use that. */
128 if (!skb->sk) 128 if (!skb->sk)
129 return -1; 129 return -1;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 7416a5c73b2a..b0c2a82178af 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -137,7 +137,7 @@ next_knode:
137 int toff = off + key->off + (off2 & key->offmask); 137 int toff = off + key->off + (off2 & key->offmask);
138 __be32 *data, _data; 138 __be32 *data, _data;
139 139
140 if (skb_headroom(skb) + toff < 0) 140 if (skb_headroom(skb) + toff > INT_MAX)
141 goto out; 141 goto out;
142 142
143 data = skb_header_pointer(skb, toff, 4, &_data); 143 data = skb_header_pointer(skb, toff, 4, &_data);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 340662789529..6318e1136b83 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -255,10 +255,6 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
255 error = -EINVAL; 255 error = -EINVAL;
256 goto err_out; 256 goto err_out;
257 } 257 }
258 if (!list_empty(&flow->list)) {
259 error = -EEXIST;
260 goto err_out;
261 }
262 } else { 258 } else {
263 int i; 259 int i;
264 unsigned long cl; 260 unsigned long cl;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index abd904be4287..47496098d35c 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -761,8 +761,8 @@ init_vf(struct hfsc_class *cl, unsigned int len)
761 if (f != cl->cl_f) { 761 if (f != cl->cl_f) {
762 cl->cl_f = f; 762 cl->cl_f = f;
763 cftree_update(cl); 763 cftree_update(cl);
764 update_cfmin(cl->cl_parent);
765 } 764 }
765 update_cfmin(cl->cl_parent);
766 } 766 }
767} 767}
768 768
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 86366390038a..ddbbf7c81fa1 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -543,16 +543,20 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
543 id = ntohs(hmacs->hmac_ids[i]); 543 id = ntohs(hmacs->hmac_ids[i]);
544 544
545 /* Check the id is in the supported range */ 545 /* Check the id is in the supported range */
546 if (id > SCTP_AUTH_HMAC_ID_MAX) 546 if (id > SCTP_AUTH_HMAC_ID_MAX) {
547 id = 0;
547 continue; 548 continue;
549 }
548 550
549 /* See is we support the id. Supported IDs have name and 551 /* See is we support the id. Supported IDs have name and
550 * length fields set, so that we can allocated and use 552 * length fields set, so that we can allocated and use
551 * them. We can safely just check for name, for without the 553 * them. We can safely just check for name, for without the
552 * name, we can't allocate the TFM. 554 * name, we can't allocate the TFM.
553 */ 555 */
554 if (!sctp_hmac_list[id].hmac_name) 556 if (!sctp_hmac_list[id].hmac_name) {
557 id = 0;
555 continue; 558 continue;
559 }
556 560
557 break; 561 break;
558 } 562 }
diff --git a/net/sctp/output.c b/net/sctp/output.c
index a646681f5acd..bcc4590ccaf2 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -92,7 +92,6 @@ struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
92 SCTP_DEBUG_PRINTK("%s: packet:%p vtag:0x%x\n", __func__, 92 SCTP_DEBUG_PRINTK("%s: packet:%p vtag:0x%x\n", __func__,
93 packet, vtag); 93 packet, vtag);
94 94
95 sctp_packet_reset(packet);
96 packet->vtag = vtag; 95 packet->vtag = vtag;
97 96
98 if (ecn_capable && sctp_packet_empty(packet)) { 97 if (ecn_capable && sctp_packet_empty(packet)) {
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index db3a42b8b349..289b1ba62cac 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -117,6 +117,7 @@ static const struct file_operations sctpprobe_fops = {
117 .owner = THIS_MODULE, 117 .owner = THIS_MODULE,
118 .open = sctpprobe_open, 118 .open = sctpprobe_open,
119 .read = sctpprobe_read, 119 .read = sctpprobe_read,
120 .llseek = noop_llseek,
120}; 121};
121 122
122sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep, 123sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep,
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 24b2cd555637..d344dc481ccc 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1232,6 +1232,18 @@ out:
1232 return 0; 1232 return 0;
1233} 1233}
1234 1234
1235static bool list_has_sctp_addr(const struct list_head *list,
1236 union sctp_addr *ipaddr)
1237{
1238 struct sctp_transport *addr;
1239
1240 list_for_each_entry(addr, list, transports) {
1241 if (sctp_cmp_addr_exact(ipaddr, &addr->ipaddr))
1242 return true;
1243 }
1244
1245 return false;
1246}
1235/* A restart is occurring, check to make sure no new addresses 1247/* A restart is occurring, check to make sure no new addresses
1236 * are being added as we may be under a takeover attack. 1248 * are being added as we may be under a takeover attack.
1237 */ 1249 */
@@ -1240,10 +1252,10 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
1240 struct sctp_chunk *init, 1252 struct sctp_chunk *init,
1241 sctp_cmd_seq_t *commands) 1253 sctp_cmd_seq_t *commands)
1242{ 1254{
1243 struct sctp_transport *new_addr, *addr; 1255 struct sctp_transport *new_addr;
1244 int found; 1256 int ret = 1;
1245 1257
1246 /* Implementor's Guide - Sectin 5.2.2 1258 /* Implementor's Guide - Section 5.2.2
1247 * ... 1259 * ...
1248 * Before responding the endpoint MUST check to see if the 1260 * Before responding the endpoint MUST check to see if the
1249 * unexpected INIT adds new addresses to the association. If new 1261 * unexpected INIT adds new addresses to the association. If new
@@ -1254,31 +1266,19 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
1254 /* Search through all current addresses and make sure 1266 /* Search through all current addresses and make sure
1255 * we aren't adding any new ones. 1267 * we aren't adding any new ones.
1256 */ 1268 */
1257 new_addr = NULL;
1258 found = 0;
1259
1260 list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list, 1269 list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list,
1261 transports) { 1270 transports) {
1262 found = 0; 1271 if (!list_has_sctp_addr(&asoc->peer.transport_addr_list,
1263 list_for_each_entry(addr, &asoc->peer.transport_addr_list, 1272 &new_addr->ipaddr)) {
1264 transports) { 1273 sctp_sf_send_restart_abort(&new_addr->ipaddr, init,
1265 if (sctp_cmp_addr_exact(&new_addr->ipaddr, 1274 commands);
1266 &addr->ipaddr)) { 1275 ret = 0;
1267 found = 1;
1268 break;
1269 }
1270 }
1271 if (!found)
1272 break; 1276 break;
1273 } 1277 }
1274
1275 /* If a new address was added, ABORT the sender. */
1276 if (!found && new_addr) {
1277 sctp_sf_send_restart_abort(&new_addr->ipaddr, init, commands);
1278 } 1278 }
1279 1279
1280 /* Return success if all addresses were found. */ 1280 /* Return success if all addresses were found. */
1281 return found; 1281 return ret;
1282} 1282}
1283 1283
1284/* Populate the verification/tie tags based on overlapping INIT 1284/* Populate the verification/tie tags based on overlapping INIT
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ca44917872d2..fbb70770ad05 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -916,6 +916,11 @@ SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk,
916 /* Walk through the addrs buffer and count the number of addresses. */ 916 /* Walk through the addrs buffer and count the number of addresses. */
917 addr_buf = kaddrs; 917 addr_buf = kaddrs;
918 while (walk_size < addrs_size) { 918 while (walk_size < addrs_size) {
919 if (walk_size + sizeof(sa_family_t) > addrs_size) {
920 kfree(kaddrs);
921 return -EINVAL;
922 }
923
919 sa_addr = (struct sockaddr *)addr_buf; 924 sa_addr = (struct sockaddr *)addr_buf;
920 af = sctp_get_af_specific(sa_addr->sa_family); 925 af = sctp_get_af_specific(sa_addr->sa_family);
921 926
@@ -1002,9 +1007,13 @@ static int __sctp_connect(struct sock* sk,
1002 /* Walk through the addrs buffer and count the number of addresses. */ 1007 /* Walk through the addrs buffer and count the number of addresses. */
1003 addr_buf = kaddrs; 1008 addr_buf = kaddrs;
1004 while (walk_size < addrs_size) { 1009 while (walk_size < addrs_size) {
1010 if (walk_size + sizeof(sa_family_t) > addrs_size) {
1011 err = -EINVAL;
1012 goto out_free;
1013 }
1014
1005 sa_addr = (union sctp_addr *)addr_buf; 1015 sa_addr = (union sctp_addr *)addr_buf;
1006 af = sctp_get_af_specific(sa_addr->sa.sa_family); 1016 af = sctp_get_af_specific(sa_addr->sa.sa_family);
1007 port = ntohs(sa_addr->v4.sin_port);
1008 1017
1009 /* If the address family is not supported or if this address 1018 /* If the address family is not supported or if this address
1010 * causes the address buffer to overflow return EINVAL. 1019 * causes the address buffer to overflow return EINVAL.
@@ -1014,6 +1023,8 @@ static int __sctp_connect(struct sock* sk,
1014 goto out_free; 1023 goto out_free;
1015 } 1024 }
1016 1025
1026 port = ntohs(sa_addr->v4.sin_port);
1027
1017 /* Save current address so we can work with it */ 1028 /* Save current address so we can work with it */
1018 memcpy(&to, sa_addr, af->sockaddr_len); 1029 memcpy(&to, sa_addr, af->sockaddr_len);
1019 1030
diff --git a/net/socket.c b/net/socket.c
index 2270b941bcc7..9eac5c394134 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -502,6 +502,7 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
502const struct file_operations bad_sock_fops = { 502const struct file_operations bad_sock_fops = {
503 .owner = THIS_MODULE, 503 .owner = THIS_MODULE,
504 .open = sock_no_open, 504 .open = sock_no_open,
505 .llseek = noop_llseek,
505}; 506};
506 507
507/** 508/**
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 36cb66022a27..e9eaaf7d43c1 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -38,7 +38,7 @@ static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
38static LIST_HEAD(cred_unused); 38static LIST_HEAD(cred_unused);
39static unsigned long number_cred_unused; 39static unsigned long number_cred_unused;
40 40
41#define MAX_HASHTABLE_BITS (10) 41#define MAX_HASHTABLE_BITS (14)
42static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) 42static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
43{ 43{
44 unsigned long num; 44 unsigned long num;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index dcfc66bab2bb..12c485982814 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -745,17 +745,18 @@ gss_pipe_release(struct inode *inode)
745 struct rpc_inode *rpci = RPC_I(inode); 745 struct rpc_inode *rpci = RPC_I(inode);
746 struct gss_upcall_msg *gss_msg; 746 struct gss_upcall_msg *gss_msg;
747 747
748restart:
748 spin_lock(&inode->i_lock); 749 spin_lock(&inode->i_lock);
749 while (!list_empty(&rpci->in_downcall)) { 750 list_for_each_entry(gss_msg, &rpci->in_downcall, list) {
750 751
751 gss_msg = list_entry(rpci->in_downcall.next, 752 if (!list_empty(&gss_msg->msg.list))
752 struct gss_upcall_msg, list); 753 continue;
753 gss_msg->msg.errno = -EPIPE; 754 gss_msg->msg.errno = -EPIPE;
754 atomic_inc(&gss_msg->count); 755 atomic_inc(&gss_msg->count);
755 __gss_unhash_msg(gss_msg); 756 __gss_unhash_msg(gss_msg);
756 spin_unlock(&inode->i_lock); 757 spin_unlock(&inode->i_lock);
757 gss_release_msg(gss_msg); 758 gss_release_msg(gss_msg);
758 spin_lock(&inode->i_lock); 759 goto restart;
759 } 760 }
760 spin_unlock(&inode->i_lock); 761 spin_unlock(&inode->i_lock);
761 762
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 032644610524..778e5dfc5144 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -237,6 +237,7 @@ get_key(const void *p, const void *end,
237 if (!supported_gss_krb5_enctype(alg)) { 237 if (!supported_gss_krb5_enctype(alg)) {
238 printk(KERN_WARNING "gss_kerberos_mech: unsupported " 238 printk(KERN_WARNING "gss_kerberos_mech: unsupported "
239 "encryption key algorithm %d\n", alg); 239 "encryption key algorithm %d\n", alg);
240 p = ERR_PTR(-EINVAL);
240 goto out_err; 241 goto out_err;
241 } 242 }
242 p = simple_get_netobj(p, end, &key); 243 p = simple_get_netobj(p, end, &key);
@@ -282,15 +283,19 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
282 ctx->enctype = ENCTYPE_DES_CBC_RAW; 283 ctx->enctype = ENCTYPE_DES_CBC_RAW;
283 284
284 ctx->gk5e = get_gss_krb5_enctype(ctx->enctype); 285 ctx->gk5e = get_gss_krb5_enctype(ctx->enctype);
285 if (ctx->gk5e == NULL) 286 if (ctx->gk5e == NULL) {
287 p = ERR_PTR(-EINVAL);
286 goto out_err; 288 goto out_err;
289 }
287 290
288 /* The downcall format was designed before we completely understood 291 /* The downcall format was designed before we completely understood
289 * the uses of the context fields; so it includes some stuff we 292 * the uses of the context fields; so it includes some stuff we
290 * just give some minimal sanity-checking, and some we ignore 293 * just give some minimal sanity-checking, and some we ignore
291 * completely (like the next twenty bytes): */ 294 * completely (like the next twenty bytes): */
292 if (unlikely(p + 20 > end || p + 20 < p)) 295 if (unlikely(p + 20 > end || p + 20 < p)) {
296 p = ERR_PTR(-EFAULT);
293 goto out_err; 297 goto out_err;
298 }
294 p += 20; 299 p += 20;
295 p = simple_get_bytes(p, end, &tmp, sizeof(tmp)); 300 p = simple_get_bytes(p, end, &tmp, sizeof(tmp));
296 if (IS_ERR(p)) 301 if (IS_ERR(p))
@@ -619,6 +624,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
619 if (ctx->seq_send64 != ctx->seq_send) { 624 if (ctx->seq_send64 != ctx->seq_send) {
620 dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__, 625 dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__,
621 (long unsigned)ctx->seq_send64, ctx->seq_send); 626 (long unsigned)ctx->seq_send64, ctx->seq_send);
627 p = ERR_PTR(-EINVAL);
622 goto out_err; 628 goto out_err;
623 } 629 }
624 p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype)); 630 p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype));
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
index dc3f1f5ed865..adade3d313f2 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c
@@ -100,6 +100,7 @@ gss_import_sec_context_spkm3(const void *p, size_t len,
100 if (version != 1) { 100 if (version != 1) {
101 dprintk("RPC: unknown spkm3 token format: " 101 dprintk("RPC: unknown spkm3 token format: "
102 "obsolete nfs-utils?\n"); 102 "obsolete nfs-utils?\n");
103 p = ERR_PTR(-EINVAL);
103 goto out_err_free_ctx; 104 goto out_err_free_ctx;
104 } 105 }
105 106
@@ -135,8 +136,10 @@ gss_import_sec_context_spkm3(const void *p, size_t len,
135 if (IS_ERR(p)) 136 if (IS_ERR(p))
136 goto out_err_free_intg_alg; 137 goto out_err_free_intg_alg;
137 138
138 if (p != end) 139 if (p != end) {
140 p = ERR_PTR(-EFAULT);
139 goto out_err_free_intg_key; 141 goto out_err_free_intg_key;
142 }
140 143
141 ctx_id->internal_ctx_id = ctx; 144 ctx_id->internal_ctx_id = ctx;
142 145
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 2b06410e584e..7dce81a926c5 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -28,7 +28,6 @@
28#include <linux/workqueue.h> 28#include <linux/workqueue.h>
29#include <linux/mutex.h> 29#include <linux/mutex.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/smp_lock.h>
32#include <asm/ioctls.h> 31#include <asm/ioctls.h>
33#include <linux/sunrpc/types.h> 32#include <linux/sunrpc/types.h>
34#include <linux/sunrpc/cache.h> 33#include <linux/sunrpc/cache.h>
@@ -1348,15 +1347,10 @@ static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait)
1348static long cache_ioctl_procfs(struct file *filp, 1347static long cache_ioctl_procfs(struct file *filp,
1349 unsigned int cmd, unsigned long arg) 1348 unsigned int cmd, unsigned long arg)
1350{ 1349{
1351 long ret;
1352 struct inode *inode = filp->f_path.dentry->d_inode; 1350 struct inode *inode = filp->f_path.dentry->d_inode;
1353 struct cache_detail *cd = PDE(inode)->data; 1351 struct cache_detail *cd = PDE(inode)->data;
1354 1352
1355 lock_kernel(); 1353 return cache_ioctl(inode, filp, cmd, arg, cd);
1356 ret = cache_ioctl(inode, filp, cmd, arg, cd);
1357 unlock_kernel();
1358
1359 return ret;
1360} 1354}
1361 1355
1362static int cache_open_procfs(struct inode *inode, struct file *filp) 1356static int cache_open_procfs(struct inode *inode, struct file *filp)
@@ -1441,6 +1435,7 @@ static const struct file_operations cache_flush_operations_procfs = {
1441 .read = read_flush_procfs, 1435 .read = read_flush_procfs,
1442 .write = write_flush_procfs, 1436 .write = write_flush_procfs,
1443 .release = release_flush_procfs, 1437 .release = release_flush_procfs,
1438 .llseek = no_llseek,
1444}; 1439};
1445 1440
1446static void remove_cache_proc_entries(struct cache_detail *cd) 1441static void remove_cache_proc_entries(struct cache_detail *cd)
@@ -1555,13 +1550,8 @@ static long cache_ioctl_pipefs(struct file *filp,
1555{ 1550{
1556 struct inode *inode = filp->f_dentry->d_inode; 1551 struct inode *inode = filp->f_dentry->d_inode;
1557 struct cache_detail *cd = RPC_I(inode)->private; 1552 struct cache_detail *cd = RPC_I(inode)->private;
1558 long ret;
1559 1553
1560 lock_kernel(); 1554 return cache_ioctl(inode, filp, cmd, arg, cd);
1561 ret = cache_ioctl(inode, filp, cmd, arg, cd);
1562 unlock_kernel();
1563
1564 return ret;
1565} 1555}
1566 1556
1567static int cache_open_pipefs(struct inode *inode, struct file *filp) 1557static int cache_open_pipefs(struct inode *inode, struct file *filp)
@@ -1646,6 +1636,7 @@ const struct file_operations cache_flush_operations_pipefs = {
1646 .read = read_flush_pipefs, 1636 .read = read_flush_pipefs,
1647 .write = write_flush_pipefs, 1637 .write = write_flush_pipefs,
1648 .release = release_flush_pipefs, 1638 .release = release_flush_pipefs,
1639 .llseek = no_llseek,
1649}; 1640};
1650 1641
1651int sunrpc_cache_register_pipefs(struct dentry *parent, 1642int sunrpc_cache_register_pipefs(struct dentry *parent,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2388d83b68ff..fa5549079d79 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -226,7 +226,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
226 goto out_no_principal; 226 goto out_no_principal;
227 } 227 }
228 228
229 kref_init(&clnt->cl_kref); 229 atomic_set(&clnt->cl_count, 1);
230 230
231 err = rpc_setup_pipedir(clnt, program->pipe_dir_name); 231 err = rpc_setup_pipedir(clnt, program->pipe_dir_name);
232 if (err < 0) 232 if (err < 0)
@@ -390,14 +390,14 @@ rpc_clone_client(struct rpc_clnt *clnt)
390 if (new->cl_principal == NULL) 390 if (new->cl_principal == NULL)
391 goto out_no_principal; 391 goto out_no_principal;
392 } 392 }
393 kref_init(&new->cl_kref); 393 atomic_set(&new->cl_count, 1);
394 err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name); 394 err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name);
395 if (err != 0) 395 if (err != 0)
396 goto out_no_path; 396 goto out_no_path;
397 if (new->cl_auth) 397 if (new->cl_auth)
398 atomic_inc(&new->cl_auth->au_count); 398 atomic_inc(&new->cl_auth->au_count);
399 xprt_get(clnt->cl_xprt); 399 xprt_get(clnt->cl_xprt);
400 kref_get(&clnt->cl_kref); 400 atomic_inc(&clnt->cl_count);
401 rpc_register_client(new); 401 rpc_register_client(new);
402 rpciod_up(); 402 rpciod_up();
403 return new; 403 return new;
@@ -465,10 +465,8 @@ EXPORT_SYMBOL_GPL(rpc_shutdown_client);
465 * Free an RPC client 465 * Free an RPC client
466 */ 466 */
467static void 467static void
468rpc_free_client(struct kref *kref) 468rpc_free_client(struct rpc_clnt *clnt)
469{ 469{
470 struct rpc_clnt *clnt = container_of(kref, struct rpc_clnt, cl_kref);
471
472 dprintk("RPC: destroying %s client for %s\n", 470 dprintk("RPC: destroying %s client for %s\n",
473 clnt->cl_protname, clnt->cl_server); 471 clnt->cl_protname, clnt->cl_server);
474 if (!IS_ERR(clnt->cl_path.dentry)) { 472 if (!IS_ERR(clnt->cl_path.dentry)) {
@@ -495,12 +493,10 @@ out_free:
495 * Free an RPC client 493 * Free an RPC client
496 */ 494 */
497static void 495static void
498rpc_free_auth(struct kref *kref) 496rpc_free_auth(struct rpc_clnt *clnt)
499{ 497{
500 struct rpc_clnt *clnt = container_of(kref, struct rpc_clnt, cl_kref);
501
502 if (clnt->cl_auth == NULL) { 498 if (clnt->cl_auth == NULL) {
503 rpc_free_client(kref); 499 rpc_free_client(clnt);
504 return; 500 return;
505 } 501 }
506 502
@@ -509,10 +505,11 @@ rpc_free_auth(struct kref *kref)
509 * release remaining GSS contexts. This mechanism ensures 505 * release remaining GSS contexts. This mechanism ensures
510 * that it can do so safely. 506 * that it can do so safely.
511 */ 507 */
512 kref_init(kref); 508 atomic_inc(&clnt->cl_count);
513 rpcauth_release(clnt->cl_auth); 509 rpcauth_release(clnt->cl_auth);
514 clnt->cl_auth = NULL; 510 clnt->cl_auth = NULL;
515 kref_put(kref, rpc_free_client); 511 if (atomic_dec_and_test(&clnt->cl_count))
512 rpc_free_client(clnt);
516} 513}
517 514
518/* 515/*
@@ -525,7 +522,8 @@ rpc_release_client(struct rpc_clnt *clnt)
525 522
526 if (list_empty(&clnt->cl_tasks)) 523 if (list_empty(&clnt->cl_tasks))
527 wake_up(&destroy_wait); 524 wake_up(&destroy_wait);
528 kref_put(&clnt->cl_kref, rpc_free_auth); 525 if (atomic_dec_and_test(&clnt->cl_count))
526 rpc_free_auth(clnt);
529} 527}
530 528
531/** 529/**
@@ -588,7 +586,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
588 if (clnt != NULL) { 586 if (clnt != NULL) {
589 rpc_task_release_client(task); 587 rpc_task_release_client(task);
590 task->tk_client = clnt; 588 task->tk_client = clnt;
591 kref_get(&clnt->cl_kref); 589 atomic_inc(&clnt->cl_count);
592 if (clnt->cl_softrtry) 590 if (clnt->cl_softrtry)
593 task->tk_flags |= RPC_TASK_SOFT; 591 task->tk_flags |= RPC_TASK_SOFT;
594 /* Add to the client's list of all tasks */ 592 /* Add to the client's list of all tasks */
@@ -931,7 +929,7 @@ call_reserveresult(struct rpc_task *task)
931 task->tk_status = 0; 929 task->tk_status = 0;
932 if (status >= 0) { 930 if (status >= 0) {
933 if (task->tk_rqstp) { 931 if (task->tk_rqstp) {
934 task->tk_action = call_allocate; 932 task->tk_action = call_refresh;
935 return; 933 return;
936 } 934 }
937 935
@@ -966,13 +964,54 @@ call_reserveresult(struct rpc_task *task)
966} 964}
967 965
968/* 966/*
969 * 2. Allocate the buffer. For details, see sched.c:rpc_malloc. 967 * 2. Bind and/or refresh the credentials
968 */
969static void
970call_refresh(struct rpc_task *task)
971{
972 dprint_status(task);
973
974 task->tk_action = call_refreshresult;
975 task->tk_status = 0;
976 task->tk_client->cl_stats->rpcauthrefresh++;
977 rpcauth_refreshcred(task);
978}
979
980/*
981 * 2a. Process the results of a credential refresh
982 */
983static void
984call_refreshresult(struct rpc_task *task)
985{
986 int status = task->tk_status;
987
988 dprint_status(task);
989
990 task->tk_status = 0;
991 task->tk_action = call_allocate;
992 if (status >= 0 && rpcauth_uptodatecred(task))
993 return;
994 switch (status) {
995 case -EACCES:
996 rpc_exit(task, -EACCES);
997 return;
998 case -ENOMEM:
999 rpc_exit(task, -ENOMEM);
1000 return;
1001 case -ETIMEDOUT:
1002 rpc_delay(task, 3*HZ);
1003 }
1004 task->tk_action = call_refresh;
1005}
1006
1007/*
1008 * 2b. Allocate the buffer. For details, see sched.c:rpc_malloc.
970 * (Note: buffer memory is freed in xprt_release). 1009 * (Note: buffer memory is freed in xprt_release).
971 */ 1010 */
972static void 1011static void
973call_allocate(struct rpc_task *task) 1012call_allocate(struct rpc_task *task)
974{ 1013{
975 unsigned int slack = task->tk_client->cl_auth->au_cslack; 1014 unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack;
976 struct rpc_rqst *req = task->tk_rqstp; 1015 struct rpc_rqst *req = task->tk_rqstp;
977 struct rpc_xprt *xprt = task->tk_xprt; 1016 struct rpc_xprt *xprt = task->tk_xprt;
978 struct rpc_procinfo *proc = task->tk_msg.rpc_proc; 1017 struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
@@ -980,7 +1019,7 @@ call_allocate(struct rpc_task *task)
980 dprint_status(task); 1019 dprint_status(task);
981 1020
982 task->tk_status = 0; 1021 task->tk_status = 0;
983 task->tk_action = call_refresh; 1022 task->tk_action = call_bind;
984 1023
985 if (req->rq_buffer) 1024 if (req->rq_buffer)
986 return; 1025 return;
@@ -1017,47 +1056,6 @@ call_allocate(struct rpc_task *task)
1017 rpc_exit(task, -ERESTARTSYS); 1056 rpc_exit(task, -ERESTARTSYS);
1018} 1057}
1019 1058
1020/*
1021 * 2a. Bind and/or refresh the credentials
1022 */
1023static void
1024call_refresh(struct rpc_task *task)
1025{
1026 dprint_status(task);
1027
1028 task->tk_action = call_refreshresult;
1029 task->tk_status = 0;
1030 task->tk_client->cl_stats->rpcauthrefresh++;
1031 rpcauth_refreshcred(task);
1032}
1033
1034/*
1035 * 2b. Process the results of a credential refresh
1036 */
1037static void
1038call_refreshresult(struct rpc_task *task)
1039{
1040 int status = task->tk_status;
1041
1042 dprint_status(task);
1043
1044 task->tk_status = 0;
1045 task->tk_action = call_bind;
1046 if (status >= 0 && rpcauth_uptodatecred(task))
1047 return;
1048 switch (status) {
1049 case -EACCES:
1050 rpc_exit(task, -EACCES);
1051 return;
1052 case -ENOMEM:
1053 rpc_exit(task, -ENOMEM);
1054 return;
1055 case -ETIMEDOUT:
1056 rpc_delay(task, 3*HZ);
1057 }
1058 task->tk_action = call_refresh;
1059}
1060
1061static inline int 1059static inline int
1062rpc_task_need_encode(struct rpc_task *task) 1060rpc_task_need_encode(struct rpc_task *task)
1063{ 1061{
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 95ccbcf45d3e..28bcd52e3ce9 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -27,7 +27,6 @@
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <linux/sunrpc/rpc_pipe_fs.h> 28#include <linux/sunrpc/rpc_pipe_fs.h>
29#include <linux/sunrpc/cache.h> 29#include <linux/sunrpc/cache.h>
30#include <linux/smp_lock.h>
31 30
32static struct vfsmount *rpc_mount __read_mostly; 31static struct vfsmount *rpc_mount __read_mostly;
33static int rpc_mount_count; 32static int rpc_mount_count;
@@ -48,7 +47,7 @@ static void rpc_purge_list(struct rpc_inode *rpci, struct list_head *head,
48 return; 47 return;
49 do { 48 do {
50 msg = list_entry(head->next, struct rpc_pipe_msg, list); 49 msg = list_entry(head->next, struct rpc_pipe_msg, list);
51 list_del(&msg->list); 50 list_del_init(&msg->list);
52 msg->errno = err; 51 msg->errno = err;
53 destroy_msg(msg); 52 destroy_msg(msg);
54 } while (!list_empty(head)); 53 } while (!list_empty(head));
@@ -208,7 +207,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
208 if (msg != NULL) { 207 if (msg != NULL) {
209 spin_lock(&inode->i_lock); 208 spin_lock(&inode->i_lock);
210 msg->errno = -EAGAIN; 209 msg->errno = -EAGAIN;
211 list_del(&msg->list); 210 list_del_init(&msg->list);
212 spin_unlock(&inode->i_lock); 211 spin_unlock(&inode->i_lock);
213 rpci->ops->destroy_msg(msg); 212 rpci->ops->destroy_msg(msg);
214 } 213 }
@@ -268,7 +267,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
268 if (res < 0 || msg->len == msg->copied) { 267 if (res < 0 || msg->len == msg->copied) {
269 filp->private_data = NULL; 268 filp->private_data = NULL;
270 spin_lock(&inode->i_lock); 269 spin_lock(&inode->i_lock);
271 list_del(&msg->list); 270 list_del_init(&msg->list);
272 spin_unlock(&inode->i_lock); 271 spin_unlock(&inode->i_lock);
273 rpci->ops->destroy_msg(msg); 272 rpci->ops->destroy_msg(msg);
274 } 273 }
@@ -309,40 +308,33 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
309 return mask; 308 return mask;
310} 309}
311 310
312static int 311static long
313rpc_pipe_ioctl_unlocked(struct file *filp, unsigned int cmd, unsigned long arg) 312rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
314{ 313{
315 struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); 314 struct inode *inode = filp->f_path.dentry->d_inode;
315 struct rpc_inode *rpci = RPC_I(inode);
316 int len; 316 int len;
317 317
318 switch (cmd) { 318 switch (cmd) {
319 case FIONREAD: 319 case FIONREAD:
320 if (rpci->ops == NULL) 320 spin_lock(&inode->i_lock);
321 if (rpci->ops == NULL) {
322 spin_unlock(&inode->i_lock);
321 return -EPIPE; 323 return -EPIPE;
324 }
322 len = rpci->pipelen; 325 len = rpci->pipelen;
323 if (filp->private_data) { 326 if (filp->private_data) {
324 struct rpc_pipe_msg *msg; 327 struct rpc_pipe_msg *msg;
325 msg = (struct rpc_pipe_msg *)filp->private_data; 328 msg = (struct rpc_pipe_msg *)filp->private_data;
326 len += msg->len - msg->copied; 329 len += msg->len - msg->copied;
327 } 330 }
331 spin_unlock(&inode->i_lock);
328 return put_user(len, (int __user *)arg); 332 return put_user(len, (int __user *)arg);
329 default: 333 default:
330 return -EINVAL; 334 return -EINVAL;
331 } 335 }
332} 336}
333 337
334static long
335rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
336{
337 long ret;
338
339 lock_kernel();
340 ret = rpc_pipe_ioctl_unlocked(filp, cmd, arg);
341 unlock_kernel();
342
343 return ret;
344}
345
346static const struct file_operations rpc_pipe_fops = { 338static const struct file_operations rpc_pipe_fops = {
347 .owner = THIS_MODULE, 339 .owner = THIS_MODULE,
348 .llseek = no_llseek, 340 .llseek = no_llseek,
@@ -371,21 +363,23 @@ rpc_show_info(struct seq_file *m, void *v)
371static int 363static int
372rpc_info_open(struct inode *inode, struct file *file) 364rpc_info_open(struct inode *inode, struct file *file)
373{ 365{
374 struct rpc_clnt *clnt; 366 struct rpc_clnt *clnt = NULL;
375 int ret = single_open(file, rpc_show_info, NULL); 367 int ret = single_open(file, rpc_show_info, NULL);
376 368
377 if (!ret) { 369 if (!ret) {
378 struct seq_file *m = file->private_data; 370 struct seq_file *m = file->private_data;
379 mutex_lock(&inode->i_mutex); 371
380 clnt = RPC_I(inode)->private; 372 spin_lock(&file->f_path.dentry->d_lock);
381 if (clnt) { 373 if (!d_unhashed(file->f_path.dentry))
382 kref_get(&clnt->cl_kref); 374 clnt = RPC_I(inode)->private;
375 if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) {
376 spin_unlock(&file->f_path.dentry->d_lock);
383 m->private = clnt; 377 m->private = clnt;
384 } else { 378 } else {
379 spin_unlock(&file->f_path.dentry->d_lock);
385 single_release(inode, file); 380 single_release(inode, file);
386 ret = -EINVAL; 381 ret = -EINVAL;
387 } 382 }
388 mutex_unlock(&inode->i_mutex);
389 } 383 }
390 return ret; 384 return ret;
391} 385}
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b6309db56226..fe9306bf10cc 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -800,7 +800,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
800 u32 _xid; 800 u32 _xid;
801 __be32 *xp; 801 __be32 *xp;
802 802
803 read_lock(&sk->sk_callback_lock); 803 read_lock_bh(&sk->sk_callback_lock);
804 dprintk("RPC: xs_udp_data_ready...\n"); 804 dprintk("RPC: xs_udp_data_ready...\n");
805 if (!(xprt = xprt_from_sock(sk))) 805 if (!(xprt = xprt_from_sock(sk)))
806 goto out; 806 goto out;
@@ -852,7 +852,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
852 dropit: 852 dropit:
853 skb_free_datagram(sk, skb); 853 skb_free_datagram(sk, skb);
854 out: 854 out:
855 read_unlock(&sk->sk_callback_lock); 855 read_unlock_bh(&sk->sk_callback_lock);
856} 856}
857 857
858static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) 858static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
@@ -1229,7 +1229,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
1229 1229
1230 dprintk("RPC: xs_tcp_data_ready...\n"); 1230 dprintk("RPC: xs_tcp_data_ready...\n");
1231 1231
1232 read_lock(&sk->sk_callback_lock); 1232 read_lock_bh(&sk->sk_callback_lock);
1233 if (!(xprt = xprt_from_sock(sk))) 1233 if (!(xprt = xprt_from_sock(sk)))
1234 goto out; 1234 goto out;
1235 if (xprt->shutdown) 1235 if (xprt->shutdown)
@@ -1248,7 +1248,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
1248 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); 1248 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
1249 } while (read > 0); 1249 } while (read > 0);
1250out: 1250out:
1251 read_unlock(&sk->sk_callback_lock); 1251 read_unlock_bh(&sk->sk_callback_lock);
1252} 1252}
1253 1253
1254/* 1254/*
@@ -1301,7 +1301,7 @@ static void xs_tcp_state_change(struct sock *sk)
1301{ 1301{
1302 struct rpc_xprt *xprt; 1302 struct rpc_xprt *xprt;
1303 1303
1304 read_lock(&sk->sk_callback_lock); 1304 read_lock_bh(&sk->sk_callback_lock);
1305 if (!(xprt = xprt_from_sock(sk))) 1305 if (!(xprt = xprt_from_sock(sk)))
1306 goto out; 1306 goto out;
1307 dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); 1307 dprintk("RPC: xs_tcp_state_change client %p...\n", xprt);
@@ -1313,7 +1313,7 @@ static void xs_tcp_state_change(struct sock *sk)
1313 1313
1314 switch (sk->sk_state) { 1314 switch (sk->sk_state) {
1315 case TCP_ESTABLISHED: 1315 case TCP_ESTABLISHED:
1316 spin_lock_bh(&xprt->transport_lock); 1316 spin_lock(&xprt->transport_lock);
1317 if (!xprt_test_and_set_connected(xprt)) { 1317 if (!xprt_test_and_set_connected(xprt)) {
1318 struct sock_xprt *transport = container_of(xprt, 1318 struct sock_xprt *transport = container_of(xprt,
1319 struct sock_xprt, xprt); 1319 struct sock_xprt, xprt);
@@ -1327,7 +1327,7 @@ static void xs_tcp_state_change(struct sock *sk)
1327 1327
1328 xprt_wake_pending_tasks(xprt, -EAGAIN); 1328 xprt_wake_pending_tasks(xprt, -EAGAIN);
1329 } 1329 }
1330 spin_unlock_bh(&xprt->transport_lock); 1330 spin_unlock(&xprt->transport_lock);
1331 break; 1331 break;
1332 case TCP_FIN_WAIT1: 1332 case TCP_FIN_WAIT1:
1333 /* The client initiated a shutdown of the socket */ 1333 /* The client initiated a shutdown of the socket */
@@ -1365,7 +1365,7 @@ static void xs_tcp_state_change(struct sock *sk)
1365 xs_sock_mark_closed(xprt); 1365 xs_sock_mark_closed(xprt);
1366 } 1366 }
1367 out: 1367 out:
1368 read_unlock(&sk->sk_callback_lock); 1368 read_unlock_bh(&sk->sk_callback_lock);
1369} 1369}
1370 1370
1371/** 1371/**
@@ -1376,7 +1376,7 @@ static void xs_error_report(struct sock *sk)
1376{ 1376{
1377 struct rpc_xprt *xprt; 1377 struct rpc_xprt *xprt;
1378 1378
1379 read_lock(&sk->sk_callback_lock); 1379 read_lock_bh(&sk->sk_callback_lock);
1380 if (!(xprt = xprt_from_sock(sk))) 1380 if (!(xprt = xprt_from_sock(sk)))
1381 goto out; 1381 goto out;
1382 dprintk("RPC: %s client %p...\n" 1382 dprintk("RPC: %s client %p...\n"
@@ -1384,7 +1384,7 @@ static void xs_error_report(struct sock *sk)
1384 __func__, xprt, sk->sk_err); 1384 __func__, xprt, sk->sk_err);
1385 xprt_wake_pending_tasks(xprt, -EAGAIN); 1385 xprt_wake_pending_tasks(xprt, -EAGAIN);
1386out: 1386out:
1387 read_unlock(&sk->sk_callback_lock); 1387 read_unlock_bh(&sk->sk_callback_lock);
1388} 1388}
1389 1389
1390static void xs_write_space(struct sock *sk) 1390static void xs_write_space(struct sock *sk)
@@ -1416,13 +1416,13 @@ static void xs_write_space(struct sock *sk)
1416 */ 1416 */
1417static void xs_udp_write_space(struct sock *sk) 1417static void xs_udp_write_space(struct sock *sk)
1418{ 1418{
1419 read_lock(&sk->sk_callback_lock); 1419 read_lock_bh(&sk->sk_callback_lock);
1420 1420
1421 /* from net/core/sock.c:sock_def_write_space */ 1421 /* from net/core/sock.c:sock_def_write_space */
1422 if (sock_writeable(sk)) 1422 if (sock_writeable(sk))
1423 xs_write_space(sk); 1423 xs_write_space(sk);
1424 1424
1425 read_unlock(&sk->sk_callback_lock); 1425 read_unlock_bh(&sk->sk_callback_lock);
1426} 1426}
1427 1427
1428/** 1428/**
@@ -1437,13 +1437,13 @@ static void xs_udp_write_space(struct sock *sk)
1437 */ 1437 */
1438static void xs_tcp_write_space(struct sock *sk) 1438static void xs_tcp_write_space(struct sock *sk)
1439{ 1439{
1440 read_lock(&sk->sk_callback_lock); 1440 read_lock_bh(&sk->sk_callback_lock);
1441 1441
1442 /* from net/core/stream.c:sk_stream_write_space */ 1442 /* from net/core/stream.c:sk_stream_write_space */
1443 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) 1443 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
1444 xs_write_space(sk); 1444 xs_write_space(sk);
1445 1445
1446 read_unlock(&sk->sk_callback_lock); 1446 read_unlock_bh(&sk->sk_callback_lock);
1447} 1447}
1448 1448
1449static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) 1449static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4414a18c63b4..0b39b2451ea5 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -692,6 +692,7 @@ static int unix_autobind(struct socket *sock)
692 static u32 ordernum = 1; 692 static u32 ordernum = 1;
693 struct unix_address *addr; 693 struct unix_address *addr;
694 int err; 694 int err;
695 unsigned int retries = 0;
695 696
696 mutex_lock(&u->readlock); 697 mutex_lock(&u->readlock);
697 698
@@ -717,9 +718,17 @@ retry:
717 if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, 718 if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
718 addr->hash)) { 719 addr->hash)) {
719 spin_unlock(&unix_table_lock); 720 spin_unlock(&unix_table_lock);
720 /* Sanity yield. It is unusual case, but yet... */ 721 /*
721 if (!(ordernum&0xFF)) 722 * __unix_find_socket_byname() may take long time if many names
722 yield(); 723 * are already in use.
724 */
725 cond_resched();
726 /* Give up if all names seems to be in use. */
727 if (retries++ == 0xFFFFF) {
728 err = -ENOSPC;
729 kfree(addr);
730 goto out;
731 }
723 goto retry; 732 goto retry;
724 } 733 }
725 addr->hash ^= sk->sk_type; 734 addr->hash ^= sk->sk_type;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 541e2fff5e9c..d6d046b9f6f2 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -475,12 +475,10 @@ int wiphy_register(struct wiphy *wiphy)
475 mutex_lock(&cfg80211_mutex); 475 mutex_lock(&cfg80211_mutex);
476 476
477 res = device_add(&rdev->wiphy.dev); 477 res = device_add(&rdev->wiphy.dev);
478 if (res) 478 if (res) {
479 goto out_unlock; 479 mutex_unlock(&cfg80211_mutex);
480 480 return res;
481 res = rfkill_register(rdev->rfkill); 481 }
482 if (res)
483 goto out_rm_dev;
484 482
485 /* set up regulatory info */ 483 /* set up regulatory info */
486 wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE); 484 wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE);
@@ -509,13 +507,18 @@ int wiphy_register(struct wiphy *wiphy)
509 cfg80211_debugfs_rdev_add(rdev); 507 cfg80211_debugfs_rdev_add(rdev);
510 mutex_unlock(&cfg80211_mutex); 508 mutex_unlock(&cfg80211_mutex);
511 509
510 /*
511 * due to a locking dependency this has to be outside of the
512 * cfg80211_mutex lock
513 */
514 res = rfkill_register(rdev->rfkill);
515 if (res)
516 goto out_rm_dev;
517
512 return 0; 518 return 0;
513 519
514out_rm_dev: 520out_rm_dev:
515 device_del(&rdev->wiphy.dev); 521 device_del(&rdev->wiphy.dev);
516
517out_unlock:
518 mutex_unlock(&cfg80211_mutex);
519 return res; 522 return res;
520} 523}
521EXPORT_SYMBOL(wiphy_register); 524EXPORT_SYMBOL(wiphy_register);
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index a4991a3efec0..39765bcfb472 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -34,6 +34,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \
34static const struct file_operations name## _ops = { \ 34static const struct file_operations name## _ops = { \
35 .read = name## _read, \ 35 .read = name## _read, \
36 .open = cfg80211_open_file_generic, \ 36 .open = cfg80211_open_file_generic, \
37 .llseek = generic_file_llseek, \
37}; 38};
38 39
39DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", 40DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
@@ -102,6 +103,7 @@ static ssize_t ht40allow_map_read(struct file *file,
102static const struct file_operations ht40allow_map_ops = { 103static const struct file_operations ht40allow_map_ops = {
103 .read = ht40allow_map_read, 104 .read = ht40allow_map_read,
104 .open = cfg80211_open_file_generic, 105 .open = cfg80211_open_file_generic,
106 .llseek = default_llseek,
105}; 107};
106 108
107#define DEBUGFS_ADD(name) \ 109#define DEBUGFS_ADD(name) \
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index bb5e0a5ecfa1..7e5c3a45f811 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1420,6 +1420,9 @@ int cfg80211_wext_giwessid(struct net_device *dev,
1420{ 1420{
1421 struct wireless_dev *wdev = dev->ieee80211_ptr; 1421 struct wireless_dev *wdev = dev->ieee80211_ptr;
1422 1422
1423 data->flags = 0;
1424 data->length = 0;
1425
1423 switch (wdev->iftype) { 1426 switch (wdev->iftype) {
1424 case NL80211_IFTYPE_ADHOC: 1427 case NL80211_IFTYPE_ADHOC:
1425 return cfg80211_ibss_wext_giwessid(dev, info, data, ssid); 1428 return cfg80211_ibss_wext_giwessid(dev, info, data, ssid);
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 0ef17bc42bac..8f5116f5af19 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -782,6 +782,22 @@ static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd,
782 } 782 }
783 } 783 }
784 784
785 if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) {
786 /*
787 * If this is a GET, but not NOMAX, it means that the extra
788 * data is not bounded by userspace, but by max_tokens. Thus
789 * set the length to max_tokens. This matches the extra data
790 * allocation.
791 * The driver should fill it with the number of tokens it
792 * provided, and it may check iwp->length rather than having
793 * knowledge of max_tokens. If the driver doesn't change the
794 * iwp->length, this ioctl just copies back max_token tokens
795 * filled with zeroes. Hopefully the driver isn't claiming
796 * them to be valid data.
797 */
798 iwp->length = descr->max_tokens;
799 }
800
785 err = handler(dev, info, (union iwreq_data *) iwp, extra); 801 err = handler(dev, info, (union iwreq_data *) iwp, extra);
786 802
787 iwp->length += essid_compat; 803 iwp->length += essid_compat;
diff --git a/net/wireless/wext-priv.c b/net/wireless/wext-priv.c
index 3feb28e41c53..674d426a9d24 100644
--- a/net/wireless/wext-priv.c
+++ b/net/wireless/wext-priv.c
@@ -152,7 +152,7 @@ static int ioctl_private_iw_point(struct iw_point *iwp, unsigned int cmd,
152 } else if (!iwp->pointer) 152 } else if (!iwp->pointer)
153 return -EFAULT; 153 return -EFAULT;
154 154
155 extra = kmalloc(extra_size, GFP_KERNEL); 155 extra = kzalloc(extra_size, GFP_KERNEL);
156 if (!extra) 156 if (!extra)
157 return -ENOMEM; 157 return -ENOMEM;
158 158
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index e6759c9660bb..2196e55e4f61 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -5,6 +5,7 @@
5config X25 5config X25
6 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)" 6 tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
7 depends on EXPERIMENTAL 7 depends on EXPERIMENTAL
8 depends on BKL # should be fixable
8 ---help--- 9 ---help---
9 X.25 is a set of standardized network protocols, similar in scope to 10 X.25 is a set of standardized network protocols, similar in scope to
10 frame relay; the one physical line from your box to the X.25 network 11 frame relay; the one physical line from your box to the X.25 network
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index a3cca0a94346..64f2ae1fdc15 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -101,7 +101,7 @@ resume:
101 err = -EHOSTUNREACH; 101 err = -EHOSTUNREACH;
102 goto error_nolock; 102 goto error_nolock;
103 } 103 }
104 skb_dst_set_noref(skb, dst); 104 skb_dst_set(skb, dst_clone(dst));
105 x = dst->xfrm; 105 x = dst->xfrm;
106 } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)); 106 } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL));
107 107
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 2b3ed7ad4933..cbab6e1a8c9c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1175,9 +1175,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
1175 tmpl->mode == XFRM_MODE_BEET) { 1175 tmpl->mode == XFRM_MODE_BEET) {
1176 remote = &tmpl->id.daddr; 1176 remote = &tmpl->id.daddr;
1177 local = &tmpl->saddr; 1177 local = &tmpl->saddr;
1178 family = tmpl->encap_family; 1178 if (xfrm_addr_any(local, tmpl->encap_family)) {
1179 if (xfrm_addr_any(local, family)) { 1179 error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
1180 error = xfrm_get_saddr(net, &tmp, remote, family);
1181 if (error) 1180 if (error)
1182 goto fail; 1181 goto fail;
1183 local = &tmp; 1182 local = &tmp;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 5208b12fbfb4..eb96ce52f178 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -656,15 +656,23 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
656EXPORT_SYMBOL(xfrm_sad_getinfo); 656EXPORT_SYMBOL(xfrm_sad_getinfo);
657 657
658static int 658static int
659xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl, 659xfrm_init_tempstate(struct xfrm_state *x, struct flowi *fl,
660 struct xfrm_tmpl *tmpl, 660 struct xfrm_tmpl *tmpl,
661 xfrm_address_t *daddr, xfrm_address_t *saddr, 661 xfrm_address_t *daddr, xfrm_address_t *saddr,
662 unsigned short family) 662 unsigned short family)
663{ 663{
664 struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); 664 struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
665 if (!afinfo) 665 if (!afinfo)
666 return -1; 666 return -1;
667 afinfo->init_tempsel(x, fl, tmpl, daddr, saddr); 667 afinfo->init_tempsel(&x->sel, fl);
668
669 if (family != tmpl->encap_family) {
670 xfrm_state_put_afinfo(afinfo);
671 afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
672 if (!afinfo)
673 return -1;
674 }
675 afinfo->init_temprop(x, tmpl, daddr, saddr);
668 xfrm_state_put_afinfo(afinfo); 676 xfrm_state_put_afinfo(afinfo);
669 return 0; 677 return 0;
670} 678}
@@ -790,37 +798,38 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
790 int error = 0; 798 int error = 0;
791 struct xfrm_state *best = NULL; 799 struct xfrm_state *best = NULL;
792 u32 mark = pol->mark.v & pol->mark.m; 800 u32 mark = pol->mark.v & pol->mark.m;
801 unsigned short encap_family = tmpl->encap_family;
793 802
794 to_put = NULL; 803 to_put = NULL;
795 804
796 spin_lock_bh(&xfrm_state_lock); 805 spin_lock_bh(&xfrm_state_lock);
797 h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, family); 806 h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
798 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { 807 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
799 if (x->props.family == family && 808 if (x->props.family == encap_family &&
800 x->props.reqid == tmpl->reqid && 809 x->props.reqid == tmpl->reqid &&
801 (mark & x->mark.m) == x->mark.v && 810 (mark & x->mark.m) == x->mark.v &&
802 !(x->props.flags & XFRM_STATE_WILDRECV) && 811 !(x->props.flags & XFRM_STATE_WILDRECV) &&
803 xfrm_state_addr_check(x, daddr, saddr, family) && 812 xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
804 tmpl->mode == x->props.mode && 813 tmpl->mode == x->props.mode &&
805 tmpl->id.proto == x->id.proto && 814 tmpl->id.proto == x->id.proto &&
806 (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) 815 (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
807 xfrm_state_look_at(pol, x, fl, family, daddr, saddr, 816 xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
808 &best, &acquire_in_progress, &error); 817 &best, &acquire_in_progress, &error);
809 } 818 }
810 if (best) 819 if (best)
811 goto found; 820 goto found;
812 821
813 h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family); 822 h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
814 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) { 823 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) {
815 if (x->props.family == family && 824 if (x->props.family == encap_family &&
816 x->props.reqid == tmpl->reqid && 825 x->props.reqid == tmpl->reqid &&
817 (mark & x->mark.m) == x->mark.v && 826 (mark & x->mark.m) == x->mark.v &&
818 !(x->props.flags & XFRM_STATE_WILDRECV) && 827 !(x->props.flags & XFRM_STATE_WILDRECV) &&
819 xfrm_state_addr_check(x, daddr, saddr, family) && 828 xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
820 tmpl->mode == x->props.mode && 829 tmpl->mode == x->props.mode &&
821 tmpl->id.proto == x->id.proto && 830 tmpl->id.proto == x->id.proto &&
822 (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) 831 (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
823 xfrm_state_look_at(pol, x, fl, family, daddr, saddr, 832 xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
824 &best, &acquire_in_progress, &error); 833 &best, &acquire_in_progress, &error);
825 } 834 }
826 835
@@ -829,7 +838,7 @@ found:
829 if (!x && !error && !acquire_in_progress) { 838 if (!x && !error && !acquire_in_progress) {
830 if (tmpl->id.spi && 839 if (tmpl->id.spi &&
831 (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi, 840 (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi,
832 tmpl->id.proto, family)) != NULL) { 841 tmpl->id.proto, encap_family)) != NULL) {
833 to_put = x0; 842 to_put = x0;
834 error = -EEXIST; 843 error = -EEXIST;
835 goto out; 844 goto out;
@@ -839,9 +848,9 @@ found:
839 error = -ENOMEM; 848 error = -ENOMEM;
840 goto out; 849 goto out;
841 } 850 }
842 /* Initialize temporary selector matching only 851 /* Initialize temporary state matching only
843 * to current session. */ 852 * to current session. */
844 xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family); 853 xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
845 memcpy(&x->mark, &pol->mark, sizeof(x->mark)); 854 memcpy(&x->mark, &pol->mark, sizeof(x->mark));
846 855
847 error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid); 856 error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid);
@@ -856,10 +865,10 @@ found:
856 x->km.state = XFRM_STATE_ACQ; 865 x->km.state = XFRM_STATE_ACQ;
857 list_add(&x->km.all, &net->xfrm.state_all); 866 list_add(&x->km.all, &net->xfrm.state_all);
858 hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); 867 hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
859 h = xfrm_src_hash(net, daddr, saddr, family); 868 h = xfrm_src_hash(net, daddr, saddr, encap_family);
860 hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); 869 hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
861 if (x->id.spi) { 870 if (x->id.spi) {
862 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, family); 871 h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
863 hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); 872 hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
864 } 873 }
865 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; 874 x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b14ed4b1f27c..8bae6b22c846 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1801,7 +1801,7 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
1801 struct xfrm_user_expire *ue = nlmsg_data(nlh); 1801 struct xfrm_user_expire *ue = nlmsg_data(nlh);
1802 struct xfrm_usersa_info *p = &ue->state; 1802 struct xfrm_usersa_info *p = &ue->state;
1803 struct xfrm_mark m; 1803 struct xfrm_mark m;
1804 u32 mark = xfrm_mark_get(attrs, &m);; 1804 u32 mark = xfrm_mark_get(attrs, &m);
1805 1805
1806 x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family); 1806 x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family);
1807 1807