aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-16 19:24:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-16 19:24:01 -0400
commit1d9d7cbf28a1c2f84f2a0224466f8eb5f0a62ace (patch)
tree35aa9ec8433f757073f21e1229e97d736b0c5593 /fs/ceph
parent2c45e7fbc962be1b03f2c2af817a76f5ba810af2 (diff)
parent00abf69dd24f4444d185982379c5cc3bb7b6d1fc (diff)
Merge tag 'ceph-for-5.2-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "On the filesystem side we have: - a fix to enforce quotas set above the mount point (Luis Henriques) - support for exporting snapshots through NFS (Zheng Yan) - proper statx implementation (Jeff Layton). statx flags are mapped to MDS caps, with AT_STATX_{DONT,FORCE}_SYNC taken into account. - some follow-up dentry name handling fixes, in particular elimination of our hand-rolled helper and the switch to __getname() as suggested by Al (Jeff Layton) - a set of MDS client cleanups in preparation for async MDS requests in the future (Jeff Layton) - a fix to sync the filesystem before remounting (Jeff Layton) On the rbd side, work is on-going on object-map and fast-diff image features" * tag 'ceph-for-5.2-rc1' of git://github.com/ceph/ceph-client: (29 commits) ceph: flush dirty inodes before proceeding with remount ceph: fix unaligned access in ceph_send_cap_releases libceph: make ceph_pr_addr take an struct ceph_entity_addr pointer libceph: fix unaligned accesses in ceph_entity_addr handling rbd: don't assert on writes to snapshots rbd: client_mutex is never nested ceph: print inode number in __caps_issued_mask debugging messages ceph: just call get_session in __ceph_lookup_mds_session ceph: simplify arguments and return semantics of try_get_cap_refs ceph: fix comment over ceph_drop_caps_for_unlink ceph: move wait for mds request into helper function ceph: have ceph_mdsc_do_request call ceph_mdsc_submit_request ceph: after an MDS request, do callback and completions ceph: use pathlen values returned by set_request_path_attr ceph: use __getname/__putname in ceph_mdsc_build_path ceph: use ceph_mdsc_build_path instead of clone_dentry_name ceph: fix potential use-after-free in ceph_mdsc_build_path ceph: dump granular cap info in "caps" debugfs file ceph: make iterate_session_caps a public symbol ceph: fix NULL pointer deref when debugging is enabled ...
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/caps.c93
-rw-r--r--fs/ceph/debugfs.c40
-rw-r--r--fs/ceph/export.c356
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/inode.c85
-rw-r--r--fs/ceph/locks.c13
-rw-r--r--fs/ceph/mds_client.c205
-rw-r--r--fs/ceph/mds_client.h33
-rw-r--r--fs/ceph/mdsmap.c2
-rw-r--r--fs/ceph/quota.c177
-rw-r--r--fs/ceph/super.c7
-rw-r--r--fs/ceph/super.h2
12 files changed, 751 insertions, 264 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 36a8dc699448..72f8e1311392 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -892,8 +892,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
892 int have = ci->i_snap_caps; 892 int have = ci->i_snap_caps;
893 893
894 if ((have & mask) == mask) { 894 if ((have & mask) == mask) {
895 dout("__ceph_caps_issued_mask %p snap issued %s" 895 dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
896 " (mask %s)\n", &ci->vfs_inode, 896 " (mask %s)\n", ci->vfs_inode.i_ino,
897 ceph_cap_string(have), 897 ceph_cap_string(have),
898 ceph_cap_string(mask)); 898 ceph_cap_string(mask));
899 return 1; 899 return 1;
@@ -904,8 +904,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
904 if (!__cap_is_valid(cap)) 904 if (!__cap_is_valid(cap))
905 continue; 905 continue;
906 if ((cap->issued & mask) == mask) { 906 if ((cap->issued & mask) == mask) {
907 dout("__ceph_caps_issued_mask %p cap %p issued %s" 907 dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
908 " (mask %s)\n", &ci->vfs_inode, cap, 908 " (mask %s)\n", ci->vfs_inode.i_ino, cap,
909 ceph_cap_string(cap->issued), 909 ceph_cap_string(cap->issued),
910 ceph_cap_string(mask)); 910 ceph_cap_string(mask));
911 if (touch) 911 if (touch)
@@ -916,8 +916,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
916 /* does a combination of caps satisfy mask? */ 916 /* does a combination of caps satisfy mask? */
917 have |= cap->issued; 917 have |= cap->issued;
918 if ((have & mask) == mask) { 918 if ((have & mask) == mask) {
919 dout("__ceph_caps_issued_mask %p combo issued %s" 919 dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
920 " (mask %s)\n", &ci->vfs_inode, 920 " (mask %s)\n", ci->vfs_inode.i_ino,
921 ceph_cap_string(cap->issued), 921 ceph_cap_string(cap->issued),
922 ceph_cap_string(mask)); 922 ceph_cap_string(mask));
923 if (touch) { 923 if (touch) {
@@ -2257,8 +2257,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2257 if (datasync) 2257 if (datasync)
2258 goto out; 2258 goto out;
2259 2259
2260 inode_lock(inode);
2261
2262 dirty = try_flush_caps(inode, &flush_tid); 2260 dirty = try_flush_caps(inode, &flush_tid);
2263 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 2261 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
2264 2262
@@ -2273,7 +2271,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2273 ret = wait_event_interruptible(ci->i_cap_wq, 2271 ret = wait_event_interruptible(ci->i_cap_wq,
2274 caps_are_flushed(inode, flush_tid)); 2272 caps_are_flushed(inode, flush_tid));
2275 } 2273 }
2276 inode_unlock(inode);
2277out: 2274out:
2278 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); 2275 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
2279 return ret; 2276 return ret;
@@ -2528,9 +2525,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
2528 * to (when applicable), and check against max_size here as well. 2525 * to (when applicable), and check against max_size here as well.
2529 * Note that caller is responsible for ensuring max_size increases are 2526 * Note that caller is responsible for ensuring max_size increases are
2530 * requested from the MDS. 2527 * requested from the MDS.
2528 *
2529 * Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
2530 * or a negative error code.
2531 *
2532 * FIXME: how does a 0 return differ from -EAGAIN?
2531 */ 2533 */
2532static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2534static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2533 loff_t endoff, bool nonblock, int *got, int *err) 2535 loff_t endoff, bool nonblock, int *got)
2534{ 2536{
2535 struct inode *inode = &ci->vfs_inode; 2537 struct inode *inode = &ci->vfs_inode;
2536 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2538 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
@@ -2550,8 +2552,7 @@ again:
2550 if ((file_wanted & need) != need) { 2552 if ((file_wanted & need) != need) {
2551 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", 2553 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
2552 ceph_cap_string(need), ceph_cap_string(file_wanted)); 2554 ceph_cap_string(need), ceph_cap_string(file_wanted));
2553 *err = -EBADF; 2555 ret = -EBADF;
2554 ret = 1;
2555 goto out_unlock; 2556 goto out_unlock;
2556 } 2557 }
2557 2558
@@ -2572,10 +2573,8 @@ again:
2572 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { 2573 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
2573 dout("get_cap_refs %p endoff %llu > maxsize %llu\n", 2574 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
2574 inode, endoff, ci->i_max_size); 2575 inode, endoff, ci->i_max_size);
2575 if (endoff > ci->i_requested_max_size) { 2576 if (endoff > ci->i_requested_max_size)
2576 *err = -EAGAIN; 2577 ret = -EAGAIN;
2577 ret = 1;
2578 }
2579 goto out_unlock; 2578 goto out_unlock;
2580 } 2579 }
2581 /* 2580 /*
@@ -2610,8 +2609,7 @@ again:
2610 * task isn't in TASK_RUNNING state 2609 * task isn't in TASK_RUNNING state
2611 */ 2610 */
2612 if (nonblock) { 2611 if (nonblock) {
2613 *err = -EAGAIN; 2612 ret = -EAGAIN;
2614 ret = 1;
2615 goto out_unlock; 2613 goto out_unlock;
2616 } 2614 }
2617 2615
@@ -2640,8 +2638,7 @@ again:
2640 if (session_readonly) { 2638 if (session_readonly) {
2641 dout("get_cap_refs %p needed %s but mds%d readonly\n", 2639 dout("get_cap_refs %p needed %s but mds%d readonly\n",
2642 inode, ceph_cap_string(need), ci->i_auth_cap->mds); 2640 inode, ceph_cap_string(need), ci->i_auth_cap->mds);
2643 *err = -EROFS; 2641 ret = -EROFS;
2644 ret = 1;
2645 goto out_unlock; 2642 goto out_unlock;
2646 } 2643 }
2647 2644
@@ -2650,16 +2647,14 @@ again:
2650 if (READ_ONCE(mdsc->fsc->mount_state) == 2647 if (READ_ONCE(mdsc->fsc->mount_state) ==
2651 CEPH_MOUNT_SHUTDOWN) { 2648 CEPH_MOUNT_SHUTDOWN) {
2652 dout("get_cap_refs %p forced umount\n", inode); 2649 dout("get_cap_refs %p forced umount\n", inode);
2653 *err = -EIO; 2650 ret = -EIO;
2654 ret = 1;
2655 goto out_unlock; 2651 goto out_unlock;
2656 } 2652 }
2657 mds_wanted = __ceph_caps_mds_wanted(ci, false); 2653 mds_wanted = __ceph_caps_mds_wanted(ci, false);
2658 if (need & ~(mds_wanted & need)) { 2654 if (need & ~(mds_wanted & need)) {
2659 dout("get_cap_refs %p caps were dropped" 2655 dout("get_cap_refs %p caps were dropped"
2660 " (session killed?)\n", inode); 2656 " (session killed?)\n", inode);
2661 *err = -ESTALE; 2657 ret = -ESTALE;
2662 ret = 1;
2663 goto out_unlock; 2658 goto out_unlock;
2664 } 2659 }
2665 if (!(file_wanted & ~mds_wanted)) 2660 if (!(file_wanted & ~mds_wanted))
@@ -2710,7 +2705,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
2710int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, 2705int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
2711 bool nonblock, int *got) 2706 bool nonblock, int *got)
2712{ 2707{
2713 int ret, err = 0; 2708 int ret;
2714 2709
2715 BUG_ON(need & ~CEPH_CAP_FILE_RD); 2710 BUG_ON(need & ~CEPH_CAP_FILE_RD);
2716 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); 2711 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
@@ -2718,15 +2713,8 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
2718 if (ret < 0) 2713 if (ret < 0)
2719 return ret; 2714 return ret;
2720 2715
2721 ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err); 2716 ret = try_get_cap_refs(ci, need, want, 0, nonblock, got);
2722 if (ret) { 2717 return ret == -EAGAIN ? 0 : ret;
2723 if (err == -EAGAIN) {
2724 ret = 0;
2725 } else if (err < 0) {
2726 ret = err;
2727 }
2728 }
2729 return ret;
2730} 2718}
2731 2719
2732/* 2720/*
@@ -2737,7 +2725,7 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
2737int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 2725int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2738 loff_t endoff, int *got, struct page **pinned_page) 2726 loff_t endoff, int *got, struct page **pinned_page)
2739{ 2727{
2740 int _got, ret, err = 0; 2728 int _got, ret;
2741 2729
2742 ret = ceph_pool_perm_check(ci, need); 2730 ret = ceph_pool_perm_check(ci, need);
2743 if (ret < 0) 2731 if (ret < 0)
@@ -2747,21 +2735,19 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2747 if (endoff > 0) 2735 if (endoff > 0)
2748 check_max_size(&ci->vfs_inode, endoff); 2736 check_max_size(&ci->vfs_inode, endoff);
2749 2737
2750 err = 0;
2751 _got = 0; 2738 _got = 0;
2752 ret = try_get_cap_refs(ci, need, want, endoff, 2739 ret = try_get_cap_refs(ci, need, want, endoff,
2753 false, &_got, &err); 2740 false, &_got);
2754 if (ret) { 2741 if (ret == -EAGAIN) {
2755 if (err == -EAGAIN) 2742 continue;
2756 continue; 2743 } else if (!ret) {
2757 if (err < 0) 2744 int err;
2758 ret = err; 2745
2759 } else {
2760 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2746 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2761 add_wait_queue(&ci->i_cap_wq, &wait); 2747 add_wait_queue(&ci->i_cap_wq, &wait);
2762 2748
2763 while (!try_get_cap_refs(ci, need, want, endoff, 2749 while (!(err = try_get_cap_refs(ci, need, want, endoff,
2764 true, &_got, &err)) { 2750 true, &_got))) {
2765 if (signal_pending(current)) { 2751 if (signal_pending(current)) {
2766 ret = -ERESTARTSYS; 2752 ret = -ERESTARTSYS;
2767 break; 2753 break;
@@ -2770,19 +2756,14 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2770 } 2756 }
2771 2757
2772 remove_wait_queue(&ci->i_cap_wq, &wait); 2758 remove_wait_queue(&ci->i_cap_wq, &wait);
2773
2774 if (err == -EAGAIN) 2759 if (err == -EAGAIN)
2775 continue; 2760 continue;
2776 if (err < 0)
2777 ret = err;
2778 } 2761 }
2779 if (ret < 0) { 2762 if (ret == -ESTALE) {
2780 if (err == -ESTALE) { 2763 /* session was killed, try renew caps */
2781 /* session was killed, try renew caps */ 2764 ret = ceph_renew_caps(&ci->vfs_inode);
2782 ret = ceph_renew_caps(&ci->vfs_inode); 2765 if (ret == 0)
2783 if (ret == 0) 2766 continue;
2784 continue;
2785 }
2786 return ret; 2767 return ret;
2787 } 2768 }
2788 2769
@@ -4099,7 +4080,7 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
4099} 4080}
4100 4081
4101/* 4082/*
4102 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it 4083 * For a soon-to-be unlinked file, drop the LINK caps. If it
4103 * looks like the link count will hit 0, drop any other caps (other 4084 * looks like the link count will hit 0, drop any other caps (other
4104 * than PIN) we don't specifically want (due to the file still being 4085 * than PIN) we don't specifically want (due to the file still being
4105 * open). 4086 * open).
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 98365e74cb4a..b3fc5fe26a1a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -37,7 +37,7 @@ static int mdsmap_show(struct seq_file *s, void *p)
37 struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; 37 struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
38 int state = mdsmap->m_info[i].state; 38 int state = mdsmap->m_info[i].state;
39 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, 39 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
40 ceph_pr_addr(&addr->in_addr), 40 ceph_pr_addr(addr),
41 ceph_mds_state_name(state)); 41 ceph_mds_state_name(state));
42 } 42 }
43 return 0; 43 return 0;
@@ -88,7 +88,7 @@ static int mdsc_show(struct seq_file *s, void *p)
88 req->r_dentry, 88 req->r_dentry,
89 path ? path : ""); 89 path ? path : "");
90 spin_unlock(&req->r_dentry->d_lock); 90 spin_unlock(&req->r_dentry->d_lock);
91 kfree(path); 91 ceph_mdsc_free_path(path, pathlen);
92 } else if (req->r_path1) { 92 } else if (req->r_path1) {
93 seq_printf(s, " #%llx/%s", req->r_ino1.ino, 93 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
94 req->r_path1); 94 req->r_path1);
@@ -108,7 +108,7 @@ static int mdsc_show(struct seq_file *s, void *p)
108 req->r_old_dentry, 108 req->r_old_dentry,
109 path ? path : ""); 109 path ? path : "");
110 spin_unlock(&req->r_old_dentry->d_lock); 110 spin_unlock(&req->r_old_dentry->d_lock);
111 kfree(path); 111 ceph_mdsc_free_path(path, pathlen);
112 } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { 112 } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
113 if (req->r_ino2.ino) 113 if (req->r_ino2.ino)
114 seq_printf(s, " #%llx/%s", req->r_ino2.ino, 114 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
@@ -124,18 +124,48 @@ static int mdsc_show(struct seq_file *s, void *p)
124 return 0; 124 return 0;
125} 125}
126 126
127static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
128{
129 struct seq_file *s = p;
130
131 seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
132 ceph_cap_string(cap->issued),
133 ceph_cap_string(cap->implemented));
134 return 0;
135}
136
127static int caps_show(struct seq_file *s, void *p) 137static int caps_show(struct seq_file *s, void *p)
128{ 138{
129 struct ceph_fs_client *fsc = s->private; 139 struct ceph_fs_client *fsc = s->private;
130 int total, avail, used, reserved, min; 140 struct ceph_mds_client *mdsc = fsc->mdsc;
141 int total, avail, used, reserved, min, i;
131 142
132 ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); 143 ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
133 seq_printf(s, "total\t\t%d\n" 144 seq_printf(s, "total\t\t%d\n"
134 "avail\t\t%d\n" 145 "avail\t\t%d\n"
135 "used\t\t%d\n" 146 "used\t\t%d\n"
136 "reserved\t%d\n" 147 "reserved\t%d\n"
137 "min\t%d\n", 148 "min\t\t%d\n\n",
138 total, avail, used, reserved, min); 149 total, avail, used, reserved, min);
150 seq_printf(s, "ino issued implemented\n");
151 seq_printf(s, "-----------------------------------------------\n");
152
153 mutex_lock(&mdsc->mutex);
154 for (i = 0; i < mdsc->max_sessions; i++) {
155 struct ceph_mds_session *session;
156
157 session = __ceph_lookup_mds_session(mdsc, i);
158 if (!session)
159 continue;
160 mutex_unlock(&mdsc->mutex);
161 mutex_lock(&session->s_mutex);
162 ceph_iterate_session_caps(session, caps_show_cb, s);
163 mutex_unlock(&session->s_mutex);
164 ceph_put_mds_session(session);
165 mutex_lock(&mdsc->mutex);
166 }
167 mutex_unlock(&mdsc->mutex);
168
139 return 0; 169 return 0;
140} 170}
141 171
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 3c59ad180ef0..d3ef7ee429ec 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -22,18 +22,77 @@ struct ceph_nfs_confh {
22 u64 ino, parent_ino; 22 u64 ino, parent_ino;
23} __attribute__ ((packed)); 23} __attribute__ ((packed));
24 24
25/*
26 * fh for snapped inode
27 */
28struct ceph_nfs_snapfh {
29 u64 ino;
30 u64 snapid;
31 u64 parent_ino;
32 u32 hash;
33} __attribute__ ((packed));
34
35static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
36 struct inode *parent_inode)
37{
38 const static int snap_handle_length =
39 sizeof(struct ceph_nfs_snapfh) >> 2;
40 struct ceph_nfs_snapfh *sfh = (void *)rawfh;
41 u64 snapid = ceph_snap(inode);
42 int ret;
43 bool no_parent = true;
44
45 if (*max_len < snap_handle_length) {
46 *max_len = snap_handle_length;
47 ret = FILEID_INVALID;
48 goto out;
49 }
50
51 ret = -EINVAL;
52 if (snapid != CEPH_SNAPDIR) {
53 struct inode *dir;
54 struct dentry *dentry = d_find_alias(inode);
55 if (!dentry)
56 goto out;
57
58 rcu_read_lock();
59 dir = d_inode_rcu(dentry->d_parent);
60 if (ceph_snap(dir) != CEPH_SNAPDIR) {
61 sfh->parent_ino = ceph_ino(dir);
62 sfh->hash = ceph_dentry_hash(dir, dentry);
63 no_parent = false;
64 }
65 rcu_read_unlock();
66 dput(dentry);
67 }
68
69 if (no_parent) {
70 if (!S_ISDIR(inode->i_mode))
71 goto out;
72 sfh->parent_ino = sfh->ino;
73 sfh->hash = 0;
74 }
75 sfh->ino = ceph_ino(inode);
76 sfh->snapid = snapid;
77
78 *max_len = snap_handle_length;
79 ret = FILEID_BTRFS_WITH_PARENT;
80out:
81 dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret);
82 return ret;
83}
84
25static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, 85static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
26 struct inode *parent_inode) 86 struct inode *parent_inode)
27{ 87{
88 const static int handle_length =
89 sizeof(struct ceph_nfs_fh) >> 2;
90 const static int connected_handle_length =
91 sizeof(struct ceph_nfs_confh) >> 2;
28 int type; 92 int type;
29 struct ceph_nfs_fh *fh = (void *)rawfh;
30 struct ceph_nfs_confh *cfh = (void *)rawfh;
31 int connected_handle_length = sizeof(*cfh)/4;
32 int handle_length = sizeof(*fh)/4;
33 93
34 /* don't re-export snaps */
35 if (ceph_snap(inode) != CEPH_NOSNAP) 94 if (ceph_snap(inode) != CEPH_NOSNAP)
36 return -EINVAL; 95 return ceph_encode_snapfh(inode, rawfh, max_len, parent_inode);
37 96
38 if (parent_inode && (*max_len < connected_handle_length)) { 97 if (parent_inode && (*max_len < connected_handle_length)) {
39 *max_len = connected_handle_length; 98 *max_len = connected_handle_length;
@@ -44,6 +103,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
44 } 103 }
45 104
46 if (parent_inode) { 105 if (parent_inode) {
106 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 dout("encode_fh %llx with parent %llx\n", 107 dout("encode_fh %llx with parent %llx\n",
48 ceph_ino(inode), ceph_ino(parent_inode)); 108 ceph_ino(inode), ceph_ino(parent_inode));
49 cfh->ino = ceph_ino(inode); 109 cfh->ino = ceph_ino(inode);
@@ -51,6 +111,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
51 *max_len = connected_handle_length; 111 *max_len = connected_handle_length;
52 type = FILEID_INO32_GEN_PARENT; 112 type = FILEID_INO32_GEN_PARENT;
53 } else { 113 } else {
114 struct ceph_nfs_fh *fh = (void *)rawfh;
54 dout("encode_fh %llx\n", ceph_ino(inode)); 115 dout("encode_fh %llx\n", ceph_ino(inode));
55 fh->ino = ceph_ino(inode); 116 fh->ino = ceph_ino(inode);
56 *max_len = handle_length; 117 *max_len = handle_length;
@@ -59,7 +120,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
59 return type; 120 return type;
60} 121}
61 122
62static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) 123static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
63{ 124{
64 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; 125 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
65 struct inode *inode; 126 struct inode *inode;
@@ -81,7 +142,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
81 mask = CEPH_STAT_CAP_INODE; 142 mask = CEPH_STAT_CAP_INODE;
82 if (ceph_security_xattr_wanted(d_inode(sb->s_root))) 143 if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
83 mask |= CEPH_CAP_XATTR_SHARED; 144 mask |= CEPH_CAP_XATTR_SHARED;
84 req->r_args.getattr.mask = cpu_to_le32(mask); 145 req->r_args.lookupino.mask = cpu_to_le32(mask);
85 146
86 req->r_ino1 = vino; 147 req->r_ino1 = vino;
87 req->r_num_caps = 1; 148 req->r_num_caps = 1;
@@ -91,16 +152,114 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
91 ihold(inode); 152 ihold(inode);
92 ceph_mdsc_put_request(req); 153 ceph_mdsc_put_request(req);
93 if (!inode) 154 if (!inode)
94 return ERR_PTR(-ESTALE); 155 return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE);
95 if (inode->i_nlink == 0) {
96 iput(inode);
97 return ERR_PTR(-ESTALE);
98 }
99 } 156 }
157 return inode;
158}
159
160struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino)
161{
162 struct inode *inode = __lookup_inode(sb, ino);
163 if (IS_ERR(inode))
164 return inode;
165 if (inode->i_nlink == 0) {
166 iput(inode);
167 return ERR_PTR(-ESTALE);
168 }
169 return inode;
170}
100 171
172static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
173{
174 struct inode *inode = __lookup_inode(sb, ino);
175 if (IS_ERR(inode))
176 return ERR_CAST(inode);
177 if (inode->i_nlink == 0) {
178 iput(inode);
179 return ERR_PTR(-ESTALE);
180 }
101 return d_obtain_alias(inode); 181 return d_obtain_alias(inode);
102} 182}
103 183
184static struct dentry *__snapfh_to_dentry(struct super_block *sb,
185 struct ceph_nfs_snapfh *sfh,
186 bool want_parent)
187{
188 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
189 struct ceph_mds_request *req;
190 struct inode *inode;
191 struct ceph_vino vino;
192 int mask;
193 int err;
194 bool unlinked = false;
195
196 if (want_parent) {
197 vino.ino = sfh->parent_ino;
198 if (sfh->snapid == CEPH_SNAPDIR)
199 vino.snap = CEPH_NOSNAP;
200 else if (sfh->ino == sfh->parent_ino)
201 vino.snap = CEPH_SNAPDIR;
202 else
203 vino.snap = sfh->snapid;
204 } else {
205 vino.ino = sfh->ino;
206 vino.snap = sfh->snapid;
207 }
208 inode = ceph_find_inode(sb, vino);
209 if (inode)
210 return d_obtain_alias(inode);
211
212 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
213 USE_ANY_MDS);
214 if (IS_ERR(req))
215 return ERR_CAST(req);
216
217 mask = CEPH_STAT_CAP_INODE;
218 if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
219 mask |= CEPH_CAP_XATTR_SHARED;
220 req->r_args.lookupino.mask = cpu_to_le32(mask);
221 if (vino.snap < CEPH_NOSNAP) {
222 req->r_args.lookupino.snapid = cpu_to_le64(vino.snap);
223 if (!want_parent && sfh->ino != sfh->parent_ino) {
224 req->r_args.lookupino.parent =
225 cpu_to_le64(sfh->parent_ino);
226 req->r_args.lookupino.hash =
227 cpu_to_le32(sfh->hash);
228 }
229 }
230
231 req->r_ino1 = vino;
232 req->r_num_caps = 1;
233 err = ceph_mdsc_do_request(mdsc, NULL, req);
234 inode = req->r_target_inode;
235 if (inode) {
236 if (vino.snap == CEPH_SNAPDIR) {
237 if (inode->i_nlink == 0)
238 unlinked = true;
239 inode = ceph_get_snapdir(inode);
240 } else if (ceph_snap(inode) == vino.snap) {
241 ihold(inode);
242 } else {
243 /* mds does not support lookup snapped inode */
244 err = -EOPNOTSUPP;
245 inode = NULL;
246 }
247 }
248 ceph_mdsc_put_request(req);
249
250 if (want_parent) {
251 dout("snapfh_to_parent %llx.%llx\n err=%d\n",
252 vino.ino, vino.snap, err);
253 } else {
254 dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d",
255 vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err);
256 }
257 if (!inode)
258 return ERR_PTR(-ESTALE);
259 /* see comments in ceph_get_parent() */
260 return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
261}
262
104/* 263/*
105 * convert regular fh to dentry 264 * convert regular fh to dentry
106 */ 265 */
@@ -110,6 +269,11 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
110{ 269{
111 struct ceph_nfs_fh *fh = (void *)fid->raw; 270 struct ceph_nfs_fh *fh = (void *)fid->raw;
112 271
272 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
273 struct ceph_nfs_snapfh *sfh = (void *)fid->raw;
274 return __snapfh_to_dentry(sb, sfh, false);
275 }
276
113 if (fh_type != FILEID_INO32_GEN && 277 if (fh_type != FILEID_INO32_GEN &&
114 fh_type != FILEID_INO32_GEN_PARENT) 278 fh_type != FILEID_INO32_GEN_PARENT)
115 return NULL; 279 return NULL;
@@ -163,13 +327,49 @@ static struct dentry *__get_parent(struct super_block *sb,
163 327
164static struct dentry *ceph_get_parent(struct dentry *child) 328static struct dentry *ceph_get_parent(struct dentry *child)
165{ 329{
166 /* don't re-export snaps */ 330 struct inode *inode = d_inode(child);
167 if (ceph_snap(d_inode(child)) != CEPH_NOSNAP) 331 struct dentry *dn;
168 return ERR_PTR(-EINVAL); 332
169 333 if (ceph_snap(inode) != CEPH_NOSNAP) {
170 dout("get_parent %p ino %llx.%llx\n", 334 struct inode* dir;
171 child, ceph_vinop(d_inode(child))); 335 bool unlinked = false;
172 return __get_parent(child->d_sb, child, 0); 336 /* do not support non-directory */
337 if (!d_is_dir(child)) {
338 dn = ERR_PTR(-EINVAL);
339 goto out;
340 }
341 dir = __lookup_inode(inode->i_sb, ceph_ino(inode));
342 if (IS_ERR(dir)) {
343 dn = ERR_CAST(dir);
344 goto out;
345 }
346 /* There can be multiple paths to access snapped inode.
347 * For simplicity, treat snapdir of head inode as parent */
348 if (ceph_snap(inode) != CEPH_SNAPDIR) {
349 struct inode *snapdir = ceph_get_snapdir(dir);
350 if (dir->i_nlink == 0)
351 unlinked = true;
352 iput(dir);
353 if (IS_ERR(snapdir)) {
354 dn = ERR_CAST(snapdir);
355 goto out;
356 }
357 dir = snapdir;
358 }
359 /* If directory has already been deleted, futher get_parent
360 * will fail. Do not mark snapdir dentry as disconnected,
361 * this prevent exportfs from doing futher get_parent. */
362 if (unlinked)
363 dn = d_obtain_root(dir);
364 else
365 dn = d_obtain_alias(dir);
366 } else {
367 dn = __get_parent(child->d_sb, child, 0);
368 }
369out:
370 dout("get_parent %p ino %llx.%llx err=%ld\n",
371 child, ceph_vinop(inode), (IS_ERR(dn) ? PTR_ERR(dn) : 0));
372 return dn;
173} 373}
174 374
175/* 375/*
@@ -182,6 +382,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
182 struct ceph_nfs_confh *cfh = (void *)fid->raw; 382 struct ceph_nfs_confh *cfh = (void *)fid->raw;
183 struct dentry *dentry; 383 struct dentry *dentry;
184 384
385 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
386 struct ceph_nfs_snapfh *sfh = (void *)fid->raw;
387 return __snapfh_to_dentry(sb, sfh, true);
388 }
389
185 if (fh_type != FILEID_INO32_GEN_PARENT) 390 if (fh_type != FILEID_INO32_GEN_PARENT)
186 return NULL; 391 return NULL;
187 if (fh_len < sizeof(*cfh) / 4) 392 if (fh_len < sizeof(*cfh) / 4)
@@ -194,14 +399,115 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
194 return dentry; 399 return dentry;
195} 400}
196 401
402static int __get_snap_name(struct dentry *parent, char *name,
403 struct dentry *child)
404{
405 struct inode *inode = d_inode(child);
406 struct inode *dir = d_inode(parent);
407 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
408 struct ceph_mds_request *req = NULL;
409 char *last_name = NULL;
410 unsigned next_offset = 2;
411 int err = -EINVAL;
412
413 if (ceph_ino(inode) != ceph_ino(dir))
414 goto out;
415 if (ceph_snap(inode) == CEPH_SNAPDIR) {
416 if (ceph_snap(dir) == CEPH_NOSNAP) {
417 strcpy(name, fsc->mount_options->snapdir_name);
418 err = 0;
419 }
420 goto out;
421 }
422 if (ceph_snap(dir) != CEPH_SNAPDIR)
423 goto out;
424
425 while (1) {
426 struct ceph_mds_reply_info_parsed *rinfo;
427 struct ceph_mds_reply_dir_entry *rde;
428 int i;
429
430 req = ceph_mdsc_create_request(fsc->mdsc, CEPH_MDS_OP_LSSNAP,
431 USE_AUTH_MDS);
432 if (IS_ERR(req)) {
433 err = PTR_ERR(req);
434 req = NULL;
435 goto out;
436 }
437 err = ceph_alloc_readdir_reply_buffer(req, inode);
438 if (err)
439 goto out;
440
441 req->r_direct_mode = USE_AUTH_MDS;
442 req->r_readdir_offset = next_offset;
443 req->r_args.readdir.flags =
444 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
445 if (last_name) {
446 req->r_path2 = last_name;
447 last_name = NULL;
448 }
449
450 req->r_inode = dir;
451 ihold(dir);
452 req->r_dentry = dget(parent);
453
454 inode_lock(dir);
455 err = ceph_mdsc_do_request(fsc->mdsc, NULL, req);
456 inode_unlock(dir);
457
458 if (err < 0)
459 goto out;
460
461 rinfo = &req->r_reply_info;
462 for (i = 0; i < rinfo->dir_nr; i++) {
463 rde = rinfo->dir_entries + i;
464 BUG_ON(!rde->inode.in);
465 if (ceph_snap(inode) ==
466 le64_to_cpu(rde->inode.in->snapid)) {
467 memcpy(name, rde->name, rde->name_len);
468 name[rde->name_len] = '\0';
469 err = 0;
470 goto out;
471 }
472 }
473
474 if (rinfo->dir_end)
475 break;
476
477 BUG_ON(rinfo->dir_nr <= 0);
478 rde = rinfo->dir_entries + (rinfo->dir_nr - 1);
479 next_offset += rinfo->dir_nr;
480 last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL);
481 if (!last_name) {
482 err = -ENOMEM;
483 goto out;
484 }
485
486 ceph_mdsc_put_request(req);
487 req = NULL;
488 }
489 err = -ENOENT;
490out:
491 if (req)
492 ceph_mdsc_put_request(req);
493 kfree(last_name);
494 dout("get_snap_name %p ino %llx.%llx err=%d\n",
495 child, ceph_vinop(inode), err);
496 return err;
497}
498
197static int ceph_get_name(struct dentry *parent, char *name, 499static int ceph_get_name(struct dentry *parent, char *name,
198 struct dentry *child) 500 struct dentry *child)
199{ 501{
200 struct ceph_mds_client *mdsc; 502 struct ceph_mds_client *mdsc;
201 struct ceph_mds_request *req; 503 struct ceph_mds_request *req;
504 struct inode *inode = d_inode(child);
202 int err; 505 int err;
203 506
204 mdsc = ceph_inode_to_client(d_inode(child))->mdsc; 507 if (ceph_snap(inode) != CEPH_NOSNAP)
508 return __get_snap_name(parent, name, child);
509
510 mdsc = ceph_inode_to_client(inode)->mdsc;
205 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, 511 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
206 USE_ANY_MDS); 512 USE_ANY_MDS);
207 if (IS_ERR(req)) 513 if (IS_ERR(req))
@@ -209,8 +515,8 @@ static int ceph_get_name(struct dentry *parent, char *name,
209 515
210 inode_lock(d_inode(parent)); 516 inode_lock(d_inode(parent));
211 517
212 req->r_inode = d_inode(child); 518 req->r_inode = inode;
213 ihold(d_inode(child)); 519 ihold(inode);
214 req->r_ino2 = ceph_vino(d_inode(parent)); 520 req->r_ino2 = ceph_vino(d_inode(parent));
215 req->r_parent = d_inode(parent); 521 req->r_parent = d_inode(parent);
216 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 522 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
@@ -224,10 +530,10 @@ static int ceph_get_name(struct dentry *parent, char *name,
224 memcpy(name, rinfo->dname, rinfo->dname_len); 530 memcpy(name, rinfo->dname, rinfo->dname_len);
225 name[rinfo->dname_len] = 0; 531 name[rinfo->dname_len] = 0;
226 dout("get_name %p ino %llx.%llx name %s\n", 532 dout("get_name %p ino %llx.%llx name %s\n",
227 child, ceph_vinop(d_inode(child)), name); 533 child, ceph_vinop(inode), name);
228 } else { 534 } else {
229 dout("get_name %p ino %llx.%llx err %d\n", 535 dout("get_name %p ino %llx.%llx err %d\n",
230 child, ceph_vinop(d_inode(child)), err); 536 child, ceph_vinop(inode), err);
231 } 537 }
232 538
233 ceph_mdsc_put_request(req); 539 ceph_mdsc_put_request(req);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 84725b53ac21..305daf043eb0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -929,7 +929,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
929 929
930 dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", 930 dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
931 (write ? "write" : "read"), file, pos, (unsigned)count, 931 (write ? "write" : "read"), file, pos, (unsigned)count,
932 snapc, snapc->seq); 932 snapc, snapc ? snapc->seq : 0);
933 933
934 ret = filemap_write_and_wait_range(inode->i_mapping, 934 ret = filemap_write_and_wait_range(inode->i_mapping,
935 pos, pos + count - 1); 935 pos, pos + count - 1);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 35dae6d5493a..f85355bf49c4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2266,43 +2266,72 @@ int ceph_permission(struct inode *inode, int mask)
2266 return err; 2266 return err;
2267} 2267}
2268 2268
2269/* Craft a mask of needed caps given a set of requested statx attrs. */
2270static int statx_to_caps(u32 want)
2271{
2272 int mask = 0;
2273
2274 if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME))
2275 mask |= CEPH_CAP_AUTH_SHARED;
2276
2277 if (want & (STATX_NLINK|STATX_CTIME))
2278 mask |= CEPH_CAP_LINK_SHARED;
2279
2280 if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
2281 STATX_BLOCKS))
2282 mask |= CEPH_CAP_FILE_SHARED;
2283
2284 if (want & (STATX_CTIME))
2285 mask |= CEPH_CAP_XATTR_SHARED;
2286
2287 return mask;
2288}
2289
2269/* 2290/*
2270 * Get all attributes. Hopefully somedata we'll have a statlite() 2291 * Get all the attributes. If we have sufficient caps for the requested attrs,
2271 * and can limit the fields we require to be accurate. 2292 * then we can avoid talking to the MDS at all.
2272 */ 2293 */
2273int ceph_getattr(const struct path *path, struct kstat *stat, 2294int ceph_getattr(const struct path *path, struct kstat *stat,
2274 u32 request_mask, unsigned int flags) 2295 u32 request_mask, unsigned int flags)
2275{ 2296{
2276 struct inode *inode = d_inode(path->dentry); 2297 struct inode *inode = d_inode(path->dentry);
2277 struct ceph_inode_info *ci = ceph_inode(inode); 2298 struct ceph_inode_info *ci = ceph_inode(inode);
2278 int err; 2299 int err = 0;
2279 2300
2280 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false); 2301 /* Skip the getattr altogether if we're asked not to sync */
2281 if (!err) { 2302 if (!(flags & AT_STATX_DONT_SYNC)) {
2282 generic_fillattr(inode, stat); 2303 err = ceph_do_getattr(inode, statx_to_caps(request_mask),
2283 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); 2304 flags & AT_STATX_FORCE_SYNC);
2284 if (ceph_snap(inode) == CEPH_NOSNAP) 2305 if (err)
2285 stat->dev = inode->i_sb->s_dev; 2306 return err;
2307 }
2308
2309 generic_fillattr(inode, stat);
2310 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
2311 if (ceph_snap(inode) == CEPH_NOSNAP)
2312 stat->dev = inode->i_sb->s_dev;
2313 else
2314 stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
2315
2316 if (S_ISDIR(inode->i_mode)) {
2317 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
2318 RBYTES))
2319 stat->size = ci->i_rbytes;
2286 else 2320 else
2287 stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; 2321 stat->size = ci->i_files + ci->i_subdirs;
2288 2322 stat->blocks = 0;
2289 if (S_ISDIR(inode->i_mode)) { 2323 stat->blksize = 65536;
2290 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), 2324 /*
2291 RBYTES)) 2325 * Some applications rely on the number of st_nlink
2292 stat->size = ci->i_rbytes; 2326 * value on directories to be either 0 (if unlinked)
2293 else 2327 * or 2 + number of subdirectories.
2294 stat->size = ci->i_files + ci->i_subdirs; 2328 */
2295 stat->blocks = 0; 2329 if (stat->nlink == 1)
2296 stat->blksize = 65536; 2330 /* '.' + '..' + subdirs */
2297 /* 2331 stat->nlink = 1 + 1 + ci->i_subdirs;
2298 * Some applications rely on the number of st_nlink
2299 * value on directories to be either 0 (if unlinked)
2300 * or 2 + number of subdirectories.
2301 */
2302 if (stat->nlink == 1)
2303 /* '.' + '..' + subdirs */
2304 stat->nlink = 1 + 1 + ci->i_subdirs;
2305 }
2306 } 2332 }
2333
2334 /* Mask off any higher bits (e.g. btime) until we have support */
2335 stat->result_mask = request_mask & STATX_BASIC_STATS;
2307 return err; 2336 return err;
2308} 2337}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 9dae2ec7e1fa..ac9b53b89365 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -237,15 +237,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
237 spin_lock(&ci->i_ceph_lock); 237 spin_lock(&ci->i_ceph_lock);
238 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 238 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
239 err = -EIO; 239 err = -EIO;
240 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
241 /*
242 * increasing i_filelock_ref closes race window between
243 * handling request reply and adding file_lock struct to
244 * inode. Otherwise, i_auth_cap may get trimmed in the
245 * window. Caller function will decrease the counter.
246 */
247 fl->fl_ops = &ceph_fl_lock_ops;
248 atomic_inc(&ci->i_filelock_ref);
249 } 240 }
250 spin_unlock(&ci->i_ceph_lock); 241 spin_unlock(&ci->i_ceph_lock);
251 if (err < 0) { 242 if (err < 0) {
@@ -299,10 +290,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
299 spin_lock(&ci->i_ceph_lock); 290 spin_lock(&ci->i_ceph_lock);
300 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { 291 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
301 err = -EIO; 292 err = -EIO;
302 } else {
303 /* see comment in ceph_lock */
304 fl->fl_ops = &ceph_fl_lock_ops;
305 atomic_inc(&ci->i_filelock_ref);
306 } 293 }
307 spin_unlock(&ci->i_ceph_lock); 294 spin_unlock(&ci->i_ceph_lock);
308 if (err < 0) { 295 if (err < 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9049c2a3e972..959b1bf7c327 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -550,15 +550,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
550struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, 550struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
551 int mds) 551 int mds)
552{ 552{
553 struct ceph_mds_session *session;
554
555 if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) 553 if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
556 return NULL; 554 return NULL;
557 session = mdsc->sessions[mds]; 555 return get_session(mdsc->sessions[mds]);
558 dout("lookup_mds_session %p %d\n", session,
559 refcount_read(&session->s_ref));
560 get_session(session);
561 return session;
562} 556}
563 557
564static bool __have_session(struct ceph_mds_client *mdsc, int mds) 558static bool __have_session(struct ceph_mds_client *mdsc, int mds)
@@ -1284,9 +1278,9 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
1284 * 1278 *
1285 * Caller must hold session s_mutex. 1279 * Caller must hold session s_mutex.
1286 */ 1280 */
1287static int iterate_session_caps(struct ceph_mds_session *session, 1281int ceph_iterate_session_caps(struct ceph_mds_session *session,
1288 int (*cb)(struct inode *, struct ceph_cap *, 1282 int (*cb)(struct inode *, struct ceph_cap *,
1289 void *), void *arg) 1283 void *), void *arg)
1290{ 1284{
1291 struct list_head *p; 1285 struct list_head *p;
1292 struct ceph_cap *cap; 1286 struct ceph_cap *cap;
@@ -1451,7 +1445,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
1451 LIST_HEAD(dispose); 1445 LIST_HEAD(dispose);
1452 1446
1453 dout("remove_session_caps on %p\n", session); 1447 dout("remove_session_caps on %p\n", session);
1454 iterate_session_caps(session, remove_session_caps_cb, fsc); 1448 ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
1455 1449
1456 wake_up_all(&fsc->mdsc->cap_flushing_wq); 1450 wake_up_all(&fsc->mdsc->cap_flushing_wq);
1457 1451
@@ -1534,8 +1528,8 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
1534static void wake_up_session_caps(struct ceph_mds_session *session, int ev) 1528static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
1535{ 1529{
1536 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); 1530 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
1537 iterate_session_caps(session, wake_up_session_cb, 1531 ceph_iterate_session_caps(session, wake_up_session_cb,
1538 (void *)(unsigned long)ev); 1532 (void *)(unsigned long)ev);
1539} 1533}
1540 1534
1541/* 1535/*
@@ -1768,7 +1762,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
1768 session->s_mds, session->s_nr_caps, max_caps, trim_caps); 1762 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1769 if (trim_caps > 0) { 1763 if (trim_caps > 0) {
1770 session->s_trim_caps = trim_caps; 1764 session->s_trim_caps = trim_caps;
1771 iterate_session_caps(session, trim_caps_cb, session); 1765 ceph_iterate_session_caps(session, trim_caps_cb, session);
1772 dout("trim_caps mds%d done: %d / %d, trimmed %d\n", 1766 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1773 session->s_mds, session->s_nr_caps, max_caps, 1767 session->s_mds, session->s_nr_caps, max_caps,
1774 trim_caps - session->s_trim_caps); 1768 trim_caps - session->s_trim_caps);
@@ -1861,7 +1855,8 @@ again:
1861 num_cap_releases--; 1855 num_cap_releases--;
1862 1856
1863 head = msg->front.iov_base; 1857 head = msg->front.iov_base;
1864 le32_add_cpu(&head->num, 1); 1858 put_unaligned_le32(get_unaligned_le32(&head->num) + 1,
1859 &head->num);
1865 item = msg->front.iov_base + msg->front.iov_len; 1860 item = msg->front.iov_base + msg->front.iov_len;
1866 item->ino = cpu_to_le64(cap->cap_ino); 1861 item->ino = cpu_to_le64(cap->cap_ino);
1867 item->cap_id = cpu_to_le64(cap->cap_id); 1862 item->cap_id = cpu_to_le64(cap->cap_id);
@@ -2089,43 +2084,29 @@ static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
2089 * Encode hidden .snap dirs as a double /, i.e. 2084 * Encode hidden .snap dirs as a double /, i.e.
2090 * foo/.snap/bar -> foo//bar 2085 * foo/.snap/bar -> foo//bar
2091 */ 2086 */
2092char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, 2087char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
2093 int stop_on_nosnap) 2088 int stop_on_nosnap)
2094{ 2089{
2095 struct dentry *temp; 2090 struct dentry *temp;
2096 char *path; 2091 char *path;
2097 int len, pos; 2092 int pos;
2098 unsigned seq; 2093 unsigned seq;
2094 u64 base;
2099 2095
2100 if (!dentry) 2096 if (!dentry)
2101 return ERR_PTR(-EINVAL); 2097 return ERR_PTR(-EINVAL);
2102 2098
2103retry: 2099 path = __getname();
2104 len = 0;
2105 seq = read_seqbegin(&rename_lock);
2106 rcu_read_lock();
2107 for (temp = dentry; !IS_ROOT(temp);) {
2108 struct inode *inode = d_inode(temp);
2109 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
2110 len++; /* slash only */
2111 else if (stop_on_nosnap && inode &&
2112 ceph_snap(inode) == CEPH_NOSNAP)
2113 break;
2114 else
2115 len += 1 + temp->d_name.len;
2116 temp = temp->d_parent;
2117 }
2118 rcu_read_unlock();
2119 if (len)
2120 len--; /* no leading '/' */
2121
2122 path = kmalloc(len+1, GFP_NOFS);
2123 if (!path) 2100 if (!path)
2124 return ERR_PTR(-ENOMEM); 2101 return ERR_PTR(-ENOMEM);
2125 pos = len; 2102retry:
2126 path[pos] = 0; /* trailing null */ 2103 pos = PATH_MAX - 1;
2104 path[pos] = '\0';
2105
2106 seq = read_seqbegin(&rename_lock);
2127 rcu_read_lock(); 2107 rcu_read_lock();
2128 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { 2108 temp = dentry;
2109 for (;;) {
2129 struct inode *inode; 2110 struct inode *inode;
2130 2111
2131 spin_lock(&temp->d_lock); 2112 spin_lock(&temp->d_lock);
@@ -2143,83 +2124,54 @@ retry:
2143 spin_unlock(&temp->d_lock); 2124 spin_unlock(&temp->d_lock);
2144 break; 2125 break;
2145 } 2126 }
2146 strncpy(path + pos, temp->d_name.name, 2127 memcpy(path + pos, temp->d_name.name, temp->d_name.len);
2147 temp->d_name.len);
2148 } 2128 }
2149 spin_unlock(&temp->d_lock); 2129 spin_unlock(&temp->d_lock);
2150 if (pos)
2151 path[--pos] = '/';
2152 temp = temp->d_parent; 2130 temp = temp->d_parent;
2131
2132 /* Are we at the root? */
2133 if (IS_ROOT(temp))
2134 break;
2135
2136 /* Are we out of buffer? */
2137 if (--pos < 0)
2138 break;
2139
2140 path[pos] = '/';
2153 } 2141 }
2142 base = ceph_ino(d_inode(temp));
2154 rcu_read_unlock(); 2143 rcu_read_unlock();
2155 if (pos != 0 || read_seqretry(&rename_lock, seq)) { 2144 if (pos < 0 || read_seqretry(&rename_lock, seq)) {
2156 pr_err("build_path did not end path lookup where " 2145 pr_err("build_path did not end path lookup where "
2157 "expected, namelen is %d, pos is %d\n", len, pos); 2146 "expected, pos is %d\n", pos);
2158 /* presumably this is only possible if racing with a 2147 /* presumably this is only possible if racing with a
2159 rename of one of the parent directories (we can not 2148 rename of one of the parent directories (we can not
2160 lock the dentries above us to prevent this, but 2149 lock the dentries above us to prevent this, but
2161 retrying should be harmless) */ 2150 retrying should be harmless) */
2162 kfree(path);
2163 goto retry; 2151 goto retry;
2164 } 2152 }
2165 2153
2166 *base = ceph_ino(d_inode(temp)); 2154 *pbase = base;
2167 *plen = len; 2155 *plen = PATH_MAX - 1 - pos;
2168 dout("build_path on %p %d built %llx '%.*s'\n", 2156 dout("build_path on %p %d built %llx '%.*s'\n",
2169 dentry, d_count(dentry), *base, len, path); 2157 dentry, d_count(dentry), base, *plen, path + pos);
2170 return path; 2158 return path + pos;
2171}
2172
2173/* Duplicate the dentry->d_name.name safely */
2174static int clone_dentry_name(struct dentry *dentry, const char **ppath,
2175 int *ppathlen)
2176{
2177 u32 len;
2178 char *name;
2179
2180retry:
2181 len = READ_ONCE(dentry->d_name.len);
2182 name = kmalloc(len + 1, GFP_NOFS);
2183 if (!name)
2184 return -ENOMEM;
2185
2186 spin_lock(&dentry->d_lock);
2187 if (dentry->d_name.len != len) {
2188 spin_unlock(&dentry->d_lock);
2189 kfree(name);
2190 goto retry;
2191 }
2192 memcpy(name, dentry->d_name.name, len);
2193 spin_unlock(&dentry->d_lock);
2194
2195 name[len] = '\0';
2196 *ppath = name;
2197 *ppathlen = len;
2198 return 0;
2199} 2159}
2200 2160
2201static int build_dentry_path(struct dentry *dentry, struct inode *dir, 2161static int build_dentry_path(struct dentry *dentry, struct inode *dir,
2202 const char **ppath, int *ppathlen, u64 *pino, 2162 const char **ppath, int *ppathlen, u64 *pino,
2203 bool *pfreepath, bool parent_locked) 2163 bool *pfreepath, bool parent_locked)
2204{ 2164{
2205 int ret;
2206 char *path; 2165 char *path;
2207 2166
2208 rcu_read_lock(); 2167 rcu_read_lock();
2209 if (!dir) 2168 if (!dir)
2210 dir = d_inode_rcu(dentry->d_parent); 2169 dir = d_inode_rcu(dentry->d_parent);
2211 if (dir && ceph_snap(dir) == CEPH_NOSNAP) { 2170 if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) {
2212 *pino = ceph_ino(dir); 2171 *pino = ceph_ino(dir);
2213 rcu_read_unlock(); 2172 rcu_read_unlock();
2214 if (parent_locked) { 2173 *ppath = dentry->d_name.name;
2215 *ppath = dentry->d_name.name; 2174 *ppathlen = dentry->d_name.len;
2216 *ppathlen = dentry->d_name.len;
2217 } else {
2218 ret = clone_dentry_name(dentry, ppath, ppathlen);
2219 if (ret)
2220 return ret;
2221 *pfreepath = true;
2222 }
2223 return 0; 2175 return 0;
2224 } 2176 }
2225 rcu_read_unlock(); 2177 rcu_read_unlock();
@@ -2331,9 +2283,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
2331 (!!req->r_inode_drop + !!req->r_dentry_drop + 2283 (!!req->r_inode_drop + !!req->r_dentry_drop +
2332 !!req->r_old_inode_drop + !!req->r_old_dentry_drop); 2284 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
2333 if (req->r_dentry_drop) 2285 if (req->r_dentry_drop)
2334 len += req->r_dentry->d_name.len; 2286 len += pathlen1;
2335 if (req->r_old_dentry_drop) 2287 if (req->r_old_dentry_drop)
2336 len += req->r_old_dentry->d_name.len; 2288 len += pathlen2;
2337 2289
2338 msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); 2290 msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
2339 if (!msg) { 2291 if (!msg) {
@@ -2410,10 +2362,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
2410 2362
2411out_free2: 2363out_free2:
2412 if (freepath2) 2364 if (freepath2)
2413 kfree((char *)path2); 2365 ceph_mdsc_free_path((char *)path2, pathlen2);
2414out_free1: 2366out_free1:
2415 if (freepath1) 2367 if (freepath1)
2416 kfree((char *)path1); 2368 ceph_mdsc_free_path((char *)path1, pathlen1);
2417out: 2369out:
2418 return msg; 2370 return msg;
2419} 2371}
@@ -2427,8 +2379,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
2427{ 2379{
2428 if (req->r_callback) 2380 if (req->r_callback)
2429 req->r_callback(mdsc, req); 2381 req->r_callback(mdsc, req);
2430 else 2382 complete_all(&req->r_completion);
2431 complete_all(&req->r_completion);
2432} 2383}
2433 2384
2434/* 2385/*
@@ -2670,28 +2621,11 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
2670 } 2621 }
2671} 2622}
2672 2623
2673void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 2624int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
2674 struct ceph_mds_request *req) 2625 struct ceph_mds_request *req)
2675{ 2626{
2676 dout("submit_request on %p\n", req);
2677 mutex_lock(&mdsc->mutex);
2678 __register_request(mdsc, req, NULL);
2679 __do_request(mdsc, req);
2680 mutex_unlock(&mdsc->mutex);
2681}
2682
2683/*
2684 * Synchrously perform an mds request. Take care of all of the
2685 * session setup, forwarding, retry details.
2686 */
2687int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
2688 struct inode *dir,
2689 struct ceph_mds_request *req)
2690{
2691 int err; 2627 int err;
2692 2628
2693 dout("do_request on %p\n", req);
2694
2695 /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ 2629 /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
2696 if (req->r_inode) 2630 if (req->r_inode)
2697 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 2631 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
@@ -2701,18 +2635,21 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
2701 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), 2635 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
2702 CEPH_CAP_PIN); 2636 CEPH_CAP_PIN);
2703 2637
2704 /* issue */ 2638 dout("submit_request on %p for inode %p\n", req, dir);
2705 mutex_lock(&mdsc->mutex); 2639 mutex_lock(&mdsc->mutex);
2706 __register_request(mdsc, req, dir); 2640 __register_request(mdsc, req, dir);
2707 __do_request(mdsc, req); 2641 __do_request(mdsc, req);
2642 err = req->r_err;
2643 mutex_unlock(&mdsc->mutex);
2644 return err;
2645}
2708 2646
2709 if (req->r_err) { 2647static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
2710 err = req->r_err; 2648 struct ceph_mds_request *req)
2711 goto out; 2649{
2712 } 2650 int err;
2713 2651
2714 /* wait */ 2652 /* wait */
2715 mutex_unlock(&mdsc->mutex);
2716 dout("do_request waiting\n"); 2653 dout("do_request waiting\n");
2717 if (!req->r_timeout && req->r_wait_for_completion) { 2654 if (!req->r_timeout && req->r_wait_for_completion) {
2718 err = req->r_wait_for_completion(mdsc, req); 2655 err = req->r_wait_for_completion(mdsc, req);
@@ -2753,8 +2690,26 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
2753 err = req->r_err; 2690 err = req->r_err;
2754 } 2691 }
2755 2692
2756out:
2757 mutex_unlock(&mdsc->mutex); 2693 mutex_unlock(&mdsc->mutex);
2694 return err;
2695}
2696
2697/*
2698 * Synchrously perform an mds request. Take care of all of the
2699 * session setup, forwarding, retry details.
2700 */
2701int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
2702 struct inode *dir,
2703 struct ceph_mds_request *req)
2704{
2705 int err;
2706
2707 dout("do_request on %p\n", req);
2708
2709 /* issue */
2710 err = ceph_mdsc_submit_request(mdsc, dir, req);
2711 if (!err)
2712 err = ceph_mdsc_wait_request(mdsc, req);
2758 dout("do_request %p done, result %d\n", req, err); 2713 dout("do_request %p done, result %d\n", req, err);
2759 return err; 2714 return err;
2760} 2715}
@@ -3485,7 +3440,7 @@ out_freeflocks:
3485 ceph_pagelist_encode_string(pagelist, path, pathlen); 3440 ceph_pagelist_encode_string(pagelist, path, pathlen);
3486 ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); 3441 ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
3487out_freepath: 3442out_freepath:
3488 kfree(path); 3443 ceph_mdsc_free_path(path, pathlen);
3489 } 3444 }
3490 3445
3491out_err: 3446out_err:
@@ -3642,7 +3597,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
3642 recon_state.msg_version = 2; 3597 recon_state.msg_version = 2;
3643 } 3598 }
3644 /* trsaverse this session's caps */ 3599 /* trsaverse this session's caps */
3645 err = iterate_session_caps(session, encode_caps_cb, &recon_state); 3600 err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state);
3646 3601
3647 spin_lock(&session->s_cap_lock); 3602 spin_lock(&session->s_cap_lock);
3648 session->s_cap_reconnect = 0; 3603 session->s_cap_reconnect = 0;
@@ -4125,6 +4080,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
4125 mdsc->max_sessions = 0; 4080 mdsc->max_sessions = 0;
4126 mdsc->stopping = 0; 4081 mdsc->stopping = 0;
4127 atomic64_set(&mdsc->quotarealms_count, 0); 4082 atomic64_set(&mdsc->quotarealms_count, 0);
4083 mdsc->quotarealms_inodes = RB_ROOT;
4084 mutex_init(&mdsc->quotarealms_inodes_mutex);
4128 mdsc->last_snap_seq = 0; 4085 mdsc->last_snap_seq = 0;
4129 init_rwsem(&mdsc->snap_rwsem); 4086 init_rwsem(&mdsc->snap_rwsem);
4130 mdsc->snap_realms = RB_ROOT; 4087 mdsc->snap_realms = RB_ROOT;
@@ -4216,6 +4173,8 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
4216 * their inode/dcache refs 4173 * their inode/dcache refs
4217 */ 4174 */
4218 ceph_msgr_flush(); 4175 ceph_msgr_flush();
4176
4177 ceph_cleanup_quotarealms_inodes(mdsc);
4219} 4178}
4220 4179
4221/* 4180/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 50385a481fdb..a83f28bc2387 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -326,6 +326,18 @@ struct ceph_snapid_map {
326}; 326};
327 327
328/* 328/*
329 * node for list of quotarealm inodes that are not visible from the filesystem
330 * mountpoint, but required to handle, e.g. quotas.
331 */
332struct ceph_quotarealm_inode {
333 struct rb_node node;
334 u64 ino;
335 unsigned long timeout; /* last time a lookup failed for this inode */
336 struct mutex mutex;
337 struct inode *inode;
338};
339
340/*
329 * mds client state 341 * mds client state
330 */ 342 */
331struct ceph_mds_client { 343struct ceph_mds_client {
@@ -344,6 +356,12 @@ struct ceph_mds_client {
344 int stopping; /* true if shutting down */ 356 int stopping; /* true if shutting down */
345 357
346 atomic64_t quotarealms_count; /* # realms with quota */ 358 atomic64_t quotarealms_count; /* # realms with quota */
359 /*
360 * We keep a list of inodes we don't see in the mountpoint but that we
361 * need to track quota realms.
362 */
363 struct rb_root quotarealms_inodes;
364 struct mutex quotarealms_inodes_mutex;
347 365
348 /* 366 /*
349 * snap_rwsem will cover cap linkage into snaprealms, and 367 * snap_rwsem will cover cap linkage into snaprealms, and
@@ -447,8 +465,9 @@ extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
447 struct inode *dir); 465 struct inode *dir);
448extern struct ceph_mds_request * 466extern struct ceph_mds_request *
449ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 467ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
450extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 468extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
451 struct ceph_mds_request *req); 469 struct inode *dir,
470 struct ceph_mds_request *req);
452extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, 471extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
453 struct inode *dir, 472 struct inode *dir,
454 struct ceph_mds_request *req); 473 struct ceph_mds_request *req);
@@ -468,8 +487,18 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
468 struct ceph_mds_session *session); 487 struct ceph_mds_session *session);
469extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); 488extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
470extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); 489extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
490extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
491 int (*cb)(struct inode *,
492 struct ceph_cap *, void *),
493 void *arg);
471extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); 494extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
472 495
496static inline void ceph_mdsc_free_path(char *path, int len)
497{
498 if (path)
499 __putname(path - (PATH_MAX - 1 - len));
500}
501
473extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, 502extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
474 int stop_on_nosnap); 503 int stop_on_nosnap);
475 504
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 1a2c5d390f7f..701b4fb0fb5a 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -205,7 +205,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
205 205
206 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", 206 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
207 i+1, n, global_id, mds, inc, 207 i+1, n, global_id, mds, inc,
208 ceph_pr_addr(&addr.in_addr), 208 ceph_pr_addr(&addr),
209 ceph_mds_state_name(state)); 209 ceph_mds_state_name(state));
210 210
211 if (mds < 0 || state <= 0) 211 if (mds < 0 || state <= 0)
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 9455d3aef0c3..c4522212872c 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -22,7 +22,16 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
22static inline bool ceph_has_realms_with_quotas(struct inode *inode) 22static inline bool ceph_has_realms_with_quotas(struct inode *inode)
23{ 23{
24 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 24 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
25 return atomic64_read(&mdsc->quotarealms_count) > 0; 25 struct super_block *sb = mdsc->fsc->sb;
26
27 if (atomic64_read(&mdsc->quotarealms_count) > 0)
28 return true;
29 /* if root is the real CephFS root, we don't have quota realms */
30 if (sb->s_root->d_inode &&
31 (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
32 return false;
33 /* otherwise, we can't know for sure */
34 return true;
26} 35}
27 36
28void ceph_handle_quota(struct ceph_mds_client *mdsc, 37void ceph_handle_quota(struct ceph_mds_client *mdsc,
@@ -68,6 +77,108 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
68 iput(inode); 77 iput(inode);
69} 78}
70 79
80static struct ceph_quotarealm_inode *
81find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
82{
83 struct ceph_quotarealm_inode *qri = NULL;
84 struct rb_node **node, *parent = NULL;
85
86 mutex_lock(&mdsc->quotarealms_inodes_mutex);
87 node = &(mdsc->quotarealms_inodes.rb_node);
88 while (*node) {
89 parent = *node;
90 qri = container_of(*node, struct ceph_quotarealm_inode, node);
91
92 if (ino < qri->ino)
93 node = &((*node)->rb_left);
94 else if (ino > qri->ino)
95 node = &((*node)->rb_right);
96 else
97 break;
98 }
99 if (!qri || (qri->ino != ino)) {
100 /* Not found, create a new one and insert it */
101 qri = kmalloc(sizeof(*qri), GFP_KERNEL);
102 if (qri) {
103 qri->ino = ino;
104 qri->inode = NULL;
105 qri->timeout = 0;
106 mutex_init(&qri->mutex);
107 rb_link_node(&qri->node, parent, node);
108 rb_insert_color(&qri->node, &mdsc->quotarealms_inodes);
109 } else
110 pr_warn("Failed to alloc quotarealms_inode\n");
111 }
112 mutex_unlock(&mdsc->quotarealms_inodes_mutex);
113
114 return qri;
115}
116
117/*
118 * This function will try to lookup a realm inode which isn't visible in the
119 * filesystem mountpoint. A list of these kind of inodes (not visible) is
120 * maintained in the mdsc and freed only when the filesystem is umounted.
121 *
122 * Note that these inodes are kept in this list even if the lookup fails, which
123 * allows to prevent useless lookup requests.
124 */
125static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
126 struct super_block *sb,
127 struct ceph_snap_realm *realm)
128{
129 struct ceph_quotarealm_inode *qri;
130 struct inode *in;
131
132 qri = find_quotarealm_inode(mdsc, realm->ino);
133 if (!qri)
134 return NULL;
135
136 mutex_lock(&qri->mutex);
137 if (qri->inode) {
138 /* A request has already returned the inode */
139 mutex_unlock(&qri->mutex);
140 return qri->inode;
141 }
142 /* Check if this inode lookup has failed recently */
143 if (qri->timeout &&
144 time_before_eq(jiffies, qri->timeout)) {
145 mutex_unlock(&qri->mutex);
146 return NULL;
147 }
148 in = ceph_lookup_inode(sb, realm->ino);
149 if (IS_ERR(in)) {
150 pr_warn("Can't lookup inode %llx (err: %ld)\n",
151 realm->ino, PTR_ERR(in));
152 qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
153 } else {
154 qri->timeout = 0;
155 qri->inode = in;
156 }
157 mutex_unlock(&qri->mutex);
158
159 return in;
160}
161
162void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
163{
164 struct ceph_quotarealm_inode *qri;
165 struct rb_node *node;
166
167 /*
168 * It should now be safe to clean quotarealms_inode tree without holding
169 * mdsc->quotarealms_inodes_mutex...
170 */
171 mutex_lock(&mdsc->quotarealms_inodes_mutex);
172 while (!RB_EMPTY_ROOT(&mdsc->quotarealms_inodes)) {
173 node = rb_first(&mdsc->quotarealms_inodes);
174 qri = rb_entry(node, struct ceph_quotarealm_inode, node);
175 rb_erase(node, &mdsc->quotarealms_inodes);
176 iput(qri->inode);
177 kfree(qri);
178 }
179 mutex_unlock(&mdsc->quotarealms_inodes_mutex);
180}
181
71/* 182/*
72 * This function walks through the snaprealm for an inode and returns the 183 * This function walks through the snaprealm for an inode and returns the
73 * ceph_snap_realm for the first snaprealm that has quotas set (either max_files 184 * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
@@ -76,9 +187,15 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
76 * 187 *
77 * Note that the caller is responsible for calling ceph_put_snap_realm() on the 188 * Note that the caller is responsible for calling ceph_put_snap_realm() on the
78 * returned realm. 189 * returned realm.
190 *
191 * Callers of this function need to hold mdsc->snap_rwsem. However, if there's
192 * a need to do an inode lookup, this rwsem will be temporarily dropped. Hence
193 * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false'
194 * this function will return -EAGAIN; otherwise, the snaprealms walk-through
195 * will be restarted.
79 */ 196 */
80static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, 197static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
81 struct inode *inode) 198 struct inode *inode, bool retry)
82{ 199{
83 struct ceph_inode_info *ci = NULL; 200 struct ceph_inode_info *ci = NULL;
84 struct ceph_snap_realm *realm, *next; 201 struct ceph_snap_realm *realm, *next;
@@ -88,6 +205,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
88 if (ceph_snap(inode) != CEPH_NOSNAP) 205 if (ceph_snap(inode) != CEPH_NOSNAP)
89 return NULL; 206 return NULL;
90 207
208restart:
91 realm = ceph_inode(inode)->i_snap_realm; 209 realm = ceph_inode(inode)->i_snap_realm;
92 if (realm) 210 if (realm)
93 ceph_get_snap_realm(mdsc, realm); 211 ceph_get_snap_realm(mdsc, realm);
@@ -95,11 +213,25 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
95 pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) " 213 pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
96 "null i_snap_realm\n", ceph_vinop(inode)); 214 "null i_snap_realm\n", ceph_vinop(inode));
97 while (realm) { 215 while (realm) {
216 bool has_inode;
217
98 spin_lock(&realm->inodes_with_caps_lock); 218 spin_lock(&realm->inodes_with_caps_lock);
99 in = realm->inode ? igrab(realm->inode) : NULL; 219 has_inode = realm->inode;
220 in = has_inode ? igrab(realm->inode) : NULL;
100 spin_unlock(&realm->inodes_with_caps_lock); 221 spin_unlock(&realm->inodes_with_caps_lock);
101 if (!in) 222 if (has_inode && !in)
102 break; 223 break;
224 if (!in) {
225 up_read(&mdsc->snap_rwsem);
226 in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
227 down_read(&mdsc->snap_rwsem);
228 if (IS_ERR_OR_NULL(in))
229 break;
230 ceph_put_snap_realm(mdsc, realm);
231 if (!retry)
232 return ERR_PTR(-EAGAIN);
233 goto restart;
234 }
103 235
104 ci = ceph_inode(in); 236 ci = ceph_inode(in);
105 has_quota = __ceph_has_any_quota(ci); 237 has_quota = __ceph_has_any_quota(ci);
@@ -125,9 +257,22 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
125 struct ceph_snap_realm *old_realm, *new_realm; 257 struct ceph_snap_realm *old_realm, *new_realm;
126 bool is_same; 258 bool is_same;
127 259
260restart:
261 /*
262 * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem.
263 * However, get_quota_realm may drop it temporarily. By setting the
264 * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was
265 * dropped and we can then restart the whole operation.
266 */
128 down_read(&mdsc->snap_rwsem); 267 down_read(&mdsc->snap_rwsem);
129 old_realm = get_quota_realm(mdsc, old); 268 old_realm = get_quota_realm(mdsc, old, true);
130 new_realm = get_quota_realm(mdsc, new); 269 new_realm = get_quota_realm(mdsc, new, false);
270 if (PTR_ERR(new_realm) == -EAGAIN) {
271 up_read(&mdsc->snap_rwsem);
272 if (old_realm)
273 ceph_put_snap_realm(mdsc, old_realm);
274 goto restart;
275 }
131 is_same = (old_realm == new_realm); 276 is_same = (old_realm == new_realm);
132 up_read(&mdsc->snap_rwsem); 277 up_read(&mdsc->snap_rwsem);
133 278
@@ -166,6 +311,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
166 return false; 311 return false;
167 312
168 down_read(&mdsc->snap_rwsem); 313 down_read(&mdsc->snap_rwsem);
314restart:
169 realm = ceph_inode(inode)->i_snap_realm; 315 realm = ceph_inode(inode)->i_snap_realm;
170 if (realm) 316 if (realm)
171 ceph_get_snap_realm(mdsc, realm); 317 ceph_get_snap_realm(mdsc, realm);
@@ -173,12 +319,23 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
173 pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) " 319 pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
174 "null i_snap_realm\n", ceph_vinop(inode)); 320 "null i_snap_realm\n", ceph_vinop(inode));
175 while (realm) { 321 while (realm) {
322 bool has_inode;
323
176 spin_lock(&realm->inodes_with_caps_lock); 324 spin_lock(&realm->inodes_with_caps_lock);
177 in = realm->inode ? igrab(realm->inode) : NULL; 325 has_inode = realm->inode;
326 in = has_inode ? igrab(realm->inode) : NULL;
178 spin_unlock(&realm->inodes_with_caps_lock); 327 spin_unlock(&realm->inodes_with_caps_lock);
179 if (!in) 328 if (has_inode && !in)
180 break; 329 break;
181 330 if (!in) {
331 up_read(&mdsc->snap_rwsem);
332 in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
333 down_read(&mdsc->snap_rwsem);
334 if (IS_ERR_OR_NULL(in))
335 break;
336 ceph_put_snap_realm(mdsc, realm);
337 goto restart;
338 }
182 ci = ceph_inode(in); 339 ci = ceph_inode(in);
183 spin_lock(&ci->i_ceph_lock); 340 spin_lock(&ci->i_ceph_lock);
184 if (op == QUOTA_CHECK_MAX_FILES_OP) { 341 if (op == QUOTA_CHECK_MAX_FILES_OP) {
@@ -314,7 +471,7 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
314 bool is_updated = false; 471 bool is_updated = false;
315 472
316 down_read(&mdsc->snap_rwsem); 473 down_read(&mdsc->snap_rwsem);
317 realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root)); 474 realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
318 up_read(&mdsc->snap_rwsem); 475 up_read(&mdsc->snap_rwsem);
319 if (!realm) 476 if (!realm)
320 return false; 477 return false;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 285edda4fc3b..c864b44c8341 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -845,6 +845,12 @@ static void ceph_umount_begin(struct super_block *sb)
845 return; 845 return;
846} 846}
847 847
848static int ceph_remount(struct super_block *sb, int *flags, char *data)
849{
850 sync_filesystem(sb);
851 return 0;
852}
853
848static const struct super_operations ceph_super_ops = { 854static const struct super_operations ceph_super_ops = {
849 .alloc_inode = ceph_alloc_inode, 855 .alloc_inode = ceph_alloc_inode,
850 .destroy_inode = ceph_destroy_inode, 856 .destroy_inode = ceph_destroy_inode,
@@ -853,6 +859,7 @@ static const struct super_operations ceph_super_ops = {
853 .drop_inode = ceph_drop_inode, 859 .drop_inode = ceph_drop_inode,
854 .sync_fs = ceph_sync_fs, 860 .sync_fs = ceph_sync_fs,
855 .put_super = ceph_put_super, 861 .put_super = ceph_put_super,
862 .remount_fs = ceph_remount,
856 .show_options = ceph_show_options, 863 .show_options = ceph_show_options,
857 .statfs = ceph_statfs, 864 .statfs = ceph_statfs,
858 .umount_begin = ceph_umount_begin, 865 .umount_begin = ceph_umount_begin,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c5b4a05905c0..6edab9a750f8 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1083,6 +1083,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1083 1083
1084/* export.c */ 1084/* export.c */
1085extern const struct export_operations ceph_export_ops; 1085extern const struct export_operations ceph_export_ops;
1086struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino);
1086 1087
1087/* locks.c */ 1088/* locks.c */
1088extern __init void ceph_flock_init(void); 1089extern __init void ceph_flock_init(void);
@@ -1133,5 +1134,6 @@ extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode,
1133 loff_t newlen); 1134 loff_t newlen);
1134extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, 1135extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
1135 struct kstatfs *buf); 1136 struct kstatfs *buf);
1137extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc);
1136 1138
1137#endif /* _FS_CEPH_SUPER_H */ 1139#endif /* _FS_CEPH_SUPER_H */