aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/inode.c')
-rw-r--r--fs/ceph/inode.c249
1 files changed, 155 insertions, 94 deletions
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 233c6f96910a..04c89c266cec 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -10,6 +10,7 @@
10#include <linux/writeback.h> 10#include <linux/writeback.h>
11#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include <linux/posix_acl.h> 12#include <linux/posix_acl.h>
13#include <linux/random.h>
13 14
14#include "super.h" 15#include "super.h"
15#include "mds_client.h" 16#include "mds_client.h"
@@ -179,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
179 * specified, copy the frag delegation info to the caller if 180 * specified, copy the frag delegation info to the caller if
180 * it is present. 181 * it is present.
181 */ 182 */
182u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, 183static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
183 struct ceph_inode_frag *pfrag, 184 struct ceph_inode_frag *pfrag, int *found)
184 int *found)
185{ 185{
186 u32 t = ceph_frag_make(0, 0); 186 u32 t = ceph_frag_make(0, 0);
187 struct ceph_inode_frag *frag; 187 struct ceph_inode_frag *frag;
@@ -191,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
191 if (found) 191 if (found)
192 *found = 0; 192 *found = 0;
193 193
194 mutex_lock(&ci->i_fragtree_mutex);
195 while (1) { 194 while (1) {
196 WARN_ON(!ceph_frag_contains_value(t, v)); 195 WARN_ON(!ceph_frag_contains_value(t, v));
197 frag = __ceph_find_frag(ci, t); 196 frag = __ceph_find_frag(ci, t);
@@ -220,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
220 } 219 }
221 dout("choose_frag(%x) = %x\n", v, t); 220 dout("choose_frag(%x) = %x\n", v, t);
222 221
223 mutex_unlock(&ci->i_fragtree_mutex);
224 return t; 222 return t;
225} 223}
226 224
225u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
226 struct ceph_inode_frag *pfrag, int *found)
227{
228 u32 ret;
229 mutex_lock(&ci->i_fragtree_mutex);
230 ret = __ceph_choose_frag(ci, v, pfrag, found);
231 mutex_unlock(&ci->i_fragtree_mutex);
232 return ret;
233}
234
227/* 235/*
228 * Process dirfrag (delegation) info from the mds. Include leaf 236 * Process dirfrag (delegation) info from the mds. Include leaf
229 * fragment in tree ONLY if ndist > 0. Otherwise, only 237 * fragment in tree ONLY if ndist > 0. Otherwise, only
@@ -237,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode,
237 u32 id = le32_to_cpu(dirinfo->frag); 245 u32 id = le32_to_cpu(dirinfo->frag);
238 int mds = le32_to_cpu(dirinfo->auth); 246 int mds = le32_to_cpu(dirinfo->auth);
239 int ndist = le32_to_cpu(dirinfo->ndist); 247 int ndist = le32_to_cpu(dirinfo->ndist);
248 int diri_auth = -1;
240 int i; 249 int i;
241 int err = 0; 250 int err = 0;
242 251
252 spin_lock(&ci->i_ceph_lock);
253 if (ci->i_auth_cap)
254 diri_auth = ci->i_auth_cap->mds;
255 spin_unlock(&ci->i_ceph_lock);
256
243 mutex_lock(&ci->i_fragtree_mutex); 257 mutex_lock(&ci->i_fragtree_mutex);
244 if (ndist == 0) { 258 if (ndist == 0 && mds == diri_auth) {
245 /* no delegation info needed. */ 259 /* no delegation info needed. */
246 frag = __ceph_find_frag(ci, id); 260 frag = __ceph_find_frag(ci, id);
247 if (!frag) 261 if (!frag)
@@ -286,6 +300,75 @@ out:
286 return err; 300 return err;
287} 301}
288 302
303static int ceph_fill_fragtree(struct inode *inode,
304 struct ceph_frag_tree_head *fragtree,
305 struct ceph_mds_reply_dirfrag *dirinfo)
306{
307 struct ceph_inode_info *ci = ceph_inode(inode);
308 struct ceph_inode_frag *frag;
309 struct rb_node *rb_node;
310 int i;
311 u32 id, nsplits;
312 bool update = false;
313
314 mutex_lock(&ci->i_fragtree_mutex);
315 nsplits = le32_to_cpu(fragtree->nsplits);
316 if (nsplits) {
317 i = prandom_u32() % nsplits;
318 id = le32_to_cpu(fragtree->splits[i].frag);
319 if (!__ceph_find_frag(ci, id))
320 update = true;
321 } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
322 rb_node = rb_first(&ci->i_fragtree);
323 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
324 if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
325 update = true;
326 }
327 if (!update && dirinfo) {
328 id = le32_to_cpu(dirinfo->frag);
329 if (id != __ceph_choose_frag(ci, id, NULL, NULL))
330 update = true;
331 }
332 if (!update)
333 goto out_unlock;
334
335 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
336 rb_node = rb_first(&ci->i_fragtree);
337 for (i = 0; i < nsplits; i++) {
338 id = le32_to_cpu(fragtree->splits[i].frag);
339 frag = NULL;
340 while (rb_node) {
341 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
342 if (ceph_frag_compare(frag->frag, id) >= 0) {
343 if (frag->frag != id)
344 frag = NULL;
345 else
346 rb_node = rb_next(rb_node);
347 break;
348 }
349 rb_node = rb_next(rb_node);
350 rb_erase(&frag->node, &ci->i_fragtree);
351 kfree(frag);
352 frag = NULL;
353 }
354 if (!frag) {
355 frag = __get_or_create_frag(ci, id);
356 if (IS_ERR(frag))
357 continue;
358 }
359 frag->split_by = le32_to_cpu(fragtree->splits[i].by);
360 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
361 }
362 while (rb_node) {
363 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
364 rb_node = rb_next(rb_node);
365 rb_erase(&frag->node, &ci->i_fragtree);
366 kfree(frag);
367 }
368out_unlock:
369 mutex_unlock(&ci->i_fragtree_mutex);
370 return 0;
371}
289 372
290/* 373/*
291 * initialize a newly allocated inode. 374 * initialize a newly allocated inode.
@@ -341,7 +424,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
341 INIT_LIST_HEAD(&ci->i_cap_snaps); 424 INIT_LIST_HEAD(&ci->i_cap_snaps);
342 ci->i_head_snapc = NULL; 425 ci->i_head_snapc = NULL;
343 ci->i_snap_caps = 0; 426 ci->i_snap_caps = 0;
344 ci->i_cap_exporting_issued = 0;
345 427
346 for (i = 0; i < CEPH_FILE_MODE_NUM; i++) 428 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
347 ci->i_nr_by_mode[i] = 0; 429 ci->i_nr_by_mode[i] = 0;
@@ -407,7 +489,7 @@ void ceph_destroy_inode(struct inode *inode)
407 489
408 /* 490 /*
409 * we may still have a snap_realm reference if there are stray 491 * we may still have a snap_realm reference if there are stray
410 * caps in i_cap_exporting_issued or i_snap_caps. 492 * caps in i_snap_caps.
411 */ 493 */
412 if (ci->i_snap_realm) { 494 if (ci->i_snap_realm) {
413 struct ceph_mds_client *mdsc = 495 struct ceph_mds_client *mdsc =
@@ -582,22 +664,26 @@ static int fill_inode(struct inode *inode,
582 unsigned long ttl_from, int cap_fmode, 664 unsigned long ttl_from, int cap_fmode,
583 struct ceph_cap_reservation *caps_reservation) 665 struct ceph_cap_reservation *caps_reservation)
584{ 666{
667 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
585 struct ceph_mds_reply_inode *info = iinfo->in; 668 struct ceph_mds_reply_inode *info = iinfo->in;
586 struct ceph_inode_info *ci = ceph_inode(inode); 669 struct ceph_inode_info *ci = ceph_inode(inode);
587 int i; 670 int issued = 0, implemented, new_issued;
588 int issued = 0, implemented;
589 struct timespec mtime, atime, ctime; 671 struct timespec mtime, atime, ctime;
590 u32 nsplits;
591 struct ceph_inode_frag *frag;
592 struct rb_node *rb_node;
593 struct ceph_buffer *xattr_blob = NULL; 672 struct ceph_buffer *xattr_blob = NULL;
673 struct ceph_cap *new_cap = NULL;
594 int err = 0; 674 int err = 0;
595 int queue_trunc = 0; 675 bool wake = false;
676 bool queue_trunc = false;
677 bool new_version = false;
596 678
597 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", 679 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
598 inode, ceph_vinop(inode), le64_to_cpu(info->version), 680 inode, ceph_vinop(inode), le64_to_cpu(info->version),
599 ci->i_version); 681 ci->i_version);
600 682
683 /* prealloc new cap struct */
684 if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP)
685 new_cap = ceph_get_cap(mdsc, caps_reservation);
686
601 /* 687 /*
602 * prealloc xattr data, if it looks like we'll need it. only 688 * prealloc xattr data, if it looks like we'll need it. only
603 * if len > 4 (meaning there are actually xattrs; the first 4 689 * if len > 4 (meaning there are actually xattrs; the first 4
@@ -623,19 +709,23 @@ static int fill_inode(struct inode *inode,
623 * 3 2 skip 709 * 3 2 skip
624 * 3 3 update 710 * 3 3 update
625 */ 711 */
626 if (le64_to_cpu(info->version) > 0 && 712 if (ci->i_version == 0 ||
627 (ci->i_version & ~1) >= le64_to_cpu(info->version)) 713 ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
628 goto no_change; 714 le64_to_cpu(info->version) > (ci->i_version & ~1)))
629 715 new_version = true;
716
630 issued = __ceph_caps_issued(ci, &implemented); 717 issued = __ceph_caps_issued(ci, &implemented);
631 issued |= implemented | __ceph_caps_dirty(ci); 718 issued |= implemented | __ceph_caps_dirty(ci);
719 new_issued = ~issued & le32_to_cpu(info->cap.caps);
632 720
633 /* update inode */ 721 /* update inode */
634 ci->i_version = le64_to_cpu(info->version); 722 ci->i_version = le64_to_cpu(info->version);
635 inode->i_version++; 723 inode->i_version++;
636 inode->i_rdev = le32_to_cpu(info->rdev); 724 inode->i_rdev = le32_to_cpu(info->rdev);
725 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
637 726
638 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { 727 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
728 (issued & CEPH_CAP_AUTH_EXCL) == 0) {
639 inode->i_mode = le32_to_cpu(info->mode); 729 inode->i_mode = le32_to_cpu(info->mode);
640 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); 730 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
641 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); 731 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
@@ -644,23 +734,35 @@ static int fill_inode(struct inode *inode,
644 from_kgid(&init_user_ns, inode->i_gid)); 734 from_kgid(&init_user_ns, inode->i_gid));
645 } 735 }
646 736
647 if ((issued & CEPH_CAP_LINK_EXCL) == 0) 737 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
738 (issued & CEPH_CAP_LINK_EXCL) == 0)
648 set_nlink(inode, le32_to_cpu(info->nlink)); 739 set_nlink(inode, le32_to_cpu(info->nlink));
649 740
650 /* be careful with mtime, atime, size */ 741 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
651 ceph_decode_timespec(&atime, &info->atime); 742 /* be careful with mtime, atime, size */
652 ceph_decode_timespec(&mtime, &info->mtime); 743 ceph_decode_timespec(&atime, &info->atime);
653 ceph_decode_timespec(&ctime, &info->ctime); 744 ceph_decode_timespec(&mtime, &info->mtime);
654 queue_trunc = ceph_fill_file_size(inode, issued, 745 ceph_decode_timespec(&ctime, &info->ctime);
655 le32_to_cpu(info->truncate_seq), 746 ceph_fill_file_time(inode, issued,
656 le64_to_cpu(info->truncate_size), 747 le32_to_cpu(info->time_warp_seq),
657 le64_to_cpu(info->size)); 748 &ctime, &mtime, &atime);
658 ceph_fill_file_time(inode, issued, 749 }
659 le32_to_cpu(info->time_warp_seq), 750
660 &ctime, &mtime, &atime); 751 if (new_version ||
661 752 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
662 ci->i_layout = info->layout; 753 ci->i_layout = info->layout;
663 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 754 queue_trunc = ceph_fill_file_size(inode, issued,
755 le32_to_cpu(info->truncate_seq),
756 le64_to_cpu(info->truncate_size),
757 le64_to_cpu(info->size));
758 /* only update max_size on auth cap */
759 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
760 ci->i_max_size != le64_to_cpu(info->max_size)) {
761 dout("max_size %lld -> %llu\n", ci->i_max_size,
762 le64_to_cpu(info->max_size));
763 ci->i_max_size = le64_to_cpu(info->max_size);
764 }
765 }
664 766
665 /* xattrs */ 767 /* xattrs */
666 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ 768 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
@@ -745,58 +847,6 @@ static int fill_inode(struct inode *inode,
745 dout(" marking %p complete (empty)\n", inode); 847 dout(" marking %p complete (empty)\n", inode);
746 __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); 848 __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
747 } 849 }
748no_change:
749 /* only update max_size on auth cap */
750 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
751 ci->i_max_size != le64_to_cpu(info->max_size)) {
752 dout("max_size %lld -> %llu\n", ci->i_max_size,
753 le64_to_cpu(info->max_size));
754 ci->i_max_size = le64_to_cpu(info->max_size);
755 }
756
757 spin_unlock(&ci->i_ceph_lock);
758
759 /* queue truncate if we saw i_size decrease */
760 if (queue_trunc)
761 ceph_queue_vmtruncate(inode);
762
763 /* populate frag tree */
764 /* FIXME: move me up, if/when version reflects fragtree changes */
765 nsplits = le32_to_cpu(info->fragtree.nsplits);
766 mutex_lock(&ci->i_fragtree_mutex);
767 rb_node = rb_first(&ci->i_fragtree);
768 for (i = 0; i < nsplits; i++) {
769 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
770 frag = NULL;
771 while (rb_node) {
772 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
773 if (ceph_frag_compare(frag->frag, id) >= 0) {
774 if (frag->frag != id)
775 frag = NULL;
776 else
777 rb_node = rb_next(rb_node);
778 break;
779 }
780 rb_node = rb_next(rb_node);
781 rb_erase(&frag->node, &ci->i_fragtree);
782 kfree(frag);
783 frag = NULL;
784 }
785 if (!frag) {
786 frag = __get_or_create_frag(ci, id);
787 if (IS_ERR(frag))
788 continue;
789 }
790 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
791 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
792 }
793 while (rb_node) {
794 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
795 rb_node = rb_next(rb_node);
796 rb_erase(&frag->node, &ci->i_fragtree);
797 kfree(frag);
798 }
799 mutex_unlock(&ci->i_fragtree_mutex);
800 850
801 /* were we issued a capability? */ 851 /* were we issued a capability? */
802 if (info->cap.caps) { 852 if (info->cap.caps) {
@@ -809,30 +859,41 @@ no_change:
809 le32_to_cpu(info->cap.seq), 859 le32_to_cpu(info->cap.seq),
810 le32_to_cpu(info->cap.mseq), 860 le32_to_cpu(info->cap.mseq),
811 le64_to_cpu(info->cap.realm), 861 le64_to_cpu(info->cap.realm),
812 info->cap.flags, 862 info->cap.flags, &new_cap);
813 caps_reservation); 863 wake = true;
814 } else { 864 } else {
815 spin_lock(&ci->i_ceph_lock);
816 dout(" %p got snap_caps %s\n", inode, 865 dout(" %p got snap_caps %s\n", inode,
817 ceph_cap_string(le32_to_cpu(info->cap.caps))); 866 ceph_cap_string(le32_to_cpu(info->cap.caps)));
818 ci->i_snap_caps |= le32_to_cpu(info->cap.caps); 867 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
819 if (cap_fmode >= 0) 868 if (cap_fmode >= 0)
820 __ceph_get_fmode(ci, cap_fmode); 869 __ceph_get_fmode(ci, cap_fmode);
821 spin_unlock(&ci->i_ceph_lock);
822 } 870 }
823 } else if (cap_fmode >= 0) { 871 } else if (cap_fmode >= 0) {
824 pr_warning("mds issued no caps on %llx.%llx\n", 872 pr_warn("mds issued no caps on %llx.%llx\n",
825 ceph_vinop(inode)); 873 ceph_vinop(inode));
826 __ceph_get_fmode(ci, cap_fmode); 874 __ceph_get_fmode(ci, cap_fmode);
827 } 875 }
876 spin_unlock(&ci->i_ceph_lock);
877
878 if (wake)
879 wake_up_all(&ci->i_cap_wq);
880
881 /* queue truncate if we saw i_size decrease */
882 if (queue_trunc)
883 ceph_queue_vmtruncate(inode);
884
885 /* populate frag tree */
886 if (S_ISDIR(inode->i_mode))
887 ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
828 888
829 /* update delegation info? */ 889 /* update delegation info? */
830 if (dirinfo) 890 if (dirinfo)
831 ceph_fill_dirfrag(inode, dirinfo); 891 ceph_fill_dirfrag(inode, dirinfo);
832 892
833 err = 0; 893 err = 0;
834
835out: 894out:
895 if (new_cap)
896 ceph_put_cap(mdsc, new_cap);
836 if (xattr_blob) 897 if (xattr_blob)
837 ceph_buffer_put(xattr_blob); 898 ceph_buffer_put(xattr_blob);
838 return err; 899 return err;
@@ -1485,7 +1546,7 @@ static void ceph_invalidate_work(struct work_struct *work)
1485 orig_gen = ci->i_rdcache_gen; 1546 orig_gen = ci->i_rdcache_gen;
1486 spin_unlock(&ci->i_ceph_lock); 1547 spin_unlock(&ci->i_ceph_lock);
1487 1548
1488 truncate_inode_pages(inode->i_mapping, 0); 1549 truncate_pagecache(inode, 0);
1489 1550
1490 spin_lock(&ci->i_ceph_lock); 1551 spin_lock(&ci->i_ceph_lock);
1491 if (orig_gen == ci->i_rdcache_gen && 1552 if (orig_gen == ci->i_rdcache_gen &&
@@ -1588,7 +1649,7 @@ retry:
1588 ci->i_truncate_pending, to); 1649 ci->i_truncate_pending, to);
1589 spin_unlock(&ci->i_ceph_lock); 1650 spin_unlock(&ci->i_ceph_lock);
1590 1651
1591 truncate_inode_pages(inode->i_mapping, to); 1652 truncate_pagecache(inode, to);
1592 1653
1593 spin_lock(&ci->i_ceph_lock); 1654 spin_lock(&ci->i_ceph_lock);
1594 if (to == ci->i_truncate_size) { 1655 if (to == ci->i_truncate_size) {