summaryrefslogtreecommitdiffstats
path: root/fs/ceph/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/inode.c')
-rw-r--r--fs/ceph/inode.c159
1 files changed, 124 insertions, 35 deletions
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e669cfa9d793..f059b5997072 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -11,6 +11,7 @@
11#include <linux/xattr.h> 11#include <linux/xattr.h>
12#include <linux/posix_acl.h> 12#include <linux/posix_acl.h>
13#include <linux/random.h> 13#include <linux/random.h>
14#include <linux/sort.h>
14 15
15#include "super.h" 16#include "super.h"
16#include "mds_client.h" 17#include "mds_client.h"
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
254 diri_auth = ci->i_auth_cap->mds; 255 diri_auth = ci->i_auth_cap->mds;
255 spin_unlock(&ci->i_ceph_lock); 256 spin_unlock(&ci->i_ceph_lock);
256 257
258 if (mds == -1) /* CDIR_AUTH_PARENT */
259 mds = diri_auth;
260
257 mutex_lock(&ci->i_fragtree_mutex); 261 mutex_lock(&ci->i_fragtree_mutex);
258 if (ndist == 0 && mds == diri_auth) { 262 if (ndist == 0 && mds == diri_auth) {
259 /* no delegation info needed. */ 263 /* no delegation info needed. */
@@ -300,20 +304,38 @@ out:
300 return err; 304 return err;
301} 305}
302 306
307static int frag_tree_split_cmp(const void *l, const void *r)
308{
309 struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
310 struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
311 return ceph_frag_compare(ls->frag, rs->frag);
312}
313
314static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
315{
316 if (!frag)
317 return f == ceph_frag_make(0, 0);
318 if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
319 return false;
320 return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
321}
322
303static int ceph_fill_fragtree(struct inode *inode, 323static int ceph_fill_fragtree(struct inode *inode,
304 struct ceph_frag_tree_head *fragtree, 324 struct ceph_frag_tree_head *fragtree,
305 struct ceph_mds_reply_dirfrag *dirinfo) 325 struct ceph_mds_reply_dirfrag *dirinfo)
306{ 326{
307 struct ceph_inode_info *ci = ceph_inode(inode); 327 struct ceph_inode_info *ci = ceph_inode(inode);
308 struct ceph_inode_frag *frag; 328 struct ceph_inode_frag *frag, *prev_frag = NULL;
309 struct rb_node *rb_node; 329 struct rb_node *rb_node;
310 int i; 330 unsigned i, split_by, nsplits;
311 u32 id, nsplits; 331 u32 id;
312 bool update = false; 332 bool update = false;
313 333
314 mutex_lock(&ci->i_fragtree_mutex); 334 mutex_lock(&ci->i_fragtree_mutex);
315 nsplits = le32_to_cpu(fragtree->nsplits); 335 nsplits = le32_to_cpu(fragtree->nsplits);
316 if (nsplits) { 336 if (nsplits != ci->i_fragtree_nsplits) {
337 update = true;
338 } else if (nsplits) {
317 i = prandom_u32() % nsplits; 339 i = prandom_u32() % nsplits;
318 id = le32_to_cpu(fragtree->splits[i].frag); 340 id = le32_to_cpu(fragtree->splits[i].frag);
319 if (!__ceph_find_frag(ci, id)) 341 if (!__ceph_find_frag(ci, id))
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
332 if (!update) 354 if (!update)
333 goto out_unlock; 355 goto out_unlock;
334 356
357 if (nsplits > 1) {
358 sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
359 frag_tree_split_cmp, NULL);
360 }
361
335 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); 362 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
336 rb_node = rb_first(&ci->i_fragtree); 363 rb_node = rb_first(&ci->i_fragtree);
337 for (i = 0; i < nsplits; i++) { 364 for (i = 0; i < nsplits; i++) {
338 id = le32_to_cpu(fragtree->splits[i].frag); 365 id = le32_to_cpu(fragtree->splits[i].frag);
366 split_by = le32_to_cpu(fragtree->splits[i].by);
367 if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
368 pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
369 "frag %x split by %d\n", ceph_vinop(inode),
370 i, nsplits, id, split_by);
371 continue;
372 }
339 frag = NULL; 373 frag = NULL;
340 while (rb_node) { 374 while (rb_node) {
341 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 375 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
347 break; 381 break;
348 } 382 }
349 rb_node = rb_next(rb_node); 383 rb_node = rb_next(rb_node);
350 rb_erase(&frag->node, &ci->i_fragtree); 384 /* delete stale split/leaf node */
351 kfree(frag); 385 if (frag->split_by > 0 ||
386 !is_frag_child(frag->frag, prev_frag)) {
387 rb_erase(&frag->node, &ci->i_fragtree);
388 if (frag->split_by > 0)
389 ci->i_fragtree_nsplits--;
390 kfree(frag);
391 }
352 frag = NULL; 392 frag = NULL;
353 } 393 }
354 if (!frag) { 394 if (!frag) {
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
356 if (IS_ERR(frag)) 396 if (IS_ERR(frag))
357 continue; 397 continue;
358 } 398 }
359 frag->split_by = le32_to_cpu(fragtree->splits[i].by); 399 if (frag->split_by == 0)
400 ci->i_fragtree_nsplits++;
401 frag->split_by = split_by;
360 dout(" frag %x split by %d\n", frag->frag, frag->split_by); 402 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
403 prev_frag = frag;
361 } 404 }
362 while (rb_node) { 405 while (rb_node) {
363 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 406 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
364 rb_node = rb_next(rb_node); 407 rb_node = rb_next(rb_node);
365 rb_erase(&frag->node, &ci->i_fragtree); 408 /* delete stale split/leaf node */
366 kfree(frag); 409 if (frag->split_by > 0 ||
410 !is_frag_child(frag->frag, prev_frag)) {
411 rb_erase(&frag->node, &ci->i_fragtree);
412 if (frag->split_by > 0)
413 ci->i_fragtree_nsplits--;
414 kfree(frag);
415 }
367 } 416 }
368out_unlock: 417out_unlock:
369 mutex_unlock(&ci->i_fragtree_mutex); 418 mutex_unlock(&ci->i_fragtree_mutex);
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
513 rb_erase(n, &ci->i_fragtree); 562 rb_erase(n, &ci->i_fragtree);
514 kfree(frag); 563 kfree(frag);
515 } 564 }
565 ci->i_fragtree_nsplits = 0;
516 566
517 __ceph_destroy_xattrs(ci); 567 __ceph_destroy_xattrs(ci);
518 if (ci->i_xattrs.blob) 568 if (ci->i_xattrs.blob)
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
533 return 1; 583 return 1;
534} 584}
535 585
586static inline blkcnt_t calc_inode_blocks(u64 size)
587{
588 return (size + (1<<9) - 1) >> 9;
589}
590
536/* 591/*
537 * Helpers to fill in size, ctime, mtime, and atime. We have to be 592 * Helpers to fill in size, ctime, mtime, and atime. We have to be
538 * careful because either the client or MDS may have more up to date 593 * careful because either the client or MDS may have more up to date
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
555 size = 0; 610 size = 0;
556 } 611 }
557 i_size_write(inode, size); 612 i_size_write(inode, size);
558 inode->i_blocks = (size + (1<<9) - 1) >> 9; 613 inode->i_blocks = calc_inode_blocks(size);
559 ci->i_reported_size = size; 614 ci->i_reported_size = size;
560 if (truncate_seq != ci->i_truncate_seq) { 615 if (truncate_seq != ci->i_truncate_seq) {
561 dout("truncate_seq %u -> %u\n", 616 dout("truncate_seq %u -> %u\n",
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
814 869
815 spin_unlock(&ci->i_ceph_lock); 870 spin_unlock(&ci->i_ceph_lock);
816 871
817 err = -EINVAL; 872 if (symlen != i_size_read(inode)) {
818 if (WARN_ON(symlen != i_size_read(inode))) 873 pr_err("fill_inode %llx.%llx BAD symlink "
819 goto out; 874 "size %lld\n", ceph_vinop(inode),
875 i_size_read(inode));
876 i_size_write(inode, symlen);
877 inode->i_blocks = calc_inode_blocks(symlen);
878 }
820 879
821 err = -ENOMEM; 880 err = -ENOMEM;
822 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); 881 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1309 int i, err = 0; 1368 int i, err = 0;
1310 1369
1311 for (i = 0; i < rinfo->dir_nr; i++) { 1370 for (i = 0; i < rinfo->dir_nr; i++) {
1371 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1312 struct ceph_vino vino; 1372 struct ceph_vino vino;
1313 struct inode *in; 1373 struct inode *in;
1314 int rc; 1374 int rc;
1315 1375
1316 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); 1376 vino.ino = le64_to_cpu(rde->inode.in->ino);
1317 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); 1377 vino.snap = le64_to_cpu(rde->inode.in->snapid);
1318 1378
1319 in = ceph_get_inode(req->r_dentry->d_sb, vino); 1379 in = ceph_get_inode(req->r_dentry->d_sb, vino);
1320 if (IS_ERR(in)) { 1380 if (IS_ERR(in)) {
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1322 dout("new_inode badness got %d\n", err); 1382 dout("new_inode badness got %d\n", err);
1323 continue; 1383 continue;
1324 } 1384 }
1325 rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1385 rc = fill_inode(in, NULL, &rde->inode, NULL, session,
1326 req->r_request_started, -1, 1386 req->r_request_started, -1,
1327 &req->r_caps_reservation); 1387 &req->r_caps_reservation);
1328 if (rc < 0) { 1388 if (rc < 0) {
1329 pr_err("fill_inode badness on %p got %d\n", in, rc); 1389 pr_err("fill_inode badness on %p got %d\n", in, rc);
1330 err = rc; 1390 err = rc;
1331 continue;
1332 } 1391 }
1392 iput(in);
1333 } 1393 }
1334 1394
1335 return err; 1395 return err;
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1387 struct ceph_mds_session *session) 1447 struct ceph_mds_session *session)
1388{ 1448{
1389 struct dentry *parent = req->r_dentry; 1449 struct dentry *parent = req->r_dentry;
1450 struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
1390 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1451 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1391 struct qstr dname; 1452 struct qstr dname;
1392 struct dentry *dn; 1453 struct dentry *dn;
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1394 int err = 0, skipped = 0, ret, i; 1455 int err = 0, skipped = 0, ret, i;
1395 struct inode *snapdir = NULL; 1456 struct inode *snapdir = NULL;
1396 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1457 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1397 struct ceph_dentry_info *di;
1398 u32 frag = le32_to_cpu(rhead->args.readdir.frag); 1458 u32 frag = le32_to_cpu(rhead->args.readdir.frag);
1459 u32 last_hash = 0;
1460 u32 fpos_offset;
1399 struct ceph_readdir_cache_control cache_ctl = {}; 1461 struct ceph_readdir_cache_control cache_ctl = {};
1400 1462
1401 if (req->r_aborted) 1463 if (req->r_aborted)
1402 return readdir_prepopulate_inodes_only(req, session); 1464 return readdir_prepopulate_inodes_only(req, session);
1403 1465
1466 if (rinfo->hash_order && req->r_path2) {
1467 last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1468 req->r_path2, strlen(req->r_path2));
1469 last_hash = ceph_frag_value(last_hash);
1470 }
1471
1404 if (rinfo->dir_dir && 1472 if (rinfo->dir_dir &&
1405 le32_to_cpu(rinfo->dir_dir->frag) != frag) { 1473 le32_to_cpu(rinfo->dir_dir->frag) != frag) {
1406 dout("readdir_prepopulate got new frag %x -> %x\n", 1474 dout("readdir_prepopulate got new frag %x -> %x\n",
1407 frag, le32_to_cpu(rinfo->dir_dir->frag)); 1475 frag, le32_to_cpu(rinfo->dir_dir->frag));
1408 frag = le32_to_cpu(rinfo->dir_dir->frag); 1476 frag = le32_to_cpu(rinfo->dir_dir->frag);
1409 if (ceph_frag_is_leftmost(frag)) 1477 if (!rinfo->hash_order)
1410 req->r_readdir_offset = 2; 1478 req->r_readdir_offset = 2;
1411 else
1412 req->r_readdir_offset = 0;
1413 } 1479 }
1414 1480
1415 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { 1481 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1427 if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { 1493 if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
1428 /* note dir version at start of readdir so we can tell 1494 /* note dir version at start of readdir so we can tell
1429 * if any dentries get dropped */ 1495 * if any dentries get dropped */
1430 struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
1431 req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); 1496 req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
1432 req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); 1497 req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
1433 req->r_readdir_cache_idx = 0; 1498 req->r_readdir_cache_idx = 0;
1434 } 1499 }
1435 1500
1436 cache_ctl.index = req->r_readdir_cache_idx; 1501 cache_ctl.index = req->r_readdir_cache_idx;
1502 fpos_offset = req->r_readdir_offset;
1437 1503
1438 /* FIXME: release caps/leases if error occurs */ 1504 /* FIXME: release caps/leases if error occurs */
1439 for (i = 0; i < rinfo->dir_nr; i++) { 1505 for (i = 0; i < rinfo->dir_nr; i++) {
1506 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1440 struct ceph_vino vino; 1507 struct ceph_vino vino;
1441 1508
1442 dname.name = rinfo->dir_dname[i]; 1509 dname.name = rde->name;
1443 dname.len = rinfo->dir_dname_len[i]; 1510 dname.len = rde->name_len;
1444 dname.hash = full_name_hash(dname.name, dname.len); 1511 dname.hash = full_name_hash(dname.name, dname.len);
1445 1512
1446 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); 1513 vino.ino = le64_to_cpu(rde->inode.in->ino);
1447 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); 1514 vino.snap = le64_to_cpu(rde->inode.in->snapid);
1515
1516 if (rinfo->hash_order) {
1517 u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1518 rde->name, rde->name_len);
1519 hash = ceph_frag_value(hash);
1520 if (hash != last_hash)
1521 fpos_offset = 2;
1522 last_hash = hash;
1523 rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
1524 } else {
1525 rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
1526 }
1448 1527
1449retry_lookup: 1528retry_lookup:
1450 dn = d_lookup(parent, &dname); 1529 dn = d_lookup(parent, &dname);
@@ -1490,7 +1569,7 @@ retry_lookup:
1490 } 1569 }
1491 } 1570 }
1492 1571
1493 ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1572 ret = fill_inode(in, NULL, &rde->inode, NULL, session,
1494 req->r_request_started, -1, 1573 req->r_request_started, -1,
1495 &req->r_caps_reservation); 1574 &req->r_caps_reservation);
1496 if (ret < 0) { 1575 if (ret < 0) {
@@ -1523,11 +1602,9 @@ retry_lookup:
1523 dn = realdn; 1602 dn = realdn;
1524 } 1603 }
1525 1604
1526 di = dn->d_fsdata; 1605 ceph_dentry(dn)->offset = rde->offset;
1527 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1528 1606
1529 update_dentry_lease(dn, rinfo->dir_dlease[i], 1607 update_dentry_lease(dn, rde->lease, req->r_session,
1530 req->r_session,
1531 req->r_request_started); 1608 req->r_request_started);
1532 1609
1533 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { 1610 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
1562 spin_lock(&ci->i_ceph_lock); 1639 spin_lock(&ci->i_ceph_lock);
1563 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); 1640 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1564 i_size_write(inode, size); 1641 i_size_write(inode, size);
1565 inode->i_blocks = (size + (1 << 9) - 1) >> 9; 1642 inode->i_blocks = calc_inode_blocks(size);
1566 1643
1567 /* tell the MDS if we are approaching max_size */ 1644 /* tell the MDS if we are approaching max_size */
1568 if ((size << 1) >= ci->i_max_size && 1645 if ((size << 1) >= ci->i_max_size &&
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
1624 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, 1701 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1625 i_pg_inv_work); 1702 i_pg_inv_work);
1626 struct inode *inode = &ci->vfs_inode; 1703 struct inode *inode = &ci->vfs_inode;
1704 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1627 u32 orig_gen; 1705 u32 orig_gen;
1628 int check = 0; 1706 int check = 0;
1629 1707
1630 mutex_lock(&ci->i_truncate_mutex); 1708 mutex_lock(&ci->i_truncate_mutex);
1709
1710 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
1711 pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
1712 inode, ceph_ino(inode));
1713 mapping_set_error(inode->i_mapping, -EIO);
1714 truncate_pagecache(inode, 0);
1715 mutex_unlock(&ci->i_truncate_mutex);
1716 goto out;
1717 }
1718
1631 spin_lock(&ci->i_ceph_lock); 1719 spin_lock(&ci->i_ceph_lock);
1632 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1720 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1633 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1721 ci->i_rdcache_gen, ci->i_rdcache_revoking);
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
1641 orig_gen = ci->i_rdcache_gen; 1729 orig_gen = ci->i_rdcache_gen;
1642 spin_unlock(&ci->i_ceph_lock); 1730 spin_unlock(&ci->i_ceph_lock);
1643 1731
1644 truncate_pagecache(inode, 0); 1732 if (invalidate_inode_pages2(inode->i_mapping) < 0) {
1733 pr_err("invalidate_pages %p fails\n", inode);
1734 }
1645 1735
1646 spin_lock(&ci->i_ceph_lock); 1736 spin_lock(&ci->i_ceph_lock);
1647 if (orig_gen == ci->i_rdcache_gen && 1737 if (orig_gen == ci->i_rdcache_gen &&
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
1920 if ((issued & CEPH_CAP_FILE_EXCL) && 2010 if ((issued & CEPH_CAP_FILE_EXCL) &&
1921 attr->ia_size > inode->i_size) { 2011 attr->ia_size > inode->i_size) {
1922 i_size_write(inode, attr->ia_size); 2012 i_size_write(inode, attr->ia_size);
1923 inode->i_blocks = 2013 inode->i_blocks = calc_inode_blocks(attr->ia_size);
1924 (attr->ia_size + (1 << 9) - 1) >> 9;
1925 inode->i_ctime = attr->ia_ctime; 2014 inode->i_ctime = attr->ia_ctime;
1926 ci->i_reported_size = attr->ia_size; 2015 ci->i_reported_size = attr->ia_size;
1927 dirtied |= CEPH_CAP_FILE_EXCL; 2016 dirtied |= CEPH_CAP_FILE_EXCL;