aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-07-21 15:43:03 -0400
committerIngo Molnar <mingo@elte.hu>2010-07-21 15:43:06 -0400
commit9dcdbf7a33d9018ac5d45debcf261be648bdd56a (patch)
treebbcc1a018f11ff76cd7ce174ef3ffe2c02da07ee /fs
parentcc5edb0eb9ce892b530e34a5d110382483587942 (diff)
parentcd5b8f8755a89a57fc8c408d284b8b613f090345 (diff)
Merge branch 'linus' into perf/core
Merge reason: Pick up the latest perf fixes. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/btrfs/ctree.c129
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/ioctl.c20
-rw-r--r--fs/ceph/auth_x.c5
-rw-r--r--fs/ceph/caps.c21
-rw-r--r--fs/ceph/crush/mapper.c41
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/inode.c19
-rw-r--r--fs/ceph/mds_client.c41
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/messenger.c75
-rw-r--r--fs/ceph/mon_client.c3
-rw-r--r--fs/ceph/osd_client.c3
-rw-r--r--fs/ceph/osdmap.c2
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/fs-writeback.c335
-rw-r--r--fs/gfs2/bmap.c1
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/glock.c12
-rw-r--r--fs/gfs2/inode.c12
-rw-r--r--fs/gfs2/quota.c10
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/inode.c2
-rw-r--r--fs/jbd2/journal.c15
-rw-r--r--fs/jbd2/transaction.c9
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/mbcache.c5
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/ocfs2/aops.c94
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/file.c309
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/journal.c30
-rw-r--r--fs/ocfs2/localalloc.c7
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/quota_local.c4
-rw-r--r--fs/ocfs2/refcounttree.c12
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/xattr.c200
-rw-r--r--fs/partitions/ibm.c14
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/splice.c9
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c130
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h3
-rw-r--r--fs/xfs/quota/xfs_qm.c7
-rw-r--r--fs/xfs/xfs_mount.h2
55 files changed, 1019 insertions, 633 deletions
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3dab9e9948d0..722743b152d8 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -680,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
680{ 680{
681 struct address_space *mapping = vnode->vfs_inode.i_mapping; 681 struct address_space *mapping = vnode->vfs_inode.i_mapping;
682 struct writeback_control wbc = { 682 struct writeback_control wbc = {
683 .bdi = mapping->backing_dev_info,
684 .sync_mode = WB_SYNC_ALL, 683 .sync_mode = WB_SYNC_ALL,
685 .nr_to_write = LONG_MAX, 684 .nr_to_write = LONG_MAX,
686 .range_cyclic = 1, 685 .range_cyclic = 1,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0d1d966b0fe4..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2304,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2304 return ret; 2304 return ret;
2305} 2305}
2306 2306
2307/*
2308 * min slot controls the lowest index we're willing to push to the
2309 * right. We'll push up to and including min_slot, but no lower
2310 */
2307static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 2311static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2308 struct btrfs_root *root, 2312 struct btrfs_root *root,
2309 struct btrfs_path *path, 2313 struct btrfs_path *path,
2310 int data_size, int empty, 2314 int data_size, int empty,
2311 struct extent_buffer *right, 2315 struct extent_buffer *right,
2312 int free_space, u32 left_nritems) 2316 int free_space, u32 left_nritems,
2317 u32 min_slot)
2313{ 2318{
2314 struct extent_buffer *left = path->nodes[0]; 2319 struct extent_buffer *left = path->nodes[0];
2315 struct extent_buffer *upper = path->nodes[1]; 2320 struct extent_buffer *upper = path->nodes[1];
@@ -2327,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2327 if (empty) 2332 if (empty)
2328 nr = 0; 2333 nr = 0;
2329 else 2334 else
2330 nr = 1; 2335 nr = max_t(u32, 1, min_slot);
2331 2336
2332 if (path->slots[0] >= left_nritems) 2337 if (path->slots[0] >= left_nritems)
2333 push_space += data_size; 2338 push_space += data_size;
@@ -2469,10 +2474,14 @@ out_unlock:
2469 * 2474 *
2470 * returns 1 if the push failed because the other node didn't have enough 2475 * returns 1 if the push failed because the other node didn't have enough
2471 * room, 0 if everything worked out and < 0 if there were major errors. 2476 * room, 0 if everything worked out and < 0 if there were major errors.
2477 *
2478 * this will push starting from min_slot to the end of the leaf. It won't
2479 * push any slot lower than min_slot
2472 */ 2480 */
2473static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 2481static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2474 *root, struct btrfs_path *path, int data_size, 2482 *root, struct btrfs_path *path,
2475 int empty) 2483 int min_data_size, int data_size,
2484 int empty, u32 min_slot)
2476{ 2485{
2477 struct extent_buffer *left = path->nodes[0]; 2486 struct extent_buffer *left = path->nodes[0];
2478 struct extent_buffer *right; 2487 struct extent_buffer *right;
@@ -2514,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2514 if (left_nritems == 0) 2523 if (left_nritems == 0)
2515 goto out_unlock; 2524 goto out_unlock;
2516 2525
2517 return __push_leaf_right(trans, root, path, data_size, empty, 2526 return __push_leaf_right(trans, root, path, min_data_size, empty,
2518 right, free_space, left_nritems); 2527 right, free_space, left_nritems, min_slot);
2519out_unlock: 2528out_unlock:
2520 btrfs_tree_unlock(right); 2529 btrfs_tree_unlock(right);
2521 free_extent_buffer(right); 2530 free_extent_buffer(right);
@@ -2525,12 +2534,17 @@ out_unlock:
2525/* 2534/*
2526 * push some data in the path leaf to the left, trying to free up at 2535 * push some data in the path leaf to the left, trying to free up at
2527 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2536 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2537 *
2538 * max_slot can put a limit on how far into the leaf we'll push items. The
2539 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
2540 * items
2528 */ 2541 */
2529static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 2542static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2530 struct btrfs_root *root, 2543 struct btrfs_root *root,
2531 struct btrfs_path *path, int data_size, 2544 struct btrfs_path *path, int data_size,
2532 int empty, struct extent_buffer *left, 2545 int empty, struct extent_buffer *left,
2533 int free_space, int right_nritems) 2546 int free_space, u32 right_nritems,
2547 u32 max_slot)
2534{ 2548{
2535 struct btrfs_disk_key disk_key; 2549 struct btrfs_disk_key disk_key;
2536 struct extent_buffer *right = path->nodes[0]; 2550 struct extent_buffer *right = path->nodes[0];
@@ -2549,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2549 slot = path->slots[1]; 2563 slot = path->slots[1];
2550 2564
2551 if (empty) 2565 if (empty)
2552 nr = right_nritems; 2566 nr = min(right_nritems, max_slot);
2553 else 2567 else
2554 nr = right_nritems - 1; 2568 nr = min(right_nritems - 1, max_slot);
2555 2569
2556 for (i = 0; i < nr; i++) { 2570 for (i = 0; i < nr; i++) {
2557 item = btrfs_item_nr(right, i); 2571 item = btrfs_item_nr(right, i);
@@ -2712,10 +2726,14 @@ out:
2712/* 2726/*
2713 * push some data in the path leaf to the left, trying to free up at 2727 * push some data in the path leaf to the left, trying to free up at
2714 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2728 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2729 *
2730 * max_slot can put a limit on how far into the leaf we'll push items. The
2731 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
2732 * items
2715 */ 2733 */
2716static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2734static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2717 *root, struct btrfs_path *path, int data_size, 2735 *root, struct btrfs_path *path, int min_data_size,
2718 int empty) 2736 int data_size, int empty, u32 max_slot)
2719{ 2737{
2720 struct extent_buffer *right = path->nodes[0]; 2738 struct extent_buffer *right = path->nodes[0];
2721 struct extent_buffer *left; 2739 struct extent_buffer *left;
@@ -2761,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2761 goto out; 2779 goto out;
2762 } 2780 }
2763 2781
2764 return __push_leaf_left(trans, root, path, data_size, 2782 return __push_leaf_left(trans, root, path, min_data_size,
2765 empty, left, free_space, right_nritems); 2783 empty, left, free_space, right_nritems,
2784 max_slot);
2766out: 2785out:
2767 btrfs_tree_unlock(left); 2786 btrfs_tree_unlock(left);
2768 free_extent_buffer(left); 2787 free_extent_buffer(left);
@@ -2855,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2855} 2874}
2856 2875
2857/* 2876/*
2877 * double splits happen when we need to insert a big item in the middle
2878 * of a leaf. A double split can leave us with 3 mostly empty leaves:
2879 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2880 * A B C
2881 *
2882 * We avoid this by trying to push the items on either side of our target
2883 * into the adjacent leaves. If all goes well we can avoid the double split
2884 * completely.
2885 */
2886static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2887 struct btrfs_root *root,
2888 struct btrfs_path *path,
2889 int data_size)
2890{
2891 int ret;
2892 int progress = 0;
2893 int slot;
2894 u32 nritems;
2895
2896 slot = path->slots[0];
2897
2898 /*
2899 * try to push all the items after our slot into the
2900 * right leaf
2901 */
2902 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2903 if (ret < 0)
2904 return ret;
2905
2906 if (ret == 0)
2907 progress++;
2908
2909 nritems = btrfs_header_nritems(path->nodes[0]);
2910 /*
2911 * our goal is to get our slot at the start or end of a leaf. If
2912 * we've done so we're done
2913 */
2914 if (path->slots[0] == 0 || path->slots[0] == nritems)
2915 return 0;
2916
2917 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2918 return 0;
2919
2920 /* try to push all the items before our slot into the next leaf */
2921 slot = path->slots[0];
2922 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2923 if (ret < 0)
2924 return ret;
2925
2926 if (ret == 0)
2927 progress++;
2928
2929 if (progress)
2930 return 0;
2931 return 1;
2932}
2933
2934/*
2858 * split the path's leaf in two, making sure there is at least data_size 2935 * split the path's leaf in two, making sure there is at least data_size
2859 * available for the resulting leaf level of the path. 2936 * available for the resulting leaf level of the path.
2860 * 2937 *
@@ -2876,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2876 int wret; 2953 int wret;
2877 int split; 2954 int split;
2878 int num_doubles = 0; 2955 int num_doubles = 0;
2956 int tried_avoid_double = 0;
2879 2957
2880 l = path->nodes[0]; 2958 l = path->nodes[0];
2881 slot = path->slots[0]; 2959 slot = path->slots[0];
@@ -2884,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2884 return -EOVERFLOW; 2962 return -EOVERFLOW;
2885 2963
2886 /* first try to make some room by pushing left and right */ 2964 /* first try to make some room by pushing left and right */
2887 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2965 if (data_size) {
2888 wret = push_leaf_right(trans, root, path, data_size, 0); 2966 wret = push_leaf_right(trans, root, path, data_size,
2967 data_size, 0, 0);
2889 if (wret < 0) 2968 if (wret < 0)
2890 return wret; 2969 return wret;
2891 if (wret) { 2970 if (wret) {
2892 wret = push_leaf_left(trans, root, path, data_size, 0); 2971 wret = push_leaf_left(trans, root, path, data_size,
2972 data_size, 0, (u32)-1);
2893 if (wret < 0) 2973 if (wret < 0)
2894 return wret; 2974 return wret;
2895 } 2975 }
@@ -2923,6 +3003,8 @@ again:
2923 if (mid != nritems && 3003 if (mid != nritems &&
2924 leaf_space_used(l, mid, nritems - mid) + 3004 leaf_space_used(l, mid, nritems - mid) +
2925 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3005 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3006 if (data_size && !tried_avoid_double)
3007 goto push_for_double;
2926 split = 2; 3008 split = 2;
2927 } 3009 }
2928 } 3010 }
@@ -2939,6 +3021,8 @@ again:
2939 if (mid != nritems && 3021 if (mid != nritems &&
2940 leaf_space_used(l, mid, nritems - mid) + 3022 leaf_space_used(l, mid, nritems - mid) +
2941 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3023 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3024 if (data_size && !tried_avoid_double)
3025 goto push_for_double;
2942 split = 2 ; 3026 split = 2 ;
2943 } 3027 }
2944 } 3028 }
@@ -3019,6 +3103,13 @@ again:
3019 } 3103 }
3020 3104
3021 return ret; 3105 return ret;
3106
3107push_for_double:
3108 push_for_double_split(trans, root, path, data_size);
3109 tried_avoid_double = 1;
3110 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3111 return 0;
3112 goto again;
3022} 3113}
3023 3114
3024static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3115static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3915,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3915 extent_buffer_get(leaf); 4006 extent_buffer_get(leaf);
3916 4007
3917 btrfs_set_path_blocking(path); 4008 btrfs_set_path_blocking(path);
3918 wret = push_leaf_left(trans, root, path, 1, 1); 4009 wret = push_leaf_left(trans, root, path, 1, 1,
4010 1, (u32)-1);
3919 if (wret < 0 && wret != -ENOSPC) 4011 if (wret < 0 && wret != -ENOSPC)
3920 ret = wret; 4012 ret = wret;
3921 4013
3922 if (path->nodes[0] == leaf && 4014 if (path->nodes[0] == leaf &&
3923 btrfs_header_nritems(leaf)) { 4015 btrfs_header_nritems(leaf)) {
3924 wret = push_leaf_right(trans, root, path, 1, 1); 4016 wret = push_leaf_right(trans, root, path, 1,
4017 1, 1, 0);
3925 if (wret < 0 && wret != -ENOSPC) 4018 if (wret < 0 && wret != -ENOSPC)
3926 ret = wret; 4019 ret = wret;
3927 } 4020 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a4080c21ec55..d74e6af9b53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2594,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2594 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2594 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2595 }; 2595 };
2596 struct writeback_control wbc_writepages = { 2596 struct writeback_control wbc_writepages = {
2597 .bdi = wbc->bdi,
2598 .sync_mode = wbc->sync_mode, 2597 .sync_mode = wbc->sync_mode,
2599 .older_than_this = NULL, 2598 .older_than_this = NULL,
2600 .nr_to_write = 64, 2599 .nr_to_write = 64,
@@ -2628,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2628 .sync_io = mode == WB_SYNC_ALL, 2627 .sync_io = mode == WB_SYNC_ALL,
2629 }; 2628 };
2630 struct writeback_control wbc_writepages = { 2629 struct writeback_control wbc_writepages = {
2631 .bdi = inode->i_mapping->backing_dev_info,
2632 .sync_mode = mode, 2630 .sync_mode = mode,
2633 .older_than_this = NULL, 2631 .older_than_this = NULL,
2634 .nr_to_write = nr_pages * 2, 2632 .nr_to_write = nr_pages * 2,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4dbaf89b1337..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1458,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1458 */ 1458 */
1459 1459
1460 /* the destination must be opened for writing */ 1460 /* the destination must be opened for writing */
1461 if (!(file->f_mode & FMODE_WRITE)) 1461 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1462 return -EINVAL; 1462 return -EINVAL;
1463 1463
1464 ret = mnt_want_write(file->f_path.mnt); 1464 ret = mnt_want_write(file->f_path.mnt);
@@ -1511,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1511 1511
1512 /* determine range to clone */ 1512 /* determine range to clone */
1513 ret = -EINVAL; 1513 ret = -EINVAL;
1514 if (off >= src->i_size || off + len > src->i_size) 1514 if (off + len > src->i_size || off + len < off)
1515 goto out_unlock; 1515 goto out_unlock;
1516 if (len == 0) 1516 if (len == 0)
1517 olen = len = src->i_size - off; 1517 olen = len = src->i_size - off;
@@ -1578,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1578 u64 disko = 0, diskl = 0; 1578 u64 disko = 0, diskl = 0;
1579 u64 datao = 0, datal = 0; 1579 u64 datao = 0, datal = 0;
1580 u8 comp; 1580 u8 comp;
1581 u64 endoff;
1581 1582
1582 size = btrfs_item_size_nr(leaf, slot); 1583 size = btrfs_item_size_nr(leaf, slot);
1583 read_extent_buffer(leaf, buf, 1584 read_extent_buffer(leaf, buf,
@@ -1712,9 +1713,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1712 btrfs_release_path(root, path); 1713 btrfs_release_path(root, path);
1713 1714
1714 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1715 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1715 if (new_key.offset + datal > inode->i_size) 1716
1716 btrfs_i_size_write(inode, 1717 /*
1717 new_key.offset + datal); 1718 * we round up to the block size at eof when
1719 * determining which extents to clone above,
1720 * but shouldn't round up the file size
1721 */
1722 endoff = new_key.offset + datal;
1723 if (endoff > off+olen)
1724 endoff = off+olen;
1725 if (endoff > inode->i_size)
1726 btrfs_i_size_write(inode, endoff);
1727
1718 BTRFS_I(inode)->flags = BTRFS_I(src)->flags; 1728 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1719 ret = btrfs_update_inode(trans, root, inode); 1729 ret = btrfs_update_inode(trans, root, inode);
1720 BUG_ON(ret); 1730 BUG_ON(ret);
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 83d4d2785ffe..6d44053ecff1 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -493,7 +493,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
493 return -EAGAIN; 493 return -EAGAIN;
494 } 494 }
495 495
496 op = le32_to_cpu(head->op); 496 op = le16_to_cpu(head->op);
497 result = le32_to_cpu(head->result); 497 result = le32_to_cpu(head->result);
498 dout("handle_reply op %d result %d\n", op, result); 498 dout("handle_reply op %d result %d\n", op, result);
499 switch (op) { 499 switch (op) {
@@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
613 remove_ticket_handler(ac, th); 613 remove_ticket_handler(ac, th);
614 } 614 }
615 615
616 if (xi->auth_authorizer.buf)
617 ceph_buffer_put(xi->auth_authorizer.buf);
618
616 kfree(ac->private); 619 kfree(ac->private);
617 ac->private = NULL; 620 ac->private = NULL;
618} 621}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 619b61655ee5..74144d6389f0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -244,8 +244,14 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
244 struct ceph_cap *cap = NULL; 244 struct ceph_cap *cap = NULL;
245 245
246 /* temporary, until we do something about cap import/export */ 246 /* temporary, until we do something about cap import/export */
247 if (!ctx) 247 if (!ctx) {
248 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 248 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249 if (cap) {
250 caps_use_count++;
251 caps_total_count++;
252 }
253 return cap;
254 }
249 255
250 spin_lock(&caps_list_lock); 256 spin_lock(&caps_list_lock);
251 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", 257 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
@@ -2886,18 +2892,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
2886 struct ceph_inode_info *ci = ceph_inode(inode); 2892 struct ceph_inode_info *ci = ceph_inode(inode);
2887 struct ceph_cap *cap; 2893 struct ceph_cap *cap;
2888 struct ceph_mds_request_release *rel = *p; 2894 struct ceph_mds_request_release *rel = *p;
2895 int used, dirty;
2889 int ret = 0; 2896 int ret = 0;
2890 int used = 0;
2891 2897
2892 spin_lock(&inode->i_lock); 2898 spin_lock(&inode->i_lock);
2893 used = __ceph_caps_used(ci); 2899 used = __ceph_caps_used(ci);
2900 dirty = __ceph_caps_dirty(ci);
2894 2901
2895 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode, 2902 dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
2896 mds, ceph_cap_string(used), ceph_cap_string(drop), 2903 inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
2897 ceph_cap_string(unless)); 2904 ceph_cap_string(unless));
2898 2905
2899 /* only drop unused caps */ 2906 /* only drop unused, clean caps */
2900 drop &= ~used; 2907 drop &= ~(used | dirty);
2901 2908
2902 cap = __get_cap_for_mds(ci, mds); 2909 cap = __get_cap_for_mds(ci, mds);
2903 if (cap && __cap_is_valid(cap)) { 2910 if (cap && __cap_is_valid(cap)) {
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 9ba54efb6543..a4eec133258e 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -238,7 +238,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
238 238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{ 240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r); 241 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) { 242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM: 243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in, 244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -264,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
264 */ 264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x) 265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{ 266{
267 if (weight[item] >= 0x1000) 267 if (weight[item] >= 0x10000)
268 return 0; 268 return 0;
269 if (weight[item] == 0) 269 if (weight[item] == 0)
270 return 1; 270 return 1;
@@ -305,7 +305,9 @@ static int crush_choose(struct crush_map *map,
305 int itemtype; 305 int itemtype;
306 int collide, reject; 306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */ 307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos); 308
309 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
310 bucket->id, x, outpos, numrep);
309 311
310 for (rep = outpos; rep < numrep; rep++) { 312 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */ 313 /* keep trying until we get a non-out, non-colliding item */
@@ -366,6 +368,7 @@ static int crush_choose(struct crush_map *map,
366 BUG_ON(item >= 0 || 368 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets); 369 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item]; 370 in = map->buckets[-1-item];
371 retry_bucket = 1;
369 continue; 372 continue;
370 } 373 }
371 374
@@ -377,15 +380,25 @@ static int crush_choose(struct crush_map *map,
377 } 380 }
378 } 381 }
379 382
380 if (recurse_to_leaf && 383 reject = 0;
381 item < 0 && 384 if (recurse_to_leaf) {
382 crush_choose(map, map->buckets[-1-item], 385 if (item < 0) {
383 weight, 386 if (crush_choose(map,
384 x, outpos+1, 0, 387 map->buckets[-1-item],
385 out2, outpos, 388 weight,
386 firstn, 0, NULL) <= outpos) { 389 x, outpos+1, 0,
387 reject = 1; 390 out2, outpos,
388 } else { 391 firstn, 0,
392 NULL) <= outpos)
393 /* didn't get leaf */
394 reject = 1;
395 } else {
396 /* we already have a leaf! */
397 out2[outpos] = item;
398 }
399 }
400
401 if (!reject) {
389 /* out? */ 402 /* out? */
390 if (itemtype == 0) 403 if (itemtype == 0)
391 reject = is_out(map, weight, 404 reject = is_out(map, weight,
@@ -424,12 +437,12 @@ reject:
424 continue; 437 continue;
425 } 438 }
426 439
427 dprintk("choose got %d\n", item); 440 dprintk("CHOOSE got %d\n", item);
428 out[outpos] = item; 441 out[outpos] = item;
429 outpos++; 442 outpos++;
430 } 443 }
431 444
432 dprintk("choose returns %d\n", outpos); 445 dprintk("CHOOSE returns %d\n", outpos);
433 return outpos; 446 return outpos;
434} 447}
435 448
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3be33fb066cc..f2f5332ddbba 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -261,7 +261,7 @@ static int osdc_show(struct seq_file *s, void *pp)
261 261
262static int caps_show(struct seq_file *s, void *p) 262static int caps_show(struct seq_file *s, void *p)
263{ 263{
264 struct ceph_client *client = p; 264 struct ceph_client *client = s->private;
265 int total, avail, used, reserved, min; 265 int total, avail, used, reserved, min;
266 266
267 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); 267 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ab47f46ca282..8f9b9fe8ef9f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -854,8 +854,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
854 d_drop(dn); 854 d_drop(dn);
855 realdn = d_materialise_unique(dn, in); 855 realdn = d_materialise_unique(dn, in);
856 if (IS_ERR(realdn)) { 856 if (IS_ERR(realdn)) {
857 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n", 857 pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
858 dn, in, ceph_vinop(in)); 858 PTR_ERR(realdn), dn, in, ceph_vinop(in));
859 if (prehash) 859 if (prehash)
860 *prehash = false; /* don't rehash on error */ 860 *prehash = false; /* don't rehash on error */
861 dn = realdn; /* note realdn contains the error */ 861 dn = realdn; /* note realdn contains the error */
@@ -1234,18 +1234,23 @@ retry_lookup:
1234 goto out; 1234 goto out;
1235 } 1235 }
1236 dn = splice_dentry(dn, in, NULL); 1236 dn = splice_dentry(dn, in, NULL);
1237 if (IS_ERR(dn))
1238 dn = NULL;
1237 } 1239 }
1238 1240
1239 if (fill_inode(in, &rinfo->dir_in[i], NULL, session, 1241 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1240 req->r_request_started, -1, 1242 req->r_request_started, -1,
1241 &req->r_caps_reservation) < 0) { 1243 &req->r_caps_reservation) < 0) {
1242 pr_err("fill_inode badness on %p\n", in); 1244 pr_err("fill_inode badness on %p\n", in);
1243 dput(dn); 1245 goto next_item;
1244 continue;
1245 } 1246 }
1246 update_dentry_lease(dn, rinfo->dir_dlease[i], 1247 if (dn)
1247 req->r_session, req->r_request_started); 1248 update_dentry_lease(dn, rinfo->dir_dlease[i],
1248 dput(dn); 1249 req->r_session,
1250 req->r_request_started);
1251next_item:
1252 if (dn)
1253 dput(dn);
1249 } 1254 }
1250 req->r_did_prepopulate = true; 1255 req->r_did_prepopulate = true;
1251 1256
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1766947fc07a..416c08d315db 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1514 ceph_encode_filepath(&p, end, ino1, path1); 1514 ceph_encode_filepath(&p, end, ino1, path1);
1515 ceph_encode_filepath(&p, end, ino2, path2); 1515 ceph_encode_filepath(&p, end, ino2, path2);
1516 1516
1517 /* make note of release offset, in case we need to replay */
1518 req->r_request_release_offset = p - msg->front.iov_base;
1519
1517 /* cap releases */ 1520 /* cap releases */
1518 releases = 0; 1521 releases = 0;
1519 if (req->r_inode_drop) 1522 if (req->r_inode_drop)
@@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1580 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, 1583 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1581 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 1584 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1582 1585
1586 if (req->r_got_unsafe) {
1587 /*
1588 * Replay. Do not regenerate message (and rebuild
1589 * paths, etc.); just use the original message.
1590 * Rebuilding paths will break for renames because
1591 * d_move mangles the src name.
1592 */
1593 msg = req->r_request;
1594 rhead = msg->front.iov_base;
1595
1596 flags = le32_to_cpu(rhead->flags);
1597 flags |= CEPH_MDS_FLAG_REPLAY;
1598 rhead->flags = cpu_to_le32(flags);
1599
1600 if (req->r_target_inode)
1601 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1602
1603 rhead->num_retry = req->r_attempts - 1;
1604
1605 /* remove cap/dentry releases from message */
1606 rhead->num_releases = 0;
1607 msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
1608 msg->front.iov_len = req->r_request_release_offset;
1609 return 0;
1610 }
1611
1583 if (req->r_request) { 1612 if (req->r_request) {
1584 ceph_msg_put(req->r_request); 1613 ceph_msg_put(req->r_request);
1585 req->r_request = NULL; 1614 req->r_request = NULL;
@@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1601 rhead->flags = cpu_to_le32(flags); 1630 rhead->flags = cpu_to_le32(flags);
1602 rhead->num_fwd = req->r_num_fwd; 1631 rhead->num_fwd = req->r_num_fwd;
1603 rhead->num_retry = req->r_attempts - 1; 1632 rhead->num_retry = req->r_attempts - 1;
1633 rhead->ino = 0;
1604 1634
1605 dout(" r_locked_dir = %p\n", req->r_locked_dir); 1635 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1606
1607 if (req->r_target_inode && req->r_got_unsafe)
1608 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1609 else
1610 rhead->ino = 0;
1611 return 0; 1636 return 0;
1612} 1637}
1613 1638
@@ -2783,6 +2808,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2783 drop_leases(mdsc); 2808 drop_leases(mdsc);
2784 ceph_flush_dirty_caps(mdsc); 2809 ceph_flush_dirty_caps(mdsc);
2785 wait_requests(mdsc); 2810 wait_requests(mdsc);
2811
2812 /*
2813 * wait for reply handlers to drop their request refs and
2814 * their inode/dcache refs
2815 */
2816 ceph_msgr_flush();
2786} 2817}
2787 2818
2788/* 2819/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index b292fa42a66d..952410c60d09 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -188,6 +188,7 @@ struct ceph_mds_request {
188 int r_old_inode_drop, r_old_inode_unless; 188 int r_old_inode_drop, r_old_inode_unless;
189 189
190 struct ceph_msg *r_request; /* original request */ 190 struct ceph_msg *r_request; /* original request */
191 int r_request_release_offset;
191 struct ceph_msg *r_reply; 192 struct ceph_msg *r_reply;
192 struct ceph_mds_reply_info_parsed r_reply_info; 193 struct ceph_mds_reply_info_parsed r_reply_info;
193 int r_err; 194 int r_err;
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 64b8b1f7863d..15167b2daa55 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con);
43 * nicely render a sockaddr as a string. 43 * nicely render a sockaddr as a string.
44 */ 44 */
45#define MAX_ADDR_STR 20 45#define MAX_ADDR_STR 20
46static char addr_str[MAX_ADDR_STR][40]; 46#define MAX_ADDR_STR_LEN 60
47static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
47static DEFINE_SPINLOCK(addr_str_lock); 48static DEFINE_SPINLOCK(addr_str_lock);
48static int last_addr_str; 49static int last_addr_str;
49 50
@@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss)
52 int i; 53 int i;
53 char *s; 54 char *s;
54 struct sockaddr_in *in4 = (void *)ss; 55 struct sockaddr_in *in4 = (void *)ss;
55 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
56 struct sockaddr_in6 *in6 = (void *)ss; 56 struct sockaddr_in6 *in6 = (void *)ss;
57 57
58 spin_lock(&addr_str_lock); 58 spin_lock(&addr_str_lock);
@@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss)
64 64
65 switch (ss->ss_family) { 65 switch (ss->ss_family) {
66 case AF_INET: 66 case AF_INET:
67 sprintf(s, "%u.%u.%u.%u:%u", 67 snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
68 (unsigned int)quad[0], 68 (unsigned int)ntohs(in4->sin_port));
69 (unsigned int)quad[1],
70 (unsigned int)quad[2],
71 (unsigned int)quad[3],
72 (unsigned int)ntohs(in4->sin_port));
73 break; 69 break;
74 70
75 case AF_INET6: 71 case AF_INET6:
76 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", 72 snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
77 in6->sin6_addr.s6_addr16[0], 73 (unsigned int)ntohs(in6->sin6_port));
78 in6->sin6_addr.s6_addr16[1],
79 in6->sin6_addr.s6_addr16[2],
80 in6->sin6_addr.s6_addr16[3],
81 in6->sin6_addr.s6_addr16[4],
82 in6->sin6_addr.s6_addr16[5],
83 in6->sin6_addr.s6_addr16[6],
84 in6->sin6_addr.s6_addr16[7],
85 (unsigned int)ntohs(in6->sin6_port));
86 break; 74 break;
87 75
88 default: 76 default:
@@ -215,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock,
215 */ 203 */
216static struct socket *ceph_tcp_connect(struct ceph_connection *con) 204static struct socket *ceph_tcp_connect(struct ceph_connection *con)
217{ 205{
218 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; 206 struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
219 struct socket *sock; 207 struct socket *sock;
220 int ret; 208 int ret;
221 209
222 BUG_ON(con->sock); 210 BUG_ON(con->sock);
223 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 211 ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
212 IPPROTO_TCP, &sock);
224 if (ret) 213 if (ret)
225 return ERR_PTR(ret); 214 return ERR_PTR(ret);
226 con->sock = sock; 215 con->sock = sock;
@@ -234,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
234 223
235 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); 224 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
236 225
237 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); 226 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
227 O_NONBLOCK);
238 if (ret == -EINPROGRESS) { 228 if (ret == -EINPROGRESS) {
239 dout("connect %s EINPROGRESS sk_state = %u\n", 229 dout("connect %s EINPROGRESS sk_state = %u\n",
240 pr_addr(&con->peer_addr.in_addr), 230 pr_addr(&con->peer_addr.in_addr),
@@ -657,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
657 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
658 con->connect_seq, global_seq, proto); 648 con->connect_seq, global_seq, proto);
659 649
660 con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT; 650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT);
661 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
662 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
663 con->out_connect.global_seq = cpu_to_le32(global_seq); 653 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1009,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end,
1009 struct sockaddr_in *in4 = (void *)ss; 999 struct sockaddr_in *in4 = (void *)ss;
1010 struct sockaddr_in6 *in6 = (void *)ss; 1000 struct sockaddr_in6 *in6 = (void *)ss;
1011 int port; 1001 int port;
1002 char delim = ',';
1003
1004 if (*p == '[') {
1005 delim = ']';
1006 p++;
1007 }
1012 1008
1013 memset(ss, 0, sizeof(*ss)); 1009 memset(ss, 0, sizeof(*ss));
1014 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, 1010 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1015 ',', &ipend)) { 1011 delim, &ipend))
1016 ss->ss_family = AF_INET; 1012 ss->ss_family = AF_INET;
1017 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, 1013 else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1018 ',', &ipend)) { 1014 delim, &ipend))
1019 ss->ss_family = AF_INET6; 1015 ss->ss_family = AF_INET6;
1020 } else { 1016 else
1021 goto bad; 1017 goto bad;
1022 }
1023 p = ipend; 1018 p = ipend;
1024 1019
1020 if (delim == ']') {
1021 if (*p != ']') {
1022 dout("missing matching ']'\n");
1023 goto bad;
1024 }
1025 p++;
1026 }
1027
1025 /* port? */ 1028 /* port? */
1026 if (p < end && *p == ':') { 1029 if (p < end && *p == ':') {
1027 port = 0; 1030 port = 0;
@@ -1055,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end,
1055 return 0; 1058 return 0;
1056 1059
1057bad: 1060bad:
1058 pr_err("parse_ips bad ip '%s'\n", c); 1061 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1059 return -EINVAL; 1062 return -EINVAL;
1060} 1063}
1061 1064
@@ -1396,10 +1399,12 @@ static int read_partial_message(struct ceph_connection *con)
1396 if (!con->in_msg) { 1399 if (!con->in_msg) {
1397 dout("got hdr type %d front %d data %d\n", con->in_hdr.type, 1400 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1398 con->in_hdr.front_len, con->in_hdr.data_len); 1401 con->in_hdr.front_len, con->in_hdr.data_len);
1402 skip = 0;
1399 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); 1403 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1400 if (skip) { 1404 if (skip) {
1401 /* skip this message */ 1405 /* skip this message */
1402 dout("alloc_msg said skip message\n"); 1406 dout("alloc_msg said skip message\n");
1407 BUG_ON(con->in_msg);
1403 con->in_base_pos = -front_len - middle_len - data_len - 1408 con->in_base_pos = -front_len - middle_len - data_len -
1404 sizeof(m->footer); 1409 sizeof(m->footer);
1405 con->in_tag = CEPH_MSGR_TAG_READY; 1410 con->in_tag = CEPH_MSGR_TAG_READY;
@@ -2013,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2013{ 2018{
2014 mutex_lock(&con->mutex); 2019 mutex_lock(&con->mutex);
2015 if (!list_empty(&msg->list_head)) { 2020 if (!list_empty(&msg->list_head)) {
2016 dout("con_revoke %p msg %p\n", con, msg); 2021 dout("con_revoke %p msg %p - was on queue\n", con, msg);
2017 list_del_init(&msg->list_head); 2022 list_del_init(&msg->list_head);
2018 ceph_msg_put(msg); 2023 ceph_msg_put(msg);
2019 msg->hdr.seq = 0; 2024 msg->hdr.seq = 0;
2020 if (con->out_msg == msg) { 2025 }
2021 ceph_msg_put(con->out_msg); 2026 if (con->out_msg == msg) {
2022 con->out_msg = NULL; 2027 dout("con_revoke %p msg %p - was sending\n", con, msg);
2023 } 2028 con->out_msg = NULL;
2024 if (con->out_kvec_is_msg) { 2029 if (con->out_kvec_is_msg) {
2025 con->out_skip = con->out_kvec_bytes; 2030 con->out_skip = con->out_kvec_bytes;
2026 con->out_kvec_is_msg = false; 2031 con->out_kvec_is_msg = false;
2027 } 2032 }
2028 } else { 2033 ceph_msg_put(msg);
2029 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); 2034 msg->hdr.seq = 0;
2030 } 2035 }
2031 mutex_unlock(&con->mutex); 2036 mutex_unlock(&con->mutex);
2032} 2037}
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 07a539906e67..cc115eafae11 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -725,7 +725,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
725 dout("authenticated, starting session\n"); 725 dout("authenticated, starting session\n");
726 726
727 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 727 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
728 monc->client->msgr->inst.name.num = monc->auth->global_id; 728 monc->client->msgr->inst.name.num =
729 cpu_to_le64(monc->auth->global_id);
729 730
730 __send_subscribe(monc); 731 __send_subscribe(monc);
731 __resend_generic_request(monc); 732 __resend_generic_request(monc);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index d25b4add85b4..92b7251a53f1 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1344,7 +1344,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1344 int type = le16_to_cpu(msg->hdr.type); 1344 int type = le16_to_cpu(msg->hdr.type);
1345 1345
1346 if (!osd) 1346 if (!osd)
1347 return; 1347 goto out;
1348 osdc = osd->o_osdc; 1348 osdc = osd->o_osdc;
1349 1349
1350 switch (type) { 1350 switch (type) {
@@ -1359,6 +1359,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1359 pr_err("received unknown message type %d %s\n", type, 1359 pr_err("received unknown message type %d %s\n", type,
1360 ceph_msg_type_name(type)); 1360 ceph_msg_type_name(type));
1361 } 1361 }
1362out:
1362 ceph_msg_put(msg); 1363 ceph_msg_put(msg);
1363} 1364}
1364 1365
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index ddc656fb5c05..277f8b339577 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
568 if (ev > CEPH_PG_POOL_VERSION) { 568 if (ev > CEPH_PG_POOL_VERSION) {
569 pr_warning("got unknown v %d > %d of ceph_pg_pool\n", 569 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
570 ev, CEPH_PG_POOL_VERSION); 570 ev, CEPH_PG_POOL_VERSION);
571 kfree(pi);
571 goto bad; 572 goto bad;
572 } 573 }
573 __decode_pool(p, pi); 574 __decode_pool(p, pi);
@@ -707,6 +708,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
707 newcrush = crush_decode(*p, min(*p+len, end)); 708 newcrush = crush_decode(*p, min(*p+len, end));
708 if (IS_ERR(newcrush)) 709 if (IS_ERR(newcrush))
709 return ERR_CAST(newcrush); 710 return ERR_CAST(newcrush);
711 *p += len;
710 } 712 }
711 713
712 /* new flags? */ 714 /* new flags? */
diff --git a/fs/dcache.c b/fs/dcache.c
index c8c78ba07827..86d4db15473e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -896,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
896 * 896 *
897 * In this case we return -1 to tell the caller that we baled. 897 * In this case we return -1 to tell the caller that we baled.
898 */ 898 */
899static int shrink_dcache_memory(int nr, gfp_t gfp_mask) 899static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
900{ 900{
901 if (nr) { 901 if (nr) {
902 if (!(gfp_mask & __GFP_FS)) 902 if (!(gfp_mask & __GFP_FS))
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0609607d3955..d5be1693ac93 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -38,43 +38,18 @@ int nr_pdflush_threads;
38/* 38/*
39 * Passed into wb_writeback(), essentially a subset of writeback_control 39 * Passed into wb_writeback(), essentially a subset of writeback_control
40 */ 40 */
41struct wb_writeback_args { 41struct wb_writeback_work {
42 long nr_pages; 42 long nr_pages;
43 struct super_block *sb; 43 struct super_block *sb;
44 enum writeback_sync_modes sync_mode; 44 enum writeback_sync_modes sync_mode;
45 unsigned int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 unsigned int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 unsigned int for_background:1; 47 unsigned int for_background:1;
48};
49 48
50/*
51 * Work items for the bdi_writeback threads
52 */
53struct bdi_work {
54 struct list_head list; /* pending work list */ 49 struct list_head list; /* pending work list */
55 struct rcu_head rcu_head; /* for RCU free/clear of work */ 50 struct completion *done; /* set if the caller waits */
56
57 unsigned long seen; /* threads that have seen this work */
58 atomic_t pending; /* number of threads still to do work */
59
60 struct wb_writeback_args args; /* writeback arguments */
61
62 unsigned long state; /* flag bits, see WS_* */
63}; 51};
64 52
65enum {
66 WS_INPROGRESS = 0,
67 WS_ONSTACK,
68};
69
70static inline void bdi_work_init(struct bdi_work *work,
71 struct wb_writeback_args *args)
72{
73 INIT_RCU_HEAD(&work->rcu_head);
74 work->args = *args;
75 __set_bit(WS_INPROGRESS, &work->state);
76}
77
78/** 53/**
79 * writeback_in_progress - determine whether there is writeback in progress 54 * writeback_in_progress - determine whether there is writeback in progress
80 * @bdi: the device's backing_dev_info structure. 55 * @bdi: the device's backing_dev_info structure.
@@ -87,49 +62,11 @@ int writeback_in_progress(struct backing_dev_info *bdi)
87 return !list_empty(&bdi->work_list); 62 return !list_empty(&bdi->work_list);
88} 63}
89 64
90static void bdi_work_free(struct rcu_head *head) 65static void bdi_queue_work(struct backing_dev_info *bdi,
91{ 66 struct wb_writeback_work *work)
92 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
93
94 clear_bit(WS_INPROGRESS, &work->state);
95 smp_mb__after_clear_bit();
96 wake_up_bit(&work->state, WS_INPROGRESS);
97
98 if (!test_bit(WS_ONSTACK, &work->state))
99 kfree(work);
100}
101
102static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
103{ 67{
104 /*
105 * The caller has retrieved the work arguments from this work,
106 * drop our reference. If this is the last ref, delete and free it
107 */
108 if (atomic_dec_and_test(&work->pending)) {
109 struct backing_dev_info *bdi = wb->bdi;
110
111 spin_lock(&bdi->wb_lock);
112 list_del_rcu(&work->list);
113 spin_unlock(&bdi->wb_lock);
114
115 call_rcu(&work->rcu_head, bdi_work_free);
116 }
117}
118
119static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
120{
121 work->seen = bdi->wb_mask;
122 BUG_ON(!work->seen);
123 atomic_set(&work->pending, bdi->wb_cnt);
124 BUG_ON(!bdi->wb_cnt);
125
126 /*
127 * list_add_tail_rcu() contains the necessary barriers to
128 * make sure the above stores are seen before the item is
129 * noticed on the list
130 */
131 spin_lock(&bdi->wb_lock); 68 spin_lock(&bdi->wb_lock);
132 list_add_tail_rcu(&work->list, &bdi->work_list); 69 list_add_tail(&work->list, &bdi->work_list);
133 spin_unlock(&bdi->wb_lock); 70 spin_unlock(&bdi->wb_lock);
134 71
135 /* 72 /*
@@ -146,55 +83,29 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
146 } 83 }
147} 84}
148 85
149/* 86static void
150 * Used for on-stack allocated work items. The caller needs to wait until 87__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
151 * the wb threads have acked the work before it's safe to continue. 88 bool range_cyclic, bool for_background)
152 */
153static void bdi_wait_on_work_done(struct bdi_work *work)
154{ 89{
155 wait_on_bit(&work->state, WS_INPROGRESS, bdi_sched_wait, 90 struct wb_writeback_work *work;
156 TASK_UNINTERRUPTIBLE);
157}
158
159static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
160 struct wb_writeback_args *args)
161{
162 struct bdi_work *work;
163 91
164 /* 92 /*
165 * This is WB_SYNC_NONE writeback, so if allocation fails just 93 * This is WB_SYNC_NONE writeback, so if allocation fails just
166 * wakeup the thread for old dirty data writeback 94 * wakeup the thread for old dirty data writeback
167 */ 95 */
168 work = kmalloc(sizeof(*work), GFP_ATOMIC); 96 work = kzalloc(sizeof(*work), GFP_ATOMIC);
169 if (work) { 97 if (!work) {
170 bdi_work_init(work, args); 98 if (bdi->wb.task)
171 bdi_queue_work(bdi, work); 99 wake_up_process(bdi->wb.task);
172 } else { 100 return;
173 struct bdi_writeback *wb = &bdi->wb;
174
175 if (wb->task)
176 wake_up_process(wb->task);
177 } 101 }
178}
179 102
180/** 103 work->sync_mode = WB_SYNC_NONE;
181 * bdi_queue_work_onstack - start and wait for writeback 104 work->nr_pages = nr_pages;
182 * @sb: write inodes from this super_block 105 work->range_cyclic = range_cyclic;
183 * 106 work->for_background = for_background;
184 * Description:
185 * This function initiates writeback and waits for the operation to
186 * complete. Callers must hold the sb s_umount semaphore for
187 * reading, to avoid having the super disappear before we are done.
188 */
189static void bdi_queue_work_onstack(struct wb_writeback_args *args)
190{
191 struct bdi_work work;
192
193 bdi_work_init(&work, args);
194 __set_bit(WS_ONSTACK, &work.state);
195 107
196 bdi_queue_work(args->sb->s_bdi, &work); 108 bdi_queue_work(bdi, work);
197 bdi_wait_on_work_done(&work);
198} 109}
199 110
200/** 111/**
@@ -210,13 +121,7 @@ static void bdi_queue_work_onstack(struct wb_writeback_args *args)
210 */ 121 */
211void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 122void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
212{ 123{
213 struct wb_writeback_args args = { 124 __bdi_start_writeback(bdi, nr_pages, true, false);
214 .sync_mode = WB_SYNC_NONE,
215 .nr_pages = nr_pages,
216 .range_cyclic = 1,
217 };
218
219 bdi_alloc_queue_work(bdi, &args);
220} 125}
221 126
222/** 127/**
@@ -230,13 +135,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
230 */ 135 */
231void bdi_start_background_writeback(struct backing_dev_info *bdi) 136void bdi_start_background_writeback(struct backing_dev_info *bdi)
232{ 137{
233 struct wb_writeback_args args = { 138 __bdi_start_writeback(bdi, LONG_MAX, true, true);
234 .sync_mode = WB_SYNC_NONE,
235 .nr_pages = LONG_MAX,
236 .for_background = 1,
237 .range_cyclic = 1,
238 };
239 bdi_alloc_queue_work(bdi, &args);
240} 139}
241 140
242/* 141/*
@@ -554,29 +453,41 @@ static bool pin_sb_for_writeback(struct super_block *sb)
554 453
555/* 454/*
556 * Write a portion of b_io inodes which belong to @sb. 455 * Write a portion of b_io inodes which belong to @sb.
557 * If @wbc->sb != NULL, then find and write all such 456 *
457 * If @only_this_sb is true, then find and write all such
558 * inodes. Otherwise write only ones which go sequentially 458 * inodes. Otherwise write only ones which go sequentially
559 * in reverse order. 459 * in reverse order.
460 *
560 * Return 1, if the caller writeback routine should be 461 * Return 1, if the caller writeback routine should be
561 * interrupted. Otherwise return 0. 462 * interrupted. Otherwise return 0.
562 */ 463 */
563static int writeback_sb_inodes(struct super_block *sb, 464static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
564 struct bdi_writeback *wb, 465 struct writeback_control *wbc, bool only_this_sb)
565 struct writeback_control *wbc)
566{ 466{
567 while (!list_empty(&wb->b_io)) { 467 while (!list_empty(&wb->b_io)) {
568 long pages_skipped; 468 long pages_skipped;
569 struct inode *inode = list_entry(wb->b_io.prev, 469 struct inode *inode = list_entry(wb->b_io.prev,
570 struct inode, i_list); 470 struct inode, i_list);
571 if (wbc->sb && sb != inode->i_sb) { 471
572 /* super block given and doesn't 472 if (inode->i_sb != sb) {
573 match, skip this inode */ 473 if (only_this_sb) {
574 redirty_tail(inode); 474 /*
575 continue; 475 * We only want to write back data for this
576 } 476 * superblock, move all inodes not belonging
577 if (sb != inode->i_sb) 477 * to it back onto the dirty list.
578 /* finish with this superblock */ 478 */
479 redirty_tail(inode);
480 continue;
481 }
482
483 /*
484 * The inode belongs to a different superblock.
485 * Bounce back to the caller to unpin this and
486 * pin the next superblock.
487 */
579 return 0; 488 return 0;
489 }
490
580 if (inode->i_state & (I_NEW | I_WILL_FREE)) { 491 if (inode->i_state & (I_NEW | I_WILL_FREE)) {
581 requeue_io(inode); 492 requeue_io(inode);
582 continue; 493 continue;
@@ -614,8 +525,8 @@ static int writeback_sb_inodes(struct super_block *sb,
614 return 1; 525 return 1;
615} 526}
616 527
617static void writeback_inodes_wb(struct bdi_writeback *wb, 528void writeback_inodes_wb(struct bdi_writeback *wb,
618 struct writeback_control *wbc) 529 struct writeback_control *wbc)
619{ 530{
620 int ret = 0; 531 int ret = 0;
621 532
@@ -629,29 +540,12 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
629 struct inode, i_list); 540 struct inode, i_list);
630 struct super_block *sb = inode->i_sb; 541 struct super_block *sb = inode->i_sb;
631 542
632 if (wbc->sb) { 543 if (!pin_sb_for_writeback(sb)) {
633 /* 544 requeue_io(inode);
634 * We are requested to write out inodes for a specific 545 continue;
635 * superblock. This means we already have s_umount
636 * taken by the caller which also waits for us to
637 * complete the writeout.
638 */
639 if (sb != wbc->sb) {
640 redirty_tail(inode);
641 continue;
642 }
643
644 WARN_ON(!rwsem_is_locked(&sb->s_umount));
645
646 ret = writeback_sb_inodes(sb, wb, wbc);
647 } else {
648 if (!pin_sb_for_writeback(sb)) {
649 requeue_io(inode);
650 continue;
651 }
652 ret = writeback_sb_inodes(sb, wb, wbc);
653 drop_super(sb);
654 } 546 }
547 ret = writeback_sb_inodes(sb, wb, wbc, false);
548 drop_super(sb);
655 549
656 if (ret) 550 if (ret)
657 break; 551 break;
@@ -660,11 +554,17 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
660 /* Leave any unwritten inodes on b_io */ 554 /* Leave any unwritten inodes on b_io */
661} 555}
662 556
663void writeback_inodes_wbc(struct writeback_control *wbc) 557static void __writeback_inodes_sb(struct super_block *sb,
558 struct bdi_writeback *wb, struct writeback_control *wbc)
664{ 559{
665 struct backing_dev_info *bdi = wbc->bdi; 560 WARN_ON(!rwsem_is_locked(&sb->s_umount));
666 561
667 writeback_inodes_wb(&bdi->wb, wbc); 562 wbc->wb_start = jiffies; /* livelock avoidance */
563 spin_lock(&inode_lock);
564 if (!wbc->for_kupdate || list_empty(&wb->b_io))
565 queue_io(wb, wbc->older_than_this);
566 writeback_sb_inodes(sb, wb, wbc, true);
567 spin_unlock(&inode_lock);
668} 568}
669 569
670/* 570/*
@@ -702,16 +602,14 @@ static inline bool over_bground_thresh(void)
702 * all dirty pages if they are all attached to "old" mappings. 602 * all dirty pages if they are all attached to "old" mappings.
703 */ 603 */
704static long wb_writeback(struct bdi_writeback *wb, 604static long wb_writeback(struct bdi_writeback *wb,
705 struct wb_writeback_args *args) 605 struct wb_writeback_work *work)
706{ 606{
707 struct writeback_control wbc = { 607 struct writeback_control wbc = {
708 .bdi = wb->bdi, 608 .sync_mode = work->sync_mode,
709 .sb = args->sb,
710 .sync_mode = args->sync_mode,
711 .older_than_this = NULL, 609 .older_than_this = NULL,
712 .for_kupdate = args->for_kupdate, 610 .for_kupdate = work->for_kupdate,
713 .for_background = args->for_background, 611 .for_background = work->for_background,
714 .range_cyclic = args->range_cyclic, 612 .range_cyclic = work->range_cyclic,
715 }; 613 };
716 unsigned long oldest_jif; 614 unsigned long oldest_jif;
717 long wrote = 0; 615 long wrote = 0;
@@ -731,21 +629,24 @@ static long wb_writeback(struct bdi_writeback *wb,
731 /* 629 /*
732 * Stop writeback when nr_pages has been consumed 630 * Stop writeback when nr_pages has been consumed
733 */ 631 */
734 if (args->nr_pages <= 0) 632 if (work->nr_pages <= 0)
735 break; 633 break;
736 634
737 /* 635 /*
738 * For background writeout, stop when we are below the 636 * For background writeout, stop when we are below the
739 * background dirty threshold 637 * background dirty threshold
740 */ 638 */
741 if (args->for_background && !over_bground_thresh()) 639 if (work->for_background && !over_bground_thresh())
742 break; 640 break;
743 641
744 wbc.more_io = 0; 642 wbc.more_io = 0;
745 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 643 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
746 wbc.pages_skipped = 0; 644 wbc.pages_skipped = 0;
747 writeback_inodes_wb(wb, &wbc); 645 if (work->sb)
748 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 646 __writeback_inodes_sb(work->sb, wb, &wbc);
647 else
648 writeback_inodes_wb(wb, &wbc);
649 work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
749 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 650 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
750 651
751 /* 652 /*
@@ -781,31 +682,21 @@ static long wb_writeback(struct bdi_writeback *wb,
781} 682}
782 683
783/* 684/*
784 * Return the next bdi_work struct that hasn't been processed by this 685 * Return the next wb_writeback_work struct that hasn't been processed yet.
785 * wb thread yet. ->seen is initially set for each thread that exists
786 * for this device, when a thread first notices a piece of work it
787 * clears its bit. Depending on writeback type, the thread will notify
788 * completion on either receiving the work (WB_SYNC_NONE) or after
789 * it is done (WB_SYNC_ALL).
790 */ 686 */
791static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, 687static struct wb_writeback_work *
792 struct bdi_writeback *wb) 688get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb)
793{ 689{
794 struct bdi_work *work, *ret = NULL; 690 struct wb_writeback_work *work = NULL;
795
796 rcu_read_lock();
797 691
798 list_for_each_entry_rcu(work, &bdi->work_list, list) { 692 spin_lock(&bdi->wb_lock);
799 if (!test_bit(wb->nr, &work->seen)) 693 if (!list_empty(&bdi->work_list)) {
800 continue; 694 work = list_entry(bdi->work_list.next,
801 clear_bit(wb->nr, &work->seen); 695 struct wb_writeback_work, list);
802 696 list_del_init(&work->list);
803 ret = work;
804 break;
805 } 697 }
806 698 spin_unlock(&bdi->wb_lock);
807 rcu_read_unlock(); 699 return work;
808 return ret;
809} 700}
810 701
811static long wb_check_old_data_flush(struct bdi_writeback *wb) 702static long wb_check_old_data_flush(struct bdi_writeback *wb)
@@ -830,14 +721,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
830 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 721 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
831 722
832 if (nr_pages) { 723 if (nr_pages) {
833 struct wb_writeback_args args = { 724 struct wb_writeback_work work = {
834 .nr_pages = nr_pages, 725 .nr_pages = nr_pages,
835 .sync_mode = WB_SYNC_NONE, 726 .sync_mode = WB_SYNC_NONE,
836 .for_kupdate = 1, 727 .for_kupdate = 1,
837 .range_cyclic = 1, 728 .range_cyclic = 1,
838 }; 729 };
839 730
840 return wb_writeback(wb, &args); 731 return wb_writeback(wb, &work);
841 } 732 }
842 733
843 return 0; 734 return 0;
@@ -849,33 +740,27 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
849long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 740long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
850{ 741{
851 struct backing_dev_info *bdi = wb->bdi; 742 struct backing_dev_info *bdi = wb->bdi;
852 struct bdi_work *work; 743 struct wb_writeback_work *work;
853 long wrote = 0; 744 long wrote = 0;
854 745
855 while ((work = get_next_work_item(bdi, wb)) != NULL) { 746 while ((work = get_next_work_item(bdi, wb)) != NULL) {
856 struct wb_writeback_args args = work->args;
857
858 /* 747 /*
859 * Override sync mode, in case we must wait for completion 748 * Override sync mode, in case we must wait for completion
749 * because this thread is exiting now.
860 */ 750 */
861 if (force_wait) 751 if (force_wait)
862 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; 752 work->sync_mode = WB_SYNC_ALL;
863
864 /*
865 * If this isn't a data integrity operation, just notify
866 * that we have seen this work and we are now starting it.
867 */
868 if (!test_bit(WS_ONSTACK, &work->state))
869 wb_clear_pending(wb, work);
870 753
871 wrote += wb_writeback(wb, &args); 754 wrote += wb_writeback(wb, work);
872 755
873 /* 756 /*
874 * This is a data integrity writeback, so only do the 757 * Notify the caller of completion if this is a synchronous
875 * notification when we have completed the work. 758 * work item, otherwise just free it.
876 */ 759 */
877 if (test_bit(WS_ONSTACK, &work->state)) 760 if (work->done)
878 wb_clear_pending(wb, work); 761 complete(work->done);
762 else
763 kfree(work);
879 } 764 }
880 765
881 /* 766 /*
@@ -938,14 +823,9 @@ int bdi_writeback_task(struct bdi_writeback *wb)
938void wakeup_flusher_threads(long nr_pages) 823void wakeup_flusher_threads(long nr_pages)
939{ 824{
940 struct backing_dev_info *bdi; 825 struct backing_dev_info *bdi;
941 struct wb_writeback_args args = {
942 .sync_mode = WB_SYNC_NONE,
943 };
944 826
945 if (nr_pages) { 827 if (!nr_pages) {
946 args.nr_pages = nr_pages; 828 nr_pages = global_page_state(NR_FILE_DIRTY) +
947 } else {
948 args.nr_pages = global_page_state(NR_FILE_DIRTY) +
949 global_page_state(NR_UNSTABLE_NFS); 829 global_page_state(NR_UNSTABLE_NFS);
950 } 830 }
951 831
@@ -953,7 +833,7 @@ void wakeup_flusher_threads(long nr_pages)
953 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 833 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
954 if (!bdi_has_dirty_io(bdi)) 834 if (!bdi_has_dirty_io(bdi))
955 continue; 835 continue;
956 bdi_alloc_queue_work(bdi, &args); 836 __bdi_start_writeback(bdi, nr_pages, false, false);
957 } 837 }
958 rcu_read_unlock(); 838 rcu_read_unlock();
959} 839}
@@ -1162,17 +1042,20 @@ void writeback_inodes_sb(struct super_block *sb)
1162{ 1042{
1163 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1043 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1164 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1044 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1165 struct wb_writeback_args args = { 1045 DECLARE_COMPLETION_ONSTACK(done);
1046 struct wb_writeback_work work = {
1166 .sb = sb, 1047 .sb = sb,
1167 .sync_mode = WB_SYNC_NONE, 1048 .sync_mode = WB_SYNC_NONE,
1049 .done = &done,
1168 }; 1050 };
1169 1051
1170 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1052 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1171 1053
1172 args.nr_pages = nr_dirty + nr_unstable + 1054 work.nr_pages = nr_dirty + nr_unstable +
1173 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1055 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1174 1056
1175 bdi_queue_work_onstack(&args); 1057 bdi_queue_work(sb->s_bdi, &work);
1058 wait_for_completion(&done);
1176} 1059}
1177EXPORT_SYMBOL(writeback_inodes_sb); 1060EXPORT_SYMBOL(writeback_inodes_sb);
1178 1061
@@ -1204,16 +1087,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1204 */ 1087 */
1205void sync_inodes_sb(struct super_block *sb) 1088void sync_inodes_sb(struct super_block *sb)
1206{ 1089{
1207 struct wb_writeback_args args = { 1090 DECLARE_COMPLETION_ONSTACK(done);
1091 struct wb_writeback_work work = {
1208 .sb = sb, 1092 .sb = sb,
1209 .sync_mode = WB_SYNC_ALL, 1093 .sync_mode = WB_SYNC_ALL,
1210 .nr_pages = LONG_MAX, 1094 .nr_pages = LONG_MAX,
1211 .range_cyclic = 0, 1095 .range_cyclic = 0,
1096 .done = &done,
1212 }; 1097 };
1213 1098
1214 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1099 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1215 1100
1216 bdi_queue_work_onstack(&args); 1101 bdi_queue_work(sb->s_bdi, &work);
1102 wait_for_completion(&done);
1103
1217 wait_sb_inodes(sb); 1104 wait_sb_inodes(sb);
1218} 1105}
1219EXPORT_SYMBOL(sync_inodes_sb); 1106EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 4a48c0f4b402..84da64b551b2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1041,6 +1041,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1041 1041
1042 if (gfs2_is_stuffed(ip)) { 1042 if (gfs2_is_stuffed(ip)) {
1043 u64 dsize = size + sizeof(struct gfs2_inode); 1043 u64 dsize = size + sizeof(struct gfs2_inode);
1044 ip->i_disksize = size;
1044 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1045 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1045 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1046 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1046 gfs2_dinode_out(ip, dibh->b_data); 1047 gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8295c5b5d4a9..26ca3361a8bc 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
392 unsigned totlen = be16_to_cpu(dent->de_rec_len); 392 unsigned totlen = be16_to_cpu(dent->de_rec_len);
393 393
394 if (gfs2_dirent_sentinel(dent)) 394 if (gfs2_dirent_sentinel(dent))
395 actual = GFS2_DIRENT_SIZE(0); 395 actual = 0;
396 if (totlen - actual >= required) 396 if (totlen - actual >= required)
397 return 1; 397 return 1;
398 return 0; 398 return 0;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ddcdbf493536..0898f3ec8212 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -706,8 +706,18 @@ static void glock_work_func(struct work_struct *work)
706{ 706{
707 unsigned long delay = 0; 707 unsigned long delay = 0;
708 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); 708 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
709 struct gfs2_holder *gh;
709 int drop_ref = 0; 710 int drop_ref = 0;
710 711
712 if (unlikely(test_bit(GLF_FROZEN, &gl->gl_flags))) {
713 spin_lock(&gl->gl_spin);
714 gh = find_first_waiter(gl);
715 if (gh && (gh->gh_flags & LM_FLAG_NOEXP) &&
716 test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
717 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
718 spin_unlock(&gl->gl_spin);
719 }
720
711 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { 721 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
712 finish_xmote(gl, gl->gl_reply); 722 finish_xmote(gl, gl->gl_reply);
713 drop_ref = 1; 723 drop_ref = 1;
@@ -1348,7 +1358,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1348} 1358}
1349 1359
1350 1360
1351static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) 1361static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
1352{ 1362{
1353 struct gfs2_glock *gl; 1363 struct gfs2_glock *gl;
1354 int may_demote; 1364 int may_demote;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b5612cbb62a5..f03afd9c44bc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
169{ 169{
170 struct inode *inode; 170 struct inode *inode;
171 struct gfs2_inode *ip; 171 struct gfs2_inode *ip;
172 struct gfs2_glock *io_gl; 172 struct gfs2_glock *io_gl = NULL;
173 int error; 173 int error;
174 174
175 inode = gfs2_iget(sb, no_addr); 175 inode = gfs2_iget(sb, no_addr);
@@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
198 ip->i_iopen_gh.gh_gl->gl_object = ip; 198 ip->i_iopen_gh.gh_gl->gl_object = ip;
199 199
200 gfs2_glock_put(io_gl); 200 gfs2_glock_put(io_gl);
201 io_gl = NULL;
201 202
202 if ((type == DT_UNKNOWN) && (no_formal_ino == 0)) 203 if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
203 goto gfs2_nfsbypass; 204 goto gfs2_nfsbypass;
@@ -228,7 +229,8 @@ gfs2_nfsbypass:
228fail_glock: 229fail_glock:
229 gfs2_glock_dq(&ip->i_iopen_gh); 230 gfs2_glock_dq(&ip->i_iopen_gh);
230fail_iopen: 231fail_iopen:
231 gfs2_glock_put(io_gl); 232 if (io_gl)
233 gfs2_glock_put(io_gl);
232fail_put: 234fail_put:
233 if (inode->i_state & I_NEW) 235 if (inode->i_state & I_NEW)
234 ip->i_gl->gl_object = NULL; 236 ip->i_gl->gl_object = NULL;
@@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
256{ 258{
257 struct gfs2_sbd *sdp; 259 struct gfs2_sbd *sdp;
258 struct gfs2_inode *ip; 260 struct gfs2_inode *ip;
259 struct gfs2_glock *io_gl; 261 struct gfs2_glock *io_gl = NULL;
260 int error; 262 int error;
261 struct gfs2_holder gh; 263 struct gfs2_holder gh;
262 struct inode *inode; 264 struct inode *inode;
@@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
293 295
294 ip->i_iopen_gh.gh_gl->gl_object = ip; 296 ip->i_iopen_gh.gh_gl->gl_object = ip;
295 gfs2_glock_put(io_gl); 297 gfs2_glock_put(io_gl);
298 io_gl = NULL;
296 299
297 inode->i_mode = DT2IF(DT_UNKNOWN); 300 inode->i_mode = DT2IF(DT_UNKNOWN);
298 301
@@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
319fail_glock: 322fail_glock:
320 gfs2_glock_dq(&ip->i_iopen_gh); 323 gfs2_glock_dq(&ip->i_iopen_gh);
321fail_iopen: 324fail_iopen:
322 gfs2_glock_put(io_gl); 325 if (io_gl)
326 gfs2_glock_put(io_gl);
323fail_put: 327fail_put:
324 ip->i_gl->gl_object = NULL; 328 ip->i_gl->gl_object = NULL;
325 gfs2_glock_put(ip->i_gl); 329 gfs2_glock_put(ip->i_gl);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 49667d68769e..8f02d3db8f42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list);
77static atomic_t qd_lru_count = ATOMIC_INIT(0); 77static atomic_t qd_lru_count = ATOMIC_INIT(0);
78static DEFINE_SPINLOCK(qd_lru_lock); 78static DEFINE_SPINLOCK(qd_lru_lock);
79 79
80int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) 80int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
81{ 81{
82 struct gfs2_quota_data *qd; 82 struct gfs2_quota_data *qd;
83 struct gfs2_sbd *sdp; 83 struct gfs2_sbd *sdp;
@@ -694,10 +694,8 @@ get_a_page:
694 if (!buffer_mapped(bh)) 694 if (!buffer_mapped(bh))
695 goto unlock_out; 695 goto unlock_out;
696 /* If it's a newly allocated disk block for quota, zero it */ 696 /* If it's a newly allocated disk block for quota, zero it */
697 if (buffer_new(bh)) { 697 if (buffer_new(bh))
698 memset(bh->b_data, 0, bh->b_size); 698 zero_user(page, pos - blocksize, bh->b_size);
699 set_buffer_uptodate(bh);
700 }
701 } 699 }
702 700
703 if (PageUptodate(page)) 701 if (PageUptodate(page))
@@ -723,7 +721,7 @@ get_a_page:
723 721
724 /* If quota straddles page boundary, we need to update the rest of the 722 /* If quota straddles page boundary, we need to update the rest of the
725 * quota at the beginning of the next page */ 723 * quota at the beginning of the next page */
726 if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */ 724 if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
727 ptr = ptr + nbytes; 725 ptr = ptr + nbytes;
728 nbytes = sizeof(struct gfs2_quota) - nbytes; 726 nbytes = sizeof(struct gfs2_quota) - nbytes;
729 offset = 0; 727 offset = 0;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 195f60c8bd14..e7d236ca48bd 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
51 return ret; 51 return ret;
52} 52}
53 53
54extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); 54extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
55extern const struct quotactl_ops gfs2_quotactl_ops; 55extern const struct quotactl_ops gfs2_quotactl_ops;
56 56
57#endif /* __QUOTA_DOT_H__ */ 57#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/inode.c b/fs/inode.c
index 2bee20ae3d65..722860b323a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan)
512 * This function is passed the number of inodes to scan, and it returns the 512 * This function is passed the number of inodes to scan, and it returns the
513 * total number of remaining possibly-reclaimable inodes. 513 * total number of remaining possibly-reclaimable inodes.
514 */ 514 */
515static int shrink_icache_memory(int nr, gfp_t gfp_mask) 515static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
516{ 516{
517 if (nr) { 517 if (nr) {
518 /* 518 /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index bc2ff5932769..036880895bfc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
297 struct page *new_page; 297 struct page *new_page;
298 unsigned int new_offset; 298 unsigned int new_offset;
299 struct buffer_head *bh_in = jh2bh(jh_in); 299 struct buffer_head *bh_in = jh2bh(jh_in);
300 struct jbd2_buffer_trigger_type *triggers;
301 journal_t *journal = transaction->t_journal; 300 journal_t *journal = transaction->t_journal;
302 301
303 /* 302 /*
@@ -328,21 +327,21 @@ repeat:
328 done_copy_out = 1; 327 done_copy_out = 1;
329 new_page = virt_to_page(jh_in->b_frozen_data); 328 new_page = virt_to_page(jh_in->b_frozen_data);
330 new_offset = offset_in_page(jh_in->b_frozen_data); 329 new_offset = offset_in_page(jh_in->b_frozen_data);
331 triggers = jh_in->b_frozen_triggers;
332 } else { 330 } else {
333 new_page = jh2bh(jh_in)->b_page; 331 new_page = jh2bh(jh_in)->b_page;
334 new_offset = offset_in_page(jh2bh(jh_in)->b_data); 332 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
335 triggers = jh_in->b_triggers;
336 } 333 }
337 334
338 mapped_data = kmap_atomic(new_page, KM_USER0); 335 mapped_data = kmap_atomic(new_page, KM_USER0);
339 /* 336 /*
340 * Fire any commit trigger. Do this before checking for escaping, 337 * Fire data frozen trigger if data already wasn't frozen. Do this
341 * as the trigger may modify the magic offset. If a copy-out 338 * before checking for escaping, as the trigger may modify the magic
342 * happens afterwards, it will have the correct data in the buffer. 339 * offset. If a copy-out happens afterwards, it will have the correct
340 * data in the buffer.
343 */ 341 */
344 jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, 342 if (!done_copy_out)
345 triggers); 343 jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
344 jh_in->b_triggers);
346 345
347 /* 346 /*
348 * Check for escaping 347 * Check for escaping
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e214d68620ac..b8e0806681bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -725,6 +725,9 @@ done:
725 page = jh2bh(jh)->b_page; 725 page = jh2bh(jh)->b_page;
726 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; 726 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
727 source = kmap_atomic(page, KM_USER0); 727 source = kmap_atomic(page, KM_USER0);
728 /* Fire data frozen trigger just before we copy the data */
729 jbd2_buffer_frozen_trigger(jh, source + offset,
730 jh->b_triggers);
728 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 731 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
729 kunmap_atomic(source, KM_USER0); 732 kunmap_atomic(source, KM_USER0);
730 733
@@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
963 jh->b_triggers = type; 966 jh->b_triggers = type;
964} 967}
965 968
966void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, 969void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
967 struct jbd2_buffer_trigger_type *triggers) 970 struct jbd2_buffer_trigger_type *triggers)
968{ 971{
969 struct buffer_head *bh = jh2bh(jh); 972 struct buffer_head *bh = jh2bh(jh);
970 973
971 if (!triggers || !triggers->t_commit) 974 if (!triggers || !triggers->t_frozen)
972 return; 975 return;
973 976
974 triggers->t_commit(triggers, bh, mapped_data, bh->b_size); 977 triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
975} 978}
976 979
977void jbd2_buffer_abort_trigger(struct journal_head *jh, 980void jbd2_buffer_abort_trigger(struct journal_head *jh,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index a2d58c96f1b4..d258e261bdc7 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
626 626
627static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) 627static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
628{ 628{
629 /* success of check_xattr_ref_inode() means taht inode (ic) dose not have 629 /* success of check_xattr_ref_inode() means that inode (ic) dose not have
630 * duplicate name/value pairs. If duplicate name/value pair would be found, 630 * duplicate name/value pairs. If duplicate name/value pair would be found,
631 * one will be removed. 631 * one will be removed.
632 */ 632 */
diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3d04a9..e28f21b95344 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache)
115 * What the mbcache registers as to get shrunk dynamically. 115 * What the mbcache registers as to get shrunk dynamically.
116 */ 116 */
117 117
118static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); 118static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
119 119
120static struct shrinker mb_cache_shrinker = { 120static struct shrinker mb_cache_shrinker = {
121 .shrink = mb_cache_shrink_fn, 121 .shrink = mb_cache_shrink_fn,
@@ -191,13 +191,14 @@ forget:
191 * This function is called by the kernel memory management when memory 191 * This function is called by the kernel memory management when memory
192 * gets low. 192 * gets low.
193 * 193 *
194 * @shrink: (ignored)
194 * @nr_to_scan: Number of objects to scan 195 * @nr_to_scan: Number of objects to scan
195 * @gfp_mask: (ignored) 196 * @gfp_mask: (ignored)
196 * 197 *
197 * Returns the number of objects which are present in the cache. 198 * Returns the number of objects which are present in the cache.
198 */ 199 */
199static int 200static int
200mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) 201mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
201{ 202{
202 LIST_HEAD(free_list); 203 LIST_HEAD(free_list);
203 struct list_head *l, *ltmp; 204 struct list_head *l, *ltmp;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 782b431ef91c..e60416d3f818 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1710,7 +1710,7 @@ static void nfs_access_free_list(struct list_head *head)
1710 } 1710 }
1711} 1711}
1712 1712
1713int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) 1713int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
1714{ 1714{
1715 LIST_HEAD(head); 1715 LIST_HEAD(head);
1716 struct nfs_inode *nfsi; 1716 struct nfs_inode *nfsi;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d8bd619e386c..e70f44b9b3f4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[];
205void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 205void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
206 206
207/* dir.c */ 207/* dir.c */
208extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); 208extern int nfs_access_cache_shrinker(struct shrinker *shrink,
209 int nr_to_scan, gfp_t gfp_mask);
209 210
210/* inode.c */ 211/* inode.c */
211extern struct workqueue_struct *nfsiod_workqueue; 212extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca20cc18..356e976772bf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
196 dump_stack(); 196 dump_stack();
197 goto bail; 197 goto bail;
198 } 198 }
199
200 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
201 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
202 (unsigned long long)past_eof);
203
204 if (create && (iblock >= past_eof))
205 set_buffer_new(bh_result);
206 } 199 }
207 200
201 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
202 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
203 (unsigned long long)past_eof);
204 if (create && (iblock >= past_eof))
205 set_buffer_new(bh_result);
206
208bail: 207bail:
209 if (err < 0) 208 if (err < 0)
210 err = -EIO; 209 err = -EIO;
@@ -459,36 +458,6 @@ int walk_page_buffers( handle_t *handle,
459 return ret; 458 return ret;
460} 459}
461 460
462handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
463 struct page *page,
464 unsigned from,
465 unsigned to)
466{
467 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
468 handle_t *handle;
469 int ret = 0;
470
471 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
472 if (IS_ERR(handle)) {
473 ret = -ENOMEM;
474 mlog_errno(ret);
475 goto out;
476 }
477
478 if (ocfs2_should_order_data(inode)) {
479 ret = ocfs2_jbd2_file_inode(handle, inode);
480 if (ret < 0)
481 mlog_errno(ret);
482 }
483out:
484 if (ret) {
485 if (!IS_ERR(handle))
486 ocfs2_commit_trans(osb, handle);
487 handle = ERR_PTR(ret);
488 }
489 return handle;
490}
491
492static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) 461static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
493{ 462{
494 sector_t status; 463 sector_t status;
@@ -1131,23 +1100,37 @@ out:
1131 */ 1100 */
1132static int ocfs2_grab_pages_for_write(struct address_space *mapping, 1101static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1133 struct ocfs2_write_ctxt *wc, 1102 struct ocfs2_write_ctxt *wc,
1134 u32 cpos, loff_t user_pos, int new, 1103 u32 cpos, loff_t user_pos,
1104 unsigned user_len, int new,
1135 struct page *mmap_page) 1105 struct page *mmap_page)
1136{ 1106{
1137 int ret = 0, i; 1107 int ret = 0, i;
1138 unsigned long start, target_index, index; 1108 unsigned long start, target_index, end_index, index;
1139 struct inode *inode = mapping->host; 1109 struct inode *inode = mapping->host;
1110 loff_t last_byte;
1140 1111
1141 target_index = user_pos >> PAGE_CACHE_SHIFT; 1112 target_index = user_pos >> PAGE_CACHE_SHIFT;
1142 1113
1143 /* 1114 /*
1144 * Figure out how many pages we'll be manipulating here. For 1115 * Figure out how many pages we'll be manipulating here. For
1145 * non allocating write, we just change the one 1116 * non allocating write, we just change the one
1146 * page. Otherwise, we'll need a whole clusters worth. 1117 * page. Otherwise, we'll need a whole clusters worth. If we're
1118 * writing past i_size, we only need enough pages to cover the
1119 * last page of the write.
1147 */ 1120 */
1148 if (new) { 1121 if (new) {
1149 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); 1122 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1150 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); 1123 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1124 /*
1125 * We need the index *past* the last page we could possibly
1126 * touch. This is the page past the end of the write or
1127 * i_size, whichever is greater.
1128 */
1129 last_byte = max(user_pos + user_len, i_size_read(inode));
1130 BUG_ON(last_byte < 1);
1131 end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
1132 if ((start + wc->w_num_pages) > end_index)
1133 wc->w_num_pages = end_index - start;
1151 } else { 1134 } else {
1152 wc->w_num_pages = 1; 1135 wc->w_num_pages = 1;
1153 start = target_index; 1136 start = target_index;
@@ -1620,21 +1603,20 @@ out:
1620 * write path can treat it as an non-allocating write, which has no 1603 * write path can treat it as an non-allocating write, which has no
1621 * special case code for sparse/nonsparse files. 1604 * special case code for sparse/nonsparse files.
1622 */ 1605 */
1623static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, 1606static int ocfs2_expand_nonsparse_inode(struct inode *inode,
1624 unsigned len, 1607 struct buffer_head *di_bh,
1608 loff_t pos, unsigned len,
1625 struct ocfs2_write_ctxt *wc) 1609 struct ocfs2_write_ctxt *wc)
1626{ 1610{
1627 int ret; 1611 int ret;
1628 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1629 loff_t newsize = pos + len; 1612 loff_t newsize = pos + len;
1630 1613
1631 if (ocfs2_sparse_alloc(osb)) 1614 BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1632 return 0;
1633 1615
1634 if (newsize <= i_size_read(inode)) 1616 if (newsize <= i_size_read(inode))
1635 return 0; 1617 return 0;
1636 1618
1637 ret = ocfs2_extend_no_holes(inode, newsize, pos); 1619 ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
1638 if (ret) 1620 if (ret)
1639 mlog_errno(ret); 1621 mlog_errno(ret);
1640 1622
@@ -1644,6 +1626,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
1644 return ret; 1626 return ret;
1645} 1627}
1646 1628
1629static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1630 loff_t pos)
1631{
1632 int ret = 0;
1633
1634 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1635 if (pos > i_size_read(inode))
1636 ret = ocfs2_zero_extend(inode, di_bh, pos);
1637
1638 return ret;
1639}
1640
1647int ocfs2_write_begin_nolock(struct address_space *mapping, 1641int ocfs2_write_begin_nolock(struct address_space *mapping,
1648 loff_t pos, unsigned len, unsigned flags, 1642 loff_t pos, unsigned len, unsigned flags,
1649 struct page **pagep, void **fsdata, 1643 struct page **pagep, void **fsdata,
@@ -1679,7 +1673,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1679 } 1673 }
1680 } 1674 }
1681 1675
1682 ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); 1676 if (ocfs2_sparse_alloc(osb))
1677 ret = ocfs2_zero_tail(inode, di_bh, pos);
1678 else
1679 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
1680 wc);
1683 if (ret) { 1681 if (ret) {
1684 mlog_errno(ret); 1682 mlog_errno(ret);
1685 goto out; 1683 goto out;
@@ -1789,7 +1787,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1789 * that we can zero and flush if we error after adding the 1787 * that we can zero and flush if we error after adding the
1790 * extent. 1788 * extent.
1791 */ 1789 */
1792 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, 1790 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
1793 cluster_of_pages, mmap_page); 1791 cluster_of_pages, mmap_page);
1794 if (ret) { 1792 if (ret) {
1795 mlog_errno(ret); 1793 mlog_errno(ret);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6b5a492e1749..153abb5abef0 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
1671 struct dlm_ctxt *dlm = NULL; 1671 struct dlm_ctxt *dlm = NULL;
1672 struct dlm_ctxt *new_ctxt = NULL; 1672 struct dlm_ctxt *new_ctxt = NULL;
1673 1673
1674 if (strlen(domain) > O2NM_MAX_NAME_LEN) { 1674 if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
1675 ret = -ENAMETOOLONG; 1675 ret = -ENAMETOOLONG;
1676 mlog(ML_ERROR, "domain name length too long\n"); 1676 mlog(ML_ERROR, "domain name length too long\n");
1677 goto leave; 1677 goto leave;
@@ -1709,6 +1709,7 @@ retry:
1709 } 1709 }
1710 1710
1711 if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) { 1711 if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
1712 spin_unlock(&dlm_domain_lock);
1712 mlog(ML_ERROR, 1713 mlog(ML_ERROR,
1713 "Requested locking protocol version is not " 1714 "Requested locking protocol version is not "
1714 "compatible with already registered domain " 1715 "compatible with already registered domain "
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4a7506a4e314..94b97fc6a88e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2808,14 +2808,8 @@ again:
2808 mlog(0, "trying again...\n"); 2808 mlog(0, "trying again...\n");
2809 goto again; 2809 goto again;
2810 } 2810 }
2811 /* now that we are sure the MIGRATING state is there, drop
2812 * the unneded state which blocked threads trying to DIRTY */
2813 spin_lock(&res->spinlock);
2814 BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2815 BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
2816 res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2817 spin_unlock(&res->spinlock);
2818 2811
2812 ret = 0;
2819 /* did the target go down or die? */ 2813 /* did the target go down or die? */
2820 spin_lock(&dlm->spinlock); 2814 spin_lock(&dlm->spinlock);
2821 if (!test_bit(target, dlm->domain_map)) { 2815 if (!test_bit(target, dlm->domain_map)) {
@@ -2826,9 +2820,21 @@ again:
2826 spin_unlock(&dlm->spinlock); 2820 spin_unlock(&dlm->spinlock);
2827 2821
2828 /* 2822 /*
2823 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
2824 * another try; otherwise, we are sure the MIGRATING state is there,
2825 * drop the unneded state which blocked threads trying to DIRTY
2826 */
2827 spin_lock(&res->spinlock);
2828 BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2829 res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2830 if (!ret)
2831 BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
2832 spin_unlock(&res->spinlock);
2833
2834 /*
2829 * at this point: 2835 * at this point:
2830 * 2836 *
2831 * o the DLM_LOCK_RES_MIGRATING flag is set 2837 * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
2832 * o there are no pending asts on this lockres 2838 * o there are no pending asts on this lockres
2833 * o all processes trying to reserve an ast on this 2839 * o all processes trying to reserve an ast on this
2834 * lockres must wait for the MIGRATING flag to clear 2840 * lockres must wait for the MIGRATING flag to clear
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f8b75ce4be70..9dfaac73b36d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
463 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { 463 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
464 int bit; 464 int bit;
465 465
466 bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); 466 bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
467 if (bit >= O2NM_MAX_NODES || bit < 0) 467 if (bit >= O2NM_MAX_NODES || bit < 0)
468 dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); 468 dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
469 else 469 else
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6a13ea64c447..2b10b36d1577 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -724,28 +724,55 @@ leave:
724 return status; 724 return status;
725} 725}
726 726
727/*
728 * While a write will already be ordering the data, a truncate will not.
729 * Thus, we need to explicitly order the zeroed pages.
730 */
731static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
732{
733 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
734 handle_t *handle = NULL;
735 int ret = 0;
736
737 if (!ocfs2_should_order_data(inode))
738 goto out;
739
740 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
741 if (IS_ERR(handle)) {
742 ret = -ENOMEM;
743 mlog_errno(ret);
744 goto out;
745 }
746
747 ret = ocfs2_jbd2_file_inode(handle, inode);
748 if (ret < 0)
749 mlog_errno(ret);
750
751out:
752 if (ret) {
753 if (!IS_ERR(handle))
754 ocfs2_commit_trans(osb, handle);
755 handle = ERR_PTR(ret);
756 }
757 return handle;
758}
759
727/* Some parts of this taken from generic_cont_expand, which turned out 760/* Some parts of this taken from generic_cont_expand, which turned out
728 * to be too fragile to do exactly what we need without us having to 761 * to be too fragile to do exactly what we need without us having to
729 * worry about recursive locking in ->write_begin() and ->write_end(). */ 762 * worry about recursive locking in ->write_begin() and ->write_end(). */
730static int ocfs2_write_zero_page(struct inode *inode, 763static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
731 u64 size) 764 u64 abs_to)
732{ 765{
733 struct address_space *mapping = inode->i_mapping; 766 struct address_space *mapping = inode->i_mapping;
734 struct page *page; 767 struct page *page;
735 unsigned long index; 768 unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
736 unsigned int offset;
737 handle_t *handle = NULL; 769 handle_t *handle = NULL;
738 int ret; 770 int ret = 0;
771 unsigned zero_from, zero_to, block_start, block_end;
739 772
740 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 773 BUG_ON(abs_from >= abs_to);
741 /* ugh. in prepare/commit_write, if from==to==start of block, we 774 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
742 ** skip the prepare. make sure we never send an offset for the start 775 BUG_ON(abs_from & (inode->i_blkbits - 1));
743 ** of a block
744 */
745 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
746 offset++;
747 }
748 index = size >> PAGE_CACHE_SHIFT;
749 776
750 page = grab_cache_page(mapping, index); 777 page = grab_cache_page(mapping, index);
751 if (!page) { 778 if (!page) {
@@ -754,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode,
754 goto out; 781 goto out;
755 } 782 }
756 783
757 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 784 /* Get the offsets within the page that we want to zero */
758 if (ret < 0) { 785 zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
759 mlog_errno(ret); 786 zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
760 goto out_unlock; 787 if (!zero_to)
761 } 788 zero_to = PAGE_CACHE_SIZE;
762 789
763 if (ocfs2_should_order_data(inode)) { 790 mlog(0,
764 handle = ocfs2_start_walk_page_trans(inode, page, offset, 791 "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
765 offset); 792 (unsigned long long)abs_from, (unsigned long long)abs_to,
766 if (IS_ERR(handle)) { 793 index, zero_from, zero_to);
767 ret = PTR_ERR(handle); 794
768 handle = NULL; 795 /* We know that zero_from is block aligned */
796 for (block_start = zero_from; block_start < zero_to;
797 block_start = block_end) {
798 block_end = block_start + (1 << inode->i_blkbits);
799
800 /*
801 * block_start is block-aligned. Bump it by one to
802 * force ocfs2_{prepare,commit}_write() to zero the
803 * whole block.
804 */
805 ret = ocfs2_prepare_write_nolock(inode, page,
806 block_start + 1,
807 block_start + 1);
808 if (ret < 0) {
809 mlog_errno(ret);
769 goto out_unlock; 810 goto out_unlock;
770 } 811 }
771 }
772 812
773 /* must not update i_size! */ 813 if (!handle) {
774 ret = block_commit_write(page, offset, offset); 814 handle = ocfs2_zero_start_ordered_transaction(inode);
775 if (ret < 0) 815 if (IS_ERR(handle)) {
776 mlog_errno(ret); 816 ret = PTR_ERR(handle);
777 else 817 handle = NULL;
778 ret = 0; 818 break;
819 }
820 }
821
822 /* must not update i_size! */
823 ret = block_commit_write(page, block_start + 1,
824 block_start + 1);
825 if (ret < 0)
826 mlog_errno(ret);
827 else
828 ret = 0;
829 }
779 830
780 if (handle) 831 if (handle)
781 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 832 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
833
782out_unlock: 834out_unlock:
783 unlock_page(page); 835 unlock_page(page);
784 page_cache_release(page); 836 page_cache_release(page);
@@ -786,22 +838,114 @@ out:
786 return ret; 838 return ret;
787} 839}
788 840
789static int ocfs2_zero_extend(struct inode *inode, 841/*
790 u64 zero_to_size) 842 * Find the next range to zero. We do this in terms of bytes because
843 * that's what ocfs2_zero_extend() wants, and it is dealing with the
844 * pagecache. We may return multiple extents.
845 *
846 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
847 * needs to be zeroed. range_start and range_end return the next zeroing
848 * range. A subsequent call should pass the previous range_end as its
849 * zero_start. If range_end is 0, there's nothing to do.
850 *
851 * Unwritten extents are skipped over. Refcounted extents are CoWd.
852 */
853static int ocfs2_zero_extend_get_range(struct inode *inode,
854 struct buffer_head *di_bh,
855 u64 zero_start, u64 zero_end,
856 u64 *range_start, u64 *range_end)
791{ 857{
792 int ret = 0; 858 int rc = 0, needs_cow = 0;
793 u64 start_off; 859 u32 p_cpos, zero_clusters = 0;
794 struct super_block *sb = inode->i_sb; 860 u32 zero_cpos =
861 zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
862 u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
863 unsigned int num_clusters = 0;
864 unsigned int ext_flags = 0;
795 865
796 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 866 while (zero_cpos < last_cpos) {
797 while (start_off < zero_to_size) { 867 rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
798 ret = ocfs2_write_zero_page(inode, start_off); 868 &num_clusters, &ext_flags);
799 if (ret < 0) { 869 if (rc) {
800 mlog_errno(ret); 870 mlog_errno(rc);
871 goto out;
872 }
873
874 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
875 zero_clusters = num_clusters;
876 if (ext_flags & OCFS2_EXT_REFCOUNTED)
877 needs_cow = 1;
878 break;
879 }
880
881 zero_cpos += num_clusters;
882 }
883 if (!zero_clusters) {
884 *range_end = 0;
885 goto out;
886 }
887
888 while ((zero_cpos + zero_clusters) < last_cpos) {
889 rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
890 &p_cpos, &num_clusters,
891 &ext_flags);
892 if (rc) {
893 mlog_errno(rc);
801 goto out; 894 goto out;
802 } 895 }
803 896
804 start_off += sb->s_blocksize; 897 if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
898 break;
899 if (ext_flags & OCFS2_EXT_REFCOUNTED)
900 needs_cow = 1;
901 zero_clusters += num_clusters;
902 }
903 if ((zero_cpos + zero_clusters) > last_cpos)
904 zero_clusters = last_cpos - zero_cpos;
905
906 if (needs_cow) {
907 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
908 UINT_MAX);
909 if (rc) {
910 mlog_errno(rc);
911 goto out;
912 }
913 }
914
915 *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
916 *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
917 zero_cpos + zero_clusters);
918
919out:
920 return rc;
921}
922
923/*
924 * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
925 * has made sure that the entire range needs zeroing.
926 */
927static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
928 u64 range_end)
929{
930 int rc = 0;
931 u64 next_pos;
932 u64 zero_pos = range_start;
933
934 mlog(0, "range_start = %llu, range_end = %llu\n",
935 (unsigned long long)range_start,
936 (unsigned long long)range_end);
937 BUG_ON(range_start >= range_end);
938
939 while (zero_pos < range_end) {
940 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
941 if (next_pos > range_end)
942 next_pos = range_end;
943 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
944 if (rc < 0) {
945 mlog_errno(rc);
946 break;
947 }
948 zero_pos = next_pos;
805 949
806 /* 950 /*
807 * Very large extends have the potential to lock up 951 * Very large extends have the potential to lock up
@@ -810,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
810 cond_resched(); 954 cond_resched();
811 } 955 }
812 956
813out: 957 return rc;
958}
959
960int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
961 loff_t zero_to_size)
962{
963 int ret = 0;
964 u64 zero_start, range_start = 0, range_end = 0;
965 struct super_block *sb = inode->i_sb;
966
967 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
968 mlog(0, "zero_start %llu for i_size %llu\n",
969 (unsigned long long)zero_start,
970 (unsigned long long)i_size_read(inode));
971 while (zero_start < zero_to_size) {
972 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
973 zero_to_size,
974 &range_start,
975 &range_end);
976 if (ret) {
977 mlog_errno(ret);
978 break;
979 }
980 if (!range_end)
981 break;
982 /* Trim the ends */
983 if (range_start < zero_start)
984 range_start = zero_start;
985 if (range_end > zero_to_size)
986 range_end = zero_to_size;
987
988 ret = ocfs2_zero_extend_range(inode, range_start,
989 range_end);
990 if (ret) {
991 mlog_errno(ret);
992 break;
993 }
994 zero_start = range_end;
995 }
996
814 return ret; 997 return ret;
815} 998}
816 999
817int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) 1000int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
1001 u64 new_i_size, u64 zero_to)
818{ 1002{
819 int ret; 1003 int ret;
820 u32 clusters_to_add; 1004 u32 clusters_to_add;
821 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1005 struct ocfs2_inode_info *oi = OCFS2_I(inode);
822 1006
1007 /*
1008 * Only quota files call this without a bh, and they can't be
1009 * refcounted.
1010 */
1011 BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
1012 BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1013
823 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); 1014 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
824 if (clusters_to_add < oi->ip_clusters) 1015 if (clusters_to_add < oi->ip_clusters)
825 clusters_to_add = 0; 1016 clusters_to_add = 0;
@@ -840,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
840 * still need to zero the area between the old i_size and the 1031 * still need to zero the area between the old i_size and the
841 * new i_size. 1032 * new i_size.
842 */ 1033 */
843 ret = ocfs2_zero_extend(inode, zero_to); 1034 ret = ocfs2_zero_extend(inode, di_bh, zero_to);
844 if (ret < 0) 1035 if (ret < 0)
845 mlog_errno(ret); 1036 mlog_errno(ret);
846 1037
@@ -862,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode,
862 goto out; 1053 goto out;
863 1054
864 if (i_size_read(inode) == new_i_size) 1055 if (i_size_read(inode) == new_i_size)
865 goto out; 1056 goto out;
866 BUG_ON(new_i_size < i_size_read(inode)); 1057 BUG_ON(new_i_size < i_size_read(inode));
867 1058
868 /* 1059 /*
869 * Fall through for converting inline data, even if the fs
870 * supports sparse files.
871 *
872 * The check for inline data here is legal - nobody can add
873 * the feature since we have i_mutex. We must check it again
874 * after acquiring ip_alloc_sem though, as paths like mmap
875 * might have raced us to converting the inode to extents.
876 */
877 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
878 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
879 goto out_update_size;
880
881 /*
882 * The alloc sem blocks people in read/write from reading our 1060 * The alloc sem blocks people in read/write from reading our
883 * allocation until we're done changing it. We depend on 1061 * allocation until we're done changing it. We depend on
884 * i_mutex to block other extend/truncate calls while we're 1062 * i_mutex to block other extend/truncate calls while we're
885 * here. 1063 * here. We even have to hold it for sparse files because there
1064 * might be some tail zeroing.
886 */ 1065 */
887 down_write(&oi->ip_alloc_sem); 1066 down_write(&oi->ip_alloc_sem);
888 1067
@@ -899,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
899 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); 1078 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
900 if (ret) { 1079 if (ret) {
901 up_write(&oi->ip_alloc_sem); 1080 up_write(&oi->ip_alloc_sem);
902
903 mlog_errno(ret); 1081 mlog_errno(ret);
904 goto out; 1082 goto out;
905 } 1083 }
906 } 1084 }
907 1085
908 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 1086 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
909 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); 1087 ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1088 else
1089 ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1090 new_i_size);
910 1091
911 up_write(&oi->ip_alloc_sem); 1092 up_write(&oi->ip_alloc_sem);
912 1093
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70e..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
54int ocfs2_simple_size_update(struct inode *inode, 54int ocfs2_simple_size_update(struct inode *inode,
55 struct buffer_head *di_bh, 55 struct buffer_head *di_bh,
56 u64 new_i_size); 56 u64 new_i_size);
57int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, 57int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
58 u64 zero_to); 58 u64 new_i_size, u64 zero_to);
59int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
60 loff_t zero_to);
59int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
60int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
61 struct kstat *stat); 63 struct kstat *stat);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 47878cf16418..625de9d7088c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
472 return container_of(triggers, struct ocfs2_triggers, ot_triggers); 472 return container_of(triggers, struct ocfs2_triggers, ot_triggers);
473} 473}
474 474
475static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, 475static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
476 struct buffer_head *bh, 476 struct buffer_head *bh,
477 void *data, size_t size) 477 void *data, size_t size)
478{ 478{
@@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
491 * Quota blocks have their own trigger because the struct ocfs2_block_check 491 * Quota blocks have their own trigger because the struct ocfs2_block_check
492 * offset depends on the blocksize. 492 * offset depends on the blocksize.
493 */ 493 */
494static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, 494static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
495 struct buffer_head *bh, 495 struct buffer_head *bh,
496 void *data, size_t size) 496 void *data, size_t size)
497{ 497{
@@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
511 * Directory blocks also have their own trigger because the 511 * Directory blocks also have their own trigger because the
512 * struct ocfs2_block_check offset depends on the blocksize. 512 * struct ocfs2_block_check offset depends on the blocksize.
513 */ 513 */
514static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers, 514static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
515 struct buffer_head *bh, 515 struct buffer_head *bh,
516 void *data, size_t size) 516 void *data, size_t size)
517{ 517{
@@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
544 544
545static struct ocfs2_triggers di_triggers = { 545static struct ocfs2_triggers di_triggers = {
546 .ot_triggers = { 546 .ot_triggers = {
547 .t_commit = ocfs2_commit_trigger, 547 .t_frozen = ocfs2_frozen_trigger,
548 .t_abort = ocfs2_abort_trigger, 548 .t_abort = ocfs2_abort_trigger,
549 }, 549 },
550 .ot_offset = offsetof(struct ocfs2_dinode, i_check), 550 .ot_offset = offsetof(struct ocfs2_dinode, i_check),
@@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = {
552 552
553static struct ocfs2_triggers eb_triggers = { 553static struct ocfs2_triggers eb_triggers = {
554 .ot_triggers = { 554 .ot_triggers = {
555 .t_commit = ocfs2_commit_trigger, 555 .t_frozen = ocfs2_frozen_trigger,
556 .t_abort = ocfs2_abort_trigger, 556 .t_abort = ocfs2_abort_trigger,
557 }, 557 },
558 .ot_offset = offsetof(struct ocfs2_extent_block, h_check), 558 .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
@@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = {
560 560
561static struct ocfs2_triggers rb_triggers = { 561static struct ocfs2_triggers rb_triggers = {
562 .ot_triggers = { 562 .ot_triggers = {
563 .t_commit = ocfs2_commit_trigger, 563 .t_frozen = ocfs2_frozen_trigger,
564 .t_abort = ocfs2_abort_trigger, 564 .t_abort = ocfs2_abort_trigger,
565 }, 565 },
566 .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), 566 .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
@@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = {
568 568
569static struct ocfs2_triggers gd_triggers = { 569static struct ocfs2_triggers gd_triggers = {
570 .ot_triggers = { 570 .ot_triggers = {
571 .t_commit = ocfs2_commit_trigger, 571 .t_frozen = ocfs2_frozen_trigger,
572 .t_abort = ocfs2_abort_trigger, 572 .t_abort = ocfs2_abort_trigger,
573 }, 573 },
574 .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), 574 .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
@@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = {
576 576
577static struct ocfs2_triggers db_triggers = { 577static struct ocfs2_triggers db_triggers = {
578 .ot_triggers = { 578 .ot_triggers = {
579 .t_commit = ocfs2_db_commit_trigger, 579 .t_frozen = ocfs2_db_frozen_trigger,
580 .t_abort = ocfs2_abort_trigger, 580 .t_abort = ocfs2_abort_trigger,
581 }, 581 },
582}; 582};
583 583
584static struct ocfs2_triggers xb_triggers = { 584static struct ocfs2_triggers xb_triggers = {
585 .ot_triggers = { 585 .ot_triggers = {
586 .t_commit = ocfs2_commit_trigger, 586 .t_frozen = ocfs2_frozen_trigger,
587 .t_abort = ocfs2_abort_trigger, 587 .t_abort = ocfs2_abort_trigger,
588 }, 588 },
589 .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), 589 .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
@@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = {
591 591
592static struct ocfs2_triggers dq_triggers = { 592static struct ocfs2_triggers dq_triggers = {
593 .ot_triggers = { 593 .ot_triggers = {
594 .t_commit = ocfs2_dq_commit_trigger, 594 .t_frozen = ocfs2_dq_frozen_trigger,
595 .t_abort = ocfs2_abort_trigger, 595 .t_abort = ocfs2_abort_trigger,
596 }, 596 },
597}; 597};
598 598
599static struct ocfs2_triggers dr_triggers = { 599static struct ocfs2_triggers dr_triggers = {
600 .ot_triggers = { 600 .ot_triggers = {
601 .t_commit = ocfs2_commit_trigger, 601 .t_frozen = ocfs2_frozen_trigger,
602 .t_abort = ocfs2_abort_trigger, 602 .t_abort = ocfs2_abort_trigger,
603 }, 603 },
604 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), 604 .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = {
606 606
607static struct ocfs2_triggers dl_triggers = { 607static struct ocfs2_triggers dl_triggers = {
608 .ot_triggers = { 608 .ot_triggers = {
609 .t_commit = ocfs2_commit_trigger, 609 .t_frozen = ocfs2_frozen_trigger,
610 .t_abort = ocfs2_abort_trigger, 610 .t_abort = ocfs2_abort_trigger,
611 }, 611 },
612 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), 612 .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
@@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
1936 mutex_lock(&os->os_lock); 1936 mutex_lock(&os->os_lock);
1937 ocfs2_queue_orphan_scan(osb); 1937 ocfs2_queue_orphan_scan(osb);
1938 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) 1938 if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
1939 schedule_delayed_work(&os->os_orphan_scan_work, 1939 queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
1940 ocfs2_orphan_scan_timeout()); 1940 ocfs2_orphan_scan_timeout());
1941 mutex_unlock(&os->os_lock); 1941 mutex_unlock(&os->os_lock);
1942} 1942}
@@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
1976 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); 1976 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
1977 else { 1977 else {
1978 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); 1978 atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
1979 schedule_delayed_work(&os->os_orphan_scan_work, 1979 queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
1980 ocfs2_orphan_scan_timeout()); 1980 ocfs2_orphan_scan_timeout());
1981 } 1981 }
1982} 1982}
1983 1983
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 3d7419682dc0..ec6adbf8f551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
118{ 118{
119 unsigned int la_mb; 119 unsigned int la_mb;
120 unsigned int gd_mb; 120 unsigned int gd_mb;
121 unsigned int la_max_mb;
121 unsigned int megs_per_slot; 122 unsigned int megs_per_slot;
122 struct super_block *sb = osb->sb; 123 struct super_block *sb = osb->sb;
123 124
@@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
182 if (megs_per_slot < la_mb) 183 if (megs_per_slot < la_mb)
183 la_mb = megs_per_slot; 184 la_mb = megs_per_slot;
184 185
186 /* We can't store more bits than we can in a block. */
187 la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
188 ocfs2_local_alloc_size(sb) * 8);
189 if (la_mb > la_max_mb)
190 la_mb = la_max_mb;
191
185 return la_mb; 192 return la_mb;
186} 193}
187 194
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bb35fe00511..4607923eb24c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
775 * locking allocators ranks above a transaction start 775 * locking allocators ranks above a transaction start
776 */ 776 */
777 WARN_ON(journal_current_handle()); 777 WARN_ON(journal_current_handle());
778 status = ocfs2_extend_no_holes(gqinode, 778 status = ocfs2_extend_no_holes(gqinode, NULL,
779 gqinode->i_size + (need_alloc << sb->s_blocksize_bits), 779 gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
780 gqinode->i_size); 780 gqinode->i_size);
781 if (status < 0) 781 if (status < 0)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8bd70d4d184d..dc78764ccc4c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
971 u64 p_blkno; 971 u64 p_blkno;
972 972
973 /* We are protected by dqio_sem so no locking needed */ 973 /* We are protected by dqio_sem so no locking needed */
974 status = ocfs2_extend_no_holes(lqinode, 974 status = ocfs2_extend_no_holes(lqinode, NULL,
975 lqinode->i_size + 2 * sb->s_blocksize, 975 lqinode->i_size + 2 * sb->s_blocksize,
976 lqinode->i_size); 976 lqinode->i_size);
977 if (status < 0) { 977 if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1114 return ocfs2_local_quota_add_chunk(sb, type, offset); 1114 return ocfs2_local_quota_add_chunk(sb, type, offset);
1115 1115
1116 /* We are protected by dqio_sem so no locking needed */ 1116 /* We are protected by dqio_sem so no locking needed */
1117 status = ocfs2_extend_no_holes(lqinode, 1117 status = ocfs2_extend_no_holes(lqinode, NULL,
1118 lqinode->i_size + sb->s_blocksize, 1118 lqinode->i_size + sb->s_blocksize,
1119 lqinode->i_size); 1119 lqinode->i_size);
1120 if (status < 0) { 1120 if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4793f36f6518..3ac5aa733e9c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2931 2931
2932 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2932 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2933 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2933 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2934 /*
2935 * We only duplicate pages until we reach the page contains i_size - 1.
2936 * So trim 'end' to i_size.
2937 */
2938 if (end > i_size_read(context->inode))
2939 end = i_size_read(context->inode);
2934 2940
2935 while (offset < end) { 2941 while (offset < end) {
2936 page_index = offset >> PAGE_CACHE_SHIFT; 2942 page_index = offset >> PAGE_CACHE_SHIFT;
@@ -4166,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
4166 struct inode *inode = old_dentry->d_inode; 4172 struct inode *inode = old_dentry->d_inode;
4167 struct buffer_head *new_bh = NULL; 4173 struct buffer_head *new_bh = NULL;
4168 4174
4175 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
4176 ret = -EINVAL;
4177 mlog_errno(ret);
4178 goto out;
4179 }
4180
4169 ret = filemap_fdatawrite(inode->i_mapping); 4181 ret = filemap_fdatawrite(inode->i_mapping);
4170 if (ret) { 4182 if (ret) {
4171 mlog_errno(ret); 4183 mlog_errno(ret);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f4c2a9eb8c4d..a8e6a95a353f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
741 le16_to_cpu(bg->bg_free_bits_count)); 741 le16_to_cpu(bg->bg_free_bits_count));
742 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, 742 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
743 le16_to_cpu(bg->bg_bits)); 743 le16_to_cpu(bg->bg_bits));
744 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno); 744 cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
745 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) 745 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
746 le16_add_cpu(&cl->cl_next_free_rec, 1); 746 le16_add_cpu(&cl->cl_next_free_rec, 1);
747 747
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e97b34842cfe..d03469f61801 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
709 struct ocfs2_xattr_value_buf *vb, 709 struct ocfs2_xattr_value_buf *vb,
710 struct ocfs2_xattr_set_ctxt *ctxt) 710 struct ocfs2_xattr_set_ctxt *ctxt)
711{ 711{
712 int status = 0; 712 int status = 0, credits;
713 handle_t *handle = ctxt->handle; 713 handle_t *handle = ctxt->handle;
714 enum ocfs2_alloc_restarted why; 714 enum ocfs2_alloc_restarted why;
715 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); 715 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
@@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
719 719
720 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); 720 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
721 721
722 status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 722 while (clusters_to_add) {
723 OCFS2_JOURNAL_ACCESS_WRITE); 723 status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
724 if (status < 0) { 724 OCFS2_JOURNAL_ACCESS_WRITE);
725 mlog_errno(status); 725 if (status < 0) {
726 goto leave; 726 mlog_errno(status);
727 } 727 break;
728 }
728 729
729 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); 730 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
730 status = ocfs2_add_clusters_in_btree(handle, 731 status = ocfs2_add_clusters_in_btree(handle,
731 &et, 732 &et,
732 &logical_start, 733 &logical_start,
733 clusters_to_add, 734 clusters_to_add,
734 0, 735 0,
735 ctxt->data_ac, 736 ctxt->data_ac,
736 ctxt->meta_ac, 737 ctxt->meta_ac,
737 &why); 738 &why);
738 if (status < 0) { 739 if ((status < 0) && (status != -EAGAIN)) {
739 mlog_errno(status); 740 if (status != -ENOSPC)
740 goto leave; 741 mlog_errno(status);
741 } 742 break;
743 }
742 744
743 ocfs2_journal_dirty(handle, vb->vb_bh); 745 ocfs2_journal_dirty(handle, vb->vb_bh);
744 746
745 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; 747 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
748 prev_clusters;
746 749
747 /* 750 if (why != RESTART_NONE && clusters_to_add) {
748 * We should have already allocated enough space before the transaction, 751 /*
749 * so no need to restart. 752 * We can only fail in case the alloc file doesn't give
750 */ 753 * up enough clusters.
751 BUG_ON(why != RESTART_NONE || clusters_to_add); 754 */
752 755 BUG_ON(why == RESTART_META);
753leave: 756
757 mlog(0, "restarting xattr value extension for %u"
758 " clusters,.\n", clusters_to_add);
759 credits = ocfs2_calc_extend_credits(inode->i_sb,
760 &vb->vb_xv->xr_list,
761 clusters_to_add);
762 status = ocfs2_extend_trans(handle, credits);
763 if (status < 0) {
764 status = -ENOMEM;
765 mlog_errno(status);
766 break;
767 }
768 }
769 }
754 770
755 return status; 771 return status;
756} 772}
@@ -6788,16 +6804,15 @@ out:
6788 return ret; 6804 return ret;
6789} 6805}
6790 6806
6791static int ocfs2_reflink_xattr_buckets(handle_t *handle, 6807static int ocfs2_reflink_xattr_bucket(handle_t *handle,
6792 u64 blkno, u64 new_blkno, u32 clusters, 6808 u64 blkno, u64 new_blkno, u32 clusters,
6809 u32 *cpos, int num_buckets,
6793 struct ocfs2_alloc_context *meta_ac, 6810 struct ocfs2_alloc_context *meta_ac,
6794 struct ocfs2_alloc_context *data_ac, 6811 struct ocfs2_alloc_context *data_ac,
6795 struct ocfs2_reflink_xattr_tree_args *args) 6812 struct ocfs2_reflink_xattr_tree_args *args)
6796{ 6813{
6797 int i, j, ret = 0; 6814 int i, j, ret = 0;
6798 struct super_block *sb = args->reflink->old_inode->i_sb; 6815 struct super_block *sb = args->reflink->old_inode->i_sb;
6799 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
6800 u32 num_buckets = clusters * bpc;
6801 int bpb = args->old_bucket->bu_blocks; 6816 int bpb = args->old_bucket->bu_blocks;
6802 struct ocfs2_xattr_value_buf vb = { 6817 struct ocfs2_xattr_value_buf vb = {
6803 .vb_access = ocfs2_journal_access, 6818 .vb_access = ocfs2_journal_access,
@@ -6816,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6816 break; 6831 break;
6817 } 6832 }
6818 6833
6819 /*
6820 * The real bucket num in this series of blocks is stored
6821 * in the 1st bucket.
6822 */
6823 if (i == 0)
6824 num_buckets = le16_to_cpu(
6825 bucket_xh(args->old_bucket)->xh_num_buckets);
6826
6827 ret = ocfs2_xattr_bucket_journal_access(handle, 6834 ret = ocfs2_xattr_bucket_journal_access(handle,
6828 args->new_bucket, 6835 args->new_bucket,
6829 OCFS2_JOURNAL_ACCESS_CREATE); 6836 OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6837,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6837 bucket_block(args->old_bucket, j), 6844 bucket_block(args->old_bucket, j),
6838 sb->s_blocksize); 6845 sb->s_blocksize);
6839 6846
6847 /*
6848 * Record the start cpos so that we can use it to initialize
6849 * our xattr tree we also set the xh_num_bucket for the new
6850 * bucket.
6851 */
6852 if (i == 0) {
6853 *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
6854 xh_entries[0].xe_name_hash);
6855 bucket_xh(args->new_bucket)->xh_num_buckets =
6856 cpu_to_le16(num_buckets);
6857 }
6858
6840 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); 6859 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6841 6860
6842 ret = ocfs2_reflink_xattr_header(handle, args->reflink, 6861 ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6866,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6866 } 6885 }
6867 6886
6868 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); 6887 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6888
6869 ocfs2_xattr_bucket_relse(args->old_bucket); 6889 ocfs2_xattr_bucket_relse(args->old_bucket);
6870 ocfs2_xattr_bucket_relse(args->new_bucket); 6890 ocfs2_xattr_bucket_relse(args->new_bucket);
6871 } 6891 }
@@ -6874,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6874 ocfs2_xattr_bucket_relse(args->new_bucket); 6894 ocfs2_xattr_bucket_relse(args->new_bucket);
6875 return ret; 6895 return ret;
6876} 6896}
6897
6898static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6899 struct inode *inode,
6900 struct ocfs2_reflink_xattr_tree_args *args,
6901 struct ocfs2_extent_tree *et,
6902 struct ocfs2_alloc_context *meta_ac,
6903 struct ocfs2_alloc_context *data_ac,
6904 u64 blkno, u32 cpos, u32 len)
6905{
6906 int ret, first_inserted = 0;
6907 u32 p_cluster, num_clusters, reflink_cpos = 0;
6908 u64 new_blkno;
6909 unsigned int num_buckets, reflink_buckets;
6910 unsigned int bpc =
6911 ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
6912
6913 ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
6914 if (ret) {
6915 mlog_errno(ret);
6916 goto out;
6917 }
6918 num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
6919 ocfs2_xattr_bucket_relse(args->old_bucket);
6920
6921 while (len && num_buckets) {
6922 ret = ocfs2_claim_clusters(handle, data_ac,
6923 1, &p_cluster, &num_clusters);
6924 if (ret) {
6925 mlog_errno(ret);
6926 goto out;
6927 }
6928
6929 new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
6930 reflink_buckets = min(num_buckets, bpc * num_clusters);
6931
6932 ret = ocfs2_reflink_xattr_bucket(handle, blkno,
6933 new_blkno, num_clusters,
6934 &reflink_cpos, reflink_buckets,
6935 meta_ac, data_ac, args);
6936 if (ret) {
6937 mlog_errno(ret);
6938 goto out;
6939 }
6940
6941 /*
6942 * For the 1st allocated cluster, we make it use the same cpos
6943 * so that the xattr tree looks the same as the original one
6944 * in the most case.
6945 */
6946 if (!first_inserted) {
6947 reflink_cpos = cpos;
6948 first_inserted = 1;
6949 }
6950 ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
6951 num_clusters, 0, meta_ac);
6952 if (ret)
6953 mlog_errno(ret);
6954
6955 mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
6956 (unsigned long long)new_blkno, num_clusters, reflink_cpos);
6957
6958 len -= num_clusters;
6959 blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
6960 num_buckets -= reflink_buckets;
6961 }
6962out:
6963 return ret;
6964}
6965
6877/* 6966/*
6878 * Create the same xattr extent record in the new inode's xattr tree. 6967 * Create the same xattr extent record in the new inode's xattr tree.
6879 */ 6968 */
@@ -6885,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6885 void *para) 6974 void *para)
6886{ 6975{
6887 int ret, credits = 0; 6976 int ret, credits = 0;
6888 u32 p_cluster, num_clusters;
6889 u64 new_blkno;
6890 handle_t *handle; 6977 handle_t *handle;
6891 struct ocfs2_reflink_xattr_tree_args *args = 6978 struct ocfs2_reflink_xattr_tree_args *args =
6892 (struct ocfs2_reflink_xattr_tree_args *)para; 6979 (struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6895,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6895 struct ocfs2_alloc_context *data_ac = NULL; 6982 struct ocfs2_alloc_context *data_ac = NULL;
6896 struct ocfs2_extent_tree et; 6983 struct ocfs2_extent_tree et;
6897 6984
6985 mlog(0, "reflink xattr buckets %llu len %u\n",
6986 (unsigned long long)blkno, len);
6987
6898 ocfs2_init_xattr_tree_extent_tree(&et, 6988 ocfs2_init_xattr_tree_extent_tree(&et,
6899 INODE_CACHE(args->reflink->new_inode), 6989 INODE_CACHE(args->reflink->new_inode),
6900 args->new_blk_bh); 6990 args->new_blk_bh);
@@ -6914,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6914 goto out; 7004 goto out;
6915 } 7005 }
6916 7006
6917 ret = ocfs2_claim_clusters(handle, data_ac, 7007 ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
6918 len, &p_cluster, &num_clusters); 7008 meta_ac, data_ac,
6919 if (ret) { 7009 blkno, cpos, len);
6920 mlog_errno(ret);
6921 goto out_commit;
6922 }
6923
6924 new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
6925
6926 mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
6927 (unsigned long long)blkno, (unsigned long long)new_blkno, len);
6928 ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
6929 meta_ac, data_ac, args);
6930 if (ret) {
6931 mlog_errno(ret);
6932 goto out_commit;
6933 }
6934
6935 mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
6936 (unsigned long long)new_blkno, len, cpos);
6937 ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
6938 len, 0, meta_ac);
6939 if (ret) 7010 if (ret)
6940 mlog_errno(ret); 7011 mlog_errno(ret);
6941 7012
6942out_commit:
6943 ocfs2_commit_trans(osb, handle); 7013 ocfs2_commit_trans(osb, handle);
6944 7014
6945out: 7015out:
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 3e73de5967ff..fc8497643fd0 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state)
74 } *label; 74 } *label;
75 unsigned char *data; 75 unsigned char *data;
76 Sector sect; 76 Sector sect;
77 sector_t labelsect;
77 78
78 res = 0; 79 res = 0;
79 blocksize = bdev_logical_block_size(bdev); 80 blocksize = bdev_logical_block_size(bdev);
@@ -98,10 +99,19 @@ int ibm_partition(struct parsed_partitions *state)
98 goto out_freeall; 99 goto out_freeall;
99 100
100 /* 101 /*
102 * Special case for FBA disks: label sector does not depend on
103 * blocksize.
104 */
105 if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
106 (info->cu_type == 0x3880 && info->dev_type == 0x3370))
107 labelsect = info->label_block;
108 else
109 labelsect = info->label_block * (blocksize >> 9);
110
111 /*
101 * Get volume label, extract name and type. 112 * Get volume label, extract name and type.
102 */ 113 */
103 data = read_part_sector(state, info->label_block*(blocksize/512), 114 data = read_part_sector(state, labelsect, &sect);
104 &sect);
105 if (data == NULL) 115 if (data == NULL)
106 goto out_readerr; 116 goto out_readerr;
107 117
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 12c233da1b6b..437d2ca2de97 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -676,7 +676,7 @@ static void prune_dqcache(int count)
676 * This is called from kswapd when we think we need some 676 * This is called from kswapd when we think we need some
677 * more memory 677 * more memory
678 */ 678 */
679static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) 679static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
680{ 680{
681 if (nr) { 681 if (nr) {
682 spin_lock(&dq_list_lock); 682 spin_lock(&dq_list_lock);
diff --git a/fs/splice.c b/fs/splice.c
index 740e6b9faf7a..efdbfece9932 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1282,7 +1282,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
1282{ 1282{
1283 struct file *file = sd->u.file; 1283 struct file *file = sd->u.file;
1284 1284
1285 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); 1285 return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
1286 sd->flags);
1286} 1287}
1287 1288
1288/** 1289/**
@@ -1371,8 +1372,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1371 if (off_in) 1372 if (off_in)
1372 return -ESPIPE; 1373 return -ESPIPE;
1373 if (off_out) { 1374 if (off_out) {
1374 if (!out->f_op || !out->f_op->llseek || 1375 if (!(out->f_mode & FMODE_PWRITE))
1375 out->f_op->llseek == no_llseek)
1376 return -EINVAL; 1376 return -EINVAL;
1377 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1377 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1378 return -EFAULT; 1378 return -EFAULT;
@@ -1392,8 +1392,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1392 if (off_out) 1392 if (off_out)
1393 return -ESPIPE; 1393 return -ESPIPE;
1394 if (off_in) { 1394 if (off_in) {
1395 if (!in->f_op || !in->f_op->llseek || 1395 if (!(in->f_mode & FMODE_PREAD))
1396 in->f_op->llseek == no_llseek)
1397 return -EINVAL; 1396 return -EINVAL;
1398 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1397 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1399 return -EFAULT; 1398 return -EFAULT;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 02feb59cefca..0b201114a5ad 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,7 +277,7 @@ static int kick_a_thread(void)
277 return 0; 277 return 0;
278} 278}
279 279
280int ubifs_shrinker(int nr, gfp_t gfp_mask) 280int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
281{ 281{
282 int freed, contention = 0; 282 int freed, contention = 0;
283 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); 283 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 2eef553d50c8..04310878f449 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
1575int ubifs_tnc_end_commit(struct ubifs_info *c); 1575int ubifs_tnc_end_commit(struct ubifs_info *c);
1576 1576
1577/* shrinker.c */ 1577/* shrinker.c */
1578int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); 1578int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
1579 1579
1580/* commit.c */ 1580/* commit.c */
1581int ubifs_bg_thread(void *info); 1581int ubifs_bg_thread(void *info);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 649ade8ef598..2ee3f7a60163 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -45,7 +45,7 @@
45 45
46static kmem_zone_t *xfs_buf_zone; 46static kmem_zone_t *xfs_buf_zone;
47STATIC int xfsbufd(void *); 47STATIC int xfsbufd(void *);
48STATIC int xfsbufd_wakeup(int, gfp_t); 48STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
49STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 49STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
50static struct shrinker xfs_buf_shake = { 50static struct shrinker xfs_buf_shake = {
51 .shrink = xfsbufd_wakeup, 51 .shrink = xfsbufd_wakeup,
@@ -340,7 +340,7 @@ _xfs_buf_lookup_pages(
340 __func__, gfp_mask); 340 __func__, gfp_mask);
341 341
342 XFS_STATS_INC(xb_page_retries); 342 XFS_STATS_INC(xb_page_retries);
343 xfsbufd_wakeup(0, gfp_mask); 343 xfsbufd_wakeup(NULL, 0, gfp_mask);
344 congestion_wait(BLK_RW_ASYNC, HZ/50); 344 congestion_wait(BLK_RW_ASYNC, HZ/50);
345 goto retry; 345 goto retry;
346 } 346 }
@@ -1762,6 +1762,7 @@ xfs_buf_runall_queues(
1762 1762
1763STATIC int 1763STATIC int
1764xfsbufd_wakeup( 1764xfsbufd_wakeup(
1765 struct shrinker *shrink,
1765 int priority, 1766 int priority,
1766 gfp_t mask) 1767 gfp_t mask)
1767{ 1768{
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2d1718c9165..80938c736c27 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1883,7 +1883,6 @@ init_xfs_fs(void)
1883 goto out_cleanup_procfs; 1883 goto out_cleanup_procfs;
1884 1884
1885 vfs_initquota(); 1885 vfs_initquota();
1886 xfs_inode_shrinker_init();
1887 1886
1888 error = register_filesystem(&xfs_fs_type); 1887 error = register_filesystem(&xfs_fs_type);
1889 if (error) 1888 if (error)
@@ -1911,7 +1910,6 @@ exit_xfs_fs(void)
1911{ 1910{
1912 vfs_exitquota(); 1911 vfs_exitquota();
1913 unregister_filesystem(&xfs_fs_type); 1912 unregister_filesystem(&xfs_fs_type);
1914 xfs_inode_shrinker_destroy();
1915 xfs_sysctl_unregister(); 1913 xfs_sysctl_unregister();
1916 xfs_cleanup_procfs(); 1914 xfs_cleanup_procfs();
1917 xfs_buf_terminate(); 1915 xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index ef7f0218bccb..a51a07c3a70c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -144,6 +144,41 @@ restart:
144 return last_error; 144 return last_error;
145} 145}
146 146
147/*
148 * Select the next per-ag structure to iterate during the walk. The reclaim
149 * walk is optimised only to walk AGs with reclaimable inodes in them.
150 */
151static struct xfs_perag *
152xfs_inode_ag_iter_next_pag(
153 struct xfs_mount *mp,
154 xfs_agnumber_t *first,
155 int tag)
156{
157 struct xfs_perag *pag = NULL;
158
159 if (tag == XFS_ICI_RECLAIM_TAG) {
160 int found;
161 int ref;
162
163 spin_lock(&mp->m_perag_lock);
164 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
165 (void **)&pag, *first, 1, tag);
166 if (found <= 0) {
167 spin_unlock(&mp->m_perag_lock);
168 return NULL;
169 }
170 *first = pag->pag_agno + 1;
171 /* open coded pag reference increment */
172 ref = atomic_inc_return(&pag->pag_ref);
173 spin_unlock(&mp->m_perag_lock);
174 trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
175 } else {
176 pag = xfs_perag_get(mp, *first);
177 (*first)++;
178 }
179 return pag;
180}
181
147int 182int
148xfs_inode_ag_iterator( 183xfs_inode_ag_iterator(
149 struct xfs_mount *mp, 184 struct xfs_mount *mp,
@@ -154,16 +189,15 @@ xfs_inode_ag_iterator(
154 int exclusive, 189 int exclusive,
155 int *nr_to_scan) 190 int *nr_to_scan)
156{ 191{
192 struct xfs_perag *pag;
157 int error = 0; 193 int error = 0;
158 int last_error = 0; 194 int last_error = 0;
159 xfs_agnumber_t ag; 195 xfs_agnumber_t ag;
160 int nr; 196 int nr;
161 197
162 nr = nr_to_scan ? *nr_to_scan : INT_MAX; 198 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
163 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 199 ag = 0;
164 struct xfs_perag *pag; 200 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
165
166 pag = xfs_perag_get(mp, ag);
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 201 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
168 exclusive, &nr); 202 exclusive, &nr);
169 xfs_perag_put(pag); 203 xfs_perag_put(pag);
@@ -640,6 +674,17 @@ __xfs_inode_set_reclaim_tag(
640 radix_tree_tag_set(&pag->pag_ici_root, 674 radix_tree_tag_set(&pag->pag_ici_root,
641 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 675 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
642 XFS_ICI_RECLAIM_TAG); 676 XFS_ICI_RECLAIM_TAG);
677
678 if (!pag->pag_ici_reclaimable) {
679 /* propagate the reclaim tag up into the perag radix tree */
680 spin_lock(&ip->i_mount->m_perag_lock);
681 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
682 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
683 XFS_ICI_RECLAIM_TAG);
684 spin_unlock(&ip->i_mount->m_perag_lock);
685 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
686 -1, _RET_IP_);
687 }
643 pag->pag_ici_reclaimable++; 688 pag->pag_ici_reclaimable++;
644} 689}
645 690
@@ -674,6 +719,16 @@ __xfs_inode_clear_reclaim_tag(
674 radix_tree_tag_clear(&pag->pag_ici_root, 719 radix_tree_tag_clear(&pag->pag_ici_root,
675 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 720 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
676 pag->pag_ici_reclaimable--; 721 pag->pag_ici_reclaimable--;
722 if (!pag->pag_ici_reclaimable) {
723 /* clear the reclaim tag from the perag radix tree */
724 spin_lock(&ip->i_mount->m_perag_lock);
725 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
726 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
727 XFS_ICI_RECLAIM_TAG);
728 spin_unlock(&ip->i_mount->m_perag_lock);
729 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
730 -1, _RET_IP_);
731 }
677} 732}
678 733
679/* 734/*
@@ -828,83 +883,52 @@ xfs_reclaim_inodes(
828 883
829/* 884/*
830 * Shrinker infrastructure. 885 * Shrinker infrastructure.
831 *
832 * This is all far more complex than it needs to be. It adds a global list of
833 * mounts because the shrinkers can only call a global context. We need to make
834 * the shrinkers pass a context to avoid the need for global state.
835 */ 886 */
836static LIST_HEAD(xfs_mount_list);
837static struct rw_semaphore xfs_mount_list_lock;
838
839static int 887static int
840xfs_reclaim_inode_shrink( 888xfs_reclaim_inode_shrink(
889 struct shrinker *shrink,
841 int nr_to_scan, 890 int nr_to_scan,
842 gfp_t gfp_mask) 891 gfp_t gfp_mask)
843{ 892{
844 struct xfs_mount *mp; 893 struct xfs_mount *mp;
845 struct xfs_perag *pag; 894 struct xfs_perag *pag;
846 xfs_agnumber_t ag; 895 xfs_agnumber_t ag;
847 int reclaimable = 0; 896 int reclaimable;
848 897
898 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
849 if (nr_to_scan) { 899 if (nr_to_scan) {
850 if (!(gfp_mask & __GFP_FS)) 900 if (!(gfp_mask & __GFP_FS))
851 return -1; 901 return -1;
852 902
853 down_read(&xfs_mount_list_lock); 903 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
854 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
855 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
856 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); 904 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
857 if (nr_to_scan <= 0) 905 /* if we don't exhaust the scan, don't bother coming back */
858 break; 906 if (nr_to_scan > 0)
859 } 907 return -1;
860 up_read(&xfs_mount_list_lock); 908 }
861 }
862 909
863 down_read(&xfs_mount_list_lock); 910 reclaimable = 0;
864 list_for_each_entry(mp, &xfs_mount_list, m_mplist) { 911 ag = 0;
865 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 912 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
866 pag = xfs_perag_get(mp, ag); 913 XFS_ICI_RECLAIM_TAG))) {
867 reclaimable += pag->pag_ici_reclaimable; 914 reclaimable += pag->pag_ici_reclaimable;
868 xfs_perag_put(pag); 915 xfs_perag_put(pag);
869 }
870 } 916 }
871 up_read(&xfs_mount_list_lock);
872 return reclaimable; 917 return reclaimable;
873} 918}
874 919
875static struct shrinker xfs_inode_shrinker = {
876 .shrink = xfs_reclaim_inode_shrink,
877 .seeks = DEFAULT_SEEKS,
878};
879
880void __init
881xfs_inode_shrinker_init(void)
882{
883 init_rwsem(&xfs_mount_list_lock);
884 register_shrinker(&xfs_inode_shrinker);
885}
886
887void
888xfs_inode_shrinker_destroy(void)
889{
890 ASSERT(list_empty(&xfs_mount_list));
891 unregister_shrinker(&xfs_inode_shrinker);
892}
893
894void 920void
895xfs_inode_shrinker_register( 921xfs_inode_shrinker_register(
896 struct xfs_mount *mp) 922 struct xfs_mount *mp)
897{ 923{
898 down_write(&xfs_mount_list_lock); 924 mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
899 list_add_tail(&mp->m_mplist, &xfs_mount_list); 925 mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
900 up_write(&xfs_mount_list_lock); 926 register_shrinker(&mp->m_inode_shrink);
901} 927}
902 928
903void 929void
904xfs_inode_shrinker_unregister( 930xfs_inode_shrinker_unregister(
905 struct xfs_mount *mp) 931 struct xfs_mount *mp)
906{ 932{
907 down_write(&xfs_mount_list_lock); 933 unregister_shrinker(&mp->m_inode_shrink);
908 list_del(&mp->m_mplist);
909 up_write(&xfs_mount_list_lock);
910} 934}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index cdcbaaca9880..e28139aaa4aa 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -55,8 +55,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp,
55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 55 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
56 int flags, int tag, int write_lock, int *nr_to_scan); 56 int flags, int tag, int write_lock, int *nr_to_scan);
57 57
58void xfs_inode_shrinker_init(void);
59void xfs_inode_shrinker_destroy(void);
60void xfs_inode_shrinker_register(struct xfs_mount *mp); 58void xfs_inode_shrinker_register(struct xfs_mount *mp);
61void xfs_inode_shrinker_unregister(struct xfs_mount *mp); 59void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
62 60
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 73d5aa117384..302820690904 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,10 @@ DEFINE_EVENT(xfs_perag_class, name, \
124 unsigned long caller_ip), \ 124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip)) 125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get); 126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
127DEFINE_PERAG_REF_EVENT(xfs_perag_put); 128DEFINE_PERAG_REF_EVENT(xfs_perag_put);
129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
128 131
129TRACE_EVENT(xfs_attr_list_node_descend, 132TRACE_EVENT(xfs_attr_list_node_descend,
130 TP_PROTO(struct xfs_attr_list_context *ctx, 133 TP_PROTO(struct xfs_attr_list_context *ctx,
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8c117ff2e3ab..67c018392d62 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -69,7 +69,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
69 69
70STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 70STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
71STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 71STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
72STATIC int xfs_qm_shake(int, gfp_t); 72STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t);
73 73
74static struct shrinker xfs_qm_shaker = { 74static struct shrinker xfs_qm_shaker = {
75 .shrink = xfs_qm_shake, 75 .shrink = xfs_qm_shake,
@@ -2117,7 +2117,10 @@ xfs_qm_shake_freelist(
2117 */ 2117 */
2118/* ARGSUSED */ 2118/* ARGSUSED */
2119STATIC int 2119STATIC int
2120xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) 2120xfs_qm_shake(
2121 struct shrinker *shrink,
2122 int nr_to_scan,
2123 gfp_t gfp_mask)
2121{ 2124{
2122 int ndqused, nfree, n; 2125 int ndqused, nfree, n;
2123 2126
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d2c7eed4eda..5761087ee8ea 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,7 +259,7 @@ typedef struct xfs_mount {
259 wait_queue_head_t m_wait_single_sync_task; 259 wait_queue_head_t m_wait_single_sync_task;
260 __int64_t m_update_flags; /* sb flags we need to update 260 __int64_t m_update_flags; /* sb flags we need to update
261 on the next remount,rw */ 261 on the next remount,rw */
262 struct list_head m_mplist; /* inode shrinker mount list */ 262 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
263} xfs_mount_t; 263} xfs_mount_t;
264 264
265/* 265/*