aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/cache.h12
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/ctree.h27
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/extent-tree.c391
-rw-r--r--fs/btrfs/extent_io.c92
-rw-r--r--fs/btrfs/extent_io.h13
-rw-r--r--fs/btrfs/file.c35
-rw-r--r--fs/btrfs/inode.c239
-rw-r--r--fs/btrfs/ioctl.c62
-rw-r--r--fs/btrfs/ordered-data.c93
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/btrfs/transaction.c10
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/coda/psdev.c1
-rw-r--r--fs/ext4/Kconfig14
-rw-r--r--fs/ext4/ext4.h54
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c444
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/inode.c578
-rw-r--r--fs/ext4/mballoc.c305
-rw-r--r--fs/ext4/mballoc.h35
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c20
-rw-r--r--fs/ext4/namei.c3
-rw-r--r--fs/ext4/super.c130
-rw-r--r--fs/fat/fat.h2
-rw-r--r--fs/fat/inode.c18
-rw-r--r--fs/fat/misc.c8
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/jbd2/checkpoint.c7
-rw-r--r--fs/jbd2/commit.c59
-rw-r--r--fs/jbd2/journal.c198
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nilfs2/btnode.c1
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/inode.c1
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nls/nls_base.c8
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/netdebug.c4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/omfs/dir.c2
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/omfs/omfs.h4
-rw-r--r--fs/partitions/check.c12
-rw-r--r--fs/select.c1
57 files changed, 1897 insertions, 1087 deletions
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
deleted file mode 100644
index 5c4f6b499e90..000000000000
--- a/fs/afs/cache.h
+++ /dev/null
@@ -1,12 +0,0 @@
1/* AFS local cache management interface
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fscache.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 106be66dafd2..6ece2a13bf71 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -18,10 +18,10 @@
18#include <linux/key.h> 18#include <linux/key.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fscache.h>
21 22
22#include "afs.h" 23#include "afs.h"
23#include "afs_vl.h" 24#include "afs_vl.h"
24#include "cache.h"
25 25
26#define AFS_CELL_MAX_ADDRS 15 26#define AFS_CELL_MAX_ADDRS 15
27 27
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index d11c51fc2a3f..2ca7a7cafdbf 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -8,8 +8,10 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/cred.h>
11#include <linux/file.h> 12#include <linux/file.h>
12#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h>
13#include <linux/slab.h> 15#include <linux/slab.h>
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/fs.h> 17#include <linux/fs.h>
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995b..69b355ae7f49 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
27#include "btrfs_inode.h" 27#include "btrfs_inode.h"
28#include "xattr.h" 28#include "xattr.h"
29 29
30#ifdef CONFIG_FS_POSIX_ACL 30#ifdef CONFIG_BTRFS_POSIX_ACL
31 31
32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
33{ 33{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
313 .set = btrfs_xattr_acl_access_set, 313 .set = btrfs_xattr_acl_access_set,
314}; 314};
315 315
316#else /* CONFIG_FS_POSIX_ACL */ 316#else /* CONFIG_BTRFS_POSIX_ACL */
317 317
318int btrfs_acl_chmod(struct inode *inode) 318int btrfs_acl_chmod(struct inode *inode)
319{ 319{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
325 return 0; 325 return 0;
326} 326}
327 327
328#endif /* CONFIG_FS_POSIX_ACL */ 328#endif /* CONFIG_BTRFS_POSIX_ACL */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 82ee56bba299..a54d354cefcb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,6 +128,14 @@ struct btrfs_inode {
128 u64 last_unlink_trans; 128 u64 last_unlink_trans;
129 129
130 /* 130 /*
131 * These two counters are for delalloc metadata reservations. We keep
132 * track of how many extents we've accounted for vs how many extents we
133 * have.
134 */
135 int delalloc_reserved_extents;
136 int delalloc_extents;
137
138 /*
131 * ordered_data_close is set by truncate when a file that used 139 * ordered_data_close is set by truncate when a file that used
132 * to have good data has been truncated to zero. When it is set 140 * to have good data has been truncated to zero. When it is set
133 * the btrfs file release call will add this inode to the 141 * the btrfs file release call will add this inode to the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80599b4e42bd..dd8ced9814c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,18 +675,19 @@ struct btrfs_space_info {
675 current allocations */ 675 current allocations */
676 u64 bytes_readonly; /* total bytes that are read only */ 676 u64 bytes_readonly; /* total bytes that are read only */
677 u64 bytes_super; /* total bytes reserved for the super blocks */ 677 u64 bytes_super; /* total bytes reserved for the super blocks */
678 678 u64 bytes_root; /* the number of bytes needed to commit a
679 /* delalloc accounting */ 679 transaction */
680 u64 bytes_delalloc; /* number of bytes reserved for allocation,
681 this space is not necessarily reserved yet
682 by the allocator */
683 u64 bytes_may_use; /* number of bytes that may be used for 680 u64 bytes_may_use; /* number of bytes that may be used for
684 delalloc */ 681 delalloc/allocations */
682 u64 bytes_delalloc; /* number of bytes currently reserved for
683 delayed allocation */
685 684
686 int full; /* indicates that we cannot allocate any more 685 int full; /* indicates that we cannot allocate any more
687 chunks for this space */ 686 chunks for this space */
688 int force_alloc; /* set if we need to force a chunk alloc for 687 int force_alloc; /* set if we need to force a chunk alloc for
689 this space */ 688 this space */
689 int force_delalloc; /* make people start doing filemap_flush until
690 we're under a threshold */
690 691
691 struct list_head list; 692 struct list_head list;
692 693
@@ -695,6 +696,9 @@ struct btrfs_space_info {
695 spinlock_t lock; 696 spinlock_t lock;
696 struct rw_semaphore groups_sem; 697 struct rw_semaphore groups_sem;
697 atomic_t caching_threads; 698 atomic_t caching_threads;
699
700 int allocating_chunk;
701 wait_queue_head_t wait;
698}; 702};
699 703
700/* 704/*
@@ -2022,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2022void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2026void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2023void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2027void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2024 2028
2025int btrfs_check_metadata_free_space(struct btrfs_root *root); 2029int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
2030int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
2031int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2032 struct inode *inode, int num_items);
2033int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2034 struct inode *inode, int num_items);
2026int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2035int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2027 u64 bytes); 2036 u64 bytes);
2028void btrfs_free_reserved_data_space(struct btrfs_root *root, 2037void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2326,7 +2335,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2326int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2335int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2327 int skip_pinned); 2336 int skip_pinned);
2328int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2337int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2329extern struct file_operations btrfs_file_operations; 2338extern const struct file_operations btrfs_file_operations;
2330int btrfs_drop_extents(struct btrfs_trans_handle *trans, 2339int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2331 struct btrfs_root *root, struct inode *inode, 2340 struct btrfs_root *root, struct inode *inode,
2332 u64 start, u64 end, u64 locked_end, 2341 u64 start, u64 end, u64 locked_end,
@@ -2357,7 +2366,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2357int btrfs_sync_fs(struct super_block *sb, int wait); 2366int btrfs_sync_fs(struct super_block *sb, int wait);
2358 2367
2359/* acl.c */ 2368/* acl.c */
2360#ifdef CONFIG_FS_POSIX_ACL 2369#ifdef CONFIG_BTRFS_POSIX_ACL
2361int btrfs_check_acl(struct inode *inode, int mask); 2370int btrfs_check_acl(struct inode *inode, int mask);
2362#else 2371#else
2363#define btrfs_check_acl NULL 2372#define btrfs_check_acl NULL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 644e796fd643..af0435f79fa6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -822,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
822 822
823int btrfs_write_tree_block(struct extent_buffer *buf) 823int btrfs_write_tree_block(struct extent_buffer *buf)
824{ 824{
825 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, 825 return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
826 buf->start + buf->len - 1, WB_SYNC_ALL); 826 buf->start + buf->len - 1);
827} 827}
828 828
829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
830{ 830{
831 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, 831 return filemap_fdatawait_range(buf->first_page->mapping,
832 buf->start, buf->start + buf->len - 1); 832 buf->start, buf->start + buf->len - 1);
833} 833}
834 834
835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -1630,7 +1630,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1630 fs_info->sb = sb; 1630 fs_info->sb = sb;
1631 fs_info->max_extent = (u64)-1; 1631 fs_info->max_extent = (u64)-1;
1632 fs_info->max_inline = 8192 * 1024; 1632 fs_info->max_inline = 8192 * 1024;
1633 fs_info->metadata_ratio = 8; 1633 fs_info->metadata_ratio = 0;
1634 1634
1635 fs_info->thread_pool_size = min_t(unsigned long, 1635 fs_info->thread_pool_size = min_t(unsigned long,
1636 num_online_cpus() + 2, 8); 1636 num_online_cpus() + 2, 8);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 993f93ff7ba6..359a754c782c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
68 struct extent_buffer **must_clean); 68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level, 69static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key); 70 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72 int dump_block_groups);
71 73
72static noinline int 74static noinline int
73block_group_cache_done(struct btrfs_block_group_cache *cache) 75block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -2765,67 +2767,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2765 alloc_target); 2767 alloc_target);
2766} 2768}
2767 2769
2770static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2771{
2772 u64 num_bytes;
2773 int level;
2774
2775 level = BTRFS_MAX_LEVEL - 2;
2776 /*
2777 * NOTE: these calculations are absolutely the worst possible case.
2778 * This assumes that _every_ item we insert will require a new leaf, and
2779 * that the tree has grown to its maximum level size.
2780 */
2781
2782 /*
2783 * for every item we insert we could insert both an extent item and a
2784 * extent ref item. Then for ever item we insert, we will need to cow
2785 * both the original leaf, plus the leaf to the left and right of it.
2786 *
2787 * Unless we are talking about the extent root, then we just want the
2788 * number of items * 2, since we just need the extent item plus its ref.
2789 */
2790 if (root == root->fs_info->extent_root)
2791 num_bytes = num_items * 2;
2792 else
2793 num_bytes = (num_items + (2 * num_items)) * 3;
2794
2795 /*
2796 * num_bytes is total number of leaves we could need times the leaf
2797 * size, and then for every leaf we could end up cow'ing 2 nodes per
2798 * level, down to the leaf level.
2799 */
2800 num_bytes = (num_bytes * root->leafsize) +
2801 (num_bytes * (level * 2)) * root->nodesize;
2802
2803 return num_bytes;
2804}
2805
2768/* 2806/*
2769 * for now this just makes sure we have at least 5% of our metadata space free 2807 * Unreserve metadata space for delalloc. If we have less reserved credits than
2770 * for use. 2808 * we have extents, this function does nothing.
2771 */ 2809 */
2772int btrfs_check_metadata_free_space(struct btrfs_root *root) 2810int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2811 struct inode *inode, int num_items)
2773{ 2812{
2774 struct btrfs_fs_info *info = root->fs_info; 2813 struct btrfs_fs_info *info = root->fs_info;
2775 struct btrfs_space_info *meta_sinfo; 2814 struct btrfs_space_info *meta_sinfo;
2776 u64 alloc_target, thresh; 2815 u64 num_bytes;
2777 int committed = 0, ret; 2816 u64 alloc_target;
2817 bool bug = false;
2778 2818
2779 /* get the space info for where the metadata will live */ 2819 /* get the space info for where the metadata will live */
2780 alloc_target = btrfs_get_alloc_profile(root, 0); 2820 alloc_target = btrfs_get_alloc_profile(root, 0);
2781 meta_sinfo = __find_space_info(info, alloc_target); 2821 meta_sinfo = __find_space_info(info, alloc_target);
2782 if (!meta_sinfo)
2783 goto alloc;
2784 2822
2785again: 2823 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2824 num_items);
2825
2786 spin_lock(&meta_sinfo->lock); 2826 spin_lock(&meta_sinfo->lock);
2787 if (!meta_sinfo->full) 2827 if (BTRFS_I(inode)->delalloc_reserved_extents <=
2788 thresh = meta_sinfo->total_bytes * 80; 2828 BTRFS_I(inode)->delalloc_extents) {
2789 else 2829 spin_unlock(&meta_sinfo->lock);
2790 thresh = meta_sinfo->total_bytes * 95; 2830 return 0;
2831 }
2832
2833 BTRFS_I(inode)->delalloc_reserved_extents--;
2834 BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
2835
2836 if (meta_sinfo->bytes_delalloc < num_bytes) {
2837 bug = true;
2838 meta_sinfo->bytes_delalloc = 0;
2839 } else {
2840 meta_sinfo->bytes_delalloc -= num_bytes;
2841 }
2842 spin_unlock(&meta_sinfo->lock);
2791 2843
2844 BUG_ON(bug);
2845
2846 return 0;
2847}
2848
2849static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2850{
2851 u64 thresh;
2852
2853 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2854 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2855 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2856 meta_sinfo->bytes_may_use;
2857
2858 thresh = meta_sinfo->total_bytes - thresh;
2859 thresh *= 80;
2792 do_div(thresh, 100); 2860 do_div(thresh, 100);
2861 if (thresh <= meta_sinfo->bytes_delalloc)
2862 meta_sinfo->force_delalloc = 1;
2863 else
2864 meta_sinfo->force_delalloc = 0;
2865}
2793 2866
2794 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2867static int maybe_allocate_chunk(struct btrfs_root *root,
2795 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2868 struct btrfs_space_info *info)
2796 meta_sinfo->bytes_super > thresh) { 2869{
2797 struct btrfs_trans_handle *trans; 2870 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2798 if (!meta_sinfo->full) { 2871 struct btrfs_trans_handle *trans;
2799 meta_sinfo->force_alloc = 1; 2872 bool wait = false;
2873 int ret = 0;
2874 u64 min_metadata;
2875 u64 free_space;
2876
2877 free_space = btrfs_super_total_bytes(disk_super);
2878 /*
2879 * we allow the metadata to grow to a max of either 5gb or 5% of the
2880 * space in the volume.
2881 */
2882 min_metadata = min((u64)5 * 1024 * 1024 * 1024,
2883 div64_u64(free_space * 5, 100));
2884 if (info->total_bytes >= min_metadata) {
2885 spin_unlock(&info->lock);
2886 return 0;
2887 }
2888
2889 if (info->full) {
2890 spin_unlock(&info->lock);
2891 return 0;
2892 }
2893
2894 if (!info->allocating_chunk) {
2895 info->force_alloc = 1;
2896 info->allocating_chunk = 1;
2897 init_waitqueue_head(&info->wait);
2898 } else {
2899 wait = true;
2900 }
2901
2902 spin_unlock(&info->lock);
2903
2904 if (wait) {
2905 wait_event(info->wait,
2906 !info->allocating_chunk);
2907 return 1;
2908 }
2909
2910 trans = btrfs_start_transaction(root, 1);
2911 if (!trans) {
2912 ret = -ENOMEM;
2913 goto out;
2914 }
2915
2916 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2917 4096 + 2 * 1024 * 1024,
2918 info->flags, 0);
2919 btrfs_end_transaction(trans, root);
2920 if (ret)
2921 goto out;
2922out:
2923 spin_lock(&info->lock);
2924 info->allocating_chunk = 0;
2925 spin_unlock(&info->lock);
2926 wake_up(&info->wait);
2927
2928 if (ret)
2929 return 0;
2930 return 1;
2931}
2932
2933/*
2934 * Reserve metadata space for delalloc.
2935 */
2936int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2937 struct inode *inode, int num_items)
2938{
2939 struct btrfs_fs_info *info = root->fs_info;
2940 struct btrfs_space_info *meta_sinfo;
2941 u64 num_bytes;
2942 u64 used;
2943 u64 alloc_target;
2944 int flushed = 0;
2945 int force_delalloc;
2946
2947 /* get the space info for where the metadata will live */
2948 alloc_target = btrfs_get_alloc_profile(root, 0);
2949 meta_sinfo = __find_space_info(info, alloc_target);
2950
2951 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2952 num_items);
2953again:
2954 spin_lock(&meta_sinfo->lock);
2955
2956 force_delalloc = meta_sinfo->force_delalloc;
2957
2958 if (unlikely(!meta_sinfo->bytes_root))
2959 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
2960
2961 if (!flushed)
2962 meta_sinfo->bytes_delalloc += num_bytes;
2963
2964 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2965 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2966 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2967 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
2968
2969 if (used > meta_sinfo->total_bytes) {
2970 flushed++;
2971
2972 if (flushed == 1) {
2973 if (maybe_allocate_chunk(root, meta_sinfo))
2974 goto again;
2975 flushed++;
2976 } else {
2800 spin_unlock(&meta_sinfo->lock); 2977 spin_unlock(&meta_sinfo->lock);
2801alloc: 2978 }
2802 trans = btrfs_start_transaction(root, 1);
2803 if (!trans)
2804 return -ENOMEM;
2805 2979
2806 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2980 if (flushed == 2) {
2807 2 * 1024 * 1024, alloc_target, 0); 2981 filemap_flush(inode->i_mapping);
2808 btrfs_end_transaction(trans, root); 2982 goto again;
2809 if (!meta_sinfo) { 2983 } else if (flushed == 3) {
2810 meta_sinfo = __find_space_info(info, 2984 btrfs_start_delalloc_inodes(root);
2811 alloc_target); 2985 btrfs_wait_ordered_extents(root, 0);
2812 }
2813 goto again; 2986 goto again;
2814 } 2987 }
2988 spin_lock(&meta_sinfo->lock);
2989 meta_sinfo->bytes_delalloc -= num_bytes;
2815 spin_unlock(&meta_sinfo->lock); 2990 spin_unlock(&meta_sinfo->lock);
2991 printk(KERN_ERR "enospc, has %d, reserved %d\n",
2992 BTRFS_I(inode)->delalloc_extents,
2993 BTRFS_I(inode)->delalloc_reserved_extents);
2994 dump_space_info(meta_sinfo, 0, 0);
2995 return -ENOSPC;
2996 }
2816 2997
2817 if (!committed) { 2998 BTRFS_I(inode)->delalloc_reserved_extents++;
2818 committed = 1; 2999 check_force_delalloc(meta_sinfo);
2819 trans = btrfs_join_transaction(root, 1); 3000 spin_unlock(&meta_sinfo->lock);
2820 if (!trans) 3001
2821 return -ENOMEM; 3002 if (!flushed && force_delalloc)
2822 ret = btrfs_commit_transaction(trans, root); 3003 filemap_flush(inode->i_mapping);
2823 if (ret) 3004
2824 return ret; 3005 return 0;
3006}
3007
3008/*
3009 * unreserve num_items number of items worth of metadata space. This needs to
3010 * be paired with btrfs_reserve_metadata_space.
3011 *
3012 * NOTE: if you have the option, run this _AFTER_ you do a
3013 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3014 * oprations which will result in more used metadata, so we want to make sure we
3015 * can do that without issue.
3016 */
3017int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3018{
3019 struct btrfs_fs_info *info = root->fs_info;
3020 struct btrfs_space_info *meta_sinfo;
3021 u64 num_bytes;
3022 u64 alloc_target;
3023 bool bug = false;
3024
3025 /* get the space info for where the metadata will live */
3026 alloc_target = btrfs_get_alloc_profile(root, 0);
3027 meta_sinfo = __find_space_info(info, alloc_target);
3028
3029 num_bytes = calculate_bytes_needed(root, num_items);
3030
3031 spin_lock(&meta_sinfo->lock);
3032 if (meta_sinfo->bytes_may_use < num_bytes) {
3033 bug = true;
3034 meta_sinfo->bytes_may_use = 0;
3035 } else {
3036 meta_sinfo->bytes_may_use -= num_bytes;
3037 }
3038 spin_unlock(&meta_sinfo->lock);
3039
3040 BUG_ON(bug);
3041
3042 return 0;
3043}
3044
3045/*
3046 * Reserve some metadata space for use. We'll calculate the worste case number
3047 * of bytes that would be needed to modify num_items number of items. If we
3048 * have space, fantastic, if not, you get -ENOSPC. Please call
3049 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3050 * items you reserved, since whatever metadata you needed should have already
3051 * been allocated.
3052 *
3053 * This will commit the transaction to make more space if we don't have enough
3054 * metadata space. THe only time we don't do this is if we're reserving space
3055 * inside of a transaction, then we will just return -ENOSPC and it is the
3056 * callers responsibility to handle it properly.
3057 */
3058int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3059{
3060 struct btrfs_fs_info *info = root->fs_info;
3061 struct btrfs_space_info *meta_sinfo;
3062 u64 num_bytes;
3063 u64 used;
3064 u64 alloc_target;
3065 int retries = 0;
3066
3067 /* get the space info for where the metadata will live */
3068 alloc_target = btrfs_get_alloc_profile(root, 0);
3069 meta_sinfo = __find_space_info(info, alloc_target);
3070
3071 num_bytes = calculate_bytes_needed(root, num_items);
3072again:
3073 spin_lock(&meta_sinfo->lock);
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!retries)
3079 meta_sinfo->bytes_may_use += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 retries++;
3088 if (retries == 1) {
3089 if (maybe_allocate_chunk(root, meta_sinfo))
3090 goto again;
3091 retries++;
3092 } else {
3093 spin_unlock(&meta_sinfo->lock);
3094 }
3095
3096 if (retries == 2) {
3097 btrfs_start_delalloc_inodes(root);
3098 btrfs_wait_ordered_extents(root, 0);
2825 goto again; 3099 goto again;
2826 } 3100 }
3101 spin_lock(&meta_sinfo->lock);
3102 meta_sinfo->bytes_may_use -= num_bytes;
3103 spin_unlock(&meta_sinfo->lock);
3104
3105 dump_space_info(meta_sinfo, 0, 0);
2827 return -ENOSPC; 3106 return -ENOSPC;
2828 } 3107 }
3108
3109 check_force_delalloc(meta_sinfo);
2829 spin_unlock(&meta_sinfo->lock); 3110 spin_unlock(&meta_sinfo->lock);
2830 3111
2831 return 0; 3112 return 0;
@@ -2888,7 +3169,7 @@ alloc:
2888 spin_unlock(&data_sinfo->lock); 3169 spin_unlock(&data_sinfo->lock);
2889 3170
2890 /* commit the current transaction and try again */ 3171 /* commit the current transaction and try again */
2891 if (!committed) { 3172 if (!committed && !root->fs_info->open_ioctl_trans) {
2892 committed = 1; 3173 committed = 1;
2893 trans = btrfs_join_transaction(root, 1); 3174 trans = btrfs_join_transaction(root, 1);
2894 if (!trans) 3175 if (!trans)
@@ -2916,7 +3197,7 @@ alloc:
2916 BTRFS_I(inode)->reserved_bytes += bytes; 3197 BTRFS_I(inode)->reserved_bytes += bytes;
2917 spin_unlock(&data_sinfo->lock); 3198 spin_unlock(&data_sinfo->lock);
2918 3199
2919 return btrfs_check_metadata_free_space(root); 3200 return 0;
2920} 3201}
2921 3202
2922/* 3203/*
@@ -3015,17 +3296,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3015 BUG_ON(!space_info); 3296 BUG_ON(!space_info);
3016 3297
3017 spin_lock(&space_info->lock); 3298 spin_lock(&space_info->lock);
3018 if (space_info->force_alloc) { 3299 if (space_info->force_alloc)
3019 force = 1; 3300 force = 1;
3020 space_info->force_alloc = 0;
3021 }
3022 if (space_info->full) { 3301 if (space_info->full) {
3023 spin_unlock(&space_info->lock); 3302 spin_unlock(&space_info->lock);
3024 goto out; 3303 goto out;
3025 } 3304 }
3026 3305
3027 thresh = space_info->total_bytes - space_info->bytes_readonly; 3306 thresh = space_info->total_bytes - space_info->bytes_readonly;
3028 thresh = div_factor(thresh, 6); 3307 thresh = div_factor(thresh, 8);
3029 if (!force && 3308 if (!force &&
3030 (space_info->bytes_used + space_info->bytes_pinned + 3309 (space_info->bytes_used + space_info->bytes_pinned +
3031 space_info->bytes_reserved + alloc_bytes) < thresh) { 3310 space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -3039,7 +3318,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3039 * we keep a reasonable number of metadata chunks allocated in the 3318 * we keep a reasonable number of metadata chunks allocated in the
3040 * FS as well. 3319 * FS as well.
3041 */ 3320 */
3042 if (flags & BTRFS_BLOCK_GROUP_DATA) { 3321 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3043 fs_info->data_chunk_allocations++; 3322 fs_info->data_chunk_allocations++;
3044 if (!(fs_info->data_chunk_allocations % 3323 if (!(fs_info->data_chunk_allocations %
3045 fs_info->metadata_ratio)) 3324 fs_info->metadata_ratio))
@@ -3047,8 +3326,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3047 } 3326 }
3048 3327
3049 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3328 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3329 spin_lock(&space_info->lock);
3050 if (ret) 3330 if (ret)
3051 space_info->full = 1; 3331 space_info->full = 1;
3332 space_info->force_alloc = 0;
3333 spin_unlock(&space_info->lock);
3052out: 3334out:
3053 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3335 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3054 return ret; 3336 return ret;
@@ -4063,21 +4345,32 @@ loop:
4063 return ret; 4345 return ret;
4064} 4346}
4065 4347
4066static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 4348static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4349 int dump_block_groups)
4067{ 4350{
4068 struct btrfs_block_group_cache *cache; 4351 struct btrfs_block_group_cache *cache;
4069 4352
4353 spin_lock(&info->lock);
4070 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4354 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4071 (unsigned long long)(info->total_bytes - info->bytes_used - 4355 (unsigned long long)(info->total_bytes - info->bytes_used -
4072 info->bytes_pinned - info->bytes_reserved), 4356 info->bytes_pinned - info->bytes_reserved -
4357 info->bytes_super),
4073 (info->full) ? "" : "not "); 4358 (info->full) ? "" : "not ");
4074 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4359 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4075 " may_use=%llu, used=%llu\n", 4360 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4361 "\n",
4076 (unsigned long long)info->total_bytes, 4362 (unsigned long long)info->total_bytes,
4077 (unsigned long long)info->bytes_pinned, 4363 (unsigned long long)info->bytes_pinned,
4078 (unsigned long long)info->bytes_delalloc, 4364 (unsigned long long)info->bytes_delalloc,
4079 (unsigned long long)info->bytes_may_use, 4365 (unsigned long long)info->bytes_may_use,
4080 (unsigned long long)info->bytes_used); 4366 (unsigned long long)info->bytes_used,
4367 (unsigned long long)info->bytes_root,
4368 (unsigned long long)info->bytes_super,
4369 (unsigned long long)info->bytes_reserved);
4370 spin_unlock(&info->lock);
4371
4372 if (!dump_block_groups)
4373 return;
4081 4374
4082 down_read(&info->groups_sem); 4375 down_read(&info->groups_sem);
4083 list_for_each_entry(cache, &info->block_groups, list) { 4376 list_for_each_entry(cache, &info->block_groups, list) {
@@ -4145,7 +4438,7 @@ again:
4145 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4438 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4146 "wanted %llu\n", (unsigned long long)data, 4439 "wanted %llu\n", (unsigned long long)data,
4147 (unsigned long long)num_bytes); 4440 (unsigned long long)num_bytes);
4148 dump_space_info(sinfo, num_bytes); 4441 dump_space_info(sinfo, num_bytes, 1);
4149 } 4442 }
4150 4443
4151 return ret; 4444 return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0cb88f8146ea..de1793ba004a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
280 return NULL; 280 return NULL;
281} 281}
282 282
283static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
284 struct extent_state *other)
285{
286 if (tree->ops && tree->ops->merge_extent_hook)
287 tree->ops->merge_extent_hook(tree->mapping->host, new,
288 other);
289}
290
283/* 291/*
284 * utility function to look for merge candidates inside a given range. 292 * utility function to look for merge candidates inside a given range.
285 * Any extents with matching state are merged together into a single 293 * Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
303 other = rb_entry(other_node, struct extent_state, rb_node); 311 other = rb_entry(other_node, struct extent_state, rb_node);
304 if (other->end == state->start - 1 && 312 if (other->end == state->start - 1 &&
305 other->state == state->state) { 313 other->state == state->state) {
314 merge_cb(tree, state, other);
306 state->start = other->start; 315 state->start = other->start;
307 other->tree = NULL; 316 other->tree = NULL;
308 rb_erase(&other->rb_node, &tree->state); 317 rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
314 other = rb_entry(other_node, struct extent_state, rb_node); 323 other = rb_entry(other_node, struct extent_state, rb_node);
315 if (other->start == state->end + 1 && 324 if (other->start == state->end + 1 &&
316 other->state == state->state) { 325 other->state == state->state) {
326 merge_cb(tree, state, other);
317 other->start = state->start; 327 other->start = state->start;
318 state->tree = NULL; 328 state->tree = NULL;
319 rb_erase(&state->rb_node, &tree->state); 329 rb_erase(&state->rb_node, &tree->state);
320 free_extent_state(state); 330 free_extent_state(state);
331 state = NULL;
321 } 332 }
322 } 333 }
334
323 return 0; 335 return 0;
324} 336}
325 337
326static void set_state_cb(struct extent_io_tree *tree, 338static int set_state_cb(struct extent_io_tree *tree,
327 struct extent_state *state, 339 struct extent_state *state,
328 unsigned long bits) 340 unsigned long bits)
329{ 341{
330 if (tree->ops && tree->ops->set_bit_hook) { 342 if (tree->ops && tree->ops->set_bit_hook) {
331 tree->ops->set_bit_hook(tree->mapping->host, state->start, 343 return tree->ops->set_bit_hook(tree->mapping->host,
332 state->end, state->state, bits); 344 state->start, state->end,
345 state->state, bits);
333 } 346 }
347
348 return 0;
334} 349}
335 350
336static void clear_state_cb(struct extent_io_tree *tree, 351static void clear_state_cb(struct extent_io_tree *tree,
337 struct extent_state *state, 352 struct extent_state *state,
338 unsigned long bits) 353 unsigned long bits)
339{ 354{
340 if (tree->ops && tree->ops->clear_bit_hook) { 355 if (tree->ops && tree->ops->clear_bit_hook)
341 tree->ops->clear_bit_hook(tree->mapping->host, state->start, 356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
342 state->end, state->state, bits);
343 }
344} 357}
345 358
346/* 359/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
358 int bits) 371 int bits)
359{ 372{
360 struct rb_node *node; 373 struct rb_node *node;
374 int ret;
361 375
362 if (end < start) { 376 if (end < start) {
363 printk(KERN_ERR "btrfs end < start %llu %llu\n", 377 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree,
365 (unsigned long long)start); 379 (unsigned long long)start);
366 WARN_ON(1); 380 WARN_ON(1);
367 } 381 }
368 if (bits & EXTENT_DIRTY)
369 tree->dirty_bytes += end - start + 1;
370 state->start = start; 382 state->start = start;
371 state->end = end; 383 state->end = end;
372 set_state_cb(tree, state, bits); 384 ret = set_state_cb(tree, state, bits);
385 if (ret)
386 return ret;
387
388 if (bits & EXTENT_DIRTY)
389 tree->dirty_bytes += end - start + 1;
373 state->state |= bits; 390 state->state |= bits;
374 node = tree_insert(&tree->state, end, &state->rb_node); 391 node = tree_insert(&tree->state, end, &state->rb_node);
375 if (node) { 392 if (node) {
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
387 return 0; 404 return 0;
388} 405}
389 406
407static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
408 u64 split)
409{
410 if (tree->ops && tree->ops->split_extent_hook)
411 return tree->ops->split_extent_hook(tree->mapping->host,
412 orig, split);
413 return 0;
414}
415
390/* 416/*
391 * split a given extent state struct in two, inserting the preallocated 417 * split a given extent state struct in two, inserting the preallocated
392 * struct 'prealloc' as the newly created second half. 'split' indicates an 418 * struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405 struct extent_state *prealloc, u64 split) 431 struct extent_state *prealloc, u64 split)
406{ 432{
407 struct rb_node *node; 433 struct rb_node *node;
434
435 split_cb(tree, orig, split);
436
408 prealloc->start = orig->start; 437 prealloc->start = orig->start;
409 prealloc->end = split - 1; 438 prealloc->end = split - 1;
410 prealloc->state = orig->state; 439 prealloc->state = orig->state;
@@ -542,8 +571,8 @@ hit_next:
542 if (err) 571 if (err)
543 goto out; 572 goto out;
544 if (state->end <= end) { 573 if (state->end <= end) {
545 set |= clear_state_bit(tree, state, bits, 574 set |= clear_state_bit(tree, state, bits, wake,
546 wake, delete); 575 delete);
547 if (last_end == (u64)-1) 576 if (last_end == (u64)-1)
548 goto out; 577 goto out;
549 start = last_end + 1; 578 start = last_end + 1;
@@ -561,12 +590,11 @@ hit_next:
561 prealloc = alloc_extent_state(GFP_ATOMIC); 590 prealloc = alloc_extent_state(GFP_ATOMIC);
562 err = split_state(tree, state, prealloc, end + 1); 591 err = split_state(tree, state, prealloc, end + 1);
563 BUG_ON(err == -EEXIST); 592 BUG_ON(err == -EEXIST);
564
565 if (wake) 593 if (wake)
566 wake_up(&state->wq); 594 wake_up(&state->wq);
567 595
568 set |= clear_state_bit(tree, prealloc, bits, 596 set |= clear_state_bit(tree, prealloc, bits, wake, delete);
569 wake, delete); 597
570 prealloc = NULL; 598 prealloc = NULL;
571 goto out; 599 goto out;
572 } 600 }
@@ -667,16 +695,23 @@ out:
667 return 0; 695 return 0;
668} 696}
669 697
670static void set_state_bits(struct extent_io_tree *tree, 698static int set_state_bits(struct extent_io_tree *tree,
671 struct extent_state *state, 699 struct extent_state *state,
672 int bits) 700 int bits)
673{ 701{
702 int ret;
703
704 ret = set_state_cb(tree, state, bits);
705 if (ret)
706 return ret;
707
674 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 708 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
675 u64 range = state->end - state->start + 1; 709 u64 range = state->end - state->start + 1;
676 tree->dirty_bytes += range; 710 tree->dirty_bytes += range;
677 } 711 }
678 set_state_cb(tree, state, bits);
679 state->state |= bits; 712 state->state |= bits;
713
714 return 0;
680} 715}
681 716
682static void cache_state(struct extent_state *state, 717static void cache_state(struct extent_state *state,
@@ -758,7 +793,10 @@ hit_next:
758 goto out; 793 goto out;
759 } 794 }
760 795
761 set_state_bits(tree, state, bits); 796 err = set_state_bits(tree, state, bits);
797 if (err)
798 goto out;
799
762 cache_state(state, cached_state); 800 cache_state(state, cached_state);
763 merge_state(tree, state); 801 merge_state(tree, state);
764 if (last_end == (u64)-1) 802 if (last_end == (u64)-1)
@@ -805,7 +843,9 @@ hit_next:
805 if (err) 843 if (err)
806 goto out; 844 goto out;
807 if (state->end <= end) { 845 if (state->end <= end) {
808 set_state_bits(tree, state, bits); 846 err = set_state_bits(tree, state, bits);
847 if (err)
848 goto out;
809 cache_state(state, cached_state); 849 cache_state(state, cached_state);
810 merge_state(tree, state); 850 merge_state(tree, state);
811 if (last_end == (u64)-1) 851 if (last_end == (u64)-1)
@@ -829,11 +869,13 @@ hit_next:
829 this_end = last_start - 1; 869 this_end = last_start - 1;
830 err = insert_state(tree, prealloc, start, this_end, 870 err = insert_state(tree, prealloc, start, this_end,
831 bits); 871 bits);
832 cache_state(prealloc, cached_state);
833 prealloc = NULL;
834 BUG_ON(err == -EEXIST); 872 BUG_ON(err == -EEXIST);
835 if (err) 873 if (err) {
874 prealloc = NULL;
836 goto out; 875 goto out;
876 }
877 cache_state(prealloc, cached_state);
878 prealloc = NULL;
837 start = this_end + 1; 879 start = this_end + 1;
838 goto search_again; 880 goto search_again;
839 } 881 }
@@ -852,7 +894,11 @@ hit_next:
852 err = split_state(tree, state, prealloc, end + 1); 894 err = split_state(tree, state, prealloc, end + 1);
853 BUG_ON(err == -EEXIST); 895 BUG_ON(err == -EEXIST);
854 896
855 set_state_bits(tree, prealloc, bits); 897 err = set_state_bits(tree, prealloc, bits);
898 if (err) {
899 prealloc = NULL;
900 goto out;
901 }
856 cache_state(prealloc, cached_state); 902 cache_state(prealloc, cached_state);
857 merge_state(tree, prealloc); 903 merge_state(tree, prealloc);
858 prealloc = NULL; 904 prealloc = NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 14ed16fd862d..4794ec891fed 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -60,8 +60,13 @@ struct extent_io_ops {
60 struct extent_state *state, int uptodate); 60 struct extent_state *state, int uptodate);
61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
62 unsigned long old, unsigned long bits); 62 unsigned long old, unsigned long bits);
63 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, 63 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
64 unsigned long old, unsigned long bits); 64 unsigned long bits);
65 int (*merge_extent_hook)(struct inode *inode,
66 struct extent_state *new,
67 struct extent_state *other);
68 int (*split_extent_hook)(struct inode *inode,
69 struct extent_state *orig, u64 split);
65 int (*write_cache_pages_lock_hook)(struct page *page); 70 int (*write_cache_pages_lock_hook)(struct page *page);
66}; 71};
67 72
@@ -79,10 +84,14 @@ struct extent_state {
79 u64 start; 84 u64 start;
80 u64 end; /* inclusive */ 85 u64 end; /* inclusive */
81 struct rb_node rb_node; 86 struct rb_node rb_node;
87
88 /* ADD NEW ELEMENTS AFTER THIS */
82 struct extent_io_tree *tree; 89 struct extent_io_tree *tree;
83 wait_queue_head_t wq; 90 wait_queue_head_t wq;
84 atomic_t refs; 91 atomic_t refs;
85 unsigned long state; 92 unsigned long state;
93 u64 split_start;
94 u64 split_end;
86 95
87 /* for use by the FS */ 96 /* for use by the FS */
88 u64 private; 97 u64 private;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a3492a3ad96b..f19e1259a971 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 124
125 end_of_last_block = start_pos + num_bytes - 1; 125 end_of_last_block = start_pos + num_bytes - 1;
126 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
127 if (err)
128 return err;
129
127 for (i = 0; i < num_pages; i++) { 130 for (i = 0; i < num_pages; i++) {
128 struct page *p = pages[i]; 131 struct page *p = pages[i];
129 SetPageUptodate(p); 132 SetPageUptodate(p);
@@ -917,21 +920,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
917 start_pos = pos; 920 start_pos = pos;
918 921
919 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 922 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
923
924 /* do the reserve before the mutex lock in case we have to do some
925 * flushing. We wouldn't deadlock, but this is more polite.
926 */
927 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
928 if (err)
929 goto out_nolock;
930
931 mutex_lock(&inode->i_mutex);
932
920 current->backing_dev_info = inode->i_mapping->backing_dev_info; 933 current->backing_dev_info = inode->i_mapping->backing_dev_info;
921 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 934 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
922 if (err) 935 if (err)
923 goto out_nolock; 936 goto out;
937
924 if (count == 0) 938 if (count == 0)
925 goto out_nolock; 939 goto out;
926 940
927 err = file_remove_suid(file); 941 err = file_remove_suid(file);
928 if (err) 942 if (err)
929 goto out_nolock; 943 goto out;
944
930 file_update_time(file); 945 file_update_time(file);
931 946
932 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 947 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
933 948
934 mutex_lock(&inode->i_mutex); 949 /* generic_write_checks can change our pos */
950 start_pos = pos;
951
935 BTRFS_I(inode)->sequence++; 952 BTRFS_I(inode)->sequence++;
936 first_index = pos >> PAGE_CACHE_SHIFT; 953 first_index = pos >> PAGE_CACHE_SHIFT;
937 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 954 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1005,9 +1022,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1005 } 1022 }
1006 1023
1007 if (will_write) { 1024 if (will_write) {
1008 btrfs_fdatawrite_range(inode->i_mapping, pos, 1025 filemap_fdatawrite_range(inode->i_mapping, pos,
1009 pos + write_bytes - 1, 1026 pos + write_bytes - 1);
1010 WB_SYNC_ALL);
1011 } else { 1027 } else {
1012 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1028 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1013 num_pages); 1029 num_pages);
@@ -1028,6 +1044,7 @@ out:
1028 mutex_unlock(&inode->i_mutex); 1044 mutex_unlock(&inode->i_mutex);
1029 if (ret) 1045 if (ret)
1030 err = ret; 1046 err = ret;
1047 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1031 1048
1032out_nolock: 1049out_nolock:
1033 kfree(pages); 1050 kfree(pages);
@@ -1196,7 +1213,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1196 return 0; 1213 return 0;
1197} 1214}
1198 1215
1199struct file_operations btrfs_file_operations = { 1216const struct file_operations btrfs_file_operations = {
1200 .llseek = generic_file_llseek, 1217 .llseek = generic_file_llseek,
1201 .read = do_sync_read, 1218 .read = do_sync_read,
1202 .aio_read = generic_file_aio_read, 1219 .aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e9b76bcd1c12..112e5aa85892 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations;
62static const struct inode_operations btrfs_file_inode_operations; 62static const struct inode_operations btrfs_file_inode_operations;
63static const struct address_space_operations btrfs_aops; 63static const struct address_space_operations btrfs_aops;
64static const struct address_space_operations btrfs_symlink_aops; 64static const struct address_space_operations btrfs_symlink_aops;
65static struct file_operations btrfs_dir_file_operations; 65static const struct file_operations btrfs_dir_file_operations;
66static struct extent_io_ops btrfs_extent_io_ops; 66static struct extent_io_ops btrfs_extent_io_ops;
67 67
68static struct kmem_cache *btrfs_inode_cachep; 68static struct kmem_cache *btrfs_inode_cachep;
@@ -1159,6 +1159,83 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1159 return ret; 1159 return ret;
1160} 1160}
1161 1161
1162static int btrfs_split_extent_hook(struct inode *inode,
1163 struct extent_state *orig, u64 split)
1164{
1165 struct btrfs_root *root = BTRFS_I(inode)->root;
1166 u64 size;
1167
1168 if (!(orig->state & EXTENT_DELALLOC))
1169 return 0;
1170
1171 size = orig->end - orig->start + 1;
1172 if (size > root->fs_info->max_extent) {
1173 u64 num_extents;
1174 u64 new_size;
1175
1176 new_size = orig->end - split + 1;
1177 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1178 root->fs_info->max_extent);
1179
1180 /*
1181 * if we break a large extent up then leave delalloc_extents be,
1182 * since we've already accounted for the large extent.
1183 */
1184 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1185 root->fs_info->max_extent) < num_extents)
1186 return 0;
1187 }
1188
1189 BTRFS_I(inode)->delalloc_extents++;
1190
1191 return 0;
1192}
1193
1194/*
1195 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1196 * extents so we can keep track of new extents that are just merged onto old
1197 * extents, such as when we are doing sequential writes, so we can properly
1198 * account for the metadata space we'll need.
1199 */
1200static int btrfs_merge_extent_hook(struct inode *inode,
1201 struct extent_state *new,
1202 struct extent_state *other)
1203{
1204 struct btrfs_root *root = BTRFS_I(inode)->root;
1205 u64 new_size, old_size;
1206 u64 num_extents;
1207
1208 /* not delalloc, ignore it */
1209 if (!(other->state & EXTENT_DELALLOC))
1210 return 0;
1211
1212 old_size = other->end - other->start + 1;
1213 if (new->start < other->start)
1214 new_size = other->end - new->start + 1;
1215 else
1216 new_size = new->end - other->start + 1;
1217
1218 /* we're not bigger than the max, unreserve the space and go */
1219 if (new_size <= root->fs_info->max_extent) {
1220 BTRFS_I(inode)->delalloc_extents--;
1221 return 0;
1222 }
1223
1224 /*
1225 * If we grew by another max_extent, just return, we want to keep that
1226 * reserved amount.
1227 */
1228 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1229 root->fs_info->max_extent);
1230 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1231 root->fs_info->max_extent) > num_extents)
1232 return 0;
1233
1234 BTRFS_I(inode)->delalloc_extents--;
1235
1236 return 0;
1237}
1238
1162/* 1239/*
1163 * extent_io.c set_bit_hook, used to track delayed allocation 1240 * extent_io.c set_bit_hook, used to track delayed allocation
1164 * bytes in this file, and to maintain the list of inodes that 1241 * bytes in this file, and to maintain the list of inodes that
@@ -1167,6 +1244,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1167static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1244static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1168 unsigned long old, unsigned long bits) 1245 unsigned long old, unsigned long bits)
1169{ 1246{
1247
1170 /* 1248 /*
1171 * set_bit and clear bit hooks normally require _irqsave/restore 1249 * set_bit and clear bit hooks normally require _irqsave/restore
1172 * but in this case, we are only testeing for the DELALLOC 1250 * but in this case, we are only testeing for the DELALLOC
@@ -1174,6 +1252,8 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1174 */ 1252 */
1175 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1253 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1176 struct btrfs_root *root = BTRFS_I(inode)->root; 1254 struct btrfs_root *root = BTRFS_I(inode)->root;
1255
1256 BTRFS_I(inode)->delalloc_extents++;
1177 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1257 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1178 spin_lock(&root->fs_info->delalloc_lock); 1258 spin_lock(&root->fs_info->delalloc_lock);
1179 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1259 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1190,22 +1270,27 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1190/* 1270/*
1191 * extent_io.c clear_bit_hook, see set_bit_hook for why 1271 * extent_io.c clear_bit_hook, see set_bit_hook for why
1192 */ 1272 */
1193static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 1273static int btrfs_clear_bit_hook(struct inode *inode,
1194 unsigned long old, unsigned long bits) 1274 struct extent_state *state, unsigned long bits)
1195{ 1275{
1196 /* 1276 /*
1197 * set_bit and clear bit hooks normally require _irqsave/restore 1277 * set_bit and clear bit hooks normally require _irqsave/restore
1198 * but in this case, we are only testeing for the DELALLOC 1278 * but in this case, we are only testeing for the DELALLOC
1199 * bit, which is only set or cleared with irqs on 1279 * bit, which is only set or cleared with irqs on
1200 */ 1280 */
1201 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1281 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1202 struct btrfs_root *root = BTRFS_I(inode)->root; 1282 struct btrfs_root *root = BTRFS_I(inode)->root;
1203 1283
1284 BTRFS_I(inode)->delalloc_extents--;
1285 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1286
1204 spin_lock(&root->fs_info->delalloc_lock); 1287 spin_lock(&root->fs_info->delalloc_lock);
1205 if (end - start + 1 > root->fs_info->delalloc_bytes) { 1288 if (state->end - state->start + 1 >
1289 root->fs_info->delalloc_bytes) {
1206 printk(KERN_INFO "btrfs warning: delalloc account " 1290 printk(KERN_INFO "btrfs warning: delalloc account "
1207 "%llu %llu\n", 1291 "%llu %llu\n",
1208 (unsigned long long)end - start + 1, 1292 (unsigned long long)
1293 state->end - state->start + 1,
1209 (unsigned long long) 1294 (unsigned long long)
1210 root->fs_info->delalloc_bytes); 1295 root->fs_info->delalloc_bytes);
1211 btrfs_delalloc_free_space(root, inode, (u64)-1); 1296 btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1213,9 +1298,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1213 BTRFS_I(inode)->delalloc_bytes = 0; 1298 BTRFS_I(inode)->delalloc_bytes = 0;
1214 } else { 1299 } else {
1215 btrfs_delalloc_free_space(root, inode, 1300 btrfs_delalloc_free_space(root, inode,
1216 end - start + 1); 1301 state->end -
1217 root->fs_info->delalloc_bytes -= end - start + 1; 1302 state->start + 1);
1218 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1303 root->fs_info->delalloc_bytes -= state->end -
1304 state->start + 1;
1305 BTRFS_I(inode)->delalloc_bytes -= state->end -
1306 state->start + 1;
1219 } 1307 }
1220 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1308 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1221 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1309 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -2950,7 +3038,12 @@ again:
2950 goto again; 3038 goto again;
2951 } 3039 }
2952 3040
2953 btrfs_set_extent_delalloc(inode, page_start, page_end); 3041 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
3042 if (ret) {
3043 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3044 goto out_unlock;
3045 }
3046
2954 ret = 0; 3047 ret = 0;
2955 if (offset != PAGE_CACHE_SIZE) { 3048 if (offset != PAGE_CACHE_SIZE) {
2956 kaddr = kmap(page); 3049 kaddr = kmap(page);
@@ -2981,15 +3074,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2981 u64 last_byte; 3074 u64 last_byte;
2982 u64 cur_offset; 3075 u64 cur_offset;
2983 u64 hole_size; 3076 u64 hole_size;
2984 int err; 3077 int err = 0;
2985 3078
2986 if (size <= hole_start) 3079 if (size <= hole_start)
2987 return 0; 3080 return 0;
2988 3081
2989 err = btrfs_check_metadata_free_space(root);
2990 if (err)
2991 return err;
2992
2993 btrfs_truncate_page(inode->i_mapping, inode->i_size); 3082 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2994 3083
2995 while (1) { 3084 while (1) {
@@ -3024,12 +3113,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3024 cur_offset, &hint_byte, 1); 3113 cur_offset, &hint_byte, 1);
3025 if (err) 3114 if (err)
3026 break; 3115 break;
3116
3117 err = btrfs_reserve_metadata_space(root, 1);
3118 if (err)
3119 break;
3120
3027 err = btrfs_insert_file_extent(trans, root, 3121 err = btrfs_insert_file_extent(trans, root,
3028 inode->i_ino, cur_offset, 0, 3122 inode->i_ino, cur_offset, 0,
3029 0, hole_size, 0, hole_size, 3123 0, hole_size, 0, hole_size,
3030 0, 0, 0); 3124 0, 0, 0);
3031 btrfs_drop_extent_cache(inode, hole_start, 3125 btrfs_drop_extent_cache(inode, hole_start,
3032 last_byte - 1, 0); 3126 last_byte - 1, 0);
3127 btrfs_unreserve_metadata_space(root, 1);
3033 } 3128 }
3034 free_extent_map(em); 3129 free_extent_map(em);
3035 cur_offset = last_byte; 3130 cur_offset = last_byte;
@@ -3990,11 +4085,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3990 if (!new_valid_dev(rdev)) 4085 if (!new_valid_dev(rdev))
3991 return -EINVAL; 4086 return -EINVAL;
3992 4087
3993 err = btrfs_check_metadata_free_space(root); 4088 /*
4089 * 2 for inode item and ref
4090 * 2 for dir items
4091 * 1 for xattr if selinux is on
4092 */
4093 err = btrfs_reserve_metadata_space(root, 5);
3994 if (err) 4094 if (err)
3995 goto fail; 4095 return err;
3996 4096
3997 trans = btrfs_start_transaction(root, 1); 4097 trans = btrfs_start_transaction(root, 1);
4098 if (!trans)
4099 goto fail;
3998 btrfs_set_trans_block_group(trans, dir); 4100 btrfs_set_trans_block_group(trans, dir);
3999 4101
4000 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4102 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4032,6 +4134,7 @@ out_unlock:
4032 nr = trans->blocks_used; 4134 nr = trans->blocks_used;
4033 btrfs_end_transaction_throttle(trans, root); 4135 btrfs_end_transaction_throttle(trans, root);
4034fail: 4136fail:
4137 btrfs_unreserve_metadata_space(root, 5);
4035 if (drop_inode) { 4138 if (drop_inode) {
4036 inode_dec_link_count(inode); 4139 inode_dec_link_count(inode);
4037 iput(inode); 4140 iput(inode);
@@ -4052,10 +4155,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4052 u64 objectid; 4155 u64 objectid;
4053 u64 index = 0; 4156 u64 index = 0;
4054 4157
4055 err = btrfs_check_metadata_free_space(root); 4158 /*
4159 * 2 for inode item and ref
4160 * 2 for dir items
4161 * 1 for xattr if selinux is on
4162 */
4163 err = btrfs_reserve_metadata_space(root, 5);
4056 if (err) 4164 if (err)
4057 goto fail; 4165 return err;
4166
4058 trans = btrfs_start_transaction(root, 1); 4167 trans = btrfs_start_transaction(root, 1);
4168 if (!trans)
4169 goto fail;
4059 btrfs_set_trans_block_group(trans, dir); 4170 btrfs_set_trans_block_group(trans, dir);
4060 4171
4061 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4172 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4096,6 +4207,7 @@ out_unlock:
4096 nr = trans->blocks_used; 4207 nr = trans->blocks_used;
4097 btrfs_end_transaction_throttle(trans, root); 4208 btrfs_end_transaction_throttle(trans, root);
4098fail: 4209fail:
4210 btrfs_unreserve_metadata_space(root, 5);
4099 if (drop_inode) { 4211 if (drop_inode) {
4100 inode_dec_link_count(inode); 4212 inode_dec_link_count(inode);
4101 iput(inode); 4213 iput(inode);
@@ -4118,10 +4230,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4118 if (inode->i_nlink == 0) 4230 if (inode->i_nlink == 0)
4119 return -ENOENT; 4231 return -ENOENT;
4120 4232
4121 btrfs_inc_nlink(inode); 4233 /*
4122 err = btrfs_check_metadata_free_space(root); 4234 * 1 item for inode ref
4235 * 2 items for dir items
4236 */
4237 err = btrfs_reserve_metadata_space(root, 3);
4123 if (err) 4238 if (err)
4124 goto fail; 4239 return err;
4240
4241 btrfs_inc_nlink(inode);
4242
4125 err = btrfs_set_inode_index(dir, &index); 4243 err = btrfs_set_inode_index(dir, &index);
4126 if (err) 4244 if (err)
4127 goto fail; 4245 goto fail;
@@ -4145,6 +4263,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4145 nr = trans->blocks_used; 4263 nr = trans->blocks_used;
4146 btrfs_end_transaction_throttle(trans, root); 4264 btrfs_end_transaction_throttle(trans, root);
4147fail: 4265fail:
4266 btrfs_unreserve_metadata_space(root, 3);
4148 if (drop_inode) { 4267 if (drop_inode) {
4149 inode_dec_link_count(inode); 4268 inode_dec_link_count(inode);
4150 iput(inode); 4269 iput(inode);
@@ -4164,17 +4283,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4164 u64 index = 0; 4283 u64 index = 0;
4165 unsigned long nr = 1; 4284 unsigned long nr = 1;
4166 4285
4167 err = btrfs_check_metadata_free_space(root); 4286 /*
4287 * 2 items for inode and ref
4288 * 2 items for dir items
4289 * 1 for xattr if selinux is on
4290 */
4291 err = btrfs_reserve_metadata_space(root, 5);
4168 if (err) 4292 if (err)
4169 goto out_unlock; 4293 return err;
4170 4294
4171 trans = btrfs_start_transaction(root, 1); 4295 trans = btrfs_start_transaction(root, 1);
4172 btrfs_set_trans_block_group(trans, dir); 4296 if (!trans) {
4173 4297 err = -ENOMEM;
4174 if (IS_ERR(trans)) {
4175 err = PTR_ERR(trans);
4176 goto out_unlock; 4298 goto out_unlock;
4177 } 4299 }
4300 btrfs_set_trans_block_group(trans, dir);
4178 4301
4179 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4302 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4180 if (err) { 4303 if (err) {
@@ -4223,6 +4346,7 @@ out_fail:
4223 btrfs_end_transaction_throttle(trans, root); 4346 btrfs_end_transaction_throttle(trans, root);
4224 4347
4225out_unlock: 4348out_unlock:
4349 btrfs_unreserve_metadata_space(root, 5);
4226 if (drop_on_err) 4350 if (drop_on_err)
4227 iput(inode); 4351 iput(inode);
4228 btrfs_btree_balance_dirty(root, nr); 4352 btrfs_btree_balance_dirty(root, nr);
@@ -4747,6 +4871,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4747 goto out; 4871 goto out;
4748 } 4872 }
4749 4873
4874 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
4875 if (ret) {
4876 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4877 ret = VM_FAULT_SIGBUS;
4878 goto out;
4879 }
4880
4750 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4881 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4751again: 4882again:
4752 lock_page(page); 4883 lock_page(page);
@@ -4778,7 +4909,23 @@ again:
4778 goto again; 4909 goto again;
4779 } 4910 }
4780 4911
4781 btrfs_set_extent_delalloc(inode, page_start, page_end); 4912 /*
4913 * XXX - page_mkwrite gets called every time the page is dirtied, even
4914 * if it was already dirty, so for space accounting reasons we need to
4915 * clear any delalloc bits for the range we are fixing to save. There
4916 * is probably a better way to do this, but for now keep consistent with
4917 * prepare_pages in the normal write path.
4918 */
4919 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
4920 EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
4921
4922 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
4923 if (ret) {
4924 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4925 ret = VM_FAULT_SIGBUS;
4926 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4927 goto out_unlock;
4928 }
4782 ret = 0; 4929 ret = 0;
4783 4930
4784 /* page is wholly or partially inside EOF */ 4931 /* page is wholly or partially inside EOF */
@@ -4801,6 +4948,7 @@ again:
4801 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4948 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4802 4949
4803out_unlock: 4950out_unlock:
4951 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
4804 if (!ret) 4952 if (!ret)
4805 return VM_FAULT_LOCKED; 4953 return VM_FAULT_LOCKED;
4806 unlock_page(page); 4954 unlock_page(page);
@@ -4917,6 +5065,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4917 return NULL; 5065 return NULL;
4918 ei->last_trans = 0; 5066 ei->last_trans = 0;
4919 ei->logged_trans = 0; 5067 ei->logged_trans = 0;
5068 ei->delalloc_extents = 0;
5069 ei->delalloc_reserved_extents = 0;
4920 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5070 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4921 INIT_LIST_HEAD(&ei->i_orphan); 5071 INIT_LIST_HEAD(&ei->i_orphan);
4922 INIT_LIST_HEAD(&ei->ordered_operations); 5072 INIT_LIST_HEAD(&ei->ordered_operations);
@@ -5070,7 +5220,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5070 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 5220 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5071 return -ENOTEMPTY; 5221 return -ENOTEMPTY;
5072 5222
5073 ret = btrfs_check_metadata_free_space(root); 5223 /*
5224 * 2 items for dir items
5225 * 1 item for orphan entry
5226 * 1 item for ref
5227 */
5228 ret = btrfs_reserve_metadata_space(root, 4);
5074 if (ret) 5229 if (ret)
5075 return ret; 5230 return ret;
5076 5231
@@ -5185,6 +5340,8 @@ out_fail:
5185 5340
5186 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5341 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5187 up_read(&root->fs_info->subvol_sem); 5342 up_read(&root->fs_info->subvol_sem);
5343
5344 btrfs_unreserve_metadata_space(root, 4);
5188 return ret; 5345 return ret;
5189} 5346}
5190 5347
@@ -5256,11 +5413,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5256 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5413 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5257 return -ENAMETOOLONG; 5414 return -ENAMETOOLONG;
5258 5415
5259 err = btrfs_check_metadata_free_space(root); 5416 /*
5417 * 2 items for inode item and ref
5418 * 2 items for dir items
5419 * 1 item for xattr if selinux is on
5420 */
5421 err = btrfs_reserve_metadata_space(root, 5);
5260 if (err) 5422 if (err)
5261 goto out_fail; 5423 return err;
5262 5424
5263 trans = btrfs_start_transaction(root, 1); 5425 trans = btrfs_start_transaction(root, 1);
5426 if (!trans)
5427 goto out_fail;
5264 btrfs_set_trans_block_group(trans, dir); 5428 btrfs_set_trans_block_group(trans, dir);
5265 5429
5266 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 5430 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5341,6 +5505,7 @@ out_unlock:
5341 nr = trans->blocks_used; 5505 nr = trans->blocks_used;
5342 btrfs_end_transaction_throttle(trans, root); 5506 btrfs_end_transaction_throttle(trans, root);
5343out_fail: 5507out_fail:
5508 btrfs_unreserve_metadata_space(root, 5);
5344 if (drop_inode) { 5509 if (drop_inode) {
5345 inode_dec_link_count(inode); 5510 inode_dec_link_count(inode);
5346 iput(inode); 5511 iput(inode);
@@ -5362,6 +5527,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5362 5527
5363 while (num_bytes > 0) { 5528 while (num_bytes > 0) {
5364 alloc_size = min(num_bytes, root->fs_info->max_extent); 5529 alloc_size = min(num_bytes, root->fs_info->max_extent);
5530
5531 ret = btrfs_reserve_metadata_space(root, 1);
5532 if (ret)
5533 goto out;
5534
5365 ret = btrfs_reserve_extent(trans, root, alloc_size, 5535 ret = btrfs_reserve_extent(trans, root, alloc_size,
5366 root->sectorsize, 0, alloc_hint, 5536 root->sectorsize, 0, alloc_hint,
5367 (u64)-1, &ins, 1); 5537 (u64)-1, &ins, 1);
@@ -5381,6 +5551,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5381 num_bytes -= ins.offset; 5551 num_bytes -= ins.offset;
5382 cur_offset += ins.offset; 5552 cur_offset += ins.offset;
5383 alloc_hint = ins.objectid + ins.offset; 5553 alloc_hint = ins.objectid + ins.offset;
5554 btrfs_unreserve_metadata_space(root, 1);
5384 } 5555 }
5385out: 5556out:
5386 if (cur_offset > start) { 5557 if (cur_offset > start) {
@@ -5544,7 +5715,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
5544 .permission = btrfs_permission, 5715 .permission = btrfs_permission,
5545}; 5716};
5546 5717
5547static struct file_operations btrfs_dir_file_operations = { 5718static const struct file_operations btrfs_dir_file_operations = {
5548 .llseek = generic_file_llseek, 5719 .llseek = generic_file_llseek,
5549 .read = generic_read_dir, 5720 .read = generic_read_dir,
5550 .readdir = btrfs_real_readdir, 5721 .readdir = btrfs_real_readdir,
@@ -5566,6 +5737,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5566 .readpage_io_failed_hook = btrfs_io_failed_hook, 5737 .readpage_io_failed_hook = btrfs_io_failed_hook,
5567 .set_bit_hook = btrfs_set_bit_hook, 5738 .set_bit_hook = btrfs_set_bit_hook,
5568 .clear_bit_hook = btrfs_clear_bit_hook, 5739 .clear_bit_hook = btrfs_clear_bit_hook,
5740 .merge_extent_hook = btrfs_merge_extent_hook,
5741 .split_extent_hook = btrfs_split_extent_hook,
5569}; 5742};
5570 5743
5571/* 5744/*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a8577a7f26ab..9a780c8d0ac8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 index = 0; 239 u64 index = 0;
240 unsigned long nr = 1; 240 unsigned long nr = 1;
241 241
242 ret = btrfs_check_metadata_free_space(root); 242 /*
243 * 1 - inode item
244 * 2 - refs
245 * 1 - root item
246 * 2 - dir items
247 */
248 ret = btrfs_reserve_metadata_space(root, 6);
243 if (ret) 249 if (ret)
244 return ret; 250 return ret;
245 251
@@ -340,6 +346,9 @@ fail:
340 err = btrfs_commit_transaction(trans, root); 346 err = btrfs_commit_transaction(trans, root);
341 if (err && !ret) 347 if (err && !ret)
342 ret = err; 348 ret = err;
349
350 btrfs_unreserve_metadata_space(root, 6);
351 btrfs_btree_balance_dirty(root, nr);
343 return ret; 352 return ret;
344} 353}
345 354
@@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
355 if (!root->ref_cows) 364 if (!root->ref_cows)
356 return -EINVAL; 365 return -EINVAL;
357 366
358 ret = btrfs_check_metadata_free_space(root); 367 /*
368 * 1 - inode item
369 * 2 - refs
370 * 1 - root item
371 * 2 - dir items
372 */
373 ret = btrfs_reserve_metadata_space(root, 6);
359 if (ret) 374 if (ret)
360 goto fail_unlock; 375 goto fail_unlock;
361 376
362 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 377 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
363 if (!pending_snapshot) { 378 if (!pending_snapshot) {
364 ret = -ENOMEM; 379 ret = -ENOMEM;
380 btrfs_unreserve_metadata_space(root, 6);
365 goto fail_unlock; 381 goto fail_unlock;
366 } 382 }
367 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); 383 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
368 if (!pending_snapshot->name) { 384 if (!pending_snapshot->name) {
369 ret = -ENOMEM; 385 ret = -ENOMEM;
370 kfree(pending_snapshot); 386 kfree(pending_snapshot);
387 btrfs_unreserve_metadata_space(root, 6);
371 goto fail_unlock; 388 goto fail_unlock;
372 } 389 }
373 memcpy(pending_snapshot->name, name, namelen); 390 memcpy(pending_snapshot->name, name, namelen);
@@ -1215,15 +1232,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
1215 struct inode *inode = fdentry(file)->d_inode; 1232 struct inode *inode = fdentry(file)->d_inode;
1216 struct btrfs_root *root = BTRFS_I(inode)->root; 1233 struct btrfs_root *root = BTRFS_I(inode)->root;
1217 struct btrfs_trans_handle *trans; 1234 struct btrfs_trans_handle *trans;
1218 int ret = 0; 1235 int ret;
1219 1236
1237 ret = -EPERM;
1220 if (!capable(CAP_SYS_ADMIN)) 1238 if (!capable(CAP_SYS_ADMIN))
1221 return -EPERM; 1239 goto out;
1222 1240
1223 if (file->private_data) { 1241 ret = -EINPROGRESS;
1224 ret = -EINPROGRESS; 1242 if (file->private_data)
1225 goto out; 1243 goto out;
1226 }
1227 1244
1228 ret = mnt_want_write(file->f_path.mnt); 1245 ret = mnt_want_write(file->f_path.mnt);
1229 if (ret) 1246 if (ret)
@@ -1233,12 +1250,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
1233 root->fs_info->open_ioctl_trans++; 1250 root->fs_info->open_ioctl_trans++;
1234 mutex_unlock(&root->fs_info->trans_mutex); 1251 mutex_unlock(&root->fs_info->trans_mutex);
1235 1252
1253 ret = -ENOMEM;
1236 trans = btrfs_start_ioctl_transaction(root, 0); 1254 trans = btrfs_start_ioctl_transaction(root, 0);
1237 if (trans) 1255 if (!trans)
1238 file->private_data = trans; 1256 goto out_drop;
1239 else 1257
1240 ret = -ENOMEM; 1258 file->private_data = trans;
1241 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ 1259 return 0;
1260
1261out_drop:
1262 mutex_lock(&root->fs_info->trans_mutex);
1263 root->fs_info->open_ioctl_trans--;
1264 mutex_unlock(&root->fs_info->trans_mutex);
1265 mnt_drop_write(file->f_path.mnt);
1242out: 1266out:
1243 return ret; 1267 return ret;
1244} 1268}
@@ -1254,24 +1278,20 @@ long btrfs_ioctl_trans_end(struct file *file)
1254 struct inode *inode = fdentry(file)->d_inode; 1278 struct inode *inode = fdentry(file)->d_inode;
1255 struct btrfs_root *root = BTRFS_I(inode)->root; 1279 struct btrfs_root *root = BTRFS_I(inode)->root;
1256 struct btrfs_trans_handle *trans; 1280 struct btrfs_trans_handle *trans;
1257 int ret = 0;
1258 1281
1259 trans = file->private_data; 1282 trans = file->private_data;
1260 if (!trans) { 1283 if (!trans)
1261 ret = -EINVAL; 1284 return -EINVAL;
1262 goto out;
1263 }
1264 btrfs_end_transaction(trans, root);
1265 file->private_data = NULL; 1285 file->private_data = NULL;
1266 1286
1287 btrfs_end_transaction(trans, root);
1288
1267 mutex_lock(&root->fs_info->trans_mutex); 1289 mutex_lock(&root->fs_info->trans_mutex);
1268 root->fs_info->open_ioctl_trans--; 1290 root->fs_info->open_ioctl_trans--;
1269 mutex_unlock(&root->fs_info->trans_mutex); 1291 mutex_unlock(&root->fs_info->trans_mutex);
1270 1292
1271 mnt_drop_write(file->f_path.mnt); 1293 mnt_drop_write(file->f_path.mnt);
1272 1294 return 0;
1273out:
1274 return ret;
1275} 1295}
1276 1296
1277long btrfs_ioctl(struct file *file, unsigned int 1297long btrfs_ioctl(struct file *file, unsigned int
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b5d6d24726b0..897fba835f89 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -458,7 +458,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
458 * start IO on any dirty ones so the wait doesn't stall waiting 458 * start IO on any dirty ones so the wait doesn't stall waiting
459 * for pdflush to find them 459 * for pdflush to find them
460 */ 460 */
461 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); 461 filemap_fdatawrite_range(inode->i_mapping, start, end);
462 if (wait) { 462 if (wait) {
463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
464 &entry->flags)); 464 &entry->flags));
@@ -488,17 +488,15 @@ again:
488 /* start IO across the range first to instantiate any delalloc 488 /* start IO across the range first to instantiate any delalloc
489 * extents 489 * extents
490 */ 490 */
491 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 491 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
492 492
493 /* The compression code will leave pages locked but return from 493 /* The compression code will leave pages locked but return from
494 * writepage without setting the page writeback. Starting again 494 * writepage without setting the page writeback. Starting again
495 * with WB_SYNC_ALL will end up waiting for the IO to actually start. 495 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
496 */ 496 */
497 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 497 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
498 498
499 btrfs_wait_on_page_writeback_range(inode->i_mapping, 499 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
500 start >> PAGE_CACHE_SHIFT,
501 orig_end >> PAGE_CACHE_SHIFT);
502 500
503 end = orig_end; 501 end = orig_end;
504 found = 0; 502 found = 0;
@@ -716,89 +714,6 @@ out:
716} 714}
717 715
718 716
719/**
720 * taken from mm/filemap.c because it isn't exported
721 *
722 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
723 * @mapping: address space structure to write
724 * @start: offset in bytes where the range starts
725 * @end: offset in bytes where the range ends (inclusive)
726 * @sync_mode: enable synchronous operation
727 *
728 * Start writeback against all of a mapping's dirty pages that lie
729 * within the byte offsets <start, end> inclusive.
730 *
731 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
732 * opposed to a regular memory cleansing writeback. The difference between
733 * these two operations is that if a dirty page/buffer is encountered, it must
734 * be waited upon, and not just skipped over.
735 */
736int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
737 loff_t end, int sync_mode)
738{
739 struct writeback_control wbc = {
740 .sync_mode = sync_mode,
741 .nr_to_write = mapping->nrpages * 2,
742 .range_start = start,
743 .range_end = end,
744 };
745 return btrfs_writepages(mapping, &wbc);
746}
747
748/**
749 * taken from mm/filemap.c because it isn't exported
750 *
751 * wait_on_page_writeback_range - wait for writeback to complete
752 * @mapping: target address_space
753 * @start: beginning page index
754 * @end: ending page index
755 *
756 * Wait for writeback to complete against pages indexed by start->end
757 * inclusive
758 */
759int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
760 pgoff_t start, pgoff_t end)
761{
762 struct pagevec pvec;
763 int nr_pages;
764 int ret = 0;
765 pgoff_t index;
766
767 if (end < start)
768 return 0;
769
770 pagevec_init(&pvec, 0);
771 index = start;
772 while ((index <= end) &&
773 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
774 PAGECACHE_TAG_WRITEBACK,
775 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
776 unsigned i;
777
778 for (i = 0; i < nr_pages; i++) {
779 struct page *page = pvec.pages[i];
780
781 /* until radix tree lookup accepts end_index */
782 if (page->index > end)
783 continue;
784
785 wait_on_page_writeback(page);
786 if (PageError(page))
787 ret = -EIO;
788 }
789 pagevec_release(&pvec);
790 cond_resched();
791 }
792
793 /* Check for outstanding write errors */
794 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
795 ret = -ENOSPC;
796 if (test_and_clear_bit(AS_EIO, &mapping->flags))
797 ret = -EIO;
798
799 return ret;
800}
801
802/* 717/*
803 * add a given inode to the list of inodes that must be fully on 718 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes. 719 * disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 993a7ea45c70..f82e87488ca8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -153,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
153int btrfs_ordered_update_i_size(struct inode *inode, 153int btrfs_ordered_update_i_size(struct inode *inode,
154 struct btrfs_ordered_extent *ordered); 154 struct btrfs_ordered_extent *ordered);
155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
156int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
157 pgoff_t start, pgoff_t end);
158int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
159 loff_t end, int sync_mode);
160int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 156int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
161int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 157int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
162int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 158int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67035385444c..9de9b2236419 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -344,7 +344,9 @@ static int btrfs_fill_super(struct super_block *sb,
344 sb->s_export_op = &btrfs_export_ops; 344 sb->s_export_op = &btrfs_export_ops;
345 sb->s_xattr = btrfs_xattr_handlers; 345 sb->s_xattr = btrfs_xattr_handlers;
346 sb->s_time_gran = 1; 346 sb->s_time_gran = 1;
347#ifdef CONFIG_BTRFS_POSIX_ACL
347 sb->s_flags |= MS_POSIXACL; 348 sb->s_flags |= MS_POSIXACL;
349#endif
348 350
349 tree_root = open_ctree(sb, fs_devices, (char *)data); 351 tree_root = open_ctree(sb, fs_devices, (char *)data);
350 352
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 88f866f85e7a..0b8f36d4400a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -186,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
186 h->alloc_exclude_start = 0; 186 h->alloc_exclude_start = 0;
187 h->delayed_ref_updates = 0; 187 h->delayed_ref_updates = 0;
188 188
189 if (!current->journal_info)
190 current->journal_info = h;
191
189 root->fs_info->running_transaction->use_count++; 192 root->fs_info->running_transaction->use_count++;
190 record_root_in_trans(h, root); 193 record_root_in_trans(h, root);
191 mutex_unlock(&root->fs_info->trans_mutex); 194 mutex_unlock(&root->fs_info->trans_mutex);
@@ -317,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
317 wake_up(&cur_trans->writer_wait); 320 wake_up(&cur_trans->writer_wait);
318 put_transaction(cur_trans); 321 put_transaction(cur_trans);
319 mutex_unlock(&info->trans_mutex); 322 mutex_unlock(&info->trans_mutex);
323
324 if (current->journal_info == trans)
325 current->journal_info = NULL;
320 memset(trans, 0, sizeof(*trans)); 326 memset(trans, 0, sizeof(*trans));
321 kmem_cache_free(btrfs_trans_handle_cachep, trans); 327 kmem_cache_free(btrfs_trans_handle_cachep, trans);
322 328
@@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
743 memcpy(&pending->root_key, &key, sizeof(key)); 749 memcpy(&pending->root_key, &key, sizeof(key));
744fail: 750fail:
745 kfree(new_root_item); 751 kfree(new_root_item);
752 btrfs_unreserve_metadata_space(root, 6);
746 return ret; 753 return ret;
747} 754}
748 755
@@ -1059,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1059 1066
1060 mutex_unlock(&root->fs_info->trans_mutex); 1067 mutex_unlock(&root->fs_info->trans_mutex);
1061 1068
1069 if (current->journal_info == trans)
1070 current->journal_info = NULL;
1071
1062 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1072 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1063 return ret; 1073 return ret;
1064} 1074}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23e7d36ff325..7eda483d7b5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
446 goto error; 446 goto error;
447 447
448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 448 device->name = kstrdup(orig_dev->name, GFP_NOFS);
449 if (!device->name) 449 if (!device->name) {
450 kfree(device);
450 goto error; 451 goto error;
452 }
451 453
452 device->devid = orig_dev->devid; 454 device->devid = orig_dev->devid;
453 device->work.func = pending_bios_fn; 455 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d2689..b0fc93f95fd0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
260 * attributes are handled directly. 260 * attributes are handled directly.
261 */ 261 */
262struct xattr_handler *btrfs_xattr_handlers[] = { 262struct xattr_handler *btrfs_xattr_handlers[] = {
263#ifdef CONFIG_FS_POSIX_ACL 263#ifdef CONFIG_BTRFS_POSIX_ACL
264 &btrfs_xattr_acl_access_handler, 264 &btrfs_xattr_acl_access_handler,
265 &btrfs_xattr_acl_default_handler, 265 &btrfs_xattr_acl_default_handler,
266#endif 266#endif
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0376ac66c44a..be4392ca2098 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,6 +22,7 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/major.h> 23#include <linux/major.h>
24#include <linux/time.h> 24#include <linux/time.h>
25#include <linux/sched.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/ioport.h> 27#include <linux/ioport.h>
27#include <linux/fcntl.h> 28#include <linux/fcntl.h>
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2e8f2d..9f2d45d75b1a 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config EXT4DEV_COMPAT
30 bool "Enable ext4dev compatibility"
31 depends on EXT4_FS
32 help
33 Starting with 2.6.28, the name of the ext4 filesystem was
34 renamed from ext4dev to ext4. Unfortunately there are some
35 legacy userspace programs (such as klibc's fstype) have
36 "ext4dev" hardcoded.
37
38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev,
40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed!
42
43config EXT4_FS_XATTR 29config EXT4_FS_XATTR
44 bool "Ext4 extended attributes" 30 bool "Ext4 extended attributes"
45 depends on EXT4_FS 31 depends on EXT4_FS
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..984ca0cb38c3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
65/* data type for block group number */ 65/* data type for block group number */
66typedef unsigned int ext4_group_t; 66typedef unsigned int ext4_group_t;
67 67
68/*
69 * Flags used in mballoc's allocation_context flags field.
70 *
71 * Also used to show what's going on for debugging purposes when the
72 * flag field is exported via the traceport interface
73 */
68 74
69/* prefer goal again. length */ 75/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 0x0001 76#define EXT4_MB_HINT_MERGE 0x0001
@@ -127,6 +133,16 @@ struct mpage_da_data {
127 int pages_written; 133 int pages_written;
128 int retval; 134 int retval;
129}; 135};
136#define DIO_AIO_UNWRITTEN 0x1
137typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */
142 ext4_lblk_t offset; /* offset in the file */
143 size_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */
145} ext4_io_end_t;
130 146
131/* 147/*
132 * Special inodes numbers 148 * Special inodes numbers
@@ -347,7 +363,16 @@ struct ext4_new_group_data {
347 /* Call ext4_da_update_reserve_space() after successfully 363 /* Call ext4_da_update_reserve_space() after successfully
348 allocating the blocks */ 364 allocating the blocks */
349#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 365#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
350 366 /* caller is from the direct IO path, request to creation of an
367 unitialized extents if not allocated, split the uninitialized
368 extent if blocks has been preallocated already*/
369#define EXT4_GET_BLOCKS_DIO 0x0010
370#define EXT4_GET_BLOCKS_CONVERT 0x0020
371#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
372 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
373 /* Convert extent to initialized after direct IO complete */
374#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
375 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
351 376
352/* 377/*
353 * ioctl commands 378 * ioctl commands
@@ -500,8 +525,8 @@ struct move_extent {
500static inline __le32 ext4_encode_extra_time(struct timespec *time) 525static inline __le32 ext4_encode_extra_time(struct timespec *time)
501{ 526{
502 return cpu_to_le32((sizeof(time->tv_sec) > 4 ? 527 return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
503 time->tv_sec >> 32 : 0) | 528 (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
504 ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); 529 ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
505} 530}
506 531
507static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) 532static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -509,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
509 if (sizeof(time->tv_sec) > 4) 534 if (sizeof(time->tv_sec) > 4)
510 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) 535 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
511 << 32; 536 << 32;
512 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; 537 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
513} 538}
514 539
515#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ 540#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -672,6 +697,11 @@ struct ext4_inode_info {
672 __u16 i_extra_isize; 697 __u16 i_extra_isize;
673 698
674 spinlock_t i_block_reservation_lock; 699 spinlock_t i_block_reservation_lock;
700
701 /* completed async DIOs that might need unwritten extents handling */
702 struct list_head i_aio_dio_complete_list;
703 /* current io_end structure for async DIO write*/
704 ext4_io_end_t *cur_aio_dio;
675}; 705};
676 706
677/* 707/*
@@ -942,18 +972,11 @@ struct ext4_sb_info {
942 unsigned int s_mb_stats; 972 unsigned int s_mb_stats;
943 unsigned int s_mb_order2_reqs; 973 unsigned int s_mb_order2_reqs;
944 unsigned int s_mb_group_prealloc; 974 unsigned int s_mb_group_prealloc;
975 unsigned int s_max_writeback_mb_bump;
945 /* where last allocation was done - for stream allocation */ 976 /* where last allocation was done - for stream allocation */
946 unsigned long s_mb_last_group; 977 unsigned long s_mb_last_group;
947 unsigned long s_mb_last_start; 978 unsigned long s_mb_last_start;
948 979
949 /* history to debug policy */
950 struct ext4_mb_history *s_mb_history;
951 int s_mb_history_cur;
952 int s_mb_history_max;
953 int s_mb_history_num;
954 spinlock_t s_mb_history_lock;
955 int s_mb_history_filter;
956
957 /* stats for buddy allocator */ 980 /* stats for buddy allocator */
958 spinlock_t s_mb_pa_lock; 981 spinlock_t s_mb_pa_lock;
959 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 982 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -980,6 +1003,9 @@ struct ext4_sb_info {
980 1003
981 unsigned int s_log_groups_per_flex; 1004 unsigned int s_log_groups_per_flex;
982 struct flex_groups *s_flex_groups; 1005 struct flex_groups *s_flex_groups;
1006
1007 /* workqueue for dio unwritten */
1008 struct workqueue_struct *dio_unwritten_wq;
983}; 1009};
984 1010
985static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1011static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1397,7 +1423,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1397 struct address_space *mapping, loff_t from); 1423 struct address_space *mapping, loff_t from);
1398extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1424extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1399extern qsize_t ext4_get_reserved_space(struct inode *inode); 1425extern qsize_t ext4_get_reserved_space(struct inode *inode);
1400 1426extern int flush_aio_dio_completed_IO(struct inode *inode);
1401/* ioctl.c */ 1427/* ioctl.c */
1402extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1428extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1403extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1429extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1699,6 +1725,8 @@ extern void ext4_ext_init(struct super_block *);
1699extern void ext4_ext_release(struct super_block *); 1725extern void ext4_ext_release(struct super_block *);
1700extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1726extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1701 loff_t len); 1727 loff_t len);
1728extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1729 loff_t len);
1702extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1730extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1703 sector_t block, unsigned int max_blocks, 1731 sector_t block, unsigned int max_blocks,
1704 struct buffer_head *bh, int flags); 1732 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1d15e6..2ca686454e87 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
221} 221}
222 222
223static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
224{
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226}
227
223extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 228extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
224extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); 229extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
225extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 230extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
235 struct ext4_ext_path *path, 240 struct ext4_ext_path *path,
236 struct ext4_extent *); 241 struct ext4_extent *);
237extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 242extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
238extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 243extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
239extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, 244extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
240 ext_prepare_callback, void *); 245 ext_prepare_callback, void *);
241extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 246extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..a2865980342f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
162int __ext4_journal_stop(const char *where, handle_t *handle); 162int __ext4_journal_stop(const char *where, handle_t *handle);
163 163
164#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) 164#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
165 165
166/* Note: Do not use this for NULL handles. This is only to determine if
167 * a properly allocated handle is using a journal or not. */
166static inline int ext4_handle_valid(handle_t *handle) 168static inline int ext4_handle_valid(handle_t *handle)
167{ 169{
168 if (handle == EXT4_NOJOURNAL_HANDLE) 170 if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
169 return 0; 171 return 0;
170 return 1; 172 return 1;
171} 173}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a3832577923..10539e364283 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -723,7 +723,7 @@ err:
723 * insert new index [@logical;@ptr] into the block at @curp; 723 * insert new index [@logical;@ptr] into the block at @curp;
724 * check where to insert: before @curp or after @curp 724 * check where to insert: before @curp or after @curp
725 */ 725 */
726static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 726int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
727 struct ext4_ext_path *curp, 727 struct ext4_ext_path *curp,
728 int logical, ext4_fsblk_t ptr) 728 int logical, ext4_fsblk_t ptr)
729{ 729{
@@ -1586,7 +1586,7 @@ out:
1586 */ 1586 */
1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1588 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1589 struct ext4_extent *newext) 1589 struct ext4_extent *newext, int flag)
1590{ 1590{
1591 struct ext4_extent_header *eh; 1591 struct ext4_extent_header *eh;
1592 struct ext4_extent *ex, *fex; 1592 struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1602 BUG_ON(path[depth].p_hdr == NULL); 1602 BUG_ON(path[depth].p_hdr == NULL);
1603 1603
1604 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1606 && ext4_can_extents_be_merged(inode, ex, newext)) {
1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1607 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext), 1608 ext4_ext_is_uninitialized(newext),
1608 ext4_ext_get_actual_len(newext), 1609 ext4_ext_get_actual_len(newext),
@@ -1722,7 +1723,8 @@ has_space:
1722 1723
1723merge: 1724merge:
1724 /* try to merge extents to the right */ 1725 /* try to merge extents to the right */
1725 ext4_ext_try_to_merge(inode, path, nearex); 1726 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1727 ext4_ext_try_to_merge(inode, path, nearex);
1726 1728
1727 /* try to merge extents to the left */ 1729 /* try to merge extents to the left */
1728 1730
@@ -2378,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
2378 */ 2380 */
2379 2381
2380 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2382 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2383#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2381 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2384 printk(KERN_INFO "EXT4-fs: file extents enabled");
2382#ifdef AGGRESSIVE_TEST 2385#ifdef AGGRESSIVE_TEST
2383 printk(", aggressive tests"); 2386 printk(", aggressive tests");
@@ -2389,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
2389 printk(", stats"); 2392 printk(", stats");
2390#endif 2393#endif
2391 printk("\n"); 2394 printk("\n");
2395#endif
2392#ifdef EXTENTS_STATS 2396#ifdef EXTENTS_STATS
2393 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2397 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2394 EXT4_SB(sb)->s_ext_min = 1 << 30; 2398 EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2490,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2490} 2494}
2491 2495
2492#define EXT4_EXT_ZERO_LEN 7 2496#define EXT4_EXT_ZERO_LEN 7
2493
2494/* 2497/*
2495 * This function is called by ext4_ext_get_blocks() if someone tries to write 2498 * This function is called by ext4_ext_get_blocks() if someone tries to write
2496 * to an uninitialized extent. It may result in splitting the uninitialized 2499 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2583 ex3->ee_block = cpu_to_le32(iblock); 2586 ex3->ee_block = cpu_to_le32(iblock);
2584 ext4_ext_store_pblock(ex3, newblock); 2587 ext4_ext_store_pblock(ex3, newblock);
2585 ex3->ee_len = cpu_to_le16(allocated); 2588 ex3->ee_len = cpu_to_le16(allocated);
2586 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2589 err = ext4_ext_insert_extent(handle, inode, path,
2590 ex3, 0);
2587 if (err == -ENOSPC) { 2591 if (err == -ENOSPC) {
2588 err = ext4_ext_zeroout(inode, &orig_ex); 2592 err = ext4_ext_zeroout(inode, &orig_ex);
2589 if (err) 2593 if (err)
@@ -2639,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2639 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2643 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2640 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2644 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2641 ext4_ext_mark_uninitialized(ex3); 2645 ext4_ext_mark_uninitialized(ex3);
2642 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2646 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2643 if (err == -ENOSPC) { 2647 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex); 2648 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err) 2649 if (err)
@@ -2757,7 +2761,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2757 err = ext4_ext_dirty(handle, inode, path + depth); 2761 err = ext4_ext_dirty(handle, inode, path + depth);
2758 goto out; 2762 goto out;
2759insert: 2763insert:
2760 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2764 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2761 if (err == -ENOSPC) { 2765 if (err == -ENOSPC) {
2762 err = ext4_ext_zeroout(inode, &orig_ex); 2766 err = ext4_ext_zeroout(inode, &orig_ex);
2763 if (err) 2767 if (err)
@@ -2785,6 +2789,324 @@ fix_extent_len:
2785} 2789}
2786 2790
2787/* 2791/*
2792 * This function is called by ext4_ext_get_blocks() from
2793 * ext4_get_blocks_dio_write() when DIO to write
2794 * to an uninitialized extent.
2795 *
2796 * Writing to an uninitized extent may result in splitting the uninitialized
2797 * extent into multiple /intialized unintialized extents (up to three)
2798 * There are three possibilities:
2799 * a> There is no split required: Entire extent should be uninitialized
2800 * b> Splits in two extents: Write is happening at either end of the extent
2801 * c> Splits in three extents: Somone is writing in middle of the extent
2802 *
2803 * One of more index blocks maybe needed if the extent tree grow after
2804 * the unintialized extent split. To prevent ENOSPC occur at the IO
2805 * complete, we need to split the uninitialized extent before DIO submit
2806 * the IO. The uninitilized extent called at this time will be split
2807 * into three uninitialized extent(at most). After IO complete, the part
2808 * being filled will be convert to initialized by the end_io callback function
2809 * via ext4_convert_unwritten_extents().
2810 */
2811static int ext4_split_unwritten_extents(handle_t *handle,
2812 struct inode *inode,
2813 struct ext4_ext_path *path,
2814 ext4_lblk_t iblock,
2815 unsigned int max_blocks,
2816 int flags)
2817{
2818 struct ext4_extent *ex, newex, orig_ex;
2819 struct ext4_extent *ex1 = NULL;
2820 struct ext4_extent *ex2 = NULL;
2821 struct ext4_extent *ex3 = NULL;
2822 struct ext4_extent_header *eh;
2823 ext4_lblk_t ee_block;
2824 unsigned int allocated, ee_len, depth;
2825 ext4_fsblk_t newblock;
2826 int err = 0;
2827 int ret = 0;
2828
2829 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2830 "iblock %llu, max_blocks %u\n", inode->i_ino,
2831 (unsigned long long)iblock, max_blocks);
2832 depth = ext_depth(inode);
2833 eh = path[depth].p_hdr;
2834 ex = path[depth].p_ext;
2835 ee_block = le32_to_cpu(ex->ee_block);
2836 ee_len = ext4_ext_get_actual_len(ex);
2837 allocated = ee_len - (iblock - ee_block);
2838 newblock = iblock - ee_block + ext_pblock(ex);
2839 ex2 = ex;
2840 orig_ex.ee_block = ex->ee_block;
2841 orig_ex.ee_len = cpu_to_le16(ee_len);
2842 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2843
2844 /*
2845 * if the entire unintialized extent length less than
2846 * the size of extent to write, there is no need to split
2847 * uninitialized extent
2848 */
2849 if (allocated <= max_blocks)
2850 return ret;
2851
2852 err = ext4_ext_get_access(handle, inode, path + depth);
2853 if (err)
2854 goto out;
2855 /* ex1: ee_block to iblock - 1 : uninitialized */
2856 if (iblock > ee_block) {
2857 ex1 = ex;
2858 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2859 ext4_ext_mark_uninitialized(ex1);
2860 ex2 = &newex;
2861 }
2862 /*
2863 * for sanity, update the length of the ex2 extent before
2864 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2865 * overlap of blocks.
2866 */
2867 if (!ex1 && allocated > max_blocks)
2868 ex2->ee_len = cpu_to_le16(max_blocks);
2869 /* ex3: to ee_block + ee_len : uninitialised */
2870 if (allocated > max_blocks) {
2871 unsigned int newdepth;
2872 ex3 = &newex;
2873 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2874 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2875 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2876 ext4_ext_mark_uninitialized(ex3);
2877 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2878 if (err == -ENOSPC) {
2879 err = ext4_ext_zeroout(inode, &orig_ex);
2880 if (err)
2881 goto fix_extent_len;
2882 /* update the extent length and mark as initialized */
2883 ex->ee_block = orig_ex.ee_block;
2884 ex->ee_len = orig_ex.ee_len;
2885 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2886 ext4_ext_dirty(handle, inode, path + depth);
2887 /* zeroed the full extent */
2888 /* blocks available from iblock */
2889 return allocated;
2890
2891 } else if (err)
2892 goto fix_extent_len;
2893 /*
2894 * The depth, and hence eh & ex might change
2895 * as part of the insert above.
2896 */
2897 newdepth = ext_depth(inode);
2898 /*
2899 * update the extent length after successful insert of the
2900 * split extent
2901 */
2902 orig_ex.ee_len = cpu_to_le16(ee_len -
2903 ext4_ext_get_actual_len(ex3));
2904 depth = newdepth;
2905 ext4_ext_drop_refs(path);
2906 path = ext4_ext_find_extent(inode, iblock, path);
2907 if (IS_ERR(path)) {
2908 err = PTR_ERR(path);
2909 goto out;
2910 }
2911 eh = path[depth].p_hdr;
2912 ex = path[depth].p_ext;
2913 if (ex2 != &newex)
2914 ex2 = ex;
2915
2916 err = ext4_ext_get_access(handle, inode, path + depth);
2917 if (err)
2918 goto out;
2919
2920 allocated = max_blocks;
2921 }
2922 /*
2923 * If there was a change of depth as part of the
2924 * insertion of ex3 above, we need to update the length
2925 * of the ex1 extent again here
2926 */
2927 if (ex1 && ex1 != ex) {
2928 ex1 = ex;
2929 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2930 ext4_ext_mark_uninitialized(ex1);
2931 ex2 = &newex;
2932 }
2933 /*
2934 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2935 * uninitialised still.
2936 */
2937 ex2->ee_block = cpu_to_le32(iblock);
2938 ext4_ext_store_pblock(ex2, newblock);
2939 ex2->ee_len = cpu_to_le16(allocated);
2940 ext4_ext_mark_uninitialized(ex2);
2941 if (ex2 != ex)
2942 goto insert;
2943 /* Mark modified extent as dirty */
2944 err = ext4_ext_dirty(handle, inode, path + depth);
2945 ext_debug("out here\n");
2946 goto out;
2947insert:
2948 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2949 if (err == -ENOSPC) {
2950 err = ext4_ext_zeroout(inode, &orig_ex);
2951 if (err)
2952 goto fix_extent_len;
2953 /* update the extent length and mark as initialized */
2954 ex->ee_block = orig_ex.ee_block;
2955 ex->ee_len = orig_ex.ee_len;
2956 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2957 ext4_ext_dirty(handle, inode, path + depth);
2958 /* zero out the first half */
2959 return allocated;
2960 } else if (err)
2961 goto fix_extent_len;
2962out:
2963 ext4_ext_show_leaf(inode, path);
2964 return err ? err : allocated;
2965
2966fix_extent_len:
2967 ex->ee_block = orig_ex.ee_block;
2968 ex->ee_len = orig_ex.ee_len;
2969 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2970 ext4_ext_mark_uninitialized(ex);
2971 ext4_ext_dirty(handle, inode, path + depth);
2972 return err;
2973}
2974static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2975 struct inode *inode,
2976 struct ext4_ext_path *path)
2977{
2978 struct ext4_extent *ex;
2979 struct ext4_extent_header *eh;
2980 int depth;
2981 int err = 0;
2982 int ret = 0;
2983
2984 depth = ext_depth(inode);
2985 eh = path[depth].p_hdr;
2986 ex = path[depth].p_ext;
2987
2988 err = ext4_ext_get_access(handle, inode, path + depth);
2989 if (err)
2990 goto out;
2991 /* first mark the extent as initialized */
2992 ext4_ext_mark_initialized(ex);
2993
2994 /*
2995 * We have to see if it can be merged with the extent
2996 * on the left.
2997 */
2998 if (ex > EXT_FIRST_EXTENT(eh)) {
2999 /*
3000 * To merge left, pass "ex - 1" to try_to_merge(),
3001 * since it merges towards right _only_.
3002 */
3003 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3004 if (ret) {
3005 err = ext4_ext_correct_indexes(handle, inode, path);
3006 if (err)
3007 goto out;
3008 depth = ext_depth(inode);
3009 ex--;
3010 }
3011 }
3012 /*
3013 * Try to Merge towards right.
3014 */
3015 ret = ext4_ext_try_to_merge(inode, path, ex);
3016 if (ret) {
3017 err = ext4_ext_correct_indexes(handle, inode, path);
3018 if (err)
3019 goto out;
3020 depth = ext_depth(inode);
3021 }
3022 /* Mark modified extent as dirty */
3023 err = ext4_ext_dirty(handle, inode, path + depth);
3024out:
3025 ext4_ext_show_leaf(inode, path);
3026 return err;
3027}
3028
3029static int
3030ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3031 ext4_lblk_t iblock, unsigned int max_blocks,
3032 struct ext4_ext_path *path, int flags,
3033 unsigned int allocated, struct buffer_head *bh_result,
3034 ext4_fsblk_t newblock)
3035{
3036 int ret = 0;
3037 int err = 0;
3038 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3039
3040 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3041 "block %llu, max_blocks %u, flags %d, allocated %u",
3042 inode->i_ino, (unsigned long long)iblock, max_blocks,
3043 flags, allocated);
3044 ext4_ext_show_leaf(inode, path);
3045
3046 /* DIO get_block() before submit the IO, split the extent */
3047 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3048 ret = ext4_split_unwritten_extents(handle,
3049 inode, path, iblock,
3050 max_blocks, flags);
3051 /* flag the io_end struct that we need convert when IO done */
3052 if (io)
3053 io->flag = DIO_AIO_UNWRITTEN;
3054 goto out;
3055 }
3056 /* DIO end_io complete, convert the filled extent to written */
3057 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3058 ret = ext4_convert_unwritten_extents_dio(handle, inode,
3059 path);
3060 goto out2;
3061 }
3062 /* buffered IO case */
3063 /*
3064 * repeat fallocate creation request
3065 * we already have an unwritten extent
3066 */
3067 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3068 goto map_out;
3069
3070 /* buffered READ or buffered write_begin() lookup */
3071 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3072 /*
3073 * We have blocks reserved already. We
3074 * return allocated blocks so that delalloc
3075 * won't do block reservation for us. But
3076 * the buffer head will be unmapped so that
3077 * a read from the block returns 0s.
3078 */
3079 set_buffer_unwritten(bh_result);
3080 goto out1;
3081 }
3082
3083 /* buffered write, writepage time, convert*/
3084 ret = ext4_ext_convert_to_initialized(handle, inode,
3085 path, iblock,
3086 max_blocks);
3087out:
3088 if (ret <= 0) {
3089 err = ret;
3090 goto out2;
3091 } else
3092 allocated = ret;
3093 set_buffer_new(bh_result);
3094map_out:
3095 set_buffer_mapped(bh_result);
3096out1:
3097 if (allocated > max_blocks)
3098 allocated = max_blocks;
3099 ext4_ext_show_leaf(inode, path);
3100 bh_result->b_bdev = inode->i_sb->s_bdev;
3101 bh_result->b_blocknr = newblock;
3102out2:
3103 if (path) {
3104 ext4_ext_drop_refs(path);
3105 kfree(path);
3106 }
3107 return err ? err : allocated;
3108}
3109/*
2788 * Block allocation/map/preallocation routine for extents based files 3110 * Block allocation/map/preallocation routine for extents based files
2789 * 3111 *
2790 * 3112 *
@@ -2814,6 +3136,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2814 int err = 0, depth, ret, cache_type; 3136 int err = 0, depth, ret, cache_type;
2815 unsigned int allocated = 0; 3137 unsigned int allocated = 0;
2816 struct ext4_allocation_request ar; 3138 struct ext4_allocation_request ar;
3139 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
2817 3140
2818 __clear_bit(BH_New, &bh_result->b_state); 3141 __clear_bit(BH_New, &bh_result->b_state);
2819 ext_debug("blocks %u/%u requested for inode %lu\n", 3142 ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -2889,33 +3212,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2889 EXT4_EXT_CACHE_EXTENT); 3212 EXT4_EXT_CACHE_EXTENT);
2890 goto out; 3213 goto out;
2891 } 3214 }
2892 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3215 ret = ext4_ext_handle_uninitialized_extents(handle,
2893 goto out; 3216 inode, iblock, max_blocks, path,
2894 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3217 flags, allocated, bh_result, newblock);
2895 if (allocated > max_blocks) 3218 return ret;
2896 allocated = max_blocks;
2897 /*
2898 * We have blocks reserved already. We
2899 * return allocated blocks so that delalloc
2900 * won't do block reservation for us. But
2901 * the buffer head will be unmapped so that
2902 * a read from the block returns 0s.
2903 */
2904 set_buffer_unwritten(bh_result);
2905 bh_result->b_bdev = inode->i_sb->s_bdev;
2906 bh_result->b_blocknr = newblock;
2907 goto out2;
2908 }
2909
2910 ret = ext4_ext_convert_to_initialized(handle, inode,
2911 path, iblock,
2912 max_blocks);
2913 if (ret <= 0) {
2914 err = ret;
2915 goto out2;
2916 } else
2917 allocated = ret;
2918 goto outnew;
2919 } 3219 }
2920 } 3220 }
2921 3221
@@ -2986,9 +3286,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2986 /* try to insert new extent into found leaf and return */ 3286 /* try to insert new extent into found leaf and return */
2987 ext4_ext_store_pblock(&newex, newblock); 3287 ext4_ext_store_pblock(&newex, newblock);
2988 newex.ee_len = cpu_to_le16(ar.len); 3288 newex.ee_len = cpu_to_le16(ar.len);
2989 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ 3289 /* Mark uninitialized */
3290 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
2990 ext4_ext_mark_uninitialized(&newex); 3291 ext4_ext_mark_uninitialized(&newex);
2991 err = ext4_ext_insert_extent(handle, inode, path, &newex); 3292 /*
3293 * io_end structure was created for every async
3294 * direct IO write to the middle of the file.
3295 * To avoid unecessary convertion for every aio dio rewrite
3296 * to the mid of file, here we flag the IO that is really
3297 * need the convertion.
3298 *
3299 */
3300 if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
3301 io->flag = DIO_AIO_UNWRITTEN;
3302 }
3303 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2992 if (err) { 3304 if (err) {
2993 /* free data blocks we just allocated */ 3305 /* free data blocks we just allocated */
2994 /* not a good idea to call discard here directly, 3306 /* not a good idea to call discard here directly,
@@ -3002,7 +3314,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3002 /* previous routine could use block we allocated */ 3314 /* previous routine could use block we allocated */
3003 newblock = ext_pblock(&newex); 3315 newblock = ext_pblock(&newex);
3004 allocated = ext4_ext_get_actual_len(&newex); 3316 allocated = ext4_ext_get_actual_len(&newex);
3005outnew:
3006 set_buffer_new(bh_result); 3317 set_buffer_new(bh_result);
3007 3318
3008 /* Cache only when it is _not_ an uninitialized extent */ 3319 /* Cache only when it is _not_ an uninitialized extent */
@@ -3201,6 +3512,63 @@ retry:
3201} 3512}
3202 3513
3203/* 3514/*
3515 * This function convert a range of blocks to written extents
3516 * The caller of this function will pass the start offset and the size.
3517 * all unwritten extents within this range will be converted to
3518 * written extents.
3519 *
3520 * This function is called from the direct IO end io call back
3521 * function, to convert the fallocated extents after IO is completed.
3522 */
3523int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3524 loff_t len)
3525{
3526 handle_t *handle;
3527 ext4_lblk_t block;
3528 unsigned int max_blocks;
3529 int ret = 0;
3530 int ret2 = 0;
3531 struct buffer_head map_bh;
3532 unsigned int credits, blkbits = inode->i_blkbits;
3533
3534 block = offset >> blkbits;
3535 /*
3536 * We can't just convert len to max_blocks because
3537 * If blocksize = 4096 offset = 3072 and len = 2048
3538 */
3539 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3540 - block;
3541 /*
3542 * credits to insert 1 extent into extent tree
3543 */
3544 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3545 while (ret >= 0 && ret < max_blocks) {
3546 block = block + ret;
3547 max_blocks = max_blocks - ret;
3548 handle = ext4_journal_start(inode, credits);
3549 if (IS_ERR(handle)) {
3550 ret = PTR_ERR(handle);
3551 break;
3552 }
3553 map_bh.b_state = 0;
3554 ret = ext4_get_blocks(handle, inode, block,
3555 max_blocks, &map_bh,
3556 EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3557 if (ret <= 0) {
3558 WARN_ON(ret <= 0);
3559 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3560 "returned error inode#%lu, block=%u, "
3561 "max_blocks=%u", __func__,
3562 inode->i_ino, block, max_blocks);
3563 }
3564 ext4_mark_inode_dirty(handle, inode);
3565 ret2 = ext4_journal_stop(handle);
3566 if (ret <= 0 || ret2 )
3567 break;
3568 }
3569 return ret > 0 ? ret2 : ret;
3570}
3571/*
3204 * Callback function called for each extent to gather FIEMAP information. 3572 * Callback function called for each extent to gather FIEMAP information.
3205 */ 3573 */
3206static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3574static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 07475740b512..2b1531266ee2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
44 * 44 *
45 * What we do is just kick off a commit and wait on it. This will snapshot the 45 * What we do is just kick off a commit and wait on it. This will snapshot the
46 * inode to disk. 46 * inode to disk.
47 *
48 * i_mutex lock is held when entering and exiting this function
47 */ 49 */
48 50
49int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
56 58
57 trace_ext4_sync_file(file, dentry, datasync); 59 trace_ext4_sync_file(file, dentry, datasync);
58 60
61 ret = flush_aio_dio_completed_IO(inode);
62 if (ret < 0)
63 goto out;
59 /* 64 /*
60 * data=writeback: 65 * data=writeback:
61 * The caller's filemap_fdatawrite()/wait will sync the data. 66 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 064746fad581..5c5bc5dafff8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h>
40 41
41#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
42#include "xattr.h" 43#include "xattr.h"
@@ -1145,6 +1146,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
1145} 1146}
1146 1147
1147/* 1148/*
1149 * Return the number of contiguous dirty pages in a given inode
1150 * starting at page frame idx.
1151 */
1152static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1153 unsigned int max_pages)
1154{
1155 struct address_space *mapping = inode->i_mapping;
1156 pgoff_t index;
1157 struct pagevec pvec;
1158 pgoff_t num = 0;
1159 int i, nr_pages, done = 0;
1160
1161 if (max_pages == 0)
1162 return 0;
1163 pagevec_init(&pvec, 0);
1164 while (!done) {
1165 index = idx;
1166 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1167 PAGECACHE_TAG_DIRTY,
1168 (pgoff_t)PAGEVEC_SIZE);
1169 if (nr_pages == 0)
1170 break;
1171 for (i = 0; i < nr_pages; i++) {
1172 struct page *page = pvec.pages[i];
1173 struct buffer_head *bh, *head;
1174
1175 lock_page(page);
1176 if (unlikely(page->mapping != mapping) ||
1177 !PageDirty(page) ||
1178 PageWriteback(page) ||
1179 page->index != idx) {
1180 done = 1;
1181 unlock_page(page);
1182 break;
1183 }
1184 if (page_has_buffers(page)) {
1185 bh = head = page_buffers(page);
1186 do {
1187 if (!buffer_delay(bh) &&
1188 !buffer_unwritten(bh))
1189 done = 1;
1190 bh = bh->b_this_page;
1191 } while (!done && (bh != head));
1192 }
1193 unlock_page(page);
1194 if (done)
1195 break;
1196 idx++;
1197 num++;
1198 if (num >= max_pages)
1199 break;
1200 }
1201 pagevec_release(&pvec);
1202 }
1203 return num;
1204}
1205
1206/*
1148 * The ext4_get_blocks() function tries to look up the requested blocks, 1207 * The ext4_get_blocks() function tries to look up the requested blocks,
1149 * and returns if the blocks are already mapped. 1208 * and returns if the blocks are already mapped.
1150 * 1209 *
@@ -1175,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1175 clear_buffer_mapped(bh); 1234 clear_buffer_mapped(bh);
1176 clear_buffer_unwritten(bh); 1235 clear_buffer_unwritten(bh);
1177 1236
1237 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
1238 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1239 (unsigned long)block);
1178 /* 1240 /*
1179 * Try to see if we can get the block without requesting a new 1241 * Try to see if we can get the block without requesting a new
1180 * file system block. 1242 * file system block.
@@ -1796,11 +1858,11 @@ repeat:
1796 1858
1797 if (ext4_claim_free_blocks(sbi, total)) { 1859 if (ext4_claim_free_blocks(sbi, total)) {
1798 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1860 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1861 vfs_dq_release_reservation_block(inode, total);
1799 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1862 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1800 yield(); 1863 yield();
1801 goto repeat; 1864 goto repeat;
1802 } 1865 }
1803 vfs_dq_release_reservation_block(inode, total);
1804 return -ENOSPC; 1866 return -ENOSPC;
1805 } 1867 }
1806 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1868 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -2092,18 +2154,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2092static void ext4_print_free_blocks(struct inode *inode) 2154static void ext4_print_free_blocks(struct inode *inode)
2093{ 2155{
2094 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2156 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2095 printk(KERN_EMERG "Total free blocks count %lld\n", 2157 printk(KERN_CRIT "Total free blocks count %lld\n",
2096 ext4_count_free_blocks(inode->i_sb)); 2158 ext4_count_free_blocks(inode->i_sb));
2097 printk(KERN_EMERG "Free/Dirty block details\n"); 2159 printk(KERN_CRIT "Free/Dirty block details\n");
2098 printk(KERN_EMERG "free_blocks=%lld\n", 2160 printk(KERN_CRIT "free_blocks=%lld\n",
2099 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 2161 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2100 printk(KERN_EMERG "dirty_blocks=%lld\n", 2162 printk(KERN_CRIT "dirty_blocks=%lld\n",
2101 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2163 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2102 printk(KERN_EMERG "Block reservation details\n"); 2164 printk(KERN_CRIT "Block reservation details\n");
2103 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 2165 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2104 EXT4_I(inode)->i_reserved_data_blocks); 2166 EXT4_I(inode)->i_reserved_data_blocks);
2105 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 2167 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2106 EXT4_I(inode)->i_reserved_meta_blocks); 2168 EXT4_I(inode)->i_reserved_meta_blocks);
2107 return; 2169 return;
2108} 2170}
2109 2171
@@ -2189,14 +2251,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2189 * writepage and writepages will again try to write 2251 * writepage and writepages will again try to write
2190 * the same. 2252 * the same.
2191 */ 2253 */
2192 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2254 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2193 "at logical offset %llu with max blocks " 2255 "delayed block allocation failed for inode %lu at "
2194 "%zd with error %d\n", 2256 "logical offset %llu with max blocks %zd with "
2195 __func__, mpd->inode->i_ino, 2257 "error %d\n", mpd->inode->i_ino,
2196 (unsigned long long)next, 2258 (unsigned long long) next,
2197 mpd->b_size >> mpd->inode->i_blkbits, err); 2259 mpd->b_size >> mpd->inode->i_blkbits, err);
2198 printk(KERN_EMERG "This should not happen.!! " 2260 printk(KERN_CRIT "This should not happen!! "
2199 "Data will be lost\n"); 2261 "Data will be lost\n");
2200 if (err == -ENOSPC) { 2262 if (err == -ENOSPC) {
2201 ext4_print_free_blocks(mpd->inode); 2263 ext4_print_free_blocks(mpd->inode);
2202 } 2264 }
@@ -2743,8 +2805,10 @@ static int ext4_da_writepages(struct address_space *mapping,
2743 int no_nrwrite_index_update; 2805 int no_nrwrite_index_update;
2744 int pages_written = 0; 2806 int pages_written = 0;
2745 long pages_skipped; 2807 long pages_skipped;
2808 unsigned int max_pages;
2746 int range_cyclic, cycled = 1, io_done = 0; 2809 int range_cyclic, cycled = 1, io_done = 0;
2747 int needed_blocks, ret = 0, nr_to_writebump = 0; 2810 int needed_blocks, ret = 0;
2811 long desired_nr_to_write, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start; 2812 loff_t range_start = wbc->range_start;
2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2813 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2750 2814
@@ -2771,16 +2835,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2771 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2835 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2772 return -EROFS; 2836 return -EROFS;
2773 2837
2774 /*
2775 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2776 * This make sure small files blocks are allocated in
2777 * single attempt. This ensure that small files
2778 * get less fragmented.
2779 */
2780 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2781 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2782 wbc->nr_to_write = sbi->s_mb_stream_request;
2783 }
2784 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2838 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2785 range_whole = 1; 2839 range_whole = 1;
2786 2840
@@ -2795,6 +2849,36 @@ static int ext4_da_writepages(struct address_space *mapping,
2795 } else 2849 } else
2796 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2850 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2797 2851
2852 /*
2853 * This works around two forms of stupidity. The first is in
2854 * the writeback code, which caps the maximum number of pages
2855 * written to be 1024 pages. This is wrong on multiple
2856 * levels; different architectues have a different page size,
2857 * which changes the maximum amount of data which gets
2858 * written. Secondly, 4 megabytes is way too small. XFS
2859 * forces this value to be 16 megabytes by multiplying
2860 * nr_to_write parameter by four, and then relies on its
2861 * allocator to allocate larger extents to make them
2862 * contiguous. Unfortunately this brings us to the second
2863 * stupidity, which is that ext4's mballoc code only allocates
2864 * at most 2048 blocks. So we force contiguous writes up to
2865 * the number of dirty blocks in the inode, or
2866 * sbi->max_writeback_mb_bump whichever is smaller.
2867 */
2868 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2869 if (!range_cyclic && range_whole)
2870 desired_nr_to_write = wbc->nr_to_write * 8;
2871 else
2872 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2873 max_pages);
2874 if (desired_nr_to_write > max_pages)
2875 desired_nr_to_write = max_pages;
2876
2877 if (wbc->nr_to_write < desired_nr_to_write) {
2878 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2879 wbc->nr_to_write = desired_nr_to_write;
2880 }
2881
2798 mpd.wbc = wbc; 2882 mpd.wbc = wbc;
2799 mpd.inode = mapping->host; 2883 mpd.inode = mapping->host;
2800 2884
@@ -2822,10 +2906,9 @@ retry:
2822 handle = ext4_journal_start(inode, needed_blocks); 2906 handle = ext4_journal_start(inode, needed_blocks);
2823 if (IS_ERR(handle)) { 2907 if (IS_ERR(handle)) {
2824 ret = PTR_ERR(handle); 2908 ret = PTR_ERR(handle);
2825 printk(KERN_CRIT "%s: jbd2_start: " 2909 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2826 "%ld pages, ino %lu; err %d\n", __func__, 2910 "%ld pages, ino %lu; err %d\n", __func__,
2827 wbc->nr_to_write, inode->i_ino, ret); 2911 wbc->nr_to_write, inode->i_ino, ret);
2828 dump_stack();
2829 goto out_writepages; 2912 goto out_writepages;
2830 } 2913 }
2831 2914
@@ -2897,9 +2980,10 @@ retry:
2897 goto retry; 2980 goto retry;
2898 } 2981 }
2899 if (pages_skipped != wbc->pages_skipped) 2982 if (pages_skipped != wbc->pages_skipped)
2900 printk(KERN_EMERG "This should not happen leaving %s " 2983 ext4_msg(inode->i_sb, KERN_CRIT,
2901 "with nr_to_write = %ld ret = %d\n", 2984 "This should not happen leaving %s "
2902 __func__, wbc->nr_to_write, ret); 2985 "with nr_to_write = %ld ret = %d\n",
2986 __func__, wbc->nr_to_write, ret);
2903 2987
2904 /* Update index */ 2988 /* Update index */
2905 index += pages_written; 2989 index += pages_written;
@@ -2914,7 +2998,8 @@ retry:
2914out_writepages: 2998out_writepages:
2915 if (!no_nrwrite_index_update) 2999 if (!no_nrwrite_index_update)
2916 wbc->no_nrwrite_index_update = 0; 3000 wbc->no_nrwrite_index_update = 0;
2917 wbc->nr_to_write -= nr_to_writebump; 3001 if (wbc->nr_to_write > nr_to_writebump)
3002 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start; 3003 wbc->range_start = range_start;
2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3004 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2920 return ret; 3005 return ret;
@@ -3272,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3272} 3357}
3273 3358
3274/* 3359/*
3360 * O_DIRECT for ext3 (or indirect map) based files
3361 *
3275 * If the O_DIRECT write will extend the file then add this inode to the 3362 * If the O_DIRECT write will extend the file then add this inode to the
3276 * orphan list. So recovery will truncate it back to the original size 3363 * orphan list. So recovery will truncate it back to the original size
3277 * if the machine crashes during the write. 3364 * if the machine crashes during the write.
@@ -3280,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3280 * crashes then stale disk data _may_ be exposed inside the file. But current 3367 * crashes then stale disk data _may_ be exposed inside the file. But current
3281 * VFS code falls back into buffered path in that case so we are safe. 3368 * VFS code falls back into buffered path in that case so we are safe.
3282 */ 3369 */
3283static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3370static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3284 const struct iovec *iov, loff_t offset, 3371 const struct iovec *iov, loff_t offset,
3285 unsigned long nr_segs) 3372 unsigned long nr_segs)
3286{ 3373{
@@ -3291,6 +3378,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3291 ssize_t ret; 3378 ssize_t ret;
3292 int orphan = 0; 3379 int orphan = 0;
3293 size_t count = iov_length(iov, nr_segs); 3380 size_t count = iov_length(iov, nr_segs);
3381 int retries = 0;
3294 3382
3295 if (rw == WRITE) { 3383 if (rw == WRITE) {
3296 loff_t final_size = offset + count; 3384 loff_t final_size = offset + count;
@@ -3313,9 +3401,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3313 } 3401 }
3314 } 3402 }
3315 3403
3404retry:
3316 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3405 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3317 offset, nr_segs, 3406 offset, nr_segs,
3318 ext4_get_block, NULL); 3407 ext4_get_block, NULL);
3408 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3409 goto retry;
3319 3410
3320 if (orphan) { 3411 if (orphan) {
3321 int err; 3412 int err;
@@ -3354,6 +3445,359 @@ out:
3354 return ret; 3445 return ret;
3355} 3446}
3356 3447
3448/* Maximum number of blocks we map for direct IO at once. */
3449
3450static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3451 struct buffer_head *bh_result, int create)
3452{
3453 handle_t *handle = NULL;
3454 int ret = 0;
3455 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3456 int dio_credits;
3457
3458 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3459 inode->i_ino, create);
3460 /*
3461 * DIO VFS code passes create = 0 flag for write to
3462 * the middle of file. It does this to avoid block
3463 * allocation for holes, to prevent expose stale data
3464 * out when there is parallel buffered read (which does
3465 * not hold the i_mutex lock) while direct IO write has
3466 * not completed. DIO request on holes finally falls back
3467 * to buffered IO for this reason.
3468 *
3469 * For ext4 extent based file, since we support fallocate,
3470 * new allocated extent as uninitialized, for holes, we
3471 * could fallocate blocks for holes, thus parallel
3472 * buffered IO read will zero out the page when read on
3473 * a hole while parallel DIO write to the hole has not completed.
3474 *
3475 * when we come here, we know it's a direct IO write to
3476 * to the middle of file (<i_size)
3477 * so it's safe to override the create flag from VFS.
3478 */
3479 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3480
3481 if (max_blocks > DIO_MAX_BLOCKS)
3482 max_blocks = DIO_MAX_BLOCKS;
3483 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3484 handle = ext4_journal_start(inode, dio_credits);
3485 if (IS_ERR(handle)) {
3486 ret = PTR_ERR(handle);
3487 goto out;
3488 }
3489 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3490 create);
3491 if (ret > 0) {
3492 bh_result->b_size = (ret << inode->i_blkbits);
3493 ret = 0;
3494 }
3495 ext4_journal_stop(handle);
3496out:
3497 return ret;
3498}
3499
3500static void ext4_free_io_end(ext4_io_end_t *io)
3501{
3502 BUG_ON(!io);
3503 iput(io->inode);
3504 kfree(io);
3505}
3506static void dump_aio_dio_list(struct inode * inode)
3507{
3508#ifdef EXT4_DEBUG
3509 struct list_head *cur, *before, *after;
3510 ext4_io_end_t *io, *io0, *io1;
3511
3512 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3513 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3514 return;
3515 }
3516
3517 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3518 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3519 cur = &io->list;
3520 before = cur->prev;
3521 io0 = container_of(before, ext4_io_end_t, list);
3522 after = cur->next;
3523 io1 = container_of(after, ext4_io_end_t, list);
3524
3525 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3526 io, inode->i_ino, io0, io1);
3527 }
3528#endif
3529}
3530
3531/*
3532 * check a range of space and convert unwritten extents to written.
3533 */
3534static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3535{
3536 struct inode *inode = io->inode;
3537 loff_t offset = io->offset;
3538 size_t size = io->size;
3539 int ret = 0;
3540
3541 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3542 "list->prev 0x%p\n",
3543 io, inode->i_ino, io->list.next, io->list.prev);
3544
3545 if (list_empty(&io->list))
3546 return ret;
3547
3548 if (io->flag != DIO_AIO_UNWRITTEN)
3549 return ret;
3550
3551 if (offset + size <= i_size_read(inode))
3552 ret = ext4_convert_unwritten_extents(inode, offset, size);
3553
3554 if (ret < 0) {
3555 printk(KERN_EMERG "%s: failed to convert unwritten"
3556 "extents to written extents, error is %d"
3557 " io is still on inode %lu aio dio list\n",
3558 __func__, ret, inode->i_ino);
3559 return ret;
3560 }
3561
3562 /* clear the DIO AIO unwritten flag */
3563 io->flag = 0;
3564 return ret;
3565}
3566/*
3567 * work on completed aio dio IO, to convert unwritten extents to extents
3568 */
3569static void ext4_end_aio_dio_work(struct work_struct *work)
3570{
3571 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3572 struct inode *inode = io->inode;
3573 int ret = 0;
3574
3575 mutex_lock(&inode->i_mutex);
3576 ret = ext4_end_aio_dio_nolock(io);
3577 if (ret >= 0) {
3578 if (!list_empty(&io->list))
3579 list_del_init(&io->list);
3580 ext4_free_io_end(io);
3581 }
3582 mutex_unlock(&inode->i_mutex);
3583}
3584/*
3585 * This function is called from ext4_sync_file().
3586 *
3587 * When AIO DIO IO is completed, the work to convert unwritten
3588 * extents to written is queued on workqueue but may not get immediately
3589 * scheduled. When fsync is called, we need to ensure the
3590 * conversion is complete before fsync returns.
3591 * The inode keeps track of a list of completed AIO from DIO path
3592 * that might needs to do the conversion. This function walks through
3593 * the list and convert the related unwritten extents to written.
3594 */
3595int flush_aio_dio_completed_IO(struct inode *inode)
3596{
3597 ext4_io_end_t *io;
3598 int ret = 0;
3599 int ret2 = 0;
3600
3601 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3602 return ret;
3603
3604 dump_aio_dio_list(inode);
3605 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3606 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3607 ext4_io_end_t, list);
3608 /*
3609 * Calling ext4_end_aio_dio_nolock() to convert completed
3610 * IO to written.
3611 *
3612 * When ext4_sync_file() is called, run_queue() may already
3613 * about to flush the work corresponding to this io structure.
3614 * It will be upset if it founds the io structure related
3615 * to the work-to-be schedule is freed.
3616 *
3617 * Thus we need to keep the io structure still valid here after
3618 * convertion finished. The io structure has a flag to
3619 * avoid double converting from both fsync and background work
3620 * queue work.
3621 */
3622 ret = ext4_end_aio_dio_nolock(io);
3623 if (ret < 0)
3624 ret2 = ret;
3625 else
3626 list_del_init(&io->list);
3627 }
3628 return (ret2 < 0) ? ret2 : 0;
3629}
3630
3631static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3632{
3633 ext4_io_end_t *io = NULL;
3634
3635 io = kmalloc(sizeof(*io), GFP_NOFS);
3636
3637 if (io) {
3638 igrab(inode);
3639 io->inode = inode;
3640 io->flag = 0;
3641 io->offset = 0;
3642 io->size = 0;
3643 io->error = 0;
3644 INIT_WORK(&io->work, ext4_end_aio_dio_work);
3645 INIT_LIST_HEAD(&io->list);
3646 }
3647
3648 return io;
3649}
3650
3651static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3652 ssize_t size, void *private)
3653{
3654 ext4_io_end_t *io_end = iocb->private;
3655 struct workqueue_struct *wq;
3656
3657 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3658 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3659 iocb->private, io_end->inode->i_ino, iocb, offset,
3660 size);
3661 /* if not async direct IO or dio with 0 bytes write, just return */
3662 if (!io_end || !size)
3663 return;
3664
3665 /* if not aio dio with unwritten extents, just free io and return */
3666 if (io_end->flag != DIO_AIO_UNWRITTEN){
3667 ext4_free_io_end(io_end);
3668 iocb->private = NULL;
3669 return;
3670 }
3671
3672 io_end->offset = offset;
3673 io_end->size = size;
3674 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3675
3676 /* queue the work to convert unwritten extents to written */
3677 queue_work(wq, &io_end->work);
3678
3679 /* Add the io_end to per-inode completed aio dio list*/
3680 list_add_tail(&io_end->list,
3681 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3682 iocb->private = NULL;
3683}
3684/*
3685 * For ext4 extent files, ext4 will do direct-io write to holes,
3686 * preallocated extents, and those write extend the file, no need to
3687 * fall back to buffered IO.
3688 *
3689 * For holes, we fallocate those blocks, mark them as unintialized
3690 * If those blocks were preallocated, we mark sure they are splited, but
3691 * still keep the range to write as unintialized.
3692 *
3693 * The unwrritten extents will be converted to written when DIO is completed.
3694 * For async direct IO, since the IO may still pending when return, we
3695 * set up an end_io call back function, which will do the convertion
3696 * when async direct IO completed.
3697 *
3698 * If the O_DIRECT write will extend the file then add this inode to the
3699 * orphan list. So recovery will truncate it back to the original size
3700 * if the machine crashes during the write.
3701 *
3702 */
3703static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3704 const struct iovec *iov, loff_t offset,
3705 unsigned long nr_segs)
3706{
3707 struct file *file = iocb->ki_filp;
3708 struct inode *inode = file->f_mapping->host;
3709 ssize_t ret;
3710 size_t count = iov_length(iov, nr_segs);
3711
3712 loff_t final_size = offset + count;
3713 if (rw == WRITE && final_size <= inode->i_size) {
3714 /*
3715 * We could direct write to holes and fallocate.
3716 *
3717 * Allocated blocks to fill the hole are marked as uninitialized
3718 * to prevent paralel buffered read to expose the stale data
3719 * before DIO complete the data IO.
3720 *
3721 * As to previously fallocated extents, ext4 get_block
3722 * will just simply mark the buffer mapped but still
3723 * keep the extents uninitialized.
3724 *
3725 * for non AIO case, we will convert those unwritten extents
3726 * to written after return back from blockdev_direct_IO.
3727 *
3728 * for async DIO, the conversion needs to be defered when
3729 * the IO is completed. The ext4 end_io callback function
3730 * will be called to take care of the conversion work.
3731 * Here for async case, we allocate an io_end structure to
3732 * hook to the iocb.
3733 */
3734 iocb->private = NULL;
3735 EXT4_I(inode)->cur_aio_dio = NULL;
3736 if (!is_sync_kiocb(iocb)) {
3737 iocb->private = ext4_init_io_end(inode);
3738 if (!iocb->private)
3739 return -ENOMEM;
3740 /*
3741 * we save the io structure for current async
3742 * direct IO, so that later ext4_get_blocks()
3743 * could flag the io structure whether there
3744 * is a unwritten extents needs to be converted
3745 * when IO is completed.
3746 */
3747 EXT4_I(inode)->cur_aio_dio = iocb->private;
3748 }
3749
3750 ret = blockdev_direct_IO(rw, iocb, inode,
3751 inode->i_sb->s_bdev, iov,
3752 offset, nr_segs,
3753 ext4_get_block_dio_write,
3754 ext4_end_io_dio);
3755 if (iocb->private)
3756 EXT4_I(inode)->cur_aio_dio = NULL;
3757 /*
3758 * The io_end structure takes a reference to the inode,
3759 * that structure needs to be destroyed and the
3760 * reference to the inode need to be dropped, when IO is
3761 * complete, even with 0 byte write, or failed.
3762 *
3763 * In the successful AIO DIO case, the io_end structure will be
3764 * desctroyed and the reference to the inode will be dropped
3765 * after the end_io call back function is called.
3766 *
3767 * In the case there is 0 byte write, or error case, since
3768 * VFS direct IO won't invoke the end_io call back function,
3769 * we need to free the end_io structure here.
3770 */
3771 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772 ext4_free_io_end(iocb->private);
3773 iocb->private = NULL;
3774 } else if (ret > 0)
3775 /*
3776 * for non AIO case, since the IO is already
3777 * completed, we could do the convertion right here
3778 */
3779 ret = ext4_convert_unwritten_extents(inode,
3780 offset, ret);
3781 return ret;
3782 }
3783
3784 /* for write the the end of file case, we fall back to old way */
3785 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3786}
3787
3788static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3789 const struct iovec *iov, loff_t offset,
3790 unsigned long nr_segs)
3791{
3792 struct file *file = iocb->ki_filp;
3793 struct inode *inode = file->f_mapping->host;
3794
3795 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3796 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3797
3798 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3799}
3800
3357/* 3801/*
3358 * Pages can be marked dirty completely asynchronously from ext4's journalling 3802 * Pages can be marked dirty completely asynchronously from ext4's journalling
3359 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3803 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -4551,8 +4995,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
4551 */ 4995 */
4552static int ext4_do_update_inode(handle_t *handle, 4996static int ext4_do_update_inode(handle_t *handle,
4553 struct inode *inode, 4997 struct inode *inode,
4554 struct ext4_iloc *iloc, 4998 struct ext4_iloc *iloc)
4555 int do_sync)
4556{ 4999{
4557 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5000 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4558 struct ext4_inode_info *ei = EXT4_I(inode); 5001 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4653,22 +5096,10 @@ static int ext4_do_update_inode(handle_t *handle,
4653 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 5096 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4654 } 5097 }
4655 5098
4656 /* 5099 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4657 * If we're not using a journal and we were called from 5100 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4658 * ext4_write_inode() to sync the inode (making do_sync true), 5101 if (!err)
4659 * we can just use sync_dirty_buffer() directly to do our dirty 5102 err = rc;
4660 * work. Testing s_journal here is a bit redundant but it's
4661 * worth it to avoid potential future trouble.
4662 */
4663 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4664 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4665 sync_dirty_buffer(bh);
4666 } else {
4667 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4668 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4669 if (!err)
4670 err = rc;
4671 }
4672 ei->i_state &= ~EXT4_STATE_NEW; 5103 ei->i_state &= ~EXT4_STATE_NEW;
4673 5104
4674out_brelse: 5105out_brelse:
@@ -4736,8 +5167,16 @@ int ext4_write_inode(struct inode *inode, int wait)
4736 err = ext4_get_inode_loc(inode, &iloc); 5167 err = ext4_get_inode_loc(inode, &iloc);
4737 if (err) 5168 if (err)
4738 return err; 5169 return err;
4739 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, 5170 if (wait)
4740 inode, &iloc, wait); 5171 sync_dirty_buffer(iloc.bh);
5172 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5173 ext4_error(inode->i_sb, __func__,
5174 "IO error syncing inode, "
5175 "inode=%lu, block=%llu",
5176 inode->i_ino,
5177 (unsigned long long)iloc.bh->b_blocknr);
5178 err = -EIO;
5179 }
4741 } 5180 }
4742 return err; 5181 return err;
4743} 5182}
@@ -5033,7 +5472,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
5033 get_bh(iloc->bh); 5472 get_bh(iloc->bh);
5034 5473
5035 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5474 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5036 err = ext4_do_update_inode(handle, inode, iloc, 0); 5475 err = ext4_do_update_inode(handle, inode, iloc);
5037 put_bh(iloc->bh); 5476 put_bh(iloc->bh);
5038 return err; 5477 return err;
5039} 5478}
@@ -5177,27 +5616,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5177 */ 5616 */
5178void ext4_dirty_inode(struct inode *inode) 5617void ext4_dirty_inode(struct inode *inode)
5179{ 5618{
5180 handle_t *current_handle = ext4_journal_current_handle();
5181 handle_t *handle; 5619 handle_t *handle;
5182 5620
5183 if (!ext4_handle_valid(current_handle)) {
5184 ext4_mark_inode_dirty(current_handle, inode);
5185 return;
5186 }
5187
5188 handle = ext4_journal_start(inode, 2); 5621 handle = ext4_journal_start(inode, 2);
5189 if (IS_ERR(handle)) 5622 if (IS_ERR(handle))
5190 goto out; 5623 goto out;
5191 if (current_handle && 5624
5192 current_handle->h_transaction != handle->h_transaction) { 5625 ext4_mark_inode_dirty(handle, inode);
5193 /* This task has a transaction open against a different fs */ 5626
5194 printk(KERN_EMERG "%s: transactions do not match!\n",
5195 __func__);
5196 } else {
5197 jbd_debug(5, "marking dirty. outer handle=%p\n",
5198 current_handle);
5199 ext4_mark_inode_dirty(handle, inode);
5200 }
5201 ext4_journal_stop(handle); 5627 ext4_journal_stop(handle);
5202out: 5628out:
5203 return; 5629 return;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c61896d605..bba12824defa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2096,207 +2096,6 @@ out:
2096 return err; 2096 return err;
2097} 2097}
2098 2098
2099#ifdef EXT4_MB_HISTORY
2100struct ext4_mb_proc_session {
2101 struct ext4_mb_history *history;
2102 struct super_block *sb;
2103 int start;
2104 int max;
2105};
2106
2107static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2108 struct ext4_mb_history *hs,
2109 int first)
2110{
2111 if (hs == s->history + s->max)
2112 hs = s->history;
2113 if (!first && hs == s->history + s->start)
2114 return NULL;
2115 while (hs->orig.fe_len == 0) {
2116 hs++;
2117 if (hs == s->history + s->max)
2118 hs = s->history;
2119 if (hs == s->history + s->start)
2120 return NULL;
2121 }
2122 return hs;
2123}
2124
2125static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2126{
2127 struct ext4_mb_proc_session *s = seq->private;
2128 struct ext4_mb_history *hs;
2129 int l = *pos;
2130
2131 if (l == 0)
2132 return SEQ_START_TOKEN;
2133 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2134 if (!hs)
2135 return NULL;
2136 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2137 return hs;
2138}
2139
2140static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2141 loff_t *pos)
2142{
2143 struct ext4_mb_proc_session *s = seq->private;
2144 struct ext4_mb_history *hs = v;
2145
2146 ++*pos;
2147 if (v == SEQ_START_TOKEN)
2148 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2149 else
2150 return ext4_mb_history_skip_empty(s, ++hs, 0);
2151}
2152
2153static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2154{
2155 char buf[25], buf2[25], buf3[25], *fmt;
2156 struct ext4_mb_history *hs = v;
2157
2158 if (v == SEQ_START_TOKEN) {
2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2161 "pid", "inode", "original", "goal", "result", "found",
2162 "grps", "cr", "flags", "merge", "tail", "broken");
2163 return 0;
2164 }
2165
2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2168 "0x%04x %-5s %-5u %-6u\n";
2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2170 hs->result.fe_start, hs->result.fe_len,
2171 hs->result.fe_logical);
2172 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2173 hs->orig.fe_start, hs->orig.fe_len,
2174 hs->orig.fe_logical);
2175 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
2176 hs->goal.fe_start, hs->goal.fe_len,
2177 hs->goal.fe_logical);
2178 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2179 hs->found, hs->groups, hs->cr, hs->flags,
2180 hs->merged ? "M" : "", hs->tail,
2181 hs->buddy ? 1 << hs->buddy : 0);
2182 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2183 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2184 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2185 hs->result.fe_start, hs->result.fe_len,
2186 hs->result.fe_logical);
2187 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2188 hs->orig.fe_start, hs->orig.fe_len,
2189 hs->orig.fe_logical);
2190 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2191 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2192 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2193 hs->result.fe_start, hs->result.fe_len);
2194 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2195 hs->pid, hs->ino, buf2);
2196 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2197 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2198 hs->result.fe_start, hs->result.fe_len);
2199 seq_printf(seq, "%-5u %-8u %-23s free\n",
2200 hs->pid, hs->ino, buf2);
2201 }
2202 return 0;
2203}
2204
2205static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2206{
2207}
2208
2209static const struct seq_operations ext4_mb_seq_history_ops = {
2210 .start = ext4_mb_seq_history_start,
2211 .next = ext4_mb_seq_history_next,
2212 .stop = ext4_mb_seq_history_stop,
2213 .show = ext4_mb_seq_history_show,
2214};
2215
2216static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2217{
2218 struct super_block *sb = PDE(inode)->data;
2219 struct ext4_sb_info *sbi = EXT4_SB(sb);
2220 struct ext4_mb_proc_session *s;
2221 int rc;
2222 int size;
2223
2224 if (unlikely(sbi->s_mb_history == NULL))
2225 return -ENOMEM;
2226 s = kmalloc(sizeof(*s), GFP_KERNEL);
2227 if (s == NULL)
2228 return -ENOMEM;
2229 s->sb = sb;
2230 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2231 s->history = kmalloc(size, GFP_KERNEL);
2232 if (s->history == NULL) {
2233 kfree(s);
2234 return -ENOMEM;
2235 }
2236
2237 spin_lock(&sbi->s_mb_history_lock);
2238 memcpy(s->history, sbi->s_mb_history, size);
2239 s->max = sbi->s_mb_history_max;
2240 s->start = sbi->s_mb_history_cur % s->max;
2241 spin_unlock(&sbi->s_mb_history_lock);
2242
2243 rc = seq_open(file, &ext4_mb_seq_history_ops);
2244 if (rc == 0) {
2245 struct seq_file *m = (struct seq_file *)file->private_data;
2246 m->private = s;
2247 } else {
2248 kfree(s->history);
2249 kfree(s);
2250 }
2251 return rc;
2252
2253}
2254
2255static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2256{
2257 struct seq_file *seq = (struct seq_file *)file->private_data;
2258 struct ext4_mb_proc_session *s = seq->private;
2259 kfree(s->history);
2260 kfree(s);
2261 return seq_release(inode, file);
2262}
2263
2264static ssize_t ext4_mb_seq_history_write(struct file *file,
2265 const char __user *buffer,
2266 size_t count, loff_t *ppos)
2267{
2268 struct seq_file *seq = (struct seq_file *)file->private_data;
2269 struct ext4_mb_proc_session *s = seq->private;
2270 struct super_block *sb = s->sb;
2271 char str[32];
2272 int value;
2273
2274 if (count >= sizeof(str)) {
2275 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2276 "mb_history", (int)sizeof(str));
2277 return -EOVERFLOW;
2278 }
2279
2280 if (copy_from_user(str, buffer, count))
2281 return -EFAULT;
2282
2283 value = simple_strtol(str, NULL, 0);
2284 if (value < 0)
2285 return -ERANGE;
2286 EXT4_SB(sb)->s_mb_history_filter = value;
2287
2288 return count;
2289}
2290
2291static const struct file_operations ext4_mb_seq_history_fops = {
2292 .owner = THIS_MODULE,
2293 .open = ext4_mb_seq_history_open,
2294 .read = seq_read,
2295 .write = ext4_mb_seq_history_write,
2296 .llseek = seq_lseek,
2297 .release = ext4_mb_seq_history_release,
2298};
2299
2300static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2099static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2301{ 2100{
2302 struct super_block *sb = seq->private; 2101 struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2396 .release = seq_release, 2195 .release = seq_release,
2397}; 2196};
2398 2197
2399static void ext4_mb_history_release(struct super_block *sb)
2400{
2401 struct ext4_sb_info *sbi = EXT4_SB(sb);
2402
2403 if (sbi->s_proc != NULL) {
2404 remove_proc_entry("mb_groups", sbi->s_proc);
2405 if (sbi->s_mb_history_max)
2406 remove_proc_entry("mb_history", sbi->s_proc);
2407 }
2408 kfree(sbi->s_mb_history);
2409}
2410
2411static void ext4_mb_history_init(struct super_block *sb)
2412{
2413 struct ext4_sb_info *sbi = EXT4_SB(sb);
2414 int i;
2415
2416 if (sbi->s_proc != NULL) {
2417 if (sbi->s_mb_history_max)
2418 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2419 &ext4_mb_seq_history_fops, sb);
2420 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2421 &ext4_mb_seq_groups_fops, sb);
2422 }
2423
2424 sbi->s_mb_history_cur = 0;
2425 spin_lock_init(&sbi->s_mb_history_lock);
2426 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2427 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
2428 /* if we can't allocate history, then we simple won't use it */
2429}
2430
2431static noinline_for_stack void
2432ext4_mb_store_history(struct ext4_allocation_context *ac)
2433{
2434 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2435 struct ext4_mb_history h;
2436
2437 if (sbi->s_mb_history == NULL)
2438 return;
2439
2440 if (!(ac->ac_op & sbi->s_mb_history_filter))
2441 return;
2442
2443 h.op = ac->ac_op;
2444 h.pid = current->pid;
2445 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2446 h.orig = ac->ac_o_ex;
2447 h.result = ac->ac_b_ex;
2448 h.flags = ac->ac_flags;
2449 h.found = ac->ac_found;
2450 h.groups = ac->ac_groups_scanned;
2451 h.cr = ac->ac_criteria;
2452 h.tail = ac->ac_tail;
2453 h.buddy = ac->ac_buddy;
2454 h.merged = 0;
2455 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2456 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2457 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2458 h.merged = 1;
2459 h.goal = ac->ac_g_ex;
2460 h.result = ac->ac_f_ex;
2461 }
2462
2463 spin_lock(&sbi->s_mb_history_lock);
2464 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2465 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2466 sbi->s_mb_history_cur = 0;
2467 spin_unlock(&sbi->s_mb_history_lock);
2468}
2469
2470#else
2471#define ext4_mb_history_release(sb)
2472#define ext4_mb_history_init(sb)
2473#endif
2474
2475 2198
2476/* Create and initialize ext4_group_info data for the given group. */ 2199/* Create and initialize ext4_group_info data for the given group. */
2477int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2200int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2690 sbi->s_mb_stats = MB_DEFAULT_STATS; 2413 sbi->s_mb_stats = MB_DEFAULT_STATS;
2691 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2414 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2692 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2415 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2693 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2694 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2416 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2695 2417
2696 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2418 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2708 spin_lock_init(&lg->lg_prealloc_lock); 2430 spin_lock_init(&lg->lg_prealloc_lock);
2709 } 2431 }
2710 2432
2711 ext4_mb_history_init(sb); 2433 if (sbi->s_proc)
2434 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2435 &ext4_mb_seq_groups_fops, sb);
2712 2436
2713 if (sbi->s_journal) 2437 if (sbi->s_journal)
2714 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2438 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2715
2716 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2717 return 0; 2439 return 0;
2718} 2440}
2719 2441
@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
2790 } 2512 }
2791 2513
2792 free_percpu(sbi->s_locality_groups); 2514 free_percpu(sbi->s_locality_groups);
2793 ext4_mb_history_release(sb); 2515 if (sbi->s_proc)
2516 remove_proc_entry("mb_groups", sbi->s_proc);
2794 2517
2795 return 0; 2518 return 0;
2796} 2519}
@@ -3276,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3276 atomic_inc(&sbi->s_bal_breaks); 2999 atomic_inc(&sbi->s_bal_breaks);
3277 } 3000 }
3278 3001
3279 ext4_mb_store_history(ac); 3002 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3003 trace_ext4_mballoc_alloc(ac);
3004 else
3005 trace_ext4_mballoc_prealloc(ac);
3280} 3006}
3281 3007
3282/* 3008/*
@@ -3776,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3776 if (ac) { 3502 if (ac) {
3777 ac->ac_sb = sb; 3503 ac->ac_sb = sb;
3778 ac->ac_inode = pa->pa_inode; 3504 ac->ac_inode = pa->pa_inode;
3779 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3780 } 3505 }
3781 3506
3782 while (bit < end) { 3507 while (bit < end) {
@@ -3796,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3796 ac->ac_b_ex.fe_start = bit; 3521 ac->ac_b_ex.fe_start = bit;
3797 ac->ac_b_ex.fe_len = next - bit; 3522 ac->ac_b_ex.fe_len = next - bit;
3798 ac->ac_b_ex.fe_logical = 0; 3523 ac->ac_b_ex.fe_logical = 0;
3799 ext4_mb_store_history(ac); 3524 trace_ext4_mballoc_discard(ac);
3800 } 3525 }
3801 3526
3802 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3527 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3831,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3831 ext4_group_t group; 3556 ext4_group_t group;
3832 ext4_grpblk_t bit; 3557 ext4_grpblk_t bit;
3833 3558
3834 if (ac)
3835 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3836
3837 trace_ext4_mb_release_group_pa(ac, pa); 3559 trace_ext4_mb_release_group_pa(ac, pa);
3838 BUG_ON(pa->pa_deleted == 0); 3560 BUG_ON(pa->pa_deleted == 0);
3839 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3561 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3848,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3848 ac->ac_b_ex.fe_start = bit; 3570 ac->ac_b_ex.fe_start = bit;
3849 ac->ac_b_ex.fe_len = pa->pa_len; 3571 ac->ac_b_ex.fe_len = pa->pa_len;
3850 ac->ac_b_ex.fe_logical = 0; 3572 ac->ac_b_ex.fe_logical = 0;
3851 ext4_mb_store_history(ac); 3573 trace_ext4_mballoc_discard(ac);
3852 } 3574 }
3853 3575
3854 return 0; 3576 return 0;
@@ -4189,7 +3911,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3911 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3912 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits; 3913 >> bsbits;
4192 size = max(size, isize);
4193 3914
4194 if ((size == isize) && 3915 if ((size == isize) &&
4195 !ext4_fs_is_busy(sbi) && 3916 !ext4_fs_is_busy(sbi) &&
@@ -4199,6 +3920,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4199 } 3920 }
4200 3921
4201 /* don't use group allocation for large files */ 3922 /* don't use group allocation for large files */
3923 size = max(size, isize);
4202 if (size >= sbi->s_mb_stream_request) { 3924 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3925 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4204 return; 3926 return;
@@ -4739,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4739 4461
4740 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4462 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4741 if (ac) { 4463 if (ac) {
4742 ac->ac_op = EXT4_MB_HISTORY_FREE;
4743 ac->ac_inode = inode; 4464 ac->ac_inode = inode;
4744 ac->ac_sb = sb; 4465 ac->ac_sb = sb;
4745 } 4466 }
@@ -4806,7 +4527,7 @@ do_more:
4806 ac->ac_b_ex.fe_group = block_group; 4527 ac->ac_b_ex.fe_group = block_group;
4807 ac->ac_b_ex.fe_start = bit; 4528 ac->ac_b_ex.fe_start = bit;
4808 ac->ac_b_ex.fe_len = count; 4529 ac->ac_b_ex.fe_len = count;
4809 ext4_mb_store_history(ac); 4530 trace_ext4_mballoc_free(ac);
4810 } 4531 }
4811 4532
4812 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4533 err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d709b24..0ca811061bc7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
52#define mb_debug(n, fmt, a...) 52#define mb_debug(n, fmt, a...)
53#endif 53#endif
54 54
55/*
56 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
57 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
58 */
59#define EXT4_MB_HISTORY
60#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 55#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
61#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ 56#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
62#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
63#define EXT4_MB_HISTORY_FREE 8 /* free */
64
65#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
66 EXT4_MB_HISTORY_PREALLOC)
67 57
68/* 58/*
69 * How long mballoc can look for a best extent (in found extents) 59 * How long mballoc can look for a best extent (in found extents)
@@ -84,7 +74,7 @@ extern u8 mb_enable_debug;
84 * with 'ext4_mb_stats' allocator will collect stats that will be 74 * with 'ext4_mb_stats' allocator will collect stats that will be
85 * shown at umount. The collecting costs though! 75 * shown at umount. The collecting costs though!
86 */ 76 */
87#define MB_DEFAULT_STATS 1 77#define MB_DEFAULT_STATS 0
88 78
89/* 79/*
90 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served 80 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -217,22 +207,6 @@ struct ext4_allocation_context {
217#define AC_STATUS_FOUND 2 207#define AC_STATUS_FOUND 2
218#define AC_STATUS_BREAK 3 208#define AC_STATUS_BREAK 3
219 209
220struct ext4_mb_history {
221 struct ext4_free_extent orig; /* orig allocation */
222 struct ext4_free_extent goal; /* goal allocation */
223 struct ext4_free_extent result; /* result allocation */
224 unsigned pid;
225 unsigned ino;
226 __u16 found; /* how many extents have been found */
227 __u16 groups; /* how many groups have been scanned */
228 __u16 tail; /* what tail broke some buddy */
229 __u16 buddy; /* buddy the tail ^^^ broke */
230 __u16 flags;
231 __u8 cr:3; /* which phase the result extent was found at */
232 __u8 op:4;
233 __u8 merged:1;
234};
235
236struct ext4_buddy { 210struct ext4_buddy {
237 struct page *bd_buddy_page; 211 struct page *bd_buddy_page;
238 void *bd_buddy; 212 void *bd_buddy;
@@ -247,13 +221,6 @@ struct ext4_buddy {
247#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 221#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
248#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 222#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
249 223
250#ifndef EXT4_MB_HISTORY
251static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
252{
253 return;
254}
255#endif
256
257#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 224#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
258 225
259static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 226static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f239ae6..a93d5b80f3e2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
75 goto err_out; 75 goto err_out;
76 } 76 }
77 } 77 }
78 retval = ext4_ext_insert_extent(handle, inode, path, &newext); 78 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
79err_out: 79err_out:
80 if (path) { 80 if (path) {
81 ext4_ext_drop_refs(path); 81 ext4_ext_drop_refs(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a2915e40b..25b6b1457360 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
322 goto out; 322 goto out;
323 323
324 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
325 orig_path, new_ext)) 325 orig_path, new_ext, 0))
326 goto out; 326 goto out;
327 } 327 }
328 328
@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
333 goto out; 333 goto out;
334 334
335 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
336 orig_path, end_ext)) 336 orig_path, end_ext, 0))
337 goto out; 337 goto out;
338 } 338 }
339out: 339out:
@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
1001 return -EINVAL; 1001 return -EINVAL;
1002 } 1002 }
1003 1003
1004 /* orig and donor should be different file */
1005 if (orig_inode->i_ino == donor_inode->i_ino) {
1006 ext4_debug("ext4 move extent: The argument files should not "
1007 "be same file [ino:orig %lu, donor %lu]\n",
1008 orig_inode->i_ino, donor_inode->i_ino);
1009 return -EINVAL;
1010 }
1011
1012 /* Ext4 move extent supports only extent based file */ 1004 /* Ext4 move extent supports only extent based file */
1013 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 1005 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
1014 ext4_debug("ext4 move extent: orig file is not extents " 1006 ext4_debug("ext4 move extent: orig file is not extents "
@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1232 int block_len_in_page; 1224 int block_len_in_page;
1233 int uninit; 1225 int uninit;
1234 1226
1227 /* orig and donor should be different file */
1228 if (orig_inode->i_ino == donor_inode->i_ino) {
1229 ext4_debug("ext4 move extent: The argument files should not "
1230 "be same file [ino:orig %lu, donor %lu]\n",
1231 orig_inode->i_ino, donor_inode->i_ino);
1232 return -EINVAL;
1233 }
1234
1235 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0) 1237 if (ret1 < 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d285cd5..7c8fe80bacdd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2076,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2076 struct ext4_iloc iloc; 2076 struct ext4_iloc iloc;
2077 int err = 0; 2077 int err = 0;
2078 2078
2079 if (!ext4_handle_valid(handle)) 2079 /* ext4_handle_valid() assumes a valid handle_t pointer */
2080 if (handle && !ext4_handle_valid(handle))
2080 return 0; 2081 return 0;
2081 2082
2082 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2083 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba27779..312211ee05af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
50#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
52 52
53static int default_mb_history_length = 1000;
54
55module_param_named(default_mb_history_length, default_mb_history_length,
56 int, 0644);
57MODULE_PARM_DESC(default_mb_history_length,
58 "Default number of entries saved for mb_history");
59
60struct proc_dir_entry *ext4_proc_root; 53struct proc_dir_entry *ext4_proc_root;
61static struct kset *ext4_kset; 54static struct kset *ext4_kset;
62 55
@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
189 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 182 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
190} 183}
191 184
185
186/* Just increment the non-pointer handle value */
187static handle_t *ext4_get_nojournal(void)
188{
189 handle_t *handle = current->journal_info;
190 unsigned long ref_cnt = (unsigned long)handle;
191
192 BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
193
194 ref_cnt++;
195 handle = (handle_t *)ref_cnt;
196
197 current->journal_info = handle;
198 return handle;
199}
200
201
202/* Decrement the non-pointer handle value */
203static void ext4_put_nojournal(handle_t *handle)
204{
205 unsigned long ref_cnt = (unsigned long)handle;
206
207 BUG_ON(ref_cnt == 0);
208
209 ref_cnt--;
210 handle = (handle_t *)ref_cnt;
211
212 current->journal_info = handle;
213}
214
192/* 215/*
193 * Wrappers for jbd2_journal_start/end. 216 * Wrappers for jbd2_journal_start/end.
194 * 217 *
@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
215 } 238 }
216 return jbd2_journal_start(journal, nblocks); 239 return jbd2_journal_start(journal, nblocks);
217 } 240 }
218 /* 241 return ext4_get_nojournal();
219 * We're not journaling, return the appropriate indication.
220 */
221 current->journal_info = EXT4_NOJOURNAL_HANDLE;
222 return current->journal_info;
223} 242}
224 243
225/* 244/*
@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
235 int rc; 254 int rc;
236 255
237 if (!ext4_handle_valid(handle)) { 256 if (!ext4_handle_valid(handle)) {
238 /* 257 ext4_put_nojournal(handle);
239 * Do this here since we don't call jbd2_journal_stop() in
240 * no-journal mode.
241 */
242 current->journal_info = NULL;
243 return 0; 258 return 0;
244 } 259 }
245 sb = handle->h_transaction->t_journal->j_private; 260 sb = handle->h_transaction->t_journal->j_private;
@@ -580,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
580 struct ext4_super_block *es = sbi->s_es; 595 struct ext4_super_block *es = sbi->s_es;
581 int i, err; 596 int i, err;
582 597
598 flush_workqueue(sbi->dio_unwritten_wq);
599 destroy_workqueue(sbi->dio_unwritten_wq);
600
583 lock_super(sb); 601 lock_super(sb);
584 lock_kernel(); 602 lock_kernel();
585 if (sb->s_dirt) 603 if (sb->s_dirt)
@@ -684,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
684 ei->i_allocated_meta_blocks = 0; 702 ei->i_allocated_meta_blocks = 0;
685 ei->i_delalloc_reserved_flag = 0; 703 ei->i_delalloc_reserved_flag = 0;
686 spin_lock_init(&(ei->i_block_reservation_lock)); 704 spin_lock_init(&(ei->i_block_reservation_lock));
705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
706 ei->cur_aio_dio = NULL;
687 707
688 return &ei->vfs_inode; 708 return &ei->vfs_inode;
689} 709}
@@ -1052,7 +1072,7 @@ enum {
1052 Opt_journal_update, Opt_journal_dev, 1072 Opt_journal_update, Opt_journal_dev,
1053 Opt_journal_checksum, Opt_journal_async_commit, 1073 Opt_journal_checksum, Opt_journal_async_commit,
1054 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1074 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1055 Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, 1075 Opt_data_err_abort, Opt_data_err_ignore,
1056 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1076 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1057 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1077 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1058 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1078 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1099,7 +1119,6 @@ static const match_table_t tokens = {
1099 {Opt_data_writeback, "data=writeback"}, 1119 {Opt_data_writeback, "data=writeback"},
1100 {Opt_data_err_abort, "data_err=abort"}, 1120 {Opt_data_err_abort, "data_err=abort"},
1101 {Opt_data_err_ignore, "data_err=ignore"}, 1121 {Opt_data_err_ignore, "data_err=ignore"},
1102 {Opt_mb_history_length, "mb_history_length=%u"},
1103 {Opt_offusrjquota, "usrjquota="}, 1122 {Opt_offusrjquota, "usrjquota="},
1104 {Opt_usrjquota, "usrjquota=%s"}, 1123 {Opt_usrjquota, "usrjquota=%s"},
1105 {Opt_offgrpjquota, "grpjquota="}, 1124 {Opt_offgrpjquota, "grpjquota="},
@@ -1340,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb,
1340 case Opt_data_err_ignore: 1359 case Opt_data_err_ignore:
1341 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1360 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1342 break; 1361 break;
1343 case Opt_mb_history_length:
1344 if (match_int(&args[0], &option))
1345 return 0;
1346 if (option < 0)
1347 return 0;
1348 sbi->s_mb_history_max = option;
1349 break;
1350#ifdef CONFIG_QUOTA 1362#ifdef CONFIG_QUOTA
1351 case Opt_usrjquota: 1363 case Opt_usrjquota:
1352 qtype = USRQUOTA; 1364 qtype = USRQUOTA;
@@ -1646,13 +1658,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1646 EXT4_INODES_PER_GROUP(sb), 1658 EXT4_INODES_PER_GROUP(sb),
1647 sbi->s_mount_opt); 1659 sbi->s_mount_opt);
1648 1660
1649 if (EXT4_SB(sb)->s_journal) {
1650 ext4_msg(sb, KERN_INFO, "%s journal on %s",
1651 EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1652 "external", EXT4_SB(sb)->s_journal->j_devname);
1653 } else {
1654 ext4_msg(sb, KERN_INFO, "no journal");
1655 }
1656 return res; 1661 return res;
1657} 1662}
1658 1663
@@ -2197,6 +2202,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2202EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2203EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2204EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2205EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2200 2206
2201static struct attribute *ext4_attrs[] = { 2207static struct attribute *ext4_attrs[] = {
2202 ATTR_LIST(delayed_allocation_blocks), 2208 ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2216,7 @@ static struct attribute *ext4_attrs[] = {
2210 ATTR_LIST(mb_order2_req), 2216 ATTR_LIST(mb_order2_req),
2211 ATTR_LIST(mb_stream_req), 2217 ATTR_LIST(mb_stream_req),
2212 ATTR_LIST(mb_group_prealloc), 2218 ATTR_LIST(mb_group_prealloc),
2219 ATTR_LIST(max_writeback_mb_bump),
2213 NULL, 2220 NULL,
2214}; 2221};
2215 2222
@@ -2413,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2413 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 2420 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2414 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2421 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2415 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2422 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2416 sbi->s_mb_history_max = default_mb_history_length;
2417 2423
2418 set_opt(sbi->s_mount_opt, BARRIER); 2424 set_opt(sbi->s_mount_opt, BARRIER);
2419 2425
@@ -2679,6 +2685,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 } 2685 }
2680 2686
2681 sbi->s_stripe = ext4_get_stripe_size(sbi); 2687 sbi->s_stripe = ext4_get_stripe_size(sbi);
2688 sbi->s_max_writeback_mb_bump = 128;
2682 2689
2683 /* 2690 /*
2684 * set up enough so that it can read an inode 2691 * set up enough so that it can read an inode
@@ -2798,6 +2805,12 @@ no_journal:
2798 clear_opt(sbi->s_mount_opt, NOBH); 2805 clear_opt(sbi->s_mount_opt, NOBH);
2799 } 2806 }
2800 } 2807 }
2808 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2809 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2810 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
2811 goto failed_mount_wq;
2812 }
2813
2801 /* 2814 /*
2802 * The jbd2_journal_load will have done any necessary log recovery, 2815 * The jbd2_journal_load will have done any necessary log recovery,
2803 * so we can safely mount the rest of the filesystem now. 2816 * so we can safely mount the rest of the filesystem now.
@@ -2849,12 +2862,12 @@ no_journal:
2849 "available"); 2862 "available");
2850 } 2863 }
2851 2864
2852 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2865 if (test_opt(sb, DELALLOC) &&
2866 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2853 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 2867 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
2854 "requested data journaling mode"); 2868 "requested data journaling mode");
2855 clear_opt(sbi->s_mount_opt, DELALLOC); 2869 clear_opt(sbi->s_mount_opt, DELALLOC);
2856 } else if (test_opt(sb, DELALLOC)) 2870 }
2857 ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
2858 2871
2859 err = ext4_setup_system_zone(sb); 2872 err = ext4_setup_system_zone(sb);
2860 if (err) { 2873 if (err) {
@@ -2910,6 +2923,8 @@ cantfind_ext4:
2910 2923
2911failed_mount4: 2924failed_mount4:
2912 ext4_msg(sb, KERN_ERR, "mount failed"); 2925 ext4_msg(sb, KERN_ERR, "mount failed");
2926 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
2927failed_mount_wq:
2913 ext4_release_system_zone(sb); 2928 ext4_release_system_zone(sb);
2914 if (sbi->s_journal) { 2929 if (sbi->s_journal) {
2915 jbd2_journal_destroy(sbi->s_journal); 2930 jbd2_journal_destroy(sbi->s_journal);
@@ -3164,9 +3179,7 @@ static int ext4_load_journal(struct super_block *sb,
3164 return -EINVAL; 3179 return -EINVAL;
3165 } 3180 }
3166 3181
3167 if (journal->j_flags & JBD2_BARRIER) 3182 if (!(journal->j_flags & JBD2_BARRIER))
3168 ext4_msg(sb, KERN_INFO, "barriers enabled");
3169 else
3170 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3183 ext4_msg(sb, KERN_INFO, "barriers disabled");
3171 3184
3172 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 3185 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3361,11 +3374,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3361{ 3374{
3362 int ret = 0; 3375 int ret = 0;
3363 tid_t target; 3376 tid_t target;
3377 struct ext4_sb_info *sbi = EXT4_SB(sb);
3364 3378
3365 trace_ext4_sync_fs(sb, wait); 3379 trace_ext4_sync_fs(sb, wait);
3366 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 3380 flush_workqueue(sbi->dio_unwritten_wq);
3381 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
3367 if (wait) 3382 if (wait)
3368 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); 3383 jbd2_log_wait_commit(sbi->s_journal, target);
3369 } 3384 }
3370 return ret; 3385 return ret;
3371} 3386}
@@ -3951,27 +3966,6 @@ static struct file_system_type ext4_fs_type = {
3951 .fs_flags = FS_REQUIRES_DEV, 3966 .fs_flags = FS_REQUIRES_DEV,
3952}; 3967};
3953 3968
3954#ifdef CONFIG_EXT4DEV_COMPAT
3955static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
3956 const char *dev_name, void *data,struct vfsmount *mnt)
3957{
3958 printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
3959 "to mount using ext4\n", dev_name);
3960 printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
3961 "will go away by 2.6.31\n", dev_name);
3962 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3963}
3964
3965static struct file_system_type ext4dev_fs_type = {
3966 .owner = THIS_MODULE,
3967 .name = "ext4dev",
3968 .get_sb = ext4dev_get_sb,
3969 .kill_sb = kill_block_super,
3970 .fs_flags = FS_REQUIRES_DEV,
3971};
3972MODULE_ALIAS("ext4dev");
3973#endif
3974
3975static int __init init_ext4_fs(void) 3969static int __init init_ext4_fs(void)
3976{ 3970{
3977 int err; 3971 int err;
@@ -3996,13 +3990,6 @@ static int __init init_ext4_fs(void)
3996 err = register_filesystem(&ext4_fs_type); 3990 err = register_filesystem(&ext4_fs_type);
3997 if (err) 3991 if (err)
3998 goto out; 3992 goto out;
3999#ifdef CONFIG_EXT4DEV_COMPAT
4000 err = register_filesystem(&ext4dev_fs_type);
4001 if (err) {
4002 unregister_filesystem(&ext4_fs_type);
4003 goto out;
4004 }
4005#endif
4006 return 0; 3993 return 0;
4007out: 3994out:
4008 destroy_inodecache(); 3995 destroy_inodecache();
@@ -4021,9 +4008,6 @@ out4:
4021static void __exit exit_ext4_fs(void) 4008static void __exit exit_ext4_fs(void)
4022{ 4009{
4023 unregister_filesystem(&ext4_fs_type); 4010 unregister_filesystem(&ext4_fs_type);
4024#ifdef CONFIG_EXT4DEV_COMPAT
4025 unregister_filesystem(&ext4dev_fs_type);
4026#endif
4027 destroy_inodecache(); 4011 destroy_inodecache();
4028 exit_ext4_xattr(); 4012 exit_ext4_xattr();
4029 exit_ext4_mballoc(); 4013 exit_ext4_mballoc();
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index adb0e72a176d..7db0979c6b72 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,7 +323,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323/* fat/misc.c */ 323/* fat/misc.c */
324extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 324extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
325 __attribute__ ((format (printf, 2, 3))) __cold; 325 __attribute__ ((format (printf, 2, 3))) __cold;
326extern void fat_clusters_flush(struct super_block *sb); 326extern int fat_clusters_flush(struct super_block *sb);
327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
329 __le16 __time, __le16 __date, u8 time_cs); 329 __le16 __time, __le16 __date, u8 time_cs);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 04629d1302fc..76b7961ab663 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb)
451 451
452static int fat_sync_fs(struct super_block *sb, int wait) 452static int fat_sync_fs(struct super_block *sb, int wait)
453{ 453{
454 lock_super(sb); 454 int err = 0;
455 fat_clusters_flush(sb);
456 sb->s_dirt = 0;
457 unlock_super(sb);
458 455
459 return 0; 456 if (sb->s_dirt) {
457 lock_super(sb);
458 sb->s_dirt = 0;
459 err = fat_clusters_flush(sb);
460 unlock_super(sb);
461 }
462
463 return err;
460} 464}
461 465
462static void fat_put_super(struct super_block *sb) 466static void fat_put_super(struct super_block *sb)
@@ -812,7 +816,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
812 seq_puts(m, ",shortname=mixed"); 816 seq_puts(m, ",shortname=mixed");
813 break; 817 break;
814 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: 818 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
815 /* seq_puts(m, ",shortname=lower"); */ 819 seq_puts(m, ",shortname=lower");
816 break; 820 break;
817 default: 821 default:
818 seq_puts(m, ",shortname=unknown"); 822 seq_puts(m, ",shortname=unknown");
@@ -963,7 +967,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
963 opts->codepage = fat_default_codepage; 967 opts->codepage = fat_default_codepage;
964 opts->iocharset = fat_default_iocharset; 968 opts->iocharset = fat_default_iocharset;
965 if (is_vfat) { 969 if (is_vfat) {
966 opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95; 970 opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
967 opts->rodir = 0; 971 opts->rodir = 0;
968 } else { 972 } else {
969 opts->shortname = 0; 973 opts->shortname = 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 4e35be873e09..0f55f5cb732f 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error);
43 43
44/* Flushes the number of free clusters on FAT32 */ 44/* Flushes the number of free clusters on FAT32 */
45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
46void fat_clusters_flush(struct super_block *sb) 46int fat_clusters_flush(struct super_block *sb)
47{ 47{
48 struct msdos_sb_info *sbi = MSDOS_SB(sb); 48 struct msdos_sb_info *sbi = MSDOS_SB(sb);
49 struct buffer_head *bh; 49 struct buffer_head *bh;
50 struct fat_boot_fsinfo *fsinfo; 50 struct fat_boot_fsinfo *fsinfo;
51 51
52 if (sbi->fat_bits != 32) 52 if (sbi->fat_bits != 32)
53 return; 53 return 0;
54 54
55 bh = sb_bread(sb, sbi->fsinfo_sector); 55 bh = sb_bread(sb, sbi->fsinfo_sector);
56 if (bh == NULL) { 56 if (bh == NULL) {
57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); 57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
58 return; 58 return -EIO;
59 } 59 }
60 60
61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data; 61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
@@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb)
74 mark_buffer_dirty(bh); 74 mark_buffer_dirty(bh);
75 } 75 }
76 brelse(bh); 76 brelse(bh);
77
78 return 0;
77} 79}
78 80
79/* 81/*
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index cb6e83557112..f565f24019b5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
499 int charlen; 499 int charlen;
500 500
501 if (utf8) { 501 if (utf8) {
502 int name_len = strlen(name); 502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
503 503 if (*outlen < 0)
504 *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); 504 return *outlen;
505 505 else if (*outlen > 255)
506 /*
507 * We stripped '.'s before and set len appropriately,
508 * but utf8s_to_utf16s doesn't care about len
509 */
510 *outlen -= (name_len - len);
511
512 if (*outlen > 255)
513 return -ENAMETOOLONG; 506 return -ENAMETOOLONG;
514 507
515 op = &outname[*outlen * sizeof(wchar_t)]; 508 op = &outname[*outlen * sizeof(wchar_t)];
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5d70b3e6d49b..ca0f5eb62b20 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -643,6 +643,7 @@ out:
643 643
644int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 644int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
645{ 645{
646 struct transaction_chp_stats_s *stats;
646 transaction_t *transaction; 647 transaction_t *transaction;
647 journal_t *journal; 648 journal_t *journal;
648 int ret = 0; 649 int ret = 0;
@@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
679 680
680 /* OK, that was the last buffer for the transaction: we can now 681 /* OK, that was the last buffer for the transaction: we can now
681 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
683 stats = &transaction->t_chp_stats;
684 if (stats->cs_chp_time)
685 stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
686 jiffies);
687 trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
688 transaction->t_tid, stats);
682 689
683 __jbd2_journal_drop_transaction(journal, transaction); 690 __jbd2_journal_drop_transaction(journal, transaction);
684 kfree(transaction); 691 kfree(transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 26d991ddc1e6..d4cfd6d2779e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
410 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
411 write_op = WRITE_SYNC_PLUG; 411 write_op = WRITE_SYNC_PLUG;
412 trace_jbd2_commit_locking(journal, commit_transaction); 412 trace_jbd2_commit_locking(journal, commit_transaction);
413 stats.u.run.rs_wait = commit_transaction->t_max_wait; 413 stats.run.rs_wait = commit_transaction->t_max_wait;
414 stats.u.run.rs_locked = jiffies; 414 stats.run.rs_locked = jiffies;
415 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 415 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
416 stats.u.run.rs_locked); 416 stats.run.rs_locked);
417 417
418 spin_lock(&commit_transaction->t_handle_lock); 418 spin_lock(&commit_transaction->t_handle_lock);
419 while (commit_transaction->t_updates) { 419 while (commit_transaction->t_updates) {
@@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
486 jbd2_journal_switch_revoke_table(journal); 486 jbd2_journal_switch_revoke_table(journal);
487 487
488 trace_jbd2_commit_flushing(journal, commit_transaction); 488 trace_jbd2_commit_flushing(journal, commit_transaction);
489 stats.u.run.rs_flushing = jiffies; 489 stats.run.rs_flushing = jiffies;
490 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, 490 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
491 stats.u.run.rs_flushing); 491 stats.run.rs_flushing);
492 492
493 commit_transaction->t_state = T_FLUSH; 493 commit_transaction->t_state = T_FLUSH;
494 journal->j_committing_transaction = commit_transaction; 494 journal->j_committing_transaction = commit_transaction;
@@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
523 spin_unlock(&journal->j_state_lock); 523 spin_unlock(&journal->j_state_lock);
524 524
525 trace_jbd2_commit_logging(journal, commit_transaction); 525 trace_jbd2_commit_logging(journal, commit_transaction);
526 stats.u.run.rs_logging = jiffies; 526 stats.run.rs_logging = jiffies;
527 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, 527 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
528 stats.u.run.rs_logging); 528 stats.run.rs_logging);
529 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; 529 stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
530 stats.u.run.rs_blocks_logged = 0; 530 stats.run.rs_blocks_logged = 0;
531 531
532 J_ASSERT(commit_transaction->t_nr_buffers <= 532 J_ASSERT(commit_transaction->t_nr_buffers <=
533 commit_transaction->t_outstanding_credits); 533 commit_transaction->t_outstanding_credits);
@@ -695,7 +695,7 @@ start_journal_io:
695 submit_bh(write_op, bh); 695 submit_bh(write_op, bh);
696 } 696 }
697 cond_resched(); 697 cond_resched();
698 stats.u.run.rs_blocks_logged += bufs; 698 stats.run.rs_blocks_logged += bufs;
699 699
700 /* Force a new descriptor to be generated next 700 /* Force a new descriptor to be generated next
701 time round the loop. */ 701 time round the loop. */
@@ -988,33 +988,30 @@ restart_loop:
988 J_ASSERT(commit_transaction->t_state == T_COMMIT); 988 J_ASSERT(commit_transaction->t_state == T_COMMIT);
989 989
990 commit_transaction->t_start = jiffies; 990 commit_transaction->t_start = jiffies;
991 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, 991 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
992 commit_transaction->t_start); 992 commit_transaction->t_start);
993 993
994 /* 994 /*
995 * File the transaction for history 995 * File the transaction statistics
996 */ 996 */
997 stats.ts_type = JBD2_STATS_RUN;
998 stats.ts_tid = commit_transaction->t_tid; 997 stats.ts_tid = commit_transaction->t_tid;
999 stats.u.run.rs_handle_count = commit_transaction->t_handle_count; 998 stats.run.rs_handle_count = commit_transaction->t_handle_count;
1000 spin_lock(&journal->j_history_lock); 999 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1001 memcpy(journal->j_history + journal->j_history_cur, &stats, 1000 commit_transaction->t_tid, &stats.run);
1002 sizeof(stats));
1003 if (++journal->j_history_cur == journal->j_history_max)
1004 journal->j_history_cur = 0;
1005 1001
1006 /* 1002 /*
1007 * Calculate overall stats 1003 * Calculate overall stats
1008 */ 1004 */
1005 spin_lock(&journal->j_history_lock);
1009 journal->j_stats.ts_tid++; 1006 journal->j_stats.ts_tid++;
1010 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; 1007 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1011 journal->j_stats.u.run.rs_running += stats.u.run.rs_running; 1008 journal->j_stats.run.rs_running += stats.run.rs_running;
1012 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; 1009 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1013 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; 1010 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1014 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; 1011 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1015 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; 1012 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1016 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; 1013 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1017 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; 1014 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1018 spin_unlock(&journal->j_history_lock); 1015 spin_unlock(&journal->j_history_lock);
1019 1016
1020 commit_transaction->t_state = T_FINISHED; 1017 commit_transaction->t_state = T_FINISHED;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 53b86e16e5fe..b0ab5219becb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -136,10 +136,6 @@ static int kjournald2(void *arg)
136 journal->j_task = current; 136 journal->j_task = current;
137 wake_up(&journal->j_wait_done_commit); 137 wake_up(&journal->j_wait_done_commit);
138 138
139 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
140 "commit interval %ld seconds\n", current->pid,
141 journal->j_devname, journal->j_commit_interval / HZ);
142
143 /* 139 /*
144 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
145 */ 141 */
@@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal)
223{ 219{
224 struct task_struct *t; 220 struct task_struct *t;
225 221
226 t = kthread_run(kjournald2, journal, "kjournald2"); 222 t = kthread_run(kjournald2, journal, "jbd2/%s",
223 journal->j_devname);
227 if (IS_ERR(t)) 224 if (IS_ERR(t))
228 return PTR_ERR(t); 225 return PTR_ERR(t);
229 226
@@ -679,153 +676,6 @@ struct jbd2_stats_proc_session {
679 int max; 676 int max;
680}; 677};
681 678
682static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
683 struct transaction_stats_s *ts,
684 int first)
685{
686 if (ts == s->stats + s->max)
687 ts = s->stats;
688 if (!first && ts == s->stats + s->start)
689 return NULL;
690 while (ts->ts_type == 0) {
691 ts++;
692 if (ts == s->stats + s->max)
693 ts = s->stats;
694 if (ts == s->stats + s->start)
695 return NULL;
696 }
697 return ts;
698
699}
700
701static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
702{
703 struct jbd2_stats_proc_session *s = seq->private;
704 struct transaction_stats_s *ts;
705 int l = *pos;
706
707 if (l == 0)
708 return SEQ_START_TOKEN;
709 ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
710 if (!ts)
711 return NULL;
712 l--;
713 while (l) {
714 ts = jbd2_history_skip_empty(s, ++ts, 0);
715 if (!ts)
716 break;
717 l--;
718 }
719 return ts;
720}
721
722static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
723{
724 struct jbd2_stats_proc_session *s = seq->private;
725 struct transaction_stats_s *ts = v;
726
727 ++*pos;
728 if (v == SEQ_START_TOKEN)
729 return jbd2_history_skip_empty(s, s->stats + s->start, 1);
730 else
731 return jbd2_history_skip_empty(s, ++ts, 0);
732}
733
734static int jbd2_seq_history_show(struct seq_file *seq, void *v)
735{
736 struct transaction_stats_s *ts = v;
737 if (v == SEQ_START_TOKEN) {
738 seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
739 "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
740 "wait", "run", "lock", "flush", "log", "hndls",
741 "block", "inlog", "ctime", "write", "drop",
742 "close");
743 return 0;
744 }
745 if (ts->ts_type == JBD2_STATS_RUN)
746 seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
747 "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
748 jiffies_to_msecs(ts->u.run.rs_wait),
749 jiffies_to_msecs(ts->u.run.rs_running),
750 jiffies_to_msecs(ts->u.run.rs_locked),
751 jiffies_to_msecs(ts->u.run.rs_flushing),
752 jiffies_to_msecs(ts->u.run.rs_logging),
753 ts->u.run.rs_handle_count,
754 ts->u.run.rs_blocks,
755 ts->u.run.rs_blocks_logged);
756 else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
757 seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
758 "C", ts->ts_tid, " ",
759 jiffies_to_msecs(ts->u.chp.cs_chp_time),
760 ts->u.chp.cs_written, ts->u.chp.cs_dropped,
761 ts->u.chp.cs_forced_to_close);
762 else
763 J_ASSERT(0);
764 return 0;
765}
766
767static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
768{
769}
770
771static const struct seq_operations jbd2_seq_history_ops = {
772 .start = jbd2_seq_history_start,
773 .next = jbd2_seq_history_next,
774 .stop = jbd2_seq_history_stop,
775 .show = jbd2_seq_history_show,
776};
777
778static int jbd2_seq_history_open(struct inode *inode, struct file *file)
779{
780 journal_t *journal = PDE(inode)->data;
781 struct jbd2_stats_proc_session *s;
782 int rc, size;
783
784 s = kmalloc(sizeof(*s), GFP_KERNEL);
785 if (s == NULL)
786 return -ENOMEM;
787 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
788 s->stats = kmalloc(size, GFP_KERNEL);
789 if (s->stats == NULL) {
790 kfree(s);
791 return -ENOMEM;
792 }
793 spin_lock(&journal->j_history_lock);
794 memcpy(s->stats, journal->j_history, size);
795 s->max = journal->j_history_max;
796 s->start = journal->j_history_cur % s->max;
797 spin_unlock(&journal->j_history_lock);
798
799 rc = seq_open(file, &jbd2_seq_history_ops);
800 if (rc == 0) {
801 struct seq_file *m = file->private_data;
802 m->private = s;
803 } else {
804 kfree(s->stats);
805 kfree(s);
806 }
807 return rc;
808
809}
810
811static int jbd2_seq_history_release(struct inode *inode, struct file *file)
812{
813 struct seq_file *seq = file->private_data;
814 struct jbd2_stats_proc_session *s = seq->private;
815
816 kfree(s->stats);
817 kfree(s);
818 return seq_release(inode, file);
819}
820
821static struct file_operations jbd2_seq_history_fops = {
822 .owner = THIS_MODULE,
823 .open = jbd2_seq_history_open,
824 .read = seq_read,
825 .llseek = seq_lseek,
826 .release = jbd2_seq_history_release,
827};
828
829static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) 679static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
830{ 680{
831 return *pos ? NULL : SEQ_START_TOKEN; 681 return *pos ? NULL : SEQ_START_TOKEN;
@@ -842,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
842 692
843 if (v != SEQ_START_TOKEN) 693 if (v != SEQ_START_TOKEN)
844 return 0; 694 return 0;
845 seq_printf(seq, "%lu transaction, each upto %u blocks\n", 695 seq_printf(seq, "%lu transaction, each up to %u blocks\n",
846 s->stats->ts_tid, 696 s->stats->ts_tid,
847 s->journal->j_max_transaction_buffers); 697 s->journal->j_max_transaction_buffers);
848 if (s->stats->ts_tid == 0) 698 if (s->stats->ts_tid == 0)
849 return 0; 699 return 0;
850 seq_printf(seq, "average: \n %ums waiting for transaction\n", 700 seq_printf(seq, "average: \n %ums waiting for transaction\n",
851 jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); 701 jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
852 seq_printf(seq, " %ums running transaction\n", 702 seq_printf(seq, " %ums running transaction\n",
853 jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); 703 jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
854 seq_printf(seq, " %ums transaction was being locked\n", 704 seq_printf(seq, " %ums transaction was being locked\n",
855 jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); 705 jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
856 seq_printf(seq, " %ums flushing data (in ordered mode)\n", 706 seq_printf(seq, " %ums flushing data (in ordered mode)\n",
857 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 707 jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
858 seq_printf(seq, " %ums logging transaction\n", 708 seq_printf(seq, " %ums logging transaction\n",
859 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 709 jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
860 seq_printf(seq, " %lluus average transaction commit time\n", 710 seq_printf(seq, " %lluus average transaction commit time\n",
861 div_u64(s->journal->j_average_commit_time, 1000)); 711 div_u64(s->journal->j_average_commit_time, 1000));
862 seq_printf(seq, " %lu handles per transaction\n", 712 seq_printf(seq, " %lu handles per transaction\n",
863 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 713 s->stats->run.rs_handle_count / s->stats->ts_tid);
864 seq_printf(seq, " %lu blocks per transaction\n", 714 seq_printf(seq, " %lu blocks per transaction\n",
865 s->stats->u.run.rs_blocks / s->stats->ts_tid); 715 s->stats->run.rs_blocks / s->stats->ts_tid);
866 seq_printf(seq, " %lu logged blocks per transaction\n", 716 seq_printf(seq, " %lu logged blocks per transaction\n",
867 s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); 717 s->stats->run.rs_blocks_logged / s->stats->ts_tid);
868 return 0; 718 return 0;
869} 719}
870 720
@@ -920,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
920 return seq_release(inode, file); 770 return seq_release(inode, file);
921} 771}
922 772
923static struct file_operations jbd2_seq_info_fops = { 773static const struct file_operations jbd2_seq_info_fops = {
924 .owner = THIS_MODULE, 774 .owner = THIS_MODULE,
925 .open = jbd2_seq_info_open, 775 .open = jbd2_seq_info_open,
926 .read = seq_read, 776 .read = seq_read,
@@ -934,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal)
934{ 784{
935 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); 785 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
936 if (journal->j_proc_entry) { 786 if (journal->j_proc_entry) {
937 proc_create_data("history", S_IRUGO, journal->j_proc_entry,
938 &jbd2_seq_history_fops, journal);
939 proc_create_data("info", S_IRUGO, journal->j_proc_entry, 787 proc_create_data("info", S_IRUGO, journal->j_proc_entry,
940 &jbd2_seq_info_fops, journal); 788 &jbd2_seq_info_fops, journal);
941 } 789 }
@@ -944,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
944static void jbd2_stats_proc_exit(journal_t *journal) 792static void jbd2_stats_proc_exit(journal_t *journal)
945{ 793{
946 remove_proc_entry("info", journal->j_proc_entry); 794 remove_proc_entry("info", journal->j_proc_entry);
947 remove_proc_entry("history", journal->j_proc_entry);
948 remove_proc_entry(journal->j_devname, proc_jbd2_stats); 795 remove_proc_entry(journal->j_devname, proc_jbd2_stats);
949} 796}
950 797
951static void journal_init_stats(journal_t *journal)
952{
953 int size;
954
955 if (!proc_jbd2_stats)
956 return;
957
958 journal->j_history_max = 100;
959 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
960 journal->j_history = kzalloc(size, GFP_KERNEL);
961 if (!journal->j_history) {
962 journal->j_history_max = 0;
963 return;
964 }
965 spin_lock_init(&journal->j_history_lock);
966}
967
968/* 798/*
969 * Management for journal control blocks: functions to create and 799 * Management for journal control blocks: functions to create and
970 * destroy journal_t structures, and to initialise and read existing 800 * destroy journal_t structures, and to initialise and read existing
@@ -1009,7 +839,7 @@ static journal_t * journal_init_common (void)
1009 goto fail; 839 goto fail;
1010 } 840 }
1011 841
1012 journal_init_stats(journal); 842 spin_lock_init(&journal->j_history_lock);
1013 843
1014 return journal; 844 return journal;
1015fail: 845fail:
@@ -1115,7 +945,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1115 while ((p = strchr(p, '/'))) 945 while ((p = strchr(p, '/')))
1116 *p = '!'; 946 *p = '!';
1117 p = journal->j_devname + strlen(journal->j_devname); 947 p = journal->j_devname + strlen(journal->j_devname);
1118 sprintf(p, ":%lu", journal->j_inode->i_ino); 948 sprintf(p, "-%lu", journal->j_inode->i_ino);
1119 jbd_debug(1, 949 jbd_debug(1,
1120 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 950 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
1121 journal, inode->i_sb->s_id, inode->i_ino, 951 journal, inode->i_sb->s_id, inode->i_ino,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 00388d2a3c99..5c01fc148ce8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -176,7 +176,7 @@ static const struct file_operations exports_operations = {
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); 176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); 177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
178 178
179static struct file_operations pool_stats_operations = { 179static const struct file_operations pool_stats_operations = {
180 .open = nfsd_pool_stats_open, 180 .open = nfsd_pool_stats_open,
181 .read = seq_read, 181 .read = seq_read,
182 .llseek = seq_lseek, 182 .llseek = seq_lseek,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 6a2711f4c321..5941958f1e47 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -36,6 +36,7 @@
36 36
37void nilfs_btnode_cache_init_once(struct address_space *btnc) 37void nilfs_btnode_cache_init_once(struct address_space *btnc)
38{ 38{
39 memset(btnc, 0, sizeof(*btnc));
39 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); 40 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
40 spin_lock_init(&btnc->tree_lock); 41 spin_lock_init(&btnc->tree_lock);
41 INIT_LIST_HEAD(&btnc->private_list); 42 INIT_LIST_HEAD(&btnc->private_list);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04cf071..e097099bfc8f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
697 return 0; 697 return 0;
698} 698}
699 699
700struct file_operations nilfs_dir_operations = { 700const struct file_operations nilfs_dir_operations = {
701 .llseek = generic_file_llseek, 701 .llseek = generic_file_llseek,
702 .read = generic_read_dir, 702 .read = generic_read_dir,
703 .readdir = nilfs_readdir, 703 .readdir = nilfs_readdir,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7d7b4983dee3..30292df443ce 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
134 * We have mostly NULL's here: the current defaults are ok for 134 * We have mostly NULL's here: the current defaults are ok for
135 * the nilfs filesystem. 135 * the nilfs filesystem.
136 */ 136 */
137struct file_operations nilfs_file_operations = { 137const struct file_operations nilfs_file_operations = {
138 .llseek = generic_file_llseek, 138 .llseek = generic_file_llseek,
139 .read = do_sync_read, 139 .read = do_sync_read,
140 .write = do_sync_write, 140 .write = do_sync_write,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2d2c501deb54..5040220c3732 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -400,6 +400,7 @@ int nilfs_read_inode_common(struct inode *inode,
400 ii->i_dir_acl = S_ISREG(inode->i_mode) ? 400 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
401 0 : le32_to_cpu(raw_inode->i_dir_acl); 401 0 : le32_to_cpu(raw_inode->i_dir_acl);
402#endif 402#endif
403 ii->i_dir_start_lookup = 0;
403 ii->i_cno = 0; 404 ii->i_cno = 0;
404 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 405 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
405 406
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index b18c4998f8d0..f6326112d647 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -433,7 +433,7 @@ static const struct address_space_operations def_mdt_aops = {
433}; 433};
434 434
435static const struct inode_operations def_mdt_iops; 435static const struct inode_operations def_mdt_iops;
436static struct file_operations def_mdt_fops; 436static const struct file_operations def_mdt_fops;
437 437
438/* 438/*
439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, 439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index bad7368782d0..4da6f67e9a91 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,9 +294,9 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
294/* 294/*
295 * Inodes and files operations 295 * Inodes and files operations
296 */ 296 */
297extern struct file_operations nilfs_dir_operations; 297extern const struct file_operations nilfs_dir_operations;
298extern const struct inode_operations nilfs_file_inode_operations; 298extern const struct inode_operations nilfs_file_inode_operations;
299extern struct file_operations nilfs_file_operations; 299extern const struct file_operations nilfs_file_operations;
300extern const struct address_space_operations nilfs_aops; 300extern const struct address_space_operations nilfs_aops;
301extern const struct inode_operations nilfs_dir_inode_operations; 301extern const struct inode_operations nilfs_dir_inode_operations;
302extern const struct inode_operations nilfs_special_inode_operations; 302extern const struct inode_operations nilfs_special_inode_operations;
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 2224b4d07bf0..44a88a9fa2c8 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
124 while (*s && len > 0) { 124 while (*s && len > 0) {
125 if (*s & 0x80) { 125 if (*s & 0x80) {
126 size = utf8_to_utf32(s, len, &u); 126 size = utf8_to_utf32(s, len, &u);
127 if (size < 0) { 127 if (size < 0)
128 /* Ignore character and move on */ 128 return -EINVAL;
129 size = 1; 129
130 } else if (u >= PLANE_SIZE) { 130 if (u >= PLANE_SIZE) {
131 u -= PLANE_SIZE; 131 u -= PLANE_SIZE;
132 *op++ = (wchar_t) (SURROGATE_PAIR | 132 *op++ = (wchar_t) (SURROGATE_PAIR |
133 ((u >> 10) & SURROGATE_BITS)); 133 ((u >> 10) & SURROGATE_BITS));
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d04611..c452d116b892 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
966} 966}
967#endif /* CONFIG_DEBUG_FS */ 967#endif /* CONFIG_DEBUG_FS */
968 968
969static struct file_operations o2hb_debug_fops = { 969static const struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open, 970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release, 971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read, 972 .read = o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index cfb2be708abe..da794bc07a6c 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
207 return seq_release_private(inode, file); 207 return seq_release_private(inode, file);
208} 208}
209 209
210static struct file_operations nst_seq_fops = { 210static const struct file_operations nst_seq_fops = {
211 .open = nst_fop_open, 211 .open = nst_fop_open,
212 .read = seq_read, 212 .read = seq_read,
213 .llseek = seq_lseek, 213 .llseek = seq_lseek,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
388 return seq_release_private(inode, file); 388 return seq_release_private(inode, file);
389} 389}
390 390
391static struct file_operations sc_seq_fops = { 391static const struct file_operations sc_seq_fops = {
392 .open = sc_fop_open, 392 .open = sc_fop_open,
393 .read = seq_read, 393 .read = seq_read,
394 .llseek = seq_lseek, 394 .llseek = seq_lseek,
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index ca46002ec10e..42b0bad7a612 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -478,7 +478,7 @@ bail:
478 return -ENOMEM; 478 return -ENOMEM;
479} 479}
480 480
481static struct file_operations debug_purgelist_fops = { 481static const struct file_operations debug_purgelist_fops = {
482 .open = debug_purgelist_open, 482 .open = debug_purgelist_open,
483 .release = debug_buffer_release, 483 .release = debug_buffer_release,
484 .read = debug_buffer_read, 484 .read = debug_buffer_read,
@@ -538,7 +538,7 @@ bail:
538 return -ENOMEM; 538 return -ENOMEM;
539} 539}
540 540
541static struct file_operations debug_mle_fops = { 541static const struct file_operations debug_mle_fops = {
542 .open = debug_mle_open, 542 .open = debug_mle_open,
543 .release = debug_buffer_release, 543 .release = debug_buffer_release,
544 .read = debug_buffer_read, 544 .read = debug_buffer_read,
@@ -741,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
741 return seq_release_private(inode, file); 741 return seq_release_private(inode, file);
742} 742}
743 743
744static struct file_operations debug_lockres_fops = { 744static const struct file_operations debug_lockres_fops = {
745 .open = debug_lockres_open, 745 .open = debug_lockres_open,
746 .release = debug_lockres_release, 746 .release = debug_lockres_release,
747 .read = seq_read, 747 .read = seq_read,
@@ -925,7 +925,7 @@ bail:
925 return -ENOMEM; 925 return -ENOMEM;
926} 926}
927 927
928static struct file_operations debug_state_fops = { 928static const struct file_operations debug_state_fops = {
929 .open = debug_state_open, 929 .open = debug_state_open,
930 .release = debug_buffer_release, 930 .release = debug_buffer_release,
931 .read = debug_buffer_read, 931 .read = debug_buffer_read,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4cc3c890a2cd..c0e48aeebb1c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
373} 373}
374#endif /* CONFIG_DEBUG_FS */ 374#endif /* CONFIG_DEBUG_FS */
375 375
376static struct file_operations ocfs2_osb_debug_fops = { 376static const struct file_operations ocfs2_osb_debug_fops = {
377 .open = ocfs2_osb_debug_open, 377 .open = ocfs2_osb_debug_open,
378 .release = ocfs2_debug_release, 378 .release = ocfs2_debug_release,
379 .read = ocfs2_debug_read, 379 .read = ocfs2_debug_read,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 3680bae335b5..b42d62419034 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -498,7 +498,7 @@ const struct inode_operations omfs_dir_inops = {
498 .rmdir = omfs_rmdir, 498 .rmdir = omfs_rmdir,
499}; 499};
500 500
501struct file_operations omfs_dir_operations = { 501const struct file_operations omfs_dir_operations = {
502 .read = generic_read_dir, 502 .read = generic_read_dir,
503 .readdir = omfs_readdir, 503 .readdir = omfs_readdir,
504 .llseek = generic_file_llseek, 504 .llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 4845fbb18e6e..399487c09364 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
322 return generic_block_bmap(mapping, block, omfs_get_block); 322 return generic_block_bmap(mapping, block, omfs_get_block);
323} 323}
324 324
325struct file_operations omfs_file_operations = { 325const struct file_operations omfs_file_operations = {
326 .llseek = generic_file_llseek, 326 .llseek = generic_file_llseek,
327 .read = do_sync_read, 327 .read = do_sync_read,
328 .write = do_sync_write, 328 .write = do_sync_write,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index df71039945ac..ebe2fdbe535e 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,14 +44,14 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
44extern int omfs_clear_range(struct super_block *sb, u64 block, int count); 44extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
45 45
46/* dir.c */ 46/* dir.c */
47extern struct file_operations omfs_dir_operations; 47extern const struct file_operations omfs_dir_operations;
48extern const struct inode_operations omfs_dir_inops; 48extern const struct inode_operations omfs_dir_inops;
49extern int omfs_make_empty(struct inode *inode, struct super_block *sb); 49extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, 50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
51 u64 fsblock); 51 u64 fsblock);
52 52
53/* file.c */ 53/* file.c */
54extern struct file_operations omfs_file_operations; 54extern const struct file_operations omfs_file_operations;
55extern const struct inode_operations omfs_file_inops; 55extern const struct inode_operations omfs_file_inops;
56extern const struct address_space_operations omfs_aops; 56extern const struct address_space_operations omfs_aops;
57extern void omfs_make_empty_table(struct buffer_head *bh, int offset); 57extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e10cbad..f38fee0311a7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,19 +248,11 @@ ssize_t part_stat_show(struct device *dev,
248 part_stat_read(p, merges[WRITE]), 248 part_stat_read(p, merges[WRITE]),
249 (unsigned long long)part_stat_read(p, sectors[WRITE]), 249 (unsigned long long)part_stat_read(p, sectors[WRITE]),
250 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), 250 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
251 part_in_flight(p), 251 p->in_flight,
252 jiffies_to_msecs(part_stat_read(p, io_ticks)), 252 jiffies_to_msecs(part_stat_read(p, io_ticks)),
253 jiffies_to_msecs(part_stat_read(p, time_in_queue))); 253 jiffies_to_msecs(part_stat_read(p, time_in_queue)));
254} 254}
255 255
256ssize_t part_inflight_show(struct device *dev,
257 struct device_attribute *attr, char *buf)
258{
259 struct hd_struct *p = dev_to_part(dev);
260
261 return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
262}
263
264#ifdef CONFIG_FAIL_MAKE_REQUEST 256#ifdef CONFIG_FAIL_MAKE_REQUEST
265ssize_t part_fail_show(struct device *dev, 257ssize_t part_fail_show(struct device *dev,
266 struct device_attribute *attr, char *buf) 258 struct device_attribute *attr, char *buf)
@@ -289,7 +281,6 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
289static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 281static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
290static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 282static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
291static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 283static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
292static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
293#ifdef CONFIG_FAIL_MAKE_REQUEST 284#ifdef CONFIG_FAIL_MAKE_REQUEST
294static struct device_attribute dev_attr_fail = 285static struct device_attribute dev_attr_fail =
295 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); 286 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -301,7 +292,6 @@ static struct attribute *part_attrs[] = {
301 &dev_attr_size.attr, 292 &dev_attr_size.attr,
302 &dev_attr_alignment_offset.attr, 293 &dev_attr_alignment_offset.attr,
303 &dev_attr_stat.attr, 294 &dev_attr_stat.attr,
304 &dev_attr_inflight.attr,
305#ifdef CONFIG_FAIL_MAKE_REQUEST 295#ifdef CONFIG_FAIL_MAKE_REQUEST
306 &dev_attr_fail.attr, 296 &dev_attr_fail.attr,
307#endif 297#endif
diff --git a/fs/select.c b/fs/select.c
index a201fc370223..fd38ce2e32e3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/sched.h>
18#include <linux/syscalls.h> 19#include <linux/syscalls.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/slab.h> 21#include <linux/slab.h>