aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/removed/o2cb (renamed from Documentation/ABI/obsolete/o2cb)9
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-cleancache11
-rw-r--r--Documentation/feature-removal-schedule.txt10
-rw-r--r--Documentation/filesystems/ext4.txt4
-rw-r--r--Documentation/filesystems/ocfs2.txt8
-rw-r--r--Documentation/filesystems/xfs.txt6
-rw-r--r--Documentation/vm/cleancache.txt278
-rw-r--r--MAINTAINERS13
-rw-r--r--arch/x86/include/asm/xen/hypercall.h7
-rw-r--r--drivers/video/mb862xx/mb862xx-i2c.c1
-rw-r--r--drivers/xen/Makefile1
-rw-r--r--drivers/xen/tmem.c264
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/Kconfig31
-rw-r--r--fs/affs/namei.c5
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/bfs/dir.c3
-rw-r--r--fs/btrfs/extent_io.c9
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/buffer.c64
-rw-r--r--fs/coda/dir.c5
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/ecryptfs/inode.c5
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/balloc.c146
-rw-r--r--fs/ext4/ext4.h127
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h5
-rw-r--r--fs/ext4/extents.c1410
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/fsync.c25
-rw-r--r--fs/ext4/inode.c114
-rw-r--r--fs/ext4/mballoc.c459
-rw-r--r--fs/ext4/mballoc.h6
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c351
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c82
-rw-r--r--fs/ext4/page-io.c39
-rw-r--r--fs/ext4/super.c206
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/fat/namei_msdos.c5
-rw-r--r--fs/fat/namei_vfat.c5
-rw-r--r--fs/fuse/dir.c6
-rw-r--r--fs/hfs/dir.c6
-rw-r--r--fs/hfsplus/dir.c8
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/namei.c9
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/jbd2/commit.c22
-rw-r--r--fs/jbd2/journal.c58
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/jffs2/dir.c5
-rw-r--r--fs/jfs/namei.c5
-rw-r--r--fs/logfs/dir.c5
-rw-r--r--fs/minix/namei.c5
-rw-r--r--fs/mpage.c7
-rw-r--r--fs/namei.c380
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/ncpfs/dir.c5
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c166
-rw-r--r--fs/ocfs2/alloc.h1
-rw-r--r--fs/ocfs2/cluster/sys.c9
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h14
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c94
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c255
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/file.c1
-rw-r--r--fs/ocfs2/ioctl.c492
-rw-r--r--fs/ocfs2/move_extents.c1153
-rw-r--r--fs/ocfs2/move_extents.h22
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h68
-rw-r--r--fs/ocfs2/ocfs2_trace.h25
-rw-r--r--fs/ocfs2/refcounttree.c58
-rw-r--r--fs/ocfs2/refcounttree.h11
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/omfs/dir.c11
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c20
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h18
-rw-r--r--fs/proc/namespaces.c198
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/reiserfs/namei.c5
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/super.c3
-rw-r--r--fs/sysv/namei.c5
-rw-r--r--fs/ubifs/dir.c5
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c29
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c18
-rw-r--r--fs/xfs/xfs_ag.h3
-rw-r--r--fs/xfs/xfs_alloc.c35
-rw-r--r--fs/xfs/xfs_alloc.h5
-rw-r--r--fs/xfs/xfs_alloc_btree.c3
-rw-r--r--fs/xfs/xfs_bmap.c549
-rw-r--r--fs/xfs/xfs_bmap.h2
-rw-r--r--fs/xfs/xfs_inode.c15
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_log_cil.c13
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--include/linux/buffer_head.h16
-rw-r--r--include/linux/cleancache.h122
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/hugetlb.h7
-rw-r--r--include/linux/hugetlb_inline.h2
-rw-r--r--include/linux/if_link.h1
-rw-r--r--include/linux/jbd2.h8
-rw-r--r--include/linux/mm.h6
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/proc_fs.h21
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/net/net_namespace.h1
-rw-r--r--include/xen/interface/xen.h22
-rw-r--r--ipc/namespace.c37
-rw-r--r--ipc/shm.c2
-rw-r--r--kernel/nsproxy.c42
-rw-r--r--kernel/utsname.c39
-rw-r--r--mm/Kconfig23
-rw-r--r--mm/Makefile1
-rw-r--r--mm/cleancache.c244
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mlock.c8
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/slub.c1
-rw-r--r--mm/truncate.c6
-rw-r--r--net/core/net_namespace.c65
-rw-r--r--net/core/rtnetlink.c5
140 files changed, 6377 insertions, 2002 deletions
diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/removed/o2cb
index 9c49d8e6c0cc..7f5daa465093 100644
--- a/Documentation/ABI/obsolete/o2cb
+++ b/Documentation/ABI/removed/o2cb
@@ -1,11 +1,10 @@
1What: /sys/o2cb symlink 1What: /sys/o2cb symlink
2Date: Dec 2005 2Date: May 2011
3KernelVersion: 2.6.16 3KernelVersion: 2.6.40
4Contact: ocfs2-devel@oss.oracle.com 4Contact: ocfs2-devel@oss.oracle.com
5Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will 5Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
6 be removed when new versions of ocfs2-tools which know to look 6 removed when new versions of ocfs2-tools which know to look
7 in /sys/fs/o2cb are sufficiently prevalent. Don't code new 7 in /sys/fs/o2cb are sufficiently prevalent. Don't code new
8 software to look here, it should try /sys/fs/o2cb instead. 8 software to look here, it should try /sys/fs/o2cb instead.
9 See Documentation/ABI/stable/o2cb for more information on usage.
10Users: ocfs2-tools. It's sufficient to mail proposed changes to 9Users: ocfs2-tools. It's sufficient to mail proposed changes to
11 ocfs2-devel@oss.oracle.com. 10 ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cleancache b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
new file mode 100644
index 000000000000..662ae646ea12
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
@@ -0,0 +1,11 @@
1What: /sys/kernel/mm/cleancache/
2Date: April 2011
3Contact: Dan Magenheimer <dan.magenheimer@oracle.com>
4Description:
5 /sys/kernel/mm/cleancache/ contains a number of files which
6 record a count of various cleancache operations
7 (sum across all filesystems):
8 succ_gets
9 failed_gets
10 puts
11 flushes
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 95788ad2506c..ff31b1cc50aa 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -262,16 +262,6 @@ Who: Michael Buesch <mb@bu3sch.de>
262 262
263--------------------------- 263---------------------------
264 264
265What: /sys/o2cb symlink
266When: January 2010
267Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb
268 exists as a symlink for backwards compatibility for old versions of
269 ocfs2-tools. 2 years should be sufficient time to phase in new versions
270 which know to look in /sys/fs/o2cb.
271Who: ocfs2-devel@oss.oracle.com
272
273---------------------------
274
275What: Ability for non root users to shm_get hugetlb pages based on mlock 265What: Ability for non root users to shm_get hugetlb pages based on mlock
276 resource limits 266 resource limits
277When: 2.6.31 267When: 2.6.31
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index c79ec58fd7f6..3ae9bc94352a 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -226,10 +226,6 @@ acl Enables POSIX Access Control Lists support.
226noacl This option disables POSIX Access Control List 226noacl This option disables POSIX Access Control List
227 support. 227 support.
228 228
229reservation
230
231noreservation
232
233bsddf (*) Make 'df' act like BSD. 229bsddf (*) Make 'df' act like BSD.
234minixdf Make 'df' act like Minix. 230minixdf Make 'df' act like Minix.
235 231
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index 9ed920a8cd79..7618a287aa41 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -46,9 +46,15 @@ errors=panic Panic and halt the machine if an error occurs.
46intr (*) Allow signals to interrupt cluster operations. 46intr (*) Allow signals to interrupt cluster operations.
47nointr Do not allow signals to interrupt cluster 47nointr Do not allow signals to interrupt cluster
48 operations. 48 operations.
49noatime Do not update access time.
50relatime(*) Update atime if the previous atime is older than
51 mtime or ctime
52strictatime Always update atime, but the minimum update interval
53 is specified by atime_quantum.
49atime_quantum=60(*) OCFS2 will not update atime unless this number 54atime_quantum=60(*) OCFS2 will not update atime unless this number
50 of seconds has passed since the last update. 55 of seconds has passed since the last update.
51 Set to zero to always update atime. 56 Set to zero to always update atime. This option need
57 work with strictatime.
52data=ordered (*) All data are forced directly out to the main file 58data=ordered (*) All data are forced directly out to the main file
53 system prior to its metadata being committed to the 59 system prior to its metadata being committed to the
54 journal. 60 journal.
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 7bff3e4f35df..3fc0c31a6f5d 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -39,6 +39,12 @@ When mounting an XFS filesystem, the following options are accepted.
39 drive level write caching to be enabled, for devices that 39 drive level write caching to be enabled, for devices that
40 support write barriers. 40 support write barriers.
41 41
42 discard
43 Issue command to let the block device reclaim space freed by the
44 filesystem. This is useful for SSD devices, thinly provisioned
45 LUNs and virtual machine images, but may have a performance
46 impact. This option is incompatible with the nodelaylog option.
47
42 dmapi 48 dmapi
43 Enable the DMAPI (Data Management API) event callouts. 49 Enable the DMAPI (Data Management API) event callouts.
44 Use with the "mtpt" option. 50 Use with the "mtpt" option.
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt
new file mode 100644
index 000000000000..36c367c73084
--- /dev/null
+++ b/Documentation/vm/cleancache.txt
@@ -0,0 +1,278 @@
1MOTIVATION
2
3Cleancache is a new optional feature provided by the VFS layer that
4potentially dramatically increases page cache effectiveness for
5many workloads in many environments at a negligible cost.
6
7Cleancache can be thought of as a page-granularity victim cache for clean
8pages that the kernel's pageframe replacement algorithm (PFRA) would like
9to keep around, but can't since there isn't enough memory. So when the
10PFRA "evicts" a page, it first attempts to use cleancache code to
11put the data contained in that page into "transcendent memory", memory
12that is not directly accessible or addressable by the kernel and is
13of unknown and possibly time-varying size.
14
15Later, when a cleancache-enabled filesystem wishes to access a page
16in a file on disk, it first checks cleancache to see if it already
17contains it; if it does, the page of data is copied into the kernel
18and a disk access is avoided.
19
20Transcendent memory "drivers" for cleancache are currently implemented
21in Xen (using hypervisor memory) and zcache (using in-kernel compressed
22memory) and other implementations are in development.
23
24FAQs are included below.
25
26IMPLEMENTATION OVERVIEW
27
28A cleancache "backend" that provides transcendent memory registers itself
29to the kernel's cleancache "frontend" by calling cleancache_register_ops,
30passing a pointer to a cleancache_ops structure with funcs set appropriately.
31Note that cleancache_register_ops returns the previous settings so that
32chaining can be performed if desired. The functions provided must conform to
33certain semantics as follows:
34
35Most important, cleancache is "ephemeral". Pages which are copied into
36cleancache have an indefinite lifetime which is completely unknowable
37by the kernel and so may or may not still be in cleancache at any later time.
38Thus, as its name implies, cleancache is not suitable for dirty pages.
39Cleancache has complete discretion over what pages to preserve and what
40pages to discard and when.
41
42Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
43pool id which, if positive, must be saved in the filesystem's superblock;
44a negative return value indicates failure. A "put_page" will copy a
45(presumably about-to-be-evicted) page into cleancache and associate it with
46the pool id, a file key, and a page index into the file. (The combination
47of a pool id, a file key, and an index is sometimes called a "handle".)
48A "get_page" will copy the page, if found, from cleancache into kernel memory.
49A "flush_page" will ensure the page no longer is present in cleancache;
50a "flush_inode" will flush all pages associated with the specified file;
51and, when a filesystem is unmounted, a "flush_fs" will flush all pages in
52all files specified by the given pool id and also surrender the pool id.
53
54An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
55to treat the pool as shared using a 128-bit UUID as a key. On systems
56that may run multiple kernels (such as hard partitioned or virtualized
57systems) that may share a clustered filesystem, and where cleancache
58may be shared among those kernels, calls to init_shared_fs that specify the
59same UUID will receive the same pool id, thus allowing the pages to
60be shared. Note that any security requirements must be imposed outside
61of the kernel (e.g. by "tools" that control cleancache). Or a
62cleancache implementation can simply disable shared_init by always
63returning a negative value.
64
65If a get_page is successful on a non-shared pool, the page is flushed (thus
66making cleancache an "exclusive" cache). On a shared pool, the page
67is NOT flushed on a successful get_page so that it remains accessible to
68other sharers. The kernel is responsible for ensuring coherency between
69cleancache (shared or not), the page cache, and the filesystem, using
70cleancache flush operations as required.
71
72Note that cleancache must enforce put-put-get coherency and get-get
73coherency. For the former, if two puts are made to the same handle but
74with different data, say AAA by the first put and BBB by the second, a
75subsequent get can never return the stale data (AAA). For get-get coherency,
76if a get for a given handle fails, subsequent gets for that handle will
77never succeed unless preceded by a successful put with that handle.
78
79Last, cleancache provides no SMP serialization guarantees; if two
80different Linux threads are simultaneously putting and flushing a page
81with the same handle, the results are indeterminate. Callers must
82lock the page to ensure serial behavior.
83
84CLEANCACHE PERFORMANCE METRICS
85
86Cleancache monitoring is done by sysfs files in the
87/sys/kernel/mm/cleancache directory. The effectiveness of cleancache
88can be measured (across all filesystems) with:
89
90succ_gets - number of gets that were successful
91failed_gets - number of gets that failed
92puts - number of puts attempted (all "succeed")
93flushes - number of flushes attempted
94
95A backend implementatation may provide additional metrics.
96
97FAQ
98
991) Where's the value? (Andrew Morton)
100
101Cleancache provides a significant performance benefit to many workloads
102in many environments with negligible overhead by improving the
103effectiveness of the pagecache. Clean pagecache pages are
104saved in transcendent memory (RAM that is otherwise not directly
105addressable to the kernel); fetching those pages later avoids "refaults"
106and thus disk reads.
107
108Cleancache (and its sister code "frontswap") provide interfaces for
109this transcendent memory (aka "tmem"), which conceptually lies between
110fast kernel-directly-addressable RAM and slower DMA/asynchronous devices.
111Disallowing direct kernel or userland reads/writes to tmem
112is ideal when data is transformed to a different form and size (such
113as with compression) or secretly moved (as might be useful for write-
114balancing for some RAM-like devices). Evicted page-cache pages (and
115swap pages) are a great use for this kind of slower-than-RAM-but-much-
116faster-than-disk transcendent memory, and the cleancache (and frontswap)
117"page-object-oriented" specification provides a nice way to read and
118write -- and indirectly "name" -- the pages.
119
120In the virtual case, the whole point of virtualization is to statistically
121multiplex physical resources across the varying demands of multiple
122virtual machines. This is really hard to do with RAM and efforts to
123do it well with no kernel change have essentially failed (except in some
124well-publicized special-case workloads). Cleancache -- and frontswap --
125with a fairly small impact on the kernel, provide a huge amount
126of flexibility for more dynamic, flexible RAM multiplexing.
127Specifically, the Xen Transcendent Memory backend allows otherwise
128"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
129virtual machines, but the pages can be compressed and deduplicated to
130optimize RAM utilization. And when guest OS's are induced to surrender
131underutilized RAM (e.g. with "self-ballooning"), page cache pages
132are the first to go, and cleancache allows those pages to be
133saved and reclaimed if overall host system memory conditions allow.
134
135And the identical interface used for cleancache can be used in
136physical systems as well. The zcache driver acts as a memory-hungry
137device that stores pages of data in a compressed state. And
138the proposed "RAMster" driver shares RAM across multiple physical
139systems.
140
1412) Why does cleancache have its sticky fingers so deep inside the
142 filesystems and VFS? (Andrew Morton and Christoph Hellwig)
143
144The core hooks for cleancache in VFS are in most cases a single line
145and the minimum set are placed precisely where needed to maintain
146coherency (via cleancache_flush operations) between cleancache,
147the page cache, and disk. All hooks compile into nothingness if
148cleancache is config'ed off and turn into a function-pointer-
149compare-to-NULL if config'ed on but no backend claims the ops
150functions, or to a compare-struct-element-to-negative if a
151backend claims the ops functions but a filesystem doesn't enable
152cleancache.
153
154Some filesystems are built entirely on top of VFS and the hooks
155in VFS are sufficient, so don't require an "init_fs" hook; the
156initial implementation of cleancache didn't provide this hook.
157But for some filesystems (such as btrfs), the VFS hooks are
158incomplete and one or more hooks in fs-specific code are required.
159And for some other filesystems, such as tmpfs, cleancache may
160be counterproductive. So it seemed prudent to require a filesystem
161to "opt in" to use cleancache, which requires adding a hook in
162each filesystem. Not all filesystems are supported by cleancache
163only because they haven't been tested. The existing set should
164be sufficient to validate the concept, the opt-in approach means
165that untested filesystems are not affected, and the hooks in the
166existing filesystems should make it very easy to add more
167filesystems in the future.
168
169The total impact of the hooks to existing fs and mm files is only
170about 40 lines added (not counting comments and blank lines).
171
1723) Why not make cleancache asynchronous and batched so it can
173 more easily interface with real devices with DMA instead
174 of copying each individual page? (Minchan Kim)
175
176The one-page-at-a-time copy semantics simplifies the implementation
177on both the frontend and backend and also allows the backend to
178do fancy things on-the-fly like page compression and
179page deduplication. And since the data is "gone" (copied into/out
180of the pageframe) before the cleancache get/put call returns,
181a great deal of race conditions and potential coherency issues
182are avoided. While the interface seems odd for a "real device"
183or for real kernel-addressable RAM, it makes perfect sense for
184transcendent memory.
185
1864) Why is non-shared cleancache "exclusive"? And where is the
187 page "flushed" after a "get"? (Minchan Kim)
188
189The main reason is to free up space in transcendent memory and
190to avoid unnecessary cleancache_flush calls. If you want inclusive,
191the page can be "put" immediately following the "get". If
192put-after-get for inclusive becomes common, the interface could
193be easily extended to add a "get_no_flush" call.
194
195The flush is done by the cleancache backend implementation.
196
1975) What's the performance impact?
198
199Performance analysis has been presented at OLS'09 and LCA'10.
200Briefly, performance gains can be significant on most workloads,
201especially when memory pressure is high (e.g. when RAM is
202overcommitted in a virtual workload); and because the hooks are
203invoked primarily in place of or in addition to a disk read/write,
204overhead is negligible even in worst case workloads. Basically
205cleancache replaces I/O with memory-copy-CPU-overhead; on older
206single-core systems with slow memory-copy speeds, cleancache
207has little value, but in newer multicore machines, especially
208consolidated/virtualized machines, it has great value.
209
2106) How do I add cleancache support for filesystem X? (Boaz Harrash)
211
212Filesystems that are well-behaved and conform to certain
213restrictions can utilize cleancache simply by making a call to
214cleancache_init_fs at mount time. Unusual, misbehaving, or
215poorly layered filesystems must either add additional hooks
216and/or undergo extensive additional testing... or should just
217not enable the optional cleancache.
218
219Some points for a filesystem to consider:
220
221- The FS should be block-device-based (e.g. a ram-based FS such
222 as tmpfs should not enable cleancache)
223- To ensure coherency/correctness, the FS must ensure that all
224 file removal or truncation operations either go through VFS or
225 add hooks to do the equivalent cleancache "flush" operations
226- To ensure coherency/correctness, either inode numbers must
227 be unique across the lifetime of the on-disk file OR the
228 FS must provide an "encode_fh" function.
229- The FS must call the VFS superblock alloc and deactivate routines
230 or add hooks to do the equivalent cleancache calls done there.
231- To maximize performance, all pages fetched from the FS should
232 go through the do_mpag_readpage routine or the FS should add
233 hooks to do the equivalent (cf. btrfs)
234- Currently, the FS blocksize must be the same as PAGESIZE. This
235 is not an architectural restriction, but no backends currently
236 support anything different.
237- A clustered FS should invoke the "shared_init_fs" cleancache
238 hook to get best performance for some backends.
239
2407) Why not use the KVA of the inode as the key? (Christoph Hellwig)
241
242If cleancache would use the inode virtual address instead of
243inode/filehandle, the pool id could be eliminated. But, this
244won't work because cleancache retains pagecache data pages
245persistently even when the inode has been pruned from the
246inode unused list, and only flushes the data page if the file
247gets removed/truncated. So if cleancache used the inode kva,
248there would be potential coherency issues if/when the inode
249kva is reused for a different file. Alternately, if cleancache
250flushed the pages when the inode kva was freed, much of the value
251of cleancache would be lost because the cache of pages in cleanache
252is potentially much larger than the kernel pagecache and is most
253useful if the pages survive inode cache removal.
254
2558) Why is a global variable required?
256
257The cleancache_enabled flag is checked in all of the frequently-used
258cleancache hooks. The alternative is a function call to check a static
259variable. Since cleancache is enabled dynamically at runtime, systems
260that don't enable cleancache would suffer thousands (possibly
261tens-of-thousands) of unnecessary function calls per second. So the
262global variable allows cleancache to be enabled by default at compile
263time, but have insignificant performance impact when cleancache remains
264disabled at runtime.
265
2669) Does cleanache work with KVM?
267
268The memory model of KVM is sufficiently different that a cleancache
269backend may have less value for KVM. This remains to be tested,
270especially in an overcommitted system.
271
27210) Does cleancache work in userspace? It sounds useful for
273 memory hungry caches like web browsers. (Jamie Lokier)
274
275No plans yet, though we agree it sounds useful, at least for
276apps that bypass the page cache (e.g. O_DIRECT).
277
278Last updated: Dan Magenheimer, April 13 2011
diff --git a/MAINTAINERS b/MAINTAINERS
index 1ab17de642e5..d54d551004f7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3572,9 +3572,16 @@ M: Andrew Morton <akpm@linux-foundation.org>
3572M: Jan Kara <jack@suse.cz> 3572M: Jan Kara <jack@suse.cz>
3573L: linux-ext4@vger.kernel.org 3573L: linux-ext4@vger.kernel.org
3574S: Maintained 3574S: Maintained
3575F: fs/jbd*/ 3575F: fs/jbd/
3576F: include/linux/ext*jbd*.h 3576F: include/linux/ext3_jbd.h
3577F: include/linux/jbd*.h 3577F: include/linux/jbd.h
3578
3579JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
3580M: "Theodore Ts'o" <tytso@mit.edu>
3581L: linux-ext4@vger.kernel.org
3582S: Maintained
3583F: fs/jbd2/
3584F: include/linux/jbd2.h
3578 3585
3579JSM Neo PCI based serial card 3586JSM Neo PCI based serial card
3580M: Breno Leitao <leitao@linux.vnet.ibm.com> 3587M: Breno Leitao <leitao@linux.vnet.ibm.com>
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 8508bfe52296..d240ea950519 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -447,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg)
447 return _hypercall2(unsigned long, hvm_op, op, arg); 447 return _hypercall2(unsigned long, hvm_op, op, arg);
448} 448}
449 449
450static inline int
451HYPERVISOR_tmem_op(
452 struct tmem_op *op)
453{
454 return _hypercall1(int, tmem_op, op);
455}
456
450static inline void 457static inline void
451MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) 458MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
452{ 459{
diff --git a/drivers/video/mb862xx/mb862xx-i2c.c b/drivers/video/mb862xx/mb862xx-i2c.c
index cb77d3b4657d..b953099edd8e 100644
--- a/drivers/video/mb862xx/mb862xx-i2c.c
+++ b/drivers/video/mb862xx/mb862xx-i2c.c
@@ -12,6 +12,7 @@
12#include <linux/fb.h> 12#include <linux/fb.h>
13#include <linux/i2c.h> 13#include <linux/i2c.h>
14#include <linux/io.h> 14#include <linux/io.h>
15#include <linux/delay.h>
15 16
16#include "mb862xxfb.h" 17#include "mb862xxfb.h"
17#include "mb862xx_reg.h" 18#include "mb862xx_reg.h"
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 4781f806701d..bbc18258ecc5 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,5 +1,6 @@
1obj-y += grant-table.o features.o events.o manage.o balloon.o 1obj-y += grant-table.o features.o events.o manage.o balloon.o
2obj-y += xenbus/ 2obj-y += xenbus/
3obj-y += tmem.o
3 4
4nostackp := $(call cc-option, -fno-stack-protector) 5nostackp := $(call cc-option, -fno-stack-protector)
5CFLAGS_features.o := $(nostackp) 6CFLAGS_features.o := $(nostackp)
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
new file mode 100644
index 000000000000..816a44959ef0
--- /dev/null
+++ b/drivers/xen/tmem.c
@@ -0,0 +1,264 @@
1/*
2 * Xen implementation for transcendent memory (tmem)
3 *
4 * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
5 * Author: Dan Magenheimer
6 */
7
8#include <linux/kernel.h>
9#include <linux/types.h>
10#include <linux/init.h>
11#include <linux/pagemap.h>
12#include <linux/cleancache.h>
13
14#include <xen/xen.h>
15#include <xen/interface/xen.h>
16#include <asm/xen/hypercall.h>
17#include <asm/xen/page.h>
18#include <asm/xen/hypervisor.h>
19
20#define TMEM_CONTROL 0
21#define TMEM_NEW_POOL 1
22#define TMEM_DESTROY_POOL 2
23#define TMEM_NEW_PAGE 3
24#define TMEM_PUT_PAGE 4
25#define TMEM_GET_PAGE 5
26#define TMEM_FLUSH_PAGE 6
27#define TMEM_FLUSH_OBJECT 7
28#define TMEM_READ 8
29#define TMEM_WRITE 9
30#define TMEM_XCHG 10
31
32/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
33#define TMEM_POOL_PERSIST 1
34#define TMEM_POOL_SHARED 2
35#define TMEM_POOL_PAGESIZE_SHIFT 4
36#define TMEM_VERSION_SHIFT 24
37
38
39struct tmem_pool_uuid {
40 u64 uuid_lo;
41 u64 uuid_hi;
42};
43
44struct tmem_oid {
45 u64 oid[3];
46};
47
48#define TMEM_POOL_PRIVATE_UUID { 0, 0 }
49
50/* flags for tmem_ops.new_pool */
51#define TMEM_POOL_PERSIST 1
52#define TMEM_POOL_SHARED 2
53
54/* xen tmem foundation ops/hypercalls */
55
56static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid,
57 u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
58{
59 struct tmem_op op;
60 int rc = 0;
61
62 op.cmd = tmem_cmd;
63 op.pool_id = tmem_pool;
64 op.u.gen.oid[0] = oid.oid[0];
65 op.u.gen.oid[1] = oid.oid[1];
66 op.u.gen.oid[2] = oid.oid[2];
67 op.u.gen.index = index;
68 op.u.gen.tmem_offset = tmem_offset;
69 op.u.gen.pfn_offset = pfn_offset;
70 op.u.gen.len = len;
71 set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn);
72 rc = HYPERVISOR_tmem_op(&op);
73 return rc;
74}
75
76static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
77 u32 flags, unsigned long pagesize)
78{
79 struct tmem_op op;
80 int rc = 0, pageshift;
81
82 for (pageshift = 0; pagesize != 1; pageshift++)
83 pagesize >>= 1;
84 flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT;
85 flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT;
86 op.cmd = TMEM_NEW_POOL;
87 op.u.new.uuid[0] = uuid.uuid_lo;
88 op.u.new.uuid[1] = uuid.uuid_hi;
89 op.u.new.flags = flags;
90 rc = HYPERVISOR_tmem_op(&op);
91 return rc;
92}
93
94/* xen generic tmem ops */
95
96static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid,
97 u32 index, unsigned long pfn)
98{
99 unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
100
101 return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
102 gmfn, 0, 0, 0);
103}
104
105static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid,
106 u32 index, unsigned long pfn)
107{
108 unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
109
110 return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
111 gmfn, 0, 0, 0);
112}
113
114static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
115{
116 return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index,
117 0, 0, 0, 0);
118}
119
120static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid)
121{
122 return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0);
123}
124
125static int xen_tmem_destroy_pool(u32 pool_id)
126{
127 struct tmem_oid oid = { { 0 } };
128
129 return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0);
130}
131
132int tmem_enabled;
133
134static int __init enable_tmem(char *s)
135{
136 tmem_enabled = 1;
137 return 1;
138}
139
140__setup("tmem", enable_tmem);
141
142/* cleancache ops */
143
144static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
145 pgoff_t index, struct page *page)
146{
147 u32 ind = (u32) index;
148 struct tmem_oid oid = *(struct tmem_oid *)&key;
149 unsigned long pfn = page_to_pfn(page);
150
151 if (pool < 0)
152 return;
153 if (ind != index)
154 return;
155 mb(); /* ensure page is quiescent; tmem may address it with an alias */
156 (void)xen_tmem_put_page((u32)pool, oid, ind, pfn);
157}
158
159static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
160 pgoff_t index, struct page *page)
161{
162 u32 ind = (u32) index;
163 struct tmem_oid oid = *(struct tmem_oid *)&key;
164 unsigned long pfn = page_to_pfn(page);
165 int ret;
166
167 /* translate return values to linux semantics */
168 if (pool < 0)
169 return -1;
170 if (ind != index)
171 return -1;
172 ret = xen_tmem_get_page((u32)pool, oid, ind, pfn);
173 if (ret == 1)
174 return 0;
175 else
176 return -1;
177}
178
179static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key,
180 pgoff_t index)
181{
182 u32 ind = (u32) index;
183 struct tmem_oid oid = *(struct tmem_oid *)&key;
184
185 if (pool < 0)
186 return;
187 if (ind != index)
188 return;
189 (void)xen_tmem_flush_page((u32)pool, oid, ind);
190}
191
192static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key)
193{
194 struct tmem_oid oid = *(struct tmem_oid *)&key;
195
196 if (pool < 0)
197 return;
198 (void)xen_tmem_flush_object((u32)pool, oid);
199}
200
201static void tmem_cleancache_flush_fs(int pool)
202{
203 if (pool < 0)
204 return;
205 (void)xen_tmem_destroy_pool((u32)pool);
206}
207
208static int tmem_cleancache_init_fs(size_t pagesize)
209{
210 struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID;
211
212 return xen_tmem_new_pool(uuid_private, 0, pagesize);
213}
214
215static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize)
216{
217 struct tmem_pool_uuid shared_uuid;
218
219 shared_uuid.uuid_lo = *(u64 *)uuid;
220 shared_uuid.uuid_hi = *(u64 *)(&uuid[8]);
221 return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize);
222}
223
224static int use_cleancache = 1;
225
226static int __init no_cleancache(char *s)
227{
228 use_cleancache = 0;
229 return 1;
230}
231
232__setup("nocleancache", no_cleancache);
233
234static struct cleancache_ops tmem_cleancache_ops = {
235 .put_page = tmem_cleancache_put_page,
236 .get_page = tmem_cleancache_get_page,
237 .flush_page = tmem_cleancache_flush_page,
238 .flush_inode = tmem_cleancache_flush_inode,
239 .flush_fs = tmem_cleancache_flush_fs,
240 .init_shared_fs = tmem_cleancache_init_shared_fs,
241 .init_fs = tmem_cleancache_init_fs
242};
243
244static int __init xen_tmem_init(void)
245{
246 struct cleancache_ops old_ops;
247
248 if (!xen_domain())
249 return 0;
250#ifdef CONFIG_CLEANCACHE
251 BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
252 if (tmem_enabled && use_cleancache) {
253 char *s = "";
254 old_ops = cleancache_register_ops(&tmem_cleancache_ops);
255 if (old_ops.init_fs != NULL)
256 s = " (WARNING: cleancache_ops overridden)";
257 printk(KERN_INFO "cleancache enabled, RAM provided by "
258 "Xen Transcendent Memory%s\n", s);
259 }
260#endif
261 return 0;
262}
263
264module_init(xen_tmem_init)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7f6c67703195..8d7f3e69ae29 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,6 +814,7 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
814 814
815int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) 815int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
816{ 816{
817 dentry_unhash(d);
817 return v9fs_remove(i, d, 1); 818 return v9fs_remove(i, d, 1);
818} 819}
819 820
@@ -839,6 +840,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
839 struct p9_fid *newdirfid; 840 struct p9_fid *newdirfid;
840 struct p9_wstat wstat; 841 struct p9_wstat wstat;
841 842
843 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
844 dentry_unhash(new_dentry);
845
842 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 846 P9_DPRINTK(P9_DEBUG_VFS, "\n");
843 retval = 0; 847 retval = 0;
844 old_inode = old_dentry->d_inode; 848 old_inode = old_dentry->d_inode;
diff --git a/fs/Kconfig b/fs/Kconfig
index 979992dcb386..19891aab9c6e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
47 def_bool n 47 def_bool n
48 48
49config EXPORTFS 49config EXPORTFS
50 bool 50 tristate
51 51
52config FILE_LOCKING 52config FILE_LOCKING
53 bool "Enable POSIX file locking API" if EXPERT 53 bool "Enable POSIX file locking API" if EXPERT
@@ -121,6 +121,20 @@ config TMPFS
121 121
122 See <file:Documentation/filesystems/tmpfs.txt> for details. 122 See <file:Documentation/filesystems/tmpfs.txt> for details.
123 123
124config TMPFS_POSIX_ACL
125 bool "Tmpfs POSIX Access Control Lists"
126 depends on TMPFS
127 select TMPFS_XATTR
128 select GENERIC_ACL
129 help
130 POSIX Access Control Lists (ACLs) support permissions for users and
131 groups beyond the owner/group/world scheme.
132
133 To learn more about Access Control Lists, visit the POSIX ACLs for
134 Linux website <http://acl.bestbits.at/>.
135
136 If you don't know what Access Control Lists are, say N.
137
124config TMPFS_XATTR 138config TMPFS_XATTR
125 bool "Tmpfs extended attributes" 139 bool "Tmpfs extended attributes"
126 depends on TMPFS 140 depends on TMPFS
@@ -133,22 +147,9 @@ config TMPFS_XATTR
133 Currently this enables support for the trusted.* and 147 Currently this enables support for the trusted.* and
134 security.* namespaces. 148 security.* namespaces.
135 149
136 If unsure, say N.
137
138 You need this for POSIX ACL support on tmpfs. 150 You need this for POSIX ACL support on tmpfs.
139 151
140config TMPFS_POSIX_ACL 152 If unsure, say N.
141 bool "Tmpfs POSIX Access Control Lists"
142 depends on TMPFS_XATTR
143 select GENERIC_ACL
144 help
145 POSIX Access Control Lists (ACLs) support permissions for users and
146 groups beyond the owner/group/world scheme.
147
148 To learn more about Access Control Lists, visit the POSIX ACLs for
149 Linux website <http://acl.bestbits.at/>.
150
151 If you don't know what Access Control Lists are, say N.
152 153
153config HUGETLBFS 154config HUGETLBFS
154 bool "HugeTLB file system support" 155 bool "HugeTLB file system support"
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd8..03330e2e390c 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,6 +320,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
320 dentry->d_inode->i_ino, 320 dentry->d_inode->i_ino,
321 (int)dentry->d_name.len, dentry->d_name.name); 321 (int)dentry->d_name.len, dentry->d_name.name);
322 322
323 dentry_unhash(dentry);
324
323 return affs_remove_header(dentry); 325 return affs_remove_header(dentry);
324} 326}
325 327
@@ -417,6 +419,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
417 struct buffer_head *bh = NULL; 419 struct buffer_head *bh = NULL;
418 int retval; 420 int retval;
419 421
422 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
423 dentry_unhash(new_dentry);
424
420 pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", 425 pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
421 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, 426 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
422 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); 427 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 20c106f24927..2c4e05160042 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,6 +845,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
845 _enter("{%x:%u},{%s}", 845 _enter("{%x:%u},{%s}",
846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); 846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
847 847
848 dentry_unhash(dentry);
849
848 ret = -ENAMETOOLONG; 850 ret = -ENAMETOOLONG;
849 if (dentry->d_name.len >= AFSNAMEMAX) 851 if (dentry->d_name.len >= AFSNAMEMAX)
850 goto error; 852 goto error;
@@ -1146,6 +1148,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
1146 struct key *key; 1148 struct key *key;
1147 int ret; 1149 int ret;
1148 1150
1151 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1152 dentry_unhash(new_dentry);
1153
1149 vnode = AFS_FS_I(old_dentry->d_inode); 1154 vnode = AFS_FS_I(old_dentry->d_inode);
1150 orig_dvnode = AFS_FS_I(old_dir); 1155 orig_dvnode = AFS_FS_I(old_dir);
1151 new_dvnode = AFS_FS_I(new_dir); 1156 new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f55ae23b137e..87d95a8cddbc 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,6 +583,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
583 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) 583 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
584 return -EACCES; 584 return -EACCES;
585 585
586 dentry_unhash(dentry);
587
586 if (atomic_dec_and_test(&ino->count)) { 588 if (atomic_dec_and_test(&ino->count)) {
587 p_ino = autofs4_dentry_ino(dentry->d_parent); 589 p_ino = autofs4_dentry_ino(dentry->d_parent);
588 if (p_ino && dentry->d_parent != dentry) 590 if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd9047..c7d1d06b0483 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,6 +224,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
224 struct bfs_sb_info *info; 224 struct bfs_sb_info *info;
225 int error = -ENOENT; 225 int error = -ENOENT;
226 226
227 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
228 dentry_unhash(new_dentry);
229
227 old_bh = new_bh = NULL; 230 old_bh = new_bh = NULL;
228 old_inode = old_dentry->d_inode; 231 old_inode = old_dentry->d_inode;
229 if (S_ISDIR(old_inode->i_mode)) 232 if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96fcfa522dab..4f9893243dae 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -11,6 +11,7 @@
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/pagevec.h> 12#include <linux/pagevec.h>
13#include <linux/prefetch.h> 13#include <linux/prefetch.h>
14#include <linux/cleancache.h>
14#include "extent_io.h" 15#include "extent_io.h"
15#include "extent_map.h" 16#include "extent_map.h"
16#include "compat.h" 17#include "compat.h"
@@ -2016,6 +2017,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2016 2017
2017 set_page_extent_mapped(page); 2018 set_page_extent_mapped(page);
2018 2019
2020 if (!PageUptodate(page)) {
2021 if (cleancache_get_page(page) == 0) {
2022 BUG_ON(blocksize != PAGE_SIZE);
2023 goto out;
2024 }
2025 }
2026
2019 end = page_end; 2027 end = page_end;
2020 while (1) { 2028 while (1) {
2021 lock_extent(tree, start, end, GFP_NOFS); 2029 lock_extent(tree, start, end, GFP_NOFS);
@@ -2149,6 +2157,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2149 cur = cur + iosize; 2157 cur = cur + iosize;
2150 page_offset += iosize; 2158 page_offset += iosize;
2151 } 2159 }
2160out:
2152 if (!nr) { 2161 if (!nr) {
2153 if (!PageError(page)) 2162 if (!PageError(page))
2154 SetPageUptodate(page); 2163 SetPageUptodate(page);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0ac712efcdf2..be4ffa12f3ef 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,6 +39,7 @@
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -624,6 +625,7 @@ static int btrfs_fill_super(struct super_block *sb,
624 sb->s_root = root_dentry; 625 sb->s_root = root_dentry;
625 626
626 save_mount_options(sb, data); 627 save_mount_options(sb, data);
628 cleancache_init_fs(sb);
627 return 0; 629 return 0;
628 630
629fail_close: 631fail_close:
diff --git a/fs/buffer.c b/fs/buffer.c
index a08bb8e61c6f..698c6b2cc462 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
41#include <linux/bitops.h> 41#include <linux/bitops.h>
42#include <linux/mpage.h> 42#include <linux/mpage.h>
43#include <linux/bit_spinlock.h> 43#include <linux/bit_spinlock.h>
44#include <linux/cleancache.h>
44 45
45static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); 46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 47
@@ -269,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
269 invalidate_bh_lrus(); 270 invalidate_bh_lrus();
270 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 271 lru_add_drain_all(); /* make sure all lru add caches are flushed */
271 invalidate_mapping_pages(mapping, 0, -1); 272 invalidate_mapping_pages(mapping, 0, -1);
273 /* 99% of the time, we don't need to flush the cleancache on the bdev.
274 * But, for the strange corners, lets be cautious
275 */
276 cleancache_flush_inode(mapping);
272} 277}
273EXPORT_SYMBOL(invalidate_bdev); 278EXPORT_SYMBOL(invalidate_bdev);
274 279
@@ -2331,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write);
2331 * page lock we can determine safely if the page is beyond EOF. If it is not 2336 * page lock we can determine safely if the page is beyond EOF. If it is not
2332 * beyond EOF, then the page is guaranteed safe against truncation until we 2337 * beyond EOF, then the page is guaranteed safe against truncation until we
2333 * unlock the page. 2338 * unlock the page.
2339 *
2340 * Direct callers of this function should call vfs_check_frozen() so that page
2341 * fault does not busyloop until the fs is thawed.
2334 */ 2342 */
2335int 2343int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2336block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 2344 get_block_t get_block)
2337 get_block_t get_block)
2338{ 2345{
2339 struct page *page = vmf->page; 2346 struct page *page = vmf->page;
2340 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 2347 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2341 unsigned long end; 2348 unsigned long end;
2342 loff_t size; 2349 loff_t size;
2343 int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 2350 int ret;
2344 2351
2345 lock_page(page); 2352 lock_page(page);
2346 size = i_size_read(inode); 2353 size = i_size_read(inode);
2347 if ((page->mapping != inode->i_mapping) || 2354 if ((page->mapping != inode->i_mapping) ||
2348 (page_offset(page) > size)) { 2355 (page_offset(page) > size)) {
2349 /* page got truncated out from underneath us */ 2356 /* We overload EFAULT to mean page got truncated */
2350 unlock_page(page); 2357 ret = -EFAULT;
2351 goto out; 2358 goto out_unlock;
2352 } 2359 }
2353 2360
2354 /* page is wholly or partially inside EOF */ 2361 /* page is wholly or partially inside EOF */
@@ -2361,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2361 if (!ret) 2368 if (!ret)
2362 ret = block_commit_write(page, 0, end); 2369 ret = block_commit_write(page, 0, end);
2363 2370
2364 if (unlikely(ret)) { 2371 if (unlikely(ret < 0))
2365 unlock_page(page); 2372 goto out_unlock;
2366 if (ret == -ENOMEM) 2373 /*
2367 ret = VM_FAULT_OOM; 2374 * Freezing in progress? We check after the page is marked dirty and
2368 else /* -ENOSPC, -EIO, etc */ 2375 * with page lock held so if the test here fails, we are sure freezing
2369 ret = VM_FAULT_SIGBUS; 2376 * code will wait during syncing until the page fault is done - at that
2370 } else 2377 * point page will be dirty and unlocked so freezing code will write it
2371 ret = VM_FAULT_LOCKED; 2378 * and writeprotect it again.
2372 2379 */
2373out: 2380 set_page_dirty(page);
2381 if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2382 ret = -EAGAIN;
2383 goto out_unlock;
2384 }
2385 return 0;
2386out_unlock:
2387 unlock_page(page);
2374 return ret; 2388 return ret;
2375} 2389}
2390EXPORT_SYMBOL(__block_page_mkwrite);
2391
2392int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2393 get_block_t get_block)
2394{
2395 int ret;
2396 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2397
2398 /*
2399 * This check is racy but catches the common case. The check in
2400 * __block_page_mkwrite() is reliable.
2401 */
2402 vfs_check_frozen(sb, SB_FREEZE_WRITE);
2403 ret = __block_page_mkwrite(vma, vmf, get_block);
2404 return block_page_mkwrite_return(ret);
2405}
2376EXPORT_SYMBOL(block_page_mkwrite); 2406EXPORT_SYMBOL(block_page_mkwrite);
2377 2407
2378/* 2408/*
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2b8dae4d121e..a46126fd5735 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,6 +336,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
336 int len = de->d_name.len; 336 int len = de->d_name.len;
337 int error; 337 int error;
338 338
339 dentry_unhash(de);
340
339 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); 341 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
340 if (!error) { 342 if (!error) {
341 /* VFS may delete the child */ 343 /* VFS may delete the child */
@@ -359,6 +361,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
359 int new_length = new_dentry->d_name.len; 361 int new_length = new_dentry->d_name.len;
360 int error; 362 int error;
361 363
364 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
365 dentry_unhash(new_dentry);
366
362 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), 367 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
363 coda_i2f(new_dir), old_length, new_length, 368 coda_i2f(new_dir), old_length, new_length,
364 (const char *) old_name, (const char *)new_name); 369 (const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9a37a9b6de3a..9d17d350abc5 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,6 +1359,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1359 struct module *subsys_owner = NULL, *dead_item_owner = NULL; 1359 struct module *subsys_owner = NULL, *dead_item_owner = NULL;
1360 int ret; 1360 int ret;
1361 1361
1362 dentry_unhash(dentry);
1363
1362 if (dentry->d_parent == configfs_sb->s_root) 1364 if (dentry->d_parent == configfs_sb->s_root)
1363 return -EPERM; 1365 return -EPERM;
1364 1366
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4d4cc6a90cd5..227b409b8406 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -521,6 +521,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
521 struct dentry *lower_dir_dentry; 521 struct dentry *lower_dir_dentry;
522 int rc; 522 int rc;
523 523
524 dentry_unhash(dentry);
525
524 lower_dentry = ecryptfs_dentry_to_lower(dentry); 526 lower_dentry = ecryptfs_dentry_to_lower(dentry);
525 dget(dentry); 527 dget(dentry);
526 lower_dir_dentry = lock_parent(lower_dentry); 528 lower_dir_dentry = lock_parent(lower_dentry);
@@ -571,6 +573,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
571 struct dentry *lower_new_dir_dentry; 573 struct dentry *lower_new_dir_dentry;
572 struct dentry *trap = NULL; 574 struct dentry *trap = NULL;
573 575
576 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
577 dentry_unhash(new_dentry);
578
574 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); 579 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
575 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); 580 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
576 dget(lower_old_dentry); 581 dget(lower_old_dentry);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c6a9e0eadc1..aad153ef6b78 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -36,6 +36,7 @@
36#include <linux/quotaops.h> 36#include <linux/quotaops.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/log2.h> 38#include <linux/log2.h>
39#include <linux/cleancache.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41 42
@@ -1367,6 +1368,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1367 } else { 1368 } else {
1368 ext3_msg(sb, KERN_INFO, "using internal journal"); 1369 ext3_msg(sb, KERN_INFO, "using internal journal");
1369 } 1370 }
1371 cleancache_init_fs(sb);
1370 return res; 1372 return res;
1371} 1373}
1372 1374
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36eda6c..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o
10 11
11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c67139ad4b4..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
362} 362}
363 363
364/** 364/**
365 * ext4_add_groupblocks() -- Add given blocks to an existing group
366 * @handle: handle to this transaction
367 * @sb: super block
368 * @block: start physcial block to add to the block group
369 * @count: number of blocks to free
370 *
371 * This marks the blocks as free in the bitmap. We ask the
372 * mballoc to reload the buddy after this by setting group
373 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
374 */
375void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
376 ext4_fsblk_t block, unsigned long count)
377{
378 struct buffer_head *bitmap_bh = NULL;
379 struct buffer_head *gd_bh;
380 ext4_group_t block_group;
381 ext4_grpblk_t bit;
382 unsigned int i;
383 struct ext4_group_desc *desc;
384 struct ext4_sb_info *sbi = EXT4_SB(sb);
385 int err = 0, ret, blk_free_count;
386 ext4_grpblk_t blocks_freed;
387 struct ext4_group_info *grp;
388
389 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
390
391 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
392 grp = ext4_get_group_info(sb, block_group);
393 /*
394 * Check to see if we are freeing blocks across a group
395 * boundary.
396 */
397 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
398 goto error_return;
399 }
400 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
401 if (!bitmap_bh)
402 goto error_return;
403 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
404 if (!desc)
405 goto error_return;
406
407 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
408 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
409 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
410 in_range(block + count - 1, ext4_inode_table(sb, desc),
411 sbi->s_itb_per_group)) {
412 ext4_error(sb, "Adding blocks in system zones - "
413 "Block = %llu, count = %lu",
414 block, count);
415 goto error_return;
416 }
417
418 /*
419 * We are about to add blocks to the bitmap,
420 * so we need undo access.
421 */
422 BUFFER_TRACE(bitmap_bh, "getting undo access");
423 err = ext4_journal_get_undo_access(handle, bitmap_bh);
424 if (err)
425 goto error_return;
426
427 /*
428 * We are about to modify some metadata. Call the journal APIs
429 * to unshare ->b_data if a currently-committing transaction is
430 * using it
431 */
432 BUFFER_TRACE(gd_bh, "get_write_access");
433 err = ext4_journal_get_write_access(handle, gd_bh);
434 if (err)
435 goto error_return;
436 /*
437 * make sure we don't allow a parallel init on other groups in the
438 * same buddy cache
439 */
440 down_write(&grp->alloc_sem);
441 for (i = 0, blocks_freed = 0; i < count; i++) {
442 BUFFER_TRACE(bitmap_bh, "clear bit");
443 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
444 bit + i, bitmap_bh->b_data)) {
445 ext4_error(sb, "bit already cleared for block %llu",
446 (ext4_fsblk_t)(block + i));
447 BUFFER_TRACE(bitmap_bh, "bit already cleared");
448 } else {
449 blocks_freed++;
450 }
451 }
452 ext4_lock_group(sb, block_group);
453 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
454 ext4_free_blks_set(sb, desc, blk_free_count);
455 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
456 ext4_unlock_group(sb, block_group);
457 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
458
459 if (sbi->s_log_groups_per_flex) {
460 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
461 atomic_add(blocks_freed,
462 &sbi->s_flex_groups[flex_group].free_blocks);
463 }
464 /*
465 * request to reload the buddy with the
466 * new bitmap information
467 */
468 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
469 grp->bb_free += blocks_freed;
470 up_write(&grp->alloc_sem);
471
472 /* We dirtied the bitmap block */
473 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
474 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
475
476 /* And the group descriptor block */
477 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
478 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
479 if (!err)
480 err = ret;
481
482error_return:
483 brelse(bitmap_bh);
484 ext4_std_error(sb, err);
485 return;
486}
487
488/**
489 * ext4_has_free_blocks() 365 * ext4_has_free_blocks()
490 * @sbi: in-core super block structure. 366 * @sbi: in-core super block structure.
491 * @nblocks: number of needed blocks 367 * @nblocks: number of needed blocks
@@ -493,7 +369,8 @@ error_return:
493 * Check if filesystem has nblocks free & available for allocation. 369 * Check if filesystem has nblocks free & available for allocation.
494 * On success return 1, return 0 on failure. 370 * On success return 1, return 0 on failure.
495 */ 371 */
496static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) 372static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
373 s64 nblocks, unsigned int flags)
497{ 374{
498 s64 free_blocks, dirty_blocks, root_blocks; 375 s64 free_blocks, dirty_blocks, root_blocks;
499 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 376 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
507 EXT4_FREEBLOCKS_WATERMARK) { 384 EXT4_FREEBLOCKS_WATERMARK) {
508 free_blocks = percpu_counter_sum_positive(fbc); 385 free_blocks = percpu_counter_sum_positive(fbc);
509 dirty_blocks = percpu_counter_sum_positive(dbc); 386 dirty_blocks = percpu_counter_sum_positive(dbc);
510 if (dirty_blocks < 0) {
511 printk(KERN_CRIT "Dirty block accounting "
512 "went wrong %lld\n",
513 (long long)dirty_blocks);
514 }
515 } 387 }
516 /* Check whether we have space after 388 /* Check whether we have space after
517 * accounting for current dirty blocks & root reserved blocks. 389 * accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
522 /* Hm, nope. Are (enough) root reserved blocks available? */ 394 /* Hm, nope. Are (enough) root reserved blocks available? */
523 if (sbi->s_resuid == current_fsuid() || 395 if (sbi->s_resuid == current_fsuid() ||
524 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || 396 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
525 capable(CAP_SYS_RESOURCE)) { 397 capable(CAP_SYS_RESOURCE) ||
398 (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
399
526 if (free_blocks >= (nblocks + dirty_blocks)) 400 if (free_blocks >= (nblocks + dirty_blocks))
527 return 1; 401 return 1;
528 } 402 }
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
531} 405}
532 406
533int ext4_claim_free_blocks(struct ext4_sb_info *sbi, 407int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
534 s64 nblocks) 408 s64 nblocks, unsigned int flags)
535{ 409{
536 if (ext4_has_free_blocks(sbi, nblocks)) { 410 if (ext4_has_free_blocks(sbi, nblocks, flags)) {
537 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); 411 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
538 return 0; 412 return 0;
539 } else 413 } else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
554 */ 428 */
555int ext4_should_retry_alloc(struct super_block *sb, int *retries) 429int ext4_should_retry_alloc(struct super_block *sb, int *retries)
556{ 430{
557 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || 431 if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
558 (*retries)++ > 3 || 432 (*retries)++ > 3 ||
559 !EXT4_SB(sb)->s_journal) 433 !EXT4_SB(sb)->s_journal)
560 return 0; 434 return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
577 * error stores in errp pointer 451 * error stores in errp pointer
578 */ 452 */
579ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 453ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
580 ext4_fsblk_t goal, unsigned long *count, int *errp) 454 ext4_fsblk_t goal, unsigned int flags,
455 unsigned long *count, int *errp)
581{ 456{
582 struct ext4_allocation_request ar; 457 struct ext4_allocation_request ar;
583 ext4_fsblk_t ret; 458 ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
587 ar.inode = inode; 462 ar.inode = inode;
588 ar.goal = goal; 463 ar.goal = goal;
589 ar.len = count ? *count : 1; 464 ar.len = count ? *count : 1;
465 ar.flags = flags;
590 466
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 467 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 468 if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b753f4..a74b89c09f90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
108#define EXT4_MB_DELALLOC_RESERVED 0x0400 108#define EXT4_MB_DELALLOC_RESERVED 0x0400
109/* We are doing stream allocation */ 109/* We are doing stream allocation */
110#define EXT4_MB_STREAM_ALLOC 0x0800 110#define EXT4_MB_STREAM_ALLOC 0x0800
111 111/* Use reserved root blocks if needed */
112#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
112 113
113struct ext4_allocation_request { 114struct ext4_allocation_request {
114 /* target inode for block we're allocating */ 115 /* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
209 */ 210 */
210#define EXT4_BAD_INO 1 /* Bad blocks inode */ 211#define EXT4_BAD_INO 1 /* Bad blocks inode */
211#define EXT4_ROOT_INO 2 /* Root inode */ 212#define EXT4_ROOT_INO 2 /* Root inode */
213#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
214#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
212#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ 215#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
213#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ 216#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
214#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ 217#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
512 /* Convert extent to initialized after IO complete */ 515 /* Convert extent to initialized after IO complete */
513#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 516#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
514 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 517 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
518 /* Punch out blocks of an extent */
519#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
520 /* Don't normalize allocation size (used for fallocate) */
521#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
515 522
516/* 523/*
517 * Flags used by ext4_free_blocks 524 * Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
1028 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ 1035 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
1029 __le32 s_flags; /* Miscellaneous flags */ 1036 __le32 s_flags; /* Miscellaneous flags */
1030 __le16 s_raid_stride; /* RAID stride */ 1037 __le16 s_raid_stride; /* RAID stride */
1031 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 1038 __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
1032 __le64 s_mmp_block; /* Block for multi-mount protection */ 1039 __le64 s_mmp_block; /* Block for multi-mount protection */
1033 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1040 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1034 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1041 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
1144 unsigned long s_ext_blocks; 1151 unsigned long s_ext_blocks;
1145 unsigned long s_ext_extents; 1152 unsigned long s_ext_extents;
1146#endif 1153#endif
1154 /* ext4 extent cache stats */
1155 unsigned long extent_cache_hits;
1156 unsigned long extent_cache_misses;
1147 1157
1148 /* for buddy allocator */ 1158 /* for buddy allocator */
1149 struct ext4_group_info ***s_group_info; 1159 struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
1201 struct ext4_li_request *s_li_request; 1211 struct ext4_li_request *s_li_request;
1202 /* Wait multiplier for lazy initialization thread */ 1212 /* Wait multiplier for lazy initialization thread */
1203 unsigned int s_li_wait_mult; 1213 unsigned int s_li_wait_mult;
1214
1215 /* Kernel thread for multiple mount protection */
1216 struct task_struct *s_mmp_tsk;
1204}; 1217};
1205 1218
1206static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1219static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1338#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 1351#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
1339#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 1352#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
1340#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 1353#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
1354#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
1341 1355
1342#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 1356#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
1343#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 1357#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1351#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ 1365#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1352#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1366#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1353 1367
1368#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1369#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1370 EXT4_FEATURE_INCOMPAT_META_BG)
1371#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1372 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1373 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1374
1375#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1376#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1377 EXT4_FEATURE_INCOMPAT_RECOVER| \
1378 EXT4_FEATURE_INCOMPAT_META_BG)
1379#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1380 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1381 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1382
1354#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1383#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1355#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1384#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1356 EXT4_FEATURE_INCOMPAT_RECOVER| \ 1385 EXT4_FEATURE_INCOMPAT_RECOVER| \
1357 EXT4_FEATURE_INCOMPAT_META_BG| \ 1386 EXT4_FEATURE_INCOMPAT_META_BG| \
1358 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1387 EXT4_FEATURE_INCOMPAT_EXTENTS| \
1359 EXT4_FEATURE_INCOMPAT_64BIT| \ 1388 EXT4_FEATURE_INCOMPAT_64BIT| \
1360 EXT4_FEATURE_INCOMPAT_FLEX_BG) 1389 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1390 EXT4_FEATURE_INCOMPAT_MMP)
1361#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1391#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1362 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1392 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1363 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ 1393 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1590 */ 1620 */
1591struct ext4_lazy_init { 1621struct ext4_lazy_init {
1592 unsigned long li_state; 1622 unsigned long li_state;
1593
1594 wait_queue_head_t li_wait_daemon;
1595 wait_queue_head_t li_wait_task;
1596 struct timer_list li_timer;
1597 struct task_struct *li_task;
1598
1599 struct list_head li_request_list; 1623 struct list_head li_request_list;
1600 struct mutex li_list_mtx; 1624 struct mutex li_list_mtx;
1601}; 1625};
@@ -1615,6 +1639,67 @@ struct ext4_features {
1615}; 1639};
1616 1640
1617/* 1641/*
1642 * This structure will be used for multiple mount protection. It will be
1643 * written into the block number saved in the s_mmp_block field in the
1644 * superblock. Programs that check MMP should assume that if
1645 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
1646 * to use the filesystem, regardless of how old the timestamp is.
1647 */
1648#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
1649#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
1650#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
1651#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
1652
1653struct mmp_struct {
1654 __le32 mmp_magic; /* Magic number for MMP */
1655 __le32 mmp_seq; /* Sequence no. updated periodically */
1656
1657 /*
1658 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
1659 * purposes and do not affect the correctness of the algorithm
1660 */
1661 __le64 mmp_time; /* Time last updated */
1662 char mmp_nodename[64]; /* Node which last updated MMP block */
1663 char mmp_bdevname[32]; /* Bdev which last updated MMP block */
1664
1665 /*
1666 * mmp_check_interval is used to verify if the MMP block has been
1667 * updated on the block device. The value is updated based on the
1668 * maximum time to write the MMP block during an update cycle.
1669 */
1670 __le16 mmp_check_interval;
1671
1672 __le16 mmp_pad1;
1673 __le32 mmp_pad2[227];
1674};
1675
1676/* arguments passed to the mmp thread */
1677struct mmpd_data {
1678 struct buffer_head *bh; /* bh from initial read_mmp_block() */
1679 struct super_block *sb; /* super block of the fs */
1680};
1681
1682/*
1683 * Check interval multiplier
1684 * The MMP block is written every update interval and initially checked every
1685 * update interval x the multiplier (the value is then adapted based on the
1686 * write latency). The reason is that writes can be delayed under load and we
1687 * don't want readers to incorrectly assume that the filesystem is no longer
1688 * in use.
1689 */
1690#define EXT4_MMP_CHECK_MULT 2UL
1691
1692/*
1693 * Minimum interval for MMP checking in seconds.
1694 */
1695#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
1696
1697/*
1698 * Maximum interval for MMP checking in seconds.
1699 */
1700#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
1701
1702/*
1618 * Function prototypes 1703 * Function prototypes
1619 */ 1704 */
1620 1705
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
1638extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 1723extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1639 ext4_group_t group); 1724 ext4_group_t group);
1640extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1725extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1641 ext4_fsblk_t goal, unsigned long *count, int *errp); 1726 ext4_fsblk_t goal,
1642extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1727 unsigned int flags,
1643extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1728 unsigned long *count,
1644 ext4_fsblk_t block, unsigned long count); 1729 int *errp);
1730extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
1731 s64 nblocks, unsigned int flags);
1645extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1732extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
1646extern void ext4_check_blocks_bitmap(struct super_block *); 1733extern void ext4_check_blocks_bitmap(struct super_block *);
1647extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1734extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1706 unsigned long count, int flags); 1793 unsigned long count, int flags);
1707extern int ext4_mb_add_groupinfo(struct super_block *sb, 1794extern int ext4_mb_add_groupinfo(struct super_block *sb,
1708 ext4_group_t i, struct ext4_group_desc *desc); 1795 ext4_group_t i, struct ext4_group_desc *desc);
1796extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1797 ext4_fsblk_t block, unsigned long count);
1709extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 1798extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1710 1799
1711/* inode.c */ 1800/* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1729extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1818extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1730extern int ext4_can_truncate(struct inode *inode); 1819extern int ext4_can_truncate(struct inode *inode);
1731extern void ext4_truncate(struct inode *); 1820extern void ext4_truncate(struct inode *);
1821extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
1732extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 1822extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1733extern void ext4_set_inode_flags(struct inode *); 1823extern void ext4_set_inode_flags(struct inode *);
1734extern void ext4_get_inode_flags(struct ext4_inode_info *); 1824extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
1738extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1828extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1739extern int ext4_block_truncate_page(handle_t *handle, 1829extern int ext4_block_truncate_page(handle_t *handle,
1740 struct address_space *mapping, loff_t from); 1830 struct address_space *mapping, loff_t from);
1831extern int ext4_block_zero_page_range(handle_t *handle,
1832 struct address_space *mapping, loff_t from, loff_t length);
1741extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1833extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1742extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1834extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1743extern void ext4_da_update_reserve_space(struct inode *inode, 1835extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
1788 __LINE__, ## message) 1880 __LINE__, ## message)
1789extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1881extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1790 __attribute__ ((format (printf, 3, 4))); 1882 __attribute__ ((format (printf, 3, 4)));
1883extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
1884 const char *, unsigned int, const char *);
1885#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
1886 __LINE__, msg)
1791extern void __ext4_grp_locked_error(const char *, unsigned int, \ 1887extern void __ext4_grp_locked_error(const char *, unsigned int, \
1792 struct super_block *, ext4_group_t, \ 1888 struct super_block *, ext4_group_t, \
1793 unsigned long, ext4_fsblk_t, \ 1889 unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
2064extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2160extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2065 struct ext4_map_blocks *map, int flags); 2161 struct ext4_map_blocks *map, int flags);
2066extern void ext4_ext_truncate(struct inode *); 2162extern void ext4_ext_truncate(struct inode *);
2163extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
2164 loff_t length);
2067extern void ext4_ext_init(struct super_block *); 2165extern void ext4_ext_init(struct super_block *);
2068extern void ext4_ext_release(struct super_block *); 2166extern void ext4_ext_release(struct super_block *);
2069extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2167extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
2092 int len, 2190 int len,
2093 struct writeback_control *wbc); 2191 struct writeback_control *wbc);
2094 2192
2193/* mmp.c */
2194extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2195
2095/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2196/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
2096enum ext4_state_bits { 2197enum ext4_state_bits {
2097 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2198 BH_Uninit /* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
6 6
7#include <trace/events/ext4.h> 7#include <trace/events/ext4.h>
8 8
9int __ext4_journal_get_undo_access(const char *where, unsigned int line,
10 handle_t *handle, struct buffer_head *bh)
11{
12 int err = 0;
13
14 if (ext4_handle_valid(handle)) {
15 err = jbd2_journal_get_undo_access(handle, bh);
16 if (err)
17 ext4_journal_abort_handle(where, line, __func__, bh,
18 handle, err);
19 }
20 return err;
21}
22
23int __ext4_journal_get_write_access(const char *where, unsigned int line, 9int __ext4_journal_get_write_access(const char *where, unsigned int line,
24 handle_t *handle, struct buffer_head *bh) 10 handle_t *handle, struct buffer_head *bh)
25{ 11{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0f53538a57f..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
126 const char *err_fn, 126 const char *err_fn,
127 struct buffer_head *bh, handle_t *handle, int err); 127 struct buffer_head *bh, handle_t *handle, int err);
128 128
129int __ext4_journal_get_undo_access(const char *where, unsigned int line,
130 handle_t *handle, struct buffer_head *bh);
131
132int __ext4_journal_get_write_access(const char *where, unsigned int line, 129int __ext4_journal_get_write_access(const char *where, unsigned int line,
133 handle_t *handle, struct buffer_head *bh); 130 handle_t *handle, struct buffer_head *bh);
134 131
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
146int __ext4_handle_dirty_super(const char *where, unsigned int line, 143int __ext4_handle_dirty_super(const char *where, unsigned int line,
147 handle_t *handle, struct super_block *sb); 144 handle_t *handle, struct super_block *sb);
148 145
149#define ext4_journal_get_undo_access(handle, bh) \
150 __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
151#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
152 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
153#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4890d6f3ad15..5199bac7fc62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
46 46
47#include <trace/events/ext4.h> 47#include <trace/events/ext4.h>
48 48
49static int ext4_split_extent(handle_t *handle,
50 struct inode *inode,
51 struct ext4_ext_path *path,
52 struct ext4_map_blocks *map,
53 int split_flag,
54 int flags);
55
49static int ext4_ext_truncate_extend_restart(handle_t *handle, 56static int ext4_ext_truncate_extend_restart(handle_t *handle,
50 struct inode *inode, 57 struct inode *inode,
51 int needed) 58 int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
192static ext4_fsblk_t 199static ext4_fsblk_t
193ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 200ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
194 struct ext4_ext_path *path, 201 struct ext4_ext_path *path,
195 struct ext4_extent *ex, int *err) 202 struct ext4_extent *ex, int *err, unsigned int flags)
196{ 203{
197 ext4_fsblk_t goal, newblock; 204 ext4_fsblk_t goal, newblock;
198 205
199 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 206 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
200 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); 207 newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
208 NULL, err);
201 return newblock; 209 return newblock;
202} 210}
203 211
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
474 } 482 }
475 ext_debug("\n"); 483 ext_debug("\n");
476} 484}
485
486static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
487 ext4_fsblk_t newblock, int level)
488{
489 int depth = ext_depth(inode);
490 struct ext4_extent *ex;
491
492 if (depth != level) {
493 struct ext4_extent_idx *idx;
494 idx = path[level].p_idx;
495 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
496 ext_debug("%d: move %d:%llu in new index %llu\n", level,
497 le32_to_cpu(idx->ei_block),
498 ext4_idx_pblock(idx),
499 newblock);
500 idx++;
501 }
502
503 return;
504 }
505
506 ex = path[depth].p_ext;
507 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
508 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
509 le32_to_cpu(ex->ee_block),
510 ext4_ext_pblock(ex),
511 ext4_ext_is_uninitialized(ex),
512 ext4_ext_get_actual_len(ex),
513 newblock);
514 ex++;
515 }
516}
517
477#else 518#else
478#define ext4_ext_show_path(inode, path) 519#define ext4_ext_show_path(inode, path)
479#define ext4_ext_show_leaf(inode, path) 520#define ext4_ext_show_leaf(inode, path)
521#define ext4_ext_show_move(inode, path, newblock, level)
480#endif 522#endif
481 523
482void ext4_ext_drop_refs(struct ext4_ext_path *path) 524void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
792 * - initializes subtree 834 * - initializes subtree
793 */ 835 */
794static int ext4_ext_split(handle_t *handle, struct inode *inode, 836static int ext4_ext_split(handle_t *handle, struct inode *inode,
795 struct ext4_ext_path *path, 837 unsigned int flags,
796 struct ext4_extent *newext, int at) 838 struct ext4_ext_path *path,
839 struct ext4_extent *newext, int at)
797{ 840{
798 struct buffer_head *bh = NULL; 841 struct buffer_head *bh = NULL;
799 int depth = ext_depth(inode); 842 int depth = ext_depth(inode);
800 struct ext4_extent_header *neh; 843 struct ext4_extent_header *neh;
801 struct ext4_extent_idx *fidx; 844 struct ext4_extent_idx *fidx;
802 struct ext4_extent *ex;
803 int i = at, k, m, a; 845 int i = at, k, m, a;
804 ext4_fsblk_t newblock, oldblock; 846 ext4_fsblk_t newblock, oldblock;
805 __le32 border; 847 __le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
847 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 889 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
848 for (a = 0; a < depth - at; a++) { 890 for (a = 0; a < depth - at; a++) {
849 newblock = ext4_ext_new_meta_block(handle, inode, path, 891 newblock = ext4_ext_new_meta_block(handle, inode, path,
850 newext, &err); 892 newext, &err, flags);
851 if (newblock == 0) 893 if (newblock == 0)
852 goto cleanup; 894 goto cleanup;
853 ablocks[a] = newblock; 895 ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
876 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 918 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
877 neh->eh_magic = EXT4_EXT_MAGIC; 919 neh->eh_magic = EXT4_EXT_MAGIC;
878 neh->eh_depth = 0; 920 neh->eh_depth = 0;
879 ex = EXT_FIRST_EXTENT(neh);
880 921
881 /* move remainder of path[depth] to the new leaf */ 922 /* move remainder of path[depth] to the new leaf */
882 if (unlikely(path[depth].p_hdr->eh_entries != 923 if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
888 goto cleanup; 929 goto cleanup;
889 } 930 }
890 /* start copy from next extent */ 931 /* start copy from next extent */
891 /* TODO: we could do it by single memmove */ 932 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
892 m = 0; 933 ext4_ext_show_move(inode, path, newblock, depth);
893 path[depth].p_ext++;
894 while (path[depth].p_ext <=
895 EXT_MAX_EXTENT(path[depth].p_hdr)) {
896 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
897 le32_to_cpu(path[depth].p_ext->ee_block),
898 ext4_ext_pblock(path[depth].p_ext),
899 ext4_ext_is_uninitialized(path[depth].p_ext),
900 ext4_ext_get_actual_len(path[depth].p_ext),
901 newblock);
902 /*memmove(ex++, path[depth].p_ext++,
903 sizeof(struct ext4_extent));
904 neh->eh_entries++;*/
905 path[depth].p_ext++;
906 m++;
907 }
908 if (m) { 934 if (m) {
909 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 935 struct ext4_extent *ex;
936 ex = EXT_FIRST_EXTENT(neh);
937 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
910 le16_add_cpu(&neh->eh_entries, m); 938 le16_add_cpu(&neh->eh_entries, m);
911 } 939 }
912 940
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
968 996
969 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 997 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
970 i, newblock, le32_to_cpu(border), oldblock); 998 i, newblock, le32_to_cpu(border), oldblock);
971 /* copy indexes */
972 m = 0;
973 path[i].p_idx++;
974 999
975 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 /* move remainder of path[i] to the new index block */
976 EXT_MAX_INDEX(path[i].p_hdr));
977 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1001 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
978 EXT_LAST_INDEX(path[i].p_hdr))) { 1002 EXT_LAST_INDEX(path[i].p_hdr))) {
979 EXT4_ERROR_INODE(inode, 1003 EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
982 err = -EIO; 1006 err = -EIO;
983 goto cleanup; 1007 goto cleanup;
984 } 1008 }
985 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1009 /* start copy indexes */
986 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1010 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
987 le32_to_cpu(path[i].p_idx->ei_block), 1011 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
988 ext4_idx_pblock(path[i].p_idx), 1012 EXT_MAX_INDEX(path[i].p_hdr));
989 newblock); 1013 ext4_ext_show_move(inode, path, newblock, i);
990 /*memmove(++fidx, path[i].p_idx++,
991 sizeof(struct ext4_extent_idx));
992 neh->eh_entries++;
993 BUG_ON(neh->eh_entries > neh->eh_max);*/
994 path[i].p_idx++;
995 m++;
996 }
997 if (m) { 1014 if (m) {
998 memmove(++fidx, path[i].p_idx - m, 1015 memmove(++fidx, path[i].p_idx,
999 sizeof(struct ext4_extent_idx) * m); 1016 sizeof(struct ext4_extent_idx) * m);
1000 le16_add_cpu(&neh->eh_entries, m); 1017 le16_add_cpu(&neh->eh_entries, m);
1001 } 1018 }
@@ -1056,8 +1073,9 @@ cleanup:
1056 * just created block 1073 * just created block
1057 */ 1074 */
1058static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1075static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1059 struct ext4_ext_path *path, 1076 unsigned int flags,
1060 struct ext4_extent *newext) 1077 struct ext4_ext_path *path,
1078 struct ext4_extent *newext)
1061{ 1079{
1062 struct ext4_ext_path *curp = path; 1080 struct ext4_ext_path *curp = path;
1063 struct ext4_extent_header *neh; 1081 struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1065 ext4_fsblk_t newblock; 1083 ext4_fsblk_t newblock;
1066 int err = 0; 1084 int err = 0;
1067 1085
1068 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); 1086 newblock = ext4_ext_new_meta_block(handle, inode, path,
1087 newext, &err, flags);
1069 if (newblock == 0) 1088 if (newblock == 0)
1070 return err; 1089 return err;
1071 1090
@@ -1140,8 +1159,9 @@ out:
1140 * if no free index is found, then it requests in-depth growing. 1159 * if no free index is found, then it requests in-depth growing.
1141 */ 1160 */
1142static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1161static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1143 struct ext4_ext_path *path, 1162 unsigned int flags,
1144 struct ext4_extent *newext) 1163 struct ext4_ext_path *path,
1164 struct ext4_extent *newext)
1145{ 1165{
1146 struct ext4_ext_path *curp; 1166 struct ext4_ext_path *curp;
1147 int depth, i, err = 0; 1167 int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
1161 if (EXT_HAS_FREE_INDEX(curp)) { 1181 if (EXT_HAS_FREE_INDEX(curp)) {
1162 /* if we found index with free entry, then use that 1182 /* if we found index with free entry, then use that
1163 * entry: create all needed subtree and add new leaf */ 1183 * entry: create all needed subtree and add new leaf */
1164 err = ext4_ext_split(handle, inode, path, newext, i); 1184 err = ext4_ext_split(handle, inode, flags, path, newext, i);
1165 if (err) 1185 if (err)
1166 goto out; 1186 goto out;
1167 1187
@@ -1174,7 +1194,8 @@ repeat:
1174 err = PTR_ERR(path); 1194 err = PTR_ERR(path);
1175 } else { 1195 } else {
1176 /* tree is full, time to grow in depth */ 1196 /* tree is full, time to grow in depth */
1177 err = ext4_ext_grow_indepth(handle, inode, path, newext); 1197 err = ext4_ext_grow_indepth(handle, inode, flags,
1198 path, newext);
1178 if (err) 1199 if (err)
1179 goto out; 1200 goto out;
1180 1201
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1563 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1584 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1564 * 1 if they got merged. 1585 * 1 if they got merged.
1565 */ 1586 */
1566static int ext4_ext_try_to_merge(struct inode *inode, 1587static int ext4_ext_try_to_merge_right(struct inode *inode,
1567 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1568 struct ext4_extent *ex) 1589 struct ext4_extent *ex)
1569{ 1590{
@@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
1603} 1624}
1604 1625
1605/* 1626/*
1627 * This function tries to merge the @ex extent to neighbours in the tree.
1628 * return 1 if merge left else 0.
1629 */
1630static int ext4_ext_try_to_merge(struct inode *inode,
1631 struct ext4_ext_path *path,
1632 struct ext4_extent *ex) {
1633 struct ext4_extent_header *eh;
1634 unsigned int depth;
1635 int merge_done = 0;
1636 int ret = 0;
1637
1638 depth = ext_depth(inode);
1639 BUG_ON(path[depth].p_hdr == NULL);
1640 eh = path[depth].p_hdr;
1641
1642 if (ex > EXT_FIRST_EXTENT(eh))
1643 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1644
1645 if (!merge_done)
1646 ret = ext4_ext_try_to_merge_right(inode, path, ex);
1647
1648 return ret;
1649}
1650
1651/*
1606 * check if a portion of the "newext" extent overlaps with an 1652 * check if a portion of the "newext" extent overlaps with an
1607 * existing extent. 1653 * existing extent.
1608 * 1654 *
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1668 int depth, len, err; 1714 int depth, len, err;
1669 ext4_lblk_t next; 1715 ext4_lblk_t next;
1670 unsigned uninitialized = 0; 1716 unsigned uninitialized = 0;
1717 int flags = 0;
1671 1718
1672 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1719 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1673 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1720 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
1742 * There is no free space in the found leaf. 1789 * There is no free space in the found leaf.
1743 * We're gonna add a new leaf in the tree. 1790 * We're gonna add a new leaf in the tree.
1744 */ 1791 */
1745 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1792 if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1793 flags = EXT4_MB_USE_ROOT_BLOCKS;
1794 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1746 if (err) 1795 if (err)
1747 goto cleanup; 1796 goto cleanup;
1748 depth = ext_depth(inode); 1797 depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2003} 2052}
2004 2053
2005/* 2054/*
2055 * ext4_ext_in_cache()
2056 * Checks to see if the given block is in the cache.
2057 * If it is, the cached extent is stored in the given
2058 * cache extent pointer. If the cached extent is a hole,
2059 * this routine should be used instead of
2060 * ext4_ext_in_cache if the calling function needs to
2061 * know the size of the hole.
2062 *
2063 * @inode: The files inode
2064 * @block: The block to look for in the cache
2065 * @ex: Pointer where the cached extent will be stored
2066 * if it contains block
2067 *
2006 * Return 0 if cache is invalid; 1 if the cache is valid 2068 * Return 0 if cache is invalid; 1 if the cache is valid
2007 */ 2069 */
2008static int 2070static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2009ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2071 struct ext4_ext_cache *ex){
2010 struct ext4_extent *ex)
2011{
2012 struct ext4_ext_cache *cex; 2072 struct ext4_ext_cache *cex;
2073 struct ext4_sb_info *sbi;
2013 int ret = 0; 2074 int ret = 0;
2014 2075
2015 /* 2076 /*
@@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2017 */ 2078 */
2018 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2079 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2019 cex = &EXT4_I(inode)->i_cached_extent; 2080 cex = &EXT4_I(inode)->i_cached_extent;
2081 sbi = EXT4_SB(inode->i_sb);
2020 2082
2021 /* has cache valid data? */ 2083 /* has cache valid data? */
2022 if (cex->ec_len == 0) 2084 if (cex->ec_len == 0)
2023 goto errout; 2085 goto errout;
2024 2086
2025 if (in_range(block, cex->ec_block, cex->ec_len)) { 2087 if (in_range(block, cex->ec_block, cex->ec_len)) {
2026 ex->ee_block = cpu_to_le32(cex->ec_block); 2088 memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2027 ext4_ext_store_pblock(ex, cex->ec_start);
2028 ex->ee_len = cpu_to_le16(cex->ec_len);
2029 ext_debug("%u cached by %u:%u:%llu\n", 2089 ext_debug("%u cached by %u:%u:%llu\n",
2030 block, 2090 block,
2031 cex->ec_block, cex->ec_len, cex->ec_start); 2091 cex->ec_block, cex->ec_len, cex->ec_start);
2032 ret = 1; 2092 ret = 1;
2033 } 2093 }
2034errout: 2094errout:
2095 if (!ret)
2096 sbi->extent_cache_misses++;
2097 else
2098 sbi->extent_cache_hits++;
2035 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2099 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2036 return ret; 2100 return ret;
2037} 2101}
2038 2102
2039/* 2103/*
2104 * ext4_ext_in_cache()
2105 * Checks to see if the given block is in the cache.
2106 * If it is, the cached extent is stored in the given
2107 * extent pointer.
2108 *
2109 * @inode: The files inode
2110 * @block: The block to look for in the cache
2111 * @ex: Pointer where the cached extent will be stored
2112 * if it contains block
2113 *
2114 * Return 0 if cache is invalid; 1 if the cache is valid
2115 */
2116static int
2117ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2118 struct ext4_extent *ex)
2119{
2120 struct ext4_ext_cache cex;
2121 int ret = 0;
2122
2123 if (ext4_ext_check_cache(inode, block, &cex)) {
2124 ex->ee_block = cpu_to_le32(cex.ec_block);
2125 ext4_ext_store_pblock(ex, cex.ec_start);
2126 ex->ee_len = cpu_to_le16(cex.ec_len);
2127 ret = 1;
2128 }
2129
2130 return ret;
2131}
2132
2133
2134/*
2040 * ext4_ext_rm_idx: 2135 * ext4_ext_rm_idx:
2041 * removes index from the index block. 2136 * removes index from the index block.
2042 * It's used in truncate case only, thus all requests are for 2137 * It's used in truncate case only, thus all requests are for
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2163 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2258 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2164 } else if (from == le32_to_cpu(ex->ee_block) 2259 } else if (from == le32_to_cpu(ex->ee_block)
2165 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2260 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2166 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2261 /* head removal */
2167 from, to, le32_to_cpu(ex->ee_block), ee_len); 2262 ext4_lblk_t num;
2263 ext4_fsblk_t start;
2264
2265 num = to - from;
2266 start = ext4_ext_pblock(ex);
2267
2268 ext_debug("free first %u blocks starting %llu\n", num, start);
2269 ext4_free_blocks(handle, inode, 0, start, num, flags);
2270
2168 } else { 2271 } else {
2169 printk(KERN_INFO "strange request: removal(2) " 2272 printk(KERN_INFO "strange request: removal(2) "
2170 "%u-%u from %u:%u\n", 2273 "%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2173 return 0; 2276 return 0;
2174} 2277}
2175 2278
2279
2280/*
2281 * ext4_ext_rm_leaf() Removes the extents associated with the
2282 * blocks appearing between "start" and "end", and splits the extents
2283 * if "start" and "end" appear in the same extent
2284 *
2285 * @handle: The journal handle
2286 * @inode: The files inode
2287 * @path: The path to the leaf
2288 * @start: The first block to remove
2289 * @end: The last block to remove
2290 */
2176static int 2291static int
2177ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2292ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2178 struct ext4_ext_path *path, ext4_lblk_t start) 2293 struct ext4_ext_path *path, ext4_lblk_t start,
2294 ext4_lblk_t end)
2179{ 2295{
2180 int err = 0, correct_index = 0; 2296 int err = 0, correct_index = 0;
2181 int depth = ext_depth(inode), credits; 2297 int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2186 unsigned short ex_ee_len; 2302 unsigned short ex_ee_len;
2187 unsigned uninitialized = 0; 2303 unsigned uninitialized = 0;
2188 struct ext4_extent *ex; 2304 struct ext4_extent *ex;
2305 struct ext4_map_blocks map;
2189 2306
2190 /* the header must be checked already in ext4_ext_remove_space() */ 2307 /* the header must be checked already in ext4_ext_remove_space() */
2191 ext_debug("truncate since %u in leaf\n", start); 2308 ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2215 path[depth].p_ext = ex; 2332 path[depth].p_ext = ex;
2216 2333
2217 a = ex_ee_block > start ? ex_ee_block : start; 2334 a = ex_ee_block > start ? ex_ee_block : start;
2218 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 2335 b = ex_ee_block+ex_ee_len - 1 < end ?
2219 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 2336 ex_ee_block+ex_ee_len - 1 : end;
2220 2337
2221 ext_debug(" border %u:%u\n", a, b); 2338 ext_debug(" border %u:%u\n", a, b);
2222 2339
2223 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 2340 /* If this extent is beyond the end of the hole, skip it */
2224 block = 0; 2341 if (end <= ex_ee_block) {
2225 num = 0; 2342 ex--;
2226 BUG(); 2343 ex_ee_block = le32_to_cpu(ex->ee_block);
2344 ex_ee_len = ext4_ext_get_actual_len(ex);
2345 continue;
2346 } else if (a != ex_ee_block &&
2347 b != ex_ee_block + ex_ee_len - 1) {
2348 /*
2349 * If this is a truncate, then this condition should
2350 * never happen because at least one of the end points
2351 * needs to be on the edge of the extent.
2352 */
2353 if (end == EXT_MAX_BLOCK) {
2354 ext_debug(" bad truncate %u:%u\n",
2355 start, end);
2356 block = 0;
2357 num = 0;
2358 err = -EIO;
2359 goto out;
2360 }
2361 /*
2362 * else this is a hole punch, so the extent needs to
2363 * be split since neither edge of the hole is on the
2364 * extent edge
2365 */
2366 else{
2367 map.m_pblk = ext4_ext_pblock(ex);
2368 map.m_lblk = ex_ee_block;
2369 map.m_len = b - ex_ee_block;
2370
2371 err = ext4_split_extent(handle,
2372 inode, path, &map, 0,
2373 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
2374 EXT4_GET_BLOCKS_PRE_IO);
2375
2376 if (err < 0)
2377 goto out;
2378
2379 ex_ee_len = ext4_ext_get_actual_len(ex);
2380
2381 b = ex_ee_block+ex_ee_len - 1 < end ?
2382 ex_ee_block+ex_ee_len - 1 : end;
2383
2384 /* Then remove tail of this extent */
2385 block = ex_ee_block;
2386 num = a - block;
2387 }
2227 } else if (a != ex_ee_block) { 2388 } else if (a != ex_ee_block) {
2228 /* remove tail of the extent */ 2389 /* remove tail of the extent */
2229 block = ex_ee_block; 2390 block = ex_ee_block;
2230 num = a - block; 2391 num = a - block;
2231 } else if (b != ex_ee_block + ex_ee_len - 1) { 2392 } else if (b != ex_ee_block + ex_ee_len - 1) {
2232 /* remove head of the extent */ 2393 /* remove head of the extent */
2233 block = a; 2394 block = b;
2234 num = b - a; 2395 num = ex_ee_block + ex_ee_len - b;
2235 /* there is no "make a hole" API yet */ 2396
2236 BUG(); 2397 /*
2398 * If this is a truncate, this condition
2399 * should never happen
2400 */
2401 if (end == EXT_MAX_BLOCK) {
2402 ext_debug(" bad truncate %u:%u\n",
2403 start, end);
2404 err = -EIO;
2405 goto out;
2406 }
2237 } else { 2407 } else {
2238 /* remove whole extent: excellent! */ 2408 /* remove whole extent: excellent! */
2239 block = ex_ee_block; 2409 block = ex_ee_block;
2240 num = 0; 2410 num = 0;
2241 BUG_ON(a != ex_ee_block); 2411 if (a != ex_ee_block) {
2242 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 2412 ext_debug(" bad truncate %u:%u\n",
2413 start, end);
2414 err = -EIO;
2415 goto out;
2416 }
2417
2418 if (b != ex_ee_block + ex_ee_len - 1) {
2419 ext_debug(" bad truncate %u:%u\n",
2420 start, end);
2421 err = -EIO;
2422 goto out;
2423 }
2243 } 2424 }
2244 2425
2245 /* 2426 /*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2270 if (num == 0) { 2451 if (num == 0) {
2271 /* this extent is removed; mark slot entirely unused */ 2452 /* this extent is removed; mark slot entirely unused */
2272 ext4_ext_store_pblock(ex, 0); 2453 ext4_ext_store_pblock(ex, 0);
2273 le16_add_cpu(&eh->eh_entries, -1); 2454 } else if (block != ex_ee_block) {
2455 /*
2456 * If this was a head removal, then we need to update
2457 * the physical block since it is now at a different
2458 * location
2459 */
2460 ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
2274 } 2461 }
2275 2462
2276 ex->ee_block = cpu_to_le32(block); 2463 ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2286 if (err) 2473 if (err)
2287 goto out; 2474 goto out;
2288 2475
2476 /*
2477 * If the extent was completely released,
2478 * we need to remove it from the leaf
2479 */
2480 if (num == 0) {
2481 if (end != EXT_MAX_BLOCK) {
2482 /*
2483 * For hole punching, we need to scoot all the
2484 * extents up when an extent is removed so that
2485 * we dont have blank extents in the middle
2486 */
2487 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2488 sizeof(struct ext4_extent));
2489
2490 /* Now get rid of the one at the end */
2491 memset(EXT_LAST_EXTENT(eh), 0,
2492 sizeof(struct ext4_extent));
2493 }
2494 le16_add_cpu(&eh->eh_entries, -1);
2495 }
2496
2289 ext_debug("new extent: %u:%u:%llu\n", block, num, 2497 ext_debug("new extent: %u:%u:%llu\n", block, num,
2290 ext4_ext_pblock(ex)); 2498 ext4_ext_pblock(ex));
2291 ex--; 2499 ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2326 return 1; 2534 return 1;
2327} 2535}
2328 2536
2329static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2537static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2538 ext4_lblk_t end)
2330{ 2539{
2331 struct super_block *sb = inode->i_sb; 2540 struct super_block *sb = inode->i_sb;
2332 int depth = ext_depth(inode); 2541 int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
2365 while (i >= 0 && err == 0) { 2574 while (i >= 0 && err == 0) {
2366 if (i == depth) { 2575 if (i == depth) {
2367 /* this is leaf block */ 2576 /* this is leaf block */
2368 err = ext4_ext_rm_leaf(handle, inode, path, start); 2577 err = ext4_ext_rm_leaf(handle, inode, path,
2578 start, end);
2369 /* root level has p_bh == NULL, brelse() eats this */ 2579 /* root level has p_bh == NULL, brelse() eats this */
2370 brelse(path[i].p_bh); 2580 brelse(path[i].p_bh);
2371 path[i].p_bh = NULL; 2581 path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2529 return ret; 2739 return ret;
2530} 2740}
2531 2741
2742/*
2743 * used by extent splitting.
2744 */
2745#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
2746 due to ENOSPC */
2747#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
2748#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
2749
2750/*
2751 * ext4_split_extent_at() splits an extent at given block.
2752 *
2753 * @handle: the journal handle
2754 * @inode: the file inode
2755 * @path: the path to the extent
2756 * @split: the logical block where the extent is splitted.
2757 * @split_flags: indicates if the extent could be zeroout if split fails, and
2758 * the states(init or uninit) of new extents.
2759 * @flags: flags used to insert new extent to extent tree.
2760 *
2761 *
2762 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2763 * of which are deterimined by split_flag.
2764 *
2765 * There are two cases:
2766 * a> the extent are splitted into two extent.
2767 * b> split is not needed, and just mark the extent.
2768 *
2769 * return 0 on success.
2770 */
2771static int ext4_split_extent_at(handle_t *handle,
2772 struct inode *inode,
2773 struct ext4_ext_path *path,
2774 ext4_lblk_t split,
2775 int split_flag,
2776 int flags)
2777{
2778 ext4_fsblk_t newblock;
2779 ext4_lblk_t ee_block;
2780 struct ext4_extent *ex, newex, orig_ex;
2781 struct ext4_extent *ex2 = NULL;
2782 unsigned int ee_len, depth;
2783 int err = 0;
2784
2785 ext_debug("ext4_split_extents_at: inode %lu, logical"
2786 "block %llu\n", inode->i_ino, (unsigned long long)split);
2787
2788 ext4_ext_show_leaf(inode, path);
2789
2790 depth = ext_depth(inode);
2791 ex = path[depth].p_ext;
2792 ee_block = le32_to_cpu(ex->ee_block);
2793 ee_len = ext4_ext_get_actual_len(ex);
2794 newblock = split - ee_block + ext4_ext_pblock(ex);
2795
2796 BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2797
2798 err = ext4_ext_get_access(handle, inode, path + depth);
2799 if (err)
2800 goto out;
2801
2802 if (split == ee_block) {
2803 /*
2804 * case b: block @split is the block that the extent begins with
2805 * then we just change the state of the extent, and splitting
2806 * is not needed.
2807 */
2808 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2809 ext4_ext_mark_uninitialized(ex);
2810 else
2811 ext4_ext_mark_initialized(ex);
2812
2813 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2814 ext4_ext_try_to_merge(inode, path, ex);
2815
2816 err = ext4_ext_dirty(handle, inode, path + depth);
2817 goto out;
2818 }
2819
2820 /* case a */
2821 memcpy(&orig_ex, ex, sizeof(orig_ex));
2822 ex->ee_len = cpu_to_le16(split - ee_block);
2823 if (split_flag & EXT4_EXT_MARK_UNINIT1)
2824 ext4_ext_mark_uninitialized(ex);
2825
2826 /*
2827 * path may lead to new leaf, not to original leaf any more
2828 * after ext4_ext_insert_extent() returns,
2829 */
2830 err = ext4_ext_dirty(handle, inode, path + depth);
2831 if (err)
2832 goto fix_extent_len;
2833
2834 ex2 = &newex;
2835 ex2->ee_block = cpu_to_le32(split);
2836 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
2837 ext4_ext_store_pblock(ex2, newblock);
2838 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2839 ext4_ext_mark_uninitialized(ex2);
2840
2841 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2842 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2843 err = ext4_ext_zeroout(inode, &orig_ex);
2844 if (err)
2845 goto fix_extent_len;
2846 /* update the extent length and mark as initialized */
2847 ex->ee_len = cpu_to_le32(ee_len);
2848 ext4_ext_try_to_merge(inode, path, ex);
2849 err = ext4_ext_dirty(handle, inode, path + depth);
2850 goto out;
2851 } else if (err)
2852 goto fix_extent_len;
2853
2854out:
2855 ext4_ext_show_leaf(inode, path);
2856 return err;
2857
2858fix_extent_len:
2859 ex->ee_len = orig_ex.ee_len;
2860 ext4_ext_dirty(handle, inode, path + depth);
2861 return err;
2862}
2863
2864/*
2865 * ext4_split_extents() splits an extent and mark extent which is covered
2866 * by @map as split_flags indicates
2867 *
2868 * It may result in splitting the extent into multiple extents (upto three)
2869 * There are three possibilities:
2870 * a> There is no split required
2871 * b> Splits in two extents: Split is happening at either end of the extent
2872 * c> Splits in three extents: Somone is splitting in middle of the extent
2873 *
2874 */
2875static int ext4_split_extent(handle_t *handle,
2876 struct inode *inode,
2877 struct ext4_ext_path *path,
2878 struct ext4_map_blocks *map,
2879 int split_flag,
2880 int flags)
2881{
2882 ext4_lblk_t ee_block;
2883 struct ext4_extent *ex;
2884 unsigned int ee_len, depth;
2885 int err = 0;
2886 int uninitialized;
2887 int split_flag1, flags1;
2888
2889 depth = ext_depth(inode);
2890 ex = path[depth].p_ext;
2891 ee_block = le32_to_cpu(ex->ee_block);
2892 ee_len = ext4_ext_get_actual_len(ex);
2893 uninitialized = ext4_ext_is_uninitialized(ex);
2894
2895 if (map->m_lblk + map->m_len < ee_block + ee_len) {
2896 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2897 EXT4_EXT_MAY_ZEROOUT : 0;
2898 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2899 if (uninitialized)
2900 split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2901 EXT4_EXT_MARK_UNINIT2;
2902 err = ext4_split_extent_at(handle, inode, path,
2903 map->m_lblk + map->m_len, split_flag1, flags1);
2904 if (err)
2905 goto out;
2906 }
2907
2908 ext4_ext_drop_refs(path);
2909 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2910 if (IS_ERR(path))
2911 return PTR_ERR(path);
2912
2913 if (map->m_lblk >= ee_block) {
2914 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2915 EXT4_EXT_MAY_ZEROOUT : 0;
2916 if (uninitialized)
2917 split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2918 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2919 split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2920 err = ext4_split_extent_at(handle, inode, path,
2921 map->m_lblk, split_flag1, flags);
2922 if (err)
2923 goto out;
2924 }
2925
2926 ext4_ext_show_leaf(inode, path);
2927out:
2928 return err ? err : map->m_len;
2929}
2930
2532#define EXT4_EXT_ZERO_LEN 7 2931#define EXT4_EXT_ZERO_LEN 7
2533/* 2932/*
2534 * This function is called by ext4_ext_map_blocks() if someone tries to write 2933 * This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2545 struct ext4_map_blocks *map, 2944 struct ext4_map_blocks *map,
2546 struct ext4_ext_path *path) 2945 struct ext4_ext_path *path)
2547{ 2946{
2548 struct ext4_extent *ex, newex, orig_ex; 2947 struct ext4_map_blocks split_map;
2549 struct ext4_extent *ex1 = NULL; 2948 struct ext4_extent zero_ex;
2550 struct ext4_extent *ex2 = NULL; 2949 struct ext4_extent *ex;
2551 struct ext4_extent *ex3 = NULL;
2552 struct ext4_extent_header *eh;
2553 ext4_lblk_t ee_block, eof_block; 2950 ext4_lblk_t ee_block, eof_block;
2554 unsigned int allocated, ee_len, depth; 2951 unsigned int allocated, ee_len, depth;
2555 ext4_fsblk_t newblock;
2556 int err = 0; 2952 int err = 0;
2557 int ret = 0; 2953 int split_flag = 0;
2558 int may_zeroout;
2559 2954
2560 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2955 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2561 "block %llu, max_blocks %u\n", inode->i_ino, 2956 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2567 eof_block = map->m_lblk + map->m_len; 2962 eof_block = map->m_lblk + map->m_len;
2568 2963
2569 depth = ext_depth(inode); 2964 depth = ext_depth(inode);
2570 eh = path[depth].p_hdr;
2571 ex = path[depth].p_ext; 2965 ex = path[depth].p_ext;
2572 ee_block = le32_to_cpu(ex->ee_block); 2966 ee_block = le32_to_cpu(ex->ee_block);
2573 ee_len = ext4_ext_get_actual_len(ex); 2967 ee_len = ext4_ext_get_actual_len(ex);
2574 allocated = ee_len - (map->m_lblk - ee_block); 2968 allocated = ee_len - (map->m_lblk - ee_block);
2575 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2576
2577 ex2 = ex;
2578 orig_ex.ee_block = ex->ee_block;
2579 orig_ex.ee_len = cpu_to_le16(ee_len);
2580 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2581 2969
2970 WARN_ON(map->m_lblk < ee_block);
2582 /* 2971 /*
2583 * It is safe to convert extent to initialized via explicit 2972 * It is safe to convert extent to initialized via explicit
2584 * zeroout only if extent is fully insde i_size or new_size. 2973 * zeroout only if extent is fully insde i_size or new_size.
2585 */ 2974 */
2586 may_zeroout = ee_block + ee_len <= eof_block; 2975 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2587 2976
2588 err = ext4_ext_get_access(handle, inode, path + depth);
2589 if (err)
2590 goto out;
2591 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2977 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2592 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { 2978 if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
2593 err = ext4_ext_zeroout(inode, &orig_ex); 2979 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2980 err = ext4_ext_zeroout(inode, ex);
2594 if (err) 2981 if (err)
2595 goto fix_extent_len;
2596 /* update the extent length and mark as initialized */
2597 ex->ee_block = orig_ex.ee_block;
2598 ex->ee_len = orig_ex.ee_len;
2599 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2600 ext4_ext_dirty(handle, inode, path + depth);
2601 /* zeroed the full extent */
2602 return allocated;
2603 }
2604
2605 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2606 if (map->m_lblk > ee_block) {
2607 ex1 = ex;
2608 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2609 ext4_ext_mark_uninitialized(ex1);
2610 ex2 = &newex;
2611 }
2612 /*
2613 * for sanity, update the length of the ex2 extent before
2614 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2615 * overlap of blocks.
2616 */
2617 if (!ex1 && allocated > map->m_len)
2618 ex2->ee_len = cpu_to_le16(map->m_len);
2619 /* ex3: to ee_block + ee_len : uninitialised */
2620 if (allocated > map->m_len) {
2621 unsigned int newdepth;
2622 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2623 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2624 /*
2625 * map->m_lblk == ee_block is handled by the zerouout
2626 * at the beginning.
2627 * Mark first half uninitialized.
2628 * Mark second half initialized and zero out the
2629 * initialized extent
2630 */
2631 ex->ee_block = orig_ex.ee_block;
2632 ex->ee_len = cpu_to_le16(ee_len - allocated);
2633 ext4_ext_mark_uninitialized(ex);
2634 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2635 ext4_ext_dirty(handle, inode, path + depth);
2636
2637 ex3 = &newex;
2638 ex3->ee_block = cpu_to_le32(map->m_lblk);
2639 ext4_ext_store_pblock(ex3, newblock);
2640 ex3->ee_len = cpu_to_le16(allocated);
2641 err = ext4_ext_insert_extent(handle, inode, path,
2642 ex3, 0);
2643 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err)
2646 goto fix_extent_len;
2647 ex->ee_block = orig_ex.ee_block;
2648 ex->ee_len = orig_ex.ee_len;
2649 ext4_ext_store_pblock(ex,
2650 ext4_ext_pblock(&orig_ex));
2651 ext4_ext_dirty(handle, inode, path + depth);
2652 /* blocks available from map->m_lblk */
2653 return allocated;
2654
2655 } else if (err)
2656 goto fix_extent_len;
2657
2658 /*
2659 * We need to zero out the second half because
2660 * an fallocate request can update file size and
2661 * converting the second half to initialized extent
2662 * implies that we can leak some junk data to user
2663 * space.
2664 */
2665 err = ext4_ext_zeroout(inode, ex3);
2666 if (err) {
2667 /*
2668 * We should actually mark the
2669 * second half as uninit and return error
2670 * Insert would have changed the extent
2671 */
2672 depth = ext_depth(inode);
2673 ext4_ext_drop_refs(path);
2674 path = ext4_ext_find_extent(inode, map->m_lblk,
2675 path);
2676 if (IS_ERR(path)) {
2677 err = PTR_ERR(path);
2678 return err;
2679 }
2680 /* get the second half extent details */
2681 ex = path[depth].p_ext;
2682 err = ext4_ext_get_access(handle, inode,
2683 path + depth);
2684 if (err)
2685 return err;
2686 ext4_ext_mark_uninitialized(ex);
2687 ext4_ext_dirty(handle, inode, path + depth);
2688 return err;
2689 }
2690
2691 /* zeroed the second half */
2692 return allocated;
2693 }
2694 ex3 = &newex;
2695 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2696 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2697 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2698 ext4_ext_mark_uninitialized(ex3);
2699 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2700 if (err == -ENOSPC && may_zeroout) {
2701 err = ext4_ext_zeroout(inode, &orig_ex);
2702 if (err)
2703 goto fix_extent_len;
2704 /* update the extent length and mark as initialized */
2705 ex->ee_block = orig_ex.ee_block;
2706 ex->ee_len = orig_ex.ee_len;
2707 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2708 ext4_ext_dirty(handle, inode, path + depth);
2709 /* zeroed the full extent */
2710 /* blocks available from map->m_lblk */
2711 return allocated;
2712
2713 } else if (err)
2714 goto fix_extent_len;
2715 /*
2716 * The depth, and hence eh & ex might change
2717 * as part of the insert above.
2718 */
2719 newdepth = ext_depth(inode);
2720 /*
2721 * update the extent length after successful insert of the
2722 * split extent
2723 */
2724 ee_len -= ext4_ext_get_actual_len(ex3);
2725 orig_ex.ee_len = cpu_to_le16(ee_len);
2726 may_zeroout = ee_block + ee_len <= eof_block;
2727
2728 depth = newdepth;
2729 ext4_ext_drop_refs(path);
2730 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2731 if (IS_ERR(path)) {
2732 err = PTR_ERR(path);
2733 goto out; 2982 goto out;
2734 }
2735 eh = path[depth].p_hdr;
2736 ex = path[depth].p_ext;
2737 if (ex2 != &newex)
2738 ex2 = ex;
2739 2983
2740 err = ext4_ext_get_access(handle, inode, path + depth); 2984 err = ext4_ext_get_access(handle, inode, path + depth);
2741 if (err) 2985 if (err)
2742 goto out; 2986 goto out;
2743 2987 ext4_ext_mark_initialized(ex);
2744 allocated = map->m_len; 2988 ext4_ext_try_to_merge(inode, path, ex);
2745 2989 err = ext4_ext_dirty(handle, inode, path + depth);
2746 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2990 goto out;
2747 * to insert a extent in the middle zerout directly
2748 * otherwise give the extent a chance to merge to left
2749 */
2750 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2751 map->m_lblk != ee_block && may_zeroout) {
2752 err = ext4_ext_zeroout(inode, &orig_ex);
2753 if (err)
2754 goto fix_extent_len;
2755 /* update the extent length and mark as initialized */
2756 ex->ee_block = orig_ex.ee_block;
2757 ex->ee_len = orig_ex.ee_len;
2758 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2759 ext4_ext_dirty(handle, inode, path + depth);
2760 /* zero out the first half */
2761 /* blocks available from map->m_lblk */
2762 return allocated;
2763 }
2764 }
2765 /*
2766 * If there was a change of depth as part of the
2767 * insertion of ex3 above, we need to update the length
2768 * of the ex1 extent again here
2769 */
2770 if (ex1 && ex1 != ex) {
2771 ex1 = ex;
2772 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2773 ext4_ext_mark_uninitialized(ex1);
2774 ex2 = &newex;
2775 }
2776 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2777 ex2->ee_block = cpu_to_le32(map->m_lblk);
2778 ext4_ext_store_pblock(ex2, newblock);
2779 ex2->ee_len = cpu_to_le16(allocated);
2780 if (ex2 != ex)
2781 goto insert;
2782 /*
2783 * New (initialized) extent starts from the first block
2784 * in the current extent. i.e., ex2 == ex
2785 * We have to see if it can be merged with the extent
2786 * on the left.
2787 */
2788 if (ex2 > EXT_FIRST_EXTENT(eh)) {
2789 /*
2790 * To merge left, pass "ex2 - 1" to try_to_merge(),
2791 * since it merges towards right _only_.
2792 */
2793 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2794 if (ret) {
2795 err = ext4_ext_correct_indexes(handle, inode, path);
2796 if (err)
2797 goto out;
2798 depth = ext_depth(inode);
2799 ex2--;
2800 }
2801 } 2991 }
2992
2802 /* 2993 /*
2803 * Try to Merge towards right. This might be required 2994 * four cases:
2804 * only when the whole extent is being written to. 2995 * 1. split the extent into three extents.
2805 * i.e. ex2 == ex and ex3 == NULL. 2996 * 2. split the extent into two extents, zeroout the first half.
2997 * 3. split the extent into two extents, zeroout the second half.
2998 * 4. split the extent into two extents with out zeroout.
2806 */ 2999 */
2807 if (!ex3) { 3000 split_map.m_lblk = map->m_lblk;
2808 ret = ext4_ext_try_to_merge(inode, path, ex2); 3001 split_map.m_len = map->m_len;
2809 if (ret) { 3002
2810 err = ext4_ext_correct_indexes(handle, inode, path); 3003 if (allocated > map->m_len) {
3004 if (allocated <= EXT4_EXT_ZERO_LEN &&
3005 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3006 /* case 3 */
3007 zero_ex.ee_block =
3008 cpu_to_le32(map->m_lblk);
3009 zero_ex.ee_len = cpu_to_le16(allocated);
3010 ext4_ext_store_pblock(&zero_ex,
3011 ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3012 err = ext4_ext_zeroout(inode, &zero_ex);
2811 if (err) 3013 if (err)
2812 goto out; 3014 goto out;
3015 split_map.m_lblk = map->m_lblk;
3016 split_map.m_len = allocated;
3017 } else if ((map->m_lblk - ee_block + map->m_len <
3018 EXT4_EXT_ZERO_LEN) &&
3019 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3020 /* case 2 */
3021 if (map->m_lblk != ee_block) {
3022 zero_ex.ee_block = ex->ee_block;
3023 zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3024 ee_block);
3025 ext4_ext_store_pblock(&zero_ex,
3026 ext4_ext_pblock(ex));
3027 err = ext4_ext_zeroout(inode, &zero_ex);
3028 if (err)
3029 goto out;
3030 }
3031
3032 split_map.m_lblk = ee_block;
3033 split_map.m_len = map->m_lblk - ee_block + map->m_len;
3034 allocated = map->m_len;
2813 } 3035 }
2814 } 3036 }
2815 /* Mark modified extent as dirty */ 3037
2816 err = ext4_ext_dirty(handle, inode, path + depth); 3038 allocated = ext4_split_extent(handle, inode, path,
2817 goto out; 3039 &split_map, split_flag, 0);
2818insert: 3040 if (allocated < 0)
2819 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 3041 err = allocated;
2820 if (err == -ENOSPC && may_zeroout) { 3042
2821 err = ext4_ext_zeroout(inode, &orig_ex);
2822 if (err)
2823 goto fix_extent_len;
2824 /* update the extent length and mark as initialized */
2825 ex->ee_block = orig_ex.ee_block;
2826 ex->ee_len = orig_ex.ee_len;
2827 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2828 ext4_ext_dirty(handle, inode, path + depth);
2829 /* zero out the first half */
2830 return allocated;
2831 } else if (err)
2832 goto fix_extent_len;
2833out: 3043out:
2834 ext4_ext_show_leaf(inode, path);
2835 return err ? err : allocated; 3044 return err ? err : allocated;
2836
2837fix_extent_len:
2838 ex->ee_block = orig_ex.ee_block;
2839 ex->ee_len = orig_ex.ee_len;
2840 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2841 ext4_ext_mark_uninitialized(ex);
2842 ext4_ext_dirty(handle, inode, path + depth);
2843 return err;
2844} 3045}
2845 3046
2846/* 3047/*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2871 struct ext4_ext_path *path, 3072 struct ext4_ext_path *path,
2872 int flags) 3073 int flags)
2873{ 3074{
2874 struct ext4_extent *ex, newex, orig_ex; 3075 ext4_lblk_t eof_block;
2875 struct ext4_extent *ex1 = NULL; 3076 ext4_lblk_t ee_block;
2876 struct ext4_extent *ex2 = NULL; 3077 struct ext4_extent *ex;
2877 struct ext4_extent *ex3 = NULL; 3078 unsigned int ee_len;
2878 ext4_lblk_t ee_block, eof_block; 3079 int split_flag = 0, depth;
2879 unsigned int allocated, ee_len, depth;
2880 ext4_fsblk_t newblock;
2881 int err = 0;
2882 int may_zeroout;
2883 3080
2884 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3081 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2885 "block %llu, max_blocks %u\n", inode->i_ino, 3082 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2889 inode->i_sb->s_blocksize_bits; 3086 inode->i_sb->s_blocksize_bits;
2890 if (eof_block < map->m_lblk + map->m_len) 3087 if (eof_block < map->m_lblk + map->m_len)
2891 eof_block = map->m_lblk + map->m_len; 3088 eof_block = map->m_lblk + map->m_len;
2892
2893 depth = ext_depth(inode);
2894 ex = path[depth].p_ext;
2895 ee_block = le32_to_cpu(ex->ee_block);
2896 ee_len = ext4_ext_get_actual_len(ex);
2897 allocated = ee_len - (map->m_lblk - ee_block);
2898 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2899
2900 ex2 = ex;
2901 orig_ex.ee_block = ex->ee_block;
2902 orig_ex.ee_len = cpu_to_le16(ee_len);
2903 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2904
2905 /* 3089 /*
2906 * It is safe to convert extent to initialized via explicit 3090 * It is safe to convert extent to initialized via explicit
2907 * zeroout only if extent is fully insde i_size or new_size. 3091 * zeroout only if extent is fully insde i_size or new_size.
2908 */ 3092 */
2909 may_zeroout = ee_block + ee_len <= eof_block; 3093 depth = ext_depth(inode);
2910 3094 ex = path[depth].p_ext;
2911 /* 3095 ee_block = le32_to_cpu(ex->ee_block);
2912 * If the uninitialized extent begins at the same logical 3096 ee_len = ext4_ext_get_actual_len(ex);
2913 * block where the write begins, and the write completely
2914 * covers the extent, then we don't need to split it.
2915 */
2916 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2917 return allocated;
2918
2919 err = ext4_ext_get_access(handle, inode, path + depth);
2920 if (err)
2921 goto out;
2922 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2923 if (map->m_lblk > ee_block) {
2924 ex1 = ex;
2925 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2926 ext4_ext_mark_uninitialized(ex1);
2927 ex2 = &newex;
2928 }
2929 /*
2930 * for sanity, update the length of the ex2 extent before
2931 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2932 * overlap of blocks.
2933 */
2934 if (!ex1 && allocated > map->m_len)
2935 ex2->ee_len = cpu_to_le16(map->m_len);
2936 /* ex3: to ee_block + ee_len : uninitialised */
2937 if (allocated > map->m_len) {
2938 unsigned int newdepth;
2939 ex3 = &newex;
2940 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2941 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2942 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2943 ext4_ext_mark_uninitialized(ex3);
2944 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2945 if (err == -ENOSPC && may_zeroout) {
2946 err = ext4_ext_zeroout(inode, &orig_ex);
2947 if (err)
2948 goto fix_extent_len;
2949 /* update the extent length and mark as initialized */
2950 ex->ee_block = orig_ex.ee_block;
2951 ex->ee_len = orig_ex.ee_len;
2952 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2953 ext4_ext_dirty(handle, inode, path + depth);
2954 /* zeroed the full extent */
2955 /* blocks available from map->m_lblk */
2956 return allocated;
2957
2958 } else if (err)
2959 goto fix_extent_len;
2960 /*
2961 * The depth, and hence eh & ex might change
2962 * as part of the insert above.
2963 */
2964 newdepth = ext_depth(inode);
2965 /*
2966 * update the extent length after successful insert of the
2967 * split extent
2968 */
2969 ee_len -= ext4_ext_get_actual_len(ex3);
2970 orig_ex.ee_len = cpu_to_le16(ee_len);
2971 may_zeroout = ee_block + ee_len <= eof_block;
2972
2973 depth = newdepth;
2974 ext4_ext_drop_refs(path);
2975 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2976 if (IS_ERR(path)) {
2977 err = PTR_ERR(path);
2978 goto out;
2979 }
2980 ex = path[depth].p_ext;
2981 if (ex2 != &newex)
2982 ex2 = ex;
2983 3097
2984 err = ext4_ext_get_access(handle, inode, path + depth); 3098 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2985 if (err) 3099 split_flag |= EXT4_EXT_MARK_UNINIT2;
2986 goto out;
2987 3100
2988 allocated = map->m_len; 3101 flags |= EXT4_GET_BLOCKS_PRE_IO;
2989 } 3102 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
2990 /*
2991 * If there was a change of depth as part of the
2992 * insertion of ex3 above, we need to update the length
2993 * of the ex1 extent again here
2994 */
2995 if (ex1 && ex1 != ex) {
2996 ex1 = ex;
2997 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2998 ext4_ext_mark_uninitialized(ex1);
2999 ex2 = &newex;
3000 }
3001 /*
3002 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3003 * using direct I/O, uninitialised still.
3004 */
3005 ex2->ee_block = cpu_to_le32(map->m_lblk);
3006 ext4_ext_store_pblock(ex2, newblock);
3007 ex2->ee_len = cpu_to_le16(allocated);
3008 ext4_ext_mark_uninitialized(ex2);
3009 if (ex2 != ex)
3010 goto insert;
3011 /* Mark modified extent as dirty */
3012 err = ext4_ext_dirty(handle, inode, path + depth);
3013 ext_debug("out here\n");
3014 goto out;
3015insert:
3016 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3017 if (err == -ENOSPC && may_zeroout) {
3018 err = ext4_ext_zeroout(inode, &orig_ex);
3019 if (err)
3020 goto fix_extent_len;
3021 /* update the extent length and mark as initialized */
3022 ex->ee_block = orig_ex.ee_block;
3023 ex->ee_len = orig_ex.ee_len;
3024 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3025 ext4_ext_dirty(handle, inode, path + depth);
3026 /* zero out the first half */
3027 return allocated;
3028 } else if (err)
3029 goto fix_extent_len;
3030out:
3031 ext4_ext_show_leaf(inode, path);
3032 return err ? err : allocated;
3033
3034fix_extent_len:
3035 ex->ee_block = orig_ex.ee_block;
3036 ex->ee_len = orig_ex.ee_len;
3037 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3038 ext4_ext_mark_uninitialized(ex);
3039 ext4_ext_dirty(handle, inode, path + depth);
3040 return err;
3041} 3103}
3104
3042static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3105static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3043 struct inode *inode, 3106 struct inode *inode,
3044 struct ext4_ext_path *path) 3107 struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3047 struct ext4_extent_header *eh; 3110 struct ext4_extent_header *eh;
3048 int depth; 3111 int depth;
3049 int err = 0; 3112 int err = 0;
3050 int ret = 0;
3051 3113
3052 depth = ext_depth(inode); 3114 depth = ext_depth(inode);
3053 eh = path[depth].p_hdr; 3115 eh = path[depth].p_hdr;
3054 ex = path[depth].p_ext; 3116 ex = path[depth].p_ext;
3055 3117
3118 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3119 "block %llu, max_blocks %u\n", inode->i_ino,
3120 (unsigned long long)le32_to_cpu(ex->ee_block),
3121 ext4_ext_get_actual_len(ex));
3122
3056 err = ext4_ext_get_access(handle, inode, path + depth); 3123 err = ext4_ext_get_access(handle, inode, path + depth);
3057 if (err) 3124 if (err)
3058 goto out; 3125 goto out;
3059 /* first mark the extent as initialized */ 3126 /* first mark the extent as initialized */
3060 ext4_ext_mark_initialized(ex); 3127 ext4_ext_mark_initialized(ex);
3061 3128
3062 /* 3129 /* note: ext4_ext_correct_indexes() isn't needed here because
3063 * We have to see if it can be merged with the extent 3130 * borders are not changed
3064 * on the left.
3065 */
3066 if (ex > EXT_FIRST_EXTENT(eh)) {
3067 /*
3068 * To merge left, pass "ex - 1" to try_to_merge(),
3069 * since it merges towards right _only_.
3070 */
3071 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3072 if (ret) {
3073 err = ext4_ext_correct_indexes(handle, inode, path);
3074 if (err)
3075 goto out;
3076 depth = ext_depth(inode);
3077 ex--;
3078 }
3079 }
3080 /*
3081 * Try to Merge towards right.
3082 */ 3131 */
3083 ret = ext4_ext_try_to_merge(inode, path, ex); 3132 ext4_ext_try_to_merge(inode, path, ex);
3084 if (ret) { 3133
3085 err = ext4_ext_correct_indexes(handle, inode, path);
3086 if (err)
3087 goto out;
3088 depth = ext_depth(inode);
3089 }
3090 /* Mark modified extent as dirty */ 3134 /* Mark modified extent as dirty */
3091 err = ext4_ext_dirty(handle, inode, path + depth); 3135 err = ext4_ext_dirty(handle, inode, path + depth);
3092out: 3136out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3302 ext4_fsblk_t newblock = 0; 3346 ext4_fsblk_t newblock = 0;
3303 int err = 0, depth, ret; 3347 int err = 0, depth, ret;
3304 unsigned int allocated = 0; 3348 unsigned int allocated = 0;
3349 unsigned int punched_out = 0;
3350 unsigned int result = 0;
3305 struct ext4_allocation_request ar; 3351 struct ext4_allocation_request ar;
3306 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3352 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3353 struct ext4_map_blocks punch_map;
3307 3354
3308 ext_debug("blocks %u/%u requested for inode %lu\n", 3355 ext_debug("blocks %u/%u requested for inode %lu\n",
3309 map->m_lblk, map->m_len, inode->i_ino); 3356 map->m_lblk, map->m_len, inode->i_ino);
3310 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3357 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3311 3358
3312 /* check in cache */ 3359 /* check in cache */
3313 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3360 if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
3361 ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
3314 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3362 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3315 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3363 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3316 /* 3364 /*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3375 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3423 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3376 ee_block, ee_len, newblock); 3424 ee_block, ee_len, newblock);
3377 3425
3378 /* Do not put uninitialized extent in the cache */ 3426 if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3379 if (!ext4_ext_is_uninitialized(ex)) { 3427 /*
3380 ext4_ext_put_in_cache(inode, ee_block, 3428 * Do not put uninitialized extent
3381 ee_len, ee_start); 3429 * in the cache
3382 goto out; 3430 */
3431 if (!ext4_ext_is_uninitialized(ex)) {
3432 ext4_ext_put_in_cache(inode, ee_block,
3433 ee_len, ee_start);
3434 goto out;
3435 }
3436 ret = ext4_ext_handle_uninitialized_extents(
3437 handle, inode, map, path, flags,
3438 allocated, newblock);
3439 return ret;
3383 } 3440 }
3384 ret = ext4_ext_handle_uninitialized_extents(handle, 3441
3385 inode, map, path, flags, allocated, 3442 /*
3386 newblock); 3443 * Punch out the map length, but only to the
3387 return ret; 3444 * end of the extent
3445 */
3446 punched_out = allocated < map->m_len ?
3447 allocated : map->m_len;
3448
3449 /*
3450 * Sense extents need to be converted to
3451 * uninitialized, they must fit in an
3452 * uninitialized extent
3453 */
3454 if (punched_out > EXT_UNINIT_MAX_LEN)
3455 punched_out = EXT_UNINIT_MAX_LEN;
3456
3457 punch_map.m_lblk = map->m_lblk;
3458 punch_map.m_pblk = newblock;
3459 punch_map.m_len = punched_out;
3460 punch_map.m_flags = 0;
3461
3462 /* Check to see if the extent needs to be split */
3463 if (punch_map.m_len != ee_len ||
3464 punch_map.m_lblk != ee_block) {
3465
3466 ret = ext4_split_extent(handle, inode,
3467 path, &punch_map, 0,
3468 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3469 EXT4_GET_BLOCKS_PRE_IO);
3470
3471 if (ret < 0) {
3472 err = ret;
3473 goto out2;
3474 }
3475 /*
3476 * find extent for the block at
3477 * the start of the hole
3478 */
3479 ext4_ext_drop_refs(path);
3480 kfree(path);
3481
3482 path = ext4_ext_find_extent(inode,
3483 map->m_lblk, NULL);
3484 if (IS_ERR(path)) {
3485 err = PTR_ERR(path);
3486 path = NULL;
3487 goto out2;
3488 }
3489
3490 depth = ext_depth(inode);
3491 ex = path[depth].p_ext;
3492 ee_len = ext4_ext_get_actual_len(ex);
3493 ee_block = le32_to_cpu(ex->ee_block);
3494 ee_start = ext4_ext_pblock(ex);
3495
3496 }
3497
3498 ext4_ext_mark_uninitialized(ex);
3499
3500 err = ext4_ext_remove_space(inode, map->m_lblk,
3501 map->m_lblk + punched_out);
3502
3503 goto out2;
3388 } 3504 }
3389 } 3505 }
3390 3506
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3446 else 3562 else
3447 /* disable in-core preallocation for non-regular files */ 3563 /* disable in-core preallocation for non-regular files */
3448 ar.flags = 0; 3564 ar.flags = 0;
3565 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
3566 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
3449 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3567 newblock = ext4_mb_new_blocks(handle, &ar, &err);
3450 if (!newblock) 3568 if (!newblock)
3451 goto out2; 3569 goto out2;
@@ -3529,7 +3647,11 @@ out2:
3529 } 3647 }
3530 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 3648 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3531 newblock, map->m_len, err ? err : allocated); 3649 newblock, map->m_len, err ? err : allocated);
3532 return err ? err : allocated; 3650
3651 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
3652 punched_out : allocated;
3653
3654 return err ? err : result;
3533} 3655}
3534 3656
3535void ext4_ext_truncate(struct inode *inode) 3657void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
3577 3699
3578 last_block = (inode->i_size + sb->s_blocksize - 1) 3700 last_block = (inode->i_size + sb->s_blocksize - 1)
3579 >> EXT4_BLOCK_SIZE_BITS(sb); 3701 >> EXT4_BLOCK_SIZE_BITS(sb);
3580 err = ext4_ext_remove_space(inode, last_block); 3702 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
3581 3703
3582 /* In a multi-transaction truncate, we only make the final 3704 /* In a multi-transaction truncate, we only make the final
3583 * transaction synchronous. 3705 * transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
3585 if (IS_SYNC(inode)) 3707 if (IS_SYNC(inode))
3586 ext4_handle_sync(handle); 3708 ext4_handle_sync(handle);
3587 3709
3588out_stop:
3589 up_write(&EXT4_I(inode)->i_data_sem); 3710 up_write(&EXT4_I(inode)->i_data_sem);
3711
3712out_stop:
3590 /* 3713 /*
3591 * If this was a simple ftruncate() and the file will remain alive, 3714 * If this was a simple ftruncate() and the file will remain alive,
3592 * then we need to clear up the orphan record which we created above. 3715 * then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3651 struct ext4_map_blocks map; 3774 struct ext4_map_blocks map;
3652 unsigned int credits, blkbits = inode->i_blkbits; 3775 unsigned int credits, blkbits = inode->i_blkbits;
3653 3776
3654 /* We only support the FALLOC_FL_KEEP_SIZE mode */
3655 if (mode & ~FALLOC_FL_KEEP_SIZE)
3656 return -EOPNOTSUPP;
3657
3658 /* 3777 /*
3659 * currently supporting (pre)allocate mode for extent-based 3778 * currently supporting (pre)allocate mode for extent-based
3660 * files _only_ 3779 * files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3662 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3781 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3663 return -EOPNOTSUPP; 3782 return -EOPNOTSUPP;
3664 3783
3784 /* Return error if mode is not supported */
3785 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3786 return -EOPNOTSUPP;
3787
3788 if (mode & FALLOC_FL_PUNCH_HOLE)
3789 return ext4_punch_hole(file, offset, len);
3790
3665 trace_ext4_fallocate_enter(inode, offset, len, mode); 3791 trace_ext4_fallocate_enter(inode, offset, len, mode);
3666 map.m_lblk = offset >> blkbits; 3792 map.m_lblk = offset >> blkbits;
3667 /* 3793 /*
@@ -3691,7 +3817,8 @@ retry:
3691 break; 3817 break;
3692 } 3818 }
3693 ret = ext4_map_blocks(handle, inode, &map, 3819 ret = ext4_map_blocks(handle, inode, &map,
3694 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3820 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
3821 EXT4_GET_BLOCKS_NO_NORMALIZE);
3695 if (ret <= 0) { 3822 if (ret <= 0) {
3696#ifdef EXT4FS_DEBUG 3823#ifdef EXT4FS_DEBUG
3697 WARN_ON(ret <= 0); 3824 WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3822 pgoff_t last_offset; 3949 pgoff_t last_offset;
3823 pgoff_t offset; 3950 pgoff_t offset;
3824 pgoff_t index; 3951 pgoff_t index;
3952 pgoff_t start_index = 0;
3825 struct page **pages = NULL; 3953 struct page **pages = NULL;
3826 struct buffer_head *bh = NULL; 3954 struct buffer_head *bh = NULL;
3827 struct buffer_head *head = NULL; 3955 struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
3848 kfree(pages); 3976 kfree(pages);
3849 return EXT_CONTINUE; 3977 return EXT_CONTINUE;
3850 } 3978 }
3979 index = 0;
3851 3980
3981next_page:
3852 /* Try to find the 1st mapped buffer. */ 3982 /* Try to find the 1st mapped buffer. */
3853 end = ((__u64)pages[0]->index << PAGE_SHIFT) >> 3983 end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
3854 blksize_bits; 3984 blksize_bits;
3855 if (!page_has_buffers(pages[0])) 3985 if (!page_has_buffers(pages[index]))
3856 goto out; 3986 goto out;
3857 head = page_buffers(pages[0]); 3987 head = page_buffers(pages[index]);
3858 if (!head) 3988 if (!head)
3859 goto out; 3989 goto out;
3860 3990
3991 index++;
3861 bh = head; 3992 bh = head;
3862 do { 3993 do {
3863 if (buffer_mapped(bh)) { 3994 if (end >= newex->ec_block +
3995 newex->ec_len)
3996 /* The buffer is out of
3997 * the request range.
3998 */
3999 goto out;
4000
4001 if (buffer_mapped(bh) &&
4002 end >= newex->ec_block) {
4003 start_index = index - 1;
3864 /* get the 1st mapped buffer. */ 4004 /* get the 1st mapped buffer. */
3865 if (end > newex->ec_block +
3866 newex->ec_len)
3867 /* The buffer is out of
3868 * the request range.
3869 */
3870 goto out;
3871 goto found_mapped_buffer; 4005 goto found_mapped_buffer;
3872 } 4006 }
4007
3873 bh = bh->b_this_page; 4008 bh = bh->b_this_page;
3874 end++; 4009 end++;
3875 } while (bh != head); 4010 } while (bh != head);
3876 4011
3877 /* No mapped buffer found. */ 4012 /* No mapped buffer in the range found in this page,
3878 goto out; 4013 * We need to look up next page.
4014 */
4015 if (index >= ret) {
4016 /* There is no page left, but we need to limit
4017 * newex->ec_len.
4018 */
4019 newex->ec_len = end - newex->ec_block;
4020 goto out;
4021 }
4022 goto next_page;
3879 } else { 4023 } else {
3880 /*Find contiguous delayed buffers. */ 4024 /*Find contiguous delayed buffers. */
3881 if (ret > 0 && pages[0]->index == last_offset) 4025 if (ret > 0 && pages[0]->index == last_offset)
3882 head = page_buffers(pages[0]); 4026 head = page_buffers(pages[0]);
3883 bh = head; 4027 bh = head;
4028 index = 1;
4029 start_index = 0;
3884 } 4030 }
3885 4031
3886found_mapped_buffer: 4032found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
3903 end++; 4049 end++;
3904 } while (bh != head); 4050 } while (bh != head);
3905 4051
3906 for (index = 1; index < ret; index++) { 4052 for (; index < ret; index++) {
3907 if (!page_has_buffers(pages[index])) { 4053 if (!page_has_buffers(pages[index])) {
3908 bh = NULL; 4054 bh = NULL;
3909 break; 4055 break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
3913 bh = NULL; 4059 bh = NULL;
3914 break; 4060 break;
3915 } 4061 }
4062
3916 if (pages[index]->index != 4063 if (pages[index]->index !=
3917 pages[0]->index + index) { 4064 pages[start_index]->index + index
4065 - start_index) {
3918 /* Blocks are not contiguous. */ 4066 /* Blocks are not contiguous. */
3919 bh = NULL; 4067 bh = NULL;
3920 break; 4068 break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
4006 return (error < 0 ? error : 0); 4154 return (error < 0 ? error : 0);
4007} 4155}
4008 4156
4157/*
4158 * ext4_ext_punch_hole
4159 *
4160 * Punches a hole of "length" bytes in a file starting
4161 * at byte "offset"
4162 *
4163 * @inode: The inode of the file to punch a hole in
4164 * @offset: The starting byte offset of the hole
4165 * @length: The length of the hole
4166 *
4167 * Returns the number of blocks removed or negative on err
4168 */
4169int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4170{
4171 struct inode *inode = file->f_path.dentry->d_inode;
4172 struct super_block *sb = inode->i_sb;
4173 struct ext4_ext_cache cache_ex;
4174 ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4175 struct address_space *mapping = inode->i_mapping;
4176 struct ext4_map_blocks map;
4177 handle_t *handle;
4178 loff_t first_block_offset, last_block_offset, block_len;
4179 loff_t first_page, last_page, first_page_offset, last_page_offset;
4180 int ret, credits, blocks_released, err = 0;
4181
4182 first_block = (offset + sb->s_blocksize - 1) >>
4183 EXT4_BLOCK_SIZE_BITS(sb);
4184 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4185
4186 first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
4187 last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
4188
4189 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4190 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4191
4192 first_page_offset = first_page << PAGE_CACHE_SHIFT;
4193 last_page_offset = last_page << PAGE_CACHE_SHIFT;
4194
4195 /*
4196 * Write out all dirty pages to avoid race conditions
4197 * Then release them.
4198 */
4199 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4200 err = filemap_write_and_wait_range(mapping,
4201 first_page_offset == 0 ? 0 : first_page_offset-1,
4202 last_page_offset);
4203
4204 if (err)
4205 return err;
4206 }
4207
4208 /* Now release the pages */
4209 if (last_page_offset > first_page_offset) {
4210 truncate_inode_pages_range(mapping, first_page_offset,
4211 last_page_offset-1);
4212 }
4213
4214 /* finish any pending end_io work */
4215 ext4_flush_completed_IO(inode);
4216
4217 credits = ext4_writepage_trans_blocks(inode);
4218 handle = ext4_journal_start(inode, credits);
4219 if (IS_ERR(handle))
4220 return PTR_ERR(handle);
4221
4222 err = ext4_orphan_add(handle, inode);
4223 if (err)
4224 goto out;
4225
4226 /*
4227 * Now we need to zero out the un block aligned data.
4228 * If the file is smaller than a block, just
4229 * zero out the middle
4230 */
4231 if (first_block > last_block)
4232 ext4_block_zero_page_range(handle, mapping, offset, length);
4233 else {
4234 /* zero out the head of the hole before the first block */
4235 block_len = first_block_offset - offset;
4236 if (block_len > 0)
4237 ext4_block_zero_page_range(handle, mapping,
4238 offset, block_len);
4239
4240 /* zero out the tail of the hole after the last block */
4241 block_len = offset + length - last_block_offset;
4242 if (block_len > 0) {
4243 ext4_block_zero_page_range(handle, mapping,
4244 last_block_offset, block_len);
4245 }
4246 }
4247
4248 /* If there are no blocks to remove, return now */
4249 if (first_block >= last_block)
4250 goto out;
4251
4252 down_write(&EXT4_I(inode)->i_data_sem);
4253 ext4_ext_invalidate_cache(inode);
4254 ext4_discard_preallocations(inode);
4255
4256 /*
4257 * Loop over all the blocks and identify blocks
4258 * that need to be punched out
4259 */
4260 iblock = first_block;
4261 blocks_released = 0;
4262 while (iblock < last_block) {
4263 max_blocks = last_block - iblock;
4264 num_blocks = 1;
4265 memset(&map, 0, sizeof(map));
4266 map.m_lblk = iblock;
4267 map.m_len = max_blocks;
4268 ret = ext4_ext_map_blocks(handle, inode, &map,
4269 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4270
4271 if (ret > 0) {
4272 blocks_released += ret;
4273 num_blocks = ret;
4274 } else if (ret == 0) {
4275 /*
4276 * If map blocks could not find the block,
4277 * then it is in a hole. If the hole was
4278 * not already cached, then map blocks should
4279 * put it in the cache. So we can get the hole
4280 * out of the cache
4281 */
4282 memset(&cache_ex, 0, sizeof(cache_ex));
4283 if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4284 !cache_ex.ec_start) {
4285
4286 /* The hole is cached */
4287 num_blocks = cache_ex.ec_block +
4288 cache_ex.ec_len - iblock;
4289
4290 } else {
4291 /* The block could not be identified */
4292 err = -EIO;
4293 break;
4294 }
4295 } else {
4296 /* Map blocks error */
4297 err = ret;
4298 break;
4299 }
4300
4301 if (num_blocks == 0) {
4302 /* This condition should never happen */
4303 ext_debug("Block lookup failed");
4304 err = -EIO;
4305 break;
4306 }
4307
4308 iblock += num_blocks;
4309 }
4310
4311 if (blocks_released > 0) {
4312 ext4_ext_invalidate_cache(inode);
4313 ext4_discard_preallocations(inode);
4314 }
4315
4316 if (IS_SYNC(inode))
4317 ext4_handle_sync(handle);
4318
4319 up_write(&EXT4_I(inode)->i_data_sem);
4320
4321out:
4322 ext4_orphan_del(handle, inode);
4323 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4324 ext4_mark_inode_dirty(handle, inode);
4325 ext4_journal_stop(handle);
4326 return err;
4327}
4009int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4328int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4010 __u64 start, __u64 len) 4329 __u64 start, __u64 len)
4011{ 4330{
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4042 4361
4043 return error; 4362 return error;
4044} 4363}
4045
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b80d543b89e..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
272}; 272};
273 273
274const struct inode_operations ext4_file_inode_operations = { 274const struct inode_operations ext4_file_inode_operations = {
275 .truncate = ext4_truncate,
276 .setattr = ext4_setattr, 275 .setattr = ext4_setattr,
277 .getattr = ext4_getattr, 276 .getattr = ext4_getattr,
278#ifdef CONFIG_EXT4_FS_XATTR 277#ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e9473cbe80df..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
36 36
37static void dump_completed_IO(struct inode * inode) 37static void dump_completed_IO(struct inode * inode)
38{ 38{
39#ifdef EXT4_DEBUG 39#ifdef EXT4FS_DEBUG
40 struct list_head *cur, *before, *after; 40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1; 41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags; 42 unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
172 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 172 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
173 int ret; 173 int ret;
174 tid_t commit_tid; 174 tid_t commit_tid;
175 bool needs_barrier = false;
175 176
176 J_ASSERT(ext4_journal_current_handle() == NULL); 177 J_ASSERT(ext4_journal_current_handle() == NULL);
177 178
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
211 } 212 }
212 213
213 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 214 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
214 if (jbd2_log_start_commit(journal, commit_tid)) { 215 if (journal->j_flags & JBD2_BARRIER &&
215 /* 216 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
216 * When the journal is on a different device than the 217 needs_barrier = true;
217 * fs data disk, we need to issue the barrier in 218 jbd2_log_start_commit(journal, commit_tid);
218 * writeback mode. (In ordered mode, the jbd2 layer 219 ret = jbd2_log_wait_commit(journal, commit_tid);
219 * will take care of issuing the barrier. In 220 if (needs_barrier)
220 * data=journal, all of the data blocks are written to
221 * the journal device.)
222 */
223 if (ext4_should_writeback_data(inode) &&
224 (journal->j_fs_dev != journal->j_dev) &&
225 (journal->j_flags & JBD2_BARRIER))
226 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
227 NULL);
228 ret = jbd2_log_wait_commit(journal, commit_tid);
229 } else if (journal->j_flags & JBD2_BARRIER)
230 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 221 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
231 out: 222 out:
232 trace_ext4_sync_file_exit(inode, ret); 223 trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f2fa5e8a582c..50d0e9c64584 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
639 while (target > 0) { 639 while (target > 0) {
640 count = target; 640 count = target;
641 /* allocating blocks for indirect blocks and direct blocks */ 641 /* allocating blocks for indirect blocks and direct blocks */
642 current_block = ext4_new_meta_blocks(handle, inode, 642 current_block = ext4_new_meta_blocks(handle, inode, goal,
643 goal, &count, err); 643 0, &count, err);
644 if (*err) 644 if (*err)
645 goto failed_out; 645 goto failed_out;
646 646
@@ -1930,7 +1930,7 @@ repeat:
1930 * We do still charge estimated metadata to the sb though; 1930 * We do still charge estimated metadata to the sb though;
1931 * we cannot afford to run out of free blocks. 1931 * we cannot afford to run out of free blocks.
1932 */ 1932 */
1933 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1934 dquot_release_reservation_block(inode, 1); 1934 dquot_release_reservation_block(inode, 1);
1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1936 yield(); 1936 yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
2796 continue; 2796 continue;
2797 } 2797 }
2798 2798
2799 if (PageWriteback(page)) 2799 wait_on_page_writeback(page);
2800 wait_on_page_writeback(page);
2801
2802 BUG_ON(PageWriteback(page)); 2800 BUG_ON(PageWriteback(page));
2803 2801
2804 if (mpd->next_page != page->index) 2802 if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
3513 loff_t end = offset + iov_length(iov, nr_segs); 3511 loff_t end = offset + iov_length(iov, nr_segs);
3514 3512
3515 if (end > isize) 3513 if (end > isize)
3516 vmtruncate(inode, isize); 3514 ext4_truncate_failed_write(inode);
3517 } 3515 }
3518 } 3516 }
3519 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3517 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
3916int ext4_block_truncate_page(handle_t *handle, 3914int ext4_block_truncate_page(handle_t *handle,
3917 struct address_space *mapping, loff_t from) 3915 struct address_space *mapping, loff_t from)
3918{ 3916{
3917 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3918 unsigned length;
3919 unsigned blocksize;
3920 struct inode *inode = mapping->host;
3921
3922 blocksize = inode->i_sb->s_blocksize;
3923 length = blocksize - (offset & (blocksize - 1));
3924
3925 return ext4_block_zero_page_range(handle, mapping, from, length);
3926}
3927
3928/*
3929 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3930 * starting from file offset 'from'. The range to be zero'd must
3931 * be contained with in one block. If the specified range exceeds
3932 * the end of the block it will be shortened to end of the block
3933 * that cooresponds to 'from'
3934 */
3935int ext4_block_zero_page_range(handle_t *handle,
3936 struct address_space *mapping, loff_t from, loff_t length)
3937{
3919 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3938 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3920 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3939 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3921 unsigned blocksize, length, pos; 3940 unsigned blocksize, max, pos;
3922 ext4_lblk_t iblock; 3941 ext4_lblk_t iblock;
3923 struct inode *inode = mapping->host; 3942 struct inode *inode = mapping->host;
3924 struct buffer_head *bh; 3943 struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
3931 return -EINVAL; 3950 return -EINVAL;
3932 3951
3933 blocksize = inode->i_sb->s_blocksize; 3952 blocksize = inode->i_sb->s_blocksize;
3934 length = blocksize - (offset & (blocksize - 1)); 3953 max = blocksize - (offset & (blocksize - 1));
3954
3955 /*
3956 * correct length if it does not fall between
3957 * 'from' and the end of the block
3958 */
3959 if (length > max || length < 0)
3960 length = max;
3961
3935 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3962 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3936 3963
3937 if (!page_has_buffers(page)) 3964 if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4380 4407
4381int ext4_can_truncate(struct inode *inode) 4408int ext4_can_truncate(struct inode *inode)
4382{ 4409{
4383 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4384 return 0;
4385 if (S_ISREG(inode->i_mode)) 4410 if (S_ISREG(inode->i_mode))
4386 return 1; 4411 return 1;
4387 if (S_ISDIR(inode->i_mode)) 4412 if (S_ISDIR(inode->i_mode))
@@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
4392} 4417}
4393 4418
4394/* 4419/*
4420 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
4421 * associated with the given offset and length
4422 *
4423 * @inode: File inode
4424 * @offset: The offset where the hole will begin
4425 * @len: The length of the hole
4426 *
4427 * Returns: 0 on sucess or negative on failure
4428 */
4429
4430int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4431{
4432 struct inode *inode = file->f_path.dentry->d_inode;
4433 if (!S_ISREG(inode->i_mode))
4434 return -ENOTSUPP;
4435
4436 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4437 /* TODO: Add support for non extent hole punching */
4438 return -ENOTSUPP;
4439 }
4440
4441 return ext4_ext_punch_hole(file, offset, length);
4442}
4443
4444/*
4395 * ext4_truncate() 4445 * ext4_truncate()
4396 * 4446 *
4397 * We block out ext4_get_block() block instantiations across the entire 4447 * We block out ext4_get_block() block instantiations across the entire
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
4617 /* 4667 /*
4618 * Figure out the offset within the block group inode table 4668 * Figure out the offset within the block group inode table
4619 */ 4669 */
4620 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4670 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4621 inode_offset = ((inode->i_ino - 1) % 4671 inode_offset = ((inode->i_ino - 1) %
4622 EXT4_INODES_PER_GROUP(sb)); 4672 EXT4_INODES_PER_GROUP(sb));
4623 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4673 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5311 5361
5312 if (S_ISREG(inode->i_mode) && 5362 if (S_ISREG(inode->i_mode) &&
5313 attr->ia_valid & ATTR_SIZE && 5363 attr->ia_valid & ATTR_SIZE &&
5314 (attr->ia_size < inode->i_size || 5364 (attr->ia_size < inode->i_size)) {
5315 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5316 handle_t *handle; 5365 handle_t *handle;
5317 5366
5318 handle = ext4_journal_start(inode, 3); 5367 handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5346 goto err_out; 5395 goto err_out;
5347 } 5396 }
5348 } 5397 }
5349 /* ext4_truncate will clear the flag */
5350 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5351 ext4_truncate(inode);
5352 } 5398 }
5353 5399
5354 if ((attr->ia_valid & ATTR_SIZE) && 5400 if (attr->ia_valid & ATTR_SIZE) {
5355 attr->ia_size != i_size_read(inode)) 5401 if (attr->ia_size != i_size_read(inode)) {
5356 rc = vmtruncate(inode, attr->ia_size); 5402 truncate_setsize(inode, attr->ia_size);
5403 ext4_truncate(inode);
5404 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
5405 ext4_truncate(inode);
5406 }
5357 5407
5358 if (!rc) { 5408 if (!rc) {
5359 setattr_copy(inode, attr); 5409 setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5811 goto out_unlock; 5861 goto out_unlock;
5812 } 5862 }
5813 ret = 0; 5863 ret = 0;
5814 if (PageMappedToDisk(page)) 5864
5815 goto out_unlock; 5865 lock_page(page);
5866 wait_on_page_writeback(page);
5867 if (PageMappedToDisk(page)) {
5868 up_read(&inode->i_alloc_sem);
5869 return VM_FAULT_LOCKED;
5870 }
5816 5871
5817 if (page->index == size >> PAGE_CACHE_SHIFT) 5872 if (page->index == size >> PAGE_CACHE_SHIFT)
5818 len = size & ~PAGE_CACHE_MASK; 5873 len = size & ~PAGE_CACHE_MASK;
5819 else 5874 else
5820 len = PAGE_CACHE_SIZE; 5875 len = PAGE_CACHE_SIZE;
5821 5876
5822 lock_page(page);
5823 /* 5877 /*
5824 * return if we have all the buffers mapped. This avoid 5878 * return if we have all the buffers mapped. This avoid
5825 * the need to call write_begin/write_end which does a 5879 * the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5829 if (page_has_buffers(page)) { 5883 if (page_has_buffers(page)) {
5830 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5884 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5831 ext4_bh_unmapped)) { 5885 ext4_bh_unmapped)) {
5832 unlock_page(page); 5886 up_read(&inode->i_alloc_sem);
5833 goto out_unlock; 5887 return VM_FAULT_LOCKED;
5834 } 5888 }
5835 } 5889 }
5836 unlock_page(page); 5890 unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5850 if (ret < 0) 5904 if (ret < 0)
5851 goto out_unlock; 5905 goto out_unlock;
5852 ret = 0; 5906 ret = 0;
5907
5908 /*
5909 * write_begin/end might have created a dirty page and someone
5910 * could wander in and start the IO. Make sure that hasn't
5911 * happened.
5912 */
5913 lock_page(page);
5914 wait_on_page_writeback(page);
5915 up_read(&inode->i_alloc_sem);
5916 return VM_FAULT_LOCKED;
5853out_unlock: 5917out_unlock:
5854 if (ret) 5918 if (ret)
5855 ret = VM_FAULT_SIGBUS; 5919 ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8a16eecf1d5..859f2ae8864e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
787 struct inode *inode; 787 struct inode *inode;
788 char *data; 788 char *data;
789 char *bitmap; 789 char *bitmap;
790 struct ext4_group_info *grinfo;
790 791
791 mb_debug(1, "init page %lu\n", page->index); 792 mb_debug(1, "init page %lu\n", page->index);
792 793
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
819 if (first_group + i >= ngroups) 820 if (first_group + i >= ngroups)
820 break; 821 break;
821 822
823 grinfo = ext4_get_group_info(sb, first_group + i);
824 /*
825 * If page is uptodate then we came here after online resize
826 * which added some new uninitialized group info structs, so
827 * we must skip all initialized uptodate buddies on the page,
828 * which may be currently in use by an allocating task.
829 */
830 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
831 bh[i] = NULL;
832 continue;
833 }
834
822 err = -EIO; 835 err = -EIO;
823 desc = ext4_get_group_desc(sb, first_group + i, NULL); 836 desc = ext4_get_group_desc(sb, first_group + i, NULL);
824 if (desc == NULL) 837 if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
871 } 884 }
872 885
873 /* wait for I/O completion */ 886 /* wait for I/O completion */
874 for (i = 0; i < groups_per_page && bh[i]; i++) 887 for (i = 0; i < groups_per_page; i++)
875 wait_on_buffer(bh[i]); 888 if (bh[i])
889 wait_on_buffer(bh[i]);
876 890
877 err = -EIO; 891 err = -EIO;
878 for (i = 0; i < groups_per_page && bh[i]; i++) 892 for (i = 0; i < groups_per_page; i++)
879 if (!buffer_uptodate(bh[i])) 893 if (bh[i] && !buffer_uptodate(bh[i]))
880 goto out; 894 goto out;
881 895
882 err = 0; 896 err = 0;
883 first_block = page->index * blocks_per_page; 897 first_block = page->index * blocks_per_page;
884 /* init the page */
885 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
886 for (i = 0; i < blocks_per_page; i++) { 898 for (i = 0; i < blocks_per_page; i++) {
887 int group; 899 int group;
888 struct ext4_group_info *grinfo;
889 900
890 group = (first_block + i) >> 1; 901 group = (first_block + i) >> 1;
891 if (group >= ngroups) 902 if (group >= ngroups)
892 break; 903 break;
893 904
905 if (!bh[group - first_group])
906 /* skip initialized uptodate buddy */
907 continue;
908
894 /* 909 /*
895 * data carry information regarding this 910 * data carry information regarding this
896 * particular group in the format specified 911 * particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
919 * incore got set to the group block bitmap below 934 * incore got set to the group block bitmap below
920 */ 935 */
921 ext4_lock_group(sb, group); 936 ext4_lock_group(sb, group);
937 /* init the buddy */
938 memset(data, 0xff, blocksize);
922 ext4_mb_generate_buddy(sb, data, incore, group); 939 ext4_mb_generate_buddy(sb, data, incore, group);
923 ext4_unlock_group(sb, group); 940 ext4_unlock_group(sb, group);
924 incore = NULL; 941 incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
948 965
949out: 966out:
950 if (bh) { 967 if (bh) {
951 for (i = 0; i < groups_per_page && bh[i]; i++) 968 for (i = 0; i < groups_per_page; i++)
952 brelse(bh[i]); 969 brelse(bh[i]);
953 if (bh != &bhs) 970 if (bh != &bhs)
954 kfree(bh); 971 kfree(bh);
@@ -957,22 +974,21 @@ out:
957} 974}
958 975
959/* 976/*
960 * lock the group_info alloc_sem of all the groups 977 * Lock the buddy and bitmap pages. This make sure other parallel init_group
961 * belonging to the same buddy cache page. This 978 * on the same buddy page doesn't happen whild holding the buddy page lock.
962 * make sure other parallel operation on the buddy 979 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
963 * cache doesn't happen whild holding the buddy cache 980 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
964 * lock
965 */ 981 */
966static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, 982static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
967 ext4_group_t group) 983 ext4_group_t group, struct ext4_buddy *e4b)
968{ 984{
969 int i; 985 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
970 int block, pnum; 986 int block, pnum, poff;
971 int blocks_per_page; 987 int blocks_per_page;
972 int groups_per_page; 988 struct page *page;
973 ext4_group_t ngroups = ext4_get_groups_count(sb); 989
974 ext4_group_t first_group; 990 e4b->bd_buddy_page = NULL;
975 struct ext4_group_info *grp; 991 e4b->bd_bitmap_page = NULL;
976 992
977 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 993 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
978 /* 994 /*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
982 */ 998 */
983 block = group * 2; 999 block = group * 2;
984 pnum = block / blocks_per_page; 1000 pnum = block / blocks_per_page;
985 first_group = pnum * blocks_per_page / 2; 1001 poff = block % blocks_per_page;
986 1002 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
987 groups_per_page = blocks_per_page >> 1; 1003 if (!page)
988 if (groups_per_page == 0) 1004 return -EIO;
989 groups_per_page = 1; 1005 BUG_ON(page->mapping != inode->i_mapping);
990 /* read all groups the page covers into the cache */ 1006 e4b->bd_bitmap_page = page;
991 for (i = 0; i < groups_per_page; i++) { 1007 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
992 1008
993 if ((first_group + i) >= ngroups) 1009 if (blocks_per_page >= 2) {
994 break; 1010 /* buddy and bitmap are on the same page */
995 grp = ext4_get_group_info(sb, first_group + i); 1011 return 0;
996 /* take all groups write allocation
997 * semaphore. This make sure there is
998 * no block allocation going on in any
999 * of that groups
1000 */
1001 down_write_nested(&grp->alloc_sem, i);
1002 } 1012 }
1003 return i; 1013
1014 block++;
1015 pnum = block / blocks_per_page;
1016 poff = block % blocks_per_page;
1017 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1018 if (!page)
1019 return -EIO;
1020 BUG_ON(page->mapping != inode->i_mapping);
1021 e4b->bd_buddy_page = page;
1022 return 0;
1004} 1023}
1005 1024
1006static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, 1025static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1007 ext4_group_t group, int locked_group)
1008{ 1026{
1009 int i; 1027 if (e4b->bd_bitmap_page) {
1010 int block, pnum; 1028 unlock_page(e4b->bd_bitmap_page);
1011 int blocks_per_page; 1029 page_cache_release(e4b->bd_bitmap_page);
1012 ext4_group_t first_group; 1030 }
1013 struct ext4_group_info *grp; 1031 if (e4b->bd_buddy_page) {
1014 1032 unlock_page(e4b->bd_buddy_page);
1015 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1033 page_cache_release(e4b->bd_buddy_page);
1016 /*
1017 * the buddy cache inode stores the block bitmap
1018 * and buddy information in consecutive blocks.
1019 * So for each group we need two blocks.
1020 */
1021 block = group * 2;
1022 pnum = block / blocks_per_page;
1023 first_group = pnum * blocks_per_page / 2;
1024 /* release locks on all the groups */
1025 for (i = 0; i < locked_group; i++) {
1026
1027 grp = ext4_get_group_info(sb, first_group + i);
1028 /* take all groups write allocation
1029 * semaphore. This make sure there is
1030 * no block allocation going on in any
1031 * of that groups
1032 */
1033 up_write(&grp->alloc_sem);
1034 } 1034 }
1035
1036} 1035}
1037 1036
1038/* 1037/*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
1044int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1043int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1045{ 1044{
1046 1045
1047 int ret = 0;
1048 void *bitmap;
1049 int blocks_per_page;
1050 int block, pnum, poff;
1051 int num_grp_locked = 0;
1052 struct ext4_group_info *this_grp; 1046 struct ext4_group_info *this_grp;
1053 struct ext4_sb_info *sbi = EXT4_SB(sb); 1047 struct ext4_buddy e4b;
1054 struct inode *inode = sbi->s_buddy_cache; 1048 struct page *page;
1055 struct page *page = NULL, *bitmap_page = NULL; 1049 int ret = 0;
1056 1050
1057 mb_debug(1, "init group %u\n", group); 1051 mb_debug(1, "init group %u\n", group);
1058 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1059 this_grp = ext4_get_group_info(sb, group); 1052 this_grp = ext4_get_group_info(sb, group);
1060 /* 1053 /*
1061 * This ensures that we don't reinit the buddy cache 1054 * This ensures that we don't reinit the buddy cache
1062 * page which map to the group from which we are already 1055 * page which map to the group from which we are already
1063 * allocating. If we are looking at the buddy cache we would 1056 * allocating. If we are looking at the buddy cache we would
1064 * have taken a reference using ext4_mb_load_buddy and that 1057 * have taken a reference using ext4_mb_load_buddy and that
1065 * would have taken the alloc_sem lock. 1058 * would have pinned buddy page to page cache.
1066 */ 1059 */
1067 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); 1060 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1068 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { 1061 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1069 /* 1062 /*
1070 * somebody initialized the group 1063 * somebody initialized the group
1071 * return without doing anything 1064 * return without doing anything
1072 */ 1065 */
1073 ret = 0;
1074 goto err; 1066 goto err;
1075 } 1067 }
1076 /* 1068
1077 * the buddy cache inode stores the block bitmap 1069 page = e4b.bd_bitmap_page;
1078 * and buddy information in consecutive blocks. 1070 ret = ext4_mb_init_cache(page, NULL);
1079 * So for each group we need two blocks. 1071 if (ret)
1080 */ 1072 goto err;
1081 block = group * 2; 1073 if (!PageUptodate(page)) {
1082 pnum = block / blocks_per_page;
1083 poff = block % blocks_per_page;
1084 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1085 if (page) {
1086 BUG_ON(page->mapping != inode->i_mapping);
1087 ret = ext4_mb_init_cache(page, NULL);
1088 if (ret) {
1089 unlock_page(page);
1090 goto err;
1091 }
1092 unlock_page(page);
1093 }
1094 if (page == NULL || !PageUptodate(page)) {
1095 ret = -EIO; 1074 ret = -EIO;
1096 goto err; 1075 goto err;
1097 } 1076 }
1098 mark_page_accessed(page); 1077 mark_page_accessed(page);
1099 bitmap_page = page;
1100 bitmap = page_address(page) + (poff * sb->s_blocksize);
1101 1078
1102 /* init buddy cache */ 1079 if (e4b.bd_buddy_page == NULL) {
1103 block++;
1104 pnum = block / blocks_per_page;
1105 poff = block % blocks_per_page;
1106 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1107 if (page == bitmap_page) {
1108 /* 1080 /*
1109 * If both the bitmap and buddy are in 1081 * If both the bitmap and buddy are in
1110 * the same page we don't need to force 1082 * the same page we don't need to force
1111 * init the buddy 1083 * init the buddy
1112 */ 1084 */
1113 unlock_page(page); 1085 ret = 0;
1114 } else if (page) { 1086 goto err;
1115 BUG_ON(page->mapping != inode->i_mapping);
1116 ret = ext4_mb_init_cache(page, bitmap);
1117 if (ret) {
1118 unlock_page(page);
1119 goto err;
1120 }
1121 unlock_page(page);
1122 } 1087 }
1123 if (page == NULL || !PageUptodate(page)) { 1088 /* init buddy cache */
1089 page = e4b.bd_buddy_page;
1090 ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1091 if (ret)
1092 goto err;
1093 if (!PageUptodate(page)) {
1124 ret = -EIO; 1094 ret = -EIO;
1125 goto err; 1095 goto err;
1126 } 1096 }
1127 mark_page_accessed(page); 1097 mark_page_accessed(page);
1128err: 1098err:
1129 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); 1099 ext4_mb_put_buddy_page_lock(&e4b);
1130 if (bitmap_page)
1131 page_cache_release(bitmap_page);
1132 if (page)
1133 page_cache_release(page);
1134 return ret; 1100 return ret;
1135} 1101}
1136 1102
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1164 e4b->bd_group = group; 1130 e4b->bd_group = group;
1165 e4b->bd_buddy_page = NULL; 1131 e4b->bd_buddy_page = NULL;
1166 e4b->bd_bitmap_page = NULL; 1132 e4b->bd_bitmap_page = NULL;
1167 e4b->alloc_semp = &grp->alloc_sem;
1168
1169 /* Take the read lock on the group alloc
1170 * sem. This would make sure a parallel
1171 * ext4_mb_init_group happening on other
1172 * groups mapped by the page is blocked
1173 * till we are done with allocation
1174 */
1175repeat_load_buddy:
1176 down_read(e4b->alloc_semp);
1177 1133
1178 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1134 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1179 /* we need to check for group need init flag
1180 * with alloc_semp held so that we can be sure
1181 * that new blocks didn't get added to the group
1182 * when we are loading the buddy cache
1183 */
1184 up_read(e4b->alloc_semp);
1185 /* 1135 /*
1186 * we need full data about the group 1136 * we need full data about the group
1187 * to make a good selection 1137 * to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
1189 ret = ext4_mb_init_group(sb, group); 1139 ret = ext4_mb_init_group(sb, group);
1190 if (ret) 1140 if (ret)
1191 return ret; 1141 return ret;
1192 goto repeat_load_buddy;
1193 } 1142 }
1194 1143
1195 /* 1144 /*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
1273 return 0; 1222 return 0;
1274 1223
1275err: 1224err:
1225 if (page)
1226 page_cache_release(page);
1276 if (e4b->bd_bitmap_page) 1227 if (e4b->bd_bitmap_page)
1277 page_cache_release(e4b->bd_bitmap_page); 1228 page_cache_release(e4b->bd_bitmap_page);
1278 if (e4b->bd_buddy_page) 1229 if (e4b->bd_buddy_page)
1279 page_cache_release(e4b->bd_buddy_page); 1230 page_cache_release(e4b->bd_buddy_page);
1280 e4b->bd_buddy = NULL; 1231 e4b->bd_buddy = NULL;
1281 e4b->bd_bitmap = NULL; 1232 e4b->bd_bitmap = NULL;
1282
1283 /* Done with the buddy cache */
1284 up_read(e4b->alloc_semp);
1285 return ret; 1233 return ret;
1286} 1234}
1287 1235
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1291 page_cache_release(e4b->bd_bitmap_page); 1239 page_cache_release(e4b->bd_bitmap_page);
1292 if (e4b->bd_buddy_page) 1240 if (e4b->bd_buddy_page)
1293 page_cache_release(e4b->bd_buddy_page); 1241 page_cache_release(e4b->bd_buddy_page);
1294 /* Done with the buddy cache */
1295 if (e4b->alloc_semp)
1296 up_read(e4b->alloc_semp);
1297} 1242}
1298 1243
1299 1244
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1606 get_page(ac->ac_bitmap_page); 1551 get_page(ac->ac_bitmap_page);
1607 ac->ac_buddy_page = e4b->bd_buddy_page; 1552 ac->ac_buddy_page = e4b->bd_buddy_page;
1608 get_page(ac->ac_buddy_page); 1553 get_page(ac->ac_buddy_page);
1609 /* on allocation we use ac to track the held semaphore */
1610 ac->alloc_semp = e4b->alloc_semp;
1611 e4b->alloc_semp = NULL;
1612 /* store last allocated for subsequent stream allocation */ 1554 /* store last allocated for subsequent stream allocation */
1613 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 1555 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1614 spin_lock(&sbi->s_md_lock); 1556 spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2659 struct super_block *sb = journal->j_private; 2601 struct super_block *sb = journal->j_private;
2660 struct ext4_buddy e4b; 2602 struct ext4_buddy e4b;
2661 struct ext4_group_info *db; 2603 struct ext4_group_info *db;
2662 int err, ret, count = 0, count2 = 0; 2604 int err, count = 0, count2 = 0;
2663 struct ext4_free_data *entry; 2605 struct ext4_free_data *entry;
2664 struct list_head *l, *ltmp; 2606 struct list_head *l, *ltmp;
2665 2607
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2669 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2611 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2670 entry->count, entry->group, entry); 2612 entry->count, entry->group, entry);
2671 2613
2672 if (test_opt(sb, DISCARD)) { 2614 if (test_opt(sb, DISCARD))
2673 ret = ext4_issue_discard(sb, entry->group, 2615 ext4_issue_discard(sb, entry->group,
2674 entry->start_blk, entry->count); 2616 entry->start_blk, entry->count);
2675 if (unlikely(ret == -EOPNOTSUPP)) {
2676 ext4_warning(sb, "discard not supported, "
2677 "disabling");
2678 clear_opt(sb, DISCARD);
2679 }
2680 }
2681 2617
2682 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2618 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2683 /* we expect to find existing buddy because it's pinned */ 2619 /* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4226 spin_unlock(&pa->pa_lock); 4162 spin_unlock(&pa->pa_lock);
4227 } 4163 }
4228 } 4164 }
4229 if (ac->alloc_semp)
4230 up_read(ac->alloc_semp);
4231 if (pa) { 4165 if (pa) {
4232 /* 4166 /*
4233 * We want to add the pa to the right bucket. 4167 * We want to add the pa to the right bucket.
4234 * Remove it from the list and while adding 4168 * Remove it from the list and while adding
4235 * make sure the list to which we are adding 4169 * make sure the list to which we are adding
4236 * doesn't grow big. We need to release 4170 * doesn't grow big.
4237 * alloc_semp before calling ext4_mb_add_n_trim()
4238 */ 4171 */
4239 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4172 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4240 spin_lock(pa->pa_obj_lock); 4173 spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4303 * there is enough free blocks to do block allocation 4236 * there is enough free blocks to do block allocation
4304 * and verify allocation doesn't exceed the quota limits. 4237 * and verify allocation doesn't exceed the quota limits.
4305 */ 4238 */
4306 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4239 while (ar->len &&
4240 ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
4241
4307 /* let others to free the space */ 4242 /* let others to free the space */
4308 yield(); 4243 yield();
4309 ar->len = ar->len >> 1; 4244 ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4313 return 0; 4248 return 0;
4314 } 4249 }
4315 reserv_blks = ar->len; 4250 reserv_blks = ar->len;
4316 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { 4251 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4317 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4252 dquot_alloc_block_nofail(ar->inode, ar->len);
4318 ar->len--; 4253 } else {
4254 while (ar->len &&
4255 dquot_alloc_block(ar->inode, ar->len)) {
4256
4257 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4258 ar->len--;
4259 }
4319 } 4260 }
4320 inquota = ar->len; 4261 inquota = ar->len;
4321 if (ar->len == 0) { 4262 if (ar->len == 0) {
@@ -4704,6 +4645,127 @@ error_return:
4704} 4645}
4705 4646
4706/** 4647/**
4648 * ext4_add_groupblocks() -- Add given blocks to an existing group
4649 * @handle: handle to this transaction
4650 * @sb: super block
4651 * @block: start physcial block to add to the block group
4652 * @count: number of blocks to free
4653 *
4654 * This marks the blocks as free in the bitmap and buddy.
4655 */
4656void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4657 ext4_fsblk_t block, unsigned long count)
4658{
4659 struct buffer_head *bitmap_bh = NULL;
4660 struct buffer_head *gd_bh;
4661 ext4_group_t block_group;
4662 ext4_grpblk_t bit;
4663 unsigned int i;
4664 struct ext4_group_desc *desc;
4665 struct ext4_sb_info *sbi = EXT4_SB(sb);
4666 struct ext4_buddy e4b;
4667 int err = 0, ret, blk_free_count;
4668 ext4_grpblk_t blocks_freed;
4669 struct ext4_group_info *grp;
4670
4671 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4672
4673 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4674 grp = ext4_get_group_info(sb, block_group);
4675 /*
4676 * Check to see if we are freeing blocks across a group
4677 * boundary.
4678 */
4679 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
4680 goto error_return;
4681
4682 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4683 if (!bitmap_bh)
4684 goto error_return;
4685 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4686 if (!desc)
4687 goto error_return;
4688
4689 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4690 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4691 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4692 in_range(block + count - 1, ext4_inode_table(sb, desc),
4693 sbi->s_itb_per_group)) {
4694 ext4_error(sb, "Adding blocks in system zones - "
4695 "Block = %llu, count = %lu",
4696 block, count);
4697 goto error_return;
4698 }
4699
4700 BUFFER_TRACE(bitmap_bh, "getting write access");
4701 err = ext4_journal_get_write_access(handle, bitmap_bh);
4702 if (err)
4703 goto error_return;
4704
4705 /*
4706 * We are about to modify some metadata. Call the journal APIs
4707 * to unshare ->b_data if a currently-committing transaction is
4708 * using it
4709 */
4710 BUFFER_TRACE(gd_bh, "get_write_access");
4711 err = ext4_journal_get_write_access(handle, gd_bh);
4712 if (err)
4713 goto error_return;
4714
4715 for (i = 0, blocks_freed = 0; i < count; i++) {
4716 BUFFER_TRACE(bitmap_bh, "clear bit");
4717 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4718 ext4_error(sb, "bit already cleared for block %llu",
4719 (ext4_fsblk_t)(block + i));
4720 BUFFER_TRACE(bitmap_bh, "bit already cleared");
4721 } else {
4722 blocks_freed++;
4723 }
4724 }
4725
4726 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4727 if (err)
4728 goto error_return;
4729
4730 /*
4731 * need to update group_info->bb_free and bitmap
4732 * with group lock held. generate_buddy look at
4733 * them with group lock_held
4734 */
4735 ext4_lock_group(sb, block_group);
4736 mb_clear_bits(bitmap_bh->b_data, bit, count);
4737 mb_free_blocks(NULL, &e4b, bit, count);
4738 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
4739 ext4_free_blks_set(sb, desc, blk_free_count);
4740 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4741 ext4_unlock_group(sb, block_group);
4742 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
4743
4744 if (sbi->s_log_groups_per_flex) {
4745 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4746 atomic_add(blocks_freed,
4747 &sbi->s_flex_groups[flex_group].free_blocks);
4748 }
4749
4750 ext4_mb_unload_buddy(&e4b);
4751
4752 /* We dirtied the bitmap block */
4753 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4754 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4755
4756 /* And the group descriptor block */
4757 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4758 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4759 if (!err)
4760 err = ret;
4761
4762error_return:
4763 brelse(bitmap_bh);
4764 ext4_std_error(sb, err);
4765 return;
4766}
4767
4768/**
4707 * ext4_trim_extent -- function to TRIM one single free extent in the group 4769 * ext4_trim_extent -- function to TRIM one single free extent in the group
4708 * @sb: super block for the file system 4770 * @sb: super block for the file system
4709 * @start: starting block of the free extent in the alloc. group 4771 * @start: starting block of the free extent in the alloc. group
@@ -4715,11 +4777,10 @@ error_return:
4715 * one will allocate those blocks, mark it as used in buddy bitmap. This must 4777 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4716 * be called with under the group lock. 4778 * be called with under the group lock.
4717 */ 4779 */
4718static int ext4_trim_extent(struct super_block *sb, int start, int count, 4780static void ext4_trim_extent(struct super_block *sb, int start, int count,
4719 ext4_group_t group, struct ext4_buddy *e4b) 4781 ext4_group_t group, struct ext4_buddy *e4b)
4720{ 4782{
4721 struct ext4_free_extent ex; 4783 struct ext4_free_extent ex;
4722 int ret = 0;
4723 4784
4724 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 4785 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4725 4786
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4733 */ 4794 */
4734 mb_mark_used(e4b, &ex); 4795 mb_mark_used(e4b, &ex);
4735 ext4_unlock_group(sb, group); 4796 ext4_unlock_group(sb, group);
4736 4797 ext4_issue_discard(sb, group, start, count);
4737 ret = ext4_issue_discard(sb, group, start, count);
4738
4739 ext4_lock_group(sb, group); 4798 ext4_lock_group(sb, group);
4740 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4799 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4741 return ret;
4742} 4800}
4743 4801
4744/** 4802/**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4760 * the group buddy bitmap. This is done until whole group is scanned. 4818 * the group buddy bitmap. This is done until whole group is scanned.
4761 */ 4819 */
4762static ext4_grpblk_t 4820static ext4_grpblk_t
4763ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, 4821ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4764 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) 4822 ext4_grpblk_t start, ext4_grpblk_t max,
4823 ext4_grpblk_t minblocks)
4765{ 4824{
4766 void *bitmap; 4825 void *bitmap;
4767 ext4_grpblk_t next, count = 0; 4826 ext4_grpblk_t next, count = 0;
4768 ext4_group_t group; 4827 struct ext4_buddy e4b;
4769 int ret = 0; 4828 int ret;
4770 4829
4771 BUG_ON(e4b == NULL); 4830 ret = ext4_mb_load_buddy(sb, group, &e4b);
4831 if (ret) {
4832 ext4_error(sb, "Error in loading buddy "
4833 "information for %u", group);
4834 return ret;
4835 }
4836 bitmap = e4b.bd_bitmap;
4772 4837
4773 bitmap = e4b->bd_bitmap;
4774 group = e4b->bd_group;
4775 start = (e4b->bd_info->bb_first_free > start) ?
4776 e4b->bd_info->bb_first_free : start;
4777 ext4_lock_group(sb, group); 4838 ext4_lock_group(sb, group);
4839 start = (e4b.bd_info->bb_first_free > start) ?
4840 e4b.bd_info->bb_first_free : start;
4778 4841
4779 while (start < max) { 4842 while (start < max) {
4780 start = mb_find_next_zero_bit(bitmap, max, start); 4843 start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4783 next = mb_find_next_bit(bitmap, max, start); 4846 next = mb_find_next_bit(bitmap, max, start);
4784 4847
4785 if ((next - start) >= minblocks) { 4848 if ((next - start) >= minblocks) {
4786 ret = ext4_trim_extent(sb, start, 4849 ext4_trim_extent(sb, start,
4787 next - start, group, e4b); 4850 next - start, group, &e4b);
4788 if (ret < 0)
4789 break;
4790 count += next - start; 4851 count += next - start;
4791 } 4852 }
4792 start = next + 1; 4853 start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4802 ext4_lock_group(sb, group); 4863 ext4_lock_group(sb, group);
4803 } 4864 }
4804 4865
4805 if ((e4b->bd_info->bb_free - count) < minblocks) 4866 if ((e4b.bd_info->bb_free - count) < minblocks)
4806 break; 4867 break;
4807 } 4868 }
4808 ext4_unlock_group(sb, group); 4869 ext4_unlock_group(sb, group);
4870 ext4_mb_unload_buddy(&e4b);
4809 4871
4810 ext4_debug("trimmed %d blocks in the group %d\n", 4872 ext4_debug("trimmed %d blocks in the group %d\n",
4811 count, group); 4873 count, group);
4812 4874
4813 if (ret < 0)
4814 count = ret;
4815
4816 return count; 4875 return count;
4817} 4876}
4818 4877
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4830 */ 4889 */
4831int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 4890int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4832{ 4891{
4833 struct ext4_buddy e4b; 4892 struct ext4_group_info *grp;
4834 ext4_group_t first_group, last_group; 4893 ext4_group_t first_group, last_group;
4835 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 4894 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4836 ext4_grpblk_t cnt = 0, first_block, last_block; 4895 ext4_grpblk_t cnt = 0, first_block, last_block;
4837 uint64_t start, len, minlen, trimmed; 4896 uint64_t start, len, minlen, trimmed = 0;
4838 ext4_fsblk_t first_data_blk = 4897 ext4_fsblk_t first_data_blk =
4839 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 4898 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4840 int ret = 0; 4899 int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4842 start = range->start >> sb->s_blocksize_bits; 4901 start = range->start >> sb->s_blocksize_bits;
4843 len = range->len >> sb->s_blocksize_bits; 4902 len = range->len >> sb->s_blocksize_bits;
4844 minlen = range->minlen >> sb->s_blocksize_bits; 4903 minlen = range->minlen >> sb->s_blocksize_bits;
4845 trimmed = 0;
4846 4904
4847 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 4905 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4848 return -EINVAL; 4906 return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4863 return -EINVAL; 4921 return -EINVAL;
4864 4922
4865 for (group = first_group; group <= last_group; group++) { 4923 for (group = first_group; group <= last_group; group++) {
4866 ret = ext4_mb_load_buddy(sb, group, &e4b); 4924 grp = ext4_get_group_info(sb, group);
4867 if (ret) { 4925 /* We only do this if the grp has never been initialized */
4868 ext4_error(sb, "Error in loading buddy " 4926 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
4869 "information for %u", group); 4927 ret = ext4_mb_init_group(sb, group);
4870 break; 4928 if (ret)
4929 break;
4871 } 4930 }
4872 4931
4873 /* 4932 /*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4880 last_block = first_block + len; 4939 last_block = first_block + len;
4881 len -= last_block - first_block; 4940 len -= last_block - first_block;
4882 4941
4883 if (e4b.bd_info->bb_free >= minlen) { 4942 if (grp->bb_free >= minlen) {
4884 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4943 cnt = ext4_trim_all_free(sb, group, first_block,
4885 last_block, minlen); 4944 last_block, minlen);
4886 if (cnt < 0) { 4945 if (cnt < 0) {
4887 ret = cnt; 4946 ret = cnt;
4888 ext4_mb_unload_buddy(&e4b);
4889 break; 4947 break;
4890 } 4948 }
4891 } 4949 }
4892 ext4_mb_unload_buddy(&e4b);
4893 trimmed += cnt; 4950 trimmed += cnt;
4894 first_block = 0; 4951 first_block = 0;
4895 } 4952 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 22bd4d7f289b..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
193 __u8 ac_op; /* operation, for history only */ 193 __u8 ac_op; /* operation, for history only */
194 struct page *ac_bitmap_page; 194 struct page *ac_bitmap_page;
195 struct page *ac_buddy_page; 195 struct page *ac_buddy_page;
196 /*
197 * pointer to the held semaphore upon successful
198 * block allocation
199 */
200 struct rw_semaphore *alloc_semp;
201 struct ext4_prealloc_space *ac_pa; 196 struct ext4_prealloc_space *ac_pa;
202 struct ext4_locality_group *ac_lg; 197 struct ext4_locality_group *ac_lg;
203}; 198};
@@ -215,7 +210,6 @@ struct ext4_buddy {
215 struct super_block *bd_sb; 210 struct super_block *bd_sb;
216 __u16 bd_blkbits; 211 __u16 bd_blkbits;
217 ext4_group_t bd_group; 212 ext4_group_t bd_group;
218 struct rw_semaphore *alloc_semp;
219}; 213};
220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 214#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 215#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 92816b4e0f16..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
376 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
377 * Now copy the i_data across 377 * Now copy the i_data across
378 */ 378 */
379 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); 379 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); 380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
381 381
382 /* 382 /*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
1#include <linux/fs.h>
2#include <linux/random.h>
3#include <linux/buffer_head.h>
4#include <linux/utsname.h>
5#include <linux/kthread.h>
6
7#include "ext4.h"
8
9/*
10 * Write the MMP block using WRITE_SYNC to try to get the block on-disk
11 * faster.
12 */
13static int write_mmp_block(struct buffer_head *bh)
14{
15 mark_buffer_dirty(bh);
16 lock_buffer(bh);
17 bh->b_end_io = end_buffer_write_sync;
18 get_bh(bh);
19 submit_bh(WRITE_SYNC, bh);
20 wait_on_buffer(bh);
21 if (unlikely(!buffer_uptodate(bh)))
22 return 1;
23
24 return 0;
25}
26
27/*
28 * Read the MMP block. It _must_ be read from disk and hence we clear the
29 * uptodate flag on the buffer.
30 */
31static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
32 ext4_fsblk_t mmp_block)
33{
34 struct mmp_struct *mmp;
35
36 if (*bh)
37 clear_buffer_uptodate(*bh);
38
39 /* This would be sb_bread(sb, mmp_block), except we need to be sure
40 * that the MD RAID device cache has been bypassed, and that the read
41 * is not blocked in the elevator. */
42 if (!*bh)
43 *bh = sb_getblk(sb, mmp_block);
44 if (*bh) {
45 get_bh(*bh);
46 lock_buffer(*bh);
47 (*bh)->b_end_io = end_buffer_read_sync;
48 submit_bh(READ_SYNC, *bh);
49 wait_on_buffer(*bh);
50 if (!buffer_uptodate(*bh)) {
51 brelse(*bh);
52 *bh = NULL;
53 }
54 }
55 if (!*bh) {
56 ext4_warning(sb, "Error while reading MMP block %llu",
57 mmp_block);
58 return -EIO;
59 }
60
61 mmp = (struct mmp_struct *)((*bh)->b_data);
62 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
63 return -EINVAL;
64
65 return 0;
66}
67
68/*
69 * Dump as much information as possible to help the admin.
70 */
71void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
72 const char *function, unsigned int line, const char *msg)
73{
74 __ext4_warning(sb, function, line, msg);
75 __ext4_warning(sb, function, line,
76 "MMP failure info: last update time: %llu, last update "
77 "node: %s, last update device: %s\n",
78 (long long unsigned int) le64_to_cpu(mmp->mmp_time),
79 mmp->mmp_nodename, mmp->mmp_bdevname);
80}
81
82/*
83 * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
84 */
85static int kmmpd(void *data)
86{
87 struct super_block *sb = ((struct mmpd_data *) data)->sb;
88 struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
89 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
90 struct mmp_struct *mmp;
91 ext4_fsblk_t mmp_block;
92 u32 seq = 0;
93 unsigned long failed_writes = 0;
94 int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
95 unsigned mmp_check_interval;
96 unsigned long last_update_time;
97 unsigned long diff;
98 int retval;
99
100 mmp_block = le64_to_cpu(es->s_mmp_block);
101 mmp = (struct mmp_struct *)(bh->b_data);
102 mmp->mmp_time = cpu_to_le64(get_seconds());
103 /*
104 * Start with the higher mmp_check_interval and reduce it if
105 * the MMP block is being updated on time.
106 */
107 mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
108 EXT4_MMP_MIN_CHECK_INTERVAL);
109 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
110 bdevname(bh->b_bdev, mmp->mmp_bdevname);
111
112 memcpy(mmp->mmp_nodename, init_utsname()->sysname,
113 sizeof(mmp->mmp_nodename));
114
115 while (!kthread_should_stop()) {
116 if (++seq > EXT4_MMP_SEQ_MAX)
117 seq = 1;
118
119 mmp->mmp_seq = cpu_to_le32(seq);
120 mmp->mmp_time = cpu_to_le64(get_seconds());
121 last_update_time = jiffies;
122
123 retval = write_mmp_block(bh);
124 /*
125 * Don't spew too many error messages. Print one every
126 * (s_mmp_update_interval * 60) seconds.
127 */
128 if (retval && (failed_writes % 60) == 0) {
129 ext4_error(sb, "Error writing to MMP block");
130 failed_writes++;
131 }
132
133 if (!(le32_to_cpu(es->s_feature_incompat) &
134 EXT4_FEATURE_INCOMPAT_MMP)) {
135 ext4_warning(sb, "kmmpd being stopped since MMP feature"
136 " has been disabled.");
137 EXT4_SB(sb)->s_mmp_tsk = NULL;
138 goto failed;
139 }
140
141 if (sb->s_flags & MS_RDONLY) {
142 ext4_warning(sb, "kmmpd being stopped since filesystem "
143 "has been remounted as readonly.");
144 EXT4_SB(sb)->s_mmp_tsk = NULL;
145 goto failed;
146 }
147
148 diff = jiffies - last_update_time;
149 if (diff < mmp_update_interval * HZ)
150 schedule_timeout_interruptible(mmp_update_interval *
151 HZ - diff);
152
153 /*
154 * We need to make sure that more than mmp_check_interval
155 * seconds have not passed since writing. If that has happened
156 * we need to check if the MMP block is as we left it.
157 */
158 diff = jiffies - last_update_time;
159 if (diff > mmp_check_interval * HZ) {
160 struct buffer_head *bh_check = NULL;
161 struct mmp_struct *mmp_check;
162
163 retval = read_mmp_block(sb, &bh_check, mmp_block);
164 if (retval) {
165 ext4_error(sb, "error reading MMP data: %d",
166 retval);
167
168 EXT4_SB(sb)->s_mmp_tsk = NULL;
169 goto failed;
170 }
171
172 mmp_check = (struct mmp_struct *)(bh_check->b_data);
173 if (mmp->mmp_seq != mmp_check->mmp_seq ||
174 memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
175 sizeof(mmp->mmp_nodename))) {
176 dump_mmp_msg(sb, mmp_check,
177 "Error while updating MMP info. "
178 "The filesystem seems to have been"
179 " multiply mounted.");
180 ext4_error(sb, "abort");
181 goto failed;
182 }
183 put_bh(bh_check);
184 }
185
186 /*
187 * Adjust the mmp_check_interval depending on how much time
188 * it took for the MMP block to be written.
189 */
190 mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
191 EXT4_MMP_MAX_CHECK_INTERVAL),
192 EXT4_MMP_MIN_CHECK_INTERVAL);
193 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
194 }
195
196 /*
197 * Unmount seems to be clean.
198 */
199 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
200 mmp->mmp_time = cpu_to_le64(get_seconds());
201
202 retval = write_mmp_block(bh);
203
204failed:
205 kfree(data);
206 brelse(bh);
207 return retval;
208}
209
210/*
211 * Get a random new sequence number but make sure it is not greater than
212 * EXT4_MMP_SEQ_MAX.
213 */
214static unsigned int mmp_new_seq(void)
215{
216 u32 new_seq;
217
218 do {
219 get_random_bytes(&new_seq, sizeof(u32));
220 } while (new_seq > EXT4_MMP_SEQ_MAX);
221
222 return new_seq;
223}
224
225/*
226 * Protect the filesystem from being mounted more than once.
227 */
228int ext4_multi_mount_protect(struct super_block *sb,
229 ext4_fsblk_t mmp_block)
230{
231 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
232 struct buffer_head *bh = NULL;
233 struct mmp_struct *mmp = NULL;
234 struct mmpd_data *mmpd_data;
235 u32 seq;
236 unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
237 unsigned int wait_time = 0;
238 int retval;
239
240 if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
241 mmp_block >= ext4_blocks_count(es)) {
242 ext4_warning(sb, "Invalid MMP block in superblock");
243 goto failed;
244 }
245
246 retval = read_mmp_block(sb, &bh, mmp_block);
247 if (retval)
248 goto failed;
249
250 mmp = (struct mmp_struct *)(bh->b_data);
251
252 if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
253 mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
254
255 /*
256 * If check_interval in MMP block is larger, use that instead of
257 * update_interval from the superblock.
258 */
259 if (mmp->mmp_check_interval > mmp_check_interval)
260 mmp_check_interval = mmp->mmp_check_interval;
261
262 seq = le32_to_cpu(mmp->mmp_seq);
263 if (seq == EXT4_MMP_SEQ_CLEAN)
264 goto skip;
265
266 if (seq == EXT4_MMP_SEQ_FSCK) {
267 dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
268 goto failed;
269 }
270
271 wait_time = min(mmp_check_interval * 2 + 1,
272 mmp_check_interval + 60);
273
274 /* Print MMP interval if more than 20 secs. */
275 if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
276 ext4_warning(sb, "MMP interval %u higher than expected, please"
277 " wait.\n", wait_time * 2);
278
279 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
280 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
281 goto failed;
282 }
283
284 retval = read_mmp_block(sb, &bh, mmp_block);
285 if (retval)
286 goto failed;
287 mmp = (struct mmp_struct *)(bh->b_data);
288 if (seq != le32_to_cpu(mmp->mmp_seq)) {
289 dump_mmp_msg(sb, mmp,
290 "Device is already active on another node.");
291 goto failed;
292 }
293
294skip:
295 /*
296 * write a new random sequence number.
297 */
298 mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
299
300 retval = write_mmp_block(bh);
301 if (retval)
302 goto failed;
303
304 /*
305 * wait for MMP interval and check mmp_seq.
306 */
307 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
308 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
309 goto failed;
310 }
311
312 retval = read_mmp_block(sb, &bh, mmp_block);
313 if (retval)
314 goto failed;
315 mmp = (struct mmp_struct *)(bh->b_data);
316 if (seq != le32_to_cpu(mmp->mmp_seq)) {
317 dump_mmp_msg(sb, mmp,
318 "Device is already active on another node.");
319 goto failed;
320 }
321
322 mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
323 if (!mmpd_data) {
324 ext4_warning(sb, "not enough memory for mmpd_data");
325 goto failed;
326 }
327 mmpd_data->sb = sb;
328 mmpd_data->bh = bh;
329
330 /*
331 * Start a kernel thread to update the MMP block periodically.
332 */
333 EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
334 bdevname(bh->b_bdev,
335 mmp->mmp_bdevname));
336 if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
337 EXT4_SB(sb)->s_mmp_tsk = NULL;
338 kfree(mmpd_data);
339 ext4_warning(sb, "Unable to create kmmpd thread for %s.",
340 sb->s_id);
341 goto failed;
342 }
343
344 return 0;
345
346failed:
347 brelse(bh);
348 return 1;
349}
350
351
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e7862f13..2b8304bf3c50 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
876 * It needs to call wait_on_page_writeback() to wait for the 876 * It needs to call wait_on_page_writeback() to wait for the
877 * writeback of the page. 877 * writeback of the page.
878 */ 878 */
879 if (PageWriteback(page)) 879 wait_on_page_writeback(page);
880 wait_on_page_writeback(page);
881 880
882 /* Release old bh and drop refs */ 881 /* Release old bh and drop refs */
883 try_to_release_page(page, 0); 882 try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67fd0b025858..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1413 frame->at = entries; 1413 frame->at = entries;
1414 frame->bh = bh; 1414 frame->bh = bh;
1415 bh = bh2; 1415 bh = bh2;
1416
1417 ext4_handle_dirty_metadata(handle, dir, frame->bh);
1418 ext4_handle_dirty_metadata(handle, dir, bh);
1419
1416 de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1420 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1417 dx_release (frames); 1421 if (!de) {
1418 if (!(de)) 1422 /*
1423 * Even if the block split failed, we have to properly write
1424 * out all the changes we did so far. Otherwise we can end up
1425 * with corrupted filesystem.
1426 */
1427 ext4_mark_inode_dirty(handle, dir);
1428 dx_release(frames);
1419 return retval; 1429 return retval;
1430 }
1431 dx_release(frames);
1420 1432
1421 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1433 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1422 brelse(bh); 1434 brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
2240 handle_t *handle; 2252 handle_t *handle;
2241 struct inode *inode; 2253 struct inode *inode;
2242 int l, err, retries = 0; 2254 int l, err, retries = 0;
2255 int credits;
2243 2256
2244 l = strlen(symname)+1; 2257 l = strlen(symname)+1;
2245 if (l > dir->i_sb->s_blocksize) 2258 if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
2247 2260
2248 dquot_initialize(dir); 2261 dquot_initialize(dir);
2249 2262
2263 if (l > EXT4_N_BLOCKS * 4) {
2264 /*
2265 * For non-fast symlinks, we just allocate inode and put it on
2266 * orphan list in the first transaction => we need bitmap,
2267 * group descriptor, sb, inode block, quota blocks.
2268 */
2269 credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2270 } else {
2271 /*
2272 * Fast symlink. We have to add entry to directory
2273 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
2274 * allocate new inode (bitmap, group descriptor, inode block,
2275 * quota blocks, sb is already counted in previous macros).
2276 */
2277 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2278 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2279 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2280 }
2250retry: 2281retry:
2251 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2282 handle = ext4_journal_start(dir, credits);
2252 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2253 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2254 if (IS_ERR(handle)) 2283 if (IS_ERR(handle))
2255 return PTR_ERR(handle); 2284 return PTR_ERR(handle);
2256 2285
@@ -2263,21 +2292,44 @@ retry:
2263 if (IS_ERR(inode)) 2292 if (IS_ERR(inode))
2264 goto out_stop; 2293 goto out_stop;
2265 2294
2266 if (l > sizeof(EXT4_I(inode)->i_data)) { 2295 if (l > EXT4_N_BLOCKS * 4) {
2267 inode->i_op = &ext4_symlink_inode_operations; 2296 inode->i_op = &ext4_symlink_inode_operations;
2268 ext4_set_aops(inode); 2297 ext4_set_aops(inode);
2269 /* 2298 /*
2270 * page_symlink() calls into ext4_prepare/commit_write. 2299 * We cannot call page_symlink() with transaction started
2271 * We have a transaction open. All is sweetness. It also sets 2300 * because it calls into ext4_write_begin() which can wait
2272 * i_size in generic_commit_write(). 2301 * for transaction commit if we are running out of space
2302 * and thus we deadlock. So we have to stop transaction now
2303 * and restart it when symlink contents is written.
2304 *
2305 * To keep fs consistent in case of crash, we have to put inode
2306 * to orphan list in the mean time.
2273 */ 2307 */
2308 drop_nlink(inode);
2309 err = ext4_orphan_add(handle, inode);
2310 ext4_journal_stop(handle);
2311 if (err)
2312 goto err_drop_inode;
2274 err = __page_symlink(inode, symname, l, 1); 2313 err = __page_symlink(inode, symname, l, 1);
2314 if (err)
2315 goto err_drop_inode;
2316 /*
2317 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
2318 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2319 */
2320 handle = ext4_journal_start(dir,
2321 EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2322 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
2323 if (IS_ERR(handle)) {
2324 err = PTR_ERR(handle);
2325 goto err_drop_inode;
2326 }
2327 inc_nlink(inode);
2328 err = ext4_orphan_del(handle, inode);
2275 if (err) { 2329 if (err) {
2330 ext4_journal_stop(handle);
2276 clear_nlink(inode); 2331 clear_nlink(inode);
2277 unlock_new_inode(inode); 2332 goto err_drop_inode;
2278 ext4_mark_inode_dirty(handle, inode);
2279 iput(inode);
2280 goto out_stop;
2281 } 2333 }
2282 } else { 2334 } else {
2283 /* clear the extent format for fast symlink */ 2335 /* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
2293 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2345 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2294 goto retry; 2346 goto retry;
2295 return err; 2347 return err;
2348err_drop_inode:
2349 unlock_new_inode(inode);
2350 iput(inode);
2351 return err;
2296} 2352}
2297 2353
2298static int ext4_link(struct dentry *old_dentry, 2354static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b6dbd056fcb1..7bb8f76d470a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
203 for (i = 0; i < io_end->num_io_pages; i++) { 203 for (i = 0; i < io_end->num_io_pages; i++) {
204 struct page *page = io_end->pages[i]->p_page; 204 struct page *page = io_end->pages[i]->p_page;
205 struct buffer_head *bh, *head; 205 struct buffer_head *bh, *head;
206 int partial_write = 0; 206 loff_t offset;
207 loff_t io_end_offset;
207 208
208 head = page_buffers(page); 209 if (error) {
209 if (error)
210 SetPageError(page); 210 SetPageError(page);
211 BUG_ON(!head); 211 set_bit(AS_EIO, &page->mapping->flags);
212 if (head->b_size != PAGE_CACHE_SIZE) { 212 head = page_buffers(page);
213 loff_t offset; 213 BUG_ON(!head);
214 loff_t io_end_offset = io_end->offset + io_end->size; 214
215 io_end_offset = io_end->offset + io_end->size;
215 216
216 offset = (sector_t) page->index << PAGE_CACHE_SHIFT; 217 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
217 bh = head; 218 bh = head;
218 do { 219 do {
219 if ((offset >= io_end->offset) && 220 if ((offset >= io_end->offset) &&
220 (offset+bh->b_size <= io_end_offset)) { 221 (offset+bh->b_size <= io_end_offset))
221 if (error) 222 buffer_io_error(bh);
222 buffer_io_error(bh); 223
223
224 }
225 if (buffer_delay(bh))
226 partial_write = 1;
227 else if (!buffer_mapped(bh))
228 clear_buffer_dirty(bh);
229 else if (buffer_dirty(bh))
230 partial_write = 1;
231 offset += bh->b_size; 224 offset += bh->b_size;
232 bh = bh->b_this_page; 225 bh = bh->b_this_page;
233 } while (bh != head); 226 } while (bh != head);
234 } 227 }
235 228
236 /*
237 * If this is a partial write which happened to make
238 * all buffers uptodate then we can optimize away a
239 * bogus readpage() for the next read(). Here we
240 * 'discover' whether the page went uptodate as a
241 * result of this (potentially partial) write.
242 */
243 if (!partial_write)
244 SetPageUptodate(page);
245
246 put_io_page(io_end->pages[i]); 229 put_io_page(io_end->pages[i]);
247 } 230 }
248 io_end->num_io_pages = 0; 231 io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8553dfb310af..cc5c157aa11d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,6 +38,7 @@
38#include <linux/ctype.h> 38#include <linux/ctype.h>
39#include <linux/log2.h> 39#include <linux/log2.h>
40#include <linux/crc16.h> 40#include <linux/crc16.h>
41#include <linux/cleancache.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42 43
43#include <linux/kthread.h> 44#include <linux/kthread.h>
@@ -75,11 +76,27 @@ static void ext4_write_super(struct super_block *sb);
75static int ext4_freeze(struct super_block *sb); 76static int ext4_freeze(struct super_block *sb);
76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 77static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77 const char *dev_name, void *data); 78 const char *dev_name, void *data);
79static inline int ext2_feature_set_ok(struct super_block *sb);
80static inline int ext3_feature_set_ok(struct super_block *sb);
78static int ext4_feature_set_ok(struct super_block *sb, int readonly); 81static int ext4_feature_set_ok(struct super_block *sb, int readonly);
79static void ext4_destroy_lazyinit_thread(void); 82static void ext4_destroy_lazyinit_thread(void);
80static void ext4_unregister_li_request(struct super_block *sb); 83static void ext4_unregister_li_request(struct super_block *sb);
81static void ext4_clear_request_list(void); 84static void ext4_clear_request_list(void);
82 85
86#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
87static struct file_system_type ext2_fs_type = {
88 .owner = THIS_MODULE,
89 .name = "ext2",
90 .mount = ext4_mount,
91 .kill_sb = kill_block_super,
92 .fs_flags = FS_REQUIRES_DEV,
93};
94#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
95#else
96#define IS_EXT2_SB(sb) (0)
97#endif
98
99
83#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 100#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
84static struct file_system_type ext3_fs_type = { 101static struct file_system_type ext3_fs_type = {
85 .owner = THIS_MODULE, 102 .owner = THIS_MODULE,
@@ -806,6 +823,8 @@ static void ext4_put_super(struct super_block *sb)
806 invalidate_bdev(sbi->journal_bdev); 823 invalidate_bdev(sbi->journal_bdev);
807 ext4_blkdev_remove(sbi); 824 ext4_blkdev_remove(sbi);
808 } 825 }
826 if (sbi->s_mmp_tsk)
827 kthread_stop(sbi->s_mmp_tsk);
809 sb->s_fs_info = NULL; 828 sb->s_fs_info = NULL;
810 /* 829 /*
811 * Now that we are completely done shutting down the 830 * Now that we are completely done shutting down the
@@ -1096,7 +1115,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1096 1115
1097 if (!test_opt(sb, INIT_INODE_TABLE)) 1116 if (!test_opt(sb, INIT_INODE_TABLE))
1098 seq_puts(seq, ",noinit_inode_table"); 1117 seq_puts(seq, ",noinit_inode_table");
1099 else if (sbi->s_li_wait_mult) 1118 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1100 seq_printf(seq, ",init_inode_table=%u", 1119 seq_printf(seq, ",init_inode_table=%u",
1101 (unsigned) sbi->s_li_wait_mult); 1120 (unsigned) sbi->s_li_wait_mult);
1102 1121
@@ -1187,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
1187 const char *data, size_t len, loff_t off); 1206 const char *data, size_t len, loff_t off);
1188 1207
1189static const struct dquot_operations ext4_quota_operations = { 1208static const struct dquot_operations ext4_quota_operations = {
1190#ifdef CONFIG_QUOTA
1191 .get_reserved_space = ext4_get_reserved_space, 1209 .get_reserved_space = ext4_get_reserved_space,
1192#endif
1193 .write_dquot = ext4_write_dquot, 1210 .write_dquot = ext4_write_dquot,
1194 .acquire_dquot = ext4_acquire_dquot, 1211 .acquire_dquot = ext4_acquire_dquot,
1195 .release_dquot = ext4_release_dquot, 1212 .release_dquot = ext4_release_dquot,
@@ -1900,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1900 ext4_msg(sb, KERN_WARNING, 1917 ext4_msg(sb, KERN_WARNING,
1901 "warning: mounting fs with errors, " 1918 "warning: mounting fs with errors, "
1902 "running e2fsck is recommended"); 1919 "running e2fsck is recommended");
1903 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1920 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1904 le16_to_cpu(es->s_mnt_count) >= 1921 le16_to_cpu(es->s_mnt_count) >=
1905 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1922 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1906 ext4_msg(sb, KERN_WARNING, 1923 ext4_msg(sb, KERN_WARNING,
@@ -1932,6 +1949,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1932 EXT4_INODES_PER_GROUP(sb), 1949 EXT4_INODES_PER_GROUP(sb),
1933 sbi->s_mount_opt, sbi->s_mount_opt2); 1950 sbi->s_mount_opt, sbi->s_mount_opt2);
1934 1951
1952 cleancache_init_fs(sb);
1935 return res; 1953 return res;
1936} 1954}
1937 1955
@@ -2425,6 +2443,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2425 EXT4_SB(sb)->s_sectors_written_start) >> 1))); 2443 EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2426} 2444}
2427 2445
2446static ssize_t extent_cache_hits_show(struct ext4_attr *a,
2447 struct ext4_sb_info *sbi, char *buf)
2448{
2449 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
2450}
2451
2452static ssize_t extent_cache_misses_show(struct ext4_attr *a,
2453 struct ext4_sb_info *sbi, char *buf)
2454{
2455 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
2456}
2457
2428static ssize_t inode_readahead_blks_store(struct ext4_attr *a, 2458static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2429 struct ext4_sb_info *sbi, 2459 struct ext4_sb_info *sbi,
2430 const char *buf, size_t count) 2460 const char *buf, size_t count)
@@ -2482,6 +2512,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2482EXT4_RO_ATTR(delayed_allocation_blocks); 2512EXT4_RO_ATTR(delayed_allocation_blocks);
2483EXT4_RO_ATTR(session_write_kbytes); 2513EXT4_RO_ATTR(session_write_kbytes);
2484EXT4_RO_ATTR(lifetime_write_kbytes); 2514EXT4_RO_ATTR(lifetime_write_kbytes);
2515EXT4_RO_ATTR(extent_cache_hits);
2516EXT4_RO_ATTR(extent_cache_misses);
2485EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 2517EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2486 inode_readahead_blks_store, s_inode_readahead_blks); 2518 inode_readahead_blks_store, s_inode_readahead_blks);
2487EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 2519EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2497,6 +2529,8 @@ static struct attribute *ext4_attrs[] = {
2497 ATTR_LIST(delayed_allocation_blocks), 2529 ATTR_LIST(delayed_allocation_blocks),
2498 ATTR_LIST(session_write_kbytes), 2530 ATTR_LIST(session_write_kbytes),
2499 ATTR_LIST(lifetime_write_kbytes), 2531 ATTR_LIST(lifetime_write_kbytes),
2532 ATTR_LIST(extent_cache_hits),
2533 ATTR_LIST(extent_cache_misses),
2500 ATTR_LIST(inode_readahead_blks), 2534 ATTR_LIST(inode_readahead_blks),
2501 ATTR_LIST(inode_goal), 2535 ATTR_LIST(inode_goal),
2502 ATTR_LIST(mb_stats), 2536 ATTR_LIST(mb_stats),
@@ -2659,12 +2693,6 @@ static void print_daily_error_info(unsigned long arg)
2659 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 2693 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2660} 2694}
2661 2695
2662static void ext4_lazyinode_timeout(unsigned long data)
2663{
2664 struct task_struct *p = (struct task_struct *)data;
2665 wake_up_process(p);
2666}
2667
2668/* Find next suitable group and run ext4_init_inode_table */ 2696/* Find next suitable group and run ext4_init_inode_table */
2669static int ext4_run_li_request(struct ext4_li_request *elr) 2697static int ext4_run_li_request(struct ext4_li_request *elr)
2670{ 2698{
@@ -2696,11 +2724,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2696 ret = ext4_init_inode_table(sb, group, 2724 ret = ext4_init_inode_table(sb, group,
2697 elr->lr_timeout ? 0 : 1); 2725 elr->lr_timeout ? 0 : 1);
2698 if (elr->lr_timeout == 0) { 2726 if (elr->lr_timeout == 0) {
2699 timeout = jiffies - timeout; 2727 timeout = (jiffies - timeout) *
2700 if (elr->lr_sbi->s_li_wait_mult) 2728 elr->lr_sbi->s_li_wait_mult;
2701 timeout *= elr->lr_sbi->s_li_wait_mult;
2702 else
2703 timeout *= 20;
2704 elr->lr_timeout = timeout; 2729 elr->lr_timeout = timeout;
2705 } 2730 }
2706 elr->lr_next_sched = jiffies + elr->lr_timeout; 2731 elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2712,7 +2737,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2712 2737
2713/* 2738/*
2714 * Remove lr_request from the list_request and free the 2739 * Remove lr_request from the list_request and free the
2715 * request tructure. Should be called with li_list_mtx held 2740 * request structure. Should be called with li_list_mtx held
2716 */ 2741 */
2717static void ext4_remove_li_request(struct ext4_li_request *elr) 2742static void ext4_remove_li_request(struct ext4_li_request *elr)
2718{ 2743{
@@ -2730,14 +2755,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
2730 2755
2731static void ext4_unregister_li_request(struct super_block *sb) 2756static void ext4_unregister_li_request(struct super_block *sb)
2732{ 2757{
2733 struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; 2758 mutex_lock(&ext4_li_mtx);
2734 2759 if (!ext4_li_info) {
2735 if (!ext4_li_info) 2760 mutex_unlock(&ext4_li_mtx);
2736 return; 2761 return;
2762 }
2737 2763
2738 mutex_lock(&ext4_li_info->li_list_mtx); 2764 mutex_lock(&ext4_li_info->li_list_mtx);
2739 ext4_remove_li_request(elr); 2765 ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
2740 mutex_unlock(&ext4_li_info->li_list_mtx); 2766 mutex_unlock(&ext4_li_info->li_list_mtx);
2767 mutex_unlock(&ext4_li_mtx);
2741} 2768}
2742 2769
2743static struct task_struct *ext4_lazyinit_task; 2770static struct task_struct *ext4_lazyinit_task;
@@ -2756,17 +2783,10 @@ static int ext4_lazyinit_thread(void *arg)
2756 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; 2783 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2757 struct list_head *pos, *n; 2784 struct list_head *pos, *n;
2758 struct ext4_li_request *elr; 2785 struct ext4_li_request *elr;
2759 unsigned long next_wakeup; 2786 unsigned long next_wakeup, cur;
2760 DEFINE_WAIT(wait);
2761 2787
2762 BUG_ON(NULL == eli); 2788 BUG_ON(NULL == eli);
2763 2789
2764 eli->li_timer.data = (unsigned long)current;
2765 eli->li_timer.function = ext4_lazyinode_timeout;
2766
2767 eli->li_task = current;
2768 wake_up(&eli->li_wait_task);
2769
2770cont_thread: 2790cont_thread:
2771 while (true) { 2791 while (true) {
2772 next_wakeup = MAX_JIFFY_OFFSET; 2792 next_wakeup = MAX_JIFFY_OFFSET;
@@ -2797,19 +2817,15 @@ cont_thread:
2797 if (freezing(current)) 2817 if (freezing(current))
2798 refrigerator(); 2818 refrigerator();
2799 2819
2800 if ((time_after_eq(jiffies, next_wakeup)) || 2820 cur = jiffies;
2821 if ((time_after_eq(cur, next_wakeup)) ||
2801 (MAX_JIFFY_OFFSET == next_wakeup)) { 2822 (MAX_JIFFY_OFFSET == next_wakeup)) {
2802 cond_resched(); 2823 cond_resched();
2803 continue; 2824 continue;
2804 } 2825 }
2805 2826
2806 eli->li_timer.expires = next_wakeup; 2827 schedule_timeout_interruptible(next_wakeup - cur);
2807 add_timer(&eli->li_timer); 2828
2808 prepare_to_wait(&eli->li_wait_daemon, &wait,
2809 TASK_INTERRUPTIBLE);
2810 if (time_before(jiffies, next_wakeup))
2811 schedule();
2812 finish_wait(&eli->li_wait_daemon, &wait);
2813 if (kthread_should_stop()) { 2829 if (kthread_should_stop()) {
2814 ext4_clear_request_list(); 2830 ext4_clear_request_list();
2815 goto exit_thread; 2831 goto exit_thread;
@@ -2833,12 +2849,7 @@ exit_thread:
2833 goto cont_thread; 2849 goto cont_thread;
2834 } 2850 }
2835 mutex_unlock(&eli->li_list_mtx); 2851 mutex_unlock(&eli->li_list_mtx);
2836 del_timer_sync(&ext4_li_info->li_timer);
2837 eli->li_task = NULL;
2838 wake_up(&eli->li_wait_task);
2839
2840 kfree(ext4_li_info); 2852 kfree(ext4_li_info);
2841 ext4_lazyinit_task = NULL;
2842 ext4_li_info = NULL; 2853 ext4_li_info = NULL;
2843 mutex_unlock(&ext4_li_mtx); 2854 mutex_unlock(&ext4_li_mtx);
2844 2855
@@ -2866,7 +2877,6 @@ static int ext4_run_lazyinit_thread(void)
2866 if (IS_ERR(ext4_lazyinit_task)) { 2877 if (IS_ERR(ext4_lazyinit_task)) {
2867 int err = PTR_ERR(ext4_lazyinit_task); 2878 int err = PTR_ERR(ext4_lazyinit_task);
2868 ext4_clear_request_list(); 2879 ext4_clear_request_list();
2869 del_timer_sync(&ext4_li_info->li_timer);
2870 kfree(ext4_li_info); 2880 kfree(ext4_li_info);
2871 ext4_li_info = NULL; 2881 ext4_li_info = NULL;
2872 printk(KERN_CRIT "EXT4: error %d creating inode table " 2882 printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2875,8 +2885,6 @@ static int ext4_run_lazyinit_thread(void)
2875 return err; 2885 return err;
2876 } 2886 }
2877 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; 2887 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2878
2879 wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
2880 return 0; 2888 return 0;
2881} 2889}
2882 2890
@@ -2911,13 +2919,9 @@ static int ext4_li_info_new(void)
2911 if (!eli) 2919 if (!eli)
2912 return -ENOMEM; 2920 return -ENOMEM;
2913 2921
2914 eli->li_task = NULL;
2915 INIT_LIST_HEAD(&eli->li_request_list); 2922 INIT_LIST_HEAD(&eli->li_request_list);
2916 mutex_init(&eli->li_list_mtx); 2923 mutex_init(&eli->li_list_mtx);
2917 2924
2918 init_waitqueue_head(&eli->li_wait_daemon);
2919 init_waitqueue_head(&eli->li_wait_task);
2920 init_timer(&eli->li_timer);
2921 eli->li_state |= EXT4_LAZYINIT_QUIT; 2925 eli->li_state |= EXT4_LAZYINIT_QUIT;
2922 2926
2923 ext4_li_info = eli; 2927 ext4_li_info = eli;
@@ -2960,20 +2964,19 @@ static int ext4_register_li_request(struct super_block *sb,
2960 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 2964 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2961 int ret = 0; 2965 int ret = 0;
2962 2966
2963 if (sbi->s_li_request != NULL) 2967 if (sbi->s_li_request != NULL) {
2968 /*
2969 * Reset timeout so it can be computed again, because
2970 * s_li_wait_mult might have changed.
2971 */
2972 sbi->s_li_request->lr_timeout = 0;
2964 return 0; 2973 return 0;
2974 }
2965 2975
2966 if (first_not_zeroed == ngroups || 2976 if (first_not_zeroed == ngroups ||
2967 (sb->s_flags & MS_RDONLY) || 2977 (sb->s_flags & MS_RDONLY) ||
2968 !test_opt(sb, INIT_INODE_TABLE)) { 2978 !test_opt(sb, INIT_INODE_TABLE))
2969 sbi->s_li_request = NULL;
2970 return 0; 2979 return 0;
2971 }
2972
2973 if (first_not_zeroed == ngroups) {
2974 sbi->s_li_request = NULL;
2975 return 0;
2976 }
2977 2980
2978 elr = ext4_li_request_new(sb, first_not_zeroed); 2981 elr = ext4_li_request_new(sb, first_not_zeroed);
2979 if (!elr) 2982 if (!elr)
@@ -3166,6 +3169,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3166 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3169 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3167 set_opt(sb, DELALLOC); 3170 set_opt(sb, DELALLOC);
3168 3171
3172 /*
3173 * set default s_li_wait_mult for lazyinit, for the case there is
3174 * no mount option specified.
3175 */
3176 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3177
3169 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3178 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3170 &journal_devnum, &journal_ioprio, NULL, 0)) { 3179 &journal_devnum, &journal_ioprio, NULL, 0)) {
3171 ext4_msg(sb, KERN_WARNING, 3180 ext4_msg(sb, KERN_WARNING,
@@ -3187,6 +3196,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3187 "feature flags set on rev 0 fs, " 3196 "feature flags set on rev 0 fs, "
3188 "running e2fsck is recommended"); 3197 "running e2fsck is recommended");
3189 3198
3199 if (IS_EXT2_SB(sb)) {
3200 if (ext2_feature_set_ok(sb))
3201 ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
3202 "using the ext4 subsystem");
3203 else {
3204 ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
3205 "to feature incompatibilities");
3206 goto failed_mount;
3207 }
3208 }
3209
3210 if (IS_EXT3_SB(sb)) {
3211 if (ext3_feature_set_ok(sb))
3212 ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
3213 "using the ext4 subsystem");
3214 else {
3215 ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
3216 "to feature incompatibilities");
3217 goto failed_mount;
3218 }
3219 }
3220
3190 /* 3221 /*
3191 * Check feature flags regardless of the revision level, since we 3222 * Check feature flags regardless of the revision level, since we
3192 * previously didn't change the revision level when setting the flags, 3223 * previously didn't change the revision level when setting the flags,
@@ -3459,6 +3490,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3459 EXT4_HAS_INCOMPAT_FEATURE(sb, 3490 EXT4_HAS_INCOMPAT_FEATURE(sb,
3460 EXT4_FEATURE_INCOMPAT_RECOVER)); 3491 EXT4_FEATURE_INCOMPAT_RECOVER));
3461 3492
3493 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
3494 !(sb->s_flags & MS_RDONLY))
3495 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
3496 goto failed_mount3;
3497
3462 /* 3498 /*
3463 * The first inode we look at is the journal inode. Don't try 3499 * The first inode we look at is the journal inode. Don't try
3464 * root first: it may be modified in the journal! 3500 * root first: it may be modified in the journal!
@@ -3474,7 +3510,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3474 goto failed_mount_wq; 3510 goto failed_mount_wq;
3475 } else { 3511 } else {
3476 clear_opt(sb, DATA_FLAGS); 3512 clear_opt(sb, DATA_FLAGS);
3477 set_opt(sb, WRITEBACK_DATA);
3478 sbi->s_journal = NULL; 3513 sbi->s_journal = NULL;
3479 needs_recovery = 0; 3514 needs_recovery = 0;
3480 goto no_journal; 3515 goto no_journal;
@@ -3707,6 +3742,8 @@ failed_mount3:
3707 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3742 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3708 percpu_counter_destroy(&sbi->s_dirs_counter); 3743 percpu_counter_destroy(&sbi->s_dirs_counter);
3709 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3744 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3745 if (sbi->s_mmp_tsk)
3746 kthread_stop(sbi->s_mmp_tsk);
3710failed_mount2: 3747failed_mount2:
3711 for (i = 0; i < db_count; i++) 3748 for (i = 0; i < db_count; i++)
3712 brelse(sbi->s_group_desc[i]); 3749 brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4279,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4242 int enable_quota = 0; 4279 int enable_quota = 0;
4243 ext4_group_t g; 4280 ext4_group_t g;
4244 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4281 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4245 int err; 4282 int err = 0;
4246#ifdef CONFIG_QUOTA 4283#ifdef CONFIG_QUOTA
4247 int i; 4284 int i;
4248#endif 4285#endif
@@ -4368,6 +4405,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4368 goto restore_opts; 4405 goto restore_opts;
4369 if (!ext4_setup_super(sb, es, 0)) 4406 if (!ext4_setup_super(sb, es, 0))
4370 sb->s_flags &= ~MS_RDONLY; 4407 sb->s_flags &= ~MS_RDONLY;
4408 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
4409 EXT4_FEATURE_INCOMPAT_MMP))
4410 if (ext4_multi_mount_protect(sb,
4411 le64_to_cpu(es->s_mmp_block))) {
4412 err = -EROFS;
4413 goto restore_opts;
4414 }
4371 enable_quota = 1; 4415 enable_quota = 1;
4372 } 4416 }
4373 } 4417 }
@@ -4432,6 +4476,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4432 struct ext4_sb_info *sbi = EXT4_SB(sb); 4476 struct ext4_sb_info *sbi = EXT4_SB(sb);
4433 struct ext4_super_block *es = sbi->s_es; 4477 struct ext4_super_block *es = sbi->s_es;
4434 u64 fsid; 4478 u64 fsid;
4479 s64 bfree;
4435 4480
4436 if (test_opt(sb, MINIX_DF)) { 4481 if (test_opt(sb, MINIX_DF)) {
4437 sbi->s_overhead_last = 0; 4482 sbi->s_overhead_last = 0;
@@ -4475,8 +4520,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4475 buf->f_type = EXT4_SUPER_MAGIC; 4520 buf->f_type = EXT4_SUPER_MAGIC;
4476 buf->f_bsize = sb->s_blocksize; 4521 buf->f_bsize = sb->s_blocksize;
4477 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 4522 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
4478 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 4523 bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
4479 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 4524 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
4525 /* prevent underflow in case that few free space is available */
4526 buf->f_bfree = max_t(s64, bfree, 0);
4480 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4527 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4481 if (buf->f_bfree < ext4_r_blocks_count(es)) 4528 if (buf->f_bfree < ext4_r_blocks_count(es))
4482 buf->f_bavail = 0; 4529 buf->f_bavail = 0;
@@ -4652,6 +4699,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
4652 if (test_opt(sb, DELALLOC)) 4699 if (test_opt(sb, DELALLOC))
4653 sync_filesystem(sb); 4700 sync_filesystem(sb);
4654 4701
4702 if (!inode)
4703 goto out;
4704
4655 /* Update modification times of quota files when userspace can 4705 /* Update modification times of quota files when userspace can
4656 * start looking at them */ 4706 * start looking at them */
4657 handle = ext4_journal_start(inode, 1); 4707 handle = ext4_journal_start(inode, 1);
@@ -4772,14 +4822,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
4772} 4822}
4773 4823
4774#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4824#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4775static struct file_system_type ext2_fs_type = {
4776 .owner = THIS_MODULE,
4777 .name = "ext2",
4778 .mount = ext4_mount,
4779 .kill_sb = kill_block_super,
4780 .fs_flags = FS_REQUIRES_DEV,
4781};
4782
4783static inline void register_as_ext2(void) 4825static inline void register_as_ext2(void)
4784{ 4826{
4785 int err = register_filesystem(&ext2_fs_type); 4827 int err = register_filesystem(&ext2_fs_type);
@@ -4792,10 +4834,22 @@ static inline void unregister_as_ext2(void)
4792{ 4834{
4793 unregister_filesystem(&ext2_fs_type); 4835 unregister_filesystem(&ext2_fs_type);
4794} 4836}
4837
4838static inline int ext2_feature_set_ok(struct super_block *sb)
4839{
4840 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
4841 return 0;
4842 if (sb->s_flags & MS_RDONLY)
4843 return 1;
4844 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
4845 return 0;
4846 return 1;
4847}
4795MODULE_ALIAS("ext2"); 4848MODULE_ALIAS("ext2");
4796#else 4849#else
4797static inline void register_as_ext2(void) { } 4850static inline void register_as_ext2(void) { }
4798static inline void unregister_as_ext2(void) { } 4851static inline void unregister_as_ext2(void) { }
4852static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
4799#endif 4853#endif
4800 4854
4801#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4855#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4811,10 +4865,24 @@ static inline void unregister_as_ext3(void)
4811{ 4865{
4812 unregister_filesystem(&ext3_fs_type); 4866 unregister_filesystem(&ext3_fs_type);
4813} 4867}
4868
4869static inline int ext3_feature_set_ok(struct super_block *sb)
4870{
4871 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
4872 return 0;
4873 if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
4874 return 0;
4875 if (sb->s_flags & MS_RDONLY)
4876 return 1;
4877 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
4878 return 0;
4879 return 1;
4880}
4814MODULE_ALIAS("ext3"); 4881MODULE_ALIAS("ext3");
4815#else 4882#else
4816static inline void register_as_ext3(void) { } 4883static inline void register_as_ext3(void) { }
4817static inline void unregister_as_ext3(void) { } 4884static inline void unregister_as_ext3(void) { }
4885static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
4818#endif 4886#endif
4819 4887
4820static struct file_system_type ext4_fs_type = { 4888static struct file_system_type ext4_fs_type = {
@@ -4898,8 +4966,8 @@ static int __init ext4_init_fs(void)
4898 err = init_inodecache(); 4966 err = init_inodecache();
4899 if (err) 4967 if (err)
4900 goto out1; 4968 goto out1;
4901 register_as_ext2();
4902 register_as_ext3(); 4969 register_as_ext3();
4970 register_as_ext2();
4903 err = register_filesystem(&ext4_fs_type); 4971 err = register_filesystem(&ext4_fs_type);
4904 if (err) 4972 if (err)
4905 goto out; 4973 goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b545ca1c459c..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
822 822
823 block = ext4_new_meta_blocks(handle, inode, 823 block = ext4_new_meta_blocks(handle, inode, goal, 0,
824 goal, NULL, &error); 824 NULL, &error);
825 if (error) 825 if (error)
826 goto cleanup; 826 goto cleanup;
827 827
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3b222dafd15b..be15437c272e 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,6 +326,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
326 struct fat_slot_info sinfo; 326 struct fat_slot_info sinfo;
327 int err; 327 int err;
328 328
329 dentry_unhash(dentry);
330
329 lock_super(sb); 331 lock_super(sb);
330 /* 332 /*
331 * Check whether the directory is not in use, then check 333 * Check whether the directory is not in use, then check
@@ -457,6 +459,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
457 old_inode = old_dentry->d_inode; 459 old_inode = old_dentry->d_inode;
458 new_inode = new_dentry->d_inode; 460 new_inode = new_dentry->d_inode;
459 461
462 if (new_inode && S_ISDIR(new_inode->i_mode))
463 dentry_unhash(new_dentry);
464
460 err = fat_scan(old_dir, old_name, &old_sinfo); 465 err = fat_scan(old_dir, old_name, &old_sinfo);
461 if (err) { 466 if (err) {
462 err = -EIO; 467 err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 20b4ea53fdc4..c61a6789f36c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,6 +824,8 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
824 struct fat_slot_info sinfo; 824 struct fat_slot_info sinfo;
825 int err; 825 int err;
826 826
827 dentry_unhash(dentry);
828
827 lock_super(sb); 829 lock_super(sb);
828 830
829 err = fat_dir_empty(inode); 831 err = fat_dir_empty(inode);
@@ -931,6 +933,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
931 int err, is_dir, update_dotdot, corrupt = 0; 933 int err, is_dir, update_dotdot, corrupt = 0;
932 struct super_block *sb = old_dir->i_sb; 934 struct super_block *sb = old_dir->i_sb;
933 935
936 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
937 dentry_unhash(new_dentry);
938
934 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 939 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
935 old_inode = old_dentry->d_inode; 940 old_inode = old_dentry->d_inode;
936 new_inode = new_dentry->d_inode; 941 new_inode = new_dentry->d_inode;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b32eb29a4e6f..0d0e3faddcfa 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,6 +667,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
667 if (IS_ERR(req)) 667 if (IS_ERR(req))
668 return PTR_ERR(req); 668 return PTR_ERR(req);
669 669
670 dentry_unhash(entry);
671
670 req->in.h.opcode = FUSE_RMDIR; 672 req->in.h.opcode = FUSE_RMDIR;
671 req->in.h.nodeid = get_node_id(dir); 673 req->in.h.nodeid = get_node_id(dir);
672 req->in.numargs = 1; 674 req->in.numargs = 1;
@@ -691,6 +693,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
691 struct fuse_rename_in inarg; 693 struct fuse_rename_in inarg;
692 struct fuse_conn *fc = get_fuse_conn(olddir); 694 struct fuse_conn *fc = get_fuse_conn(olddir);
693 struct fuse_req *req = fuse_get_req(fc); 695 struct fuse_req *req = fuse_get_req(fc);
696
697 if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
698 dentry_unhash(newent);
699
694 if (IS_ERR(req)) 700 if (IS_ERR(req))
695 return PTR_ERR(req); 701 return PTR_ERR(req);
696 702
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be92..1cb70cdba2c1 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,6 +253,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
253 struct inode *inode = dentry->d_inode; 253 struct inode *inode = dentry->d_inode;
254 int res; 254 int res;
255 255
256 if (S_ISDIR(inode->i_mode))
257 dentry_unhash(dentry);
258
256 if (S_ISDIR(inode->i_mode) && inode->i_size != 2) 259 if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
257 return -ENOTEMPTY; 260 return -ENOTEMPTY;
258 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); 261 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -283,6 +286,9 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
283 286
284 /* Unlink destination if it already exists */ 287 /* Unlink destination if it already exists */
285 if (new_dentry->d_inode) { 288 if (new_dentry->d_inode) {
289 if (S_ISDIR(new_dentry->d_inode->i_mode))
290 dentry_unhash(new_dentry);
291
286 res = hfs_remove(new_dir, new_dentry); 292 res = hfs_remove(new_dir, new_dentry);
287 if (res) 293 if (res)
288 return res; 294 return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4df5059c25da..b28835091dd0 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,6 +370,8 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
370 struct inode *inode = dentry->d_inode; 370 struct inode *inode = dentry->d_inode;
371 int res; 371 int res;
372 372
373 dentry_unhash(dentry);
374
373 if (inode->i_size != 2) 375 if (inode->i_size != 2)
374 return -ENOTEMPTY; 376 return -ENOTEMPTY;
375 377
@@ -467,10 +469,12 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
467 469
468 /* Unlink destination if it already exists */ 470 /* Unlink destination if it already exists */
469 if (new_dentry->d_inode) { 471 if (new_dentry->d_inode) {
470 if (S_ISDIR(new_dentry->d_inode->i_mode)) 472 if (S_ISDIR(new_dentry->d_inode->i_mode)) {
473 dentry_unhash(new_dentry);
471 res = hfsplus_rmdir(new_dir, new_dentry); 474 res = hfsplus_rmdir(new_dir, new_dentry);
472 else 475 } else {
473 res = hfsplus_unlink(new_dir, new_dentry); 476 res = hfsplus_unlink(new_dir, new_dentry);
477 }
474 if (res) 478 if (res)
475 return res; 479 return res;
476 } 480 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2638c834ed28..e6816b9e6903 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,6 +683,8 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
683 char *file; 683 char *file;
684 int err; 684 int err;
685 685
686 dentry_unhash(dentry);
687
686 if ((file = dentry_name(dentry)) == NULL) 688 if ((file = dentry_name(dentry)) == NULL)
687 return -ENOMEM; 689 return -ENOMEM;
688 err = do_rmdir(file); 690 err = do_rmdir(file);
@@ -736,6 +738,9 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
736 char *from_name, *to_name; 738 char *from_name, *to_name;
737 int err; 739 int err;
738 740
741 if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
742 dentry_unhash(to);
743
739 if ((from_name = dentry_name(from)) == NULL) 744 if ((from_name = dentry_name(from)) == NULL)
740 return -ENOMEM; 745 return -ENOMEM;
741 if ((to_name = dentry_name(to)) == NULL) { 746 if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 1f05839c27a7..ff0ce21c0867 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -395,7 +395,6 @@ again:
395 395
396 dentry_unhash(dentry); 396 dentry_unhash(dentry);
397 if (!d_unhashed(dentry)) { 397 if (!d_unhashed(dentry)) {
398 dput(dentry);
399 hpfs_unlock(dir->i_sb); 398 hpfs_unlock(dir->i_sb);
400 return -ENOSPC; 399 return -ENOSPC;
401 } 400 }
@@ -403,7 +402,6 @@ again:
403 !S_ISREG(inode->i_mode) || 402 !S_ISREG(inode->i_mode) ||
404 get_write_access(inode)) { 403 get_write_access(inode)) {
405 d_rehash(dentry); 404 d_rehash(dentry);
406 dput(dentry);
407 } else { 405 } else {
408 struct iattr newattrs; 406 struct iattr newattrs;
409 /*printk("HPFS: truncating file before delete.\n");*/ 407 /*printk("HPFS: truncating file before delete.\n");*/
@@ -411,7 +409,6 @@ again:
411 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; 409 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
412 err = notify_change(dentry, &newattrs); 410 err = notify_change(dentry, &newattrs);
413 put_write_access(inode); 411 put_write_access(inode);
414 dput(dentry);
415 if (!err) 412 if (!err)
416 goto again; 413 goto again;
417 } 414 }
@@ -442,6 +439,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
442 int err; 439 int err;
443 int r; 440 int r;
444 441
442 dentry_unhash(dentry);
443
445 hpfs_adjust_length(name, &len); 444 hpfs_adjust_length(name, &len);
446 hpfs_lock(dir->i_sb); 445 hpfs_lock(dir->i_sb);
447 err = -ENOENT; 446 err = -ENOENT;
@@ -535,6 +534,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
535 struct buffer_head *bh; 534 struct buffer_head *bh;
536 struct fnode *fnode; 535 struct fnode *fnode;
537 int err; 536 int err;
537
538 if (new_inode && S_ISDIR(new_inode->i_mode))
539 dentry_unhash(new_dentry);
540
538 if ((err = hpfs_chk_name(new_name, &new_len))) return err; 541 if ((err = hpfs_chk_name(new_name, &new_len))) return err;
539 err = 0; 542 err = 0;
540 hpfs_adjust_length(old_name, &old_len); 543 hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e7a035781b7d..7aafeb8fa300 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -921,7 +921,8 @@ static int can_do_hugetlb_shm(void)
921 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); 921 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
922} 922}
923 923
924struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, 924struct file *hugetlb_file_setup(const char *name, size_t size,
925 vm_flags_t acctflag,
925 struct user_struct **user, int creat_flags) 926 struct user_struct **user, int creat_flags)
926{ 927{
927 int error = -ENOMEM; 928 int error = -ENOMEM;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 29148a81c783..7f21cf3aaf92 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
219 ret = err; 219 ret = err;
220 spin_lock(&journal->j_list_lock); 220 spin_lock(&journal->j_list_lock);
221 J_ASSERT(jinode->i_transaction == commit_transaction); 221 J_ASSERT(jinode->i_transaction == commit_transaction);
222 commit_transaction->t_flushed_data_blocks = 1;
223 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 222 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
224 smp_mb__after_clear_bit(); 223 smp_mb__after_clear_bit();
225 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 224 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -672,12 +671,16 @@ start_journal_io:
672 err = 0; 671 err = 0;
673 } 672 }
674 673
674 write_lock(&journal->j_state_lock);
675 J_ASSERT(commit_transaction->t_state == T_COMMIT);
676 commit_transaction->t_state = T_COMMIT_DFLUSH;
677 write_unlock(&journal->j_state_lock);
675 /* 678 /*
676 * If the journal is not located on the file system device, 679 * If the journal is not located on the file system device,
677 * then we must flush the file system device before we issue 680 * then we must flush the file system device before we issue
678 * the commit record 681 * the commit record
679 */ 682 */
680 if (commit_transaction->t_flushed_data_blocks && 683 if (commit_transaction->t_need_data_flush &&
681 (journal->j_fs_dev != journal->j_dev) && 684 (journal->j_fs_dev != journal->j_dev) &&
682 (journal->j_flags & JBD2_BARRIER)) 685 (journal->j_flags & JBD2_BARRIER))
683 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 686 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -754,8 +757,13 @@ wait_for_iobuf:
754 required. */ 757 required. */
755 JBUFFER_TRACE(jh, "file as BJ_Forget"); 758 JBUFFER_TRACE(jh, "file as BJ_Forget");
756 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 759 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
757 /* Wake up any transactions which were waiting for this 760 /*
758 IO to complete */ 761 * Wake up any transactions which were waiting for this IO to
762 * complete. The barrier must be here so that changes by
763 * jbd2_journal_file_buffer() take effect before wake_up_bit()
764 * does the waitqueue check.
765 */
766 smp_mb();
759 wake_up_bit(&bh->b_state, BH_Unshadow); 767 wake_up_bit(&bh->b_state, BH_Unshadow);
760 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 768 JBUFFER_TRACE(jh, "brelse shadowed buffer");
761 __brelse(bh); 769 __brelse(bh);
@@ -794,6 +802,10 @@ wait_for_iobuf:
794 jbd2_journal_abort(journal, err); 802 jbd2_journal_abort(journal, err);
795 803
796 jbd_debug(3, "JBD: commit phase 5\n"); 804 jbd_debug(3, "JBD: commit phase 5\n");
805 write_lock(&journal->j_state_lock);
806 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
807 commit_transaction->t_state = T_COMMIT_JFLUSH;
808 write_unlock(&journal->j_state_lock);
797 809
798 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 810 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
799 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 811 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -949,7 +961,7 @@ restart_loop:
949 961
950 jbd_debug(3, "JBD: commit phase 7\n"); 962 jbd_debug(3, "JBD: commit phase 7\n");
951 963
952 J_ASSERT(commit_transaction->t_state == T_COMMIT); 964 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
953 965
954 commit_transaction->t_start = jiffies; 966 commit_transaction->t_start = jiffies;
955 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, 967 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e0ec3db1c395..9a7826990304 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
479int __jbd2_log_start_commit(journal_t *journal, tid_t target) 479int __jbd2_log_start_commit(journal_t *journal, tid_t target)
480{ 480{
481 /* 481 /*
482 * Are we already doing a recent enough commit? 482 * The only transaction we can possibly wait upon is the
483 * currently running transaction (if it exists). Otherwise,
484 * the target tid must be an old one.
483 */ 485 */
484 if (!tid_geq(journal->j_commit_request, target)) { 486 if (journal->j_running_transaction &&
487 journal->j_running_transaction->t_tid == target) {
485 /* 488 /*
486 * We want a new commit: OK, mark the request and wakeup the 489 * We want a new commit: OK, mark the request and wakeup the
487 * commit thread. We do _not_ do the commit ourselves. 490 * commit thread. We do _not_ do the commit ourselves.
@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
493 journal->j_commit_sequence); 496 journal->j_commit_sequence);
494 wake_up(&journal->j_wait_commit); 497 wake_up(&journal->j_wait_commit);
495 return 1; 498 return 1;
496 } 499 } else if (!tid_geq(journal->j_commit_request, target))
500 /* This should never happen, but if it does, preserve
501 the evidence before kjournald goes into a loop and
502 increments j_commit_sequence beyond all recognition. */
503 WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
504 journal->j_commit_request,
505 journal->j_commit_sequence,
506 target, journal->j_running_transaction ?
507 journal->j_running_transaction->t_tid : 0);
497 return 0; 508 return 0;
498} 509}
499 510
@@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
577} 588}
578 589
579/* 590/*
591 * Return 1 if a given transaction has not yet sent barrier request
592 * connected with a transaction commit. If 0 is returned, transaction
593 * may or may not have sent the barrier. Used to avoid sending barrier
594 * twice in common cases.
595 */
596int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
597{
598 int ret = 0;
599 transaction_t *commit_trans;
600
601 if (!(journal->j_flags & JBD2_BARRIER))
602 return 0;
603 read_lock(&journal->j_state_lock);
604 /* Transaction already committed? */
605 if (tid_geq(journal->j_commit_sequence, tid))
606 goto out;
607 commit_trans = journal->j_committing_transaction;
608 if (!commit_trans || commit_trans->t_tid != tid) {
609 ret = 1;
610 goto out;
611 }
612 /*
613 * Transaction is being committed and we already proceeded to
614 * submitting a flush to fs partition?
615 */
616 if (journal->j_fs_dev != journal->j_dev) {
617 if (!commit_trans->t_need_data_flush ||
618 commit_trans->t_state >= T_COMMIT_DFLUSH)
619 goto out;
620 } else {
621 if (commit_trans->t_state >= T_COMMIT_JFLUSH)
622 goto out;
623 }
624 ret = 1;
625out:
626 read_unlock(&journal->j_state_lock);
627 return ret;
628}
629EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
630
631/*
580 * Wait for a specified commit to complete. 632 * Wait for a specified commit to complete.
581 * The caller may not hold the journal lock. 633 * The caller may not hold the journal lock.
582 */ 634 */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 05fa77a23711..3eec82d32fd4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
82 */ 82 */
83 83
84/* 84/*
85 * Update transiaction's maximum wait time, if debugging is enabled. 85 * Update transaction's maximum wait time, if debugging is enabled.
86 * 86 *
87 * In order for t_max_wait to be reliable, it must be protected by a 87 * In order for t_max_wait to be reliable, it must be protected by a
88 * lock. But doing so will mean that start_this_handle() can not be 88 * lock. But doing so will mean that start_this_handle() can not be
@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
91 * means that maximum wait time reported by the jbd2_run_stats 91 * means that maximum wait time reported by the jbd2_run_stats
92 * tracepoint will always be zero. 92 * tracepoint will always be zero.
93 */ 93 */
94static inline void update_t_max_wait(transaction_t *transaction) 94static inline void update_t_max_wait(transaction_t *transaction,
95 unsigned long ts)
95{ 96{
96#ifdef CONFIG_JBD2_DEBUG 97#ifdef CONFIG_JBD2_DEBUG
97 unsigned long ts = jiffies;
98
99 if (jbd2_journal_enable_debug && 98 if (jbd2_journal_enable_debug &&
100 time_after(transaction->t_start, ts)) { 99 time_after(transaction->t_start, ts)) {
101 ts = jbd2_time_diff(ts, transaction->t_start); 100 ts = jbd2_time_diff(ts, transaction->t_start);
@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
121 tid_t tid; 120 tid_t tid;
122 int needed, need_to_start; 121 int needed, need_to_start;
123 int nblocks = handle->h_buffer_credits; 122 int nblocks = handle->h_buffer_credits;
123 unsigned long ts = jiffies;
124 124
125 if (nblocks > journal->j_max_transaction_buffers) { 125 if (nblocks > journal->j_max_transaction_buffers) {
126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ repeat:
271 /* OK, account for the buffers that this operation expects to 271 /* OK, account for the buffers that this operation expects to
272 * use and add the handle to the running transaction. 272 * use and add the handle to the running transaction.
273 */ 273 */
274 update_t_max_wait(transaction); 274 update_t_max_wait(transaction, ts);
275 handle->h_transaction = transaction; 275 handle->h_transaction = transaction;
276 atomic_inc(&transaction->t_updates); 276 atomic_inc(&transaction->t_updates);
277 atomic_inc(&transaction->t_handle_count); 277 atomic_inc(&transaction->t_handle_count);
@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
316 * This function is visible to journal users (like ext3fs), so is not 316 * This function is visible to journal users (like ext3fs), so is not
317 * called with the journal already locked. 317 * called with the journal already locked.
318 * 318 *
319 * Return a pointer to a newly allocated handle, or NULL on failure 319 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
320 * on failure.
320 */ 321 */
321handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) 322handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
322{ 323{
@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
921 */ 922 */
922 JBUFFER_TRACE(jh, "cancelling revoke"); 923 JBUFFER_TRACE(jh, "cancelling revoke");
923 jbd2_journal_cancel_revoke(handle, jh); 924 jbd2_journal_cancel_revoke(handle, jh);
924 jbd2_journal_put_journal_head(jh);
925out: 925out:
926 jbd2_journal_put_journal_head(jh);
926 return err; 927 return err;
927} 928}
928 929
@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
2147 jinode->i_next_transaction == transaction) 2148 jinode->i_next_transaction == transaction)
2148 goto done; 2149 goto done;
2149 2150
2151 /*
2152 * We only ever set this variable to 1 so the test is safe. Since
2153 * t_need_data_flush is likely to be set, we do the test to save some
2154 * cacheline bouncing
2155 */
2156 if (!transaction->t_need_data_flush)
2157 transaction->t_need_data_flush = 1;
2150 /* On some different transaction's list - should be 2158 /* On some different transaction's list - should be
2151 * the committing one */ 2159 * the committing one */
2152 if (jinode->i_transaction) { 2160 if (jinode->i_transaction) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 82faddd1f321..05f73328b28b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -609,6 +609,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
609 int ret; 609 int ret;
610 uint32_t now = get_seconds(); 610 uint32_t now = get_seconds();
611 611
612 dentry_unhash(dentry);
613
612 for (fd = f->dents ; fd; fd = fd->next) { 614 for (fd = f->dents ; fd; fd = fd->next) {
613 if (fd->ino) 615 if (fd->ino)
614 return -ENOTEMPTY; 616 return -ENOTEMPTY;
@@ -784,6 +786,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
784 uint8_t type; 786 uint8_t type;
785 uint32_t now; 787 uint32_t now;
786 788
789 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
790 dentry_unhash(new_dentry);
791
787 /* The VFS will check for us and prevent trying to rename a 792 /* The VFS will check for us and prevent trying to rename a
788 * file over a directory and vice versa, but if it's a directory, 793 * file over a directory and vice versa, but if it's a directory,
789 * the VFS can't check whether the victim is empty. The filesystem 794 * the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index eaaf2b511e89..865df16a6cf3 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,6 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
360 360
361 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); 361 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
362 362
363 dentry_unhash(dentry);
364
363 /* Init inode for quota operations. */ 365 /* Init inode for quota operations. */
364 dquot_initialize(dip); 366 dquot_initialize(dip);
365 dquot_initialize(ip); 367 dquot_initialize(ip);
@@ -1095,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1095 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, 1097 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
1096 new_dentry->d_name.name); 1098 new_dentry->d_name.name);
1097 1099
1100 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1101 dentry_unhash(new_dentry);
1102
1098 dquot_initialize(old_dir); 1103 dquot_initialize(old_dir);
1099 dquot_initialize(new_dir); 1104 dquot_initialize(new_dir);
1100 1105
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9ed89d1663f8..f34c9cde9e94 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,6 +273,8 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{ 273{
274 struct inode *inode = dentry->d_inode; 274 struct inode *inode = dentry->d_inode;
275 275
276 dentry_unhash(dentry);
277
276 if (!logfs_empty_dir(inode)) 278 if (!logfs_empty_dir(inode))
277 return -ENOTEMPTY; 279 return -ENOTEMPTY;
278 280
@@ -622,6 +624,9 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
622 loff_t pos; 624 loff_t pos;
623 int err; 625 int err;
624 626
627 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
628 dentry_unhash(new_dentry);
629
625 /* 1. locate source dd */ 630 /* 1. locate source dd */
626 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); 631 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
627 if (err) 632 if (err)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 6e6777f1b4b2..f60aed8db9c4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,6 +168,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
168 struct inode * inode = dentry->d_inode; 168 struct inode * inode = dentry->d_inode;
169 int err = -ENOTEMPTY; 169 int err = -ENOTEMPTY;
170 170
171 dentry_unhash(dentry);
172
171 if (minix_empty_dir(inode)) { 173 if (minix_empty_dir(inode)) {
172 err = minix_unlink(dir, dentry); 174 err = minix_unlink(dir, dentry);
173 if (!err) { 175 if (!err) {
@@ -190,6 +192,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
190 struct minix_dir_entry * old_de; 192 struct minix_dir_entry * old_de;
191 int err = -ENOENT; 193 int err = -ENOENT;
192 194
195 if (new_inode && S_ISDIR(new_inode->i_mode))
196 dentry_unhash(new_dentry);
197
193 old_de = minix_find_entry(old_dentry, &old_page); 198 old_de = minix_find_entry(old_dentry, &old_page);
194 if (!old_de) 199 if (!old_de)
195 goto out; 200 goto out;
diff --git a/fs/mpage.c b/fs/mpage.c
index 0afc809e46e0..fdfae9fa98cd 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -27,6 +27,7 @@
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
29#include <linux/pagevec.h> 29#include <linux/pagevec.h>
30#include <linux/cleancache.h>
30 31
31/* 32/*
32 * I/O completion handler for multipage BIOs. 33 * I/O completion handler for multipage BIOs.
@@ -271,6 +272,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
271 SetPageMappedToDisk(page); 272 SetPageMappedToDisk(page);
272 } 273 }
273 274
275 if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
276 cleancache_get_page(page) == 0) {
277 SetPageUptodate(page);
278 goto confused;
279 }
280
274 /* 281 /*
275 * This page will go to BIO. Do we need to send this BIO off first? 282 * This page will go to BIO. Do we need to send this BIO off first?
276 */ 283 */
diff --git a/fs/namei.c b/fs/namei.c
index 6ff858c049c0..2358b326b221 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -391,79 +391,28 @@ void path_put(struct path *path)
391} 391}
392EXPORT_SYMBOL(path_put); 392EXPORT_SYMBOL(path_put);
393 393
394/** 394/*
395 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
396 * @nd: nameidata pathwalk data to drop
397 * Returns: 0 on success, -ECHILD on failure
398 *
399 * Path walking has 2 modes, rcu-walk and ref-walk (see 395 * Path walking has 2 modes, rcu-walk and ref-walk (see
400 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt 396 * Documentation/filesystems/path-lookup.txt). In situations when we can't
401 * to drop out of rcu-walk mode and take normal reference counts on dentries 397 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
402 * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take 398 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
403 * refcounts at the last known good point before rcu-walk got stuck, so 399 * mode. Refcounts are grabbed at the last known good point before rcu-walk
404 * ref-walk may continue from there. If this is not successful (eg. a seqcount 400 * got stuck, so ref-walk may continue from there. If this is not successful
405 * has changed), then failure is returned and path walk restarts from the 401 * (eg. a seqcount has changed), then failure is returned and it's up to caller
406 * beginning in ref-walk mode. 402 * to restart the path walk from the beginning in ref-walk mode.
407 *
408 * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
409 * ref-walk. Must be called from rcu-walk context.
410 */ 403 */
411static int nameidata_drop_rcu(struct nameidata *nd)
412{
413 struct fs_struct *fs = current->fs;
414 struct dentry *dentry = nd->path.dentry;
415 int want_root = 0;
416
417 BUG_ON(!(nd->flags & LOOKUP_RCU));
418 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
419 want_root = 1;
420 spin_lock(&fs->lock);
421 if (nd->root.mnt != fs->root.mnt ||
422 nd->root.dentry != fs->root.dentry)
423 goto err_root;
424 }
425 spin_lock(&dentry->d_lock);
426 if (!__d_rcu_to_refcount(dentry, nd->seq))
427 goto err;
428 BUG_ON(nd->inode != dentry->d_inode);
429 spin_unlock(&dentry->d_lock);
430 if (want_root) {
431 path_get(&nd->root);
432 spin_unlock(&fs->lock);
433 }
434 mntget(nd->path.mnt);
435
436 rcu_read_unlock();
437 br_read_unlock(vfsmount_lock);
438 nd->flags &= ~LOOKUP_RCU;
439 return 0;
440err:
441 spin_unlock(&dentry->d_lock);
442err_root:
443 if (want_root)
444 spin_unlock(&fs->lock);
445 return -ECHILD;
446}
447
448/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
449static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
450{
451 if (nd->flags & LOOKUP_RCU)
452 return nameidata_drop_rcu(nd);
453 return 0;
454}
455 404
456/** 405/**
457 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk 406 * unlazy_walk - try to switch to ref-walk mode.
458 * @nd: nameidata pathwalk data to drop 407 * @nd: nameidata pathwalk data
459 * @dentry: dentry to drop 408 * @dentry: child of nd->path.dentry or NULL
460 * Returns: 0 on success, -ECHILD on failure 409 * Returns: 0 on success, -ECHILD on failure
461 * 410 *
462 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root, 411 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
463 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on 412 * for ref-walk mode. @dentry must be a path found by a do_lookup call on
464 * @nd. Must be called from rcu-walk context. 413 * @nd or NULL. Must be called from rcu-walk context.
465 */ 414 */
466static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry) 415static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
467{ 416{
468 struct fs_struct *fs = current->fs; 417 struct fs_struct *fs = current->fs;
469 struct dentry *parent = nd->path.dentry; 418 struct dentry *parent = nd->path.dentry;
@@ -478,18 +427,25 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
478 goto err_root; 427 goto err_root;
479 } 428 }
480 spin_lock(&parent->d_lock); 429 spin_lock(&parent->d_lock);
481 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 430 if (!dentry) {
482 if (!__d_rcu_to_refcount(dentry, nd->seq)) 431 if (!__d_rcu_to_refcount(parent, nd->seq))
483 goto err; 432 goto err_parent;
484 /* 433 BUG_ON(nd->inode != parent->d_inode);
485 * If the sequence check on the child dentry passed, then the child has 434 } else {
486 * not been removed from its parent. This means the parent dentry must 435 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
487 * be valid and able to take a reference at this point. 436 if (!__d_rcu_to_refcount(dentry, nd->seq))
488 */ 437 goto err_child;
489 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); 438 /*
490 BUG_ON(!parent->d_count); 439 * If the sequence check on the child dentry passed, then
491 parent->d_count++; 440 * the child has not been removed from its parent. This
492 spin_unlock(&dentry->d_lock); 441 * means the parent dentry must be valid and able to take
442 * a reference at this point.
443 */
444 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
445 BUG_ON(!parent->d_count);
446 parent->d_count++;
447 spin_unlock(&dentry->d_lock);
448 }
493 spin_unlock(&parent->d_lock); 449 spin_unlock(&parent->d_lock);
494 if (want_root) { 450 if (want_root) {
495 path_get(&nd->root); 451 path_get(&nd->root);
@@ -501,8 +457,10 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
501 br_read_unlock(vfsmount_lock); 457 br_read_unlock(vfsmount_lock);
502 nd->flags &= ~LOOKUP_RCU; 458 nd->flags &= ~LOOKUP_RCU;
503 return 0; 459 return 0;
504err: 460
461err_child:
505 spin_unlock(&dentry->d_lock); 462 spin_unlock(&dentry->d_lock);
463err_parent:
506 spin_unlock(&parent->d_lock); 464 spin_unlock(&parent->d_lock);
507err_root: 465err_root:
508 if (want_root) 466 if (want_root)
@@ -510,59 +468,6 @@ err_root:
510 return -ECHILD; 468 return -ECHILD;
511} 469}
512 470
513/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
514static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
515{
516 if (nd->flags & LOOKUP_RCU) {
517 if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
518 nd->flags &= ~LOOKUP_RCU;
519 if (!(nd->flags & LOOKUP_ROOT))
520 nd->root.mnt = NULL;
521 rcu_read_unlock();
522 br_read_unlock(vfsmount_lock);
523 return -ECHILD;
524 }
525 }
526 return 0;
527}
528
529/**
530 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
531 * @nd: nameidata pathwalk data to drop
532 * Returns: 0 on success, -ECHILD on failure
533 *
534 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
535 * nd->path should be the final element of the lookup, so nd->root is discarded.
536 * Must be called from rcu-walk context.
537 */
538static int nameidata_drop_rcu_last(struct nameidata *nd)
539{
540 struct dentry *dentry = nd->path.dentry;
541
542 BUG_ON(!(nd->flags & LOOKUP_RCU));
543 nd->flags &= ~LOOKUP_RCU;
544 if (!(nd->flags & LOOKUP_ROOT))
545 nd->root.mnt = NULL;
546 spin_lock(&dentry->d_lock);
547 if (!__d_rcu_to_refcount(dentry, nd->seq))
548 goto err_unlock;
549 BUG_ON(nd->inode != dentry->d_inode);
550 spin_unlock(&dentry->d_lock);
551
552 mntget(nd->path.mnt);
553
554 rcu_read_unlock();
555 br_read_unlock(vfsmount_lock);
556
557 return 0;
558
559err_unlock:
560 spin_unlock(&dentry->d_lock);
561 rcu_read_unlock();
562 br_read_unlock(vfsmount_lock);
563 return -ECHILD;
564}
565
566/** 471/**
567 * release_open_intent - free up open intent resources 472 * release_open_intent - free up open intent resources
568 * @nd: pointer to nameidata 473 * @nd: pointer to nameidata
@@ -606,26 +511,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
606 return dentry; 511 return dentry;
607} 512}
608 513
609/* 514/**
610 * handle_reval_path - force revalidation of a dentry 515 * complete_walk - successful completion of path walk
611 * 516 * @nd: pointer nameidata
612 * In some situations the path walking code will trust dentries without
613 * revalidating them. This causes problems for filesystems that depend on
614 * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
615 * (which indicates that it's possible for the dentry to go stale), force
616 * a d_revalidate call before proceeding.
617 * 517 *
618 * Returns 0 if the revalidation was successful. If the revalidation fails, 518 * If we had been in RCU mode, drop out of it and legitimize nd->path.
619 * either return the error returned by d_revalidate or -ESTALE if the 519 * Revalidate the final result, unless we'd already done that during
620 * revalidation it just returned 0. If d_revalidate returns 0, we attempt to 520 * the path walk or the filesystem doesn't ask for it. Return 0 on
621 * invalidate the dentry. It's up to the caller to handle putting references 521 * success, -error on failure. In case of failure caller does not
622 * to the path if necessary. 522 * need to drop nd->path.
623 */ 523 */
624static inline int handle_reval_path(struct nameidata *nd) 524static int complete_walk(struct nameidata *nd)
625{ 525{
626 struct dentry *dentry = nd->path.dentry; 526 struct dentry *dentry = nd->path.dentry;
627 int status; 527 int status;
628 528
529 if (nd->flags & LOOKUP_RCU) {
530 nd->flags &= ~LOOKUP_RCU;
531 if (!(nd->flags & LOOKUP_ROOT))
532 nd->root.mnt = NULL;
533 spin_lock(&dentry->d_lock);
534 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
535 spin_unlock(&dentry->d_lock);
536 rcu_read_unlock();
537 br_read_unlock(vfsmount_lock);
538 return -ECHILD;
539 }
540 BUG_ON(nd->inode != dentry->d_inode);
541 spin_unlock(&dentry->d_lock);
542 mntget(nd->path.mnt);
543 rcu_read_unlock();
544 br_read_unlock(vfsmount_lock);
545 }
546
629 if (likely(!(nd->flags & LOOKUP_JUMPED))) 547 if (likely(!(nd->flags & LOOKUP_JUMPED)))
630 return 0; 548 return 0;
631 549
@@ -643,6 +561,7 @@ static inline int handle_reval_path(struct nameidata *nd)
643 if (!status) 561 if (!status)
644 status = -ESTALE; 562 status = -ESTALE;
645 563
564 path_put(&nd->path);
646 return status; 565 return status;
647} 566}
648 567
@@ -1241,13 +1160,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1241 if (likely(__follow_mount_rcu(nd, path, inode, false))) 1160 if (likely(__follow_mount_rcu(nd, path, inode, false)))
1242 return 0; 1161 return 0;
1243unlazy: 1162unlazy:
1244 if (dentry) { 1163 if (unlazy_walk(nd, dentry))
1245 if (nameidata_dentry_drop_rcu(nd, dentry)) 1164 return -ECHILD;
1246 return -ECHILD;
1247 } else {
1248 if (nameidata_drop_rcu(nd))
1249 return -ECHILD;
1250 }
1251 } else { 1165 } else {
1252 dentry = __d_lookup(parent, name); 1166 dentry = __d_lookup(parent, name);
1253 } 1167 }
@@ -1303,7 +1217,7 @@ static inline int may_lookup(struct nameidata *nd)
1303 int err = exec_permission(nd->inode, IPERM_FLAG_RCU); 1217 int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
1304 if (err != -ECHILD) 1218 if (err != -ECHILD)
1305 return err; 1219 return err;
1306 if (nameidata_drop_rcu(nd)) 1220 if (unlazy_walk(nd, NULL))
1307 return -ECHILD; 1221 return -ECHILD;
1308 } 1222 }
1309 return exec_permission(nd->inode, 0); 1223 return exec_permission(nd->inode, 0);
@@ -1357,8 +1271,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
1357 return -ENOENT; 1271 return -ENOENT;
1358 } 1272 }
1359 if (unlikely(inode->i_op->follow_link) && follow) { 1273 if (unlikely(inode->i_op->follow_link) && follow) {
1360 if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) 1274 if (nd->flags & LOOKUP_RCU) {
1361 return -ECHILD; 1275 if (unlikely(unlazy_walk(nd, path->dentry))) {
1276 terminate_walk(nd);
1277 return -ECHILD;
1278 }
1279 }
1362 BUG_ON(inode != path->dentry->d_inode); 1280 BUG_ON(inode != path->dentry->d_inode);
1363 return 1; 1281 return 1;
1364 } 1282 }
@@ -1657,18 +1575,8 @@ static int path_lookupat(int dfd, const char *name,
1657 } 1575 }
1658 } 1576 }
1659 1577
1660 if (nd->flags & LOOKUP_RCU) { 1578 if (!err)
1661 /* went all way through without dropping RCU */ 1579 err = complete_walk(nd);
1662 BUG_ON(err);
1663 if (nameidata_drop_rcu_last(nd))
1664 err = -ECHILD;
1665 }
1666
1667 if (!err) {
1668 err = handle_reval_path(nd);
1669 if (err)
1670 path_put(&nd->path);
1671 }
1672 1580
1673 if (!err && nd->flags & LOOKUP_DIRECTORY) { 1581 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1674 if (!nd->inode->i_op->lookup) { 1582 if (!nd->inode->i_op->lookup) {
@@ -2134,13 +2042,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2134 return ERR_PTR(error); 2042 return ERR_PTR(error);
2135 /* fallthrough */ 2043 /* fallthrough */
2136 case LAST_ROOT: 2044 case LAST_ROOT:
2137 if (nd->flags & LOOKUP_RCU) { 2045 error = complete_walk(nd);
2138 if (nameidata_drop_rcu_last(nd))
2139 return ERR_PTR(-ECHILD);
2140 }
2141 error = handle_reval_path(nd);
2142 if (error) 2046 if (error)
2143 goto exit; 2047 return ERR_PTR(error);
2144 audit_inode(pathname, nd->path.dentry); 2048 audit_inode(pathname, nd->path.dentry);
2145 if (open_flag & O_CREAT) { 2049 if (open_flag & O_CREAT) {
2146 error = -EISDIR; 2050 error = -EISDIR;
@@ -2148,10 +2052,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2148 } 2052 }
2149 goto ok; 2053 goto ok;
2150 case LAST_BIND: 2054 case LAST_BIND:
2151 /* can't be RCU mode here */ 2055 error = complete_walk(nd);
2152 error = handle_reval_path(nd);
2153 if (error) 2056 if (error)
2154 goto exit; 2057 return ERR_PTR(error);
2155 audit_inode(pathname, dir); 2058 audit_inode(pathname, dir);
2156 goto ok; 2059 goto ok;
2157 } 2060 }
@@ -2170,10 +2073,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2170 if (error) /* symlink */ 2073 if (error) /* symlink */
2171 return NULL; 2074 return NULL;
2172 /* sayonara */ 2075 /* sayonara */
2173 if (nd->flags & LOOKUP_RCU) { 2076 error = complete_walk(nd);
2174 if (nameidata_drop_rcu_last(nd)) 2077 if (error)
2175 return ERR_PTR(-ECHILD); 2078 return ERR_PTR(-ECHILD);
2176 }
2177 2079
2178 error = -ENOTDIR; 2080 error = -ENOTDIR;
2179 if (nd->flags & LOOKUP_DIRECTORY) { 2081 if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2185,11 +2087,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2185 } 2087 }
2186 2088
2187 /* create side of things */ 2089 /* create side of things */
2188 2090 error = complete_walk(nd);
2189 if (nd->flags & LOOKUP_RCU) { 2091 if (error)
2190 if (nameidata_drop_rcu_last(nd)) 2092 return ERR_PTR(error);
2191 return ERR_PTR(-ECHILD);
2192 }
2193 2093
2194 audit_inode(pathname, dir); 2094 audit_inode(pathname, dir);
2195 error = -EISDIR; 2095 error = -EISDIR;
@@ -2629,10 +2529,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2629} 2529}
2630 2530
2631/* 2531/*
2632 * We try to drop the dentry early: we should have 2532 * The dentry_unhash() helper will try to drop the dentry early: we
2633 * a usage count of 2 if we're the only user of this 2533 * should have a usage count of 2 if we're the only user of this
2634 * dentry, and if that is true (possibly after pruning 2534 * dentry, and if that is true (possibly after pruning the dcache),
2635 * the dcache), then we drop the dentry now. 2535 * then we drop the dentry now.
2636 * 2536 *
2637 * A low-level filesystem can, if it choses, legally 2537 * A low-level filesystem can, if it choses, legally
2638 * do a 2538 * do a
@@ -2645,10 +2545,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2645 */ 2545 */
2646void dentry_unhash(struct dentry *dentry) 2546void dentry_unhash(struct dentry *dentry)
2647{ 2547{
2648 dget(dentry);
2649 shrink_dcache_parent(dentry); 2548 shrink_dcache_parent(dentry);
2650 spin_lock(&dentry->d_lock); 2549 spin_lock(&dentry->d_lock);
2651 if (dentry->d_count == 2) 2550 if (dentry->d_count == 1)
2652 __d_drop(dentry); 2551 __d_drop(dentry);
2653 spin_unlock(&dentry->d_lock); 2552 spin_unlock(&dentry->d_lock);
2654} 2553}
@@ -2664,25 +2563,26 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2664 return -EPERM; 2563 return -EPERM;
2665 2564
2666 mutex_lock(&dentry->d_inode->i_mutex); 2565 mutex_lock(&dentry->d_inode->i_mutex);
2667 dentry_unhash(dentry); 2566
2567 error = -EBUSY;
2668 if (d_mountpoint(dentry)) 2568 if (d_mountpoint(dentry))
2669 error = -EBUSY; 2569 goto out;
2670 else { 2570
2671 error = security_inode_rmdir(dir, dentry); 2571 error = security_inode_rmdir(dir, dentry);
2672 if (!error) { 2572 if (error)
2673 error = dir->i_op->rmdir(dir, dentry); 2573 goto out;
2674 if (!error) { 2574
2675 dentry->d_inode->i_flags |= S_DEAD; 2575 error = dir->i_op->rmdir(dir, dentry);
2676 dont_mount(dentry); 2576 if (error)
2677 } 2577 goto out;
2678 } 2578
2679 } 2579 dentry->d_inode->i_flags |= S_DEAD;
2580 dont_mount(dentry);
2581
2582out:
2680 mutex_unlock(&dentry->d_inode->i_mutex); 2583 mutex_unlock(&dentry->d_inode->i_mutex);
2681 if (!error) { 2584 if (!error)
2682 d_delete(dentry); 2585 d_delete(dentry);
2683 }
2684 dput(dentry);
2685
2686 return error; 2586 return error;
2687} 2587}
2688 2588
@@ -3053,12 +2953,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
3053 * HOWEVER, it relies on the assumption that any object with ->lookup() 2953 * HOWEVER, it relies on the assumption that any object with ->lookup()
3054 * has no more than 1 dentry. If "hybrid" objects will ever appear, 2954 * has no more than 1 dentry. If "hybrid" objects will ever appear,
3055 * we'd better make sure that there's no link(2) for them. 2955 * we'd better make sure that there's no link(2) for them.
3056 * d) some filesystems don't support opened-but-unlinked directories, 2956 * d) conversion from fhandle to dentry may come in the wrong moment - when
3057 * either because of layout or because they are not ready to deal with
3058 * all cases correctly. The latter will be fixed (taking this sort of
3059 * stuff into VFS), but the former is not going away. Solution: the same
3060 * trick as in rmdir().
3061 * e) conversion from fhandle to dentry may come in the wrong moment - when
3062 * we are removing the target. Solution: we will have to grab ->i_mutex 2957 * we are removing the target. Solution: we will have to grab ->i_mutex
3063 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 2958 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
3064 * ->i_mutex on parents, which works but leads to some truly excessive 2959 * ->i_mutex on parents, which works but leads to some truly excessive
@@ -3068,7 +2963,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3068 struct inode *new_dir, struct dentry *new_dentry) 2963 struct inode *new_dir, struct dentry *new_dentry)
3069{ 2964{
3070 int error = 0; 2965 int error = 0;
3071 struct inode *target; 2966 struct inode *target = new_dentry->d_inode;
3072 2967
3073 /* 2968 /*
3074 * If we are going to change the parent - check write permissions, 2969 * If we are going to change the parent - check write permissions,
@@ -3084,26 +2979,24 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3084 if (error) 2979 if (error)
3085 return error; 2980 return error;
3086 2981
3087 target = new_dentry->d_inode;
3088 if (target) 2982 if (target)
3089 mutex_lock(&target->i_mutex); 2983 mutex_lock(&target->i_mutex);
3090 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 2984
3091 error = -EBUSY; 2985 error = -EBUSY;
3092 else { 2986 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
3093 if (target) 2987 goto out;
3094 dentry_unhash(new_dentry); 2988
3095 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2989 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
3096 } 2990 if (error)
2991 goto out;
2992
3097 if (target) { 2993 if (target) {
3098 if (!error) { 2994 target->i_flags |= S_DEAD;
3099 target->i_flags |= S_DEAD; 2995 dont_mount(new_dentry);
3100 dont_mount(new_dentry);
3101 }
3102 mutex_unlock(&target->i_mutex);
3103 if (d_unhashed(new_dentry))
3104 d_rehash(new_dentry);
3105 dput(new_dentry);
3106 } 2996 }
2997out:
2998 if (target)
2999 mutex_unlock(&target->i_mutex);
3107 if (!error) 3000 if (!error)
3108 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3001 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3109 d_move(old_dentry,new_dentry); 3002 d_move(old_dentry,new_dentry);
@@ -3113,7 +3006,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3113static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, 3006static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
3114 struct inode *new_dir, struct dentry *new_dentry) 3007 struct inode *new_dir, struct dentry *new_dentry)
3115{ 3008{
3116 struct inode *target; 3009 struct inode *target = new_dentry->d_inode;
3117 int error; 3010 int error;
3118 3011
3119 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 3012 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3121,19 +3014,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
3121 return error; 3014 return error;
3122 3015
3123 dget(new_dentry); 3016 dget(new_dentry);
3124 target = new_dentry->d_inode;
3125 if (target) 3017 if (target)
3126 mutex_lock(&target->i_mutex); 3018 mutex_lock(&target->i_mutex);
3019
3020 error = -EBUSY;
3127 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 3021 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
3128 error = -EBUSY; 3022 goto out;
3129 else 3023
3130 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3024 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
3131 if (!error) { 3025 if (error)
3132 if (target) 3026 goto out;
3133 dont_mount(new_dentry); 3027
3134 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3028 if (target)
3135 d_move(old_dentry, new_dentry); 3029 dont_mount(new_dentry);
3136 } 3030 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3031 d_move(old_dentry, new_dentry);
3032out:
3137 if (target) 3033 if (target)
3138 mutex_unlock(&target->i_mutex); 3034 mutex_unlock(&target->i_mutex);
3139 dput(new_dentry); 3035 dput(new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index d99bcf59e4c2..fe59bd145d21 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1695 1695
1696static int flags_to_propagation_type(int flags) 1696static int flags_to_propagation_type(int flags)
1697{ 1697{
1698 int type = flags & ~MS_REC; 1698 int type = flags & ~(MS_REC | MS_SILENT);
1699 1699
1700 /* Fail if any non-propagation flags are set */ 1700 /* Fail if any non-propagation flags are set */
1701 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1701 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f6946bb5cb55..e3e646b06404 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,6 +1033,8 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1033 DPRINTK("ncp_rmdir: removing %s/%s\n", 1033 DPRINTK("ncp_rmdir: removing %s/%s\n",
1034 dentry->d_parent->d_name.name, dentry->d_name.name); 1034 dentry->d_parent->d_name.name, dentry->d_name.name);
1035 1035
1036 dentry_unhash(dentry);
1037
1036 error = -EBUSY; 1038 error = -EBUSY;
1037 if (!d_unhashed(dentry)) 1039 if (!d_unhashed(dentry))
1038 goto out; 1040 goto out;
@@ -1139,6 +1141,9 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1139 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1141 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1140 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1142 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1141 1143
1144 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1145 dentry_unhash(new_dentry);
1146
1142 ncp_age_dentry(server, old_dentry); 1147 ncp_age_dentry(server, old_dentry);
1143 ncp_age_dentry(server, new_dentry); 1148 ncp_age_dentry(server, new_dentry);
1144 1149
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 546849b3e88f..1102a5fbb744 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,6 +334,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
334 struct nilfs_transaction_info ti; 334 struct nilfs_transaction_info ti;
335 int err; 335 int err;
336 336
337 dentry_unhash(dentry);
338
337 err = nilfs_transaction_begin(dir->i_sb, &ti, 0); 339 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
338 if (err) 340 if (err)
339 return err; 341 return err;
@@ -369,6 +371,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
369 struct nilfs_transaction_info ti; 371 struct nilfs_transaction_info ti;
370 int err; 372 int err;
371 373
374 if (new_inode && S_ISDIR(new_inode->i_mode))
375 dentry_unhash(new_dentry);
376
372 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); 377 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
373 if (unlikely(err)) 378 if (unlikely(err))
374 return err; 379 return err;
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index d8a0313e99e6..f17e58b32989 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -30,6 +30,7 @@ ocfs2-objs := \
30 namei.o \ 30 namei.o \
31 refcounttree.o \ 31 refcounttree.o \
32 reservations.o \ 32 reservations.o \
33 move_extents.o \
33 resize.o \ 34 resize.o \
34 slot_map.o \ 35 slot_map.o \
35 suballoc.o \ 36 suballoc.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 48aa9c7401c7..ed553c60de82 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/swap.h> 30#include <linux/swap.h>
31#include <linux/quotaops.h> 31#include <linux/quotaops.h>
32#include <linux/blkdev.h>
32 33
33#include <cluster/masklog.h> 34#include <cluster/masklog.h>
34 35
@@ -7184,3 +7185,168 @@ out_commit:
7184out: 7185out:
7185 return ret; 7186 return ret;
7186} 7187}
7188
7189static int ocfs2_trim_extent(struct super_block *sb,
7190 struct ocfs2_group_desc *gd,
7191 u32 start, u32 count)
7192{
7193 u64 discard, bcount;
7194
7195 bcount = ocfs2_clusters_to_blocks(sb, count);
7196 discard = le64_to_cpu(gd->bg_blkno) +
7197 ocfs2_clusters_to_blocks(sb, start);
7198
7199 trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
7200
7201 return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
7202}
7203
7204static int ocfs2_trim_group(struct super_block *sb,
7205 struct ocfs2_group_desc *gd,
7206 u32 start, u32 max, u32 minbits)
7207{
7208 int ret = 0, count = 0, next;
7209 void *bitmap = gd->bg_bitmap;
7210
7211 if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
7212 return 0;
7213
7214 trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
7215 start, max, minbits);
7216
7217 while (start < max) {
7218 start = ocfs2_find_next_zero_bit(bitmap, max, start);
7219 if (start >= max)
7220 break;
7221 next = ocfs2_find_next_bit(bitmap, max, start);
7222
7223 if ((next - start) >= minbits) {
7224 ret = ocfs2_trim_extent(sb, gd,
7225 start, next - start);
7226 if (ret < 0) {
7227 mlog_errno(ret);
7228 break;
7229 }
7230 count += next - start;
7231 }
7232 start = next + 1;
7233
7234 if (fatal_signal_pending(current)) {
7235 count = -ERESTARTSYS;
7236 break;
7237 }
7238
7239 if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
7240 break;
7241 }
7242
7243 if (ret < 0)
7244 count = ret;
7245
7246 return count;
7247}
7248
7249int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7250{
7251 struct ocfs2_super *osb = OCFS2_SB(sb);
7252 u64 start, len, trimmed, first_group, last_group, group;
7253 int ret, cnt;
7254 u32 first_bit, last_bit, minlen;
7255 struct buffer_head *main_bm_bh = NULL;
7256 struct inode *main_bm_inode = NULL;
7257 struct buffer_head *gd_bh = NULL;
7258 struct ocfs2_dinode *main_bm;
7259 struct ocfs2_group_desc *gd = NULL;
7260
7261 start = range->start >> osb->s_clustersize_bits;
7262 len = range->len >> osb->s_clustersize_bits;
7263 minlen = range->minlen >> osb->s_clustersize_bits;
7264 trimmed = 0;
7265
7266 if (!len) {
7267 range->len = 0;
7268 return 0;
7269 }
7270
7271 if (minlen >= osb->bitmap_cpg)
7272 return -EINVAL;
7273
7274 main_bm_inode = ocfs2_get_system_file_inode(osb,
7275 GLOBAL_BITMAP_SYSTEM_INODE,
7276 OCFS2_INVALID_SLOT);
7277 if (!main_bm_inode) {
7278 ret = -EIO;
7279 mlog_errno(ret);
7280 goto out;
7281 }
7282
7283 mutex_lock(&main_bm_inode->i_mutex);
7284
7285 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
7286 if (ret < 0) {
7287 mlog_errno(ret);
7288 goto out_mutex;
7289 }
7290 main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
7291
7292 if (start >= le32_to_cpu(main_bm->i_clusters)) {
7293 ret = -EINVAL;
7294 goto out_unlock;
7295 }
7296
7297 if (start + len > le32_to_cpu(main_bm->i_clusters))
7298 len = le32_to_cpu(main_bm->i_clusters) - start;
7299
7300 trace_ocfs2_trim_fs(start, len, minlen);
7301
7302 /* Determine first and last group to examine based on start and len */
7303 first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7304 if (first_group == osb->first_cluster_group_blkno)
7305 first_bit = start;
7306 else
7307 first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
7308 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7309 last_bit = osb->bitmap_cpg;
7310
7311 for (group = first_group; group <= last_group;) {
7312 if (first_bit + len >= osb->bitmap_cpg)
7313 last_bit = osb->bitmap_cpg;
7314 else
7315 last_bit = first_bit + len;
7316
7317 ret = ocfs2_read_group_descriptor(main_bm_inode,
7318 main_bm, group,
7319 &gd_bh);
7320 if (ret < 0) {
7321 mlog_errno(ret);
7322 break;
7323 }
7324
7325 gd = (struct ocfs2_group_desc *)gd_bh->b_data;
7326 cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
7327 brelse(gd_bh);
7328 gd_bh = NULL;
7329 if (cnt < 0) {
7330 ret = cnt;
7331 mlog_errno(ret);
7332 break;
7333 }
7334
7335 trimmed += cnt;
7336 len -= osb->bitmap_cpg - first_bit;
7337 first_bit = 0;
7338 if (group == osb->first_cluster_group_blkno)
7339 group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7340 else
7341 group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7342 }
7343 range->len = trimmed * sb->s_blocksize;
7344out_unlock:
7345 ocfs2_inode_unlock(main_bm_inode, 0);
7346 brelse(main_bm_bh);
7347out_mutex:
7348 mutex_unlock(&main_bm_inode->i_mutex);
7349 iput(main_bm_inode);
7350out:
7351 return ret;
7352}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a03251c..ca381c584127 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
239 struct buffer_head **leaf_bh); 239 struct buffer_head **leaf_bh);
240int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); 240int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
241 241
242int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
242/* 243/*
243 * Helper function to look at the # of clusters in an extent record. 244 * Helper function to look at the # of clusters in an extent record.
244 */ 245 */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index bc702dab5d1f..a4b07730b2e1 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
57void o2cb_sys_shutdown(void) 57void o2cb_sys_shutdown(void)
58{ 58{
59 mlog_sys_shutdown(); 59 mlog_sys_shutdown();
60 sysfs_remove_link(NULL, "o2cb");
61 kset_unregister(o2cb_kset); 60 kset_unregister(o2cb_kset);
62} 61}
63 62
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
69 if (!o2cb_kset) 68 if (!o2cb_kset)
70 return -ENOMEM; 69 return -ENOMEM;
71 70
72 /*
73 * Create this symlink for backwards compatibility with old
74 * versions of ocfs2-tools which look for things in /sys/o2cb.
75 */
76 ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
77 if (ret)
78 goto error;
79
80 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); 71 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
81 if (ret) 72 if (ret)
82 goto error; 73 goto error;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4bdf7baee344..d602abb51b61 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -144,6 +144,7 @@ struct dlm_ctxt
144 wait_queue_head_t dlm_join_events; 144 wait_queue_head_t dlm_join_events;
145 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 145 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
146 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 146 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
147 unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
147 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 148 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
148 struct dlm_recovery_ctxt reco; 149 struct dlm_recovery_ctxt reco;
149 spinlock_t master_lock; 150 spinlock_t master_lock;
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
401 return 1; 402 return 1;
402} 403}
403 404
405static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
406{
407 if (idx == DLM_GRANTED_LIST)
408 return "granted";
409 else if (idx == DLM_CONVERTING_LIST)
410 return "converting";
411 else if (idx == DLM_BLOCKED_LIST)
412 return "blocked";
413 else
414 return "unknown";
415}
416
404static inline struct list_head * 417static inline struct list_head *
405dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) 418dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
406{ 419{
@@ -448,6 +461,7 @@ enum {
448 DLM_FINALIZE_RECO_MSG = 518, 461 DLM_FINALIZE_RECO_MSG = 518,
449 DLM_QUERY_REGION = 519, 462 DLM_QUERY_REGION = 519,
450 DLM_QUERY_NODEINFO = 520, 463 DLM_QUERY_NODEINFO = 520,
464 DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
451}; 465};
452 466
453struct dlm_reco_node_data 467struct dlm_reco_node_data
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 04a32be0aeb9..56f82cb912e3 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
756 buf + out, len - out); 756 buf + out, len - out);
757 out += snprintf(buf + out, len - out, "\n"); 757 out += snprintf(buf + out, len - out, "\n");
758 758
759 /* Exit Domain Map: xx xx xx */
760 out += snprintf(buf + out, len - out, "Exit Domain Map: ");
761 out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
762 buf + out, len - out);
763 out += snprintf(buf + out, len - out, "\n");
764
759 /* Live Map: xx xx xx */ 765 /* Live Map: xx xx xx */
760 out += snprintf(buf + out, len - out, "Live Map: "); 766 out += snprintf(buf + out, len - out, "Live Map: ");
761 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, 767 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3b179d6cbde0..6ed6b95dcf93 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
132 * New in version 1.1: 132 * New in version 1.1:
133 * - Message DLM_QUERY_REGION added to support global heartbeat 133 * - Message DLM_QUERY_REGION added to support global heartbeat
134 * - Message DLM_QUERY_NODEINFO added to allow online node removes 134 * - Message DLM_QUERY_NODEINFO added to allow online node removes
135 * New in version 1.2:
136 * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
135 */ 137 */
136static const struct dlm_protocol_version dlm_protocol = { 138static const struct dlm_protocol_version dlm_protocol = {
137 .pv_major = 1, 139 .pv_major = 1,
138 .pv_minor = 1, 140 .pv_minor = 2,
139}; 141};
140 142
141#define DLM_DOMAIN_BACKOFF_MS 200 143#define DLM_DOMAIN_BACKOFF_MS 200
@@ -449,14 +451,18 @@ redo_bucket:
449 dropped = dlm_empty_lockres(dlm, res); 451 dropped = dlm_empty_lockres(dlm, res);
450 452
451 spin_lock(&res->spinlock); 453 spin_lock(&res->spinlock);
452 __dlm_lockres_calc_usage(dlm, res); 454 if (dropped)
453 iter = res->hash_node.next; 455 __dlm_lockres_calc_usage(dlm, res);
456 else
457 iter = res->hash_node.next;
454 spin_unlock(&res->spinlock); 458 spin_unlock(&res->spinlock);
455 459
456 dlm_lockres_put(res); 460 dlm_lockres_put(res);
457 461
458 if (dropped) 462 if (dropped) {
463 cond_resched_lock(&dlm->spinlock);
459 goto redo_bucket; 464 goto redo_bucket;
465 }
460 } 466 }
461 cond_resched_lock(&dlm->spinlock); 467 cond_resched_lock(&dlm->spinlock);
462 num += n; 468 num += n;
@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
486 return ret; 492 return ret;
487} 493}
488 494
495static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
496 void *data, void **ret_data)
497{
498 struct dlm_ctxt *dlm = data;
499 unsigned int node;
500 struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
501
502 if (!dlm_grab(dlm))
503 return 0;
504
505 node = exit_msg->node_idx;
506 mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
507
508 spin_lock(&dlm->spinlock);
509 set_bit(node, dlm->exit_domain_map);
510 spin_unlock(&dlm->spinlock);
511
512 dlm_put(dlm);
513
514 return 0;
515}
516
489static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) 517static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
490{ 518{
491 /* Yikes, a double spinlock! I need domain_lock for the dlm 519 /* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
542 570
543 spin_lock(&dlm->spinlock); 571 spin_lock(&dlm->spinlock);
544 clear_bit(node, dlm->domain_map); 572 clear_bit(node, dlm->domain_map);
573 clear_bit(node, dlm->exit_domain_map);
545 __dlm_print_nodes(dlm); 574 __dlm_print_nodes(dlm);
546 575
547 /* notify anything attached to the heartbeat events */ 576 /* notify anything attached to the heartbeat events */
@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
554 return 0; 583 return 0;
555} 584}
556 585
557static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, 586static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
558 unsigned int node) 587 unsigned int node)
559{ 588{
560 int status; 589 int status;
561 struct dlm_exit_domain leave_msg; 590 struct dlm_exit_domain leave_msg;
562 591
563 mlog(0, "Asking node %u if we can leave the domain %s me = %u\n", 592 mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
564 node, dlm->name, dlm->node_num); 593 msg_type, node);
565 594
566 memset(&leave_msg, 0, sizeof(leave_msg)); 595 memset(&leave_msg, 0, sizeof(leave_msg));
567 leave_msg.node_idx = dlm->node_num; 596 leave_msg.node_idx = dlm->node_num;
568 597
569 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, 598 status = o2net_send_message(msg_type, dlm->key, &leave_msg,
570 &leave_msg, sizeof(leave_msg), node, 599 sizeof(leave_msg), node, NULL);
571 NULL);
572 if (status < 0) 600 if (status < 0)
573 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 601 mlog(ML_ERROR, "Error %d sending domain exit message %u "
574 "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node); 602 "to node %u on domain %s\n", status, msg_type, node,
575 mlog(0, "status return %d from o2net_send_message\n", status); 603 dlm->name);
576 604
577 return status; 605 return status;
578} 606}
579 607
608static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
609{
610 int node = -1;
611
612 /* Support for begin exit domain was added in 1.2 */
613 if (dlm->dlm_locking_proto.pv_major == 1 &&
614 dlm->dlm_locking_proto.pv_minor < 2)
615 return;
616
617 /*
618 * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
619 * informational. Meaning if a node does not receive the message,
620 * so be it.
621 */
622 spin_lock(&dlm->spinlock);
623 while (1) {
624 node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
625 if (node >= O2NM_MAX_NODES)
626 break;
627 if (node == dlm->node_num)
628 continue;
629
630 spin_unlock(&dlm->spinlock);
631 dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
632 spin_lock(&dlm->spinlock);
633 }
634 spin_unlock(&dlm->spinlock);
635}
580 636
581static void dlm_leave_domain(struct dlm_ctxt *dlm) 637static void dlm_leave_domain(struct dlm_ctxt *dlm)
582{ 638{
@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
602 658
603 clear_node = 1; 659 clear_node = 1;
604 660
605 status = dlm_send_one_domain_exit(dlm, node); 661 status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
662 node);
606 if (status < 0 && 663 if (status < 0 &&
607 status != -ENOPROTOOPT && 664 status != -ENOPROTOOPT &&
608 status != -ENOTCONN) { 665 status != -ENOTCONN) {
@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
677 734
678 if (leave) { 735 if (leave) {
679 mlog(0, "shutting down domain %s\n", dlm->name); 736 mlog(0, "shutting down domain %s\n", dlm->name);
737 dlm_begin_exit_domain(dlm);
680 738
681 /* We changed dlm state, notify the thread */ 739 /* We changed dlm state, notify the thread */
682 dlm_kick_thread(dlm, NULL); 740 dlm_kick_thread(dlm, NULL);
@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
909 * leftover join state. */ 967 * leftover join state. */
910 BUG_ON(dlm->joining_node != assert->node_idx); 968 BUG_ON(dlm->joining_node != assert->node_idx);
911 set_bit(assert->node_idx, dlm->domain_map); 969 set_bit(assert->node_idx, dlm->domain_map);
970 clear_bit(assert->node_idx, dlm->exit_domain_map);
912 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 971 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
913 972
914 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", 973 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1793 if (status) 1852 if (status)
1794 goto bail; 1853 goto bail;
1795 1854
1855 status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
1856 sizeof(struct dlm_exit_domain),
1857 dlm_begin_exit_domain_handler,
1858 dlm, NULL, &dlm->dlm_domain_handlers);
1859 if (status)
1860 goto bail;
1861
1796bail: 1862bail:
1797 if (status) 1863 if (status)
1798 dlm_unregister_domain_handlers(dlm); 1864 dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 84d166328cf7..11eefb8c12e9 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2339 dlm_lockres_put(res); 2339 dlm_lockres_put(res);
2340} 2340}
2341 2341
2342/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0 2342/*
2343 * if not. If 0, numlocks is set to the number of locks in the lockres. 2343 * A migrateable resource is one that is :
2344 * 1. locally mastered, and,
2345 * 2. zero local locks, and,
2346 * 3. one or more non-local locks, or, one or more references
2347 * Returns 1 if yes, 0 if not.
2344 */ 2348 */
2345static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, 2349static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2346 struct dlm_lock_resource *res, 2350 struct dlm_lock_resource *res)
2347 int *numlocks,
2348 int *hasrefs)
2349{ 2351{
2350 int ret; 2352 enum dlm_lockres_list idx;
2351 int i; 2353 int nonlocal = 0, node_ref;
2352 int count = 0;
2353 struct list_head *queue; 2354 struct list_head *queue;
2354 struct dlm_lock *lock; 2355 struct dlm_lock *lock;
2356 u64 cookie;
2355 2357
2356 assert_spin_locked(&res->spinlock); 2358 assert_spin_locked(&res->spinlock);
2357 2359
2358 *numlocks = 0; 2360 if (res->owner != dlm->node_num)
2359 *hasrefs = 0; 2361 return 0;
2360
2361 ret = -EINVAL;
2362 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
2363 mlog(0, "cannot migrate lockres with unknown owner!\n");
2364 goto leave;
2365 }
2366
2367 if (res->owner != dlm->node_num) {
2368 mlog(0, "cannot migrate lockres this node doesn't own!\n");
2369 goto leave;
2370 }
2371 2362
2372 ret = 0; 2363 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2373 queue = &res->granted; 2364 queue = dlm_list_idx_to_ptr(res, idx);
2374 for (i = 0; i < 3; i++) {
2375 list_for_each_entry(lock, queue, list) { 2365 list_for_each_entry(lock, queue, list) {
2376 ++count; 2366 if (lock->ml.node != dlm->node_num) {
2377 if (lock->ml.node == dlm->node_num) { 2367 nonlocal++;
2378 mlog(0, "found a lock owned by this node still " 2368 continue;
2379 "on the %s queue! will not migrate this "
2380 "lockres\n", (i == 0 ? "granted" :
2381 (i == 1 ? "converting" :
2382 "blocked")));
2383 ret = -ENOTEMPTY;
2384 goto leave;
2385 } 2369 }
2370 cookie = be64_to_cpu(lock->ml.cookie);
2371 mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
2372 "%s list\n", dlm->name, res->lockname.len,
2373 res->lockname.name,
2374 dlm_get_lock_cookie_node(cookie),
2375 dlm_get_lock_cookie_seq(cookie),
2376 dlm_list_in_text(idx));
2377 return 0;
2386 } 2378 }
2387 queue++;
2388 } 2379 }
2389 2380
2390 *numlocks = count; 2381 if (!nonlocal) {
2391 2382 node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2392 count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 2383 if (node_ref >= O2NM_MAX_NODES)
2393 if (count < O2NM_MAX_NODES) 2384 return 0;
2394 *hasrefs = 1; 2385 }
2395 2386
2396 mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name, 2387 mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
2397 res->lockname.len, res->lockname.name, *numlocks, *hasrefs); 2388 res->lockname.name);
2398 2389
2399leave: 2390 return 1;
2400 return ret;
2401} 2391}
2402 2392
2403/* 2393/*
@@ -2406,8 +2396,7 @@ leave:
2406 2396
2407 2397
2408static int dlm_migrate_lockres(struct dlm_ctxt *dlm, 2398static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2409 struct dlm_lock_resource *res, 2399 struct dlm_lock_resource *res, u8 target)
2410 u8 target)
2411{ 2400{
2412 struct dlm_master_list_entry *mle = NULL; 2401 struct dlm_master_list_entry *mle = NULL;
2413 struct dlm_master_list_entry *oldmle = NULL; 2402 struct dlm_master_list_entry *oldmle = NULL;
@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2416 const char *name; 2405 const char *name;
2417 unsigned int namelen; 2406 unsigned int namelen;
2418 int mle_added = 0; 2407 int mle_added = 0;
2419 int numlocks, hasrefs;
2420 int wake = 0; 2408 int wake = 0;
2421 2409
2422 if (!dlm_grab(dlm)) 2410 if (!dlm_grab(dlm))
2423 return -EINVAL; 2411 return -EINVAL;
2424 2412
2413 BUG_ON(target == O2NM_MAX_NODES);
2414
2425 name = res->lockname.name; 2415 name = res->lockname.name;
2426 namelen = res->lockname.len; 2416 namelen = res->lockname.len;
2427 2417
2428 mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target); 2418 mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
2429 2419 target);
2430 /*
2431 * ensure this lockres is a proper candidate for migration
2432 */
2433 spin_lock(&res->spinlock);
2434 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2435 if (ret < 0) {
2436 spin_unlock(&res->spinlock);
2437 goto leave;
2438 }
2439 spin_unlock(&res->spinlock);
2440
2441 /* no work to do */
2442 if (numlocks == 0 && !hasrefs)
2443 goto leave;
2444
2445 /*
2446 * preallocate up front
2447 * if this fails, abort
2448 */
2449 2420
2421 /* preallocate up front. if this fails, abort */
2450 ret = -ENOMEM; 2422 ret = -ENOMEM;
2451 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); 2423 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
2452 if (!mres) { 2424 if (!mres) {
@@ -2462,35 +2434,10 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2462 ret = 0; 2434 ret = 0;
2463 2435
2464 /* 2436 /*
2465 * find a node to migrate the lockres to
2466 */
2467
2468 spin_lock(&dlm->spinlock);
2469 /* pick a new node */
2470 if (!test_bit(target, dlm->domain_map) ||
2471 target >= O2NM_MAX_NODES) {
2472 target = dlm_pick_migration_target(dlm, res);
2473 }
2474 mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
2475 namelen, name, target);
2476
2477 if (target >= O2NM_MAX_NODES ||
2478 !test_bit(target, dlm->domain_map)) {
2479 /* target chosen is not alive */
2480 ret = -EINVAL;
2481 }
2482
2483 if (ret) {
2484 spin_unlock(&dlm->spinlock);
2485 goto fail;
2486 }
2487
2488 mlog(0, "continuing with target = %u\n", target);
2489
2490 /*
2491 * clear any existing master requests and 2437 * clear any existing master requests and
2492 * add the migration mle to the list 2438 * add the migration mle to the list
2493 */ 2439 */
2440 spin_lock(&dlm->spinlock);
2494 spin_lock(&dlm->master_lock); 2441 spin_lock(&dlm->master_lock);
2495 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, 2442 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2496 namelen, target, dlm->node_num); 2443 namelen, target, dlm->node_num);
@@ -2531,6 +2478,7 @@ fail:
2531 dlm_put_mle(mle); 2478 dlm_put_mle(mle);
2532 } else if (mle) { 2479 } else if (mle) {
2533 kmem_cache_free(dlm_mle_cache, mle); 2480 kmem_cache_free(dlm_mle_cache, mle);
2481 mle = NULL;
2534 } 2482 }
2535 goto leave; 2483 goto leave;
2536 } 2484 }
@@ -2652,69 +2600,52 @@ leave:
2652 if (wake) 2600 if (wake)
2653 wake_up(&res->wq); 2601 wake_up(&res->wq);
2654 2602
2655 /* TODO: cleanup */
2656 if (mres) 2603 if (mres)
2657 free_page((unsigned long)mres); 2604 free_page((unsigned long)mres);
2658 2605
2659 dlm_put(dlm); 2606 dlm_put(dlm);
2660 2607
2661 mlog(0, "returning %d\n", ret); 2608 mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
2609 name, target, ret);
2662 return ret; 2610 return ret;
2663} 2611}
2664 2612
2665#define DLM_MIGRATION_RETRY_MS 100 2613#define DLM_MIGRATION_RETRY_MS 100
2666 2614
2667/* Should be called only after beginning the domain leave process. 2615/*
2616 * Should be called only after beginning the domain leave process.
2668 * There should not be any remaining locks on nonlocal lock resources, 2617 * There should not be any remaining locks on nonlocal lock resources,
2669 * and there should be no local locks left on locally mastered resources. 2618 * and there should be no local locks left on locally mastered resources.
2670 * 2619 *
2671 * Called with the dlm spinlock held, may drop it to do migration, but 2620 * Called with the dlm spinlock held, may drop it to do migration, but
2672 * will re-acquire before exit. 2621 * will re-acquire before exit.
2673 * 2622 *
2674 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ 2623 * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
2624 */
2675int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 2625int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2676{ 2626{
2677 int ret; 2627 int ret;
2678 int lock_dropped = 0; 2628 int lock_dropped = 0;
2679 int numlocks, hasrefs; 2629 u8 target = O2NM_MAX_NODES;
2630
2631 assert_spin_locked(&dlm->spinlock);
2680 2632
2681 spin_lock(&res->spinlock); 2633 spin_lock(&res->spinlock);
2682 if (res->owner != dlm->node_num) { 2634 if (dlm_is_lockres_migrateable(dlm, res))
2683 if (!__dlm_lockres_unused(res)) { 2635 target = dlm_pick_migration_target(dlm, res);
2684 mlog(ML_ERROR, "%s:%.*s: this node is not master, " 2636 spin_unlock(&res->spinlock);
2685 "trying to free this but locks remain\n",
2686 dlm->name, res->lockname.len, res->lockname.name);
2687 }
2688 spin_unlock(&res->spinlock);
2689 goto leave;
2690 }
2691 2637
2692 /* No need to migrate a lockres having no locks */ 2638 if (target == O2NM_MAX_NODES)
2693 ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
2694 if (ret >= 0 && numlocks == 0 && !hasrefs) {
2695 spin_unlock(&res->spinlock);
2696 goto leave; 2639 goto leave;
2697 }
2698 spin_unlock(&res->spinlock);
2699 2640
2700 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ 2641 /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2701 spin_unlock(&dlm->spinlock); 2642 spin_unlock(&dlm->spinlock);
2702 lock_dropped = 1; 2643 lock_dropped = 1;
2703 while (1) { 2644 ret = dlm_migrate_lockres(dlm, res, target);
2704 ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); 2645 if (ret)
2705 if (ret >= 0) 2646 mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
2706 break; 2647 dlm->name, res->lockname.len, res->lockname.name,
2707 if (ret == -ENOTEMPTY) { 2648 target, ret);
2708 mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
2709 res->lockname.len, res->lockname.name);
2710 BUG();
2711 }
2712
2713 mlog(0, "lockres %.*s: migrate failed, "
2714 "retrying\n", res->lockname.len,
2715 res->lockname.name);
2716 msleep(DLM_MIGRATION_RETRY_MS);
2717 }
2718 spin_lock(&dlm->spinlock); 2649 spin_lock(&dlm->spinlock);
2719leave: 2650leave:
2720 return lock_dropped; 2651 return lock_dropped;
@@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2898 } 2829 }
2899} 2830}
2900 2831
2901/* for now this is not too intelligent. we will 2832/*
2902 * need stats to make this do the right thing. 2833 * Pick a node to migrate the lock resource to. This function selects a
2903 * this just finds the first lock on one of the 2834 * potential target based first on the locks and then on refmap. It skips
2904 * queues and uses that node as the target. */ 2835 * nodes that are in the process of exiting the domain.
2836 */
2905static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, 2837static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2906 struct dlm_lock_resource *res) 2838 struct dlm_lock_resource *res)
2907{ 2839{
2908 int i; 2840 enum dlm_lockres_list idx;
2909 struct list_head *queue = &res->granted; 2841 struct list_head *queue = &res->granted;
2910 struct dlm_lock *lock; 2842 struct dlm_lock *lock;
2911 int nodenum; 2843 int noderef;
2844 u8 nodenum = O2NM_MAX_NODES;
2912 2845
2913 assert_spin_locked(&dlm->spinlock); 2846 assert_spin_locked(&dlm->spinlock);
2847 assert_spin_locked(&res->spinlock);
2914 2848
2915 spin_lock(&res->spinlock); 2849 /* Go through all the locks */
2916 for (i=0; i<3; i++) { 2850 for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2851 queue = dlm_list_idx_to_ptr(res, idx);
2917 list_for_each_entry(lock, queue, list) { 2852 list_for_each_entry(lock, queue, list) {
2918 /* up to the caller to make sure this node 2853 if (lock->ml.node == dlm->node_num)
2919 * is alive */ 2854 continue;
2920 if (lock->ml.node != dlm->node_num) { 2855 if (test_bit(lock->ml.node, dlm->exit_domain_map))
2921 spin_unlock(&res->spinlock); 2856 continue;
2922 return lock->ml.node; 2857 nodenum = lock->ml.node;
2923 } 2858 goto bail;
2924 } 2859 }
2925 queue++;
2926 }
2927
2928 nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2929 if (nodenum < O2NM_MAX_NODES) {
2930 spin_unlock(&res->spinlock);
2931 return nodenum;
2932 } 2860 }
2933 spin_unlock(&res->spinlock);
2934 mlog(0, "have not found a suitable target yet! checking domain map\n");
2935 2861
2936 /* ok now we're getting desperate. pick anyone alive. */ 2862 /* Go thru the refmap */
2937 nodenum = -1; 2863 noderef = -1;
2938 while (1) { 2864 while (1) {
2939 nodenum = find_next_bit(dlm->domain_map, 2865 noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
2940 O2NM_MAX_NODES, nodenum+1); 2866 noderef + 1);
2941 mlog(0, "found %d in domain map\n", nodenum); 2867 if (noderef >= O2NM_MAX_NODES)
2942 if (nodenum >= O2NM_MAX_NODES)
2943 break; 2868 break;
2944 if (nodenum != dlm->node_num) { 2869 if (noderef == dlm->node_num)
2945 mlog(0, "picking %d\n", nodenum); 2870 continue;
2946 return nodenum; 2871 if (test_bit(noderef, dlm->exit_domain_map))
2947 } 2872 continue;
2873 nodenum = noderef;
2874 goto bail;
2948 } 2875 }
2949 2876
2950 mlog(0, "giving up. no master to migrate to\n"); 2877bail:
2951 return DLM_LOCK_RES_OWNER_UNKNOWN; 2878 return nodenum;
2952} 2879}
2953 2880
2954
2955
2956/* this is called by the new master once all lockres 2881/* this is called by the new master once all lockres
2957 * data has been received */ 2882 * data has been received */
2958static int dlm_do_migrate_request(struct dlm_ctxt *dlm, 2883static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f1beb6fc254d..7efab6d28a21 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2393 2393
2394 mlog(0, "node %u being removed from domain map!\n", idx); 2394 mlog(0, "node %u being removed from domain map!\n", idx);
2395 clear_bit(idx, dlm->domain_map); 2395 clear_bit(idx, dlm->domain_map);
2396 clear_bit(idx, dlm->exit_domain_map);
2396 /* wake up migration waiters if a node goes down. 2397 /* wake up migration waiters if a node goes down.
2397 * perhaps later we can genericize this for other waiters. */ 2398 * perhaps later we can genericize this for other waiters. */
2398 wake_up(&dlm->migration_wq); 2399 wake_up(&dlm->migration_wq);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 8c5c0eddc365..b42076797049 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
88 * signifies a bast fired on the lock. 88 * signifies a bast fired on the lock.
89 */ 89 */
90#define DLMFS_CAPABILITIES "bast stackglue" 90#define DLMFS_CAPABILITIES "bast stackglue"
91extern int param_set_dlmfs_capabilities(const char *val, 91static int param_set_dlmfs_capabilities(const char *val,
92 struct kernel_param *kp) 92 struct kernel_param *kp)
93{ 93{
94 printk(KERN_ERR "%s: readonly parameter\n", kp->name); 94 printk(KERN_ERR "%s: readonly parameter\n", kp->name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89659d6dc206..b1e35a392ca5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
2670 .flock = ocfs2_flock, 2670 .flock = ocfs2_flock,
2671 .splice_read = ocfs2_file_splice_read, 2671 .splice_read = ocfs2_file_splice_read,
2672 .splice_write = ocfs2_file_splice_write, 2672 .splice_write = ocfs2_file_splice_write,
2673 .fallocate = ocfs2_fallocate,
2673}; 2674};
2674 2675
2675const struct file_operations ocfs2_dops_no_plocks = { 2676const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8f13c5989eae..bc91072b7219 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,11 @@
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h" 23#include "resize.h"
24#include "refcounttree.h" 24#include "refcounttree.h"
25#include "sysfile.h"
26#include "dir.h"
27#include "buffer_head_io.h"
28#include "suballoc.h"
29#include "move_extents.h"
25 30
26#include <linux/ext2_fs.h> 31#include <linux/ext2_fs.h>
27 32
@@ -35,31 +40,27 @@
35 * be -EFAULT. The error will be returned from the ioctl(2) call. It's 40 * be -EFAULT. The error will be returned from the ioctl(2) call. It's
36 * just a best-effort to tell userspace that this request caused the error. 41 * just a best-effort to tell userspace that this request caused the error.
37 */ 42 */
38static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, 43static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
39 struct ocfs2_info_request __user *req) 44 struct ocfs2_info_request __user *req)
40{ 45{
41 kreq->ir_flags |= OCFS2_INFO_FL_ERROR; 46 kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
42 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); 47 (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
43} 48}
44 49
45#define o2info_set_request_error(a, b) \ 50static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
46 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
47
48static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
49{ 51{
50 req->ir_flags |= OCFS2_INFO_FL_FILLED; 52 req->ir_flags |= OCFS2_INFO_FL_FILLED;
51} 53}
52 54
53#define o2info_set_request_filled(a) \ 55static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
54 __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
55
56static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
57{ 56{
58 req->ir_flags &= ~OCFS2_INFO_FL_FILLED; 57 req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
59} 58}
60 59
61#define o2info_clear_request_filled(a) \ 60static inline int o2info_coherent(struct ocfs2_info_request *req)
62 __o2info_clear_request_filled((struct ocfs2_info_request *)&(a)) 61{
62 return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
63}
63 64
64static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 65static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
65{ 66{
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
153 154
154 oib.ib_blocksize = inode->i_sb->s_blocksize; 155 oib.ib_blocksize = inode->i_sb->s_blocksize;
155 156
156 o2info_set_request_filled(oib); 157 o2info_set_request_filled(&oib.ib_req);
157 158
158 if (o2info_to_user(oib, req)) 159 if (o2info_to_user(oib, req))
159 goto bail; 160 goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
161 status = 0; 162 status = 0;
162bail: 163bail:
163 if (status) 164 if (status)
164 o2info_set_request_error(oib, req); 165 o2info_set_request_error(&oib.ib_req, req);
165 166
166 return status; 167 return status;
167} 168}
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
178 179
179 oic.ic_clustersize = osb->s_clustersize; 180 oic.ic_clustersize = osb->s_clustersize;
180 181
181 o2info_set_request_filled(oic); 182 o2info_set_request_filled(&oic.ic_req);
182 183
183 if (o2info_to_user(oic, req)) 184 if (o2info_to_user(oic, req))
184 goto bail; 185 goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
186 status = 0; 187 status = 0;
187bail: 188bail:
188 if (status) 189 if (status)
189 o2info_set_request_error(oic, req); 190 o2info_set_request_error(&oic.ic_req, req);
190 191
191 return status; 192 return status;
192} 193}
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
203 204
204 oim.im_max_slots = osb->max_slots; 205 oim.im_max_slots = osb->max_slots;
205 206
206 o2info_set_request_filled(oim); 207 o2info_set_request_filled(&oim.im_req);
207 208
208 if (o2info_to_user(oim, req)) 209 if (o2info_to_user(oim, req))
209 goto bail; 210 goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
211 status = 0; 212 status = 0;
212bail: 213bail:
213 if (status) 214 if (status)
214 o2info_set_request_error(oim, req); 215 o2info_set_request_error(&oim.im_req, req);
215 216
216 return status; 217 return status;
217} 218}
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
228 229
229 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); 230 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
230 231
231 o2info_set_request_filled(oil); 232 o2info_set_request_filled(&oil.il_req);
232 233
233 if (o2info_to_user(oil, req)) 234 if (o2info_to_user(oil, req))
234 goto bail; 235 goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
236 status = 0; 237 status = 0;
237bail: 238bail:
238 if (status) 239 if (status)
239 o2info_set_request_error(oil, req); 240 o2info_set_request_error(&oil.il_req, req);
240 241
241 return status; 242 return status;
242} 243}
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
253 254
254 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); 255 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
255 256
256 o2info_set_request_filled(oiu); 257 o2info_set_request_filled(&oiu.iu_req);
257 258
258 if (o2info_to_user(oiu, req)) 259 if (o2info_to_user(oiu, req))
259 goto bail; 260 goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
261 status = 0; 262 status = 0;
262bail: 263bail:
263 if (status) 264 if (status)
264 o2info_set_request_error(oiu, req); 265 o2info_set_request_error(&oiu.iu_req, req);
265 266
266 return status; 267 return status;
267} 268}
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
280 oif.if_incompat_features = osb->s_feature_incompat; 281 oif.if_incompat_features = osb->s_feature_incompat;
281 oif.if_ro_compat_features = osb->s_feature_ro_compat; 282 oif.if_ro_compat_features = osb->s_feature_ro_compat;
282 283
283 o2info_set_request_filled(oif); 284 o2info_set_request_filled(&oif.if_req);
284 285
285 if (o2info_to_user(oif, req)) 286 if (o2info_to_user(oif, req))
286 goto bail; 287 goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
288 status = 0; 289 status = 0;
289bail: 290bail:
290 if (status) 291 if (status)
291 o2info_set_request_error(oif, req); 292 o2info_set_request_error(&oif.if_req, req);
292 293
293 return status; 294 return status;
294} 295}
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
305 306
306 oij.ij_journal_size = osb->journal->j_inode->i_size; 307 oij.ij_journal_size = osb->journal->j_inode->i_size;
307 308
308 o2info_set_request_filled(oij); 309 o2info_set_request_filled(&oij.ij_req);
309 310
310 if (o2info_to_user(oij, req)) 311 if (o2info_to_user(oij, req))
311 goto bail; 312 goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
313 status = 0; 314 status = 0;
314bail: 315bail:
315 if (status) 316 if (status)
316 o2info_set_request_error(oij, req); 317 o2info_set_request_error(&oij.ij_req, req);
318
319 return status;
320}
321
322int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
323 struct inode *inode_alloc, u64 blkno,
324 struct ocfs2_info_freeinode *fi, u32 slot)
325{
326 int status = 0, unlock = 0;
327
328 struct buffer_head *bh = NULL;
329 struct ocfs2_dinode *dinode_alloc = NULL;
330
331 if (inode_alloc)
332 mutex_lock(&inode_alloc->i_mutex);
333
334 if (o2info_coherent(&fi->ifi_req)) {
335 status = ocfs2_inode_lock(inode_alloc, &bh, 0);
336 if (status < 0) {
337 mlog_errno(status);
338 goto bail;
339 }
340 unlock = 1;
341 } else {
342 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
343 if (status < 0) {
344 mlog_errno(status);
345 goto bail;
346 }
347 }
348
349 dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
350
351 fi->ifi_stat[slot].lfi_total =
352 le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
353 fi->ifi_stat[slot].lfi_free =
354 le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
355 le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
356
357bail:
358 if (unlock)
359 ocfs2_inode_unlock(inode_alloc, 0);
360
361 if (inode_alloc)
362 mutex_unlock(&inode_alloc->i_mutex);
363
364 brelse(bh);
365
366 return status;
367}
368
369int ocfs2_info_handle_freeinode(struct inode *inode,
370 struct ocfs2_info_request __user *req)
371{
372 u32 i;
373 u64 blkno = -1;
374 char namebuf[40];
375 int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
376 struct ocfs2_info_freeinode *oifi = NULL;
377 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
378 struct inode *inode_alloc = NULL;
379
380 oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
381 if (!oifi) {
382 status = -ENOMEM;
383 mlog_errno(status);
384 goto bail;
385 }
386
387 if (o2info_from_user(*oifi, req))
388 goto bail;
389
390 oifi->ifi_slotnum = osb->max_slots;
391
392 for (i = 0; i < oifi->ifi_slotnum; i++) {
393 if (o2info_coherent(&oifi->ifi_req)) {
394 inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
395 if (!inode_alloc) {
396 mlog(ML_ERROR, "unable to get alloc inode in "
397 "slot %u\n", i);
398 status = -EIO;
399 goto bail;
400 }
401 } else {
402 ocfs2_sprintf_system_inode_name(namebuf,
403 sizeof(namebuf),
404 type, i);
405 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
406 namebuf,
407 strlen(namebuf),
408 &blkno);
409 if (status < 0) {
410 status = -ENOENT;
411 goto bail;
412 }
413 }
414
415 status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
416 if (status < 0)
417 goto bail;
418
419 iput(inode_alloc);
420 inode_alloc = NULL;
421 }
422
423 o2info_set_request_filled(&oifi->ifi_req);
424
425 if (o2info_to_user(*oifi, req))
426 goto bail;
427
428 status = 0;
429bail:
430 if (status)
431 o2info_set_request_error(&oifi->ifi_req, req);
432
433 kfree(oifi);
434
435 return status;
436}
437
438static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
439 unsigned int chunksize)
440{
441 int index;
442
443 index = __ilog2_u32(chunksize);
444 if (index >= OCFS2_INFO_MAX_HIST)
445 index = OCFS2_INFO_MAX_HIST - 1;
446
447 hist->fc_chunks[index]++;
448 hist->fc_clusters[index] += chunksize;
449}
450
451static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
452 unsigned int chunksize)
453{
454 if (chunksize > stats->ffs_max)
455 stats->ffs_max = chunksize;
456
457 if (chunksize < stats->ffs_min)
458 stats->ffs_min = chunksize;
459
460 stats->ffs_avg += chunksize;
461 stats->ffs_free_chunks_real++;
462}
463
464void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
465 unsigned int chunksize)
466{
467 o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
468 o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
469}
470
471int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
472 struct inode *gb_inode,
473 struct ocfs2_dinode *gb_dinode,
474 struct ocfs2_chain_rec *rec,
475 struct ocfs2_info_freefrag *ffg,
476 u32 chunks_in_group)
477{
478 int status = 0, used;
479 u64 blkno;
480
481 struct buffer_head *bh = NULL;
482 struct ocfs2_group_desc *bg = NULL;
483
484 unsigned int max_bits, num_clusters;
485 unsigned int offset = 0, cluster, chunk;
486 unsigned int chunk_free, last_chunksize = 0;
487
488 if (!le32_to_cpu(rec->c_free))
489 goto bail;
490
491 do {
492 if (!bg)
493 blkno = le64_to_cpu(rec->c_blkno);
494 else
495 blkno = le64_to_cpu(bg->bg_next_group);
496
497 if (bh) {
498 brelse(bh);
499 bh = NULL;
500 }
501
502 if (o2info_coherent(&ffg->iff_req))
503 status = ocfs2_read_group_descriptor(gb_inode,
504 gb_dinode,
505 blkno, &bh);
506 else
507 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
508
509 if (status < 0) {
510 mlog(ML_ERROR, "Can't read the group descriptor # "
511 "%llu from device.", (unsigned long long)blkno);
512 status = -EIO;
513 goto bail;
514 }
515
516 bg = (struct ocfs2_group_desc *)bh->b_data;
517
518 if (!le16_to_cpu(bg->bg_free_bits_count))
519 continue;
520
521 max_bits = le16_to_cpu(bg->bg_bits);
522 offset = 0;
523
524 for (chunk = 0; chunk < chunks_in_group; chunk++) {
525 /*
526 * last chunk may be not an entire one.
527 */
528 if ((offset + ffg->iff_chunksize) > max_bits)
529 num_clusters = max_bits - offset;
530 else
531 num_clusters = ffg->iff_chunksize;
532
533 chunk_free = 0;
534 for (cluster = 0; cluster < num_clusters; cluster++) {
535 used = ocfs2_test_bit(offset,
536 (unsigned long *)bg->bg_bitmap);
537 /*
538 * - chunk_free counts free clusters in #N chunk.
539 * - last_chunksize records the size(in) clusters
540 * for the last real free chunk being counted.
541 */
542 if (!used) {
543 last_chunksize++;
544 chunk_free++;
545 }
546
547 if (used && last_chunksize) {
548 ocfs2_info_update_ffg(ffg,
549 last_chunksize);
550 last_chunksize = 0;
551 }
552
553 offset++;
554 }
555
556 if (chunk_free == ffg->iff_chunksize)
557 ffg->iff_ffs.ffs_free_chunks++;
558 }
559
560 /*
561 * need to update the info for last free chunk.
562 */
563 if (last_chunksize)
564 ocfs2_info_update_ffg(ffg, last_chunksize);
565
566 } while (le64_to_cpu(bg->bg_next_group));
567
568bail:
569 brelse(bh);
570
571 return status;
572}
573
574int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
575 struct inode *gb_inode, u64 blkno,
576 struct ocfs2_info_freefrag *ffg)
577{
578 u32 chunks_in_group;
579 int status = 0, unlock = 0, i;
580
581 struct buffer_head *bh = NULL;
582 struct ocfs2_chain_list *cl = NULL;
583 struct ocfs2_chain_rec *rec = NULL;
584 struct ocfs2_dinode *gb_dinode = NULL;
585
586 if (gb_inode)
587 mutex_lock(&gb_inode->i_mutex);
588
589 if (o2info_coherent(&ffg->iff_req)) {
590 status = ocfs2_inode_lock(gb_inode, &bh, 0);
591 if (status < 0) {
592 mlog_errno(status);
593 goto bail;
594 }
595 unlock = 1;
596 } else {
597 status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
598 if (status < 0) {
599 mlog_errno(status);
600 goto bail;
601 }
602 }
603
604 gb_dinode = (struct ocfs2_dinode *)bh->b_data;
605 cl = &(gb_dinode->id2.i_chain);
606
607 /*
608 * Chunksize(in) clusters from userspace should be
609 * less than clusters in a group.
610 */
611 if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
612 status = -EINVAL;
613 goto bail;
614 }
615
616 memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
617
618 ffg->iff_ffs.ffs_min = ~0U;
619 ffg->iff_ffs.ffs_clusters =
620 le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
621 ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
622 le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
623
624 chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
625
626 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
627 rec = &(cl->cl_recs[i]);
628 status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
629 gb_dinode,
630 rec, ffg,
631 chunks_in_group);
632 if (status)
633 goto bail;
634 }
635
636 if (ffg->iff_ffs.ffs_free_chunks_real)
637 ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
638 ffg->iff_ffs.ffs_free_chunks_real);
639bail:
640 if (unlock)
641 ocfs2_inode_unlock(gb_inode, 0);
642
643 if (gb_inode)
644 mutex_unlock(&gb_inode->i_mutex);
645
646 if (gb_inode)
647 iput(gb_inode);
648
649 brelse(bh);
650
651 return status;
652}
653
654int ocfs2_info_handle_freefrag(struct inode *inode,
655 struct ocfs2_info_request __user *req)
656{
657 u64 blkno = -1;
658 char namebuf[40];
659 int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
660
661 struct ocfs2_info_freefrag *oiff;
662 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
663 struct inode *gb_inode = NULL;
664
665 oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
666 if (!oiff) {
667 status = -ENOMEM;
668 mlog_errno(status);
669 goto bail;
670 }
671
672 if (o2info_from_user(*oiff, req))
673 goto bail;
674 /*
675 * chunksize from userspace should be power of 2.
676 */
677 if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
678 (!oiff->iff_chunksize)) {
679 status = -EINVAL;
680 goto bail;
681 }
682
683 if (o2info_coherent(&oiff->iff_req)) {
684 gb_inode = ocfs2_get_system_file_inode(osb, type,
685 OCFS2_INVALID_SLOT);
686 if (!gb_inode) {
687 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
688 status = -EIO;
689 goto bail;
690 }
691 } else {
692 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
693 OCFS2_INVALID_SLOT);
694 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
695 namebuf,
696 strlen(namebuf),
697 &blkno);
698 if (status < 0) {
699 status = -ENOENT;
700 goto bail;
701 }
702 }
703
704 status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
705 if (status < 0)
706 goto bail;
707
708 o2info_set_request_filled(&oiff->iff_req);
709
710 if (o2info_to_user(*oiff, req))
711 goto bail;
712
713 status = 0;
714bail:
715 if (status)
716 o2info_set_request_error(&oiff->iff_req, req);
717
718 kfree(oiff);
317 719
318 return status; 720 return status;
319} 721}
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
327 if (o2info_from_user(oir, req)) 729 if (o2info_from_user(oir, req))
328 goto bail; 730 goto bail;
329 731
330 o2info_clear_request_filled(oir); 732 o2info_clear_request_filled(&oir);
331 733
332 if (o2info_to_user(oir, req)) 734 if (o2info_to_user(oir, req))
333 goto bail; 735 goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
335 status = 0; 737 status = 0;
336bail: 738bail:
337 if (status) 739 if (status)
338 o2info_set_request_error(oir, req); 740 o2info_set_request_error(&oir, req);
339 741
340 return status; 742 return status;
341} 743}
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
389 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) 791 if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
390 status = ocfs2_info_handle_journal_size(inode, req); 792 status = ocfs2_info_handle_journal_size(inode, req);
391 break; 793 break;
794 case OCFS2_INFO_FREEINODE:
795 if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
796 status = ocfs2_info_handle_freeinode(inode, req);
797 break;
798 case OCFS2_INFO_FREEFRAG:
799 if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
800 status = ocfs2_info_handle_freefrag(inode, req);
801 break;
392 default: 802 default:
393 status = ocfs2_info_handle_unknown(inode, req); 803 status = ocfs2_info_handle_unknown(inode, req);
394 break; 804 break;
@@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
542 return -EFAULT; 952 return -EFAULT;
543 953
544 return ocfs2_info_handle(inode, &info, 0); 954 return ocfs2_info_handle(inode, &info, 0);
955 case FITRIM:
956 {
957 struct super_block *sb = inode->i_sb;
958 struct fstrim_range range;
959 int ret = 0;
960
961 if (!capable(CAP_SYS_ADMIN))
962 return -EPERM;
963
964 if (copy_from_user(&range, (struct fstrim_range *)arg,
965 sizeof(range)))
966 return -EFAULT;
967
968 ret = ocfs2_trim_fs(sb, &range);
969 if (ret < 0)
970 return ret;
971
972 if (copy_to_user((struct fstrim_range *)arg, &range,
973 sizeof(range)))
974 return -EFAULT;
975
976 return 0;
977 }
978 case OCFS2_IOC_MOVE_EXT:
979 return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
545 default: 980 default:
546 return -ENOTTY; 981 return -ENOTTY;
547 } 982 }
@@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
569 case OCFS2_IOC_GROUP_EXTEND: 1004 case OCFS2_IOC_GROUP_EXTEND:
570 case OCFS2_IOC_GROUP_ADD: 1005 case OCFS2_IOC_GROUP_ADD:
571 case OCFS2_IOC_GROUP_ADD64: 1006 case OCFS2_IOC_GROUP_ADD64:
1007 case FITRIM:
572 break; 1008 break;
573 case OCFS2_IOC_REFLINK: 1009 case OCFS2_IOC_REFLINK:
574 if (copy_from_user(&args, (struct reflink_arguments *)arg, 1010 if (copy_from_user(&args, (struct reflink_arguments *)arg,
@@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
584 return -EFAULT; 1020 return -EFAULT;
585 1021
586 return ocfs2_info_handle(inode, &info, 1); 1022 return ocfs2_info_handle(inode, &info, 1);
1023 case OCFS2_IOC_MOVE_EXT:
1024 break;
587 default: 1025 default:
588 return -ENOIOCTLCMD; 1026 return -ENOIOCTLCMD;
589 } 1027 }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 000000000000..4c5488468c14
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1153 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * move_extents.c
5 *
6 * Copyright (C) 2011 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#include <linux/fs.h>
18#include <linux/types.h>
19#include <linux/mount.h>
20#include <linux/swap.h>
21
22#include <cluster/masklog.h>
23
24#include "ocfs2.h"
25#include "ocfs2_ioctl.h"
26
27#include "alloc.h"
28#include "aops.h"
29#include "dlmglue.h"
30#include "extent_map.h"
31#include "inode.h"
32#include "journal.h"
33#include "suballoc.h"
34#include "uptodate.h"
35#include "super.h"
36#include "dir.h"
37#include "buffer_head_io.h"
38#include "sysfile.h"
39#include "suballoc.h"
40#include "refcounttree.h"
41#include "move_extents.h"
42
43struct ocfs2_move_extents_context {
44 struct inode *inode;
45 struct file *file;
46 int auto_defrag;
47 int partial;
48 int credits;
49 u32 new_phys_cpos;
50 u32 clusters_moved;
51 u64 refcount_loc;
52 struct ocfs2_move_extents *range;
53 struct ocfs2_extent_tree et;
54 struct ocfs2_alloc_context *meta_ac;
55 struct ocfs2_alloc_context *data_ac;
56 struct ocfs2_cached_dealloc_ctxt dealloc;
57};
58
59static int __ocfs2_move_extent(handle_t *handle,
60 struct ocfs2_move_extents_context *context,
61 u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
62 int ext_flags)
63{
64 int ret = 0, index;
65 struct inode *inode = context->inode;
66 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
67 struct ocfs2_extent_rec *rec, replace_rec;
68 struct ocfs2_path *path = NULL;
69 struct ocfs2_extent_list *el;
70 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
71 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
72
73 ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
74 p_cpos, new_p_cpos, len);
75 if (ret) {
76 mlog_errno(ret);
77 goto out;
78 }
79
80 memset(&replace_rec, 0, sizeof(replace_rec));
81 replace_rec.e_cpos = cpu_to_le32(cpos);
82 replace_rec.e_leaf_clusters = cpu_to_le16(len);
83 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
84 new_p_cpos));
85
86 path = ocfs2_new_path_from_et(&context->et);
87 if (!path) {
88 ret = -ENOMEM;
89 mlog_errno(ret);
90 goto out;
91 }
92
93 ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
94 if (ret) {
95 mlog_errno(ret);
96 goto out;
97 }
98
99 el = path_leaf_el(path);
100
101 index = ocfs2_search_extent_list(el, cpos);
102 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
103 ocfs2_error(inode->i_sb,
104 "Inode %llu has an extent at cpos %u which can no "
105 "longer be found.\n",
106 (unsigned long long)ino, cpos);
107 ret = -EROFS;
108 goto out;
109 }
110
111 rec = &el->l_recs[index];
112
113 BUG_ON(ext_flags != rec->e_flags);
114 /*
115 * after moving/defraging to new location, the extent is not going
116 * to be refcounted anymore.
117 */
118 replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
119
120 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
121 context->et.et_root_bh,
122 OCFS2_JOURNAL_ACCESS_WRITE);
123 if (ret) {
124 mlog_errno(ret);
125 goto out;
126 }
127
128 ret = ocfs2_split_extent(handle, &context->et, path, index,
129 &replace_rec, context->meta_ac,
130 &context->dealloc);
131 if (ret) {
132 mlog_errno(ret);
133 goto out;
134 }
135
136 ocfs2_journal_dirty(handle, context->et.et_root_bh);
137
138 context->new_phys_cpos = new_p_cpos;
139
140 /*
141 * need I to append truncate log for old clusters?
142 */
143 if (old_blkno) {
144 if (ext_flags & OCFS2_EXT_REFCOUNTED)
145 ret = ocfs2_decrease_refcount(inode, handle,
146 ocfs2_blocks_to_clusters(osb->sb,
147 old_blkno),
148 len, context->meta_ac,
149 &context->dealloc, 1);
150 else
151 ret = ocfs2_truncate_log_append(osb, handle,
152 old_blkno, len);
153 }
154
155out:
156 return ret;
157}
158
159/*
160 * lock allocators, and reserving appropriate number of bits for
161 * meta blocks and data clusters.
162 *
163 * in some cases, we don't need to reserve clusters, just let data_ac
164 * be NULL.
165 */
166static int ocfs2_lock_allocators_move_extents(struct inode *inode,
167 struct ocfs2_extent_tree *et,
168 u32 clusters_to_move,
169 u32 extents_to_split,
170 struct ocfs2_alloc_context **meta_ac,
171 struct ocfs2_alloc_context **data_ac,
172 int extra_blocks,
173 int *credits)
174{
175 int ret, num_free_extents;
176 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
177 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
178
179 num_free_extents = ocfs2_num_free_extents(osb, et);
180 if (num_free_extents < 0) {
181 ret = num_free_extents;
182 mlog_errno(ret);
183 goto out;
184 }
185
186 if (!num_free_extents ||
187 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
188 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
189
190 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
191 if (ret) {
192 mlog_errno(ret);
193 goto out;
194 }
195
196 if (data_ac) {
197 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
198 if (ret) {
199 mlog_errno(ret);
200 goto out;
201 }
202 }
203
204 *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
205 clusters_to_move + 2);
206
207 mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
208 extra_blocks, clusters_to_move, *credits);
209out:
210 if (ret) {
211 if (*meta_ac) {
212 ocfs2_free_alloc_context(*meta_ac);
213 *meta_ac = NULL;
214 }
215 }
216
217 return ret;
218}
219
220/*
221 * Using one journal handle to guarantee the data consistency in case
222 * crash happens anywhere.
223 *
224 * XXX: defrag can end up with finishing partial extent as requested,
225 * due to not enough contiguous clusters can be found in allocator.
226 */
227static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
228 u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
229{
230 int ret, credits = 0, extra_blocks = 0, partial = context->partial;
231 handle_t *handle;
232 struct inode *inode = context->inode;
233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234 struct inode *tl_inode = osb->osb_tl_inode;
235 struct ocfs2_refcount_tree *ref_tree = NULL;
236 u32 new_phys_cpos, new_len;
237 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
238
239 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
240
241 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
242 OCFS2_HAS_REFCOUNT_FL));
243
244 BUG_ON(!context->refcount_loc);
245
246 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
247 &ref_tree, NULL);
248 if (ret) {
249 mlog_errno(ret);
250 return ret;
251 }
252
253 ret = ocfs2_prepare_refcount_change_for_del(inode,
254 context->refcount_loc,
255 phys_blkno,
256 *len,
257 &credits,
258 &extra_blocks);
259 if (ret) {
260 mlog_errno(ret);
261 goto out;
262 }
263 }
264
265 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
266 &context->meta_ac,
267 &context->data_ac,
268 extra_blocks, &credits);
269 if (ret) {
270 mlog_errno(ret);
271 goto out;
272 }
273
274 /*
275 * should be using allocation reservation strategy there?
276 *
277 * if (context->data_ac)
278 * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
279 */
280
281 mutex_lock(&tl_inode->i_mutex);
282
283 if (ocfs2_truncate_log_needs_flush(osb)) {
284 ret = __ocfs2_flush_truncate_log(osb);
285 if (ret < 0) {
286 mlog_errno(ret);
287 goto out_unlock_mutex;
288 }
289 }
290
291 handle = ocfs2_start_trans(osb, credits);
292 if (IS_ERR(handle)) {
293 ret = PTR_ERR(handle);
294 mlog_errno(ret);
295 goto out_unlock_mutex;
296 }
297
298 ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
299 &new_phys_cpos, &new_len);
300 if (ret) {
301 mlog_errno(ret);
302 goto out_commit;
303 }
304
305 /*
306 * allowing partial extent moving is kind of 'pros and cons', it makes
307 * whole defragmentation less likely to fail, on the contrary, the bad
308 * thing is it may make the fs even more fragmented after moving, let
309 * userspace make a good decision here.
310 */
311 if (new_len != *len) {
312 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
313 if (!partial) {
314 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
315 ret = -ENOSPC;
316 goto out_commit;
317 }
318 }
319
320 mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
321 phys_cpos, new_phys_cpos);
322
323 ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
324 new_phys_cpos, ext_flags);
325 if (ret)
326 mlog_errno(ret);
327
328 if (partial && (new_len != *len))
329 *len = new_len;
330
331 /*
332 * Here we should write the new page out first if we are
333 * in write-back mode.
334 */
335 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
336 if (ret)
337 mlog_errno(ret);
338
339out_commit:
340 ocfs2_commit_trans(osb, handle);
341
342out_unlock_mutex:
343 mutex_unlock(&tl_inode->i_mutex);
344
345 if (context->data_ac) {
346 ocfs2_free_alloc_context(context->data_ac);
347 context->data_ac = NULL;
348 }
349
350 if (context->meta_ac) {
351 ocfs2_free_alloc_context(context->meta_ac);
352 context->meta_ac = NULL;
353 }
354
355out:
356 if (ref_tree)
357 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
358
359 return ret;
360}
361
362/*
363 * find the victim alloc group, where #blkno fits.
364 */
365static int ocfs2_find_victim_alloc_group(struct inode *inode,
366 u64 vict_blkno,
367 int type, int slot,
368 int *vict_bit,
369 struct buffer_head **ret_bh)
370{
371 int ret, i, blocks_per_unit = 1;
372 u64 blkno;
373 char namebuf[40];
374
375 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
376 struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
377 struct ocfs2_chain_list *cl;
378 struct ocfs2_chain_rec *rec;
379 struct ocfs2_dinode *ac_dinode;
380 struct ocfs2_group_desc *bg;
381
382 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
383 ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
384 strlen(namebuf), &blkno);
385 if (ret) {
386 ret = -ENOENT;
387 goto out;
388 }
389
390 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
391 if (ret) {
392 mlog_errno(ret);
393 goto out;
394 }
395
396 ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
397 cl = &(ac_dinode->id2.i_chain);
398 rec = &(cl->cl_recs[0]);
399
400 if (type == GLOBAL_BITMAP_SYSTEM_INODE)
401 blocks_per_unit <<= (osb->s_clustersize_bits -
402 inode->i_sb->s_blocksize_bits);
403 /*
404 * 'vict_blkno' was out of the valid range.
405 */
406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
407 (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
408 blocks_per_unit))) {
409 ret = -EINVAL;
410 goto out;
411 }
412
413 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
414
415 rec = &(cl->cl_recs[i]);
416 if (!rec)
417 continue;
418
419 bg = NULL;
420
421 do {
422 if (!bg)
423 blkno = le64_to_cpu(rec->c_blkno);
424 else
425 blkno = le64_to_cpu(bg->bg_next_group);
426
427 if (gd_bh) {
428 brelse(gd_bh);
429 gd_bh = NULL;
430 }
431
432 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
433 if (ret) {
434 mlog_errno(ret);
435 goto out;
436 }
437
438 bg = (struct ocfs2_group_desc *)gd_bh->b_data;
439
440 if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
441 le16_to_cpu(bg->bg_bits))) {
442
443 *ret_bh = gd_bh;
444 *vict_bit = (vict_blkno - blkno) /
445 blocks_per_unit;
446 mlog(0, "find the victim group: #%llu, "
447 "total_bits: %u, vict_bit: %u\n",
448 blkno, le16_to_cpu(bg->bg_bits),
449 *vict_bit);
450 goto out;
451 }
452
453 } while (le64_to_cpu(bg->bg_next_group));
454 }
455
456 ret = -EINVAL;
457out:
458 brelse(ac_bh);
459
460 /*
461 * caller has to release the gd_bh properly.
462 */
463 return ret;
464}
465
466/*
467 * XXX: helper to validate and adjust moving goal.
468 */
469static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
470 struct ocfs2_move_extents *range)
471{
472 int ret, goal_bit = 0;
473
474 struct buffer_head *gd_bh = NULL;
475 struct ocfs2_group_desc *bg;
476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
477 int c_to_b = 1 << (osb->s_clustersize_bits -
478 inode->i_sb->s_blocksize_bits);
479
480 /*
481 * validate goal sits within global_bitmap, and return the victim
482 * group desc
483 */
484 ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
485 GLOBAL_BITMAP_SYSTEM_INODE,
486 OCFS2_INVALID_SLOT,
487 &goal_bit, &gd_bh);
488 if (ret)
489 goto out;
490
491 bg = (struct ocfs2_group_desc *)gd_bh->b_data;
492
493 /*
494 * make goal become cluster aligned.
495 */
496 if (range->me_goal % c_to_b)
497 range->me_goal = range->me_goal / c_to_b * c_to_b;
498
499 /*
500 * moving goal is not allowd to start with a group desc blok(#0 blk)
501 * let's compromise to the latter cluster.
502 */
503 if (range->me_goal == le64_to_cpu(bg->bg_blkno))
504 range->me_goal += c_to_b;
505
506 /*
507 * movement is not gonna cross two groups.
508 */
509 if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
510 range->me_len) {
511 ret = -EINVAL;
512 goto out;
513 }
514 /*
515 * more exact validations/adjustments will be performed later during
516 * moving operation for each extent range.
517 */
518 mlog(0, "extents get ready to be moved to #%llu block\n",
519 range->me_goal);
520
521out:
522 brelse(gd_bh);
523
524 return ret;
525}
526
527static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
528 int *goal_bit, u32 move_len, u32 max_hop,
529 u32 *phys_cpos)
530{
531 int i, used, last_free_bits = 0, base_bit = *goal_bit;
532 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
533 u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
534 le64_to_cpu(gd->bg_blkno));
535
536 for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
537
538 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
539 if (used) {
540 /*
541 * we even tried searching the free chunk by jumping
542 * a 'max_hop' distance, but still failed.
543 */
544 if ((i - base_bit) > max_hop) {
545 *phys_cpos = 0;
546 break;
547 }
548
549 if (last_free_bits)
550 last_free_bits = 0;
551
552 continue;
553 } else
554 last_free_bits++;
555
556 if (last_free_bits == move_len) {
557 *goal_bit = i;
558 *phys_cpos = base_cpos + i;
559 break;
560 }
561 }
562
563 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
564}
565
566static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
567 handle_t *handle,
568 struct buffer_head *di_bh,
569 u32 num_bits,
570 u16 chain)
571{
572 int ret;
573 u32 tmp_used;
574 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
575 struct ocfs2_chain_list *cl =
576 (struct ocfs2_chain_list *) &di->id2.i_chain;
577
578 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
579 OCFS2_JOURNAL_ACCESS_WRITE);
580 if (ret < 0) {
581 mlog_errno(ret);
582 goto out;
583 }
584
585 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
586 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
587 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
588 ocfs2_journal_dirty(handle, di_bh);
589
590out:
591 return ret;
592}
593
594static inline int ocfs2_block_group_set_bits(handle_t *handle,
595 struct inode *alloc_inode,
596 struct ocfs2_group_desc *bg,
597 struct buffer_head *group_bh,
598 unsigned int bit_off,
599 unsigned int num_bits)
600{
601 int status;
602 void *bitmap = bg->bg_bitmap;
603 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
604
605 /* All callers get the descriptor via
606 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
607 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
608 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
609
610 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
611 num_bits);
612
613 if (ocfs2_is_cluster_bitmap(alloc_inode))
614 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
615
616 status = ocfs2_journal_access_gd(handle,
617 INODE_CACHE(alloc_inode),
618 group_bh,
619 journal_type);
620 if (status < 0) {
621 mlog_errno(status);
622 goto bail;
623 }
624
625 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
626 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
627 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
628 " count %u but claims %u are freed. num_bits %d",
629 (unsigned long long)le64_to_cpu(bg->bg_blkno),
630 le16_to_cpu(bg->bg_bits),
631 le16_to_cpu(bg->bg_free_bits_count), num_bits);
632 return -EROFS;
633 }
634 while (num_bits--)
635 ocfs2_set_bit(bit_off++, bitmap);
636
637 ocfs2_journal_dirty(handle, group_bh);
638
639bail:
640 return status;
641}
642
643static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
644 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
645 u32 len, int ext_flags)
646{
647 int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
648 handle_t *handle;
649 struct inode *inode = context->inode;
650 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
651 struct inode *tl_inode = osb->osb_tl_inode;
652 struct inode *gb_inode = NULL;
653 struct buffer_head *gb_bh = NULL;
654 struct buffer_head *gd_bh = NULL;
655 struct ocfs2_group_desc *gd;
656 struct ocfs2_refcount_tree *ref_tree = NULL;
657 u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
658 context->range->me_threshold);
659 u64 phys_blkno, new_phys_blkno;
660
661 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
662
663 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
664
665 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
666 OCFS2_HAS_REFCOUNT_FL));
667
668 BUG_ON(!context->refcount_loc);
669
670 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
671 &ref_tree, NULL);
672 if (ret) {
673 mlog_errno(ret);
674 return ret;
675 }
676
677 ret = ocfs2_prepare_refcount_change_for_del(inode,
678 context->refcount_loc,
679 phys_blkno,
680 len,
681 &credits,
682 &extra_blocks);
683 if (ret) {
684 mlog_errno(ret);
685 goto out;
686 }
687 }
688
689 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
690 &context->meta_ac,
691 NULL, extra_blocks, &credits);
692 if (ret) {
693 mlog_errno(ret);
694 goto out;
695 }
696
697 /*
698 * need to count 2 extra credits for global_bitmap inode and
699 * group descriptor.
700 */
701 credits += OCFS2_INODE_UPDATE_CREDITS + 1;
702
703 /*
704 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
705 * logic, while we still need to lock the global_bitmap.
706 */
707 gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
708 OCFS2_INVALID_SLOT);
709 if (!gb_inode) {
710 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
711 ret = -EIO;
712 goto out;
713 }
714
715 mutex_lock(&gb_inode->i_mutex);
716
717 ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
718 if (ret) {
719 mlog_errno(ret);
720 goto out_unlock_gb_mutex;
721 }
722
723 mutex_lock(&tl_inode->i_mutex);
724
725 handle = ocfs2_start_trans(osb, credits);
726 if (IS_ERR(handle)) {
727 ret = PTR_ERR(handle);
728 mlog_errno(ret);
729 goto out_unlock_tl_inode;
730 }
731
732 new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
733 ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
734 GLOBAL_BITMAP_SYSTEM_INODE,
735 OCFS2_INVALID_SLOT,
736 &goal_bit, &gd_bh);
737 if (ret) {
738 mlog_errno(ret);
739 goto out_commit;
740 }
741
742 /*
743 * probe the victim cluster group to find a proper
744 * region to fit wanted movement, it even will perfrom
745 * a best-effort attempt by compromising to a threshold
746 * around the goal.
747 */
748 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
749 new_phys_cpos);
750 if (!new_phys_cpos) {
751 ret = -ENOSPC;
752 goto out_commit;
753 }
754
755 ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
756 *new_phys_cpos, ext_flags);
757 if (ret) {
758 mlog_errno(ret);
759 goto out_commit;
760 }
761
762 gd = (struct ocfs2_group_desc *)gd_bh->b_data;
763 ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
764 le16_to_cpu(gd->bg_chain));
765 if (ret) {
766 mlog_errno(ret);
767 goto out_commit;
768 }
769
770 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
771 goal_bit, len);
772 if (ret)
773 mlog_errno(ret);
774
775 /*
776 * Here we should write the new page out first if we are
777 * in write-back mode.
778 */
779 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
780 if (ret)
781 mlog_errno(ret);
782
783out_commit:
784 ocfs2_commit_trans(osb, handle);
785 brelse(gd_bh);
786
787out_unlock_tl_inode:
788 mutex_unlock(&tl_inode->i_mutex);
789
790 ocfs2_inode_unlock(gb_inode, 1);
791out_unlock_gb_mutex:
792 mutex_unlock(&gb_inode->i_mutex);
793 brelse(gb_bh);
794 iput(gb_inode);
795
796out:
797 if (context->meta_ac) {
798 ocfs2_free_alloc_context(context->meta_ac);
799 context->meta_ac = NULL;
800 }
801
802 if (ref_tree)
803 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
804
805 return ret;
806}
807
808/*
809 * Helper to calculate the defraging length in one run according to threshold.
810 */
811static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
812 u32 threshold, int *skip)
813{
814 if ((*alloc_size + *len_defraged) < threshold) {
815 /*
816 * proceed defragmentation until we meet the thresh
817 */
818 *len_defraged += *alloc_size;
819 } else if (*len_defraged == 0) {
820 /*
821 * XXX: skip a large extent.
822 */
823 *skip = 1;
824 } else {
825 /*
826 * split this extent to coalesce with former pieces as
827 * to reach the threshold.
828 *
829 * we're done here with one cycle of defragmentation
830 * in a size of 'thresh', resetting 'len_defraged'
831 * forces a new defragmentation.
832 */
833 *alloc_size = threshold - *len_defraged;
834 *len_defraged = 0;
835 }
836}
837
838static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
839 struct ocfs2_move_extents_context *context)
840{
841 int ret = 0, flags, do_defrag, skip = 0;
842 u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
843 u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
844
845 struct inode *inode = context->inode;
846 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
847 struct ocfs2_move_extents *range = context->range;
848 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
849
850 if ((inode->i_size == 0) || (range->me_len == 0))
851 return 0;
852
853 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
854 return 0;
855
856 context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
857
858 ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
859 ocfs2_init_dealloc_ctxt(&context->dealloc);
860
861 /*
862 * TO-DO XXX:
863 *
864 * - xattr extents.
865 */
866
867 do_defrag = context->auto_defrag;
868
869 /*
870 * extents moving happens in unit of clusters, for the sake
871 * of simplicity, we may ignore two clusters where 'byte_start'
872 * and 'byte_start + len' were within.
873 */
874 move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
875 len_to_move = (range->me_start + range->me_len) >>
876 osb->s_clustersize_bits;
877 if (len_to_move >= move_start)
878 len_to_move -= move_start;
879 else
880 len_to_move = 0;
881
882 if (do_defrag) {
883 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
884 if (defrag_thresh <= 1)
885 goto done;
886 } else
887 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
888 range->me_goal);
889
890 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
891 "thresh: %u\n",
892 (unsigned long long)OCFS2_I(inode)->ip_blkno,
893 (unsigned long long)range->me_start,
894 (unsigned long long)range->me_len,
895 move_start, len_to_move, defrag_thresh);
896
897 cpos = move_start;
898 while (len_to_move) {
899 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
900 &flags);
901 if (ret) {
902 mlog_errno(ret);
903 goto out;
904 }
905
906 if (alloc_size > len_to_move)
907 alloc_size = len_to_move;
908
909 /*
910 * XXX: how to deal with a hole:
911 *
912 * - skip the hole of course
913 * - force a new defragmentation
914 */
915 if (!phys_cpos) {
916 if (do_defrag)
917 len_defraged = 0;
918
919 goto next;
920 }
921
922 if (do_defrag) {
923 ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
924 defrag_thresh, &skip);
925 /*
926 * skip large extents
927 */
928 if (skip) {
929 skip = 0;
930 goto next;
931 }
932
933 mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
934 "alloc_size: %u, len_defraged: %u\n",
935 cpos, phys_cpos, alloc_size, len_defraged);
936
937 ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
938 &alloc_size, flags);
939 } else {
940 ret = ocfs2_move_extent(context, cpos, phys_cpos,
941 &new_phys_cpos, alloc_size,
942 flags);
943
944 new_phys_cpos += alloc_size;
945 }
946
947 if (ret < 0) {
948 mlog_errno(ret);
949 goto out;
950 }
951
952 context->clusters_moved += alloc_size;
953next:
954 cpos += alloc_size;
955 len_to_move -= alloc_size;
956 }
957
958done:
959 range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
960
961out:
962 range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
963 context->clusters_moved);
964 range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
965 context->new_phys_cpos);
966
967 ocfs2_schedule_truncate_log_flush(osb, 1);
968 ocfs2_run_deallocs(osb, &context->dealloc);
969
970 return ret;
971}
972
973static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
974{
975 int status;
976 handle_t *handle;
977 struct inode *inode = context->inode;
978 struct ocfs2_dinode *di;
979 struct buffer_head *di_bh = NULL;
980 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
981
982 if (!inode)
983 return -ENOENT;
984
985 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
986 return -EROFS;
987
988 mutex_lock(&inode->i_mutex);
989
990 /*
991 * This prevents concurrent writes from other nodes
992 */
993 status = ocfs2_rw_lock(inode, 1);
994 if (status) {
995 mlog_errno(status);
996 goto out;
997 }
998
999 status = ocfs2_inode_lock(inode, &di_bh, 1);
1000 if (status) {
1001 mlog_errno(status);
1002 goto out_rw_unlock;
1003 }
1004
1005 /*
1006 * rememer ip_xattr_sem also needs to be held if necessary
1007 */
1008 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1009
1010 status = __ocfs2_move_extents_range(di_bh, context);
1011
1012 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1013 if (status) {
1014 mlog_errno(status);
1015 goto out_inode_unlock;
1016 }
1017
1018 /*
1019 * We update ctime for these changes
1020 */
1021 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1022 if (IS_ERR(handle)) {
1023 status = PTR_ERR(handle);
1024 mlog_errno(status);
1025 goto out_inode_unlock;
1026 }
1027
1028 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1029 OCFS2_JOURNAL_ACCESS_WRITE);
1030 if (status) {
1031 mlog_errno(status);
1032 goto out_commit;
1033 }
1034
1035 di = (struct ocfs2_dinode *)di_bh->b_data;
1036 inode->i_ctime = CURRENT_TIME;
1037 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1038 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1039
1040 ocfs2_journal_dirty(handle, di_bh);
1041
1042out_commit:
1043 ocfs2_commit_trans(osb, handle);
1044
1045out_inode_unlock:
1046 brelse(di_bh);
1047 ocfs2_inode_unlock(inode, 1);
1048out_rw_unlock:
1049 ocfs2_rw_unlock(inode, 1);
1050out:
1051 mutex_unlock(&inode->i_mutex);
1052
1053 return status;
1054}
1055
1056int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
1057{
1058 int status;
1059
1060 struct inode *inode = filp->f_path.dentry->d_inode;
1061 struct ocfs2_move_extents range;
1062 struct ocfs2_move_extents_context *context = NULL;
1063
1064 status = mnt_want_write(filp->f_path.mnt);
1065 if (status)
1066 return status;
1067
1068 if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
1069 goto out;
1070
1071 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1072 status = -EPERM;
1073 goto out;
1074 }
1075
1076 context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
1077 if (!context) {
1078 status = -ENOMEM;
1079 mlog_errno(status);
1080 goto out;
1081 }
1082
1083 context->inode = inode;
1084 context->file = filp;
1085
1086 if (argp) {
1087 if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
1088 sizeof(range))) {
1089 status = -EFAULT;
1090 goto out;
1091 }
1092 } else {
1093 status = -EINVAL;
1094 goto out;
1095 }
1096
1097 if (range.me_start > i_size_read(inode))
1098 goto out;
1099
1100 if (range.me_start + range.me_len > i_size_read(inode))
1101 range.me_len = i_size_read(inode) - range.me_start;
1102
1103 context->range = &range;
1104
1105 if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1106 context->auto_defrag = 1;
1107 /*
1108 * ok, the default theshold for the defragmentation
1109 * is 1M, since our maximum clustersize was 1M also.
1110 * any thought?
1111 */
1112 if (!range.me_threshold)
1113 range.me_threshold = 1024 * 1024;
1114
1115 if (range.me_threshold > i_size_read(inode))
1116 range.me_threshold = i_size_read(inode);
1117
1118 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
1119 context->partial = 1;
1120 } else {
1121 /*
1122 * first best-effort attempt to validate and adjust the goal
1123 * (physical address in block), while it can't guarantee later
1124 * operation can succeed all the time since global_bitmap may
1125 * change a bit over time.
1126 */
1127
1128 status = ocfs2_validate_and_adjust_move_goal(inode, &range);
1129 if (status)
1130 goto out;
1131 }
1132
1133 status = ocfs2_move_extents(context);
1134 if (status)
1135 mlog_errno(status);
1136out:
1137 /*
1138 * movement/defragmentation may end up being partially completed,
1139 * that's the reason why we need to return userspace the finished
1140 * length and new_offset even if failure happens somewhere.
1141 */
1142 if (argp) {
1143 if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
1144 sizeof(range)))
1145 status = -EFAULT;
1146 }
1147
1148 kfree(context);
1149
1150 mnt_drop_write(filp->f_path.mnt);
1151
1152 return status;
1153}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 000000000000..4e143e811441
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * move_extents.h
5 *
6 * Copyright (C) 2011 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#ifndef OCFS2_MOVE_EXTENTS_H
18#define OCFS2_MOVE_EXTENTS_H
19
20int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
21
22#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index b46f39bf7438..5b27ff1fa577 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
142 __u64 ij_journal_size; 142 __u64 ij_journal_size;
143}; 143};
144 144
145struct ocfs2_info_freeinode {
146 struct ocfs2_info_request ifi_req;
147 struct ocfs2_info_local_freeinode {
148 __u64 lfi_total;
149 __u64 lfi_free;
150 } ifi_stat[OCFS2_MAX_SLOTS];
151 __u32 ifi_slotnum; /* out */
152 __u32 ifi_pad;
153};
154
155#define OCFS2_INFO_MAX_HIST (32)
156
157struct ocfs2_info_freefrag {
158 struct ocfs2_info_request iff_req;
159 struct ocfs2_info_freefrag_stats { /* (out) */
160 struct ocfs2_info_free_chunk_list {
161 __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
162 __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
163 } ffs_fc_hist;
164 __u32 ffs_clusters;
165 __u32 ffs_free_clusters;
166 __u32 ffs_free_chunks;
167 __u32 ffs_free_chunks_real;
168 __u32 ffs_min; /* Minimum free chunksize in clusters */
169 __u32 ffs_max;
170 __u32 ffs_avg;
171 __u32 ffs_pad;
172 } iff_ffs;
173 __u32 iff_chunksize; /* chunksize in clusters(in) */
174 __u32 iff_pad;
175};
176
145/* Codes for ocfs2_info_request */ 177/* Codes for ocfs2_info_request */
146enum ocfs2_info_type { 178enum ocfs2_info_type {
147 OCFS2_INFO_CLUSTERSIZE = 1, 179 OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
151 OCFS2_INFO_UUID, 183 OCFS2_INFO_UUID,
152 OCFS2_INFO_FS_FEATURES, 184 OCFS2_INFO_FS_FEATURES,
153 OCFS2_INFO_JOURNAL_SIZE, 185 OCFS2_INFO_JOURNAL_SIZE,
186 OCFS2_INFO_FREEINODE,
187 OCFS2_INFO_FREEFRAG,
154 OCFS2_INFO_NUM_TYPES 188 OCFS2_INFO_NUM_TYPES
155}; 189};
156 190
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
171 205
172#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) 206#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
173 207
208struct ocfs2_move_extents {
209/* All values are in bytes */
210 /* in */
211 __u64 me_start; /* Virtual start in the file to move */
212 __u64 me_len; /* Length of the extents to be moved */
213 __u64 me_goal; /* Physical offset of the goal,
214 it's in block unit */
215 __u64 me_threshold; /* Maximum distance from goal or threshold
216 for auto defragmentation */
217 __u64 me_flags; /* Flags for the operation:
218 * - auto defragmentation.
219 * - refcount,xattr cases.
220 */
221 /* out */
222 __u64 me_moved_len; /* Moved/defraged length */
223 __u64 me_new_offset; /* Resulting physical location */
224 __u32 me_reserved[2]; /* Reserved for futhure */
225};
226
227#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
228 claim new clusters
229 as the goal place
230 for extents moving */
231#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
232 moving, is to make
233 movement less likely
234 to fail, may make fs
235 even more fragmented */
236#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
237 completely gets done.
238 */
239
240#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
241
174#endif /* OCFS2_IOCTL_H */ 242#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a1dae5bb54ac..3b481f490633 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
688 __entry->blkno, __entry->bit) 688 __entry->blkno, __entry->bit)
689); 689);
690 690
691TRACE_EVENT(ocfs2_trim_extent,
692 TP_PROTO(struct super_block *sb, unsigned long long blk,
693 unsigned long long count),
694 TP_ARGS(sb, blk, count),
695 TP_STRUCT__entry(
696 __field(int, dev_major)
697 __field(int, dev_minor)
698 __field(unsigned long long, blk)
699 __field(__u64, count)
700 ),
701 TP_fast_assign(
702 __entry->dev_major = MAJOR(sb->s_dev);
703 __entry->dev_minor = MINOR(sb->s_dev);
704 __entry->blk = blk;
705 __entry->count = count;
706 ),
707 TP_printk("%d %d %llu %llu",
708 __entry->dev_major, __entry->dev_minor,
709 __entry->blk, __entry->count)
710);
711
712DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
713
714DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
715
691/* End of trace events for fs/ocfs2/alloc.c. */ 716/* End of trace events for fs/ocfs2/alloc.c. */
692 717
693/* Trace events for fs/ocfs2/localalloc.c. */ 718/* Trace events for fs/ocfs2/localalloc.c. */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3c7606cff1ab..ebfd3825f12a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
66 u32 *num_clusters, 66 u32 *num_clusters,
67 unsigned int *extent_flags); 67 unsigned int *extent_flags);
68 int (*cow_duplicate_clusters)(handle_t *handle, 68 int (*cow_duplicate_clusters)(handle_t *handle,
69 struct ocfs2_cow_context *context, 69 struct file *file,
70 u32 cpos, u32 old_cluster, 70 u32 cpos, u32 old_cluster,
71 u32 new_cluster, u32 new_len); 71 u32 new_cluster, u32 new_len);
72}; 72};
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2921 return 0; 2921 return 0;
2922} 2922}
2923 2923
2924static int ocfs2_duplicate_clusters_by_page(handle_t *handle, 2924int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2925 struct ocfs2_cow_context *context, 2925 struct file *file,
2926 u32 cpos, u32 old_cluster, 2926 u32 cpos, u32 old_cluster,
2927 u32 new_cluster, u32 new_len) 2927 u32 new_cluster, u32 new_len)
2928{ 2928{
2929 int ret = 0, partial; 2929 int ret = 0, partial;
2930 struct ocfs2_caching_info *ci = context->data_et.et_ci; 2930 struct inode *inode = file->f_path.dentry->d_inode;
2931 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
2931 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2932 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2932 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2933 struct page *page; 2934 struct page *page;
2934 pgoff_t page_index; 2935 pgoff_t page_index;
2935 unsigned int from, to, readahead_pages; 2936 unsigned int from, to, readahead_pages;
2936 loff_t offset, end, map_end; 2937 loff_t offset, end, map_end;
2937 struct address_space *mapping = context->inode->i_mapping; 2938 struct address_space *mapping = inode->i_mapping;
2938 2939
2939 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, 2940 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2940 new_cluster, new_len); 2941 new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2948 * We only duplicate pages until we reach the page contains i_size - 1. 2949 * We only duplicate pages until we reach the page contains i_size - 1.
2949 * So trim 'end' to i_size. 2950 * So trim 'end' to i_size.
2950 */ 2951 */
2951 if (end > i_size_read(context->inode)) 2952 if (end > i_size_read(inode))
2952 end = i_size_read(context->inode); 2953 end = i_size_read(inode);
2953 2954
2954 while (offset < end) { 2955 while (offset < end) {
2955 page_index = offset >> PAGE_CACHE_SHIFT; 2956 page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2972 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2973 BUG_ON(PageDirty(page)); 2974 BUG_ON(PageDirty(page));
2974 2975
2975 if (PageReadahead(page) && context->file) { 2976 if (PageReadahead(page)) {
2976 page_cache_async_readahead(mapping, 2977 page_cache_async_readahead(mapping,
2977 &context->file->f_ra, 2978 &file->f_ra, file,
2978 context->file,
2979 page, page_index, 2979 page, page_index,
2980 readahead_pages); 2980 readahead_pages);
2981 } 2981 }
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2999 } 2999 }
3000 } 3000 }
3001 3001
3002 ocfs2_map_and_dirty_page(context->inode, 3002 ocfs2_map_and_dirty_page(inode, handle, from, to,
3003 handle, from, to,
3004 page, 0, &new_block); 3003 page, 0, &new_block);
3005 mark_page_accessed(page); 3004 mark_page_accessed(page);
3006unlock: 3005unlock:
@@ -3015,14 +3014,15 @@ unlock:
3015 return ret; 3014 return ret;
3016} 3015}
3017 3016
3018static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, 3017int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3019 struct ocfs2_cow_context *context, 3018 struct file *file,
3020 u32 cpos, u32 old_cluster, 3019 u32 cpos, u32 old_cluster,
3021 u32 new_cluster, u32 new_len) 3020 u32 new_cluster, u32 new_len)
3022{ 3021{
3023 int ret = 0; 3022 int ret = 0;
3024 struct super_block *sb = context->inode->i_sb; 3023 struct inode *inode = file->f_path.dentry->d_inode;
3025 struct ocfs2_caching_info *ci = context->data_et.et_ci; 3024 struct super_block *sb = inode->i_sb;
3025 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
3026 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); 3026 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3027 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); 3027 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3028 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 3028 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
3145 3145
3146 /*If the old clusters is unwritten, no need to duplicate. */ 3146 /*If the old clusters is unwritten, no need to duplicate. */
3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { 3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3148 ret = context->cow_duplicate_clusters(handle, context, cpos, 3148 ret = context->cow_duplicate_clusters(handle, context->file,
3149 old, new, len); 3149 cpos, old, new, len);
3150 if (ret) { 3150 if (ret) {
3151 mlog_errno(ret); 3151 mlog_errno(ret);
3152 goto out; 3152 goto out;
@@ -3162,22 +3162,22 @@ out:
3162 return ret; 3162 return ret;
3163} 3163}
3164 3164
3165static int ocfs2_cow_sync_writeback(struct super_block *sb, 3165int ocfs2_cow_sync_writeback(struct super_block *sb,
3166 struct ocfs2_cow_context *context, 3166 struct inode *inode,
3167 u32 cpos, u32 num_clusters) 3167 u32 cpos, u32 num_clusters)
3168{ 3168{
3169 int ret = 0; 3169 int ret = 0;
3170 loff_t offset, end, map_end; 3170 loff_t offset, end, map_end;
3171 pgoff_t page_index; 3171 pgoff_t page_index;
3172 struct page *page; 3172 struct page *page;
3173 3173
3174 if (ocfs2_should_order_data(context->inode)) 3174 if (ocfs2_should_order_data(inode))
3175 return 0; 3175 return 0;
3176 3176
3177 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 3177 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3178 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); 3178 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3179 3179
3180 ret = filemap_fdatawrite_range(context->inode->i_mapping, 3180 ret = filemap_fdatawrite_range(inode->i_mapping,
3181 offset, end - 1); 3181 offset, end - 1);
3182 if (ret < 0) { 3182 if (ret < 0) {
3183 mlog_errno(ret); 3183 mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
3190 if (map_end > end) 3190 if (map_end > end)
3191 map_end = end; 3191 map_end = end;
3192 3192
3193 page = find_or_create_page(context->inode->i_mapping, 3193 page = find_or_create_page(inode->i_mapping,
3194 page_index, GFP_NOFS); 3194 page_index, GFP_NOFS);
3195 BUG_ON(!page); 3195 BUG_ON(!page);
3196 3196
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3349 * in write-back mode. 3349 * in write-back mode.
3350 */ 3350 */
3351 if (context->get_clusters == ocfs2_di_get_clusters) { 3351 if (context->get_clusters == ocfs2_di_get_clusters) {
3352 ret = ocfs2_cow_sync_writeback(sb, context, cpos, 3352 ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
3353 orig_num_clusters); 3353 orig_num_clusters);
3354 if (ret) 3354 if (ret)
3355 mlog_errno(ret); 3355 mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c8ce46f7d8e3..7754608c83a4 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
84 struct buffer_head *ref_root_bh, 84 struct buffer_head *ref_root_bh,
85 u32 cpos, u32 write_len, 85 u32 cpos, u32 write_len,
86 struct ocfs2_post_refcount *post); 86 struct ocfs2_post_refcount *post);
87int ocfs2_duplicate_clusters_by_page(handle_t *handle,
88 struct file *file,
89 u32 cpos, u32 old_cluster,
90 u32 new_cluster, u32 new_len);
91int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
92 struct file *file,
93 u32 cpos, u32 old_cluster,
94 u32 new_cluster, u32 new_len);
95int ocfs2_cow_sync_writeback(struct super_block *sb,
96 struct inode *inode,
97 u32 cpos, u32 num_clusters);
87int ocfs2_add_refcount_flag(struct inode *inode, 98int ocfs2_add_refcount_flag(struct inode *inode,
88 struct ocfs2_extent_tree *data_et, 99 struct ocfs2_extent_tree *data_et,
89 struct ocfs2_caching_info *ref_ci, 100 struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5a521c748859..cdbaf5e97308 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/cleancache.h>
44 45
45#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
46#include "ocfs2_trace.h" 47#include "ocfs2_trace.h"
@@ -1566,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1566 if (osb->preferred_slot != OCFS2_INVALID_SLOT) 1567 if (osb->preferred_slot != OCFS2_INVALID_SLOT)
1567 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); 1568 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
1568 1569
1569 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) 1570 if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
1570 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); 1571 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
1571 1572
1572 if (osb->osb_commit_interval) 1573 if (osb->osb_commit_interval)
@@ -2352,6 +2353,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2352 mlog_errno(status); 2353 mlog_errno(status);
2353 goto bail; 2354 goto bail;
2354 } 2355 }
2356 cleancache_init_shared_fs((char *)&uuid_net_key, sb);
2355 2357
2356bail: 2358bail:
2357 return status; 2359 return status;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index de4ff29f1e05..c368360c35a1 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -240,8 +240,12 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
240 struct inode *inode = dentry->d_inode; 240 struct inode *inode = dentry->d_inode;
241 int ret; 241 int ret;
242 242
243 if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode)) 243
244 return -ENOTEMPTY; 244 if (S_ISDIR(inode->i_mode)) {
245 dentry_unhash(dentry);
246 if (!omfs_dir_is_empty(inode))
247 return -ENOTEMPTY;
248 }
245 249
246 ret = omfs_delete_entry(dentry); 250 ret = omfs_delete_entry(dentry);
247 if (ret) 251 if (ret)
@@ -378,6 +382,9 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
378 int err; 382 int err;
379 383
380 if (new_inode) { 384 if (new_inode) {
385 if (S_ISDIR(new_inode->i_mode))
386 dentry_unhash(new_dentry);
387
381 /* overwriting existing file/dir */ 388 /* overwriting existing file/dir */
382 err = omfs_remove(new_dir, new_dentry); 389 err = omfs_remove(new_dir, new_dentry);
383 if (err) 390 if (err)
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index df434c5f28fb..c1c729335924 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -20,6 +20,7 @@ proc-y += stat.o
20proc-y += uptime.o 20proc-y += uptime.o
21proc-y += version.o 21proc-y += version.o
22proc-y += softirqs.o 22proc-y += softirqs.o
23proc-y += namespaces.o
23proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 24proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
24proc-$(CONFIG_NET) += proc_net.o 25proc-$(CONFIG_NET) += proc_net.o
25proc-$(CONFIG_PROC_KCORE) += kcore.o 26proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa532730e55..dc8bca72b002 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode)
600 return allowed; 600 return allowed;
601} 601}
602 602
603static int proc_setattr(struct dentry *dentry, struct iattr *attr) 603int proc_setattr(struct dentry *dentry, struct iattr *attr)
604{ 604{
605 int error; 605 int error;
606 struct inode *inode = dentry->d_inode; 606 struct inode *inode = dentry->d_inode;
@@ -1736,8 +1736,7 @@ static int task_dumpable(struct task_struct *task)
1736 return 0; 1736 return 0;
1737} 1737}
1738 1738
1739 1739struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1740static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1741{ 1740{
1742 struct inode * inode; 1741 struct inode * inode;
1743 struct proc_inode *ei; 1742 struct proc_inode *ei;
@@ -1779,7 +1778,7 @@ out_unlock:
1779 return NULL; 1778 return NULL;
1780} 1779}
1781 1780
1782static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 1781int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1783{ 1782{
1784 struct inode *inode = dentry->d_inode; 1783 struct inode *inode = dentry->d_inode;
1785 struct task_struct *task; 1784 struct task_struct *task;
@@ -1820,7 +1819,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
1820 * made this apply to all per process world readable and executable 1819 * made this apply to all per process world readable and executable
1821 * directories. 1820 * directories.
1822 */ 1821 */
1823static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) 1822int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1824{ 1823{
1825 struct inode *inode; 1824 struct inode *inode;
1826 struct task_struct *task; 1825 struct task_struct *task;
@@ -1862,7 +1861,7 @@ static int pid_delete_dentry(const struct dentry * dentry)
1862 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; 1861 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1863} 1862}
1864 1863
1865static const struct dentry_operations pid_dentry_operations = 1864const struct dentry_operations pid_dentry_operations =
1866{ 1865{
1867 .d_revalidate = pid_revalidate, 1866 .d_revalidate = pid_revalidate,
1868 .d_delete = pid_delete_dentry, 1867 .d_delete = pid_delete_dentry,
@@ -1870,9 +1869,6 @@ static const struct dentry_operations pid_dentry_operations =
1870 1869
1871/* Lookups */ 1870/* Lookups */
1872 1871
1873typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
1874 struct task_struct *, const void *);
1875
1876/* 1872/*
1877 * Fill a directory entry. 1873 * Fill a directory entry.
1878 * 1874 *
@@ -1885,8 +1881,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
1885 * reported by readdir in sync with the inode numbers reported 1881 * reported by readdir in sync with the inode numbers reported
1886 * by stat. 1882 * by stat.
1887 */ 1883 */
1888static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 1884int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
1889 char *name, int len, 1885 const char *name, int len,
1890 instantiate_t instantiate, struct task_struct *task, const void *ptr) 1886 instantiate_t instantiate, struct task_struct *task, const void *ptr)
1891{ 1887{
1892 struct dentry *child, *dir = filp->f_path.dentry; 1888 struct dentry *child, *dir = filp->f_path.dentry;
@@ -2820,6 +2816,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2820 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), 2816 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2821 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2817 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2822 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2818 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2819 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2823#ifdef CONFIG_NET 2820#ifdef CONFIG_NET
2824 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), 2821 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2825#endif 2822#endif
@@ -3168,6 +3165,7 @@ out_no_task:
3168static const struct pid_entry tid_base_stuff[] = { 3165static const struct pid_entry tid_base_stuff[] = {
3169 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 3166 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3170 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 3167 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3168 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3171 REG("environ", S_IRUSR, proc_environ_operations), 3169 REG("environ", S_IRUSR, proc_environ_operations),
3172 INF("auxv", S_IRUSR, proc_pid_auxv), 3170 INF("auxv", S_IRUSR, proc_pid_auxv),
3173 ONE("status", S_IRUGO, proc_pid_status), 3171 ONE("status", S_IRUGO, proc_pid_status),
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d15aa1b1cc8f..74b48cfa1bb2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -28,6 +28,7 @@ static void proc_evict_inode(struct inode *inode)
28{ 28{
29 struct proc_dir_entry *de; 29 struct proc_dir_entry *de;
30 struct ctl_table_header *head; 30 struct ctl_table_header *head;
31 const struct proc_ns_operations *ns_ops;
31 32
32 truncate_inode_pages(&inode->i_data, 0); 33 truncate_inode_pages(&inode->i_data, 0);
33 end_writeback(inode); 34 end_writeback(inode);
@@ -44,6 +45,10 @@ static void proc_evict_inode(struct inode *inode)
44 rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); 45 rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
45 sysctl_head_put(head); 46 sysctl_head_put(head);
46 } 47 }
48 /* Release any associated namespace */
49 ns_ops = PROC_I(inode)->ns_ops;
50 if (ns_ops && ns_ops->put)
51 ns_ops->put(PROC_I(inode)->ns);
47} 52}
48 53
49static struct kmem_cache * proc_inode_cachep; 54static struct kmem_cache * proc_inode_cachep;
@@ -62,6 +67,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
62 ei->pde = NULL; 67 ei->pde = NULL;
63 ei->sysctl = NULL; 68 ei->sysctl = NULL;
64 ei->sysctl_entry = NULL; 69 ei->sysctl_entry = NULL;
70 ei->ns = NULL;
71 ei->ns_ops = NULL;
65 inode = &ei->vfs_inode; 72 inode = &ei->vfs_inode;
66 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 73 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
67 return inode; 74 return inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3763b436e69d..7838e5cfec14 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -127,3 +127,21 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
127 */ 127 */
128int proc_readdir(struct file *, void *, filldir_t); 128int proc_readdir(struct file *, void *, filldir_t);
129struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); 129struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
130
131
132
133/* Lookups */
134typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
135 struct task_struct *, const void *);
136int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
137 const char *name, int len,
138 instantiate_t instantiate, struct task_struct *task, const void *ptr);
139int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
140struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
141extern const struct dentry_operations pid_dentry_operations;
142int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
143int proc_setattr(struct dentry *dentry, struct iattr *attr);
144
145extern const struct inode_operations proc_ns_dir_inode_operations;
146extern const struct file_operations proc_ns_dir_operations;
147
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
new file mode 100644
index 000000000000..781dec5bd682
--- /dev/null
+++ b/fs/proc/namespaces.c
@@ -0,0 +1,198 @@
1#include <linux/proc_fs.h>
2#include <linux/nsproxy.h>
3#include <linux/sched.h>
4#include <linux/ptrace.h>
5#include <linux/fs_struct.h>
6#include <linux/mount.h>
7#include <linux/path.h>
8#include <linux/namei.h>
9#include <linux/file.h>
10#include <linux/utsname.h>
11#include <net/net_namespace.h>
12#include <linux/mnt_namespace.h>
13#include <linux/ipc_namespace.h>
14#include <linux/pid_namespace.h>
15#include "internal.h"
16
17
18static const struct proc_ns_operations *ns_entries[] = {
19#ifdef CONFIG_NET_NS
20 &netns_operations,
21#endif
22#ifdef CONFIG_UTS_NS
23 &utsns_operations,
24#endif
25#ifdef CONFIG_IPC_NS
26 &ipcns_operations,
27#endif
28};
29
30static const struct file_operations ns_file_operations = {
31 .llseek = no_llseek,
32};
33
34static struct dentry *proc_ns_instantiate(struct inode *dir,
35 struct dentry *dentry, struct task_struct *task, const void *ptr)
36{
37 const struct proc_ns_operations *ns_ops = ptr;
38 struct inode *inode;
39 struct proc_inode *ei;
40 struct dentry *error = ERR_PTR(-ENOENT);
41
42 inode = proc_pid_make_inode(dir->i_sb, task);
43 if (!inode)
44 goto out;
45
46 ei = PROC_I(inode);
47 inode->i_mode = S_IFREG|S_IRUSR;
48 inode->i_fop = &ns_file_operations;
49 ei->ns_ops = ns_ops;
50 ei->ns = ns_ops->get(task);
51 if (!ei->ns)
52 goto out_iput;
53
54 dentry->d_op = &pid_dentry_operations;
55 d_add(dentry, inode);
56 /* Close the race of the process dying before we return the dentry */
57 if (pid_revalidate(dentry, NULL))
58 error = NULL;
59out:
60 return error;
61out_iput:
62 iput(inode);
63 goto out;
64}
65
66static int proc_ns_fill_cache(struct file *filp, void *dirent,
67 filldir_t filldir, struct task_struct *task,
68 const struct proc_ns_operations *ops)
69{
70 return proc_fill_cache(filp, dirent, filldir,
71 ops->name, strlen(ops->name),
72 proc_ns_instantiate, task, ops);
73}
74
75static int proc_ns_dir_readdir(struct file *filp, void *dirent,
76 filldir_t filldir)
77{
78 int i;
79 struct dentry *dentry = filp->f_path.dentry;
80 struct inode *inode = dentry->d_inode;
81 struct task_struct *task = get_proc_task(inode);
82 const struct proc_ns_operations **entry, **last;
83 ino_t ino;
84 int ret;
85
86 ret = -ENOENT;
87 if (!task)
88 goto out_no_task;
89
90 ret = -EPERM;
91 if (!ptrace_may_access(task, PTRACE_MODE_READ))
92 goto out;
93
94 ret = 0;
95 i = filp->f_pos;
96 switch (i) {
97 case 0:
98 ino = inode->i_ino;
99 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
100 goto out;
101 i++;
102 filp->f_pos++;
103 /* fall through */
104 case 1:
105 ino = parent_ino(dentry);
106 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
107 goto out;
108 i++;
109 filp->f_pos++;
110 /* fall through */
111 default:
112 i -= 2;
113 if (i >= ARRAY_SIZE(ns_entries)) {
114 ret = 1;
115 goto out;
116 }
117 entry = ns_entries + i;
118 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
119 while (entry <= last) {
120 if (proc_ns_fill_cache(filp, dirent, filldir,
121 task, *entry) < 0)
122 goto out;
123 filp->f_pos++;
124 entry++;
125 }
126 }
127
128 ret = 1;
129out:
130 put_task_struct(task);
131out_no_task:
132 return ret;
133}
134
135const struct file_operations proc_ns_dir_operations = {
136 .read = generic_read_dir,
137 .readdir = proc_ns_dir_readdir,
138};
139
140static struct dentry *proc_ns_dir_lookup(struct inode *dir,
141 struct dentry *dentry, struct nameidata *nd)
142{
143 struct dentry *error;
144 struct task_struct *task = get_proc_task(dir);
145 const struct proc_ns_operations **entry, **last;
146 unsigned int len = dentry->d_name.len;
147
148 error = ERR_PTR(-ENOENT);
149
150 if (!task)
151 goto out_no_task;
152
153 error = ERR_PTR(-EPERM);
154 if (!ptrace_may_access(task, PTRACE_MODE_READ))
155 goto out;
156
157 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
158 for (entry = ns_entries; entry <= last; entry++) {
159 if (strlen((*entry)->name) != len)
160 continue;
161 if (!memcmp(dentry->d_name.name, (*entry)->name, len))
162 break;
163 }
164 error = ERR_PTR(-ENOENT);
165 if (entry > last)
166 goto out;
167
168 error = proc_ns_instantiate(dir, dentry, task, *entry);
169out:
170 put_task_struct(task);
171out_no_task:
172 return error;
173}
174
175const struct inode_operations proc_ns_dir_inode_operations = {
176 .lookup = proc_ns_dir_lookup,
177 .getattr = pid_getattr,
178 .setattr = proc_setattr,
179};
180
181struct file *proc_ns_fget(int fd)
182{
183 struct file *file;
184
185 file = fget(fd);
186 if (!file)
187 return ERR_PTR(-EBADF);
188
189 if (file->f_op != &ns_file_operations)
190 goto out_invalid;
191
192 return file;
193
194out_invalid:
195 fput(file);
196 return ERR_PTR(-EINVAL);
197}
198
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2c9db29ea358..db15935fa757 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
211{ 211{
212 struct mm_struct *mm = vma->vm_mm; 212 struct mm_struct *mm = vma->vm_mm;
213 struct file *file = vma->vm_file; 213 struct file *file = vma->vm_file;
214 int flags = vma->vm_flags; 214 vm_flags_t flags = vma->vm_flags;
215 unsigned long ino = 0; 215 unsigned long ino = 0;
216 unsigned long long pgoff = 0; 216 unsigned long long pgoff = 0;
217 unsigned long start, end; 217 unsigned long start, end;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 118662690cdf..76c8164d5651 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,6 +831,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
831 INITIALIZE_PATH(path); 831 INITIALIZE_PATH(path);
832 struct reiserfs_dir_entry de; 832 struct reiserfs_dir_entry de;
833 833
834 dentry_unhash(dentry);
835
834 /* we will be doing 2 balancings and update 2 stat data, we change quotas 836 /* we will be doing 2 balancings and update 2 stat data, we change quotas
835 * of the owner of the directory and of the owner of the parent directory. 837 * of the owner of the directory and of the owner of the parent directory.
836 * The quota structure is possibly deleted only on last iput => outside 838 * The quota structure is possibly deleted only on last iput => outside
@@ -1225,6 +1227,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1225 unsigned long savelink = 1; 1227 unsigned long savelink = 1;
1226 struct timespec ctime; 1228 struct timespec ctime;
1227 1229
1230 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1231 dentry_unhash(new_dentry);
1232
1228 /* three balancings: (1) old name removal, (2) new name insertion 1233 /* three balancings: (1) old name removal, (2) new name insertion
1229 and (3) maybe "save" link insertion 1234 and (3) maybe "save" link insertion
1230 stat data updates: (1) old directory, 1235 stat data updates: (1) old directory,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 47d2a4498b03..50f1abccd1cd 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -105,7 +105,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
105 mutex_unlock(&dentry->d_inode->i_mutex); 105 mutex_unlock(&dentry->d_inode->i_mutex);
106 if (!error) 106 if (!error)
107 d_delete(dentry); 107 d_delete(dentry);
108 dput(dentry);
109 108
110 return error; 109 return error;
111} 110}
diff --git a/fs/super.c b/fs/super.c
index c04f7e0b7ed2..c75593953c52 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -31,6 +31,7 @@
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
33#include <linux/rculist_bl.h> 33#include <linux/rculist_bl.h>
34#include <linux/cleancache.h>
34#include "internal.h" 35#include "internal.h"
35 36
36 37
@@ -112,6 +113,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
112 s->s_maxbytes = MAX_NON_LFS; 113 s->s_maxbytes = MAX_NON_LFS;
113 s->s_op = &default_op; 114 s->s_op = &default_op;
114 s->s_time_gran = 1000000000; 115 s->s_time_gran = 1000000000;
116 s->cleancache_poolid = -1;
115 } 117 }
116out: 118out:
117 return s; 119 return s;
@@ -177,6 +179,7 @@ void deactivate_locked_super(struct super_block *s)
177{ 179{
178 struct file_system_type *fs = s->s_type; 180 struct file_system_type *fs = s->s_type;
179 if (atomic_dec_and_test(&s->s_active)) { 181 if (atomic_dec_and_test(&s->s_active)) {
182 cleancache_flush_fs(s);
180 fs->kill_sb(s); 183 fs->kill_sb(s);
181 /* 184 /*
182 * We need to call rcu_barrier so all the delayed rcu free 185 * We need to call rcu_barrier so all the delayed rcu free
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e474fbcf8bde..e2cc6756f3b1 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,6 +196,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
196 struct inode *inode = dentry->d_inode; 196 struct inode *inode = dentry->d_inode;
197 int err = -ENOTEMPTY; 197 int err = -ENOTEMPTY;
198 198
199 dentry_unhash(dentry);
200
199 if (sysv_empty_dir(inode)) { 201 if (sysv_empty_dir(inode)) {
200 err = sysv_unlink(dir, dentry); 202 err = sysv_unlink(dir, dentry);
201 if (!err) { 203 if (!err) {
@@ -222,6 +224,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
222 struct sysv_dir_entry * old_de; 224 struct sysv_dir_entry * old_de;
223 int err = -ENOENT; 225 int err = -ENOENT;
224 226
227 if (new_inode && S_ISDIR(new_inode->i_mode))
228 dentry_unhash(new_dentry);
229
225 old_de = sysv_find_entry(old_dentry, &old_page); 230 old_de = sysv_find_entry(old_dentry, &old_page);
226 if (!old_de) 231 if (!old_de)
227 goto out; 232 goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ef5abd38f0bf..c2b80943560d 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,6 +656,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
656 struct ubifs_inode *dir_ui = ubifs_inode(dir); 656 struct ubifs_inode *dir_ui = ubifs_inode(dir);
657 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; 657 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
658 658
659 dentry_unhash(dentry);
660
659 /* 661 /*
660 * Budget request settings: deletion direntry, deletion inode and 662 * Budget request settings: deletion direntry, deletion inode and
661 * changing the parent inode. If budgeting fails, go ahead anyway 663 * changing the parent inode. If budgeting fails, go ahead anyway
@@ -976,6 +978,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
976 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; 978 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
977 struct timespec time; 979 struct timespec time;
978 980
981 if (new_inode && S_ISDIR(new_inode->i_mode))
982 dentry_unhash(new_dentry);
983
979 /* 984 /*
980 * Budget request settings: deletion direntry, new direntry, removing 985 * Budget request settings: deletion direntry, new direntry, removing
981 * the old inode, and changing old and new parent directory inodes. 986 * the old inode, and changing old and new parent directory inodes.
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef96..4d76594c2a8f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,6 +783,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
783 struct fileIdentDesc *fi, cfi; 783 struct fileIdentDesc *fi, cfi;
784 struct kernel_lb_addr tloc; 784 struct kernel_lb_addr tloc;
785 785
786 dentry_unhash(dentry);
787
786 retval = -ENOENT; 788 retval = -ENOENT;
787 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 789 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
788 if (!fi) 790 if (!fi)
@@ -1081,6 +1083,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1081 struct kernel_lb_addr tloc; 1083 struct kernel_lb_addr tloc;
1082 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1084 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1083 1085
1086 if (new_inode && S_ISDIR(new_inode->i_mode))
1087 dentry_unhash(new_dentry);
1088
1084 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1089 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1085 if (ofi) { 1090 if (ofi) {
1086 if (ofibh.sbh != ofibh.ebh) 1091 if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 29309e25417f..953ebdfc5bf7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,6 +258,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
258 struct inode * inode = dentry->d_inode; 258 struct inode * inode = dentry->d_inode;
259 int err= -ENOTEMPTY; 259 int err= -ENOTEMPTY;
260 260
261 dentry_unhash(dentry);
262
261 lock_ufs(dir->i_sb); 263 lock_ufs(dir->i_sb);
262 if (ufs_empty_dir (inode)) { 264 if (ufs_empty_dir (inode)) {
263 err = ufs_unlink(dir, dentry); 265 err = ufs_unlink(dir, dentry);
@@ -282,6 +284,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
282 struct ufs_dir_entry *old_de; 284 struct ufs_dir_entry *old_de;
283 int err = -ENOENT; 285 int err = -ENOENT;
284 286
287 if (new_inode && S_ISDIR(new_inode->i_mode))
288 dentry_unhash(new_dentry);
289
285 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 290 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
286 if (!old_de) 291 if (!old_de)
287 goto out; 292 goto out;
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index d61611c88012..244e797dae32 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -191,3 +191,32 @@ xfs_ioc_trim(
191 return -XFS_ERROR(EFAULT); 191 return -XFS_ERROR(EFAULT);
192 return 0; 192 return 0;
193} 193}
194
195int
196xfs_discard_extents(
197 struct xfs_mount *mp,
198 struct list_head *list)
199{
200 struct xfs_busy_extent *busyp;
201 int error = 0;
202
203 list_for_each_entry(busyp, list, list) {
204 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
205 busyp->length);
206
207 error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
208 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
209 XFS_FSB_TO_BB(mp, busyp->length),
210 GFP_NOFS, 0);
211 if (error && error != EOPNOTSUPP) {
212 xfs_info(mp,
213 "discard failed for extent [0x%llu,%u], error %d",
214 (unsigned long long)busyp->bno,
215 busyp->length,
216 error);
217 return error;
218 }
219 }
220
221 return 0;
222}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
index e82b6dd3e127..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -2,7 +2,9 @@
2#define XFS_DISCARD_H 1 2#define XFS_DISCARD_H 1
3 3
4struct fstrim_range; 4struct fstrim_range;
5struct list_head;
5 6
6extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); 7extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
8extern int xfs_discard_extents(struct xfs_mount *, struct list_head *);
7 9
8#endif /* XFS_DISCARD_H */ 10#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b0aa59e51fd0..98b9c91fcdf1 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -110,8 +110,10 @@ mempool_t *xfs_ioend_pool;
110#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ 110#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
111#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ 111#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
112#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ 112#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
113#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ 113#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */
114#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ 114#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
115#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
116#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
115 117
116/* 118/*
117 * Table driven mount option parser. 119 * Table driven mount option parser.
@@ -355,6 +357,10 @@ xfs_parseargs(
355 mp->m_flags |= XFS_MOUNT_DELAYLOG; 357 mp->m_flags |= XFS_MOUNT_DELAYLOG;
356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 358 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 359 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
360 } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
361 mp->m_flags |= XFS_MOUNT_DISCARD;
362 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
363 mp->m_flags &= ~XFS_MOUNT_DISCARD;
358 } else if (!strcmp(this_char, "ihashsize")) { 364 } else if (!strcmp(this_char, "ihashsize")) {
359 xfs_warn(mp, 365 xfs_warn(mp,
360 "ihashsize no longer used, option is deprecated."); 366 "ihashsize no longer used, option is deprecated.");
@@ -388,6 +394,13 @@ xfs_parseargs(
388 return EINVAL; 394 return EINVAL;
389 } 395 }
390 396
397 if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
398 !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
399 xfs_warn(mp,
400 "the discard option is incompatible with the nodelaylog option");
401 return EINVAL;
402 }
403
391#ifndef CONFIG_XFS_QUOTA 404#ifndef CONFIG_XFS_QUOTA
392 if (XFS_IS_QUOTA_RUNNING(mp)) { 405 if (XFS_IS_QUOTA_RUNNING(mp)) {
393 xfs_warn(mp, "quota support not available in this kernel."); 406 xfs_warn(mp, "quota support not available in this kernel.");
@@ -488,6 +501,7 @@ xfs_showargs(
488 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 501 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
489 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 502 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
490 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, 503 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
504 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
491 { 0, NULL } 505 { 0, NULL }
492 }; 506 };
493 static struct proc_xfs_info xfs_info_unset[] = { 507 static struct proc_xfs_info xfs_info_unset[] = {
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index da0a561ffba2..6530769a999b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,6 +187,9 @@ struct xfs_busy_extent {
187 xfs_agnumber_t agno; 187 xfs_agnumber_t agno;
188 xfs_agblock_t bno; 188 xfs_agblock_t bno;
189 xfs_extlen_t length; 189 xfs_extlen_t length;
190 unsigned int flags;
191#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
192#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */
190}; 193};
191 194
192/* 195/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index acdced86413c..95862bbff56b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2469,7 +2469,7 @@ xfs_free_extent(
2469 2469
2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2471 if (!error) 2471 if (!error)
2472 xfs_alloc_busy_insert(tp, args.agno, args.agbno, len); 2472 xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
2473error0: 2473error0:
2474 xfs_perag_put(args.pag); 2474 xfs_perag_put(args.pag);
2475 return error; 2475 return error;
@@ -2480,7 +2480,8 @@ xfs_alloc_busy_insert(
2480 struct xfs_trans *tp, 2480 struct xfs_trans *tp,
2481 xfs_agnumber_t agno, 2481 xfs_agnumber_t agno,
2482 xfs_agblock_t bno, 2482 xfs_agblock_t bno,
2483 xfs_extlen_t len) 2483 xfs_extlen_t len,
2484 unsigned int flags)
2484{ 2485{
2485 struct xfs_busy_extent *new; 2486 struct xfs_busy_extent *new;
2486 struct xfs_busy_extent *busyp; 2487 struct xfs_busy_extent *busyp;
@@ -2504,6 +2505,7 @@ xfs_alloc_busy_insert(
2504 new->bno = bno; 2505 new->bno = bno;
2505 new->length = len; 2506 new->length = len;
2506 INIT_LIST_HEAD(&new->list); 2507 INIT_LIST_HEAD(&new->list);
2508 new->flags = flags;
2507 2509
2508 /* trace before insert to be able to see failed inserts */ 2510 /* trace before insert to be able to see failed inserts */
2509 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); 2511 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
@@ -2609,6 +2611,18 @@ xfs_alloc_busy_update_extent(
2609 xfs_agblock_t bend = bbno + busyp->length; 2611 xfs_agblock_t bend = bbno + busyp->length;
2610 2612
2611 /* 2613 /*
2614 * This extent is currently being discarded. Give the thread
2615 * performing the discard a chance to mark the extent unbusy
2616 * and retry.
2617 */
2618 if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
2619 spin_unlock(&pag->pagb_lock);
2620 delay(1);
2621 spin_lock(&pag->pagb_lock);
2622 return false;
2623 }
2624
2625 /*
2612 * If there is a busy extent overlapping a user allocation, we have 2626 * If there is a busy extent overlapping a user allocation, we have
2613 * no choice but to force the log and retry the search. 2627 * no choice but to force the log and retry the search.
2614 * 2628 *
@@ -2813,7 +2827,8 @@ restart:
2813 * If this is a metadata allocation, try to reuse the busy 2827 * If this is a metadata allocation, try to reuse the busy
2814 * extent instead of trimming the allocation. 2828 * extent instead of trimming the allocation.
2815 */ 2829 */
2816 if (!args->userdata) { 2830 if (!args->userdata &&
2831 !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
2817 if (!xfs_alloc_busy_update_extent(args->mp, args->pag, 2832 if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
2818 busyp, fbno, flen, 2833 busyp, fbno, flen,
2819 false)) 2834 false))
@@ -2979,10 +2994,16 @@ xfs_alloc_busy_clear_one(
2979 kmem_free(busyp); 2994 kmem_free(busyp);
2980} 2995}
2981 2996
2997/*
2998 * Remove all extents on the passed in list from the busy extents tree.
2999 * If do_discard is set skip extents that need to be discarded, and mark
3000 * these as undergoing a discard operation instead.
3001 */
2982void 3002void
2983xfs_alloc_busy_clear( 3003xfs_alloc_busy_clear(
2984 struct xfs_mount *mp, 3004 struct xfs_mount *mp,
2985 struct list_head *list) 3005 struct list_head *list,
3006 bool do_discard)
2986{ 3007{
2987 struct xfs_busy_extent *busyp, *n; 3008 struct xfs_busy_extent *busyp, *n;
2988 struct xfs_perag *pag = NULL; 3009 struct xfs_perag *pag = NULL;
@@ -2999,7 +3020,11 @@ xfs_alloc_busy_clear(
2999 agno = busyp->agno; 3020 agno = busyp->agno;
3000 } 3021 }
3001 3022
3002 xfs_alloc_busy_clear_one(mp, pag, busyp); 3023 if (do_discard && busyp->length &&
3024 !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
3025 busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
3026 else
3027 xfs_alloc_busy_clear_one(mp, pag, busyp);
3003 } 3028 }
3004 3029
3005 if (pag) { 3030 if (pag) {
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 240ad288f2f9..2f52b924be79 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -137,10 +137,11 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
137#ifdef __KERNEL__ 137#ifdef __KERNEL__
138void 138void
139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, 139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
140 xfs_agblock_t bno, xfs_extlen_t len); 140 xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
141 141
142void 142void
143xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list); 143xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
144 bool do_discard);
144 145
145int 146int
146xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, 147xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 8b469d53599f..2b3518826a69 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -120,7 +120,8 @@ xfs_allocbt_free_block(
120 if (error) 120 if (error)
121 return error; 121 return error;
122 122
123 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 123 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
124 XFS_ALLOC_BUSY_SKIP_DISCARD);
124 xfs_trans_agbtree_delta(cur->bc_tp, -1); 125 xfs_trans_agbtree_delta(cur->bc_tp, -1);
125 return 0; 126 return 0;
126} 127}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index fa00788de2f5..e546a33214c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local(
89 int *flags); /* inode logging flags */ 89 int *flags); /* inode logging flags */
90 90
91/* 91/*
92 * Called by xfs_bmapi to update file extent records and the btree
93 * after allocating space (or doing a delayed allocation).
94 */
95STATIC int /* error */
96xfs_bmap_add_extent(
97 xfs_inode_t *ip, /* incore inode pointer */
98 xfs_extnum_t idx, /* extent number to update/insert */
99 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
100 xfs_bmbt_irec_t *new, /* new data to add to file extents */
101 xfs_fsblock_t *first, /* pointer to firstblock variable */
102 xfs_bmap_free_t *flist, /* list of extents to be freed */
103 int *logflagsp, /* inode logging flags */
104 int whichfork, /* data or attr fork */
105 int rsvd); /* OK to allocate reserved blocks */
106
107/*
108 * Called by xfs_bmap_add_extent to handle cases converting a delayed 92 * Called by xfs_bmap_add_extent to handle cases converting a delayed
109 * allocation to a real allocation. 93 * allocation to a real allocation.
110 */ 94 */
111STATIC int /* error */ 95STATIC int /* error */
112xfs_bmap_add_extent_delay_real( 96xfs_bmap_add_extent_delay_real(
113 xfs_inode_t *ip, /* incore inode pointer */ 97 xfs_inode_t *ip, /* incore inode pointer */
114 xfs_extnum_t idx, /* extent number to update/insert */ 98 xfs_extnum_t *idx, /* extent number to update/insert */
115 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 99 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
116 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 100 xfs_bmbt_irec_t *new, /* new data to add to file extents */
117 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ 101 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
118 xfs_fsblock_t *first, /* pointer to firstblock variable */ 102 xfs_fsblock_t *first, /* pointer to firstblock variable */
119 xfs_bmap_free_t *flist, /* list of extents to be freed */ 103 xfs_bmap_free_t *flist, /* list of extents to be freed */
120 int *logflagsp, /* inode logging flags */ 104 int *logflagsp); /* inode logging flags */
121 int rsvd); /* OK to allocate reserved blocks */
122 105
123/* 106/*
124 * Called by xfs_bmap_add_extent to handle cases converting a hole 107 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real(
127STATIC int /* error */ 110STATIC int /* error */
128xfs_bmap_add_extent_hole_delay( 111xfs_bmap_add_extent_hole_delay(
129 xfs_inode_t *ip, /* incore inode pointer */ 112 xfs_inode_t *ip, /* incore inode pointer */
130 xfs_extnum_t idx, /* extent number to update/insert */ 113 xfs_extnum_t *idx, /* extent number to update/insert */
131 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 114 xfs_bmbt_irec_t *new, /* new data to add to file extents */
132 int *logflagsp,/* inode logging flags */ 115 int *logflagsp); /* inode logging flags */
133 int rsvd); /* OK to allocate reserved blocks */
134 116
135/* 117/*
136 * Called by xfs_bmap_add_extent to handle cases converting a hole 118 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay(
139STATIC int /* error */ 121STATIC int /* error */
140xfs_bmap_add_extent_hole_real( 122xfs_bmap_add_extent_hole_real(
141 xfs_inode_t *ip, /* incore inode pointer */ 123 xfs_inode_t *ip, /* incore inode pointer */
142 xfs_extnum_t idx, /* extent number to update/insert */ 124 xfs_extnum_t *idx, /* extent number to update/insert */
143 xfs_btree_cur_t *cur, /* if null, not a btree */ 125 xfs_btree_cur_t *cur, /* if null, not a btree */
144 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 126 xfs_bmbt_irec_t *new, /* new data to add to file extents */
145 int *logflagsp, /* inode logging flags */ 127 int *logflagsp, /* inode logging flags */
@@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real(
152STATIC int /* error */ 134STATIC int /* error */
153xfs_bmap_add_extent_unwritten_real( 135xfs_bmap_add_extent_unwritten_real(
154 xfs_inode_t *ip, /* incore inode pointer */ 136 xfs_inode_t *ip, /* incore inode pointer */
155 xfs_extnum_t idx, /* extent number to update/insert */ 137 xfs_extnum_t *idx, /* extent number to update/insert */
156 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 138 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
157 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 139 xfs_bmbt_irec_t *new, /* new data to add to file extents */
158 int *logflagsp); /* inode logging flags */ 140 int *logflagsp); /* inode logging flags */
@@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents(
180 int whichfork); /* data or attr fork */ 162 int whichfork); /* data or attr fork */
181 163
182/* 164/*
183 * Called by xfs_bmapi to update file extent records and the btree
184 * after removing space (or undoing a delayed allocation).
185 */
186STATIC int /* error */
187xfs_bmap_del_extent(
188 xfs_inode_t *ip, /* incore inode pointer */
189 xfs_trans_t *tp, /* current trans pointer */
190 xfs_extnum_t idx, /* extent number to update/insert */
191 xfs_bmap_free_t *flist, /* list of extents to be freed */
192 xfs_btree_cur_t *cur, /* if null, not a btree */
193 xfs_bmbt_irec_t *new, /* new data to add to file extents */
194 int *logflagsp,/* inode logging flags */
195 int whichfork, /* data or attr fork */
196 int rsvd); /* OK to allocate reserved blocks */
197
198/*
199 * Remove the entry "free" from the free item list. Prev points to the 165 * Remove the entry "free" from the free item list. Prev points to the
200 * previous entry, unless "free" is the head of the list. 166 * previous entry, unless "free" is the head of the list.
201 */ 167 */
@@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local(
474STATIC int /* error */ 440STATIC int /* error */
475xfs_bmap_add_extent( 441xfs_bmap_add_extent(
476 xfs_inode_t *ip, /* incore inode pointer */ 442 xfs_inode_t *ip, /* incore inode pointer */
477 xfs_extnum_t idx, /* extent number to update/insert */ 443 xfs_extnum_t *idx, /* extent number to update/insert */
478 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 444 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
479 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 445 xfs_bmbt_irec_t *new, /* new data to add to file extents */
480 xfs_fsblock_t *first, /* pointer to firstblock variable */ 446 xfs_fsblock_t *first, /* pointer to firstblock variable */
481 xfs_bmap_free_t *flist, /* list of extents to be freed */ 447 xfs_bmap_free_t *flist, /* list of extents to be freed */
482 int *logflagsp, /* inode logging flags */ 448 int *logflagsp, /* inode logging flags */
483 int whichfork, /* data or attr fork */ 449 int whichfork) /* data or attr fork */
484 int rsvd) /* OK to use reserved data blocks */
485{ 450{
486 xfs_btree_cur_t *cur; /* btree cursor or null */ 451 xfs_btree_cur_t *cur; /* btree cursor or null */
487 xfs_filblks_t da_new; /* new count del alloc blocks used */ 452 xfs_filblks_t da_new; /* new count del alloc blocks used */
@@ -492,23 +457,27 @@ xfs_bmap_add_extent(
492 xfs_extnum_t nextents; /* number of extents in file now */ 457 xfs_extnum_t nextents; /* number of extents in file now */
493 458
494 XFS_STATS_INC(xs_add_exlist); 459 XFS_STATS_INC(xs_add_exlist);
460
495 cur = *curp; 461 cur = *curp;
496 ifp = XFS_IFORK_PTR(ip, whichfork); 462 ifp = XFS_IFORK_PTR(ip, whichfork);
497 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 463 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
498 ASSERT(idx <= nextents);
499 da_old = da_new = 0; 464 da_old = da_new = 0;
500 error = 0; 465 error = 0;
466
467 ASSERT(*idx >= 0);
468 ASSERT(*idx <= nextents);
469
501 /* 470 /*
502 * This is the first extent added to a new/empty file. 471 * This is the first extent added to a new/empty file.
503 * Special case this one, so other routines get to assume there are 472 * Special case this one, so other routines get to assume there are
504 * already extents in the list. 473 * already extents in the list.
505 */ 474 */
506 if (nextents == 0) { 475 if (nextents == 0) {
507 xfs_iext_insert(ip, 0, 1, new, 476 xfs_iext_insert(ip, *idx, 1, new,
508 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); 477 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
509 478
510 ASSERT(cur == NULL); 479 ASSERT(cur == NULL);
511 ifp->if_lastex = 0; 480
512 if (!isnullstartblock(new->br_startblock)) { 481 if (!isnullstartblock(new->br_startblock)) {
513 XFS_IFORK_NEXT_SET(ip, whichfork, 1); 482 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
514 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 483 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -522,27 +491,25 @@ xfs_bmap_add_extent(
522 if (cur) 491 if (cur)
523 ASSERT((cur->bc_private.b.flags & 492 ASSERT((cur->bc_private.b.flags &
524 XFS_BTCUR_BPRV_WASDEL) == 0); 493 XFS_BTCUR_BPRV_WASDEL) == 0);
525 if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, 494 error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
526 &logflags, rsvd))) 495 &logflags);
527 goto done;
528 } 496 }
529 /* 497 /*
530 * Real allocation off the end of the file. 498 * Real allocation off the end of the file.
531 */ 499 */
532 else if (idx == nextents) { 500 else if (*idx == nextents) {
533 if (cur) 501 if (cur)
534 ASSERT((cur->bc_private.b.flags & 502 ASSERT((cur->bc_private.b.flags &
535 XFS_BTCUR_BPRV_WASDEL) == 0); 503 XFS_BTCUR_BPRV_WASDEL) == 0);
536 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, 504 error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
537 &logflags, whichfork))) 505 &logflags, whichfork);
538 goto done;
539 } else { 506 } else {
540 xfs_bmbt_irec_t prev; /* old extent at offset idx */ 507 xfs_bmbt_irec_t prev; /* old extent at offset idx */
541 508
542 /* 509 /*
543 * Get the record referred to by idx. 510 * Get the record referred to by idx.
544 */ 511 */
545 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev); 512 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
546 /* 513 /*
547 * If it's a real allocation record, and the new allocation ends 514 * If it's a real allocation record, and the new allocation ends
548 * after the start of the referred to record, then we're filling 515 * after the start of the referred to record, then we're filling
@@ -557,22 +524,18 @@ xfs_bmap_add_extent(
557 if (cur) 524 if (cur)
558 ASSERT(cur->bc_private.b.flags & 525 ASSERT(cur->bc_private.b.flags &
559 XFS_BTCUR_BPRV_WASDEL); 526 XFS_BTCUR_BPRV_WASDEL);
560 if ((error = xfs_bmap_add_extent_delay_real(ip, 527 error = xfs_bmap_add_extent_delay_real(ip,
561 idx, &cur, new, &da_new, first, flist, 528 idx, &cur, new, &da_new,
562 &logflags, rsvd))) 529 first, flist, &logflags);
563 goto done;
564 } else if (new->br_state == XFS_EXT_NORM) {
565 ASSERT(new->br_state == XFS_EXT_NORM);
566 if ((error = xfs_bmap_add_extent_unwritten_real(
567 ip, idx, &cur, new, &logflags)))
568 goto done;
569 } else { 530 } else {
570 ASSERT(new->br_state == XFS_EXT_UNWRITTEN); 531 ASSERT(new->br_state == XFS_EXT_NORM ||
571 if ((error = xfs_bmap_add_extent_unwritten_real( 532 new->br_state == XFS_EXT_UNWRITTEN);
572 ip, idx, &cur, new, &logflags))) 533
534 error = xfs_bmap_add_extent_unwritten_real(ip,
535 idx, &cur, new, &logflags);
536 if (error)
573 goto done; 537 goto done;
574 } 538 }
575 ASSERT(*curp == cur || *curp == NULL);
576 } 539 }
577 /* 540 /*
578 * Otherwise we're filling in a hole with an allocation. 541 * Otherwise we're filling in a hole with an allocation.
@@ -581,13 +544,15 @@ xfs_bmap_add_extent(
581 if (cur) 544 if (cur)
582 ASSERT((cur->bc_private.b.flags & 545 ASSERT((cur->bc_private.b.flags &
583 XFS_BTCUR_BPRV_WASDEL) == 0); 546 XFS_BTCUR_BPRV_WASDEL) == 0);
584 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, 547 error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
585 new, &logflags, whichfork))) 548 new, &logflags, whichfork);
586 goto done;
587 } 549 }
588 } 550 }
589 551
552 if (error)
553 goto done;
590 ASSERT(*curp == cur || *curp == NULL); 554 ASSERT(*curp == cur || *curp == NULL);
555
591 /* 556 /*
592 * Convert to a btree if necessary. 557 * Convert to a btree if necessary.
593 */ 558 */
@@ -615,7 +580,7 @@ xfs_bmap_add_extent(
615 ASSERT(nblks <= da_old); 580 ASSERT(nblks <= da_old);
616 if (nblks < da_old) 581 if (nblks < da_old)
617 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, 582 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
618 (int64_t)(da_old - nblks), rsvd); 583 (int64_t)(da_old - nblks), 0);
619 } 584 }
620 /* 585 /*
621 * Clear out the allocated field, done with it now in any case. 586 * Clear out the allocated field, done with it now in any case.
@@ -640,14 +605,13 @@ done:
640STATIC int /* error */ 605STATIC int /* error */
641xfs_bmap_add_extent_delay_real( 606xfs_bmap_add_extent_delay_real(
642 xfs_inode_t *ip, /* incore inode pointer */ 607 xfs_inode_t *ip, /* incore inode pointer */
643 xfs_extnum_t idx, /* extent number to update/insert */ 608 xfs_extnum_t *idx, /* extent number to update/insert */
644 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 609 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
645 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 610 xfs_bmbt_irec_t *new, /* new data to add to file extents */
646 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ 611 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
647 xfs_fsblock_t *first, /* pointer to firstblock variable */ 612 xfs_fsblock_t *first, /* pointer to firstblock variable */
648 xfs_bmap_free_t *flist, /* list of extents to be freed */ 613 xfs_bmap_free_t *flist, /* list of extents to be freed */
649 int *logflagsp, /* inode logging flags */ 614 int *logflagsp) /* inode logging flags */
650 int rsvd) /* OK to use reserved data block allocation */
651{ 615{
652 xfs_btree_cur_t *cur; /* btree cursor */ 616 xfs_btree_cur_t *cur; /* btree cursor */
653 int diff; /* temp value */ 617 int diff; /* temp value */
@@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real(
673 */ 637 */
674 cur = *curp; 638 cur = *curp;
675 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 639 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
676 ep = xfs_iext_get_ext(ifp, idx); 640 ep = xfs_iext_get_ext(ifp, *idx);
677 xfs_bmbt_get_all(ep, &PREV); 641 xfs_bmbt_get_all(ep, &PREV);
678 new_endoff = new->br_startoff + new->br_blockcount; 642 new_endoff = new->br_startoff + new->br_blockcount;
679 ASSERT(PREV.br_startoff <= new->br_startoff); 643 ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real(
692 * Check and set flags if this segment has a left neighbor. 656 * Check and set flags if this segment has a left neighbor.
693 * Don't set contiguous if the combined extent would be too large. 657 * Don't set contiguous if the combined extent would be too large.
694 */ 658 */
695 if (idx > 0) { 659 if (*idx > 0) {
696 state |= BMAP_LEFT_VALID; 660 state |= BMAP_LEFT_VALID;
697 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 661 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
698 662
699 if (isnullstartblock(LEFT.br_startblock)) 663 if (isnullstartblock(LEFT.br_startblock))
700 state |= BMAP_LEFT_DELAY; 664 state |= BMAP_LEFT_DELAY;
@@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real(
712 * Don't set contiguous if the combined extent would be too large. 676 * Don't set contiguous if the combined extent would be too large.
713 * Also check for all-three-contiguous being too large. 677 * Also check for all-three-contiguous being too large.
714 */ 678 */
715 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { 679 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
716 state |= BMAP_RIGHT_VALID; 680 state |= BMAP_RIGHT_VALID;
717 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 681 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
718 682
719 if (isnullstartblock(RIGHT.br_startblock)) 683 if (isnullstartblock(RIGHT.br_startblock))
720 state |= BMAP_RIGHT_DELAY; 684 state |= BMAP_RIGHT_DELAY;
@@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real(
745 * Filling in all of a previously delayed allocation extent. 709 * Filling in all of a previously delayed allocation extent.
746 * The left and right neighbors are both contiguous with new. 710 * The left and right neighbors are both contiguous with new.
747 */ 711 */
748 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 712 --*idx;
749 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 713 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
714 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
750 LEFT.br_blockcount + PREV.br_blockcount + 715 LEFT.br_blockcount + PREV.br_blockcount +
751 RIGHT.br_blockcount); 716 RIGHT.br_blockcount);
752 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 717 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
753 718
754 xfs_iext_remove(ip, idx, 2, state); 719 xfs_iext_remove(ip, *idx + 1, 2, state);
755 ip->i_df.if_lastex = idx - 1;
756 ip->i_d.di_nextents--; 720 ip->i_d.di_nextents--;
757 if (cur == NULL) 721 if (cur == NULL)
758 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 722 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real(
784 * Filling in all of a previously delayed allocation extent. 748 * Filling in all of a previously delayed allocation extent.
785 * The left neighbor is contiguous, the right is not. 749 * The left neighbor is contiguous, the right is not.
786 */ 750 */
787 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 751 --*idx;
788 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 752
753 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
754 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
789 LEFT.br_blockcount + PREV.br_blockcount); 755 LEFT.br_blockcount + PREV.br_blockcount);
790 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 756 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
791 757
792 ip->i_df.if_lastex = idx - 1; 758 xfs_iext_remove(ip, *idx + 1, 1, state);
793 xfs_iext_remove(ip, idx, 1, state);
794 if (cur == NULL) 759 if (cur == NULL)
795 rval = XFS_ILOG_DEXT; 760 rval = XFS_ILOG_DEXT;
796 else { 761 else {
@@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real(
814 * Filling in all of a previously delayed allocation extent. 779 * Filling in all of a previously delayed allocation extent.
815 * The right neighbor is contiguous, the left is not. 780 * The right neighbor is contiguous, the left is not.
816 */ 781 */
817 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 782 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
818 xfs_bmbt_set_startblock(ep, new->br_startblock); 783 xfs_bmbt_set_startblock(ep, new->br_startblock);
819 xfs_bmbt_set_blockcount(ep, 784 xfs_bmbt_set_blockcount(ep,
820 PREV.br_blockcount + RIGHT.br_blockcount); 785 PREV.br_blockcount + RIGHT.br_blockcount);
821 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 786 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
822 787
823 ip->i_df.if_lastex = idx; 788 xfs_iext_remove(ip, *idx + 1, 1, state);
824 xfs_iext_remove(ip, idx + 1, 1, state);
825 if (cur == NULL) 789 if (cur == NULL)
826 rval = XFS_ILOG_DEXT; 790 rval = XFS_ILOG_DEXT;
827 else { 791 else {
@@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real(
837 RIGHT.br_blockcount, PREV.br_state))) 801 RIGHT.br_blockcount, PREV.br_state)))
838 goto done; 802 goto done;
839 } 803 }
804
840 *dnew = 0; 805 *dnew = 0;
841 break; 806 break;
842 807
@@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real(
846 * Neither the left nor right neighbors are contiguous with 811 * Neither the left nor right neighbors are contiguous with
847 * the new one. 812 * the new one.
848 */ 813 */
849 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 814 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
850 xfs_bmbt_set_startblock(ep, new->br_startblock); 815 xfs_bmbt_set_startblock(ep, new->br_startblock);
851 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 816 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
852 817
853 ip->i_df.if_lastex = idx;
854 ip->i_d.di_nextents++; 818 ip->i_d.di_nextents++;
855 if (cur == NULL) 819 if (cur == NULL)
856 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 820 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real(
866 goto done; 830 goto done;
867 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 831 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
868 } 832 }
833
869 *dnew = 0; 834 *dnew = 0;
870 break; 835 break;
871 836
@@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real(
874 * Filling in the first part of a previous delayed allocation. 839 * Filling in the first part of a previous delayed allocation.
875 * The left neighbor is contiguous. 840 * The left neighbor is contiguous.
876 */ 841 */
877 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 842 trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
878 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 843 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
879 LEFT.br_blockcount + new->br_blockcount); 844 LEFT.br_blockcount + new->br_blockcount);
880 xfs_bmbt_set_startoff(ep, 845 xfs_bmbt_set_startoff(ep,
881 PREV.br_startoff + new->br_blockcount); 846 PREV.br_startoff + new->br_blockcount);
882 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 847 trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
883 848
884 temp = PREV.br_blockcount - new->br_blockcount; 849 temp = PREV.br_blockcount - new->br_blockcount;
885 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 850 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
886 xfs_bmbt_set_blockcount(ep, temp); 851 xfs_bmbt_set_blockcount(ep, temp);
887 ip->i_df.if_lastex = idx - 1;
888 if (cur == NULL) 852 if (cur == NULL)
889 rval = XFS_ILOG_DEXT; 853 rval = XFS_ILOG_DEXT;
890 else { 854 else {
@@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real(
904 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 868 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
905 startblockval(PREV.br_startblock)); 869 startblockval(PREV.br_startblock));
906 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 870 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
907 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 871 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
872
873 --*idx;
908 *dnew = temp; 874 *dnew = temp;
909 break; 875 break;
910 876
@@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real(
913 * Filling in the first part of a previous delayed allocation. 879 * Filling in the first part of a previous delayed allocation.
914 * The left neighbor is not contiguous. 880 * The left neighbor is not contiguous.
915 */ 881 */
916 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 882 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
917 xfs_bmbt_set_startoff(ep, new_endoff); 883 xfs_bmbt_set_startoff(ep, new_endoff);
918 temp = PREV.br_blockcount - new->br_blockcount; 884 temp = PREV.br_blockcount - new->br_blockcount;
919 xfs_bmbt_set_blockcount(ep, temp); 885 xfs_bmbt_set_blockcount(ep, temp);
920 xfs_iext_insert(ip, idx, 1, new, state); 886 xfs_iext_insert(ip, *idx, 1, new, state);
921 ip->i_df.if_lastex = idx;
922 ip->i_d.di_nextents++; 887 ip->i_d.di_nextents++;
923 if (cur == NULL) 888 if (cur == NULL)
924 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 889 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real(
946 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 911 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
947 startblockval(PREV.br_startblock) - 912 startblockval(PREV.br_startblock) -
948 (cur ? cur->bc_private.b.allocated : 0)); 913 (cur ? cur->bc_private.b.allocated : 0));
949 ep = xfs_iext_get_ext(ifp, idx + 1); 914 ep = xfs_iext_get_ext(ifp, *idx + 1);
950 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 915 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
951 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 916 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
917
952 *dnew = temp; 918 *dnew = temp;
953 break; 919 break;
954 920
@@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real(
958 * The right neighbor is contiguous with the new allocation. 924 * The right neighbor is contiguous with the new allocation.
959 */ 925 */
960 temp = PREV.br_blockcount - new->br_blockcount; 926 temp = PREV.br_blockcount - new->br_blockcount;
961 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 927 trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
962 trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
963 xfs_bmbt_set_blockcount(ep, temp); 928 xfs_bmbt_set_blockcount(ep, temp);
964 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 929 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
965 new->br_startoff, new->br_startblock, 930 new->br_startoff, new->br_startblock,
966 new->br_blockcount + RIGHT.br_blockcount, 931 new->br_blockcount + RIGHT.br_blockcount,
967 RIGHT.br_state); 932 RIGHT.br_state);
968 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 933 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
969 ip->i_df.if_lastex = idx + 1;
970 if (cur == NULL) 934 if (cur == NULL)
971 rval = XFS_ILOG_DEXT; 935 rval = XFS_ILOG_DEXT;
972 else { 936 else {
@@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real(
983 RIGHT.br_state))) 947 RIGHT.br_state)))
984 goto done; 948 goto done;
985 } 949 }
950
986 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 951 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
987 startblockval(PREV.br_startblock)); 952 startblockval(PREV.br_startblock));
953 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
988 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 954 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
989 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 955 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
956
957 ++*idx;
990 *dnew = temp; 958 *dnew = temp;
991 break; 959 break;
992 960
@@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real(
996 * The right neighbor is not contiguous. 964 * The right neighbor is not contiguous.
997 */ 965 */
998 temp = PREV.br_blockcount - new->br_blockcount; 966 temp = PREV.br_blockcount - new->br_blockcount;
999 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 967 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1000 xfs_bmbt_set_blockcount(ep, temp); 968 xfs_bmbt_set_blockcount(ep, temp);
1001 xfs_iext_insert(ip, idx + 1, 1, new, state); 969 xfs_iext_insert(ip, *idx + 1, 1, new, state);
1002 ip->i_df.if_lastex = idx + 1;
1003 ip->i_d.di_nextents++; 970 ip->i_d.di_nextents++;
1004 if (cur == NULL) 971 if (cur == NULL)
1005 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 972 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real(
1027 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 994 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1028 startblockval(PREV.br_startblock) - 995 startblockval(PREV.br_startblock) -
1029 (cur ? cur->bc_private.b.allocated : 0)); 996 (cur ? cur->bc_private.b.allocated : 0));
1030 ep = xfs_iext_get_ext(ifp, idx); 997 ep = xfs_iext_get_ext(ifp, *idx);
1031 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 998 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1032 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 999 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1000
1001 ++*idx;
1033 *dnew = temp; 1002 *dnew = temp;
1034 break; 1003 break;
1035 1004
@@ -1056,7 +1025,7 @@ xfs_bmap_add_extent_delay_real(
1056 */ 1025 */
1057 temp = new->br_startoff - PREV.br_startoff; 1026 temp = new->br_startoff - PREV.br_startoff;
1058 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1027 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1059 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); 1028 trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
1060 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ 1029 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1061 LEFT = *new; 1030 LEFT = *new;
1062 RIGHT.br_state = PREV.br_state; 1031 RIGHT.br_state = PREV.br_state;
@@ -1065,8 +1034,7 @@ xfs_bmap_add_extent_delay_real(
1065 RIGHT.br_startoff = new_endoff; 1034 RIGHT.br_startoff = new_endoff;
1066 RIGHT.br_blockcount = temp2; 1035 RIGHT.br_blockcount = temp2;
1067 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ 1036 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1068 xfs_iext_insert(ip, idx + 1, 2, &LEFT, state); 1037 xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
1069 ip->i_df.if_lastex = idx + 1;
1070 ip->i_d.di_nextents++; 1038 ip->i_d.di_nextents++;
1071 if (cur == NULL) 1039 if (cur == NULL)
1072 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1040 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1097,7 +1065,7 @@ xfs_bmap_add_extent_delay_real(
1097 (cur ? cur->bc_private.b.allocated : 0)); 1065 (cur ? cur->bc_private.b.allocated : 0));
1098 if (diff > 0 && 1066 if (diff > 0 &&
1099 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, 1067 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1100 -((int64_t)diff), rsvd)) { 1068 -((int64_t)diff), 0)) {
1101 /* 1069 /*
1102 * Ick gross gag me with a spoon. 1070 * Ick gross gag me with a spoon.
1103 */ 1071 */
@@ -1109,7 +1077,7 @@ xfs_bmap_add_extent_delay_real(
1109 if (!diff || 1077 if (!diff ||
1110 !xfs_icsb_modify_counters(ip->i_mount, 1078 !xfs_icsb_modify_counters(ip->i_mount,
1111 XFS_SBS_FDBLOCKS, 1079 XFS_SBS_FDBLOCKS,
1112 -((int64_t)diff), rsvd)) 1080 -((int64_t)diff), 0))
1113 break; 1081 break;
1114 } 1082 }
1115 if (temp2) { 1083 if (temp2) {
@@ -1118,18 +1086,20 @@ xfs_bmap_add_extent_delay_real(
1118 if (!diff || 1086 if (!diff ||
1119 !xfs_icsb_modify_counters(ip->i_mount, 1087 !xfs_icsb_modify_counters(ip->i_mount,
1120 XFS_SBS_FDBLOCKS, 1088 XFS_SBS_FDBLOCKS,
1121 -((int64_t)diff), rsvd)) 1089 -((int64_t)diff), 0))
1122 break; 1090 break;
1123 } 1091 }
1124 } 1092 }
1125 } 1093 }
1126 ep = xfs_iext_get_ext(ifp, idx); 1094 ep = xfs_iext_get_ext(ifp, *idx);
1127 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 1095 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1128 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1096 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1129 trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_); 1097 trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
1130 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), 1098 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
1131 nullstartblock((int)temp2)); 1099 nullstartblock((int)temp2));
1132 trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); 1100 trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
1101
1102 ++*idx;
1133 *dnew = temp + temp2; 1103 *dnew = temp + temp2;
1134 break; 1104 break;
1135 1105
@@ -1161,7 +1131,7 @@ done:
1161STATIC int /* error */ 1131STATIC int /* error */
1162xfs_bmap_add_extent_unwritten_real( 1132xfs_bmap_add_extent_unwritten_real(
1163 xfs_inode_t *ip, /* incore inode pointer */ 1133 xfs_inode_t *ip, /* incore inode pointer */
1164 xfs_extnum_t idx, /* extent number to update/insert */ 1134 xfs_extnum_t *idx, /* extent number to update/insert */
1165 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 1135 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
1166 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1136 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1167 int *logflagsp) /* inode logging flags */ 1137 int *logflagsp) /* inode logging flags */
@@ -1188,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real(
1188 error = 0; 1158 error = 0;
1189 cur = *curp; 1159 cur = *curp;
1190 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1160 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1191 ep = xfs_iext_get_ext(ifp, idx); 1161 ep = xfs_iext_get_ext(ifp, *idx);
1192 xfs_bmbt_get_all(ep, &PREV); 1162 xfs_bmbt_get_all(ep, &PREV);
1193 newext = new->br_state; 1163 newext = new->br_state;
1194 oldext = (newext == XFS_EXT_UNWRITTEN) ? 1164 oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1211,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real(
1211 * Check and set flags if this segment has a left neighbor. 1181 * Check and set flags if this segment has a left neighbor.
1212 * Don't set contiguous if the combined extent would be too large. 1182 * Don't set contiguous if the combined extent would be too large.
1213 */ 1183 */
1214 if (idx > 0) { 1184 if (*idx > 0) {
1215 state |= BMAP_LEFT_VALID; 1185 state |= BMAP_LEFT_VALID;
1216 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 1186 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
1217 1187
1218 if (isnullstartblock(LEFT.br_startblock)) 1188 if (isnullstartblock(LEFT.br_startblock))
1219 state |= BMAP_LEFT_DELAY; 1189 state |= BMAP_LEFT_DELAY;
@@ -1231,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real(
1231 * Don't set contiguous if the combined extent would be too large. 1201 * Don't set contiguous if the combined extent would be too large.
1232 * Also check for all-three-contiguous being too large. 1202 * Also check for all-three-contiguous being too large.
1233 */ 1203 */
1234 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { 1204 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
1235 state |= BMAP_RIGHT_VALID; 1205 state |= BMAP_RIGHT_VALID;
1236 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 1206 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
1237 if (isnullstartblock(RIGHT.br_startblock)) 1207 if (isnullstartblock(RIGHT.br_startblock))
1238 state |= BMAP_RIGHT_DELAY; 1208 state |= BMAP_RIGHT_DELAY;
1239 } 1209 }
@@ -1262,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real(
1262 * Setting all of a previous oldext extent to newext. 1232 * Setting all of a previous oldext extent to newext.
1263 * The left and right neighbors are both contiguous with new. 1233 * The left and right neighbors are both contiguous with new.
1264 */ 1234 */
1265 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1235 --*idx;
1266 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1236
1237 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1238 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1267 LEFT.br_blockcount + PREV.br_blockcount + 1239 LEFT.br_blockcount + PREV.br_blockcount +
1268 RIGHT.br_blockcount); 1240 RIGHT.br_blockcount);
1269 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1241 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1270 1242
1271 xfs_iext_remove(ip, idx, 2, state); 1243 xfs_iext_remove(ip, *idx + 1, 2, state);
1272 ip->i_df.if_lastex = idx - 1;
1273 ip->i_d.di_nextents -= 2; 1244 ip->i_d.di_nextents -= 2;
1274 if (cur == NULL) 1245 if (cur == NULL)
1275 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1246 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1305,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real(
1305 * Setting all of a previous oldext extent to newext. 1276 * Setting all of a previous oldext extent to newext.
1306 * The left neighbor is contiguous, the right is not. 1277 * The left neighbor is contiguous, the right is not.
1307 */ 1278 */
1308 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1279 --*idx;
1309 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1280
1281 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1282 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1310 LEFT.br_blockcount + PREV.br_blockcount); 1283 LEFT.br_blockcount + PREV.br_blockcount);
1311 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1284 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1312 1285
1313 ip->i_df.if_lastex = idx - 1; 1286 xfs_iext_remove(ip, *idx + 1, 1, state);
1314 xfs_iext_remove(ip, idx, 1, state);
1315 ip->i_d.di_nextents--; 1287 ip->i_d.di_nextents--;
1316 if (cur == NULL) 1288 if (cur == NULL)
1317 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1289 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1341,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real(
1341 * Setting all of a previous oldext extent to newext. 1313 * Setting all of a previous oldext extent to newext.
1342 * The right neighbor is contiguous, the left is not. 1314 * The right neighbor is contiguous, the left is not.
1343 */ 1315 */
1344 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1316 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1345 xfs_bmbt_set_blockcount(ep, 1317 xfs_bmbt_set_blockcount(ep,
1346 PREV.br_blockcount + RIGHT.br_blockcount); 1318 PREV.br_blockcount + RIGHT.br_blockcount);
1347 xfs_bmbt_set_state(ep, newext); 1319 xfs_bmbt_set_state(ep, newext);
1348 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1320 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1349 ip->i_df.if_lastex = idx; 1321 xfs_iext_remove(ip, *idx + 1, 1, state);
1350 xfs_iext_remove(ip, idx + 1, 1, state);
1351 ip->i_d.di_nextents--; 1322 ip->i_d.di_nextents--;
1352 if (cur == NULL) 1323 if (cur == NULL)
1353 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1324 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1378,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real(
1378 * Neither the left nor right neighbors are contiguous with 1349 * Neither the left nor right neighbors are contiguous with
1379 * the new one. 1350 * the new one.
1380 */ 1351 */
1381 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1352 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1382 xfs_bmbt_set_state(ep, newext); 1353 xfs_bmbt_set_state(ep, newext);
1383 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1354 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1384 1355
1385 ip->i_df.if_lastex = idx;
1386 if (cur == NULL) 1356 if (cur == NULL)
1387 rval = XFS_ILOG_DEXT; 1357 rval = XFS_ILOG_DEXT;
1388 else { 1358 else {
@@ -1404,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real(
1404 * Setting the first part of a previous oldext extent to newext. 1374 * Setting the first part of a previous oldext extent to newext.
1405 * The left neighbor is contiguous. 1375 * The left neighbor is contiguous.
1406 */ 1376 */
1407 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1377 trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
1408 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1378 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
1409 LEFT.br_blockcount + new->br_blockcount); 1379 LEFT.br_blockcount + new->br_blockcount);
1410 xfs_bmbt_set_startoff(ep, 1380 xfs_bmbt_set_startoff(ep,
1411 PREV.br_startoff + new->br_blockcount); 1381 PREV.br_startoff + new->br_blockcount);
1412 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1382 trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
1413 1383
1414 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1384 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1415 xfs_bmbt_set_startblock(ep, 1385 xfs_bmbt_set_startblock(ep,
1416 new->br_startblock + new->br_blockcount); 1386 new->br_startblock + new->br_blockcount);
1417 xfs_bmbt_set_blockcount(ep, 1387 xfs_bmbt_set_blockcount(ep,
1418 PREV.br_blockcount - new->br_blockcount); 1388 PREV.br_blockcount - new->br_blockcount);
1419 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1389 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1390
1391 --*idx;
1420 1392
1421 ip->i_df.if_lastex = idx - 1;
1422 if (cur == NULL) 1393 if (cur == NULL)
1423 rval = XFS_ILOG_DEXT; 1394 rval = XFS_ILOG_DEXT;
1424 else { 1395 else {
@@ -1449,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real(
1449 * Setting the first part of a previous oldext extent to newext. 1420 * Setting the first part of a previous oldext extent to newext.
1450 * The left neighbor is not contiguous. 1421 * The left neighbor is not contiguous.
1451 */ 1422 */
1452 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1423 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1453 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); 1424 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
1454 xfs_bmbt_set_startoff(ep, new_endoff); 1425 xfs_bmbt_set_startoff(ep, new_endoff);
1455 xfs_bmbt_set_blockcount(ep, 1426 xfs_bmbt_set_blockcount(ep,
1456 PREV.br_blockcount - new->br_blockcount); 1427 PREV.br_blockcount - new->br_blockcount);
1457 xfs_bmbt_set_startblock(ep, 1428 xfs_bmbt_set_startblock(ep,
1458 new->br_startblock + new->br_blockcount); 1429 new->br_startblock + new->br_blockcount);
1459 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1430 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1460 1431
1461 xfs_iext_insert(ip, idx, 1, new, state); 1432 xfs_iext_insert(ip, *idx, 1, new, state);
1462 ip->i_df.if_lastex = idx;
1463 ip->i_d.di_nextents++; 1433 ip->i_d.di_nextents++;
1464 if (cur == NULL) 1434 if (cur == NULL)
1465 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1435 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1488,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real(
1488 * Setting the last part of a previous oldext extent to newext. 1458 * Setting the last part of a previous oldext extent to newext.
1489 * The right neighbor is contiguous with the new allocation. 1459 * The right neighbor is contiguous with the new allocation.
1490 */ 1460 */
1491 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1461 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1492 trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
1493 xfs_bmbt_set_blockcount(ep, 1462 xfs_bmbt_set_blockcount(ep,
1494 PREV.br_blockcount - new->br_blockcount); 1463 PREV.br_blockcount - new->br_blockcount);
1495 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1464 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1496 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 1465
1466 ++*idx;
1467
1468 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1469 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
1497 new->br_startoff, new->br_startblock, 1470 new->br_startoff, new->br_startblock,
1498 new->br_blockcount + RIGHT.br_blockcount, newext); 1471 new->br_blockcount + RIGHT.br_blockcount, newext);
1499 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 1472 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1500 1473
1501 ip->i_df.if_lastex = idx + 1;
1502 if (cur == NULL) 1474 if (cur == NULL)
1503 rval = XFS_ILOG_DEXT; 1475 rval = XFS_ILOG_DEXT;
1504 else { 1476 else {
@@ -1528,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real(
1528 * Setting the last part of a previous oldext extent to newext. 1500 * Setting the last part of a previous oldext extent to newext.
1529 * The right neighbor is not contiguous. 1501 * The right neighbor is not contiguous.
1530 */ 1502 */
1531 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1503 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1532 xfs_bmbt_set_blockcount(ep, 1504 xfs_bmbt_set_blockcount(ep,
1533 PREV.br_blockcount - new->br_blockcount); 1505 PREV.br_blockcount - new->br_blockcount);
1534 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1506 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1507
1508 ++*idx;
1509 xfs_iext_insert(ip, *idx, 1, new, state);
1535 1510
1536 xfs_iext_insert(ip, idx + 1, 1, new, state);
1537 ip->i_df.if_lastex = idx + 1;
1538 ip->i_d.di_nextents++; 1511 ip->i_d.di_nextents++;
1539 if (cur == NULL) 1512 if (cur == NULL)
1540 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1513 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1568,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real(
1568 * newext. Contiguity is impossible here. 1541 * newext. Contiguity is impossible here.
1569 * One extent becomes three extents. 1542 * One extent becomes three extents.
1570 */ 1543 */
1571 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1544 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1572 xfs_bmbt_set_blockcount(ep, 1545 xfs_bmbt_set_blockcount(ep,
1573 new->br_startoff - PREV.br_startoff); 1546 new->br_startoff - PREV.br_startoff);
1574 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1547 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1575 1548
1576 r[0] = *new; 1549 r[0] = *new;
1577 r[1].br_startoff = new_endoff; 1550 r[1].br_startoff = new_endoff;
@@ -1579,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real(
1579 PREV.br_startoff + PREV.br_blockcount - new_endoff; 1552 PREV.br_startoff + PREV.br_blockcount - new_endoff;
1580 r[1].br_startblock = new->br_startblock + new->br_blockcount; 1553 r[1].br_startblock = new->br_startblock + new->br_blockcount;
1581 r[1].br_state = oldext; 1554 r[1].br_state = oldext;
1582 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1555
1583 ip->i_df.if_lastex = idx + 1; 1556 ++*idx;
1557 xfs_iext_insert(ip, *idx, 2, &r[0], state);
1558
1584 ip->i_d.di_nextents += 2; 1559 ip->i_d.di_nextents += 2;
1585 if (cur == NULL) 1560 if (cur == NULL)
1586 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1561 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1650,12 +1625,10 @@ done:
1650STATIC int /* error */ 1625STATIC int /* error */
1651xfs_bmap_add_extent_hole_delay( 1626xfs_bmap_add_extent_hole_delay(
1652 xfs_inode_t *ip, /* incore inode pointer */ 1627 xfs_inode_t *ip, /* incore inode pointer */
1653 xfs_extnum_t idx, /* extent number to update/insert */ 1628 xfs_extnum_t *idx, /* extent number to update/insert */
1654 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1629 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1655 int *logflagsp, /* inode logging flags */ 1630 int *logflagsp) /* inode logging flags */
1656 int rsvd) /* OK to allocate reserved blocks */
1657{ 1631{
1658 xfs_bmbt_rec_host_t *ep; /* extent record for idx */
1659 xfs_ifork_t *ifp; /* inode fork pointer */ 1632 xfs_ifork_t *ifp; /* inode fork pointer */
1660 xfs_bmbt_irec_t left; /* left neighbor extent entry */ 1633 xfs_bmbt_irec_t left; /* left neighbor extent entry */
1661 xfs_filblks_t newlen=0; /* new indirect size */ 1634 xfs_filblks_t newlen=0; /* new indirect size */
@@ -1665,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay(
1665 xfs_filblks_t temp=0; /* temp for indirect calculations */ 1638 xfs_filblks_t temp=0; /* temp for indirect calculations */
1666 1639
1667 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1640 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1668 ep = xfs_iext_get_ext(ifp, idx);
1669 state = 0; 1641 state = 0;
1670 ASSERT(isnullstartblock(new->br_startblock)); 1642 ASSERT(isnullstartblock(new->br_startblock));
1671 1643
1672 /* 1644 /*
1673 * Check and set flags if this segment has a left neighbor 1645 * Check and set flags if this segment has a left neighbor
1674 */ 1646 */
1675 if (idx > 0) { 1647 if (*idx > 0) {
1676 state |= BMAP_LEFT_VALID; 1648 state |= BMAP_LEFT_VALID;
1677 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1649 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
1678 1650
1679 if (isnullstartblock(left.br_startblock)) 1651 if (isnullstartblock(left.br_startblock))
1680 state |= BMAP_LEFT_DELAY; 1652 state |= BMAP_LEFT_DELAY;
@@ -1684,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay(
1684 * Check and set flags if the current (right) segment exists. 1656 * Check and set flags if the current (right) segment exists.
1685 * If it doesn't exist, we're converting the hole at end-of-file. 1657 * If it doesn't exist, we're converting the hole at end-of-file.
1686 */ 1658 */
1687 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { 1659 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
1688 state |= BMAP_RIGHT_VALID; 1660 state |= BMAP_RIGHT_VALID;
1689 xfs_bmbt_get_all(ep, &right); 1661 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
1690 1662
1691 if (isnullstartblock(right.br_startblock)) 1663 if (isnullstartblock(right.br_startblock))
1692 state |= BMAP_RIGHT_DELAY; 1664 state |= BMAP_RIGHT_DELAY;
@@ -1719,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay(
1719 * on the left and on the right. 1691 * on the left and on the right.
1720 * Merge all three into a single extent record. 1692 * Merge all three into a single extent record.
1721 */ 1693 */
1694 --*idx;
1722 temp = left.br_blockcount + new->br_blockcount + 1695 temp = left.br_blockcount + new->br_blockcount +
1723 right.br_blockcount; 1696 right.br_blockcount;
1724 1697
1725 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1698 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1726 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1699 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
1727 oldlen = startblockval(left.br_startblock) + 1700 oldlen = startblockval(left.br_startblock) +
1728 startblockval(new->br_startblock) + 1701 startblockval(new->br_startblock) +
1729 startblockval(right.br_startblock); 1702 startblockval(right.br_startblock);
1730 newlen = xfs_bmap_worst_indlen(ip, temp); 1703 newlen = xfs_bmap_worst_indlen(ip, temp);
1731 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1704 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
1732 nullstartblock((int)newlen)); 1705 nullstartblock((int)newlen));
1733 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1706 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1734 1707
1735 xfs_iext_remove(ip, idx, 1, state); 1708 xfs_iext_remove(ip, *idx + 1, 1, state);
1736 ip->i_df.if_lastex = idx - 1;
1737 break; 1709 break;
1738 1710
1739 case BMAP_LEFT_CONTIG: 1711 case BMAP_LEFT_CONTIG:
@@ -1742,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay(
1742 * on the left. 1714 * on the left.
1743 * Merge the new allocation with the left neighbor. 1715 * Merge the new allocation with the left neighbor.
1744 */ 1716 */
1717 --*idx;
1745 temp = left.br_blockcount + new->br_blockcount; 1718 temp = left.br_blockcount + new->br_blockcount;
1746 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1719
1747 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1720 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1721 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
1748 oldlen = startblockval(left.br_startblock) + 1722 oldlen = startblockval(left.br_startblock) +
1749 startblockval(new->br_startblock); 1723 startblockval(new->br_startblock);
1750 newlen = xfs_bmap_worst_indlen(ip, temp); 1724 newlen = xfs_bmap_worst_indlen(ip, temp);
1751 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1725 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
1752 nullstartblock((int)newlen)); 1726 nullstartblock((int)newlen));
1753 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1727 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1754
1755 ip->i_df.if_lastex = idx - 1;
1756 break; 1728 break;
1757 1729
1758 case BMAP_RIGHT_CONTIG: 1730 case BMAP_RIGHT_CONTIG:
@@ -1761,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay(
1761 * on the right. 1733 * on the right.
1762 * Merge the new allocation with the right neighbor. 1734 * Merge the new allocation with the right neighbor.
1763 */ 1735 */
1764 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1736 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1765 temp = new->br_blockcount + right.br_blockcount; 1737 temp = new->br_blockcount + right.br_blockcount;
1766 oldlen = startblockval(new->br_startblock) + 1738 oldlen = startblockval(new->br_startblock) +
1767 startblockval(right.br_startblock); 1739 startblockval(right.br_startblock);
1768 newlen = xfs_bmap_worst_indlen(ip, temp); 1740 newlen = xfs_bmap_worst_indlen(ip, temp);
1769 xfs_bmbt_set_allf(ep, new->br_startoff, 1741 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
1742 new->br_startoff,
1770 nullstartblock((int)newlen), temp, right.br_state); 1743 nullstartblock((int)newlen), temp, right.br_state);
1771 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1744 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1772
1773 ip->i_df.if_lastex = idx;
1774 break; 1745 break;
1775 1746
1776 case 0: 1747 case 0:
@@ -1780,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay(
1780 * Insert a new entry. 1751 * Insert a new entry.
1781 */ 1752 */
1782 oldlen = newlen = 0; 1753 oldlen = newlen = 0;
1783 xfs_iext_insert(ip, idx, 1, new, state); 1754 xfs_iext_insert(ip, *idx, 1, new, state);
1784 ip->i_df.if_lastex = idx;
1785 break; 1755 break;
1786 } 1756 }
1787 if (oldlen != newlen) { 1757 if (oldlen != newlen) {
1788 ASSERT(oldlen > newlen); 1758 ASSERT(oldlen > newlen);
1789 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, 1759 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1790 (int64_t)(oldlen - newlen), rsvd); 1760 (int64_t)(oldlen - newlen), 0);
1791 /* 1761 /*
1792 * Nothing to do for disk quota accounting here. 1762 * Nothing to do for disk quota accounting here.
1793 */ 1763 */
@@ -1803,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay(
1803STATIC int /* error */ 1773STATIC int /* error */
1804xfs_bmap_add_extent_hole_real( 1774xfs_bmap_add_extent_hole_real(
1805 xfs_inode_t *ip, /* incore inode pointer */ 1775 xfs_inode_t *ip, /* incore inode pointer */
1806 xfs_extnum_t idx, /* extent number to update/insert */ 1776 xfs_extnum_t *idx, /* extent number to update/insert */
1807 xfs_btree_cur_t *cur, /* if null, not a btree */ 1777 xfs_btree_cur_t *cur, /* if null, not a btree */
1808 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1778 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1809 int *logflagsp, /* inode logging flags */ 1779 int *logflagsp, /* inode logging flags */
1810 int whichfork) /* data or attr fork */ 1780 int whichfork) /* data or attr fork */
1811{ 1781{
1812 xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */
1813 int error; /* error return value */ 1782 int error; /* error return value */
1814 int i; /* temp state */ 1783 int i; /* temp state */
1815 xfs_ifork_t *ifp; /* inode fork pointer */ 1784 xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1819,8 +1788,7 @@ xfs_bmap_add_extent_hole_real(
1819 int state; /* state bits, accessed thru macros */ 1788 int state; /* state bits, accessed thru macros */
1820 1789
1821 ifp = XFS_IFORK_PTR(ip, whichfork); 1790 ifp = XFS_IFORK_PTR(ip, whichfork);
1822 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 1791 ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
1823 ep = xfs_iext_get_ext(ifp, idx);
1824 state = 0; 1792 state = 0;
1825 1793
1826 if (whichfork == XFS_ATTR_FORK) 1794 if (whichfork == XFS_ATTR_FORK)
@@ -1829,9 +1797,9 @@ xfs_bmap_add_extent_hole_real(
1829 /* 1797 /*
1830 * Check and set flags if this segment has a left neighbor. 1798 * Check and set flags if this segment has a left neighbor.
1831 */ 1799 */
1832 if (idx > 0) { 1800 if (*idx > 0) {
1833 state |= BMAP_LEFT_VALID; 1801 state |= BMAP_LEFT_VALID;
1834 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1802 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
1835 if (isnullstartblock(left.br_startblock)) 1803 if (isnullstartblock(left.br_startblock))
1836 state |= BMAP_LEFT_DELAY; 1804 state |= BMAP_LEFT_DELAY;
1837 } 1805 }
@@ -1840,9 +1808,9 @@ xfs_bmap_add_extent_hole_real(
1840 * Check and set flags if this segment has a current value. 1808 * Check and set flags if this segment has a current value.
1841 * Not true if we're inserting into the "hole" at eof. 1809 * Not true if we're inserting into the "hole" at eof.
1842 */ 1810 */
1843 if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { 1811 if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
1844 state |= BMAP_RIGHT_VALID; 1812 state |= BMAP_RIGHT_VALID;
1845 xfs_bmbt_get_all(ep, &right); 1813 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
1846 if (isnullstartblock(right.br_startblock)) 1814 if (isnullstartblock(right.br_startblock))
1847 state |= BMAP_RIGHT_DELAY; 1815 state |= BMAP_RIGHT_DELAY;
1848 } 1816 }
@@ -1879,14 +1847,15 @@ xfs_bmap_add_extent_hole_real(
1879 * left and on the right. 1847 * left and on the right.
1880 * Merge all three into a single extent record. 1848 * Merge all three into a single extent record.
1881 */ 1849 */
1882 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1850 --*idx;
1883 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1851 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1852 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1884 left.br_blockcount + new->br_blockcount + 1853 left.br_blockcount + new->br_blockcount +
1885 right.br_blockcount); 1854 right.br_blockcount);
1886 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1855 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1856
1857 xfs_iext_remove(ip, *idx + 1, 1, state);
1887 1858
1888 xfs_iext_remove(ip, idx, 1, state);
1889 ifp->if_lastex = idx - 1;
1890 XFS_IFORK_NEXT_SET(ip, whichfork, 1859 XFS_IFORK_NEXT_SET(ip, whichfork,
1891 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 1860 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
1892 if (cur == NULL) { 1861 if (cur == NULL) {
@@ -1921,12 +1890,12 @@ xfs_bmap_add_extent_hole_real(
1921 * on the left. 1890 * on the left.
1922 * Merge the new allocation with the left neighbor. 1891 * Merge the new allocation with the left neighbor.
1923 */ 1892 */
1924 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1893 --*idx;
1925 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1894 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1895 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1926 left.br_blockcount + new->br_blockcount); 1896 left.br_blockcount + new->br_blockcount);
1927 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1897 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1928 1898
1929 ifp->if_lastex = idx - 1;
1930 if (cur == NULL) { 1899 if (cur == NULL) {
1931 rval = xfs_ilog_fext(whichfork); 1900 rval = xfs_ilog_fext(whichfork);
1932 } else { 1901 } else {
@@ -1952,13 +1921,13 @@ xfs_bmap_add_extent_hole_real(
1952 * on the right. 1921 * on the right.
1953 * Merge the new allocation with the right neighbor. 1922 * Merge the new allocation with the right neighbor.
1954 */ 1923 */
1955 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1924 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1956 xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, 1925 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
1926 new->br_startoff, new->br_startblock,
1957 new->br_blockcount + right.br_blockcount, 1927 new->br_blockcount + right.br_blockcount,
1958 right.br_state); 1928 right.br_state);
1959 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1929 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1960 1930
1961 ifp->if_lastex = idx;
1962 if (cur == NULL) { 1931 if (cur == NULL) {
1963 rval = xfs_ilog_fext(whichfork); 1932 rval = xfs_ilog_fext(whichfork);
1964 } else { 1933 } else {
@@ -1984,8 +1953,7 @@ xfs_bmap_add_extent_hole_real(
1984 * real allocation. 1953 * real allocation.
1985 * Insert a new entry. 1954 * Insert a new entry.
1986 */ 1955 */
1987 xfs_iext_insert(ip, idx, 1, new, state); 1956 xfs_iext_insert(ip, *idx, 1, new, state);
1988 ifp->if_lastex = idx;
1989 XFS_IFORK_NEXT_SET(ip, whichfork, 1957 XFS_IFORK_NEXT_SET(ip, whichfork,
1990 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 1958 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
1991 if (cur == NULL) { 1959 if (cur == NULL) {
@@ -2833,13 +2801,12 @@ STATIC int /* error */
2833xfs_bmap_del_extent( 2801xfs_bmap_del_extent(
2834 xfs_inode_t *ip, /* incore inode pointer */ 2802 xfs_inode_t *ip, /* incore inode pointer */
2835 xfs_trans_t *tp, /* current transaction pointer */ 2803 xfs_trans_t *tp, /* current transaction pointer */
2836 xfs_extnum_t idx, /* extent number to update/delete */ 2804 xfs_extnum_t *idx, /* extent number to update/delete */
2837 xfs_bmap_free_t *flist, /* list of extents to be freed */ 2805 xfs_bmap_free_t *flist, /* list of extents to be freed */
2838 xfs_btree_cur_t *cur, /* if null, not a btree */ 2806 xfs_btree_cur_t *cur, /* if null, not a btree */
2839 xfs_bmbt_irec_t *del, /* data to remove from extents */ 2807 xfs_bmbt_irec_t *del, /* data to remove from extents */
2840 int *logflagsp, /* inode logging flags */ 2808 int *logflagsp, /* inode logging flags */
2841 int whichfork, /* data or attr fork */ 2809 int whichfork) /* data or attr fork */
2842 int rsvd) /* OK to allocate reserved blocks */
2843{ 2810{
2844 xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ 2811 xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
2845 xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ 2812 xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
@@ -2870,10 +2837,10 @@ xfs_bmap_del_extent(
2870 2837
2871 mp = ip->i_mount; 2838 mp = ip->i_mount;
2872 ifp = XFS_IFORK_PTR(ip, whichfork); 2839 ifp = XFS_IFORK_PTR(ip, whichfork);
2873 ASSERT((idx >= 0) && (idx < ifp->if_bytes / 2840 ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
2874 (uint)sizeof(xfs_bmbt_rec_t))); 2841 (uint)sizeof(xfs_bmbt_rec_t)));
2875 ASSERT(del->br_blockcount > 0); 2842 ASSERT(del->br_blockcount > 0);
2876 ep = xfs_iext_get_ext(ifp, idx); 2843 ep = xfs_iext_get_ext(ifp, *idx);
2877 xfs_bmbt_get_all(ep, &got); 2844 xfs_bmbt_get_all(ep, &got);
2878 ASSERT(got.br_startoff <= del->br_startoff); 2845 ASSERT(got.br_startoff <= del->br_startoff);
2879 del_endoff = del->br_startoff + del->br_blockcount; 2846 del_endoff = del->br_startoff + del->br_blockcount;
@@ -2947,11 +2914,12 @@ xfs_bmap_del_extent(
2947 /* 2914 /*
2948 * Matches the whole extent. Delete the entry. 2915 * Matches the whole extent. Delete the entry.
2949 */ 2916 */
2950 xfs_iext_remove(ip, idx, 1, 2917 xfs_iext_remove(ip, *idx, 1,
2951 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); 2918 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
2952 ifp->if_lastex = idx; 2919 --*idx;
2953 if (delay) 2920 if (delay)
2954 break; 2921 break;
2922
2955 XFS_IFORK_NEXT_SET(ip, whichfork, 2923 XFS_IFORK_NEXT_SET(ip, whichfork,
2956 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 2924 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
2957 flags |= XFS_ILOG_CORE; 2925 flags |= XFS_ILOG_CORE;
@@ -2968,21 +2936,20 @@ xfs_bmap_del_extent(
2968 /* 2936 /*
2969 * Deleting the first part of the extent. 2937 * Deleting the first part of the extent.
2970 */ 2938 */
2971 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 2939 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2972 xfs_bmbt_set_startoff(ep, del_endoff); 2940 xfs_bmbt_set_startoff(ep, del_endoff);
2973 temp = got.br_blockcount - del->br_blockcount; 2941 temp = got.br_blockcount - del->br_blockcount;
2974 xfs_bmbt_set_blockcount(ep, temp); 2942 xfs_bmbt_set_blockcount(ep, temp);
2975 ifp->if_lastex = idx;
2976 if (delay) { 2943 if (delay) {
2977 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 2944 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2978 da_old); 2945 da_old);
2979 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 2946 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
2980 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2947 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2981 da_new = temp; 2948 da_new = temp;
2982 break; 2949 break;
2983 } 2950 }
2984 xfs_bmbt_set_startblock(ep, del_endblock); 2951 xfs_bmbt_set_startblock(ep, del_endblock);
2985 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2952 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2986 if (!cur) { 2953 if (!cur) {
2987 flags |= xfs_ilog_fext(whichfork); 2954 flags |= xfs_ilog_fext(whichfork);
2988 break; 2955 break;
@@ -2998,18 +2965,17 @@ xfs_bmap_del_extent(
2998 * Deleting the last part of the extent. 2965 * Deleting the last part of the extent.
2999 */ 2966 */
3000 temp = got.br_blockcount - del->br_blockcount; 2967 temp = got.br_blockcount - del->br_blockcount;
3001 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 2968 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
3002 xfs_bmbt_set_blockcount(ep, temp); 2969 xfs_bmbt_set_blockcount(ep, temp);
3003 ifp->if_lastex = idx;
3004 if (delay) { 2970 if (delay) {
3005 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 2971 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3006 da_old); 2972 da_old);
3007 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 2973 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3008 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2974 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
3009 da_new = temp; 2975 da_new = temp;
3010 break; 2976 break;
3011 } 2977 }
3012 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2978 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
3013 if (!cur) { 2979 if (!cur) {
3014 flags |= xfs_ilog_fext(whichfork); 2980 flags |= xfs_ilog_fext(whichfork);
3015 break; 2981 break;
@@ -3026,7 +2992,7 @@ xfs_bmap_del_extent(
3026 * Deleting the middle of the extent. 2992 * Deleting the middle of the extent.
3027 */ 2993 */
3028 temp = del->br_startoff - got.br_startoff; 2994 temp = del->br_startoff - got.br_startoff;
3029 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 2995 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
3030 xfs_bmbt_set_blockcount(ep, temp); 2996 xfs_bmbt_set_blockcount(ep, temp);
3031 new.br_startoff = del_endoff; 2997 new.br_startoff = del_endoff;
3032 temp2 = got_endoff - del_endoff; 2998 temp2 = got_endoff - del_endoff;
@@ -3113,9 +3079,9 @@ xfs_bmap_del_extent(
3113 } 3079 }
3114 } 3080 }
3115 } 3081 }
3116 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 3082 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
3117 xfs_iext_insert(ip, idx + 1, 1, &new, state); 3083 xfs_iext_insert(ip, *idx + 1, 1, &new, state);
3118 ifp->if_lastex = idx + 1; 3084 ++*idx;
3119 break; 3085 break;
3120 } 3086 }
3121 /* 3087 /*
@@ -3142,7 +3108,7 @@ xfs_bmap_del_extent(
3142 ASSERT(da_old >= da_new); 3108 ASSERT(da_old >= da_new);
3143 if (da_old > da_new) { 3109 if (da_old > da_new) {
3144 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 3110 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
3145 (int64_t)(da_old - da_new), rsvd); 3111 (int64_t)(da_old - da_new), 0);
3146 } 3112 }
3147done: 3113done:
3148 *logflagsp = flags; 3114 *logflagsp = flags;
@@ -4562,29 +4528,24 @@ xfs_bmapi(
4562 if (rt) { 4528 if (rt) {
4563 error = xfs_mod_incore_sb(mp, 4529 error = xfs_mod_incore_sb(mp,
4564 XFS_SBS_FREXTENTS, 4530 XFS_SBS_FREXTENTS,
4565 -((int64_t)extsz), (flags & 4531 -((int64_t)extsz), 0);
4566 XFS_BMAPI_RSVBLOCKS));
4567 } else { 4532 } else {
4568 error = xfs_icsb_modify_counters(mp, 4533 error = xfs_icsb_modify_counters(mp,
4569 XFS_SBS_FDBLOCKS, 4534 XFS_SBS_FDBLOCKS,
4570 -((int64_t)alen), (flags & 4535 -((int64_t)alen), 0);
4571 XFS_BMAPI_RSVBLOCKS));
4572 } 4536 }
4573 if (!error) { 4537 if (!error) {
4574 error = xfs_icsb_modify_counters(mp, 4538 error = xfs_icsb_modify_counters(mp,
4575 XFS_SBS_FDBLOCKS, 4539 XFS_SBS_FDBLOCKS,
4576 -((int64_t)indlen), (flags & 4540 -((int64_t)indlen), 0);
4577 XFS_BMAPI_RSVBLOCKS));
4578 if (error && rt) 4541 if (error && rt)
4579 xfs_mod_incore_sb(mp, 4542 xfs_mod_incore_sb(mp,
4580 XFS_SBS_FREXTENTS, 4543 XFS_SBS_FREXTENTS,
4581 (int64_t)extsz, (flags & 4544 (int64_t)extsz, 0);
4582 XFS_BMAPI_RSVBLOCKS));
4583 else if (error) 4545 else if (error)
4584 xfs_icsb_modify_counters(mp, 4546 xfs_icsb_modify_counters(mp,
4585 XFS_SBS_FDBLOCKS, 4547 XFS_SBS_FDBLOCKS,
4586 (int64_t)alen, (flags & 4548 (int64_t)alen, 0);
4587 XFS_BMAPI_RSVBLOCKS));
4588 } 4549 }
4589 4550
4590 if (error) { 4551 if (error) {
@@ -4701,13 +4662,12 @@ xfs_bmapi(
4701 if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) 4662 if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
4702 got.br_state = XFS_EXT_UNWRITTEN; 4663 got.br_state = XFS_EXT_UNWRITTEN;
4703 } 4664 }
4704 error = xfs_bmap_add_extent(ip, lastx, &cur, &got, 4665 error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
4705 firstblock, flist, &tmp_logflags, 4666 firstblock, flist, &tmp_logflags,
4706 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4667 whichfork);
4707 logflags |= tmp_logflags; 4668 logflags |= tmp_logflags;
4708 if (error) 4669 if (error)
4709 goto error0; 4670 goto error0;
4710 lastx = ifp->if_lastex;
4711 ep = xfs_iext_get_ext(ifp, lastx); 4671 ep = xfs_iext_get_ext(ifp, lastx);
4712 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4672 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4713 xfs_bmbt_get_all(ep, &got); 4673 xfs_bmbt_get_all(ep, &got);
@@ -4803,13 +4763,12 @@ xfs_bmapi(
4803 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) 4763 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4804 ? XFS_EXT_NORM 4764 ? XFS_EXT_NORM
4805 : XFS_EXT_UNWRITTEN; 4765 : XFS_EXT_UNWRITTEN;
4806 error = xfs_bmap_add_extent(ip, lastx, &cur, mval, 4766 error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
4807 firstblock, flist, &tmp_logflags, 4767 firstblock, flist, &tmp_logflags,
4808 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4768 whichfork);
4809 logflags |= tmp_logflags; 4769 logflags |= tmp_logflags;
4810 if (error) 4770 if (error)
4811 goto error0; 4771 goto error0;
4812 lastx = ifp->if_lastex;
4813 ep = xfs_iext_get_ext(ifp, lastx); 4772 ep = xfs_iext_get_ext(ifp, lastx);
4814 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4773 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4815 xfs_bmbt_get_all(ep, &got); 4774 xfs_bmbt_get_all(ep, &got);
@@ -4868,14 +4827,14 @@ xfs_bmapi(
4868 /* 4827 /*
4869 * Else go on to the next record. 4828 * Else go on to the next record.
4870 */ 4829 */
4871 ep = xfs_iext_get_ext(ifp, ++lastx);
4872 prev = got; 4830 prev = got;
4873 if (lastx >= nextents) 4831 if (++lastx < nextents) {
4874 eof = 1; 4832 ep = xfs_iext_get_ext(ifp, lastx);
4875 else
4876 xfs_bmbt_get_all(ep, &got); 4833 xfs_bmbt_get_all(ep, &got);
4834 } else {
4835 eof = 1;
4836 }
4877 } 4837 }
4878 ifp->if_lastex = lastx;
4879 *nmap = n; 4838 *nmap = n;
4880 /* 4839 /*
4881 * Transform from btree to extents, give it cur. 4840 * Transform from btree to extents, give it cur.
@@ -4984,7 +4943,6 @@ xfs_bmapi_single(
4984 ASSERT(!isnullstartblock(got.br_startblock)); 4943 ASSERT(!isnullstartblock(got.br_startblock));
4985 ASSERT(bno < got.br_startoff + got.br_blockcount); 4944 ASSERT(bno < got.br_startoff + got.br_blockcount);
4986 *fsb = got.br_startblock + (bno - got.br_startoff); 4945 *fsb = got.br_startblock + (bno - got.br_startoff);
4987 ifp->if_lastex = lastx;
4988 return 0; 4946 return 0;
4989} 4947}
4990 4948
@@ -5026,7 +4984,6 @@ xfs_bunmapi(
5026 int tmp_logflags; /* partial logging flags */ 4984 int tmp_logflags; /* partial logging flags */
5027 int wasdel; /* was a delayed alloc extent */ 4985 int wasdel; /* was a delayed alloc extent */
5028 int whichfork; /* data or attribute fork */ 4986 int whichfork; /* data or attribute fork */
5029 int rsvd; /* OK to allocate reserved blocks */
5030 xfs_fsblock_t sum; 4987 xfs_fsblock_t sum;
5031 4988
5032 trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); 4989 trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5044,7 +5001,7 @@ xfs_bunmapi(
5044 mp = ip->i_mount; 5001 mp = ip->i_mount;
5045 if (XFS_FORCED_SHUTDOWN(mp)) 5002 if (XFS_FORCED_SHUTDOWN(mp))
5046 return XFS_ERROR(EIO); 5003 return XFS_ERROR(EIO);
5047 rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; 5004
5048 ASSERT(len > 0); 5005 ASSERT(len > 0);
5049 ASSERT(nexts >= 0); 5006 ASSERT(nexts >= 0);
5050 ASSERT(ifp->if_ext_max == 5007 ASSERT(ifp->if_ext_max ==
@@ -5160,9 +5117,9 @@ xfs_bunmapi(
5160 del.br_blockcount = mod; 5117 del.br_blockcount = mod;
5161 } 5118 }
5162 del.br_state = XFS_EXT_UNWRITTEN; 5119 del.br_state = XFS_EXT_UNWRITTEN;
5163 error = xfs_bmap_add_extent(ip, lastx, &cur, &del, 5120 error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
5164 firstblock, flist, &logflags, 5121 firstblock, flist, &logflags,
5165 XFS_DATA_FORK, 0); 5122 XFS_DATA_FORK);
5166 if (error) 5123 if (error)
5167 goto error0; 5124 goto error0;
5168 goto nodelete; 5125 goto nodelete;
@@ -5188,9 +5145,12 @@ xfs_bunmapi(
5188 */ 5145 */
5189 ASSERT(bno >= del.br_blockcount); 5146 ASSERT(bno >= del.br_blockcount);
5190 bno -= del.br_blockcount; 5147 bno -= del.br_blockcount;
5191 if (bno < got.br_startoff) { 5148 if (got.br_startoff > bno) {
5192 if (--lastx >= 0) 5149 if (--lastx >= 0) {
5193 xfs_bmbt_get_all(--ep, &got); 5150 ep = xfs_iext_get_ext(ifp,
5151 lastx);
5152 xfs_bmbt_get_all(ep, &got);
5153 }
5194 } 5154 }
5195 continue; 5155 continue;
5196 } else if (del.br_state == XFS_EXT_UNWRITTEN) { 5156 } else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5214,18 +5174,19 @@ xfs_bunmapi(
5214 prev.br_startoff = start; 5174 prev.br_startoff = start;
5215 } 5175 }
5216 prev.br_state = XFS_EXT_UNWRITTEN; 5176 prev.br_state = XFS_EXT_UNWRITTEN;
5217 error = xfs_bmap_add_extent(ip, lastx - 1, &cur, 5177 lastx--;
5178 error = xfs_bmap_add_extent(ip, &lastx, &cur,
5218 &prev, firstblock, flist, &logflags, 5179 &prev, firstblock, flist, &logflags,
5219 XFS_DATA_FORK, 0); 5180 XFS_DATA_FORK);
5220 if (error) 5181 if (error)
5221 goto error0; 5182 goto error0;
5222 goto nodelete; 5183 goto nodelete;
5223 } else { 5184 } else {
5224 ASSERT(del.br_state == XFS_EXT_NORM); 5185 ASSERT(del.br_state == XFS_EXT_NORM);
5225 del.br_state = XFS_EXT_UNWRITTEN; 5186 del.br_state = XFS_EXT_UNWRITTEN;
5226 error = xfs_bmap_add_extent(ip, lastx, &cur, 5187 error = xfs_bmap_add_extent(ip, &lastx, &cur,
5227 &del, firstblock, flist, &logflags, 5188 &del, firstblock, flist, &logflags,
5228 XFS_DATA_FORK, 0); 5189 XFS_DATA_FORK);
5229 if (error) 5190 if (error)
5230 goto error0; 5191 goto error0;
5231 goto nodelete; 5192 goto nodelete;
@@ -5240,13 +5201,13 @@ xfs_bunmapi(
5240 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); 5201 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
5241 do_div(rtexts, mp->m_sb.sb_rextsize); 5202 do_div(rtexts, mp->m_sb.sb_rextsize);
5242 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, 5203 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
5243 (int64_t)rtexts, rsvd); 5204 (int64_t)rtexts, 0);
5244 (void)xfs_trans_reserve_quota_nblks(NULL, 5205 (void)xfs_trans_reserve_quota_nblks(NULL,
5245 ip, -((long)del.br_blockcount), 0, 5206 ip, -((long)del.br_blockcount), 0,
5246 XFS_QMOPT_RES_RTBLKS); 5207 XFS_QMOPT_RES_RTBLKS);
5247 } else { 5208 } else {
5248 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 5209 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
5249 (int64_t)del.br_blockcount, rsvd); 5210 (int64_t)del.br_blockcount, 0);
5250 (void)xfs_trans_reserve_quota_nblks(NULL, 5211 (void)xfs_trans_reserve_quota_nblks(NULL,
5251 ip, -((long)del.br_blockcount), 0, 5212 ip, -((long)del.br_blockcount), 0,
5252 XFS_QMOPT_RES_REGBLKS); 5213 XFS_QMOPT_RES_REGBLKS);
@@ -5277,31 +5238,29 @@ xfs_bunmapi(
5277 error = XFS_ERROR(ENOSPC); 5238 error = XFS_ERROR(ENOSPC);
5278 goto error0; 5239 goto error0;
5279 } 5240 }
5280 error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, 5241 error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
5281 &tmp_logflags, whichfork, rsvd); 5242 &tmp_logflags, whichfork);
5282 logflags |= tmp_logflags; 5243 logflags |= tmp_logflags;
5283 if (error) 5244 if (error)
5284 goto error0; 5245 goto error0;
5285 bno = del.br_startoff - 1; 5246 bno = del.br_startoff - 1;
5286nodelete: 5247nodelete:
5287 lastx = ifp->if_lastex;
5288 /* 5248 /*
5289 * If not done go on to the next (previous) record. 5249 * If not done go on to the next (previous) record.
5290 * Reset ep in case the extents array was re-alloced.
5291 */ 5250 */
5292 ep = xfs_iext_get_ext(ifp, lastx);
5293 if (bno != (xfs_fileoff_t)-1 && bno >= start) { 5251 if (bno != (xfs_fileoff_t)-1 && bno >= start) {
5294 if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) || 5252 if (lastx >= 0) {
5295 xfs_bmbt_get_startoff(ep) > bno) { 5253 ep = xfs_iext_get_ext(ifp, lastx);
5296 if (--lastx >= 0) 5254 if (xfs_bmbt_get_startoff(ep) > bno) {
5297 ep = xfs_iext_get_ext(ifp, lastx); 5255 if (--lastx >= 0)
5298 } 5256 ep = xfs_iext_get_ext(ifp,
5299 if (lastx >= 0) 5257 lastx);
5258 }
5300 xfs_bmbt_get_all(ep, &got); 5259 xfs_bmbt_get_all(ep, &got);
5260 }
5301 extno++; 5261 extno++;
5302 } 5262 }
5303 } 5263 }
5304 ifp->if_lastex = lastx;
5305 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; 5264 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
5306 ASSERT(ifp->if_ext_max == 5265 ASSERT(ifp->if_ext_max ==
5307 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); 5266 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 3651191daea1..c62234bde053 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -69,7 +69,6 @@ typedef struct xfs_bmap_free
69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ 69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ 70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ 71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
72#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */
73#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ 72#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ 73#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
75 /* combine contig. space */ 74 /* combine contig. space */
@@ -87,7 +86,6 @@ typedef struct xfs_bmap_free
87 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 86 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
88 { XFS_BMAPI_METADATA, "METADATA" }, \ 87 { XFS_BMAPI_METADATA, "METADATA" }, \
89 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ 88 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
90 { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \
91 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ 89 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
92 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ 90 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
93 { XFS_BMAPI_CONTIG, "CONTIG" }, \ 91 { XFS_BMAPI_CONTIG, "CONTIG" }, \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c8e3349c287c..a098a20ca63e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -920,7 +920,6 @@ xfs_iread_extents(
920 /* 920 /*
921 * We know that the size is valid (it's checked in iformat_btree) 921 * We know that the size is valid (it's checked in iformat_btree)
922 */ 922 */
923 ifp->if_lastex = NULLEXTNUM;
924 ifp->if_bytes = ifp->if_real_bytes = 0; 923 ifp->if_bytes = ifp->if_real_bytes = 0;
925 ifp->if_flags |= XFS_IFEXTENTS; 924 ifp->if_flags |= XFS_IFEXTENTS;
926 xfs_iext_add(ifp, 0, nextents); 925 xfs_iext_add(ifp, 0, nextents);
@@ -2558,12 +2557,9 @@ xfs_iflush_fork(
2558 case XFS_DINODE_FMT_EXTENTS: 2557 case XFS_DINODE_FMT_EXTENTS:
2559 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2558 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2560 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2559 !(iip->ili_format.ilf_fields & extflag[whichfork]));
2561 ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
2562 (ifp->if_bytes == 0));
2563 ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
2564 (ifp->if_bytes > 0));
2565 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2560 if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
2566 (ifp->if_bytes > 0)) { 2561 (ifp->if_bytes > 0)) {
2562 ASSERT(xfs_iext_get_ext(ifp, 0));
2567 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2563 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2568 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2564 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2569 whichfork); 2565 whichfork);
@@ -3112,6 +3108,8 @@ xfs_iext_get_ext(
3112 xfs_extnum_t idx) /* index of target extent */ 3108 xfs_extnum_t idx) /* index of target extent */
3113{ 3109{
3114 ASSERT(idx >= 0); 3110 ASSERT(idx >= 0);
3111 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3112
3115 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { 3113 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
3116 return ifp->if_u1.if_ext_irec->er_extbuf; 3114 return ifp->if_u1.if_ext_irec->er_extbuf;
3117 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3115 } else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3191,7 +3189,6 @@ xfs_iext_add(
3191 } 3189 }
3192 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3190 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3193 ifp->if_real_bytes = 0; 3191 ifp->if_real_bytes = 0;
3194 ifp->if_lastex = nextents + ext_diff;
3195 } 3192 }
3196 /* 3193 /*
3197 * Otherwise use a linear (direct) extent list. 3194 * Otherwise use a linear (direct) extent list.
@@ -3886,8 +3883,10 @@ xfs_iext_idx_to_irec(
3886 xfs_extnum_t page_idx = *idxp; /* extent index in target list */ 3883 xfs_extnum_t page_idx = *idxp; /* extent index in target list */
3887 3884
3888 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3885 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3889 ASSERT(page_idx >= 0 && page_idx <= 3886 ASSERT(page_idx >= 0);
3890 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 3887 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3888 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3889
3891 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3890 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3892 erp_idx = 0; 3891 erp_idx = 0;
3893 low = 0; 3892 low = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ff4e2a30227d..3ae6d58e5473 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,7 +67,6 @@ typedef struct xfs_ifork {
67 short if_broot_bytes; /* bytes allocated for root */ 67 short if_broot_bytes; /* bytes allocated for root */
68 unsigned char if_flags; /* per-fork flags */ 68 unsigned char if_flags; /* per-fork flags */
69 unsigned char if_ext_max; /* max # of extent records */ 69 unsigned char if_ext_max; /* max # of extent records */
70 xfs_extnum_t if_lastex; /* last if_extents used */
71 union { 70 union {
72 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ 71 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
73 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ 72 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7d56e88a3f0e..c7755d5a5fbe 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -29,6 +29,7 @@
29#include "xfs_mount.h" 29#include "xfs_mount.h"
30#include "xfs_error.h" 30#include "xfs_error.h"
31#include "xfs_alloc.h" 31#include "xfs_alloc.h"
32#include "xfs_discard.h"
32 33
33/* 34/*
34 * Perform initial CIL structure initialisation. If the CIL is not 35 * Perform initial CIL structure initialisation. If the CIL is not
@@ -361,18 +362,28 @@ xlog_cil_committed(
361 int abort) 362 int abort)
362{ 363{
363 struct xfs_cil_ctx *ctx = args; 364 struct xfs_cil_ctx *ctx = args;
365 struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
364 366
365 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, 367 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
366 ctx->start_lsn, abort); 368 ctx->start_lsn, abort);
367 369
368 xfs_alloc_busy_sort(&ctx->busy_extents); 370 xfs_alloc_busy_sort(&ctx->busy_extents);
369 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents); 371 xfs_alloc_busy_clear(mp, &ctx->busy_extents,
372 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
370 373
371 spin_lock(&ctx->cil->xc_cil_lock); 374 spin_lock(&ctx->cil->xc_cil_lock);
372 list_del(&ctx->committing); 375 list_del(&ctx->committing);
373 spin_unlock(&ctx->cil->xc_cil_lock); 376 spin_unlock(&ctx->cil->xc_cil_lock);
374 377
375 xlog_cil_free_logvec(ctx->lv_chain); 378 xlog_cil_free_logvec(ctx->lv_chain);
379
380 if (!list_empty(&ctx->busy_extents)) {
381 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
382
383 xfs_discard_extents(mp, &ctx->busy_extents);
384 xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
385 }
386
376 kmem_free(ctx); 387 kmem_free(ctx);
377} 388}
378 389
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19af0ab0d0c6..3d68bb267c5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -224,6 +224,7 @@ typedef struct xfs_mount {
224#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 224#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
225 operations, typically for 225 operations, typically for
226 disk errors in metadata */ 226 disk errors in metadata */
227#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
227#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to 228#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
228 user */ 229 user */
229#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment 230#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d1f24858ccc4..7c7bc2b786bd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -609,7 +609,7 @@ xfs_trans_free(
609 struct xfs_trans *tp) 609 struct xfs_trans *tp)
610{ 610{
611 xfs_alloc_busy_sort(&tp->t_busy); 611 xfs_alloc_busy_sort(&tp->t_busy);
612 xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy); 612 xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
613 613
614 atomic_dec(&tp->t_mountp->m_active_trans); 614 atomic_dec(&tp->t_mountp->m_active_trans);
615 xfs_trans_free_dqinfo(tp); 615 xfs_trans_free_dqinfo(tp);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f5df23561b96..503c8a6b3079 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -217,8 +217,24 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
217 get_block_t *, loff_t *); 217 get_block_t *, loff_t *);
218int generic_cont_expand_simple(struct inode *inode, loff_t size); 218int generic_cont_expand_simple(struct inode *inode, loff_t size);
219int block_commit_write(struct page *page, unsigned from, unsigned to); 219int block_commit_write(struct page *page, unsigned from, unsigned to);
220int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
221 get_block_t get_block);
220int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 222int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
221 get_block_t get_block); 223 get_block_t get_block);
224/* Convert errno to return value from ->page_mkwrite() call */
225static inline int block_page_mkwrite_return(int err)
226{
227 if (err == 0)
228 return VM_FAULT_LOCKED;
229 if (err == -EFAULT)
230 return VM_FAULT_NOPAGE;
231 if (err == -ENOMEM)
232 return VM_FAULT_OOM;
233 if (err == -EAGAIN)
234 return VM_FAULT_RETRY;
235 /* -ENOSPC, -EDQUOT, -EIO ... */
236 return VM_FAULT_SIGBUS;
237}
222sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); 238sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
223int block_truncate_page(struct address_space *, loff_t, get_block_t *); 239int block_truncate_page(struct address_space *, loff_t, get_block_t *);
224int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned, 240int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned,
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
new file mode 100644
index 000000000000..04ffb2e6c9d0
--- /dev/null
+++ b/include/linux/cleancache.h
@@ -0,0 +1,122 @@
1#ifndef _LINUX_CLEANCACHE_H
2#define _LINUX_CLEANCACHE_H
3
4#include <linux/fs.h>
5#include <linux/exportfs.h>
6#include <linux/mm.h>
7
8#define CLEANCACHE_KEY_MAX 6
9
10/*
11 * cleancache requires every file with a page in cleancache to have a
12 * unique key unless/until the file is removed/truncated. For some
13 * filesystems, the inode number is unique, but for "modern" filesystems
14 * an exportable filehandle is required (see exportfs.h)
15 */
16struct cleancache_filekey {
17 union {
18 ino_t ino;
19 __u32 fh[CLEANCACHE_KEY_MAX];
20 u32 key[CLEANCACHE_KEY_MAX];
21 } u;
22};
23
24struct cleancache_ops {
25 int (*init_fs)(size_t);
26 int (*init_shared_fs)(char *uuid, size_t);
27 int (*get_page)(int, struct cleancache_filekey,
28 pgoff_t, struct page *);
29 void (*put_page)(int, struct cleancache_filekey,
30 pgoff_t, struct page *);
31 void (*flush_page)(int, struct cleancache_filekey, pgoff_t);
32 void (*flush_inode)(int, struct cleancache_filekey);
33 void (*flush_fs)(int);
34};
35
36extern struct cleancache_ops
37 cleancache_register_ops(struct cleancache_ops *ops);
38extern void __cleancache_init_fs(struct super_block *);
39extern void __cleancache_init_shared_fs(char *, struct super_block *);
40extern int __cleancache_get_page(struct page *);
41extern void __cleancache_put_page(struct page *);
42extern void __cleancache_flush_page(struct address_space *, struct page *);
43extern void __cleancache_flush_inode(struct address_space *);
44extern void __cleancache_flush_fs(struct super_block *);
45extern int cleancache_enabled;
46
47#ifdef CONFIG_CLEANCACHE
48static inline bool cleancache_fs_enabled(struct page *page)
49{
50 return page->mapping->host->i_sb->cleancache_poolid >= 0;
51}
52static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping)
53{
54 return mapping->host->i_sb->cleancache_poolid >= 0;
55}
56#else
57#define cleancache_enabled (0)
58#define cleancache_fs_enabled(_page) (0)
59#define cleancache_fs_enabled_mapping(_page) (0)
60#endif
61
62/*
63 * The shim layer provided by these inline functions allows the compiler
64 * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
65 * is disabled, to a single global variable check if CONFIG_CLEANCACHE
66 * is enabled but no cleancache "backend" has dynamically enabled it,
67 * and, for the most frequent cleancache ops, to a single global variable
68 * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
69 * and a cleancache backend has dynamically enabled cleancache, but the
70 * filesystem referenced by that cleancache op has not enabled cleancache.
71 * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
72 * no measurable performance impact.
73 */
74
75static inline void cleancache_init_fs(struct super_block *sb)
76{
77 if (cleancache_enabled)
78 __cleancache_init_fs(sb);
79}
80
81static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
82{
83 if (cleancache_enabled)
84 __cleancache_init_shared_fs(uuid, sb);
85}
86
87static inline int cleancache_get_page(struct page *page)
88{
89 int ret = -1;
90
91 if (cleancache_enabled && cleancache_fs_enabled(page))
92 ret = __cleancache_get_page(page);
93 return ret;
94}
95
96static inline void cleancache_put_page(struct page *page)
97{
98 if (cleancache_enabled && cleancache_fs_enabled(page))
99 __cleancache_put_page(page);
100}
101
102static inline void cleancache_flush_page(struct address_space *mapping,
103 struct page *page)
104{
105 /* careful... page->mapping is NULL sometimes when this is called */
106 if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
107 __cleancache_flush_page(mapping, page);
108}
109
110static inline void cleancache_flush_inode(struct address_space *mapping)
111{
112 if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
113 __cleancache_flush_inode(mapping);
114}
115
116static inline void cleancache_flush_fs(struct super_block *sb)
117{
118 if (cleancache_enabled)
119 __cleancache_flush_fs(sb);
120}
121
122#endif /* _LINUX_CLEANCACHE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f9d3251790d..241609346dfb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1428,6 +1428,11 @@ struct super_block {
1428 */ 1428 */
1429 char __rcu *s_options; 1429 char __rcu *s_options;
1430 const struct dentry_operations *s_d_op; /* default d_op for dentries */ 1430 const struct dentry_operations *s_d_op; /* default d_op for dentries */
1431
1432 /*
1433 * Saved pool identifier for cleancache (-1 means none)
1434 */
1435 int cleancache_poolid;
1431}; 1436};
1432 1437
1433extern struct timespec current_fs_time(struct super_block *sb); 1438extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 943c76b3d4bb..59225ef27d15 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1,6 +1,7 @@
1#ifndef _LINUX_HUGETLB_H 1#ifndef _LINUX_HUGETLB_H
2#define _LINUX_HUGETLB_H 2#define _LINUX_HUGETLB_H
3 3
4#include <linux/mm_types.h>
4#include <linux/fs.h> 5#include <linux/fs.h>
5#include <linux/hugetlb_inline.h> 6#include <linux/hugetlb_inline.h>
6 7
@@ -41,7 +42,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
41 unsigned long address, unsigned int flags); 42 unsigned long address, unsigned int flags);
42int hugetlb_reserve_pages(struct inode *inode, long from, long to, 43int hugetlb_reserve_pages(struct inode *inode, long from, long to,
43 struct vm_area_struct *vma, 44 struct vm_area_struct *vma,
44 int acctflags); 45 vm_flags_t vm_flags);
45void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); 46void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
46int dequeue_hwpoisoned_huge_page(struct page *page); 47int dequeue_hwpoisoned_huge_page(struct page *page);
47void copy_huge_page(struct page *dst, struct page *src); 48void copy_huge_page(struct page *dst, struct page *src);
@@ -168,7 +169,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
168 169
169extern const struct file_operations hugetlbfs_file_operations; 170extern const struct file_operations hugetlbfs_file_operations;
170extern const struct vm_operations_struct hugetlb_vm_ops; 171extern const struct vm_operations_struct hugetlb_vm_ops;
171struct file *hugetlb_file_setup(const char *name, size_t size, int acct, 172struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
172 struct user_struct **user, int creat_flags); 173 struct user_struct **user, int creat_flags);
173int hugetlb_get_quota(struct address_space *mapping, long delta); 174int hugetlb_get_quota(struct address_space *mapping, long delta);
174void hugetlb_put_quota(struct address_space *mapping, long delta); 175void hugetlb_put_quota(struct address_space *mapping, long delta);
@@ -192,7 +193,7 @@ static inline void set_file_hugepages(struct file *file)
192#define is_file_hugepages(file) 0 193#define is_file_hugepages(file) 0
193#define set_file_hugepages(file) BUG() 194#define set_file_hugepages(file) BUG()
194static inline struct file *hugetlb_file_setup(const char *name, size_t size, 195static inline struct file *hugetlb_file_setup(const char *name, size_t size,
195 int acctflag, struct user_struct **user, int creat_flags) 196 vm_flags_t acctflag, struct user_struct **user, int creat_flags)
196{ 197{
197 return ERR_PTR(-ENOSYS); 198 return ERR_PTR(-ENOSYS);
198} 199}
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 6931489a5c14..2bb681fbeb35 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -7,7 +7,7 @@
7 7
8static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) 8static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
9{ 9{
10 return vma->vm_flags & VM_HUGETLB; 10 return !!(vma->vm_flags & VM_HUGETLB);
11} 11}
12 12
13#else 13#else
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index f4a2e6b1b864..0ee969a5593d 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -136,6 +136,7 @@ enum {
136 IFLA_PORT_SELF, 136 IFLA_PORT_SELF,
137 IFLA_AF_SPEC, 137 IFLA_AF_SPEC,
138 IFLA_GROUP, /* Group the device belongs to */ 138 IFLA_GROUP, /* Group the device belongs to */
139 IFLA_NET_NS_FD,
139 __IFLA_MAX 140 __IFLA_MAX
140}; 141};
141 142
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index a32dcaec04e1..4ecb7b16b278 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -529,9 +529,10 @@ struct transaction_s
529 enum { 529 enum {
530 T_RUNNING, 530 T_RUNNING,
531 T_LOCKED, 531 T_LOCKED,
532 T_RUNDOWN,
533 T_FLUSH, 532 T_FLUSH,
534 T_COMMIT, 533 T_COMMIT,
534 T_COMMIT_DFLUSH,
535 T_COMMIT_JFLUSH,
535 T_FINISHED 536 T_FINISHED
536 } t_state; 537 } t_state;
537 538
@@ -658,7 +659,9 @@ struct transaction_s
658 * waiting for it to finish. 659 * waiting for it to finish.
659 */ 660 */
660 unsigned int t_synchronous_commit:1; 661 unsigned int t_synchronous_commit:1;
661 unsigned int t_flushed_data_blocks:1; 662
663 /* Disk flush needs to be sent to fs partition [no locking] */
664 int t_need_data_flush;
662 665
663 /* 666 /*
664 * For use by the filesystem to store fs-specific data 667 * For use by the filesystem to store fs-specific data
@@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
1228int jbd2_journal_force_commit_nested(journal_t *journal); 1231int jbd2_journal_force_commit_nested(journal_t *journal);
1229int jbd2_log_wait_commit(journal_t *journal, tid_t tid); 1232int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
1230int jbd2_log_do_checkpoint(journal_t *journal); 1233int jbd2_log_do_checkpoint(journal_t *journal);
1234int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
1231 1235
1232void __jbd2_log_wait_for_space(journal_t *journal); 1236void __jbd2_log_wait_for_space(journal_t *journal);
1233extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); 1237extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8eb969ebf904..fb8e814f78dc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -165,12 +165,12 @@ extern pgprot_t protection_map[16];
165 */ 165 */
166static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) 166static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
167{ 167{
168 return (vma->vm_flags & VM_PFN_AT_MMAP); 168 return !!(vma->vm_flags & VM_PFN_AT_MMAP);
169} 169}
170 170
171static inline int is_pfn_mapping(struct vm_area_struct *vma) 171static inline int is_pfn_mapping(struct vm_area_struct *vma)
172{ 172{
173 return (vma->vm_flags & VM_PFNMAP); 173 return !!(vma->vm_flags & VM_PFNMAP);
174} 174}
175 175
176/* 176/*
@@ -1432,7 +1432,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1432 unsigned long flag, unsigned long pgoff); 1432 unsigned long flag, unsigned long pgoff);
1433extern unsigned long mmap_region(struct file *file, unsigned long addr, 1433extern unsigned long mmap_region(struct file *file, unsigned long addr,
1434 unsigned long len, unsigned long flags, 1434 unsigned long len, unsigned long flags,
1435 unsigned int vm_flags, unsigned long pgoff); 1435 vm_flags_t vm_flags, unsigned long pgoff);
1436 1436
1437static inline unsigned long do_mmap(struct file *file, unsigned long addr, 1437static inline unsigned long do_mmap(struct file *file, unsigned long addr,
1438 unsigned long len, unsigned long prot, 1438 unsigned long len, unsigned long prot,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 071d459e866b..6fe96c19f85e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -102,6 +102,8 @@ struct page {
102#endif 102#endif
103}; 103};
104 104
105typedef unsigned long __nocast vm_flags_t;
106
105/* 107/*
106 * A region containing a mapping of a non-memory backed file under NOMMU 108 * A region containing a mapping of a non-memory backed file under NOMMU
107 * conditions. These are held in a global tree and are pinned by the VMAs that 109 * conditions. These are held in a global tree and are pinned by the VMAs that
@@ -109,7 +111,7 @@ struct page {
109 */ 111 */
110struct vm_region { 112struct vm_region {
111 struct rb_node vm_rb; /* link in global region tree */ 113 struct rb_node vm_rb; /* link in global region tree */
112 unsigned long vm_flags; /* VMA vm_flags */ 114 vm_flags_t vm_flags; /* VMA vm_flags */
113 unsigned long vm_start; /* start address of region */ 115 unsigned long vm_start; /* start address of region */
114 unsigned long vm_end; /* region initialised to here */ 116 unsigned long vm_end; /* region initialised to here */
115 unsigned long vm_top; /* region allocated to here */ 117 unsigned long vm_top; /* region allocated to here */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 3686cd6c9aca..648c9c58add7 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -179,6 +179,8 @@ extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
179extern struct file *get_mm_exe_file(struct mm_struct *mm); 179extern struct file *get_mm_exe_file(struct mm_struct *mm);
180extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm); 180extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
181 181
182extern struct file *proc_ns_fget(int fd);
183
182#else 184#else
183 185
184#define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) 186#define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; })
@@ -241,6 +243,11 @@ static inline void dup_mm_exe_file(struct mm_struct *oldmm,
241 struct mm_struct *newmm) 243 struct mm_struct *newmm)
242{} 244{}
243 245
246static inline struct file *proc_ns_fget(int fd)
247{
248 return ERR_PTR(-EINVAL);
249}
250
244#endif /* CONFIG_PROC_FS */ 251#endif /* CONFIG_PROC_FS */
245 252
246#if !defined(CONFIG_PROC_KCORE) 253#if !defined(CONFIG_PROC_KCORE)
@@ -252,6 +259,18 @@ kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
252extern void kclist_add(struct kcore_list *, void *, size_t, int type); 259extern void kclist_add(struct kcore_list *, void *, size_t, int type);
253#endif 260#endif
254 261
262struct nsproxy;
263struct proc_ns_operations {
264 const char *name;
265 int type;
266 void *(*get)(struct task_struct *task);
267 void (*put)(void *ns);
268 int (*install)(struct nsproxy *nsproxy, void *ns);
269};
270extern const struct proc_ns_operations netns_operations;
271extern const struct proc_ns_operations utsns_operations;
272extern const struct proc_ns_operations ipcns_operations;
273
255union proc_op { 274union proc_op {
256 int (*proc_get_link)(struct inode *, struct path *); 275 int (*proc_get_link)(struct inode *, struct path *);
257 int (*proc_read)(struct task_struct *task, char *page); 276 int (*proc_read)(struct task_struct *task, char *page);
@@ -270,6 +289,8 @@ struct proc_inode {
270 struct proc_dir_entry *pde; 289 struct proc_dir_entry *pde;
271 struct ctl_table_header *sysctl; 290 struct ctl_table_header *sysctl;
272 struct ctl_table *sysctl_entry; 291 struct ctl_table *sysctl_entry;
292 void *ns;
293 const struct proc_ns_operations *ns_ops;
273 struct inode vfs_inode; 294 struct inode vfs_inode;
274}; 295};
275 296
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index ab71447d0c5a..8c03b98df5f9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -846,4 +846,5 @@ asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
846asmlinkage long sys_open_by_handle_at(int mountdirfd, 846asmlinkage long sys_open_by_handle_at(int mountdirfd,
847 struct file_handle __user *handle, 847 struct file_handle __user *handle,
848 int flags); 848 int flags);
849asmlinkage long sys_setns(int fd, int nstype);
849#endif 850#endif
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 3ae491932bc8..dcc8f5749d3f 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -119,6 +119,7 @@ static inline struct net *copy_net_ns(unsigned long flags, struct net *net_ns)
119extern struct list_head net_namespace_list; 119extern struct list_head net_namespace_list;
120 120
121extern struct net *get_net_ns_by_pid(pid_t pid); 121extern struct net *get_net_ns_by_pid(pid_t pid);
122extern struct net *get_net_ns_by_fd(int pid);
122 123
123#ifdef CONFIG_NET_NS 124#ifdef CONFIG_NET_NS
124extern void __put_net(struct net *net); 125extern void __put_net(struct net *net);
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index b33257bc7e83..70213b4515eb 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -58,6 +58,7 @@
58#define __HYPERVISOR_event_channel_op 32 58#define __HYPERVISOR_event_channel_op 32
59#define __HYPERVISOR_physdev_op 33 59#define __HYPERVISOR_physdev_op 33
60#define __HYPERVISOR_hvm_op 34 60#define __HYPERVISOR_hvm_op 34
61#define __HYPERVISOR_tmem_op 38
61 62
62/* Architecture-specific hypercall definitions. */ 63/* Architecture-specific hypercall definitions. */
63#define __HYPERVISOR_arch_0 48 64#define __HYPERVISOR_arch_0 48
@@ -461,6 +462,27 @@ typedef uint8_t xen_domain_handle_t[16];
461#define __mk_unsigned_long(x) x ## UL 462#define __mk_unsigned_long(x) x ## UL
462#define mk_unsigned_long(x) __mk_unsigned_long(x) 463#define mk_unsigned_long(x) __mk_unsigned_long(x)
463 464
465#define TMEM_SPEC_VERSION 1
466
467struct tmem_op {
468 uint32_t cmd;
469 int32_t pool_id;
470 union {
471 struct { /* for cmd == TMEM_NEW_POOL */
472 uint64_t uuid[2];
473 uint32_t flags;
474 } new;
475 struct {
476 uint64_t oid[3];
477 uint32_t index;
478 uint32_t tmem_offset;
479 uint32_t pfn_offset;
480 uint32_t len;
481 GUEST_HANDLE(void) gmfn; /* guest machine page frame */
482 } gen;
483 } u;
484};
485
464#else /* __ASSEMBLY__ */ 486#else /* __ASSEMBLY__ */
465 487
466/* In assembly code we cannot use C numeric constant suffixes. */ 488/* In assembly code we cannot use C numeric constant suffixes. */
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 8054c8e5faf1..ce0a647869b1 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -12,6 +12,7 @@
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/mount.h> 13#include <linux/mount.h>
14#include <linux/user_namespace.h> 14#include <linux/user_namespace.h>
15#include <linux/proc_fs.h>
15 16
16#include "util.h" 17#include "util.h"
17 18
@@ -140,3 +141,39 @@ void put_ipc_ns(struct ipc_namespace *ns)
140 free_ipc_ns(ns); 141 free_ipc_ns(ns);
141 } 142 }
142} 143}
144
145static void *ipcns_get(struct task_struct *task)
146{
147 struct ipc_namespace *ns = NULL;
148 struct nsproxy *nsproxy;
149
150 rcu_read_lock();
151 nsproxy = task_nsproxy(task);
152 if (nsproxy)
153 ns = get_ipc_ns(nsproxy->ipc_ns);
154 rcu_read_unlock();
155
156 return ns;
157}
158
159static void ipcns_put(void *ns)
160{
161 return put_ipc_ns(ns);
162}
163
164static int ipcns_install(struct nsproxy *nsproxy, void *ns)
165{
166 /* Ditch state from the old ipc namespace */
167 exit_sem(current);
168 put_ipc_ns(nsproxy->ipc_ns);
169 nsproxy->ipc_ns = get_ipc_ns(ns);
170 return 0;
171}
172
173const struct proc_ns_operations ipcns_operations = {
174 .name = "ipc",
175 .type = CLONE_NEWIPC,
176 .get = ipcns_get,
177 .put = ipcns_put,
178 .install = ipcns_install,
179};
diff --git a/ipc/shm.c b/ipc/shm.c
index 729acb7e3148..ab3385a21b27 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -347,7 +347,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
347 struct file * file; 347 struct file * file;
348 char name[13]; 348 char name[13];
349 int id; 349 int id;
350 int acctflag = 0; 350 vm_flags_t acctflag = 0;
351 351
352 if (size < SHMMIN || size > ns->shm_ctlmax) 352 if (size < SHMMIN || size > ns->shm_ctlmax)
353 return -EINVAL; 353 return -EINVAL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd9..5424e37673ed 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h> 23#include <net/net_namespace.h>
24#include <linux/ipc_namespace.h> 24#include <linux/ipc_namespace.h>
25#include <linux/proc_fs.h>
26#include <linux/file.h>
27#include <linux/syscalls.h>
25 28
26static struct kmem_cache *nsproxy_cachep; 29static struct kmem_cache *nsproxy_cachep;
27 30
@@ -233,6 +236,45 @@ void exit_task_namespaces(struct task_struct *p)
233 switch_task_namespaces(p, NULL); 236 switch_task_namespaces(p, NULL);
234} 237}
235 238
239SYSCALL_DEFINE2(setns, int, fd, int, nstype)
240{
241 const struct proc_ns_operations *ops;
242 struct task_struct *tsk = current;
243 struct nsproxy *new_nsproxy;
244 struct proc_inode *ei;
245 struct file *file;
246 int err;
247
248 if (!capable(CAP_SYS_ADMIN))
249 return -EPERM;
250
251 file = proc_ns_fget(fd);
252 if (IS_ERR(file))
253 return PTR_ERR(file);
254
255 err = -EINVAL;
256 ei = PROC_I(file->f_dentry->d_inode);
257 ops = ei->ns_ops;
258 if (nstype && (ops->type != nstype))
259 goto out;
260
261 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
262 if (IS_ERR(new_nsproxy)) {
263 err = PTR_ERR(new_nsproxy);
264 goto out;
265 }
266
267 err = ops->install(new_nsproxy, ei->ns);
268 if (err) {
269 free_nsproxy(new_nsproxy);
270 goto out;
271 }
272 switch_task_namespaces(tsk, new_nsproxy);
273out:
274 fput(file);
275 return err;
276}
277
236static int __init nsproxy_cache_init(void) 278static int __init nsproxy_cache_init(void)
237{ 279{
238 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 280 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eaba..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/proc_fs.h>
18 19
19static struct uts_namespace *create_uts_ns(void) 20static struct uts_namespace *create_uts_ns(void)
20{ 21{
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
79 put_user_ns(ns->user_ns); 80 put_user_ns(ns->user_ns);
80 kfree(ns); 81 kfree(ns);
81} 82}
83
84static void *utsns_get(struct task_struct *task)
85{
86 struct uts_namespace *ns = NULL;
87 struct nsproxy *nsproxy;
88
89 rcu_read_lock();
90 nsproxy = task_nsproxy(task);
91 if (nsproxy) {
92 ns = nsproxy->uts_ns;
93 get_uts_ns(ns);
94 }
95 rcu_read_unlock();
96
97 return ns;
98}
99
100static void utsns_put(void *ns)
101{
102 put_uts_ns(ns);
103}
104
105static int utsns_install(struct nsproxy *nsproxy, void *ns)
106{
107 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns;
110 return 0;
111}
112
113const struct proc_ns_operations utsns_operations = {
114 .name = "uts",
115 .type = CLONE_NEWUTS,
116 .get = utsns_get,
117 .put = utsns_put,
118 .install = utsns_install,
119};
120
diff --git a/mm/Kconfig b/mm/Kconfig
index e9c0c61f2ddd..8ca47a5ee9c8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -347,3 +347,26 @@ config NEED_PER_CPU_KM
347 depends on !SMP 347 depends on !SMP
348 bool 348 bool
349 default y 349 default y
350
351config CLEANCACHE
352 bool "Enable cleancache driver to cache clean pages if tmem is present"
353 default n
354 help
355 Cleancache can be thought of as a page-granularity victim cache
356 for clean pages that the kernel's pageframe replacement algorithm
357 (PFRA) would like to keep around, but can't since there isn't enough
358 memory. So when the PFRA "evicts" a page, it first attempts to use
359 cleancacne code to put the data contained in that page into
360 "transcendent memory", memory that is not directly accessible or
361 addressable by the kernel and is of unknown and possibly
362 time-varying size. And when a cleancache-enabled
363 filesystem wishes to access a page in a file on disk, it first
364 checks cleancache to see if it already contains it; if it does,
365 the page is copied into the kernel and a disk access is avoided.
366 When a transcendent memory driver is available (such as zcache or
367 Xen transcendent memory), a significant I/O reduction
368 may be achieved. When none is available, all cleancache calls
369 are reduced to a single pointer-compare-against-NULL resulting
370 in a negligible performance hit.
371
372 If unsure, say Y to enable cleancache
diff --git a/mm/Makefile b/mm/Makefile
index 42a8326c3e3d..836e4163c1bf 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -49,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
49obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 49obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
50obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 50obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
51obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 51obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
52obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 000000000000..bcaae4c2a770
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,244 @@
1/*
2 * Cleancache frontend
3 *
4 * This code provides the generic "frontend" layer to call a matching
5 * "backend" driver implementation of cleancache. See
6 * Documentation/vm/cleancache.txt for more information.
7 *
8 * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
9 * Author: Dan Magenheimer
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */
13
14#include <linux/module.h>
15#include <linux/fs.h>
16#include <linux/exportfs.h>
17#include <linux/mm.h>
18#include <linux/cleancache.h>
19
20/*
21 * This global enablement flag may be read thousands of times per second
22 * by cleancache_get/put/flush even on systems where cleancache_ops
23 * is not claimed (e.g. cleancache is config'ed on but remains
24 * disabled), so is preferred to the slower alternative: a function
25 * call that checks a non-global.
26 */
27int cleancache_enabled;
28EXPORT_SYMBOL(cleancache_enabled);
29
30/*
31 * cleancache_ops is set by cleancache_ops_register to contain the pointers
32 * to the cleancache "backend" implementation functions.
33 */
34static struct cleancache_ops cleancache_ops;
35
36/* useful stats available in /sys/kernel/mm/cleancache */
37static unsigned long cleancache_succ_gets;
38static unsigned long cleancache_failed_gets;
39static unsigned long cleancache_puts;
40static unsigned long cleancache_flushes;
41
42/*
43 * register operations for cleancache, returning previous thus allowing
44 * detection of multiple backends and possible nesting
45 */
46struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
47{
48 struct cleancache_ops old = cleancache_ops;
49
50 cleancache_ops = *ops;
51 cleancache_enabled = 1;
52 return old;
53}
54EXPORT_SYMBOL(cleancache_register_ops);
55
56/* Called by a cleancache-enabled filesystem at time of mount */
57void __cleancache_init_fs(struct super_block *sb)
58{
59 sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
60}
61EXPORT_SYMBOL(__cleancache_init_fs);
62
63/* Called by a cleancache-enabled clustered filesystem at time of mount */
64void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
65{
66 sb->cleancache_poolid =
67 (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
68}
69EXPORT_SYMBOL(__cleancache_init_shared_fs);
70
71/*
72 * If the filesystem uses exportable filehandles, use the filehandle as
73 * the key, else use the inode number.
74 */
75static int cleancache_get_key(struct inode *inode,
76 struct cleancache_filekey *key)
77{
78 int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
79 int len = 0, maxlen = CLEANCACHE_KEY_MAX;
80 struct super_block *sb = inode->i_sb;
81
82 key->u.ino = inode->i_ino;
83 if (sb->s_export_op != NULL) {
84 fhfn = sb->s_export_op->encode_fh;
85 if (fhfn) {
86 struct dentry d;
87 d.d_inode = inode;
88 len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
89 if (len <= 0 || len == 255)
90 return -1;
91 if (maxlen > CLEANCACHE_KEY_MAX)
92 return -1;
93 }
94 }
95 return 0;
96}
97
98/*
99 * "Get" data from cleancache associated with the poolid/inode/index
100 * that were specified when the data was put to cleanache and, if
101 * successful, use it to fill the specified page with data and return 0.
102 * The pageframe is unchanged and returns -1 if the get fails.
103 * Page must be locked by caller.
104 */
105int __cleancache_get_page(struct page *page)
106{
107 int ret = -1;
108 int pool_id;
109 struct cleancache_filekey key = { .u.key = { 0 } };
110
111 VM_BUG_ON(!PageLocked(page));
112 pool_id = page->mapping->host->i_sb->cleancache_poolid;
113 if (pool_id < 0)
114 goto out;
115
116 if (cleancache_get_key(page->mapping->host, &key) < 0)
117 goto out;
118
119 ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
120 if (ret == 0)
121 cleancache_succ_gets++;
122 else
123 cleancache_failed_gets++;
124out:
125 return ret;
126}
127EXPORT_SYMBOL(__cleancache_get_page);
128
129/*
130 * "Put" data from a page to cleancache and associate it with the
131 * (previously-obtained per-filesystem) poolid and the page's,
132 * inode and page index. Page must be locked. Note that a put_page
133 * always "succeeds", though a subsequent get_page may succeed or fail.
134 */
135void __cleancache_put_page(struct page *page)
136{
137 int pool_id;
138 struct cleancache_filekey key = { .u.key = { 0 } };
139
140 VM_BUG_ON(!PageLocked(page));
141 pool_id = page->mapping->host->i_sb->cleancache_poolid;
142 if (pool_id >= 0 &&
143 cleancache_get_key(page->mapping->host, &key) >= 0) {
144 (*cleancache_ops.put_page)(pool_id, key, page->index, page);
145 cleancache_puts++;
146 }
147}
148EXPORT_SYMBOL(__cleancache_put_page);
149
150/*
151 * Flush any data from cleancache associated with the poolid and the
152 * page's inode and page index so that a subsequent "get" will fail.
153 */
154void __cleancache_flush_page(struct address_space *mapping, struct page *page)
155{
156 /* careful... page->mapping is NULL sometimes when this is called */
157 int pool_id = mapping->host->i_sb->cleancache_poolid;
158 struct cleancache_filekey key = { .u.key = { 0 } };
159
160 if (pool_id >= 0) {
161 VM_BUG_ON(!PageLocked(page));
162 if (cleancache_get_key(mapping->host, &key) >= 0) {
163 (*cleancache_ops.flush_page)(pool_id, key, page->index);
164 cleancache_flushes++;
165 }
166 }
167}
168EXPORT_SYMBOL(__cleancache_flush_page);
169
170/*
171 * Flush all data from cleancache associated with the poolid and the
172 * mappings's inode so that all subsequent gets to this poolid/inode
173 * will fail.
174 */
175void __cleancache_flush_inode(struct address_space *mapping)
176{
177 int pool_id = mapping->host->i_sb->cleancache_poolid;
178 struct cleancache_filekey key = { .u.key = { 0 } };
179
180 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
181 (*cleancache_ops.flush_inode)(pool_id, key);
182}
183EXPORT_SYMBOL(__cleancache_flush_inode);
184
185/*
186 * Called by any cleancache-enabled filesystem at time of unmount;
187 * note that pool_id is surrendered and may be reutrned by a subsequent
188 * cleancache_init_fs or cleancache_init_shared_fs
189 */
190void __cleancache_flush_fs(struct super_block *sb)
191{
192 if (sb->cleancache_poolid >= 0) {
193 int old_poolid = sb->cleancache_poolid;
194 sb->cleancache_poolid = -1;
195 (*cleancache_ops.flush_fs)(old_poolid);
196 }
197}
198EXPORT_SYMBOL(__cleancache_flush_fs);
199
200#ifdef CONFIG_SYSFS
201
202/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
203
204#define CLEANCACHE_SYSFS_RO(_name) \
205 static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
206 struct kobj_attribute *attr, char *buf) \
207 { \
208 return sprintf(buf, "%lu\n", cleancache_##_name); \
209 } \
210 static struct kobj_attribute cleancache_##_name##_attr = { \
211 .attr = { .name = __stringify(_name), .mode = 0444 }, \
212 .show = cleancache_##_name##_show, \
213 }
214
215CLEANCACHE_SYSFS_RO(succ_gets);
216CLEANCACHE_SYSFS_RO(failed_gets);
217CLEANCACHE_SYSFS_RO(puts);
218CLEANCACHE_SYSFS_RO(flushes);
219
220static struct attribute *cleancache_attrs[] = {
221 &cleancache_succ_gets_attr.attr,
222 &cleancache_failed_gets_attr.attr,
223 &cleancache_puts_attr.attr,
224 &cleancache_flushes_attr.attr,
225 NULL,
226};
227
228static struct attribute_group cleancache_attr_group = {
229 .attrs = cleancache_attrs,
230 .name = "cleancache",
231};
232
233#endif /* CONFIG_SYSFS */
234
235static int __init init_cleancache(void)
236{
237#ifdef CONFIG_SYSFS
238 int err;
239
240 err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
241#endif /* CONFIG_SYSFS */
242 return 0;
243}
244module_init(init_cleancache)
diff --git a/mm/filemap.c b/mm/filemap.c
index 68e782b3d3de..7455ccd8bda8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */ 36#include <linux/mm_inline.h> /* for page_is_file_cache() */
37#include <linux/cleancache.h>
37#include "internal.h" 38#include "internal.h"
38 39
39/* 40/*
@@ -118,6 +119,16 @@ void __delete_from_page_cache(struct page *page)
118{ 119{
119 struct address_space *mapping = page->mapping; 120 struct address_space *mapping = page->mapping;
120 121
122 /*
123 * if we're uptodate, flush out into the cleancache, otherwise
124 * invalidate any existing cleancache entries. We can't leave
125 * stale data around in the cleancache once our page is gone
126 */
127 if (PageUptodate(page) && PageMappedToDisk(page))
128 cleancache_put_page(page);
129 else
130 cleancache_flush_page(mapping, page);
131
121 radix_tree_delete(&mapping->page_tree, page->index); 132 radix_tree_delete(&mapping->page_tree, page->index);
122 page->mapping = NULL; 133 page->mapping = NULL;
123 mapping->nrpages--; 134 mapping->nrpages--;
diff --git a/mm/fremap.c b/mm/fremap.c
index 7f4123056e06..b8e0e2d468af 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -224,7 +224,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
224 /* 224 /*
225 * drop PG_Mlocked flag for over-mapped range 225 * drop PG_Mlocked flag for over-mapped range
226 */ 226 */
227 unsigned int saved_flags = vma->vm_flags; 227 vm_flags_t saved_flags = vma->vm_flags;
228 munlock_vma_pages_range(vma, start, start + size); 228 munlock_vma_pages_range(vma, start, start + size);
229 vma->vm_flags = saved_flags; 229 vma->vm_flags = saved_flags;
230 } 230 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5fd68b95c671..f33bb319b73f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2833int hugetlb_reserve_pages(struct inode *inode, 2833int hugetlb_reserve_pages(struct inode *inode,
2834 long from, long to, 2834 long from, long to,
2835 struct vm_area_struct *vma, 2835 struct vm_area_struct *vma,
2836 int acctflag) 2836 vm_flags_t vm_flags)
2837{ 2837{
2838 long ret, chg; 2838 long ret, chg;
2839 struct hstate *h = hstate_inode(inode); 2839 struct hstate *h = hstate_inode(inode);
@@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2843 * attempt will be made for VM_NORESERVE to allocate a page 2843 * attempt will be made for VM_NORESERVE to allocate a page
2844 * and filesystem quota without using reserves 2844 * and filesystem quota without using reserves
2845 */ 2845 */
2846 if (acctflag & VM_NORESERVE) 2846 if (vm_flags & VM_NORESERVE)
2847 return 0; 2847 return 0;
2848 2848
2849 /* 2849 /*
diff --git a/mm/memory.c b/mm/memory.c
index b73f677f0bb1..fc24f7d788bd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -730,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
730 add_taint(TAINT_BAD_PAGE); 730 add_taint(TAINT_BAD_PAGE);
731} 731}
732 732
733static inline int is_cow_mapping(unsigned int flags) 733static inline int is_cow_mapping(vm_flags_t flags)
734{ 734{
735 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 735 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
736} 736}
diff --git a/mm/mlock.c b/mm/mlock.c
index 516b2c2ddd5a..048260c4e02e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
307 * For vmas that pass the filters, merge/split as appropriate. 307 * For vmas that pass the filters, merge/split as appropriate.
308 */ 308 */
309static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 309static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
310 unsigned long start, unsigned long end, unsigned int newflags) 310 unsigned long start, unsigned long end, vm_flags_t newflags)
311{ 311{
312 struct mm_struct *mm = vma->vm_mm; 312 struct mm_struct *mm = vma->vm_mm;
313 pgoff_t pgoff; 313 pgoff_t pgoff;
314 int nr_pages; 314 int nr_pages;
315 int ret = 0; 315 int ret = 0;
316 int lock = newflags & VM_LOCKED; 316 int lock = !!(newflags & VM_LOCKED);
317 317
318 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || 318 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
319 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) 319 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
385 prev = vma; 385 prev = vma;
386 386
387 for (nstart = start ; ; ) { 387 for (nstart = start ; ; ) {
388 unsigned int newflags; 388 vm_flags_t newflags;
389 389
390 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 390 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
391 391
@@ -524,7 +524,7 @@ static int do_mlockall(int flags)
524 goto out; 524 goto out;
525 525
526 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { 526 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
527 unsigned int newflags; 527 vm_flags_t newflags;
528 528
529 newflags = vma->vm_flags | VM_LOCKED; 529 newflags = vma->vm_flags | VM_LOCKED;
530 if (!(flags & MCL_CURRENT)) 530 if (!(flags & MCL_CURRENT))
diff --git a/mm/mmap.c b/mm/mmap.c
index ac2631b7477f..bbdc9af5e117 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
960{ 960{
961 struct mm_struct * mm = current->mm; 961 struct mm_struct * mm = current->mm;
962 struct inode *inode; 962 struct inode *inode;
963 unsigned int vm_flags; 963 vm_flags_t vm_flags;
964 int error; 964 int error;
965 unsigned long reqprot = prot; 965 unsigned long reqprot = prot;
966 966
@@ -1165,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1165 */ 1165 */
1166int vma_wants_writenotify(struct vm_area_struct *vma) 1166int vma_wants_writenotify(struct vm_area_struct *vma)
1167{ 1167{
1168 unsigned int vm_flags = vma->vm_flags; 1168 vm_flags_t vm_flags = vma->vm_flags;
1169 1169
1170 /* If it was private or non-writable, the write bit is already clear */ 1170 /* If it was private or non-writable, the write bit is already clear */
1171 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) 1171 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1193,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1193 * We account for memory if it's a private writeable mapping, 1193 * We account for memory if it's a private writeable mapping,
1194 * not hugepages and VM_NORESERVE wasn't set. 1194 * not hugepages and VM_NORESERVE wasn't set.
1195 */ 1195 */
1196static inline int accountable_mapping(struct file *file, unsigned int vm_flags) 1196static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1197{ 1197{
1198 /* 1198 /*
1199 * hugetlb has its own accounting separate from the core VM 1199 * hugetlb has its own accounting separate from the core VM
@@ -1207,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
1207 1207
1208unsigned long mmap_region(struct file *file, unsigned long addr, 1208unsigned long mmap_region(struct file *file, unsigned long addr,
1209 unsigned long len, unsigned long flags, 1209 unsigned long len, unsigned long flags,
1210 unsigned int vm_flags, unsigned long pgoff) 1210 vm_flags_t vm_flags, unsigned long pgoff)
1211{ 1211{
1212 struct mm_struct *mm = current->mm; 1212 struct mm_struct *mm = current->mm;
1213 struct vm_area_struct *vma, *prev; 1213 struct vm_area_struct *vma, *prev;
diff --git a/mm/slub.c b/mm/slub.c
index 4aad32d2e60d..7be0223531b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1831,7 +1831,6 @@ load_freelist:
1831 page->inuse = page->objects; 1831 page->inuse = page->objects;
1832 page->freelist = NULL; 1832 page->freelist = NULL;
1833 1833
1834unlock_out:
1835 slab_unlock(page); 1834 slab_unlock(page);
1836 c->tid = next_tid(c->tid); 1835 c->tid = next_tid(c->tid);
1837 local_irq_restore(flags); 1836 local_irq_restore(flags);
diff --git a/mm/truncate.c b/mm/truncate.c
index a95667529135..3a29a6180212 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,6 +19,7 @@
19#include <linux/task_io_accounting_ops.h> 19#include <linux/task_io_accounting_ops.h>
20#include <linux/buffer_head.h> /* grr. try_to_release_page, 20#include <linux/buffer_head.h> /* grr. try_to_release_page,
21 do_invalidatepage */ 21 do_invalidatepage */
22#include <linux/cleancache.h>
22#include "internal.h" 23#include "internal.h"
23 24
24 25
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
51static inline void truncate_partial_page(struct page *page, unsigned partial) 52static inline void truncate_partial_page(struct page *page, unsigned partial)
52{ 53{
53 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 54 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
55 cleancache_flush_page(page->mapping, page);
54 if (page_has_private(page)) 56 if (page_has_private(page))
55 do_invalidatepage(page, partial); 57 do_invalidatepage(page, partial);
56} 58}
@@ -214,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
214 pgoff_t next; 216 pgoff_t next;
215 int i; 217 int i;
216 218
219 cleancache_flush_inode(mapping);
217 if (mapping->nrpages == 0) 220 if (mapping->nrpages == 0)
218 return; 221 return;
219 222
@@ -291,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
291 pagevec_release(&pvec); 294 pagevec_release(&pvec);
292 mem_cgroup_uncharge_end(); 295 mem_cgroup_uncharge_end();
293 } 296 }
297 cleancache_flush_inode(mapping);
294} 298}
295EXPORT_SYMBOL(truncate_inode_pages_range); 299EXPORT_SYMBOL(truncate_inode_pages_range);
296 300
@@ -440,6 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
440 int did_range_unmap = 0; 444 int did_range_unmap = 0;
441 int wrapped = 0; 445 int wrapped = 0;
442 446
447 cleancache_flush_inode(mapping);
443 pagevec_init(&pvec, 0); 448 pagevec_init(&pvec, 0);
444 next = start; 449 next = start;
445 while (next <= end && !wrapped && 450 while (next <= end && !wrapped &&
@@ -498,6 +503,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
498 mem_cgroup_uncharge_end(); 503 mem_cgroup_uncharge_end();
499 cond_resched(); 504 cond_resched();
500 } 505 }
506 cleancache_flush_inode(mapping);
501 return ret; 507 return ret;
502} 508}
503EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 509EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2e2dce6583e1..6c6b86d0da15 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -8,6 +8,8 @@
8#include <linux/idr.h> 8#include <linux/idr.h>
9#include <linux/rculist.h> 9#include <linux/rculist.h>
10#include <linux/nsproxy.h> 10#include <linux/nsproxy.h>
11#include <linux/proc_fs.h>
12#include <linux/file.h>
11#include <net/net_namespace.h> 13#include <net/net_namespace.h>
12#include <net/netns/generic.h> 14#include <net/netns/generic.h>
13 15
@@ -302,6 +304,28 @@ void __put_net(struct net *net)
302} 304}
303EXPORT_SYMBOL_GPL(__put_net); 305EXPORT_SYMBOL_GPL(__put_net);
304 306
307struct net *get_net_ns_by_fd(int fd)
308{
309 struct proc_inode *ei;
310 struct file *file;
311 struct net *net;
312
313 net = ERR_PTR(-EINVAL);
314 file = proc_ns_fget(fd);
315 if (!file)
316 goto out;
317
318 ei = PROC_I(file->f_dentry->d_inode);
319 if (ei->ns_ops != &netns_operations)
320 goto out;
321
322 net = get_net(ei->ns);
323out:
324 if (file)
325 fput(file);
326 return net;
327}
328
305#else 329#else
306struct net *copy_net_ns(unsigned long flags, struct net *old_net) 330struct net *copy_net_ns(unsigned long flags, struct net *old_net)
307{ 331{
@@ -309,6 +333,11 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
309 return ERR_PTR(-EINVAL); 333 return ERR_PTR(-EINVAL);
310 return old_net; 334 return old_net;
311} 335}
336
337struct net *get_net_ns_by_fd(int fd)
338{
339 return ERR_PTR(-EINVAL);
340}
312#endif 341#endif
313 342
314struct net *get_net_ns_by_pid(pid_t pid) 343struct net *get_net_ns_by_pid(pid_t pid)
@@ -561,3 +590,39 @@ void unregister_pernet_device(struct pernet_operations *ops)
561 mutex_unlock(&net_mutex); 590 mutex_unlock(&net_mutex);
562} 591}
563EXPORT_SYMBOL_GPL(unregister_pernet_device); 592EXPORT_SYMBOL_GPL(unregister_pernet_device);
593
594#ifdef CONFIG_NET_NS
595static void *netns_get(struct task_struct *task)
596{
597 struct net *net = NULL;
598 struct nsproxy *nsproxy;
599
600 rcu_read_lock();
601 nsproxy = task_nsproxy(task);
602 if (nsproxy)
603 net = get_net(nsproxy->net_ns);
604 rcu_read_unlock();
605
606 return net;
607}
608
609static void netns_put(void *ns)
610{
611 put_net(ns);
612}
613
614static int netns_install(struct nsproxy *nsproxy, void *ns)
615{
616 put_net(nsproxy->net_ns);
617 nsproxy->net_ns = get_net(ns);
618 return 0;
619}
620
621const struct proc_ns_operations netns_operations = {
622 .name = "net",
623 .type = CLONE_NEWNET,
624 .get = netns_get,
625 .put = netns_put,
626 .install = netns_install,
627};
628#endif
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2d56cb9b0b94..abd936d8a716 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1046,6 +1046,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
1046 [IFLA_LINKMODE] = { .type = NLA_U8 }, 1046 [IFLA_LINKMODE] = { .type = NLA_U8 },
1047 [IFLA_LINKINFO] = { .type = NLA_NESTED }, 1047 [IFLA_LINKINFO] = { .type = NLA_NESTED },
1048 [IFLA_NET_NS_PID] = { .type = NLA_U32 }, 1048 [IFLA_NET_NS_PID] = { .type = NLA_U32 },
1049 [IFLA_NET_NS_FD] = { .type = NLA_U32 },
1049 [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, 1050 [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 },
1050 [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, 1051 [IFLA_VFINFO_LIST] = {. type = NLA_NESTED },
1051 [IFLA_VF_PORTS] = { .type = NLA_NESTED }, 1052 [IFLA_VF_PORTS] = { .type = NLA_NESTED },
@@ -1094,6 +1095,8 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
1094 */ 1095 */
1095 if (tb[IFLA_NET_NS_PID]) 1096 if (tb[IFLA_NET_NS_PID])
1096 net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); 1097 net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
1098 else if (tb[IFLA_NET_NS_FD])
1099 net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
1097 else 1100 else
1098 net = get_net(src_net); 1101 net = get_net(src_net);
1099 return net; 1102 return net;
@@ -1224,7 +1227,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
1224 int send_addr_notify = 0; 1227 int send_addr_notify = 0;
1225 int err; 1228 int err;
1226 1229
1227 if (tb[IFLA_NET_NS_PID]) { 1230 if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
1228 struct net *net = rtnl_link_get_net(dev_net(dev), tb); 1231 struct net *net = rtnl_link_get_net(dev_net(dev), tb);
1229 if (IS_ERR(net)) { 1232 if (IS_ERR(net)) {
1230 err = PTR_ERR(net); 1233 err = PTR_ERR(net);