aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile1
-rw-r--r--fs/bio-integrity.c719
-rw-r--r--fs/bio.c88
-rw-r--r--fs/buffer.c21
-rw-r--r--fs/char_dev.c7
-rw-r--r--fs/cifs/cifsacl.c10
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/inode.c20
-rw-r--r--fs/dlm/user.c9
-rw-r--r--fs/ecryptfs/file.c3
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext4/balloc.c209
-rw-r--r--fs/ext4/dir.c17
-rw-r--r--fs/ext4/ext4.h61
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h10
-rw-r--r--fs/ext4/ext4_jbd2.h21
-rw-r--r--fs/ext4/ext4_sb.h5
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/inode.c1591
-rw-r--r--fs/ext4/mballoc.c451
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/resize.c52
-rw-r--r--fs/ext4/super.c142
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/fat/cache.c2
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/file.c6
-rw-r--r--fs/fat/inode.c26
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/gfs2/Kconfig18
-rw-r--r--fs/gfs2/Makefile1
-rw-r--r--fs/gfs2/gfs2.h5
-rw-r--r--fs/gfs2/glock.c1643
-rw-r--r--fs/gfs2/glock.h11
-rw-r--r--fs/gfs2/glops.c70
-rw-r--r--fs/gfs2/incore.h38
-rw-r--r--fs/gfs2/inode.c11
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/locking.c52
-rw-r--r--fs/gfs2/locking/dlm/lock.c368
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h18
-rw-r--r--fs/gfs2/locking/dlm/mount.c14
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c13
-rw-r--r--fs/gfs2/locking/dlm/thread.c331
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c238
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c14
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/ops_address.c40
-rw-r--r--fs/gfs2/ops_file.c42
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/ops_inode.c25
-rw-r--r--fs/gfs2/ops_super.c4
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/recovery.c5
-rw-r--r--fs/gfs2/rgrp.c108
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/sys.c16
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c294
-rw-r--r--fs/jbd2/journal.c53
-rw-r--r--fs/jbd2/transaction.c365
-rw-r--r--fs/jfs/jfs_debug.c62
-rw-r--r--fs/jfs/jfs_debug.h10
-rw-r--r--fs/jfs/jfs_dtree.h3
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_logmgr.c35
-rw-r--r--fs/jfs/jfs_metapage.c36
-rw-r--r--fs/jfs/jfs_txnmgr.c68
-rw-r--r--fs/jfs/jfs_xtree.c36
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c7
-rw-r--r--fs/mpage.c14
-rw-r--r--fs/msdos/namei.c35
-rw-r--r--fs/namespace.c14
-rw-r--r--fs/ncpfs/file.c12
-rw-r--r--fs/nfs/file.c7
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/dlmglue.c14
-rw-r--r--fs/ocfs2/stack_user.c3
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/proc/proc_misc.c16
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/ramfs/file-mmu.c1
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/read_write.c38
-rw-r--r--fs/smbfs/file.c11
-rw-r--r--fs/splice.c17
-rw-r--r--fs/vfat/namei.c35
-rw-r--r--fs/xfs/xfs_log.c15
102 files changed, 4902 insertions, 3225 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 0ce72dcd6b96..84ab76a206a0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -930,7 +930,7 @@ config PROC_KCORE
930 930
931config PROC_VMCORE 931config PROC_VMCORE
932 bool "/proc/vmcore support (EXPERIMENTAL)" 932 bool "/proc/vmcore support (EXPERIMENTAL)"
933 depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP 933 depends on PROC_FS && CRASH_DUMP
934 default y 934 default y
935 help 935 help
936 Exports the dump image of crashed kernel in ELF format. 936 Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da1..277b079dec9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
19obj-y += no-block.o 19obj-y += no-block.o
20endif 20endif
21 21
22obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
22obj-$(CONFIG_INOTIFY) += inotify.o 23obj-$(CONFIG_INOTIFY) += inotify.o
23obj-$(CONFIG_INOTIFY_USER) += inotify_user.o 24obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
24obj-$(CONFIG_EPOLL) += eventpoll.o 25obj-$(CONFIG_EPOLL) += eventpoll.o
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 000000000000..63e2ee63058d
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
1/*
2 * bio-integrity.c - bio data integrity extensions
3 *
4 * Copyright (C) 2007, 2008 Oracle Corporation
5 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License version
9 * 2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
19 * USA.
20 *
21 */
22
23#include <linux/blkdev.h>
24#include <linux/mempool.h>
25#include <linux/bio.h>
26#include <linux/workqueue.h>
27
28static struct kmem_cache *bio_integrity_slab __read_mostly;
29static struct workqueue_struct *kintegrityd_wq;
30
31/**
32 * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
33 * @bio: bio to attach integrity metadata to
34 * @gfp_mask: Memory allocation mask
35 * @nr_vecs: Number of integrity metadata scatter-gather elements
36 * @bs: bio_set to allocate from
37 *
38 * Description: This function prepares a bio for attaching integrity
39 * metadata. nr_vecs specifies the maximum number of pages containing
40 * integrity metadata that can be attached.
41 */
42struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
43 gfp_t gfp_mask,
44 unsigned int nr_vecs,
45 struct bio_set *bs)
46{
47 struct bio_integrity_payload *bip;
48 struct bio_vec *iv;
49 unsigned long idx;
50
51 BUG_ON(bio == NULL);
52
53 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
54 if (unlikely(bip == NULL)) {
55 printk(KERN_ERR "%s: could not alloc bip\n", __func__);
56 return NULL;
57 }
58
59 memset(bip, 0, sizeof(*bip));
60
61 iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
62 if (unlikely(iv == NULL)) {
63 printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
64 mempool_free(bip, bs->bio_integrity_pool);
65 return NULL;
66 }
67
68 bip->bip_pool = idx;
69 bip->bip_vec = iv;
70 bip->bip_bio = bio;
71 bio->bi_integrity = bip;
72
73 return bip;
74}
75EXPORT_SYMBOL(bio_integrity_alloc_bioset);
76
77/**
78 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
79 * @bio: bio to attach integrity metadata to
80 * @gfp_mask: Memory allocation mask
81 * @nr_vecs: Number of integrity metadata scatter-gather elements
82 *
83 * Description: This function prepares a bio for attaching integrity
84 * metadata. nr_vecs specifies the maximum number of pages containing
85 * integrity metadata that can be attached.
86 */
87struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
88 gfp_t gfp_mask,
89 unsigned int nr_vecs)
90{
91 return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
92}
93EXPORT_SYMBOL(bio_integrity_alloc);
94
95/**
96 * bio_integrity_free - Free bio integrity payload
97 * @bio: bio containing bip to be freed
98 * @bs: bio_set this bio was allocated from
99 *
100 * Description: Used to free the integrity portion of a bio. Usually
101 * called from bio_free().
102 */
103void bio_integrity_free(struct bio *bio, struct bio_set *bs)
104{
105 struct bio_integrity_payload *bip = bio->bi_integrity;
106
107 BUG_ON(bip == NULL);
108
109 /* A cloned bio doesn't own the integrity metadata */
110 if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
111 kfree(bip->bip_buf);
112
113 mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
114 mempool_free(bip, bs->bio_integrity_pool);
115
116 bio->bi_integrity = NULL;
117}
118EXPORT_SYMBOL(bio_integrity_free);
119
120/**
121 * bio_integrity_add_page - Attach integrity metadata
122 * @bio: bio to update
123 * @page: page containing integrity metadata
124 * @len: number of bytes of integrity metadata in page
125 * @offset: start offset within page
126 *
127 * Description: Attach a page containing integrity metadata to bio.
128 */
129int bio_integrity_add_page(struct bio *bio, struct page *page,
130 unsigned int len, unsigned int offset)
131{
132 struct bio_integrity_payload *bip = bio->bi_integrity;
133 struct bio_vec *iv;
134
135 if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
136 printk(KERN_ERR "%s: bip_vec full\n", __func__);
137 return 0;
138 }
139
140 iv = bip_vec_idx(bip, bip->bip_vcnt);
141 BUG_ON(iv == NULL);
142 BUG_ON(iv->bv_page != NULL);
143
144 iv->bv_page = page;
145 iv->bv_len = len;
146 iv->bv_offset = offset;
147 bip->bip_vcnt++;
148
149 return len;
150}
151EXPORT_SYMBOL(bio_integrity_add_page);
152
153/**
154 * bio_integrity_enabled - Check whether integrity can be passed
155 * @bio: bio to check
156 *
157 * Description: Determines whether bio_integrity_prep() can be called
158 * on this bio or not. bio data direction and target device must be
159 * set prior to calling. The functions honors the write_generate and
160 * read_verify flags in sysfs.
161 */
162int bio_integrity_enabled(struct bio *bio)
163{
164 /* Already protected? */
165 if (bio_integrity(bio))
166 return 0;
167
168 return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
169}
170EXPORT_SYMBOL(bio_integrity_enabled);
171
172/**
173 * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
174 * @bi: blk_integrity profile for device
175 * @sectors: Number of 512 sectors to convert
176 *
177 * Description: The block layer calculates everything in 512 byte
178 * sectors but integrity metadata is done in terms of the hardware
179 * sector size of the storage device. Convert the block layer sectors
180 * to physical sectors.
181 */
182static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
183 unsigned int sectors)
184{
185 /* At this point there are only 512b or 4096b DIF/EPP devices */
186 if (bi->sector_size == 4096)
187 return sectors >>= 3;
188
189 return sectors;
190}
191
192/**
193 * bio_integrity_tag_size - Retrieve integrity tag space
194 * @bio: bio to inspect
195 *
196 * Description: Returns the maximum number of tag bytes that can be
197 * attached to this bio. Filesystems can use this to determine how
198 * much metadata to attach to an I/O.
199 */
200unsigned int bio_integrity_tag_size(struct bio *bio)
201{
202 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
203
204 BUG_ON(bio->bi_size == 0);
205
206 return bi->tag_size * (bio->bi_size / bi->sector_size);
207}
208EXPORT_SYMBOL(bio_integrity_tag_size);
209
210int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
211{
212 struct bio_integrity_payload *bip = bio->bi_integrity;
213 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
214 unsigned int nr_sectors;
215
216 BUG_ON(bip->bip_buf == NULL);
217
218 if (bi->tag_size == 0)
219 return -1;
220
221 nr_sectors = bio_integrity_hw_sectors(bi,
222 DIV_ROUND_UP(len, bi->tag_size));
223
224 if (nr_sectors * bi->tuple_size > bip->bip_size) {
225 printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
226 __func__, nr_sectors * bi->tuple_size, bip->bip_size);
227 return -1;
228 }
229
230 if (set)
231 bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
232 else
233 bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
234
235 return 0;
236}
237
238/**
239 * bio_integrity_set_tag - Attach a tag buffer to a bio
240 * @bio: bio to attach buffer to
241 * @tag_buf: Pointer to a buffer containing tag data
242 * @len: Length of the included buffer
243 *
244 * Description: Use this function to tag a bio by leveraging the extra
245 * space provided by devices formatted with integrity protection. The
246 * size of the integrity buffer must be <= to the size reported by
247 * bio_integrity_tag_size().
248 */
249int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
250{
251 BUG_ON(bio_data_dir(bio) != WRITE);
252
253 return bio_integrity_tag(bio, tag_buf, len, 1);
254}
255EXPORT_SYMBOL(bio_integrity_set_tag);
256
257/**
258 * bio_integrity_get_tag - Retrieve a tag buffer from a bio
259 * @bio: bio to retrieve buffer from
260 * @tag_buf: Pointer to a buffer for the tag data
261 * @len: Length of the target buffer
262 *
263 * Description: Use this function to retrieve the tag buffer from a
264 * completed I/O. The size of the integrity buffer must be <= to the
265 * size reported by bio_integrity_tag_size().
266 */
267int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
268{
269 BUG_ON(bio_data_dir(bio) != READ);
270
271 return bio_integrity_tag(bio, tag_buf, len, 0);
272}
273EXPORT_SYMBOL(bio_integrity_get_tag);
274
275/**
276 * bio_integrity_generate - Generate integrity metadata for a bio
277 * @bio: bio to generate integrity metadata for
278 *
279 * Description: Generates integrity metadata for a bio by calling the
280 * block device's generation callback function. The bio must have a
281 * bip attached with enough room to accommodate the generated
282 * integrity metadata.
283 */
284static void bio_integrity_generate(struct bio *bio)
285{
286 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
287 struct blk_integrity_exchg bix;
288 struct bio_vec *bv;
289 sector_t sector = bio->bi_sector;
290 unsigned int i, sectors, total;
291 void *prot_buf = bio->bi_integrity->bip_buf;
292
293 total = 0;
294 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
295 bix.sector_size = bi->sector_size;
296
297 bio_for_each_segment(bv, bio, i) {
298 void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
299 bix.data_buf = kaddr + bv->bv_offset;
300 bix.data_size = bv->bv_len;
301 bix.prot_buf = prot_buf;
302 bix.sector = sector;
303
304 bi->generate_fn(&bix);
305
306 sectors = bv->bv_len / bi->sector_size;
307 sector += sectors;
308 prot_buf += sectors * bi->tuple_size;
309 total += sectors * bi->tuple_size;
310 BUG_ON(total > bio->bi_integrity->bip_size);
311
312 kunmap_atomic(kaddr, KM_USER0);
313 }
314}
315
316/**
317 * bio_integrity_prep - Prepare bio for integrity I/O
318 * @bio: bio to prepare
319 *
320 * Description: Allocates a buffer for integrity metadata, maps the
321 * pages and attaches them to a bio. The bio must have data
322 * direction, target device and start sector set priot to calling. In
323 * the WRITE case, integrity metadata will be generated using the
324 * block device's integrity function. In the READ case, the buffer
325 * will be prepared for DMA and a suitable end_io handler set up.
326 */
327int bio_integrity_prep(struct bio *bio)
328{
329 struct bio_integrity_payload *bip;
330 struct blk_integrity *bi;
331 struct request_queue *q;
332 void *buf;
333 unsigned long start, end;
334 unsigned int len, nr_pages;
335 unsigned int bytes, offset, i;
336 unsigned int sectors;
337
338 bi = bdev_get_integrity(bio->bi_bdev);
339 q = bdev_get_queue(bio->bi_bdev);
340 BUG_ON(bi == NULL);
341 BUG_ON(bio_integrity(bio));
342
343 sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
344
345 /* Allocate kernel buffer for protection data */
346 len = sectors * blk_integrity_tuple_size(bi);
347 buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
348 if (unlikely(buf == NULL)) {
349 printk(KERN_ERR "could not allocate integrity buffer\n");
350 return -EIO;
351 }
352
353 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
354 start = ((unsigned long) buf) >> PAGE_SHIFT;
355 nr_pages = end - start;
356
357 /* Allocate bio integrity payload and integrity vectors */
358 bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
359 if (unlikely(bip == NULL)) {
360 printk(KERN_ERR "could not allocate data integrity bioset\n");
361 kfree(buf);
362 return -EIO;
363 }
364
365 bip->bip_buf = buf;
366 bip->bip_size = len;
367 bip->bip_sector = bio->bi_sector;
368
369 /* Map it */
370 offset = offset_in_page(buf);
371 for (i = 0 ; i < nr_pages ; i++) {
372 int ret;
373 bytes = PAGE_SIZE - offset;
374
375 if (len <= 0)
376 break;
377
378 if (bytes > len)
379 bytes = len;
380
381 ret = bio_integrity_add_page(bio, virt_to_page(buf),
382 bytes, offset);
383
384 if (ret == 0)
385 return 0;
386
387 if (ret < bytes)
388 break;
389
390 buf += bytes;
391 len -= bytes;
392 offset = 0;
393 }
394
395 /* Install custom I/O completion handler if read verify is enabled */
396 if (bio_data_dir(bio) == READ) {
397 bip->bip_end_io = bio->bi_end_io;
398 bio->bi_end_io = bio_integrity_endio;
399 }
400
401 /* Auto-generate integrity metadata if this is a write */
402 if (bio_data_dir(bio) == WRITE)
403 bio_integrity_generate(bio);
404
405 return 0;
406}
407EXPORT_SYMBOL(bio_integrity_prep);
408
409/**
410 * bio_integrity_verify - Verify integrity metadata for a bio
411 * @bio: bio to verify
412 *
413 * Description: This function is called to verify the integrity of a
414 * bio. The data in the bio io_vec is compared to the integrity
415 * metadata returned by the HBA.
416 */
417static int bio_integrity_verify(struct bio *bio)
418{
419 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
420 struct blk_integrity_exchg bix;
421 struct bio_vec *bv;
422 sector_t sector = bio->bi_integrity->bip_sector;
423 unsigned int i, sectors, total, ret;
424 void *prot_buf = bio->bi_integrity->bip_buf;
425
426 ret = total = 0;
427 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
428 bix.sector_size = bi->sector_size;
429
430 bio_for_each_segment(bv, bio, i) {
431 void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
432 bix.data_buf = kaddr + bv->bv_offset;
433 bix.data_size = bv->bv_len;
434 bix.prot_buf = prot_buf;
435 bix.sector = sector;
436
437 ret = bi->verify_fn(&bix);
438
439 if (ret) {
440 kunmap_atomic(kaddr, KM_USER0);
441 break;
442 }
443
444 sectors = bv->bv_len / bi->sector_size;
445 sector += sectors;
446 prot_buf += sectors * bi->tuple_size;
447 total += sectors * bi->tuple_size;
448 BUG_ON(total > bio->bi_integrity->bip_size);
449
450 kunmap_atomic(kaddr, KM_USER0);
451 }
452
453 return ret;
454}
455
456/**
457 * bio_integrity_verify_fn - Integrity I/O completion worker
458 * @work: Work struct stored in bio to be verified
459 *
460 * Description: This workqueue function is called to complete a READ
461 * request. The function verifies the transferred integrity metadata
462 * and then calls the original bio end_io function.
463 */
464static void bio_integrity_verify_fn(struct work_struct *work)
465{
466 struct bio_integrity_payload *bip =
467 container_of(work, struct bio_integrity_payload, bip_work);
468 struct bio *bio = bip->bip_bio;
469 int error = bip->bip_error;
470
471 if (bio_integrity_verify(bio)) {
472 clear_bit(BIO_UPTODATE, &bio->bi_flags);
473 error = -EIO;
474 }
475
476 /* Restore original bio completion handler */
477 bio->bi_end_io = bip->bip_end_io;
478
479 if (bio->bi_end_io)
480 bio->bi_end_io(bio, error);
481}
482
483/**
484 * bio_integrity_endio - Integrity I/O completion function
485 * @bio: Protected bio
486 * @error: Pointer to errno
487 *
488 * Description: Completion for integrity I/O
489 *
490 * Normally I/O completion is done in interrupt context. However,
491 * verifying I/O integrity is a time-consuming task which must be run
492 * in process context. This function postpones completion
493 * accordingly.
494 */
495void bio_integrity_endio(struct bio *bio, int error)
496{
497 struct bio_integrity_payload *bip = bio->bi_integrity;
498
499 BUG_ON(bip->bip_bio != bio);
500
501 bip->bip_error = error;
502 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
503 queue_work(kintegrityd_wq, &bip->bip_work);
504}
505EXPORT_SYMBOL(bio_integrity_endio);
506
507/**
508 * bio_integrity_mark_head - Advance bip_vec skip bytes
509 * @bip: Integrity vector to advance
510 * @skip: Number of bytes to advance it
511 */
512void bio_integrity_mark_head(struct bio_integrity_payload *bip,
513 unsigned int skip)
514{
515 struct bio_vec *iv;
516 unsigned int i;
517
518 bip_for_each_vec(iv, bip, i) {
519 if (skip == 0) {
520 bip->bip_idx = i;
521 return;
522 } else if (skip >= iv->bv_len) {
523 skip -= iv->bv_len;
524 } else { /* skip < iv->bv_len) */
525 iv->bv_offset += skip;
526 iv->bv_len -= skip;
527 bip->bip_idx = i;
528 return;
529 }
530 }
531}
532
533/**
534 * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
535 * @bip: Integrity vector to truncate
536 * @len: New length of integrity vector
537 */
538void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
539 unsigned int len)
540{
541 struct bio_vec *iv;
542 unsigned int i;
543
544 bip_for_each_vec(iv, bip, i) {
545 if (len == 0) {
546 bip->bip_vcnt = i;
547 return;
548 } else if (len >= iv->bv_len) {
549 len -= iv->bv_len;
550 } else { /* len < iv->bv_len) */
551 iv->bv_len = len;
552 len = 0;
553 }
554 }
555}
556
557/**
558 * bio_integrity_advance - Advance integrity vector
559 * @bio: bio whose integrity vector to update
560 * @bytes_done: number of data bytes that have been completed
561 *
562 * Description: This function calculates how many integrity bytes the
563 * number of completed data bytes correspond to and advances the
564 * integrity vector accordingly.
565 */
566void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
567{
568 struct bio_integrity_payload *bip = bio->bi_integrity;
569 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
570 unsigned int nr_sectors;
571
572 BUG_ON(bip == NULL);
573 BUG_ON(bi == NULL);
574
575 nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
576 bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
577}
578EXPORT_SYMBOL(bio_integrity_advance);
579
580/**
581 * bio_integrity_trim - Trim integrity vector
582 * @bio: bio whose integrity vector to update
583 * @offset: offset to first data sector
584 * @sectors: number of data sectors
585 *
586 * Description: Used to trim the integrity vector in a cloned bio.
587 * The ivec will be advanced corresponding to 'offset' data sectors
588 * and the length will be truncated corresponding to 'len' data
589 * sectors.
590 */
591void bio_integrity_trim(struct bio *bio, unsigned int offset,
592 unsigned int sectors)
593{
594 struct bio_integrity_payload *bip = bio->bi_integrity;
595 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
596 unsigned int nr_sectors;
597
598 BUG_ON(bip == NULL);
599 BUG_ON(bi == NULL);
600 BUG_ON(!bio_flagged(bio, BIO_CLONED));
601
602 nr_sectors = bio_integrity_hw_sectors(bi, sectors);
603 bip->bip_sector = bip->bip_sector + offset;
604 bio_integrity_mark_head(bip, offset * bi->tuple_size);
605 bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
606}
607EXPORT_SYMBOL(bio_integrity_trim);
608
609/**
610 * bio_integrity_split - Split integrity metadata
611 * @bio: Protected bio
612 * @bp: Resulting bio_pair
613 * @sectors: Offset
614 *
615 * Description: Splits an integrity page into a bio_pair.
616 */
617void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
618{
619 struct blk_integrity *bi;
620 struct bio_integrity_payload *bip = bio->bi_integrity;
621 unsigned int nr_sectors;
622
623 if (bio_integrity(bio) == 0)
624 return;
625
626 bi = bdev_get_integrity(bio->bi_bdev);
627 BUG_ON(bi == NULL);
628 BUG_ON(bip->bip_vcnt != 1);
629
630 nr_sectors = bio_integrity_hw_sectors(bi, sectors);
631
632 bp->bio1.bi_integrity = &bp->bip1;
633 bp->bio2.bi_integrity = &bp->bip2;
634
635 bp->iv1 = bip->bip_vec[0];
636 bp->iv2 = bip->bip_vec[0];
637
638 bp->bip1.bip_vec = &bp->iv1;
639 bp->bip2.bip_vec = &bp->iv2;
640
641 bp->iv1.bv_len = sectors * bi->tuple_size;
642 bp->iv2.bv_offset += sectors * bi->tuple_size;
643 bp->iv2.bv_len -= sectors * bi->tuple_size;
644
645 bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
646 bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
647
648 bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
649 bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
650}
651EXPORT_SYMBOL(bio_integrity_split);
652
653/**
654 * bio_integrity_clone - Callback for cloning bios with integrity metadata
655 * @bio: New bio
656 * @bio_src: Original bio
657 * @bs: bio_set to allocate bip from
658 *
659 * Description: Called to allocate a bip when cloning a bio
660 */
661int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
662 struct bio_set *bs)
663{
664 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
665 struct bio_integrity_payload *bip;
666
667 BUG_ON(bip_src == NULL);
668
669 bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
670
671 if (bip == NULL)
672 return -EIO;
673
674 memcpy(bip->bip_vec, bip_src->bip_vec,
675 bip_src->bip_vcnt * sizeof(struct bio_vec));
676
677 bip->bip_sector = bip_src->bip_sector;
678 bip->bip_vcnt = bip_src->bip_vcnt;
679 bip->bip_idx = bip_src->bip_idx;
680
681 return 0;
682}
683EXPORT_SYMBOL(bio_integrity_clone);
684
685int bioset_integrity_create(struct bio_set *bs, int pool_size)
686{
687 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
688 bio_integrity_slab);
689 if (!bs->bio_integrity_pool)
690 return -1;
691
692 return 0;
693}
694EXPORT_SYMBOL(bioset_integrity_create);
695
696void bioset_integrity_free(struct bio_set *bs)
697{
698 if (bs->bio_integrity_pool)
699 mempool_destroy(bs->bio_integrity_pool);
700}
701EXPORT_SYMBOL(bioset_integrity_free);
702
703void __init bio_integrity_init_slab(void)
704{
705 bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
706 SLAB_HWCACHE_ALIGN|SLAB_PANIC);
707}
708EXPORT_SYMBOL(bio_integrity_init_slab);
709
710static int __init integrity_init(void)
711{
712 kintegrityd_wq = create_workqueue("kintegrityd");
713
714 if (!kintegrityd_wq)
715 panic("Failed to create kintegrityd\n");
716
717 return 0;
718}
719subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb52..88322b066acb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
28#include <linux/blktrace_api.h> 28#include <linux/blktrace_api.h>
29#include <scsi/sg.h> /* for struct sg_iovec */ 29#include <scsi/sg.h> /* for struct sg_iovec */
30 30
31#define BIO_POOL_SIZE 2
32
33static struct kmem_cache *bio_slab __read_mostly; 31static struct kmem_cache *bio_slab __read_mostly;
34 32
35#define BIOVEC_NR_POOLS 6
36
37/*
38 * a small number of entries is fine, not going to be performance critical.
39 * basically we just need to survive
40 */
41#define BIO_SPLIT_ENTRIES 2
42mempool_t *bio_split_pool __read_mostly; 33mempool_t *bio_split_pool __read_mostly;
43 34
44struct biovec_slab {
45 int nr_vecs;
46 char *name;
47 struct kmem_cache *slab;
48};
49
50/* 35/*
51 * if you change this list, also change bvec_alloc or things will 36 * if you change this list, also change bvec_alloc or things will
52 * break badly! cannot be bigger than what you can fit into an 37 * break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
60#undef BV 45#undef BV
61 46
62/* 47/*
63 * bio_set is used to allow other portions of the IO system to
64 * allocate their own private memory pools for bio and iovec structures.
65 * These memory pools in turn all allocate from the bio_slab
66 * and the bvec_slabs[].
67 */
68struct bio_set {
69 mempool_t *bio_pool;
70 mempool_t *bvec_pools[BIOVEC_NR_POOLS];
71};
72
73/*
74 * fs_bio_set is the bio_set containing bio and iovec memory pools used by 48 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
75 * IO code that does not need private memory pools. 49 * IO code that does not need private memory pools.
76 */ 50 */
77static struct bio_set *fs_bio_set; 51struct bio_set *fs_bio_set;
52
53unsigned int bvec_nr_vecs(unsigned short idx)
54{
55 return bvec_slabs[idx].nr_vecs;
56}
78 57
79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 58struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
80{ 59{
81 struct bio_vec *bvl; 60 struct bio_vec *bvl;
82 61
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
117 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); 96 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
118 } 97 }
119 98
99 if (bio_integrity(bio))
100 bio_integrity_free(bio, bio_set);
101
120 mempool_free(bio, bio_set->bio_pool); 102 mempool_free(bio, bio_set->bio_pool);
121} 103}
122 104
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
275{ 257{
276 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 258 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
277 259
278 if (b) { 260 if (!b)
279 b->bi_destructor = bio_fs_destructor; 261 return NULL;
280 __bio_clone(b, bio); 262
263 b->bi_destructor = bio_fs_destructor;
264 __bio_clone(b, bio);
265
266 if (bio_integrity(bio)) {
267 int ret;
268
269 ret = bio_integrity_clone(b, bio, fs_bio_set);
270
271 if (ret < 0)
272 return NULL;
281 } 273 }
282 274
283 return b; 275 return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
333 if (page == prev->bv_page && 325 if (page == prev->bv_page &&
334 offset == prev->bv_offset + prev->bv_len) { 326 offset == prev->bv_offset + prev->bv_len) {
335 prev->bv_len += len; 327 prev->bv_len += len;
336 if (q->merge_bvec_fn && 328
337 q->merge_bvec_fn(q, bio, prev) < len) { 329 if (q->merge_bvec_fn) {
338 prev->bv_len -= len; 330 struct bvec_merge_data bvm = {
339 return 0; 331 .bi_bdev = bio->bi_bdev,
332 .bi_sector = bio->bi_sector,
333 .bi_size = bio->bi_size,
334 .bi_rw = bio->bi_rw,
335 };
336
337 if (q->merge_bvec_fn(q, &bvm, prev) < len) {
338 prev->bv_len -= len;
339 return 0;
340 }
340 } 341 }
341 342
342 goto done; 343 goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
377 * queue to get further control 378 * queue to get further control
378 */ 379 */
379 if (q->merge_bvec_fn) { 380 if (q->merge_bvec_fn) {
381 struct bvec_merge_data bvm = {
382 .bi_bdev = bio->bi_bdev,
383 .bi_sector = bio->bi_sector,
384 .bi_size = bio->bi_size,
385 .bi_rw = bio->bi_rw,
386 };
387
380 /* 388 /*
381 * merge_bvec_fn() returns number of bytes it can accept 389 * merge_bvec_fn() returns number of bytes it can accept
382 * at this offset 390 * at this offset
383 */ 391 */
384 if (q->merge_bvec_fn(q, bio, bvec) < len) { 392 if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
385 bvec->bv_page = NULL; 393 bvec->bv_page = NULL;
386 bvec->bv_len = 0; 394 bvec->bv_len = 0;
387 bvec->bv_offset = 0; 395 bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1249 bp->bio1.bi_private = bi; 1257 bp->bio1.bi_private = bi;
1250 bp->bio2.bi_private = pool; 1258 bp->bio2.bi_private = pool;
1251 1259
1260 if (bio_integrity(bi))
1261 bio_integrity_split(bi, bp, first_sectors);
1262
1252 return bp; 1263 return bp;
1253} 1264}
1254 1265
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
1290 if (bs->bio_pool) 1301 if (bs->bio_pool)
1291 mempool_destroy(bs->bio_pool); 1302 mempool_destroy(bs->bio_pool);
1292 1303
1304 bioset_integrity_free(bs);
1293 biovec_free_pools(bs); 1305 biovec_free_pools(bs);
1294 1306
1295 kfree(bs); 1307 kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
1306 if (!bs->bio_pool) 1318 if (!bs->bio_pool)
1307 goto bad; 1319 goto bad;
1308 1320
1321 if (bioset_integrity_create(bs, bio_pool_size))
1322 goto bad;
1323
1309 if (!biovec_create_pools(bs, bvec_pool_size)) 1324 if (!biovec_create_pools(bs, bvec_pool_size))
1310 return bs; 1325 return bs;
1311 1326
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
1332{ 1347{
1333 bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 1348 bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
1334 1349
1350 bio_integrity_init_slab();
1335 biovec_init_slabs(); 1351 biovec_init_slabs();
1336 1352
1337 fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); 1353 fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c266..d48caee12e2a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
1464 1464
1465void invalidate_bh_lrus(void) 1465void invalidate_bh_lrus(void)
1466{ 1466{
1467 on_each_cpu(invalidate_bh_lru, NULL, 1, 1); 1467 on_each_cpu(invalidate_bh_lru, NULL, 1);
1468} 1468}
1469EXPORT_SYMBOL_GPL(invalidate_bh_lrus); 1469EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1470 1470
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1691 */ 1691 */
1692 clear_buffer_dirty(bh); 1692 clear_buffer_dirty(bh);
1693 set_buffer_uptodate(bh); 1693 set_buffer_uptodate(bh);
1694 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { 1694 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1695 buffer_dirty(bh)) {
1695 WARN_ON(bh->b_size != blocksize); 1696 WARN_ON(bh->b_size != blocksize);
1696 err = get_block(inode, block, bh, 1); 1697 err = get_block(inode, block, bh, 1);
1697 if (err) 1698 if (err)
1698 goto recover; 1699 goto recover;
1700 clear_buffer_delay(bh);
1699 if (buffer_new(bh)) { 1701 if (buffer_new(bh)) {
1700 /* blockdev mappings never come here */ 1702 /* blockdev mappings never come here */
1701 clear_buffer_new(bh); 1703 clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
1774 bh = head; 1776 bh = head;
1775 /* Recovery: lock and submit the mapped buffers */ 1777 /* Recovery: lock and submit the mapped buffers */
1776 do { 1778 do {
1777 if (buffer_mapped(bh) && buffer_dirty(bh)) { 1779 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1780 !buffer_delay(bh)) {
1778 lock_buffer(bh); 1781 lock_buffer(bh);
1779 mark_buffer_async_write(bh); 1782 mark_buffer_async_write(bh);
1780 } else { 1783 } else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
2061 struct page *page, void *fsdata) 2064 struct page *page, void *fsdata)
2062{ 2065{
2063 struct inode *inode = mapping->host; 2066 struct inode *inode = mapping->host;
2067 int i_size_changed = 0;
2064 2068
2065 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 2069 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2066 2070
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
2073 */ 2077 */
2074 if (pos+copied > inode->i_size) { 2078 if (pos+copied > inode->i_size) {
2075 i_size_write(inode, pos+copied); 2079 i_size_write(inode, pos+copied);
2076 mark_inode_dirty(inode); 2080 i_size_changed = 1;
2077 } 2081 }
2078 2082
2079 unlock_page(page); 2083 unlock_page(page);
2080 page_cache_release(page); 2084 page_cache_release(page);
2081 2085
2086 /*
2087 * Don't mark the inode dirty under page lock. First, it unnecessarily
2088 * makes the holding time of page lock longer. Second, it forces lock
2089 * ordering of page lock and transaction start for journaling
2090 * filesystems.
2091 */
2092 if (i_size_changed)
2093 mark_inode_dirty(inode);
2094
2082 return copied; 2095 return copied;
2083} 2096}
2084EXPORT_SYMBOL(generic_write_end); 2097EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b88457..3cb7cda3d780 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
373 return -ENXIO; 373 return -ENXIO;
374 new = container_of(kobj, struct cdev, kobj); 374 new = container_of(kobj, struct cdev, kobj);
375 spin_lock(&cdev_lock); 375 spin_lock(&cdev_lock);
376 /* Check i_cdev again in case somebody beat us to it while
377 we dropped the lock. */
376 p = inode->i_cdev; 378 p = inode->i_cdev;
377 if (!p) { 379 if (!p) {
378 inode->i_cdev = p = new; 380 inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
392 cdev_put(p); 394 cdev_put(p);
393 return -ENXIO; 395 return -ENXIO;
394 } 396 }
395 if (filp->f_op->open) { 397 if (filp->f_op->open)
396 lock_kernel();
397 ret = filp->f_op->open(inode,filp); 398 ret = filp->f_op->open(inode,filp);
398 unlock_kernel();
399 }
400 if (ret) 399 if (ret)
401 cdev_put(p); 400 cdev_put(p);
402 return ret; 401 return ret;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff5400..0e9fc2ba90ee 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
34static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { 34static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
35 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, 35 {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
36 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, 36 {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
37 {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"}, 37 {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
38 {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"}, 38 {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
39 {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"}, 39 {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
40 {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"}, 40 {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
41 {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} } 41 {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
42; 42;
43 43
44 44
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 86b4d5f405ae..22857c639df5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -612,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
612 if (retval < 0) 612 if (retval < 0)
613 return (loff_t)retval; 613 return (loff_t)retval;
614 } 614 }
615 return remote_llseek(file, offset, origin); 615 return generic_file_llseek_unlocked(file, offset, origin);
616} 616}
617 617
618struct file_system_type cifs_fs_type = { 618struct file_system_type cifs_fs_type = {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 722be543ceec..2e904bd111c8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode,
219 rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data, 219 rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
220 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 220 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
221 CIFS_MOUNT_MAP_SPECIAL_CHR); 221 CIFS_MOUNT_MAP_SPECIAL_CHR);
222 if (rc) { 222 if (rc == -EREMOTE && !is_dfs_referral) {
223 if (rc == -EREMOTE && !is_dfs_referral) { 223 is_dfs_referral = true;
224 is_dfs_referral = true; 224 cFYI(DBG2, ("DFS ref"));
225 cFYI(DBG2, ("DFS ref")); 225 /* for DFS, server does not give us real inode data */
226 /* for DFS, server does not give us real inode data */ 226 fill_fake_finddataunix(&find_data, sb);
227 fill_fake_finddataunix(&find_data, sb); 227 rc = 0;
228 rc = 0; 228 } else if (rc)
229 } 229 goto cgiiu_exit;
230 } 230
231 num_of_bytes = le64_to_cpu(find_data.NumOfBytes); 231 num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
232 end_of_file = le64_to_cpu(find_data.EndOfFile); 232 end_of_file = le64_to_cpu(find_data.EndOfFile);
233 233
@@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
236 *pinode = new_inode(sb); 236 *pinode = new_inode(sb);
237 if (*pinode == NULL) { 237 if (*pinode == NULL) {
238 rc = -ENOMEM; 238 rc = -ENOMEM;
239 goto cgiiu_exit; 239 goto cgiiu_exit;
240 } 240 }
241 /* Is an i_ino of zero legal? */ 241 /* Is an i_ino of zero legal? */
242 /* note ino incremented to unique num in new_inode */ 242 /* note ino incremented to unique num in new_inode */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33b..f976f303c196 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
15#include <linux/poll.h> 15#include <linux/poll.h>
16#include <linux/signal.h> 16#include <linux/signal.h>
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/smp_lock.h>
18#include <linux/dlm.h> 19#include <linux/dlm.h>
19#include <linux/dlm_device.h> 20#include <linux/dlm_device.h>
20 21
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
618 struct dlm_user_proc *proc; 619 struct dlm_user_proc *proc;
619 struct dlm_ls *ls; 620 struct dlm_ls *ls;
620 621
622 lock_kernel();
621 ls = dlm_find_lockspace_device(iminor(inode)); 623 ls = dlm_find_lockspace_device(iminor(inode));
622 if (!ls) 624 if (!ls) {
625 unlock_kernel();
623 return -ENOENT; 626 return -ENOENT;
627 }
624 628
625 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); 629 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
626 if (!proc) { 630 if (!proc) {
627 dlm_put_lockspace(ls); 631 dlm_put_lockspace(ls);
632 unlock_kernel();
628 return -ENOMEM; 633 return -ENOMEM;
629 } 634 }
630 635
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
636 spin_lock_init(&proc->locks_spin); 641 spin_lock_init(&proc->locks_spin);
637 init_waitqueue_head(&proc->wait); 642 init_waitqueue_head(&proc->wait);
638 file->private_data = proc; 643 file->private_data = proc;
644 unlock_kernel();
639 645
640 return 0; 646 return 0;
641} 647}
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
870 876
871static int ctl_device_open(struct inode *inode, struct file *file) 877static int ctl_device_open(struct inode *inode, struct file *file)
872{ 878{
879 cycle_kernel_lock();
873 file->private_data = NULL; 880 file->private_data = NULL;
874 return 0; 881 return 0;
875} 882}
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a6..24749bf0668f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/fs_stack.h> 32#include <linux/fs_stack.h>
33#include <linux/smp_lock.h>
33#include "ecryptfs_kernel.h" 34#include "ecryptfs_kernel.h"
34 35
35/** 36/**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
277 int rc = 0; 278 int rc = 0;
278 struct file *lower_file = NULL; 279 struct file *lower_file = NULL;
279 280
281 lock_kernel();
280 lower_file = ecryptfs_file_to_lower(file); 282 lower_file = ecryptfs_file_to_lower(file);
281 if (lower_file->f_op && lower_file->f_op->fasync) 283 if (lower_file->f_op && lower_file->f_op->fasync)
282 rc = lower_file->f_op->fasync(fd, lower_file, flag); 284 rc = lower_file->f_op->fasync(fd, lower_file, flag);
285 unlock_kernel();
283 return rc; 286 return rc;
284} 287}
285 288
diff --git a/fs/exec.c b/fs/exec.c
index da94a6f05df3..fd9234379e8d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -610,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
610 bprm->exec -= stack_shift; 610 bprm->exec -= stack_shift;
611 611
612 down_write(&mm->mmap_sem); 612 down_write(&mm->mmap_sem);
613 vm_flags = vma->vm_flags; 613 vm_flags = VM_STACK_FLAGS;
614 614
615 /* 615 /*
616 * Adjust stack execute permissions; explicitly enable for 616 * Adjust stack execute permissions; explicitly enable for
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d8..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
47 ext4_group_t block_group) 47 ext4_group_t block_group)
48{ 48{
49 ext4_group_t actual_group; 49 ext4_group_t actual_group;
50 ext4_get_group_no_and_offset(sb, block, &actual_group, 0); 50 ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
51 if (actual_group == block_group) 51 if (actual_group == block_group)
52 return 1; 52 return 1;
53 return 0; 53 return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
121 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); 121 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
122 } 122 }
123 } else { /* For META_BG_BLOCK_GROUPS */ 123 } else { /* For META_BG_BLOCK_GROUPS */
124 int group_rel = (block_group - 124 bit_max += ext4_bg_num_gdb(sb, block_group);
125 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
126 EXT4_DESC_PER_BLOCK(sb);
127 if (group_rel == 0 || group_rel == 1 ||
128 (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
129 bit_max += 1;
130 } 125 }
131 126
132 if (block_group == sbi->s_groups_count - 1) { 127 if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
295 return 0; 290 return 0;
296} 291}
297/** 292/**
298 * read_block_bitmap() 293 * ext4_read_block_bitmap()
299 * @sb: super block 294 * @sb: super block
300 * @block_group: given block group 295 * @block_group: given block group
301 * 296 *
@@ -305,7 +300,7 @@ err_out:
305 * Return buffer_head on success or NULL in case of failure. 300 * Return buffer_head on success or NULL in case of failure.
306 */ 301 */
307struct buffer_head * 302struct buffer_head *
308read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 303ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
309{ 304{
310 struct ext4_group_desc * desc; 305 struct ext4_group_desc * desc;
311 struct buffer_head * bh = NULL; 306 struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
409 prev = rsv; 404 prev = rsv;
410 } 405 }
411 printk("Window map complete.\n"); 406 printk("Window map complete.\n");
412 if (bad) 407 BUG_ON(bad);
413 BUG();
414} 408}
415#define rsv_window_dump(root, verbose) \ 409#define rsv_window_dump(root, verbose) \
416 __rsv_window_dump((root), (verbose), __func__) 410 __rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
694 count -= overflow; 688 count -= overflow;
695 } 689 }
696 brelse(bitmap_bh); 690 brelse(bitmap_bh);
697 bitmap_bh = read_block_bitmap(sb, block_group); 691 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
698 if (!bitmap_bh) 692 if (!bitmap_bh)
699 goto error_return; 693 goto error_return;
700 desc = ext4_get_group_desc (sb, block_group, &gd_bh); 694 desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
810 spin_unlock(sb_bgl_lock(sbi, block_group)); 804 spin_unlock(sb_bgl_lock(sbi, block_group));
811 percpu_counter_add(&sbi->s_freeblocks_counter, count); 805 percpu_counter_add(&sbi->s_freeblocks_counter, count);
812 806
807 if (sbi->s_log_groups_per_flex) {
808 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
809 spin_lock(sb_bgl_lock(sbi, flex_group));
810 sbi->s_flex_groups[flex_group].free_blocks += count;
811 spin_unlock(sb_bgl_lock(sbi, flex_group));
812 }
813
813 /* We dirtied the bitmap block */ 814 /* We dirtied the bitmap block */
814 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 815 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
815 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 816 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
1598 1599
1599/** 1600/**
1600 * ext4_has_free_blocks() 1601 * ext4_has_free_blocks()
1601 * @sbi: in-core super block structure. 1602 * @sbi: in-core super block structure.
1603 * @nblocks: number of neeed blocks
1602 * 1604 *
1603 * Check if filesystem has at least 1 free block available for allocation. 1605 * Check if filesystem has free blocks available for allocation.
1606 * Return the number of blocks avaible for allocation for this request
1607 * On success, return nblocks
1604 */ 1608 */
1605static int ext4_has_free_blocks(struct ext4_sb_info *sbi) 1609ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1610 ext4_fsblk_t nblocks)
1606{ 1611{
1607 ext4_fsblk_t free_blocks, root_blocks; 1612 ext4_fsblk_t free_blocks;
1613 ext4_fsblk_t root_blocks = 0;
1608 1614
1609 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 1615 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1610 root_blocks = ext4_r_blocks_count(sbi->s_es); 1616
1611 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && 1617 if (!capable(CAP_SYS_RESOURCE) &&
1612 sbi->s_resuid != current->fsuid && 1618 sbi->s_resuid != current->fsuid &&
1613 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { 1619 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1614 return 0; 1620 root_blocks = ext4_r_blocks_count(sbi->s_es);
1615 } 1621#ifdef CONFIG_SMP
1616 return 1; 1622 if (free_blocks - root_blocks < FBC_BATCH)
1617} 1623 free_blocks =
1624 percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
1625#endif
1626 if (free_blocks - root_blocks < nblocks)
1627 return free_blocks - root_blocks;
1628 return nblocks;
1629 }
1630
1618 1631
1619/** 1632/**
1620 * ext4_should_retry_alloc() 1633 * ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
1630 */ 1643 */
1631int ext4_should_retry_alloc(struct super_block *sb, int *retries) 1644int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1632{ 1645{
1633 if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3) 1646 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
1634 return 0; 1647 return 0;
1635 1648
1636 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); 1649 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1639} 1652}
1640 1653
1641/** 1654/**
1642 * ext4_new_blocks_old() -- core block(s) allocation function 1655 * ext4_old_new_blocks() -- core block bitmap based block allocation function
1656 *
1643 * @handle: handle to this transaction 1657 * @handle: handle to this transaction
1644 * @inode: file inode 1658 * @inode: file inode
1645 * @goal: given target block(filesystem wide) 1659 * @goal: given target block(filesystem wide)
1646 * @count: target number of blocks to allocate 1660 * @count: target number of blocks to allocate
1647 * @errp: error code 1661 * @errp: error code
1648 * 1662 *
1649 * ext4_new_blocks uses a goal block to assist allocation. It tries to 1663 * ext4_old_new_blocks uses a goal block to assist allocation and look up
1650 * allocate block(s) from the block group contains the goal block first. If that 1664 * the block bitmap directly to do block allocation. It tries to
1651 * fails, it will try to allocate block(s) from other block groups without 1665 * allocate block(s) from the block group contains the goal block first. If
1652 * any specific goal block. 1666 * that fails, it will try to allocate block(s) from other block groups
1667 * without any specific goal block.
1668 *
1669 * This function is called when -o nomballoc mount option is enabled
1653 * 1670 *
1654 */ 1671 */
1655ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, 1672ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1656 ext4_fsblk_t goal, unsigned long *count, int *errp) 1673 ext4_fsblk_t goal, unsigned long *count, int *errp)
1657{ 1674{
1658 struct buffer_head *bitmap_bh = NULL; 1675 struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1676 ext4_group_t ngroups; 1693 ext4_group_t ngroups;
1677 unsigned long num = *count; 1694 unsigned long num = *count;
1678 1695
1679 *errp = -ENOSPC;
1680 sb = inode->i_sb; 1696 sb = inode->i_sb;
1681 if (!sb) { 1697 if (!sb) {
1698 *errp = -ENODEV;
1682 printk("ext4_new_block: nonexistent device"); 1699 printk("ext4_new_block: nonexistent device");
1683 return 0; 1700 return 0;
1684 } 1701 }
1685 1702
1703 sbi = EXT4_SB(sb);
1704 if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
1705 /*
1706 * With delalloc we already reserved the blocks
1707 */
1708 *count = ext4_has_free_blocks(sbi, *count);
1709 }
1710 if (*count == 0) {
1711 *errp = -ENOSPC;
1712 return 0; /*return with ENOSPC error */
1713 }
1714 num = *count;
1715
1686 /* 1716 /*
1687 * Check quota for allocation of this block. 1717 * Check quota for allocation of this block.
1688 */ 1718 */
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1706 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) 1736 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1707 my_rsv = &block_i->rsv_window_node; 1737 my_rsv = &block_i->rsv_window_node;
1708 1738
1709 if (!ext4_has_free_blocks(sbi)) {
1710 *errp = -ENOSPC;
1711 goto out;
1712 }
1713
1714 /* 1739 /*
1715 * First, test whether the goal block is free. 1740 * First, test whether the goal block is free.
1716 */ 1741 */
@@ -1734,7 +1759,7 @@ retry_alloc:
1734 my_rsv = NULL; 1759 my_rsv = NULL;
1735 1760
1736 if (free_blocks > 0) { 1761 if (free_blocks > 0) {
1737 bitmap_bh = read_block_bitmap(sb, group_no); 1762 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1738 if (!bitmap_bh) 1763 if (!bitmap_bh)
1739 goto io_error; 1764 goto io_error;
1740 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, 1765 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
1770 continue; 1795 continue;
1771 1796
1772 brelse(bitmap_bh); 1797 brelse(bitmap_bh);
1773 bitmap_bh = read_block_bitmap(sb, group_no); 1798 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1774 if (!bitmap_bh) 1799 if (!bitmap_bh)
1775 goto io_error; 1800 goto io_error;
1776 /* 1801 /*
@@ -1882,7 +1907,15 @@ allocated:
1882 le16_add_cpu(&gdp->bg_free_blocks_count, -num); 1907 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1883 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); 1908 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1884 spin_unlock(sb_bgl_lock(sbi, group_no)); 1909 spin_unlock(sb_bgl_lock(sbi, group_no));
1885 percpu_counter_sub(&sbi->s_freeblocks_counter, num); 1910 if (!EXT4_I(inode)->i_delalloc_reserved_flag)
1911 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1912
1913 if (sbi->s_log_groups_per_flex) {
1914 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
1915 spin_lock(sb_bgl_lock(sbi, flex_group));
1916 sbi->s_flex_groups[flex_group].free_blocks -= num;
1917 spin_unlock(sb_bgl_lock(sbi, flex_group));
1918 }
1886 1919
1887 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); 1920 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1888 err = ext4_journal_dirty_metadata(handle, gdp_bh); 1921 err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
1915 return 0; 1948 return 0;
1916} 1949}
1917 1950
1918ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, 1951#define EXT4_META_BLOCK 0x1
1919 ext4_fsblk_t goal, int *errp) 1952
1953static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
1954 ext4_lblk_t iblock, ext4_fsblk_t goal,
1955 unsigned long *count, int *errp, int flags)
1920{ 1956{
1921 struct ext4_allocation_request ar; 1957 struct ext4_allocation_request ar;
1922 ext4_fsblk_t ret; 1958 ext4_fsblk_t ret;
1923 1959
1924 if (!test_opt(inode->i_sb, MBALLOC)) { 1960 if (!test_opt(inode->i_sb, MBALLOC)) {
1925 unsigned long count = 1; 1961 return ext4_old_new_blocks(handle, inode, goal, count, errp);
1926 ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
1927 return ret;
1928 } 1962 }
1929 1963
1930 memset(&ar, 0, sizeof(ar)); 1964 memset(&ar, 0, sizeof(ar));
1965 /* Fill with neighbour allocated blocks */
1966
1931 ar.inode = inode; 1967 ar.inode = inode;
1932 ar.goal = goal; 1968 ar.goal = goal;
1933 ar.len = 1; 1969 ar.len = *count;
1970 ar.logical = iblock;
1971
1972 if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
1973 /* enable in-core preallocation for data block allocation */
1974 ar.flags = EXT4_MB_HINT_DATA;
1975 else
1976 /* disable in-core preallocation for non-regular files */
1977 ar.flags = 0;
1978
1934 ret = ext4_mb_new_blocks(handle, &ar, errp); 1979 ret = ext4_mb_new_blocks(handle, &ar, errp);
1980 *count = ar.len;
1935 return ret; 1981 return ret;
1936} 1982}
1937 1983
1938ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1984/*
1985 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
1986 *
1987 * @handle: handle to this transaction
1988 * @inode: file inode
1989 * @goal: given target block(filesystem wide)
1990 * @count: total number of blocks need
1991 * @errp: error code
1992 *
1993 * Return 1st allocated block numberon success, *count stores total account
1994 * error stores in errp pointer
1995 */
1996ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1939 ext4_fsblk_t goal, unsigned long *count, int *errp) 1997 ext4_fsblk_t goal, unsigned long *count, int *errp)
1940{ 1998{
1941 struct ext4_allocation_request ar;
1942 ext4_fsblk_t ret; 1999 ext4_fsblk_t ret;
1943 2000 ret = do_blk_alloc(handle, inode, 0, goal,
1944 if (!test_opt(inode->i_sb, MBALLOC)) { 2001 count, errp, EXT4_META_BLOCK);
1945 ret = ext4_new_blocks_old(handle, inode, goal, count, errp); 2002 /*
1946 return ret; 2003 * Account for the allocated meta blocks
2004 */
2005 if (!(*errp)) {
2006 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2007 EXT4_I(inode)->i_allocated_meta_blocks += *count;
2008 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1947 } 2009 }
1948
1949 memset(&ar, 0, sizeof(ar));
1950 ar.inode = inode;
1951 ar.goal = goal;
1952 ar.len = *count;
1953 ret = ext4_mb_new_blocks(handle, &ar, errp);
1954 *count = ar.len;
1955 return ret; 2010 return ret;
1956} 2011}
1957 2012
2013/*
2014 * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
2015 *
2016 * @handle: handle to this transaction
2017 * @inode: file inode
2018 * @goal: given target block(filesystem wide)
2019 * @errp: error code
2020 *
2021 * Return allocated block number on success
2022 */
2023ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
2024 ext4_fsblk_t goal, int *errp)
2025{
2026 unsigned long count = 1;
2027 return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
2028}
2029
2030/*
2031 * ext4_new_blocks() -- allocate data blocks
2032 *
2033 * @handle: handle to this transaction
2034 * @inode: file inode
2035 * @goal: given target block(filesystem wide)
2036 * @count: total number of blocks need
2037 * @errp: error code
2038 *
2039 * Return 1st allocated block numberon success, *count stores total account
2040 * error stores in errp pointer
2041 */
2042
2043ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
2044 ext4_lblk_t iblock, ext4_fsblk_t goal,
2045 unsigned long *count, int *errp)
2046{
2047 return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
2048}
1958 2049
1959/** 2050/**
1960 * ext4_count_free_blocks() -- count filesystem free blocks 2051 * ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1986 continue; 2077 continue;
1987 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 2078 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1988 brelse(bitmap_bh); 2079 brelse(bitmap_bh);
1989 bitmap_bh = read_block_bitmap(sb, i); 2080 bitmap_bh = ext4_read_block_bitmap(sb, i);
1990 if (bitmap_bh == NULL) 2081 if (bitmap_bh == NULL)
1991 continue; 2082 continue;
1992 2083
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
129 struct buffer_head *bh = NULL; 129 struct buffer_head *bh = NULL;
130 130
131 map_bh.b_state = 0; 131 map_bh.b_state = 0;
132 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); 132 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
133 0, 0, 0);
133 if (err > 0) { 134 if (err > 0) {
134 pgoff_t index = map_bh.b_blocknr >> 135 pgoff_t index = map_bh.b_blocknr >>
135 (PAGE_CACHE_SHIFT - inode->i_blkbits); 136 (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
272 273
273 while (n) { 274 while (n) {
274 /* Do the node's children first */ 275 /* Do the node's children first */
275 if ((n)->rb_left) { 276 if (n->rb_left) {
276 n = n->rb_left; 277 n = n->rb_left;
277 continue; 278 continue;
278 } 279 }
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
301 parent->rb_right = NULL; 302 parent->rb_right = NULL;
302 n = parent; 303 n = parent;
303 } 304 }
304 root->rb_node = NULL;
305} 305}
306 306
307 307
308static struct dir_private_info *create_dir_info(loff_t pos) 308static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
309{ 309{
310 struct dir_private_info *p; 310 struct dir_private_info *p;
311 311
312 p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); 312 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
313 if (!p) 313 if (!p)
314 return NULL; 314 return NULL;
315 p->root.rb_node = NULL;
316 p->curr_node = NULL;
317 p->extra_fname = NULL;
318 p->last_pos = 0;
319 p->curr_hash = pos2maj_hash(pos); 315 p->curr_hash = pos2maj_hash(pos);
320 p->curr_minor_hash = pos2min_hash(pos); 316 p->curr_minor_hash = pos2min_hash(pos);
321 p->next_hash = 0;
322 return p; 317 return p;
323} 318}
324 319
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
433 int ret; 428 int ret;
434 429
435 if (!info) { 430 if (!info) {
436 info = create_dir_info(filp->f_pos); 431 info = ext4_htree_create_dir_info(filp->f_pos);
437 if (!info) 432 if (!info)
438 return -ENOMEM; 433 return -ENOMEM;
439 filp->private_data = info; 434 filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
22#include "ext4_i.h" 22#include "ext4_i.h"
23 23
24/* 24/*
25 * The second extended filesystem constants/structures 25 * The fourth extended filesystem constants/structures
26 */ 26 */
27 27
28/* 28/*
@@ -45,7 +45,7 @@
45#define ext4_debug(f, a...) \ 45#define ext4_debug(f, a...) \
46 do { \ 46 do { \
47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ 47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
48 __FILE__, __LINE__, __FUNCTION__); \ 48 __FILE__, __LINE__, __func__); \
49 printk (KERN_DEBUG f, ## a); \ 49 printk (KERN_DEBUG f, ## a); \
50 } while (0) 50 } while (0)
51#else 51#else
@@ -74,6 +74,9 @@
74#define EXT4_MB_HINT_GOAL_ONLY 256 74#define EXT4_MB_HINT_GOAL_ONLY 256
75/* goal is meaningful */ 75/* goal is meaningful */
76#define EXT4_MB_HINT_TRY_GOAL 512 76#define EXT4_MB_HINT_TRY_GOAL 512
77/* blocks already pre-reserved by delayed allocation */
78#define EXT4_MB_DELALLOC_RESERVED 1024
79
77 80
78struct ext4_allocation_request { 81struct ext4_allocation_request {
79 /* target inode for block we're allocating */ 82 /* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
170 __u32 bg_reserved2[3]; 173 __u32 bg_reserved2[3];
171}; 174};
172 175
176/*
177 * Structure of a flex block group info
178 */
179
180struct flex_groups {
181 __u32 free_inodes;
182 __u32 free_blocks;
183};
184
173#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ 185#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
174#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ 186#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
175#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ 187#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do { \
527#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
528#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
529#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ 541#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
530/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 543/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
531#ifndef _LINUX_EXT2_FS_H 544#ifndef _LINUX_EXT2_FS_H
532#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 545#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
647 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 660 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
648 __le64 s_mmp_block; /* Block for multi-mount protection */ 661 __le64 s_mmp_block; /* Block for multi-mount protection */
649 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 662 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
650 __u32 s_reserved[163]; /* Padding to the end of the block */ 663 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
664 __u8 s_reserved_char_pad2;
665 __le16 s_reserved_pad;
666 __u32 s_reserved[162]; /* Padding to the end of the block */
651}; 667};
652 668
653#ifdef __KERNEL__ 669#ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
958extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); 974extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
959extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 975extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
960 ext4_group_t group); 976 ext4_group_t group);
961extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, 977extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
962 ext4_fsblk_t goal, int *errp); 978 ext4_fsblk_t goal, int *errp);
963extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, 979extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
964 ext4_fsblk_t goal, unsigned long *count, int *errp); 980 ext4_fsblk_t goal, unsigned long *count, int *errp);
965extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, 981extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
982 ext4_lblk_t iblock, ext4_fsblk_t goal,
983 unsigned long *count, int *errp);
984extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
966 ext4_fsblk_t goal, unsigned long *count, int *errp); 985 ext4_fsblk_t goal, unsigned long *count, int *errp);
986extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
987 ext4_fsblk_t nblocks);
967extern void ext4_free_blocks (handle_t *handle, struct inode *inode, 988extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
968 ext4_fsblk_t block, unsigned long count, int metadata); 989 ext4_fsblk_t block, unsigned long count, int metadata);
969extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, 990extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
1016extern void exit_ext4_mballoc(void); 1037extern void exit_ext4_mballoc(void);
1017extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1038extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1018 unsigned long, unsigned long, int, unsigned long *); 1039 unsigned long, unsigned long, int, unsigned long *);
1040extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
1041 ext4_group_t i, struct ext4_group_desc *desc);
1042extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1043 ext4_grpblk_t add);
1019 1044
1020 1045
1021/* inode.c */ 1046/* inode.c */
1047void ext4_da_release_space(struct inode *inode, int used, int to_free);
1022int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1048int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
1023 struct buffer_head *bh, ext4_fsblk_t blocknr); 1049 struct buffer_head *bh, ext4_fsblk_t blocknr);
1024struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1050struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1033extern struct inode *ext4_iget(struct super_block *, unsigned long); 1059extern struct inode *ext4_iget(struct super_block *, unsigned long);
1034extern int ext4_write_inode (struct inode *, int); 1060extern int ext4_write_inode (struct inode *, int);
1035extern int ext4_setattr (struct dentry *, struct iattr *); 1061extern int ext4_setattr (struct dentry *, struct iattr *);
1062extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1063 struct kstat *stat);
1036extern void ext4_delete_inode (struct inode *); 1064extern void ext4_delete_inode (struct inode *);
1037extern int ext4_sync_inode (handle_t *, struct inode *); 1065extern int ext4_sync_inode (handle_t *, struct inode *);
1038extern void ext4_discard_reservation (struct inode *); 1066extern void ext4_discard_reservation (struct inode *);
1039extern void ext4_dirty_inode(struct inode *); 1067extern void ext4_dirty_inode(struct inode *);
1040extern int ext4_change_inode_journal_flag(struct inode *, int); 1068extern int ext4_change_inode_journal_flag(struct inode *, int);
1041extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1069extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1070extern int ext4_can_truncate(struct inode *inode);
1042extern void ext4_truncate (struct inode *); 1071extern void ext4_truncate (struct inode *);
1043extern void ext4_set_inode_flags(struct inode *); 1072extern void ext4_set_inode_flags(struct inode *);
1044extern void ext4_get_inode_flags(struct ext4_inode_info *); 1073extern void ext4_get_inode_flags(struct ext4_inode_info *);
1045extern void ext4_set_aops(struct inode *inode); 1074extern void ext4_set_aops(struct inode *inode);
1046extern int ext4_writepage_trans_blocks(struct inode *); 1075extern int ext4_writepage_trans_blocks(struct inode *);
1047extern int ext4_block_truncate_page(handle_t *handle, struct page *page, 1076extern int ext4_block_truncate_page(handle_t *handle,
1048 struct address_space *mapping, loff_t from); 1077 struct address_space *mapping, loff_t from);
1078extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1049 1079
1050/* ioctl.c */ 1080/* ioctl.c */
1051extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1081extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1159} 1189}
1160 1190
1161 1191
1192static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
1193 ext4_group_t block_group)
1194{
1195 return block_group >> sbi->s_log_groups_per_flex;
1196}
1197
1198static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1199{
1200 return 1 << sbi->s_log_groups_per_flex;
1201}
1202
1162#define ext4_std_error(sb, errno) \ 1203#define ext4_std_error(sb, errno) \
1163do { \ 1204do { \
1164 if ((errno)) \ 1205 if ((errno)) \
1165 __ext4_std_error((sb), __FUNCTION__, (errno)); \ 1206 __ext4_std_error((sb), __func__, (errno)); \
1166} while (0) 1207} while (0)
1167 1208
1168/* 1209/*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1191 ext4_lblk_t iblock, 1232 ext4_lblk_t iblock,
1192 unsigned long max_blocks, struct buffer_head *bh_result, 1233 unsigned long max_blocks, struct buffer_head *bh_result,
1193 int create, int extend_disksize); 1234 int create, int extend_disksize);
1194extern void ext4_ext_truncate(struct inode *, struct page *); 1235extern void ext4_ext_truncate(struct inode *);
1195extern void ext4_ext_init(struct super_block *); 1236extern void ext4_ext_init(struct super_block *);
1196extern void ext4_ext_release(struct super_block *); 1237extern void ext4_ext_release(struct super_block *);
1197extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1238extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1199extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, 1240extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1200 sector_t block, unsigned long max_blocks, 1241 sector_t block, unsigned long max_blocks,
1201 struct buffer_head *bh, int create, 1242 struct buffer_head *bh, int create,
1202 int extend_disksize); 1243 int extend_disksize, int flag);
1203#endif /* __KERNEL__ */ 1244#endif /* __KERNEL__ */
1204 1245
1205#endif /* _EXT4_H */ 1246#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
212 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 212 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
213} 213}
214 214
215extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
215extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 216extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
216extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); 217extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
217extern int ext4_extent_tree_init(handle_t *, struct inode *); 218extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
79}; 79};
80 80
81/* 81/*
82 * third extended file system inode data in memory 82 * fourth extended file system inode data in memory
83 */ 83 */
84struct ext4_inode_info { 84struct ext4_inode_info {
85 __le32 i_data[15]; /* unconverted */ 85 __le32 i_data[15]; /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
150 */ 150 */
151 struct rw_semaphore i_data_sem; 151 struct rw_semaphore i_data_sem;
152 struct inode vfs_inode; 152 struct inode vfs_inode;
153 struct jbd2_inode jinode;
153 154
154 unsigned long i_ext_generation; 155 unsigned long i_ext_generation;
155 struct ext4_ext_cache i_cached_extent; 156 struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
162 /* mballoc */ 163 /* mballoc */
163 struct list_head i_prealloc_list; 164 struct list_head i_prealloc_list;
164 spinlock_t i_prealloc_lock; 165 spinlock_t i_prealloc_lock;
166
167 /* allocation reservation info for delalloc */
168 unsigned long i_reserved_data_blocks;
169 unsigned long i_reserved_meta_blocks;
170 unsigned long i_allocated_meta_blocks;
171 unsigned short i_delalloc_reserved_flag;
172 spinlock_t i_block_reservation_lock;
165}; 173};
166 174
167#endif /* _EXT4_I */ 175#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
142 handle_t *handle, struct buffer_head *bh); 142 handle_t *handle, struct buffer_head *bh);
143 143
144#define ext4_journal_get_undo_access(handle, bh) \ 144#define ext4_journal_get_undo_access(handle, bh) \
145 __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh)) 145 __ext4_journal_get_undo_access(__func__, (handle), (bh))
146#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
147 __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, (handle), (bh))
148#define ext4_journal_revoke(handle, blocknr, bh) \ 148#define ext4_journal_revoke(handle, blocknr, bh) \
149 __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) 149 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
150#define ext4_journal_get_create_access(handle, bh) \ 150#define ext4_journal_get_create_access(handle, bh) \
151 __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh)) 151 __ext4_journal_get_create_access(__func__, (handle), (bh))
152#define ext4_journal_dirty_metadata(handle, bh) \ 152#define ext4_journal_dirty_metadata(handle, bh) \
153 __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) 153 __ext4_journal_dirty_metadata(__func__, (handle), (bh))
154#define ext4_journal_forget(handle, bh) \ 154#define ext4_journal_forget(handle, bh) \
155 __ext4_journal_forget(__FUNCTION__, (handle), (bh)) 155 __ext4_journal_forget(__func__, (handle), (bh))
156
157int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
158 156
159handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 157handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
160int __ext4_journal_stop(const char *where, handle_t *handle); 158int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
165} 163}
166 164
167#define ext4_journal_stop(handle) \ 165#define ext4_journal_stop(handle) \
168 __ext4_journal_stop(__FUNCTION__, (handle)) 166 __ext4_journal_stop(__func__, (handle))
169 167
170static inline handle_t *ext4_journal_current_handle(void) 168static inline handle_t *ext4_journal_current_handle(void)
171{ 169{
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
192 return jbd2_journal_force_commit(journal); 190 return jbd2_journal_force_commit(journal);
193} 191}
194 192
193static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
194{
195 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
196}
197
195/* super.c */ 198/* super.c */
196int ext4_force_commit(struct super_block *sb); 199int ext4_force_commit(struct super_block *sb);
197 200
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
25#include <linux/rbtree.h> 25#include <linux/rbtree.h>
26 26
27/* 27/*
28 * third extended-fs super-block data in memory 28 * fourth extended-fs super-block data in memory
29 */ 29 */
30struct ext4_sb_info { 30struct ext4_sb_info {
31 unsigned long s_desc_size; /* Size of a group descriptor in bytes */ 31 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
143 143
144 /* locality groups */ 144 /* locality groups */
145 struct ext4_locality_group *s_locality_groups; 145 struct ext4_locality_group *s_locality_groups;
146
147 unsigned int s_log_groups_per_flex;
148 struct flex_groups *s_flex_groups;
146}; 149};
147 150
148#endif /* _EXT4_SB */ 151#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93} 93}
94 94
95static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) 95static int ext4_ext_journal_restart(handle_t *handle, int needed)
96{ 96{
97 int err; 97 int err;
98 98
99 if (handle->h_buffer_credits > needed) 99 if (handle->h_buffer_credits > needed)
100 return handle; 100 return 0;
101 if (!ext4_journal_extend(handle, needed)) 101 err = ext4_journal_extend(handle, needed);
102 return handle; 102 if (err)
103 err = ext4_journal_restart(handle, needed); 103 return err;
104 104 return ext4_journal_restart(handle, needed);
105 return handle;
106} 105}
107 106
108/* 107/*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
180 return bg_start + colour + block; 179 return bg_start + colour + block;
181} 180}
182 181
182/*
183 * Allocation for a meta data block
184 */
183static ext4_fsblk_t 185static ext4_fsblk_t
184ext4_ext_new_block(handle_t *handle, struct inode *inode, 186ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
185 struct ext4_ext_path *path, 187 struct ext4_ext_path *path,
186 struct ext4_extent *ex, int *err) 188 struct ext4_extent *ex, int *err)
187{ 189{
188 ext4_fsblk_t goal, newblock; 190 ext4_fsblk_t goal, newblock;
189 191
190 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 192 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
191 newblock = ext4_new_block(handle, inode, goal, err); 193 newblock = ext4_new_meta_block(handle, inode, goal, err);
192 return newblock; 194 return newblock;
193} 195}
194 196
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
246 return size; 248 return size;
247} 249}
248 250
251/*
252 * Calculate the number of metadata blocks needed
253 * to allocate @blocks
254 * Worse case is one block per extent
255 */
256int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
257{
258 int lcap, icap, rcap, leafs, idxs, num;
259 int newextents = blocks;
260
261 rcap = ext4_ext_space_root_idx(inode);
262 lcap = ext4_ext_space_block(inode);
263 icap = ext4_ext_space_block_idx(inode);
264
265 /* number of new leaf blocks needed */
266 num = leafs = (newextents + lcap - 1) / lcap;
267
268 /*
269 * Worse case, we need separate index block(s)
270 * to link all new leaf blocks
271 */
272 idxs = (leafs + icap - 1) / icap;
273 do {
274 num += idxs;
275 idxs = (idxs + icap - 1) / icap;
276 } while (idxs > rcap);
277
278 return num;
279}
280
249static int 281static int
250ext4_ext_max_entries(struct inode *inode, int depth) 282ext4_ext_max_entries(struct inode *inode, int depth)
251{ 283{
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
524 alloc = 1; 556 alloc = 1;
525 } 557 }
526 path[0].p_hdr = eh; 558 path[0].p_hdr = eh;
559 path[0].p_bh = NULL;
527 560
528 i = depth; 561 i = depth;
529 /* walk through the tree */ 562 /* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
552 } 585 }
553 586
554 path[ppos].p_depth = i; 587 path[ppos].p_depth = i;
555 path[ppos].p_hdr = eh;
556 path[ppos].p_ext = NULL; 588 path[ppos].p_ext = NULL;
557 path[ppos].p_idx = NULL; 589 path[ppos].p_idx = NULL;
558 590
559 /* find extent */ 591 /* find extent */
560 ext4_ext_binsearch(inode, path + ppos, block); 592 ext4_ext_binsearch(inode, path + ppos, block);
593 /* if not an empty leaf */
594 if (path[ppos].p_ext)
595 path[ppos].p_block = ext_pblock(path[ppos].p_ext);
561 596
562 ext4_ext_show_path(inode, path); 597 ext4_ext_show_path(inode, path);
563 598
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
688 /* allocate all needed blocks */ 723 /* allocate all needed blocks */
689 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 724 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
690 for (a = 0; a < depth - at; a++) { 725 for (a = 0; a < depth - at; a++) {
691 newblock = ext4_ext_new_block(handle, inode, path, newext, &err); 726 newblock = ext4_ext_new_meta_block(handle, inode, path,
727 newext, &err);
692 if (newblock == 0) 728 if (newblock == 0)
693 goto cleanup; 729 goto cleanup;
694 ablocks[a] = newblock; 730 ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
884 ext4_fsblk_t newblock; 920 ext4_fsblk_t newblock;
885 int err = 0; 921 int err = 0;
886 922
887 newblock = ext4_ext_new_block(handle, inode, path, newext, &err); 923 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
888 if (newblock == 0) 924 if (newblock == 0)
889 return err; 925 return err;
890 926
@@ -981,6 +1017,8 @@ repeat:
981 /* if we found index with free entry, then use that 1017 /* if we found index with free entry, then use that
982 * entry: create all needed subtree and add new leaf */ 1018 * entry: create all needed subtree and add new leaf */
983 err = ext4_ext_split(handle, inode, path, newext, i); 1019 err = ext4_ext_split(handle, inode, path, newext, i);
1020 if (err)
1021 goto out;
984 1022
985 /* refill path */ 1023 /* refill path */
986 ext4_ext_drop_refs(path); 1024 ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1883 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 1921 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1884#endif 1922#endif
1885 1923
1886 handle = ext4_ext_journal_restart(handle, credits); 1924 err = ext4_ext_journal_restart(handle, credits);
1887 if (IS_ERR(handle)) { 1925 if (err)
1888 err = PTR_ERR(handle);
1889 goto out; 1926 goto out;
1890 }
1891 1927
1892 err = ext4_ext_get_access(handle, inode, path + depth); 1928 err = ext4_ext_get_access(handle, inode, path + depth);
1893 if (err) 1929 if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2529 int err = 0, depth, ret; 2565 int err = 0, depth, ret;
2530 unsigned long allocated = 0; 2566 unsigned long allocated = 0;
2531 struct ext4_allocation_request ar; 2567 struct ext4_allocation_request ar;
2568 loff_t disksize;
2532 2569
2533 __clear_bit(BH_New, &bh_result->b_state); 2570 __clear_bit(BH_New, &bh_result->b_state);
2534 ext_debug("blocks %u/%lu requested for inode %u\n", 2571 ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2616 */ 2653 */
2617 if (allocated > max_blocks) 2654 if (allocated > max_blocks)
2618 allocated = max_blocks; 2655 allocated = max_blocks;
2619 /* mark the buffer unwritten */ 2656 set_buffer_unwritten(bh_result);
2620 __set_bit(BH_Unwritten, &bh_result->b_state);
2621 goto out2; 2657 goto out2;
2622 } 2658 }
2623 2659
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2716 goto out2; 2752 goto out2;
2717 } 2753 }
2718 2754
2719 if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
2720 EXT4_I(inode)->i_disksize = inode->i_size;
2721
2722 /* previous routine could use block we allocated */ 2755 /* previous routine could use block we allocated */
2723 newblock = ext_pblock(&newex); 2756 newblock = ext_pblock(&newex);
2724 allocated = ext4_ext_get_actual_len(&newex); 2757 allocated = ext4_ext_get_actual_len(&newex);
2725outnew: 2758outnew:
2726 __set_bit(BH_New, &bh_result->b_state); 2759 if (extend_disksize) {
2760 disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
2761 if (disksize > i_size_read(inode))
2762 disksize = i_size_read(inode);
2763 if (disksize > EXT4_I(inode)->i_disksize)
2764 EXT4_I(inode)->i_disksize = disksize;
2765 }
2766
2767 set_buffer_new(bh_result);
2727 2768
2728 /* Cache only when it is _not_ an uninitialized extent */ 2769 /* Cache only when it is _not_ an uninitialized extent */
2729 if (create != EXT4_CREATE_UNINITIALIZED_EXT) 2770 if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
2733 if (allocated > max_blocks) 2774 if (allocated > max_blocks)
2734 allocated = max_blocks; 2775 allocated = max_blocks;
2735 ext4_ext_show_leaf(inode, path); 2776 ext4_ext_show_leaf(inode, path);
2736 __set_bit(BH_Mapped, &bh_result->b_state); 2777 set_buffer_mapped(bh_result);
2737 bh_result->b_bdev = inode->i_sb->s_bdev; 2778 bh_result->b_bdev = inode->i_sb->s_bdev;
2738 bh_result->b_blocknr = newblock; 2779 bh_result->b_blocknr = newblock;
2739out2: 2780out2:
@@ -2744,7 +2785,7 @@ out2:
2744 return err ? err : allocated; 2785 return err ? err : allocated;
2745} 2786}
2746 2787
2747void ext4_ext_truncate(struct inode * inode, struct page *page) 2788void ext4_ext_truncate(struct inode *inode)
2748{ 2789{
2749 struct address_space *mapping = inode->i_mapping; 2790 struct address_space *mapping = inode->i_mapping;
2750 struct super_block *sb = inode->i_sb; 2791 struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2757 */ 2798 */
2758 err = ext4_writepage_trans_blocks(inode) + 3; 2799 err = ext4_writepage_trans_blocks(inode) + 3;
2759 handle = ext4_journal_start(inode, err); 2800 handle = ext4_journal_start(inode, err);
2760 if (IS_ERR(handle)) { 2801 if (IS_ERR(handle))
2761 if (page) {
2762 clear_highpage(page);
2763 flush_dcache_page(page);
2764 unlock_page(page);
2765 page_cache_release(page);
2766 }
2767 return; 2802 return;
2768 }
2769 2803
2770 if (page) 2804 if (inode->i_size & (sb->s_blocksize - 1))
2771 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 2805 ext4_block_truncate_page(handle, mapping, inode->i_size);
2806
2807 if (ext4_orphan_add(handle, inode))
2808 goto out_stop;
2772 2809
2773 down_write(&EXT4_I(inode)->i_data_sem); 2810 down_write(&EXT4_I(inode)->i_data_sem);
2774 ext4_ext_invalidate_cache(inode); 2811 ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2780 * Probably we need not scan at all, 2817 * Probably we need not scan at all,
2781 * because page truncation is enough. 2818 * because page truncation is enough.
2782 */ 2819 */
2783 if (ext4_orphan_add(handle, inode))
2784 goto out_stop;
2785 2820
2786 /* we have to know where to truncate from in crash case */ 2821 /* we have to know where to truncate from in crash case */
2787 EXT4_I(inode)->i_disksize = inode->i_size; 2822 EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2798 handle->h_sync = 1; 2833 handle->h_sync = 1;
2799 2834
2800out_stop: 2835out_stop:
2836 up_write(&EXT4_I(inode)->i_data_sem);
2801 /* 2837 /*
2802 * If this was a simple ftruncate() and the file will remain alive, 2838 * If this was a simple ftruncate() and the file will remain alive,
2803 * then we need to clear up the orphan record which we created above. 2839 * then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
2808 if (inode->i_nlink) 2844 if (inode->i_nlink)
2809 ext4_orphan_del(handle, inode); 2845 ext4_orphan_del(handle, inode);
2810 2846
2811 up_write(&EXT4_I(inode)->i_data_sem);
2812 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 2847 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
2813 ext4_mark_inode_dirty(handle, inode); 2848 ext4_mark_inode_dirty(handle, inode);
2814 ext4_journal_stop(handle); 2849 ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
2911 } 2946 }
2912 ret = ext4_get_blocks_wrap(handle, inode, block, 2947 ret = ext4_get_blocks_wrap(handle, inode, block,
2913 max_blocks, &map_bh, 2948 max_blocks, &map_bh,
2914 EXT4_CREATE_UNINITIALIZED_EXT, 0); 2949 EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
2915 if (ret <= 0) { 2950 if (ret <= 0) {
2916#ifdef EXT4FS_DEBUG 2951#ifdef EXT4FS_DEBUG
2917 WARN_ON(ret <= 0); 2952 WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
123 return ret; 123 return ret;
124} 124}
125 125
126static struct vm_operations_struct ext4_file_vm_ops = {
127 .fault = filemap_fault,
128 .page_mkwrite = ext4_page_mkwrite,
129};
130
131static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
132{
133 struct address_space *mapping = file->f_mapping;
134
135 if (!mapping->a_ops->readpage)
136 return -ENOEXEC;
137 file_accessed(file);
138 vma->vm_ops = &ext4_file_vm_ops;
139 vma->vm_flags |= VM_CAN_NONLINEAR;
140 return 0;
141}
142
126const struct file_operations ext4_file_operations = { 143const struct file_operations ext4_file_operations = {
127 .llseek = generic_file_llseek, 144 .llseek = generic_file_llseek,
128 .read = do_sync_read, 145 .read = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
133#ifdef CONFIG_COMPAT 150#ifdef CONFIG_COMPAT
134 .compat_ioctl = ext4_compat_ioctl, 151 .compat_ioctl = ext4_compat_ioctl,
135#endif 152#endif
136 .mmap = generic_file_mmap, 153 .mmap = ext4_file_mmap,
137 .open = generic_file_open, 154 .open = generic_file_open,
138 .release = ext4_release_file, 155 .release = ext4_release_file,
139 .fsync = ext4_sync_file, 156 .fsync = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
144const struct inode_operations ext4_file_inode_operations = { 161const struct inode_operations ext4_file_inode_operations = {
145 .truncate = ext4_truncate, 162 .truncate = ext4_truncate,
146 .setattr = ext4_setattr, 163 .setattr = ext4_setattr,
164 .getattr = ext4_getattr,
147#ifdef CONFIG_EXT4DEV_FS_XATTR 165#ifdef CONFIG_EXT4DEV_FS_XATTR
148 .setxattr = generic_setxattr, 166 .setxattr = generic_setxattr,
149 .getxattr = generic_getxattr, 167 .getxattr = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h> 29#include <linux/jbd2.h>
30#include <linux/blkdev.h>
30#include "ext4.h" 31#include "ext4.h"
31#include "ext4_jbd2.h" 32#include "ext4_jbd2.h"
32 33
@@ -45,6 +46,7 @@
45int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
46{ 47{
47 struct inode *inode = dentry->d_inode; 48 struct inode *inode = dentry->d_inode;
49 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
48 int ret = 0; 50 int ret = 0;
49 51
50 J_ASSERT(ext4_journal_current_handle() == NULL); 52 J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
85 .nr_to_write = 0, /* sys_fsync did this */ 87 .nr_to_write = 0, /* sys_fsync did this */
86 }; 88 };
87 ret = sync_inode(inode, &wbc); 89 ret = sync_inode(inode, &wbc);
90 if (journal && (journal->j_flags & JBD2_BARRIER))
91 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
88 } 92 }
89out: 93out:
90 return ret; 94 return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
13 struct ext4_group_desc *gdp); 13 struct ext4_group_desc *gdp);
14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, 14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
15 struct ext4_group_desc *gdp); 15 struct ext4_group_desc *gdp);
16struct buffer_head *read_block_bitmap(struct super_block *sb, 16struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
17 ext4_group_t block_group); 17 ext4_group_t block_group);
18extern unsigned ext4_init_block_bitmap(struct super_block *sb, 18extern unsigned ext4_init_block_bitmap(struct super_block *sb,
19 struct buffer_head *bh, 19 struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
157 struct ext4_super_block * es; 157 struct ext4_super_block * es;
158 struct ext4_sb_info *sbi; 158 struct ext4_sb_info *sbi;
159 int fatal = 0, err; 159 int fatal = 0, err;
160 ext4_group_t flex_group;
160 161
161 if (atomic_read(&inode->i_count) > 1) { 162 if (atomic_read(&inode->i_count) > 1) {
162 printk ("ext4_free_inode: inode has count=%d\n", 163 printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
232 if (is_directory) 233 if (is_directory)
233 percpu_counter_dec(&sbi->s_dirs_counter); 234 percpu_counter_dec(&sbi->s_dirs_counter);
234 235
236 if (sbi->s_log_groups_per_flex) {
237 flex_group = ext4_flex_group(sbi, block_group);
238 spin_lock(sb_bgl_lock(sbi, flex_group));
239 sbi->s_flex_groups[flex_group].free_inodes++;
240 spin_unlock(sb_bgl_lock(sbi, flex_group));
241 }
235 } 242 }
236 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 243 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
237 err = ext4_journal_dirty_metadata(handle, bh2); 244 err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
286 return ret; 293 return ret;
287} 294}
288 295
296#define free_block_ratio 10
297
298static int find_group_flex(struct super_block *sb, struct inode *parent,
299 ext4_group_t *best_group)
300{
301 struct ext4_sb_info *sbi = EXT4_SB(sb);
302 struct ext4_group_desc *desc;
303 struct buffer_head *bh;
304 struct flex_groups *flex_group = sbi->s_flex_groups;
305 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
306 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
307 ext4_group_t ngroups = sbi->s_groups_count;
308 int flex_size = ext4_flex_bg_size(sbi);
309 ext4_group_t best_flex = parent_fbg_group;
310 int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
311 int flexbg_free_blocks;
312 int flex_freeb_ratio;
313 ext4_group_t n_fbg_groups;
314 ext4_group_t i;
315
316 n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
317 sbi->s_log_groups_per_flex;
318
319find_close_to_parent:
320 flexbg_free_blocks = flex_group[best_flex].free_blocks;
321 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
322 if (flex_group[best_flex].free_inodes &&
323 flex_freeb_ratio > free_block_ratio)
324 goto found_flexbg;
325
326 if (best_flex && best_flex == parent_fbg_group) {
327 best_flex--;
328 goto find_close_to_parent;
329 }
330
331 for (i = 0; i < n_fbg_groups; i++) {
332 if (i == parent_fbg_group || i == parent_fbg_group - 1)
333 continue;
334
335 flexbg_free_blocks = flex_group[i].free_blocks;
336 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
337
338 if (flex_freeb_ratio > free_block_ratio &&
339 flex_group[i].free_inodes) {
340 best_flex = i;
341 goto found_flexbg;
342 }
343
344 if (best_flex < 0 ||
345 (flex_group[i].free_blocks >
346 flex_group[best_flex].free_blocks &&
347 flex_group[i].free_inodes))
348 best_flex = i;
349 }
350
351 if (!flex_group[best_flex].free_inodes ||
352 !flex_group[best_flex].free_blocks)
353 return -1;
354
355found_flexbg:
356 for (i = best_flex * flex_size; i < ngroups &&
357 i < (best_flex + 1) * flex_size; i++) {
358 desc = ext4_get_group_desc(sb, i, &bh);
359 if (le16_to_cpu(desc->bg_free_inodes_count)) {
360 *best_group = i;
361 goto out;
362 }
363 }
364
365 return -1;
366out:
367 return 0;
368}
369
289/* 370/*
290 * Orlov's allocator for directories. 371 * Orlov's allocator for directories.
291 * 372 *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
501 struct inode *ret; 582 struct inode *ret;
502 ext4_group_t i; 583 ext4_group_t i;
503 int free = 0; 584 int free = 0;
585 ext4_group_t flex_group;
504 586
505 /* Cannot create files in a deleted directory */ 587 /* Cannot create files in a deleted directory */
506 if (!dir || !dir->i_nlink) 588 if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
514 596
515 sbi = EXT4_SB(sb); 597 sbi = EXT4_SB(sb);
516 es = sbi->s_es; 598 es = sbi->s_es;
599
600 if (sbi->s_log_groups_per_flex) {
601 ret2 = find_group_flex(sb, dir, &group);
602 goto got_group;
603 }
604
517 if (S_ISDIR(mode)) { 605 if (S_ISDIR(mode)) {
518 if (test_opt (sb, OLDALLOC)) 606 if (test_opt (sb, OLDALLOC))
519 ret2 = find_group_dir(sb, dir, &group); 607 ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
522 } else 610 } else
523 ret2 = find_group_other(sb, dir, &group); 611 ret2 = find_group_other(sb, dir, &group);
524 612
613got_group:
525 err = -ENOSPC; 614 err = -ENOSPC;
526 if (ret2 == -1) 615 if (ret2 == -1)
527 goto out; 616 goto out;
@@ -600,7 +689,7 @@ got:
600 /* We may have to initialize the block bitmap if it isn't already */ 689 /* We may have to initialize the block bitmap if it isn't already */
601 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && 690 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
602 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 691 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
603 struct buffer_head *block_bh = read_block_bitmap(sb, group); 692 struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
604 693
605 BUFFER_TRACE(block_bh, "get block bitmap access"); 694 BUFFER_TRACE(block_bh, "get block bitmap access");
606 err = ext4_journal_get_write_access(handle, block_bh); 695 err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
676 percpu_counter_inc(&sbi->s_dirs_counter); 765 percpu_counter_inc(&sbi->s_dirs_counter);
677 sb->s_dirt = 1; 766 sb->s_dirt = 1;
678 767
768 if (sbi->s_log_groups_per_flex) {
769 flex_group = ext4_flex_group(sbi, group);
770 spin_lock(sb_bgl_lock(sbi, flex_group));
771 sbi->s_flex_groups[flex_group].free_inodes--;
772 spin_unlock(sb_bgl_lock(sbi, flex_group));
773 }
774
679 inode->i_uid = current->fsuid; 775 inode->i_uid = current->fsuid;
680 if (test_opt (sb, GRPID)) 776 if (test_opt (sb, GRPID))
681 inode->i_gid = dir->i_gid; 777 inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
740 goto fail_free_drop; 836 goto fail_free_drop;
741 837
742 if (test_opt(sb, EXTENTS)) { 838 if (test_opt(sb, EXTENTS)) {
743 /* set extent flag only for diretory, file and normal symlink*/ 839 /* set extent flag only for directory, file and normal symlink*/
744 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 840 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
745 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 841 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
746 ext4_ext_tree_init(handle, inode); 842 ext4_ext_tree_init(handle, inode);
747 err = ext4_update_incompat_feature(handle, sb,
748 EXT4_FEATURE_INCOMPAT_EXTENTS);
749 if (err)
750 goto fail_free_drop;
751 } 843 }
752 } 844 }
753 845
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
817 if (IS_ERR(inode)) 909 if (IS_ERR(inode))
818 goto iget_failed; 910 goto iget_failed;
819 911
912 /*
913 * If the orphans has i_nlinks > 0 then it should be able to be
914 * truncated, otherwise it won't be removed from the orphan list
915 * during processing and an infinite loop will result.
916 */
917 if (inode->i_nlink && !ext4_can_truncate(inode))
918 goto bad_orphan;
919
820 if (NEXT_ORPHAN(inode) > max_ino) 920 if (NEXT_ORPHAN(inode) > max_ino)
821 goto bad_orphan; 921 goto bad_orphan;
822 brelse(bitmap_bh); 922 brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
838 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", 938 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
839 NEXT_ORPHAN(inode)); 939 NEXT_ORPHAN(inode));
840 printk(KERN_NOTICE "max_ino=%lu\n", max_ino); 940 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
941 printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
841 /* Avoid freeing blocks if we got a bad deleted inode */ 942 /* Avoid freeing blocks if we got a bad deleted inode */
842 if (inode->i_nlink == 0) 943 if (inode->i_nlink == 0)
843 inode->i_blocks = 0; 944 inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <linux/pagevec.h>
35#include <linux/mpage.h> 36#include <linux/mpage.h>
36#include <linux/uio.h> 37#include <linux/uio.h>
37#include <linux/bio.h> 38#include <linux/bio.h>
38#include "ext4_jbd2.h" 39#include "ext4_jbd2.h"
39#include "xattr.h" 40#include "xattr.h"
40#include "acl.h" 41#include "acl.h"
42#include "ext4_extents.h"
43
44static inline int ext4_begin_ordered_truncate(struct inode *inode,
45 loff_t new_size)
46{
47 return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
48 new_size);
49}
50
51static void ext4_invalidatepage(struct page *page, unsigned long offset);
41 52
42/* 53/*
43 * Test whether an inode is a fast symlink. 54 * Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
181{ 192{
182 handle_t *handle; 193 handle_t *handle;
183 194
195 if (ext4_should_order_data(inode))
196 ext4_begin_ordered_truncate(inode, 0);
184 truncate_inode_pages(&inode->i_data, 0); 197 truncate_inode_pages(&inode->i_data, 0);
185 198
186 if (is_bad_inode(inode)) 199 if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
508 * direct blocks 521 * direct blocks
509 */ 522 */
510static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 523static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
511 ext4_fsblk_t goal, int indirect_blks, int blks, 524 ext4_lblk_t iblock, ext4_fsblk_t goal,
512 ext4_fsblk_t new_blocks[4], int *err) 525 int indirect_blks, int blks,
526 ext4_fsblk_t new_blocks[4], int *err)
513{ 527{
514 int target, i; 528 int target, i;
515 unsigned long count = 0; 529 unsigned long count = 0, blk_allocated = 0;
516 int index = 0; 530 int index = 0;
517 ext4_fsblk_t current_block = 0; 531 ext4_fsblk_t current_block = 0;
518 int ret = 0; 532 int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
525 * the first direct block of this branch. That's the 539 * the first direct block of this branch. That's the
526 * minimum number of blocks need to allocate(required) 540 * minimum number of blocks need to allocate(required)
527 */ 541 */
528 target = blks + indirect_blks; 542 /* first we try to allocate the indirect blocks */
529 543 target = indirect_blks;
530 while (1) { 544 while (target > 0) {
531 count = target; 545 count = target;
532 /* allocating blocks for indirect blocks and direct blocks */ 546 /* allocating blocks for indirect blocks and direct blocks */
533 current_block = ext4_new_blocks(handle,inode,goal,&count,err); 547 current_block = ext4_new_meta_blocks(handle, inode,
548 goal, &count, err);
534 if (*err) 549 if (*err)
535 goto failed_out; 550 goto failed_out;
536 551
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
540 new_blocks[index++] = current_block++; 555 new_blocks[index++] = current_block++;
541 count--; 556 count--;
542 } 557 }
543 558 if (count > 0) {
544 if (count > 0) 559 /*
560 * save the new block number
561 * for the first direct block
562 */
563 new_blocks[index] = current_block;
564 printk(KERN_INFO "%s returned more blocks than "
565 "requested\n", __func__);
566 WARN_ON(1);
545 break; 567 break;
568 }
546 } 569 }
547 570
548 /* save the new block number for the first direct block */ 571 target = blks - count ;
549 new_blocks[index] = current_block; 572 blk_allocated = count;
550 573 if (!target)
574 goto allocated;
575 /* Now allocate data blocks */
576 count = target;
577 /* allocating blocks for data blocks */
578 current_block = ext4_new_blocks(handle, inode, iblock,
579 goal, &count, err);
580 if (*err && (target == blks)) {
581 /*
582 * if the allocation failed and we didn't allocate
583 * any blocks before
584 */
585 goto failed_out;
586 }
587 if (!*err) {
588 if (target == blks) {
589 /*
590 * save the new block number
591 * for the first direct block
592 */
593 new_blocks[index] = current_block;
594 }
595 blk_allocated += count;
596 }
597allocated:
551 /* total number of blocks allocated for direct blocks */ 598 /* total number of blocks allocated for direct blocks */
552 ret = count; 599 ret = blk_allocated;
553 *err = 0; 600 *err = 0;
554 return ret; 601 return ret;
555failed_out: 602failed_out:
@@ -584,8 +631,9 @@ failed_out:
584 * as described above and return 0. 631 * as described above and return 0.
585 */ 632 */
586static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 633static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
587 int indirect_blks, int *blks, ext4_fsblk_t goal, 634 ext4_lblk_t iblock, int indirect_blks,
588 ext4_lblk_t *offsets, Indirect *branch) 635 int *blks, ext4_fsblk_t goal,
636 ext4_lblk_t *offsets, Indirect *branch)
589{ 637{
590 int blocksize = inode->i_sb->s_blocksize; 638 int blocksize = inode->i_sb->s_blocksize;
591 int i, n = 0; 639 int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
595 ext4_fsblk_t new_blocks[4]; 643 ext4_fsblk_t new_blocks[4];
596 ext4_fsblk_t current_block; 644 ext4_fsblk_t current_block;
597 645
598 num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, 646 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
599 *blks, new_blocks, &err); 647 *blks, new_blocks, &err);
600 if (err) 648 if (err)
601 return err; 649 return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
799 struct ext4_inode_info *ei = EXT4_I(inode); 847 struct ext4_inode_info *ei = EXT4_I(inode);
800 int count = 0; 848 int count = 0;
801 ext4_fsblk_t first_block = 0; 849 ext4_fsblk_t first_block = 0;
850 loff_t disksize;
802 851
803 852
804 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 853 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
855 /* 904 /*
856 * Block out ext4_truncate while we alter the tree 905 * Block out ext4_truncate while we alter the tree
857 */ 906 */
858 err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, 907 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
859 offsets + (partial - chain), partial); 908 &count, goal,
909 offsets + (partial - chain), partial);
860 910
861 /* 911 /*
862 * The ext4_splice_branch call will free and forget any buffers 912 * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
873 * protect it if you're about to implement concurrent 923 * protect it if you're about to implement concurrent
874 * ext4_get_block() -bzzz 924 * ext4_get_block() -bzzz
875 */ 925 */
876 if (!err && extend_disksize && inode->i_size > ei->i_disksize) 926 if (!err && extend_disksize) {
877 ei->i_disksize = inode->i_size; 927 disksize = ((loff_t) iblock + count) << inode->i_blkbits;
928 if (disksize > i_size_read(inode))
929 disksize = i_size_read(inode);
930 if (disksize > ei->i_disksize)
931 ei->i_disksize = disksize;
932 }
878 if (err) 933 if (err)
879 goto cleanup; 934 goto cleanup;
880 935
@@ -934,7 +989,7 @@ out:
934 */ 989 */
935int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 990int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
936 unsigned long max_blocks, struct buffer_head *bh, 991 unsigned long max_blocks, struct buffer_head *bh,
937 int create, int extend_disksize) 992 int create, int extend_disksize, int flag)
938{ 993{
939 int retval; 994 int retval;
940 995
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
975 * with create == 1 flag. 1030 * with create == 1 flag.
976 */ 1031 */
977 down_write((&EXT4_I(inode)->i_data_sem)); 1032 down_write((&EXT4_I(inode)->i_data_sem));
1033
1034 /*
1035 * if the caller is from delayed allocation writeout path
1036 * we have already reserved fs blocks for allocation
1037 * let the underlying get_block() function know to
1038 * avoid double accounting
1039 */
1040 if (flag)
1041 EXT4_I(inode)->i_delalloc_reserved_flag = 1;
978 /* 1042 /*
979 * We need to check for EXT4 here because migrate 1043 * We need to check for EXT4 here because migrate
980 * could have changed the inode type in between 1044 * could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
996 ~EXT4_EXT_MIGRATE; 1060 ~EXT4_EXT_MIGRATE;
997 } 1061 }
998 } 1062 }
1063
1064 if (flag) {
1065 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1066 /*
1067 * Update reserved blocks/metadata blocks
1068 * after successful block allocation
1069 * which were deferred till now
1070 */
1071 if ((retval > 0) && buffer_delay(bh))
1072 ext4_da_release_space(inode, retval, 0);
1073 }
1074
999 up_write((&EXT4_I(inode)->i_data_sem)); 1075 up_write((&EXT4_I(inode)->i_data_sem));
1000 return retval; 1076 return retval;
1001} 1077}
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
1021 } 1097 }
1022 1098
1023 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1099 ret = ext4_get_blocks_wrap(handle, inode, iblock,
1024 max_blocks, bh_result, create, 0); 1100 max_blocks, bh_result, create, 0, 0);
1025 if (ret > 0) { 1101 if (ret > 0) {
1026 bh_result->b_size = (ret << inode->i_blkbits); 1102 bh_result->b_size = (ret << inode->i_blkbits);
1027 ret = 0; 1103 ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1047 dummy.b_blocknr = -1000; 1123 dummy.b_blocknr = -1000;
1048 buffer_trace_init(&dummy.b_history); 1124 buffer_trace_init(&dummy.b_history);
1049 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1125 err = ext4_get_blocks_wrap(handle, inode, block, 1,
1050 &dummy, create, 1); 1126 &dummy, create, 1, 0);
1051 /* 1127 /*
1052 * ext4_get_blocks_handle() returns number of blocks 1128 * ext4_get_blocks_handle() returns number of blocks
1053 * mapped. 0 in case of a HOLE. 1129 * mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1203 to = from + len; 1279 to = from + len;
1204 1280
1205retry: 1281retry:
1206 page = __grab_cache_page(mapping, index);
1207 if (!page)
1208 return -ENOMEM;
1209 *pagep = page;
1210
1211 handle = ext4_journal_start(inode, needed_blocks); 1282 handle = ext4_journal_start(inode, needed_blocks);
1212 if (IS_ERR(handle)) { 1283 if (IS_ERR(handle)) {
1213 unlock_page(page);
1214 page_cache_release(page);
1215 ret = PTR_ERR(handle); 1284 ret = PTR_ERR(handle);
1216 goto out; 1285 goto out;
1217 } 1286 }
1218 1287
1288 page = __grab_cache_page(mapping, index);
1289 if (!page) {
1290 ext4_journal_stop(handle);
1291 ret = -ENOMEM;
1292 goto out;
1293 }
1294 *pagep = page;
1295
1219 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1296 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1220 ext4_get_block); 1297 ext4_get_block);
1221 1298
@@ -1225,8 +1302,8 @@ retry:
1225 } 1302 }
1226 1303
1227 if (ret) { 1304 if (ret) {
1228 ext4_journal_stop(handle);
1229 unlock_page(page); 1305 unlock_page(page);
1306 ext4_journal_stop(handle);
1230 page_cache_release(page); 1307 page_cache_release(page);
1231 } 1308 }
1232 1309
@@ -1236,15 +1313,6 @@ out:
1236 return ret; 1313 return ret;
1237} 1314}
1238 1315
1239int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1240{
1241 int err = jbd2_journal_dirty_data(handle, bh);
1242 if (err)
1243 ext4_journal_abort_handle(__func__, __func__,
1244 bh, handle, err);
1245 return err;
1246}
1247
1248/* For write_end() in data=journal mode */ 1316/* For write_end() in data=journal mode */
1249static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1317static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1250{ 1318{
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1255} 1323}
1256 1324
1257/* 1325/*
1258 * Generic write_end handler for ordered and writeback ext4 journal modes.
1259 * We can't use generic_write_end, because that unlocks the page and we need to
1260 * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
1261 * after block_write_end.
1262 */
1263static int ext4_generic_write_end(struct file *file,
1264 struct address_space *mapping,
1265 loff_t pos, unsigned len, unsigned copied,
1266 struct page *page, void *fsdata)
1267{
1268 struct inode *inode = file->f_mapping->host;
1269
1270 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1271
1272 if (pos+copied > inode->i_size) {
1273 i_size_write(inode, pos+copied);
1274 mark_inode_dirty(inode);
1275 }
1276
1277 return copied;
1278}
1279
1280/*
1281 * We need to pick up the new inode size which generic_commit_write gave us 1326 * We need to pick up the new inode size which generic_commit_write gave us
1282 * `file' can be NULL - eg, when called from page_symlink(). 1327 * `file' can be NULL - eg, when called from page_symlink().
1283 * 1328 *
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
1290 struct page *page, void *fsdata) 1335 struct page *page, void *fsdata)
1291{ 1336{
1292 handle_t *handle = ext4_journal_current_handle(); 1337 handle_t *handle = ext4_journal_current_handle();
1293 struct inode *inode = file->f_mapping->host; 1338 struct inode *inode = mapping->host;
1294 unsigned from, to; 1339 unsigned from, to;
1295 int ret = 0, ret2; 1340 int ret = 0, ret2;
1296 1341
1297 from = pos & (PAGE_CACHE_SIZE - 1); 1342 from = pos & (PAGE_CACHE_SIZE - 1);
1298 to = from + len; 1343 to = from + len;
1299 1344
1300 ret = walk_page_buffers(handle, page_buffers(page), 1345 ret = ext4_jbd2_file_inode(handle, inode);
1301 from, to, NULL, ext4_journal_dirty_data);
1302 1346
1303 if (ret == 0) { 1347 if (ret == 0) {
1304 /* 1348 /*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
1311 new_i_size = pos + copied; 1355 new_i_size = pos + copied;
1312 if (new_i_size > EXT4_I(inode)->i_disksize) 1356 if (new_i_size > EXT4_I(inode)->i_disksize)
1313 EXT4_I(inode)->i_disksize = new_i_size; 1357 EXT4_I(inode)->i_disksize = new_i_size;
1314 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1358 ret2 = generic_write_end(file, mapping, pos, len, copied,
1315 page, fsdata); 1359 page, fsdata);
1316 copied = ret2; 1360 copied = ret2;
1317 if (ret2 < 0) 1361 if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
1320 ret2 = ext4_journal_stop(handle); 1364 ret2 = ext4_journal_stop(handle);
1321 if (!ret) 1365 if (!ret)
1322 ret = ret2; 1366 ret = ret2;
1323 unlock_page(page);
1324 page_cache_release(page);
1325 1367
1326 return ret ? ret : copied; 1368 return ret ? ret : copied;
1327} 1369}
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
1332 struct page *page, void *fsdata) 1374 struct page *page, void *fsdata)
1333{ 1375{
1334 handle_t *handle = ext4_journal_current_handle(); 1376 handle_t *handle = ext4_journal_current_handle();
1335 struct inode *inode = file->f_mapping->host; 1377 struct inode *inode = mapping->host;
1336 int ret = 0, ret2; 1378 int ret = 0, ret2;
1337 loff_t new_i_size; 1379 loff_t new_i_size;
1338 1380
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
1340 if (new_i_size > EXT4_I(inode)->i_disksize) 1382 if (new_i_size > EXT4_I(inode)->i_disksize)
1341 EXT4_I(inode)->i_disksize = new_i_size; 1383 EXT4_I(inode)->i_disksize = new_i_size;
1342 1384
1343 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1385 ret2 = generic_write_end(file, mapping, pos, len, copied,
1344 page, fsdata); 1386 page, fsdata);
1345 copied = ret2; 1387 copied = ret2;
1346 if (ret2 < 0) 1388 if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
1349 ret2 = ext4_journal_stop(handle); 1391 ret2 = ext4_journal_stop(handle);
1350 if (!ret) 1392 if (!ret)
1351 ret = ret2; 1393 ret = ret2;
1352 unlock_page(page);
1353 page_cache_release(page);
1354 1394
1355 return ret ? ret : copied; 1395 return ret ? ret : copied;
1356} 1396}
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
1389 ret = ret2; 1429 ret = ret2;
1390 } 1430 }
1391 1431
1432 unlock_page(page);
1392 ret2 = ext4_journal_stop(handle); 1433 ret2 = ext4_journal_stop(handle);
1393 if (!ret) 1434 if (!ret)
1394 ret = ret2; 1435 ret = ret2;
1395 unlock_page(page);
1396 page_cache_release(page); 1436 page_cache_release(page);
1397 1437
1398 return ret ? ret : copied; 1438 return ret ? ret : copied;
1399} 1439}
1440/*
1441 * Calculate the number of metadata blocks need to reserve
1442 * to allocate @blocks for non extent file based file
1443 */
1444static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
1445{
1446 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1447 int ind_blks, dind_blks, tind_blks;
1448
1449 /* number of new indirect blocks needed */
1450 ind_blks = (blocks + icap - 1) / icap;
1451
1452 dind_blks = (ind_blks + icap - 1) / icap;
1453
1454 tind_blks = 1;
1455
1456 return ind_blks + dind_blks + tind_blks;
1457}
1458
1459/*
1460 * Calculate the number of metadata blocks need to reserve
1461 * to allocate given number of blocks
1462 */
1463static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
1464{
1465 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1466 return ext4_ext_calc_metadata_amount(inode, blocks);
1467
1468 return ext4_indirect_calc_metadata_amount(inode, blocks);
1469}
1470
1471static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1472{
1473 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1474 unsigned long md_needed, mdblocks, total = 0;
1475
1476 /*
1477 * recalculate the amount of metadata blocks to reserve
1478 * in order to allocate nrblocks
1479 * worse case is one extent per block
1480 */
1481 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1482 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1483 mdblocks = ext4_calc_metadata_amount(inode, total);
1484 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
1485
1486 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1487 total = md_needed + nrblocks;
1488
1489 if (ext4_has_free_blocks(sbi, total) < total) {
1490 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1491 return -ENOSPC;
1492 }
1493
1494 /* reduce fs free blocks counter */
1495 percpu_counter_sub(&sbi->s_freeblocks_counter, total);
1496
1497 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1498 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1499
1500 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1501 return 0; /* success */
1502}
1503
1504void ext4_da_release_space(struct inode *inode, int used, int to_free)
1505{
1506 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1507 int total, mdb, mdb_free, release;
1508
1509 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1510 /* recalculate the number of metablocks still need to be reserved */
1511 total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
1512 mdb = ext4_calc_metadata_amount(inode, total);
1513
1514 /* figure out how many metablocks to release */
1515 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1516 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1517
1518 /* Account for allocated meta_blocks */
1519 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1520
1521 release = to_free + mdb_free;
1522
1523 /* update fs free blocks counter for truncate case */
1524 percpu_counter_add(&sbi->s_freeblocks_counter, release);
1525
1526 /* update per-inode reservations */
1527 BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
1528 EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
1529
1530 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1531 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1532 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1533 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1534}
1535
1536static void ext4_da_page_release_reservation(struct page *page,
1537 unsigned long offset)
1538{
1539 int to_release = 0;
1540 struct buffer_head *head, *bh;
1541 unsigned int curr_off = 0;
1542
1543 head = page_buffers(page);
1544 bh = head;
1545 do {
1546 unsigned int next_off = curr_off + bh->b_size;
1547
1548 if ((offset <= curr_off) && (buffer_delay(bh))) {
1549 to_release++;
1550 clear_buffer_delay(bh);
1551 }
1552 curr_off = next_off;
1553 } while ((bh = bh->b_this_page) != head);
1554 ext4_da_release_space(page->mapping->host, 0, to_release);
1555}
1556
1557/*
1558 * Delayed allocation stuff
1559 */
1560
1561struct mpage_da_data {
1562 struct inode *inode;
1563 struct buffer_head lbh; /* extent of blocks */
1564 unsigned long first_page, next_page; /* extent of pages */
1565 get_block_t *get_block;
1566 struct writeback_control *wbc;
1567};
1568
1569/*
1570 * mpage_da_submit_io - walks through extent of pages and try to write
1571 * them with __mpage_writepage()
1572 *
1573 * @mpd->inode: inode
1574 * @mpd->first_page: first page of the extent
1575 * @mpd->next_page: page after the last page of the extent
1576 * @mpd->get_block: the filesystem's block mapper function
1577 *
1578 * By the time mpage_da_submit_io() is called we expect all blocks
1579 * to be allocated. this may be wrong if allocation failed.
1580 *
1581 * As pages are already locked by write_cache_pages(), we can't use it
1582 */
1583static int mpage_da_submit_io(struct mpage_da_data *mpd)
1584{
1585 struct address_space *mapping = mpd->inode->i_mapping;
1586 struct mpage_data mpd_pp = {
1587 .bio = NULL,
1588 .last_block_in_bio = 0,
1589 .get_block = mpd->get_block,
1590 .use_writepage = 1,
1591 };
1592 int ret = 0, err, nr_pages, i;
1593 unsigned long index, end;
1594 struct pagevec pvec;
1595
1596 BUG_ON(mpd->next_page <= mpd->first_page);
1597
1598 pagevec_init(&pvec, 0);
1599 index = mpd->first_page;
1600 end = mpd->next_page - 1;
1601
1602 while (index <= end) {
1603 /* XXX: optimize tail */
1604 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1605 if (nr_pages == 0)
1606 break;
1607 for (i = 0; i < nr_pages; i++) {
1608 struct page *page = pvec.pages[i];
1609
1610 index = page->index;
1611 if (index > end)
1612 break;
1613 index++;
1614
1615 err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
1616
1617 /*
1618 * In error case, we have to continue because
1619 * remaining pages are still locked
1620 * XXX: unlock and re-dirty them?
1621 */
1622 if (ret == 0)
1623 ret = err;
1624 }
1625 pagevec_release(&pvec);
1626 }
1627 if (mpd_pp.bio)
1628 mpage_bio_submit(WRITE, mpd_pp.bio);
1629
1630 return ret;
1631}
1632
1633/*
1634 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
1635 *
1636 * @mpd->inode - inode to walk through
1637 * @exbh->b_blocknr - first block on a disk
1638 * @exbh->b_size - amount of space in bytes
1639 * @logical - first logical block to start assignment with
1640 *
1641 * the function goes through all passed space and put actual disk
1642 * block numbers into buffer heads, dropping BH_Delay
1643 */
1644static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1645 struct buffer_head *exbh)
1646{
1647 struct inode *inode = mpd->inode;
1648 struct address_space *mapping = inode->i_mapping;
1649 int blocks = exbh->b_size >> inode->i_blkbits;
1650 sector_t pblock = exbh->b_blocknr, cur_logical;
1651 struct buffer_head *head, *bh;
1652 unsigned long index, end;
1653 struct pagevec pvec;
1654 int nr_pages, i;
1655
1656 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1657 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1658 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1659
1660 pagevec_init(&pvec, 0);
1661
1662 while (index <= end) {
1663 /* XXX: optimize tail */
1664 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1665 if (nr_pages == 0)
1666 break;
1667 for (i = 0; i < nr_pages; i++) {
1668 struct page *page = pvec.pages[i];
1669
1670 index = page->index;
1671 if (index > end)
1672 break;
1673 index++;
1674
1675 BUG_ON(!PageLocked(page));
1676 BUG_ON(PageWriteback(page));
1677 BUG_ON(!page_has_buffers(page));
1678
1679 bh = page_buffers(page);
1680 head = bh;
1681
1682 /* skip blocks out of the range */
1683 do {
1684 if (cur_logical >= logical)
1685 break;
1686 cur_logical++;
1687 } while ((bh = bh->b_this_page) != head);
1688
1689 do {
1690 if (cur_logical >= logical + blocks)
1691 break;
1692 if (buffer_delay(bh)) {
1693 bh->b_blocknr = pblock;
1694 clear_buffer_delay(bh);
1695 } else if (buffer_mapped(bh))
1696 BUG_ON(bh->b_blocknr != pblock);
1697
1698 cur_logical++;
1699 pblock++;
1700 } while ((bh = bh->b_this_page) != head);
1701 }
1702 pagevec_release(&pvec);
1703 }
1704}
1705
1706
1707/*
1708 * __unmap_underlying_blocks - just a helper function to unmap
1709 * set of blocks described by @bh
1710 */
1711static inline void __unmap_underlying_blocks(struct inode *inode,
1712 struct buffer_head *bh)
1713{
1714 struct block_device *bdev = inode->i_sb->s_bdev;
1715 int blocks, i;
1716
1717 blocks = bh->b_size >> inode->i_blkbits;
1718 for (i = 0; i < blocks; i++)
1719 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1720}
1721
1722/*
1723 * mpage_da_map_blocks - go through given space
1724 *
1725 * @mpd->lbh - bh describing space
1726 * @mpd->get_block - the filesystem's block mapper function
1727 *
1728 * The function skips space we know is already mapped to disk blocks.
1729 *
1730 * The function ignores errors ->get_block() returns, thus real
1731 * error handling is postponed to __mpage_writepage()
1732 */
1733static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1734{
1735 struct buffer_head *lbh = &mpd->lbh;
1736 int err = 0, remain = lbh->b_size;
1737 sector_t next = lbh->b_blocknr;
1738 struct buffer_head new;
1739
1740 /*
1741 * We consider only non-mapped and non-allocated blocks
1742 */
1743 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1744 return;
1745
1746 while (remain) {
1747 new.b_state = lbh->b_state;
1748 new.b_blocknr = 0;
1749 new.b_size = remain;
1750 err = mpd->get_block(mpd->inode, next, &new, 1);
1751 if (err) {
1752 /*
1753 * Rather than implement own error handling
1754 * here, we just leave remaining blocks
1755 * unallocated and try again with ->writepage()
1756 */
1757 break;
1758 }
1759 BUG_ON(new.b_size == 0);
1760
1761 if (buffer_new(&new))
1762 __unmap_underlying_blocks(mpd->inode, &new);
1763
1764 /*
1765 * If blocks are delayed marked, we need to
1766 * put actual blocknr and drop delayed bit
1767 */
1768 if (buffer_delay(lbh))
1769 mpage_put_bnr_to_bhs(mpd, next, &new);
1770
1771 /* go for the remaining blocks */
1772 next += new.b_size >> mpd->inode->i_blkbits;
1773 remain -= new.b_size;
1774 }
1775}
1776
1777#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
1778
1779/*
1780 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1781 *
1782 * @mpd->lbh - extent of blocks
1783 * @logical - logical number of the block in the file
1784 * @bh - bh of the block (used to access block's state)
1785 *
1786 * the function is used to collect contig. blocks in same state
1787 */
1788static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1789 sector_t logical, struct buffer_head *bh)
1790{
1791 struct buffer_head *lbh = &mpd->lbh;
1792 sector_t next;
1793
1794 next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
1795
1796 /*
1797 * First block in the extent
1798 */
1799 if (lbh->b_size == 0) {
1800 lbh->b_blocknr = logical;
1801 lbh->b_size = bh->b_size;
1802 lbh->b_state = bh->b_state & BH_FLAGS;
1803 return;
1804 }
1805
1806 /*
1807 * Can we merge the block to our big extent?
1808 */
1809 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
1810 lbh->b_size += bh->b_size;
1811 return;
1812 }
1813
1814 /*
1815 * We couldn't merge the block to our extent, so we
1816 * need to flush current extent and start new one
1817 */
1818 mpage_da_map_blocks(mpd);
1819
1820 /*
1821 * Now start a new extent
1822 */
1823 lbh->b_size = bh->b_size;
1824 lbh->b_state = bh->b_state & BH_FLAGS;
1825 lbh->b_blocknr = logical;
1826}
1827
1828/*
1829 * __mpage_da_writepage - finds extent of pages and blocks
1830 *
1831 * @page: page to consider
1832 * @wbc: not used, we just follow rules
1833 * @data: context
1834 *
1835 * The function finds extents of pages and scan them for all blocks.
1836 */
1837static int __mpage_da_writepage(struct page *page,
1838 struct writeback_control *wbc, void *data)
1839{
1840 struct mpage_da_data *mpd = data;
1841 struct inode *inode = mpd->inode;
1842 struct buffer_head *bh, *head, fake;
1843 sector_t logical;
1844
1845 /*
1846 * Can we merge this page to current extent?
1847 */
1848 if (mpd->next_page != page->index) {
1849 /*
1850 * Nope, we can't. So, we map non-allocated blocks
1851 * and start IO on them using __mpage_writepage()
1852 */
1853 if (mpd->next_page != mpd->first_page) {
1854 mpage_da_map_blocks(mpd);
1855 mpage_da_submit_io(mpd);
1856 }
1857
1858 /*
1859 * Start next extent of pages ...
1860 */
1861 mpd->first_page = page->index;
1862
1863 /*
1864 * ... and blocks
1865 */
1866 mpd->lbh.b_size = 0;
1867 mpd->lbh.b_state = 0;
1868 mpd->lbh.b_blocknr = 0;
1869 }
1870
1871 mpd->next_page = page->index + 1;
1872 logical = (sector_t) page->index <<
1873 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1874
1875 if (!page_has_buffers(page)) {
1876 /*
1877 * There is no attached buffer heads yet (mmap?)
1878 * we treat the page asfull of dirty blocks
1879 */
1880 bh = &fake;
1881 bh->b_size = PAGE_CACHE_SIZE;
1882 bh->b_state = 0;
1883 set_buffer_dirty(bh);
1884 set_buffer_uptodate(bh);
1885 mpage_add_bh_to_extent(mpd, logical, bh);
1886 } else {
1887 /*
1888 * Page with regular buffer heads, just add all dirty ones
1889 */
1890 head = page_buffers(page);
1891 bh = head;
1892 do {
1893 BUG_ON(buffer_locked(bh));
1894 if (buffer_dirty(bh))
1895 mpage_add_bh_to_extent(mpd, logical, bh);
1896 logical++;
1897 } while ((bh = bh->b_this_page) != head);
1898 }
1899
1900 return 0;
1901}
1902
1903/*
1904 * mpage_da_writepages - walk the list of dirty pages of the given
1905 * address space, allocates non-allocated blocks, maps newly-allocated
1906 * blocks to existing bhs and issue IO them
1907 *
1908 * @mapping: address space structure to write
1909 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1910 * @get_block: the filesystem's block mapper function.
1911 *
1912 * This is a library function, which implements the writepages()
1913 * address_space_operation.
1914 *
1915 * In order to avoid duplication of logic that deals with partial pages,
1916 * multiple bio per page, etc, we find non-allocated blocks, allocate
1917 * them with minimal calls to ->get_block() and re-use __mpage_writepage()
1918 *
1919 * It's important that we call __mpage_writepage() only once for each
1920 * involved page, otherwise we'd have to implement more complicated logic
1921 * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
1922 *
1923 * See comments to mpage_writepages()
1924 */
1925static int mpage_da_writepages(struct address_space *mapping,
1926 struct writeback_control *wbc,
1927 get_block_t get_block)
1928{
1929 struct mpage_da_data mpd;
1930 int ret;
1931
1932 if (!get_block)
1933 return generic_writepages(mapping, wbc);
1934
1935 mpd.wbc = wbc;
1936 mpd.inode = mapping->host;
1937 mpd.lbh.b_size = 0;
1938 mpd.lbh.b_state = 0;
1939 mpd.lbh.b_blocknr = 0;
1940 mpd.first_page = 0;
1941 mpd.next_page = 0;
1942 mpd.get_block = get_block;
1943
1944 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
1945
1946 /*
1947 * Handle last extent of pages
1948 */
1949 if (mpd.next_page != mpd.first_page) {
1950 mpage_da_map_blocks(&mpd);
1951 mpage_da_submit_io(&mpd);
1952 }
1953
1954 return ret;
1955}
1956
1957/*
1958 * this is a special callback for ->write_begin() only
1959 * it's intention is to return mapped block or reserve space
1960 */
1961static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1962 struct buffer_head *bh_result, int create)
1963{
1964 int ret = 0;
1965
1966 BUG_ON(create == 0);
1967 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1968
1969 /*
1970 * first, we need to know whether the block is allocated already
1971 * preallocated blocks are unmapped but should treated
1972 * the same as allocated blocks.
1973 */
1974 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
1975 if ((ret == 0) && !buffer_delay(bh_result)) {
1976 /* the block isn't (pre)allocated yet, let's reserve space */
1977 /*
1978 * XXX: __block_prepare_write() unmaps passed block,
1979 * is it OK?
1980 */
1981 ret = ext4_da_reserve_space(inode, 1);
1982 if (ret)
1983 /* not enough space to reserve */
1984 return ret;
1985
1986 map_bh(bh_result, inode->i_sb, 0);
1987 set_buffer_new(bh_result);
1988 set_buffer_delay(bh_result);
1989 } else if (ret > 0) {
1990 bh_result->b_size = (ret << inode->i_blkbits);
1991 ret = 0;
1992 }
1993
1994 return ret;
1995}
1996#define EXT4_DELALLOC_RSVED 1
1997static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1998 struct buffer_head *bh_result, int create)
1999{
2000 int ret;
2001 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2002 loff_t disksize = EXT4_I(inode)->i_disksize;
2003 handle_t *handle = NULL;
2004
2005 handle = ext4_journal_current_handle();
2006 if (!handle) {
2007 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2008 bh_result, 0, 0, 0);
2009 BUG_ON(!ret);
2010 } else {
2011 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2012 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2013 }
2014
2015 if (ret > 0) {
2016 bh_result->b_size = (ret << inode->i_blkbits);
2017
2018 /*
2019 * Update on-disk size along with block allocation
2020 * we don't use 'extend_disksize' as size may change
2021 * within already allocated block -bzzz
2022 */
2023 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2024 if (disksize > i_size_read(inode))
2025 disksize = i_size_read(inode);
2026 if (disksize > EXT4_I(inode)->i_disksize) {
2027 /*
2028 * XXX: replace with spinlock if seen contended -bzzz
2029 */
2030 down_write(&EXT4_I(inode)->i_data_sem);
2031 if (disksize > EXT4_I(inode)->i_disksize)
2032 EXT4_I(inode)->i_disksize = disksize;
2033 up_write(&EXT4_I(inode)->i_data_sem);
2034
2035 if (EXT4_I(inode)->i_disksize == disksize) {
2036 ret = ext4_mark_inode_dirty(handle, inode);
2037 return ret;
2038 }
2039 }
2040 ret = 0;
2041 }
2042 return ret;
2043}
2044
2045static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2046{
2047 /*
2048 * unmapped buffer is possible for holes.
2049 * delay buffer is possible with delayed allocation
2050 */
2051 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
2052}
2053
2054static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
2055 struct buffer_head *bh_result, int create)
2056{
2057 int ret = 0;
2058 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2059
2060 /*
2061 * we don't want to do block allocation in writepage
2062 * so call get_block_wrap with create = 0
2063 */
2064 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
2065 bh_result, 0, 0, 0);
2066 if (ret > 0) {
2067 bh_result->b_size = (ret << inode->i_blkbits);
2068 ret = 0;
2069 }
2070 return ret;
2071}
2072
2073/*
2074 * get called vi ext4_da_writepages after taking page lock (have journal handle)
2075 * get called via journal_submit_inode_data_buffers (no journal handle)
2076 * get called via shrink_page_list via pdflush (no journal handle)
2077 * or grab_page_cache when doing write_begin (have journal handle)
2078 */
2079static int ext4_da_writepage(struct page *page,
2080 struct writeback_control *wbc)
2081{
2082 int ret = 0;
2083 loff_t size;
2084 unsigned long len;
2085 struct buffer_head *page_bufs;
2086 struct inode *inode = page->mapping->host;
2087
2088 size = i_size_read(inode);
2089 if (page->index == size >> PAGE_CACHE_SHIFT)
2090 len = size & ~PAGE_CACHE_MASK;
2091 else
2092 len = PAGE_CACHE_SIZE;
2093
2094 if (page_has_buffers(page)) {
2095 page_bufs = page_buffers(page);
2096 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2097 ext4_bh_unmapped_or_delay)) {
2098 /*
2099 * We don't want to do block allocation
2100 * So redirty the page and return
2101 * We may reach here when we do a journal commit
2102 * via journal_submit_inode_data_buffers.
2103 * If we don't have mapping block we just ignore
2104 * them. We can also reach here via shrink_page_list
2105 */
2106 redirty_page_for_writepage(wbc, page);
2107 unlock_page(page);
2108 return 0;
2109 }
2110 } else {
2111 /*
2112 * The test for page_has_buffers() is subtle:
2113 * We know the page is dirty but it lost buffers. That means
2114 * that at some moment in time after write_begin()/write_end()
2115 * has been called all buffers have been clean and thus they
2116 * must have been written at least once. So they are all
2117 * mapped and we can happily proceed with mapping them
2118 * and writing the page.
2119 *
2120 * Try to initialize the buffer_heads and check whether
2121 * all are mapped and non delay. We don't want to
2122 * do block allocation here.
2123 */
2124 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
2125 ext4_normal_get_block_write);
2126 if (!ret) {
2127 page_bufs = page_buffers(page);
2128 /* check whether all are mapped and non delay */
2129 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2130 ext4_bh_unmapped_or_delay)) {
2131 redirty_page_for_writepage(wbc, page);
2132 unlock_page(page);
2133 return 0;
2134 }
2135 } else {
2136 /*
2137 * We can't do block allocation here
2138 * so just redity the page and unlock
2139 * and return
2140 */
2141 redirty_page_for_writepage(wbc, page);
2142 unlock_page(page);
2143 return 0;
2144 }
2145 }
2146
2147 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2148 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
2149 else
2150 ret = block_write_full_page(page,
2151 ext4_normal_get_block_write,
2152 wbc);
2153
2154 return ret;
2155}
2156
2157/*
2158 * For now just follow the DIO way to estimate the max credits
2159 * needed to write out EXT4_MAX_WRITEBACK_PAGES.
2160 * todo: need to calculate the max credits need for
2161 * extent based files, currently the DIO credits is based on
2162 * indirect-blocks mapping way.
2163 *
2164 * Probably should have a generic way to calculate credits
2165 * for DIO, writepages, and truncate
2166 */
2167#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
2168#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
2169
2170static int ext4_da_writepages(struct address_space *mapping,
2171 struct writeback_control *wbc)
2172{
2173 struct inode *inode = mapping->host;
2174 handle_t *handle = NULL;
2175 int needed_blocks;
2176 int ret = 0;
2177 long to_write;
2178 loff_t range_start = 0;
2179
2180 /*
2181 * No pages to write? This is mainly a kludge to avoid starting
2182 * a transaction for special inodes like journal inode on last iput()
2183 * because that could violate lock ordering on umount
2184 */
2185 if (!mapping->nrpages)
2186 return 0;
2187
2188 /*
2189 * Estimate the worse case needed credits to write out
2190 * EXT4_MAX_BUF_BLOCKS pages
2191 */
2192 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
2193
2194 to_write = wbc->nr_to_write;
2195 if (!wbc->range_cyclic) {
2196 /*
2197 * If range_cyclic is not set force range_cont
2198 * and save the old writeback_index
2199 */
2200 wbc->range_cont = 1;
2201 range_start = wbc->range_start;
2202 }
2203
2204 while (!ret && to_write) {
2205 /* start a new transaction*/
2206 handle = ext4_journal_start(inode, needed_blocks);
2207 if (IS_ERR(handle)) {
2208 ret = PTR_ERR(handle);
2209 goto out_writepages;
2210 }
2211 if (ext4_should_order_data(inode)) {
2212 /*
2213 * With ordered mode we need to add
2214 * the inode to the journal handle
2215 * when we do block allocation.
2216 */
2217 ret = ext4_jbd2_file_inode(handle, inode);
2218 if (ret) {
2219 ext4_journal_stop(handle);
2220 goto out_writepages;
2221 }
2222
2223 }
2224 /*
2225 * set the max dirty pages could be write at a time
2226 * to fit into the reserved transaction credits
2227 */
2228 if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
2229 wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
2230
2231 to_write -= wbc->nr_to_write;
2232 ret = mpage_da_writepages(mapping, wbc,
2233 ext4_da_get_block_write);
2234 ext4_journal_stop(handle);
2235 if (wbc->nr_to_write) {
2236 /*
2237 * There is no more writeout needed
2238 * or we requested for a noblocking writeout
2239 * and we found the device congested
2240 */
2241 to_write += wbc->nr_to_write;
2242 break;
2243 }
2244 wbc->nr_to_write = to_write;
2245 }
2246
2247out_writepages:
2248 wbc->nr_to_write = to_write;
2249 if (range_start)
2250 wbc->range_start = range_start;
2251 return ret;
2252}
2253
2254static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2255 loff_t pos, unsigned len, unsigned flags,
2256 struct page **pagep, void **fsdata)
2257{
2258 int ret, retries = 0;
2259 struct page *page;
2260 pgoff_t index;
2261 unsigned from, to;
2262 struct inode *inode = mapping->host;
2263 handle_t *handle;
2264
2265 index = pos >> PAGE_CACHE_SHIFT;
2266 from = pos & (PAGE_CACHE_SIZE - 1);
2267 to = from + len;
2268
2269retry:
2270 /*
2271 * With delayed allocation, we don't log the i_disksize update
2272 * if there is delayed block allocation. But we still need
2273 * to journalling the i_disksize update if writes to the end
2274 * of file which has an already mapped buffer.
2275 */
2276 handle = ext4_journal_start(inode, 1);
2277 if (IS_ERR(handle)) {
2278 ret = PTR_ERR(handle);
2279 goto out;
2280 }
2281
2282 page = __grab_cache_page(mapping, index);
2283 if (!page)
2284 return -ENOMEM;
2285 *pagep = page;
2286
2287 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2288 ext4_da_get_block_prep);
2289 if (ret < 0) {
2290 unlock_page(page);
2291 ext4_journal_stop(handle);
2292 page_cache_release(page);
2293 }
2294
2295 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2296 goto retry;
2297out:
2298 return ret;
2299}
2300
2301/*
2302 * Check if we should update i_disksize
2303 * when write to the end of file but not require block allocation
2304 */
2305static int ext4_da_should_update_i_disksize(struct page *page,
2306 unsigned long offset)
2307{
2308 struct buffer_head *bh;
2309 struct inode *inode = page->mapping->host;
2310 unsigned int idx;
2311 int i;
2312
2313 bh = page_buffers(page);
2314 idx = offset >> inode->i_blkbits;
2315
2316 for (i=0; i < idx; i++)
2317 bh = bh->b_this_page;
2318
2319 if (!buffer_mapped(bh) || (buffer_delay(bh)))
2320 return 0;
2321 return 1;
2322}
2323
2324static int ext4_da_write_end(struct file *file,
2325 struct address_space *mapping,
2326 loff_t pos, unsigned len, unsigned copied,
2327 struct page *page, void *fsdata)
2328{
2329 struct inode *inode = mapping->host;
2330 int ret = 0, ret2;
2331 handle_t *handle = ext4_journal_current_handle();
2332 loff_t new_i_size;
2333 unsigned long start, end;
2334
2335 start = pos & (PAGE_CACHE_SIZE - 1);
2336 end = start + copied -1;
2337
2338 /*
2339 * generic_write_end() will run mark_inode_dirty() if i_size
2340 * changes. So let's piggyback the i_disksize mark_inode_dirty
2341 * into that.
2342 */
2343
2344 new_i_size = pos + copied;
2345 if (new_i_size > EXT4_I(inode)->i_disksize) {
2346 if (ext4_da_should_update_i_disksize(page, end)) {
2347 down_write(&EXT4_I(inode)->i_data_sem);
2348 if (new_i_size > EXT4_I(inode)->i_disksize) {
2349 /*
2350 * Updating i_disksize when extending file
2351 * without needing block allocation
2352 */
2353 if (ext4_should_order_data(inode))
2354 ret = ext4_jbd2_file_inode(handle,
2355 inode);
2356
2357 EXT4_I(inode)->i_disksize = new_i_size;
2358 }
2359 up_write(&EXT4_I(inode)->i_data_sem);
2360 }
2361 }
2362 ret2 = generic_write_end(file, mapping, pos, len, copied,
2363 page, fsdata);
2364 copied = ret2;
2365 if (ret2 < 0)
2366 ret = ret2;
2367 ret2 = ext4_journal_stop(handle);
2368 if (!ret)
2369 ret = ret2;
2370
2371 return ret ? ret : copied;
2372}
2373
2374static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2375{
2376 /*
2377 * Drop reserved blocks
2378 */
2379 BUG_ON(!PageLocked(page));
2380 if (!page_has_buffers(page))
2381 goto out;
2382
2383 ext4_da_page_release_reservation(page, offset);
2384
2385out:
2386 ext4_invalidatepage(page, offset);
2387
2388 return;
2389}
2390
1400 2391
1401/* 2392/*
1402 * bmap() is special. It gets used by applications such as lilo and by 2393 * bmap() is special. It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
1418 journal_t *journal; 2409 journal_t *journal;
1419 int err; 2410 int err;
1420 2411
2412 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2413 test_opt(inode->i_sb, DELALLOC)) {
2414 /*
2415 * With delalloc we want to sync the file
2416 * so that we can make sure we allocate
2417 * blocks for file
2418 */
2419 filemap_write_and_wait(mapping);
2420 }
2421
1421 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2422 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
1422 /* 2423 /*
1423 * This is a REALLY heavyweight approach, but the use of 2424 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
1462 return 0; 2463 return 0;
1463} 2464}
1464 2465
1465static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1466{
1467 if (buffer_mapped(bh))
1468 return ext4_journal_dirty_data(handle, bh);
1469 return 0;
1470}
1471
1472/* 2466/*
1473 * Note that we always start a transaction even if we're not journalling 2467 * Note that we don't need to start a transaction unless we're journaling data
1474 * data. This is to preserve ordering: any hole instantiation within 2468 * because we should have holes filled from ext4_page_mkwrite(). We even don't
1475 * __block_write_full_page -> ext4_get_block() should be journalled 2469 * need to file the inode to the transaction's list in ordered mode because if
1476 * along with the data so we don't crash and then get metadata which 2470 * we are writing back data added by write(), the inode is already there and if
1477 * refers to old data. 2471 * we are writing back data modified via mmap(), noone guarantees in which
2472 * transaction the data will hit the disk. In case we are journaling data, we
2473 * cannot start transaction directly because transaction start ranks above page
2474 * lock so we have to do some magic.
1478 * 2475 *
1479 * In all journalling modes block_write_full_page() will start the I/O. 2476 * In all journaling modes block_write_full_page() will start the I/O.
1480 * 2477 *
1481 * Problem: 2478 * Problem:
1482 * 2479 *
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1518 * disastrous. Any write() or metadata operation will sync the fs for 2515 * disastrous. Any write() or metadata operation will sync the fs for
1519 * us. 2516 * us.
1520 * 2517 *
1521 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1522 * we don't need to open a transaction here.
1523 */ 2518 */
1524static int ext4_ordered_writepage(struct page *page, 2519static int __ext4_normal_writepage(struct page *page,
1525 struct writeback_control *wbc) 2520 struct writeback_control *wbc)
1526{ 2521{
1527 struct inode *inode = page->mapping->host; 2522 struct inode *inode = page->mapping->host;
1528 struct buffer_head *page_bufs;
1529 handle_t *handle = NULL;
1530 int ret = 0;
1531 int err;
1532
1533 J_ASSERT(PageLocked(page));
1534
1535 /*
1536 * We give up here if we're reentered, because it might be for a
1537 * different filesystem.
1538 */
1539 if (ext4_journal_current_handle())
1540 goto out_fail;
1541 2523
1542 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2524 if (test_opt(inode->i_sb, NOBH))
2525 return nobh_writepage(page,
2526 ext4_normal_get_block_write, wbc);
2527 else
2528 return block_write_full_page(page,
2529 ext4_normal_get_block_write,
2530 wbc);
2531}
1543 2532
1544 if (IS_ERR(handle)) { 2533static int ext4_normal_writepage(struct page *page,
1545 ret = PTR_ERR(handle); 2534 struct writeback_control *wbc)
1546 goto out_fail; 2535{
1547 } 2536 struct inode *inode = page->mapping->host;
2537 loff_t size = i_size_read(inode);
2538 loff_t len;
1548 2539
1549 if (!page_has_buffers(page)) { 2540 J_ASSERT(PageLocked(page));
1550 create_empty_buffers(page, inode->i_sb->s_blocksize, 2541 if (page->index == size >> PAGE_CACHE_SHIFT)
1551 (1 << BH_Dirty)|(1 << BH_Uptodate)); 2542 len = size & ~PAGE_CACHE_MASK;
2543 else
2544 len = PAGE_CACHE_SIZE;
2545
2546 if (page_has_buffers(page)) {
2547 /* if page has buffers it should all be mapped
2548 * and allocated. If there are not buffers attached
2549 * to the page we know the page is dirty but it lost
2550 * buffers. That means that at some moment in time
2551 * after write_begin() / write_end() has been called
2552 * all buffers have been clean and thus they must have been
2553 * written at least once. So they are all mapped and we can
2554 * happily proceed with mapping them and writing the page.
2555 */
2556 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2557 ext4_bh_unmapped_or_delay));
1552 } 2558 }
1553 page_bufs = page_buffers(page);
1554 walk_page_buffers(handle, page_bufs, 0,
1555 PAGE_CACHE_SIZE, NULL, bget_one);
1556
1557 ret = block_write_full_page(page, ext4_get_block, wbc);
1558 2559
1559 /* 2560 if (!ext4_journal_current_handle())
1560 * The page can become unlocked at any point now, and 2561 return __ext4_normal_writepage(page, wbc);
1561 * truncate can then come in and change things. So we
1562 * can't touch *page from now on. But *page_bufs is
1563 * safe due to elevated refcount.
1564 */
1565 2562
1566 /*
1567 * And attach them to the current transaction. But only if
1568 * block_write_full_page() succeeded. Otherwise they are unmapped,
1569 * and generally junk.
1570 */
1571 if (ret == 0) {
1572 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1573 NULL, jbd2_journal_dirty_data_fn);
1574 if (!ret)
1575 ret = err;
1576 }
1577 walk_page_buffers(handle, page_bufs, 0,
1578 PAGE_CACHE_SIZE, NULL, bput_one);
1579 err = ext4_journal_stop(handle);
1580 if (!ret)
1581 ret = err;
1582 return ret;
1583
1584out_fail:
1585 redirty_page_for_writepage(wbc, page); 2563 redirty_page_for_writepage(wbc, page);
1586 unlock_page(page); 2564 unlock_page(page);
1587 return ret; 2565 return 0;
1588} 2566}
1589 2567
1590static int ext4_writeback_writepage(struct page *page, 2568static int __ext4_journalled_writepage(struct page *page,
1591 struct writeback_control *wbc) 2569 struct writeback_control *wbc)
1592{ 2570{
1593 struct inode *inode = page->mapping->host; 2571 struct address_space *mapping = page->mapping;
2572 struct inode *inode = mapping->host;
2573 struct buffer_head *page_bufs;
1594 handle_t *handle = NULL; 2574 handle_t *handle = NULL;
1595 int ret = 0; 2575 int ret = 0;
1596 int err; 2576 int err;
1597 2577
1598 if (ext4_journal_current_handle()) 2578 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1599 goto out_fail; 2579 ext4_normal_get_block_write);
2580 if (ret != 0)
2581 goto out_unlock;
2582
2583 page_bufs = page_buffers(page);
2584 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
2585 bget_one);
2586 /* As soon as we unlock the page, it can go away, but we have
2587 * references to buffers so we are safe */
2588 unlock_page(page);
1600 2589
1601 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2590 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1602 if (IS_ERR(handle)) { 2591 if (IS_ERR(handle)) {
1603 ret = PTR_ERR(handle); 2592 ret = PTR_ERR(handle);
1604 goto out_fail; 2593 goto out;
1605 } 2594 }
1606 2595
1607 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2596 ret = walk_page_buffers(handle, page_bufs, 0,
1608 ret = nobh_writepage(page, ext4_get_block, wbc); 2597 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1609 else
1610 ret = block_write_full_page(page, ext4_get_block, wbc);
1611 2598
2599 err = walk_page_buffers(handle, page_bufs, 0,
2600 PAGE_CACHE_SIZE, NULL, write_end_fn);
2601 if (ret == 0)
2602 ret = err;
1612 err = ext4_journal_stop(handle); 2603 err = ext4_journal_stop(handle);
1613 if (!ret) 2604 if (!ret)
1614 ret = err; 2605 ret = err;
1615 return ret;
1616 2606
1617out_fail: 2607 walk_page_buffers(handle, page_bufs, 0,
1618 redirty_page_for_writepage(wbc, page); 2608 PAGE_CACHE_SIZE, NULL, bput_one);
2609 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
2610 goto out;
2611
2612out_unlock:
1619 unlock_page(page); 2613 unlock_page(page);
2614out:
1620 return ret; 2615 return ret;
1621} 2616}
1622 2617
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
1624 struct writeback_control *wbc) 2619 struct writeback_control *wbc)
1625{ 2620{
1626 struct inode *inode = page->mapping->host; 2621 struct inode *inode = page->mapping->host;
1627 handle_t *handle = NULL; 2622 loff_t size = i_size_read(inode);
1628 int ret = 0; 2623 loff_t len;
1629 int err;
1630 2624
1631 if (ext4_journal_current_handle()) 2625 J_ASSERT(PageLocked(page));
1632 goto no_write; 2626 if (page->index == size >> PAGE_CACHE_SHIFT)
2627 len = size & ~PAGE_CACHE_MASK;
2628 else
2629 len = PAGE_CACHE_SIZE;
2630
2631 if (page_has_buffers(page)) {
2632 /* if page has buffers it should all be mapped
2633 * and allocated. If there are not buffers attached
2634 * to the page we know the page is dirty but it lost
2635 * buffers. That means that at some moment in time
2636 * after write_begin() / write_end() has been called
2637 * all buffers have been clean and thus they must have been
2638 * written at least once. So they are all mapped and we can
2639 * happily proceed with mapping them and writing the page.
2640 */
2641 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2642 ext4_bh_unmapped_or_delay));
2643 }
1633 2644
1634 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2645 if (ext4_journal_current_handle())
1635 if (IS_ERR(handle)) {
1636 ret = PTR_ERR(handle);
1637 goto no_write; 2646 goto no_write;
1638 }
1639 2647
1640 if (!page_has_buffers(page) || PageChecked(page)) { 2648 if (PageChecked(page)) {
1641 /* 2649 /*
1642 * It's mmapped pagecache. Add buffers and journal it. There 2650 * It's mmapped pagecache. Add buffers and journal it. There
1643 * doesn't seem much point in redirtying the page here. 2651 * doesn't seem much point in redirtying the page here.
1644 */ 2652 */
1645 ClearPageChecked(page); 2653 ClearPageChecked(page);
1646 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2654 return __ext4_journalled_writepage(page, wbc);
1647 ext4_get_block);
1648 if (ret != 0) {
1649 ext4_journal_stop(handle);
1650 goto out_unlock;
1651 }
1652 ret = walk_page_buffers(handle, page_buffers(page), 0,
1653 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1654
1655 err = walk_page_buffers(handle, page_buffers(page), 0,
1656 PAGE_CACHE_SIZE, NULL, write_end_fn);
1657 if (ret == 0)
1658 ret = err;
1659 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1660 unlock_page(page);
1661 } else { 2655 } else {
1662 /* 2656 /*
1663 * It may be a page full of checkpoint-mode buffers. We don't 2657 * It may be a page full of checkpoint-mode buffers. We don't
1664 * really know unless we go poke around in the buffer_heads. 2658 * really know unless we go poke around in the buffer_heads.
1665 * But block_write_full_page will do the right thing. 2659 * But block_write_full_page will do the right thing.
1666 */ 2660 */
1667 ret = block_write_full_page(page, ext4_get_block, wbc); 2661 return block_write_full_page(page,
2662 ext4_normal_get_block_write,
2663 wbc);
1668 } 2664 }
1669 err = ext4_journal_stop(handle);
1670 if (!ret)
1671 ret = err;
1672out:
1673 return ret;
1674
1675no_write: 2665no_write:
1676 redirty_page_for_writepage(wbc, page); 2666 redirty_page_for_writepage(wbc, page);
1677out_unlock:
1678 unlock_page(page); 2667 unlock_page(page);
1679 goto out; 2668 return 0;
1680} 2669}
1681 2670
1682static int ext4_readpage(struct file *file, struct page *page) 2671static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
1819static const struct address_space_operations ext4_ordered_aops = { 2808static const struct address_space_operations ext4_ordered_aops = {
1820 .readpage = ext4_readpage, 2809 .readpage = ext4_readpage,
1821 .readpages = ext4_readpages, 2810 .readpages = ext4_readpages,
1822 .writepage = ext4_ordered_writepage, 2811 .writepage = ext4_normal_writepage,
1823 .sync_page = block_sync_page, 2812 .sync_page = block_sync_page,
1824 .write_begin = ext4_write_begin, 2813 .write_begin = ext4_write_begin,
1825 .write_end = ext4_ordered_write_end, 2814 .write_end = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
1833static const struct address_space_operations ext4_writeback_aops = { 2822static const struct address_space_operations ext4_writeback_aops = {
1834 .readpage = ext4_readpage, 2823 .readpage = ext4_readpage,
1835 .readpages = ext4_readpages, 2824 .readpages = ext4_readpages,
1836 .writepage = ext4_writeback_writepage, 2825 .writepage = ext4_normal_writepage,
1837 .sync_page = block_sync_page, 2826 .sync_page = block_sync_page,
1838 .write_begin = ext4_write_begin, 2827 .write_begin = ext4_write_begin,
1839 .write_end = ext4_writeback_write_end, 2828 .write_end = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
1857 .releasepage = ext4_releasepage, 2846 .releasepage = ext4_releasepage,
1858}; 2847};
1859 2848
2849static const struct address_space_operations ext4_da_aops = {
2850 .readpage = ext4_readpage,
2851 .readpages = ext4_readpages,
2852 .writepage = ext4_da_writepage,
2853 .writepages = ext4_da_writepages,
2854 .sync_page = block_sync_page,
2855 .write_begin = ext4_da_write_begin,
2856 .write_end = ext4_da_write_end,
2857 .bmap = ext4_bmap,
2858 .invalidatepage = ext4_da_invalidatepage,
2859 .releasepage = ext4_releasepage,
2860 .direct_IO = ext4_direct_IO,
2861 .migratepage = buffer_migrate_page,
2862};
2863
1860void ext4_set_aops(struct inode *inode) 2864void ext4_set_aops(struct inode *inode)
1861{ 2865{
1862 if (ext4_should_order_data(inode)) 2866 if (ext4_should_order_data(inode) &&
2867 test_opt(inode->i_sb, DELALLOC))
2868 inode->i_mapping->a_ops = &ext4_da_aops;
2869 else if (ext4_should_order_data(inode))
1863 inode->i_mapping->a_ops = &ext4_ordered_aops; 2870 inode->i_mapping->a_ops = &ext4_ordered_aops;
2871 else if (ext4_should_writeback_data(inode) &&
2872 test_opt(inode->i_sb, DELALLOC))
2873 inode->i_mapping->a_ops = &ext4_da_aops;
1864 else if (ext4_should_writeback_data(inode)) 2874 else if (ext4_should_writeback_data(inode))
1865 inode->i_mapping->a_ops = &ext4_writeback_aops; 2875 inode->i_mapping->a_ops = &ext4_writeback_aops;
1866 else 2876 else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
1873 * This required during truncate. We need to physically zero the tail end 2883 * This required during truncate. We need to physically zero the tail end
1874 * of that block so it doesn't yield old data if the file is later grown. 2884 * of that block so it doesn't yield old data if the file is later grown.
1875 */ 2885 */
1876int ext4_block_truncate_page(handle_t *handle, struct page *page, 2886int ext4_block_truncate_page(handle_t *handle,
1877 struct address_space *mapping, loff_t from) 2887 struct address_space *mapping, loff_t from)
1878{ 2888{
1879 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 2889 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1882 ext4_lblk_t iblock; 2892 ext4_lblk_t iblock;
1883 struct inode *inode = mapping->host; 2893 struct inode *inode = mapping->host;
1884 struct buffer_head *bh; 2894 struct buffer_head *bh;
2895 struct page *page;
1885 int err = 0; 2896 int err = 0;
1886 2897
2898 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
2899 if (!page)
2900 return -EINVAL;
2901
1887 blocksize = inode->i_sb->s_blocksize; 2902 blocksize = inode->i_sb->s_blocksize;
1888 length = blocksize - (offset & (blocksize - 1)); 2903 length = blocksize - (offset & (blocksize - 1));
1889 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 2904 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1956 err = ext4_journal_dirty_metadata(handle, bh); 2971 err = ext4_journal_dirty_metadata(handle, bh);
1957 } else { 2972 } else {
1958 if (ext4_should_order_data(inode)) 2973 if (ext4_should_order_data(inode))
1959 err = ext4_journal_dirty_data(handle, bh); 2974 err = ext4_jbd2_file_inode(handle, inode);
1960 mark_buffer_dirty(bh); 2975 mark_buffer_dirty(bh);
1961 } 2976 }
1962 2977
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
2179 3194
2180 if (this_bh) { 3195 if (this_bh) {
2181 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 3196 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
2182 ext4_journal_dirty_metadata(handle, this_bh); 3197
3198 /*
3199 * The buffer head should have an attached journal head at this
3200 * point. However, if the data is corrupted and an indirect
3201 * block pointed to itself, it would have been detached when
3202 * the block was cleared. Check for this instead of OOPSing.
3203 */
3204 if (bh2jh(this_bh))
3205 ext4_journal_dirty_metadata(handle, this_bh);
3206 else
3207 ext4_error(inode->i_sb, __func__,
3208 "circular indirect block detected, "
3209 "inode=%lu, block=%llu",
3210 inode->i_ino,
3211 (unsigned long long) this_bh->b_blocknr);
2183 } 3212 }
2184} 3213}
2185 3214
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
2305 } 3334 }
2306} 3335}
2307 3336
3337int ext4_can_truncate(struct inode *inode)
3338{
3339 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3340 return 0;
3341 if (S_ISREG(inode->i_mode))
3342 return 1;
3343 if (S_ISDIR(inode->i_mode))
3344 return 1;
3345 if (S_ISLNK(inode->i_mode))
3346 return !ext4_inode_is_fast_symlink(inode);
3347 return 0;
3348}
3349
2308/* 3350/*
2309 * ext4_truncate() 3351 * ext4_truncate()
2310 * 3352 *
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
2347 int n; 3389 int n;
2348 ext4_lblk_t last_block; 3390 ext4_lblk_t last_block;
2349 unsigned blocksize = inode->i_sb->s_blocksize; 3391 unsigned blocksize = inode->i_sb->s_blocksize;
2350 struct page *page;
2351 3392
2352 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 3393 if (!ext4_can_truncate(inode))
2353 S_ISLNK(inode->i_mode)))
2354 return;
2355 if (ext4_inode_is_fast_symlink(inode))
2356 return;
2357 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2358 return; 3394 return;
2359 3395
2360 /*
2361 * We have to lock the EOF page here, because lock_page() nests
2362 * outside jbd2_journal_start().
2363 */
2364 if ((inode->i_size & (blocksize - 1)) == 0) {
2365 /* Block boundary? Nothing to do */
2366 page = NULL;
2367 } else {
2368 page = grab_cache_page(mapping,
2369 inode->i_size >> PAGE_CACHE_SHIFT);
2370 if (!page)
2371 return;
2372 }
2373
2374 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3396 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
2375 ext4_ext_truncate(inode, page); 3397 ext4_ext_truncate(inode);
2376 return; 3398 return;
2377 } 3399 }
2378 3400
2379 handle = start_transaction(inode); 3401 handle = start_transaction(inode);
2380 if (IS_ERR(handle)) { 3402 if (IS_ERR(handle))
2381 if (page) {
2382 clear_highpage(page);
2383 flush_dcache_page(page);
2384 unlock_page(page);
2385 page_cache_release(page);
2386 }
2387 return; /* AKPM: return what? */ 3403 return; /* AKPM: return what? */
2388 }
2389 3404
2390 last_block = (inode->i_size + blocksize-1) 3405 last_block = (inode->i_size + blocksize-1)
2391 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3406 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
2392 3407
2393 if (page) 3408 if (inode->i_size & (blocksize - 1))
2394 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 3409 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
3410 goto out_stop;
2395 3411
2396 n = ext4_block_to_path(inode, last_block, offsets, NULL); 3412 n = ext4_block_to_path(inode, last_block, offsets, NULL);
2397 if (n == 0) 3413 if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
2410 goto out_stop; 3426 goto out_stop;
2411 3427
2412 /* 3428 /*
3429 * From here we block out all ext4_get_block() callers who want to
3430 * modify the block allocation tree.
3431 */
3432 down_write(&ei->i_data_sem);
3433 /*
2413 * The orphan list entry will now protect us from any crash which 3434 * The orphan list entry will now protect us from any crash which
2414 * occurs before the truncate completes, so it is now safe to propagate 3435 * occurs before the truncate completes, so it is now safe to propagate
2415 * the new, shorter inode size (held for now in i_size) into the 3436 * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
2418 */ 3439 */
2419 ei->i_disksize = inode->i_size; 3440 ei->i_disksize = inode->i_size;
2420 3441
2421 /*
2422 * From here we block out all ext4_get_block() callers who want to
2423 * modify the block allocation tree.
2424 */
2425 down_write(&ei->i_data_sem);
2426
2427 if (n == 1) { /* direct blocks */ 3442 if (n == 1) { /* direct blocks */
2428 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 3443 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
2429 i_data + EXT4_NDIR_BLOCKS); 3444 i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
3107 * be freed, so we have a strong guarantee that no future commit will 4122 * be freed, so we have a strong guarantee that no future commit will
3108 * leave these blocks visible to the user.) 4123 * leave these blocks visible to the user.)
3109 * 4124 *
3110 * Called with inode->sem down. 4125 * Another thing we have to assure is that if we are in ordered mode
4126 * and inode is still attached to the committing transaction, we must
4127 * we start writeout of all the dirty pages which are being truncated.
4128 * This way we are sure that all the data written in the previous
4129 * transaction are already on disk (truncate waits for pages under
4130 * writeback).
4131 *
4132 * Called with inode->i_mutex down.
3111 */ 4133 */
3112int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4134int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3113{ 4135{
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3173 if (!error) 4195 if (!error)
3174 error = rc; 4196 error = rc;
3175 ext4_journal_stop(handle); 4197 ext4_journal_stop(handle);
4198
4199 if (ext4_should_order_data(inode)) {
4200 error = ext4_begin_ordered_truncate(inode,
4201 attr->ia_size);
4202 if (error) {
4203 /* Do as much error cleanup as possible */
4204 handle = ext4_journal_start(inode, 3);
4205 if (IS_ERR(handle)) {
4206 ext4_orphan_del(NULL, inode);
4207 goto err_out;
4208 }
4209 ext4_orphan_del(handle, inode);
4210 ext4_journal_stop(handle);
4211 goto err_out;
4212 }
4213 }
3176 } 4214 }
3177 4215
3178 rc = inode_setattr(inode, attr); 4216 rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
3193 return error; 4231 return error;
3194} 4232}
3195 4233
4234int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4235 struct kstat *stat)
4236{
4237 struct inode *inode;
4238 unsigned long delalloc_blocks;
4239
4240 inode = dentry->d_inode;
4241 generic_fillattr(inode, stat);
4242
4243 /*
4244 * We can't update i_blocks if the block allocation is delayed
4245 * otherwise in the case of system crash before the real block
4246 * allocation is done, we will have i_blocks inconsistent with
4247 * on-disk file blocks.
4248 * We always keep i_blocks updated together with real
4249 * allocation. But to not confuse with user, stat
4250 * will return the blocks that include the delayed allocation
4251 * blocks for this file.
4252 */
4253 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
4254 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4255 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
4256
4257 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4258 return 0;
4259}
3196 4260
3197/* 4261/*
3198 * How many blocks doth make a writepage()? 4262 * How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
3506 4570
3507 return err; 4571 return err;
3508} 4572}
4573
4574static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4575{
4576 return !buffer_mapped(bh);
4577}
4578
4579int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4580{
4581 loff_t size;
4582 unsigned long len;
4583 int ret = -EINVAL;
4584 struct file *file = vma->vm_file;
4585 struct inode *inode = file->f_path.dentry->d_inode;
4586 struct address_space *mapping = inode->i_mapping;
4587
4588 /*
4589 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
4590 * get i_mutex because we are already holding mmap_sem.
4591 */
4592 down_read(&inode->i_alloc_sem);
4593 size = i_size_read(inode);
4594 if (page->mapping != mapping || size <= page_offset(page)
4595 || !PageUptodate(page)) {
4596 /* page got truncated from under us? */
4597 goto out_unlock;
4598 }
4599 ret = 0;
4600 if (PageMappedToDisk(page))
4601 goto out_unlock;
4602
4603 if (page->index == size >> PAGE_CACHE_SHIFT)
4604 len = size & ~PAGE_CACHE_MASK;
4605 else
4606 len = PAGE_CACHE_SIZE;
4607
4608 if (page_has_buffers(page)) {
4609 /* return if we have all the buffers mapped */
4610 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4611 ext4_bh_unmapped))
4612 goto out_unlock;
4613 }
4614 /*
4615 * OK, we need to fill the hole... Do write_begin write_end
4616 * to do block allocation/reservation.We are not holding
4617 * inode.i__mutex here. That allow * parallel write_begin,
4618 * write_end call. lock_page prevent this from happening
4619 * on the same page though
4620 */
4621 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4622 len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
4623 if (ret < 0)
4624 goto out_unlock;
4625 ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4626 len, len, page, NULL);
4627 if (ret < 0)
4628 goto out_unlock;
4629 ret = 0;
4630out_unlock:
4631 up_read(&inode->i_alloc_sem);
4632 return ret;
4633}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade150..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
381 381
382static inline int mb_find_next_zero_bit(void *addr, int max, int start) 382static inline int mb_find_next_zero_bit(void *addr, int max, int start)
383{ 383{
384 int fix = 0; 384 int fix = 0, ret, tmpmax;
385 addr = mb_correct_addr_and_bit(&fix, addr); 385 addr = mb_correct_addr_and_bit(&fix, addr);
386 max += fix; 386 tmpmax = max + fix;
387 start += fix; 387 start += fix;
388 388
389 return ext4_find_next_zero_bit(addr, max, start) - fix; 389 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
390 if (ret > max)
391 return max;
392 return ret;
390} 393}
391 394
392static inline int mb_find_next_bit(void *addr, int max, int start) 395static inline int mb_find_next_bit(void *addr, int max, int start)
393{ 396{
394 int fix = 0; 397 int fix = 0, ret, tmpmax;
395 addr = mb_correct_addr_and_bit(&fix, addr); 398 addr = mb_correct_addr_and_bit(&fix, addr);
396 max += fix; 399 tmpmax = max + fix;
397 start += fix; 400 start += fix;
398 401
399 return ext4_find_next_bit(addr, max, start) - fix; 402 ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
403 if (ret > max)
404 return max;
405 return ret;
400} 406}
401 407
402static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 408static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
803 if (!buffer_uptodate(bh[i])) 809 if (!buffer_uptodate(bh[i]))
804 goto out; 810 goto out;
805 811
812 err = 0;
806 first_block = page->index * blocks_per_page; 813 first_block = page->index * blocks_per_page;
807 for (i = 0; i < blocks_per_page; i++) { 814 for (i = 0; i < blocks_per_page; i++) {
808 int group; 815 int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
883 int pnum; 890 int pnum;
884 int poff; 891 int poff;
885 struct page *page; 892 struct page *page;
893 int ret;
886 894
887 mb_debug("load group %lu\n", group); 895 mb_debug("load group %lu\n", group);
888 896
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
914 if (page) { 922 if (page) {
915 BUG_ON(page->mapping != inode->i_mapping); 923 BUG_ON(page->mapping != inode->i_mapping);
916 if (!PageUptodate(page)) { 924 if (!PageUptodate(page)) {
917 ext4_mb_init_cache(page, NULL); 925 ret = ext4_mb_init_cache(page, NULL);
926 if (ret) {
927 unlock_page(page);
928 goto err;
929 }
918 mb_cmp_bitmaps(e4b, page_address(page) + 930 mb_cmp_bitmaps(e4b, page_address(page) +
919 (poff * sb->s_blocksize)); 931 (poff * sb->s_blocksize));
920 } 932 }
921 unlock_page(page); 933 unlock_page(page);
922 } 934 }
923 } 935 }
924 if (page == NULL || !PageUptodate(page)) 936 if (page == NULL || !PageUptodate(page)) {
937 ret = -EIO;
925 goto err; 938 goto err;
939 }
926 e4b->bd_bitmap_page = page; 940 e4b->bd_bitmap_page = page;
927 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 941 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
928 mark_page_accessed(page); 942 mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
938 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 952 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
939 if (page) { 953 if (page) {
940 BUG_ON(page->mapping != inode->i_mapping); 954 BUG_ON(page->mapping != inode->i_mapping);
941 if (!PageUptodate(page)) 955 if (!PageUptodate(page)) {
942 ext4_mb_init_cache(page, e4b->bd_bitmap); 956 ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
943 957 if (ret) {
958 unlock_page(page);
959 goto err;
960 }
961 }
944 unlock_page(page); 962 unlock_page(page);
945 } 963 }
946 } 964 }
947 if (page == NULL || !PageUptodate(page)) 965 if (page == NULL || !PageUptodate(page)) {
966 ret = -EIO;
948 goto err; 967 goto err;
968 }
949 e4b->bd_buddy_page = page; 969 e4b->bd_buddy_page = page;
950 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 970 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
951 mark_page_accessed(page); 971 mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
962 page_cache_release(e4b->bd_buddy_page); 982 page_cache_release(e4b->bd_buddy_page);
963 e4b->bd_buddy = NULL; 983 e4b->bd_buddy = NULL;
964 e4b->bd_bitmap = NULL; 984 e4b->bd_bitmap = NULL;
965 return -EIO; 985 return ret;
966} 986}
967 987
968static void ext4_mb_release_desc(struct ext4_buddy *e4b) 988static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1031 } 1051 }
1032} 1052}
1033 1053
1034static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1054static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1035 int first, int count) 1055 int first, int count)
1036{ 1056{
1037 int block = 0; 1057 int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1071 blocknr += block; 1091 blocknr += block;
1072 blocknr += 1092 blocknr +=
1073 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1093 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1074 1094 ext4_unlock_group(sb, e4b->bd_group);
1075 ext4_error(sb, __func__, "double-free of inode" 1095 ext4_error(sb, __func__, "double-free of inode"
1076 " %lu's block %llu(bit %u in group %lu)\n", 1096 " %lu's block %llu(bit %u in group %lu)\n",
1077 inode ? inode->i_ino : 0, blocknr, block, 1097 inode ? inode->i_ino : 0, blocknr, block,
1078 e4b->bd_group); 1098 e4b->bd_group);
1099 ext4_lock_group(sb, e4b->bd_group);
1079 } 1100 }
1080 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1101 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1081 e4b->bd_info->bb_counters[order]++; 1102 e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1113 } while (1); 1134 } while (1);
1114 } 1135 }
1115 mb_check_buddy(e4b); 1136 mb_check_buddy(e4b);
1116
1117 return 0;
1118} 1137}
1119 1138
1120static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1139static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1730 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 1749 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1731 spin_unlock(&sbi->s_md_lock); 1750 spin_unlock(&sbi->s_md_lock);
1732 } 1751 }
1733
1734 /* searching for the right group start from the goal value specified */
1735 group = ac->ac_g_ex.fe_group;
1736
1737 /* Let's just scan groups to find more-less suitable blocks */ 1752 /* Let's just scan groups to find more-less suitable blocks */
1738 cr = ac->ac_2order ? 0 : 1; 1753 cr = ac->ac_2order ? 0 : 1;
1739 /* 1754 /*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1743repeat: 1758repeat:
1744 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 1759 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
1745 ac->ac_criteria = cr; 1760 ac->ac_criteria = cr;
1761 /*
1762 * searching for the right group start
1763 * from the goal value specified
1764 */
1765 group = ac->ac_g_ex.fe_group;
1766
1746 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { 1767 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
1747 struct ext4_group_info *grp; 1768 struct ext4_group_info *grp;
1748 struct ext4_group_desc *desc; 1769 struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
1963 int rc; 1984 int rc;
1964 int size; 1985 int size;
1965 1986
1987 if (unlikely(sbi->s_mb_history == NULL))
1988 return -ENOMEM;
1966 s = kmalloc(sizeof(*s), GFP_KERNEL); 1989 s = kmalloc(sizeof(*s), GFP_KERNEL);
1967 if (s == NULL) 1990 if (s == NULL)
1968 return -ENOMEM; 1991 return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
2165 sbi->s_mb_history_cur = 0; 2188 sbi->s_mb_history_cur = 0;
2166 spin_lock_init(&sbi->s_mb_history_lock); 2189 spin_lock_init(&sbi->s_mb_history_lock);
2167 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); 2190 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2168 sbi->s_mb_history = kmalloc(i, GFP_KERNEL); 2191 sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
2169 if (likely(sbi->s_mb_history != NULL))
2170 memset(sbi->s_mb_history, 0, i);
2171 /* if we can't allocate history, then we simple won't use it */ 2192 /* if we can't allocate history, then we simple won't use it */
2172} 2193}
2173 2194
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
2215#define ext4_mb_history_init(sb) 2236#define ext4_mb_history_init(sb)
2216#endif 2237#endif
2217 2238
2239
2240/* Create and initialize ext4_group_info data for the given group. */
2241int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2242 struct ext4_group_desc *desc)
2243{
2244 int i, len;
2245 int metalen = 0;
2246 struct ext4_sb_info *sbi = EXT4_SB(sb);
2247 struct ext4_group_info **meta_group_info;
2248
2249 /*
2250 * First check if this group is the first of a reserved block.
2251 * If it's true, we have to allocate a new table of pointers
2252 * to ext4_group_info structures
2253 */
2254 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2255 metalen = sizeof(*meta_group_info) <<
2256 EXT4_DESC_PER_BLOCK_BITS(sb);
2257 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2258 if (meta_group_info == NULL) {
2259 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2260 "buddy group\n");
2261 goto exit_meta_group_info;
2262 }
2263 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2264 meta_group_info;
2265 }
2266
2267 /*
2268 * calculate needed size. if change bb_counters size,
2269 * don't forget about ext4_mb_generate_buddy()
2270 */
2271 len = offsetof(typeof(**meta_group_info),
2272 bb_counters[sb->s_blocksize_bits + 2]);
2273
2274 meta_group_info =
2275 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2276 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2277
2278 meta_group_info[i] = kzalloc(len, GFP_KERNEL);
2279 if (meta_group_info[i] == NULL) {
2280 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2281 goto exit_group_info;
2282 }
2283 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2284 &(meta_group_info[i]->bb_state));
2285
2286 /*
2287 * initialize bb_free to be able to skip
2288 * empty groups without initialization
2289 */
2290 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2291 meta_group_info[i]->bb_free =
2292 ext4_free_blocks_after_init(sb, group, desc);
2293 } else {
2294 meta_group_info[i]->bb_free =
2295 le16_to_cpu(desc->bg_free_blocks_count);
2296 }
2297
2298 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2299
2300#ifdef DOUBLE_CHECK
2301 {
2302 struct buffer_head *bh;
2303 meta_group_info[i]->bb_bitmap =
2304 kmalloc(sb->s_blocksize, GFP_KERNEL);
2305 BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2306 bh = ext4_read_block_bitmap(sb, group);
2307 BUG_ON(bh == NULL);
2308 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2309 sb->s_blocksize);
2310 put_bh(bh);
2311 }
2312#endif
2313
2314 return 0;
2315
2316exit_group_info:
2317 /* If a meta_group_info table has been allocated, release it now */
2318 if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
2319 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2320exit_meta_group_info:
2321 return -ENOMEM;
2322} /* ext4_mb_add_groupinfo */
2323
2324/*
2325 * Add a group to the existing groups.
2326 * This function is used for online resize
2327 */
2328int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
2329 struct ext4_group_desc *desc)
2330{
2331 struct ext4_sb_info *sbi = EXT4_SB(sb);
2332 struct inode *inode = sbi->s_buddy_cache;
2333 int blocks_per_page;
2334 int block;
2335 int pnum;
2336 struct page *page;
2337 int err;
2338
2339 /* Add group based on group descriptor*/
2340 err = ext4_mb_add_groupinfo(sb, group, desc);
2341 if (err)
2342 return err;
2343
2344 /*
2345 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
2346 * datas) are set not up to date so that they will be re-initilaized
2347 * during the next call to ext4_mb_load_buddy
2348 */
2349
2350 /* Set buddy page as not up to date */
2351 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2352 block = group * 2;
2353 pnum = block / blocks_per_page;
2354 page = find_get_page(inode->i_mapping, pnum);
2355 if (page != NULL) {
2356 ClearPageUptodate(page);
2357 page_cache_release(page);
2358 }
2359
2360 /* Set bitmap page as not up to date */
2361 block++;
2362 pnum = block / blocks_per_page;
2363 page = find_get_page(inode->i_mapping, pnum);
2364 if (page != NULL) {
2365 ClearPageUptodate(page);
2366 page_cache_release(page);
2367 }
2368
2369 return 0;
2370}
2371
2372/*
2373 * Update an existing group.
2374 * This function is used for online resize
2375 */
2376void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2377{
2378 grp->bb_free += add;
2379}
2380
2218static int ext4_mb_init_backend(struct super_block *sb) 2381static int ext4_mb_init_backend(struct super_block *sb)
2219{ 2382{
2220 ext4_group_t i; 2383 ext4_group_t i;
2221 int j, len, metalen; 2384 int metalen;
2222 struct ext4_sb_info *sbi = EXT4_SB(sb); 2385 struct ext4_sb_info *sbi = EXT4_SB(sb);
2223 int num_meta_group_infos = 2386 struct ext4_super_block *es = sbi->s_es;
2224 (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> 2387 int num_meta_group_infos;
2225 EXT4_DESC_PER_BLOCK_BITS(sb); 2388 int num_meta_group_infos_max;
2389 int array_size;
2226 struct ext4_group_info **meta_group_info; 2390 struct ext4_group_info **meta_group_info;
2391 struct ext4_group_desc *desc;
2392
2393 /* This is the number of blocks used by GDT */
2394 num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
2395 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
2396
2397 /*
2398 * This is the total number of blocks used by GDT including
2399 * the number of reserved blocks for GDT.
2400 * The s_group_info array is allocated with this value
2401 * to allow a clean online resize without a complex
2402 * manipulation of pointer.
2403 * The drawback is the unused memory when no resize
2404 * occurs but it's very low in terms of pages
2405 * (see comments below)
2406 * Need to handle this properly when META_BG resizing is allowed
2407 */
2408 num_meta_group_infos_max = num_meta_group_infos +
2409 le16_to_cpu(es->s_reserved_gdt_blocks);
2227 2410
2411 /*
2412 * array_size is the size of s_group_info array. We round it
2413 * to the next power of two because this approximation is done
2414 * internally by kmalloc so we can have some more memory
2415 * for free here (e.g. may be used for META_BG resize).
2416 */
2417 array_size = 1;
2418 while (array_size < sizeof(*sbi->s_group_info) *
2419 num_meta_group_infos_max)
2420 array_size = array_size << 1;
2228 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2421 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2229 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2422 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2230 * So a two level scheme suffices for now. */ 2423 * So a two level scheme suffices for now. */
2231 sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * 2424 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
2232 num_meta_group_infos, GFP_KERNEL);
2233 if (sbi->s_group_info == NULL) { 2425 if (sbi->s_group_info == NULL) {
2234 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2426 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2235 return -ENOMEM; 2427 return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
2256 sbi->s_group_info[i] = meta_group_info; 2448 sbi->s_group_info[i] = meta_group_info;
2257 } 2449 }
2258 2450
2259 /*
2260 * calculate needed size. if change bb_counters size,
2261 * don't forget about ext4_mb_generate_buddy()
2262 */
2263 len = sizeof(struct ext4_group_info);
2264 len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
2265 for (i = 0; i < sbi->s_groups_count; i++) { 2451 for (i = 0; i < sbi->s_groups_count; i++) {
2266 struct ext4_group_desc *desc;
2267
2268 meta_group_info =
2269 sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2270 j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
2271
2272 meta_group_info[j] = kzalloc(len, GFP_KERNEL);
2273 if (meta_group_info[j] == NULL) {
2274 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2275 goto err_freebuddy;
2276 }
2277 desc = ext4_get_group_desc(sb, i, NULL); 2452 desc = ext4_get_group_desc(sb, i, NULL);
2278 if (desc == NULL) { 2453 if (desc == NULL) {
2279 printk(KERN_ERR 2454 printk(KERN_ERR
2280 "EXT4-fs: can't read descriptor %lu\n", i); 2455 "EXT4-fs: can't read descriptor %lu\n", i);
2281 i++;
2282 goto err_freebuddy; 2456 goto err_freebuddy;
2283 } 2457 }
2284 memset(meta_group_info[j], 0, len); 2458 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2285 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2459 goto err_freebuddy;
2286 &(meta_group_info[j]->bb_state));
2287
2288 /*
2289 * initialize bb_free to be able to skip
2290 * empty groups without initialization
2291 */
2292 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2293 meta_group_info[j]->bb_free =
2294 ext4_free_blocks_after_init(sb, i, desc);
2295 } else {
2296 meta_group_info[j]->bb_free =
2297 le16_to_cpu(desc->bg_free_blocks_count);
2298 }
2299
2300 INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
2301
2302#ifdef DOUBLE_CHECK
2303 {
2304 struct buffer_head *bh;
2305 meta_group_info[j]->bb_bitmap =
2306 kmalloc(sb->s_blocksize, GFP_KERNEL);
2307 BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
2308 bh = read_block_bitmap(sb, i);
2309 BUG_ON(bh == NULL);
2310 memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
2311 sb->s_blocksize);
2312 put_bh(bh);
2313 }
2314#endif
2315
2316 } 2460 }
2317 2461
2318 return 0; 2462 return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2336 unsigned i; 2480 unsigned i;
2337 unsigned offset; 2481 unsigned offset;
2338 unsigned max; 2482 unsigned max;
2483 int ret;
2339 2484
2340 if (!test_opt(sb, MBALLOC)) 2485 if (!test_opt(sb, MBALLOC))
2341 return 0; 2486 return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2370 } while (i <= sb->s_blocksize_bits + 1); 2515 } while (i <= sb->s_blocksize_bits + 1);
2371 2516
2372 /* init file for buddy data */ 2517 /* init file for buddy data */
2373 i = ext4_mb_init_backend(sb); 2518 ret = ext4_mb_init_backend(sb);
2374 if (i) { 2519 if (ret != 0) {
2375 clear_opt(sbi->s_mount_opt, MBALLOC); 2520 clear_opt(sbi->s_mount_opt, MBALLOC);
2376 kfree(sbi->s_mb_offsets); 2521 kfree(sbi->s_mb_offsets);
2377 kfree(sbi->s_mb_maxs); 2522 kfree(sbi->s_mb_maxs);
2378 return i; 2523 return ret;
2379 } 2524 }
2380 2525
2381 spin_lock_init(&sbi->s_md_lock); 2526 spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2548 ext4_lock_group(sb, md->group); 2693 ext4_lock_group(sb, md->group);
2549 for (i = 0; i < md->num; i++) { 2694 for (i = 0; i < md->num; i++) {
2550 mb_debug(" %u", md->blocks[i]); 2695 mb_debug(" %u", md->blocks[i]);
2551 err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); 2696 mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
2552 BUG_ON(err != 0);
2553 } 2697 }
2554 mb_debug("\n"); 2698 mb_debug("\n");
2555 ext4_unlock_group(sb, md->group); 2699 ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2575 2719
2576 2720
2577 2721
2578#define MB_PROC_VALUE_READ(name) \ 2722#define MB_PROC_FOPS(name) \
2579static int ext4_mb_read_##name(char *page, char **start, \ 2723static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
2580 off_t off, int count, int *eof, void *data) \
2581{ \ 2724{ \
2582 struct ext4_sb_info *sbi = data; \ 2725 struct ext4_sb_info *sbi = m->private; \
2583 int len; \ 2726 \
2584 *eof = 1; \ 2727 seq_printf(m, "%ld\n", sbi->s_mb_##name); \
2585 if (off != 0) \ 2728 return 0; \
2586 return 0; \ 2729} \
2587 len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ 2730 \
2588 *start = page; \ 2731static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
2589 return len; \ 2732{ \
2590} 2733 return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
2591 2734} \
2592#define MB_PROC_VALUE_WRITE(name) \ 2735 \
2593static int ext4_mb_write_##name(struct file *file, \ 2736static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
2594 const char __user *buf, unsigned long cnt, void *data) \ 2737 const char __user *buf, size_t cnt, loff_t *ppos) \
2595{ \ 2738{ \
2596 struct ext4_sb_info *sbi = data; \ 2739 struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
2597 char str[32]; \ 2740 char str[32]; \
2598 long value; \ 2741 long value; \
2599 if (cnt >= sizeof(str)) \ 2742 if (cnt >= sizeof(str)) \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \
2605 return -ERANGE; \ 2748 return -ERANGE; \
2606 sbi->s_mb_##name = value; \ 2749 sbi->s_mb_##name = value; \
2607 return cnt; \ 2750 return cnt; \
2608} 2751} \
2752 \
2753static const struct file_operations ext4_mb_##name##_proc_fops = { \
2754 .owner = THIS_MODULE, \
2755 .open = ext4_mb_##name##_proc_open, \
2756 .read = seq_read, \
2757 .llseek = seq_lseek, \
2758 .release = single_release, \
2759 .write = ext4_mb_##name##_proc_write, \
2760};
2609 2761
2610MB_PROC_VALUE_READ(stats); 2762MB_PROC_FOPS(stats);
2611MB_PROC_VALUE_WRITE(stats); 2763MB_PROC_FOPS(max_to_scan);
2612MB_PROC_VALUE_READ(max_to_scan); 2764MB_PROC_FOPS(min_to_scan);
2613MB_PROC_VALUE_WRITE(max_to_scan); 2765MB_PROC_FOPS(order2_reqs);
2614MB_PROC_VALUE_READ(min_to_scan); 2766MB_PROC_FOPS(stream_request);
2615MB_PROC_VALUE_WRITE(min_to_scan); 2767MB_PROC_FOPS(group_prealloc);
2616MB_PROC_VALUE_READ(order2_reqs);
2617MB_PROC_VALUE_WRITE(order2_reqs);
2618MB_PROC_VALUE_READ(stream_request);
2619MB_PROC_VALUE_WRITE(stream_request);
2620MB_PROC_VALUE_READ(group_prealloc);
2621MB_PROC_VALUE_WRITE(group_prealloc);
2622 2768
2623#define MB_PROC_HANDLER(name, var) \ 2769#define MB_PROC_HANDLER(name, var) \
2624do { \ 2770do { \
2625 proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ 2771 proc = proc_create_data(name, mode, sbi->s_mb_proc, \
2772 &ext4_mb_##var##_proc_fops, sbi); \
2626 if (proc == NULL) { \ 2773 if (proc == NULL) { \
2627 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ 2774 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2628 goto err_out; \ 2775 goto err_out; \
2629 } \ 2776 } \
2630 proc->data = sbi; \
2631 proc->read_proc = ext4_mb_read_##var ; \
2632 proc->write_proc = ext4_mb_write_##var; \
2633} while (0) 2777} while (0)
2634 2778
2635static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2779static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2639 struct proc_dir_entry *proc; 2783 struct proc_dir_entry *proc;
2640 char devname[64]; 2784 char devname[64];
2641 2785
2786 if (proc_root_ext4 == NULL) {
2787 sbi->s_mb_proc = NULL;
2788 return -EINVAL;
2789 }
2642 bdevname(sb->s_bdev, devname); 2790 bdevname(sb->s_bdev, devname);
2643 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); 2791 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2644 2792
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2747 2895
2748 2896
2749 err = -EIO; 2897 err = -EIO;
2750 bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2898 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2751 if (!bitmap_bh) 2899 if (!bitmap_bh)
2752 goto out_err; 2900 goto out_err;
2753 2901
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2816 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 2964 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
2817 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2965 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2818 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2966 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2819 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 2967
2968 /*
2969 * free blocks account has already be reduced/reserved
2970 * at write_begin() time for delayed allocation
2971 * do not double accounting
2972 */
2973 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2974 percpu_counter_sub(&sbi->s_freeblocks_counter,
2975 ac->ac_b_ex.fe_len);
2976
2977 if (sbi->s_log_groups_per_flex) {
2978 ext4_group_t flex_group = ext4_flex_group(sbi,
2979 ac->ac_b_ex.fe_group);
2980 spin_lock(sb_bgl_lock(sbi, flex_group));
2981 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
2982 spin_unlock(sb_bgl_lock(sbi, flex_group));
2983 }
2820 2984
2821 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 2985 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2822 if (err) 2986 if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3473 if (bit >= end) 3637 if (bit >= end)
3474 break; 3638 break;
3475 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3639 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3476 if (next > end)
3477 next = end;
3478 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3640 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3479 le32_to_cpu(sbi->s_es->s_first_data_block); 3641 le32_to_cpu(sbi->s_es->s_first_data_block);
3480 mb_debug(" free preallocated %u/%u in group %u\n", 3642 mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3569 if (list_empty(&grp->bb_prealloc_list)) 3731 if (list_empty(&grp->bb_prealloc_list))
3570 return 0; 3732 return 0;
3571 3733
3572 bitmap_bh = read_block_bitmap(sb, group); 3734 bitmap_bh = ext4_read_block_bitmap(sb, group);
3573 if (bitmap_bh == NULL) { 3735 if (bitmap_bh == NULL) {
3574 /* error handling here */ 3736 /* error handling here */
3575 ext4_mb_release_desc(&e4b); 3737 ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
3743 err = ext4_mb_load_buddy(sb, group, &e4b); 3905 err = ext4_mb_load_buddy(sb, group, &e4b);
3744 BUG_ON(err != 0); /* error handling here */ 3906 BUG_ON(err != 0); /* error handling here */
3745 3907
3746 bitmap_bh = read_block_bitmap(sb, group); 3908 bitmap_bh = ext4_read_block_bitmap(sb, group);
3747 if (bitmap_bh == NULL) { 3909 if (bitmap_bh == NULL) {
3748 /* error handling here */ 3910 /* error handling here */
3749 ext4_mb_release_desc(&e4b); 3911 ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4011 sbi = EXT4_SB(sb); 4173 sbi = EXT4_SB(sb);
4012 4174
4013 if (!test_opt(sb, MBALLOC)) { 4175 if (!test_opt(sb, MBALLOC)) {
4014 block = ext4_new_blocks_old(handle, ar->inode, ar->goal, 4176 block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
4015 &(ar->len), errp); 4177 &(ar->len), errp);
4016 return block; 4178 return block;
4017 } 4179 }
4180 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4181 /*
4182 * With delalloc we already reserved the blocks
4183 */
4184 ar->len = ext4_has_free_blocks(sbi, ar->len);
4185 }
4186
4187 if (ar->len == 0) {
4188 *errp = -ENOSPC;
4189 return 0;
4190 }
4018 4191
4019 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { 4192 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4020 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4193 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4026 } 4199 }
4027 inquota = ar->len; 4200 inquota = ar->len;
4028 4201
4202 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4203 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4204
4029 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4205 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4030 if (!ac) { 4206 if (!ac) {
4207 ar->len = 0;
4031 *errp = -ENOMEM; 4208 *errp = -ENOMEM;
4032 return 0; 4209 goto out1;
4033 } 4210 }
4034 4211
4035 ext4_mb_poll_new_transaction(sb, handle); 4212 ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4037 *errp = ext4_mb_initialize_context(ac, ar); 4214 *errp = ext4_mb_initialize_context(ac, ar);
4038 if (*errp) { 4215 if (*errp) {
4039 ar->len = 0; 4216 ar->len = 0;
4040 goto out; 4217 goto out2;
4041 } 4218 }
4042 4219
4043 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4220 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4044 if (!ext4_mb_use_preallocated(ac)) { 4221 if (!ext4_mb_use_preallocated(ac)) {
4045
4046 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 4222 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4047 ext4_mb_normalize_request(ac, ar); 4223 ext4_mb_normalize_request(ac, ar);
4048repeat: 4224repeat:
@@ -4085,11 +4261,12 @@ repeat:
4085 4261
4086 ext4_mb_release_context(ac); 4262 ext4_mb_release_context(ac);
4087 4263
4088out: 4264out2:
4265 kmem_cache_free(ext4_ac_cachep, ac);
4266out1:
4089 if (ar->len < inquota) 4267 if (ar->len < inquota)
4090 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4268 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4091 4269
4092 kmem_cache_free(ext4_ac_cachep, ac);
4093 return block; 4270 return block;
4094} 4271}
4095static void ext4_mb_poll_new_transaction(struct super_block *sb, 4272static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
4242 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 4419 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
4243 count -= overflow; 4420 count -= overflow;
4244 } 4421 }
4245 bitmap_bh = read_block_bitmap(sb, block_group); 4422 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4246 if (!bitmap_bh) 4423 if (!bitmap_bh)
4247 goto error_return; 4424 goto error_return;
4248 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4425 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
4309 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); 4486 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
4310 } else { 4487 } else {
4311 ext4_lock_group(sb, block_group); 4488 ext4_lock_group(sb, block_group);
4312 err = mb_free_blocks(inode, &e4b, bit, count); 4489 mb_free_blocks(inode, &e4b, bit, count);
4313 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4490 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4314 ext4_unlock_group(sb, block_group); 4491 ext4_unlock_group(sb, block_group);
4315 BUG_ON(err != 0);
4316 } 4492 }
4317 4493
4318 spin_lock(sb_bgl_lock(sbi, block_group)); 4494 spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
4321 spin_unlock(sb_bgl_lock(sbi, block_group)); 4497 spin_unlock(sb_bgl_lock(sbi, block_group));
4322 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4498 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4323 4499
4500 if (sbi->s_log_groups_per_flex) {
4501 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4502 spin_lock(sb_bgl_lock(sbi, flex_group));
4503 sbi->s_flex_groups[flex_group].free_blocks += count;
4504 spin_unlock(sb_bgl_lock(sbi, flex_group));
4505 }
4506
4324 ext4_mb_release_desc(&e4b); 4507 ext4_mb_release_desc(&e4b);
4325 4508
4326 *freed += count; 4509 *freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
183 struct inode *inode); 183 struct inode *inode);
184 184
185/* 185/*
186 * p is at least 6 bytes before the end of page
187 */
188static inline struct ext4_dir_entry_2 *
189ext4_next_entry(struct ext4_dir_entry_2 *p)
190{
191 return (struct ext4_dir_entry_2 *)((char *)p +
192 ext4_rec_len_from_disk(p->rec_len));
193}
194
195/*
186 * Future: use high four bits of block for coalesce-on-delete flags 196 * Future: use high four bits of block for coalesce-on-delete flags
187 * Mask them off for now. 197 * Mask them off for now.
188 */ 198 */
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
231{ 241{
232 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 242 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
233 EXT4_DIR_REC_LEN(2) - infosize; 243 EXT4_DIR_REC_LEN(2) - infosize;
234 return 0? 20: entry_space / sizeof(struct dx_entry); 244 return entry_space / sizeof(struct dx_entry);
235} 245}
236 246
237static inline unsigned dx_node_limit (struct inode *dir) 247static inline unsigned dx_node_limit (struct inode *dir)
238{ 248{
239 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 249 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
240 return 0? 22: entry_space / sizeof(struct dx_entry); 250 return entry_space / sizeof(struct dx_entry);
241} 251}
242 252
243/* 253/*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
554 564
555 565
556/* 566/*
557 * p is at least 6 bytes before the end of page
558 */
559static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
560{
561 return (struct ext4_dir_entry_2 *)((char *)p +
562 ext4_rec_len_from_disk(p->rec_len));
563}
564
565/*
566 * This function fills a red-black tree with information from a 567 * This function fills a red-black tree with information from a
567 * directory block. It returns the number directory entries loaded 568 * directory block. It returns the number directory entries loaded
568 * into the tree. If there is an error it is returned in err. 569 * into the tree. If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
993 de = (struct ext4_dir_entry_2 *) bh->b_data; 994 de = (struct ext4_dir_entry_2 *) bh->b_data;
994 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - 995 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
995 EXT4_DIR_REC_LEN(0)); 996 EXT4_DIR_REC_LEN(0));
996 for (; de < top; de = ext4_next_entry(de)) 997 for (; de < top; de = ext4_next_entry(de)) {
997 if (ext4_match (namelen, name, de)) { 998 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
998 if (!ext4_check_dir_entry("ext4_find_entry", 999 + ((char *) de - bh->b_data);
999 dir, de, bh, 1000
1000 (block<<EXT4_BLOCK_SIZE_BITS(sb)) 1001 if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
1001 +((char *)de - bh->b_data))) { 1002 brelse(bh);
1002 brelse (bh);
1003 *err = ERR_BAD_DX_DIR; 1003 *err = ERR_BAD_DX_DIR;
1004 goto errout; 1004 goto errout;
1005 } 1005 }
1006 *res_dir = de; 1006
1007 dx_release (frames); 1007 if (ext4_match(namelen, name, de)) {
1008 return bh; 1008 *res_dir = de;
1009 dx_release(frames);
1010 return bh;
1011 }
1009 } 1012 }
1010 brelse (bh); 1013 brelse (bh);
1011 /* Check to see if we should continue to search */ 1014 /* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c04239..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
866 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 866 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
867 867
868 /* 868 /*
869 * We can allocate memory for mb_alloc based on the new group
870 * descriptor
871 */
872 if (test_opt(sb, MBALLOC)) {
873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
874 if (err)
875 goto exit_journal;
876 }
877 /*
869 * Make the new blocks and inodes valid next. We do this before 878 * Make the new blocks and inodes valid next. We do this before
870 * increasing the group count so that once the group is enabled, 879 * increasing the group count so that once the group is enabled,
871 * all of its blocks and inodes are already valid. 880 * all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
957 handle_t *handle; 966 handle_t *handle;
958 int err; 967 int err;
959 unsigned long freed_blocks; 968 unsigned long freed_blocks;
969 ext4_group_t group;
970 struct ext4_group_info *grp;
960 971
961 /* We don't need to worry about locking wrt other resizers just 972 /* We don't need to worry about locking wrt other resizers just
962 * yet: we're going to revalidate es->s_blocks_count after 973 * yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
988 } 999 }
989 1000
990 /* Handle the remaining blocks in the last group only. */ 1001 /* Handle the remaining blocks in the last group only. */
991 ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); 1002 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
992 1003
993 if (last == 0) { 1004 if (last == 0) {
994 ext4_warning(sb, __func__, 1005 ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1060 o_blocks_count + add); 1071 o_blocks_count + add);
1061 if ((err = ext4_journal_stop(handle))) 1072 if ((err = ext4_journal_stop(handle)))
1062 goto exit_put; 1073 goto exit_put;
1074
1075 /*
1076 * Mark mballoc pages as not up to date so that they will be updated
1077 * next time they are loaded by ext4_mb_load_buddy.
1078 */
1079 if (test_opt(sb, MBALLOC)) {
1080 struct ext4_sb_info *sbi = EXT4_SB(sb);
1081 struct inode *inode = sbi->s_buddy_cache;
1082 int blocks_per_page;
1083 int block;
1084 int pnum;
1085 struct page *page;
1086
1087 /* Set buddy page as not up to date */
1088 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1089 block = group * 2;
1090 pnum = block / blocks_per_page;
1091 page = find_get_page(inode->i_mapping, pnum);
1092 if (page != NULL) {
1093 ClearPageUptodate(page);
1094 page_cache_release(page);
1095 }
1096
1097 /* Set bitmap page as not up to date */
1098 block++;
1099 pnum = block / blocks_per_page;
1100 page = find_get_page(inode->i_mapping, pnum);
1101 if (page != NULL) {
1102 ClearPageUptodate(page);
1103 page_cache_release(page);
1104 }
1105
1106 /* Get the info on the last group */
1107 grp = ext4_get_group_info(sb, group);
1108
1109 /* Update free blocks in group info */
1110 ext4_mb_update_group_info(grp, add);
1111 }
1112
1063 if (test_opt(sb, DEBUG)) 1113 if (test_opt(sb, DEBUG))
1064 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", 1114 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1065 ext4_blocks_count(es)); 1115 ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 02bf24343979..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
506 ext4_ext_release(sb); 506 ext4_ext_release(sb);
507 ext4_xattr_put_super(sb); 507 ext4_xattr_put_super(sb);
508 jbd2_journal_destroy(sbi->s_journal); 508 jbd2_journal_destroy(sbi->s_journal);
509 sbi->s_journal = NULL;
509 if (!(sb->s_flags & MS_RDONLY)) { 510 if (!(sb->s_flags & MS_RDONLY)) {
510 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 511 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
511 es->s_state = cpu_to_le16(sbi->s_mount_state); 512 es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
517 for (i = 0; i < sbi->s_gdb_count; i++) 518 for (i = 0; i < sbi->s_gdb_count; i++)
518 brelse(sbi->s_group_desc[i]); 519 brelse(sbi->s_group_desc[i]);
519 kfree(sbi->s_group_desc); 520 kfree(sbi->s_group_desc);
521 kfree(sbi->s_flex_groups);
520 percpu_counter_destroy(&sbi->s_freeblocks_counter); 522 percpu_counter_destroy(&sbi->s_freeblocks_counter);
521 percpu_counter_destroy(&sbi->s_freeinodes_counter); 523 percpu_counter_destroy(&sbi->s_freeinodes_counter);
522 percpu_counter_destroy(&sbi->s_dirs_counter); 524 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
571 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 573 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
572 INIT_LIST_HEAD(&ei->i_prealloc_list); 574 INIT_LIST_HEAD(&ei->i_prealloc_list);
573 spin_lock_init(&ei->i_prealloc_lock); 575 spin_lock_init(&ei->i_prealloc_lock);
576 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
577 ei->i_reserved_data_blocks = 0;
578 ei->i_reserved_meta_blocks = 0;
579 ei->i_allocated_meta_blocks = 0;
580 ei->i_delalloc_reserved_flag = 0;
581 spin_lock_init(&(ei->i_block_reservation_lock));
574 return &ei->vfs_inode; 582 return &ei->vfs_inode;
575} 583}
576 584
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
635 EXT4_I(inode)->i_block_alloc_info = NULL; 643 EXT4_I(inode)->i_block_alloc_info = NULL;
636 if (unlikely(rsv)) 644 if (unlikely(rsv))
637 kfree(rsv); 645 kfree(rsv);
646 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
647 &EXT4_I(inode)->jinode);
638} 648}
639 649
640static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) 650static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
671 unsigned long def_mount_opts; 681 unsigned long def_mount_opts;
672 struct super_block *sb = vfs->mnt_sb; 682 struct super_block *sb = vfs->mnt_sb;
673 struct ext4_sb_info *sbi = EXT4_SB(sb); 683 struct ext4_sb_info *sbi = EXT4_SB(sb);
674 journal_t *journal = sbi->s_journal;
675 struct ext4_super_block *es = sbi->s_es; 684 struct ext4_super_block *es = sbi->s_es;
676 685
677 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 686 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
747 seq_puts(seq, ",nomballoc"); 756 seq_puts(seq, ",nomballoc");
748 if (test_opt(sb, I_VERSION)) 757 if (test_opt(sb, I_VERSION))
749 seq_puts(seq, ",i_version"); 758 seq_puts(seq, ",i_version");
759 if (!test_opt(sb, DELALLOC))
760 seq_puts(seq, ",nodelalloc");
761
750 762
751 if (sbi->s_stripe) 763 if (sbi->s_stripe)
752 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 764 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
894 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 906 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
895 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 907 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
896 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 908 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
897 Opt_mballoc, Opt_nomballoc, Opt_stripe, 909 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
898}; 910};
899 911
900static match_table_t tokens = { 912static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
953 {Opt_nomballoc, "nomballoc"}, 965 {Opt_nomballoc, "nomballoc"},
954 {Opt_stripe, "stripe=%u"}, 966 {Opt_stripe, "stripe=%u"},
955 {Opt_resize, "resize"}, 967 {Opt_resize, "resize"},
968 {Opt_delalloc, "delalloc"},
969 {Opt_nodelalloc, "nodelalloc"},
956 {Opt_err, NULL}, 970 {Opt_err, NULL},
957}; 971};
958 972
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
990 int qtype, qfmt; 1004 int qtype, qfmt;
991 char *qname; 1005 char *qname;
992#endif 1006#endif
1007 ext4_fsblk_t last_block;
993 1008
994 if (!options) 1009 if (!options)
995 return 1; 1010 return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
1309 clear_opt(sbi->s_mount_opt, NOBH); 1324 clear_opt(sbi->s_mount_opt, NOBH);
1310 break; 1325 break;
1311 case Opt_extents: 1326 case Opt_extents:
1327 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
1328 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1329 ext4_warning(sb, __func__,
1330 "extents feature not enabled "
1331 "on this filesystem, use tune2fs\n");
1332 return 0;
1333 }
1312 set_opt (sbi->s_mount_opt, EXTENTS); 1334 set_opt (sbi->s_mount_opt, EXTENTS);
1313 break; 1335 break;
1314 case Opt_noextents: 1336 case Opt_noextents:
1337 /*
1338 * When e2fsprogs support resizing an already existing
1339 * ext3 file system to greater than 2**32 we need to
1340 * add support to block allocator to handle growing
1341 * already existing block mapped inode so that blocks
1342 * allocated for them fall within 2**32
1343 */
1344 last_block = ext4_blocks_count(sbi->s_es) - 1;
1345 if (last_block > 0xffffffffULL) {
1346 printk(KERN_ERR "EXT4-fs: Filesystem too "
1347 "large to mount with "
1348 "-o noextents options\n");
1349 return 0;
1350 }
1315 clear_opt (sbi->s_mount_opt, EXTENTS); 1351 clear_opt (sbi->s_mount_opt, EXTENTS);
1316 break; 1352 break;
1317 case Opt_i_version: 1353 case Opt_i_version:
1318 set_opt(sbi->s_mount_opt, I_VERSION); 1354 set_opt(sbi->s_mount_opt, I_VERSION);
1319 sb->s_flags |= MS_I_VERSION; 1355 sb->s_flags |= MS_I_VERSION;
1320 break; 1356 break;
1357 case Opt_nodelalloc:
1358 clear_opt(sbi->s_mount_opt, DELALLOC);
1359 break;
1321 case Opt_mballoc: 1360 case Opt_mballoc:
1322 set_opt(sbi->s_mount_opt, MBALLOC); 1361 set_opt(sbi->s_mount_opt, MBALLOC);
1323 break; 1362 break;
@@ -1331,6 +1370,9 @@ set_qf_format:
1331 return 0; 1370 return 0;
1332 sbi->s_stripe = option; 1371 sbi->s_stripe = option;
1333 break; 1372 break;
1373 case Opt_delalloc:
1374 set_opt(sbi->s_mount_opt, DELALLOC);
1375 break;
1334 default: 1376 default:
1335 printk (KERN_ERR 1377 printk (KERN_ERR
1336 "EXT4-fs: Unrecognized mount option \"%s\" " 1378 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1443 return res; 1485 return res;
1444} 1486}
1445 1487
1488static int ext4_fill_flex_info(struct super_block *sb)
1489{
1490 struct ext4_sb_info *sbi = EXT4_SB(sb);
1491 struct ext4_group_desc *gdp = NULL;
1492 struct buffer_head *bh;
1493 ext4_group_t flex_group_count;
1494 ext4_group_t flex_group;
1495 int groups_per_flex = 0;
1496 __u64 block_bitmap = 0;
1497 int i;
1498
1499 if (!sbi->s_es->s_log_groups_per_flex) {
1500 sbi->s_log_groups_per_flex = 0;
1501 return 1;
1502 }
1503
1504 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1505 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1506
1507 flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
1508 groups_per_flex;
1509 sbi->s_flex_groups = kmalloc(flex_group_count *
1510 sizeof(struct flex_groups), GFP_KERNEL);
1511 if (sbi->s_flex_groups == NULL) {
1512 printk(KERN_ERR "EXT4-fs: not enough memory\n");
1513 goto failed;
1514 }
1515 memset(sbi->s_flex_groups, 0, flex_group_count *
1516 sizeof(struct flex_groups));
1517
1518 gdp = ext4_get_group_desc(sb, 1, &bh);
1519 block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
1520
1521 for (i = 0; i < sbi->s_groups_count; i++) {
1522 gdp = ext4_get_group_desc(sb, i, &bh);
1523
1524 flex_group = ext4_flex_group(sbi, i);
1525 sbi->s_flex_groups[flex_group].free_inodes +=
1526 le16_to_cpu(gdp->bg_free_inodes_count);
1527 sbi->s_flex_groups[flex_group].free_blocks +=
1528 le16_to_cpu(gdp->bg_free_blocks_count);
1529 }
1530
1531 return 1;
1532failed:
1533 return 0;
1534}
1535
1446__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, 1536__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1447 struct ext4_group_desc *gdp) 1537 struct ext4_group_desc *gdp)
1448{ 1538{
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1810} 1900}
1811 1901
1812static int ext4_fill_super (struct super_block *sb, void *data, int silent) 1902static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1813 __releases(kernel_sem) 1903 __releases(kernel_lock)
1814 __acquires(kernel_sem) 1904 __acquires(kernel_lock)
1815 1905
1816{ 1906{
1817 struct buffer_head * bh; 1907 struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1851 goto out_fail; 1941 goto out_fail;
1852 } 1942 }
1853 1943
1854 if (!sb_set_blocksize(sb, blocksize)) {
1855 printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
1856 goto out_fail;
1857 }
1858
1859 /* 1944 /*
1860 * The ext4 superblock will not be buffer aligned for other than 1kB 1945 * The ext4 superblock will not be buffer aligned for other than 1kB
1861 * block sizes. We need to calculate the offset from buffer start. 1946 * block sizes. We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1919 2004
1920 /* 2005 /*
1921 * turn on extents feature by default in ext4 filesystem 2006 * turn on extents feature by default in ext4 filesystem
1922 * User -o noextents to turn it off 2007 * only if feature flag already set by mkfs or tune2fs.
2008 * Use -o noextents to turn it off
1923 */ 2009 */
1924 set_opt(sbi->s_mount_opt, EXTENTS); 2010 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2011 set_opt(sbi->s_mount_opt, EXTENTS);
2012 else
2013 ext4_warning(sb, __func__,
2014 "extents feature not enabled on this filesystem, "
2015 "use tune2fs.\n");
1925 /* 2016 /*
1926 * turn on mballoc feature by default in ext4 filesystem 2017 * turn on mballoc code by default in ext4 filesystem
1927 * User -o nomballoc to turn it off 2018 * Use -o nomballoc to turn it off
1928 */ 2019 */
1929 set_opt(sbi->s_mount_opt, MBALLOC); 2020 set_opt(sbi->s_mount_opt, MBALLOC);
1930 2021
2022 /*
2023 * enable delayed allocation by default
2024 * Use -o nodelalloc to turn it off
2025 */
2026 set_opt(sbi->s_mount_opt, DELALLOC);
2027
2028
1931 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, 2029 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1932 NULL, 0)) 2030 NULL, 0))
1933 goto failed_mount; 2031 goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2138 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); 2236 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
2139 goto failed_mount2; 2237 goto failed_mount2;
2140 } 2238 }
2239 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2240 if (!ext4_fill_flex_info(sb)) {
2241 printk(KERN_ERR
2242 "EXT4-fs: unable to initialize "
2243 "flex_bg meta info!\n");
2244 goto failed_mount2;
2245 }
2246
2141 sbi->s_gdb_count = db_count; 2247 sbi->s_gdb_count = db_count;
2142 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2248 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2143 spin_lock_init(&sbi->s_next_gen_lock); 2249 spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2358 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2464 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
2359 "writeback"); 2465 "writeback");
2360 2466
2467 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2468 printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
2469 "requested data journaling mode\n");
2470 clear_opt(sbi->s_mount_opt, DELALLOC);
2471 } else if (test_opt(sb, DELALLOC))
2472 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2473
2361 ext4_ext_init(sb); 2474 ext4_ext_init(sb);
2362 ext4_mb_init(sb, needs_recovery); 2475 ext4_mb_init(sb, needs_recovery);
2363 2476
@@ -2372,6 +2485,7 @@ cantfind_ext4:
2372 2485
2373failed_mount4: 2486failed_mount4:
2374 jbd2_journal_destroy(sbi->s_journal); 2487 jbd2_journal_destroy(sbi->s_journal);
2488 sbi->s_journal = NULL;
2375failed_mount3: 2489failed_mount3:
2376 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2490 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2377 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2491 percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3325 err = ext4_journal_dirty_metadata(handle, bh); 3439 err = ext4_journal_dirty_metadata(handle, bh);
3326 else { 3440 else {
3327 /* Always do at least ordered writes for quotas */ 3441 /* Always do at least ordered writes for quotas */
3328 err = ext4_journal_dirty_data(handle, bh); 3442 err = ext4_jbd2_file_inode(handle, inode);
3329 mark_buffer_dirty(bh); 3443 mark_buffer_dirty(bh);
3330 } 3444 }
3331 brelse(bh); 3445 brelse(bh);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
810 /* We need to allocate a new block */ 810 /* We need to allocate a new block */
811 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 811 ext4_fsblk_t goal = ext4_group_first_block_no(sb,
812 EXT4_I(inode)->i_block_group); 812 EXT4_I(inode)->i_block_group);
813 ext4_fsblk_t block = ext4_new_block(handle, inode, 813 ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
814 goal, &error); 814 goal, &error);
815 if (error) 815 if (error)
816 goto cleanup; 816 goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
13#include "ext4.h" 13#include "ext4.h"
14#include "xattr.h" 14#include "xattr.h"
15 15
16#define XATTR_TRUSTED_PREFIX "trusted."
17
18static size_t 16static size_t
19ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, 17ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
20 const char *name, size_t name_len) 18 const char *name, size_t name_len)
21{ 19{
22 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; 20 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
23 const size_t total_len = prefix_len + name_len + 1; 21 const size_t total_len = prefix_len + name_len + 1;
24 22
25 if (!capable(CAP_SYS_ADMIN)) 23 if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
12#include "ext4.h" 12#include "ext4.h"
13#include "xattr.h" 13#include "xattr.h"
14 14
15#define XATTR_USER_PREFIX "user."
16
17static size_t 15static size_t
18ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, 16ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
19 const char *name, size_t name_len) 17 const char *name, size_t name_len)
20{ 18{
21 const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; 19 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
22 const size_t total_len = prefix_len + name_len + 1; 20 const size_t total_len = prefix_len + name_len + 1;
23 21
24 if (!test_opt(inode->i_sb, XATTR_USER)) 22 if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af26..3a9ecac8d61f 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
61 61
62static inline struct fat_cache *fat_cache_alloc(struct inode *inode) 62static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
63{ 63{
64 return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL); 64 return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
65} 65}
66 66
67static inline void fat_cache_free(struct fat_cache *cache) 67static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99ae..34541d06e626 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
472 loff_t cpos; 472 loff_t cpos;
473 int ret = 0; 473 int ret = 0;
474 474
475 lock_kernel(); 475 lock_super(sb);
476 476
477 cpos = filp->f_pos; 477 cpos = filp->f_pos;
478 /* Fake . and .. for the root directory. */ 478 /* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
654 if (unicode) 654 if (unicode)
655 __putname(unicode); 655 __putname(unicode);
656out: 656out:
657 unlock_kernel(); 657 unlock_super(sb);
658 return ret; 658 return ret;
659} 659}
660 660
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 771326b8047e..c672df4036e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
11#include <linux/mount.h> 11#include <linux/mount.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/msdos_fs.h> 13#include <linux/msdos_fs.h>
14#include <linux/smp_lock.h>
15#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
16#include <linux/writeback.h> 15#include <linux/writeback.h>
17#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
242 241
243 nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; 242 nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
244 243
245 lock_kernel();
246 fat_free(inode, nr_clusters); 244 fat_free(inode, nr_clusters);
247 unlock_kernel();
248 fat_flush_inodes(inode->i_sb, inode, NULL); 245 fat_flush_inodes(inode->i_sb, inode, NULL);
249} 246}
250 247
@@ -310,8 +307,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
310 int error = 0; 307 int error = 0;
311 unsigned int ia_valid; 308 unsigned int ia_valid;
312 309
313 lock_kernel();
314
315 /* 310 /*
316 * Expand the file. Since inode_setattr() updates ->i_size 311 * Expand the file. Since inode_setattr() updates ->i_size
317 * before calling the ->truncate(), but FAT needs to fill the 312 * before calling the ->truncate(), but FAT needs to fill the
@@ -366,7 +361,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
366 361
367 error = inode_setattr(inode, attr); 362 error = inode_setattr(inode, attr);
368out: 363out:
369 unlock_kernel();
370 return error; 364 return error;
371} 365}
372EXPORT_SYMBOL_GPL(fat_setattr); 366EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d677..46a4508ffd2e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
440 440
441static void fat_clear_inode(struct inode *inode) 441static void fat_clear_inode(struct inode *inode)
442{ 442{
443 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 443 struct super_block *sb = inode->i_sb;
444 struct msdos_sb_info *sbi = MSDOS_SB(sb);
444 445
445 lock_kernel();
446 spin_lock(&sbi->inode_hash_lock); 446 spin_lock(&sbi->inode_hash_lock);
447 fat_cache_inval_inode(inode); 447 fat_cache_inval_inode(inode);
448 hlist_del_init(&MSDOS_I(inode)->i_fat_hash); 448 hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
449 spin_unlock(&sbi->inode_hash_lock); 449 spin_unlock(&sbi->inode_hash_lock);
450 unlock_kernel();
451} 450}
452 451
453static void fat_write_super(struct super_block *sb) 452static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
485static struct inode *fat_alloc_inode(struct super_block *sb) 484static struct inode *fat_alloc_inode(struct super_block *sb)
486{ 485{
487 struct msdos_inode_info *ei; 486 struct msdos_inode_info *ei;
488 ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL); 487 ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
489 if (!ei) 488 if (!ei)
490 return NULL; 489 return NULL;
491 return &ei->vfs_inode; 490 return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
567 if (inode->i_ino == MSDOS_ROOT_INO || !i_pos) 566 if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
568 return 0; 567 return 0;
569 568
570 lock_kernel(); 569 lock_super(sb);
571 bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); 570 bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
572 if (!bh) { 571 if (!bh) {
573 printk(KERN_ERR "FAT: unable to read inode block " 572 printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
579 if (i_pos != MSDOS_I(inode)->i_pos) { 578 if (i_pos != MSDOS_I(inode)->i_pos) {
580 spin_unlock(&sbi->inode_hash_lock); 579 spin_unlock(&sbi->inode_hash_lock);
581 brelse(bh); 580 brelse(bh);
582 unlock_kernel(); 581 unlock_super(sb);
583 goto retry; 582 goto retry;
584 } 583 }
585 584
@@ -606,7 +605,7 @@ retry:
606 err = sync_dirty_buffer(bh); 605 err = sync_dirty_buffer(bh);
607 brelse(bh); 606 brelse(bh);
608out: 607out:
609 unlock_kernel(); 608 unlock_super(sb);
610 return err; 609 return err;
611} 610}
612 611
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
736 735
737static struct dentry *fat_get_parent(struct dentry *child) 736static struct dentry *fat_get_parent(struct dentry *child)
738{ 737{
738 struct super_block *sb = child->d_sb;
739 struct buffer_head *bh; 739 struct buffer_head *bh;
740 struct msdos_dir_entry *de; 740 struct msdos_dir_entry *de;
741 loff_t i_pos; 741 loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
743 struct inode *inode; 743 struct inode *inode;
744 int err; 744 int err;
745 745
746 lock_kernel(); 746 lock_super(sb);
747 747
748 err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos); 748 err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
749 if (err) { 749 if (err) {
750 parent = ERR_PTR(err); 750 parent = ERR_PTR(err);
751 goto out; 751 goto out;
752 } 752 }
753 inode = fat_build_inode(child->d_sb, de, i_pos); 753 inode = fat_build_inode(sb, de, i_pos);
754 brelse(bh); 754 brelse(bh);
755 if (IS_ERR(inode)) { 755 if (IS_ERR(inode)) {
756 parent = ERR_CAST(inode); 756 parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
762 parent = ERR_PTR(-ENOMEM); 762 parent = ERR_PTR(-ENOMEM);
763 } 763 }
764out: 764out:
765 unlock_kernel(); 765 unlock_super(sb);
766 766
767 return parent; 767 return parent;
768} 768}
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1172 long error; 1172 long error;
1173 char buf[50]; 1173 char buf[50];
1174 1174
1175 /*
1176 * GFP_KERNEL is ok here, because while we do hold the
1177 * supeblock lock, memory pressure can't call back into
1178 * the filesystem, since we're only just about to mount
1179 * it and have no inodes etc active!
1180 */
1175 sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); 1181 sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
1176 if (!sbi) 1182 if (!sbi)
1177 return -ENOMEM; 1183 return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a72..330a7d782591 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
12#include <linux/fdtable.h> 12#include <linux/fdtable.h>
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/dnotify.h> 14#include <linux/dnotify.h>
15#include <linux/smp_lock.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
17#include <linux/module.h> 16#include <linux/module.h>
18#include <linux/security.h> 17#include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
227 if (error) 226 if (error)
228 return error; 227 return error;
229 228
230 lock_kernel();
231 if ((arg ^ filp->f_flags) & FASYNC) { 229 if ((arg ^ filp->f_flags) & FASYNC) {
232 if (filp->f_op && filp->f_op->fasync) { 230 if (filp->f_op && filp->f_op->fasync) {
233 error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); 231 error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
238 236
239 filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); 237 filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
240 out: 238 out:
241 unlock_kernel();
242 return error; 239 return error;
243} 240}
244 241
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfbb..ab2f57e3fb87 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
14 GFS is perfect consistency -- changes made to the filesystem on one 14 GFS is perfect consistency -- changes made to the filesystem on one
15 machine show up immediately on all other machines in the cluster. 15 machine show up immediately on all other machines in the cluster.
16 16
17 To use the GFS2 filesystem, you will need to enable one or more of 17 To use the GFS2 filesystem in a cluster, you will need to enable
18 the below locking modules. Documentation and utilities for GFS2 can 18 the locking module below. Documentation and utilities for GFS2 can
19 be found here: http://sources.redhat.com/cluster 19 be found here: http://sources.redhat.com/cluster
20 20
21config GFS2_FS_LOCKING_NOLOCK 21 The "nolock" lock module is now built in to GFS2 by default.
22 tristate "GFS2 \"nolock\" locking module"
23 depends on GFS2_FS
24 help
25 Single node locking module for GFS2.
26
27 Use this module if you want to use GFS2 on a single node without
28 its clustering features. You can still take advantage of the
29 large file support, and upgrade to running a full cluster later on
30 if required.
31
32 If you will only be using GFS2 in cluster mode, you do not need this
33 module.
34 22
35config GFS2_FS_LOCKING_DLM 23config GFS2_FS_LOCKING_DLM
36 tristate "GFS2 DLM locking module" 24 tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a07..ec65851ec80a 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 6 recovery.o rgrp.o super.o sys.o trans.o util.o
7 7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
9obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/ 8obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
10 9
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b56..ef606e3a5cf4 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -16,11 +16,6 @@ enum {
16}; 16};
17 17
18enum { 18enum {
19 NO_WAIT = 0,
20 WAIT = 1,
21};
22
23enum {
24 NO_FORCE = 0, 19 NO_FORCE = 0,
25 FORCE = 1, 20 FORCE = 1,
26}; 21};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5d..13391e546616 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
45 struct hlist_head hb_list; 45 struct hlist_head hb_list;
46}; 46};
47 47
48struct glock_iter { 48struct gfs2_glock_iter {
49 int hash; /* hash bucket index */ 49 int hash; /* hash bucket index */
50 struct gfs2_sbd *sdp; /* incore superblock */ 50 struct gfs2_sbd *sdp; /* incore superblock */
51 struct gfs2_glock *gl; /* current glock struct */ 51 struct gfs2_glock *gl; /* current glock struct */
52 struct seq_file *seq; /* sequence file for debugfs */ 52 char string[512]; /* scratch space */
53 char string[512]; /* scratch space */
54}; 53};
55 54
56typedef void (*glock_examiner) (struct gfs2_glock * gl); 55typedef void (*glock_examiner) (struct gfs2_glock * gl);
57 56
58static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); 57static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
59static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl); 58static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
60static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh); 59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
61static void gfs2_glock_drop_th(struct gfs2_glock *gl); 60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
62static void run_queue(struct gfs2_glock *gl);
63 61
64static DECLARE_RWSEM(gfs2_umount_flush_sem); 62static DECLARE_RWSEM(gfs2_umount_flush_sem);
65static struct dentry *gfs2_root; 63static struct dentry *gfs2_root;
@@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
123#endif 121#endif
124 122
125/** 123/**
126 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
127 * @actual: the current state of the lock
128 * @requested: the lock state that was requested by the caller
129 * @flags: the modifier flags passed in by the caller
130 *
131 * Returns: 1 if the locks are compatible, 0 otherwise
132 */
133
134static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
135 int flags)
136{
137 if (actual == requested)
138 return 1;
139
140 if (flags & GL_EXACT)
141 return 0;
142
143 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
144 return 1;
145
146 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
147 return 1;
148
149 return 0;
150}
151
152/**
153 * gl_hash() - Turn glock number into hash bucket number 124 * gl_hash() - Turn glock number into hash bucket number
154 * @lock: The glock number 125 * @lock: The glock number
155 * 126 *
@@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
182 struct gfs2_sbd *sdp = gl->gl_sbd; 153 struct gfs2_sbd *sdp = gl->gl_sbd;
183 struct inode *aspace = gl->gl_aspace; 154 struct inode *aspace = gl->gl_aspace;
184 155
185 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 156 if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
186 sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock); 157 sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
187 158
188 if (aspace) 159 if (aspace)
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
211int gfs2_glock_put(struct gfs2_glock *gl) 182int gfs2_glock_put(struct gfs2_glock *gl)
212{ 183{
213 int rv = 0; 184 int rv = 0;
214 struct gfs2_sbd *sdp = gl->gl_sbd;
215 185
216 write_lock(gl_lock_addr(gl->gl_hash)); 186 write_lock(gl_lock_addr(gl->gl_hash));
217 if (atomic_dec_and_test(&gl->gl_ref)) { 187 if (atomic_dec_and_test(&gl->gl_ref)) {
218 hlist_del(&gl->gl_list); 188 hlist_del(&gl->gl_list);
219 write_unlock(gl_lock_addr(gl->gl_hash)); 189 write_unlock(gl_lock_addr(gl->gl_hash));
220 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED); 190 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
221 gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); 191 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
222 gfs2_assert(sdp, list_empty(&gl->gl_holders)); 192 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
223 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
224 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
225 glock_free(gl); 193 glock_free(gl);
226 rv = 1; 194 rv = 1;
227 goto out; 195 goto out;
@@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
281 return gl; 249 return gl;
282} 250}
283 251
252/**
253 * may_grant - check if its ok to grant a new lock
254 * @gl: The glock
255 * @gh: The lock request which we wish to grant
256 *
257 * Returns: true if its ok to grant the lock
258 */
259
260static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
261{
262 const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
263 if ((gh->gh_state == LM_ST_EXCLUSIVE ||
264 gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
265 return 0;
266 if (gl->gl_state == gh->gh_state)
267 return 1;
268 if (gh->gh_flags & GL_EXACT)
269 return 0;
270 if (gl->gl_state == LM_ST_EXCLUSIVE) {
271 if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
272 return 1;
273 if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
274 return 1;
275 }
276 if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
277 return 1;
278 return 0;
279}
280
281static void gfs2_holder_wake(struct gfs2_holder *gh)
282{
283 clear_bit(HIF_WAIT, &gh->gh_iflags);
284 smp_mb__after_clear_bit();
285 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
286}
287
288/**
289 * do_promote - promote as many requests as possible on the current queue
290 * @gl: The glock
291 *
292 * Returns: true if there is a blocked holder at the head of the list
293 */
294
295static int do_promote(struct gfs2_glock *gl)
296{
297 const struct gfs2_glock_operations *glops = gl->gl_ops;
298 struct gfs2_holder *gh, *tmp;
299 int ret;
300
301restart:
302 list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
303 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
304 continue;
305 if (may_grant(gl, gh)) {
306 if (gh->gh_list.prev == &gl->gl_holders &&
307 glops->go_lock) {
308 spin_unlock(&gl->gl_spin);
309 /* FIXME: eliminate this eventually */
310 ret = glops->go_lock(gh);
311 spin_lock(&gl->gl_spin);
312 if (ret) {
313 gh->gh_error = ret;
314 list_del_init(&gh->gh_list);
315 gfs2_holder_wake(gh);
316 goto restart;
317 }
318 set_bit(HIF_HOLDER, &gh->gh_iflags);
319 gfs2_holder_wake(gh);
320 goto restart;
321 }
322 set_bit(HIF_HOLDER, &gh->gh_iflags);
323 gfs2_holder_wake(gh);
324 continue;
325 }
326 if (gh->gh_list.prev == &gl->gl_holders)
327 return 1;
328 break;
329 }
330 return 0;
331}
332
333/**
334 * do_error - Something unexpected has happened during a lock request
335 *
336 */
337
338static inline void do_error(struct gfs2_glock *gl, const int ret)
339{
340 struct gfs2_holder *gh, *tmp;
341
342 list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
343 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
344 continue;
345 if (ret & LM_OUT_ERROR)
346 gh->gh_error = -EIO;
347 else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
348 gh->gh_error = GLR_TRYFAILED;
349 else
350 continue;
351 list_del_init(&gh->gh_list);
352 gfs2_holder_wake(gh);
353 }
354}
355
356/**
357 * find_first_waiter - find the first gh that's waiting for the glock
358 * @gl: the glock
359 */
360
361static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
362{
363 struct gfs2_holder *gh;
364
365 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
366 if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
367 return gh;
368 }
369 return NULL;
370}
371
372/**
373 * state_change - record that the glock is now in a different state
374 * @gl: the glock
375 * @new_state the new state
376 *
377 */
378
379static void state_change(struct gfs2_glock *gl, unsigned int new_state)
380{
381 int held1, held2;
382
383 held1 = (gl->gl_state != LM_ST_UNLOCKED);
384 held2 = (new_state != LM_ST_UNLOCKED);
385
386 if (held1 != held2) {
387 if (held2)
388 gfs2_glock_hold(gl);
389 else
390 gfs2_glock_put(gl);
391 }
392
393 gl->gl_state = new_state;
394 gl->gl_tchange = jiffies;
395}
396
397static void gfs2_demote_wake(struct gfs2_glock *gl)
398{
399 gl->gl_demote_state = LM_ST_EXCLUSIVE;
400 clear_bit(GLF_DEMOTE, &gl->gl_flags);
401 smp_mb__after_clear_bit();
402 wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
403}
404
405/**
406 * finish_xmote - The DLM has replied to one of our lock requests
407 * @gl: The glock
408 * @ret: The status from the DLM
409 *
410 */
411
412static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
413{
414 const struct gfs2_glock_operations *glops = gl->gl_ops;
415 struct gfs2_holder *gh;
416 unsigned state = ret & LM_OUT_ST_MASK;
417
418 spin_lock(&gl->gl_spin);
419 state_change(gl, state);
420 gh = find_first_waiter(gl);
421
422 /* Demote to UN request arrived during demote to SH or DF */
423 if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
424 state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
425 gl->gl_target = LM_ST_UNLOCKED;
426
427 /* Check for state != intended state */
428 if (unlikely(state != gl->gl_target)) {
429 if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
430 /* move to back of queue and try next entry */
431 if (ret & LM_OUT_CANCELED) {
432 if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
433 list_move_tail(&gh->gh_list, &gl->gl_holders);
434 gh = find_first_waiter(gl);
435 gl->gl_target = gh->gh_state;
436 goto retry;
437 }
438 /* Some error or failed "try lock" - report it */
439 if ((ret & LM_OUT_ERROR) ||
440 (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
441 gl->gl_target = gl->gl_state;
442 do_error(gl, ret);
443 goto out;
444 }
445 }
446 switch(state) {
447 /* Unlocked due to conversion deadlock, try again */
448 case LM_ST_UNLOCKED:
449retry:
450 do_xmote(gl, gh, gl->gl_target);
451 break;
452 /* Conversion fails, unlock and try again */
453 case LM_ST_SHARED:
454 case LM_ST_DEFERRED:
455 do_xmote(gl, gh, LM_ST_UNLOCKED);
456 break;
457 default: /* Everything else */
458 printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
459 GLOCK_BUG_ON(gl, 1);
460 }
461 spin_unlock(&gl->gl_spin);
462 gfs2_glock_put(gl);
463 return;
464 }
465
466 /* Fast path - we got what we asked for */
467 if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
468 gfs2_demote_wake(gl);
469 if (state != LM_ST_UNLOCKED) {
470 if (glops->go_xmote_bh) {
471 int rv;
472 spin_unlock(&gl->gl_spin);
473 rv = glops->go_xmote_bh(gl, gh);
474 if (rv == -EAGAIN)
475 return;
476 spin_lock(&gl->gl_spin);
477 if (rv) {
478 do_error(gl, rv);
479 goto out;
480 }
481 }
482 do_promote(gl);
483 }
484out:
485 clear_bit(GLF_LOCK, &gl->gl_flags);
486 spin_unlock(&gl->gl_spin);
487 gfs2_glock_put(gl);
488}
489
490static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
491 unsigned int cur_state, unsigned int req_state,
492 unsigned int flags)
493{
494 int ret = LM_OUT_ERROR;
495
496 if (!sdp->sd_lockstruct.ls_ops->lm_lock)
497 return req_state == LM_ST_UNLOCKED ? 0 : req_state;
498
499 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
500 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
501 req_state, flags);
502 return ret;
503}
504
505/**
506 * do_xmote - Calls the DLM to change the state of a lock
507 * @gl: The lock state
508 * @gh: The holder (only for promotes)
509 * @target: The target lock state
510 *
511 */
512
513static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
514{
515 const struct gfs2_glock_operations *glops = gl->gl_ops;
516 struct gfs2_sbd *sdp = gl->gl_sbd;
517 unsigned int lck_flags = gh ? gh->gh_flags : 0;
518 int ret;
519
520 lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
521 LM_FLAG_PRIORITY);
522 BUG_ON(gl->gl_state == target);
523 BUG_ON(gl->gl_state == gl->gl_target);
524 if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
525 glops->go_inval) {
526 set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
527 do_error(gl, 0); /* Fail queued try locks */
528 }
529 spin_unlock(&gl->gl_spin);
530 if (glops->go_xmote_th)
531 glops->go_xmote_th(gl);
532 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
533 glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
534 clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
535
536 gfs2_glock_hold(gl);
537 if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
538 gl->gl_state == LM_ST_DEFERRED) &&
539 !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
540 lck_flags |= LM_FLAG_TRY_1CB;
541 ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
542
543 if (!(ret & LM_OUT_ASYNC)) {
544 finish_xmote(gl, ret);
545 gfs2_glock_hold(gl);
546 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
547 gfs2_glock_put(gl);
548 } else {
549 GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
550 }
551 spin_lock(&gl->gl_spin);
552}
553
554/**
555 * find_first_holder - find the first "holder" gh
556 * @gl: the glock
557 */
558
559static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
560{
561 struct gfs2_holder *gh;
562
563 if (!list_empty(&gl->gl_holders)) {
564 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
565 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
566 return gh;
567 }
568 return NULL;
569}
570
571/**
572 * run_queue - do all outstanding tasks related to a glock
573 * @gl: The glock in question
574 * @nonblock: True if we must not block in run_queue
575 *
576 */
577
578static void run_queue(struct gfs2_glock *gl, const int nonblock)
579{
580 struct gfs2_holder *gh = NULL;
581
582 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
583 return;
584
585 GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
586
587 if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
588 gl->gl_demote_state != gl->gl_state) {
589 if (find_first_holder(gl))
590 goto out;
591 if (nonblock)
592 goto out_sched;
593 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
594 GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
595 gl->gl_target = gl->gl_demote_state;
596 } else {
597 if (test_bit(GLF_DEMOTE, &gl->gl_flags))
598 gfs2_demote_wake(gl);
599 if (do_promote(gl) == 0)
600 goto out;
601 gh = find_first_waiter(gl);
602 gl->gl_target = gh->gh_state;
603 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
604 do_error(gl, 0); /* Fail queued try locks */
605 }
606 do_xmote(gl, gh, gl->gl_target);
607 return;
608
609out_sched:
610 gfs2_glock_hold(gl);
611 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
612 gfs2_glock_put(gl);
613out:
614 clear_bit(GLF_LOCK, &gl->gl_flags);
615}
616
284static void glock_work_func(struct work_struct *work) 617static void glock_work_func(struct work_struct *work)
285{ 618{
619 unsigned long delay = 0;
286 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); 620 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
287 621
622 if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
623 finish_xmote(gl, gl->gl_reply);
288 spin_lock(&gl->gl_spin); 624 spin_lock(&gl->gl_spin);
289 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)) 625 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
290 set_bit(GLF_DEMOTE, &gl->gl_flags); 626 gl->gl_state != LM_ST_UNLOCKED &&
291 run_queue(gl); 627 gl->gl_demote_state != LM_ST_EXCLUSIVE) {
628 unsigned long holdtime, now = jiffies;
629 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
630 if (time_before(now, holdtime))
631 delay = holdtime - now;
632 set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
633 }
634 run_queue(gl, 0);
292 spin_unlock(&gl->gl_spin); 635 spin_unlock(&gl->gl_spin);
293 gfs2_glock_put(gl); 636 if (!delay ||
637 queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
638 gfs2_glock_put(gl);
294} 639}
295 640
296static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, 641static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
297 void **lockp) 642 void **lockp)
298{ 643{
299 int error = -EIO; 644 int error = -EIO;
645 if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
646 return 0;
300 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 647 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
301 error = sdp->sd_lockstruct.ls_ops->lm_get_lock( 648 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
302 sdp->sd_lockstruct.ls_lockspace, name, lockp); 649 sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
342 gl->gl_name = name; 689 gl->gl_name = name;
343 atomic_set(&gl->gl_ref, 1); 690 atomic_set(&gl->gl_ref, 1);
344 gl->gl_state = LM_ST_UNLOCKED; 691 gl->gl_state = LM_ST_UNLOCKED;
692 gl->gl_target = LM_ST_UNLOCKED;
345 gl->gl_demote_state = LM_ST_EXCLUSIVE; 693 gl->gl_demote_state = LM_ST_EXCLUSIVE;
346 gl->gl_hash = hash; 694 gl->gl_hash = hash;
347 gl->gl_owner_pid = NULL;
348 gl->gl_ip = 0;
349 gl->gl_ops = glops; 695 gl->gl_ops = glops;
350 gl->gl_req_gh = NULL;
351 gl->gl_stamp = jiffies; 696 gl->gl_stamp = jiffies;
352 gl->gl_tchange = jiffies; 697 gl->gl_tchange = jiffies;
353 gl->gl_object = NULL; 698 gl->gl_object = NULL;
@@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
447 gh->gh_ip = 0; 792 gh->gh_ip = 0;
448} 793}
449 794
450static void gfs2_holder_wake(struct gfs2_holder *gh)
451{
452 clear_bit(HIF_WAIT, &gh->gh_iflags);
453 smp_mb__after_clear_bit();
454 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
455}
456
457static int just_schedule(void *word) 795static int just_schedule(void *word)
458{ 796{
459 schedule(); 797 schedule();
@@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
466 wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE); 804 wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
467} 805}
468 806
469static void gfs2_demote_wake(struct gfs2_glock *gl)
470{
471 gl->gl_demote_state = LM_ST_EXCLUSIVE;
472 clear_bit(GLF_DEMOTE, &gl->gl_flags);
473 smp_mb__after_clear_bit();
474 wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
475}
476
477static void wait_on_demote(struct gfs2_glock *gl) 807static void wait_on_demote(struct gfs2_glock *gl)
478{ 808{
479 might_sleep(); 809 might_sleep();
@@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl)
481} 811}
482 812
483/** 813/**
484 * rq_mutex - process a mutex request in the queue
485 * @gh: the glock holder
486 *
487 * Returns: 1 if the queue is blocked
488 */
489
490static int rq_mutex(struct gfs2_holder *gh)
491{
492 struct gfs2_glock *gl = gh->gh_gl;
493
494 list_del_init(&gh->gh_list);
495 /* gh->gh_error never examined. */
496 set_bit(GLF_LOCK, &gl->gl_flags);
497 clear_bit(HIF_WAIT, &gh->gh_iflags);
498 smp_mb();
499 wake_up_bit(&gh->gh_iflags, HIF_WAIT);
500
501 return 1;
502}
503
504/**
505 * rq_promote - process a promote request in the queue
506 * @gh: the glock holder
507 *
508 * Acquire a new inter-node lock, or change a lock state to more restrictive.
509 *
510 * Returns: 1 if the queue is blocked
511 */
512
513static int rq_promote(struct gfs2_holder *gh)
514{
515 struct gfs2_glock *gl = gh->gh_gl;
516
517 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
518 if (list_empty(&gl->gl_holders)) {
519 gl->gl_req_gh = gh;
520 set_bit(GLF_LOCK, &gl->gl_flags);
521 spin_unlock(&gl->gl_spin);
522 gfs2_glock_xmote_th(gh->gh_gl, gh);
523 spin_lock(&gl->gl_spin);
524 }
525 return 1;
526 }
527
528 if (list_empty(&gl->gl_holders)) {
529 set_bit(HIF_FIRST, &gh->gh_iflags);
530 set_bit(GLF_LOCK, &gl->gl_flags);
531 } else {
532 struct gfs2_holder *next_gh;
533 if (gh->gh_state == LM_ST_EXCLUSIVE)
534 return 1;
535 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
536 gh_list);
537 if (next_gh->gh_state == LM_ST_EXCLUSIVE)
538 return 1;
539 }
540
541 list_move_tail(&gh->gh_list, &gl->gl_holders);
542 gh->gh_error = 0;
543 set_bit(HIF_HOLDER, &gh->gh_iflags);
544
545 gfs2_holder_wake(gh);
546
547 return 0;
548}
549
550/**
551 * rq_demote - process a demote request in the queue
552 * @gh: the glock holder
553 *
554 * Returns: 1 if the queue is blocked
555 */
556
557static int rq_demote(struct gfs2_glock *gl)
558{
559 if (!list_empty(&gl->gl_holders))
560 return 1;
561
562 if (gl->gl_state == gl->gl_demote_state ||
563 gl->gl_state == LM_ST_UNLOCKED) {
564 gfs2_demote_wake(gl);
565 return 0;
566 }
567
568 set_bit(GLF_LOCK, &gl->gl_flags);
569 set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
570
571 if (gl->gl_demote_state == LM_ST_UNLOCKED ||
572 gl->gl_state != LM_ST_EXCLUSIVE) {
573 spin_unlock(&gl->gl_spin);
574 gfs2_glock_drop_th(gl);
575 } else {
576 spin_unlock(&gl->gl_spin);
577 gfs2_glock_xmote_th(gl, NULL);
578 }
579
580 spin_lock(&gl->gl_spin);
581 clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
582
583 return 0;
584}
585
586/**
587 * run_queue - process holder structures on a glock
588 * @gl: the glock
589 *
590 */
591static void run_queue(struct gfs2_glock *gl)
592{
593 struct gfs2_holder *gh;
594 int blocked = 1;
595
596 for (;;) {
597 if (test_bit(GLF_LOCK, &gl->gl_flags))
598 break;
599
600 if (!list_empty(&gl->gl_waiters1)) {
601 gh = list_entry(gl->gl_waiters1.next,
602 struct gfs2_holder, gh_list);
603 blocked = rq_mutex(gh);
604 } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
605 blocked = rq_demote(gl);
606 if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
607 !blocked) {
608 set_bit(GLF_DEMOTE, &gl->gl_flags);
609 gl->gl_demote_state = LM_ST_UNLOCKED;
610 }
611 clear_bit(GLF_WAITERS2, &gl->gl_flags);
612 } else if (!list_empty(&gl->gl_waiters3)) {
613 gh = list_entry(gl->gl_waiters3.next,
614 struct gfs2_holder, gh_list);
615 blocked = rq_promote(gh);
616 } else
617 break;
618
619 if (blocked)
620 break;
621 }
622}
623
624/**
625 * gfs2_glmutex_lock - acquire a local lock on a glock
626 * @gl: the glock
627 *
628 * Gives caller exclusive access to manipulate a glock structure.
629 */
630
631static void gfs2_glmutex_lock(struct gfs2_glock *gl)
632{
633 spin_lock(&gl->gl_spin);
634 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
635 struct gfs2_holder gh;
636
637 gfs2_holder_init(gl, 0, 0, &gh);
638 set_bit(HIF_WAIT, &gh.gh_iflags);
639 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
640 spin_unlock(&gl->gl_spin);
641 wait_on_holder(&gh);
642 gfs2_holder_uninit(&gh);
643 } else {
644 gl->gl_owner_pid = get_pid(task_pid(current));
645 gl->gl_ip = (unsigned long)__builtin_return_address(0);
646 spin_unlock(&gl->gl_spin);
647 }
648}
649
650/**
651 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
652 * @gl: the glock
653 *
654 * Returns: 1 if the glock is acquired
655 */
656
657static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
658{
659 int acquired = 1;
660
661 spin_lock(&gl->gl_spin);
662 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
663 acquired = 0;
664 } else {
665 gl->gl_owner_pid = get_pid(task_pid(current));
666 gl->gl_ip = (unsigned long)__builtin_return_address(0);
667 }
668 spin_unlock(&gl->gl_spin);
669
670 return acquired;
671}
672
673/**
674 * gfs2_glmutex_unlock - release a local lock on a glock
675 * @gl: the glock
676 *
677 */
678
679static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
680{
681 struct pid *pid;
682
683 spin_lock(&gl->gl_spin);
684 clear_bit(GLF_LOCK, &gl->gl_flags);
685 pid = gl->gl_owner_pid;
686 gl->gl_owner_pid = NULL;
687 gl->gl_ip = 0;
688 run_queue(gl);
689 spin_unlock(&gl->gl_spin);
690
691 put_pid(pid);
692}
693
694/**
695 * handle_callback - process a demote request 814 * handle_callback - process a demote request
696 * @gl: the glock 815 * @gl: the glock
697 * @state: the state the caller wants us to change to 816 * @state: the state the caller wants us to change to
@@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
705{ 824{
706 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; 825 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
707 826
708 spin_lock(&gl->gl_spin);
709 set_bit(bit, &gl->gl_flags); 827 set_bit(bit, &gl->gl_flags);
710 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { 828 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
711 gl->gl_demote_state = state; 829 gl->gl_demote_state = state;
712 gl->gl_demote_time = jiffies; 830 gl->gl_demote_time = jiffies;
713 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && 831 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
714 gl->gl_object) { 832 gl->gl_object)
715 gfs2_glock_schedule_for_reclaim(gl); 833 gfs2_glock_schedule_for_reclaim(gl);
716 spin_unlock(&gl->gl_spin);
717 return;
718 }
719 } else if (gl->gl_demote_state != LM_ST_UNLOCKED && 834 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
720 gl->gl_demote_state != state) { 835 gl->gl_demote_state != state) {
721 if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) 836 gl->gl_demote_state = LM_ST_UNLOCKED;
722 set_bit(GLF_WAITERS2, &gl->gl_flags);
723 else
724 gl->gl_demote_state = LM_ST_UNLOCKED;
725 }
726 spin_unlock(&gl->gl_spin);
727}
728
729/**
730 * state_change - record that the glock is now in a different state
731 * @gl: the glock
732 * @new_state the new state
733 *
734 */
735
736static void state_change(struct gfs2_glock *gl, unsigned int new_state)
737{
738 int held1, held2;
739
740 held1 = (gl->gl_state != LM_ST_UNLOCKED);
741 held2 = (new_state != LM_ST_UNLOCKED);
742
743 if (held1 != held2) {
744 if (held2)
745 gfs2_glock_hold(gl);
746 else
747 gfs2_glock_put(gl);
748 } 837 }
749
750 gl->gl_state = new_state;
751 gl->gl_tchange = jiffies;
752} 838}
753 839
754/** 840/**
755 * drop_bh - Called after a lock module unlock completes 841 * gfs2_glock_wait - wait on a glock acquisition
756 * @gl: the glock
757 * @ret: the return status
758 *
759 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
760 * Doesn't drop the reference on the glock the top half took out
761 *
762 */
763
764static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
765{
766 struct gfs2_sbd *sdp = gl->gl_sbd;
767 struct gfs2_holder *gh = gl->gl_req_gh;
768
769 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
770 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
771 gfs2_assert_warn(sdp, !ret);
772
773 state_change(gl, LM_ST_UNLOCKED);
774
775 if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
776 spin_lock(&gl->gl_spin);
777 gh->gh_error = 0;
778 spin_unlock(&gl->gl_spin);
779 gfs2_glock_xmote_th(gl, gl->gl_req_gh);
780 gfs2_glock_put(gl);
781 return;
782 }
783
784 spin_lock(&gl->gl_spin);
785 gfs2_demote_wake(gl);
786 clear_bit(GLF_LOCK, &gl->gl_flags);
787 spin_unlock(&gl->gl_spin);
788 gfs2_glock_put(gl);
789}
790
791/**
792 * xmote_bh - Called after the lock module is done acquiring a lock
793 * @gl: The glock in question
794 * @ret: the int returned from the lock module
795 *
796 */
797
798static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
799{
800 struct gfs2_sbd *sdp = gl->gl_sbd;
801 const struct gfs2_glock_operations *glops = gl->gl_ops;
802 struct gfs2_holder *gh = gl->gl_req_gh;
803 int op_done = 1;
804
805 if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
806 drop_bh(gl, ret);
807 return;
808 }
809
810 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
811 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
812 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
813
814 state_change(gl, ret & LM_OUT_ST_MASK);
815
816 /* Deal with each possible exit condition */
817
818 if (!gh) {
819 gl->gl_stamp = jiffies;
820 if (ret & LM_OUT_CANCELED) {
821 op_done = 0;
822 } else {
823 spin_lock(&gl->gl_spin);
824 if (gl->gl_state != gl->gl_demote_state) {
825 spin_unlock(&gl->gl_spin);
826 gfs2_glock_drop_th(gl);
827 gfs2_glock_put(gl);
828 return;
829 }
830 gfs2_demote_wake(gl);
831 spin_unlock(&gl->gl_spin);
832 }
833 } else {
834 spin_lock(&gl->gl_spin);
835 if (ret & LM_OUT_CONV_DEADLK) {
836 gh->gh_error = 0;
837 set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
838 spin_unlock(&gl->gl_spin);
839 gfs2_glock_drop_th(gl);
840 gfs2_glock_put(gl);
841 return;
842 }
843 list_del_init(&gh->gh_list);
844 gh->gh_error = -EIO;
845 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
846 goto out;
847 gh->gh_error = GLR_CANCELED;
848 if (ret & LM_OUT_CANCELED)
849 goto out;
850 if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
851 list_add_tail(&gh->gh_list, &gl->gl_holders);
852 gh->gh_error = 0;
853 set_bit(HIF_HOLDER, &gh->gh_iflags);
854 set_bit(HIF_FIRST, &gh->gh_iflags);
855 op_done = 0;
856 goto out;
857 }
858 gh->gh_error = GLR_TRYFAILED;
859 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
860 goto out;
861 gh->gh_error = -EINVAL;
862 if (gfs2_assert_withdraw(sdp, 0) == -1)
863 fs_err(sdp, "ret = 0x%.8X\n", ret);
864out:
865 spin_unlock(&gl->gl_spin);
866 }
867
868 if (glops->go_xmote_bh)
869 glops->go_xmote_bh(gl);
870
871 if (op_done) {
872 spin_lock(&gl->gl_spin);
873 gl->gl_req_gh = NULL;
874 clear_bit(GLF_LOCK, &gl->gl_flags);
875 spin_unlock(&gl->gl_spin);
876 }
877
878 gfs2_glock_put(gl);
879
880 if (gh)
881 gfs2_holder_wake(gh);
882}
883
884static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
885 unsigned int cur_state, unsigned int req_state,
886 unsigned int flags)
887{
888 int ret = 0;
889 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
890 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
891 req_state, flags);
892 return ret;
893}
894
895/**
896 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
897 * @gl: The glock in question
898 * @state: the requested state
899 * @flags: modifier flags to the lock call
900 *
901 */
902
903static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
904{
905 struct gfs2_sbd *sdp = gl->gl_sbd;
906 int flags = gh ? gh->gh_flags : 0;
907 unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
908 const struct gfs2_glock_operations *glops = gl->gl_ops;
909 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
910 LM_FLAG_NOEXP | LM_FLAG_ANY |
911 LM_FLAG_PRIORITY);
912 unsigned int lck_ret;
913
914 if (glops->go_xmote_th)
915 glops->go_xmote_th(gl);
916 if (state == LM_ST_DEFERRED && glops->go_inval)
917 glops->go_inval(gl, DIO_METADATA);
918
919 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
920 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
921 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
922 gfs2_assert_warn(sdp, state != gl->gl_state);
923
924 gfs2_glock_hold(gl);
925
926 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
927
928 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
929 return;
930
931 if (lck_ret & LM_OUT_ASYNC)
932 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
933 else
934 xmote_bh(gl, lck_ret);
935}
936
937static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
938 unsigned int cur_state)
939{
940 int ret = 0;
941 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
942 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
943 return ret;
944}
945
946/**
947 * gfs2_glock_drop_th - call into the lock module to unlock a lock
948 * @gl: the glock
949 *
950 */
951
952static void gfs2_glock_drop_th(struct gfs2_glock *gl)
953{
954 struct gfs2_sbd *sdp = gl->gl_sbd;
955 const struct gfs2_glock_operations *glops = gl->gl_ops;
956 unsigned int ret;
957
958 if (glops->go_xmote_th)
959 glops->go_xmote_th(gl);
960 if (glops->go_inval)
961 glops->go_inval(gl, DIO_METADATA);
962
963 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
964 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
965 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
966
967 gfs2_glock_hold(gl);
968
969 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
970
971 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
972 return;
973
974 if (!ret)
975 drop_bh(gl, ret);
976 else
977 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
978}
979
980/**
981 * do_cancels - cancel requests for locks stuck waiting on an expire flag
982 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
983 *
984 * Don't cancel GL_NOCANCEL requests.
985 */
986
987static void do_cancels(struct gfs2_holder *gh)
988{
989 struct gfs2_glock *gl = gh->gh_gl;
990 struct gfs2_sbd *sdp = gl->gl_sbd;
991
992 spin_lock(&gl->gl_spin);
993
994 while (gl->gl_req_gh != gh &&
995 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
996 !list_empty(&gh->gh_list)) {
997 if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
998 spin_unlock(&gl->gl_spin);
999 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1000 sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
1001 msleep(100);
1002 spin_lock(&gl->gl_spin);
1003 } else {
1004 spin_unlock(&gl->gl_spin);
1005 msleep(100);
1006 spin_lock(&gl->gl_spin);
1007 }
1008 }
1009
1010 spin_unlock(&gl->gl_spin);
1011}
1012
1013/**
1014 * glock_wait_internal - wait on a glock acquisition
1015 * @gh: the glock holder 842 * @gh: the glock holder
1016 * 843 *
1017 * Returns: 0 on success 844 * Returns: 0 on success
1018 */ 845 */
1019 846
1020static int glock_wait_internal(struct gfs2_holder *gh) 847int gfs2_glock_wait(struct gfs2_holder *gh)
1021{ 848{
1022 struct gfs2_glock *gl = gh->gh_gl;
1023 struct gfs2_sbd *sdp = gl->gl_sbd;
1024 const struct gfs2_glock_operations *glops = gl->gl_ops;
1025
1026 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1027 return -EIO;
1028
1029 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1030 spin_lock(&gl->gl_spin);
1031 if (gl->gl_req_gh != gh &&
1032 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1033 !list_empty(&gh->gh_list)) {
1034 list_del_init(&gh->gh_list);
1035 gh->gh_error = GLR_TRYFAILED;
1036 run_queue(gl);
1037 spin_unlock(&gl->gl_spin);
1038 return gh->gh_error;
1039 }
1040 spin_unlock(&gl->gl_spin);
1041 }
1042
1043 if (gh->gh_flags & LM_FLAG_PRIORITY)
1044 do_cancels(gh);
1045
1046 wait_on_holder(gh); 849 wait_on_holder(gh);
1047 if (gh->gh_error)
1048 return gh->gh_error;
1049
1050 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1051 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
1052 gh->gh_flags));
1053
1054 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1055 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1056
1057 if (glops->go_lock) {
1058 gh->gh_error = glops->go_lock(gh);
1059 if (gh->gh_error) {
1060 spin_lock(&gl->gl_spin);
1061 list_del_init(&gh->gh_list);
1062 spin_unlock(&gl->gl_spin);
1063 }
1064 }
1065
1066 spin_lock(&gl->gl_spin);
1067 gl->gl_req_gh = NULL;
1068 clear_bit(GLF_LOCK, &gl->gl_flags);
1069 run_queue(gl);
1070 spin_unlock(&gl->gl_spin);
1071 }
1072
1073 return gh->gh_error; 850 return gh->gh_error;
1074} 851}
1075 852
1076static inline struct gfs2_holder * 853void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
1077find_holder_by_owner(struct list_head *head, struct pid *pid)
1078{
1079 struct gfs2_holder *gh;
1080
1081 list_for_each_entry(gh, head, gh_list) {
1082 if (gh->gh_owner_pid == pid)
1083 return gh;
1084 }
1085
1086 return NULL;
1087}
1088
1089static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
1090{ 854{
1091 va_list args; 855 va_list args;
1092 856
1093 va_start(args, fmt); 857 va_start(args, fmt);
1094 if (gi) { 858 if (seq) {
859 struct gfs2_glock_iter *gi = seq->private;
1095 vsprintf(gi->string, fmt, args); 860 vsprintf(gi->string, fmt, args);
1096 seq_printf(gi->seq, gi->string); 861 seq_printf(seq, gi->string);
1097 } 862 } else {
1098 else 863 printk(KERN_ERR " ");
1099 vprintk(fmt, args); 864 vprintk(fmt, args);
865 }
1100 va_end(args); 866 va_end(args);
1101} 867}
1102 868
@@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
1104 * add_to_queue - Add a holder to the wait queue (but look for recursion) 870 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1105 * @gh: the holder structure to add 871 * @gh: the holder structure to add
1106 * 872 *
873 * Eventually we should move the recursive locking trap to a
874 * debugging option or something like that. This is the fast
875 * path and needs to have the minimum number of distractions.
876 *
1107 */ 877 */
1108 878
1109static void add_to_queue(struct gfs2_holder *gh) 879static inline void add_to_queue(struct gfs2_holder *gh)
1110{ 880{
1111 struct gfs2_glock *gl = gh->gh_gl; 881 struct gfs2_glock *gl = gh->gh_gl;
1112 struct gfs2_holder *existing; 882 struct gfs2_sbd *sdp = gl->gl_sbd;
883 struct list_head *insert_pt = NULL;
884 struct gfs2_holder *gh2;
885 int try_lock = 0;
1113 886
1114 BUG_ON(gh->gh_owner_pid == NULL); 887 BUG_ON(gh->gh_owner_pid == NULL);
1115 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) 888 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
1116 BUG(); 889 BUG();
1117 890
1118 if (!(gh->gh_flags & GL_FLOCK)) { 891 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1119 existing = find_holder_by_owner(&gl->gl_holders, 892 if (test_bit(GLF_LOCK, &gl->gl_flags))
1120 gh->gh_owner_pid); 893 try_lock = 1;
1121 if (existing) { 894 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
1122 print_symbol(KERN_WARNING "original: %s\n", 895 goto fail;
1123 existing->gh_ip); 896 }
1124 printk(KERN_INFO "pid : %d\n", 897
1125 pid_nr(existing->gh_owner_pid)); 898 list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
1126 printk(KERN_INFO "lock type : %d lock state : %d\n", 899 if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
1127 existing->gh_gl->gl_name.ln_type, 900 (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
1128 existing->gh_gl->gl_state); 901 goto trap_recursive;
1129 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); 902 if (try_lock &&
1130 printk(KERN_INFO "pid : %d\n", 903 !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
1131 pid_nr(gh->gh_owner_pid)); 904 !may_grant(gl, gh)) {
1132 printk(KERN_INFO "lock type : %d lock state : %d\n", 905fail:
1133 gl->gl_name.ln_type, gl->gl_state); 906 gh->gh_error = GLR_TRYFAILED;
1134 BUG(); 907 gfs2_holder_wake(gh);
1135 } 908 return;
1136
1137 existing = find_holder_by_owner(&gl->gl_waiters3,
1138 gh->gh_owner_pid);
1139 if (existing) {
1140 print_symbol(KERN_WARNING "original: %s\n",
1141 existing->gh_ip);
1142 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1143 BUG();
1144 } 909 }
910 if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
911 continue;
912 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
913 insert_pt = &gh2->gh_list;
914 }
915 if (likely(insert_pt == NULL)) {
916 list_add_tail(&gh->gh_list, &gl->gl_holders);
917 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
918 goto do_cancel;
919 return;
920 }
921 list_add_tail(&gh->gh_list, insert_pt);
922do_cancel:
923 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
924 if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
925 spin_unlock(&gl->gl_spin);
926 if (sdp->sd_lockstruct.ls_ops->lm_cancel)
927 sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
928 spin_lock(&gl->gl_spin);
1145 } 929 }
930 return;
1146 931
1147 if (gh->gh_flags & LM_FLAG_PRIORITY) 932trap_recursive:
1148 list_add(&gh->gh_list, &gl->gl_waiters3); 933 print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
1149 else 934 printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
1150 list_add_tail(&gh->gh_list, &gl->gl_waiters3); 935 printk(KERN_ERR "lock type: %d req lock state : %d\n",
936 gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
937 print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
938 printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
939 printk(KERN_ERR "lock type: %d req lock state : %d\n",
940 gh->gh_gl->gl_name.ln_type, gh->gh_state);
941 __dump_glock(NULL, gl);
942 BUG();
1151} 943}
1152 944
1153/** 945/**
@@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
1165 struct gfs2_sbd *sdp = gl->gl_sbd; 957 struct gfs2_sbd *sdp = gl->gl_sbd;
1166 int error = 0; 958 int error = 0;
1167 959
1168restart: 960 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1169 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1170 set_bit(HIF_ABORTED, &gh->gh_iflags);
1171 return -EIO; 961 return -EIO;
1172 }
1173 962
1174 spin_lock(&gl->gl_spin); 963 spin_lock(&gl->gl_spin);
1175 add_to_queue(gh); 964 add_to_queue(gh);
1176 run_queue(gl); 965 run_queue(gl, 1);
1177 spin_unlock(&gl->gl_spin); 966 spin_unlock(&gl->gl_spin);
1178 967
1179 if (!(gh->gh_flags & GL_ASYNC)) { 968 if (!(gh->gh_flags & GL_ASYNC))
1180 error = glock_wait_internal(gh); 969 error = gfs2_glock_wait(gh);
1181 if (error == GLR_CANCELED) {
1182 msleep(100);
1183 goto restart;
1184 }
1185 }
1186 970
1187 return error; 971 return error;
1188} 972}
@@ -1196,48 +980,7 @@ restart:
1196 980
1197int gfs2_glock_poll(struct gfs2_holder *gh) 981int gfs2_glock_poll(struct gfs2_holder *gh)
1198{ 982{
1199 struct gfs2_glock *gl = gh->gh_gl; 983 return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
1200 int ready = 0;
1201
1202 spin_lock(&gl->gl_spin);
1203
1204 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1205 ready = 1;
1206 else if (list_empty(&gh->gh_list)) {
1207 if (gh->gh_error == GLR_CANCELED) {
1208 spin_unlock(&gl->gl_spin);
1209 msleep(100);
1210 if (gfs2_glock_nq(gh))
1211 return 1;
1212 return 0;
1213 } else
1214 ready = 1;
1215 }
1216
1217 spin_unlock(&gl->gl_spin);
1218
1219 return ready;
1220}
1221
1222/**
1223 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1224 * @gh: the holder structure
1225 *
1226 * Returns: 0, GLR_TRYFAILED, or errno on failure
1227 */
1228
1229int gfs2_glock_wait(struct gfs2_holder *gh)
1230{
1231 int error;
1232
1233 error = glock_wait_internal(gh);
1234 if (error == GLR_CANCELED) {
1235 msleep(100);
1236 gh->gh_flags &= ~GL_ASYNC;
1237 error = gfs2_glock_nq(gh);
1238 }
1239
1240 return error;
1241} 984}
1242 985
1243/** 986/**
@@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1251 struct gfs2_glock *gl = gh->gh_gl; 994 struct gfs2_glock *gl = gh->gh_gl;
1252 const struct gfs2_glock_operations *glops = gl->gl_ops; 995 const struct gfs2_glock_operations *glops = gl->gl_ops;
1253 unsigned delay = 0; 996 unsigned delay = 0;
997 int fast_path = 0;
1254 998
999 spin_lock(&gl->gl_spin);
1255 if (gh->gh_flags & GL_NOCACHE) 1000 if (gh->gh_flags & GL_NOCACHE)
1256 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1001 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1257 1002
1258 gfs2_glmutex_lock(gl);
1259
1260 spin_lock(&gl->gl_spin);
1261 list_del_init(&gh->gh_list); 1003 list_del_init(&gh->gh_list);
1262 1004 if (find_first_holder(gl) == NULL) {
1263 if (list_empty(&gl->gl_holders)) {
1264 if (glops->go_unlock) { 1005 if (glops->go_unlock) {
1006 GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
1265 spin_unlock(&gl->gl_spin); 1007 spin_unlock(&gl->gl_spin);
1266 glops->go_unlock(gh); 1008 glops->go_unlock(gh);
1267 spin_lock(&gl->gl_spin); 1009 spin_lock(&gl->gl_spin);
1010 clear_bit(GLF_LOCK, &gl->gl_flags);
1268 } 1011 }
1269 gl->gl_stamp = jiffies; 1012 gl->gl_stamp = jiffies;
1013 if (list_empty(&gl->gl_holders) &&
1014 !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
1015 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1016 fast_path = 1;
1270 } 1017 }
1271
1272 clear_bit(GLF_LOCK, &gl->gl_flags);
1273 spin_unlock(&gl->gl_spin); 1018 spin_unlock(&gl->gl_spin);
1019 if (likely(fast_path))
1020 return;
1274 1021
1275 gfs2_glock_hold(gl); 1022 gfs2_glock_hold(gl);
1276 if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && 1023 if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1454static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) 1201static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
1455{ 1202{
1456 int error = -EIO; 1203 int error = -EIO;
1204 if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
1205 return 0;
1457 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 1206 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1458 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp); 1207 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
1459 return error; 1208 return error;
@@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
1469{ 1218{
1470 int error; 1219 int error;
1471 1220
1472 gfs2_glmutex_lock(gl);
1473
1474 if (!atomic_read(&gl->gl_lvb_count)) { 1221 if (!atomic_read(&gl->gl_lvb_count)) {
1475 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb); 1222 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1476 if (error) { 1223 if (error)
1477 gfs2_glmutex_unlock(gl);
1478 return error; 1224 return error;
1479 }
1480 gfs2_glock_hold(gl); 1225 gfs2_glock_hold(gl);
1481 } 1226 }
1482 atomic_inc(&gl->gl_lvb_count); 1227 atomic_inc(&gl->gl_lvb_count);
1483 1228
1484 gfs2_glmutex_unlock(gl);
1485
1486 return 0; 1229 return 0;
1487} 1230}
1488 1231
@@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
1497 struct gfs2_sbd *sdp = gl->gl_sbd; 1240 struct gfs2_sbd *sdp = gl->gl_sbd;
1498 1241
1499 gfs2_glock_hold(gl); 1242 gfs2_glock_hold(gl);
1500 gfs2_glmutex_lock(gl);
1501
1502 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0); 1243 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1503 if (atomic_dec_and_test(&gl->gl_lvb_count)) { 1244 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1504 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 1245 if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
1505 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb); 1246 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
1506 gl->gl_lvb = NULL; 1247 gl->gl_lvb = NULL;
1507 gfs2_glock_put(gl); 1248 gfs2_glock_put(gl);
1508 } 1249 }
1509
1510 gfs2_glmutex_unlock(gl);
1511 gfs2_glock_put(gl); 1250 gfs2_glock_put(gl);
1512} 1251}
1513 1252
@@ -1527,7 +1266,9 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1527 if (time_before(now, holdtime)) 1266 if (time_before(now, holdtime))
1528 delay = holdtime - now; 1267 delay = holdtime - now;
1529 1268
1269 spin_lock(&gl->gl_spin);
1530 handle_callback(gl, state, 1, delay); 1270 handle_callback(gl, state, 1, delay);
1271 spin_unlock(&gl->gl_spin);
1531 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 1272 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
1532 gfs2_glock_put(gl); 1273 gfs2_glock_put(gl);
1533} 1274}
@@ -1568,7 +1309,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1568 gl = gfs2_glock_find(sdp, &async->lc_name); 1309 gl = gfs2_glock_find(sdp, &async->lc_name);
1569 if (gfs2_assert_warn(sdp, gl)) 1310 if (gfs2_assert_warn(sdp, gl))
1570 return; 1311 return;
1571 xmote_bh(gl, async->lc_ret); 1312 gl->gl_reply = async->lc_ret;
1313 set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
1572 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1314 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1573 gfs2_glock_put(gl); 1315 gfs2_glock_put(gl);
1574 up_read(&gfs2_umount_flush_sem); 1316 up_read(&gfs2_umount_flush_sem);
@@ -1581,11 +1323,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1581 wake_up_process(sdp->sd_recoverd_process); 1323 wake_up_process(sdp->sd_recoverd_process);
1582 return; 1324 return;
1583 1325
1584 case LM_CB_DROPLOCKS:
1585 gfs2_gl_hash_clear(sdp, NO_WAIT);
1586 gfs2_quota_scan(sdp);
1587 return;
1588
1589 default: 1326 default:
1590 gfs2_assert_warn(sdp, 0); 1327 gfs2_assert_warn(sdp, 0);
1591 return; 1328 return;
@@ -1646,6 +1383,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1646void gfs2_reclaim_glock(struct gfs2_sbd *sdp) 1383void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1647{ 1384{
1648 struct gfs2_glock *gl; 1385 struct gfs2_glock *gl;
1386 int done_callback = 0;
1649 1387
1650 spin_lock(&sdp->sd_reclaim_lock); 1388 spin_lock(&sdp->sd_reclaim_lock);
1651 if (list_empty(&sdp->sd_reclaim_list)) { 1389 if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1398,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1660 atomic_dec(&sdp->sd_reclaim_count); 1398 atomic_dec(&sdp->sd_reclaim_count);
1661 atomic_inc(&sdp->sd_reclaimed); 1399 atomic_inc(&sdp->sd_reclaimed);
1662 1400
1663 if (gfs2_glmutex_trylock(gl)) { 1401 spin_lock(&gl->gl_spin);
1664 if (list_empty(&gl->gl_holders) && 1402 if (find_first_holder(gl) == NULL &&
1665 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) 1403 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
1666 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1404 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1667 gfs2_glmutex_unlock(gl); 1405 done_callback = 1;
1668 } 1406 }
1669 1407 spin_unlock(&gl->gl_spin);
1670 gfs2_glock_put(gl); 1408 if (!done_callback ||
1409 queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1410 gfs2_glock_put(gl);
1671} 1411}
1672 1412
1673/** 1413/**
@@ -1724,18 +1464,14 @@ static void scan_glock(struct gfs2_glock *gl)
1724{ 1464{
1725 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) 1465 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
1726 return; 1466 return;
1467 if (test_bit(GLF_LOCK, &gl->gl_flags))
1468 return;
1727 1469
1728 if (gfs2_glmutex_trylock(gl)) { 1470 spin_lock(&gl->gl_spin);
1729 if (list_empty(&gl->gl_holders) && 1471 if (find_first_holder(gl) == NULL &&
1730 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) 1472 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1731 goto out_schedule; 1473 gfs2_glock_schedule_for_reclaim(gl);
1732 gfs2_glmutex_unlock(gl); 1474 spin_unlock(&gl->gl_spin);
1733 }
1734 return;
1735
1736out_schedule:
1737 gfs2_glmutex_unlock(gl);
1738 gfs2_glock_schedule_for_reclaim(gl);
1739} 1475}
1740 1476
1741/** 1477/**
@@ -1760,12 +1496,13 @@ static void clear_glock(struct gfs2_glock *gl)
1760 spin_unlock(&sdp->sd_reclaim_lock); 1496 spin_unlock(&sdp->sd_reclaim_lock);
1761 } 1497 }
1762 1498
1763 if (gfs2_glmutex_trylock(gl)) { 1499 spin_lock(&gl->gl_spin);
1764 if (list_empty(&gl->gl_holders) && 1500 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
1765 gl->gl_state != LM_ST_UNLOCKED) 1501 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1766 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1502 spin_unlock(&gl->gl_spin);
1767 gfs2_glmutex_unlock(gl); 1503 gfs2_glock_hold(gl);
1768 } 1504 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1505 gfs2_glock_put(gl);
1769} 1506}
1770 1507
1771/** 1508/**
@@ -1773,11 +1510,10 @@ static void clear_glock(struct gfs2_glock *gl)
1773 * @sdp: the filesystem 1510 * @sdp: the filesystem
1774 * @wait: wait until it's all gone 1511 * @wait: wait until it's all gone
1775 * 1512 *
1776 * Called when unmounting the filesystem, or when inter-node lock manager 1513 * Called when unmounting the filesystem.
1777 * requests DROPLOCKS because it is running out of capacity.
1778 */ 1514 */
1779 1515
1780void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) 1516void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1781{ 1517{
1782 unsigned long t; 1518 unsigned long t;
1783 unsigned int x; 1519 unsigned int x;
@@ -1792,7 +1528,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
1792 cont = 1; 1528 cont = 1;
1793 } 1529 }
1794 1530
1795 if (!wait || !cont) 1531 if (!cont)
1796 break; 1532 break;
1797 1533
1798 if (time_after_eq(jiffies, 1534 if (time_after_eq(jiffies,
@@ -1810,180 +1546,164 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
1810 } 1546 }
1811} 1547}
1812 1548
1813/* 1549static const char *state2str(unsigned state)
1814 * Diagnostic routines to help debug distributed deadlock
1815 */
1816
1817static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
1818 unsigned long address)
1819{ 1550{
1820 char buffer[KSYM_SYMBOL_LEN]; 1551 switch(state) {
1821 1552 case LM_ST_UNLOCKED:
1822 sprint_symbol(buffer, address); 1553 return "UN";
1823 print_dbg(gi, fmt, buffer); 1554 case LM_ST_SHARED:
1555 return "SH";
1556 case LM_ST_DEFERRED:
1557 return "DF";
1558 case LM_ST_EXCLUSIVE:
1559 return "EX";
1560 }
1561 return "??";
1562}
1563
1564static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1565{
1566 char *p = buf;
1567 if (flags & LM_FLAG_TRY)
1568 *p++ = 't';
1569 if (flags & LM_FLAG_TRY_1CB)
1570 *p++ = 'T';
1571 if (flags & LM_FLAG_NOEXP)
1572 *p++ = 'e';
1573 if (flags & LM_FLAG_ANY)
1574 *p++ = 'a';
1575 if (flags & LM_FLAG_PRIORITY)
1576 *p++ = 'p';
1577 if (flags & GL_ASYNC)
1578 *p++ = 'a';
1579 if (flags & GL_EXACT)
1580 *p++ = 'E';
1581 if (flags & GL_ATIME)
1582 *p++ = 'a';
1583 if (flags & GL_NOCACHE)
1584 *p++ = 'c';
1585 if (test_bit(HIF_HOLDER, &iflags))
1586 *p++ = 'H';
1587 if (test_bit(HIF_WAIT, &iflags))
1588 *p++ = 'W';
1589 if (test_bit(HIF_FIRST, &iflags))
1590 *p++ = 'F';
1591 *p = 0;
1592 return buf;
1824} 1593}
1825 1594
1826/** 1595/**
1827 * dump_holder - print information about a glock holder 1596 * dump_holder - print information about a glock holder
1828 * @str: a string naming the type of holder 1597 * @seq: the seq_file struct
1829 * @gh: the glock holder 1598 * @gh: the glock holder
1830 * 1599 *
1831 * Returns: 0 on success, -ENOBUFS when we run out of space 1600 * Returns: 0 on success, -ENOBUFS when we run out of space
1832 */ 1601 */
1833 1602
1834static int dump_holder(struct glock_iter *gi, char *str, 1603static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1835 struct gfs2_holder *gh)
1836{ 1604{
1837 unsigned int x; 1605 struct task_struct *gh_owner = NULL;
1838 struct task_struct *gh_owner; 1606 char buffer[KSYM_SYMBOL_LEN];
1607 char flags_buf[32];
1839 1608
1840 print_dbg(gi, " %s\n", str); 1609 sprint_symbol(buffer, gh->gh_ip);
1841 if (gh->gh_owner_pid) { 1610 if (gh->gh_owner_pid)
1842 print_dbg(gi, " owner = %ld ",
1843 (long)pid_nr(gh->gh_owner_pid));
1844 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); 1611 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
1845 if (gh_owner) 1612 gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
1846 print_dbg(gi, "(%s)\n", gh_owner->comm); 1613 state2str(gh->gh_state),
1847 else 1614 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
1848 print_dbg(gi, "(ended)\n"); 1615 gh->gh_error,
1849 } else 1616 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
1850 print_dbg(gi, " owner = -1\n"); 1617 gh_owner ? gh_owner->comm : "(ended)", buffer);
1851 print_dbg(gi, " gh_state = %u\n", gh->gh_state);
1852 print_dbg(gi, " gh_flags =");
1853 for (x = 0; x < 32; x++)
1854 if (gh->gh_flags & (1 << x))
1855 print_dbg(gi, " %u", x);
1856 print_dbg(gi, " \n");
1857 print_dbg(gi, " error = %d\n", gh->gh_error);
1858 print_dbg(gi, " gh_iflags =");
1859 for (x = 0; x < 32; x++)
1860 if (test_bit(x, &gh->gh_iflags))
1861 print_dbg(gi, " %u", x);
1862 print_dbg(gi, " \n");
1863 gfs2_print_symbol(gi, " initialized at: %s\n", gh->gh_ip);
1864
1865 return 0; 1618 return 0;
1866} 1619}
1867 1620
1868/** 1621static const char *gflags2str(char *buf, const unsigned long *gflags)
1869 * dump_inode - print information about an inode 1622{
1870 * @ip: the inode 1623 char *p = buf;
1871 * 1624 if (test_bit(GLF_LOCK, gflags))
1872 * Returns: 0 on success, -ENOBUFS when we run out of space 1625 *p++ = 'l';
1873 */ 1626 if (test_bit(GLF_STICKY, gflags))
1874 1627 *p++ = 's';
1875static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip) 1628 if (test_bit(GLF_DEMOTE, gflags))
1876{ 1629 *p++ = 'D';
1877 unsigned int x; 1630 if (test_bit(GLF_PENDING_DEMOTE, gflags))
1878 1631 *p++ = 'd';
1879 print_dbg(gi, " Inode:\n"); 1632 if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
1880 print_dbg(gi, " num = %llu/%llu\n", 1633 *p++ = 'p';
1881 (unsigned long long)ip->i_no_formal_ino, 1634 if (test_bit(GLF_DIRTY, gflags))
1882 (unsigned long long)ip->i_no_addr); 1635 *p++ = 'y';
1883 print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode)); 1636 if (test_bit(GLF_LFLUSH, gflags))
1884 print_dbg(gi, " i_flags ="); 1637 *p++ = 'f';
1885 for (x = 0; x < 32; x++) 1638 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
1886 if (test_bit(x, &ip->i_flags)) 1639 *p++ = 'i';
1887 print_dbg(gi, " %u", x); 1640 if (test_bit(GLF_REPLY_PENDING, gflags))
1888 print_dbg(gi, " \n"); 1641 *p++ = 'r';
1889 return 0; 1642 *p = 0;
1643 return buf;
1890} 1644}
1891 1645
1892/** 1646/**
1893 * dump_glock - print information about a glock 1647 * __dump_glock - print information about a glock
1648 * @seq: The seq_file struct
1894 * @gl: the glock 1649 * @gl: the glock
1895 * @count: where we are in the buffer 1650 *
1651 * The file format is as follows:
1652 * One line per object, capital letters are used to indicate objects
1653 * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
1654 * other objects are indented by a single space and follow the glock to
1655 * which they are related. Fields are indicated by lower case letters
1656 * followed by a colon and the field value, except for strings which are in
1657 * [] so that its possible to see if they are composed of spaces for
1658 * example. The field's are n = number (id of the object), f = flags,
1659 * t = type, s = state, r = refcount, e = error, p = pid.
1896 * 1660 *
1897 * Returns: 0 on success, -ENOBUFS when we run out of space 1661 * Returns: 0 on success, -ENOBUFS when we run out of space
1898 */ 1662 */
1899 1663
1900static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl) 1664static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1901{ 1665{
1902 struct gfs2_holder *gh; 1666 const struct gfs2_glock_operations *glops = gl->gl_ops;
1903 unsigned int x; 1667 unsigned long long dtime;
1904 int error = -ENOBUFS; 1668 const struct gfs2_holder *gh;
1905 struct task_struct *gl_owner; 1669 char gflags_buf[32];
1670 int error = 0;
1906 1671
1907 spin_lock(&gl->gl_spin); 1672 dtime = jiffies - gl->gl_demote_time;
1673 dtime *= 1000000/HZ; /* demote time in uSec */
1674 if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
1675 dtime = 0;
1676 gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
1677 state2str(gl->gl_state),
1678 gl->gl_name.ln_type,
1679 (unsigned long long)gl->gl_name.ln_number,
1680 gflags2str(gflags_buf, &gl->gl_flags),
1681 state2str(gl->gl_target),
1682 state2str(gl->gl_demote_state), dtime,
1683 atomic_read(&gl->gl_lvb_count),
1684 atomic_read(&gl->gl_ail_count),
1685 atomic_read(&gl->gl_ref));
1908 1686
1909 print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
1910 (unsigned long long)gl->gl_name.ln_number);
1911 print_dbg(gi, " gl_flags =");
1912 for (x = 0; x < 32; x++) {
1913 if (test_bit(x, &gl->gl_flags))
1914 print_dbg(gi, " %u", x);
1915 }
1916 if (!test_bit(GLF_LOCK, &gl->gl_flags))
1917 print_dbg(gi, " (unlocked)");
1918 print_dbg(gi, " \n");
1919 print_dbg(gi, " gl_ref = %d\n", atomic_read(&gl->gl_ref));
1920 print_dbg(gi, " gl_state = %u\n", gl->gl_state);
1921 if (gl->gl_owner_pid) {
1922 gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
1923 if (gl_owner)
1924 print_dbg(gi, " gl_owner = pid %d (%s)\n",
1925 pid_nr(gl->gl_owner_pid), gl_owner->comm);
1926 else
1927 print_dbg(gi, " gl_owner = %d (ended)\n",
1928 pid_nr(gl->gl_owner_pid));
1929 } else
1930 print_dbg(gi, " gl_owner = -1\n");
1931 print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip);
1932 print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
1933 print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
1934 print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
1935 print_dbg(gi, " reclaim = %s\n",
1936 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
1937 if (gl->gl_aspace)
1938 print_dbg(gi, " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
1939 gl->gl_aspace->i_mapping->nrpages);
1940 else
1941 print_dbg(gi, " aspace = no\n");
1942 print_dbg(gi, " ail = %d\n", atomic_read(&gl->gl_ail_count));
1943 if (gl->gl_req_gh) {
1944 error = dump_holder(gi, "Request", gl->gl_req_gh);
1945 if (error)
1946 goto out;
1947 }
1948 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 1687 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
1949 error = dump_holder(gi, "Holder", gh); 1688 error = dump_holder(seq, gh);
1950 if (error) 1689 if (error)
1951 goto out; 1690 goto out;
1952 } 1691 }
1953 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) { 1692 if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
1954 error = dump_holder(gi, "Waiter1", gh); 1693 error = glops->go_dump(seq, gl);
1955 if (error)
1956 goto out;
1957 }
1958 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
1959 error = dump_holder(gi, "Waiter3", gh);
1960 if (error)
1961 goto out;
1962 }
1963 if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
1964 print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n",
1965 gl->gl_demote_state, (unsigned long long)
1966 (jiffies - gl->gl_demote_time)*(1000000/HZ));
1967 }
1968 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
1969 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
1970 list_empty(&gl->gl_holders)) {
1971 error = dump_inode(gi, gl->gl_object);
1972 if (error)
1973 goto out;
1974 } else {
1975 error = -ENOBUFS;
1976 print_dbg(gi, " Inode: busy\n");
1977 }
1978 }
1979
1980 error = 0;
1981
1982out: 1694out:
1983 spin_unlock(&gl->gl_spin);
1984 return error; 1695 return error;
1985} 1696}
1986 1697
1698static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1699{
1700 int ret;
1701 spin_lock(&gl->gl_spin);
1702 ret = __dump_glock(seq, gl);
1703 spin_unlock(&gl->gl_spin);
1704 return ret;
1705}
1706
1987/** 1707/**
1988 * gfs2_dump_lockstate - print out the current lockstate 1708 * gfs2_dump_lockstate - print out the current lockstate
1989 * @sdp: the filesystem 1709 * @sdp: the filesystem
@@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void)
2086module_param(scand_secs, uint, S_IRUGO|S_IWUSR); 1806module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
2087MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs"); 1807MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
2088 1808
2089static int gfs2_glock_iter_next(struct glock_iter *gi) 1809static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
2090{ 1810{
2091 struct gfs2_glock *gl; 1811 struct gfs2_glock *gl;
2092 1812
@@ -2104,7 +1824,7 @@ restart:
2104 gfs2_glock_put(gl); 1824 gfs2_glock_put(gl);
2105 if (gl && gi->gl == NULL) 1825 if (gl && gi->gl == NULL)
2106 gi->hash++; 1826 gi->hash++;
2107 while(gi->gl == NULL) { 1827 while (gi->gl == NULL) {
2108 if (gi->hash >= GFS2_GL_HASH_SIZE) 1828 if (gi->hash >= GFS2_GL_HASH_SIZE)
2109 return 1; 1829 return 1;
2110 read_lock(gl_lock_addr(gi->hash)); 1830 read_lock(gl_lock_addr(gi->hash));
@@ -2122,58 +1842,34 @@ restart:
2122 return 0; 1842 return 0;
2123} 1843}
2124 1844
2125static void gfs2_glock_iter_free(struct glock_iter *gi) 1845static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
2126{ 1846{
2127 if (gi->gl) 1847 if (gi->gl)
2128 gfs2_glock_put(gi->gl); 1848 gfs2_glock_put(gi->gl);
2129 kfree(gi);
2130}
2131
2132static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
2133{
2134 struct glock_iter *gi;
2135
2136 gi = kmalloc(sizeof (*gi), GFP_KERNEL);
2137 if (!gi)
2138 return NULL;
2139
2140 gi->sdp = sdp;
2141 gi->hash = 0;
2142 gi->seq = NULL;
2143 gi->gl = NULL; 1849 gi->gl = NULL;
2144 memset(gi->string, 0, sizeof(gi->string));
2145
2146 if (gfs2_glock_iter_next(gi)) {
2147 gfs2_glock_iter_free(gi);
2148 return NULL;
2149 }
2150
2151 return gi;
2152} 1850}
2153 1851
2154static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos) 1852static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
2155{ 1853{
2156 struct glock_iter *gi; 1854 struct gfs2_glock_iter *gi = seq->private;
2157 loff_t n = *pos; 1855 loff_t n = *pos;
2158 1856
2159 gi = gfs2_glock_iter_init(file->private); 1857 gi->hash = 0;
2160 if (!gi)
2161 return NULL;
2162 1858
2163 while(n--) { 1859 do {
2164 if (gfs2_glock_iter_next(gi)) { 1860 if (gfs2_glock_iter_next(gi)) {
2165 gfs2_glock_iter_free(gi); 1861 gfs2_glock_iter_free(gi);
2166 return NULL; 1862 return NULL;
2167 } 1863 }
2168 } 1864 } while (n--);
2169 1865
2170 return gi; 1866 return gi->gl;
2171} 1867}
2172 1868
2173static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr, 1869static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
2174 loff_t *pos) 1870 loff_t *pos)
2175{ 1871{
2176 struct glock_iter *gi = iter_ptr; 1872 struct gfs2_glock_iter *gi = seq->private;
2177 1873
2178 (*pos)++; 1874 (*pos)++;
2179 1875
@@ -2182,24 +1878,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
2182 return NULL; 1878 return NULL;
2183 } 1879 }
2184 1880
2185 return gi; 1881 return gi->gl;
2186} 1882}
2187 1883
2188static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr) 1884static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
2189{ 1885{
2190 struct glock_iter *gi = iter_ptr; 1886 struct gfs2_glock_iter *gi = seq->private;
2191 if (gi) 1887 gfs2_glock_iter_free(gi);
2192 gfs2_glock_iter_free(gi);
2193} 1888}
2194 1889
2195static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr) 1890static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
2196{ 1891{
2197 struct glock_iter *gi = iter_ptr; 1892 return dump_glock(seq, iter_ptr);
2198
2199 gi->seq = file;
2200 dump_glock(gi, gi->gl);
2201
2202 return 0;
2203} 1893}
2204 1894
2205static const struct seq_operations gfs2_glock_seq_ops = { 1895static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1901,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
2211 1901
2212static int gfs2_debugfs_open(struct inode *inode, struct file *file) 1902static int gfs2_debugfs_open(struct inode *inode, struct file *file)
2213{ 1903{
2214 struct seq_file *seq; 1904 int ret = seq_open_private(file, &gfs2_glock_seq_ops,
2215 int ret; 1905 sizeof(struct gfs2_glock_iter));
2216 1906 if (ret == 0) {
2217 ret = seq_open(file, &gfs2_glock_seq_ops); 1907 struct seq_file *seq = file->private_data;
2218 if (ret) 1908 struct gfs2_glock_iter *gi = seq->private;
2219 return ret; 1909 gi->sdp = inode->i_private;
2220 1910 }
2221 seq = file->private_data; 1911 return ret;
2222 seq->private = inode->i_private;
2223
2224 return 0;
2225} 1912}
2226 1913
2227static const struct file_operations gfs2_debug_fops = { 1914static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1916,7 @@ static const struct file_operations gfs2_debug_fops = {
2229 .open = gfs2_debugfs_open, 1916 .open = gfs2_debugfs_open,
2230 .read = seq_read, 1917 .read = seq_read,
2231 .llseek = seq_lseek, 1918 .llseek = seq_lseek,
2232 .release = seq_release 1919 .release = seq_release_private,
2233}; 1920};
2234 1921
2235int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) 1922int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f8150..971d92af70fc 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,11 +26,8 @@
26#define GL_SKIP 0x00000100 26#define GL_SKIP 0x00000100
27#define GL_ATIME 0x00000200 27#define GL_ATIME 0x00000200
28#define GL_NOCACHE 0x00000400 28#define GL_NOCACHE 0x00000400
29#define GL_FLOCK 0x00000800
30#define GL_NOCANCEL 0x00001000
31 29
32#define GLR_TRYFAILED 13 30#define GLR_TRYFAILED 13
33#define GLR_CANCELED 14
34 31
35static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) 32static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
36{ 33{
@@ -41,6 +38,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
41 spin_lock(&gl->gl_spin); 38 spin_lock(&gl->gl_spin);
42 pid = task_pid(current); 39 pid = task_pid(current);
43 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 40 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
41 if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
42 break;
44 if (gh->gh_owner_pid == pid) 43 if (gh->gh_owner_pid == pid)
45 goto out; 44 goto out;
46 } 45 }
@@ -70,7 +69,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
70{ 69{
71 int ret; 70 int ret;
72 spin_lock(&gl->gl_spin); 71 spin_lock(&gl->gl_spin);
73 ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3); 72 ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
74 spin_unlock(&gl->gl_spin); 73 spin_unlock(&gl->gl_spin);
75 return ret; 74 return ret;
76} 75}
@@ -98,6 +97,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
98int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 97int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
99void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 98void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
100void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); 99void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
100void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
101 101
102/** 102/**
103 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock 103 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,10 +130,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
130void gfs2_lvb_unhold(struct gfs2_glock *gl); 130void gfs2_lvb_unhold(struct gfs2_glock *gl);
131 131
132void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); 132void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
133
134void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); 133void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
135void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 134void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
136void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); 135void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
137 136
138int __init gfs2_glock_init(void); 137int __init gfs2_glock_init(void);
139void gfs2_glock_exit(void); 138void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 07d84d16cda4..c6c318c2a0f6 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/lm_interface.h> 15#include <linux/lm_interface.h>
16#include <linux/bio.h>
16 17
17#include "gfs2.h" 18#include "gfs2.h"
18#include "incore.h" 19#include "incore.h"
@@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
172} 173}
173 174
174/** 175/**
175 * inode_go_xmote_bh - After promoting/demoting a glock
176 * @gl: the glock
177 *
178 */
179
180static void inode_go_xmote_bh(struct gfs2_glock *gl)
181{
182 struct gfs2_holder *gh = gl->gl_req_gh;
183 struct buffer_head *bh;
184 int error;
185
186 if (gl->gl_state != LM_ST_UNLOCKED &&
187 (!gh || !(gh->gh_flags & GL_SKIP))) {
188 error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
189 if (!error)
190 brelse(bh);
191 }
192}
193
194/**
195 * inode_go_inval - prepare a inode glock to be released 176 * inode_go_inval - prepare a inode glock to be released
196 * @gl: the glock 177 * @gl: the glock
197 * @flags: 178 * @flags:
@@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
267} 248}
268 249
269/** 250/**
251 * inode_go_dump - print information about an inode
252 * @seq: The iterator
253 * @ip: the inode
254 *
255 * Returns: 0 on success, -ENOBUFS when we run out of space
256 */
257
258static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
259{
260 const struct gfs2_inode *ip = gl->gl_object;
261 if (ip == NULL)
262 return 0;
263 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
264 (unsigned long long)ip->i_no_formal_ino,
265 (unsigned long long)ip->i_no_addr,
266 IF2DT(ip->i_inode.i_mode), ip->i_flags);
267 return 0;
268}
269
270/**
270 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock 271 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
271 * @gl: the glock 272 * @gl: the glock
272 * 273 *
@@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
306} 307}
307 308
308/** 309/**
310 * rgrp_go_dump - print out an rgrp
311 * @seq: The iterator
312 * @gl: The glock in question
313 *
314 */
315
316static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
317{
318 const struct gfs2_rgrpd *rgd = gl->gl_object;
319 if (rgd == NULL)
320 return 0;
321 gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
322 return 0;
323}
324
325/**
309 * trans_go_sync - promote/demote the transaction glock 326 * trans_go_sync - promote/demote the transaction glock
310 * @gl: the glock 327 * @gl: the glock
311 * @state: the requested state 328 * @state: the requested state
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
330 * 347 *
331 */ 348 */
332 349
333static void trans_go_xmote_bh(struct gfs2_glock *gl) 350static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
334{ 351{
335 struct gfs2_sbd *sdp = gl->gl_sbd; 352 struct gfs2_sbd *sdp = gl->gl_sbd;
336 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); 353 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
338 struct gfs2_log_header_host head; 355 struct gfs2_log_header_host head;
339 int error; 356 int error;
340 357
341 if (gl->gl_state != LM_ST_UNLOCKED && 358 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
342 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
343 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); 359 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
344 360
345 error = gfs2_find_jhead(sdp->sd_jdesc, &head); 361 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
354 gfs2_log_pointers_init(sdp, head.lh_blkno); 370 gfs2_log_pointers_init(sdp, head.lh_blkno);
355 } 371 }
356 } 372 }
373 return 0;
357} 374}
358 375
359/** 376/**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
375 392
376const struct gfs2_glock_operations gfs2_inode_glops = { 393const struct gfs2_glock_operations gfs2_inode_glops = {
377 .go_xmote_th = inode_go_sync, 394 .go_xmote_th = inode_go_sync,
378 .go_xmote_bh = inode_go_xmote_bh,
379 .go_inval = inode_go_inval, 395 .go_inval = inode_go_inval,
380 .go_demote_ok = inode_go_demote_ok, 396 .go_demote_ok = inode_go_demote_ok,
381 .go_lock = inode_go_lock, 397 .go_lock = inode_go_lock,
398 .go_dump = inode_go_dump,
382 .go_type = LM_TYPE_INODE, 399 .go_type = LM_TYPE_INODE,
383 .go_min_hold_time = HZ / 10, 400 .go_min_hold_time = HZ / 5,
384}; 401};
385 402
386const struct gfs2_glock_operations gfs2_rgrp_glops = { 403const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
389 .go_demote_ok = rgrp_go_demote_ok, 406 .go_demote_ok = rgrp_go_demote_ok,
390 .go_lock = rgrp_go_lock, 407 .go_lock = rgrp_go_lock,
391 .go_unlock = rgrp_go_unlock, 408 .go_unlock = rgrp_go_unlock,
409 .go_dump = rgrp_go_dump,
392 .go_type = LM_TYPE_RGRP, 410 .go_type = LM_TYPE_RGRP,
393 .go_min_hold_time = HZ / 10, 411 .go_min_hold_time = HZ / 5,
394}; 412};
395 413
396const struct gfs2_glock_operations gfs2_trans_glops = { 414const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eabe5eac41da..448697a5c462 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
77struct gfs2_rgrpd { 77struct gfs2_rgrpd {
78 struct list_head rd_list; /* Link with superblock */ 78 struct list_head rd_list; /* Link with superblock */
79 struct list_head rd_list_mru; 79 struct list_head rd_list_mru;
80 struct list_head rd_recent; /* Recently used rgrps */
81 struct gfs2_glock *rd_gl; /* Glock for this rgrp */ 80 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
82 u64 rd_addr; /* grp block disk address */ 81 u64 rd_addr; /* grp block disk address */
83 u64 rd_data0; /* first data location */ 82 u64 rd_data0; /* first data location */
@@ -128,20 +127,20 @@ struct gfs2_bufdata {
128 127
129struct gfs2_glock_operations { 128struct gfs2_glock_operations {
130 void (*go_xmote_th) (struct gfs2_glock *gl); 129 void (*go_xmote_th) (struct gfs2_glock *gl);
131 void (*go_xmote_bh) (struct gfs2_glock *gl); 130 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
132 void (*go_inval) (struct gfs2_glock *gl, int flags); 131 void (*go_inval) (struct gfs2_glock *gl, int flags);
133 int (*go_demote_ok) (struct gfs2_glock *gl); 132 int (*go_demote_ok) (struct gfs2_glock *gl);
134 int (*go_lock) (struct gfs2_holder *gh); 133 int (*go_lock) (struct gfs2_holder *gh);
135 void (*go_unlock) (struct gfs2_holder *gh); 134 void (*go_unlock) (struct gfs2_holder *gh);
135 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
136 const int go_type; 136 const int go_type;
137 const unsigned long go_min_hold_time; 137 const unsigned long go_min_hold_time;
138}; 138};
139 139
140enum { 140enum {
141 /* States */ 141 /* States */
142 HIF_HOLDER = 6, 142 HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
143 HIF_FIRST = 7, 143 HIF_FIRST = 7,
144 HIF_ABORTED = 9,
145 HIF_WAIT = 10, 144 HIF_WAIT = 10,
146}; 145};
147 146
@@ -154,20 +153,20 @@ struct gfs2_holder {
154 unsigned gh_flags; 153 unsigned gh_flags;
155 154
156 int gh_error; 155 int gh_error;
157 unsigned long gh_iflags; 156 unsigned long gh_iflags; /* HIF_... */
158 unsigned long gh_ip; 157 unsigned long gh_ip;
159}; 158};
160 159
161enum { 160enum {
162 GLF_LOCK = 1, 161 GLF_LOCK = 1,
163 GLF_STICKY = 2, 162 GLF_STICKY = 2,
164 GLF_DEMOTE = 3, 163 GLF_DEMOTE = 3,
165 GLF_PENDING_DEMOTE = 4, 164 GLF_PENDING_DEMOTE = 4,
166 GLF_DIRTY = 5, 165 GLF_DEMOTE_IN_PROGRESS = 5,
167 GLF_DEMOTE_IN_PROGRESS = 6, 166 GLF_DIRTY = 6,
168 GLF_LFLUSH = 7, 167 GLF_LFLUSH = 7,
169 GLF_WAITERS2 = 8, 168 GLF_INVALIDATE_IN_PROGRESS = 8,
170 GLF_CONV_DEADLK = 9, 169 GLF_REPLY_PENDING = 9,
171}; 170};
172 171
173struct gfs2_glock { 172struct gfs2_glock {
@@ -179,19 +178,14 @@ struct gfs2_glock {
179 spinlock_t gl_spin; 178 spinlock_t gl_spin;
180 179
181 unsigned int gl_state; 180 unsigned int gl_state;
181 unsigned int gl_target;
182 unsigned int gl_reply;
182 unsigned int gl_hash; 183 unsigned int gl_hash;
183 unsigned int gl_demote_state; /* state requested by remote node */ 184 unsigned int gl_demote_state; /* state requested by remote node */
184 unsigned long gl_demote_time; /* time of first demote request */ 185 unsigned long gl_demote_time; /* time of first demote request */
185 struct pid *gl_owner_pid;
186 unsigned long gl_ip;
187 struct list_head gl_holders; 186 struct list_head gl_holders;
188 struct list_head gl_waiters1; /* HIF_MUTEX */
189 struct list_head gl_waiters3; /* HIF_PROMOTE */
190 187
191 const struct gfs2_glock_operations *gl_ops; 188 const struct gfs2_glock_operations *gl_ops;
192
193 struct gfs2_holder *gl_req_gh;
194
195 void *gl_lock; 189 void *gl_lock;
196 char *gl_lvb; 190 char *gl_lvb;
197 atomic_t gl_lvb_count; 191 atomic_t gl_lvb_count;
@@ -427,7 +421,6 @@ struct gfs2_tune {
427 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 421 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
428 unsigned int gt_atime_quantum; /* Min secs between atime updates */ 422 unsigned int gt_atime_quantum; /* Min secs between atime updates */
429 unsigned int gt_new_files_jdata; 423 unsigned int gt_new_files_jdata;
430 unsigned int gt_new_files_directio;
431 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 424 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
432 unsigned int gt_stall_secs; /* Detects trouble! */ 425 unsigned int gt_stall_secs; /* Detects trouble! */
433 unsigned int gt_complain_secs; 426 unsigned int gt_complain_secs;
@@ -534,7 +527,6 @@ struct gfs2_sbd {
534 struct mutex sd_rindex_mutex; 527 struct mutex sd_rindex_mutex;
535 struct list_head sd_rindex_list; 528 struct list_head sd_rindex_list;
536 struct list_head sd_rindex_mru_list; 529 struct list_head sd_rindex_mru_list;
537 struct list_head sd_rindex_recent_list;
538 struct gfs2_rgrpd *sd_rindex_forward; 530 struct gfs2_rgrpd *sd_rindex_forward;
539 unsigned int sd_rgrps; 531 unsigned int sd_rgrps;
540 532
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 09453d057e41..6da0ab355b8a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -504,7 +504,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
504 } 504 }
505 505
506 if (!is_root) { 506 if (!is_root) {
507 error = permission(dir, MAY_EXEC, NULL); 507 error = gfs2_permission(dir, MAY_EXEC);
508 if (error) 508 if (error)
509 goto out; 509 goto out;
510 } 510 }
@@ -667,7 +667,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
667{ 667{
668 int error; 668 int error;
669 669
670 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); 670 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
671 if (error) 671 if (error)
672 return error; 672 return error;
673 673
@@ -789,13 +789,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
789 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || 789 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
790 gfs2_tune_get(sdp, gt_new_files_jdata)) 790 gfs2_tune_get(sdp, gt_new_files_jdata))
791 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); 791 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
792 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
793 gfs2_tune_get(sdp, gt_new_files_directio))
794 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
795 } else if (S_ISDIR(mode)) { 792 } else if (S_ISDIR(mode)) {
796 di->di_flags |= cpu_to_be32(dip->i_di.di_flags & 793 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
797 GFS2_DIF_INHERIT_DIRECTIO);
798 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
799 GFS2_DIF_INHERIT_JDATA); 794 GFS2_DIF_INHERIT_JDATA);
800 } 795 }
801 796
@@ -1134,7 +1129,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
1134 if (IS_APPEND(&dip->i_inode)) 1129 if (IS_APPEND(&dip->i_inode))
1135 return -EPERM; 1130 return -EPERM;
1136 1131
1137 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); 1132 error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
1138 if (error) 1133 if (error)
1139 return error; 1134 return error;
1140 1135
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38f..6074c2506f75 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
72} 72}
73 73
74 74
75void gfs2_inode_attr_in(struct gfs2_inode *ip);
76void gfs2_set_iop(struct inode *inode); 75void gfs2_set_iop(struct inode *inode);
77struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 76struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
78 u64 no_addr, u64 no_formal_ino, 77 u64 no_addr, u64 no_formal_ino,
@@ -91,6 +90,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
91 struct gfs2_inode *ip); 90 struct gfs2_inode *ip);
92int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, 91int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
93 const struct gfs2_inode *ip); 92 const struct gfs2_inode *ip);
93int gfs2_permission(struct inode *inode, int mask);
94int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to); 94int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
95int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); 95int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
96int gfs2_glock_nq_atime(struct gfs2_holder *gh); 96int gfs2_glock_nq_atime(struct gfs2_holder *gh);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee728783..523243a13a21 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
23 const struct lm_lockops *lw_ops; 23 const struct lm_lockops *lw_ops;
24}; 24};
25 25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, void *cb_data,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj);
31
26/* List of registered low-level locking protocols. A file system selects one 32/* List of registered low-level locking protocols. A file system selects one
27 of them by name at mount time, e.g. lock_nolock, lock_dlm. */ 33 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
28 34
35static const struct lm_lockops nolock_ops = {
36 .lm_proto_name = "lock_nolock",
37 .lm_mount = nolock_mount,
38};
39
40static struct lmh_wrapper nolock_proto = {
41 .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
42 .lw_ops = &nolock_ops,
43};
44
29static LIST_HEAD(lmh_list); 45static LIST_HEAD(lmh_list);
30static DEFINE_MUTEX(lmh_lock); 46static DEFINE_MUTEX(lmh_lock);
31 47
48static int nolock_mount(char *table_name, char *host_data,
49 lm_callback_t cb, void *cb_data,
50 unsigned int min_lvb_size, int flags,
51 struct lm_lockstruct *lockstruct,
52 struct kobject *fskobj)
53{
54 char *c;
55 unsigned int jid;
56
57 c = strstr(host_data, "jid=");
58 if (!c)
59 jid = 0;
60 else {
61 c += 4;
62 sscanf(c, "%u", &jid);
63 }
64
65 lockstruct->ls_jid = jid;
66 lockstruct->ls_first = 1;
67 lockstruct->ls_lvb_size = min_lvb_size;
68 lockstruct->ls_ops = &nolock_ops;
69 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
70
71 return 0;
72}
73
32/** 74/**
33 * gfs2_register_lockproto - Register a low-level locking protocol 75 * gfs2_register_lockproto - Register a low-level locking protocol
34 * @proto: the protocol definition 76 * @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
116 int try = 0; 158 int try = 0;
117 int error, found; 159 int error, found;
118 160
161
119retry: 162retry:
120 mutex_lock(&lmh_lock); 163 mutex_lock(&lmh_lock);
121 164
165 if (list_empty(&nolock_proto.lw_list))
166 list_add(&nolock_proto.lw_list, &lmh_list);
167
122 found = 0; 168 found = 0;
123 list_for_each_entry(lw, &lmh_list, lw_list) { 169 list_for_each_entry(lw, &lmh_list, lw_list) {
124 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) { 170 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
139 goto out; 185 goto out;
140 } 186 }
141 187
142 if (!try_module_get(lw->lw_ops->lm_owner)) { 188 if (lw->lw_ops->lm_owner &&
189 !try_module_get(lw->lw_ops->lm_owner)) {
143 try = 0; 190 try = 0;
144 mutex_unlock(&lmh_lock); 191 mutex_unlock(&lmh_lock);
145 msleep(1000); 192 msleep(1000);
@@ -158,7 +205,8 @@ out:
158void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct) 205void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
159{ 206{
160 mutex_lock(&lmh_lock); 207 mutex_lock(&lmh_lock);
161 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace); 208 if (lockstruct->ls_ops->lm_unmount)
209 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
162 if (lockstruct->ls_ops->lm_owner) 210 if (lockstruct->ls_ops->lm_owner)
163 module_put(lockstruct->ls_ops->lm_owner); 211 module_put(lockstruct->ls_ops->lm_owner);
164 mutex_unlock(&lmh_lock); 212 mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec87..2482c9047505 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,60 @@
11 11
12static char junk_lvb[GDLM_LVB_SIZE]; 12static char junk_lvb[GDLM_LVB_SIZE];
13 13
14static void queue_complete(struct gdlm_lock *lp) 14
15/* convert dlm lock-mode to gfs lock-state */
16
17static s16 gdlm_make_lmstate(s16 dlmmode)
15{ 18{
16 struct gdlm_ls *ls = lp->ls; 19 switch (dlmmode) {
20 case DLM_LOCK_IV:
21 case DLM_LOCK_NL:
22 return LM_ST_UNLOCKED;
23 case DLM_LOCK_EX:
24 return LM_ST_EXCLUSIVE;
25 case DLM_LOCK_CW:
26 return LM_ST_DEFERRED;
27 case DLM_LOCK_PR:
28 return LM_ST_SHARED;
29 }
30 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
31 return -1;
32}
17 33
18 clear_bit(LFL_ACTIVE, &lp->flags); 34/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
35 thread gets to it. */
36
37static void queue_submit(struct gdlm_lock *lp)
38{
39 struct gdlm_ls *ls = lp->ls;
19 40
20 spin_lock(&ls->async_lock); 41 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete); 42 list_add_tail(&lp->delay_list, &ls->submit);
22 spin_unlock(&ls->async_lock); 43 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait); 44 wake_up(&ls->thread_wait);
24} 45}
25 46
26static inline void gdlm_ast(void *astarg) 47static void wake_up_ast(struct gdlm_lock *lp)
27{ 48{
28 queue_complete(astarg); 49 clear_bit(LFL_AST_WAIT, &lp->flags);
50 smp_mb__after_clear_bit();
51 wake_up_bit(&lp->flags, LFL_AST_WAIT);
29} 52}
30 53
31static inline void gdlm_bast(void *astarg, int mode) 54static void gdlm_delete_lp(struct gdlm_lock *lp)
32{ 55{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls; 56 struct gdlm_ls *ls = lp->ls;
35 57
36 if (!mode) {
37 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type,
39 (unsigned long long)lp->lockname.ln_number);
40 return;
41 }
42
43 spin_lock(&ls->async_lock); 58 spin_lock(&ls->async_lock);
44 if (!lp->bast_mode) { 59 if (!list_empty(&lp->delay_list))
45 list_add_tail(&lp->blist, &ls->blocking); 60 list_del_init(&lp->delay_list);
46 lp->bast_mode = mode; 61 ls->all_locks_count--;
47 } else if (lp->bast_mode < mode)
48 lp->bast_mode = mode;
49 spin_unlock(&ls->async_lock); 62 spin_unlock(&ls->async_lock);
50 wake_up(&ls->thread_wait); 63
64 kfree(lp);
51} 65}
52 66
53void gdlm_queue_delayed(struct gdlm_lock *lp) 67static void gdlm_queue_delayed(struct gdlm_lock *lp)
54{ 68{
55 struct gdlm_ls *ls = lp->ls; 69 struct gdlm_ls *ls = lp->ls;
56 70
@@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
59 spin_unlock(&ls->async_lock); 73 spin_unlock(&ls->async_lock);
60} 74}
61 75
76static void process_complete(struct gdlm_lock *lp)
77{
78 struct gdlm_ls *ls = lp->ls;
79 struct lm_async_cb acb;
80
81 memset(&acb, 0, sizeof(acb));
82
83 if (lp->lksb.sb_status == -DLM_ECANCEL) {
84 log_info("complete dlm cancel %x,%llx flags %lx",
85 lp->lockname.ln_type,
86 (unsigned long long)lp->lockname.ln_number,
87 lp->flags);
88
89 lp->req = lp->cur;
90 acb.lc_ret |= LM_OUT_CANCELED;
91 if (lp->cur == DLM_LOCK_IV)
92 lp->lksb.sb_lkid = 0;
93 goto out;
94 }
95
96 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
97 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
98 log_info("unlock sb_status %d %x,%llx flags %lx",
99 lp->lksb.sb_status, lp->lockname.ln_type,
100 (unsigned long long)lp->lockname.ln_number,
101 lp->flags);
102 return;
103 }
104
105 lp->cur = DLM_LOCK_IV;
106 lp->req = DLM_LOCK_IV;
107 lp->lksb.sb_lkid = 0;
108
109 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
110 gdlm_delete_lp(lp);
111 return;
112 }
113 goto out;
114 }
115
116 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
117 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
118
119 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
120 if (lp->req == DLM_LOCK_PR)
121 lp->req = DLM_LOCK_CW;
122 else if (lp->req == DLM_LOCK_CW)
123 lp->req = DLM_LOCK_PR;
124 }
125
126 /*
127 * A canceled lock request. The lock was just taken off the delayed
128 * list and was never even submitted to dlm.
129 */
130
131 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
132 log_info("complete internal cancel %x,%llx",
133 lp->lockname.ln_type,
134 (unsigned long long)lp->lockname.ln_number);
135 lp->req = lp->cur;
136 acb.lc_ret |= LM_OUT_CANCELED;
137 goto out;
138 }
139
140 /*
141 * An error occured.
142 */
143
144 if (lp->lksb.sb_status) {
145 /* a "normal" error */
146 if ((lp->lksb.sb_status == -EAGAIN) &&
147 (lp->lkf & DLM_LKF_NOQUEUE)) {
148 lp->req = lp->cur;
149 if (lp->cur == DLM_LOCK_IV)
150 lp->lksb.sb_lkid = 0;
151 goto out;
152 }
153
154 /* this could only happen with cancels I think */
155 log_info("ast sb_status %d %x,%llx flags %lx",
156 lp->lksb.sb_status, lp->lockname.ln_type,
157 (unsigned long long)lp->lockname.ln_number,
158 lp->flags);
159 return;
160 }
161
162 /*
163 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
164 */
165
166 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
167 wake_up_ast(lp);
168 return;
169 }
170
171 /*
172 * A lock has been demoted to NL because it initially completed during
173 * BLOCK_LOCKS. Now it must be requested in the originally requested
174 * mode.
175 */
176
177 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
178 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
179 lp->lockname.ln_type,
180 (unsigned long long)lp->lockname.ln_number);
181 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
182 lp->lockname.ln_type,
183 (unsigned long long)lp->lockname.ln_number);
184
185 lp->cur = DLM_LOCK_NL;
186 lp->req = lp->prev_req;
187 lp->prev_req = DLM_LOCK_IV;
188 lp->lkf &= ~DLM_LKF_CONVDEADLK;
189
190 set_bit(LFL_NOCACHE, &lp->flags);
191
192 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
193 !test_bit(LFL_NOBLOCK, &lp->flags))
194 gdlm_queue_delayed(lp);
195 else
196 queue_submit(lp);
197 return;
198 }
199
200 /*
201 * A request is granted during dlm recovery. It may be granted
202 * because the locks of a failed node were cleared. In that case,
203 * there may be inconsistent data beneath this lock and we must wait
204 * for recovery to complete to use it. When gfs recovery is done this
205 * granted lock will be converted to NL and then reacquired in this
206 * granted state.
207 */
208
209 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
210 !test_bit(LFL_NOBLOCK, &lp->flags) &&
211 lp->req != DLM_LOCK_NL) {
212
213 lp->cur = lp->req;
214 lp->prev_req = lp->req;
215 lp->req = DLM_LOCK_NL;
216 lp->lkf |= DLM_LKF_CONVERT;
217 lp->lkf &= ~DLM_LKF_CONVDEADLK;
218
219 log_debug("rereq %x,%llx id %x %d,%d",
220 lp->lockname.ln_type,
221 (unsigned long long)lp->lockname.ln_number,
222 lp->lksb.sb_lkid, lp->cur, lp->req);
223
224 set_bit(LFL_REREQUEST, &lp->flags);
225 queue_submit(lp);
226 return;
227 }
228
229 /*
230 * DLM demoted the lock to NL before it was granted so GFS must be
231 * told it cannot cache data for this lock.
232 */
233
234 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
235 set_bit(LFL_NOCACHE, &lp->flags);
236
237out:
238 /*
239 * This is an internal lock_dlm lock
240 */
241
242 if (test_bit(LFL_INLOCK, &lp->flags)) {
243 clear_bit(LFL_NOBLOCK, &lp->flags);
244 lp->cur = lp->req;
245 wake_up_ast(lp);
246 return;
247 }
248
249 /*
250 * Normal completion of a lock request. Tell GFS it now has the lock.
251 */
252
253 clear_bit(LFL_NOBLOCK, &lp->flags);
254 lp->cur = lp->req;
255
256 acb.lc_name = lp->lockname;
257 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
258
259 ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
260}
261
262static void gdlm_ast(void *astarg)
263{
264 struct gdlm_lock *lp = astarg;
265 clear_bit(LFL_ACTIVE, &lp->flags);
266 process_complete(lp);
267}
268
269static void process_blocking(struct gdlm_lock *lp, int bast_mode)
270{
271 struct gdlm_ls *ls = lp->ls;
272 unsigned int cb = 0;
273
274 switch (gdlm_make_lmstate(bast_mode)) {
275 case LM_ST_EXCLUSIVE:
276 cb = LM_CB_NEED_E;
277 break;
278 case LM_ST_DEFERRED:
279 cb = LM_CB_NEED_D;
280 break;
281 case LM_ST_SHARED:
282 cb = LM_CB_NEED_S;
283 break;
284 default:
285 gdlm_assert(0, "unknown bast mode %u", bast_mode);
286 }
287
288 ls->fscb(ls->sdp, cb, &lp->lockname);
289}
290
291
292static void gdlm_bast(void *astarg, int mode)
293{
294 struct gdlm_lock *lp = astarg;
295
296 if (!mode) {
297 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
298 lp->lockname.ln_type,
299 (unsigned long long)lp->lockname.ln_number);
300 return;
301 }
302
303 process_blocking(lp, mode);
304}
305
62/* convert gfs lock-state to dlm lock-mode */ 306/* convert gfs lock-state to dlm lock-mode */
63 307
64static s16 make_mode(s16 lmstate) 308static s16 make_mode(s16 lmstate)
@@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate)
77 return -1; 321 return -1;
78} 322}
79 323
80/* convert dlm lock-mode to gfs lock-state */
81
82s16 gdlm_make_lmstate(s16 dlmmode)
83{
84 switch (dlmmode) {
85 case DLM_LOCK_IV:
86 case DLM_LOCK_NL:
87 return LM_ST_UNLOCKED;
88 case DLM_LOCK_EX:
89 return LM_ST_EXCLUSIVE;
90 case DLM_LOCK_CW:
91 return LM_ST_DEFERRED;
92 case DLM_LOCK_PR:
93 return LM_ST_SHARED;
94 }
95 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
96 return -1;
97}
98 324
99/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and 325/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
100 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */ 326 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
134 360
135 if (lp->lksb.sb_lkid != 0) { 361 if (lp->lksb.sb_lkid != 0) {
136 lkf |= DLM_LKF_CONVERT; 362 lkf |= DLM_LKF_CONVERT;
137
138 /* Conversion deadlock avoidance by DLM */
139
140 if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
141 !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
142 !(lkf & DLM_LKF_NOQUEUE) &&
143 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
144 lkf |= DLM_LKF_CONVDEADLK;
145 } 363 }
146 364
147 if (lp->lvb) 365 if (lp->lvb)
@@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
173 make_strname(name, &lp->strname); 391 make_strname(name, &lp->strname);
174 lp->ls = ls; 392 lp->ls = ls;
175 lp->cur = DLM_LOCK_IV; 393 lp->cur = DLM_LOCK_IV;
176 lp->lvb = NULL;
177 lp->hold_null = NULL;
178 INIT_LIST_HEAD(&lp->clist);
179 INIT_LIST_HEAD(&lp->blist);
180 INIT_LIST_HEAD(&lp->delay_list); 394 INIT_LIST_HEAD(&lp->delay_list);
181 395
182 spin_lock(&ls->async_lock); 396 spin_lock(&ls->async_lock);
183 list_add(&lp->all_list, &ls->all_locks);
184 ls->all_locks_count++; 397 ls->all_locks_count++;
185 spin_unlock(&ls->async_lock); 398 spin_unlock(&ls->async_lock);
186 399
@@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
188 return 0; 401 return 0;
189} 402}
190 403
191void gdlm_delete_lp(struct gdlm_lock *lp)
192{
193 struct gdlm_ls *ls = lp->ls;
194
195 spin_lock(&ls->async_lock);
196 if (!list_empty(&lp->clist))
197 list_del_init(&lp->clist);
198 if (!list_empty(&lp->blist))
199 list_del_init(&lp->blist);
200 if (!list_empty(&lp->delay_list))
201 list_del_init(&lp->delay_list);
202 gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
203 (unsigned long long)lp->lockname.ln_number);
204 list_del_init(&lp->all_list);
205 ls->all_locks_count--;
206 spin_unlock(&ls->async_lock);
207
208 kfree(lp);
209}
210
211int gdlm_get_lock(void *lockspace, struct lm_lockname *name, 404int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
212 void **lockp) 405 void **lockp)
213{ 406{
@@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
261 454
262 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) { 455 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
263 lp->lksb.sb_status = -EAGAIN; 456 lp->lksb.sb_status = -EAGAIN;
264 queue_complete(lp); 457 gdlm_ast(lp);
265 error = 0; 458 error = 0;
266 } 459 }
267 460
@@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
308{ 501{
309 struct gdlm_lock *lp = lock; 502 struct gdlm_lock *lp = lock;
310 503
504 if (req_state == LM_ST_UNLOCKED)
505 return gdlm_unlock(lock, cur_state);
506
507 if (req_state == LM_ST_UNLOCKED)
508 return gdlm_unlock(lock, cur_state);
509
311 clear_bit(LFL_DLM_CANCEL, &lp->flags); 510 clear_bit(LFL_DLM_CANCEL, &lp->flags);
312 if (flags & LM_FLAG_NOEXP) 511 if (flags & LM_FLAG_NOEXP)
313 set_bit(LFL_NOBLOCK, &lp->flags); 512 set_bit(LFL_NOBLOCK, &lp->flags);
@@ -351,7 +550,7 @@ void gdlm_cancel(void *lock)
351 if (delay_list) { 550 if (delay_list) {
352 set_bit(LFL_CANCEL, &lp->flags); 551 set_bit(LFL_CANCEL, &lp->flags);
353 set_bit(LFL_ACTIVE, &lp->flags); 552 set_bit(LFL_ACTIVE, &lp->flags);
354 queue_complete(lp); 553 gdlm_ast(lp);
355 return; 554 return;
356 } 555 }
357 556
@@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
507 wake_up(&ls->thread_wait); 706 wake_up(&ls->thread_wait);
508} 707}
509 708
510int gdlm_release_all_locks(struct gdlm_ls *ls)
511{
512 struct gdlm_lock *lp, *safe;
513 int count = 0;
514
515 spin_lock(&ls->async_lock);
516 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
517 list_del_init(&lp->all_list);
518
519 if (lp->lvb && lp->lvb != junk_lvb)
520 kfree(lp->lvb);
521 kfree(lp);
522 count++;
523 }
524 spin_unlock(&ls->async_lock);
525
526 return count;
527}
528
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54e..3c98e7c6f93b 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,19 +72,12 @@ struct gdlm_ls {
72 int recover_jid_done; 72 int recover_jid_done;
73 int recover_jid_status; 73 int recover_jid_status;
74 spinlock_t async_lock; 74 spinlock_t async_lock;
75 struct list_head complete;
76 struct list_head blocking;
77 struct list_head delayed; 75 struct list_head delayed;
78 struct list_head submit; 76 struct list_head submit;
79 struct list_head all_locks;
80 u32 all_locks_count; 77 u32 all_locks_count;
81 wait_queue_head_t wait_control; 78 wait_queue_head_t wait_control;
82 struct task_struct *thread1; 79 struct task_struct *thread;
83 struct task_struct *thread2;
84 wait_queue_head_t thread_wait; 80 wait_queue_head_t thread_wait;
85 unsigned long drop_time;
86 int drop_locks_count;
87 int drop_locks_period;
88}; 81};
89 82
90enum { 83enum {
@@ -117,12 +110,7 @@ struct gdlm_lock {
117 u32 lkf; /* dlm flags DLM_LKF_ */ 110 u32 lkf; /* dlm flags DLM_LKF_ */
118 unsigned long flags; /* lock_dlm flags LFL_ */ 111 unsigned long flags; /* lock_dlm flags LFL_ */
119 112
120 int bast_mode; /* protected by async_lock */
121
122 struct list_head clist; /* complete */
123 struct list_head blist; /* blocking */
124 struct list_head delay_list; /* delayed */ 113 struct list_head delay_list; /* delayed */
125 struct list_head all_list; /* all locks for the fs */
126 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */ 114 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
127}; 115};
128 116
@@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *);
159 147
160/* lock.c */ 148/* lock.c */
161 149
162s16 gdlm_make_lmstate(s16);
163void gdlm_queue_delayed(struct gdlm_lock *);
164void gdlm_submit_delayed(struct gdlm_ls *); 150void gdlm_submit_delayed(struct gdlm_ls *);
165int gdlm_release_all_locks(struct gdlm_ls *);
166void gdlm_delete_lp(struct gdlm_lock *);
167unsigned int gdlm_do_lock(struct gdlm_lock *); 151unsigned int gdlm_do_lock(struct gdlm_lock *);
168 152
169int gdlm_get_lock(void *, struct lm_lockname *, void **); 153int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b50..09d78c216f48 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
22 if (!ls) 22 if (!ls)
23 return NULL; 23 return NULL;
24 24
25 ls->drop_locks_count = GDLM_DROP_COUNT;
26 ls->drop_locks_period = GDLM_DROP_PERIOD;
27 ls->fscb = cb; 25 ls->fscb = cb;
28 ls->sdp = sdp; 26 ls->sdp = sdp;
29 ls->fsflags = flags; 27 ls->fsflags = flags;
30 spin_lock_init(&ls->async_lock); 28 spin_lock_init(&ls->async_lock);
31 INIT_LIST_HEAD(&ls->complete);
32 INIT_LIST_HEAD(&ls->blocking);
33 INIT_LIST_HEAD(&ls->delayed); 29 INIT_LIST_HEAD(&ls->delayed);
34 INIT_LIST_HEAD(&ls->submit); 30 INIT_LIST_HEAD(&ls->submit);
35 INIT_LIST_HEAD(&ls->all_locks);
36 init_waitqueue_head(&ls->thread_wait); 31 init_waitqueue_head(&ls->thread_wait);
37 init_waitqueue_head(&ls->wait_control); 32 init_waitqueue_head(&ls->wait_control);
38 ls->thread1 = NULL;
39 ls->thread2 = NULL;
40 ls->drop_time = jiffies;
41 ls->jid = -1; 33 ls->jid = -1;
42 34
43 strncpy(buf, table_name, 256); 35 strncpy(buf, table_name, 256);
@@ -180,7 +172,6 @@ out:
180static void gdlm_unmount(void *lockspace) 172static void gdlm_unmount(void *lockspace)
181{ 173{
182 struct gdlm_ls *ls = lockspace; 174 struct gdlm_ls *ls = lockspace;
183 int rv;
184 175
185 log_debug("unmount flags %lx", ls->flags); 176 log_debug("unmount flags %lx", ls->flags);
186 177
@@ -194,9 +185,7 @@ static void gdlm_unmount(void *lockspace)
194 gdlm_kobject_release(ls); 185 gdlm_kobject_release(ls);
195 dlm_release_lockspace(ls->dlm_lockspace, 2); 186 dlm_release_lockspace(ls->dlm_lockspace, 2);
196 gdlm_release_threads(ls); 187 gdlm_release_threads(ls);
197 rv = gdlm_release_all_locks(ls); 188 BUG_ON(ls->all_locks_count);
198 if (rv)
199 log_info("gdlm_unmount: %d stray locks freed", rv);
200out: 189out:
201 kfree(ls); 190 kfree(ls);
202} 191}
@@ -232,7 +221,6 @@ static void gdlm_withdraw(void *lockspace)
232 221
233 dlm_release_lockspace(ls->dlm_lockspace, 2); 222 dlm_release_lockspace(ls->dlm_lockspace, 2);
234 gdlm_release_threads(ls); 223 gdlm_release_threads(ls);
235 gdlm_release_all_locks(ls);
236 gdlm_kobject_release(ls); 224 gdlm_kobject_release(ls);
237} 225}
238 226
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9ee..4ec571c3d8a9 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
114 return sprintf(buf, "%d\n", ls->recover_jid_status); 114 return sprintf(buf, "%d\n", ls->recover_jid_status);
115} 115}
116 116
117static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
118{
119 return sprintf(buf, "%d\n", ls->drop_locks_count);
120}
121
122static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
123{
124 ls->drop_locks_count = simple_strtol(buf, NULL, 0);
125 return len;
126}
127
128struct gdlm_attr { 117struct gdlm_attr {
129 struct attribute attr; 118 struct attribute attr;
130 ssize_t (*show)(struct gdlm_ls *, char *); 119 ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL);
144GDLM_ATTR(recover, 0644, recover_show, recover_store); 133GDLM_ATTR(recover, 0644, recover_show, recover_store);
145GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); 134GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
146GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); 135GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
147GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store);
148 136
149static struct attribute *gdlm_attrs[] = { 137static struct attribute *gdlm_attrs[] = {
150 &gdlm_attr_proto_name.attr, 138 &gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
157 &gdlm_attr_recover.attr, 145 &gdlm_attr_recover.attr,
158 &gdlm_attr_recover_done.attr, 146 &gdlm_attr_recover_done.attr,
159 &gdlm_attr_recover_status.attr, 147 &gdlm_attr_recover_status.attr,
160 &gdlm_attr_drop_count.attr,
161 NULL, 148 NULL,
162}; 149};
163 150
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28ab..38823efd698c 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,367 +9,60 @@
9 9
10#include "lock_dlm.h" 10#include "lock_dlm.h"
11 11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm 12static inline int no_work(struct gdlm_ls *ls)
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->sdp, cb, &lp->lockname);
45}
46
47static void wake_up_ast(struct gdlm_lock *lp)
48{
49 clear_bit(LFL_AST_WAIT, &lp->flags);
50 smp_mb__after_clear_bit();
51 wake_up_bit(&lp->flags, LFL_AST_WAIT);
52}
53
54static void process_complete(struct gdlm_lock *lp)
55{
56 struct gdlm_ls *ls = lp->ls;
57 struct lm_async_cb acb;
58 s16 prev_mode = lp->cur;
59
60 memset(&acb, 0, sizeof(acb));
61
62 if (lp->lksb.sb_status == -DLM_ECANCEL) {
63 log_info("complete dlm cancel %x,%llx flags %lx",
64 lp->lockname.ln_type,
65 (unsigned long long)lp->lockname.ln_number,
66 lp->flags);
67
68 lp->req = lp->cur;
69 acb.lc_ret |= LM_OUT_CANCELED;
70 if (lp->cur == DLM_LOCK_IV)
71 lp->lksb.sb_lkid = 0;
72 goto out;
73 }
74
75 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
76 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
77 log_info("unlock sb_status %d %x,%llx flags %lx",
78 lp->lksb.sb_status, lp->lockname.ln_type,
79 (unsigned long long)lp->lockname.ln_number,
80 lp->flags);
81 return;
82 }
83
84 lp->cur = DLM_LOCK_IV;
85 lp->req = DLM_LOCK_IV;
86 lp->lksb.sb_lkid = 0;
87
88 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
89 gdlm_delete_lp(lp);
90 return;
91 }
92 goto out;
93 }
94
95 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
96 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
97
98 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
99 if (lp->req == DLM_LOCK_PR)
100 lp->req = DLM_LOCK_CW;
101 else if (lp->req == DLM_LOCK_CW)
102 lp->req = DLM_LOCK_PR;
103 }
104
105 /*
106 * A canceled lock request. The lock was just taken off the delayed
107 * list and was never even submitted to dlm.
108 */
109
110 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
111 log_info("complete internal cancel %x,%llx",
112 lp->lockname.ln_type,
113 (unsigned long long)lp->lockname.ln_number);
114 lp->req = lp->cur;
115 acb.lc_ret |= LM_OUT_CANCELED;
116 goto out;
117 }
118
119 /*
120 * An error occured.
121 */
122
123 if (lp->lksb.sb_status) {
124 /* a "normal" error */
125 if ((lp->lksb.sb_status == -EAGAIN) &&
126 (lp->lkf & DLM_LKF_NOQUEUE)) {
127 lp->req = lp->cur;
128 if (lp->cur == DLM_LOCK_IV)
129 lp->lksb.sb_lkid = 0;
130 goto out;
131 }
132
133 /* this could only happen with cancels I think */
134 log_info("ast sb_status %d %x,%llx flags %lx",
135 lp->lksb.sb_status, lp->lockname.ln_type,
136 (unsigned long long)lp->lockname.ln_number,
137 lp->flags);
138 if (lp->lksb.sb_status == -EDEADLOCK &&
139 lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
140 lp->req = lp->cur;
141 acb.lc_ret |= LM_OUT_CONV_DEADLK;
142 if (lp->cur == DLM_LOCK_IV)
143 lp->lksb.sb_lkid = 0;
144 goto out;
145 } else
146 return;
147 }
148
149 /*
150 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
151 */
152
153 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
154 wake_up_ast(lp);
155 return;
156 }
157
158 /*
159 * A lock has been demoted to NL because it initially completed during
160 * BLOCK_LOCKS. Now it must be requested in the originally requested
161 * mode.
162 */
163
164 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
165 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
166 lp->lockname.ln_type,
167 (unsigned long long)lp->lockname.ln_number);
168 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
169 lp->lockname.ln_type,
170 (unsigned long long)lp->lockname.ln_number);
171
172 lp->cur = DLM_LOCK_NL;
173 lp->req = lp->prev_req;
174 lp->prev_req = DLM_LOCK_IV;
175 lp->lkf &= ~DLM_LKF_CONVDEADLK;
176
177 set_bit(LFL_NOCACHE, &lp->flags);
178
179 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
180 !test_bit(LFL_NOBLOCK, &lp->flags))
181 gdlm_queue_delayed(lp);
182 else
183 queue_submit(lp);
184 return;
185 }
186
187 /*
188 * A request is granted during dlm recovery. It may be granted
189 * because the locks of a failed node were cleared. In that case,
190 * there may be inconsistent data beneath this lock and we must wait
191 * for recovery to complete to use it. When gfs recovery is done this
192 * granted lock will be converted to NL and then reacquired in this
193 * granted state.
194 */
195
196 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
197 !test_bit(LFL_NOBLOCK, &lp->flags) &&
198 lp->req != DLM_LOCK_NL) {
199
200 lp->cur = lp->req;
201 lp->prev_req = lp->req;
202 lp->req = DLM_LOCK_NL;
203 lp->lkf |= DLM_LKF_CONVERT;
204 lp->lkf &= ~DLM_LKF_CONVDEADLK;
205
206 log_debug("rereq %x,%llx id %x %d,%d",
207 lp->lockname.ln_type,
208 (unsigned long long)lp->lockname.ln_number,
209 lp->lksb.sb_lkid, lp->cur, lp->req);
210
211 set_bit(LFL_REREQUEST, &lp->flags);
212 queue_submit(lp);
213 return;
214 }
215
216 /*
217 * DLM demoted the lock to NL before it was granted so GFS must be
218 * told it cannot cache data for this lock.
219 */
220
221 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
222 set_bit(LFL_NOCACHE, &lp->flags);
223
224out:
225 /*
226 * This is an internal lock_dlm lock
227 */
228
229 if (test_bit(LFL_INLOCK, &lp->flags)) {
230 clear_bit(LFL_NOBLOCK, &lp->flags);
231 lp->cur = lp->req;
232 wake_up_ast(lp);
233 return;
234 }
235
236 /*
237 * Normal completion of a lock request. Tell GFS it now has the lock.
238 */
239
240 clear_bit(LFL_NOBLOCK, &lp->flags);
241 lp->cur = lp->req;
242
243 acb.lc_name = lp->lockname;
244 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
245
246 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
247 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
248 acb.lc_ret |= LM_OUT_CACHEABLE;
249
250 ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
251}
252
253static inline int no_work(struct gdlm_ls *ls, int blocking)
254{ 13{
255 int ret; 14 int ret;
256 15
257 spin_lock(&ls->async_lock); 16 spin_lock(&ls->async_lock);
258 ret = list_empty(&ls->complete) && list_empty(&ls->submit); 17 ret = list_empty(&ls->submit);
259 if (ret && blocking)
260 ret = list_empty(&ls->blocking);
261 spin_unlock(&ls->async_lock); 18 spin_unlock(&ls->async_lock);
262 19
263 return ret; 20 return ret;
264} 21}
265 22
266static inline int check_drop(struct gdlm_ls *ls) 23static int gdlm_thread(void *data)
267{
268 if (!ls->drop_locks_count)
269 return 0;
270
271 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
272 ls->drop_time = jiffies;
273 if (ls->all_locks_count >= ls->drop_locks_count)
274 return 1;
275 }
276 return 0;
277}
278
279static int gdlm_thread(void *data, int blist)
280{ 24{
281 struct gdlm_ls *ls = (struct gdlm_ls *) data; 25 struct gdlm_ls *ls = (struct gdlm_ls *) data;
282 struct gdlm_lock *lp = NULL; 26 struct gdlm_lock *lp = NULL;
283 uint8_t complete, blocking, submit, drop;
284
285 /* Only thread1 is allowed to do blocking callbacks since gfs
286 may wait for a completion callback within a blocking cb. */
287 27
288 while (!kthread_should_stop()) { 28 while (!kthread_should_stop()) {
289 wait_event_interruptible(ls->thread_wait, 29 wait_event_interruptible(ls->thread_wait,
290 !no_work(ls, blist) || kthread_should_stop()); 30 !no_work(ls) || kthread_should_stop());
291
292 complete = blocking = submit = drop = 0;
293 31
294 spin_lock(&ls->async_lock); 32 spin_lock(&ls->async_lock);
295 33
296 if (blist && !list_empty(&ls->blocking)) { 34 if (!list_empty(&ls->submit)) {
297 lp = list_entry(ls->blocking.next, struct gdlm_lock,
298 blist);
299 list_del_init(&lp->blist);
300 blocking = lp->bast_mode;
301 lp->bast_mode = 0;
302 } else if (!list_empty(&ls->complete)) {
303 lp = list_entry(ls->complete.next, struct gdlm_lock,
304 clist);
305 list_del_init(&lp->clist);
306 complete = 1;
307 } else if (!list_empty(&ls->submit)) {
308 lp = list_entry(ls->submit.next, struct gdlm_lock, 35 lp = list_entry(ls->submit.next, struct gdlm_lock,
309 delay_list); 36 delay_list);
310 list_del_init(&lp->delay_list); 37 list_del_init(&lp->delay_list);
311 submit = 1; 38 spin_unlock(&ls->async_lock);
39 gdlm_do_lock(lp);
40 spin_lock(&ls->async_lock);
312 } 41 }
313
314 drop = check_drop(ls);
315 spin_unlock(&ls->async_lock); 42 spin_unlock(&ls->async_lock);
316
317 if (complete)
318 process_complete(lp);
319
320 else if (blocking)
321 process_blocking(lp, blocking);
322
323 else if (submit)
324 gdlm_do_lock(lp);
325
326 if (drop)
327 ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
328
329 schedule();
330 } 43 }
331 44
332 return 0; 45 return 0;
333} 46}
334 47
335static int gdlm_thread1(void *data)
336{
337 return gdlm_thread(data, 1);
338}
339
340static int gdlm_thread2(void *data)
341{
342 return gdlm_thread(data, 0);
343}
344
345int gdlm_init_threads(struct gdlm_ls *ls) 48int gdlm_init_threads(struct gdlm_ls *ls)
346{ 49{
347 struct task_struct *p; 50 struct task_struct *p;
348 int error; 51 int error;
349 52
350 p = kthread_run(gdlm_thread1, ls, "lock_dlm1"); 53 p = kthread_run(gdlm_thread, ls, "lock_dlm");
351 error = IS_ERR(p);
352 if (error) {
353 log_error("can't start lock_dlm1 thread %d", error);
354 return error;
355 }
356 ls->thread1 = p;
357
358 p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
359 error = IS_ERR(p); 54 error = IS_ERR(p);
360 if (error) { 55 if (error) {
361 log_error("can't start lock_dlm2 thread %d", error); 56 log_error("can't start lock_dlm thread %d", error);
362 kthread_stop(ls->thread1);
363 return error; 57 return error;
364 } 58 }
365 ls->thread2 = p; 59 ls->thread = p;
366 60
367 return 0; 61 return 0;
368} 62}
369 63
370void gdlm_release_threads(struct gdlm_ls *ls) 64void gdlm_release_threads(struct gdlm_ls *ls)
371{ 65{
372 kthread_stop(ls->thread1); 66 kthread_stop(ls->thread);
373 kthread_stop(ls->thread2);
374} 67}
375 68
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a8..000000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
1obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 284a5ece8d94..000000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,238 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/init.h>
13#include <linux/types.h>
14#include <linux/fs.h>
15#include <linux/lm_interface.h>
16
17struct nolock_lockspace {
18 unsigned int nl_lvb_size;
19};
20
21static const struct lm_lockops nolock_ops;
22
23static int nolock_mount(char *table_name, char *host_data,
24 lm_callback_t cb, void *cb_data,
25 unsigned int min_lvb_size, int flags,
26 struct lm_lockstruct *lockstruct,
27 struct kobject *fskobj)
28{
29 char *c;
30 unsigned int jid;
31 struct nolock_lockspace *nl;
32
33 c = strstr(host_data, "jid=");
34 if (!c)
35 jid = 0;
36 else {
37 c += 4;
38 sscanf(c, "%u", &jid);
39 }
40
41 nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
42 if (!nl)
43 return -ENOMEM;
44
45 nl->nl_lvb_size = min_lvb_size;
46
47 lockstruct->ls_jid = jid;
48 lockstruct->ls_first = 1;
49 lockstruct->ls_lvb_size = min_lvb_size;
50 lockstruct->ls_lockspace = nl;
51 lockstruct->ls_ops = &nolock_ops;
52 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
53
54 return 0;
55}
56
57static void nolock_others_may_mount(void *lockspace)
58{
59}
60
61static void nolock_unmount(void *lockspace)
62{
63 struct nolock_lockspace *nl = lockspace;
64 kfree(nl);
65}
66
67static void nolock_withdraw(void *lockspace)
68{
69}
70
71/**
72 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
73 * @lockspace: the lockspace the lock lives in
74 * @name: the name of the lock
75 * @lockp: return the lm_lock_t here
76 *
77 * Returns: 0 on success, -EXXX on failure
78 */
79
80static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
81 void **lockp)
82{
83 *lockp = lockspace;
84 return 0;
85}
86
87/**
88 * nolock_put_lock - get rid of a lock structure
89 * @lock: the lock to throw away
90 *
91 */
92
93static void nolock_put_lock(void *lock)
94{
95}
96
97/**
98 * nolock_lock - acquire a lock
99 * @lock: the lock to manipulate
100 * @cur_state: the current state
101 * @req_state: the requested state
102 * @flags: modifier flags
103 *
104 * Returns: A bitmap of LM_OUT_*
105 */
106
107static unsigned int nolock_lock(void *lock, unsigned int cur_state,
108 unsigned int req_state, unsigned int flags)
109{
110 return req_state | LM_OUT_CACHEABLE;
111}
112
113/**
114 * nolock_unlock - unlock a lock
115 * @lock: the lock to manipulate
116 * @cur_state: the current state
117 *
118 * Returns: 0
119 */
120
121static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
122{
123 return 0;
124}
125
126static void nolock_cancel(void *lock)
127{
128}
129
130/**
131 * nolock_hold_lvb - hold on to a lock value block
132 * @lock: the lock the LVB is associated with
133 * @lvbp: return the lm_lvb_t here
134 *
135 * Returns: 0 on success, -EXXX on failure
136 */
137
138static int nolock_hold_lvb(void *lock, char **lvbp)
139{
140 struct nolock_lockspace *nl = lock;
141 int error = 0;
142
143 *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
144 if (!*lvbp)
145 error = -ENOMEM;
146
147 return error;
148}
149
150/**
151 * nolock_unhold_lvb - release a LVB
152 * @lock: the lock the LVB is associated with
153 * @lvb: the lock value block
154 *
155 */
156
157static void nolock_unhold_lvb(void *lock, char *lvb)
158{
159 kfree(lvb);
160}
161
162static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
163 struct file *file, struct file_lock *fl)
164{
165 posix_test_lock(file, fl);
166
167 return 0;
168}
169
170static int nolock_plock(void *lockspace, struct lm_lockname *name,
171 struct file *file, int cmd, struct file_lock *fl)
172{
173 int error;
174 error = posix_lock_file_wait(file, fl);
175 return error;
176}
177
178static int nolock_punlock(void *lockspace, struct lm_lockname *name,
179 struct file *file, struct file_lock *fl)
180{
181 int error;
182 error = posix_lock_file_wait(file, fl);
183 return error;
184}
185
186static void nolock_recovery_done(void *lockspace, unsigned int jid,
187 unsigned int message)
188{
189}
190
191static const struct lm_lockops nolock_ops = {
192 .lm_proto_name = "lock_nolock",
193 .lm_mount = nolock_mount,
194 .lm_others_may_mount = nolock_others_may_mount,
195 .lm_unmount = nolock_unmount,
196 .lm_withdraw = nolock_withdraw,
197 .lm_get_lock = nolock_get_lock,
198 .lm_put_lock = nolock_put_lock,
199 .lm_lock = nolock_lock,
200 .lm_unlock = nolock_unlock,
201 .lm_cancel = nolock_cancel,
202 .lm_hold_lvb = nolock_hold_lvb,
203 .lm_unhold_lvb = nolock_unhold_lvb,
204 .lm_plock_get = nolock_plock_get,
205 .lm_plock = nolock_plock,
206 .lm_punlock = nolock_punlock,
207 .lm_recovery_done = nolock_recovery_done,
208 .lm_owner = THIS_MODULE,
209};
210
211static int __init init_nolock(void)
212{
213 int error;
214
215 error = gfs2_register_lockproto(&nolock_ops);
216 if (error) {
217 printk(KERN_WARNING
218 "lock_nolock: can't register protocol: %d\n", error);
219 return error;
220 }
221
222 printk(KERN_INFO
223 "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
224 return 0;
225}
226
227static void __exit exit_nolock(void)
228{
229 gfs2_unregister_lockproto(&nolock_ops);
230}
231
232module_init(init_nolock);
233module_exit(exit_nolock);
234
235MODULE_DESCRIPTION("GFS Nolock Locking Module");
236MODULE_AUTHOR("Red Hat, Inc.");
237MODULE_LICENSE("GPL");
238
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836d..6c6af9f5e3ab 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -87,6 +87,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
87 */ 87 */
88 88
89static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) 89static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
90__releases(&sdp->sd_log_lock)
91__acquires(&sdp->sd_log_lock)
90{ 92{
91 struct gfs2_bufdata *bd, *s; 93 struct gfs2_bufdata *bd, *s;
92 struct buffer_head *bh; 94 struct buffer_head *bh;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 771152816508..7c64510ccfd2 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23static inline void gfs2_log_lock(struct gfs2_sbd *sdp) 23static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
24__acquires(&sdp->sd_log_lock)
24{ 25{
25 spin_lock(&sdp->sd_log_lock); 26 spin_lock(&sdp->sd_log_lock);
26} 27}
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
32 */ 33 */
33 34
34static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) 35static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
36__releases(&sdp->sd_log_lock)
35{ 37{
36 spin_unlock(&sdp->sd_log_lock); 38 spin_unlock(&sdp->sd_log_lock);
37} 39}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd50..bcc668d0fadd 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -40,8 +40,6 @@ static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
40 INIT_HLIST_NODE(&gl->gl_list); 40 INIT_HLIST_NODE(&gl->gl_list);
41 spin_lock_init(&gl->gl_spin); 41 spin_lock_init(&gl->gl_spin);
42 INIT_LIST_HEAD(&gl->gl_holders); 42 INIT_LIST_HEAD(&gl->gl_holders);
43 INIT_LIST_HEAD(&gl->gl_waiters1);
44 INIT_LIST_HEAD(&gl->gl_waiters3);
45 gl->gl_lvb = NULL; 43 gl->gl_lvb = NULL;
46 atomic_set(&gl->gl_lvb_count, 0); 44 atomic_set(&gl->gl_lvb_count, 0);
47 INIT_LIST_HEAD(&gl->gl_reclaim); 45 INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78d75f892f82..09853620c951 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
129} 129}
130 130
131/** 131/**
132 * getbuf - Get a buffer with a given address space 132 * gfs2_getbuf - Get a buffer with a given address space
133 * @gl: the glock 133 * @gl: the glock
134 * @blkno: the block number (filesystem scope) 134 * @blkno: the block number (filesystem scope)
135 * @create: 1 if the buffer should be created 135 * @create: 1 if the buffer should be created
@@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
137 * Returns: the buffer 137 * Returns: the buffer
138 */ 138 */
139 139
140static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create) 140struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
141{ 141{
142 struct address_space *mapping = gl->gl_aspace->i_mapping; 142 struct address_space *mapping = gl->gl_aspace->i_mapping;
143 struct gfs2_sbd *sdp = gl->gl_sbd; 143 struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
205struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) 205struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
206{ 206{
207 struct buffer_head *bh; 207 struct buffer_head *bh;
208 bh = getbuf(gl, blkno, CREATE); 208 bh = gfs2_getbuf(gl, blkno, CREATE);
209 meta_prep_new(bh); 209 meta_prep_new(bh);
210 return bh; 210 return bh;
211} 211}
@@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
223int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, 223int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
224 struct buffer_head **bhp) 224 struct buffer_head **bhp)
225{ 225{
226 *bhp = getbuf(gl, blkno, CREATE); 226 *bhp = gfs2_getbuf(gl, blkno, CREATE);
227 if (!buffer_uptodate(*bhp)) { 227 if (!buffer_uptodate(*bhp)) {
228 ll_rw_block(READ_META, 1, bhp); 228 ll_rw_block(READ_META, 1, bhp);
229 if (flags & DIO_WAIT) { 229 if (flags & DIO_WAIT) {
@@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
346 struct buffer_head *bh; 346 struct buffer_head *bh;
347 347
348 while (blen) { 348 while (blen) {
349 bh = getbuf(ip->i_gl, bstart, NO_CREATE); 349 bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
350 if (bh) { 350 if (bh) {
351 lock_buffer(bh); 351 lock_buffer(bh);
352 gfs2_log_lock(sdp); 352 gfs2_log_lock(sdp);
@@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
421 if (extlen > max_ra) 421 if (extlen > max_ra)
422 extlen = max_ra; 422 extlen = max_ra;
423 423
424 first_bh = getbuf(gl, dblock, CREATE); 424 first_bh = gfs2_getbuf(gl, dblock, CREATE);
425 425
426 if (buffer_uptodate(first_bh)) 426 if (buffer_uptodate(first_bh))
427 goto out; 427 goto out;
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
432 extlen--; 432 extlen--;
433 433
434 while (extlen) { 434 while (extlen) {
435 bh = getbuf(gl, dblock, CREATE); 435 bh = gfs2_getbuf(gl, dblock, CREATE);
436 436
437 if (!buffer_uptodate(bh) && !buffer_locked(bh)) 437 if (!buffer_uptodate(bh) && !buffer_locked(bh))
438 ll_rw_block(READA, 1, &bh); 438 ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe1..b1a5f3674d43 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
47int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, 47int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
48 int flags, struct buffer_head **bhp); 48 int flags, struct buffer_head **bhp);
49int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); 49int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
50struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
50 51
51void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, 52void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
52 int meta); 53 int meta);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb2..e64a1b04117a 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
499 * @file: The file to read 499 * @file: The file to read
500 * @page: The page of the file 500 * @page: The page of the file
501 * 501 *
502 * This deals with the locking required. We use a trylock in order to 502 * This deals with the locking required. We have to unlock and
503 * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE 503 * relock the page in order to get the locking in the right
504 * in the event that we are unable to get the lock. 504 * order.
505 */ 505 */
506 506
507static int gfs2_readpage(struct file *file, struct page *page) 507static int gfs2_readpage(struct file *file, struct page *page)
508{ 508{
509 struct gfs2_inode *ip = GFS2_I(page->mapping->host); 509 struct address_space *mapping = page->mapping;
510 struct gfs2_holder *gh; 510 struct gfs2_inode *ip = GFS2_I(mapping->host);
511 struct gfs2_holder gh;
511 int error; 512 int error;
512 513
513 gh = gfs2_glock_is_locked_by_me(ip->i_gl); 514 unlock_page(page);
514 if (!gh) { 515 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
515 gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS); 516 error = gfs2_glock_nq_atime(&gh);
516 if (!gh) 517 if (unlikely(error))
517 return -ENOBUFS; 518 goto out;
518 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh); 519 error = AOP_TRUNCATED_PAGE;
520 lock_page(page);
521 if (page->mapping == mapping && !PageUptodate(page))
522 error = __gfs2_readpage(file, page);
523 else
519 unlock_page(page); 524 unlock_page(page);
520 error = gfs2_glock_nq_atime(gh); 525 gfs2_glock_dq(&gh);
521 if (likely(error != 0))
522 goto out;
523 return AOP_TRUNCATED_PAGE;
524 }
525 error = __gfs2_readpage(file, page);
526 gfs2_glock_dq(gh);
527out: 526out:
528 gfs2_holder_uninit(gh); 527 gfs2_holder_uninit(&gh);
529 kfree(gh); 528 if (error && error != AOP_TRUNCATED_PAGE)
529 lock_page(page);
530 return error; 530 return error;
531} 531}
532 532
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a066..e9a366d4411c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
15#include <linux/uio.h> 15#include <linux/uio.h>
16#include <linux/blkdev.h> 16#include <linux/blkdev.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/mount.h>
18#include <linux/fs.h> 19#include <linux/fs.h>
19#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
20#include <linux/ext2_fs.h> 21#include <linux/ext2_fs.h>
@@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
62 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 63 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
63 &i_gh); 64 &i_gh);
64 if (!error) { 65 if (!error) {
65 error = remote_llseek(file, offset, origin); 66 error = generic_file_llseek_unlocked(file, offset, origin);
66 gfs2_glock_dq_uninit(&i_gh); 67 gfs2_glock_dq_uninit(&i_gh);
67 } 68 }
68 } else 69 } else
69 error = remote_llseek(file, offset, origin); 70 error = generic_file_llseek_unlocked(file, offset, origin);
70 71
71 return error; 72 return error;
72} 73}
@@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
133 [7] = GFS2_DIF_NOATIME, 134 [7] = GFS2_DIF_NOATIME,
134 [12] = GFS2_DIF_EXHASH, 135 [12] = GFS2_DIF_EXHASH,
135 [14] = GFS2_DIF_INHERIT_JDATA, 136 [14] = GFS2_DIF_INHERIT_JDATA,
136 [20] = GFS2_DIF_INHERIT_DIRECTIO,
137}; 137};
138 138
139static const u32 gfs2_to_fsflags[32] = { 139static const u32 gfs2_to_fsflags[32] = {
@@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
142 [gfs2fl_AppendOnly] = FS_APPEND_FL, 142 [gfs2fl_AppendOnly] = FS_APPEND_FL,
143 [gfs2fl_NoAtime] = FS_NOATIME_FL, 143 [gfs2fl_NoAtime] = FS_NOATIME_FL,
144 [gfs2fl_ExHash] = FS_INDEX_FL, 144 [gfs2fl_ExHash] = FS_INDEX_FL,
145 [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
146 [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, 145 [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
147}; 146};
148 147
@@ -160,12 +159,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
160 return error; 159 return error;
161 160
162 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); 161 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
163 if (!S_ISDIR(inode->i_mode)) { 162 if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
164 if (ip->i_di.di_flags & GFS2_DIF_JDATA) 163 fsflags |= FS_JOURNAL_DATA_FL;
165 fsflags |= FS_JOURNAL_DATA_FL;
166 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
167 fsflags |= FS_DIRECTIO_FL;
168 }
169 if (put_user(fsflags, ptr)) 164 if (put_user(fsflags, ptr))
170 error = -EFAULT; 165 error = -EFAULT;
171 166
@@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
194 189
195/* Flags that can be set by user space */ 190/* Flags that can be set by user space */
196#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ 191#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
197 GFS2_DIF_DIRECTIO| \
198 GFS2_DIF_IMMUTABLE| \ 192 GFS2_DIF_IMMUTABLE| \
199 GFS2_DIF_APPENDONLY| \ 193 GFS2_DIF_APPENDONLY| \
200 GFS2_DIF_NOATIME| \ 194 GFS2_DIF_NOATIME| \
201 GFS2_DIF_SYNC| \ 195 GFS2_DIF_SYNC| \
202 GFS2_DIF_SYSTEM| \ 196 GFS2_DIF_SYSTEM| \
203 GFS2_DIF_INHERIT_DIRECTIO| \
204 GFS2_DIF_INHERIT_JDATA) 197 GFS2_DIF_INHERIT_JDATA)
205 198
206/** 199/**
@@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
220 int error; 213 int error;
221 u32 new_flags, flags; 214 u32 new_flags, flags;
222 215
223 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 216 error = mnt_want_write(filp->f_path.mnt);
224 if (error) 217 if (error)
225 return error; 218 return error;
226 219
220 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
221 if (error)
222 goto out_drop_write;
223
227 flags = ip->i_di.di_flags; 224 flags = ip->i_di.di_flags;
228 new_flags = (flags & ~mask) | (reqflags & mask); 225 new_flags = (flags & ~mask) | (reqflags & mask);
229 if ((new_flags ^ flags) == 0) 226 if ((new_flags ^ flags) == 0)
@@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
242 !capable(CAP_LINUX_IMMUTABLE)) 239 !capable(CAP_LINUX_IMMUTABLE))
243 goto out; 240 goto out;
244 if (!IS_IMMUTABLE(inode)) { 241 if (!IS_IMMUTABLE(inode)) {
245 error = permission(inode, MAY_WRITE, NULL); 242 error = gfs2_permission(inode, MAY_WRITE);
246 if (error) 243 if (error)
247 goto out; 244 goto out;
248 } 245 }
@@ -272,6 +269,8 @@ out_trans_end:
272 gfs2_trans_end(sdp); 269 gfs2_trans_end(sdp);
273out: 270out:
274 gfs2_glock_dq_uninit(&gh); 271 gfs2_glock_dq_uninit(&gh);
272out_drop_write:
273 mnt_drop_write(filp->f_path.mnt);
275 return error; 274 return error;
276} 275}
277 276
@@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
285 if (!S_ISDIR(inode->i_mode)) { 284 if (!S_ISDIR(inode->i_mode)) {
286 if (gfsflags & GFS2_DIF_INHERIT_JDATA) 285 if (gfsflags & GFS2_DIF_INHERIT_JDATA)
287 gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); 286 gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
288 if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
289 gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
290 return do_gfs2_set_flags(filp, gfsflags, ~0); 287 return do_gfs2_set_flags(filp, gfsflags, ~0);
291 } 288 }
292 return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); 289 return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
487 goto fail_gunlock; 484 goto fail_gunlock;
488 } 485 }
489 486
490 /* Listen to the Direct I/O flag */
491
492 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
493 file->f_flags |= O_DIRECT;
494
495 gfs2_glock_dq_uninit(&i_gh); 487 gfs2_glock_dq_uninit(&i_gh);
496 } 488 }
497 489
@@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
669 int error = 0; 661 int error = 0;
670 662
671 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; 663 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
672 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE 664 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
673 | GL_FLOCK;
674 665
675 mutex_lock(&fp->f_fl_mutex); 666 mutex_lock(&fp->f_fl_mutex);
676 667
@@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
683 gfs2_glock_dq_wait(fl_gh); 674 gfs2_glock_dq_wait(fl_gh);
684 gfs2_holder_reinit(state, flags, fl_gh); 675 gfs2_holder_reinit(state, flags, fl_gh);
685 } else { 676 } else {
686 error = gfs2_glock_get(GFS2_SB(&ip->i_inode), 677 error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
687 ip->i_no_addr, &gfs2_flock_glops, 678 &gfs2_flock_glops, CREATE, &gl);
688 CREATE, &gl);
689 if (error) 679 if (error)
690 goto out; 680 goto out;
691 gfs2_holder_init(gl, state, flags, fl_gh); 681 gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b2028c82e8d1..b4d1d6490633 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
64 mutex_init(&sdp->sd_rindex_mutex); 64 mutex_init(&sdp->sd_rindex_mutex);
65 INIT_LIST_HEAD(&sdp->sd_rindex_list); 65 INIT_LIST_HEAD(&sdp->sd_rindex_list);
66 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); 66 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
67 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
68 67
69 INIT_LIST_HEAD(&sdp->sd_jindex_list); 68 INIT_LIST_HEAD(&sdp->sd_jindex_list);
70 spin_lock_init(&sdp->sd_jindex_spin); 69 spin_lock_init(&sdp->sd_jindex_spin);
@@ -364,6 +363,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
364 363
365static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) 364static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
366{ 365{
366 if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
367 return;
367 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 368 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
368 sdp->sd_lockstruct.ls_ops->lm_others_may_mount( 369 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
369 sdp->sd_lockstruct.ls_lockspace); 370 sdp->sd_lockstruct.ls_lockspace);
@@ -741,8 +742,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
741 goto out; 742 goto out;
742 } 743 }
743 744
744 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) || 745 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
745 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
746 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= 746 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
747 GFS2_MIN_LVB_SIZE)) { 747 GFS2_MIN_LVB_SIZE)) {
748 gfs2_unmount_lockproto(&sdp->sd_lockstruct); 748 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
@@ -873,7 +873,7 @@ fail_sb:
873fail_locking: 873fail_locking:
874 init_locking(sdp, &mount_gh, UNDO); 874 init_locking(sdp, &mount_gh, UNDO);
875fail_lm: 875fail_lm:
876 gfs2_gl_hash_clear(sdp, WAIT); 876 gfs2_gl_hash_clear(sdp);
877 gfs2_lm_unmount(sdp); 877 gfs2_lm_unmount(sdp);
878 while (invalidate_inodes(sb)) 878 while (invalidate_inodes(sb))
879 yield(); 879 yield();
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c0029..1e252dfc5294 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -163,7 +163,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
163 if (error) 163 if (error)
164 goto out; 164 goto out;
165 165
166 error = permission(dir, MAY_WRITE | MAY_EXEC, NULL); 166 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
167 if (error) 167 if (error)
168 goto out_gunlock; 168 goto out_gunlock;
169 169
@@ -669,7 +669,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
669 } 669 }
670 } 670 }
671 } else { 671 } else {
672 error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL); 672 error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
673 if (error) 673 if (error)
674 goto out_gunlock; 674 goto out_gunlock;
675 675
@@ -704,7 +704,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
704 /* Check out the dir to be renamed */ 704 /* Check out the dir to be renamed */
705 705
706 if (dir_rename) { 706 if (dir_rename) {
707 error = permission(odentry->d_inode, MAY_WRITE, NULL); 707 error = gfs2_permission(odentry->d_inode, MAY_WRITE);
708 if (error) 708 if (error)
709 goto out_gunlock; 709 goto out_gunlock;
710 } 710 }
@@ -891,7 +891,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
891 * Returns: errno 891 * Returns: errno
892 */ 892 */
893 893
894static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd) 894int gfs2_permission(struct inode *inode, int mask)
895{ 895{
896 struct gfs2_inode *ip = GFS2_I(inode); 896 struct gfs2_inode *ip = GFS2_I(inode);
897 struct gfs2_holder i_gh; 897 struct gfs2_holder i_gh;
@@ -905,13 +905,22 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
905 unlock = 1; 905 unlock = 1;
906 } 906 }
907 907
908 error = generic_permission(inode, mask, gfs2_check_acl); 908 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
909 error = -EACCES;
910 else
911 error = generic_permission(inode, mask, gfs2_check_acl);
909 if (unlock) 912 if (unlock)
910 gfs2_glock_dq_uninit(&i_gh); 913 gfs2_glock_dq_uninit(&i_gh);
911 914
912 return error; 915 return error;
913} 916}
914 917
918static int gfs2_iop_permission(struct inode *inode, int mask,
919 struct nameidata *nd)
920{
921 return gfs2_permission(inode, mask);
922}
923
915static int setattr_size(struct inode *inode, struct iattr *attr) 924static int setattr_size(struct inode *inode, struct iattr *attr)
916{ 925{
917 struct gfs2_inode *ip = GFS2_I(inode); 926 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1141,7 +1150,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1141} 1150}
1142 1151
1143const struct inode_operations gfs2_file_iops = { 1152const struct inode_operations gfs2_file_iops = {
1144 .permission = gfs2_permission, 1153 .permission = gfs2_iop_permission,
1145 .setattr = gfs2_setattr, 1154 .setattr = gfs2_setattr,
1146 .getattr = gfs2_getattr, 1155 .getattr = gfs2_getattr,
1147 .setxattr = gfs2_setxattr, 1156 .setxattr = gfs2_setxattr,
@@ -1160,7 +1169,7 @@ const struct inode_operations gfs2_dir_iops = {
1160 .rmdir = gfs2_rmdir, 1169 .rmdir = gfs2_rmdir,
1161 .mknod = gfs2_mknod, 1170 .mknod = gfs2_mknod,
1162 .rename = gfs2_rename, 1171 .rename = gfs2_rename,
1163 .permission = gfs2_permission, 1172 .permission = gfs2_iop_permission,
1164 .setattr = gfs2_setattr, 1173 .setattr = gfs2_setattr,
1165 .getattr = gfs2_getattr, 1174 .getattr = gfs2_getattr,
1166 .setxattr = gfs2_setxattr, 1175 .setxattr = gfs2_setxattr,
@@ -1172,7 +1181,7 @@ const struct inode_operations gfs2_dir_iops = {
1172const struct inode_operations gfs2_symlink_iops = { 1181const struct inode_operations gfs2_symlink_iops = {
1173 .readlink = gfs2_readlink, 1182 .readlink = gfs2_readlink,
1174 .follow_link = gfs2_follow_link, 1183 .follow_link = gfs2_follow_link,
1175 .permission = gfs2_permission, 1184 .permission = gfs2_iop_permission,
1176 .setattr = gfs2_setattr, 1185 .setattr = gfs2_setattr,
1177 .getattr = gfs2_getattr, 1186 .getattr = gfs2_getattr,
1178 .setxattr = gfs2_setxattr, 1187 .setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0b7cc920eb89..f66ea0f7a356 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
126 gfs2_clear_rgrpd(sdp); 126 gfs2_clear_rgrpd(sdp);
127 gfs2_jindex_free(sdp); 127 gfs2_jindex_free(sdp);
128 /* Take apart glock structures and buffer lists */ 128 /* Take apart glock structures and buffer lists */
129 gfs2_gl_hash_clear(sdp, WAIT); 129 gfs2_gl_hash_clear(sdp);
130 /* Unmount the locking protocol */ 130 /* Unmount the locking protocol */
131 gfs2_lm_unmount(sdp); 131 gfs2_lm_unmount(sdp);
132 132
@@ -155,7 +155,7 @@ static void gfs2_write_super(struct super_block *sb)
155static int gfs2_sync_fs(struct super_block *sb, int wait) 155static int gfs2_sync_fs(struct super_block *sb, int wait)
156{ 156{
157 sb->s_dirt = 0; 157 sb->s_dirt = 0;
158 if (wait) 158 if (wait && sb->s_fs_info)
159 gfs2_log_flush(sb->s_fs_info, NULL); 159 gfs2_log_flush(sb->s_fs_info, NULL);
160 return 0; 160 return 0;
161} 161}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59a..3e073f5144fa 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
904 do_sync = 0; 904 do_sync = 0;
905 else { 905 else {
906 value *= gfs2_jindex_size(sdp) * num; 906 value *= gfs2_jindex_size(sdp) * num;
907 do_div(value, den); 907 value = div_s64(value, den);
908 value += (s64)be64_to_cpu(qd->qd_qb.qb_value); 908 value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
909 if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) 909 if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
910 do_sync = 0; 910 do_sync = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c5..d5e91f4f6a0b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
428static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, 428static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
429 unsigned int message) 429 unsigned int message)
430{ 430{
431 if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
432 return;
433
431 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 434 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
432 sdp->sd_lockstruct.ls_ops->lm_recovery_done( 435 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
433 sdp->sd_lockstruct.ls_lockspace, jid, message); 436 sdp->sd_lockstruct.ls_lockspace, jid, message);
@@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
505 508
506 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 509 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
507 LM_FLAG_NOEXP | LM_FLAG_PRIORITY | 510 LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
508 GL_NOCANCEL | GL_NOCACHE, &t_gh); 511 GL_NOCACHE, &t_gh);
509 if (error) 512 if (error)
510 goto fail_gunlock_ji; 513 goto fail_gunlock_ji;
511 514
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3401628d742b..2d90fb253505 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
371 371
372 spin_lock(&sdp->sd_rindex_spin); 372 spin_lock(&sdp->sd_rindex_spin);
373 sdp->sd_rindex_forward = NULL; 373 sdp->sd_rindex_forward = NULL;
374 head = &sdp->sd_rindex_recent_list;
375 while (!list_empty(head)) {
376 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
377 list_del(&rgd->rd_recent);
378 }
379 spin_unlock(&sdp->sd_rindex_spin); 374 spin_unlock(&sdp->sd_rindex_spin);
380 375
381 head = &sdp->sd_rindex_list; 376 head = &sdp->sd_rindex_list;
@@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
945} 940}
946 941
947/** 942/**
948 * recent_rgrp_first - get first RG from "recent" list
949 * @sdp: The GFS2 superblock
950 * @rglast: address of the rgrp used last
951 *
952 * Returns: The first rgrp in the recent list
953 */
954
955static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
956 u64 rglast)
957{
958 struct gfs2_rgrpd *rgd;
959
960 spin_lock(&sdp->sd_rindex_spin);
961
962 if (rglast) {
963 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
964 if (rgrp_contains_block(rgd, rglast))
965 goto out;
966 }
967 }
968 rgd = NULL;
969 if (!list_empty(&sdp->sd_rindex_recent_list))
970 rgd = list_entry(sdp->sd_rindex_recent_list.next,
971 struct gfs2_rgrpd, rd_recent);
972out:
973 spin_unlock(&sdp->sd_rindex_spin);
974 return rgd;
975}
976
977/**
978 * recent_rgrp_next - get next RG from "recent" list 943 * recent_rgrp_next - get next RG from "recent" list
979 * @cur_rgd: current rgrp 944 * @cur_rgd: current rgrp
980 * @remove:
981 * 945 *
982 * Returns: The next rgrp in the recent list 946 * Returns: The next rgrp in the recent list
983 */ 947 */
984 948
985static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd, 949static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
986 int remove)
987{ 950{
988 struct gfs2_sbd *sdp = cur_rgd->rd_sbd; 951 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
989 struct list_head *head; 952 struct list_head *head;
990 struct gfs2_rgrpd *rgd; 953 struct gfs2_rgrpd *rgd;
991 954
992 spin_lock(&sdp->sd_rindex_spin); 955 spin_lock(&sdp->sd_rindex_spin);
993 956 head = &sdp->sd_rindex_mru_list;
994 head = &sdp->sd_rindex_recent_list; 957 if (unlikely(cur_rgd->rd_list_mru.next == head)) {
995 958 spin_unlock(&sdp->sd_rindex_spin);
996 list_for_each_entry(rgd, head, rd_recent) { 959 return NULL;
997 if (rgd == cur_rgd) {
998 if (cur_rgd->rd_recent.next != head)
999 rgd = list_entry(cur_rgd->rd_recent.next,
1000 struct gfs2_rgrpd, rd_recent);
1001 else
1002 rgd = NULL;
1003
1004 if (remove)
1005 list_del(&cur_rgd->rd_recent);
1006
1007 goto out;
1008 }
1009 } 960 }
1010 961 rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
1011 rgd = NULL;
1012 if (!list_empty(head))
1013 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
1014
1015out:
1016 spin_unlock(&sdp->sd_rindex_spin); 962 spin_unlock(&sdp->sd_rindex_spin);
1017 return rgd; 963 return rgd;
1018} 964}
1019 965
1020/** 966/**
1021 * recent_rgrp_add - add an RG to tail of "recent" list
1022 * @new_rgd: The rgrp to add
1023 *
1024 */
1025
1026static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
1027{
1028 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
1029 struct gfs2_rgrpd *rgd;
1030 unsigned int count = 0;
1031 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
1032
1033 spin_lock(&sdp->sd_rindex_spin);
1034
1035 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
1036 if (rgd == new_rgd)
1037 goto out;
1038
1039 if (++count >= max)
1040 goto out;
1041 }
1042 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
1043
1044out:
1045 spin_unlock(&sdp->sd_rindex_spin);
1046}
1047
1048/**
1049 * forward_rgrp_get - get an rgrp to try next from full list 967 * forward_rgrp_get - get an rgrp to try next from full list
1050 * @sdp: The GFS2 superblock 968 * @sdp: The GFS2 superblock
1051 * 969 *
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1112 int loops = 0; 1030 int loops = 0;
1113 int error, rg_locked; 1031 int error, rg_locked;
1114 1032
1115 /* Try recently successful rgrps */ 1033 rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
1116
1117 rgd = recent_rgrp_first(sdp, ip->i_goal);
1118 1034
1119 while (rgd) { 1035 while (rgd) {
1120 rg_locked = 0; 1036 rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1136 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1052 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1137 if (inode) 1053 if (inode)
1138 return inode; 1054 return inode;
1139 rgd = recent_rgrp_next(rgd, 1); 1055 /* fall through */
1140 break;
1141
1142 case GLR_TRYFAILED: 1056 case GLR_TRYFAILED:
1143 rgd = recent_rgrp_next(rgd, 0); 1057 rgd = recent_rgrp_next(rgd);
1144 break; 1058 break;
1145 1059
1146 default: 1060 default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1199 1113
1200out: 1114out:
1201 if (begin) { 1115 if (begin) {
1202 recent_rgrp_add(rgd); 1116 spin_lock(&sdp->sd_rindex_spin);
1117 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
1118 spin_unlock(&sdp->sd_rindex_spin);
1203 rgd = gfs2_rgrpd_get_next(rgd); 1119 rgd = gfs2_rgrpd_get_next(rgd);
1204 if (!rgd) 1120 if (!rgd)
1205 rgd = gfs2_rgrpd_get_first(sdp); 1121 rgd = gfs2_rgrpd_get_first(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f35..63a8a902d9db 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -65,7 +65,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
65 gt->gt_quota_quantum = 60; 65 gt->gt_quota_quantum = 60;
66 gt->gt_atime_quantum = 3600; 66 gt->gt_atime_quantum = 3600;
67 gt->gt_new_files_jdata = 0; 67 gt->gt_new_files_jdata = 0;
68 gt->gt_new_files_directio = 0;
69 gt->gt_max_readahead = 1 << 18; 68 gt->gt_max_readahead = 1 << 18;
70 gt->gt_stall_secs = 600; 69 gt->gt_stall_secs = 600;
71 gt->gt_complain_secs = 10; 70 gt->gt_complain_secs = 10;
@@ -941,8 +940,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
941 } 940 }
942 941
943 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED, 942 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
944 LM_FLAG_PRIORITY | GL_NOCACHE, 943 GL_NOCACHE, t_gh);
945 t_gh);
946 944
947 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 945 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
948 error = gfs2_jdesc_check(jd); 946 error = gfs2_jdesc_check(jd);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd0..74846559fc3f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
110 return len; 110 return len;
111} 111}
112 112
113static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
114{
115 if (!capable(CAP_SYS_ADMIN))
116 return -EACCES;
117
118 if (simple_strtol(buf, NULL, 0) != 1)
119 return -EINVAL;
120
121 gfs2_gl_hash_clear(sdp, NO_WAIT);
122 return len;
123}
124
125static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, 113static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
126 size_t len) 114 size_t len)
127{ 115{
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
175GFS2_ATTR(id, 0444, id_show, NULL); 163GFS2_ATTR(id, 0444, id_show, NULL);
176GFS2_ATTR(fsname, 0444, fsname_show, NULL); 164GFS2_ATTR(fsname, 0444, fsname_show, NULL);
177GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); 165GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
178GFS2_ATTR(shrink, 0200, NULL, shrink_store);
179GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 166GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
180GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); 167GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
181GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); 168GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
186 &gfs2_attr_id.attr, 173 &gfs2_attr_id.attr,
187 &gfs2_attr_fsname.attr, 174 &gfs2_attr_fsname.attr,
188 &gfs2_attr_freeze.attr, 175 &gfs2_attr_freeze.attr,
189 &gfs2_attr_shrink.attr,
190 &gfs2_attr_withdraw.attr, 176 &gfs2_attr_withdraw.attr,
191 &gfs2_attr_statfs_sync.attr, 177 &gfs2_attr_statfs_sync.attr,
192 &gfs2_attr_quota_sync.attr, 178 &gfs2_attr_quota_sync.attr,
@@ -426,7 +412,6 @@ TUNE_ATTR(max_readahead, 0);
426TUNE_ATTR(complain_secs, 0); 412TUNE_ATTR(complain_secs, 0);
427TUNE_ATTR(statfs_slow, 0); 413TUNE_ATTR(statfs_slow, 0);
428TUNE_ATTR(new_files_jdata, 0); 414TUNE_ATTR(new_files_jdata, 0);
429TUNE_ATTR(new_files_directio, 0);
430TUNE_ATTR(quota_simul_sync, 1); 415TUNE_ATTR(quota_simul_sync, 1);
431TUNE_ATTR(quota_cache_secs, 1); 416TUNE_ATTR(quota_cache_secs, 1);
432TUNE_ATTR(stall_secs, 1); 417TUNE_ATTR(stall_secs, 1);
@@ -455,7 +440,6 @@ static struct attribute *tune_attrs[] = {
455 &tune_attr_quotad_secs.attr, 440 &tune_attr_quotad_secs.attr,
456 &tune_attr_quota_scale.attr, 441 &tune_attr_quota_scale.attr,
457 &tune_attr_new_files_jdata.attr, 442 &tune_attr_new_files_jdata.attr,
458 &tune_attr_new_files_directio.attr,
459 NULL, 443 NULL,
460}; 444};
461 445
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
688 688
689 J_ASSERT(transaction->t_state == T_FINISHED); 689 J_ASSERT(transaction->t_state == T_FINISHED);
690 J_ASSERT(transaction->t_buffers == NULL); 690 J_ASSERT(transaction->t_buffers == NULL);
691 J_ASSERT(transaction->t_sync_datalist == NULL);
692 J_ASSERT(transaction->t_forget == NULL); 691 J_ASSERT(transaction->t_forget == NULL);
693 J_ASSERT(transaction->t_iobuf_list == NULL); 692 J_ASSERT(transaction->t_iobuf_list == NULL);
694 J_ASSERT(transaction->t_shadow_list == NULL); 693 J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/crc32.h> 24#include <linux/crc32.h>
25#include <linux/writeback.h>
26#include <linux/backing-dev.h>
25 27
26/* 28/*
27 * Default IO end handler for temporary BJ_IO buffer_heads. 29 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37} 39}
38 40
39/* 41/*
40 * When an ext3-ordered file is truncated, it is possible that many pages are 42 * When an ext4 file is truncated, it is possible that some pages are not
41 * not sucessfully freed, because they are attached to a committing transaction. 43 * successfully freed, because they are attached to a committing transaction.
42 * After the transaction commits, these pages are left on the LRU, with no 44 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable 45 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes 46 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
80} 82}
81 83
82/* 84/*
83 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84 * held. For ranking reasons we must trylock. If we lose, schedule away and
85 * return 0. j_list_lock is dropped in this case.
86 */
87static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88{
89 if (!jbd_trylock_bh_state(bh)) {
90 spin_unlock(&journal->j_list_lock);
91 schedule();
92 return 0;
93 }
94 return 1;
95}
96
97/*
98 * Done it all: now submit the commit record. We should have 85 * Done it all: now submit the commit record. We should have
99 * cleaned up our previous buffers by now, so if we are in abort 86 * cleaned up our previous buffers by now, so if we are in abort
100 * mode we can now just skip the rest of the journal write 87 * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
112 struct buffer_head *bh; 99 struct buffer_head *bh;
113 int ret; 100 int ret;
114 int barrier_done = 0; 101 int barrier_done = 0;
102 struct timespec now = current_kernel_time();
115 103
116 if (is_journal_aborted(journal)) 104 if (is_journal_aborted(journal))
117 return 0; 105 return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
126 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 114 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 115 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); 116 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
117 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
118 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
129 119
130 if (JBD2_HAS_COMPAT_FEATURE(journal, 120 if (JBD2_HAS_COMPAT_FEATURE(journal,
131 JBD2_FEATURE_COMPAT_CHECKSUM)) { 121 JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
197} 187}
198 188
199/* 189/*
200 * Wait for all submitted IO to complete. 190 * write the filemap data using writepage() address_space_operations.
191 * We don't do block allocation here even for delalloc. We don't
192 * use writepages() because with dealyed allocation we may be doing
193 * block allocation in writepages().
201 */ 194 */
202static int journal_wait_on_locked_list(journal_t *journal, 195static int journal_submit_inode_data_buffers(struct address_space *mapping)
203 transaction_t *commit_transaction)
204{ 196{
205 int ret = 0; 197 int ret;
206 struct journal_head *jh; 198 struct writeback_control wbc = {
207 199 .sync_mode = WB_SYNC_ALL,
208 while (commit_transaction->t_locked_list) { 200 .nr_to_write = mapping->nrpages * 2,
209 struct buffer_head *bh; 201 .range_start = 0,
210 202 .range_end = i_size_read(mapping->host),
211 jh = commit_transaction->t_locked_list->b_tprev; 203 .for_writepages = 1,
212 bh = jh2bh(jh); 204 };
213 get_bh(bh); 205
214 if (buffer_locked(bh)) { 206 ret = generic_writepages(mapping, &wbc);
215 spin_unlock(&journal->j_list_lock);
216 wait_on_buffer(bh);
217 if (unlikely(!buffer_uptodate(bh)))
218 ret = -EIO;
219 spin_lock(&journal->j_list_lock);
220 }
221 if (!inverted_lock(journal, bh)) {
222 put_bh(bh);
223 spin_lock(&journal->j_list_lock);
224 continue;
225 }
226 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
227 __jbd2_journal_unfile_buffer(jh);
228 jbd_unlock_bh_state(bh);
229 jbd2_journal_remove_journal_head(bh);
230 put_bh(bh);
231 } else {
232 jbd_unlock_bh_state(bh);
233 }
234 put_bh(bh);
235 cond_resched_lock(&journal->j_list_lock);
236 }
237 return ret; 207 return ret;
238 } 208}
239 209
240static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) 210/*
211 * Submit all the data buffers of inode associated with the transaction to
212 * disk.
213 *
214 * We are in a committing transaction. Therefore no new inode can be added to
215 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
216 * operate on from being released while we write out pages.
217 */
218static int journal_submit_data_buffers(journal_t *journal,
219 transaction_t *commit_transaction)
241{ 220{
242 int i; 221 struct jbd2_inode *jinode;
222 int err, ret = 0;
223 struct address_space *mapping;
243 224
244 for (i = 0; i < bufs; i++) { 225 spin_lock(&journal->j_list_lock);
245 wbuf[i]->b_end_io = end_buffer_write_sync; 226 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
246 /* We use-up our safety reference in submit_bh() */ 227 mapping = jinode->i_vfs_inode->i_mapping;
247 submit_bh(WRITE, wbuf[i]); 228 jinode->i_flags |= JI_COMMIT_RUNNING;
229 spin_unlock(&journal->j_list_lock);
230 /*
231 * submit the inode data buffers. We use writepage
232 * instead of writepages. Because writepages can do
233 * block allocation with delalloc. We need to write
234 * only allocated blocks here.
235 */
236 err = journal_submit_inode_data_buffers(mapping);
237 if (!ret)
238 ret = err;
239 spin_lock(&journal->j_list_lock);
240 J_ASSERT(jinode->i_transaction == commit_transaction);
241 jinode->i_flags &= ~JI_COMMIT_RUNNING;
242 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
248 } 243 }
244 spin_unlock(&journal->j_list_lock);
245 return ret;
249} 246}
250 247
251/* 248/*
252 * Submit all the data buffers to disk 249 * Wait for data submitted for writeout, refile inodes to proper
250 * transaction if needed.
251 *
253 */ 252 */
254static void journal_submit_data_buffers(journal_t *journal, 253static int journal_finish_inode_data_buffers(journal_t *journal,
255 transaction_t *commit_transaction) 254 transaction_t *commit_transaction)
256{ 255{
257 struct journal_head *jh; 256 struct jbd2_inode *jinode, *next_i;
258 struct buffer_head *bh; 257 int err, ret = 0;
259 int locked;
260 int bufs = 0;
261 struct buffer_head **wbuf = journal->j_wbuf;
262 258
263 /* 259 /* For locking, see the comment in journal_submit_data_buffers() */
264 * Whenever we unlock the journal and sleep, things can get added
265 * onto ->t_sync_datalist, so we have to keep looping back to
266 * write_out_data until we *know* that the list is empty.
267 *
268 * Cleanup any flushed data buffers from the data list. Even in
269 * abort mode, we want to flush this out as soon as possible.
270 */
271write_out_data:
272 cond_resched();
273 spin_lock(&journal->j_list_lock); 260 spin_lock(&journal->j_list_lock);
261 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
262 jinode->i_flags |= JI_COMMIT_RUNNING;
263 spin_unlock(&journal->j_list_lock);
264 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
265 if (!ret)
266 ret = err;
267 spin_lock(&journal->j_list_lock);
268 jinode->i_flags &= ~JI_COMMIT_RUNNING;
269 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
270 }
274 271
275 while (commit_transaction->t_sync_datalist) { 272 /* Now refile inode to proper lists */
276 jh = commit_transaction->t_sync_datalist; 273 list_for_each_entry_safe(jinode, next_i,
277 bh = jh2bh(jh); 274 &commit_transaction->t_inode_list, i_list) {
278 locked = 0; 275 list_del(&jinode->i_list);
279 276 if (jinode->i_next_transaction) {
280 /* Get reference just to make sure buffer does not disappear 277 jinode->i_transaction = jinode->i_next_transaction;
281 * when we are forced to drop various locks */ 278 jinode->i_next_transaction = NULL;
282 get_bh(bh); 279 list_add(&jinode->i_list,
283 /* If the buffer is dirty, we need to submit IO and hence 280 &jinode->i_transaction->t_inode_list);
284 * we need the buffer lock. We try to lock the buffer without
285 * blocking. If we fail, we need to drop j_list_lock and do
286 * blocking lock_buffer().
287 */
288 if (buffer_dirty(bh)) {
289 if (test_set_buffer_locked(bh)) {
290 BUFFER_TRACE(bh, "needs blocking lock");
291 spin_unlock(&journal->j_list_lock);
292 /* Write out all data to prevent deadlocks */
293 journal_do_submit_data(wbuf, bufs);
294 bufs = 0;
295 lock_buffer(bh);
296 spin_lock(&journal->j_list_lock);
297 }
298 locked = 1;
299 }
300 /* We have to get bh_state lock. Again out of order, sigh. */
301 if (!inverted_lock(journal, bh)) {
302 jbd_lock_bh_state(bh);
303 spin_lock(&journal->j_list_lock);
304 }
305 /* Someone already cleaned up the buffer? */
306 if (!buffer_jbd(bh)
307 || jh->b_transaction != commit_transaction
308 || jh->b_jlist != BJ_SyncData) {
309 jbd_unlock_bh_state(bh);
310 if (locked)
311 unlock_buffer(bh);
312 BUFFER_TRACE(bh, "already cleaned up");
313 put_bh(bh);
314 continue;
315 }
316 if (locked && test_clear_buffer_dirty(bh)) {
317 BUFFER_TRACE(bh, "needs writeout, adding to array");
318 wbuf[bufs++] = bh;
319 __jbd2_journal_file_buffer(jh, commit_transaction,
320 BJ_Locked);
321 jbd_unlock_bh_state(bh);
322 if (bufs == journal->j_wbufsize) {
323 spin_unlock(&journal->j_list_lock);
324 journal_do_submit_data(wbuf, bufs);
325 bufs = 0;
326 goto write_out_data;
327 }
328 } else if (!locked && buffer_locked(bh)) {
329 __jbd2_journal_file_buffer(jh, commit_transaction,
330 BJ_Locked);
331 jbd_unlock_bh_state(bh);
332 put_bh(bh);
333 } else { 281 } else {
334 BUFFER_TRACE(bh, "writeout complete: unfile"); 282 jinode->i_transaction = NULL;
335 __jbd2_journal_unfile_buffer(jh);
336 jbd_unlock_bh_state(bh);
337 if (locked)
338 unlock_buffer(bh);
339 jbd2_journal_remove_journal_head(bh);
340 /* Once for our safety reference, once for
341 * jbd2_journal_remove_journal_head() */
342 put_bh(bh);
343 put_bh(bh);
344 }
345
346 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
347 spin_unlock(&journal->j_list_lock);
348 goto write_out_data;
349 } 283 }
350 } 284 }
351 spin_unlock(&journal->j_list_lock); 285 spin_unlock(&journal->j_list_lock);
352 journal_do_submit_data(wbuf, bufs); 286
287 return ret;
353} 288}
354 289
355static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 290static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
524 * Now start flushing things to disk, in the order they appear 459 * Now start flushing things to disk, in the order they appear
525 * on the transaction lists. Data blocks go first. 460 * on the transaction lists. Data blocks go first.
526 */ 461 */
527 err = 0; 462 err = journal_submit_data_buffers(journal, commit_transaction);
528 journal_submit_data_buffers(journal, commit_transaction);
529
530 /*
531 * Wait for all previously submitted IO to complete if commit
532 * record is to be written synchronously.
533 */
534 spin_lock(&journal->j_list_lock);
535 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
536 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
537 err = journal_wait_on_locked_list(journal,
538 commit_transaction);
539
540 spin_unlock(&journal->j_list_lock);
541
542 if (err) 463 if (err)
543 jbd2_journal_abort(journal, err); 464 jbd2_journal_abort(journal, err);
544 465
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
547 jbd_debug(3, "JBD: commit phase 2\n"); 468 jbd_debug(3, "JBD: commit phase 2\n");
548 469
549 /* 470 /*
550 * If we found any dirty or locked buffers, then we should have
551 * looped back up to the write_out_data label. If there weren't
552 * any then journal_clean_data_list should have wiped the list
553 * clean by now, so check that it is in fact empty.
554 */
555 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
556
557 jbd_debug (3, "JBD: commit phase 3\n");
558
559 /*
560 * Way to go: we have now written out all of the data for a 471 * Way to go: we have now written out all of the data for a
561 * transaction! Now comes the tricky part: we need to write out 472 * transaction! Now comes the tricky part: we need to write out
562 * metadata. Loop over the transaction's entire buffer list: 473 * metadata. Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
574 J_ASSERT(commit_transaction->t_nr_buffers <= 485 J_ASSERT(commit_transaction->t_nr_buffers <=
575 commit_transaction->t_outstanding_credits); 486 commit_transaction->t_outstanding_credits);
576 487
488 err = 0;
577 descriptor = NULL; 489 descriptor = NULL;
578 bufs = 0; 490 bufs = 0;
579 while (commit_transaction->t_buffers) { 491 while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
748 &cbh, crc32_sum); 660 &cbh, crc32_sum);
749 if (err) 661 if (err)
750 __jbd2_journal_abort_hard(journal); 662 __jbd2_journal_abort_hard(journal);
751
752 spin_lock(&journal->j_list_lock);
753 err = journal_wait_on_locked_list(journal,
754 commit_transaction);
755 spin_unlock(&journal->j_list_lock);
756 if (err)
757 __jbd2_journal_abort_hard(journal);
758 } 663 }
759 664
665 /*
666 * This is the right place to wait for data buffers both for ASYNC
667 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
668 * the commit block went to disk (which happens above). If commit is
669 * SYNC, we need to wait for data buffers before we start writing
670 * commit block, which happens below in such setting.
671 */
672 err = journal_finish_inode_data_buffers(journal, commit_transaction);
673 if (err)
674 jbd2_journal_abort(journal, err);
675
760 /* Lo and behold: we have just managed to send a transaction to 676 /* Lo and behold: we have just managed to send a transaction to
761 the log. Before we can commit it, wait for the IO so far to 677 the log. Before we can commit it, wait for the IO so far to
762 complete. Control buffers being written are on the 678 complete. Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
768 so we incur less scheduling load. 684 so we incur less scheduling load.
769 */ 685 */
770 686
771 jbd_debug(3, "JBD: commit phase 4\n"); 687 jbd_debug(3, "JBD: commit phase 3\n");
772 688
773 /* 689 /*
774 * akpm: these are BJ_IO, and j_list_lock is not needed. 690 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
827 743
828 J_ASSERT (commit_transaction->t_shadow_list == NULL); 744 J_ASSERT (commit_transaction->t_shadow_list == NULL);
829 745
830 jbd_debug(3, "JBD: commit phase 5\n"); 746 jbd_debug(3, "JBD: commit phase 4\n");
831 747
832 /* Here we wait for the revoke record and descriptor record buffers */ 748 /* Here we wait for the revoke record and descriptor record buffers */
833 wait_for_ctlbuf: 749 wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
854 /* AKPM: bforget here */ 770 /* AKPM: bforget here */
855 } 771 }
856 772
857 jbd_debug(3, "JBD: commit phase 6\n"); 773 jbd_debug(3, "JBD: commit phase 5\n");
858 774
859 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 775 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
860 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 776 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
874 transaction can be removed from any checkpoint list it was on 790 transaction can be removed from any checkpoint list it was on
875 before. */ 791 before. */
876 792
877 jbd_debug(3, "JBD: commit phase 7\n"); 793 jbd_debug(3, "JBD: commit phase 6\n");
878 794
879 J_ASSERT(commit_transaction->t_sync_datalist == NULL); 795 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
880 J_ASSERT(commit_transaction->t_buffers == NULL); 796 J_ASSERT(commit_transaction->t_buffers == NULL);
881 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 797 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
882 J_ASSERT(commit_transaction->t_iobuf_list == NULL); 798 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
997 913
998 /* Done with this transaction! */ 914 /* Done with this transaction! */
999 915
1000 jbd_debug(3, "JBD: commit phase 8\n"); 916 jbd_debug(3, "JBD: commit phase 7\n");
1001 917
1002 J_ASSERT(commit_transaction->t_state == T_COMMIT); 918 J_ASSERT(commit_transaction->t_state == T_COMMIT);
1003 919
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
50EXPORT_SYMBOL(jbd2_journal_get_write_access); 50EXPORT_SYMBOL(jbd2_journal_get_write_access);
51EXPORT_SYMBOL(jbd2_journal_get_create_access); 51EXPORT_SYMBOL(jbd2_journal_get_create_access);
52EXPORT_SYMBOL(jbd2_journal_get_undo_access); 52EXPORT_SYMBOL(jbd2_journal_get_undo_access);
53EXPORT_SYMBOL(jbd2_journal_dirty_data);
54EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 53EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
55EXPORT_SYMBOL(jbd2_journal_release_buffer); 54EXPORT_SYMBOL(jbd2_journal_release_buffer);
56EXPORT_SYMBOL(jbd2_journal_forget); 55EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
82EXPORT_SYMBOL(jbd2_journal_invalidatepage); 81EXPORT_SYMBOL(jbd2_journal_invalidatepage);
83EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); 82EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
84EXPORT_SYMBOL(jbd2_journal_force_commit); 83EXPORT_SYMBOL(jbd2_journal_force_commit);
84EXPORT_SYMBOL(jbd2_journal_file_inode);
85EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
86EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
87EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
85 88
86static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 89static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
87static void __journal_abort_soft (journal_t *journal, int errno); 90static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
2195} 2198}
2196 2199
2197/* 2200/*
2201 * Initialize jbd inode head
2202 */
2203void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
2204{
2205 jinode->i_transaction = NULL;
2206 jinode->i_next_transaction = NULL;
2207 jinode->i_vfs_inode = inode;
2208 jinode->i_flags = 0;
2209 INIT_LIST_HEAD(&jinode->i_list);
2210}
2211
2212/*
2213 * Function to be called before we start removing inode from memory (i.e.,
2214 * clear_inode() is a fine place to be called from). It removes inode from
2215 * transaction's lists.
2216 */
2217void jbd2_journal_release_jbd_inode(journal_t *journal,
2218 struct jbd2_inode *jinode)
2219{
2220 int writeout = 0;
2221
2222 if (!journal)
2223 return;
2224restart:
2225 spin_lock(&journal->j_list_lock);
2226 /* Is commit writing out inode - we have to wait */
2227 if (jinode->i_flags & JI_COMMIT_RUNNING) {
2228 wait_queue_head_t *wq;
2229 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2230 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
2231 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2232 spin_unlock(&journal->j_list_lock);
2233 schedule();
2234 finish_wait(wq, &wait.wait);
2235 goto restart;
2236 }
2237
2238 /* Do we need to wait for data writeback? */
2239 if (journal->j_committing_transaction == jinode->i_transaction)
2240 writeout = 1;
2241 if (jinode->i_transaction) {
2242 list_del(&jinode->i_list);
2243 jinode->i_transaction = NULL;
2244 }
2245 spin_unlock(&journal->j_list_lock);
2246}
2247
2248/*
2198 * debugfs tunables 2249 * debugfs tunables
2199 */ 2250 */
2200#ifdef CONFIG_JBD2_DEBUG 2251#ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
41 * new transaction and we can't block without protecting against other 41 * new transaction and we can't block without protecting against other
42 * processes trying to touch the journal while it is in transition. 42 * processes trying to touch the journal while it is in transition.
43 * 43 *
44 * Called under j_state_lock
45 */ 44 */
46 45
47static transaction_t * 46static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
52 transaction->t_tid = journal->j_transaction_sequence++; 51 transaction->t_tid = journal->j_transaction_sequence++;
53 transaction->t_expires = jiffies + journal->j_commit_interval; 52 transaction->t_expires = jiffies + journal->j_commit_interval;
54 spin_lock_init(&transaction->t_handle_lock); 53 spin_lock_init(&transaction->t_handle_lock);
54 INIT_LIST_HEAD(&transaction->t_inode_list);
55 55
56 /* Set up the commit timer for the new transaction. */ 56 /* Set up the commit timer for the new transaction. */
57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
943} 943}
944 944
945/** 945/**
946 * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
947 * needs to be flushed before we can commit the
948 * current transaction.
949 * @handle: transaction
950 * @bh: bufferhead to mark
951 *
952 * The buffer is placed on the transaction's data list and is marked as
953 * belonging to the transaction.
954 *
955 * Returns error number or 0 on success.
956 *
957 * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
958 * by kswapd.
959 */
960int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
961{
962 journal_t *journal = handle->h_transaction->t_journal;
963 int need_brelse = 0;
964 struct journal_head *jh;
965
966 if (is_handle_aborted(handle))
967 return 0;
968
969 jh = jbd2_journal_add_journal_head(bh);
970 JBUFFER_TRACE(jh, "entry");
971
972 /*
973 * The buffer could *already* be dirty. Writeout can start
974 * at any time.
975 */
976 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
977
978 /*
979 * What if the buffer is already part of a running transaction?
980 *
981 * There are two cases:
982 * 1) It is part of the current running transaction. Refile it,
983 * just in case we have allocated it as metadata, deallocated
984 * it, then reallocated it as data.
985 * 2) It is part of the previous, still-committing transaction.
986 * If all we want to do is to guarantee that the buffer will be
987 * written to disk before this new transaction commits, then
988 * being sure that the *previous* transaction has this same
989 * property is sufficient for us! Just leave it on its old
990 * transaction.
991 *
992 * In case (2), the buffer must not already exist as metadata
993 * --- that would violate write ordering (a transaction is free
994 * to write its data at any point, even before the previous
995 * committing transaction has committed). The caller must
996 * never, ever allow this to happen: there's nothing we can do
997 * about it in this layer.
998 */
999 jbd_lock_bh_state(bh);
1000 spin_lock(&journal->j_list_lock);
1001
1002 /* Now that we have bh_state locked, are we really still mapped? */
1003 if (!buffer_mapped(bh)) {
1004 JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
1005 goto no_journal;
1006 }
1007
1008 if (jh->b_transaction) {
1009 JBUFFER_TRACE(jh, "has transaction");
1010 if (jh->b_transaction != handle->h_transaction) {
1011 JBUFFER_TRACE(jh, "belongs to older transaction");
1012 J_ASSERT_JH(jh, jh->b_transaction ==
1013 journal->j_committing_transaction);
1014
1015 /* @@@ IS THIS TRUE ? */
1016 /*
1017 * Not any more. Scenario: someone does a write()
1018 * in data=journal mode. The buffer's transaction has
1019 * moved into commit. Then someone does another
1020 * write() to the file. We do the frozen data copyout
1021 * and set b_next_transaction to point to j_running_t.
1022 * And while we're in that state, someone does a
1023 * writepage() in an attempt to pageout the same area
1024 * of the file via a shared mapping. At present that
1025 * calls jbd2_journal_dirty_data(), and we get right here.
1026 * It may be too late to journal the data. Simply
1027 * falling through to the next test will suffice: the
1028 * data will be dirty and wil be checkpointed. The
1029 * ordering comments in the next comment block still
1030 * apply.
1031 */
1032 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1033
1034 /*
1035 * If we're journalling data, and this buffer was
1036 * subject to a write(), it could be metadata, forget
1037 * or shadow against the committing transaction. Now,
1038 * someone has dirtied the same darn page via a mapping
1039 * and it is being writepage()'d.
1040 * We *could* just steal the page from commit, with some
1041 * fancy locking there. Instead, we just skip it -
1042 * don't tie the page's buffers to the new transaction
1043 * at all.
1044 * Implication: if we crash before the writepage() data
1045 * is written into the filesystem, recovery will replay
1046 * the write() data.
1047 */
1048 if (jh->b_jlist != BJ_None &&
1049 jh->b_jlist != BJ_SyncData &&
1050 jh->b_jlist != BJ_Locked) {
1051 JBUFFER_TRACE(jh, "Not stealing");
1052 goto no_journal;
1053 }
1054
1055 /*
1056 * This buffer may be undergoing writeout in commit. We
1057 * can't return from here and let the caller dirty it
1058 * again because that can cause the write-out loop in
1059 * commit to never terminate.
1060 */
1061 if (buffer_dirty(bh)) {
1062 get_bh(bh);
1063 spin_unlock(&journal->j_list_lock);
1064 jbd_unlock_bh_state(bh);
1065 need_brelse = 1;
1066 sync_dirty_buffer(bh);
1067 jbd_lock_bh_state(bh);
1068 spin_lock(&journal->j_list_lock);
1069 /* Since we dropped the lock... */
1070 if (!buffer_mapped(bh)) {
1071 JBUFFER_TRACE(jh, "buffer got unmapped");
1072 goto no_journal;
1073 }
1074 /* The buffer may become locked again at any
1075 time if it is redirtied */
1076 }
1077
1078 /* journal_clean_data_list() may have got there first */
1079 if (jh->b_transaction != NULL) {
1080 JBUFFER_TRACE(jh, "unfile from commit");
1081 __jbd2_journal_temp_unlink_buffer(jh);
1082 /* It still points to the committing
1083 * transaction; move it to this one so
1084 * that the refile assert checks are
1085 * happy. */
1086 jh->b_transaction = handle->h_transaction;
1087 }
1088 /* The buffer will be refiled below */
1089
1090 }
1091 /*
1092 * Special case --- the buffer might actually have been
1093 * allocated and then immediately deallocated in the previous,
1094 * committing transaction, so might still be left on that
1095 * transaction's metadata lists.
1096 */
1097 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1098 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1099 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1100 __jbd2_journal_temp_unlink_buffer(jh);
1101 jh->b_transaction = handle->h_transaction;
1102 JBUFFER_TRACE(jh, "file as data");
1103 __jbd2_journal_file_buffer(jh, handle->h_transaction,
1104 BJ_SyncData);
1105 }
1106 } else {
1107 JBUFFER_TRACE(jh, "not on a transaction");
1108 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1109 }
1110no_journal:
1111 spin_unlock(&journal->j_list_lock);
1112 jbd_unlock_bh_state(bh);
1113 if (need_brelse) {
1114 BUFFER_TRACE(bh, "brelse");
1115 __brelse(bh);
1116 }
1117 JBUFFER_TRACE(jh, "exit");
1118 jbd2_journal_put_journal_head(jh);
1119 return 0;
1120}
1121
1122/**
1123 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata 946 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
1124 * @handle: transaction to add buffer to. 947 * @handle: transaction to add buffer to.
1125 * @bh: buffer to mark 948 * @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1541 * Remove a buffer from the appropriate transaction list. 1364 * Remove a buffer from the appropriate transaction list.
1542 * 1365 *
1543 * Note that this function can *change* the value of 1366 * Note that this function can *change* the value of
1544 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, 1367 * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
1545 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller 1368 * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
1546 * is holding onto a copy of one of thee pointers, it could go bad. 1369 * of these pointers, it could go bad. Generally the caller needs to re-read
1547 * Generally the caller needs to re-read the pointer from the transaction_t. 1370 * the pointer from the transaction_t.
1548 * 1371 *
1549 * Called under j_list_lock. The journal may not be locked. 1372 * Called under j_list_lock. The journal may not be locked.
1550 */ 1373 */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1566 switch (jh->b_jlist) { 1389 switch (jh->b_jlist) {
1567 case BJ_None: 1390 case BJ_None:
1568 return; 1391 return;
1569 case BJ_SyncData:
1570 list = &transaction->t_sync_datalist;
1571 break;
1572 case BJ_Metadata: 1392 case BJ_Metadata:
1573 transaction->t_nr_buffers--; 1393 transaction->t_nr_buffers--;
1574 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); 1394 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1589 case BJ_Reserved: 1409 case BJ_Reserved:
1590 list = &transaction->t_reserved_list; 1410 list = &transaction->t_reserved_list;
1591 break; 1411 break;
1592 case BJ_Locked:
1593 list = &transaction->t_locked_list;
1594 break;
1595 } 1412 }
1596 1413
1597 __blist_del_buffer(list, jh); 1414 __blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1634 goto out; 1451 goto out;
1635 1452
1636 spin_lock(&journal->j_list_lock); 1453 spin_lock(&journal->j_list_lock);
1637 if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { 1454 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1638 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1639 /* A written-back ordered data buffer */
1640 JBUFFER_TRACE(jh, "release data");
1641 __jbd2_journal_unfile_buffer(jh);
1642 jbd2_journal_remove_journal_head(bh);
1643 __brelse(bh);
1644 }
1645 } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1646 /* written-back checkpointed metadata buffer */ 1455 /* written-back checkpointed metadata buffer */
1647 if (jh->b_jlist == BJ_None) { 1456 if (jh->b_jlist == BJ_None) {
1648 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1457 JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
1656 return; 1465 return;
1657} 1466}
1658 1467
1468/*
1469 * jbd2_journal_try_to_free_buffers() could race with
1470 * jbd2_journal_commit_transaction(). The later might still hold the
1471 * reference count to the buffers when inspecting them on
1472 * t_syncdata_list or t_locked_list.
1473 *
1474 * jbd2_journal_try_to_free_buffers() will call this function to
1475 * wait for the current transaction to finish syncing data buffers, before
1476 * try to free that buffer.
1477 *
1478 * Called with journal->j_state_lock hold.
1479 */
1480static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
1481{
1482 transaction_t *transaction;
1483 tid_t tid;
1484
1485 spin_lock(&journal->j_state_lock);
1486 transaction = journal->j_committing_transaction;
1487
1488 if (!transaction) {
1489 spin_unlock(&journal->j_state_lock);
1490 return;
1491 }
1492
1493 tid = transaction->t_tid;
1494 spin_unlock(&journal->j_state_lock);
1495 jbd2_log_wait_commit(journal, tid);
1496}
1659 1497
1660/** 1498/**
1661 * int jbd2_journal_try_to_free_buffers() - try to free page buffers. 1499 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
1662 * @journal: journal for operation 1500 * @journal: journal for operation
1663 * @page: to try and free 1501 * @page: to try and free
1664 * @unused_gfp_mask: unused 1502 * @gfp_mask: we use the mask to detect how hard should we try to release
1503 * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
1504 * release the buffers.
1665 * 1505 *
1666 * 1506 *
1667 * For all the buffers on this page, 1507 * For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
1690 * journal_try_to_free_buffer() is changing its state. But that 1530 * journal_try_to_free_buffer() is changing its state. But that
1691 * cannot happen because we never reallocate freed data as metadata 1531 * cannot happen because we never reallocate freed data as metadata
1692 * while the data is part of a transaction. Yes? 1532 * while the data is part of a transaction. Yes?
1533 *
1534 * Return 0 on failure, 1 on success
1693 */ 1535 */
1694int jbd2_journal_try_to_free_buffers(journal_t *journal, 1536int jbd2_journal_try_to_free_buffers(journal_t *journal,
1695 struct page *page, gfp_t unused_gfp_mask) 1537 struct page *page, gfp_t gfp_mask)
1696{ 1538{
1697 struct buffer_head *head; 1539 struct buffer_head *head;
1698 struct buffer_head *bh; 1540 struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
1708 /* 1550 /*
1709 * We take our own ref against the journal_head here to avoid 1551 * We take our own ref against the journal_head here to avoid
1710 * having to add tons of locking around each instance of 1552 * having to add tons of locking around each instance of
1711 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head(). 1553 * jbd2_journal_remove_journal_head() and
1554 * jbd2_journal_put_journal_head().
1712 */ 1555 */
1713 jh = jbd2_journal_grab_journal_head(bh); 1556 jh = jbd2_journal_grab_journal_head(bh);
1714 if (!jh) 1557 if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
1721 if (buffer_jbd(bh)) 1564 if (buffer_jbd(bh))
1722 goto busy; 1565 goto busy;
1723 } while ((bh = bh->b_this_page) != head); 1566 } while ((bh = bh->b_this_page) != head);
1567
1724 ret = try_to_free_buffers(page); 1568 ret = try_to_free_buffers(page);
1569
1570 /*
1571 * There are a number of places where jbd2_journal_try_to_free_buffers()
1572 * could race with jbd2_journal_commit_transaction(), the later still
1573 * holds the reference to the buffers to free while processing them.
1574 * try_to_free_buffers() failed to free those buffers. Some of the
1575 * caller of releasepage() request page buffers to be dropped, otherwise
1576 * treat the fail-to-free as errors (such as generic_file_direct_IO())
1577 *
1578 * So, if the caller of try_to_release_page() wants the synchronous
1579 * behaviour(i.e make sure buffers are dropped upon return),
1580 * let's wait for the current transaction to finish flush of
1581 * dirty data buffers, then try to free those buffers again,
1582 * with the journal locked.
1583 */
1584 if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
1585 jbd2_journal_wait_for_transaction_sync_data(journal);
1586 ret = try_to_free_buffers(page);
1587 }
1588
1725busy: 1589busy:
1726 return ret; 1590 return ret;
1727} 1591}
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1823 if (!buffer_jbd(bh)) 1687 if (!buffer_jbd(bh))
1824 goto zap_buffer_unlocked; 1688 goto zap_buffer_unlocked;
1825 1689
1690 /* OK, we have data buffer in journaled mode */
1826 spin_lock(&journal->j_state_lock); 1691 spin_lock(&journal->j_state_lock);
1827 jbd_lock_bh_state(bh); 1692 jbd_lock_bh_state(bh);
1828 spin_lock(&journal->j_list_lock); 1693 spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1886 } 1751 }
1887 } else if (transaction == journal->j_committing_transaction) { 1752 } else if (transaction == journal->j_committing_transaction) {
1888 JBUFFER_TRACE(jh, "on committing transaction"); 1753 JBUFFER_TRACE(jh, "on committing transaction");
1889 if (jh->b_jlist == BJ_Locked) {
1890 /*
1891 * The buffer is on the committing transaction's locked
1892 * list. We have the buffer locked, so I/O has
1893 * completed. So we can nail the buffer now.
1894 */
1895 may_free = __dispose_buffer(jh, transaction);
1896 goto zap_buffer;
1897 }
1898 /* 1754 /*
1899 * If it is committing, we simply cannot touch it. We 1755 * If it is committing, we simply cannot touch it. We
1900 * can remove it's next_transaction pointer from the 1756 * can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
2027 J_ASSERT_JH(jh, !jh->b_committed_data); 1883 J_ASSERT_JH(jh, !jh->b_committed_data);
2028 J_ASSERT_JH(jh, !jh->b_frozen_data); 1884 J_ASSERT_JH(jh, !jh->b_frozen_data);
2029 return; 1885 return;
2030 case BJ_SyncData:
2031 list = &transaction->t_sync_datalist;
2032 break;
2033 case BJ_Metadata: 1886 case BJ_Metadata:
2034 transaction->t_nr_buffers++; 1887 transaction->t_nr_buffers++;
2035 list = &transaction->t_buffers; 1888 list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
2049 case BJ_Reserved: 1902 case BJ_Reserved:
2050 list = &transaction->t_reserved_list; 1903 list = &transaction->t_reserved_list;
2051 break; 1904 break;
2052 case BJ_Locked:
2053 list = &transaction->t_locked_list;
2054 break;
2055 } 1905 }
2056 1906
2057 __blist_add_buffer(list, jh); 1907 __blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2141 spin_unlock(&journal->j_list_lock); 1991 spin_unlock(&journal->j_list_lock);
2142 __brelse(bh); 1992 __brelse(bh);
2143} 1993}
1994
1995/*
1996 * File inode in the inode list of the handle's transaction
1997 */
1998int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
1999{
2000 transaction_t *transaction = handle->h_transaction;
2001 journal_t *journal = transaction->t_journal;
2002
2003 if (is_handle_aborted(handle))
2004 return -EIO;
2005
2006 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2007 transaction->t_tid);
2008
2009 /*
2010 * First check whether inode isn't already on the transaction's
2011 * lists without taking the lock. Note that this check is safe
2012 * without the lock as we cannot race with somebody removing inode
2013 * from the transaction. The reason is that we remove inode from the
2014 * transaction only in journal_release_jbd_inode() and when we commit
2015 * the transaction. We are guarded from the first case by holding
2016 * a reference to the inode. We are safe against the second case
2017 * because if jinode->i_transaction == transaction, commit code
2018 * cannot touch the transaction because we hold reference to it,
2019 * and if jinode->i_next_transaction == transaction, commit code
2020 * will only file the inode where we want it.
2021 */
2022 if (jinode->i_transaction == transaction ||
2023 jinode->i_next_transaction == transaction)
2024 return 0;
2025
2026 spin_lock(&journal->j_list_lock);
2027
2028 if (jinode->i_transaction == transaction ||
2029 jinode->i_next_transaction == transaction)
2030 goto done;
2031
2032 /* On some different transaction's list - should be
2033 * the committing one */
2034 if (jinode->i_transaction) {
2035 J_ASSERT(jinode->i_next_transaction == NULL);
2036 J_ASSERT(jinode->i_transaction ==
2037 journal->j_committing_transaction);
2038 jinode->i_next_transaction = transaction;
2039 goto done;
2040 }
2041 /* Not on any transaction list... */
2042 J_ASSERT(!jinode->i_next_transaction);
2043 jinode->i_transaction = transaction;
2044 list_add(&jinode->i_list, &transaction->t_inode_list);
2045done:
2046 spin_unlock(&journal->j_list_lock);
2047
2048 return 0;
2049}
2050
2051/*
2052 * This function must be called when inode is journaled in ordered mode
2053 * before truncation happens. It starts writeout of truncated part in
2054 * case it is in the committing transaction so that we stand to ordered
2055 * mode consistency guarantees.
2056 */
2057int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
2058 loff_t new_size)
2059{
2060 journal_t *journal;
2061 transaction_t *commit_trans;
2062 int ret = 0;
2063
2064 if (!inode->i_transaction && !inode->i_next_transaction)
2065 goto out;
2066 journal = inode->i_transaction->t_journal;
2067 spin_lock(&journal->j_state_lock);
2068 commit_trans = journal->j_committing_transaction;
2069 spin_unlock(&journal->j_state_lock);
2070 if (inode->i_transaction == commit_trans) {
2071 ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
2072 new_size, LLONG_MAX);
2073 if (ret)
2074 jbd2_journal_abort(journal, ret);
2075 }
2076out:
2077 return ret;
2078}
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index bf6ab19b86ee..6a73de84bcef 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -21,6 +21,7 @@
21#include <linux/ctype.h> 21#include <linux/ctype.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/proc_fs.h> 23#include <linux/proc_fs.h>
24#include <linux/seq_file.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include "jfs_incore.h" 26#include "jfs_incore.h"
26#include "jfs_filsys.h" 27#include "jfs_filsys.h"
@@ -30,29 +31,19 @@
30 31
31static struct proc_dir_entry *base; 32static struct proc_dir_entry *base;
32#ifdef CONFIG_JFS_DEBUG 33#ifdef CONFIG_JFS_DEBUG
33static int loglevel_read(char *page, char **start, off_t off, 34static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
34 int count, int *eof, void *data)
35{ 35{
36 int len; 36 seq_printf(m, "%d\n", jfsloglevel);
37 37 return 0;
38 len = sprintf(page, "%d\n", jfsloglevel); 38}
39
40 len -= off;
41 *start = page + off;
42
43 if (len > count)
44 len = count;
45 else
46 *eof = 1;
47
48 if (len < 0)
49 len = 0;
50 39
51 return len; 40static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
41{
42 return single_open(file, jfs_loglevel_proc_show, NULL);
52} 43}
53 44
54static int loglevel_write(struct file *file, const char __user *buffer, 45static ssize_t jfs_loglevel_proc_write(struct file *file,
55 unsigned long count, void *data) 46 const char __user *buffer, size_t count, loff_t *ppos)
56{ 47{
57 char c; 48 char c;
58 49
@@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer,
65 jfsloglevel = c - '0'; 56 jfsloglevel = c - '0';
66 return count; 57 return count;
67} 58}
59
60static const struct file_operations jfs_loglevel_proc_fops = {
61 .owner = THIS_MODULE,
62 .open = jfs_loglevel_proc_open,
63 .read = seq_read,
64 .llseek = seq_lseek,
65 .release = single_release,
66 .write = jfs_loglevel_proc_write,
67};
68#endif 68#endif
69 69
70static struct { 70static struct {
71 const char *name; 71 const char *name;
72 read_proc_t *read_fn; 72 const struct file_operations *proc_fops;
73 write_proc_t *write_fn;
74} Entries[] = { 73} Entries[] = {
75#ifdef CONFIG_JFS_STATISTICS 74#ifdef CONFIG_JFS_STATISTICS
76 { "lmstats", jfs_lmstats_read, }, 75 { "lmstats", &jfs_lmstats_proc_fops, },
77 { "txstats", jfs_txstats_read, }, 76 { "txstats", &jfs_txstats_proc_fops, },
78 { "xtstat", jfs_xtstat_read, }, 77 { "xtstat", &jfs_xtstat_proc_fops, },
79 { "mpstat", jfs_mpstat_read, }, 78 { "mpstat", &jfs_mpstat_proc_fops, },
80#endif 79#endif
81#ifdef CONFIG_JFS_DEBUG 80#ifdef CONFIG_JFS_DEBUG
82 { "TxAnchor", jfs_txanchor_read, }, 81 { "TxAnchor", &jfs_txanchor_proc_fops, },
83 { "loglevel", loglevel_read, loglevel_write } 82 { "loglevel", &jfs_loglevel_proc_fops }
84#endif 83#endif
85}; 84};
86#define NPROCENT ARRAY_SIZE(Entries) 85#define NPROCENT ARRAY_SIZE(Entries)
@@ -93,13 +92,8 @@ void jfs_proc_init(void)
93 return; 92 return;
94 base->owner = THIS_MODULE; 93 base->owner = THIS_MODULE;
95 94
96 for (i = 0; i < NPROCENT; i++) { 95 for (i = 0; i < NPROCENT; i++)
97 struct proc_dir_entry *p; 96 proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
98 if ((p = create_proc_entry(Entries[i].name, 0, base))) {
99 p->read_proc = Entries[i].read_fn;
100 p->write_proc = Entries[i].write_fn;
101 }
102 }
103} 97}
104 98
105void jfs_proc_clean(void) 99void jfs_proc_clean(void)
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 044c1e654cc0..eafd1300a00b 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
62 62
63extern int jfsloglevel; 63extern int jfsloglevel;
64 64
65extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); 65extern const struct file_operations jfs_txanchor_proc_fops;
66 66
67/* information message: e.g., configuration, major event */ 67/* information message: e.g., configuration, major event */
68#define jfs_info(fmt, arg...) do { \ 68#define jfs_info(fmt, arg...) do { \
@@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
105 * ---------- 105 * ----------
106 */ 106 */
107#ifdef CONFIG_JFS_STATISTICS 107#ifdef CONFIG_JFS_STATISTICS
108extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *); 108extern const struct file_operations jfs_lmstats_proc_fops;
109extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *); 109extern const struct file_operations jfs_txstats_proc_fops;
110extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *); 110extern const struct file_operations jfs_mpstat_proc_fops;
111extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *); 111extern const struct file_operations jfs_xtstat_proc_fops;
112 112
113#define INCREMENT(x) ((x)++) 113#define INCREMENT(x) ((x)++)
114#define DECREMENT(x) ((x)--) 114#define DECREMENT(x) ((x)--)
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index cdac2d5bafeb..2545bb317235 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -243,9 +243,6 @@ typedef union {
243#define JFS_REMOVE 3 243#define JFS_REMOVE 3
244#define JFS_RENAME 4 244#define JFS_RENAME 4
245 245
246#define DIRENTSIZ(namlen) \
247 ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
248
249/* 246/*
250 * Maximum file offset for directories. 247 * Maximum file offset for directories.
251 */ 248 */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 734ec916beaf..d6363d8309d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1520 jfs_error(ip->i_sb, 1520 jfs_error(ip->i_sb,
1521 "diAlloc: can't find free bit " 1521 "diAlloc: can't find free bit "
1522 "in wmap"); 1522 "in wmap");
1523 return EIO; 1523 return -EIO;
1524 } 1524 }
1525 1525
1526 /* determine the inode number within the 1526 /* determine the inode number within the
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 325a9679b95a..cd2ec2988b59 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -69,6 +69,7 @@
69#include <linux/freezer.h> 69#include <linux/freezer.h>
70#include <linux/delay.h> 70#include <linux/delay.h>
71#include <linux/mutex.h> 71#include <linux/mutex.h>
72#include <linux/seq_file.h>
72#include "jfs_incore.h" 73#include "jfs_incore.h"
73#include "jfs_filsys.h" 74#include "jfs_filsys.h"
74#include "jfs_metapage.h" 75#include "jfs_metapage.h"
@@ -2503,13 +2504,9 @@ exit:
2503} 2504}
2504 2505
2505#ifdef CONFIG_JFS_STATISTICS 2506#ifdef CONFIG_JFS_STATISTICS
2506int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length, 2507static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
2507 int *eof, void *data)
2508{ 2508{
2509 int len = 0; 2509 seq_printf(m,
2510 off_t begin;
2511
2512 len += sprintf(buffer,
2513 "JFS Logmgr stats\n" 2510 "JFS Logmgr stats\n"
2514 "================\n" 2511 "================\n"
2515 "commits = %d\n" 2512 "commits = %d\n"
@@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
2522 lmStat.pagedone, 2519 lmStat.pagedone,
2523 lmStat.full_page, 2520 lmStat.full_page,
2524 lmStat.partial_page); 2521 lmStat.partial_page);
2522 return 0;
2523}
2525 2524
2526 begin = offset; 2525static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
2527 *start = buffer + begin; 2526{
2528 len -= begin; 2527 return single_open(file, jfs_lmstats_proc_show, NULL);
2529
2530 if (len > length)
2531 len = length;
2532 else
2533 *eof = 1;
2534
2535 if (len < 0)
2536 len = 0;
2537
2538 return len;
2539} 2528}
2529
2530const struct file_operations jfs_lmstats_proc_fops = {
2531 .owner = THIS_MODULE,
2532 .open = jfs_lmstats_proc_open,
2533 .read = seq_read,
2534 .llseek = seq_lseek,
2535 .release = single_release,
2536};
2540#endif /* CONFIG_JFS_STATISTICS */ 2537#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d1e64f2f2fcd..854ff0ec574f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -19,10 +19,12 @@
19 19
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h>
22#include <linux/bio.h> 23#include <linux/bio.h>
23#include <linux/init.h> 24#include <linux/init.h>
24#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
25#include <linux/mempool.h> 26#include <linux/mempool.h>
27#include <linux/seq_file.h>
26#include "jfs_incore.h" 28#include "jfs_incore.h"
27#include "jfs_superblock.h" 29#include "jfs_superblock.h"
28#include "jfs_filsys.h" 30#include "jfs_filsys.h"
@@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
804} 806}
805 807
806#ifdef CONFIG_JFS_STATISTICS 808#ifdef CONFIG_JFS_STATISTICS
807int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length, 809static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
808 int *eof, void *data)
809{ 810{
810 int len = 0; 811 seq_printf(m,
811 off_t begin;
812
813 len += sprintf(buffer,
814 "JFS Metapage statistics\n" 812 "JFS Metapage statistics\n"
815 "=======================\n" 813 "=======================\n"
816 "page allocations = %d\n" 814 "page allocations = %d\n"
@@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
819 mpStat.pagealloc, 817 mpStat.pagealloc,
820 mpStat.pagefree, 818 mpStat.pagefree,
821 mpStat.lockwait); 819 mpStat.lockwait);
820 return 0;
821}
822 822
823 begin = offset; 823static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
824 *start = buffer + begin; 824{
825 len -= begin; 825 return single_open(file, jfs_mpstat_proc_show, NULL);
826
827 if (len > length)
828 len = length;
829 else
830 *eof = 1;
831
832 if (len < 0)
833 len = 0;
834
835 return len;
836} 826}
827
828const struct file_operations jfs_mpstat_proc_fops = {
829 .owner = THIS_MODULE,
830 .open = jfs_mpstat_proc_open,
831 .read = seq_read,
832 .llseek = seq_lseek,
833 .release = single_release,
834};
837#endif 835#endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index e7c60ae6b5b2..f26e4d03ada5 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/moduleparam.h> 50#include <linux/moduleparam.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
52#include <linux/seq_file.h>
52#include "jfs_incore.h" 53#include "jfs_incore.h"
53#include "jfs_inode.h" 54#include "jfs_inode.h"
54#include "jfs_filsys.h" 55#include "jfs_filsys.h"
@@ -3009,11 +3010,8 @@ int jfs_sync(void *arg)
3009} 3010}
3010 3011
3011#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) 3012#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
3012int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length, 3013static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
3013 int *eof, void *data)
3014{ 3014{
3015 int len = 0;
3016 off_t begin;
3017 char *freewait; 3015 char *freewait;
3018 char *freelockwait; 3016 char *freelockwait;
3019 char *lowlockwait; 3017 char *lowlockwait;
@@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
3025 lowlockwait = 3023 lowlockwait =
3026 waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; 3024 waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
3027 3025
3028 len += sprintf(buffer, 3026 seq_printf(m,
3029 "JFS TxAnchor\n" 3027 "JFS TxAnchor\n"
3030 "============\n" 3028 "============\n"
3031 "freetid = %d\n" 3029 "freetid = %d\n"
@@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
3044 TxAnchor.tlocksInUse, 3042 TxAnchor.tlocksInUse,
3045 jfs_tlocks_low, 3043 jfs_tlocks_low,
3046 list_empty(&TxAnchor.unlock_queue) ? "" : "not "); 3044 list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
3045 return 0;
3046}
3047 3047
3048 begin = offset; 3048static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
3049 *start = buffer + begin; 3049{
3050 len -= begin; 3050 return single_open(file, jfs_txanchor_proc_show, NULL);
3051
3052 if (len > length)
3053 len = length;
3054 else
3055 *eof = 1;
3056
3057 if (len < 0)
3058 len = 0;
3059
3060 return len;
3061} 3051}
3052
3053const struct file_operations jfs_txanchor_proc_fops = {
3054 .owner = THIS_MODULE,
3055 .open = jfs_txanchor_proc_open,
3056 .read = seq_read,
3057 .llseek = seq_lseek,
3058 .release = single_release,
3059};
3062#endif 3060#endif
3063 3061
3064#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) 3062#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
3065int jfs_txstats_read(char *buffer, char **start, off_t offset, int length, 3063static int jfs_txstats_proc_show(struct seq_file *m, void *v)
3066 int *eof, void *data)
3067{ 3064{
3068 int len = 0; 3065 seq_printf(m,
3069 off_t begin;
3070
3071 len += sprintf(buffer,
3072 "JFS TxStats\n" 3066 "JFS TxStats\n"
3073 "===========\n" 3067 "===========\n"
3074 "calls to txBegin = %d\n" 3068 "calls to txBegin = %d\n"
@@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
3089 TxStat.txBeginAnon_lockslow, 3083 TxStat.txBeginAnon_lockslow,
3090 TxStat.txLockAlloc, 3084 TxStat.txLockAlloc,
3091 TxStat.txLockAlloc_freelock); 3085 TxStat.txLockAlloc_freelock);
3086 return 0;
3087}
3092 3088
3093 begin = offset; 3089static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
3094 *start = buffer + begin; 3090{
3095 len -= begin; 3091 return single_open(file, jfs_txstats_proc_show, NULL);
3096
3097 if (len > length)
3098 len = length;
3099 else
3100 *eof = 1;
3101
3102 if (len < 0)
3103 len = 0;
3104
3105 return len;
3106} 3092}
3093
3094const struct file_operations jfs_txstats_proc_fops = {
3095 .owner = THIS_MODULE,
3096 .open = jfs_txstats_proc_open,
3097 .read = seq_read,
3098 .llseek = seq_lseek,
3099 .release = single_release,
3100};
3107#endif 3101#endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5a61ebf2cbcc..ae3acafb447b 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -20,7 +20,9 @@
20 */ 20 */
21 21
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/module.h>
23#include <linux/quotaops.h> 24#include <linux/quotaops.h>
25#include <linux/seq_file.h>
24#include "jfs_incore.h" 26#include "jfs_incore.h"
25#include "jfs_filsys.h" 27#include "jfs_filsys.h"
26#include "jfs_metapage.h" 28#include "jfs_metapage.h"
@@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
4134} 4136}
4135 4137
4136#ifdef CONFIG_JFS_STATISTICS 4138#ifdef CONFIG_JFS_STATISTICS
4137int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length, 4139static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
4138 int *eof, void *data)
4139{ 4140{
4140 int len = 0; 4141 seq_printf(m,
4141 off_t begin;
4142
4143 len += sprintf(buffer,
4144 "JFS Xtree statistics\n" 4142 "JFS Xtree statistics\n"
4145 "====================\n" 4143 "====================\n"
4146 "searches = %d\n" 4144 "searches = %d\n"
@@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
4149 xtStat.search, 4147 xtStat.search,
4150 xtStat.fastSearch, 4148 xtStat.fastSearch,
4151 xtStat.split); 4149 xtStat.split);
4150 return 0;
4151}
4152 4152
4153 begin = offset; 4153static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
4154 *start = buffer + begin; 4154{
4155 len -= begin; 4155 return single_open(file, jfs_xtstat_proc_show, NULL);
4156
4157 if (len > length)
4158 len = length;
4159 else
4160 *eof = 1;
4161
4162 if (len < 0)
4163 len = 0;
4164
4165 return len;
4166} 4156}
4157
4158const struct file_operations jfs_xtstat_proc_fops = {
4159 .owner = THIS_MODULE,
4160 .open = jfs_xtstat_proc_open,
4161 .read = seq_read,
4162 .llseek = seq_lseek,
4163 .release = single_release,
4164};
4167#endif 4165#endif
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 0ba6778edaa2..2aba82386810 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
1455 free_UCSname(&key); 1455 free_UCSname(&key);
1456 if (rc == -ENOENT) { 1456 if (rc == -ENOENT) {
1457 d_add(dentry, NULL); 1457 d_add(dentry, NULL);
1458 return ERR_PTR(0); 1458 return NULL;
1459 } else if (rc) { 1459 } else if (rc) {
1460 jfs_err("jfs_lookup: dtSearch returned %d", rc); 1460 jfs_err("jfs_lookup: dtSearch returned %d", rc);
1461 return ERR_PTR(rc); 1461 return ERR_PTR(rc);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 50ea65451732..0288e6d7936a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -499,7 +499,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
499 inode = jfs_iget(sb, ROOT_I); 499 inode = jfs_iget(sb, ROOT_I);
500 if (IS_ERR(inode)) { 500 if (IS_ERR(inode)) {
501 ret = PTR_ERR(inode); 501 ret = PTR_ERR(inode);
502 goto out_no_root; 502 goto out_no_rw;
503 } 503 }
504 sb->s_root = d_alloc_root(inode); 504 sb->s_root = d_alloc_root(inode);
505 if (!sb->s_root) 505 if (!sb->s_root)
@@ -521,9 +521,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
521 return 0; 521 return 0;
522 522
523out_no_root: 523out_no_root:
524 jfs_err("jfs_read_super: get root inode failed"); 524 jfs_err("jfs_read_super: get root dentry failed");
525 if (inode) 525 iput(inode);
526 iput(inode);
527 526
528out_no_rw: 527out_no_rw:
529 rc = jfs_umount(sb); 528 rc = jfs_umount(sb);
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
82 bio_put(bio); 82 bio_put(bio);
83} 83}
84 84
85static struct bio *mpage_bio_submit(int rw, struct bio *bio) 85struct bio *mpage_bio_submit(int rw, struct bio *bio)
86{ 86{
87 bio->bi_end_io = mpage_end_io_read; 87 bio->bi_end_io = mpage_end_io_read;
88 if (rw == WRITE) 88 if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
90 submit_bio(rw, bio); 90 submit_bio(rw, bio);
91 return NULL; 91 return NULL;
92} 92}
93EXPORT_SYMBOL(mpage_bio_submit);
93 94
94static struct bio * 95static struct bio *
95mpage_alloc(struct block_device *bdev, 96mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
435 * written, so it can intelligently allocate a suitably-sized BIO. For now, 436 * written, so it can intelligently allocate a suitably-sized BIO. For now,
436 * just allocate full-size (16-page) BIOs. 437 * just allocate full-size (16-page) BIOs.
437 */ 438 */
438struct mpage_data {
439 struct bio *bio;
440 sector_t last_block_in_bio;
441 get_block_t *get_block;
442 unsigned use_writepage;
443};
444 439
445static int __mpage_writepage(struct page *page, struct writeback_control *wbc, 440int __mpage_writepage(struct page *page, struct writeback_control *wbc,
446 void *data) 441 void *data)
447{ 442{
448 struct mpage_data *mpd = data; 443 struct mpage_data *mpd = data;
449 struct bio *bio = mpd->bio; 444 struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
651 mpd->bio = bio; 646 mpd->bio = bio;
652 return ret; 647 return ret;
653} 648}
649EXPORT_SYMBOL(__mpage_writepage);
654 650
655/** 651/**
656 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them 652 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d7026..1f7f2956412a 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
214 214
215 dentry->d_op = &msdos_dentry_operations; 215 dentry->d_op = &msdos_dentry_operations;
216 216
217 lock_kernel(); 217 lock_super(sb);
218 res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); 218 res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
219 if (res == -ENOENT) 219 if (res == -ENOENT)
220 goto add; 220 goto add;
@@ -232,7 +232,7 @@ add:
232 if (dentry) 232 if (dentry)
233 dentry->d_op = &msdos_dentry_operations; 233 dentry->d_op = &msdos_dentry_operations;
234out: 234out:
235 unlock_kernel(); 235 unlock_super(sb);
236 if (!res) 236 if (!res)
237 return dentry; 237 return dentry;
238 return ERR_PTR(res); 238 return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
286 unsigned char msdos_name[MSDOS_NAME]; 286 unsigned char msdos_name[MSDOS_NAME];
287 int err, is_hid; 287 int err, is_hid;
288 288
289 lock_kernel(); 289 lock_super(sb);
290 290
291 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, 291 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
292 msdos_name, &MSDOS_SB(sb)->options); 292 msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
315 315
316 d_instantiate(dentry, inode); 316 d_instantiate(dentry, inode);
317out: 317out:
318 unlock_kernel(); 318 unlock_super(sb);
319 if (!err) 319 if (!err)
320 err = fat_flush_inodes(sb, dir, inode); 320 err = fat_flush_inodes(sb, dir, inode);
321 return err; 321 return err;
@@ -324,11 +324,12 @@ out:
324/***** Remove a directory */ 324/***** Remove a directory */
325static int msdos_rmdir(struct inode *dir, struct dentry *dentry) 325static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
326{ 326{
327 struct super_block *sb = dir->i_sb;
327 struct inode *inode = dentry->d_inode; 328 struct inode *inode = dentry->d_inode;
328 struct fat_slot_info sinfo; 329 struct fat_slot_info sinfo;
329 int err; 330 int err;
330 331
331 lock_kernel(); 332 lock_super(sb);
332 /* 333 /*
333 * Check whether the directory is not in use, then check 334 * Check whether the directory is not in use, then check
334 * whether it is empty. 335 * whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
349 inode->i_ctime = CURRENT_TIME_SEC; 350 inode->i_ctime = CURRENT_TIME_SEC;
350 fat_detach(inode); 351 fat_detach(inode);
351out: 352out:
352 unlock_kernel(); 353 unlock_super(sb);
353 if (!err) 354 if (!err)
354 err = fat_flush_inodes(inode->i_sb, dir, inode); 355 err = fat_flush_inodes(sb, dir, inode);
355 356
356 return err; 357 return err;
357} 358}
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
366 struct timespec ts; 367 struct timespec ts;
367 int err, is_hid, cluster; 368 int err, is_hid, cluster;
368 369
369 lock_kernel(); 370 lock_super(sb);
370 371
371 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len, 372 err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
372 msdos_name, &MSDOS_SB(sb)->options); 373 msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
404 405
405 d_instantiate(dentry, inode); 406 d_instantiate(dentry, inode);
406 407
407 unlock_kernel(); 408 unlock_super(sb);
408 fat_flush_inodes(sb, dir, inode); 409 fat_flush_inodes(sb, dir, inode);
409 return 0; 410 return 0;
410 411
411out_free: 412out_free:
412 fat_free_clusters(dir, cluster); 413 fat_free_clusters(dir, cluster);
413out: 414out:
414 unlock_kernel(); 415 unlock_super(sb);
415 return err; 416 return err;
416} 417}
417 418
@@ -419,10 +420,11 @@ out:
419static int msdos_unlink(struct inode *dir, struct dentry *dentry) 420static int msdos_unlink(struct inode *dir, struct dentry *dentry)
420{ 421{
421 struct inode *inode = dentry->d_inode; 422 struct inode *inode = dentry->d_inode;
423 struct super_block *sb= inode->i_sb;
422 struct fat_slot_info sinfo; 424 struct fat_slot_info sinfo;
423 int err; 425 int err;
424 426
425 lock_kernel(); 427 lock_super(sb);
426 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); 428 err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
427 if (err) 429 if (err)
428 goto out; 430 goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
434 inode->i_ctime = CURRENT_TIME_SEC; 436 inode->i_ctime = CURRENT_TIME_SEC;
435 fat_detach(inode); 437 fat_detach(inode);
436out: 438out:
437 unlock_kernel(); 439 unlock_super(sb);
438 if (!err) 440 if (!err)
439 err = fat_flush_inodes(inode->i_sb, dir, inode); 441 err = fat_flush_inodes(sb, dir, inode);
440 442
441 return err; 443 return err;
442} 444}
@@ -618,10 +620,11 @@ error_inode:
618static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, 620static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
619 struct inode *new_dir, struct dentry *new_dentry) 621 struct inode *new_dir, struct dentry *new_dentry)
620{ 622{
623 struct super_block *sb = old_dir->i_sb;
621 unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME]; 624 unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
622 int err, is_hid; 625 int err, is_hid;
623 626
624 lock_kernel(); 627 lock_super(sb);
625 628
626 err = msdos_format_name(old_dentry->d_name.name, 629 err = msdos_format_name(old_dentry->d_name.name,
627 old_dentry->d_name.len, old_msdos_name, 630 old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
640 err = do_msdos_rename(old_dir, old_msdos_name, old_dentry, 643 err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
641 new_dir, new_msdos_name, new_dentry, is_hid); 644 new_dir, new_msdos_name, new_dentry, is_hid);
642out: 645out:
643 unlock_kernel(); 646 unlock_super(sb);
644 if (!err) 647 if (!err)
645 err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir); 648 err = fat_flush_inodes(sb, old_dir, new_dir);
646 return err; 649 return err;
647} 650}
648 651
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e0..4f6f7635b59c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
750 const char *str; 750 const char *str;
751}; 751};
752 752
753static void show_sb_opts(struct seq_file *m, struct super_block *sb) 753static int show_sb_opts(struct seq_file *m, struct super_block *sb)
754{ 754{
755 static const struct proc_fs_info fs_info[] = { 755 static const struct proc_fs_info fs_info[] = {
756 { MS_SYNCHRONOUS, ",sync" }, 756 { MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
764 if (sb->s_flags & fs_infop->flag) 764 if (sb->s_flags & fs_infop->flag)
765 seq_puts(m, fs_infop->str); 765 seq_puts(m, fs_infop->str);
766 } 766 }
767
768 return security_sb_show_options(m, sb);
767} 769}
768 770
769static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) 771static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
806 seq_putc(m, ' '); 808 seq_putc(m, ' ');
807 show_type(m, mnt->mnt_sb); 809 show_type(m, mnt->mnt_sb);
808 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); 810 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
809 show_sb_opts(m, mnt->mnt_sb); 811 err = show_sb_opts(m, mnt->mnt_sb);
812 if (err)
813 goto out;
810 show_mnt_opts(m, mnt); 814 show_mnt_opts(m, mnt);
811 if (mnt->mnt_sb->s_op->show_options) 815 if (mnt->mnt_sb->s_op->show_options)
812 err = mnt->mnt_sb->s_op->show_options(m, mnt); 816 err = mnt->mnt_sb->s_op->show_options(m, mnt);
813 seq_puts(m, " 0 0\n"); 817 seq_puts(m, " 0 0\n");
818out:
814 return err; 819 return err;
815} 820}
816 821
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
865 seq_putc(m, ' '); 870 seq_putc(m, ' ');
866 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 871 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
867 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); 872 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
868 show_sb_opts(m, sb); 873 err = show_sb_opts(m, sb);
874 if (err)
875 goto out;
869 if (sb->s_op->show_options) 876 if (sb->s_op->show_options)
870 err = sb->s_op->show_options(m, mnt); 877 err = sb->s_op->show_options(m, mnt);
871 seq_putc(m, '\n'); 878 seq_putc(m, '\n');
879out:
872 return err; 880 return err;
873} 881}
874 882
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b39..6a7d901f1936 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp_lock.h>
21 22
22#include <linux/ncp_fs.h> 23#include <linux/ncp_fs.h>
23#include "ncplib_kernel.h" 24#include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
281 return 0; 282 return 0;
282} 283}
283 284
285static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
286{
287 loff_t ret;
288 lock_kernel();
289 ret = generic_file_llseek_unlocked(file, offset, origin);
290 unlock_kernel();
291 return ret;
292}
293
284const struct file_operations ncp_file_operations = 294const struct file_operations ncp_file_operations =
285{ 295{
286 .llseek = remote_llseek, 296 .llseek = ncp_remote_llseek,
287 .read = ncp_file_read, 297 .read = ncp_file_read,
288 .write = ncp_file_write, 298 .write = ncp_file_write,
289 .ioctl = ncp_ioctl, 299 .ioctl = ncp_ioctl,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 509dcb58959e..43164fe86069 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -180,6 +180,8 @@ force_reval:
180 180
181static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) 181static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
182{ 182{
183 loff_t loff;
184
183 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", 185 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
184 filp->f_path.dentry->d_parent->d_name.name, 186 filp->f_path.dentry->d_parent->d_name.name,
185 filp->f_path.dentry->d_name.name, 187 filp->f_path.dentry->d_name.name,
@@ -192,7 +194,10 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
192 if (retval < 0) 194 if (retval < 0)
193 return (loff_t)retval; 195 return (loff_t)retval;
194 } 196 }
195 return remote_llseek(filp, offset, origin); 197 lock_kernel(); /* BKL needed? */
198 loff = generic_file_llseek_unlocked(filp, offset, origin);
199 unlock_kernel();
200 return loff;
196} 201}
197 202
198/* 203/*
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index efc015c6128a..44f87caf3683 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
606 606
607 res->last_used = 0; 607 res->last_used = 0;
608 608
609 spin_lock(&dlm->spinlock);
609 list_add_tail(&res->tracking, &dlm->tracking_list); 610 list_add_tail(&res->tracking, &dlm->tracking_list);
611 spin_unlock(&dlm->spinlock);
610 612
611 memset(res->lvb, 0, DLM_LVB_LEN); 613 memset(res->lvb, 0, DLM_LVB_LEN);
612 memset(res->refmap, 0, sizeof(res->refmap)); 614 memset(res->refmap, 0, sizeof(res->refmap));
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a5..80e20d9f2780 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1554,8 +1554,8 @@ out:
1554 */ 1554 */
1555int ocfs2_file_lock(struct file *file, int ex, int trylock) 1555int ocfs2_file_lock(struct file *file, int ex, int trylock)
1556{ 1556{
1557 int ret, level = ex ? LKM_EXMODE : LKM_PRMODE; 1557 int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
1558 unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0; 1558 unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
1559 unsigned long flags; 1559 unsigned long flags;
1560 struct ocfs2_file_private *fp = file->private_data; 1560 struct ocfs2_file_private *fp = file->private_data;
1561 struct ocfs2_lock_res *lockres = &fp->fp_flock; 1561 struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1582,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1582 * Get the lock at NLMODE to start - that way we 1582 * Get the lock at NLMODE to start - that way we
1583 * can cancel the upconvert request if need be. 1583 * can cancel the upconvert request if need be.
1584 */ 1584 */
1585 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); 1585 ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
1586 if (ret < 0) { 1586 if (ret < 0) {
1587 mlog_errno(ret); 1587 mlog_errno(ret);
1588 goto out; 1588 goto out;
@@ -1597,7 +1597,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1597 } 1597 }
1598 1598
1599 lockres->l_action = OCFS2_AST_CONVERT; 1599 lockres->l_action = OCFS2_AST_CONVERT;
1600 lkm_flags |= LKM_CONVERT; 1600 lkm_flags |= DLM_LKF_CONVERT;
1601 lockres->l_requested = level; 1601 lockres->l_requested = level;
1602 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 1602 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1603 1603
@@ -1664,7 +1664,7 @@ void ocfs2_file_unlock(struct file *file)
1664 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) 1664 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
1665 return; 1665 return;
1666 1666
1667 if (lockres->l_level == LKM_NLMODE) 1667 if (lockres->l_level == DLM_LOCK_NL)
1668 return; 1668 return;
1669 1669
1670 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n", 1670 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1678,11 @@ void ocfs2_file_unlock(struct file *file)
1678 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 1678 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1679 lockres->l_blocking = DLM_LOCK_EX; 1679 lockres->l_blocking = DLM_LOCK_EX;
1680 1680
1681 gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE); 1681 gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
1682 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 1682 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1683 spin_unlock_irqrestore(&lockres->l_lock, flags); 1683 spin_unlock_irqrestore(&lockres->l_lock, flags);
1684 1684
1685 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen); 1685 ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
1686 if (ret) { 1686 if (ret) {
1687 mlog_errno(ret); 1687 mlog_errno(ret);
1688 return; 1688 return;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280dd462..bd7e0f3acfc7 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/miscdevice.h> 22#include <linux/miscdevice.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/smp_lock.h>
24#include <linux/reboot.h> 25#include <linux/reboot.h>
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26 27
@@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
619 return -ENOMEM; 620 return -ENOMEM;
620 p->op_this_node = -1; 621 p->op_this_node = -1;
621 622
623 lock_kernel();
622 mutex_lock(&ocfs2_control_lock); 624 mutex_lock(&ocfs2_control_lock);
623 file->private_data = p; 625 file->private_data = p;
624 list_add(&p->op_list, &ocfs2_control_private_list); 626 list_add(&p->op_list, &ocfs2_control_private_list);
625 mutex_unlock(&ocfs2_control_lock); 627 mutex_unlock(&ocfs2_control_lock);
628 unlock_kernel();
626 629
627 return 0; 630 return 0;
628} 631}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7ff..58c3e6a8e15e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
233 */ 233 */
234 if (task->parent == current && (task->ptrace & PT_PTRACED) && 234 if (task->parent == current && (task->ptrace & PT_PTRACED) &&
235 task_is_stopped_or_traced(task) && 235 task_is_stopped_or_traced(task) &&
236 ptrace_may_attach(task)) 236 ptrace_may_access(task, PTRACE_MODE_ATTACH))
237 return 0; 237 return 0;
238 238
239 /* 239 /*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
251 task_lock(task); 251 task_lock(task);
252 if (task->mm != mm) 252 if (task->mm != mm)
253 goto out; 253 goto out;
254 if (task->mm != current->mm && __ptrace_may_attach(task) < 0) 254 if (task->mm != current->mm &&
255 __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
255 goto out; 256 goto out;
256 task_unlock(task); 257 task_unlock(task);
257 return mm; 258 return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
518 */ 519 */
519 task = get_proc_task(inode); 520 task = get_proc_task(inode);
520 if (task) { 521 if (task) {
521 allowed = ptrace_may_attach(task); 522 allowed = ptrace_may_access(task, PTRACE_MODE_READ);
522 put_task_struct(task); 523 put_task_struct(task);
523 } 524 }
524 return allowed; 525 return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
904 if (!task) 905 if (!task)
905 goto out_no_task; 906 goto out_no_task;
906 907
907 if (!ptrace_may_attach(task)) 908 if (!ptrace_may_access(task, PTRACE_MODE_READ))
908 goto out; 909 goto out;
909 910
910 ret = -ENOMEM; 911 ret = -ENOMEM;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad466..c652d469dc08 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
123 return proc_calc_metrics(page, start, off, count, eof, len); 123 return proc_calc_metrics(page, start, off, count, eof, len);
124} 124}
125 125
126int __attribute__((weak)) arch_report_meminfo(char *page)
127{
128 return 0;
129}
130
126static int meminfo_read_proc(char *page, char **start, off_t off, 131static int meminfo_read_proc(char *page, char **start, off_t off,
127 int count, int *eof, void *data) 132 int count, int *eof, void *data)
128{ 133{
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
221 226
222 len += hugetlb_report_meminfo(page + len); 227 len += hugetlb_report_meminfo(page + len);
223 228
229 len += arch_report_meminfo(page + len);
230
224 return proc_calc_metrics(page, start, off, count, eof, len); 231 return proc_calc_metrics(page, start, off, count, eof, len);
225#undef K 232#undef K
226} 233}
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
472}; 479};
473#endif 480#endif
474 481
482#ifndef arch_irq_stat_cpu
483#define arch_irq_stat_cpu(cpu) 0
484#endif
485#ifndef arch_irq_stat
486#define arch_irq_stat() 0
487#endif
488
475static int show_stat(struct seq_file *p, void *v) 489static int show_stat(struct seq_file *p, void *v)
476{ 490{
477 int i; 491 int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
509 sum += temp; 523 sum += temp;
510 per_irq_sum[j] += temp; 524 per_irq_sum[j] += temp;
511 } 525 }
526 sum += arch_irq_stat_cpu(i);
512 } 527 }
528 sum += arch_irq_stat();
513 529
514 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", 530 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
515 (unsigned long long)cputime64_to_clock_t(user), 531 (unsigned long long)cputime64_to_clock_t(user),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c492449f3b45..164bd9f9ede3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
210 dev_t dev = 0; 210 dev_t dev = 0;
211 int len; 211 int len;
212 212
213 if (maps_protect && !ptrace_may_attach(task)) 213 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
214 return -EACCES; 214 return -EACCES;
215 215
216 if (file) { 216 if (file) {
@@ -646,7 +646,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
646 goto out; 646 goto out;
647 647
648 ret = -EACCES; 648 ret = -EACCES;
649 if (!ptrace_may_attach(task)) 649 if (!ptrace_may_access(task, PTRACE_MODE_READ))
650 goto out_task; 650 goto out_task;
651 651
652 ret = -EINVAL; 652 ret = -EINVAL;
@@ -747,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
747 struct proc_maps_private *priv = m->private; 747 struct proc_maps_private *priv = m->private;
748 struct task_struct *task = priv->task; 748 struct task_struct *task = priv->task;
749 749
750 if (maps_protect && !ptrace_may_attach(task)) 750 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
751 return -EACCES; 751 return -EACCES;
752 752
753 return show_numa_map(m, v); 753 return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f186..5d84e7121df8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
113 struct proc_maps_private *priv = m->private; 113 struct proc_maps_private *priv = m->private;
114 struct task_struct *task = priv->task; 114 struct task_struct *task = priv->task;
115 115
116 if (maps_protect && !ptrace_may_attach(task)) 116 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
117 return -EACCES; 117 return -EACCES;
118 118
119 return nommu_vma_show(m, vml->vma); 119 return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b9024300..78f613cb9c76 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
45 .mmap = generic_file_mmap, 45 .mmap = generic_file_mmap,
46 .fsync = simple_sync_file, 46 .fsync = simple_sync_file,
47 .splice_read = generic_file_splice_read, 47 .splice_read = generic_file_splice_read,
48 .splice_write = generic_file_splice_write,
48 .llseek = generic_file_llseek, 49 .llseek = generic_file_llseek,
49}; 50};
50 51
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f69..52312ec93ff4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
43 .aio_write = generic_file_aio_write, 43 .aio_write = generic_file_aio_write,
44 .fsync = simple_sync_file, 44 .fsync = simple_sync_file,
45 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
46 .splice_write = generic_file_splice_write,
46 .llseek = generic_file_llseek, 47 .llseek = generic_file_llseek,
47}; 48};
48 49
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c69..9ba495d5a29b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
31 31
32EXPORT_SYMBOL(generic_ro_fops); 32EXPORT_SYMBOL(generic_ro_fops);
33 33
34loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) 34loff_t
35generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
35{ 36{
36 loff_t retval; 37 loff_t retval;
37 struct inode *inode = file->f_mapping->host; 38 struct inode *inode = file->f_mapping->host;
38 39
39 mutex_lock(&inode->i_mutex);
40 switch (origin) { 40 switch (origin) {
41 case SEEK_END: 41 case SEEK_END:
42 offset += inode->i_size; 42 offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
46 } 46 }
47 retval = -EINVAL; 47 retval = -EINVAL;
48 if (offset>=0 && offset<=inode->i_sb->s_maxbytes) { 48 if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
49 /* Special lock needed here? */
49 if (offset != file->f_pos) { 50 if (offset != file->f_pos) {
50 file->f_pos = offset; 51 file->f_pos = offset;
51 file->f_version = 0; 52 file->f_version = 0;
52 } 53 }
53 retval = offset; 54 retval = offset;
54 } 55 }
55 mutex_unlock(&inode->i_mutex);
56 return retval; 56 return retval;
57} 57}
58EXPORT_SYMBOL(generic_file_llseek_unlocked);
58 59
59EXPORT_SYMBOL(generic_file_llseek); 60loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
60
61loff_t remote_llseek(struct file *file, loff_t offset, int origin)
62{ 61{
63 loff_t retval; 62 loff_t n;
64 63 mutex_lock(&file->f_dentry->d_inode->i_mutex);
65 lock_kernel(); 64 n = generic_file_llseek_unlocked(file, offset, origin);
66 switch (origin) { 65 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
67 case SEEK_END: 66 return n;
68 offset += i_size_read(file->f_path.dentry->d_inode);
69 break;
70 case SEEK_CUR:
71 offset += file->f_pos;
72 }
73 retval = -EINVAL;
74 if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
75 if (offset != file->f_pos) {
76 file->f_pos = offset;
77 file->f_version = 0;
78 }
79 retval = offset;
80 }
81 unlock_kernel();
82 return retval;
83} 67}
84EXPORT_SYMBOL(remote_llseek); 68EXPORT_SYMBOL(generic_file_llseek);
85 69
86loff_t no_llseek(struct file *file, loff_t offset, int origin) 70loff_t no_llseek(struct file *file, loff_t offset, int origin)
87{ 71{
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7a..2294783320cb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
422 return error; 422 return error;
423} 423}
424 424
425static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
426{
427 loff_t ret;
428 lock_kernel();
429 ret = generic_file_llseek_unlocked(file, offset, origin);
430 unlock_kernel();
431 return ret;
432}
433
425const struct file_operations smb_file_operations = 434const struct file_operations smb_file_operations =
426{ 435{
427 .llseek = remote_llseek, 436 .llseek = smb_remote_llseek,
428 .read = do_sync_read, 437 .read = do_sync_read,
429 .aio_read = smb_file_aio_read, 438 .aio_read = smb_file_aio_read,
430 .write = do_sync_write, 439 .write = do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b305..399442179d89 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
379 lock_page(page); 379 lock_page(page);
380 380
381 /* 381 /*
382 * page was truncated, stop here. if this isn't the 382 * Page was truncated, or invalidated by the
383 * first page, we'll just complete what we already 383 * filesystem. Redo the find/create, but this time the
384 * added 384 * page is kept locked, so there's no chance of another
385 * race with truncate/invalidate.
385 */ 386 */
386 if (!page->mapping) { 387 if (!page->mapping) {
387 unlock_page(page); 388 unlock_page(page);
388 break; 389 page = find_or_create_page(mapping, index,
390 mapping_gfp_mask(mapping));
391
392 if (!page) {
393 error = -ENOMEM;
394 break;
395 }
396 page_cache_release(pages[page_nr]);
397 pages[page_nr] = page;
389 } 398 }
390 /* 399 /*
391 * page was already under io and is now done, great 400 * page was already under io and is now done, great
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5b..b546ba69be82 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
645 if (len == 0) 645 if (len == 0)
646 return -ENOENT; 646 return -ENOENT;
647 647
648 slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL); 648 slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
649 if (slots == NULL) 649 if (slots == NULL)
650 return -ENOMEM; 650 return -ENOMEM;
651 651
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
687 struct dentry *alias; 687 struct dentry *alias;
688 int err, table; 688 int err, table;
689 689
690 lock_kernel(); 690 lock_super(sb);
691 table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0; 691 table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
692 dentry->d_op = &vfat_dentry_ops[table]; 692 dentry->d_op = &vfat_dentry_ops[table];
693 693
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
699 inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); 699 inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
700 brelse(sinfo.bh); 700 brelse(sinfo.bh);
701 if (IS_ERR(inode)) { 701 if (IS_ERR(inode)) {
702 unlock_kernel(); 702 unlock_super(sb);
703 return ERR_CAST(inode); 703 return ERR_CAST(inode);
704 } 704 }
705 alias = d_find_alias(inode); 705 alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
708 dput(alias); 708 dput(alias);
709 else { 709 else {
710 iput(inode); 710 iput(inode);
711 unlock_kernel(); 711 unlock_super(sb);
712 return alias; 712 return alias;
713 } 713 }
714 714
715 } 715 }
716error: 716error:
717 unlock_kernel(); 717 unlock_super(sb);
718 dentry->d_op = &vfat_dentry_ops[table]; 718 dentry->d_op = &vfat_dentry_ops[table];
719 dentry->d_time = dentry->d_parent->d_inode->i_version; 719 dentry->d_time = dentry->d_parent->d_inode->i_version;
720 dentry = d_splice_alias(inode, dentry); 720 dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
734 struct timespec ts; 734 struct timespec ts;
735 int err; 735 int err;
736 736
737 lock_kernel(); 737 lock_super(sb);
738 738
739 ts = CURRENT_TIME_SEC; 739 ts = CURRENT_TIME_SEC;
740 err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); 740 err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
755 dentry->d_time = dentry->d_parent->d_inode->i_version; 755 dentry->d_time = dentry->d_parent->d_inode->i_version;
756 d_instantiate(dentry, inode); 756 d_instantiate(dentry, inode);
757out: 757out:
758 unlock_kernel(); 758 unlock_super(sb);
759 return err; 759 return err;
760} 760}
761 761
762static int vfat_rmdir(struct inode *dir, struct dentry *dentry) 762static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
763{ 763{
764 struct inode *inode = dentry->d_inode; 764 struct inode *inode = dentry->d_inode;
765 struct super_block *sb = dir->i_sb;
765 struct fat_slot_info sinfo; 766 struct fat_slot_info sinfo;
766 int err; 767 int err;
767 768
768 lock_kernel(); 769 lock_super(sb);
769 770
770 err = fat_dir_empty(inode); 771 err = fat_dir_empty(inode);
771 if (err) 772 if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
783 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; 784 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
784 fat_detach(inode); 785 fat_detach(inode);
785out: 786out:
786 unlock_kernel(); 787 unlock_super(sb);
787 788
788 return err; 789 return err;
789} 790}
@@ -791,10 +792,11 @@ out:
791static int vfat_unlink(struct inode *dir, struct dentry *dentry) 792static int vfat_unlink(struct inode *dir, struct dentry *dentry)
792{ 793{
793 struct inode *inode = dentry->d_inode; 794 struct inode *inode = dentry->d_inode;
795 struct super_block *sb = dir->i_sb;
794 struct fat_slot_info sinfo; 796 struct fat_slot_info sinfo;
795 int err; 797 int err;
796 798
797 lock_kernel(); 799 lock_super(sb);
798 800
799 err = vfat_find(dir, &dentry->d_name, &sinfo); 801 err = vfat_find(dir, &dentry->d_name, &sinfo);
800 if (err) 802 if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
807 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; 809 inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
808 fat_detach(inode); 810 fat_detach(inode);
809out: 811out:
810 unlock_kernel(); 812 unlock_super(sb);
811 813
812 return err; 814 return err;
813} 815}
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
820 struct timespec ts; 822 struct timespec ts;
821 int err, cluster; 823 int err, cluster;
822 824
823 lock_kernel(); 825 lock_super(sb);
824 826
825 ts = CURRENT_TIME_SEC; 827 ts = CURRENT_TIME_SEC;
826 cluster = fat_alloc_new_dir(dir, &ts); 828 cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
849 dentry->d_time = dentry->d_parent->d_inode->i_version; 851 dentry->d_time = dentry->d_parent->d_inode->i_version;
850 d_instantiate(dentry, inode); 852 d_instantiate(dentry, inode);
851 853
852 unlock_kernel(); 854 unlock_super(sb);
853 return 0; 855 return 0;
854 856
855out_free: 857out_free:
856 fat_free_clusters(dir, cluster); 858 fat_free_clusters(dir, cluster);
857out: 859out:
858 unlock_kernel(); 860 unlock_super(sb);
859 return err; 861 return err;
860} 862}
861 863
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
869 struct timespec ts; 871 struct timespec ts;
870 loff_t dotdot_i_pos, new_i_pos; 872 loff_t dotdot_i_pos, new_i_pos;
871 int err, is_dir, update_dotdot, corrupt = 0; 873 int err, is_dir, update_dotdot, corrupt = 0;
874 struct super_block *sb = old_dir->i_sb;
872 875
873 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 876 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
874 old_inode = old_dentry->d_inode; 877 old_inode = old_dentry->d_inode;
875 new_inode = new_dentry->d_inode; 878 new_inode = new_dentry->d_inode;
876 lock_kernel(); 879 lock_super(sb);
877 err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); 880 err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
878 if (err) 881 if (err)
879 goto out; 882 goto out;
@@ -951,7 +954,7 @@ out:
951 brelse(sinfo.bh); 954 brelse(sinfo.bh);
952 brelse(dotdot_bh); 955 brelse(dotdot_bh);
953 brelse(old_sinfo.bh); 956 brelse(old_sinfo.bh);
954 unlock_kernel(); 957 unlock_super(sb);
955 958
956 return err; 959 return err;
957 960
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0ee..ad3d26ddfe31 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2427,13 +2427,20 @@ restart:
2427 if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { 2427 if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
2428 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 2428 xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
2429 2429
2430 /* If I'm the only one writing to this iclog, sync it to disk */ 2430 /*
2431 if (atomic_read(&iclog->ic_refcnt) == 1) { 2431 * If I'm the only one writing to this iclog, sync it to disk.
2432 * We need to do an atomic compare and decrement here to avoid
2433 * racing with concurrent atomic_dec_and_lock() calls in
2434 * xlog_state_release_iclog() when there is more than one
2435 * reference to the iclog.
2436 */
2437 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
2438 /* we are the only one */
2432 spin_unlock(&log->l_icloglock); 2439 spin_unlock(&log->l_icloglock);
2433 if ((error = xlog_state_release_iclog(log, iclog))) 2440 error = xlog_state_release_iclog(log, iclog);
2441 if (error)
2434 return error; 2442 return error;
2435 } else { 2443 } else {
2436 atomic_dec(&iclog->ic_refcnt);
2437 spin_unlock(&log->l_icloglock); 2444 spin_unlock(&log->l_icloglock);
2438 } 2445 }
2439 goto restart; 2446 goto restart;