aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorSteve French <sfrench@us.ibm.com>2005-10-31 11:36:11 -0500
committerSteve French <sfrench@us.ibm.com>2005-10-31 11:36:11 -0500
commit53b2ec5518aa2623e8c0cb36f1c304a797988a46 (patch)
tree465d8631ade6c2fcbd7576ff9813d00116c6a1e8 /fs
parent0753ca7bc2b876dd136e9db11a20f85cbe4e08b1 (diff)
parent581c1b14394aee60aff46ea67d05483261ed6527 (diff)
Merge with /pub/scm/linux/kernel/git/torvalds/linux-2.6.git
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/afs/file.c8
-rw-r--r--fs/attr.c3
-rw-r--r--fs/binfmt_aout.c1
-rw-r--r--fs/binfmt_elf.c5
-rw-r--r--fs/binfmt_elf_fdpic.c7
-rw-r--r--fs/binfmt_flat.c1
-rw-r--r--fs/binfmt_som.c1
-rw-r--r--fs/bio.c4
-rw-r--r--fs/buffer.c25
-rw-r--r--fs/coda/psdev.c4
-rw-r--r--fs/compat.c1
-rw-r--r--fs/compat_ioctl.c5
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/dquot.c4
-rw-r--r--fs/exec.c68
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext3/balloc.c7
-rw-r--r--fs/ext3/bitmap.c2
-rw-r--r--fs/ext3/bitmap.h8
-rw-r--r--fs/ext3/ialloc.c3
-rw-r--r--fs/ext3/inode.c13
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/namei.h8
-rw-r--r--fs/ext3/resize.c10
-rw-r--r--fs/ext3/super.c30
-rw-r--r--fs/ext3/xattr.c8
-rw-r--r--fs/fat/dir.c230
-rw-r--r--fs/file_table.c14
-rw-r--r--fs/filesystems.c1
-rw-r--r--fs/fs-writeback.c28
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hugetlbfs/inode.c206
-rw-r--r--fs/inode.c3
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jffs2/background.c1
-rw-r--r--fs/jffs2/wbuf.c2
-rw-r--r--fs/jfs/jfs_dmap.c20
-rw-r--r--fs/jfs/jfs_imap.c10
-rw-r--r--fs/jfs/jfs_metapage.c22
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c18
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/lockd/host.c4
-rw-r--r--fs/locks.c48
-rw-r--r--fs/mbcache.c6
-rw-r--r--fs/msdos/namei.c14
-rw-r--r--fs/namei.c101
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/delegation.h16
-rw-r--r--fs/nfs/dir.c67
-rw-r--r--fs/nfs/file.c31
-rw-r--r--fs/nfs/inode.c202
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3proc.c92
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4_fs.h53
-rw-r--r--fs/nfs/nfs4proc.c735
-rw-r--r--fs/nfs/nfs4state.c181
-rw-r--r--fs/nfs/nfs4xdr.c305
-rw-r--r--fs/nfs/proc.c44
-rw-r--r--fs/nfs/read.c1
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/ntfs/ChangeLog85
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/aops.c832
-rw-r--r--fs/ntfs/attrib.c983
-rw-r--r--fs/ntfs/attrib.h10
-rw-r--r--fs/ntfs/file.c2255
-rw-r--r--fs/ntfs/inode.c514
-rw-r--r--fs/ntfs/layout.h31
-rw-r--r--fs/ntfs/lcnalloc.c56
-rw-r--r--fs/ntfs/lcnalloc.h43
-rw-r--r--fs/ntfs/malloc.h3
-rw-r--r--fs/ntfs/mft.c26
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/open.c79
-rw-r--r--fs/partitions/check.c29
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/generic.c2
-rw-r--r--fs/proc/inode.c17
-rw-r--r--fs/proc/proc_misc.c8
-rw-r--r--fs/proc/task_mmu.c51
-rw-r--r--fs/reiserfs/fix_node.c2
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/super.c25
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/reiserfs/xattr_acl.c3
-rw-r--r--fs/super.c2
-rw-r--r--fs/vfat/namei.c20
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xfs/linux-2.6/kmem.c22
-rw-r--r--fs/xfs/linux-2.6/kmem.h18
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c15
102 files changed, 5722 insertions, 2172 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 48f5422cb19a..01a295232f75 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -810,7 +810,7 @@ config TMPFS
810 810
811config HUGETLBFS 811config HUGETLBFS
812 bool "HugeTLB file system support" 812 bool "HugeTLB file system support"
813 depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN 813 depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
814 814
815config HUGETLB_PAGE 815config HUGETLB_PAGE
816 def_bool HUGETLBFS 816 def_bool HUGETLBFS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 434c19d076ac..175b2e8177c1 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -57,7 +57,7 @@ config BINFMT_SHARED_FLAT
57 57
58config BINFMT_AOUT 58config BINFMT_AOUT
59 tristate "Kernel support for a.out and ECOFF binaries" 59 tristate "Kernel support for a.out and ECOFF binaries"
60 depends on (X86 && !X86_64) || ALPHA || ARM || M68K || SPARC32 60 depends on X86_32 || ALPHA || ARM || M68K || SPARC32
61 ---help--- 61 ---help---
62 A.out (Assembler.OUTput) is a set of formats for libraries and 62 A.out (Assembler.OUTput) is a set of formats for libraries and
63 executables used in the earliest versions of UNIX. Linux used 63 executables used in the earliest versions of UNIX. Linux used
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 23c125128024..4975c9c193dd 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -29,7 +29,7 @@ static int afs_file_release(struct inode *inode, struct file *file);
29 29
30static int afs_file_readpage(struct file *file, struct page *page); 30static int afs_file_readpage(struct file *file, struct page *page);
31static int afs_file_invalidatepage(struct page *page, unsigned long offset); 31static int afs_file_invalidatepage(struct page *page, unsigned long offset);
32static int afs_file_releasepage(struct page *page, int gfp_flags); 32static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
33 33
34static ssize_t afs_file_write(struct file *file, const char __user *buf, 34static ssize_t afs_file_write(struct file *file, const char __user *buf,
35 size_t size, loff_t *off); 35 size_t size, loff_t *off);
@@ -279,7 +279,7 @@ static int afs_file_invalidatepage(struct page *page, unsigned long offset)
279/* 279/*
280 * release a page and cleanup its private data 280 * release a page and cleanup its private data
281 */ 281 */
282static int afs_file_releasepage(struct page *page, int gfp_flags) 282static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
283{ 283{
284 struct cachefs_page *pageio; 284 struct cachefs_page *pageio;
285 285
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, int gfp_flags)
291 cachefs_uncache_page(vnode->cache, page); 291 cachefs_uncache_page(vnode->cache, page);
292#endif 292#endif
293 293
294 pageio = (struct cachefs_page *) page->private; 294 pageio = (struct cachefs_page *) page_private(page);
295 page->private = 0; 295 set_page_private(page, 0);
296 ClearPagePrivate(page); 296 ClearPagePrivate(page);
297 297
298 if (pageio) 298 if (pageio)
diff --git a/fs/attr.c b/fs/attr.c
index b1796fb9e524..67bcd9b14ea5 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -117,9 +117,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
117 struct timespec now; 117 struct timespec now;
118 unsigned int ia_valid = attr->ia_valid; 118 unsigned int ia_valid = attr->ia_valid;
119 119
120 if (!inode)
121 BUG();
122
123 mode = inode->i_mode; 120 mode = inode->i_mode;
124 now = current_fs_time(inode->i_sb); 121 now = current_fs_time(inode->i_sb);
125 122
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index dd9baabaf016..72011826f0cb 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
318 current->mm->free_area_cache = current->mm->mmap_base; 318 current->mm->free_area_cache = current->mm->mmap_base;
319 current->mm->cached_hole_size = 0; 319 current->mm->cached_hole_size = 0;
320 320
321 set_mm_counter(current->mm, rss, 0);
322 current->mm->mmap = NULL; 321 current->mm->mmap = NULL;
323 compute_creds(bprm); 322 compute_creds(bprm);
324 current->flags &= ~PF_FORKNOEXEC; 323 current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4b15576e584..6fa6adc40972 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
773 773
774 /* Do this so that we can load the interpreter, if need be. We will 774 /* Do this so that we can load the interpreter, if need be. We will
775 change some of these later */ 775 change some of these later */
776 set_mm_counter(current->mm, rss, 0);
777 current->mm->free_area_cache = current->mm->mmap_base; 776 current->mm->free_area_cache = current->mm->mmap_base;
778 current->mm->cached_hole_size = 0; 777 current->mm->cached_hole_size = 0;
779 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), 778 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
@@ -1503,9 +1502,7 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
1503 fill_psinfo(psinfo, current->group_leader, current->mm); 1502 fill_psinfo(psinfo, current->group_leader, current->mm);
1504 fill_note(notes +1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); 1503 fill_note(notes +1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1505 1504
1506 fill_note(notes +2, "CORE", NT_TASKSTRUCT, sizeof(*current), current); 1505 numnote = 2;
1507
1508 numnote = 3;
1509 1506
1510 auxv = (elf_addr_t *) current->mm->saved_auxv; 1507 auxv = (elf_addr_t *) current->mm->saved_auxv;
1511 1508
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 134c9c0d1f54..dda87c4c82a3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
294 &interp_params, 294 &interp_params,
295 &current->mm->start_stack, 295 &current->mm->start_stack,
296 &current->mm->start_brk); 296 &current->mm->start_brk);
297#endif
298
299 /* do this so that we can load the interpreter, if need be
300 * - we will change some of these later
301 */
302 set_mm_counter(current->mm, rss, 0);
303 297
304#ifdef CONFIG_MMU
305 retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); 298 retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack);
306 if (retval < 0) { 299 if (retval < 0) {
307 send_sig(SIGKILL, current, 0); 300 send_sig(SIGKILL, current, 0);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7974efa107bc..9d6625829b99 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm,
650 current->mm->start_brk = datapos + data_len + bss_len; 650 current->mm->start_brk = datapos + data_len + bss_len;
651 current->mm->brk = (current->mm->start_brk + 3) & ~3; 651 current->mm->brk = (current->mm->start_brk + 3) & ~3;
652 current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; 652 current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
653 set_mm_counter(current->mm, rss, 0);
654 } 653 }
655 654
656 if (flags & FLAT_FLAG_KTRACE) 655 if (flags & FLAT_FLAG_KTRACE)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 227a2682d2bf..00a91dc25d16 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
259 create_som_tables(bprm); 259 create_som_tables(bprm);
260 260
261 current->mm->start_stack = bprm->p; 261 current->mm->start_stack = bprm->p;
262 set_mm_counter(current->mm, rss, 0);
263 262
264#if 0 263#if 0
265 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); 264 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
diff --git a/fs/bio.c b/fs/bio.c
index 7d81a93afd48..460554b07ff9 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -778,7 +778,7 @@ static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err)
778 778
779 779
780static struct bio *__bio_map_kern(request_queue_t *q, void *data, 780static struct bio *__bio_map_kern(request_queue_t *q, void *data,
781 unsigned int len, unsigned int gfp_mask) 781 unsigned int len, gfp_t gfp_mask)
782{ 782{
783 unsigned long kaddr = (unsigned long)data; 783 unsigned long kaddr = (unsigned long)data;
784 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 784 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -825,7 +825,7 @@ static struct bio *__bio_map_kern(request_queue_t *q, void *data,
825 * device. Returns an error pointer in case of error. 825 * device. Returns an error pointer in case of error.
826 */ 826 */
827struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len, 827struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len,
828 unsigned int gfp_mask) 828 gfp_t gfp_mask)
829{ 829{
830 struct bio *bio; 830 struct bio *bio;
831 831
diff --git a/fs/buffer.c b/fs/buffer.c
index 1216c0d3c8ce..35fa34977e81 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -96,7 +96,7 @@ static void
96__clear_page_buffers(struct page *page) 96__clear_page_buffers(struct page *page)
97{ 97{
98 ClearPagePrivate(page); 98 ClearPagePrivate(page);
99 page->private = 0; 99 set_page_private(page, 0);
100 page_cache_release(page); 100 page_cache_release(page);
101} 101}
102 102
@@ -502,7 +502,7 @@ static void free_more_memory(void)
502 yield(); 502 yield();
503 503
504 for_each_pgdat(pgdat) { 504 for_each_pgdat(pgdat) {
505 zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones; 505 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
506 if (*zones) 506 if (*zones)
507 try_to_free_pages(zones, GFP_NOFS); 507 try_to_free_pages(zones, GFP_NOFS);
508 } 508 }
@@ -1478,8 +1478,10 @@ EXPORT_SYMBOL(__getblk);
1478void __breadahead(struct block_device *bdev, sector_t block, int size) 1478void __breadahead(struct block_device *bdev, sector_t block, int size)
1479{ 1479{
1480 struct buffer_head *bh = __getblk(bdev, block, size); 1480 struct buffer_head *bh = __getblk(bdev, block, size);
1481 ll_rw_block(READA, 1, &bh); 1481 if (likely(bh)) {
1482 brelse(bh); 1482 ll_rw_block(READA, 1, &bh);
1483 brelse(bh);
1484 }
1483} 1485}
1484EXPORT_SYMBOL(__breadahead); 1486EXPORT_SYMBOL(__breadahead);
1485 1487
@@ -1497,7 +1499,7 @@ __bread(struct block_device *bdev, sector_t block, int size)
1497{ 1499{
1498 struct buffer_head *bh = __getblk(bdev, block, size); 1500 struct buffer_head *bh = __getblk(bdev, block, size);
1499 1501
1500 if (!buffer_uptodate(bh)) 1502 if (likely(bh) && !buffer_uptodate(bh))
1501 bh = __bread_slow(bh); 1503 bh = __bread_slow(bh);
1502 return bh; 1504 return bh;
1503} 1505}
@@ -1571,7 +1573,7 @@ static inline void discard_buffer(struct buffer_head * bh)
1571 * 1573 *
1572 * NOTE: @gfp_mask may go away, and this function may become non-blocking. 1574 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1573 */ 1575 */
1574int try_to_release_page(struct page *page, int gfp_mask) 1576int try_to_release_page(struct page *page, gfp_t gfp_mask)
1575{ 1577{
1576 struct address_space * const mapping = page->mapping; 1578 struct address_space * const mapping = page->mapping;
1577 1579
@@ -1637,6 +1639,15 @@ out:
1637} 1639}
1638EXPORT_SYMBOL(block_invalidatepage); 1640EXPORT_SYMBOL(block_invalidatepage);
1639 1641
1642int do_invalidatepage(struct page *page, unsigned long offset)
1643{
1644 int (*invalidatepage)(struct page *, unsigned long);
1645 invalidatepage = page->mapping->a_ops->invalidatepage;
1646 if (invalidatepage == NULL)
1647 invalidatepage = block_invalidatepage;
1648 return (*invalidatepage)(page, offset);
1649}
1650
1640/* 1651/*
1641 * We attach and possibly dirty the buffers atomically wrt 1652 * We attach and possibly dirty the buffers atomically wrt
1642 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers 1653 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -2696,7 +2707,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2696 * they may have been added in ext3_writepage(). Make them 2707 * they may have been added in ext3_writepage(). Make them
2697 * freeable here, so the page does not leak. 2708 * freeable here, so the page does not leak.
2698 */ 2709 */
2699 block_invalidatepage(page, 0); 2710 do_invalidatepage(page, 0);
2700 unlock_page(page); 2711 unlock_page(page);
2701 return 0; /* don't care */ 2712 return 0; /* don't care */
2702 } 2713 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 3d1cce3653b8..6a3df88accfe 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -370,8 +370,8 @@ static int init_coda_psdev(void)
370 } 370 }
371 devfs_mk_dir ("coda"); 371 devfs_mk_dir ("coda");
372 for (i = 0; i < MAX_CODADEVS; i++) { 372 for (i = 0; i < MAX_CODADEVS; i++) {
373 class_device_create(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR,i), 373 class_device_create(coda_psdev_class, NULL,
374 NULL, "cfs%d", i); 374 MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i);
375 err = devfs_mk_cdev(MKDEV(CODA_PSDEV_MAJOR, i), 375 err = devfs_mk_cdev(MKDEV(CODA_PSDEV_MAJOR, i),
376 S_IFCHR|S_IRUSR|S_IWUSR, "coda/%d", i); 376 S_IFCHR|S_IRUSR|S_IWUSR, "coda/%d", i);
377 if (err) 377 if (err)
diff --git a/fs/compat.c b/fs/compat.c
index a719e158e002..8e71cdbecc7c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
1490 /* execve success */ 1490 /* execve success */
1491 security_bprm_free(bprm); 1491 security_bprm_free(bprm);
1492 acct_update_integrals(current); 1492 acct_update_integrals(current);
1493 update_mem_hiwater(current);
1494 kfree(bprm); 1493 kfree(bprm);
1495 return retval; 1494 return retval;
1496 } 1495 }
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e28a74203f3b..43dbcb0b21eb 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -3046,10 +3046,15 @@ HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
3046/* Serial */ 3046/* Serial */
3047HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl) 3047HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl)
3048HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl) 3048HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl)
3049#ifdef TIOCGLTC
3050COMPATIBLE_IOCTL(TIOCGLTC)
3051COMPATIBLE_IOCTL(TIOCSLTC)
3052#endif
3049/* Usbdevfs */ 3053/* Usbdevfs */
3050HANDLE_IOCTL(USBDEVFS_CONTROL32, do_usbdevfs_control) 3054HANDLE_IOCTL(USBDEVFS_CONTROL32, do_usbdevfs_control)
3051HANDLE_IOCTL(USBDEVFS_BULK32, do_usbdevfs_bulk) 3055HANDLE_IOCTL(USBDEVFS_BULK32, do_usbdevfs_bulk)
3052HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal) 3056HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal)
3057COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
3053/* i2c */ 3058/* i2c */
3054HANDLE_IOCTL(I2C_FUNCS, w_long) 3059HANDLE_IOCTL(I2C_FUNCS, w_long)
3055HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl) 3060HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
diff --git a/fs/dcache.c b/fs/dcache.c
index fb10386c59be..e90512ed35a4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -689,7 +689,7 @@ void shrink_dcache_anon(struct hlist_head *head)
689 * 689 *
690 * In this case we return -1 to tell the caller that we baled. 690 * In this case we return -1 to tell the caller that we baled.
691 */ 691 */
692static int shrink_dcache_memory(int nr, unsigned int gfp_mask) 692static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
693{ 693{
694 if (nr) { 694 if (nr) {
695 if (!(gfp_mask & __GFP_FS)) 695 if (!(gfp_mask & __GFP_FS))
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0d06097bc995..3931e7f1e6bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
162 up_read(&current->mm->mmap_sem); 162 up_read(&current->mm->mmap_sem);
163 163
164 if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { 164 if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
165 struct page *page = ZERO_PAGE(dio->curr_user_address);
165 /* 166 /*
166 * A memory fault, but the filesystem has some outstanding 167 * A memory fault, but the filesystem has some outstanding
167 * mapped blocks. We need to use those blocks up to avoid 168 * mapped blocks. We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
169 */ 170 */
170 if (dio->page_errors == 0) 171 if (dio->page_errors == 0)
171 dio->page_errors = ret; 172 dio->page_errors = ret;
172 dio->pages[0] = ZERO_PAGE(dio->curr_user_address); 173 page_cache_get(page);
174 dio->pages[0] = page;
173 dio->head = 0; 175 dio->head = 0;
174 dio->tail = 1; 176 dio->tail = 1;
175 ret = 0; 177 ret = 0;
diff --git a/fs/dquot.c b/fs/dquot.c
index b9732335bcdc..ea7644227a65 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -500,7 +500,7 @@ static void prune_dqcache(int count)
500 * more memory 500 * more memory
501 */ 501 */
502 502
503static int shrink_dqcache_memory(int nr, unsigned int gfp_mask) 503static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
504{ 504{
505 if (nr) { 505 if (nr) {
506 spin_lock(&dq_list_lock); 506 spin_lock(&dq_list_lock);
@@ -662,7 +662,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
662restart: 662restart:
663 file_list_lock(); 663 file_list_lock();
664 list_for_each(p, &sb->s_files) { 664 list_for_each(p, &sb->s_files) {
665 struct file *filp = list_entry(p, struct file, f_list); 665 struct file *filp = list_entry(p, struct file, f_u.fu_list);
666 struct inode *inode = filp->f_dentry->d_inode; 666 struct inode *inode = filp->f_dentry->d_inode;
667 if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) { 667 if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) {
668 struct dentry *dentry = dget(filp->f_dentry); 668 struct dentry *dentry = dget(filp->f_dentry);
diff --git a/fs/exec.c b/fs/exec.c
index a04a575ad433..10d493fea7ce 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -126,8 +126,7 @@ asmlinkage long sys_uselib(const char __user * library)
126 struct nameidata nd; 126 struct nameidata nd;
127 int error; 127 int error;
128 128
129 nd.intent.open.flags = FMODE_READ; 129 error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ);
130 error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
131 if (error) 130 if (error)
132 goto out; 131 goto out;
133 132
@@ -139,7 +138,7 @@ asmlinkage long sys_uselib(const char __user * library)
139 if (error) 138 if (error)
140 goto exit; 139 goto exit;
141 140
142 file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); 141 file = nameidata_to_filp(&nd, O_RDONLY);
143 error = PTR_ERR(file); 142 error = PTR_ERR(file);
144 if (IS_ERR(file)) 143 if (IS_ERR(file))
145 goto out; 144 goto out;
@@ -167,6 +166,7 @@ asmlinkage long sys_uselib(const char __user * library)
167out: 166out:
168 return error; 167 return error;
169exit: 168exit:
169 release_open_intent(&nd);
170 path_release(&nd); 170 path_release(&nd);
171 goto out; 171 goto out;
172} 172}
@@ -309,40 +309,36 @@ void install_arg_page(struct vm_area_struct *vma,
309 pud_t * pud; 309 pud_t * pud;
310 pmd_t * pmd; 310 pmd_t * pmd;
311 pte_t * pte; 311 pte_t * pte;
312 spinlock_t *ptl;
312 313
313 if (unlikely(anon_vma_prepare(vma))) 314 if (unlikely(anon_vma_prepare(vma)))
314 goto out_sig; 315 goto out;
315 316
316 flush_dcache_page(page); 317 flush_dcache_page(page);
317 pgd = pgd_offset(mm, address); 318 pgd = pgd_offset(mm, address);
318
319 spin_lock(&mm->page_table_lock);
320 pud = pud_alloc(mm, pgd, address); 319 pud = pud_alloc(mm, pgd, address);
321 if (!pud) 320 if (!pud)
322 goto out; 321 goto out;
323 pmd = pmd_alloc(mm, pud, address); 322 pmd = pmd_alloc(mm, pud, address);
324 if (!pmd) 323 if (!pmd)
325 goto out; 324 goto out;
326 pte = pte_alloc_map(mm, pmd, address); 325 pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
327 if (!pte) 326 if (!pte)
328 goto out; 327 goto out;
329 if (!pte_none(*pte)) { 328 if (!pte_none(*pte)) {
330 pte_unmap(pte); 329 pte_unmap_unlock(pte, ptl);
331 goto out; 330 goto out;
332 } 331 }
333 inc_mm_counter(mm, rss); 332 inc_mm_counter(mm, anon_rss);
334 lru_cache_add_active(page); 333 lru_cache_add_active(page);
335 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( 334 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
336 page, vma->vm_page_prot)))); 335 page, vma->vm_page_prot))));
337 page_add_anon_rmap(page, vma, address); 336 page_add_anon_rmap(page, vma, address);
338 pte_unmap(pte); 337 pte_unmap_unlock(pte, ptl);
339 spin_unlock(&mm->page_table_lock);
340 338
341 /* no need for flush_tlb */ 339 /* no need for flush_tlb */
342 return; 340 return;
343out: 341out:
344 spin_unlock(&mm->page_table_lock);
345out_sig:
346 __free_page(page); 342 __free_page(page);
347 force_sig(SIGKILL, current); 343 force_sig(SIGKILL, current);
348} 344}
@@ -490,8 +486,7 @@ struct file *open_exec(const char *name)
490 int err; 486 int err;
491 struct file *file; 487 struct file *file;
492 488
493 nd.intent.open.flags = FMODE_READ; 489 err = path_lookup_open(name, LOOKUP_FOLLOW, &nd, FMODE_READ);
494 err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
495 file = ERR_PTR(err); 490 file = ERR_PTR(err);
496 491
497 if (!err) { 492 if (!err) {
@@ -504,7 +499,7 @@ struct file *open_exec(const char *name)
504 err = -EACCES; 499 err = -EACCES;
505 file = ERR_PTR(err); 500 file = ERR_PTR(err);
506 if (!err) { 501 if (!err) {
507 file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); 502 file = nameidata_to_filp(&nd, O_RDONLY);
508 if (!IS_ERR(file)) { 503 if (!IS_ERR(file)) {
509 err = deny_write_access(file); 504 err = deny_write_access(file);
510 if (err) { 505 if (err) {
@@ -516,6 +511,7 @@ out:
516 return file; 511 return file;
517 } 512 }
518 } 513 }
514 release_open_intent(&nd);
519 path_release(&nd); 515 path_release(&nd);
520 } 516 }
521 goto out; 517 goto out;
@@ -634,10 +630,9 @@ static inline int de_thread(struct task_struct *tsk)
634 /* 630 /*
635 * Account for the thread group leader hanging around: 631 * Account for the thread group leader hanging around:
636 */ 632 */
637 count = 2; 633 count = 1;
638 if (thread_group_leader(current)) 634 if (!thread_group_leader(current)) {
639 count = 1; 635 count = 2;
640 else {
641 /* 636 /*
642 * The SIGALRM timer survives the exec, but needs to point 637 * The SIGALRM timer survives the exec, but needs to point
643 * at us as the new group leader now. We have a race with 638 * at us as the new group leader now. We have a race with
@@ -646,8 +641,10 @@ static inline int de_thread(struct task_struct *tsk)
646 * before we can safely let the old group leader die. 641 * before we can safely let the old group leader die.
647 */ 642 */
648 sig->real_timer.data = (unsigned long)current; 643 sig->real_timer.data = (unsigned long)current;
644 spin_unlock_irq(lock);
649 if (del_timer_sync(&sig->real_timer)) 645 if (del_timer_sync(&sig->real_timer))
650 add_timer(&sig->real_timer); 646 add_timer(&sig->real_timer);
647 spin_lock_irq(lock);
651 } 648 }
652 while (atomic_read(&sig->count) > count) { 649 while (atomic_read(&sig->count) > count) {
653 sig->group_exit_task = current; 650 sig->group_exit_task = current;
@@ -659,7 +656,6 @@ static inline int de_thread(struct task_struct *tsk)
659 } 656 }
660 sig->group_exit_task = NULL; 657 sig->group_exit_task = NULL;
661 sig->notify_count = 0; 658 sig->notify_count = 0;
662 sig->real_timer.data = (unsigned long)current;
663 spin_unlock_irq(lock); 659 spin_unlock_irq(lock);
664 660
665 /* 661 /*
@@ -1207,7 +1203,6 @@ int do_execve(char * filename,
1207 /* execve success */ 1203 /* execve success */
1208 security_bprm_free(bprm); 1204 security_bprm_free(bprm);
1209 acct_update_integrals(current); 1205 acct_update_integrals(current);
1210 update_mem_hiwater(current);
1211 kfree(bprm); 1206 kfree(bprm);
1212 return retval; 1207 return retval;
1213 } 1208 }
@@ -1422,19 +1417,16 @@ static void zap_threads (struct mm_struct *mm)
1422static void coredump_wait(struct mm_struct *mm) 1417static void coredump_wait(struct mm_struct *mm)
1423{ 1418{
1424 DECLARE_COMPLETION(startup_done); 1419 DECLARE_COMPLETION(startup_done);
1420 int core_waiters;
1425 1421
1426 mm->core_waiters++; /* let other threads block */
1427 mm->core_startup_done = &startup_done; 1422 mm->core_startup_done = &startup_done;
1428 1423
1429 /* give other threads a chance to run: */
1430 yield();
1431
1432 zap_threads(mm); 1424 zap_threads(mm);
1433 if (--mm->core_waiters) { 1425 core_waiters = mm->core_waiters;
1434 up_write(&mm->mmap_sem); 1426 up_write(&mm->mmap_sem);
1427
1428 if (core_waiters)
1435 wait_for_completion(&startup_done); 1429 wait_for_completion(&startup_done);
1436 } else
1437 up_write(&mm->mmap_sem);
1438 BUG_ON(mm->core_waiters); 1430 BUG_ON(mm->core_waiters);
1439} 1431}
1440 1432
@@ -1468,11 +1460,21 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1468 current->fsuid = 0; /* Dump root private */ 1460 current->fsuid = 0; /* Dump root private */
1469 } 1461 }
1470 mm->dumpable = 0; 1462 mm->dumpable = 0;
1471 init_completion(&mm->core_done); 1463
1464 retval = -EAGAIN;
1472 spin_lock_irq(&current->sighand->siglock); 1465 spin_lock_irq(&current->sighand->siglock);
1473 current->signal->flags = SIGNAL_GROUP_EXIT; 1466 if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
1474 current->signal->group_exit_code = exit_code; 1467 current->signal->flags = SIGNAL_GROUP_EXIT;
1468 current->signal->group_exit_code = exit_code;
1469 retval = 0;
1470 }
1475 spin_unlock_irq(&current->sighand->siglock); 1471 spin_unlock_irq(&current->sighand->siglock);
1472 if (retval) {
1473 up_write(&mm->mmap_sem);
1474 goto fail;
1475 }
1476
1477 init_completion(&mm->core_done);
1476 coredump_wait(mm); 1478 coredump_wait(mm);
1477 1479
1478 /* 1480 /*
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fdba4d1d3c60..e7d3f0522d01 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -440,6 +440,10 @@ static int ext2_alloc_branch(struct inode *inode,
440 * the pointer to new one, then send parent to disk. 440 * the pointer to new one, then send parent to disk.
441 */ 441 */
442 bh = sb_getblk(inode->i_sb, parent); 442 bh = sb_getblk(inode->i_sb, parent);
443 if (!bh) {
444 err = -EIO;
445 break;
446 }
443 lock_buffer(bh); 447 lock_buffer(bh);
444 memset(bh->b_data, 0, blocksize); 448 memset(bh->b_data, 0, blocksize);
445 branch[n].bh = bh; 449 branch[n].bh = bh;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 0213db4911a2..7992d21e0e09 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,8 @@
20#include <linux/quotaops.h> 20#include <linux/quotaops.h>
21#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
22 22
23#include "bitmap.h"
24
23/* 25/*
24 * balloc.c contains the blocks allocation and deallocation routines 26 * balloc.c contains the blocks allocation and deallocation routines
25 */ 27 */
@@ -1010,7 +1012,7 @@ retry:
1010 * allocation within the reservation window. 1012 * allocation within the reservation window.
1011 * 1013 *
1012 * This will avoid keeping on searching the reservation list again and 1014 * This will avoid keeping on searching the reservation list again and
1013 * again when someboday is looking for a free block (without 1015 * again when somebody is looking for a free block (without
1014 * reservation), and there are lots of free blocks, but they are all 1016 * reservation), and there are lots of free blocks, but they are all
1015 * being reserved. 1017 * being reserved.
1016 * 1018 *
@@ -1416,12 +1418,12 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
1416 unsigned long bitmap_count, x; 1418 unsigned long bitmap_count, x;
1417 struct buffer_head *bitmap_bh = NULL; 1419 struct buffer_head *bitmap_bh = NULL;
1418 1420
1419 lock_super(sb);
1420 es = EXT3_SB(sb)->s_es; 1421 es = EXT3_SB(sb)->s_es;
1421 desc_count = 0; 1422 desc_count = 0;
1422 bitmap_count = 0; 1423 bitmap_count = 0;
1423 gdp = NULL; 1424 gdp = NULL;
1424 1425
1426 smp_rmb();
1425 for (i = 0; i < ngroups; i++) { 1427 for (i = 0; i < ngroups; i++) {
1426 gdp = ext3_get_group_desc(sb, i, NULL); 1428 gdp = ext3_get_group_desc(sb, i, NULL);
1427 if (!gdp) 1429 if (!gdp)
@@ -1440,7 +1442,6 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
1440 brelse(bitmap_bh); 1442 brelse(bitmap_bh);
1441 printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n", 1443 printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
1442 le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); 1444 le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
1443 unlock_super(sb);
1444 return bitmap_count; 1445 return bitmap_count;
1445#else 1446#else
1446 desc_count = 0; 1447 desc_count = 0;
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
index 6c419b9ab0e8..5b4ba3e246e6 100644
--- a/fs/ext3/bitmap.c
+++ b/fs/ext3/bitmap.c
@@ -8,7 +8,7 @@
8 */ 8 */
9 9
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11 11#include "bitmap.h"
12 12
13static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 13static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
14 14
diff --git a/fs/ext3/bitmap.h b/fs/ext3/bitmap.h
new file mode 100644
index 000000000000..6ee503a6bb4e
--- /dev/null
+++ b/fs/ext3/bitmap.h
@@ -0,0 +1,8 @@
1/* linux/fs/ext3/bitmap.c
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern unsigned long ext3_count_free (struct buffer_head *, unsigned int );
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 6549945f9ac1..df3f517c54ac 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -26,6 +26,7 @@
26 26
27#include <asm/byteorder.h> 27#include <asm/byteorder.h>
28 28
29#include "bitmap.h"
29#include "xattr.h" 30#include "xattr.h"
30#include "acl.h" 31#include "acl.h"
31 32
@@ -704,7 +705,6 @@ unsigned long ext3_count_free_inodes (struct super_block * sb)
704 unsigned long bitmap_count, x; 705 unsigned long bitmap_count, x;
705 struct buffer_head *bitmap_bh = NULL; 706 struct buffer_head *bitmap_bh = NULL;
706 707
707 lock_super (sb);
708 es = EXT3_SB(sb)->s_es; 708 es = EXT3_SB(sb)->s_es;
709 desc_count = 0; 709 desc_count = 0;
710 bitmap_count = 0; 710 bitmap_count = 0;
@@ -727,7 +727,6 @@ unsigned long ext3_count_free_inodes (struct super_block * sb)
727 brelse(bitmap_bh); 727 brelse(bitmap_bh);
728 printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n", 728 printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
729 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); 729 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
730 unlock_super(sb);
731 return desc_count; 730 return desc_count;
732#else 731#else
733 desc_count = 0; 732 desc_count = 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b5177c90d6f1..5d9b00e28837 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -491,7 +491,7 @@ static unsigned long ext3_find_goal(struct inode *inode, long block,
491 * the same format as ext3_get_branch() would do. We are calling it after 491 * the same format as ext3_get_branch() would do. We are calling it after
492 * we had read the existing part of chain and partial points to the last 492 * we had read the existing part of chain and partial points to the last
493 * triple of that (one with zero ->key). Upon the exit we have the same 493 * triple of that (one with zero ->key). Upon the exit we have the same
494 * picture as after the successful ext3_get_block(), excpet that in one 494 * picture as after the successful ext3_get_block(), except that in one
495 * place chain is disconnected - *branch->p is still zero (we did not 495 * place chain is disconnected - *branch->p is still zero (we did not
496 * set the last link), but branch->key contains the number that should 496 * set the last link), but branch->key contains the number that should
497 * be placed into *branch->p to fill that gap. 497 * be placed into *branch->p to fill that gap.
@@ -523,7 +523,6 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
523 if (!nr) 523 if (!nr)
524 break; 524 break;
525 branch[n].key = cpu_to_le32(nr); 525 branch[n].key = cpu_to_le32(nr);
526 keys = n+1;
527 526
528 /* 527 /*
529 * Get buffer_head for parent block, zero it out 528 * Get buffer_head for parent block, zero it out
@@ -531,6 +530,9 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
531 * parent to disk. 530 * parent to disk.
532 */ 531 */
533 bh = sb_getblk(inode->i_sb, parent); 532 bh = sb_getblk(inode->i_sb, parent);
533 if (!bh)
534 break;
535 keys = n+1;
534 branch[n].bh = bh; 536 branch[n].bh = bh;
535 lock_buffer(bh); 537 lock_buffer(bh);
536 BUFFER_TRACE(bh, "call get_create_access"); 538 BUFFER_TRACE(bh, "call get_create_access");
@@ -864,6 +866,10 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
864 if (!*errp && buffer_mapped(&dummy)) { 866 if (!*errp && buffer_mapped(&dummy)) {
865 struct buffer_head *bh; 867 struct buffer_head *bh;
866 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 868 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
869 if (!bh) {
870 *errp = -EIO;
871 goto err;
872 }
867 if (buffer_new(&dummy)) { 873 if (buffer_new(&dummy)) {
868 J_ASSERT(create != 0); 874 J_ASSERT(create != 0);
869 J_ASSERT(handle != 0); 875 J_ASSERT(handle != 0);
@@ -896,6 +902,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
896 } 902 }
897 return bh; 903 return bh;
898 } 904 }
905err:
899 return NULL; 906 return NULL;
900} 907}
901 908
@@ -1434,7 +1441,7 @@ static int ext3_invalidatepage(struct page *page, unsigned long offset)
1434 return journal_invalidatepage(journal, page, offset); 1441 return journal_invalidatepage(journal, page, offset);
1435} 1442}
1436 1443
1437static int ext3_releasepage(struct page *page, int wait) 1444static int ext3_releasepage(struct page *page, gfp_t wait)
1438{ 1445{
1439 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1446 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1440 1447
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 50378d8ff84b..b3c690a3b54a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -36,6 +36,8 @@
36#include <linux/quotaops.h> 36#include <linux/quotaops.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/smp_lock.h> 38#include <linux/smp_lock.h>
39
40#include "namei.h"
39#include "xattr.h" 41#include "xattr.h"
40#include "acl.h" 42#include "acl.h"
41 43
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
new file mode 100644
index 000000000000..f2ce2b0065c9
--- /dev/null
+++ b/fs/ext3/namei.h
@@ -0,0 +1,8 @@
1/* linux/fs/ext3/namei.h
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern struct dentry *ext3_get_parent(struct dentry *child);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 57f79106267d..1be78b4b4de9 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -118,6 +118,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
118 int err; 118 int err;
119 119
120 bh = sb_getblk(sb, blk); 120 bh = sb_getblk(sb, blk);
121 if (!bh)
122 return ERR_PTR(-EIO);
121 if ((err = ext3_journal_get_write_access(handle, bh))) { 123 if ((err = ext3_journal_get_write_access(handle, bh))) {
122 brelse(bh); 124 brelse(bh);
123 bh = ERR_PTR(err); 125 bh = ERR_PTR(err);
@@ -202,6 +204,10 @@ static int setup_new_group_blocks(struct super_block *sb,
202 ext3_debug("update backup group %#04lx (+%d)\n", block, bit); 204 ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
203 205
204 gdb = sb_getblk(sb, block); 206 gdb = sb_getblk(sb, block);
207 if (!gdb) {
208 err = -EIO;
209 goto exit_bh;
210 }
205 if ((err = ext3_journal_get_write_access(handle, gdb))) { 211 if ((err = ext3_journal_get_write_access(handle, gdb))) {
206 brelse(gdb); 212 brelse(gdb);
207 goto exit_bh; 213 goto exit_bh;
@@ -643,6 +649,10 @@ static void update_backups(struct super_block *sb,
643 break; 649 break;
644 650
645 bh = sb_getblk(sb, group * bpg + blk_off); 651 bh = sb_getblk(sb, group * bpg + blk_off);
652 if (!bh) {
653 err = -EIO;
654 break;
655 }
646 ext3_debug("update metadata backup %#04lx\n", 656 ext3_debug("update metadata backup %#04lx\n",
647 (unsigned long)bh->b_blocknr); 657 (unsigned long)bh->b_blocknr);
648 if ((err = ext3_journal_get_write_access(handle, bh))) 658 if ((err = ext3_journal_get_write_access(handle, bh)))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e24ceb019fe..f594989ccb7a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -36,9 +36,12 @@
36#include <linux/namei.h> 36#include <linux/namei.h>
37#include <linux/quotaops.h> 37#include <linux/quotaops.h>
38#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
41
40#include "xattr.h" 42#include "xattr.h"
41#include "acl.h" 43#include "acl.h"
44#include "namei.h"
42 45
43static int ext3_load_journal(struct super_block *, struct ext3_super_block *); 46static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
44static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 47static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -510,19 +513,11 @@ static void ext3_clear_inode(struct inode *inode)
510 kfree(rsv); 513 kfree(rsv);
511} 514}
512 515
513static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) 516static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
514{ 517{
515 struct super_block *sb = vfs->mnt_sb; 518#if defined(CONFIG_QUOTA)
516 struct ext3_sb_info *sbi = EXT3_SB(sb); 519 struct ext3_sb_info *sbi = EXT3_SB(sb);
517 520
518 if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
519 seq_puts(seq, ",data=journal");
520 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
521 seq_puts(seq, ",data=ordered");
522 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
523 seq_puts(seq, ",data=writeback");
524
525#if defined(CONFIG_QUOTA)
526 if (sbi->s_jquota_fmt) 521 if (sbi->s_jquota_fmt)
527 seq_printf(seq, ",jqfmt=%s", 522 seq_printf(seq, ",jqfmt=%s",
528 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); 523 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
@@ -539,6 +534,20 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
539 if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) 534 if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
540 seq_puts(seq, ",grpquota"); 535 seq_puts(seq, ",grpquota");
541#endif 536#endif
537}
538
539static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
540{
541 struct super_block *sb = vfs->mnt_sb;
542
543 if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
544 seq_puts(seq, ",data=journal");
545 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
546 seq_puts(seq, ",data=ordered");
547 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
548 seq_puts(seq, ",data=writeback");
549
550 ext3_show_quota_options(seq, sb);
542 551
543 return 0; 552 return 0;
544} 553}
@@ -609,7 +618,6 @@ static struct super_operations ext3_sops = {
609#endif 618#endif
610}; 619};
611 620
612struct dentry *ext3_get_parent(struct dentry *child);
613static struct export_operations ext3_export_ops = { 621static struct export_operations ext3_export_ops = {
614 .get_parent = ext3_get_parent, 622 .get_parent = ext3_get_parent,
615}; 623};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 269c7b92db9a..430de9f63be3 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -210,7 +210,7 @@ ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
210 return cmp ? -ENODATA : 0; 210 return cmp ? -ENODATA : 0;
211} 211}
212 212
213int 213static int
214ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, 214ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
215 void *buffer, size_t buffer_size) 215 void *buffer, size_t buffer_size)
216{ 216{
@@ -354,7 +354,7 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
354 return buffer_size - rest; 354 return buffer_size - rest;
355} 355}
356 356
357int 357static int
358ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) 358ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
359{ 359{
360 struct buffer_head *bh = NULL; 360 struct buffer_head *bh = NULL;
@@ -626,7 +626,7 @@ struct ext3_xattr_block_find {
626 struct buffer_head *bh; 626 struct buffer_head *bh;
627}; 627};
628 628
629int 629static int
630ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, 630ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
631 struct ext3_xattr_block_find *bs) 631 struct ext3_xattr_block_find *bs)
632{ 632{
@@ -859,7 +859,7 @@ struct ext3_xattr_ibody_find {
859 struct ext3_iloc iloc; 859 struct ext3_iloc iloc;
860}; 860};
861 861
862int 862static int
863ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, 863ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
864 struct ext3_xattr_ibody_find *is) 864 struct ext3_xattr_ibody_find *is)
865{ 865{
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 895049b2ac9c..ba824964b9bb 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -222,6 +222,80 @@ fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size,
222 return len; 222 return len;
223} 223}
224 224
225enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };
226
227/**
228 * fat_parse_long - Parse extended directory entry.
229 *
230 * This function returns zero on success, negative value on error, or one of
231 * the following:
232 *
233 * %PARSE_INVALID - Directory entry is invalid.
234 * %PARSE_NOT_LONGNAME - Directory entry does not contain longname.
235 * %PARSE_EOF - Directory has no more entries.
236 */
237static int fat_parse_long(struct inode *dir, loff_t *pos,
238 struct buffer_head **bh, struct msdos_dir_entry **de,
239 wchar_t **unicode, unsigned char *nr_slots)
240{
241 struct msdos_dir_slot *ds;
242 unsigned char id, slot, slots, alias_checksum;
243
244 if (!*unicode) {
245 *unicode = (wchar_t *)__get_free_page(GFP_KERNEL);
246 if (!*unicode) {
247 brelse(*bh);
248 return -ENOMEM;
249 }
250 }
251parse_long:
252 slots = 0;
253 ds = (struct msdos_dir_slot *)*de;
254 id = ds->id;
255 if (!(id & 0x40))
256 return PARSE_INVALID;
257 slots = id & ~0x40;
258 if (slots > 20 || !slots) /* ceil(256 * 2 / 26) */
259 return PARSE_INVALID;
260 *nr_slots = slots;
261 alias_checksum = ds->alias_checksum;
262
263 slot = slots;
264 while (1) {
265 int offset;
266
267 slot--;
268 offset = slot * 13;
269 fat16_towchar(*unicode + offset, ds->name0_4, 5);
270 fat16_towchar(*unicode + offset + 5, ds->name5_10, 6);
271 fat16_towchar(*unicode + offset + 11, ds->name11_12, 2);
272
273 if (ds->id & 0x40)
274 (*unicode)[offset + 13] = 0;
275 if (fat_get_entry(dir, pos, bh, de) < 0)
276 return PARSE_EOF;
277 if (slot == 0)
278 break;
279 ds = (struct msdos_dir_slot *)*de;
280 if (ds->attr != ATTR_EXT)
281 return PARSE_NOT_LONGNAME;
282 if ((ds->id & ~0x40) != slot)
283 goto parse_long;
284 if (ds->alias_checksum != alias_checksum)
285 goto parse_long;
286 }
287 if ((*de)->name[0] == DELETED_FLAG)
288 return PARSE_INVALID;
289 if ((*de)->attr == ATTR_EXT)
290 goto parse_long;
291 if (IS_FREE((*de)->name) || ((*de)->attr & ATTR_VOLUME))
292 return PARSE_INVALID;
293 if (fat_checksum((*de)->name) != alias_checksum)
294 *nr_slots = 0;
295
296 return 0;
297}
298
225/* 299/*
226 * Return values: negative -> error, 0 -> not found, positive -> found, 300 * Return values: negative -> error, 0 -> not found, positive -> found,
227 * value is the total amount of slots, including the shortname entry. 301 * value is the total amount of slots, including the shortname entry.
@@ -259,68 +333,16 @@ parse_record:
259 if (de->attr != ATTR_EXT && IS_FREE(de->name)) 333 if (de->attr != ATTR_EXT && IS_FREE(de->name))
260 continue; 334 continue;
261 if (de->attr == ATTR_EXT) { 335 if (de->attr == ATTR_EXT) {
262 struct msdos_dir_slot *ds; 336 int status = fat_parse_long(inode, &cpos, &bh, &de,
263 unsigned char id; 337 &unicode, &nr_slots);
264 unsigned char slot; 338 if (status < 0)
265 unsigned char slots; 339 return status;
266 unsigned char sum; 340 else if (status == PARSE_INVALID)
267 unsigned char alias_checksum;
268
269 if (!unicode) {
270 unicode = (wchar_t *)
271 __get_free_page(GFP_KERNEL);
272 if (!unicode) {
273 brelse(bh);
274 return -ENOMEM;
275 }
276 }
277parse_long:
278 slots = 0;
279 ds = (struct msdos_dir_slot *) de;
280 id = ds->id;
281 if (!(id & 0x40))
282 continue;
283 slots = id & ~0x40;
284 if (slots > 20 || !slots) /* ceil(256 * 2 / 26) */
285 continue;
286 nr_slots = slots;
287 alias_checksum = ds->alias_checksum;
288
289 slot = slots;
290 while (1) {
291 int offset;
292
293 slot--;
294 offset = slot * 13;
295 fat16_towchar(unicode + offset, ds->name0_4, 5);
296 fat16_towchar(unicode + offset + 5, ds->name5_10, 6);
297 fat16_towchar(unicode + offset + 11, ds->name11_12, 2);
298
299 if (ds->id & 0x40) {
300 unicode[offset + 13] = 0;
301 }
302 if (fat_get_entry(inode, &cpos, &bh, &de) < 0)
303 goto EODir;
304 if (slot == 0)
305 break;
306 ds = (struct msdos_dir_slot *) de;
307 if (ds->attr != ATTR_EXT)
308 goto parse_record;
309 if ((ds->id & ~0x40) != slot)
310 goto parse_long;
311 if (ds->alias_checksum != alias_checksum)
312 goto parse_long;
313 }
314 if (de->name[0] == DELETED_FLAG)
315 continue;
316 if (de->attr == ATTR_EXT)
317 goto parse_long;
318 if (IS_FREE(de->name) || (de->attr & ATTR_VOLUME))
319 continue; 341 continue;
320 for (sum = 0, i = 0; i < 11; i++) 342 else if (status == PARSE_NOT_LONGNAME)
321 sum = (((sum&1)<<7)|((sum&0xfe)>>1)) + de->name[i]; 343 goto parse_record;
322 if (sum != alias_checksum) 344 else if (status == PARSE_EOF)
323 nr_slots = 0; 345 goto EODir;
324 } 346 }
325 347
326 memcpy(work, de->name, sizeof(de->name)); 348 memcpy(work, de->name, sizeof(de->name));
@@ -408,8 +430,8 @@ struct fat_ioctl_filldir_callback {
408 int short_len; 430 int short_len;
409}; 431};
410 432
411static int fat_readdirx(struct inode *inode, struct file *filp, void *dirent, 433static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
412 filldir_t filldir, int short_only, int both) 434 filldir_t filldir, int short_only, int both)
413{ 435{
414 struct super_block *sb = inode->i_sb; 436 struct super_block *sb = inode->i_sb;
415 struct msdos_sb_info *sbi = MSDOS_SB(sb); 437 struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -458,9 +480,10 @@ static int fat_readdirx(struct inode *inode, struct file *filp, void *dirent,
458 480
459 bh = NULL; 481 bh = NULL;
460GetNew: 482GetNew:
461 long_slots = 0;
462 if (fat_get_entry(inode, &cpos, &bh, &de) == -1) 483 if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
463 goto EODir; 484 goto EODir;
485parse_record:
486 long_slots = 0;
464 /* Check for long filename entry */ 487 /* Check for long filename entry */
465 if (isvfat) { 488 if (isvfat) {
466 if (de->name[0] == DELETED_FLAG) 489 if (de->name[0] == DELETED_FLAG)
@@ -475,69 +498,18 @@ GetNew:
475 } 498 }
476 499
477 if (isvfat && de->attr == ATTR_EXT) { 500 if (isvfat && de->attr == ATTR_EXT) {
478 struct msdos_dir_slot *ds; 501 int status = fat_parse_long(inode, &cpos, &bh, &de,
479 unsigned char id; 502 &unicode, &long_slots);
480 unsigned char slot; 503 if (status < 0) {
481 unsigned char slots; 504 filp->f_pos = cpos;
482 unsigned char sum; 505 ret = status;
483 unsigned char alias_checksum; 506 goto out;
484 507 } else if (status == PARSE_INVALID)
485 if (!unicode) {
486 unicode = (wchar_t *)__get_free_page(GFP_KERNEL);
487 if (!unicode) {
488 filp->f_pos = cpos;
489 brelse(bh);
490 ret = -ENOMEM;
491 goto out;
492 }
493 }
494ParseLong:
495 slots = 0;
496 ds = (struct msdos_dir_slot *) de;
497 id = ds->id;
498 if (!(id & 0x40))
499 goto RecEnd;
500 slots = id & ~0x40;
501 if (slots > 20 || !slots) /* ceil(256 * 2 / 26) */
502 goto RecEnd; 508 goto RecEnd;
503 long_slots = slots; 509 else if (status == PARSE_NOT_LONGNAME)
504 alias_checksum = ds->alias_checksum; 510 goto parse_record;
505 511 else if (status == PARSE_EOF)
506 slot = slots; 512 goto EODir;
507 while (1) {
508 int offset;
509
510 slot--;
511 offset = slot * 13;
512 fat16_towchar(unicode + offset, ds->name0_4, 5);
513 fat16_towchar(unicode + offset + 5, ds->name5_10, 6);
514 fat16_towchar(unicode + offset + 11, ds->name11_12, 2);
515
516 if (ds->id & 0x40) {
517 unicode[offset + 13] = 0;
518 }
519 if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
520 goto EODir;
521 if (slot == 0)
522 break;
523 ds = (struct msdos_dir_slot *) de;
524 if (ds->attr != ATTR_EXT)
525 goto RecEnd; /* XXX */
526 if ((ds->id & ~0x40) != slot)
527 goto ParseLong;
528 if (ds->alias_checksum != alias_checksum)
529 goto ParseLong;
530 }
531 if (de->name[0] == DELETED_FLAG)
532 goto RecEnd;
533 if (de->attr == ATTR_EXT)
534 goto ParseLong;
535 if (IS_FREE(de->name) || (de->attr & ATTR_VOLUME))
536 goto RecEnd;
537 for (sum = 0, i = 0; i < 11; i++)
538 sum = (((sum&1)<<7)|((sum&0xfe)>>1)) + de->name[i];
539 if (sum != alias_checksum)
540 long_slots = 0;
541 } 513 }
542 514
543 if (sbi->options.dotsOK) { 515 if (sbi->options.dotsOK) {
@@ -671,7 +643,7 @@ out:
671static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir) 643static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
672{ 644{
673 struct inode *inode = filp->f_dentry->d_inode; 645 struct inode *inode = filp->f_dentry->d_inode;
674 return fat_readdirx(inode, filp, dirent, filldir, 0, 0); 646 return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
675} 647}
676 648
677static int fat_ioctl_filldir(void *__buf, const char *name, int name_len, 649static int fat_ioctl_filldir(void *__buf, const char *name, int name_len,
@@ -760,8 +732,8 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
760 down(&inode->i_sem); 732 down(&inode->i_sem);
761 ret = -ENOENT; 733 ret = -ENOENT;
762 if (!IS_DEADDIR(inode)) { 734 if (!IS_DEADDIR(inode)) {
763 ret = fat_readdirx(inode, filp, &buf, fat_ioctl_filldir, 735 ret = __fat_readdir(inode, filp, &buf, fat_ioctl_filldir,
764 short_only, both); 736 short_only, both);
765 } 737 }
766 up(&inode->i_sem); 738 up(&inode->i_sem);
767 if (ret >= 0) 739 if (ret >= 0)
diff --git a/fs/file_table.c b/fs/file_table.c
index 86ec8ae985b4..4dc205546547 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -56,13 +56,13 @@ void filp_dtor(void * objp, struct kmem_cache_s *cachep, unsigned long dflags)
56 56
57static inline void file_free_rcu(struct rcu_head *head) 57static inline void file_free_rcu(struct rcu_head *head)
58{ 58{
59 struct file *f = container_of(head, struct file, f_rcuhead); 59 struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
60 kmem_cache_free(filp_cachep, f); 60 kmem_cache_free(filp_cachep, f);
61} 61}
62 62
63static inline void file_free(struct file *f) 63static inline void file_free(struct file *f)
64{ 64{
65 call_rcu(&f->f_rcuhead, file_free_rcu); 65 call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
66} 66}
67 67
68/* Find an unused file structure and return a pointer to it. 68/* Find an unused file structure and return a pointer to it.
@@ -95,7 +95,7 @@ struct file *get_empty_filp(void)
95 f->f_gid = current->fsgid; 95 f->f_gid = current->fsgid;
96 rwlock_init(&f->f_owner.lock); 96 rwlock_init(&f->f_owner.lock);
97 /* f->f_version: 0 */ 97 /* f->f_version: 0 */
98 INIT_LIST_HEAD(&f->f_list); 98 INIT_LIST_HEAD(&f->f_u.fu_list);
99 return f; 99 return f;
100 100
101over: 101over:
@@ -225,15 +225,15 @@ void file_move(struct file *file, struct list_head *list)
225 if (!list) 225 if (!list)
226 return; 226 return;
227 file_list_lock(); 227 file_list_lock();
228 list_move(&file->f_list, list); 228 list_move(&file->f_u.fu_list, list);
229 file_list_unlock(); 229 file_list_unlock();
230} 230}
231 231
232void file_kill(struct file *file) 232void file_kill(struct file *file)
233{ 233{
234 if (!list_empty(&file->f_list)) { 234 if (!list_empty(&file->f_u.fu_list)) {
235 file_list_lock(); 235 file_list_lock();
236 list_del_init(&file->f_list); 236 list_del_init(&file->f_u.fu_list);
237 file_list_unlock(); 237 file_list_unlock();
238 } 238 }
239} 239}
@@ -245,7 +245,7 @@ int fs_may_remount_ro(struct super_block *sb)
245 /* Check that no files are currently opened for writing. */ 245 /* Check that no files are currently opened for writing. */
246 file_list_lock(); 246 file_list_lock();
247 list_for_each(p, &sb->s_files) { 247 list_for_each(p, &sb->s_files) {
248 struct file *file = list_entry(p, struct file, f_list); 248 struct file *file = list_entry(p, struct file, f_u.fu_list);
249 struct inode *inode = file->f_dentry->d_inode; 249 struct inode *inode = file->f_dentry->d_inode;
250 250
251 /* File with pending delete? */ 251 /* File with pending delete? */
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 44082bfdfec9..9f1072836c8e 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -12,6 +12,7 @@
12#include <linux/kmod.h> 12#include <linux/kmod.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/sched.h> /* for 'current' */
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
17/* 18/*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e94ab398b717..ffab4783ac64 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -230,7 +230,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
230 * The inode is clean, unused 230 * The inode is clean, unused
231 */ 231 */
232 list_move(&inode->i_list, &inode_unused); 232 list_move(&inode->i_list, &inode_unused);
233 inodes_stat.nr_unused++;
234 } 233 }
235 } 234 }
236 wake_up_inode(inode); 235 wake_up_inode(inode);
@@ -238,14 +237,20 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
238} 237}
239 238
240/* 239/*
241 * Write out an inode's dirty pages. Called under inode_lock. 240 * Write out an inode's dirty pages. Called under inode_lock. Either the
241 * caller has ref on the inode (either via __iget or via syscall against an fd)
242 * or the inode has I_WILL_FREE set (via generic_forget_inode)
242 */ 243 */
243static int 244static int
244__writeback_single_inode(struct inode *inode, 245__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
245 struct writeback_control *wbc)
246{ 246{
247 wait_queue_head_t *wqh; 247 wait_queue_head_t *wqh;
248 248
249 if (!atomic_read(&inode->i_count))
250 WARN_ON(!(inode->i_state & I_WILL_FREE));
251 else
252 WARN_ON(inode->i_state & I_WILL_FREE);
253
249 if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) { 254 if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
250 list_move(&inode->i_list, &inode->i_sb->s_dirty); 255 list_move(&inode->i_list, &inode->i_sb->s_dirty);
251 return 0; 256 return 0;
@@ -259,11 +264,9 @@ __writeback_single_inode(struct inode *inode,
259 264
260 wqh = bit_waitqueue(&inode->i_state, __I_LOCK); 265 wqh = bit_waitqueue(&inode->i_state, __I_LOCK);
261 do { 266 do {
262 __iget(inode);
263 spin_unlock(&inode_lock); 267 spin_unlock(&inode_lock);
264 __wait_on_bit(wqh, &wq, inode_wait, 268 __wait_on_bit(wqh, &wq, inode_wait,
265 TASK_UNINTERRUPTIBLE); 269 TASK_UNINTERRUPTIBLE);
266 iput(inode);
267 spin_lock(&inode_lock); 270 spin_lock(&inode_lock);
268 } while (inode->i_state & I_LOCK); 271 } while (inode->i_state & I_LOCK);
269 } 272 }
@@ -541,14 +544,15 @@ void sync_inodes(int wait)
541} 544}
542 545
543/** 546/**
544 * write_inode_now - write an inode to disk 547 * write_inode_now - write an inode to disk
545 * @inode: inode to write to disk 548 * @inode: inode to write to disk
546 * @sync: whether the write should be synchronous or not 549 * @sync: whether the write should be synchronous or not
550 *
551 * This function commits an inode to disk immediately if it is dirty. This is
552 * primarily needed by knfsd.
547 * 553 *
548 * This function commits an inode to disk immediately if it is 554 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
549 * dirty. This is primarily needed by knfsd.
550 */ 555 */
551
552int write_inode_now(struct inode *inode, int sync) 556int write_inode_now(struct inode *inode, int sync)
553{ 557{
554 int ret; 558 int ret;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index d4c869c6d01b..a6f90a6c754a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -151,9 +151,9 @@ void fuse_release_background(struct fuse_req *req)
151/* 151/*
152 * This function is called when a request is finished. Either a reply 152 * This function is called when a request is finished. Either a reply
153 * has arrived or it was interrupted (and not yet sent) or some error 153 * has arrived or it was interrupted (and not yet sent) or some error
154 * occured during communication with userspace, or the device file was 154 * occurred during communication with userspace, or the device file was
155 * closed. It decreases the referece count for the request. In case 155 * closed. It decreases the reference count for the request. In case
156 * of a background request the referece to the stored objects are 156 * of a background request the reference to the stored objects are
157 * released. The requester thread is woken up (if still waiting), and 157 * released. The requester thread is woken up (if still waiting), and
158 * finally the request is either freed or put on the unused_list 158 * finally the request is either freed or put on the unused_list
159 * 159 *
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 29f1e9f6e85c..70dba721acab 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -741,13 +741,14 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
741 if (inode && S_ISDIR(inode->i_mode)) { 741 if (inode && S_ISDIR(inode->i_mode)) {
742 /* Don't allow creating an alias to a directory */ 742 /* Don't allow creating an alias to a directory */
743 struct dentry *alias = d_find_alias(inode); 743 struct dentry *alias = d_find_alias(inode);
744 if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { 744 if (alias) {
745 dput(alias); 745 dput(alias);
746 iput(inode); 746 iput(inode);
747 return ERR_PTR(-EIO); 747 return ERR_PTR(-EIO);
748 } 748 }
749 } 749 }
750 return d_splice_alias(inode, entry); 750 d_add(entry, inode);
751 return NULL;
751} 752}
752 753
753static int fuse_setxattr(struct dentry *entry, const char *name, 754static int fuse_setxattr(struct dentry *entry, const char *name,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 24d761518d86..5cb456f572c1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -349,22 +349,22 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
349 int isdir); 349 int isdir);
350 350
351/** 351/**
352 * Initialise file operations on a regular file 352 * Initialize file operations on a regular file
353 */ 353 */
354void fuse_init_file_inode(struct inode *inode); 354void fuse_init_file_inode(struct inode *inode);
355 355
356/** 356/**
357 * Initialise inode operations on regular files and special files 357 * Initialize inode operations on regular files and special files
358 */ 358 */
359void fuse_init_common(struct inode *inode); 359void fuse_init_common(struct inode *inode);
360 360
361/** 361/**
362 * Initialise inode and file operations on a directory 362 * Initialize inode and file operations on a directory
363 */ 363 */
364void fuse_init_dir(struct inode *inode); 364void fuse_init_dir(struct inode *inode);
365 365
366/** 366/**
367 * Initialise inode operations on a symlink 367 * Initialize inode operations on a symlink
368 */ 368 */
369void fuse_init_symlink(struct inode *inode); 369void fuse_init_symlink(struct inode *inode);
370 370
@@ -411,7 +411,7 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc);
411 411
412/** 412/**
413 * Decrement reference count of a request. If count goes to zero put 413 * Decrement reference count of a request. If count goes to zero put
414 * on unused list (preallocated) or free reqest (not preallocated). 414 * on unused list (preallocated) or free request (not preallocated).
415 */ 415 */
416void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); 416void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
417 417
@@ -431,7 +431,7 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
431void request_send_background(struct fuse_conn *fc, struct fuse_req *req); 431void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
432 432
433/** 433/**
434 * Release inodes and file assiciated with background request 434 * Release inodes and file associated with background request
435 */ 435 */
436void fuse_release_background(struct fuse_req *req); 436void fuse_release_background(struct fuse_req *req);
437 437
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index f1570b9f9de3..3f680c5675bf 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -46,7 +46,7 @@ static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
46 return generic_block_bmap(mapping, block, hfs_get_block); 46 return generic_block_bmap(mapping, block, hfs_get_block);
47} 47}
48 48
49static int hfs_releasepage(struct page *page, int mask) 49static int hfs_releasepage(struct page *page, gfp_t mask)
50{ 50{
51 struct inode *inode = page->mapping->host; 51 struct inode *inode = page->mapping->host;
52 struct super_block *sb = inode->i_sb; 52 struct super_block *sb = inode->i_sb;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d5642705f633..f205773ddfbe 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -40,7 +40,7 @@ static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
40 return generic_block_bmap(mapping, block, hfsplus_get_block); 40 return generic_block_bmap(mapping, block, hfsplus_get_block);
41} 41}
42 42
43static int hfsplus_releasepage(struct page *page, int mask) 43static int hfsplus_releasepage(struct page *page, gfp_t mask)
44{ 44{
45 struct inode *inode = page->mapping->host; 45 struct inode *inode = page->mapping->host;
46 struct super_block *sb = inode->i_sb; 46 struct super_block *sb = inode->i_sb;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a9b6d179cbd..e026c807e6b3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -45,10 +45,58 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
45 45
46int sysctl_hugetlb_shm_group; 46int sysctl_hugetlb_shm_group;
47 47
48static void huge_pagevec_release(struct pagevec *pvec)
49{
50 int i;
51
52 for (i = 0; i < pagevec_count(pvec); ++i)
53 put_page(pvec->pages[i]);
54
55 pagevec_reinit(pvec);
56}
57
58/*
59 * huge_pages_needed tries to determine the number of new huge pages that
60 * will be required to fully populate this VMA. This will be equal to
61 * the size of the VMA in huge pages minus the number of huge pages
62 * (covered by this VMA) that are found in the page cache.
63 *
64 * Result is in bytes to be compatible with is_hugepage_mem_enough()
65 */
66unsigned long
67huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
68{
69 int i;
70 struct pagevec pvec;
71 unsigned long start = vma->vm_start;
72 unsigned long end = vma->vm_end;
73 unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
74 pgoff_t next = vma->vm_pgoff;
75 pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
76
77 pagevec_init(&pvec, 0);
78 while (next < endpg) {
79 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
80 break;
81 for (i = 0; i < pagevec_count(&pvec); i++) {
82 struct page *page = pvec.pages[i];
83 if (page->index > next)
84 next = page->index;
85 if (page->index >= endpg)
86 break;
87 next++;
88 hugepages--;
89 }
90 huge_pagevec_release(&pvec);
91 }
92 return hugepages << HPAGE_SHIFT;
93}
94
48static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 95static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
49{ 96{
50 struct inode *inode = file->f_dentry->d_inode; 97 struct inode *inode = file->f_dentry->d_inode;
51 struct address_space *mapping = inode->i_mapping; 98 struct address_space *mapping = inode->i_mapping;
99 unsigned long bytes;
52 loff_t len, vma_len; 100 loff_t len, vma_len;
53 int ret; 101 int ret;
54 102
@@ -67,6 +115,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
67 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 115 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
68 return -EINVAL; 116 return -EINVAL;
69 117
118 bytes = huge_pages_needed(mapping, vma);
119 if (!is_hugepage_mem_enough(bytes))
120 return -ENOMEM;
121
70 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 122 vma_len = (loff_t)(vma->vm_end - vma->vm_start);
71 123
72 down(&inode->i_sem); 124 down(&inode->i_sem);
@@ -79,10 +131,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
79 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) 131 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
80 goto out; 132 goto out;
81 133
82 ret = hugetlb_prefault(mapping, vma); 134 ret = 0;
83 if (ret) 135 hugetlb_prefault_arch_hook(vma->vm_mm);
84 goto out;
85
86 if (inode->i_size < len) 136 if (inode->i_size < len)
87 inode->i_size = len; 137 inode->i_size = len;
88out: 138out:
@@ -92,7 +142,7 @@ out:
92} 142}
93 143
94/* 144/*
95 * Called under down_write(mmap_sem), page_table_lock is not held 145 * Called under down_write(mmap_sem).
96 */ 146 */
97 147
98#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 148#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -171,16 +221,6 @@ static int hugetlbfs_commit_write(struct file *file,
171 return -EINVAL; 221 return -EINVAL;
172} 222}
173 223
174static void huge_pagevec_release(struct pagevec *pvec)
175{
176 int i;
177
178 for (i = 0; i < pagevec_count(pvec); ++i)
179 put_page(pvec->pages[i]);
180
181 pagevec_reinit(pvec);
182}
183
184static void truncate_huge_page(struct page *page) 224static void truncate_huge_page(struct page *page)
185{ 225{
186 clear_page_dirty(page); 226 clear_page_dirty(page);
@@ -224,52 +264,35 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
224 264
225static void hugetlbfs_delete_inode(struct inode *inode) 265static void hugetlbfs_delete_inode(struct inode *inode)
226{ 266{
227 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
228
229 hlist_del_init(&inode->i_hash);
230 list_del_init(&inode->i_list);
231 list_del_init(&inode->i_sb_list);
232 inode->i_state |= I_FREEING;
233 inodes_stat.nr_inodes--;
234 spin_unlock(&inode_lock);
235
236 if (inode->i_data.nrpages) 267 if (inode->i_data.nrpages)
237 truncate_hugepages(&inode->i_data, 0); 268 truncate_hugepages(&inode->i_data, 0);
238
239 security_inode_delete(inode);
240
241 if (sbinfo->free_inodes >= 0) {
242 spin_lock(&sbinfo->stat_lock);
243 sbinfo->free_inodes++;
244 spin_unlock(&sbinfo->stat_lock);
245 }
246
247 clear_inode(inode); 269 clear_inode(inode);
248 destroy_inode(inode);
249} 270}
250 271
251static void hugetlbfs_forget_inode(struct inode *inode) 272static void hugetlbfs_forget_inode(struct inode *inode)
252{ 273{
253 struct super_block *super_block = inode->i_sb; 274 struct super_block *sb = inode->i_sb;
254 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block);
255 275
256 if (hlist_unhashed(&inode->i_hash)) 276 if (!hlist_unhashed(&inode->i_hash)) {
257 goto out_truncate; 277 if (!(inode->i_state & (I_DIRTY|I_LOCK)))
258 278 list_move(&inode->i_list, &inode_unused);
259 if (!(inode->i_state & (I_DIRTY|I_LOCK))) { 279 inodes_stat.nr_unused++;
260 list_del(&inode->i_list); 280 if (!sb || (sb->s_flags & MS_ACTIVE)) {
261 list_add(&inode->i_list, &inode_unused); 281 spin_unlock(&inode_lock);
262 } 282 return;
263 inodes_stat.nr_unused++; 283 }
264 if (!super_block || (super_block->s_flags & MS_ACTIVE)) { 284 inode->i_state |= I_WILL_FREE;
265 spin_unlock(&inode_lock); 285 spin_unlock(&inode_lock);
266 return; 286 /*
287 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
288 * in our backing_dev_info.
289 */
290 write_inode_now(inode, 1);
291 spin_lock(&inode_lock);
292 inode->i_state &= ~I_WILL_FREE;
293 inodes_stat.nr_unused--;
294 hlist_del_init(&inode->i_hash);
267 } 295 }
268
269 /* write_inode_now() ? */
270 inodes_stat.nr_unused--;
271 hlist_del_init(&inode->i_hash);
272out_truncate:
273 list_del_init(&inode->i_list); 296 list_del_init(&inode->i_list);
274 list_del_init(&inode->i_sb_list); 297 list_del_init(&inode->i_sb_list);
275 inode->i_state |= I_FREEING; 298 inode->i_state |= I_FREEING;
@@ -277,13 +300,6 @@ out_truncate:
277 spin_unlock(&inode_lock); 300 spin_unlock(&inode_lock);
278 if (inode->i_data.nrpages) 301 if (inode->i_data.nrpages)
279 truncate_hugepages(&inode->i_data, 0); 302 truncate_hugepages(&inode->i_data, 0);
280
281 if (sbinfo->free_inodes >= 0) {
282 spin_lock(&sbinfo->stat_lock);
283 sbinfo->free_inodes++;
284 spin_unlock(&sbinfo->stat_lock);
285 }
286
287 clear_inode(inode); 303 clear_inode(inode);
288 destroy_inode(inode); 304 destroy_inode(inode);
289} 305}
@@ -291,7 +307,7 @@ out_truncate:
291static void hugetlbfs_drop_inode(struct inode *inode) 307static void hugetlbfs_drop_inode(struct inode *inode)
292{ 308{
293 if (!inode->i_nlink) 309 if (!inode->i_nlink)
294 hugetlbfs_delete_inode(inode); 310 generic_delete_inode(inode);
295 else 311 else
296 hugetlbfs_forget_inode(inode); 312 hugetlbfs_forget_inode(inode);
297} 313}
@@ -308,7 +324,6 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
308 324
309 vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) { 325 vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
310 unsigned long h_vm_pgoff; 326 unsigned long h_vm_pgoff;
311 unsigned long v_length;
312 unsigned long v_offset; 327 unsigned long v_offset;
313 328
314 h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); 329 h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
@@ -319,11 +334,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
319 if (h_vm_pgoff >= h_pgoff) 334 if (h_vm_pgoff >= h_pgoff)
320 v_offset = 0; 335 v_offset = 0;
321 336
322 v_length = vma->vm_end - vma->vm_start; 337 unmap_hugepage_range(vma,
323 338 vma->vm_start + v_offset, vma->vm_end);
324 zap_hugepage_range(vma,
325 vma->vm_start + v_offset,
326 v_length - v_offset);
327 } 339 }
328} 340}
329 341
@@ -379,17 +391,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
379 gid_t gid, int mode, dev_t dev) 391 gid_t gid, int mode, dev_t dev)
380{ 392{
381 struct inode *inode; 393 struct inode *inode;
382 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
383
384 if (sbinfo->free_inodes >= 0) {
385 spin_lock(&sbinfo->stat_lock);
386 if (!sbinfo->free_inodes) {
387 spin_unlock(&sbinfo->stat_lock);
388 return NULL;
389 }
390 sbinfo->free_inodes--;
391 spin_unlock(&sbinfo->stat_lock);
392 }
393 394
394 inode = new_inode(sb); 395 inode = new_inode(sb);
395 if (inode) { 396 if (inode) {
@@ -531,29 +532,51 @@ static void hugetlbfs_put_super(struct super_block *sb)
531 } 532 }
532} 533}
533 534
535static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
536{
537 if (sbinfo->free_inodes >= 0) {
538 spin_lock(&sbinfo->stat_lock);
539 if (unlikely(!sbinfo->free_inodes)) {
540 spin_unlock(&sbinfo->stat_lock);
541 return 0;
542 }
543 sbinfo->free_inodes--;
544 spin_unlock(&sbinfo->stat_lock);
545 }
546
547 return 1;
548}
549
550static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
551{
552 if (sbinfo->free_inodes >= 0) {
553 spin_lock(&sbinfo->stat_lock);
554 sbinfo->free_inodes++;
555 spin_unlock(&sbinfo->stat_lock);
556 }
557}
558
559
534static kmem_cache_t *hugetlbfs_inode_cachep; 560static kmem_cache_t *hugetlbfs_inode_cachep;
535 561
536static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) 562static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
537{ 563{
564 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
538 struct hugetlbfs_inode_info *p; 565 struct hugetlbfs_inode_info *p;
539 566
567 if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
568 return NULL;
540 p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL); 569 p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
541 if (!p) 570 if (unlikely(!p)) {
571 hugetlbfs_inc_free_inodes(sbinfo);
542 return NULL; 572 return NULL;
573 }
543 return &p->vfs_inode; 574 return &p->vfs_inode;
544} 575}
545 576
546static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
547{
548 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
549
550 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
551 SLAB_CTOR_CONSTRUCTOR)
552 inode_init_once(&ei->vfs_inode);
553}
554
555static void hugetlbfs_destroy_inode(struct inode *inode) 577static void hugetlbfs_destroy_inode(struct inode *inode)
556{ 578{
579 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
557 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 580 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
558 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 581 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
559} 582}
@@ -565,6 +588,16 @@ static struct address_space_operations hugetlbfs_aops = {
565 .set_page_dirty = hugetlbfs_set_page_dirty, 588 .set_page_dirty = hugetlbfs_set_page_dirty,
566}; 589};
567 590
591
592static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
593{
594 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
595
596 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
597 SLAB_CTOR_CONSTRUCTOR)
598 inode_init_once(&ei->vfs_inode);
599}
600
568struct file_operations hugetlbfs_file_operations = { 601struct file_operations hugetlbfs_file_operations = {
569 .mmap = hugetlbfs_file_mmap, 602 .mmap = hugetlbfs_file_mmap,
570 .fsync = simple_sync_file, 603 .fsync = simple_sync_file,
@@ -592,6 +625,7 @@ static struct super_operations hugetlbfs_ops = {
592 .alloc_inode = hugetlbfs_alloc_inode, 625 .alloc_inode = hugetlbfs_alloc_inode,
593 .destroy_inode = hugetlbfs_destroy_inode, 626 .destroy_inode = hugetlbfs_destroy_inode,
594 .statfs = hugetlbfs_statfs, 627 .statfs = hugetlbfs_statfs,
628 .delete_inode = hugetlbfs_delete_inode,
595 .drop_inode = hugetlbfs_drop_inode, 629 .drop_inode = hugetlbfs_drop_inode,
596 .put_super = hugetlbfs_put_super, 630 .put_super = hugetlbfs_put_super,
597}; 631};
diff --git a/fs/inode.c b/fs/inode.c
index f80a79ff156b..d8d04bd72b59 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -475,7 +475,7 @@ static void prune_icache(int nr_to_scan)
475 * This function is passed the number of inodes to scan, and it returns the 475 * This function is passed the number of inodes to scan, and it returns the
476 * total number of remaining possibly-reclaimable inodes. 476 * total number of remaining possibly-reclaimable inodes.
477 */ 477 */
478static int shrink_icache_memory(int nr, unsigned int gfp_mask) 478static int shrink_icache_memory(int nr, gfp_t gfp_mask)
479{ 479{
480 if (nr) { 480 if (nr) {
481 /* 481 /*
@@ -1088,6 +1088,7 @@ static void generic_forget_inode(struct inode *inode)
1088 if (inode->i_data.nrpages) 1088 if (inode->i_data.nrpages)
1089 truncate_inode_pages(&inode->i_data, 0); 1089 truncate_inode_pages(&inode->i_data, 0);
1090 clear_inode(inode); 1090 clear_inode(inode);
1091 wake_up_inode(inode);
1091 destroy_inode(inode); 1092 destroy_inode(inode);
1092} 1093}
1093 1094
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7ae2c4fe506b..e4b516ac4989 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1606,7 +1606,7 @@ int journal_blocks_per_page(struct inode *inode)
1606 * Simple support for retrying memory allocations. Introduced to help to 1606 * Simple support for retrying memory allocations. Introduced to help to
1607 * debug different VM deadlock avoidance strategies. 1607 * debug different VM deadlock avoidance strategies.
1608 */ 1608 */
1609void * __jbd_kmalloc (const char *where, size_t size, int flags, int retry) 1609void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
1610{ 1610{
1611 return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0)); 1611 return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
1612} 1612}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 49bbc2be3d72..13cb05bf6048 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1621,7 +1621,7 @@ out:
1621 * while the data is part of a transaction. Yes? 1621 * while the data is part of a transaction. Yes?
1622 */ 1622 */
1623int journal_try_to_free_buffers(journal_t *journal, 1623int journal_try_to_free_buffers(journal_t *journal,
1624 struct page *page, int unused_gfp_mask) 1624 struct page *page, gfp_t unused_gfp_mask)
1625{ 1625{
1626 struct buffer_head *head; 1626 struct buffer_head *head;
1627 struct buffer_head *bh; 1627 struct buffer_head *bh;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 0f224384f176..8210ac16a368 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -15,6 +15,7 @@
15#include <linux/jffs2.h> 15#include <linux/jffs2.h>
16#include <linux/mtd/mtd.h> 16#include <linux/mtd/mtd.h>
17#include <linux/completion.h> 17#include <linux/completion.h>
18#include <linux/sched.h>
18#include "nodelist.h" 19#include "nodelist.h"
19 20
20 21
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 996d922e503e..316133c626b7 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -18,6 +18,8 @@
18#include <linux/mtd/mtd.h> 18#include <linux/mtd/mtd.h>
19#include <linux/crc32.h> 19#include <linux/crc32.h>
20#include <linux/mtd/nand.h> 20#include <linux/mtd/nand.h>
21#include <linux/jiffies.h>
22
21#include "nodelist.h" 23#include "nodelist.h"
22 24
23/* For testing write failures */ 25/* For testing write failures */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index eadf319bee22..68000a50ceb6 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -74,7 +74,7 @@
74static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, 74static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
75 int nblocks); 75 int nblocks);
76static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); 76static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
77static void dbBackSplit(dmtree_t * tp, int leafno); 77static int dbBackSplit(dmtree_t * tp, int leafno);
78static int dbJoin(dmtree_t * tp, int leafno, int newval); 78static int dbJoin(dmtree_t * tp, int leafno, int newval);
79static void dbAdjTree(dmtree_t * tp, int leafno, int newval); 79static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
80static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, 80static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
@@ -305,7 +305,6 @@ int dbSync(struct inode *ipbmap)
305 filemap_fdatawrite(ipbmap->i_mapping); 305 filemap_fdatawrite(ipbmap->i_mapping);
306 filemap_fdatawait(ipbmap->i_mapping); 306 filemap_fdatawait(ipbmap->i_mapping);
307 307
308 ipbmap->i_state |= I_DIRTY;
309 diWriteSpecial(ipbmap, 0); 308 diWriteSpecial(ipbmap, 0);
310 309
311 return (0); 310 return (0);
@@ -2467,7 +2466,9 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2467 * that it is at the front of a binary buddy system. 2466 * that it is at the front of a binary buddy system.
2468 */ 2467 */
2469 if (oldval == NOFREE) { 2468 if (oldval == NOFREE) {
2470 dbBackSplit((dmtree_t *) dcp, leafno); 2469 rc = dbBackSplit((dmtree_t *) dcp, leafno);
2470 if (rc)
2471 return rc;
2471 oldval = dcp->stree[ti]; 2472 oldval = dcp->stree[ti];
2472 } 2473 }
2473 dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); 2474 dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
@@ -2627,7 +2628,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
2627 * 2628 *
2628 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2629 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2629 */ 2630 */
2630static void dbBackSplit(dmtree_t * tp, int leafno) 2631static int dbBackSplit(dmtree_t * tp, int leafno)
2631{ 2632{
2632 int budsz, bud, w, bsz, size; 2633 int budsz, bud, w, bsz, size;
2633 int cursz; 2634 int cursz;
@@ -2662,7 +2663,10 @@ static void dbBackSplit(dmtree_t * tp, int leafno)
2662 */ 2663 */
2663 for (w = leafno, bsz = budsz;; bsz <<= 1, 2664 for (w = leafno, bsz = budsz;; bsz <<= 1,
2664 w = (w < bud) ? w : bud) { 2665 w = (w < bud) ? w : bud) {
2665 assert(bsz < le32_to_cpu(tp->dmt_nleafs)); 2666 if (bsz >= le32_to_cpu(tp->dmt_nleafs)) {
2667 jfs_err("JFS: block map error in dbBackSplit");
2668 return -EIO;
2669 }
2666 2670
2667 /* determine the buddy. 2671 /* determine the buddy.
2668 */ 2672 */
@@ -2681,7 +2685,11 @@ static void dbBackSplit(dmtree_t * tp, int leafno)
2681 } 2685 }
2682 } 2686 }
2683 2687
2684 assert(leaf[leafno] == size); 2688 if (leaf[leafno] != size) {
2689 jfs_err("JFS: wrong leaf value in dbBackSplit");
2690 return -EIO;
2691 }
2692 return 0;
2685} 2693}
2686 2694
2687 2695
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 4021d46da7e3..28201b194f53 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -57,6 +57,12 @@
57#include "jfs_debug.h" 57#include "jfs_debug.h"
58 58
59/* 59/*
60 * __mark_inode_dirty expects inodes to be hashed. Since we don't want
61 * special inodes in the fileset inode space, we hash them to a dummy head
62 */
63static HLIST_HEAD(aggregate_hash);
64
65/*
60 * imap locks 66 * imap locks
61 */ 67 */
62/* iag free list lock */ 68/* iag free list lock */
@@ -491,6 +497,8 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
491 /* release the page */ 497 /* release the page */
492 release_metapage(mp); 498 release_metapage(mp);
493 499
500 hlist_add_head(&ip->i_hash, &aggregate_hash);
501
494 return (ip); 502 return (ip);
495} 503}
496 504
@@ -514,8 +522,6 @@ void diWriteSpecial(struct inode *ip, int secondary)
514 ino_t inum = ip->i_ino; 522 ino_t inum = ip->i_ino;
515 struct metapage *mp; 523 struct metapage *mp;
516 524
517 ip->i_state &= ~I_DIRTY;
518
519 if (secondary) 525 if (secondary)
520 address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; 526 address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
521 else 527 else
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 13d7e3f1feb4..8a53981f9f27 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -86,7 +86,7 @@ struct meta_anchor {
86 atomic_t io_count; 86 atomic_t io_count;
87 struct metapage *mp[MPS_PER_PAGE]; 87 struct metapage *mp[MPS_PER_PAGE];
88}; 88};
89#define mp_anchor(page) ((struct meta_anchor *)page->private) 89#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
90 90
91static inline struct metapage *page_to_mp(struct page *page, uint offset) 91static inline struct metapage *page_to_mp(struct page *page, uint offset)
92{ 92{
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
108 if (!a) 108 if (!a)
109 return -ENOMEM; 109 return -ENOMEM;
110 memset(a, 0, sizeof(struct meta_anchor)); 110 memset(a, 0, sizeof(struct meta_anchor));
111 page->private = (unsigned long)a; 111 set_page_private(page, (unsigned long)a);
112 SetPagePrivate(page); 112 SetPagePrivate(page);
113 kmap(page); 113 kmap(page);
114 } 114 }
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
136 a->mp[index] = NULL; 136 a->mp[index] = NULL;
137 if (--a->mp_count == 0) { 137 if (--a->mp_count == 0) {
138 kfree(a); 138 kfree(a);
139 page->private = 0; 139 set_page_private(page, 0);
140 ClearPagePrivate(page); 140 ClearPagePrivate(page);
141 kunmap(page); 141 kunmap(page);
142 } 142 }
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
156#else 156#else
157static inline struct metapage *page_to_mp(struct page *page, uint offset) 157static inline struct metapage *page_to_mp(struct page *page, uint offset)
158{ 158{
159 return PagePrivate(page) ? (struct metapage *)page->private : NULL; 159 return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
160} 160}
161 161
162static inline int insert_metapage(struct page *page, struct metapage *mp) 162static inline int insert_metapage(struct page *page, struct metapage *mp)
163{ 163{
164 if (mp) { 164 if (mp) {
165 page->private = (unsigned long)mp; 165 set_page_private(page, (unsigned long)mp);
166 SetPagePrivate(page); 166 SetPagePrivate(page);
167 kmap(page); 167 kmap(page);
168 } 168 }
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
171 171
172static inline void remove_metapage(struct page *page, struct metapage *mp) 172static inline void remove_metapage(struct page *page, struct metapage *mp)
173{ 173{
174 page->private = 0; 174 set_page_private(page, 0);
175 ClearPagePrivate(page); 175 ClearPagePrivate(page);
176 kunmap(page); 176 kunmap(page);
177} 177}
@@ -198,7 +198,7 @@ static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
198 } 198 }
199} 199}
200 200
201static inline struct metapage *alloc_metapage(unsigned int gfp_mask) 201static inline struct metapage *alloc_metapage(gfp_t gfp_mask)
202{ 202{
203 return mempool_alloc(metapage_mempool, gfp_mask); 203 return mempool_alloc(metapage_mempool, gfp_mask);
204} 204}
@@ -395,6 +395,12 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
395 395
396 if (mp->nohomeok && !test_bit(META_forcewrite, &mp->flag)) { 396 if (mp->nohomeok && !test_bit(META_forcewrite, &mp->flag)) {
397 redirty = 1; 397 redirty = 1;
398 /*
399 * Make sure this page isn't blocked indefinitely.
400 * If the journal isn't undergoing I/O, push it
401 */
402 if (mp->log && !(mp->log->cflag & logGC_PAGEOUT))
403 jfs_flush_journal(mp->log, 0);
398 continue; 404 continue;
399 } 405 }
400 406
@@ -534,7 +540,7 @@ add_failed:
534 return -EIO; 540 return -EIO;
535} 541}
536 542
537static int metapage_releasepage(struct page *page, int gfp_mask) 543static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
538{ 544{
539 struct metapage *mp; 545 struct metapage *mp;
540 int busy = 0; 546 int busy = 0;
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 9b71ed2674fe..b660c93c92de 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2396,7 +2396,6 @@ static void txUpdateMap(struct tblock * tblk)
2396 */ 2396 */
2397 if (tblk->xflag & COMMIT_CREATE) { 2397 if (tblk->xflag & COMMIT_CREATE) {
2398 diUpdatePMap(ipimap, tblk->ino, FALSE, tblk); 2398 diUpdatePMap(ipimap, tblk->ino, FALSE, tblk);
2399 ipimap->i_state |= I_DIRTY;
2400 /* update persistent block allocation map 2399 /* update persistent block allocation map
2401 * for the allocation of inode extent; 2400 * for the allocation of inode extent;
2402 */ 2401 */
@@ -2407,7 +2406,6 @@ static void txUpdateMap(struct tblock * tblk)
2407 } else if (tblk->xflag & COMMIT_DELETE) { 2406 } else if (tblk->xflag & COMMIT_DELETE) {
2408 ip = tblk->u.ip; 2407 ip = tblk->u.ip;
2409 diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk); 2408 diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk);
2410 ipimap->i_state |= I_DIRTY;
2411 iput(ip); 2409 iput(ip);
2412 } 2410 }
2413} 2411}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index a7fe2f2b969f..e72f4ebb6e9c 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -3516,16 +3516,10 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3516 /* process entries backward from last index */ 3516 /* process entries backward from last index */
3517 index = le16_to_cpu(p->header.nextindex) - 1; 3517 index = le16_to_cpu(p->header.nextindex) - 1;
3518 3518
3519 if (p->header.flag & BT_INTERNAL)
3520 goto getChild;
3521
3522 /*
3523 * leaf page
3524 */
3525 3519
3526 /* Since this is the rightmost leaf, and we may have already freed 3520 /* Since this is the rightmost page at this level, and we may have
3527 * a page that was formerly to the right, let's make sure that the 3521 * already freed a page that was formerly to the right, let's make
3528 * next pointer is zero. 3522 * sure that the next pointer is zero.
3529 */ 3523 */
3530 if (p->header.next) { 3524 if (p->header.next) {
3531 if (log) 3525 if (log)
@@ -3539,6 +3533,12 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3539 p->header.next = 0; 3533 p->header.next = 0;
3540 } 3534 }
3541 3535
3536 if (p->header.flag & BT_INTERNAL)
3537 goto getChild;
3538
3539 /*
3540 * leaf page
3541 */
3542 freed = 0; 3542 freed = 0;
3543 3543
3544 /* does region covered by leaf page precede Teof ? */ 3544 /* does region covered by leaf page precede Teof ? */
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 71bc34b96b2b..4226af3ea91b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -442,6 +442,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
442 inode->i_nlink = 1; 442 inode->i_nlink = 1;
443 inode->i_size = sb->s_bdev->bd_inode->i_size; 443 inode->i_size = sb->s_bdev->bd_inode->i_size;
444 inode->i_mapping->a_ops = &jfs_metapage_aops; 444 inode->i_mapping->a_ops = &jfs_metapage_aops;
445 insert_inode_hash(inode);
445 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); 446 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
446 447
447 sbi->direct_inode = inode; 448 sbi->direct_inode = inode;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 82c77df81c5f..c4c8601096e0 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -173,11 +173,10 @@ nlm_bind_host(struct nlm_host *host)
173 173
174 /* If we've already created an RPC client, check whether 174 /* If we've already created an RPC client, check whether
175 * RPC rebind is required 175 * RPC rebind is required
176 * Note: why keep rebinding if we're on a tcp connection?
177 */ 176 */
178 if ((clnt = host->h_rpcclnt) != NULL) { 177 if ((clnt = host->h_rpcclnt) != NULL) {
179 xprt = clnt->cl_xprt; 178 xprt = clnt->cl_xprt;
180 if (!xprt->stream && time_after_eq(jiffies, host->h_nextrebind)) { 179 if (time_after_eq(jiffies, host->h_nextrebind)) {
181 clnt->cl_port = 0; 180 clnt->cl_port = 0;
182 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 181 host->h_nextrebind = jiffies + NLM_HOST_REBIND;
183 dprintk("lockd: next rebind in %ld jiffies\n", 182 dprintk("lockd: next rebind in %ld jiffies\n",
@@ -189,7 +188,6 @@ nlm_bind_host(struct nlm_host *host)
189 goto forgetit; 188 goto forgetit;
190 189
191 xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout); 190 xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout);
192 xprt->nocong = 1; /* No congestion control for NLM */
193 xprt->resvport = 1; /* NLM requires a reserved port */ 191 xprt->resvport = 1; /* NLM requires a reserved port */
194 192
195 /* Existing NLM servers accept AUTH_UNIX only */ 193 /* Existing NLM servers accept AUTH_UNIX only */
diff --git a/fs/locks.c b/fs/locks.c
index f7daa5f48949..a1e8b2248014 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -316,21 +316,22 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
316 /* POSIX-1996 leaves the case l->l_len < 0 undefined; 316 /* POSIX-1996 leaves the case l->l_len < 0 undefined;
317 POSIX-2001 defines it. */ 317 POSIX-2001 defines it. */
318 start += l->l_start; 318 start += l->l_start;
319 end = start + l->l_len - 1; 319 if (start < 0)
320 if (l->l_len < 0) { 320 return -EINVAL;
321 fl->fl_end = OFFSET_MAX;
322 if (l->l_len > 0) {
323 end = start + l->l_len - 1;
324 fl->fl_end = end;
325 } else if (l->l_len < 0) {
321 end = start - 1; 326 end = start - 1;
327 fl->fl_end = end;
322 start += l->l_len; 328 start += l->l_len;
329 if (start < 0)
330 return -EINVAL;
323 } 331 }
324
325 if (start < 0)
326 return -EINVAL;
327 if (l->l_len > 0 && end < 0)
328 return -EOVERFLOW;
329
330 fl->fl_start = start; /* we record the absolute position */ 332 fl->fl_start = start; /* we record the absolute position */
331 fl->fl_end = end; 333 if (fl->fl_end < fl->fl_start)
332 if (l->l_len == 0) 334 return -EOVERFLOW;
333 fl->fl_end = OFFSET_MAX;
334 335
335 fl->fl_owner = current->files; 336 fl->fl_owner = current->files;
336 fl->fl_pid = current->tgid; 337 fl->fl_pid = current->tgid;
@@ -362,14 +363,21 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
362 return -EINVAL; 363 return -EINVAL;
363 } 364 }
364 365
365 if (((start += l->l_start) < 0) || (l->l_len < 0)) 366 start += l->l_start;
367 if (start < 0)
366 return -EINVAL; 368 return -EINVAL;
367 fl->fl_end = start + l->l_len - 1; 369 fl->fl_end = OFFSET_MAX;
368 if (l->l_len > 0 && fl->fl_end < 0) 370 if (l->l_len > 0) {
369 return -EOVERFLOW; 371 fl->fl_end = start + l->l_len - 1;
372 } else if (l->l_len < 0) {
373 fl->fl_end = start - 1;
374 start += l->l_len;
375 if (start < 0)
376 return -EINVAL;
377 }
370 fl->fl_start = start; /* we record the absolute position */ 378 fl->fl_start = start; /* we record the absolute position */
371 if (l->l_len == 0) 379 if (fl->fl_end < fl->fl_start)
372 fl->fl_end = OFFSET_MAX; 380 return -EOVERFLOW;
373 381
374 fl->fl_owner = current->files; 382 fl->fl_owner = current->files;
375 fl->fl_pid = current->tgid; 383 fl->fl_pid = current->tgid;
@@ -829,12 +837,16 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request)
829 /* Detect adjacent or overlapping regions (if same lock type) 837 /* Detect adjacent or overlapping regions (if same lock type)
830 */ 838 */
831 if (request->fl_type == fl->fl_type) { 839 if (request->fl_type == fl->fl_type) {
840 /* In all comparisons of start vs end, use
841 * "start - 1" rather than "end + 1". If end
842 * is OFFSET_MAX, end + 1 will become negative.
843 */
832 if (fl->fl_end < request->fl_start - 1) 844 if (fl->fl_end < request->fl_start - 1)
833 goto next_lock; 845 goto next_lock;
834 /* If the next lock in the list has entirely bigger 846 /* If the next lock in the list has entirely bigger
835 * addresses than the new one, insert the lock here. 847 * addresses than the new one, insert the lock here.
836 */ 848 */
837 if (fl->fl_start > request->fl_end + 1) 849 if (fl->fl_start - 1 > request->fl_end)
838 break; 850 break;
839 851
840 /* If we come here, the new and old lock are of the 852 /* If we come here, the new and old lock are of the
diff --git a/fs/mbcache.c b/fs/mbcache.c
index b002a088857d..298997f17475 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -116,7 +116,7 @@ mb_cache_indexes(struct mb_cache *cache)
116 * What the mbcache registers as to get shrunk dynamically. 116 * What the mbcache registers as to get shrunk dynamically.
117 */ 117 */
118 118
119static int mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask); 119static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
120 120
121 121
122static inline int 122static inline int
@@ -140,7 +140,7 @@ __mb_cache_entry_unhash(struct mb_cache_entry *ce)
140 140
141 141
142static inline void 142static inline void
143__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) 143__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
144{ 144{
145 struct mb_cache *cache = ce->e_cache; 145 struct mb_cache *cache = ce->e_cache;
146 146
@@ -193,7 +193,7 @@ forget:
193 * Returns the number of objects which are present in the cache. 193 * Returns the number of objects which are present in the cache.
194 */ 194 */
195static int 195static int
196mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask) 196mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
197{ 197{
198 LIST_HEAD(free_list); 198 LIST_HEAD(free_list);
199 struct list_head *l, *ltmp; 199 struct list_head *l, *ltmp;
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 154f511c7245..626a367bcd81 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -454,10 +454,10 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
454{ 454{
455 struct buffer_head *dotdot_bh; 455 struct buffer_head *dotdot_bh;
456 struct msdos_dir_entry *dotdot_de; 456 struct msdos_dir_entry *dotdot_de;
457 loff_t dotdot_i_pos;
458 struct inode *old_inode, *new_inode; 457 struct inode *old_inode, *new_inode;
459 struct fat_slot_info old_sinfo, sinfo; 458 struct fat_slot_info old_sinfo, sinfo;
460 struct timespec ts; 459 struct timespec ts;
460 loff_t dotdot_i_pos, new_i_pos;
461 int err, old_attrs, is_dir, update_dotdot, corrupt = 0; 461 int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
462 462
463 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 463 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -516,28 +516,24 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
516 if (new_inode) { 516 if (new_inode) {
517 if (err) 517 if (err)
518 goto out; 518 goto out;
519 if (MSDOS_I(new_inode)->i_pos != sinfo.i_pos) {
520 /* WTF??? Cry and fail. */
521 printk(KERN_WARNING "msdos_rename: fs corrupted\n");
522 goto out;
523 }
524
525 if (is_dir) { 519 if (is_dir) {
526 err = fat_dir_empty(new_inode); 520 err = fat_dir_empty(new_inode);
527 if (err) 521 if (err)
528 goto out; 522 goto out;
529 } 523 }
524 new_i_pos = MSDOS_I(new_inode)->i_pos;
530 fat_detach(new_inode); 525 fat_detach(new_inode);
531 } else { 526 } else {
532 err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0, 527 err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
533 &ts, &sinfo); 528 &ts, &sinfo);
534 if (err) 529 if (err)
535 goto out; 530 goto out;
531 new_i_pos = sinfo.i_pos;
536 } 532 }
537 new_dir->i_version++; 533 new_dir->i_version++;
538 534
539 fat_detach(old_inode); 535 fat_detach(old_inode);
540 fat_attach(old_inode, sinfo.i_pos); 536 fat_attach(old_inode, new_i_pos);
541 if (is_hid) 537 if (is_hid)
542 MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN; 538 MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
543 else 539 else
@@ -604,7 +600,7 @@ error_inode:
604 fat_attach(old_inode, old_sinfo.i_pos); 600 fat_attach(old_inode, old_sinfo.i_pos);
605 MSDOS_I(old_inode)->i_attrs = old_attrs; 601 MSDOS_I(old_inode)->i_attrs = old_attrs;
606 if (new_inode) { 602 if (new_inode) {
607 fat_attach(new_inode, sinfo.i_pos); 603 fat_attach(new_inode, new_i_pos);
608 if (corrupt) 604 if (corrupt)
609 corrupt |= fat_sync_inode(new_inode); 605 corrupt |= fat_sync_inode(new_inode);
610 } else { 606 } else {
diff --git a/fs/namei.c b/fs/namei.c
index aa62dbda93ac..c5769c4fcab1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -28,6 +28,7 @@
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/mount.h> 29#include <linux/mount.h>
30#include <linux/audit.h> 30#include <linux/audit.h>
31#include <linux/file.h>
31#include <asm/namei.h> 32#include <asm/namei.h>
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33 34
@@ -317,6 +318,18 @@ void path_release_on_umount(struct nameidata *nd)
317 mntput_no_expire(nd->mnt); 318 mntput_no_expire(nd->mnt);
318} 319}
319 320
321/**
322 * release_open_intent - free up open intent resources
323 * @nd: pointer to nameidata
324 */
325void release_open_intent(struct nameidata *nd)
326{
327 if (nd->intent.open.file->f_dentry == NULL)
328 put_filp(nd->intent.open.file);
329 else
330 fput(nd->intent.open.file);
331}
332
320/* 333/*
321 * Internal lookup() using the new generic dcache. 334 * Internal lookup() using the new generic dcache.
322 * SMP-safe 335 * SMP-safe
@@ -750,6 +763,7 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
750 struct qstr this; 763 struct qstr this;
751 unsigned int c; 764 unsigned int c;
752 765
766 nd->flags |= LOOKUP_CONTINUE;
753 err = exec_permission_lite(inode, nd); 767 err = exec_permission_lite(inode, nd);
754 if (err == -EAGAIN) { 768 if (err == -EAGAIN) {
755 err = permission(inode, MAY_EXEC, nd); 769 err = permission(inode, MAY_EXEC, nd);
@@ -802,7 +816,6 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
802 if (err < 0) 816 if (err < 0)
803 break; 817 break;
804 } 818 }
805 nd->flags |= LOOKUP_CONTINUE;
806 /* This does the actual lookups.. */ 819 /* This does the actual lookups.. */
807 err = do_lookup(nd, &this, &next); 820 err = do_lookup(nd, &this, &next);
808 if (err) 821 if (err)
@@ -1052,6 +1065,70 @@ out:
1052 return retval; 1065 return retval;
1053} 1066}
1054 1067
1068static int __path_lookup_intent_open(const char *name, unsigned int lookup_flags,
1069 struct nameidata *nd, int open_flags, int create_mode)
1070{
1071 struct file *filp = get_empty_filp();
1072 int err;
1073
1074 if (filp == NULL)
1075 return -ENFILE;
1076 nd->intent.open.file = filp;
1077 nd->intent.open.flags = open_flags;
1078 nd->intent.open.create_mode = create_mode;
1079 err = path_lookup(name, lookup_flags|LOOKUP_OPEN, nd);
1080 if (IS_ERR(nd->intent.open.file)) {
1081 if (err == 0) {
1082 err = PTR_ERR(nd->intent.open.file);
1083 path_release(nd);
1084 }
1085 } else if (err != 0)
1086 release_open_intent(nd);
1087 return err;
1088}
1089
1090/**
1091 * path_lookup_open - lookup a file path with open intent
1092 * @name: pointer to file name
1093 * @lookup_flags: lookup intent flags
1094 * @nd: pointer to nameidata
1095 * @open_flags: open intent flags
1096 */
1097int path_lookup_open(const char *name, unsigned int lookup_flags,
1098 struct nameidata *nd, int open_flags)
1099{
1100 return __path_lookup_intent_open(name, lookup_flags, nd,
1101 open_flags, 0);
1102}
1103
1104/**
1105 * path_lookup_create - lookup a file path with open + create intent
1106 * @name: pointer to file name
1107 * @lookup_flags: lookup intent flags
1108 * @nd: pointer to nameidata
1109 * @open_flags: open intent flags
1110 * @create_mode: create intent flags
1111 */
1112int path_lookup_create(const char *name, unsigned int lookup_flags,
1113 struct nameidata *nd, int open_flags, int create_mode)
1114{
1115 return __path_lookup_intent_open(name, lookup_flags|LOOKUP_CREATE, nd,
1116 open_flags, create_mode);
1117}
1118
1119int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
1120 struct nameidata *nd, int open_flags)
1121{
1122 char *tmp = getname(name);
1123 int err = PTR_ERR(tmp);
1124
1125 if (!IS_ERR(tmp)) {
1126 err = __path_lookup_intent_open(tmp, lookup_flags, nd, open_flags, 0);
1127 putname(tmp);
1128 }
1129 return err;
1130}
1131
1055/* 1132/*
1056 * Restricted form of lookup. Doesn't follow links, single-component only, 1133 * Restricted form of lookup. Doesn't follow links, single-component only,
1057 * needs parent already locked. Doesn't follow mounts. 1134 * needs parent already locked. Doesn't follow mounts.
@@ -1234,9 +1311,6 @@ static inline int may_create(struct inode *dir, struct dentry *child,
1234} 1311}
1235 1312
1236/* 1313/*
1237 * Special case: O_CREAT|O_EXCL implies O_NOFOLLOW for security
1238 * reasons.
1239 *
1240 * O_DIRECTORY translates into forcing a directory lookup. 1314 * O_DIRECTORY translates into forcing a directory lookup.
1241 */ 1315 */
1242static inline int lookup_flags(unsigned int f) 1316static inline int lookup_flags(unsigned int f)
@@ -1246,9 +1320,6 @@ static inline int lookup_flags(unsigned int f)
1246 if (f & O_NOFOLLOW) 1320 if (f & O_NOFOLLOW)
1247 retval &= ~LOOKUP_FOLLOW; 1321 retval &= ~LOOKUP_FOLLOW;
1248 1322
1249 if ((f & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
1250 retval &= ~LOOKUP_FOLLOW;
1251
1252 if (f & O_DIRECTORY) 1323 if (f & O_DIRECTORY)
1253 retval |= LOOKUP_DIRECTORY; 1324 retval |= LOOKUP_DIRECTORY;
1254 1325
@@ -1416,27 +1487,27 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1416 */ 1487 */
1417int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) 1488int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
1418{ 1489{
1419 int acc_mode, error = 0; 1490 int acc_mode, error;
1420 struct path path; 1491 struct path path;
1421 struct dentry *dir; 1492 struct dentry *dir;
1422 int count = 0; 1493 int count = 0;
1423 1494
1424 acc_mode = ACC_MODE(flag); 1495 acc_mode = ACC_MODE(flag);
1425 1496
1497 /* O_TRUNC implies we need access checks for write permissions */
1498 if (flag & O_TRUNC)
1499 acc_mode |= MAY_WRITE;
1500
1426 /* Allow the LSM permission hook to distinguish append 1501 /* Allow the LSM permission hook to distinguish append
1427 access from general write access. */ 1502 access from general write access. */
1428 if (flag & O_APPEND) 1503 if (flag & O_APPEND)
1429 acc_mode |= MAY_APPEND; 1504 acc_mode |= MAY_APPEND;
1430 1505
1431 /* Fill in the open() intent data */
1432 nd->intent.open.flags = flag;
1433 nd->intent.open.create_mode = mode;
1434
1435 /* 1506 /*
1436 * The simplest case - just a plain lookup. 1507 * The simplest case - just a plain lookup.
1437 */ 1508 */
1438 if (!(flag & O_CREAT)) { 1509 if (!(flag & O_CREAT)) {
1439 error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd); 1510 error = path_lookup_open(pathname, lookup_flags(flag), nd, flag);
1440 if (error) 1511 if (error)
1441 return error; 1512 return error;
1442 goto ok; 1513 goto ok;
@@ -1445,7 +1516,7 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
1445 /* 1516 /*
1446 * Create - we need to know the parent. 1517 * Create - we need to know the parent.
1447 */ 1518 */
1448 error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); 1519 error = path_lookup_create(pathname, LOOKUP_PARENT, nd, flag, mode);
1449 if (error) 1520 if (error)
1450 return error; 1521 return error;
1451 1522
@@ -1520,6 +1591,8 @@ ok:
1520exit_dput: 1591exit_dput:
1521 dput_path(&path, nd); 1592 dput_path(&path, nd);
1522exit: 1593exit:
1594 if (!IS_ERR(nd->intent.open.file))
1595 release_open_intent(nd);
1523 path_release(nd); 1596 path_release(nd);
1524 return error; 1597 return error;
1525 1598
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 4a36839f0bbd..44135af9894c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -142,7 +142,7 @@ static void nfs_msync_inode(struct inode *inode)
142/* 142/*
143 * Basic procedure for returning a delegation to the server 143 * Basic procedure for returning a delegation to the server
144 */ 144 */
145int nfs_inode_return_delegation(struct inode *inode) 145int __nfs_inode_return_delegation(struct inode *inode)
146{ 146{
147 struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; 147 struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
148 struct nfs_inode *nfsi = NFS_I(inode); 148 struct nfs_inode *nfsi = NFS_I(inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 3f6c45a29d6a..8017846b561f 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -25,7 +25,7 @@ struct nfs_delegation {
25 25
26int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 26int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
27void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 27void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
28int nfs_inode_return_delegation(struct inode *inode); 28int __nfs_inode_return_delegation(struct inode *inode);
29int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); 29int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
30 30
31struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle); 31struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle);
@@ -47,11 +47,25 @@ static inline int nfs_have_delegation(struct inode *inode, int flags)
47 return 1; 47 return 1;
48 return 0; 48 return 0;
49} 49}
50
51static inline int nfs_inode_return_delegation(struct inode *inode)
52{
53 int err = 0;
54
55 if (NFS_I(inode)->delegation != NULL)
56 err = __nfs_inode_return_delegation(inode);
57 return err;
58}
50#else 59#else
51static inline int nfs_have_delegation(struct inode *inode, int flags) 60static inline int nfs_have_delegation(struct inode *inode, int flags)
52{ 61{
53 return 0; 62 return 0;
54} 63}
64
65static inline int nfs_inode_return_delegation(struct inode *inode)
66{
67 return 0;
68}
55#endif 69#endif
56 70
57#endif 71#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2df639f143e8..8272ed3fc707 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -532,6 +532,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
532 my_entry.eof = 0; 532 my_entry.eof = 0;
533 my_entry.fh = &fh; 533 my_entry.fh = &fh;
534 my_entry.fattr = &fattr; 534 my_entry.fattr = &fattr;
535 nfs_fattr_init(&fattr);
535 desc->entry = &my_entry; 536 desc->entry = &my_entry;
536 537
537 while(!desc->entry->eof) { 538 while(!desc->entry->eof) {
@@ -565,8 +566,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
565 } 566 }
566 } 567 }
567 unlock_kernel(); 568 unlock_kernel();
568 if (desc->error < 0)
569 return desc->error;
570 if (res < 0) 569 if (res < 0)
571 return res; 570 return res;
572 return 0; 571 return 0;
@@ -803,6 +802,7 @@ static int nfs_dentry_delete(struct dentry *dentry)
803 */ 802 */
804static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) 803static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
805{ 804{
805 nfs_inode_return_delegation(inode);
806 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 806 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
807 lock_kernel(); 807 lock_kernel();
808 inode->i_nlink--; 808 inode->i_nlink--;
@@ -853,12 +853,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
853 dentry->d_op = NFS_PROTO(dir)->dentry_ops; 853 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
854 854
855 lock_kernel(); 855 lock_kernel();
856 /* Revalidate parent directory attribute cache */
857 error = nfs_revalidate_inode(NFS_SERVER(dir), dir);
858 if (error < 0) {
859 res = ERR_PTR(error);
860 goto out_unlock;
861 }
862 856
863 /* If we're doing an exclusive create, optimize away the lookup */ 857 /* If we're doing an exclusive create, optimize away the lookup */
864 if (nfs_is_exclusive_create(dir, nd)) 858 if (nfs_is_exclusive_create(dir, nd))
@@ -916,7 +910,6 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd)
916static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 910static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
917{ 911{
918 struct dentry *res = NULL; 912 struct dentry *res = NULL;
919 struct inode *inode = NULL;
920 int error; 913 int error;
921 914
922 /* Check that we are indeed trying to open this file */ 915 /* Check that we are indeed trying to open this file */
@@ -930,8 +923,10 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
930 dentry->d_op = NFS_PROTO(dir)->dentry_ops; 923 dentry->d_op = NFS_PROTO(dir)->dentry_ops;
931 924
932 /* Let vfs_create() deal with O_EXCL */ 925 /* Let vfs_create() deal with O_EXCL */
933 if (nd->intent.open.flags & O_EXCL) 926 if (nd->intent.open.flags & O_EXCL) {
934 goto no_entry; 927 d_add(dentry, NULL);
928 goto out;
929 }
935 930
936 /* Open the file on the server */ 931 /* Open the file on the server */
937 lock_kernel(); 932 lock_kernel();
@@ -945,32 +940,30 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
945 940
946 if (nd->intent.open.flags & O_CREAT) { 941 if (nd->intent.open.flags & O_CREAT) {
947 nfs_begin_data_update(dir); 942 nfs_begin_data_update(dir);
948 inode = nfs4_atomic_open(dir, dentry, nd); 943 res = nfs4_atomic_open(dir, dentry, nd);
949 nfs_end_data_update(dir); 944 nfs_end_data_update(dir);
950 } else 945 } else
951 inode = nfs4_atomic_open(dir, dentry, nd); 946 res = nfs4_atomic_open(dir, dentry, nd);
952 unlock_kernel(); 947 unlock_kernel();
953 if (IS_ERR(inode)) { 948 if (IS_ERR(res)) {
954 error = PTR_ERR(inode); 949 error = PTR_ERR(res);
955 switch (error) { 950 switch (error) {
956 /* Make a negative dentry */ 951 /* Make a negative dentry */
957 case -ENOENT: 952 case -ENOENT:
958 inode = NULL; 953 res = NULL;
959 break; 954 goto out;
960 /* This turned out not to be a regular file */ 955 /* This turned out not to be a regular file */
956 case -EISDIR:
957 case -ENOTDIR:
958 goto no_open;
961 case -ELOOP: 959 case -ELOOP:
962 if (!(nd->intent.open.flags & O_NOFOLLOW)) 960 if (!(nd->intent.open.flags & O_NOFOLLOW))
963 goto no_open; 961 goto no_open;
964 /* case -EISDIR: */
965 /* case -EINVAL: */ 962 /* case -EINVAL: */
966 default: 963 default:
967 res = ERR_PTR(error);
968 goto out; 964 goto out;
969 } 965 }
970 } 966 } else if (res != NULL)
971no_entry:
972 res = d_add_unique(dentry, inode);
973 if (res != NULL)
974 dentry = res; 967 dentry = res;
975 nfs_renew_times(dentry); 968 nfs_renew_times(dentry);
976 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 969 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
@@ -1014,7 +1007,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
1014 */ 1007 */
1015 lock_kernel(); 1008 lock_kernel();
1016 verifier = nfs_save_change_attribute(dir); 1009 verifier = nfs_save_change_attribute(dir);
1017 ret = nfs4_open_revalidate(dir, dentry, openflags); 1010 ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
1018 if (!ret) 1011 if (!ret)
1019 nfs_set_verifier(dentry, verifier); 1012 nfs_set_verifier(dentry, verifier);
1020 unlock_kernel(); 1013 unlock_kernel();
@@ -1137,7 +1130,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
1137 1130
1138 lock_kernel(); 1131 lock_kernel();
1139 nfs_begin_data_update(dir); 1132 nfs_begin_data_update(dir);
1140 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); 1133 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
1141 nfs_end_data_update(dir); 1134 nfs_end_data_update(dir);
1142 if (error != 0) 1135 if (error != 0)
1143 goto out_err; 1136 goto out_err;
@@ -1332,6 +1325,7 @@ static int nfs_safe_remove(struct dentry *dentry)
1332 1325
1333 nfs_begin_data_update(dir); 1326 nfs_begin_data_update(dir);
1334 if (inode != NULL) { 1327 if (inode != NULL) {
1328 nfs_inode_return_delegation(inode);
1335 nfs_begin_data_update(inode); 1329 nfs_begin_data_update(inode);
1336 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1330 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1337 /* The VFS may want to delete this inode */ 1331 /* The VFS may want to delete this inode */
@@ -1438,17 +1432,14 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1438 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1432 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1439 dentry->d_parent->d_name.name, dentry->d_name.name); 1433 dentry->d_parent->d_name.name, dentry->d_name.name);
1440 1434
1441 /*
1442 * Drop the dentry in advance to force a new lookup.
1443 * Since nfs_proc_link doesn't return a file handle,
1444 * we can't use the existing dentry.
1445 */
1446 lock_kernel(); 1435 lock_kernel();
1447 d_drop(dentry);
1448
1449 nfs_begin_data_update(dir); 1436 nfs_begin_data_update(dir);
1450 nfs_begin_data_update(inode); 1437 nfs_begin_data_update(inode);
1451 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1438 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1439 if (error == 0) {
1440 atomic_inc(&inode->i_count);
1441 d_instantiate(dentry, inode);
1442 }
1452 nfs_end_data_update(inode); 1443 nfs_end_data_update(inode);
1453 nfs_end_data_update(dir); 1444 nfs_end_data_update(dir);
1454 unlock_kernel(); 1445 unlock_kernel();
@@ -1512,9 +1503,11 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1512 */ 1503 */
1513 if (!new_inode) 1504 if (!new_inode)
1514 goto go_ahead; 1505 goto go_ahead;
1515 if (S_ISDIR(new_inode->i_mode)) 1506 if (S_ISDIR(new_inode->i_mode)) {
1516 goto out; 1507 error = -EISDIR;
1517 else if (atomic_read(&new_dentry->d_count) > 2) { 1508 if (!S_ISDIR(old_inode->i_mode))
1509 goto out;
1510 } else if (atomic_read(&new_dentry->d_count) > 2) {
1518 int err; 1511 int err;
1519 /* copy the target dentry's name */ 1512 /* copy the target dentry's name */
1520 dentry = d_alloc(new_dentry->d_parent, 1513 dentry = d_alloc(new_dentry->d_parent,
@@ -1539,7 +1532,8 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1539#endif 1532#endif
1540 goto out; 1533 goto out;
1541 } 1534 }
1542 } 1535 } else
1536 new_inode->i_nlink--;
1543 1537
1544go_ahead: 1538go_ahead:
1545 /* 1539 /*
@@ -1549,6 +1543,7 @@ go_ahead:
1549 nfs_wb_all(old_inode); 1543 nfs_wb_all(old_inode);
1550 shrink_dcache_parent(old_dentry); 1544 shrink_dcache_parent(old_dentry);
1551 } 1545 }
1546 nfs_inode_return_delegation(old_inode);
1552 1547
1553 if (new_inode) 1548 if (new_inode)
1554 d_delete(new_dentry); 1549 d_delete(new_dentry);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6bdcfa95de94..57d3e77d97ee 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -205,8 +205,8 @@ nfs_file_flush(struct file *file)
205 if (!status) { 205 if (!status) {
206 status = ctx->error; 206 status = ctx->error;
207 ctx->error = 0; 207 ctx->error = 0;
208 if (!status && !nfs_have_delegation(inode, FMODE_READ)) 208 if (!status)
209 __nfs_revalidate_inode(NFS_SERVER(inode), inode); 209 nfs_revalidate_inode(NFS_SERVER(inode), inode);
210 } 210 }
211 unlock_kernel(); 211 unlock_kernel();
212 return status; 212 return status;
@@ -376,22 +376,31 @@ out_swapfile:
376 376
377static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) 377static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
378{ 378{
379 struct file_lock *cfl;
379 struct inode *inode = filp->f_mapping->host; 380 struct inode *inode = filp->f_mapping->host;
380 int status = 0; 381 int status = 0;
381 382
382 lock_kernel(); 383 lock_kernel();
383 /* Use local locking if mounted with "-onolock" */ 384 /* Try local locking first */
384 if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) 385 cfl = posix_test_lock(filp, fl);
385 status = NFS_PROTO(inode)->lock(filp, cmd, fl); 386 if (cfl != NULL) {
386 else { 387 locks_copy_lock(fl, cfl);
387 struct file_lock *cfl = posix_test_lock(filp, fl); 388 goto out;
388
389 fl->fl_type = F_UNLCK;
390 if (cfl != NULL)
391 memcpy(fl, cfl, sizeof(*fl));
392 } 389 }
390
391 if (nfs_have_delegation(inode, FMODE_READ))
392 goto out_noconflict;
393
394 if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
395 goto out_noconflict;
396
397 status = NFS_PROTO(inode)->lock(filp, cmd, fl);
398out:
393 unlock_kernel(); 399 unlock_kernel();
394 return status; 400 return status;
401out_noconflict:
402 fl->fl_type = F_UNLCK;
403 goto out;
395} 404}
396 405
397static int do_vfs_lock(struct file *file, struct file_lock *fl) 406static int do_vfs_lock(struct file *file, struct file_lock *fl)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d4eadeea128e..fc0f12ba89cc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -358,6 +358,35 @@ out_no_root:
358 return no_root_error; 358 return no_root_error;
359} 359}
360 360
361static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
362{
363 to->to_initval = timeo * HZ / 10;
364 to->to_retries = retrans;
365 if (!to->to_retries)
366 to->to_retries = 2;
367
368 switch (proto) {
369 case IPPROTO_TCP:
370 if (!to->to_initval)
371 to->to_initval = 60 * HZ;
372 if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
373 to->to_initval = NFS_MAX_TCP_TIMEOUT;
374 to->to_increment = to->to_initval;
375 to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
376 to->to_exponential = 0;
377 break;
378 case IPPROTO_UDP:
379 default:
380 if (!to->to_initval)
381 to->to_initval = 11 * HZ / 10;
382 if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
383 to->to_initval = NFS_MAX_UDP_TIMEOUT;
384 to->to_maxval = NFS_MAX_UDP_TIMEOUT;
385 to->to_exponential = 1;
386 break;
387 }
388}
389
361/* 390/*
362 * Create an RPC client handle. 391 * Create an RPC client handle.
363 */ 392 */
@@ -367,22 +396,12 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
367 struct rpc_timeout timeparms; 396 struct rpc_timeout timeparms;
368 struct rpc_xprt *xprt = NULL; 397 struct rpc_xprt *xprt = NULL;
369 struct rpc_clnt *clnt = NULL; 398 struct rpc_clnt *clnt = NULL;
370 int tcp = (data->flags & NFS_MOUNT_TCP); 399 int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
371 400
372 /* Initialize timeout values */ 401 nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
373 timeparms.to_initval = data->timeo * HZ / 10;
374 timeparms.to_retries = data->retrans;
375 timeparms.to_maxval = tcp ? RPC_MAX_TCP_TIMEOUT : RPC_MAX_UDP_TIMEOUT;
376 timeparms.to_exponential = 1;
377
378 if (!timeparms.to_initval)
379 timeparms.to_initval = (tcp ? 600 : 11) * HZ / 10;
380 if (!timeparms.to_retries)
381 timeparms.to_retries = 5;
382 402
383 /* create transport and client */ 403 /* create transport and client */
384 xprt = xprt_create_proto(tcp ? IPPROTO_TCP : IPPROTO_UDP, 404 xprt = xprt_create_proto(proto, &server->addr, &timeparms);
385 &server->addr, &timeparms);
386 if (IS_ERR(xprt)) { 405 if (IS_ERR(xprt)) {
387 dprintk("%s: cannot create RPC transport. Error = %ld\n", 406 dprintk("%s: cannot create RPC transport. Error = %ld\n",
388 __FUNCTION__, PTR_ERR(xprt)); 407 __FUNCTION__, PTR_ERR(xprt));
@@ -576,7 +595,6 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
576 { NFS_MOUNT_SOFT, ",soft", ",hard" }, 595 { NFS_MOUNT_SOFT, ",soft", ",hard" },
577 { NFS_MOUNT_INTR, ",intr", "" }, 596 { NFS_MOUNT_INTR, ",intr", "" },
578 { NFS_MOUNT_POSIX, ",posix", "" }, 597 { NFS_MOUNT_POSIX, ",posix", "" },
579 { NFS_MOUNT_TCP, ",tcp", ",udp" },
580 { NFS_MOUNT_NOCTO, ",nocto", "" }, 598 { NFS_MOUNT_NOCTO, ",nocto", "" },
581 { NFS_MOUNT_NOAC, ",noac", "" }, 599 { NFS_MOUNT_NOAC, ",noac", "" },
582 { NFS_MOUNT_NONLM, ",nolock", ",lock" }, 600 { NFS_MOUNT_NONLM, ",nolock", ",lock" },
@@ -585,6 +603,8 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
585 }; 603 };
586 struct proc_nfs_info *nfs_infop; 604 struct proc_nfs_info *nfs_infop;
587 struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); 605 struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
606 char buf[12];
607 char *proto;
588 608
589 seq_printf(m, ",v%d", nfss->rpc_ops->version); 609 seq_printf(m, ",v%d", nfss->rpc_ops->version);
590 seq_printf(m, ",rsize=%d", nfss->rsize); 610 seq_printf(m, ",rsize=%d", nfss->rsize);
@@ -603,6 +623,18 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
603 else 623 else
604 seq_puts(m, nfs_infop->nostr); 624 seq_puts(m, nfs_infop->nostr);
605 } 625 }
626 switch (nfss->client->cl_xprt->prot) {
627 case IPPROTO_TCP:
628 proto = "tcp";
629 break;
630 case IPPROTO_UDP:
631 proto = "udp";
632 break;
633 default:
634 snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
635 proto = buf;
636 }
637 seq_printf(m, ",proto=%s", proto);
606 seq_puts(m, ",addr="); 638 seq_puts(m, ",addr=");
607 seq_escape(m, nfss->hostname, " \t\n\\"); 639 seq_escape(m, nfss->hostname, " \t\n\\");
608 return 0; 640 return 0;
@@ -753,7 +785,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
753 else 785 else
754 init_special_inode(inode, inode->i_mode, fattr->rdev); 786 init_special_inode(inode, inode->i_mode, fattr->rdev);
755 787
756 nfsi->read_cache_jiffies = fattr->timestamp; 788 nfsi->read_cache_jiffies = fattr->time_start;
789 nfsi->last_updated = jiffies;
757 inode->i_atime = fattr->atime; 790 inode->i_atime = fattr->atime;
758 inode->i_mtime = fattr->mtime; 791 inode->i_mtime = fattr->mtime;
759 inode->i_ctime = fattr->ctime; 792 inode->i_ctime = fattr->ctime;
@@ -821,6 +854,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
821 filemap_fdatawait(inode->i_mapping); 854 filemap_fdatawait(inode->i_mapping);
822 nfs_wb_all(inode); 855 nfs_wb_all(inode);
823 } 856 }
857 /*
858 * Return any delegations if we're going to change ACLs
859 */
860 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
861 nfs_inode_return_delegation(inode);
824 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); 862 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
825 if (error == 0) 863 if (error == 0)
826 nfs_refresh_inode(inode, &fattr); 864 nfs_refresh_inode(inode, &fattr);
@@ -1019,15 +1057,11 @@ int nfs_open(struct inode *inode, struct file *filp)
1019 ctx->mode = filp->f_mode; 1057 ctx->mode = filp->f_mode;
1020 nfs_file_set_open_context(filp, ctx); 1058 nfs_file_set_open_context(filp, ctx);
1021 put_nfs_open_context(ctx); 1059 put_nfs_open_context(ctx);
1022 if ((filp->f_mode & FMODE_WRITE) != 0)
1023 nfs_begin_data_update(inode);
1024 return 0; 1060 return 0;
1025} 1061}
1026 1062
1027int nfs_release(struct inode *inode, struct file *filp) 1063int nfs_release(struct inode *inode, struct file *filp)
1028{ 1064{
1029 if ((filp->f_mode & FMODE_WRITE) != 0)
1030 nfs_end_data_update(inode);
1031 nfs_file_clear_open_context(filp); 1065 nfs_file_clear_open_context(filp);
1032 return 0; 1066 return 0;
1033} 1067}
@@ -1083,14 +1117,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1083 goto out; 1117 goto out;
1084 } 1118 }
1085 1119
1120 spin_lock(&inode->i_lock);
1086 status = nfs_update_inode(inode, &fattr, verifier); 1121 status = nfs_update_inode(inode, &fattr, verifier);
1087 if (status) { 1122 if (status) {
1123 spin_unlock(&inode->i_lock);
1088 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 1124 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
1089 inode->i_sb->s_id, 1125 inode->i_sb->s_id,
1090 (long long)NFS_FILEID(inode), status); 1126 (long long)NFS_FILEID(inode), status);
1091 goto out; 1127 goto out;
1092 } 1128 }
1093 spin_lock(&inode->i_lock);
1094 cache_validity = nfsi->cache_validity; 1129 cache_validity = nfsi->cache_validity;
1095 nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; 1130 nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
1096 1131
@@ -1098,7 +1133,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1098 * We may need to keep the attributes marked as invalid if 1133 * We may need to keep the attributes marked as invalid if
1099 * we raced with nfs_end_attr_update(). 1134 * we raced with nfs_end_attr_update().
1100 */ 1135 */
1101 if (verifier == nfsi->cache_change_attribute) 1136 if (time_after_eq(verifier, nfsi->cache_change_attribute))
1102 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); 1137 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
1103 spin_unlock(&inode->i_lock); 1138 spin_unlock(&inode->i_lock);
1104 1139
@@ -1165,7 +1200,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1165 if (S_ISDIR(inode->i_mode)) { 1200 if (S_ISDIR(inode->i_mode)) {
1166 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 1201 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
1167 /* This ensures we revalidate child dentries */ 1202 /* This ensures we revalidate child dentries */
1168 nfsi->cache_change_attribute++; 1203 nfsi->cache_change_attribute = jiffies;
1169 } 1204 }
1170 spin_unlock(&inode->i_lock); 1205 spin_unlock(&inode->i_lock);
1171 1206
@@ -1197,20 +1232,19 @@ void nfs_end_data_update(struct inode *inode)
1197 struct nfs_inode *nfsi = NFS_I(inode); 1232 struct nfs_inode *nfsi = NFS_I(inode);
1198 1233
1199 if (!nfs_have_delegation(inode, FMODE_READ)) { 1234 if (!nfs_have_delegation(inode, FMODE_READ)) {
1200 /* Mark the attribute cache for revalidation */ 1235 /* Directories and symlinks: invalidate page cache */
1201 spin_lock(&inode->i_lock); 1236 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
1202 nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 1237 spin_lock(&inode->i_lock);
1203 /* Directories and symlinks: invalidate page cache too */
1204 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1205 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 1238 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
1206 spin_unlock(&inode->i_lock); 1239 spin_unlock(&inode->i_lock);
1240 }
1207 } 1241 }
1208 nfsi->cache_change_attribute ++; 1242 nfsi->cache_change_attribute = jiffies;
1209 atomic_dec(&nfsi->data_updates); 1243 atomic_dec(&nfsi->data_updates);
1210} 1244}
1211 1245
1212/** 1246/**
1213 * nfs_refresh_inode - verify consistency of the inode attribute cache 1247 * nfs_check_inode_attributes - verify consistency of the inode attribute cache
1214 * @inode - pointer to inode 1248 * @inode - pointer to inode
1215 * @fattr - updated attributes 1249 * @fattr - updated attributes
1216 * 1250 *
@@ -1218,13 +1252,12 @@ void nfs_end_data_update(struct inode *inode)
1218 * so that fattr carries weak cache consistency data, then it may 1252 * so that fattr carries weak cache consistency data, then it may
1219 * also update the ctime/mtime/change_attribute. 1253 * also update the ctime/mtime/change_attribute.
1220 */ 1254 */
1221int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) 1255static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr)
1222{ 1256{
1223 struct nfs_inode *nfsi = NFS_I(inode); 1257 struct nfs_inode *nfsi = NFS_I(inode);
1224 loff_t cur_size, new_isize; 1258 loff_t cur_size, new_isize;
1225 int data_unstable; 1259 int data_unstable;
1226 1260
1227 spin_lock(&inode->i_lock);
1228 1261
1229 /* Are we in the process of updating data on the server? */ 1262 /* Are we in the process of updating data on the server? */
1230 data_unstable = nfs_caches_unstable(inode); 1263 data_unstable = nfs_caches_unstable(inode);
@@ -1241,14 +1274,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
1241 } 1274 }
1242 1275
1243 if ((fattr->valid & NFS_ATTR_FATTR) == 0) { 1276 if ((fattr->valid & NFS_ATTR_FATTR) == 0) {
1244 spin_unlock(&inode->i_lock);
1245 return 0; 1277 return 0;
1246 } 1278 }
1247 1279
1248 /* Has the inode gone and changed behind our back? */ 1280 /* Has the inode gone and changed behind our back? */
1249 if (nfsi->fileid != fattr->fileid 1281 if (nfsi->fileid != fattr->fileid
1250 || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { 1282 || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
1251 spin_unlock(&inode->i_lock);
1252 return -EIO; 1283 return -EIO;
1253 } 1284 }
1254 1285
@@ -1288,11 +1319,67 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
1288 if (!timespec_equal(&inode->i_atime, &fattr->atime)) 1319 if (!timespec_equal(&inode->i_atime, &fattr->atime))
1289 nfsi->cache_validity |= NFS_INO_INVALID_ATIME; 1320 nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
1290 1321
1291 nfsi->read_cache_jiffies = fattr->timestamp; 1322 nfsi->read_cache_jiffies = fattr->time_start;
1292 spin_unlock(&inode->i_lock);
1293 return 0; 1323 return 0;
1294} 1324}
1295 1325
1326/**
1327 * nfs_refresh_inode - try to update the inode attribute cache
1328 * @inode - pointer to inode
1329 * @fattr - updated attributes
1330 *
1331 * Check that an RPC call that returned attributes has not overlapped with
1332 * other recent updates of the inode metadata, then decide whether it is
1333 * safe to do a full update of the inode attributes, or whether just to
1334 * call nfs_check_inode_attributes.
1335 */
1336int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
1337{
1338 struct nfs_inode *nfsi = NFS_I(inode);
1339 int status;
1340
1341 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1342 return 0;
1343 spin_lock(&inode->i_lock);
1344 nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
1345 if (nfs_verify_change_attribute(inode, fattr->time_start))
1346 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
1347 if (time_after(fattr->time_start, nfsi->last_updated))
1348 status = nfs_update_inode(inode, fattr, fattr->time_start);
1349 else
1350 status = nfs_check_inode_attributes(inode, fattr);
1351
1352 spin_unlock(&inode->i_lock);
1353 return status;
1354}
1355
1356/**
1357 * nfs_post_op_update_inode - try to update the inode attribute cache
1358 * @inode - pointer to inode
1359 * @fattr - updated attributes
1360 *
1361 * After an operation that has changed the inode metadata, mark the
1362 * attribute cache as being invalid, then try to update it.
1363 */
1364int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1365{
1366 struct nfs_inode *nfsi = NFS_I(inode);
1367 int status = 0;
1368
1369 spin_lock(&inode->i_lock);
1370 if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
1371 nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
1372 goto out;
1373 }
1374 status = nfs_update_inode(inode, fattr, fattr->time_start);
1375 if (time_after_eq(fattr->time_start, nfsi->cache_change_attribute))
1376 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE);
1377 nfsi->cache_change_attribute = jiffies;
1378out:
1379 spin_unlock(&inode->i_lock);
1380 return status;
1381}
1382
1296/* 1383/*
1297 * Many nfs protocol calls return the new file attributes after 1384 * Many nfs protocol calls return the new file attributes after
1298 * an operation. Here we update the inode to reflect the state 1385 * an operation. Here we update the inode to reflect the state
@@ -1328,20 +1415,17 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1328 goto out_err; 1415 goto out_err;
1329 } 1416 }
1330 1417
1331 spin_lock(&inode->i_lock);
1332
1333 /* 1418 /*
1334 * Make sure the inode's type hasn't changed. 1419 * Make sure the inode's type hasn't changed.
1335 */ 1420 */
1336 if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { 1421 if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
1337 spin_unlock(&inode->i_lock);
1338 goto out_changed; 1422 goto out_changed;
1339 }
1340 1423
1341 /* 1424 /*
1342 * Update the read time so we don't revalidate too often. 1425 * Update the read time so we don't revalidate too often.
1343 */ 1426 */
1344 nfsi->read_cache_jiffies = fattr->timestamp; 1427 nfsi->read_cache_jiffies = fattr->time_start;
1428 nfsi->last_updated = jiffies;
1345 1429
1346 /* Are we racing with known updates of the metadata on the server? */ 1430 /* Are we racing with known updates of the metadata on the server? */
1347 data_unstable = ! (nfs_verify_change_attribute(inode, verifier) || 1431 data_unstable = ! (nfs_verify_change_attribute(inode, verifier) ||
@@ -1354,7 +1438,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1354 /* Do we perhaps have any outstanding writes? */ 1438 /* Do we perhaps have any outstanding writes? */
1355 if (nfsi->npages == 0) { 1439 if (nfsi->npages == 0) {
1356 /* No, but did we race with nfs_end_data_update()? */ 1440 /* No, but did we race with nfs_end_data_update()? */
1357 if (verifier == nfsi->cache_change_attribute) { 1441 if (time_after_eq(verifier, nfsi->cache_change_attribute)) {
1358 inode->i_size = new_isize; 1442 inode->i_size = new_isize;
1359 invalid |= NFS_INO_INVALID_DATA; 1443 invalid |= NFS_INO_INVALID_DATA;
1360 } 1444 }
@@ -1430,7 +1514,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
1430 if (!nfs_have_delegation(inode, FMODE_READ)) 1514 if (!nfs_have_delegation(inode, FMODE_READ))
1431 nfsi->cache_validity |= invalid; 1515 nfsi->cache_validity |= invalid;
1432 1516
1433 spin_unlock(&inode->i_lock);
1434 return 0; 1517 return 0;
1435 out_changed: 1518 out_changed:
1436 /* 1519 /*
@@ -1639,8 +1722,7 @@ static void nfs4_clear_inode(struct inode *inode)
1639 struct nfs_inode *nfsi = NFS_I(inode); 1722 struct nfs_inode *nfsi = NFS_I(inode);
1640 1723
1641 /* If we are holding a delegation, return it! */ 1724 /* If we are holding a delegation, return it! */
1642 if (nfsi->delegation != NULL) 1725 nfs_inode_return_delegation(inode);
1643 nfs_inode_return_delegation(inode);
1644 /* First call standard NFS clear_inode() code */ 1726 /* First call standard NFS clear_inode() code */
1645 nfs_clear_inode(inode); 1727 nfs_clear_inode(inode);
1646 /* Now clear out any remaining state */ 1728 /* Now clear out any remaining state */
@@ -1669,7 +1751,7 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
1669 struct rpc_clnt *clnt = NULL; 1751 struct rpc_clnt *clnt = NULL;
1670 struct rpc_timeout timeparms; 1752 struct rpc_timeout timeparms;
1671 rpc_authflavor_t authflavour; 1753 rpc_authflavor_t authflavour;
1672 int proto, err = -EIO; 1754 int err = -EIO;
1673 1755
1674 sb->s_blocksize_bits = 0; 1756 sb->s_blocksize_bits = 0;
1675 sb->s_blocksize = 0; 1757 sb->s_blocksize = 0;
@@ -1687,30 +1769,8 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
1687 server->acdirmax = data->acdirmax*HZ; 1769 server->acdirmax = data->acdirmax*HZ;
1688 1770
1689 server->rpc_ops = &nfs_v4_clientops; 1771 server->rpc_ops = &nfs_v4_clientops;
1690 /* Initialize timeout values */
1691
1692 timeparms.to_initval = data->timeo * HZ / 10;
1693 timeparms.to_retries = data->retrans;
1694 timeparms.to_exponential = 1;
1695 if (!timeparms.to_retries)
1696 timeparms.to_retries = 5;
1697 1772
1698 proto = data->proto; 1773 nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
1699 /* Which IP protocol do we use? */
1700 switch (proto) {
1701 case IPPROTO_TCP:
1702 timeparms.to_maxval = RPC_MAX_TCP_TIMEOUT;
1703 if (!timeparms.to_initval)
1704 timeparms.to_initval = 600 * HZ / 10;
1705 break;
1706 case IPPROTO_UDP:
1707 timeparms.to_maxval = RPC_MAX_UDP_TIMEOUT;
1708 if (!timeparms.to_initval)
1709 timeparms.to_initval = 11 * HZ / 10;
1710 break;
1711 default:
1712 return -EINVAL;
1713 }
1714 1774
1715 clp = nfs4_get_client(&server->addr.sin_addr); 1775 clp = nfs4_get_client(&server->addr.sin_addr);
1716 if (!clp) { 1776 if (!clp) {
@@ -1735,7 +1795,7 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
1735 1795
1736 down_write(&clp->cl_sem); 1796 down_write(&clp->cl_sem);
1737 if (IS_ERR(clp->cl_rpcclient)) { 1797 if (IS_ERR(clp->cl_rpcclient)) {
1738 xprt = xprt_create_proto(proto, &server->addr, &timeparms); 1798 xprt = xprt_create_proto(data->proto, &server->addr, &timeparms);
1739 if (IS_ERR(xprt)) { 1799 if (IS_ERR(xprt)) {
1740 up_write(&clp->cl_sem); 1800 up_write(&clp->cl_sem);
1741 err = PTR_ERR(xprt); 1801 err = PTR_ERR(xprt);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index d91b69044a4d..59049e864ca7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -143,7 +143,6 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
143 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; 143 fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
144 fattr->rdev = 0; 144 fattr->rdev = 0;
145 } 145 }
146 fattr->timestamp = jiffies;
147 return p; 146 return p;
148} 147}
149 148
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index edc95514046d..92c870d19ccd 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -78,7 +78,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
78 int status; 78 int status;
79 79
80 dprintk("%s: call fsinfo\n", __FUNCTION__); 80 dprintk("%s: call fsinfo\n", __FUNCTION__);
81 info->fattr->valid = 0; 81 nfs_fattr_init(info->fattr);
82 status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); 82 status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0);
83 dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status); 83 dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status);
84 if (!(info->fattr->valid & NFS_ATTR_FATTR)) { 84 if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
@@ -98,7 +98,7 @@ nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
98 int status; 98 int status;
99 99
100 dprintk("NFS call getattr\n"); 100 dprintk("NFS call getattr\n");
101 fattr->valid = 0; 101 nfs_fattr_init(fattr);
102 status = rpc_call(server->client, NFS3PROC_GETATTR, 102 status = rpc_call(server->client, NFS3PROC_GETATTR,
103 fhandle, fattr, 0); 103 fhandle, fattr, 0);
104 dprintk("NFS reply getattr: %d\n", status); 104 dprintk("NFS reply getattr: %d\n", status);
@@ -117,7 +117,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
117 int status; 117 int status;
118 118
119 dprintk("NFS call setattr\n"); 119 dprintk("NFS call setattr\n");
120 fattr->valid = 0; 120 nfs_fattr_init(fattr);
121 status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0); 121 status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0);
122 if (status == 0) 122 if (status == 0)
123 nfs_setattr_update_inode(inode, sattr); 123 nfs_setattr_update_inode(inode, sattr);
@@ -143,8 +143,8 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
143 int status; 143 int status;
144 144
145 dprintk("NFS call lookup %s\n", name->name); 145 dprintk("NFS call lookup %s\n", name->name);
146 dir_attr.valid = 0; 146 nfs_fattr_init(&dir_attr);
147 fattr->valid = 0; 147 nfs_fattr_init(fattr);
148 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_LOOKUP, &arg, &res, 0); 148 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_LOOKUP, &arg, &res, 0);
149 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) 149 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR))
150 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_GETATTR, 150 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_GETATTR,
@@ -174,7 +174,6 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
174 int status; 174 int status;
175 175
176 dprintk("NFS call access\n"); 176 dprintk("NFS call access\n");
177 fattr.valid = 0;
178 177
179 if (mode & MAY_READ) 178 if (mode & MAY_READ)
180 arg.access |= NFS3_ACCESS_READ; 179 arg.access |= NFS3_ACCESS_READ;
@@ -189,6 +188,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
189 if (mode & MAY_EXEC) 188 if (mode & MAY_EXEC)
190 arg.access |= NFS3_ACCESS_EXECUTE; 189 arg.access |= NFS3_ACCESS_EXECUTE;
191 } 190 }
191 nfs_fattr_init(&fattr);
192 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 192 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
193 nfs_refresh_inode(inode, &fattr); 193 nfs_refresh_inode(inode, &fattr);
194 if (status == 0) { 194 if (status == 0) {
@@ -217,7 +217,7 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
217 int status; 217 int status;
218 218
219 dprintk("NFS call readlink\n"); 219 dprintk("NFS call readlink\n");
220 fattr.valid = 0; 220 nfs_fattr_init(&fattr);
221 status = rpc_call(NFS_CLIENT(inode), NFS3PROC_READLINK, 221 status = rpc_call(NFS_CLIENT(inode), NFS3PROC_READLINK,
222 &args, &fattr, 0); 222 &args, &fattr, 0);
223 nfs_refresh_inode(inode, &fattr); 223 nfs_refresh_inode(inode, &fattr);
@@ -240,7 +240,7 @@ static int nfs3_proc_read(struct nfs_read_data *rdata)
240 240
241 dprintk("NFS call read %d @ %Ld\n", rdata->args.count, 241 dprintk("NFS call read %d @ %Ld\n", rdata->args.count,
242 (long long) rdata->args.offset); 242 (long long) rdata->args.offset);
243 fattr->valid = 0; 243 nfs_fattr_init(fattr);
244 status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); 244 status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
245 if (status >= 0) 245 if (status >= 0)
246 nfs_refresh_inode(inode, fattr); 246 nfs_refresh_inode(inode, fattr);
@@ -263,10 +263,10 @@ static int nfs3_proc_write(struct nfs_write_data *wdata)
263 263
264 dprintk("NFS call write %d @ %Ld\n", wdata->args.count, 264 dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
265 (long long) wdata->args.offset); 265 (long long) wdata->args.offset);
266 fattr->valid = 0; 266 nfs_fattr_init(fattr);
267 status = rpc_call_sync(NFS_CLIENT(inode), &msg, rpcflags); 267 status = rpc_call_sync(NFS_CLIENT(inode), &msg, rpcflags);
268 if (status >= 0) 268 if (status >= 0)
269 nfs_refresh_inode(inode, fattr); 269 nfs_post_op_update_inode(inode, fattr);
270 dprintk("NFS reply write: %d\n", status); 270 dprintk("NFS reply write: %d\n", status);
271 return status < 0? status : wdata->res.count; 271 return status < 0? status : wdata->res.count;
272} 272}
@@ -285,10 +285,10 @@ static int nfs3_proc_commit(struct nfs_write_data *cdata)
285 285
286 dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, 286 dprintk("NFS call commit %d @ %Ld\n", cdata->args.count,
287 (long long) cdata->args.offset); 287 (long long) cdata->args.offset);
288 fattr->valid = 0; 288 nfs_fattr_init(fattr);
289 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 289 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
290 if (status >= 0) 290 if (status >= 0)
291 nfs_refresh_inode(inode, fattr); 291 nfs_post_op_update_inode(inode, fattr);
292 dprintk("NFS reply commit: %d\n", status); 292 dprintk("NFS reply commit: %d\n", status);
293 return status; 293 return status;
294} 294}
@@ -299,7 +299,7 @@ static int nfs3_proc_commit(struct nfs_write_data *cdata)
299 */ 299 */
300static int 300static int
301nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 301nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
302 int flags) 302 int flags, struct nameidata *nd)
303{ 303{
304 struct nfs_fh fhandle; 304 struct nfs_fh fhandle;
305 struct nfs_fattr fattr; 305 struct nfs_fattr fattr;
@@ -329,10 +329,10 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
329 sattr->ia_mode &= ~current->fs->umask; 329 sattr->ia_mode &= ~current->fs->umask;
330 330
331again: 331again:
332 dir_attr.valid = 0; 332 nfs_fattr_init(&dir_attr);
333 fattr.valid = 0; 333 nfs_fattr_init(&fattr);
334 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_CREATE, &arg, &res, 0); 334 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_CREATE, &arg, &res, 0);
335 nfs_refresh_inode(dir, &dir_attr); 335 nfs_post_op_update_inode(dir, &dir_attr);
336 336
337 /* If the server doesn't support the exclusive creation semantics, 337 /* If the server doesn't support the exclusive creation semantics,
338 * try again with simple 'guarded' mode. */ 338 * try again with simple 'guarded' mode. */
@@ -401,9 +401,9 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
401 int status; 401 int status;
402 402
403 dprintk("NFS call remove %s\n", name->name); 403 dprintk("NFS call remove %s\n", name->name);
404 dir_attr.valid = 0; 404 nfs_fattr_init(&dir_attr);
405 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 405 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
406 nfs_refresh_inode(dir, &dir_attr); 406 nfs_post_op_update_inode(dir, &dir_attr);
407 dprintk("NFS reply remove: %d\n", status); 407 dprintk("NFS reply remove: %d\n", status);
408 return status; 408 return status;
409} 409}
@@ -422,7 +422,7 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr
422 ptr->arg.fh = NFS_FH(dir->d_inode); 422 ptr->arg.fh = NFS_FH(dir->d_inode);
423 ptr->arg.name = name->name; 423 ptr->arg.name = name->name;
424 ptr->arg.len = name->len; 424 ptr->arg.len = name->len;
425 ptr->res.valid = 0; 425 nfs_fattr_init(&ptr->res);
426 msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; 426 msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];
427 msg->rpc_argp = &ptr->arg; 427 msg->rpc_argp = &ptr->arg;
428 msg->rpc_resp = &ptr->res; 428 msg->rpc_resp = &ptr->res;
@@ -439,7 +439,7 @@ nfs3_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
439 return 1; 439 return 1;
440 if (msg->rpc_argp) { 440 if (msg->rpc_argp) {
441 dir_attr = (struct nfs_fattr*)msg->rpc_resp; 441 dir_attr = (struct nfs_fattr*)msg->rpc_resp;
442 nfs_refresh_inode(dir->d_inode, dir_attr); 442 nfs_post_op_update_inode(dir->d_inode, dir_attr);
443 kfree(msg->rpc_argp); 443 kfree(msg->rpc_argp);
444 } 444 }
445 return 0; 445 return 0;
@@ -465,11 +465,11 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
465 int status; 465 int status;
466 466
467 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); 467 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
468 old_dir_attr.valid = 0; 468 nfs_fattr_init(&old_dir_attr);
469 new_dir_attr.valid = 0; 469 nfs_fattr_init(&new_dir_attr);
470 status = rpc_call(NFS_CLIENT(old_dir), NFS3PROC_RENAME, &arg, &res, 0); 470 status = rpc_call(NFS_CLIENT(old_dir), NFS3PROC_RENAME, &arg, &res, 0);
471 nfs_refresh_inode(old_dir, &old_dir_attr); 471 nfs_post_op_update_inode(old_dir, &old_dir_attr);
472 nfs_refresh_inode(new_dir, &new_dir_attr); 472 nfs_post_op_update_inode(new_dir, &new_dir_attr);
473 dprintk("NFS reply rename: %d\n", status); 473 dprintk("NFS reply rename: %d\n", status);
474 return status; 474 return status;
475} 475}
@@ -491,11 +491,11 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
491 int status; 491 int status;
492 492
493 dprintk("NFS call link %s\n", name->name); 493 dprintk("NFS call link %s\n", name->name);
494 dir_attr.valid = 0; 494 nfs_fattr_init(&dir_attr);
495 fattr.valid = 0; 495 nfs_fattr_init(&fattr);
496 status = rpc_call(NFS_CLIENT(inode), NFS3PROC_LINK, &arg, &res, 0); 496 status = rpc_call(NFS_CLIENT(inode), NFS3PROC_LINK, &arg, &res, 0);
497 nfs_refresh_inode(dir, &dir_attr); 497 nfs_post_op_update_inode(dir, &dir_attr);
498 nfs_refresh_inode(inode, &fattr); 498 nfs_post_op_update_inode(inode, &fattr);
499 dprintk("NFS reply link: %d\n", status); 499 dprintk("NFS reply link: %d\n", status);
500 return status; 500 return status;
501} 501}
@@ -524,10 +524,10 @@ nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
524 if (path->len > NFS3_MAXPATHLEN) 524 if (path->len > NFS3_MAXPATHLEN)
525 return -ENAMETOOLONG; 525 return -ENAMETOOLONG;
526 dprintk("NFS call symlink %s -> %s\n", name->name, path->name); 526 dprintk("NFS call symlink %s -> %s\n", name->name, path->name);
527 dir_attr.valid = 0; 527 nfs_fattr_init(&dir_attr);
528 fattr->valid = 0; 528 nfs_fattr_init(fattr);
529 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_SYMLINK, &arg, &res, 0); 529 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_SYMLINK, &arg, &res, 0);
530 nfs_refresh_inode(dir, &dir_attr); 530 nfs_post_op_update_inode(dir, &dir_attr);
531 dprintk("NFS reply symlink: %d\n", status); 531 dprintk("NFS reply symlink: %d\n", status);
532 return status; 532 return status;
533} 533}
@@ -552,13 +552,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
552 int status; 552 int status;
553 553
554 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 554 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
555 dir_attr.valid = 0;
556 fattr.valid = 0;
557 555
558 sattr->ia_mode &= ~current->fs->umask; 556 sattr->ia_mode &= ~current->fs->umask;
559 557
558 nfs_fattr_init(&dir_attr);
559 nfs_fattr_init(&fattr);
560 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKDIR, &arg, &res, 0); 560 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKDIR, &arg, &res, 0);
561 nfs_refresh_inode(dir, &dir_attr); 561 nfs_post_op_update_inode(dir, &dir_attr);
562 if (status != 0) 562 if (status != 0)
563 goto out; 563 goto out;
564 status = nfs_instantiate(dentry, &fhandle, &fattr); 564 status = nfs_instantiate(dentry, &fhandle, &fattr);
@@ -582,9 +582,9 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
582 int status; 582 int status;
583 583
584 dprintk("NFS call rmdir %s\n", name->name); 584 dprintk("NFS call rmdir %s\n", name->name);
585 dir_attr.valid = 0; 585 nfs_fattr_init(&dir_attr);
586 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_RMDIR, &arg, &dir_attr, 0); 586 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_RMDIR, &arg, &dir_attr, 0);
587 nfs_refresh_inode(dir, &dir_attr); 587 nfs_post_op_update_inode(dir, &dir_attr);
588 dprintk("NFS reply rmdir: %d\n", status); 588 dprintk("NFS reply rmdir: %d\n", status);
589 return status; 589 return status;
590} 590}
@@ -634,7 +634,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
634 dprintk("NFS call readdir%s %d\n", 634 dprintk("NFS call readdir%s %d\n",
635 plus? "plus" : "", (unsigned int) cookie); 635 plus? "plus" : "", (unsigned int) cookie);
636 636
637 dir_attr.valid = 0; 637 nfs_fattr_init(&dir_attr);
638 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 638 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
639 nfs_refresh_inode(dir, &dir_attr); 639 nfs_refresh_inode(dir, &dir_attr);
640 dprintk("NFS reply readdir: %d\n", status); 640 dprintk("NFS reply readdir: %d\n", status);
@@ -676,10 +676,10 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
676 676
677 sattr->ia_mode &= ~current->fs->umask; 677 sattr->ia_mode &= ~current->fs->umask;
678 678
679 dir_attr.valid = 0; 679 nfs_fattr_init(&dir_attr);
680 fattr.valid = 0; 680 nfs_fattr_init(&fattr);
681 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKNOD, &arg, &res, 0); 681 status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKNOD, &arg, &res, 0);
682 nfs_refresh_inode(dir, &dir_attr); 682 nfs_post_op_update_inode(dir, &dir_attr);
683 if (status != 0) 683 if (status != 0)
684 goto out; 684 goto out;
685 status = nfs_instantiate(dentry, &fh, &fattr); 685 status = nfs_instantiate(dentry, &fh, &fattr);
@@ -698,7 +698,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
698 int status; 698 int status;
699 699
700 dprintk("NFS call fsstat\n"); 700 dprintk("NFS call fsstat\n");
701 stat->fattr->valid = 0; 701 nfs_fattr_init(stat->fattr);
702 status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, stat, 0); 702 status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, stat, 0);
703 dprintk("NFS reply statfs: %d\n", status); 703 dprintk("NFS reply statfs: %d\n", status);
704 return status; 704 return status;
@@ -711,7 +711,7 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
711 int status; 711 int status;
712 712
713 dprintk("NFS call fsinfo\n"); 713 dprintk("NFS call fsinfo\n");
714 info->fattr->valid = 0; 714 nfs_fattr_init(info->fattr);
715 status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); 715 status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0);
716 dprintk("NFS reply fsinfo: %d\n", status); 716 dprintk("NFS reply fsinfo: %d\n", status);
717 return status; 717 return status;
@@ -724,7 +724,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
724 int status; 724 int status;
725 725
726 dprintk("NFS call pathconf\n"); 726 dprintk("NFS call pathconf\n");
727 info->fattr->valid = 0; 727 nfs_fattr_init(info->fattr);
728 status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0); 728 status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0);
729 dprintk("NFS reply pathconf: %d\n", status); 729 dprintk("NFS reply pathconf: %d\n", status);
730 return status; 730 return status;
@@ -735,7 +735,7 @@ extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
735static void 735static void
736nfs3_read_done(struct rpc_task *task) 736nfs3_read_done(struct rpc_task *task)
737{ 737{
738 struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; 738 struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata;
739 739
740 if (nfs3_async_handle_jukebox(task)) 740 if (nfs3_async_handle_jukebox(task))
741 return; 741 return;
@@ -775,7 +775,7 @@ nfs3_write_done(struct rpc_task *task)
775 return; 775 return;
776 data = (struct nfs_write_data *)task->tk_calldata; 776 data = (struct nfs_write_data *)task->tk_calldata;
777 if (task->tk_status >= 0) 777 if (task->tk_status >= 0)
778 nfs_refresh_inode(data->inode, data->res.fattr); 778 nfs_post_op_update_inode(data->inode, data->res.fattr);
779 nfs_writeback_done(task); 779 nfs_writeback_done(task);
780} 780}
781 781
@@ -819,7 +819,7 @@ nfs3_commit_done(struct rpc_task *task)
819 return; 819 return;
820 data = (struct nfs_write_data *)task->tk_calldata; 820 data = (struct nfs_write_data *)task->tk_calldata;
821 if (task->tk_status >= 0) 821 if (task->tk_status >= 0)
822 nfs_refresh_inode(data->inode, data->res.fattr); 822 nfs_post_op_update_inode(data->inode, data->res.fattr);
823 nfs_commit_done(task); 823 nfs_commit_done(task);
824} 824}
825 825
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index db4a904810a4..0498bd36602c 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -174,7 +174,6 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
174 174
175 /* Update the mode bits */ 175 /* Update the mode bits */
176 fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); 176 fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
177 fattr->timestamp = jiffies;
178 return p; 177 return p;
179} 178}
180 179
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ec1a22d7b876..78a53f5a9f18 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -93,25 +93,50 @@ struct nfs4_client {
93}; 93};
94 94
95/* 95/*
96 * struct rpc_sequence ensures that RPC calls are sent in the exact
97 * order that they appear on the list.
98 */
99struct rpc_sequence {
100 struct rpc_wait_queue wait; /* RPC call delay queue */
101 spinlock_t lock; /* Protects the list */
102 struct list_head list; /* Defines sequence of RPC calls */
103};
104
105#define NFS_SEQID_CONFIRMED 1
106struct nfs_seqid_counter {
107 struct rpc_sequence *sequence;
108 int flags;
109 u32 counter;
110};
111
112struct nfs_seqid {
113 struct nfs_seqid_counter *sequence;
114 struct list_head list;
115};
116
117static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status)
118{
119 if (seqid_mutating_err(-status))
120 seqid->flags |= NFS_SEQID_CONFIRMED;
121}
122
123/*
96 * NFS4 state_owners and lock_owners are simply labels for ordered 124 * NFS4 state_owners and lock_owners are simply labels for ordered
97 * sequences of RPC calls. Their sole purpose is to provide once-only 125 * sequences of RPC calls. Their sole purpose is to provide once-only
98 * semantics by allowing the server to identify replayed requests. 126 * semantics by allowing the server to identify replayed requests.
99 *
100 * The ->so_sema is held during all state_owner seqid-mutating operations:
101 * OPEN, OPEN_DOWNGRADE, and CLOSE. Its purpose is to properly serialize
102 * so_seqid.
103 */ 127 */
104struct nfs4_state_owner { 128struct nfs4_state_owner {
129 spinlock_t so_lock;
105 struct list_head so_list; /* per-clientid list of state_owners */ 130 struct list_head so_list; /* per-clientid list of state_owners */
106 struct nfs4_client *so_client; 131 struct nfs4_client *so_client;
107 u32 so_id; /* 32-bit identifier, unique */ 132 u32 so_id; /* 32-bit identifier, unique */
108 struct semaphore so_sema;
109 u32 so_seqid; /* protected by so_sema */
110 atomic_t so_count; 133 atomic_t so_count;
111 134
112 struct rpc_cred *so_cred; /* Associated cred */ 135 struct rpc_cred *so_cred; /* Associated cred */
113 struct list_head so_states; 136 struct list_head so_states;
114 struct list_head so_delegations; 137 struct list_head so_delegations;
138 struct nfs_seqid_counter so_seqid;
139 struct rpc_sequence so_sequence;
115}; 140};
116 141
117/* 142/*
@@ -132,7 +157,7 @@ struct nfs4_lock_state {
132 fl_owner_t ls_owner; /* POSIX lock owner */ 157 fl_owner_t ls_owner; /* POSIX lock owner */
133#define NFS_LOCK_INITIALIZED 1 158#define NFS_LOCK_INITIALIZED 1
134 int ls_flags; 159 int ls_flags;
135 u32 ls_seqid; 160 struct nfs_seqid_counter ls_seqid;
136 u32 ls_id; 161 u32 ls_id;
137 nfs4_stateid ls_stateid; 162 nfs4_stateid ls_stateid;
138 atomic_t ls_count; 163 atomic_t ls_count;
@@ -153,7 +178,6 @@ struct nfs4_state {
153 struct inode *inode; /* Pointer to the inode */ 178 struct inode *inode; /* Pointer to the inode */
154 179
155 unsigned long flags; /* Do we hold any locks? */ 180 unsigned long flags; /* Do we hold any locks? */
156 struct semaphore lock_sema; /* Serializes file locking operations */
157 spinlock_t state_lock; /* Protects the lock_states list */ 181 spinlock_t state_lock; /* Protects the lock_states list */
158 182
159 nfs4_stateid stateid; 183 nfs4_stateid stateid;
@@ -191,8 +215,8 @@ extern int nfs4_proc_setclientid_confirm(struct nfs4_client *);
191extern int nfs4_proc_async_renew(struct nfs4_client *); 215extern int nfs4_proc_async_renew(struct nfs4_client *);
192extern int nfs4_proc_renew(struct nfs4_client *); 216extern int nfs4_proc_renew(struct nfs4_client *);
193extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode); 217extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode);
194extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); 218extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
195extern int nfs4_open_revalidate(struct inode *, struct dentry *, int); 219extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
196 220
197extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; 221extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
198extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops; 222extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
@@ -224,12 +248,17 @@ extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state
224extern void nfs4_put_open_state(struct nfs4_state *); 248extern void nfs4_put_open_state(struct nfs4_state *);
225extern void nfs4_close_state(struct nfs4_state *, mode_t); 249extern void nfs4_close_state(struct nfs4_state *, mode_t);
226extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode); 250extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode);
227extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp);
228extern void nfs4_schedule_state_recovery(struct nfs4_client *); 251extern void nfs4_schedule_state_recovery(struct nfs4_client *);
252extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
229extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 253extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
230extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls);
231extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 254extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
232 255
256extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
257extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
258extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
259extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
260extern void nfs_free_seqid(struct nfs_seqid *seqid);
261
233extern const nfs4_stateid zero_stateid; 262extern const nfs4_stateid zero_stateid;
234 263
235/* nfs4xdr.c */ 264/* nfs4xdr.c */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9701ca8c9428..933e13b383f8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -47,6 +47,7 @@
47#include <linux/nfs_page.h> 47#include <linux/nfs_page.h>
48#include <linux/smp_lock.h> 48#include <linux/smp_lock.h>
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mount.h>
50 51
51#include "nfs4_fs.h" 52#include "nfs4_fs.h"
52#include "delegation.h" 53#include "delegation.h"
@@ -56,10 +57,11 @@
56#define NFS4_POLL_RETRY_MIN (1*HZ) 57#define NFS4_POLL_RETRY_MIN (1*HZ)
57#define NFS4_POLL_RETRY_MAX (15*HZ) 58#define NFS4_POLL_RETRY_MAX (15*HZ)
58 59
60static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid);
59static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 61static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
60static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *); 62static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
61static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); 63static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
62static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception); 64static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
63extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); 65extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
64extern struct rpc_procinfo nfs4_procedures[]; 66extern struct rpc_procinfo nfs4_procedures[];
65 67
@@ -185,8 +187,26 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf
185{ 187{
186 struct nfs_inode *nfsi = NFS_I(inode); 188 struct nfs_inode *nfsi = NFS_I(inode);
187 189
190 spin_lock(&inode->i_lock);
191 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
188 if (cinfo->before == nfsi->change_attr && cinfo->atomic) 192 if (cinfo->before == nfsi->change_attr && cinfo->atomic)
189 nfsi->change_attr = cinfo->after; 193 nfsi->change_attr = cinfo->after;
194 spin_unlock(&inode->i_lock);
195}
196
197/* Helper for asynchronous RPC calls */
198static int nfs4_call_async(struct rpc_clnt *clnt, rpc_action tk_begin,
199 rpc_action tk_exit, void *calldata)
200{
201 struct rpc_task *task;
202
203 if (!(task = rpc_new_task(clnt, tk_exit, RPC_TASK_ASYNC)))
204 return -ENOMEM;
205
206 task->tk_calldata = calldata;
207 task->tk_action = tk_begin;
208 rpc_execute(task);
209 return 0;
190} 210}
191 211
192static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) 212static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
@@ -195,6 +215,7 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid,
195 215
196 open_flags &= (FMODE_READ|FMODE_WRITE); 216 open_flags &= (FMODE_READ|FMODE_WRITE);
197 /* Protect against nfs4_find_state() */ 217 /* Protect against nfs4_find_state() */
218 spin_lock(&state->owner->so_lock);
198 spin_lock(&inode->i_lock); 219 spin_lock(&inode->i_lock);
199 state->state |= open_flags; 220 state->state |= open_flags;
200 /* NB! List reordering - see the reclaim code for why. */ 221 /* NB! List reordering - see the reclaim code for why. */
@@ -204,12 +225,12 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid,
204 state->nreaders++; 225 state->nreaders++;
205 memcpy(&state->stateid, stateid, sizeof(state->stateid)); 226 memcpy(&state->stateid, stateid, sizeof(state->stateid));
206 spin_unlock(&inode->i_lock); 227 spin_unlock(&inode->i_lock);
228 spin_unlock(&state->owner->so_lock);
207} 229}
208 230
209/* 231/*
210 * OPEN_RECLAIM: 232 * OPEN_RECLAIM:
211 * reclaim state on the server after a reboot. 233 * reclaim state on the server after a reboot.
212 * Assumes caller is holding the sp->so_sem
213 */ 234 */
214static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) 235static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
215{ 236{
@@ -218,7 +239,6 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st
218 struct nfs_delegation *delegation = NFS_I(inode)->delegation; 239 struct nfs_delegation *delegation = NFS_I(inode)->delegation;
219 struct nfs_openargs o_arg = { 240 struct nfs_openargs o_arg = {
220 .fh = NFS_FH(inode), 241 .fh = NFS_FH(inode),
221 .seqid = sp->so_seqid,
222 .id = sp->so_id, 242 .id = sp->so_id,
223 .open_flags = state->state, 243 .open_flags = state->state,
224 .clientid = server->nfs4_state->cl_clientid, 244 .clientid = server->nfs4_state->cl_clientid,
@@ -245,8 +265,13 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st
245 } 265 }
246 o_arg.u.delegation_type = delegation->type; 266 o_arg.u.delegation_type = delegation->type;
247 } 267 }
268 o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
269 if (o_arg.seqid == NULL)
270 return -ENOMEM;
248 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 271 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
249 nfs4_increment_seqid(status, sp); 272 /* Confirm the sequence as being established */
273 nfs_confirm_seqid(&sp->so_seqid, status);
274 nfs_increment_open_seqid(status, o_arg.seqid);
250 if (status == 0) { 275 if (status == 0) {
251 memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); 276 memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
252 if (o_res.delegation_type != 0) { 277 if (o_res.delegation_type != 0) {
@@ -256,6 +281,7 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st
256 nfs_async_inode_return_delegation(inode, &o_res.stateid); 281 nfs_async_inode_return_delegation(inode, &o_res.stateid);
257 } 282 }
258 } 283 }
284 nfs_free_seqid(o_arg.seqid);
259 clear_bit(NFS_DELEGATED_STATE, &state->flags); 285 clear_bit(NFS_DELEGATED_STATE, &state->flags);
260 /* Ensure we update the inode attributes */ 286 /* Ensure we update the inode attributes */
261 NFS_CACHEINV(inode); 287 NFS_CACHEINV(inode);
@@ -302,23 +328,35 @@ static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state
302 }; 328 };
303 int status = 0; 329 int status = 0;
304 330
305 down(&sp->so_sema);
306 if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) 331 if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
307 goto out; 332 goto out;
308 if (state->state == 0) 333 if (state->state == 0)
309 goto out; 334 goto out;
310 arg.seqid = sp->so_seqid; 335 arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
336 status = -ENOMEM;
337 if (arg.seqid == NULL)
338 goto out;
311 arg.open_flags = state->state; 339 arg.open_flags = state->state;
312 memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data)); 340 memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data));
313 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 341 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
314 nfs4_increment_seqid(status, sp); 342 nfs_increment_open_seqid(status, arg.seqid);
343 if (status != 0)
344 goto out_free;
345 if(res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
346 status = _nfs4_proc_open_confirm(server->client, NFS_FH(inode),
347 sp, &res.stateid, arg.seqid);
348 if (status != 0)
349 goto out_free;
350 }
351 nfs_confirm_seqid(&sp->so_seqid, 0);
315 if (status >= 0) { 352 if (status >= 0) {
316 memcpy(state->stateid.data, res.stateid.data, 353 memcpy(state->stateid.data, res.stateid.data,
317 sizeof(state->stateid.data)); 354 sizeof(state->stateid.data));
318 clear_bit(NFS_DELEGATED_STATE, &state->flags); 355 clear_bit(NFS_DELEGATED_STATE, &state->flags);
319 } 356 }
357out_free:
358 nfs_free_seqid(arg.seqid);
320out: 359out:
321 up(&sp->so_sema);
322 dput(parent); 360 dput(parent);
323 return status; 361 return status;
324} 362}
@@ -345,11 +383,11 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
345 return err; 383 return err;
346} 384}
347 385
348static inline int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid) 386static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid)
349{ 387{
350 struct nfs_open_confirmargs arg = { 388 struct nfs_open_confirmargs arg = {
351 .fh = fh, 389 .fh = fh,
352 .seqid = sp->so_seqid, 390 .seqid = seqid,
353 .stateid = *stateid, 391 .stateid = *stateid,
354 }; 392 };
355 struct nfs_open_confirmres res; 393 struct nfs_open_confirmres res;
@@ -362,7 +400,9 @@ static inline int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nf
362 int status; 400 int status;
363 401
364 status = rpc_call_sync(clnt, &msg, RPC_TASK_NOINTR); 402 status = rpc_call_sync(clnt, &msg, RPC_TASK_NOINTR);
365 nfs4_increment_seqid(status, sp); 403 /* Confirm the sequence as being established */
404 nfs_confirm_seqid(&sp->so_seqid, status);
405 nfs_increment_open_seqid(status, seqid);
366 if (status >= 0) 406 if (status >= 0)
367 memcpy(stateid, &res.stateid, sizeof(*stateid)); 407 memcpy(stateid, &res.stateid, sizeof(*stateid));
368 return status; 408 return status;
@@ -380,21 +420,41 @@ static int _nfs4_proc_open(struct inode *dir, struct nfs4_state_owner *sp, stru
380 int status; 420 int status;
381 421
382 /* Update sequence id. The caller must serialize! */ 422 /* Update sequence id. The caller must serialize! */
383 o_arg->seqid = sp->so_seqid;
384 o_arg->id = sp->so_id; 423 o_arg->id = sp->so_id;
385 o_arg->clientid = sp->so_client->cl_clientid; 424 o_arg->clientid = sp->so_client->cl_clientid;
386 425
387 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 426 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
388 nfs4_increment_seqid(status, sp); 427 if (status == 0) {
428 /* OPEN on anything except a regular file is disallowed in NFSv4 */
429 switch (o_res->f_attr->mode & S_IFMT) {
430 case S_IFREG:
431 break;
432 case S_IFLNK:
433 status = -ELOOP;
434 break;
435 case S_IFDIR:
436 status = -EISDIR;
437 break;
438 default:
439 status = -ENOTDIR;
440 }
441 }
442
443 nfs_increment_open_seqid(status, o_arg->seqid);
389 if (status != 0) 444 if (status != 0)
390 goto out; 445 goto out;
391 update_changeattr(dir, &o_res->cinfo); 446 if (o_arg->open_flags & O_CREAT) {
447 update_changeattr(dir, &o_res->cinfo);
448 nfs_post_op_update_inode(dir, o_res->dir_attr);
449 } else
450 nfs_refresh_inode(dir, o_res->dir_attr);
392 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { 451 if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
393 status = _nfs4_proc_open_confirm(server->client, &o_res->fh, 452 status = _nfs4_proc_open_confirm(server->client, &o_res->fh,
394 sp, &o_res->stateid); 453 sp, &o_res->stateid, o_arg->seqid);
395 if (status != 0) 454 if (status != 0)
396 goto out; 455 goto out;
397 } 456 }
457 nfs_confirm_seqid(&sp->so_seqid, 0);
398 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) 458 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
399 status = server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr); 459 status = server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
400out: 460out:
@@ -441,9 +501,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
441 struct inode *inode = state->inode; 501 struct inode *inode = state->inode;
442 struct nfs_server *server = NFS_SERVER(dir); 502 struct nfs_server *server = NFS_SERVER(dir);
443 struct nfs_delegation *delegation = NFS_I(inode)->delegation; 503 struct nfs_delegation *delegation = NFS_I(inode)->delegation;
444 struct nfs_fattr f_attr = { 504 struct nfs_fattr f_attr, dir_attr;
445 .valid = 0,
446 };
447 struct nfs_openargs o_arg = { 505 struct nfs_openargs o_arg = {
448 .fh = NFS_FH(dir), 506 .fh = NFS_FH(dir),
449 .open_flags = state->state, 507 .open_flags = state->state,
@@ -453,6 +511,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
453 }; 511 };
454 struct nfs_openres o_res = { 512 struct nfs_openres o_res = {
455 .f_attr = &f_attr, 513 .f_attr = &f_attr,
514 .dir_attr = &dir_attr,
456 .server = server, 515 .server = server,
457 }; 516 };
458 int status = 0; 517 int status = 0;
@@ -465,6 +524,12 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
465 set_bit(NFS_DELEGATED_STATE, &state->flags); 524 set_bit(NFS_DELEGATED_STATE, &state->flags);
466 goto out; 525 goto out;
467 } 526 }
527 o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
528 status = -ENOMEM;
529 if (o_arg.seqid == NULL)
530 goto out;
531 nfs_fattr_init(&f_attr);
532 nfs_fattr_init(&dir_attr);
468 status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); 533 status = _nfs4_proc_open(dir, sp, &o_arg, &o_res);
469 if (status != 0) 534 if (status != 0)
470 goto out_nodeleg; 535 goto out_nodeleg;
@@ -490,6 +555,7 @@ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
490 nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res); 555 nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res);
491 } 556 }
492out_nodeleg: 557out_nodeleg:
558 nfs_free_seqid(o_arg.seqid);
493 clear_bit(NFS_DELEGATED_STATE, &state->flags); 559 clear_bit(NFS_DELEGATED_STATE, &state->flags);
494out: 560out:
495 dput(parent); 561 dput(parent);
@@ -564,7 +630,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
564 dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__); 630 dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__);
565 goto out_err; 631 goto out_err;
566 } 632 }
567 down(&sp->so_sema);
568 state = nfs4_get_open_state(inode, sp); 633 state = nfs4_get_open_state(inode, sp);
569 if (state == NULL) 634 if (state == NULL)
570 goto out_err; 635 goto out_err;
@@ -589,7 +654,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
589 set_bit(NFS_DELEGATED_STATE, &state->flags); 654 set_bit(NFS_DELEGATED_STATE, &state->flags);
590 update_open_stateid(state, &delegation->stateid, open_flags); 655 update_open_stateid(state, &delegation->stateid, open_flags);
591out_ok: 656out_ok:
592 up(&sp->so_sema);
593 nfs4_put_state_owner(sp); 657 nfs4_put_state_owner(sp);
594 up_read(&nfsi->rwsem); 658 up_read(&nfsi->rwsem);
595 up_read(&clp->cl_sem); 659 up_read(&clp->cl_sem);
@@ -600,11 +664,12 @@ out_err:
600 if (sp != NULL) { 664 if (sp != NULL) {
601 if (state != NULL) 665 if (state != NULL)
602 nfs4_put_open_state(state); 666 nfs4_put_open_state(state);
603 up(&sp->so_sema);
604 nfs4_put_state_owner(sp); 667 nfs4_put_state_owner(sp);
605 } 668 }
606 up_read(&nfsi->rwsem); 669 up_read(&nfsi->rwsem);
607 up_read(&clp->cl_sem); 670 up_read(&clp->cl_sem);
671 if (err != -EACCES)
672 nfs_inode_return_delegation(inode);
608 return err; 673 return err;
609} 674}
610 675
@@ -635,9 +700,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
635 struct nfs4_client *clp = server->nfs4_state; 700 struct nfs4_client *clp = server->nfs4_state;
636 struct inode *inode = NULL; 701 struct inode *inode = NULL;
637 int status; 702 int status;
638 struct nfs_fattr f_attr = { 703 struct nfs_fattr f_attr, dir_attr;
639 .valid = 0,
640 };
641 struct nfs_openargs o_arg = { 704 struct nfs_openargs o_arg = {
642 .fh = NFS_FH(dir), 705 .fh = NFS_FH(dir),
643 .open_flags = flags, 706 .open_flags = flags,
@@ -648,6 +711,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
648 }; 711 };
649 struct nfs_openres o_res = { 712 struct nfs_openres o_res = {
650 .f_attr = &f_attr, 713 .f_attr = &f_attr,
714 .dir_attr = &dir_attr,
651 .server = server, 715 .server = server,
652 }; 716 };
653 717
@@ -665,8 +729,12 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
665 } else 729 } else
666 o_arg.u.attrs = sattr; 730 o_arg.u.attrs = sattr;
667 /* Serialization for the sequence id */ 731 /* Serialization for the sequence id */
668 down(&sp->so_sema);
669 732
733 o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
734 if (o_arg.seqid == NULL)
735 return -ENOMEM;
736 nfs_fattr_init(&f_attr);
737 nfs_fattr_init(&dir_attr);
670 status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); 738 status = _nfs4_proc_open(dir, sp, &o_arg, &o_res);
671 if (status != 0) 739 if (status != 0)
672 goto out_err; 740 goto out_err;
@@ -681,7 +749,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
681 update_open_stateid(state, &o_res.stateid, flags); 749 update_open_stateid(state, &o_res.stateid, flags);
682 if (o_res.delegation_type != 0) 750 if (o_res.delegation_type != 0)
683 nfs_inode_set_delegation(inode, cred, &o_res); 751 nfs_inode_set_delegation(inode, cred, &o_res);
684 up(&sp->so_sema); 752 nfs_free_seqid(o_arg.seqid);
685 nfs4_put_state_owner(sp); 753 nfs4_put_state_owner(sp);
686 up_read(&clp->cl_sem); 754 up_read(&clp->cl_sem);
687 *res = state; 755 *res = state;
@@ -690,7 +758,7 @@ out_err:
690 if (sp != NULL) { 758 if (sp != NULL) {
691 if (state != NULL) 759 if (state != NULL)
692 nfs4_put_open_state(state); 760 nfs4_put_open_state(state);
693 up(&sp->so_sema); 761 nfs_free_seqid(o_arg.seqid);
694 nfs4_put_state_owner(sp); 762 nfs4_put_state_owner(sp);
695 } 763 }
696 /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */ 764 /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */
@@ -718,7 +786,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,
718 * It is actually a sign of a bug on the client or on the server. 786 * It is actually a sign of a bug on the client or on the server.
719 * 787 *
720 * If we receive a BAD_SEQID error in the particular case of 788 * If we receive a BAD_SEQID error in the particular case of
721 * doing an OPEN, we assume that nfs4_increment_seqid() will 789 * doing an OPEN, we assume that nfs_increment_open_seqid() will
722 * have unhashed the old state_owner for us, and that we can 790 * have unhashed the old state_owner for us, and that we can
723 * therefore safely retry using a new one. We should still warn 791 * therefore safely retry using a new one. We should still warn
724 * the user though... 792 * the user though...
@@ -728,6 +796,16 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,
728 exception.retry = 1; 796 exception.retry = 1;
729 continue; 797 continue;
730 } 798 }
799 /*
800 * BAD_STATEID on OPEN means that the server cancelled our
801 * state before it received the OPEN_CONFIRM.
802 * Recover by retrying the request as per the discussion
803 * on Page 181 of RFC3530.
804 */
805 if (status == -NFS4ERR_BAD_STATEID) {
806 exception.retry = 1;
807 continue;
808 }
731 res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), 809 res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
732 status, &exception)); 810 status, &exception));
733 } while (exception.retry); 811 } while (exception.retry);
@@ -755,7 +833,7 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
755 }; 833 };
756 int status; 834 int status;
757 835
758 fattr->valid = 0; 836 nfs_fattr_init(fattr);
759 837
760 if (state != NULL) { 838 if (state != NULL) {
761 msg.rpc_cred = state->owner->so_cred; 839 msg.rpc_cred = state->owner->so_cred;
@@ -787,19 +865,30 @@ struct nfs4_closedata {
787 struct nfs4_state *state; 865 struct nfs4_state *state;
788 struct nfs_closeargs arg; 866 struct nfs_closeargs arg;
789 struct nfs_closeres res; 867 struct nfs_closeres res;
868 struct nfs_fattr fattr;
790}; 869};
791 870
871static void nfs4_free_closedata(struct nfs4_closedata *calldata)
872{
873 struct nfs4_state *state = calldata->state;
874 struct nfs4_state_owner *sp = state->owner;
875
876 nfs4_put_open_state(calldata->state);
877 nfs_free_seqid(calldata->arg.seqid);
878 nfs4_put_state_owner(sp);
879 kfree(calldata);
880}
881
792static void nfs4_close_done(struct rpc_task *task) 882static void nfs4_close_done(struct rpc_task *task)
793{ 883{
794 struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata; 884 struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
795 struct nfs4_state *state = calldata->state; 885 struct nfs4_state *state = calldata->state;
796 struct nfs4_state_owner *sp = state->owner;
797 struct nfs_server *server = NFS_SERVER(calldata->inode); 886 struct nfs_server *server = NFS_SERVER(calldata->inode);
798 887
799 /* hmm. we are done with the inode, and in the process of freeing 888 /* hmm. we are done with the inode, and in the process of freeing
800 * the state_owner. we keep this around to process errors 889 * the state_owner. we keep this around to process errors
801 */ 890 */
802 nfs4_increment_seqid(task->tk_status, sp); 891 nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid);
803 switch (task->tk_status) { 892 switch (task->tk_status) {
804 case 0: 893 case 0:
805 memcpy(&state->stateid, &calldata->res.stateid, 894 memcpy(&state->stateid, &calldata->res.stateid,
@@ -816,25 +905,49 @@ static void nfs4_close_done(struct rpc_task *task)
816 return; 905 return;
817 } 906 }
818 } 907 }
908 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
819 state->state = calldata->arg.open_flags; 909 state->state = calldata->arg.open_flags;
820 nfs4_put_open_state(state); 910 nfs4_free_closedata(calldata);
821 up(&sp->so_sema);
822 nfs4_put_state_owner(sp);
823 up_read(&server->nfs4_state->cl_sem);
824 kfree(calldata);
825} 911}
826 912
827static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata *calldata) 913static void nfs4_close_begin(struct rpc_task *task)
828{ 914{
915 struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata;
916 struct nfs4_state *state = calldata->state;
829 struct rpc_message msg = { 917 struct rpc_message msg = {
830 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], 918 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE],
831 .rpc_argp = &calldata->arg, 919 .rpc_argp = &calldata->arg,
832 .rpc_resp = &calldata->res, 920 .rpc_resp = &calldata->res,
833 .rpc_cred = calldata->state->owner->so_cred, 921 .rpc_cred = state->owner->so_cred,
834 }; 922 };
835 if (calldata->arg.open_flags != 0) 923 int mode = 0;
924 int status;
925
926 status = nfs_wait_on_sequence(calldata->arg.seqid, task);
927 if (status != 0)
928 return;
929 /* Don't reorder reads */
930 smp_rmb();
931 /* Recalculate the new open mode in case someone reopened the file
932 * while we were waiting in line to be scheduled.
933 */
934 if (state->nreaders != 0)
935 mode |= FMODE_READ;
936 if (state->nwriters != 0)
937 mode |= FMODE_WRITE;
938 if (test_bit(NFS_DELEGATED_STATE, &state->flags))
939 state->state = mode;
940 if (mode == state->state) {
941 nfs4_free_closedata(calldata);
942 task->tk_exit = NULL;
943 rpc_exit(task, 0);
944 return;
945 }
946 nfs_fattr_init(calldata->res.fattr);
947 if (mode != 0)
836 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; 948 msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
837 return rpc_call_async(clnt, &msg, 0, nfs4_close_done, calldata); 949 calldata->arg.open_flags = mode;
950 rpc_call_setup(task, &msg, 0);
838} 951}
839 952
840/* 953/*
@@ -850,40 +963,57 @@ static inline int nfs4_close_call(struct rpc_clnt *clnt, struct nfs4_closedata *
850 */ 963 */
851int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode) 964int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode)
852{ 965{
966 struct nfs_server *server = NFS_SERVER(inode);
853 struct nfs4_closedata *calldata; 967 struct nfs4_closedata *calldata;
854 int status; 968 int status = -ENOMEM;
855 969
856 /* Tell caller we're done */ 970 calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
857 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
858 state->state = mode;
859 return 0;
860 }
861 calldata = (struct nfs4_closedata *)kmalloc(sizeof(*calldata), GFP_KERNEL);
862 if (calldata == NULL) 971 if (calldata == NULL)
863 return -ENOMEM; 972 goto out;
864 calldata->inode = inode; 973 calldata->inode = inode;
865 calldata->state = state; 974 calldata->state = state;
866 calldata->arg.fh = NFS_FH(inode); 975 calldata->arg.fh = NFS_FH(inode);
976 calldata->arg.stateid = &state->stateid;
867 /* Serialization for the sequence id */ 977 /* Serialization for the sequence id */
868 calldata->arg.seqid = state->owner->so_seqid; 978 calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
869 calldata->arg.open_flags = mode; 979 if (calldata->arg.seqid == NULL)
870 memcpy(&calldata->arg.stateid, &state->stateid, 980 goto out_free_calldata;
871 sizeof(calldata->arg.stateid)); 981 calldata->arg.bitmask = server->attr_bitmask;
872 status = nfs4_close_call(NFS_SERVER(inode)->client, calldata); 982 calldata->res.fattr = &calldata->fattr;
873 /* 983 calldata->res.server = server;
874 * Return -EINPROGRESS on success in order to indicate to the 984
875 * caller that an asynchronous RPC call has been launched, and 985 status = nfs4_call_async(server->client, nfs4_close_begin,
876 * that it will release the semaphores on completion. 986 nfs4_close_done, calldata);
877 */ 987 if (status == 0)
878 return (status == 0) ? -EINPROGRESS : status; 988 goto out;
989
990 nfs_free_seqid(calldata->arg.seqid);
991out_free_calldata:
992 kfree(calldata);
993out:
994 return status;
879} 995}
880 996
881struct inode * 997static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
998{
999 struct file *filp;
1000
1001 filp = lookup_instantiate_filp(nd, dentry, NULL);
1002 if (!IS_ERR(filp)) {
1003 struct nfs_open_context *ctx;
1004 ctx = (struct nfs_open_context *)filp->private_data;
1005 ctx->state = state;
1006 } else
1007 nfs4_close_state(state, nd->intent.open.flags);
1008}
1009
1010struct dentry *
882nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) 1011nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
883{ 1012{
884 struct iattr attr; 1013 struct iattr attr;
885 struct rpc_cred *cred; 1014 struct rpc_cred *cred;
886 struct nfs4_state *state; 1015 struct nfs4_state *state;
1016 struct dentry *res;
887 1017
888 if (nd->flags & LOOKUP_CREATE) { 1018 if (nd->flags & LOOKUP_CREATE) {
889 attr.ia_mode = nd->intent.open.create_mode; 1019 attr.ia_mode = nd->intent.open.create_mode;
@@ -897,16 +1027,23 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
897 1027
898 cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); 1028 cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
899 if (IS_ERR(cred)) 1029 if (IS_ERR(cred))
900 return (struct inode *)cred; 1030 return (struct dentry *)cred;
901 state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); 1031 state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred);
902 put_rpccred(cred); 1032 put_rpccred(cred);
903 if (IS_ERR(state)) 1033 if (IS_ERR(state)) {
904 return (struct inode *)state; 1034 if (PTR_ERR(state) == -ENOENT)
905 return state->inode; 1035 d_add(dentry, NULL);
1036 return (struct dentry *)state;
1037 }
1038 res = d_add_unique(dentry, state->inode);
1039 if (res != NULL)
1040 dentry = res;
1041 nfs4_intent_set_file(nd, dentry, state);
1042 return res;
906} 1043}
907 1044
908int 1045int
909nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags) 1046nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd)
910{ 1047{
911 struct rpc_cred *cred; 1048 struct rpc_cred *cred;
912 struct nfs4_state *state; 1049 struct nfs4_state *state;
@@ -919,18 +1056,30 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags)
919 if (IS_ERR(state)) 1056 if (IS_ERR(state))
920 state = nfs4_do_open(dir, dentry, openflags, NULL, cred); 1057 state = nfs4_do_open(dir, dentry, openflags, NULL, cred);
921 put_rpccred(cred); 1058 put_rpccred(cred);
922 if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0) 1059 if (IS_ERR(state)) {
923 return 1; 1060 switch (PTR_ERR(state)) {
924 if (IS_ERR(state)) 1061 case -EPERM:
925 return 0; 1062 case -EACCES:
1063 case -EDQUOT:
1064 case -ENOSPC:
1065 case -EROFS:
1066 lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
1067 return 1;
1068 case -ENOENT:
1069 if (dentry->d_inode == NULL)
1070 return 1;
1071 }
1072 goto out_drop;
1073 }
926 inode = state->inode; 1074 inode = state->inode;
1075 iput(inode);
927 if (inode == dentry->d_inode) { 1076 if (inode == dentry->d_inode) {
928 iput(inode); 1077 nfs4_intent_set_file(nd, dentry, state);
929 return 1; 1078 return 1;
930 } 1079 }
931 d_drop(dentry);
932 nfs4_close_state(state, openflags); 1080 nfs4_close_state(state, openflags);
933 iput(inode); 1081out_drop:
1082 d_drop(dentry);
934 return 0; 1083 return 0;
935} 1084}
936 1085
@@ -974,13 +1123,12 @@ static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fh
974static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, 1123static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
975 struct nfs_fsinfo *info) 1124 struct nfs_fsinfo *info)
976{ 1125{
977 struct nfs_fattr * fattr = info->fattr;
978 struct nfs4_lookup_root_arg args = { 1126 struct nfs4_lookup_root_arg args = {
979 .bitmask = nfs4_fattr_bitmap, 1127 .bitmask = nfs4_fattr_bitmap,
980 }; 1128 };
981 struct nfs4_lookup_res res = { 1129 struct nfs4_lookup_res res = {
982 .server = server, 1130 .server = server,
983 .fattr = fattr, 1131 .fattr = info->fattr,
984 .fh = fhandle, 1132 .fh = fhandle,
985 }; 1133 };
986 struct rpc_message msg = { 1134 struct rpc_message msg = {
@@ -988,7 +1136,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
988 .rpc_argp = &args, 1136 .rpc_argp = &args,
989 .rpc_resp = &res, 1137 .rpc_resp = &res,
990 }; 1138 };
991 fattr->valid = 0; 1139 nfs_fattr_init(info->fattr);
992 return rpc_call_sync(server->client, &msg, 0); 1140 return rpc_call_sync(server->client, &msg, 0);
993} 1141}
994 1142
@@ -1051,7 +1199,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
1051 q.len = p - q.name; 1199 q.len = p - q.name;
1052 1200
1053 do { 1201 do {
1054 fattr->valid = 0; 1202 nfs_fattr_init(fattr);
1055 status = nfs4_handle_exception(server, 1203 status = nfs4_handle_exception(server,
1056 rpc_call_sync(server->client, &msg, 0), 1204 rpc_call_sync(server->client, &msg, 0),
1057 &exception); 1205 &exception);
@@ -1088,7 +1236,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
1088 .rpc_resp = &res, 1236 .rpc_resp = &res,
1089 }; 1237 };
1090 1238
1091 fattr->valid = 0; 1239 nfs_fattr_init(fattr);
1092 return rpc_call_sync(server->client, &msg, 0); 1240 return rpc_call_sync(server->client, &msg, 0);
1093} 1241}
1094 1242
@@ -1130,7 +1278,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
1130 struct nfs4_state *state; 1278 struct nfs4_state *state;
1131 int status; 1279 int status;
1132 1280
1133 fattr->valid = 0; 1281 nfs_fattr_init(fattr);
1134 1282
1135 cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); 1283 cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
1136 if (IS_ERR(cred)) 1284 if (IS_ERR(cred))
@@ -1176,7 +1324,7 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
1176 .rpc_resp = &res, 1324 .rpc_resp = &res,
1177 }; 1325 };
1178 1326
1179 fattr->valid = 0; 1327 nfs_fattr_init(fattr);
1180 1328
1181 dprintk("NFS call lookup %s\n", name->name); 1329 dprintk("NFS call lookup %s\n", name->name);
1182 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 1330 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
@@ -1325,7 +1473,7 @@ static int _nfs4_proc_read(struct nfs_read_data *rdata)
1325 dprintk("NFS call read %d @ %Ld\n", rdata->args.count, 1473 dprintk("NFS call read %d @ %Ld\n", rdata->args.count,
1326 (long long) rdata->args.offset); 1474 (long long) rdata->args.offset);
1327 1475
1328 fattr->valid = 0; 1476 nfs_fattr_init(fattr);
1329 status = rpc_call_sync(server->client, &msg, flags); 1477 status = rpc_call_sync(server->client, &msg, flags);
1330 if (!status) 1478 if (!status)
1331 renew_lease(server, timestamp); 1479 renew_lease(server, timestamp);
@@ -1362,7 +1510,7 @@ static int _nfs4_proc_write(struct nfs_write_data *wdata)
1362 dprintk("NFS call write %d @ %Ld\n", wdata->args.count, 1510 dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
1363 (long long) wdata->args.offset); 1511 (long long) wdata->args.offset);
1364 1512
1365 fattr->valid = 0; 1513 nfs_fattr_init(fattr);
1366 status = rpc_call_sync(server->client, &msg, rpcflags); 1514 status = rpc_call_sync(server->client, &msg, rpcflags);
1367 dprintk("NFS reply write: %d\n", status); 1515 dprintk("NFS reply write: %d\n", status);
1368 return status; 1516 return status;
@@ -1396,7 +1544,7 @@ static int _nfs4_proc_commit(struct nfs_write_data *cdata)
1396 dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, 1544 dprintk("NFS call commit %d @ %Ld\n", cdata->args.count,
1397 (long long) cdata->args.offset); 1545 (long long) cdata->args.offset);
1398 1546
1399 fattr->valid = 0; 1547 nfs_fattr_init(fattr);
1400 status = rpc_call_sync(server->client, &msg, 0); 1548 status = rpc_call_sync(server->client, &msg, 0);
1401 dprintk("NFS reply commit: %d\n", status); 1549 dprintk("NFS reply commit: %d\n", status);
1402 return status; 1550 return status;
@@ -1431,7 +1579,7 @@ static int nfs4_proc_commit(struct nfs_write_data *cdata)
1431 1579
1432static int 1580static int
1433nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 1581nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1434 int flags) 1582 int flags, struct nameidata *nd)
1435{ 1583{
1436 struct nfs4_state *state; 1584 struct nfs4_state *state;
1437 struct rpc_cred *cred; 1585 struct rpc_cred *cred;
@@ -1453,24 +1601,30 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
1453 struct nfs_fattr fattr; 1601 struct nfs_fattr fattr;
1454 status = nfs4_do_setattr(NFS_SERVER(dir), &fattr, 1602 status = nfs4_do_setattr(NFS_SERVER(dir), &fattr,
1455 NFS_FH(state->inode), sattr, state); 1603 NFS_FH(state->inode), sattr, state);
1456 if (status == 0) { 1604 if (status == 0)
1457 nfs_setattr_update_inode(state->inode, sattr); 1605 nfs_setattr_update_inode(state->inode, sattr);
1458 goto out; 1606 }
1459 } 1607 if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN))
1460 } else if (flags != 0) 1608 nfs4_intent_set_file(nd, dentry, state);
1461 goto out; 1609 else
1462 nfs4_close_state(state, flags); 1610 nfs4_close_state(state, flags);
1463out: 1611out:
1464 return status; 1612 return status;
1465} 1613}
1466 1614
1467static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) 1615static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
1468{ 1616{
1617 struct nfs_server *server = NFS_SERVER(dir);
1469 struct nfs4_remove_arg args = { 1618 struct nfs4_remove_arg args = {
1470 .fh = NFS_FH(dir), 1619 .fh = NFS_FH(dir),
1471 .name = name, 1620 .name = name,
1621 .bitmask = server->attr_bitmask,
1622 };
1623 struct nfs_fattr dir_attr;
1624 struct nfs4_remove_res res = {
1625 .server = server,
1626 .dir_attr = &dir_attr,
1472 }; 1627 };
1473 struct nfs4_change_info res;
1474 struct rpc_message msg = { 1628 struct rpc_message msg = {
1475 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE], 1629 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE],
1476 .rpc_argp = &args, 1630 .rpc_argp = &args,
@@ -1478,9 +1632,12 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
1478 }; 1632 };
1479 int status; 1633 int status;
1480 1634
1481 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 1635 nfs_fattr_init(res.dir_attr);
1482 if (status == 0) 1636 status = rpc_call_sync(server->client, &msg, 0);
1483 update_changeattr(dir, &res); 1637 if (status == 0) {
1638 update_changeattr(dir, &res.cinfo);
1639 nfs_post_op_update_inode(dir, res.dir_attr);
1640 }
1484 return status; 1641 return status;
1485} 1642}
1486 1643
@@ -1498,12 +1655,14 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
1498 1655
1499struct unlink_desc { 1656struct unlink_desc {
1500 struct nfs4_remove_arg args; 1657 struct nfs4_remove_arg args;
1501 struct nfs4_change_info res; 1658 struct nfs4_remove_res res;
1659 struct nfs_fattr dir_attr;
1502}; 1660};
1503 1661
1504static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, 1662static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir,
1505 struct qstr *name) 1663 struct qstr *name)
1506{ 1664{
1665 struct nfs_server *server = NFS_SERVER(dir->d_inode);
1507 struct unlink_desc *up; 1666 struct unlink_desc *up;
1508 1667
1509 up = (struct unlink_desc *) kmalloc(sizeof(*up), GFP_KERNEL); 1668 up = (struct unlink_desc *) kmalloc(sizeof(*up), GFP_KERNEL);
@@ -1512,6 +1671,9 @@ static int nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir,
1512 1671
1513 up->args.fh = NFS_FH(dir->d_inode); 1672 up->args.fh = NFS_FH(dir->d_inode);
1514 up->args.name = name; 1673 up->args.name = name;
1674 up->args.bitmask = server->attr_bitmask;
1675 up->res.server = server;
1676 up->res.dir_attr = &up->dir_attr;
1515 1677
1516 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 1678 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
1517 msg->rpc_argp = &up->args; 1679 msg->rpc_argp = &up->args;
@@ -1526,7 +1688,8 @@ static int nfs4_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
1526 1688
1527 if (msg->rpc_resp != NULL) { 1689 if (msg->rpc_resp != NULL) {
1528 up = container_of(msg->rpc_resp, struct unlink_desc, res); 1690 up = container_of(msg->rpc_resp, struct unlink_desc, res);
1529 update_changeattr(dir->d_inode, &up->res); 1691 update_changeattr(dir->d_inode, &up->res.cinfo);
1692 nfs_post_op_update_inode(dir->d_inode, up->res.dir_attr);
1530 kfree(up); 1693 kfree(up);
1531 msg->rpc_resp = NULL; 1694 msg->rpc_resp = NULL;
1532 msg->rpc_argp = NULL; 1695 msg->rpc_argp = NULL;
@@ -1537,13 +1700,20 @@ static int nfs4_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
1537static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, 1700static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
1538 struct inode *new_dir, struct qstr *new_name) 1701 struct inode *new_dir, struct qstr *new_name)
1539{ 1702{
1703 struct nfs_server *server = NFS_SERVER(old_dir);
1540 struct nfs4_rename_arg arg = { 1704 struct nfs4_rename_arg arg = {
1541 .old_dir = NFS_FH(old_dir), 1705 .old_dir = NFS_FH(old_dir),
1542 .new_dir = NFS_FH(new_dir), 1706 .new_dir = NFS_FH(new_dir),
1543 .old_name = old_name, 1707 .old_name = old_name,
1544 .new_name = new_name, 1708 .new_name = new_name,
1709 .bitmask = server->attr_bitmask,
1710 };
1711 struct nfs_fattr old_fattr, new_fattr;
1712 struct nfs4_rename_res res = {
1713 .server = server,
1714 .old_fattr = &old_fattr,
1715 .new_fattr = &new_fattr,
1545 }; 1716 };
1546 struct nfs4_rename_res res = { };
1547 struct rpc_message msg = { 1717 struct rpc_message msg = {
1548 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], 1718 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
1549 .rpc_argp = &arg, 1719 .rpc_argp = &arg,
@@ -1551,11 +1721,15 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
1551 }; 1721 };
1552 int status; 1722 int status;
1553 1723
1554 status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); 1724 nfs_fattr_init(res.old_fattr);
1725 nfs_fattr_init(res.new_fattr);
1726 status = rpc_call_sync(server->client, &msg, 0);
1555 1727
1556 if (!status) { 1728 if (!status) {
1557 update_changeattr(old_dir, &res.old_cinfo); 1729 update_changeattr(old_dir, &res.old_cinfo);
1730 nfs_post_op_update_inode(old_dir, res.old_fattr);
1558 update_changeattr(new_dir, &res.new_cinfo); 1731 update_changeattr(new_dir, &res.new_cinfo);
1732 nfs_post_op_update_inode(new_dir, res.new_fattr);
1559 } 1733 }
1560 return status; 1734 return status;
1561} 1735}
@@ -1576,22 +1750,34 @@ static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
1576 1750
1577static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) 1751static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
1578{ 1752{
1753 struct nfs_server *server = NFS_SERVER(inode);
1579 struct nfs4_link_arg arg = { 1754 struct nfs4_link_arg arg = {
1580 .fh = NFS_FH(inode), 1755 .fh = NFS_FH(inode),
1581 .dir_fh = NFS_FH(dir), 1756 .dir_fh = NFS_FH(dir),
1582 .name = name, 1757 .name = name,
1758 .bitmask = server->attr_bitmask,
1759 };
1760 struct nfs_fattr fattr, dir_attr;
1761 struct nfs4_link_res res = {
1762 .server = server,
1763 .fattr = &fattr,
1764 .dir_attr = &dir_attr,
1583 }; 1765 };
1584 struct nfs4_change_info cinfo = { };
1585 struct rpc_message msg = { 1766 struct rpc_message msg = {
1586 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], 1767 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
1587 .rpc_argp = &arg, 1768 .rpc_argp = &arg,
1588 .rpc_resp = &cinfo, 1769 .rpc_resp = &res,
1589 }; 1770 };
1590 int status; 1771 int status;
1591 1772
1592 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 1773 nfs_fattr_init(res.fattr);
1593 if (!status) 1774 nfs_fattr_init(res.dir_attr);
1594 update_changeattr(dir, &cinfo); 1775 status = rpc_call_sync(server->client, &msg, 0);
1776 if (!status) {
1777 update_changeattr(dir, &res.cinfo);
1778 nfs_post_op_update_inode(dir, res.dir_attr);
1779 nfs_refresh_inode(inode, res.fattr);
1780 }
1595 1781
1596 return status; 1782 return status;
1597} 1783}
@@ -1613,6 +1799,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
1613 struct nfs_fattr *fattr) 1799 struct nfs_fattr *fattr)
1614{ 1800{
1615 struct nfs_server *server = NFS_SERVER(dir); 1801 struct nfs_server *server = NFS_SERVER(dir);
1802 struct nfs_fattr dir_fattr;
1616 struct nfs4_create_arg arg = { 1803 struct nfs4_create_arg arg = {
1617 .dir_fh = NFS_FH(dir), 1804 .dir_fh = NFS_FH(dir),
1618 .server = server, 1805 .server = server,
@@ -1625,6 +1812,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
1625 .server = server, 1812 .server = server,
1626 .fh = fhandle, 1813 .fh = fhandle,
1627 .fattr = fattr, 1814 .fattr = fattr,
1815 .dir_fattr = &dir_fattr,
1628 }; 1816 };
1629 struct rpc_message msg = { 1817 struct rpc_message msg = {
1630 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK], 1818 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
@@ -1636,11 +1824,13 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
1636 if (path->len > NFS4_MAXPATHLEN) 1824 if (path->len > NFS4_MAXPATHLEN)
1637 return -ENAMETOOLONG; 1825 return -ENAMETOOLONG;
1638 arg.u.symlink = path; 1826 arg.u.symlink = path;
1639 fattr->valid = 0; 1827 nfs_fattr_init(fattr);
1828 nfs_fattr_init(&dir_fattr);
1640 1829
1641 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 1830 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
1642 if (!status) 1831 if (!status)
1643 update_changeattr(dir, &res.dir_cinfo); 1832 update_changeattr(dir, &res.dir_cinfo);
1833 nfs_post_op_update_inode(dir, res.dir_fattr);
1644 return status; 1834 return status;
1645} 1835}
1646 1836
@@ -1664,7 +1854,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
1664{ 1854{
1665 struct nfs_server *server = NFS_SERVER(dir); 1855 struct nfs_server *server = NFS_SERVER(dir);
1666 struct nfs_fh fhandle; 1856 struct nfs_fh fhandle;
1667 struct nfs_fattr fattr; 1857 struct nfs_fattr fattr, dir_fattr;
1668 struct nfs4_create_arg arg = { 1858 struct nfs4_create_arg arg = {
1669 .dir_fh = NFS_FH(dir), 1859 .dir_fh = NFS_FH(dir),
1670 .server = server, 1860 .server = server,
@@ -1677,6 +1867,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
1677 .server = server, 1867 .server = server,
1678 .fh = &fhandle, 1868 .fh = &fhandle,
1679 .fattr = &fattr, 1869 .fattr = &fattr,
1870 .dir_fattr = &dir_fattr,
1680 }; 1871 };
1681 struct rpc_message msg = { 1872 struct rpc_message msg = {
1682 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], 1873 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
@@ -1685,11 +1876,13 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
1685 }; 1876 };
1686 int status; 1877 int status;
1687 1878
1688 fattr.valid = 0; 1879 nfs_fattr_init(&fattr);
1880 nfs_fattr_init(&dir_fattr);
1689 1881
1690 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 1882 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
1691 if (!status) { 1883 if (!status) {
1692 update_changeattr(dir, &res.dir_cinfo); 1884 update_changeattr(dir, &res.dir_cinfo);
1885 nfs_post_op_update_inode(dir, res.dir_fattr);
1693 status = nfs_instantiate(dentry, &fhandle, &fattr); 1886 status = nfs_instantiate(dentry, &fhandle, &fattr);
1694 } 1887 }
1695 return status; 1888 return status;
@@ -1762,7 +1955,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
1762{ 1955{
1763 struct nfs_server *server = NFS_SERVER(dir); 1956 struct nfs_server *server = NFS_SERVER(dir);
1764 struct nfs_fh fh; 1957 struct nfs_fh fh;
1765 struct nfs_fattr fattr; 1958 struct nfs_fattr fattr, dir_fattr;
1766 struct nfs4_create_arg arg = { 1959 struct nfs4_create_arg arg = {
1767 .dir_fh = NFS_FH(dir), 1960 .dir_fh = NFS_FH(dir),
1768 .server = server, 1961 .server = server,
@@ -1774,6 +1967,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
1774 .server = server, 1967 .server = server,
1775 .fh = &fh, 1968 .fh = &fh,
1776 .fattr = &fattr, 1969 .fattr = &fattr,
1970 .dir_fattr = &dir_fattr,
1777 }; 1971 };
1778 struct rpc_message msg = { 1972 struct rpc_message msg = {
1779 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], 1973 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
@@ -1783,7 +1977,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
1783 int status; 1977 int status;
1784 int mode = sattr->ia_mode; 1978 int mode = sattr->ia_mode;
1785 1979
1786 fattr.valid = 0; 1980 nfs_fattr_init(&fattr);
1981 nfs_fattr_init(&dir_fattr);
1787 1982
1788 BUG_ON(!(sattr->ia_valid & ATTR_MODE)); 1983 BUG_ON(!(sattr->ia_valid & ATTR_MODE));
1789 BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); 1984 BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
@@ -1805,6 +2000,7 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
1805 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2000 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
1806 if (status == 0) { 2001 if (status == 0) {
1807 update_changeattr(dir, &res.dir_cinfo); 2002 update_changeattr(dir, &res.dir_cinfo);
2003 nfs_post_op_update_inode(dir, res.dir_fattr);
1808 status = nfs_instantiate(dentry, &fh, &fattr); 2004 status = nfs_instantiate(dentry, &fh, &fattr);
1809 } 2005 }
1810 return status; 2006 return status;
@@ -1836,7 +2032,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
1836 .rpc_resp = fsstat, 2032 .rpc_resp = fsstat,
1837 }; 2033 };
1838 2034
1839 fsstat->fattr->valid = 0; 2035 nfs_fattr_init(fsstat->fattr);
1840 return rpc_call_sync(server->client, &msg, 0); 2036 return rpc_call_sync(server->client, &msg, 0);
1841} 2037}
1842 2038
@@ -1883,7 +2079,7 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
1883 2079
1884static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) 2080static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
1885{ 2081{
1886 fsinfo->fattr->valid = 0; 2082 nfs_fattr_init(fsinfo->fattr);
1887 return nfs4_do_fsinfo(server, fhandle, fsinfo); 2083 return nfs4_do_fsinfo(server, fhandle, fsinfo);
1888} 2084}
1889 2085
@@ -1906,7 +2102,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
1906 return 0; 2102 return 0;
1907 } 2103 }
1908 2104
1909 pathconf->fattr->valid = 0; 2105 nfs_fattr_init(pathconf->fattr);
1910 return rpc_call_sync(server->client, &msg, 0); 2106 return rpc_call_sync(server->client, &msg, 0);
1911} 2107}
1912 2108
@@ -1973,8 +2169,10 @@ nfs4_write_done(struct rpc_task *task)
1973 rpc_restart_call(task); 2169 rpc_restart_call(task);
1974 return; 2170 return;
1975 } 2171 }
1976 if (task->tk_status >= 0) 2172 if (task->tk_status >= 0) {
1977 renew_lease(NFS_SERVER(inode), data->timestamp); 2173 renew_lease(NFS_SERVER(inode), data->timestamp);
2174 nfs_post_op_update_inode(inode, data->res.fattr);
2175 }
1978 /* Call back common NFS writeback processing */ 2176 /* Call back common NFS writeback processing */
1979 nfs_writeback_done(task); 2177 nfs_writeback_done(task);
1980} 2178}
@@ -1990,6 +2188,7 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how)
1990 .rpc_cred = data->cred, 2188 .rpc_cred = data->cred,
1991 }; 2189 };
1992 struct inode *inode = data->inode; 2190 struct inode *inode = data->inode;
2191 struct nfs_server *server = NFS_SERVER(inode);
1993 int stable; 2192 int stable;
1994 int flags; 2193 int flags;
1995 2194
@@ -2001,6 +2200,8 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how)
2001 } else 2200 } else
2002 stable = NFS_UNSTABLE; 2201 stable = NFS_UNSTABLE;
2003 data->args.stable = stable; 2202 data->args.stable = stable;
2203 data->args.bitmask = server->attr_bitmask;
2204 data->res.server = server;
2004 2205
2005 data->timestamp = jiffies; 2206 data->timestamp = jiffies;
2006 2207
@@ -2022,6 +2223,8 @@ nfs4_commit_done(struct rpc_task *task)
2022 rpc_restart_call(task); 2223 rpc_restart_call(task);
2023 return; 2224 return;
2024 } 2225 }
2226 if (task->tk_status >= 0)
2227 nfs_post_op_update_inode(inode, data->res.fattr);
2025 /* Call back common NFS writeback processing */ 2228 /* Call back common NFS writeback processing */
2026 nfs_commit_done(task); 2229 nfs_commit_done(task);
2027} 2230}
@@ -2037,8 +2240,12 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
2037 .rpc_cred = data->cred, 2240 .rpc_cred = data->cred,
2038 }; 2241 };
2039 struct inode *inode = data->inode; 2242 struct inode *inode = data->inode;
2243 struct nfs_server *server = NFS_SERVER(inode);
2040 int flags; 2244 int flags;
2041 2245
2246 data->args.bitmask = server->attr_bitmask;
2247 data->res.server = server;
2248
2042 /* Set the initial flags for the task. */ 2249 /* Set the initial flags for the task. */
2043 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; 2250 flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
2044 2251
@@ -2106,65 +2313,6 @@ nfs4_proc_renew(struct nfs4_client *clp)
2106 return 0; 2313 return 0;
2107} 2314}
2108 2315
2109/*
2110 * We will need to arrange for the VFS layer to provide an atomic open.
2111 * Until then, this open method is prone to inefficiency and race conditions
2112 * due to the lookup, potential create, and open VFS calls from sys_open()
2113 * placed on the wire.
2114 */
2115static int
2116nfs4_proc_file_open(struct inode *inode, struct file *filp)
2117{
2118 struct dentry *dentry = filp->f_dentry;
2119 struct nfs_open_context *ctx;
2120 struct nfs4_state *state = NULL;
2121 struct rpc_cred *cred;
2122 int status = -ENOMEM;
2123
2124 dprintk("nfs4_proc_file_open: starting on (%.*s/%.*s)\n",
2125 (int)dentry->d_parent->d_name.len,
2126 dentry->d_parent->d_name.name,
2127 (int)dentry->d_name.len, dentry->d_name.name);
2128
2129
2130 /* Find our open stateid */
2131 cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
2132 if (IS_ERR(cred))
2133 return PTR_ERR(cred);
2134 ctx = alloc_nfs_open_context(dentry, cred);
2135 put_rpccred(cred);
2136 if (unlikely(ctx == NULL))
2137 return -ENOMEM;
2138 status = -EIO; /* ERACE actually */
2139 state = nfs4_find_state(inode, cred, filp->f_mode);
2140 if (unlikely(state == NULL))
2141 goto no_state;
2142 ctx->state = state;
2143 nfs4_close_state(state, filp->f_mode);
2144 ctx->mode = filp->f_mode;
2145 nfs_file_set_open_context(filp, ctx);
2146 put_nfs_open_context(ctx);
2147 if (filp->f_mode & FMODE_WRITE)
2148 nfs_begin_data_update(inode);
2149 return 0;
2150no_state:
2151 printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__);
2152 put_nfs_open_context(ctx);
2153 return status;
2154}
2155
2156/*
2157 * Release our state
2158 */
2159static int
2160nfs4_proc_file_release(struct inode *inode, struct file *filp)
2161{
2162 if (filp->f_mode & FMODE_WRITE)
2163 nfs_end_data_update(inode);
2164 nfs_file_clear_open_context(filp);
2165 return 0;
2166}
2167
2168static inline int nfs4_server_supports_acls(struct nfs_server *server) 2316static inline int nfs4_server_supports_acls(struct nfs_server *server)
2169{ 2317{
2170 return (server->caps & NFS_CAP_ACLS) 2318 return (server->caps & NFS_CAP_ACLS)
@@ -2285,7 +2433,7 @@ static inline ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size
2285 return -ENOMEM; 2433 return -ENOMEM;
2286 args.acl_pages[0] = localpage; 2434 args.acl_pages[0] = localpage;
2287 args.acl_pgbase = 0; 2435 args.acl_pgbase = 0;
2288 args.acl_len = PAGE_SIZE; 2436 resp_len = args.acl_len = PAGE_SIZE;
2289 } else { 2437 } else {
2290 resp_buf = buf; 2438 resp_buf = buf;
2291 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); 2439 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
@@ -2345,6 +2493,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
2345 2493
2346 if (!nfs4_server_supports_acls(server)) 2494 if (!nfs4_server_supports_acls(server))
2347 return -EOPNOTSUPP; 2495 return -EOPNOTSUPP;
2496 nfs_inode_return_delegation(inode);
2348 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 2497 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
2349 ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0); 2498 ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
2350 if (ret == 0) 2499 if (ret == 0)
@@ -2353,7 +2502,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
2353} 2502}
2354 2503
2355static int 2504static int
2356nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server) 2505nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
2357{ 2506{
2358 struct nfs4_client *clp = server->nfs4_state; 2507 struct nfs4_client *clp = server->nfs4_state;
2359 2508
@@ -2431,7 +2580,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
2431/* This is the error handling routine for processes that are allowed 2580/* This is the error handling routine for processes that are allowed
2432 * to sleep. 2581 * to sleep.
2433 */ 2582 */
2434int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) 2583int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
2435{ 2584{
2436 struct nfs4_client *clp = server->nfs4_state; 2585 struct nfs4_client *clp = server->nfs4_state;
2437 int ret = errorcode; 2586 int ret = errorcode;
@@ -2632,7 +2781,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
2632 2781
2633 down_read(&clp->cl_sem); 2782 down_read(&clp->cl_sem);
2634 nlo.clientid = clp->cl_clientid; 2783 nlo.clientid = clp->cl_clientid;
2635 down(&state->lock_sema);
2636 status = nfs4_set_lock_state(state, request); 2784 status = nfs4_set_lock_state(state, request);
2637 if (status != 0) 2785 if (status != 0)
2638 goto out; 2786 goto out;
@@ -2659,7 +2807,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
2659 status = 0; 2807 status = 0;
2660 } 2808 }
2661out: 2809out:
2662 up(&state->lock_sema);
2663 up_read(&clp->cl_sem); 2810 up_read(&clp->cl_sem);
2664 return status; 2811 return status;
2665} 2812}
@@ -2696,79 +2843,149 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
2696 return res; 2843 return res;
2697} 2844}
2698 2845
2699static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) 2846struct nfs4_unlockdata {
2847 struct nfs_lockargs arg;
2848 struct nfs_locku_opargs luargs;
2849 struct nfs_lockres res;
2850 struct nfs4_lock_state *lsp;
2851 struct nfs_open_context *ctx;
2852 atomic_t refcount;
2853 struct completion completion;
2854};
2855
2856static void nfs4_locku_release_calldata(struct nfs4_unlockdata *calldata)
2700{ 2857{
2701 struct inode *inode = state->inode; 2858 if (atomic_dec_and_test(&calldata->refcount)) {
2702 struct nfs_server *server = NFS_SERVER(inode); 2859 nfs_free_seqid(calldata->luargs.seqid);
2703 struct nfs4_client *clp = server->nfs4_state; 2860 nfs4_put_lock_state(calldata->lsp);
2704 struct nfs_lockargs arg = { 2861 put_nfs_open_context(calldata->ctx);
2705 .fh = NFS_FH(inode), 2862 kfree(calldata);
2706 .type = nfs4_lck_type(cmd, request), 2863 }
2707 .offset = request->fl_start, 2864}
2708 .length = nfs4_lck_length(request), 2865
2709 }; 2866static void nfs4_locku_complete(struct nfs4_unlockdata *calldata)
2710 struct nfs_lockres res = { 2867{
2711 .server = server, 2868 complete(&calldata->completion);
2712 }; 2869 nfs4_locku_release_calldata(calldata);
2870}
2871
2872static void nfs4_locku_done(struct rpc_task *task)
2873{
2874 struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata;
2875
2876 nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid);
2877 switch (task->tk_status) {
2878 case 0:
2879 memcpy(calldata->lsp->ls_stateid.data,
2880 calldata->res.u.stateid.data,
2881 sizeof(calldata->lsp->ls_stateid.data));
2882 break;
2883 case -NFS4ERR_STALE_STATEID:
2884 case -NFS4ERR_EXPIRED:
2885 nfs4_schedule_state_recovery(calldata->res.server->nfs4_state);
2886 break;
2887 default:
2888 if (nfs4_async_handle_error(task, calldata->res.server) == -EAGAIN) {
2889 rpc_restart_call(task);
2890 return;
2891 }
2892 }
2893 nfs4_locku_complete(calldata);
2894}
2895
2896static void nfs4_locku_begin(struct rpc_task *task)
2897{
2898 struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata;
2713 struct rpc_message msg = { 2899 struct rpc_message msg = {
2714 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU], 2900 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU],
2715 .rpc_argp = &arg, 2901 .rpc_argp = &calldata->arg,
2716 .rpc_resp = &res, 2902 .rpc_resp = &calldata->res,
2717 .rpc_cred = state->owner->so_cred, 2903 .rpc_cred = calldata->lsp->ls_state->owner->so_cred,
2718 }; 2904 };
2905 int status;
2906
2907 status = nfs_wait_on_sequence(calldata->luargs.seqid, task);
2908 if (status != 0)
2909 return;
2910 if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) {
2911 nfs4_locku_complete(calldata);
2912 task->tk_exit = NULL;
2913 rpc_exit(task, 0);
2914 return;
2915 }
2916 rpc_call_setup(task, &msg, 0);
2917}
2918
2919static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
2920{
2921 struct nfs4_unlockdata *calldata;
2922 struct inode *inode = state->inode;
2923 struct nfs_server *server = NFS_SERVER(inode);
2719 struct nfs4_lock_state *lsp; 2924 struct nfs4_lock_state *lsp;
2720 struct nfs_locku_opargs luargs;
2721 int status; 2925 int status;
2722 2926
2723 down_read(&clp->cl_sem);
2724 down(&state->lock_sema);
2725 status = nfs4_set_lock_state(state, request); 2927 status = nfs4_set_lock_state(state, request);
2726 if (status != 0) 2928 if (status != 0)
2727 goto out; 2929 return status;
2728 lsp = request->fl_u.nfs4_fl.owner; 2930 lsp = request->fl_u.nfs4_fl.owner;
2729 /* We might have lost the locks! */ 2931 /* We might have lost the locks! */
2730 if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) 2932 if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0)
2731 goto out; 2933 return 0;
2732 luargs.seqid = lsp->ls_seqid; 2934 calldata = kmalloc(sizeof(*calldata), GFP_KERNEL);
2733 memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid)); 2935 if (calldata == NULL)
2734 arg.u.locku = &luargs; 2936 return -ENOMEM;
2735 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 2937 calldata->luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid);
2736 nfs4_increment_lock_seqid(status, lsp); 2938 if (calldata->luargs.seqid == NULL) {
2737 2939 kfree(calldata);
2738 if (status == 0) 2940 return -ENOMEM;
2739 memcpy(&lsp->ls_stateid, &res.u.stateid, 2941 }
2740 sizeof(lsp->ls_stateid)); 2942 calldata->luargs.stateid = &lsp->ls_stateid;
2741out: 2943 calldata->arg.fh = NFS_FH(inode);
2742 up(&state->lock_sema); 2944 calldata->arg.type = nfs4_lck_type(cmd, request);
2945 calldata->arg.offset = request->fl_start;
2946 calldata->arg.length = nfs4_lck_length(request);
2947 calldata->arg.u.locku = &calldata->luargs;
2948 calldata->res.server = server;
2949 calldata->lsp = lsp;
2950 atomic_inc(&lsp->ls_count);
2951
2952 /* Ensure we don't close file until we're done freeing locks! */
2953 calldata->ctx = get_nfs_open_context((struct nfs_open_context*)request->fl_file->private_data);
2954
2955 atomic_set(&calldata->refcount, 2);
2956 init_completion(&calldata->completion);
2957
2958 status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_locku_begin,
2959 nfs4_locku_done, calldata);
2743 if (status == 0) 2960 if (status == 0)
2744 do_vfs_lock(request->fl_file, request); 2961 wait_for_completion_interruptible(&calldata->completion);
2745 up_read(&clp->cl_sem); 2962 do_vfs_lock(request->fl_file, request);
2963 nfs4_locku_release_calldata(calldata);
2746 return status; 2964 return status;
2747} 2965}
2748 2966
2749static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
2750{
2751 struct nfs4_exception exception = { };
2752 int err;
2753
2754 do {
2755 err = nfs4_handle_exception(NFS_SERVER(state->inode),
2756 _nfs4_proc_unlck(state, cmd, request),
2757 &exception);
2758 } while (exception.retry);
2759 return err;
2760}
2761
2762static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim) 2967static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim)
2763{ 2968{
2764 struct inode *inode = state->inode; 2969 struct inode *inode = state->inode;
2765 struct nfs_server *server = NFS_SERVER(inode); 2970 struct nfs_server *server = NFS_SERVER(inode);
2766 struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; 2971 struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
2972 struct nfs_lock_opargs largs = {
2973 .lock_stateid = &lsp->ls_stateid,
2974 .open_stateid = &state->stateid,
2975 .lock_owner = {
2976 .clientid = server->nfs4_state->cl_clientid,
2977 .id = lsp->ls_id,
2978 },
2979 .reclaim = reclaim,
2980 };
2767 struct nfs_lockargs arg = { 2981 struct nfs_lockargs arg = {
2768 .fh = NFS_FH(inode), 2982 .fh = NFS_FH(inode),
2769 .type = nfs4_lck_type(cmd, request), 2983 .type = nfs4_lck_type(cmd, request),
2770 .offset = request->fl_start, 2984 .offset = request->fl_start,
2771 .length = nfs4_lck_length(request), 2985 .length = nfs4_lck_length(request),
2986 .u = {
2987 .lock = &largs,
2988 },
2772 }; 2989 };
2773 struct nfs_lockres res = { 2990 struct nfs_lockres res = {
2774 .server = server, 2991 .server = server,
@@ -2779,53 +2996,39 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r
2779 .rpc_resp = &res, 2996 .rpc_resp = &res,
2780 .rpc_cred = state->owner->so_cred, 2997 .rpc_cred = state->owner->so_cred,
2781 }; 2998 };
2782 struct nfs_lock_opargs largs = { 2999 int status = -ENOMEM;
2783 .reclaim = reclaim,
2784 .new_lock_owner = 0,
2785 };
2786 int status;
2787 3000
2788 if (!(lsp->ls_flags & NFS_LOCK_INITIALIZED)) { 3001 largs.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
3002 if (largs.lock_seqid == NULL)
3003 return -ENOMEM;
3004 if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) {
2789 struct nfs4_state_owner *owner = state->owner; 3005 struct nfs4_state_owner *owner = state->owner;
2790 struct nfs_open_to_lock otl = { 3006
2791 .lock_owner = { 3007 largs.open_seqid = nfs_alloc_seqid(&owner->so_seqid);
2792 .clientid = server->nfs4_state->cl_clientid, 3008 if (largs.open_seqid == NULL)
2793 }, 3009 goto out;
2794 };
2795
2796 otl.lock_seqid = lsp->ls_seqid;
2797 otl.lock_owner.id = lsp->ls_id;
2798 memcpy(&otl.open_stateid, &state->stateid, sizeof(otl.open_stateid));
2799 largs.u.open_lock = &otl;
2800 largs.new_lock_owner = 1; 3010 largs.new_lock_owner = 1;
2801 arg.u.lock = &largs;
2802 down(&owner->so_sema);
2803 otl.open_seqid = owner->so_seqid;
2804 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 3011 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
2805 /* increment open_owner seqid on success, and 3012 /* increment open seqid on success, and seqid mutating errors */
2806 * seqid mutating errors */ 3013 if (largs.new_lock_owner != 0) {
2807 nfs4_increment_seqid(status, owner); 3014 nfs_increment_open_seqid(status, largs.open_seqid);
2808 up(&owner->so_sema); 3015 if (status == 0)
2809 if (status == 0) { 3016 nfs_confirm_seqid(&lsp->ls_seqid, 0);
2810 lsp->ls_flags |= NFS_LOCK_INITIALIZED;
2811 lsp->ls_seqid++;
2812 } 3017 }
2813 } else { 3018 nfs_free_seqid(largs.open_seqid);
2814 struct nfs_exist_lock el = { 3019 } else
2815 .seqid = lsp->ls_seqid,
2816 };
2817 memcpy(&el.stateid, &lsp->ls_stateid, sizeof(el.stateid));
2818 largs.u.exist_lock = &el;
2819 arg.u.lock = &largs;
2820 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); 3020 status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
2821 /* increment seqid on success, and * seqid mutating errors*/ 3021 /* increment lock seqid on success, and seqid mutating errors*/
2822 nfs4_increment_lock_seqid(status, lsp); 3022 nfs_increment_lock_seqid(status, largs.lock_seqid);
2823 }
2824 /* save the returned stateid. */ 3023 /* save the returned stateid. */
2825 if (status == 0) 3024 if (status == 0) {
2826 memcpy(&lsp->ls_stateid, &res.u.stateid, sizeof(nfs4_stateid)); 3025 memcpy(lsp->ls_stateid.data, res.u.stateid.data,
2827 else if (status == -NFS4ERR_DENIED) 3026 sizeof(lsp->ls_stateid.data));
3027 lsp->ls_flags |= NFS_LOCK_INITIALIZED;
3028 } else if (status == -NFS4ERR_DENIED)
2828 status = -EAGAIN; 3029 status = -EAGAIN;
3030out:
3031 nfs_free_seqid(largs.lock_seqid);
2829 return status; 3032 return status;
2830} 3033}
2831 3034
@@ -2865,11 +3068,9 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
2865 int status; 3068 int status;
2866 3069
2867 down_read(&clp->cl_sem); 3070 down_read(&clp->cl_sem);
2868 down(&state->lock_sema);
2869 status = nfs4_set_lock_state(state, request); 3071 status = nfs4_set_lock_state(state, request);
2870 if (status == 0) 3072 if (status == 0)
2871 status = _nfs4_do_setlk(state, cmd, request, 0); 3073 status = _nfs4_do_setlk(state, cmd, request, 0);
2872 up(&state->lock_sema);
2873 if (status == 0) { 3074 if (status == 0) {
2874 /* Note: we always want to sleep here! */ 3075 /* Note: we always want to sleep here! */
2875 request->fl_flags |= FL_SLEEP; 3076 request->fl_flags |= FL_SLEEP;
@@ -3024,8 +3225,8 @@ struct nfs_rpc_ops nfs_v4_clientops = {
3024 .read_setup = nfs4_proc_read_setup, 3225 .read_setup = nfs4_proc_read_setup,
3025 .write_setup = nfs4_proc_write_setup, 3226 .write_setup = nfs4_proc_write_setup,
3026 .commit_setup = nfs4_proc_commit_setup, 3227 .commit_setup = nfs4_proc_commit_setup,
3027 .file_open = nfs4_proc_file_open, 3228 .file_open = nfs_open,
3028 .file_release = nfs4_proc_file_release, 3229 .file_release = nfs_release,
3029 .lock = nfs4_proc_lock, 3230 .lock = nfs4_proc_lock,
3030 .clear_acl_cache = nfs4_zap_acl_attr, 3231 .clear_acl_cache = nfs4_zap_acl_attr,
3031}; 3232};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index afe587d82f1e..2d5a6a2b9dec 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -264,13 +264,16 @@ nfs4_alloc_state_owner(void)
264{ 264{
265 struct nfs4_state_owner *sp; 265 struct nfs4_state_owner *sp;
266 266
267 sp = kmalloc(sizeof(*sp),GFP_KERNEL); 267 sp = kzalloc(sizeof(*sp),GFP_KERNEL);
268 if (!sp) 268 if (!sp)
269 return NULL; 269 return NULL;
270 init_MUTEX(&sp->so_sema); 270 spin_lock_init(&sp->so_lock);
271 sp->so_seqid = 0; /* arbitrary */
272 INIT_LIST_HEAD(&sp->so_states); 271 INIT_LIST_HEAD(&sp->so_states);
273 INIT_LIST_HEAD(&sp->so_delegations); 272 INIT_LIST_HEAD(&sp->so_delegations);
273 rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
274 sp->so_seqid.sequence = &sp->so_sequence;
275 spin_lock_init(&sp->so_sequence.lock);
276 INIT_LIST_HEAD(&sp->so_sequence.list);
274 atomic_set(&sp->so_count, 1); 277 atomic_set(&sp->so_count, 1);
275 return sp; 278 return sp;
276} 279}
@@ -359,7 +362,6 @@ nfs4_alloc_open_state(void)
359 memset(state->stateid.data, 0, sizeof(state->stateid.data)); 362 memset(state->stateid.data, 0, sizeof(state->stateid.data));
360 atomic_set(&state->count, 1); 363 atomic_set(&state->count, 1);
361 INIT_LIST_HEAD(&state->lock_states); 364 INIT_LIST_HEAD(&state->lock_states);
362 init_MUTEX(&state->lock_sema);
363 spin_lock_init(&state->state_lock); 365 spin_lock_init(&state->state_lock);
364 return state; 366 return state;
365} 367}
@@ -437,21 +439,23 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
437 if (state) 439 if (state)
438 goto out; 440 goto out;
439 new = nfs4_alloc_open_state(); 441 new = nfs4_alloc_open_state();
442 spin_lock(&owner->so_lock);
440 spin_lock(&inode->i_lock); 443 spin_lock(&inode->i_lock);
441 state = __nfs4_find_state_byowner(inode, owner); 444 state = __nfs4_find_state_byowner(inode, owner);
442 if (state == NULL && new != NULL) { 445 if (state == NULL && new != NULL) {
443 state = new; 446 state = new;
444 /* Caller *must* be holding owner->so_sem */
445 /* Note: The reclaim code dictates that we add stateless
446 * and read-only stateids to the end of the list */
447 list_add_tail(&state->open_states, &owner->so_states);
448 state->owner = owner; 447 state->owner = owner;
449 atomic_inc(&owner->so_count); 448 atomic_inc(&owner->so_count);
450 list_add(&state->inode_states, &nfsi->open_states); 449 list_add(&state->inode_states, &nfsi->open_states);
451 state->inode = igrab(inode); 450 state->inode = igrab(inode);
452 spin_unlock(&inode->i_lock); 451 spin_unlock(&inode->i_lock);
452 /* Note: The reclaim code dictates that we add stateless
453 * and read-only stateids to the end of the list */
454 list_add_tail(&state->open_states, &owner->so_states);
455 spin_unlock(&owner->so_lock);
453 } else { 456 } else {
454 spin_unlock(&inode->i_lock); 457 spin_unlock(&inode->i_lock);
458 spin_unlock(&owner->so_lock);
455 if (new) 459 if (new)
456 nfs4_free_open_state(new); 460 nfs4_free_open_state(new);
457 } 461 }
@@ -461,19 +465,21 @@ out:
461 465
462/* 466/*
463 * Beware! Caller must be holding exactly one 467 * Beware! Caller must be holding exactly one
464 * reference to clp->cl_sem and owner->so_sema! 468 * reference to clp->cl_sem!
465 */ 469 */
466void nfs4_put_open_state(struct nfs4_state *state) 470void nfs4_put_open_state(struct nfs4_state *state)
467{ 471{
468 struct inode *inode = state->inode; 472 struct inode *inode = state->inode;
469 struct nfs4_state_owner *owner = state->owner; 473 struct nfs4_state_owner *owner = state->owner;
470 474
471 if (!atomic_dec_and_lock(&state->count, &inode->i_lock)) 475 if (!atomic_dec_and_lock(&state->count, &owner->so_lock))
472 return; 476 return;
477 spin_lock(&inode->i_lock);
473 if (!list_empty(&state->inode_states)) 478 if (!list_empty(&state->inode_states))
474 list_del(&state->inode_states); 479 list_del(&state->inode_states);
475 spin_unlock(&inode->i_lock);
476 list_del(&state->open_states); 480 list_del(&state->open_states);
481 spin_unlock(&inode->i_lock);
482 spin_unlock(&owner->so_lock);
477 iput(inode); 483 iput(inode);
478 BUG_ON (state->state != 0); 484 BUG_ON (state->state != 0);
479 nfs4_free_open_state(state); 485 nfs4_free_open_state(state);
@@ -481,20 +487,17 @@ void nfs4_put_open_state(struct nfs4_state *state)
481} 487}
482 488
483/* 489/*
484 * Beware! Caller must be holding no references to clp->cl_sem! 490 * Close the current file.
485 * of owner->so_sema!
486 */ 491 */
487void nfs4_close_state(struct nfs4_state *state, mode_t mode) 492void nfs4_close_state(struct nfs4_state *state, mode_t mode)
488{ 493{
489 struct inode *inode = state->inode; 494 struct inode *inode = state->inode;
490 struct nfs4_state_owner *owner = state->owner; 495 struct nfs4_state_owner *owner = state->owner;
491 struct nfs4_client *clp = owner->so_client;
492 int newstate; 496 int newstate;
493 497
494 atomic_inc(&owner->so_count); 498 atomic_inc(&owner->so_count);
495 down_read(&clp->cl_sem);
496 down(&owner->so_sema);
497 /* Protect against nfs4_find_state() */ 499 /* Protect against nfs4_find_state() */
500 spin_lock(&owner->so_lock);
498 spin_lock(&inode->i_lock); 501 spin_lock(&inode->i_lock);
499 if (mode & FMODE_READ) 502 if (mode & FMODE_READ)
500 state->nreaders--; 503 state->nreaders--;
@@ -507,6 +510,7 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode)
507 list_move_tail(&state->open_states, &owner->so_states); 510 list_move_tail(&state->open_states, &owner->so_states);
508 } 511 }
509 spin_unlock(&inode->i_lock); 512 spin_unlock(&inode->i_lock);
513 spin_unlock(&owner->so_lock);
510 newstate = 0; 514 newstate = 0;
511 if (state->state != 0) { 515 if (state->state != 0) {
512 if (state->nreaders) 516 if (state->nreaders)
@@ -515,14 +519,16 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode)
515 newstate |= FMODE_WRITE; 519 newstate |= FMODE_WRITE;
516 if (state->state == newstate) 520 if (state->state == newstate)
517 goto out; 521 goto out;
518 if (nfs4_do_close(inode, state, newstate) == -EINPROGRESS) 522 if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
523 state->state = newstate;
524 goto out;
525 }
526 if (nfs4_do_close(inode, state, newstate) == 0)
519 return; 527 return;
520 } 528 }
521out: 529out:
522 nfs4_put_open_state(state); 530 nfs4_put_open_state(state);
523 up(&owner->so_sema);
524 nfs4_put_state_owner(owner); 531 nfs4_put_state_owner(owner);
525 up_read(&clp->cl_sem);
526} 532}
527 533
528/* 534/*
@@ -546,19 +552,16 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
546 * Return a compatible lock_state. If no initialized lock_state structure 552 * Return a compatible lock_state. If no initialized lock_state structure
547 * exists, return an uninitialized one. 553 * exists, return an uninitialized one.
548 * 554 *
549 * The caller must be holding state->lock_sema
550 */ 555 */
551static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) 556static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
552{ 557{
553 struct nfs4_lock_state *lsp; 558 struct nfs4_lock_state *lsp;
554 struct nfs4_client *clp = state->owner->so_client; 559 struct nfs4_client *clp = state->owner->so_client;
555 560
556 lsp = kmalloc(sizeof(*lsp), GFP_KERNEL); 561 lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
557 if (lsp == NULL) 562 if (lsp == NULL)
558 return NULL; 563 return NULL;
559 lsp->ls_flags = 0; 564 lsp->ls_seqid.sequence = &state->owner->so_sequence;
560 lsp->ls_seqid = 0; /* arbitrary */
561 memset(lsp->ls_stateid.data, 0, sizeof(lsp->ls_stateid.data));
562 atomic_set(&lsp->ls_count, 1); 565 atomic_set(&lsp->ls_count, 1);
563 lsp->ls_owner = fl_owner; 566 lsp->ls_owner = fl_owner;
564 spin_lock(&clp->cl_lock); 567 spin_lock(&clp->cl_lock);
@@ -572,7 +575,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
572 * Return a compatible lock_state. If no initialized lock_state structure 575 * Return a compatible lock_state. If no initialized lock_state structure
573 * exists, return an uninitialized one. 576 * exists, return an uninitialized one.
574 * 577 *
575 * The caller must be holding state->lock_sema and clp->cl_sem 578 * The caller must be holding clp->cl_sem
576 */ 579 */
577static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) 580static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
578{ 581{
@@ -605,7 +608,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
605 * Release reference to lock_state, and free it if we see that 608 * Release reference to lock_state, and free it if we see that
606 * it is no longer in use 609 * it is no longer in use
607 */ 610 */
608static void nfs4_put_lock_state(struct nfs4_lock_state *lsp) 611void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
609{ 612{
610 struct nfs4_state *state; 613 struct nfs4_state *state;
611 614
@@ -673,29 +676,94 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
673 nfs4_put_lock_state(lsp); 676 nfs4_put_lock_state(lsp);
674} 677}
675 678
676/* 679struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
677* Called with state->lock_sema and clp->cl_sem held.
678*/
679void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp)
680{ 680{
681 if (status == NFS_OK || seqid_mutating_err(-status)) 681 struct nfs_seqid *new;
682 lsp->ls_seqid++; 682
683 new = kmalloc(sizeof(*new), GFP_KERNEL);
684 if (new != NULL) {
685 new->sequence = counter;
686 INIT_LIST_HEAD(&new->list);
687 }
688 return new;
689}
690
691void nfs_free_seqid(struct nfs_seqid *seqid)
692{
693 struct rpc_sequence *sequence = seqid->sequence->sequence;
694
695 if (!list_empty(&seqid->list)) {
696 spin_lock(&sequence->lock);
697 list_del(&seqid->list);
698 spin_unlock(&sequence->lock);
699 }
700 rpc_wake_up_next(&sequence->wait);
701 kfree(seqid);
683} 702}
684 703
685/* 704/*
686* Called with sp->so_sema and clp->cl_sem held. 705 * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
687* 706 * failed with a seqid incrementing error -
688* Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or 707 * see comments nfs_fs.h:seqid_mutating_error()
689* failed with a seqid incrementing error - 708 */
690* see comments nfs_fs.h:seqid_mutating_error() 709static inline void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
691*/ 710{
692void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp) 711 switch (status) {
693{ 712 case 0:
694 if (status == NFS_OK || seqid_mutating_err(-status)) 713 break;
695 sp->so_seqid++; 714 case -NFS4ERR_BAD_SEQID:
696 /* If the server returns BAD_SEQID, unhash state_owner here */ 715 case -NFS4ERR_STALE_CLIENTID:
697 if (status == -NFS4ERR_BAD_SEQID) 716 case -NFS4ERR_STALE_STATEID:
717 case -NFS4ERR_BAD_STATEID:
718 case -NFS4ERR_BADXDR:
719 case -NFS4ERR_RESOURCE:
720 case -NFS4ERR_NOFILEHANDLE:
721 /* Non-seqid mutating errors */
722 return;
723 };
724 /*
725 * Note: no locking needed as we are guaranteed to be first
726 * on the sequence list
727 */
728 seqid->sequence->counter++;
729}
730
731void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
732{
733 if (status == -NFS4ERR_BAD_SEQID) {
734 struct nfs4_state_owner *sp = container_of(seqid->sequence,
735 struct nfs4_state_owner, so_seqid);
698 nfs4_drop_state_owner(sp); 736 nfs4_drop_state_owner(sp);
737 }
738 return nfs_increment_seqid(status, seqid);
739}
740
741/*
742 * Increment the seqid if the LOCK/LOCKU succeeded, or
743 * failed with a seqid incrementing error -
744 * see comments nfs_fs.h:seqid_mutating_error()
745 */
746void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
747{
748 return nfs_increment_seqid(status, seqid);
749}
750
751int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
752{
753 struct rpc_sequence *sequence = seqid->sequence->sequence;
754 int status = 0;
755
756 if (sequence->list.next == &seqid->list)
757 goto out;
758 spin_lock(&sequence->lock);
759 if (!list_empty(&sequence->list)) {
760 rpc_sleep_on(&sequence->wait, task, NULL, NULL);
761 status = -EAGAIN;
762 } else
763 list_add(&seqid->list, &sequence->list);
764 spin_unlock(&sequence->lock);
765out:
766 return status;
699} 767}
700 768
701static int reclaimer(void *); 769static int reclaimer(void *);
@@ -791,8 +859,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
791 if (state->state == 0) 859 if (state->state == 0)
792 continue; 860 continue;
793 status = ops->recover_open(sp, state); 861 status = ops->recover_open(sp, state);
794 list_for_each_entry(lock, &state->lock_states, ls_locks)
795 lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
796 if (status >= 0) { 862 if (status >= 0) {
797 status = nfs4_reclaim_locks(ops, state); 863 status = nfs4_reclaim_locks(ops, state);
798 if (status < 0) 864 if (status < 0)
@@ -831,6 +897,28 @@ out_err:
831 return status; 897 return status;
832} 898}
833 899
900static void nfs4_state_mark_reclaim(struct nfs4_client *clp)
901{
902 struct nfs4_state_owner *sp;
903 struct nfs4_state *state;
904 struct nfs4_lock_state *lock;
905
906 /* Reset all sequence ids to zero */
907 list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
908 sp->so_seqid.counter = 0;
909 sp->so_seqid.flags = 0;
910 spin_lock(&sp->so_lock);
911 list_for_each_entry(state, &sp->so_states, open_states) {
912 list_for_each_entry(lock, &state->lock_states, ls_locks) {
913 lock->ls_seqid.counter = 0;
914 lock->ls_seqid.flags = 0;
915 lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
916 }
917 }
918 spin_unlock(&sp->so_lock);
919 }
920}
921
834static int reclaimer(void *ptr) 922static int reclaimer(void *ptr)
835{ 923{
836 struct reclaimer_args *args = (struct reclaimer_args *)ptr; 924 struct reclaimer_args *args = (struct reclaimer_args *)ptr;
@@ -864,6 +952,7 @@ restart_loop:
864 default: 952 default:
865 ops = &nfs4_network_partition_recovery_ops; 953 ops = &nfs4_network_partition_recovery_ops;
866 }; 954 };
955 nfs4_state_mark_reclaim(clp);
867 status = __nfs4_init_client(clp); 956 status = __nfs4_init_client(clp);
868 if (status) 957 if (status)
869 goto out_error; 958 goto out_error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6c564ef9489e..fbbace8a30c4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -95,6 +95,8 @@ static int nfs_stat_to_errno(int);
95#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) 95#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
96#define encode_savefh_maxsz (op_encode_hdr_maxsz) 96#define encode_savefh_maxsz (op_encode_hdr_maxsz)
97#define decode_savefh_maxsz (op_decode_hdr_maxsz) 97#define decode_savefh_maxsz (op_decode_hdr_maxsz)
98#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
99#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
98#define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2) 100#define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2)
99#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) 101#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11)
100#define encode_renew_maxsz (op_encode_hdr_maxsz + 3) 102#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
@@ -157,16 +159,20 @@ static int nfs_stat_to_errno(int);
157 op_decode_hdr_maxsz + 2) 159 op_decode_hdr_maxsz + 2)
158#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \ 160#define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \
159 encode_putfh_maxsz + \ 161 encode_putfh_maxsz + \
160 op_encode_hdr_maxsz + 8) 162 op_encode_hdr_maxsz + 8 + \
163 encode_getattr_maxsz)
161#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \ 164#define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \
162 decode_putfh_maxsz + \ 165 decode_putfh_maxsz + \
163 op_decode_hdr_maxsz + 4) 166 op_decode_hdr_maxsz + 4 + \
167 decode_getattr_maxsz)
164#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \ 168#define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \
165 encode_putfh_maxsz + \ 169 encode_putfh_maxsz + \
166 op_encode_hdr_maxsz + 3) 170 op_encode_hdr_maxsz + 3 + \
171 encode_getattr_maxsz)
167#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \ 172#define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \
168 decode_putfh_maxsz + \ 173 decode_putfh_maxsz + \
169 op_decode_hdr_maxsz + 2) 174 op_decode_hdr_maxsz + 2 + \
175 decode_getattr_maxsz)
170#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ 176#define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \
171 encode_putfh_maxsz + \ 177 encode_putfh_maxsz + \
172 op_encode_hdr_maxsz + \ 178 op_encode_hdr_maxsz + \
@@ -196,17 +202,21 @@ static int nfs_stat_to_errno(int);
196#define NFS4_enc_open_downgrade_sz \ 202#define NFS4_enc_open_downgrade_sz \
197 (compound_encode_hdr_maxsz + \ 203 (compound_encode_hdr_maxsz + \
198 encode_putfh_maxsz + \ 204 encode_putfh_maxsz + \
199 op_encode_hdr_maxsz + 7) 205 op_encode_hdr_maxsz + 7 + \
206 encode_getattr_maxsz)
200#define NFS4_dec_open_downgrade_sz \ 207#define NFS4_dec_open_downgrade_sz \
201 (compound_decode_hdr_maxsz + \ 208 (compound_decode_hdr_maxsz + \
202 decode_putfh_maxsz + \ 209 decode_putfh_maxsz + \
203 op_decode_hdr_maxsz + 4) 210 op_decode_hdr_maxsz + 4 + \
211 decode_getattr_maxsz)
204#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \ 212#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \
205 encode_putfh_maxsz + \ 213 encode_putfh_maxsz + \
206 op_encode_hdr_maxsz + 5) 214 op_encode_hdr_maxsz + 5 + \
215 encode_getattr_maxsz)
207#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \ 216#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
208 decode_putfh_maxsz + \ 217 decode_putfh_maxsz + \
209 op_decode_hdr_maxsz + 4) 218 op_decode_hdr_maxsz + 4 + \
219 decode_getattr_maxsz)
210#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ 220#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
211 encode_putfh_maxsz + \ 221 encode_putfh_maxsz + \
212 op_encode_hdr_maxsz + 4 + \ 222 op_encode_hdr_maxsz + 4 + \
@@ -300,30 +310,44 @@ static int nfs_stat_to_errno(int);
300 decode_getfh_maxsz) 310 decode_getfh_maxsz)
301#define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \ 311#define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \
302 encode_putfh_maxsz + \ 312 encode_putfh_maxsz + \
303 encode_remove_maxsz) 313 encode_remove_maxsz + \
314 encode_getattr_maxsz)
304#define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \ 315#define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \
305 decode_putfh_maxsz + \ 316 decode_putfh_maxsz + \
306 op_decode_hdr_maxsz + 5) 317 op_decode_hdr_maxsz + 5 + \
318 decode_getattr_maxsz)
307#define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \ 319#define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \
308 encode_putfh_maxsz + \ 320 encode_putfh_maxsz + \
309 encode_savefh_maxsz + \ 321 encode_savefh_maxsz + \
310 encode_putfh_maxsz + \ 322 encode_putfh_maxsz + \
311 encode_rename_maxsz) 323 encode_rename_maxsz + \
324 encode_getattr_maxsz + \
325 encode_restorefh_maxsz + \
326 encode_getattr_maxsz)
312#define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \ 327#define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \
313 decode_putfh_maxsz + \ 328 decode_putfh_maxsz + \
314 decode_savefh_maxsz + \ 329 decode_savefh_maxsz + \
315 decode_putfh_maxsz + \ 330 decode_putfh_maxsz + \
316 decode_rename_maxsz) 331 decode_rename_maxsz + \
332 decode_getattr_maxsz + \
333 decode_restorefh_maxsz + \
334 decode_getattr_maxsz)
317#define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \ 335#define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \
318 encode_putfh_maxsz + \ 336 encode_putfh_maxsz + \
319 encode_savefh_maxsz + \ 337 encode_savefh_maxsz + \
320 encode_putfh_maxsz + \ 338 encode_putfh_maxsz + \
321 encode_link_maxsz) 339 encode_link_maxsz + \
340 decode_getattr_maxsz + \
341 encode_restorefh_maxsz + \
342 decode_getattr_maxsz)
322#define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \ 343#define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \
323 decode_putfh_maxsz + \ 344 decode_putfh_maxsz + \
324 decode_savefh_maxsz + \ 345 decode_savefh_maxsz + \
325 decode_putfh_maxsz + \ 346 decode_putfh_maxsz + \
326 decode_link_maxsz) 347 decode_link_maxsz + \
348 decode_getattr_maxsz + \
349 decode_restorefh_maxsz + \
350 decode_getattr_maxsz)
327#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \ 351#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \
328 encode_putfh_maxsz + \ 352 encode_putfh_maxsz + \
329 encode_symlink_maxsz + \ 353 encode_symlink_maxsz + \
@@ -336,14 +360,20 @@ static int nfs_stat_to_errno(int);
336 decode_getfh_maxsz) 360 decode_getfh_maxsz)
337#define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \ 361#define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \
338 encode_putfh_maxsz + \ 362 encode_putfh_maxsz + \
363 encode_savefh_maxsz + \
339 encode_create_maxsz + \ 364 encode_create_maxsz + \
365 encode_getfh_maxsz + \
340 encode_getattr_maxsz + \ 366 encode_getattr_maxsz + \
341 encode_getfh_maxsz) 367 encode_restorefh_maxsz + \
368 encode_getattr_maxsz)
342#define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \ 369#define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \
343 decode_putfh_maxsz + \ 370 decode_putfh_maxsz + \
371 decode_savefh_maxsz + \
344 decode_create_maxsz + \ 372 decode_create_maxsz + \
373 decode_getfh_maxsz + \
345 decode_getattr_maxsz + \ 374 decode_getattr_maxsz + \
346 decode_getfh_maxsz) 375 decode_restorefh_maxsz + \
376 decode_getattr_maxsz)
347#define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \ 377#define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \
348 encode_putfh_maxsz + \ 378 encode_putfh_maxsz + \
349 encode_getattr_maxsz) 379 encode_getattr_maxsz)
@@ -602,10 +632,10 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
602{ 632{
603 uint32_t *p; 633 uint32_t *p;
604 634
605 RESERVE_SPACE(8+sizeof(arg->stateid.data)); 635 RESERVE_SPACE(8+sizeof(arg->stateid->data));
606 WRITE32(OP_CLOSE); 636 WRITE32(OP_CLOSE);
607 WRITE32(arg->seqid); 637 WRITE32(arg->seqid->sequence->counter);
608 WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); 638 WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
609 639
610 return 0; 640 return 0;
611} 641}
@@ -729,22 +759,18 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
729 WRITE64(arg->length); 759 WRITE64(arg->length);
730 WRITE32(opargs->new_lock_owner); 760 WRITE32(opargs->new_lock_owner);
731 if (opargs->new_lock_owner){ 761 if (opargs->new_lock_owner){
732 struct nfs_open_to_lock *ol = opargs->u.open_lock;
733
734 RESERVE_SPACE(40); 762 RESERVE_SPACE(40);
735 WRITE32(ol->open_seqid); 763 WRITE32(opargs->open_seqid->sequence->counter);
736 WRITEMEM(&ol->open_stateid, sizeof(ol->open_stateid)); 764 WRITEMEM(opargs->open_stateid->data, sizeof(opargs->open_stateid->data));
737 WRITE32(ol->lock_seqid); 765 WRITE32(opargs->lock_seqid->sequence->counter);
738 WRITE64(ol->lock_owner.clientid); 766 WRITE64(opargs->lock_owner.clientid);
739 WRITE32(4); 767 WRITE32(4);
740 WRITE32(ol->lock_owner.id); 768 WRITE32(opargs->lock_owner.id);
741 } 769 }
742 else { 770 else {
743 struct nfs_exist_lock *el = opargs->u.exist_lock;
744
745 RESERVE_SPACE(20); 771 RESERVE_SPACE(20);
746 WRITEMEM(&el->stateid, sizeof(el->stateid)); 772 WRITEMEM(opargs->lock_stateid->data, sizeof(opargs->lock_stateid->data));
747 WRITE32(el->seqid); 773 WRITE32(opargs->lock_seqid->sequence->counter);
748 } 774 }
749 775
750 return 0; 776 return 0;
@@ -775,8 +801,8 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_lockargs *arg)
775 RESERVE_SPACE(44); 801 RESERVE_SPACE(44);
776 WRITE32(OP_LOCKU); 802 WRITE32(OP_LOCKU);
777 WRITE32(arg->type); 803 WRITE32(arg->type);
778 WRITE32(opargs->seqid); 804 WRITE32(opargs->seqid->sequence->counter);
779 WRITEMEM(&opargs->stateid, sizeof(opargs->stateid)); 805 WRITEMEM(opargs->stateid->data, sizeof(opargs->stateid->data));
780 WRITE64(arg->offset); 806 WRITE64(arg->offset);
781 WRITE64(arg->length); 807 WRITE64(arg->length);
782 808
@@ -826,7 +852,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
826 */ 852 */
827 RESERVE_SPACE(8); 853 RESERVE_SPACE(8);
828 WRITE32(OP_OPEN); 854 WRITE32(OP_OPEN);
829 WRITE32(arg->seqid); 855 WRITE32(arg->seqid->sequence->counter);
830 encode_share_access(xdr, arg->open_flags); 856 encode_share_access(xdr, arg->open_flags);
831 RESERVE_SPACE(16); 857 RESERVE_SPACE(16);
832 WRITE64(arg->clientid); 858 WRITE64(arg->clientid);
@@ -941,7 +967,7 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
941 RESERVE_SPACE(8+sizeof(arg->stateid.data)); 967 RESERVE_SPACE(8+sizeof(arg->stateid.data));
942 WRITE32(OP_OPEN_CONFIRM); 968 WRITE32(OP_OPEN_CONFIRM);
943 WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); 969 WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
944 WRITE32(arg->seqid); 970 WRITE32(arg->seqid->sequence->counter);
945 971
946 return 0; 972 return 0;
947} 973}
@@ -950,10 +976,10 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea
950{ 976{
951 uint32_t *p; 977 uint32_t *p;
952 978
953 RESERVE_SPACE(8+sizeof(arg->stateid.data)); 979 RESERVE_SPACE(8+sizeof(arg->stateid->data));
954 WRITE32(OP_OPEN_DOWNGRADE); 980 WRITE32(OP_OPEN_DOWNGRADE);
955 WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); 981 WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data));
956 WRITE32(arg->seqid); 982 WRITE32(arg->seqid->sequence->counter);
957 encode_share_access(xdr, arg->open_flags); 983 encode_share_access(xdr, arg->open_flags);
958 return 0; 984 return 0;
959} 985}
@@ -1117,6 +1143,17 @@ static int encode_renew(struct xdr_stream *xdr, const struct nfs4_client *client
1117} 1143}
1118 1144
1119static int 1145static int
1146encode_restorefh(struct xdr_stream *xdr)
1147{
1148 uint32_t *p;
1149
1150 RESERVE_SPACE(4);
1151 WRITE32(OP_RESTOREFH);
1152
1153 return 0;
1154}
1155
1156static int
1120encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) 1157encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
1121{ 1158{
1122 uint32_t *p; 1159 uint32_t *p;
@@ -1296,14 +1333,18 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, uint32_t *p, const struct n
1296{ 1333{
1297 struct xdr_stream xdr; 1334 struct xdr_stream xdr;
1298 struct compound_hdr hdr = { 1335 struct compound_hdr hdr = {
1299 .nops = 2, 1336 .nops = 3,
1300 }; 1337 };
1301 int status; 1338 int status;
1302 1339
1303 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1340 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1304 encode_compound_hdr(&xdr, &hdr); 1341 encode_compound_hdr(&xdr, &hdr);
1305 if ((status = encode_putfh(&xdr, args->fh)) == 0) 1342 if ((status = encode_putfh(&xdr, args->fh)) != 0)
1306 status = encode_remove(&xdr, args->name); 1343 goto out;
1344 if ((status = encode_remove(&xdr, args->name)) != 0)
1345 goto out;
1346 status = encode_getfattr(&xdr, args->bitmask);
1347out:
1307 return status; 1348 return status;
1308} 1349}
1309 1350
@@ -1314,7 +1355,7 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct n
1314{ 1355{
1315 struct xdr_stream xdr; 1356 struct xdr_stream xdr;
1316 struct compound_hdr hdr = { 1357 struct compound_hdr hdr = {
1317 .nops = 4, 1358 .nops = 7,
1318 }; 1359 };
1319 int status; 1360 int status;
1320 1361
@@ -1326,7 +1367,13 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct n
1326 goto out; 1367 goto out;
1327 if ((status = encode_putfh(&xdr, args->new_dir)) != 0) 1368 if ((status = encode_putfh(&xdr, args->new_dir)) != 0)
1328 goto out; 1369 goto out;
1329 status = encode_rename(&xdr, args->old_name, args->new_name); 1370 if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0)
1371 goto out;
1372 if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
1373 goto out;
1374 if ((status = encode_restorefh(&xdr)) != 0)
1375 goto out;
1376 status = encode_getfattr(&xdr, args->bitmask);
1330out: 1377out:
1331 return status; 1378 return status;
1332} 1379}
@@ -1338,7 +1385,7 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs
1338{ 1385{
1339 struct xdr_stream xdr; 1386 struct xdr_stream xdr;
1340 struct compound_hdr hdr = { 1387 struct compound_hdr hdr = {
1341 .nops = 4, 1388 .nops = 7,
1342 }; 1389 };
1343 int status; 1390 int status;
1344 1391
@@ -1350,7 +1397,13 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs
1350 goto out; 1397 goto out;
1351 if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) 1398 if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
1352 goto out; 1399 goto out;
1353 status = encode_link(&xdr, args->name); 1400 if ((status = encode_link(&xdr, args->name)) != 0)
1401 goto out;
1402 if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
1403 goto out;
1404 if ((status = encode_restorefh(&xdr)) != 0)
1405 goto out;
1406 status = encode_getfattr(&xdr, args->bitmask);
1354out: 1407out:
1355 return status; 1408 return status;
1356} 1409}
@@ -1362,7 +1415,7 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct n
1362{ 1415{
1363 struct xdr_stream xdr; 1416 struct xdr_stream xdr;
1364 struct compound_hdr hdr = { 1417 struct compound_hdr hdr = {
1365 .nops = 4, 1418 .nops = 7,
1366 }; 1419 };
1367 int status; 1420 int status;
1368 1421
@@ -1370,10 +1423,16 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct n
1370 encode_compound_hdr(&xdr, &hdr); 1423 encode_compound_hdr(&xdr, &hdr);
1371 if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) 1424 if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
1372 goto out; 1425 goto out;
1426 if ((status = encode_savefh(&xdr)) != 0)
1427 goto out;
1373 if ((status = encode_create(&xdr, args)) != 0) 1428 if ((status = encode_create(&xdr, args)) != 0)
1374 goto out; 1429 goto out;
1375 if ((status = encode_getfh(&xdr)) != 0) 1430 if ((status = encode_getfh(&xdr)) != 0)
1376 goto out; 1431 goto out;
1432 if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
1433 goto out;
1434 if ((status = encode_restorefh(&xdr)) != 0)
1435 goto out;
1377 status = encode_getfattr(&xdr, args->bitmask); 1436 status = encode_getfattr(&xdr, args->bitmask);
1378out: 1437out:
1379 return status; 1438 return status;
@@ -1412,7 +1471,7 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_clos
1412{ 1471{
1413 struct xdr_stream xdr; 1472 struct xdr_stream xdr;
1414 struct compound_hdr hdr = { 1473 struct compound_hdr hdr = {
1415 .nops = 2, 1474 .nops = 3,
1416 }; 1475 };
1417 int status; 1476 int status;
1418 1477
@@ -1422,6 +1481,9 @@ static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_clos
1422 if(status) 1481 if(status)
1423 goto out; 1482 goto out;
1424 status = encode_close(&xdr, args); 1483 status = encode_close(&xdr, args);
1484 if (status != 0)
1485 goto out;
1486 status = encode_getfattr(&xdr, args->bitmask);
1425out: 1487out:
1426 return status; 1488 return status;
1427} 1489}
@@ -1433,15 +1495,21 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena
1433{ 1495{
1434 struct xdr_stream xdr; 1496 struct xdr_stream xdr;
1435 struct compound_hdr hdr = { 1497 struct compound_hdr hdr = {
1436 .nops = 4, 1498 .nops = 7,
1437 }; 1499 };
1438 int status; 1500 int status;
1439 1501
1502 status = nfs_wait_on_sequence(args->seqid, req->rq_task);
1503 if (status != 0)
1504 goto out;
1440 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1505 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1441 encode_compound_hdr(&xdr, &hdr); 1506 encode_compound_hdr(&xdr, &hdr);
1442 status = encode_putfh(&xdr, args->fh); 1507 status = encode_putfh(&xdr, args->fh);
1443 if (status) 1508 if (status)
1444 goto out; 1509 goto out;
1510 status = encode_savefh(&xdr);
1511 if (status)
1512 goto out;
1445 status = encode_open(&xdr, args); 1513 status = encode_open(&xdr, args);
1446 if (status) 1514 if (status)
1447 goto out; 1515 goto out;
@@ -1449,6 +1517,12 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena
1449 if (status) 1517 if (status)
1450 goto out; 1518 goto out;
1451 status = encode_getfattr(&xdr, args->bitmask); 1519 status = encode_getfattr(&xdr, args->bitmask);
1520 if (status)
1521 goto out;
1522 status = encode_restorefh(&xdr);
1523 if (status)
1524 goto out;
1525 status = encode_getfattr(&xdr, args->bitmask);
1452out: 1526out:
1453 return status; 1527 return status;
1454} 1528}
@@ -1464,6 +1538,9 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct n
1464 }; 1538 };
1465 int status; 1539 int status;
1466 1540
1541 status = nfs_wait_on_sequence(args->seqid, req->rq_task);
1542 if (status != 0)
1543 goto out;
1467 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1544 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1468 encode_compound_hdr(&xdr, &hdr); 1545 encode_compound_hdr(&xdr, &hdr);
1469 status = encode_putfh(&xdr, args->fh); 1546 status = encode_putfh(&xdr, args->fh);
@@ -1485,6 +1562,9 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nf
1485 }; 1562 };
1486 int status; 1563 int status;
1487 1564
1565 status = nfs_wait_on_sequence(args->seqid, req->rq_task);
1566 if (status != 0)
1567 goto out;
1488 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1568 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1489 encode_compound_hdr(&xdr, &hdr); 1569 encode_compound_hdr(&xdr, &hdr);
1490 status = encode_putfh(&xdr, args->fh); 1570 status = encode_putfh(&xdr, args->fh);
@@ -1502,7 +1582,7 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct
1502{ 1582{
1503 struct xdr_stream xdr; 1583 struct xdr_stream xdr;
1504 struct compound_hdr hdr = { 1584 struct compound_hdr hdr = {
1505 .nops = 2, 1585 .nops = 3,
1506 }; 1586 };
1507 int status; 1587 int status;
1508 1588
@@ -1512,6 +1592,9 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct
1512 if (status) 1592 if (status)
1513 goto out; 1593 goto out;
1514 status = encode_open_downgrade(&xdr, args); 1594 status = encode_open_downgrade(&xdr, args);
1595 if (status != 0)
1596 goto out;
1597 status = encode_getfattr(&xdr, args->bitmask);
1515out: 1598out:
1516 return status; 1599 return status;
1517} 1600}
@@ -1525,8 +1608,15 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_locka
1525 struct compound_hdr hdr = { 1608 struct compound_hdr hdr = {
1526 .nops = 2, 1609 .nops = 2,
1527 }; 1610 };
1611 struct nfs_lock_opargs *opargs = args->u.lock;
1528 int status; 1612 int status;
1529 1613
1614 status = nfs_wait_on_sequence(opargs->lock_seqid, req->rq_task);
1615 if (status != 0)
1616 goto out;
1617 /* Do we need to do an open_to_lock_owner? */
1618 if (opargs->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)
1619 opargs->new_lock_owner = 0;
1530 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1620 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
1531 encode_compound_hdr(&xdr, &hdr); 1621 encode_compound_hdr(&xdr, &hdr);
1532 status = encode_putfh(&xdr, args->fh); 1622 status = encode_putfh(&xdr, args->fh);
@@ -1713,7 +1803,7 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writ
1713{ 1803{
1714 struct xdr_stream xdr; 1804 struct xdr_stream xdr;
1715 struct compound_hdr hdr = { 1805 struct compound_hdr hdr = {
1716 .nops = 2, 1806 .nops = 3,
1717 }; 1807 };
1718 int status; 1808 int status;
1719 1809
@@ -1723,6 +1813,9 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writ
1723 if (status) 1813 if (status)
1724 goto out; 1814 goto out;
1725 status = encode_write(&xdr, args); 1815 status = encode_write(&xdr, args);
1816 if (status)
1817 goto out;
1818 status = encode_getfattr(&xdr, args->bitmask);
1726out: 1819out:
1727 return status; 1820 return status;
1728} 1821}
@@ -1734,7 +1827,7 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_wri
1734{ 1827{
1735 struct xdr_stream xdr; 1828 struct xdr_stream xdr;
1736 struct compound_hdr hdr = { 1829 struct compound_hdr hdr = {
1737 .nops = 2, 1830 .nops = 3,
1738 }; 1831 };
1739 int status; 1832 int status;
1740 1833
@@ -1744,6 +1837,9 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_wri
1744 if (status) 1837 if (status)
1745 goto out; 1838 goto out;
1746 status = encode_commit(&xdr, args); 1839 status = encode_commit(&xdr, args);
1840 if (status)
1841 goto out;
1842 status = encode_getfattr(&xdr, args->bitmask);
1747out: 1843out:
1748 return status; 1844 return status;
1749} 1845}
@@ -2670,8 +2766,7 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
2670 goto xdr_error; 2766 goto xdr_error;
2671 status = verify_attr_len(xdr, savep, attrlen); 2767 status = verify_attr_len(xdr, savep, attrlen);
2672xdr_error: 2768xdr_error:
2673 if (status != 0) 2769 dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
2674 printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
2675 return status; 2770 return status;
2676} 2771}
2677 2772
@@ -2704,8 +2799,7 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
2704 2799
2705 status = verify_attr_len(xdr, savep, attrlen); 2800 status = verify_attr_len(xdr, savep, attrlen);
2706xdr_error: 2801xdr_error:
2707 if (status != 0) 2802 dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
2708 printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
2709 return status; 2803 return status;
2710} 2804}
2711 2805
@@ -2730,8 +2824,7 @@ static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf
2730 2824
2731 status = verify_attr_len(xdr, savep, attrlen); 2825 status = verify_attr_len(xdr, savep, attrlen);
2732xdr_error: 2826xdr_error:
2733 if (status != 0) 2827 dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
2734 printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
2735 return status; 2828 return status;
2736} 2829}
2737 2830
@@ -2787,13 +2880,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
2787 goto xdr_error; 2880 goto xdr_error;
2788 if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) 2881 if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
2789 goto xdr_error; 2882 goto xdr_error;
2790 if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) { 2883 if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
2791 fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; 2884 fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
2792 fattr->timestamp = jiffies;
2793 }
2794xdr_error: 2885xdr_error:
2795 if (status != 0) 2886 dprintk("%s: xdr returned %d\n", __FUNCTION__, -status);
2796 printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
2797 return status; 2887 return status;
2798} 2888}
2799 2889
@@ -2826,8 +2916,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
2826 2916
2827 status = verify_attr_len(xdr, savep, attrlen); 2917 status = verify_attr_len(xdr, savep, attrlen);
2828xdr_error: 2918xdr_error:
2829 if (status != 0) 2919 dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
2830 printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
2831 return status; 2920 return status;
2832} 2921}
2833 2922
@@ -2890,8 +2979,8 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lockres *res)
2890 2979
2891 status = decode_op_hdr(xdr, OP_LOCK); 2980 status = decode_op_hdr(xdr, OP_LOCK);
2892 if (status == 0) { 2981 if (status == 0) {
2893 READ_BUF(sizeof(nfs4_stateid)); 2982 READ_BUF(sizeof(res->u.stateid.data));
2894 COPYMEM(&res->u.stateid, sizeof(res->u.stateid)); 2983 COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data));
2895 } else if (status == -NFS4ERR_DENIED) 2984 } else if (status == -NFS4ERR_DENIED)
2896 return decode_lock_denied(xdr, &res->u.denied); 2985 return decode_lock_denied(xdr, &res->u.denied);
2897 return status; 2986 return status;
@@ -2913,8 +3002,8 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_lockres *res)
2913 3002
2914 status = decode_op_hdr(xdr, OP_LOCKU); 3003 status = decode_op_hdr(xdr, OP_LOCKU);
2915 if (status == 0) { 3004 if (status == 0) {
2916 READ_BUF(sizeof(nfs4_stateid)); 3005 READ_BUF(sizeof(res->u.stateid.data));
2917 COPYMEM(&res->u.stateid, sizeof(res->u.stateid)); 3006 COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data));
2918 } 3007 }
2919 return status; 3008 return status;
2920} 3009}
@@ -2994,7 +3083,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
2994 p += bmlen; 3083 p += bmlen;
2995 return decode_delegation(xdr, res); 3084 return decode_delegation(xdr, res);
2996xdr_error: 3085xdr_error:
2997 printk(KERN_NOTICE "%s: xdr error!\n", __FUNCTION__); 3086 dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen);
2998 return -EIO; 3087 return -EIO;
2999} 3088}
3000 3089
@@ -3208,6 +3297,12 @@ static int decode_renew(struct xdr_stream *xdr)
3208 return decode_op_hdr(xdr, OP_RENEW); 3297 return decode_op_hdr(xdr, OP_RENEW);
3209} 3298}
3210 3299
3300static int
3301decode_restorefh(struct xdr_stream *xdr)
3302{
3303 return decode_op_hdr(xdr, OP_RESTOREFH);
3304}
3305
3211static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, 3306static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
3212 size_t *acl_len) 3307 size_t *acl_len)
3213{ 3308{
@@ -3243,7 +3338,8 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
3243 if (attrlen <= *acl_len) 3338 if (attrlen <= *acl_len)
3244 xdr_read_pages(xdr, attrlen); 3339 xdr_read_pages(xdr, attrlen);
3245 *acl_len = attrlen; 3340 *acl_len = attrlen;
3246 } 3341 } else
3342 status = -EOPNOTSUPP;
3247 3343
3248out: 3344out:
3249 return status; 3345 return status;
@@ -3352,6 +3448,9 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, uint32_t *p, stru
3352 if (status) 3448 if (status)
3353 goto out; 3449 goto out;
3354 status = decode_open_downgrade(&xdr, res); 3450 status = decode_open_downgrade(&xdr, res);
3451 if (status != 0)
3452 goto out;
3453 decode_getfattr(&xdr, res->fattr, res->server);
3355out: 3454out:
3356 return status; 3455 return status;
3357} 3456}
@@ -3424,7 +3523,7 @@ out:
3424/* 3523/*
3425 * Decode REMOVE response 3524 * Decode REMOVE response
3426 */ 3525 */
3427static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_change_info *cinfo) 3526static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_remove_res *res)
3428{ 3527{
3429 struct xdr_stream xdr; 3528 struct xdr_stream xdr;
3430 struct compound_hdr hdr; 3529 struct compound_hdr hdr;
@@ -3433,8 +3532,11 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_
3433 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3532 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
3434 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3533 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
3435 goto out; 3534 goto out;
3436 if ((status = decode_putfh(&xdr)) == 0) 3535 if ((status = decode_putfh(&xdr)) != 0)
3437 status = decode_remove(&xdr, cinfo); 3536 goto out;
3537 if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
3538 goto out;
3539 decode_getfattr(&xdr, res->dir_attr, res->server);
3438out: 3540out:
3439 return status; 3541 return status;
3440} 3542}
@@ -3457,7 +3559,14 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_
3457 goto out; 3559 goto out;
3458 if ((status = decode_putfh(&xdr)) != 0) 3560 if ((status = decode_putfh(&xdr)) != 0)
3459 goto out; 3561 goto out;
3460 status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo); 3562 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
3563 goto out;
3564 /* Current FH is target directory */
3565 if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0)
3566 goto out;
3567 if ((status = decode_restorefh(&xdr)) != 0)
3568 goto out;
3569 decode_getfattr(&xdr, res->old_fattr, res->server);
3461out: 3570out:
3462 return status; 3571 return status;
3463} 3572}
@@ -3465,7 +3574,7 @@ out:
3465/* 3574/*
3466 * Decode LINK response 3575 * Decode LINK response
3467 */ 3576 */
3468static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_change_info *cinfo) 3577static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_link_res *res)
3469{ 3578{
3470 struct xdr_stream xdr; 3579 struct xdr_stream xdr;
3471 struct compound_hdr hdr; 3580 struct compound_hdr hdr;
@@ -3480,7 +3589,17 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_ch
3480 goto out; 3589 goto out;
3481 if ((status = decode_putfh(&xdr)) != 0) 3590 if ((status = decode_putfh(&xdr)) != 0)
3482 goto out; 3591 goto out;
3483 status = decode_link(&xdr, cinfo); 3592 if ((status = decode_link(&xdr, &res->cinfo)) != 0)
3593 goto out;
3594 /*
3595 * Note order: OP_LINK leaves the directory as the current
3596 * filehandle.
3597 */
3598 if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0)
3599 goto out;
3600 if ((status = decode_restorefh(&xdr)) != 0)
3601 goto out;
3602 decode_getfattr(&xdr, res->fattr, res->server);
3484out: 3603out:
3485 return status; 3604 return status;
3486} 3605}
@@ -3499,13 +3618,17 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_
3499 goto out; 3618 goto out;
3500 if ((status = decode_putfh(&xdr)) != 0) 3619 if ((status = decode_putfh(&xdr)) != 0)
3501 goto out; 3620 goto out;
3621 if ((status = decode_savefh(&xdr)) != 0)
3622 goto out;
3502 if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0) 3623 if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0)
3503 goto out; 3624 goto out;
3504 if ((status = decode_getfh(&xdr, res->fh)) != 0) 3625 if ((status = decode_getfh(&xdr, res->fh)) != 0)
3505 goto out; 3626 goto out;
3506 status = decode_getfattr(&xdr, res->fattr, res->server); 3627 if (decode_getfattr(&xdr, res->fattr, res->server) != 0)
3507 if (status == NFS4ERR_DELAY) 3628 goto out;
3508 status = 0; 3629 if ((status = decode_restorefh(&xdr)) != 0)
3630 goto out;
3631 decode_getfattr(&xdr, res->dir_fattr, res->server);
3509out: 3632out:
3510 return status; 3633 return status;
3511} 3634}
@@ -3623,6 +3746,15 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_cl
3623 if (status) 3746 if (status)
3624 goto out; 3747 goto out;
3625 status = decode_close(&xdr, res); 3748 status = decode_close(&xdr, res);
3749 if (status != 0)
3750 goto out;
3751 /*
3752 * Note: Server may do delete on close for this file
3753 * in which case the getattr call will fail with
3754 * an ESTALE error. Shouldn't be a problem,
3755 * though, since fattr->valid will remain unset.
3756 */
3757 decode_getfattr(&xdr, res->fattr, res->server);
3626out: 3758out:
3627 return status; 3759 return status;
3628} 3760}
@@ -3643,15 +3775,20 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_ope
3643 status = decode_putfh(&xdr); 3775 status = decode_putfh(&xdr);
3644 if (status) 3776 if (status)
3645 goto out; 3777 goto out;
3778 status = decode_savefh(&xdr);
3779 if (status)
3780 goto out;
3646 status = decode_open(&xdr, res); 3781 status = decode_open(&xdr, res);
3647 if (status) 3782 if (status)
3648 goto out; 3783 goto out;
3649 status = decode_getfh(&xdr, &res->fh); 3784 status = decode_getfh(&xdr, &res->fh);
3650 if (status) 3785 if (status)
3651 goto out; 3786 goto out;
3652 status = decode_getfattr(&xdr, res->f_attr, res->server); 3787 if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
3653 if (status == NFS4ERR_DELAY) 3788 goto out;
3654 status = 0; 3789 if ((status = decode_restorefh(&xdr)) != 0)
3790 goto out;
3791 decode_getfattr(&xdr, res->dir_attr, res->server);
3655out: 3792out:
3656 return status; 3793 return status;
3657} 3794}
@@ -3869,6 +4006,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_wr
3869 if (status) 4006 if (status)
3870 goto out; 4007 goto out;
3871 status = decode_write(&xdr, res); 4008 status = decode_write(&xdr, res);
4009 if (status)
4010 goto out;
4011 decode_getfattr(&xdr, res->fattr, res->server);
3872 if (!status) 4012 if (!status)
3873 status = res->count; 4013 status = res->count;
3874out: 4014out:
@@ -3892,6 +4032,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_w
3892 if (status) 4032 if (status)
3893 goto out; 4033 goto out;
3894 status = decode_commit(&xdr, res); 4034 status = decode_commit(&xdr, res);
4035 if (status)
4036 goto out;
4037 decode_getfattr(&xdr, res->fattr, res->server);
3895out: 4038out:
3896 return status; 4039 return status;
3897} 4040}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index be23c3fb9260..a48a003242c0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -61,7 +61,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
61 int status; 61 int status;
62 62
63 dprintk("%s: call getattr\n", __FUNCTION__); 63 dprintk("%s: call getattr\n", __FUNCTION__);
64 fattr->valid = 0; 64 nfs_fattr_init(fattr);
65 status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0); 65 status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0);
66 dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); 66 dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
67 if (status) 67 if (status)
@@ -93,7 +93,7 @@ nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
93 int status; 93 int status;
94 94
95 dprintk("NFS call getattr\n"); 95 dprintk("NFS call getattr\n");
96 fattr->valid = 0; 96 nfs_fattr_init(fattr);
97 status = rpc_call(server->client, NFSPROC_GETATTR, 97 status = rpc_call(server->client, NFSPROC_GETATTR,
98 fhandle, fattr, 0); 98 fhandle, fattr, 0);
99 dprintk("NFS reply getattr: %d\n", status); 99 dprintk("NFS reply getattr: %d\n", status);
@@ -112,7 +112,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
112 int status; 112 int status;
113 113
114 dprintk("NFS call setattr\n"); 114 dprintk("NFS call setattr\n");
115 fattr->valid = 0; 115 nfs_fattr_init(fattr);
116 status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0); 116 status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0);
117 if (status == 0) 117 if (status == 0)
118 nfs_setattr_update_inode(inode, sattr); 118 nfs_setattr_update_inode(inode, sattr);
@@ -136,7 +136,7 @@ nfs_proc_lookup(struct inode *dir, struct qstr *name,
136 int status; 136 int status;
137 137
138 dprintk("NFS call lookup %s\n", name->name); 138 dprintk("NFS call lookup %s\n", name->name);
139 fattr->valid = 0; 139 nfs_fattr_init(fattr);
140 status = rpc_call(NFS_CLIENT(dir), NFSPROC_LOOKUP, &arg, &res, 0); 140 status = rpc_call(NFS_CLIENT(dir), NFSPROC_LOOKUP, &arg, &res, 0);
141 dprintk("NFS reply lookup: %d\n", status); 141 dprintk("NFS reply lookup: %d\n", status);
142 return status; 142 return status;
@@ -174,7 +174,7 @@ static int nfs_proc_read(struct nfs_read_data *rdata)
174 174
175 dprintk("NFS call read %d @ %Ld\n", rdata->args.count, 175 dprintk("NFS call read %d @ %Ld\n", rdata->args.count,
176 (long long) rdata->args.offset); 176 (long long) rdata->args.offset);
177 fattr->valid = 0; 177 nfs_fattr_init(fattr);
178 status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); 178 status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
179 if (status >= 0) { 179 if (status >= 0) {
180 nfs_refresh_inode(inode, fattr); 180 nfs_refresh_inode(inode, fattr);
@@ -203,10 +203,10 @@ static int nfs_proc_write(struct nfs_write_data *wdata)
203 203
204 dprintk("NFS call write %d @ %Ld\n", wdata->args.count, 204 dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
205 (long long) wdata->args.offset); 205 (long long) wdata->args.offset);
206 fattr->valid = 0; 206 nfs_fattr_init(fattr);
207 status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); 207 status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
208 if (status >= 0) { 208 if (status >= 0) {
209 nfs_refresh_inode(inode, fattr); 209 nfs_post_op_update_inode(inode, fattr);
210 wdata->res.count = wdata->args.count; 210 wdata->res.count = wdata->args.count;
211 wdata->verf.committed = NFS_FILE_SYNC; 211 wdata->verf.committed = NFS_FILE_SYNC;
212 } 212 }
@@ -216,7 +216,7 @@ static int nfs_proc_write(struct nfs_write_data *wdata)
216 216
217static int 217static int
218nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 218nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
219 int flags) 219 int flags, struct nameidata *nd)
220{ 220{
221 struct nfs_fh fhandle; 221 struct nfs_fh fhandle;
222 struct nfs_fattr fattr; 222 struct nfs_fattr fattr;
@@ -232,7 +232,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
232 }; 232 };
233 int status; 233 int status;
234 234
235 fattr.valid = 0; 235 nfs_fattr_init(&fattr);
236 dprintk("NFS call create %s\n", dentry->d_name.name); 236 dprintk("NFS call create %s\n", dentry->d_name.name);
237 status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); 237 status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
238 if (status == 0) 238 if (status == 0)
@@ -273,12 +273,13 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
273 sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ 273 sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
274 } 274 }
275 275
276 fattr.valid = 0; 276 nfs_fattr_init(&fattr);
277 status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); 277 status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
278 nfs_mark_for_revalidate(dir);
278 279
279 if (status == -EINVAL && S_ISFIFO(mode)) { 280 if (status == -EINVAL && S_ISFIFO(mode)) {
280 sattr->ia_mode = mode; 281 sattr->ia_mode = mode;
281 fattr.valid = 0; 282 nfs_fattr_init(&fattr);
282 status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0); 283 status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
283 } 284 }
284 if (status == 0) 285 if (status == 0)
@@ -305,6 +306,7 @@ nfs_proc_remove(struct inode *dir, struct qstr *name)
305 306
306 dprintk("NFS call remove %s\n", name->name); 307 dprintk("NFS call remove %s\n", name->name);
307 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 308 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
309 nfs_mark_for_revalidate(dir);
308 310
309 dprintk("NFS reply remove: %d\n", status); 311 dprintk("NFS reply remove: %d\n", status);
310 return status; 312 return status;
@@ -331,8 +333,10 @@ nfs_proc_unlink_done(struct dentry *dir, struct rpc_task *task)
331{ 333{
332 struct rpc_message *msg = &task->tk_msg; 334 struct rpc_message *msg = &task->tk_msg;
333 335
334 if (msg->rpc_argp) 336 if (msg->rpc_argp) {
337 nfs_mark_for_revalidate(dir->d_inode);
335 kfree(msg->rpc_argp); 338 kfree(msg->rpc_argp);
339 }
336 return 0; 340 return 0;
337} 341}
338 342
@@ -352,6 +356,8 @@ nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
352 356
353 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); 357 dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
354 status = rpc_call(NFS_CLIENT(old_dir), NFSPROC_RENAME, &arg, NULL, 0); 358 status = rpc_call(NFS_CLIENT(old_dir), NFSPROC_RENAME, &arg, NULL, 0);
359 nfs_mark_for_revalidate(old_dir);
360 nfs_mark_for_revalidate(new_dir);
355 dprintk("NFS reply rename: %d\n", status); 361 dprintk("NFS reply rename: %d\n", status);
356 return status; 362 return status;
357} 363}
@@ -369,6 +375,7 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
369 375
370 dprintk("NFS call link %s\n", name->name); 376 dprintk("NFS call link %s\n", name->name);
371 status = rpc_call(NFS_CLIENT(inode), NFSPROC_LINK, &arg, NULL, 0); 377 status = rpc_call(NFS_CLIENT(inode), NFSPROC_LINK, &arg, NULL, 0);
378 nfs_mark_for_revalidate(dir);
372 dprintk("NFS reply link: %d\n", status); 379 dprintk("NFS reply link: %d\n", status);
373 return status; 380 return status;
374} 381}
@@ -391,9 +398,10 @@ nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
391 if (path->len > NFS2_MAXPATHLEN) 398 if (path->len > NFS2_MAXPATHLEN)
392 return -ENAMETOOLONG; 399 return -ENAMETOOLONG;
393 dprintk("NFS call symlink %s -> %s\n", name->name, path->name); 400 dprintk("NFS call symlink %s -> %s\n", name->name, path->name);
394 fattr->valid = 0; 401 nfs_fattr_init(fattr);
395 fhandle->size = 0; 402 fhandle->size = 0;
396 status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0); 403 status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0);
404 nfs_mark_for_revalidate(dir);
397 dprintk("NFS reply symlink: %d\n", status); 405 dprintk("NFS reply symlink: %d\n", status);
398 return status; 406 return status;
399} 407}
@@ -416,8 +424,9 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
416 int status; 424 int status;
417 425
418 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 426 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
419 fattr.valid = 0; 427 nfs_fattr_init(&fattr);
420 status = rpc_call(NFS_CLIENT(dir), NFSPROC_MKDIR, &arg, &res, 0); 428 status = rpc_call(NFS_CLIENT(dir), NFSPROC_MKDIR, &arg, &res, 0);
429 nfs_mark_for_revalidate(dir);
421 if (status == 0) 430 if (status == 0)
422 status = nfs_instantiate(dentry, &fhandle, &fattr); 431 status = nfs_instantiate(dentry, &fhandle, &fattr);
423 dprintk("NFS reply mkdir: %d\n", status); 432 dprintk("NFS reply mkdir: %d\n", status);
@@ -436,6 +445,7 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
436 445
437 dprintk("NFS call rmdir %s\n", name->name); 446 dprintk("NFS call rmdir %s\n", name->name);
438 status = rpc_call(NFS_CLIENT(dir), NFSPROC_RMDIR, &arg, NULL, 0); 447 status = rpc_call(NFS_CLIENT(dir), NFSPROC_RMDIR, &arg, NULL, 0);
448 nfs_mark_for_revalidate(dir);
439 dprintk("NFS reply rmdir: %d\n", status); 449 dprintk("NFS reply rmdir: %d\n", status);
440 return status; 450 return status;
441} 451}
@@ -484,7 +494,7 @@ nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
484 int status; 494 int status;
485 495
486 dprintk("NFS call statfs\n"); 496 dprintk("NFS call statfs\n");
487 stat->fattr->valid = 0; 497 nfs_fattr_init(stat->fattr);
488 status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); 498 status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0);
489 dprintk("NFS reply statfs: %d\n", status); 499 dprintk("NFS reply statfs: %d\n", status);
490 if (status) 500 if (status)
@@ -507,7 +517,7 @@ nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
507 int status; 517 int status;
508 518
509 dprintk("NFS call fsinfo\n"); 519 dprintk("NFS call fsinfo\n");
510 info->fattr->valid = 0; 520 nfs_fattr_init(info->fattr);
511 status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); 521 status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0);
512 dprintk("NFS reply fsinfo: %d\n", status); 522 dprintk("NFS reply fsinfo: %d\n", status);
513 if (status) 523 if (status)
@@ -579,7 +589,7 @@ nfs_write_done(struct rpc_task *task)
579 struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; 589 struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata;
580 590
581 if (task->tk_status >= 0) 591 if (task->tk_status >= 0)
582 nfs_refresh_inode(data->inode, data->res.fattr); 592 nfs_post_op_update_inode(data->inode, data->res.fattr);
583 nfs_writeback_done(task); 593 nfs_writeback_done(task);
584} 594}
585 595
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 9758ebd49905..43b03b19731b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -215,6 +215,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
215 data->res.fattr = &data->fattr; 215 data->res.fattr = &data->fattr;
216 data->res.count = count; 216 data->res.count = count;
217 data->res.eof = 0; 217 data->res.eof = 0;
218 nfs_fattr_init(&data->fattr);
218 219
219 NFS_PROTO(inode)->read_setup(data); 220 NFS_PROTO(inode)->read_setup(data);
220 221
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5130eda231d7..819a65f5071f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -870,6 +870,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
870 data->res.fattr = &data->fattr; 870 data->res.fattr = &data->fattr;
871 data->res.count = count; 871 data->res.count = count;
872 data->res.verf = &data->verf; 872 data->res.verf = &data->verf;
873 nfs_fattr_init(&data->fattr);
873 874
874 NFS_PROTO(inode)->write_setup(data, how); 875 NFS_PROTO(inode)->write_setup(data, how);
875 876
@@ -1237,6 +1238,7 @@ static void nfs_commit_rpcsetup(struct list_head *head,
1237 data->res.count = 0; 1238 data->res.count = 0;
1238 data->res.fattr = &data->fattr; 1239 data->res.fattr = &data->fattr;
1239 data->res.verf = &data->verf; 1240 data->res.verf = &data->verf;
1241 nfs_fattr_init(&data->fattr);
1240 1242
1241 NFS_PROTO(inode)->commit_setup(data, how); 1243 NFS_PROTO(inode)->commit_setup(data, how);
1242 1244
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index de58579a1d0e..50a7749cfca1 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -1,18 +1,15 @@
1ToDo/Notes: 1ToDo/Notes:
2 - Find and fix bugs. 2 - Find and fix bugs.
3 - In between ntfs_prepare/commit_write, need exclusion between 3 - The only places in the kernel where a file is resized are
4 simultaneous file extensions. This is given to us by holding i_sem 4 ntfs_file_write*() and ntfs_truncate() for both of which i_sem is
5 on the inode. The only places in the kernel when a file is resized 5 held. Just have to be careful in read-/writepage and other helpers
6 are prepare/commit write and truncate for both of which i_sem is 6 not running under i_sem that we play nice... Also need to be careful
7 held. Just have to be careful in readpage/writepage and all other 7 with initialized_size extension in ntfs_file_write*() and writepage.
8 helpers not running under i_sem that we play nice... 8 UPDATE: The only things that need to be checked are the compressed
9 Also need to be careful with initialized_size extention in 9 write and the other attribute resize/write cases like index
10 ntfs_prepare_write. Basically, just be _very_ careful in this code... 10 attributes, etc. For now none of these are implemented so are safe.
11 UPDATE: The only things that need to be checked are read/writepage 11 - Implement filling in of holes in aops.c::ntfs_writepage() and its
12 which do not hold i_sem. Note writepage cannot change i_size but it 12 helpers.
13 needs to cope with a concurrent i_size change, just like readpage.
14 Also both need to cope with concurrent changes to the other sizes,
15 i.e. initialized/allocated/compressed size, as well.
16 - Implement mft.c::sync_mft_mirror_umount(). We currently will just 13 - Implement mft.c::sync_mft_mirror_umount(). We currently will just
17 leave the volume dirty on umount if the final iput(vol->mft_ino) 14 leave the volume dirty on umount if the final iput(vol->mft_ino)
18 causes a write of any mirrored mft records due to the mft mirror 15 causes a write of any mirrored mft records due to the mft mirror
@@ -22,6 +19,68 @@ ToDo/Notes:
22 - Enable the code for setting the NT4 compatibility flag when we start 19 - Enable the code for setting the NT4 compatibility flag when we start
23 making NTFS 1.2 specific modifications. 20 making NTFS 1.2 specific modifications.
24 21
222.1.25 - (Almost) fully implement write(2) and truncate(2).
23
24 - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and
25 {__,}ntfs_cluster_free() to also take an optional attribute search
26 context as argument. This allows calling these functions with the
27 mft record mapped. Update all callers.
28 - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock()
29 error handling by passing in the active search context when calling
30 ntfs_cluster_free().
31 - Change ntfs_cluster_alloc() to take an extra boolean parameter
32 specifying whether the cluster are being allocated to extend an
33 attribute or to fill a hole.
34 - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc()
35 with @is_extension set to TRUE and remove the runlist terminator
36 fixup code as this is now done by ntfs_cluster_alloc().
37 - Change ntfs_attr_make_non_resident to take the attribute value size
38 as an extra parameter. This is needed since we need to know the size
39 before we can map the mft record and our callers always know it. The
40 reason we cannot simply read the size from the vfs inode i_size is
41 that this is not necessarily uptodate. This happens when
42 ntfs_attr_make_non_resident() is called in the ->truncate call path.
43 - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks
44 which is zero for a resident attribute but should no longer be zero
45 once the attribute is non-resident as it then has real clusters
46 allocated.
47 - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to
48 extend the allocation of an attributes. Optionally, the data size,
49 but not the initialized size can be extended, too.
50 - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports
51 uncompressed and unencrypted files and it never creates sparse files
52 at least for the moment (making a file sparse requires us to modify
53 its directory entries and we do not support directory operations at
54 the moment). Also, support for highly fragmented files, i.e. ones
55 whose data attribute is split across multiple extents, is severly
56 limited. When such a case is encountered, EOPNOTSUPP is returned.
57 - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes
58 the initial implementation of file truncation. Now both open(2)ing
59 a file with the O_TRUNC flag and the {,f}truncate(2) system calls
60 will resize a file appropriately. The limitations are that only
61 uncompressed and unencrypted files are supported. Also, there is
62 only very limited support for highly fragmented files (the ones whose
63 $DATA attribute is split into multiple attribute extents).
64 - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited()
65 and cond_resched() in the main loop as we could be dirtying a lot of
66 pages and this ensures we play nice with the VM and the system as a
67 whole.
68 - Implement file operations ->write, ->aio_write, ->writev for regular
69 files. This replaces the old use of generic_file_write(), et al and
70 the address space operations ->prepare_write and ->commit_write.
71 This means that both sparse and non-sparse (unencrypted and
72 uncompressed) files can now be extended using the normal write(2)
73 code path. There are two limitations at present and these are that
74 we never create sparse files and that we only have limited support
75 for highly fragmented files, i.e. ones whose data attribute is split
76 across multiple extents. When such a case is encountered,
77 EOPNOTSUPP is returned.
78 - $EA attributes can be both resident and non-resident.
79 - Use %z for size_t to fix compilation warnings. (Andrew Morton)
80 - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0.
81 - Document extended attribute ($EA) NEED_EA flag. (Based on libntfs
82 patch by Yura Pakhuchiy.)
83
252.1.24 - Lots of bug fixes and support more clean journal states. 842.1.24 - Lots of bug fixes and support more clean journal states.
26 85
27 - Support journals ($LogFile) which have been modified by chkdsk. This 86 - Support journals ($LogFile) which have been modified by chkdsk. This
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 894b2b876d35..d0d45d1c853a 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.24\" 9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.25\"
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG 12EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 5e80c07c6a4d..1c0a4315876a 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1391,8 +1391,7 @@ retry_writepage:
1391 if (NInoEncrypted(ni)) { 1391 if (NInoEncrypted(ni)) {
1392 unlock_page(page); 1392 unlock_page(page);
1393 BUG_ON(ni->type != AT_DATA); 1393 BUG_ON(ni->type != AT_DATA);
1394 ntfs_debug("Denying write access to encrypted " 1394 ntfs_debug("Denying write access to encrypted file.");
1395 "file.");
1396 return -EACCES; 1395 return -EACCES;
1397 } 1396 }
1398 /* Compressed data streams are handled in compress.c. */ 1397 /* Compressed data streams are handled in compress.c. */
@@ -1508,8 +1507,8 @@ retry_writepage:
1508 /* Zero out of bounds area in the page cache page. */ 1507 /* Zero out of bounds area in the page cache page. */
1509 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); 1508 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1510 kunmap_atomic(kaddr, KM_USER0); 1509 kunmap_atomic(kaddr, KM_USER0);
1511 flush_dcache_mft_record_page(ctx->ntfs_ino);
1512 flush_dcache_page(page); 1510 flush_dcache_page(page);
1511 flush_dcache_mft_record_page(ctx->ntfs_ino);
1513 /* We are done with the page. */ 1512 /* We are done with the page. */
1514 end_page_writeback(page); 1513 end_page_writeback(page);
1515 /* Finally, mark the mft record dirty, so it gets written back. */ 1514 /* Finally, mark the mft record dirty, so it gets written back. */
@@ -1542,830 +1541,6 @@ err_out:
1542 return err; 1541 return err;
1543} 1542}
1544 1543
1545/**
1546 * ntfs_prepare_nonresident_write -
1547 *
1548 */
1549static int ntfs_prepare_nonresident_write(struct page *page,
1550 unsigned from, unsigned to)
1551{
1552 VCN vcn;
1553 LCN lcn;
1554 s64 initialized_size;
1555 loff_t i_size;
1556 sector_t block, ablock, iblock;
1557 struct inode *vi;
1558 ntfs_inode *ni;
1559 ntfs_volume *vol;
1560 runlist_element *rl;
1561 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
1562 unsigned long flags;
1563 unsigned int vcn_ofs, block_start, block_end, blocksize;
1564 int err;
1565 BOOL is_retry;
1566 unsigned char blocksize_bits;
1567
1568 vi = page->mapping->host;
1569 ni = NTFS_I(vi);
1570 vol = ni->vol;
1571
1572 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1573 "0x%lx, from = %u, to = %u.", ni->mft_no, ni->type,
1574 page->index, from, to);
1575
1576 BUG_ON(!NInoNonResident(ni));
1577
1578 blocksize_bits = vi->i_blkbits;
1579 blocksize = 1 << blocksize_bits;
1580
1581 /*
1582 * create_empty_buffers() will create uptodate/dirty buffers if the
1583 * page is uptodate/dirty.
1584 */
1585 if (!page_has_buffers(page))
1586 create_empty_buffers(page, blocksize, 0);
1587 bh = head = page_buffers(page);
1588 if (unlikely(!bh))
1589 return -ENOMEM;
1590
1591 /* The first block in the page. */
1592 block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
1593
1594 read_lock_irqsave(&ni->size_lock, flags);
1595 /*
1596 * The first out of bounds block for the allocated size. No need to
1597 * round up as allocated_size is in multiples of cluster size and the
1598 * minimum cluster size is 512 bytes, which is equal to the smallest
1599 * blocksize.
1600 */
1601 ablock = ni->allocated_size >> blocksize_bits;
1602 i_size = i_size_read(vi);
1603 initialized_size = ni->initialized_size;
1604 read_unlock_irqrestore(&ni->size_lock, flags);
1605
1606 /* The last (fully or partially) initialized block. */
1607 iblock = initialized_size >> blocksize_bits;
1608
1609 /* Loop through all the buffers in the page. */
1610 block_start = 0;
1611 rl = NULL;
1612 err = 0;
1613 do {
1614 block_end = block_start + blocksize;
1615 /*
1616 * If buffer @bh is outside the write, just mark it uptodate
1617 * if the page is uptodate and continue with the next buffer.
1618 */
1619 if (block_end <= from || block_start >= to) {
1620 if (PageUptodate(page)) {
1621 if (!buffer_uptodate(bh))
1622 set_buffer_uptodate(bh);
1623 }
1624 continue;
1625 }
1626 /*
1627 * @bh is at least partially being written to.
1628 * Make sure it is not marked as new.
1629 */
1630 //if (buffer_new(bh))
1631 // clear_buffer_new(bh);
1632
1633 if (block >= ablock) {
1634 // TODO: block is above allocated_size, need to
1635 // allocate it. Best done in one go to accommodate not
1636 // only block but all above blocks up to and including:
1637 // ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
1638 // - 1) >> blobksize_bits. Obviously will need to round
1639 // up to next cluster boundary, too. This should be
1640 // done with a helper function, so it can be reused.
1641 ntfs_error(vol->sb, "Writing beyond allocated size "
1642 "is not supported yet. Sorry.");
1643 err = -EOPNOTSUPP;
1644 goto err_out;
1645 // Need to update ablock.
1646 // Need to set_buffer_new() on all block bhs that are
1647 // newly allocated.
1648 }
1649 /*
1650 * Now we have enough allocated size to fulfill the whole
1651 * request, i.e. block < ablock is true.
1652 */
1653 if (unlikely((block >= iblock) &&
1654 (initialized_size < i_size))) {
1655 /*
1656 * If this page is fully outside initialized size, zero
1657 * out all pages between the current initialized size
1658 * and the current page. Just use ntfs_readpage() to do
1659 * the zeroing transparently.
1660 */
1661 if (block > iblock) {
1662 // TODO:
1663 // For each page do:
1664 // - read_cache_page()
1665 // Again for each page do:
1666 // - wait_on_page_locked()
1667 // - Check (PageUptodate(page) &&
1668 // !PageError(page))
1669 // Update initialized size in the attribute and
1670 // in the inode.
1671 // Again, for each page do:
1672 // __set_page_dirty_buffers();
1673 // page_cache_release()
1674 // We don't need to wait on the writes.
1675 // Update iblock.
1676 }
1677 /*
1678 * The current page straddles initialized size. Zero
1679 * all non-uptodate buffers and set them uptodate (and
1680 * dirty?). Note, there aren't any non-uptodate buffers
1681 * if the page is uptodate.
1682 * FIXME: For an uptodate page, the buffers may need to
1683 * be written out because they were not initialized on
1684 * disk before.
1685 */
1686 if (!PageUptodate(page)) {
1687 // TODO:
1688 // Zero any non-uptodate buffers up to i_size.
1689 // Set them uptodate and dirty.
1690 }
1691 // TODO:
1692 // Update initialized size in the attribute and in the
1693 // inode (up to i_size).
1694 // Update iblock.
1695 // FIXME: This is inefficient. Try to batch the two
1696 // size changes to happen in one go.
1697 ntfs_error(vol->sb, "Writing beyond initialized size "
1698 "is not supported yet. Sorry.");
1699 err = -EOPNOTSUPP;
1700 goto err_out;
1701 // Do NOT set_buffer_new() BUT DO clear buffer range
1702 // outside write request range.
1703 // set_buffer_uptodate() on complete buffers as well as
1704 // set_buffer_dirty().
1705 }
1706
1707 /* Need to map unmapped buffers. */
1708 if (!buffer_mapped(bh)) {
1709 /* Unmapped buffer. Need to map it. */
1710 bh->b_bdev = vol->sb->s_bdev;
1711
1712 /* Convert block into corresponding vcn and offset. */
1713 vcn = (VCN)block << blocksize_bits >>
1714 vol->cluster_size_bits;
1715 vcn_ofs = ((VCN)block << blocksize_bits) &
1716 vol->cluster_size_mask;
1717
1718 is_retry = FALSE;
1719 if (!rl) {
1720lock_retry_remap:
1721 down_read(&ni->runlist.lock);
1722 rl = ni->runlist.rl;
1723 }
1724 if (likely(rl != NULL)) {
1725 /* Seek to element containing target vcn. */
1726 while (rl->length && rl[1].vcn <= vcn)
1727 rl++;
1728 lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1729 } else
1730 lcn = LCN_RL_NOT_MAPPED;
1731 if (unlikely(lcn < 0)) {
1732 /*
1733 * We extended the attribute allocation above.
1734 * If we hit an ENOENT here it means that the
1735 * allocation was insufficient which is a bug.
1736 */
1737 BUG_ON(lcn == LCN_ENOENT);
1738
1739 /* It is a hole, need to instantiate it. */
1740 if (lcn == LCN_HOLE) {
1741 // TODO: Instantiate the hole.
1742 // clear_buffer_new(bh);
1743 // unmap_underlying_metadata(bh->b_bdev,
1744 // bh->b_blocknr);
1745 // For non-uptodate buffers, need to
1746 // zero out the region outside the
1747 // request in this bh or all bhs,
1748 // depending on what we implemented
1749 // above.
1750 // Need to flush_dcache_page().
1751 // Or could use set_buffer_new()
1752 // instead?
1753 ntfs_error(vol->sb, "Writing into "
1754 "sparse regions is "
1755 "not supported yet. "
1756 "Sorry.");
1757 err = -EOPNOTSUPP;
1758 if (!rl)
1759 up_read(&ni->runlist.lock);
1760 goto err_out;
1761 } else if (!is_retry &&
1762 lcn == LCN_RL_NOT_MAPPED) {
1763 is_retry = TRUE;
1764 /*
1765 * Attempt to map runlist, dropping
1766 * lock for the duration.
1767 */
1768 up_read(&ni->runlist.lock);
1769 err = ntfs_map_runlist(ni, vcn);
1770 if (likely(!err))
1771 goto lock_retry_remap;
1772 rl = NULL;
1773 } else if (!rl)
1774 up_read(&ni->runlist.lock);
1775 /*
1776 * Failed to map the buffer, even after
1777 * retrying.
1778 */
1779 if (!err)
1780 err = -EIO;
1781 bh->b_blocknr = -1;
1782 ntfs_error(vol->sb, "Failed to write to inode "
1783 "0x%lx, attribute type 0x%x, "
1784 "vcn 0x%llx, offset 0x%x "
1785 "because its location on disk "
1786 "could not be determined%s "
1787 "(error code %i).",
1788 ni->mft_no, ni->type,
1789 (unsigned long long)vcn,
1790 vcn_ofs, is_retry ? " even "
1791 "after retrying" : "", err);
1792 goto err_out;
1793 }
1794 /* We now have a successful remap, i.e. lcn >= 0. */
1795
1796 /* Setup buffer head to correct block. */
1797 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
1798 + vcn_ofs) >> blocksize_bits;
1799 set_buffer_mapped(bh);
1800
1801 // FIXME: Something analogous to this is needed for
1802 // each newly allocated block, i.e. BH_New.
1803 // FIXME: Might need to take this out of the
1804 // if (!buffer_mapped(bh)) {}, depending on how we
1805 // implement things during the allocated_size and
1806 // initialized_size extension code above.
1807 if (buffer_new(bh)) {
1808 clear_buffer_new(bh);
1809 unmap_underlying_metadata(bh->b_bdev,
1810 bh->b_blocknr);
1811 if (PageUptodate(page)) {
1812 set_buffer_uptodate(bh);
1813 continue;
1814 }
1815 /*
1816 * Page is _not_ uptodate, zero surrounding
1817 * region. NOTE: This is how we decide if to
1818 * zero or not!
1819 */
1820 if (block_end > to || block_start < from) {
1821 void *kaddr;
1822
1823 kaddr = kmap_atomic(page, KM_USER0);
1824 if (block_end > to)
1825 memset(kaddr + to, 0,
1826 block_end - to);
1827 if (block_start < from)
1828 memset(kaddr + block_start, 0,
1829 from -
1830 block_start);
1831 flush_dcache_page(page);
1832 kunmap_atomic(kaddr, KM_USER0);
1833 }
1834 continue;
1835 }
1836 }
1837 /* @bh is mapped, set it uptodate if the page is uptodate. */
1838 if (PageUptodate(page)) {
1839 if (!buffer_uptodate(bh))
1840 set_buffer_uptodate(bh);
1841 continue;
1842 }
1843 /*
1844 * The page is not uptodate. The buffer is mapped. If it is not
1845 * uptodate, and it is only partially being written to, we need
1846 * to read the buffer in before the write, i.e. right now.
1847 */
1848 if (!buffer_uptodate(bh) &&
1849 (block_start < from || block_end > to)) {
1850 ll_rw_block(READ, 1, &bh);
1851 *wait_bh++ = bh;
1852 }
1853 } while (block++, block_start = block_end,
1854 (bh = bh->b_this_page) != head);
1855
1856 /* Release the lock if we took it. */
1857 if (rl) {
1858 up_read(&ni->runlist.lock);
1859 rl = NULL;
1860 }
1861
1862 /* If we issued read requests, let them complete. */
1863 while (wait_bh > wait) {
1864 wait_on_buffer(*--wait_bh);
1865 if (!buffer_uptodate(*wait_bh))
1866 return -EIO;
1867 }
1868
1869 ntfs_debug("Done.");
1870 return 0;
1871err_out:
1872 /*
1873 * Zero out any newly allocated blocks to avoid exposing stale data.
1874 * If BH_New is set, we know that the block was newly allocated in the
1875 * above loop.
1876 * FIXME: What about initialized_size increments? Have we done all the
1877 * required zeroing above? If not this error handling is broken, and
1878 * in particular the if (block_end <= from) check is completely bogus.
1879 */
1880 bh = head;
1881 block_start = 0;
1882 is_retry = FALSE;
1883 do {
1884 block_end = block_start + blocksize;
1885 if (block_end <= from)
1886 continue;
1887 if (block_start >= to)
1888 break;
1889 if (buffer_new(bh)) {
1890 void *kaddr;
1891
1892 clear_buffer_new(bh);
1893 kaddr = kmap_atomic(page, KM_USER0);
1894 memset(kaddr + block_start, 0, bh->b_size);
1895 kunmap_atomic(kaddr, KM_USER0);
1896 set_buffer_uptodate(bh);
1897 mark_buffer_dirty(bh);
1898 is_retry = TRUE;
1899 }
1900 } while (block_start = block_end, (bh = bh->b_this_page) != head);
1901 if (is_retry)
1902 flush_dcache_page(page);
1903 if (rl)
1904 up_read(&ni->runlist.lock);
1905 return err;
1906}
1907
1908/**
1909 * ntfs_prepare_write - prepare a page for receiving data
1910 *
1911 * This is called from generic_file_write() with i_sem held on the inode
1912 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
1913 * data has not yet been copied into the @page.
1914 *
1915 * Need to extend the attribute/fill in holes if necessary, create blocks and
1916 * make partially overwritten blocks uptodate,
1917 *
1918 * i_size is not to be modified yet.
1919 *
1920 * Return 0 on success or -errno on error.
1921 *
1922 * Should be using block_prepare_write() [support for sparse files] or
1923 * cont_prepare_write() [no support for sparse files]. Cannot do that due to
1924 * ntfs specifics but can look at them for implementation guidance.
1925 *
1926 * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
1927 * the first byte in the page that will be written to and @to is the first byte
1928 * after the last byte that will be written to.
1929 */
1930static int ntfs_prepare_write(struct file *file, struct page *page,
1931 unsigned from, unsigned to)
1932{
1933 s64 new_size;
1934 loff_t i_size;
1935 struct inode *vi = page->mapping->host;
1936 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1937 ntfs_volume *vol = ni->vol;
1938 ntfs_attr_search_ctx *ctx = NULL;
1939 MFT_RECORD *m = NULL;
1940 ATTR_RECORD *a;
1941 u8 *kaddr;
1942 u32 attr_len;
1943 int err;
1944
1945 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1946 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1947 page->index, from, to);
1948 BUG_ON(!PageLocked(page));
1949 BUG_ON(from > PAGE_CACHE_SIZE);
1950 BUG_ON(to > PAGE_CACHE_SIZE);
1951 BUG_ON(from > to);
1952 BUG_ON(NInoMstProtected(ni));
1953 /*
1954 * If a previous ntfs_truncate() failed, repeat it and abort if it
1955 * fails again.
1956 */
1957 if (unlikely(NInoTruncateFailed(ni))) {
1958 down_write(&vi->i_alloc_sem);
1959 err = ntfs_truncate(vi);
1960 up_write(&vi->i_alloc_sem);
1961 if (err || NInoTruncateFailed(ni)) {
1962 if (!err)
1963 err = -EIO;
1964 goto err_out;
1965 }
1966 }
1967 /* If the attribute is not resident, deal with it elsewhere. */
1968 if (NInoNonResident(ni)) {
1969 /*
1970 * Only unnamed $DATA attributes can be compressed, encrypted,
1971 * and/or sparse.
1972 */
1973 if (ni->type == AT_DATA && !ni->name_len) {
1974 /* If file is encrypted, deny access, just like NT4. */
1975 if (NInoEncrypted(ni)) {
1976 ntfs_debug("Denying write access to encrypted "
1977 "file.");
1978 return -EACCES;
1979 }
1980 /* Compressed data streams are handled in compress.c. */
1981 if (NInoCompressed(ni)) {
1982 // TODO: Implement and replace this check with
1983 // return ntfs_write_compressed_block(page);
1984 ntfs_error(vi->i_sb, "Writing to compressed "
1985 "files is not supported yet. "
1986 "Sorry.");
1987 return -EOPNOTSUPP;
1988 }
1989 // TODO: Implement and remove this check.
1990 if (NInoSparse(ni)) {
1991 ntfs_error(vi->i_sb, "Writing to sparse files "
1992 "is not supported yet. Sorry.");
1993 return -EOPNOTSUPP;
1994 }
1995 }
1996 /* Normal data stream. */
1997 return ntfs_prepare_nonresident_write(page, from, to);
1998 }
1999 /*
2000 * Attribute is resident, implying it is not compressed, encrypted, or
2001 * sparse.
2002 */
2003 BUG_ON(page_has_buffers(page));
2004 new_size = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
2005 /* If we do not need to resize the attribute allocation we are done. */
2006 if (new_size <= i_size_read(vi))
2007 goto done;
2008 /* Map, pin, and lock the (base) mft record. */
2009 if (!NInoAttr(ni))
2010 base_ni = ni;
2011 else
2012 base_ni = ni->ext.base_ntfs_ino;
2013 m = map_mft_record(base_ni);
2014 if (IS_ERR(m)) {
2015 err = PTR_ERR(m);
2016 m = NULL;
2017 ctx = NULL;
2018 goto err_out;
2019 }
2020 ctx = ntfs_attr_get_search_ctx(base_ni, m);
2021 if (unlikely(!ctx)) {
2022 err = -ENOMEM;
2023 goto err_out;
2024 }
2025 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2026 CASE_SENSITIVE, 0, NULL, 0, ctx);
2027 if (unlikely(err)) {
2028 if (err == -ENOENT)
2029 err = -EIO;
2030 goto err_out;
2031 }
2032 m = ctx->mrec;
2033 a = ctx->attr;
2034 /* The total length of the attribute value. */
2035 attr_len = le32_to_cpu(a->data.resident.value_length);
2036 /* Fix an eventual previous failure of ntfs_commit_write(). */
2037 i_size = i_size_read(vi);
2038 if (unlikely(attr_len > i_size)) {
2039 attr_len = i_size;
2040 a->data.resident.value_length = cpu_to_le32(attr_len);
2041 }
2042 /* If we do not need to resize the attribute allocation we are done. */
2043 if (new_size <= attr_len)
2044 goto done_unm;
2045 /* Check if new size is allowed in $AttrDef. */
2046 err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
2047 if (unlikely(err)) {
2048 if (err == -ERANGE) {
2049 ntfs_error(vol->sb, "Write would cause the inode "
2050 "0x%lx to exceed the maximum size for "
2051 "its attribute type (0x%x). Aborting "
2052 "write.", vi->i_ino,
2053 le32_to_cpu(ni->type));
2054 } else {
2055 ntfs_error(vol->sb, "Inode 0x%lx has unknown "
2056 "attribute type 0x%x. Aborting "
2057 "write.", vi->i_ino,
2058 le32_to_cpu(ni->type));
2059 err = -EIO;
2060 }
2061 goto err_out2;
2062 }
2063 /*
2064 * Extend the attribute record to be able to store the new attribute
2065 * size.
2066 */
2067 if (new_size >= vol->mft_record_size || ntfs_attr_record_resize(m, a,
2068 le16_to_cpu(a->data.resident.value_offset) +
2069 new_size)) {
2070 /* Not enough space in the mft record. */
2071 ntfs_error(vol->sb, "Not enough space in the mft record for "
2072 "the resized attribute value. This is not "
2073 "supported yet. Aborting write.");
2074 err = -EOPNOTSUPP;
2075 goto err_out2;
2076 }
2077 /*
2078 * We have enough space in the mft record to fit the write. This
2079 * implies the attribute is smaller than the mft record and hence the
2080 * attribute must be in a single page and hence page->index must be 0.
2081 */
2082 BUG_ON(page->index);
2083 /*
2084 * If the beginning of the write is past the old size, enlarge the
2085 * attribute value up to the beginning of the write and fill it with
2086 * zeroes.
2087 */
2088 if (from > attr_len) {
2089 memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
2090 attr_len, 0, from - attr_len);
2091 a->data.resident.value_length = cpu_to_le32(from);
2092 /* Zero the corresponding area in the page as well. */
2093 if (PageUptodate(page)) {
2094 kaddr = kmap_atomic(page, KM_USER0);
2095 memset(kaddr + attr_len, 0, from - attr_len);
2096 kunmap_atomic(kaddr, KM_USER0);
2097 flush_dcache_page(page);
2098 }
2099 }
2100 flush_dcache_mft_record_page(ctx->ntfs_ino);
2101 mark_mft_record_dirty(ctx->ntfs_ino);
2102done_unm:
2103 ntfs_attr_put_search_ctx(ctx);
2104 unmap_mft_record(base_ni);
2105 /*
2106 * Because resident attributes are handled by memcpy() to/from the
2107 * corresponding MFT record, and because this form of i/o is byte
2108 * aligned rather than block aligned, there is no need to bring the
2109 * page uptodate here as in the non-resident case where we need to
2110 * bring the buffers straddled by the write uptodate before
2111 * generic_file_write() does the copying from userspace.
2112 *
2113 * We thus defer the uptodate bringing of the page region outside the
2114 * region written to to ntfs_commit_write(), which makes the code
2115 * simpler and saves one atomic kmap which is good.
2116 */
2117done:
2118 ntfs_debug("Done.");
2119 return 0;
2120err_out:
2121 if (err == -ENOMEM)
2122 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2123 "prepare the write.");
2124 else {
2125 ntfs_error(vi->i_sb, "Resident attribute prepare write failed "
2126 "with error %i.", err);
2127 NVolSetErrors(vol);
2128 make_bad_inode(vi);
2129 }
2130err_out2:
2131 if (ctx)
2132 ntfs_attr_put_search_ctx(ctx);
2133 if (m)
2134 unmap_mft_record(base_ni);
2135 return err;
2136}
2137
2138/**
2139 * ntfs_commit_nonresident_write -
2140 *
2141 */
2142static int ntfs_commit_nonresident_write(struct page *page,
2143 unsigned from, unsigned to)
2144{
2145 s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
2146 struct inode *vi = page->mapping->host;
2147 struct buffer_head *bh, *head;
2148 unsigned int block_start, block_end, blocksize;
2149 BOOL partial;
2150
2151 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2152 "0x%lx, from = %u, to = %u.", vi->i_ino,
2153 NTFS_I(vi)->type, page->index, from, to);
2154 blocksize = 1 << vi->i_blkbits;
2155
2156 // FIXME: We need a whole slew of special cases in here for compressed
2157 // files for example...
2158 // For now, we know ntfs_prepare_write() would have failed so we can't
2159 // get here in any of the cases which we have to special case, so we
2160 // are just a ripped off, unrolled generic_commit_write().
2161
2162 bh = head = page_buffers(page);
2163 block_start = 0;
2164 partial = FALSE;
2165 do {
2166 block_end = block_start + blocksize;
2167 if (block_end <= from || block_start >= to) {
2168 if (!buffer_uptodate(bh))
2169 partial = TRUE;
2170 } else {
2171 set_buffer_uptodate(bh);
2172 mark_buffer_dirty(bh);
2173 }
2174 } while (block_start = block_end, (bh = bh->b_this_page) != head);
2175 /*
2176 * If this is a partial write which happened to make all buffers
2177 * uptodate then we can optimize away a bogus ->readpage() for the next
2178 * read(). Here we 'discover' whether the page went uptodate as a
2179 * result of this (potentially partial) write.
2180 */
2181 if (!partial)
2182 SetPageUptodate(page);
2183 /*
2184 * Not convinced about this at all. See disparity comment above. For
2185 * now we know ntfs_prepare_write() would have failed in the write
2186 * exceeds i_size case, so this will never trigger which is fine.
2187 */
2188 if (pos > i_size_read(vi)) {
2189 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
2190 "not supported yet. Sorry.");
2191 return -EOPNOTSUPP;
2192 // vi->i_size = pos;
2193 // mark_inode_dirty(vi);
2194 }
2195 ntfs_debug("Done.");
2196 return 0;
2197}
2198
2199/**
2200 * ntfs_commit_write - commit the received data
2201 *
2202 * This is called from generic_file_write() with i_sem held on the inode
2203 * (@page->mapping->host). The @page is locked but not kmap()ped. The source
2204 * data has already been copied into the @page. ntfs_prepare_write() has been
2205 * called before the data copied and it returned success so we can take the
2206 * results of various BUG checks and some error handling for granted.
2207 *
2208 * Need to mark modified blocks dirty so they get written out later when
2209 * ntfs_writepage() is invoked by the VM.
2210 *
2211 * Return 0 on success or -errno on error.
2212 *
2213 * Should be using generic_commit_write(). This marks buffers uptodate and
2214 * dirty, sets the page uptodate if all buffers in the page are uptodate, and
2215 * updates i_size if the end of io is beyond i_size. In that case, it also
2216 * marks the inode dirty.
2217 *
2218 * Cannot use generic_commit_write() due to ntfs specialities but can look at
2219 * it for implementation guidance.
2220 *
2221 * If things have gone as outlined in ntfs_prepare_write(), then we do not
2222 * need to do any page content modifications here at all, except in the write
2223 * to resident attribute case, where we need to do the uptodate bringing here
2224 * which we combine with the copying into the mft record which means we save
2225 * one atomic kmap.
2226 */
2227static int ntfs_commit_write(struct file *file, struct page *page,
2228 unsigned from, unsigned to)
2229{
2230 struct inode *vi = page->mapping->host;
2231 ntfs_inode *base_ni, *ni = NTFS_I(vi);
2232 char *kaddr, *kattr;
2233 ntfs_attr_search_ctx *ctx;
2234 MFT_RECORD *m;
2235 ATTR_RECORD *a;
2236 u32 attr_len;
2237 int err;
2238
2239 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
2240 "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
2241 page->index, from, to);
2242 /* If the attribute is not resident, deal with it elsewhere. */
2243 if (NInoNonResident(ni)) {
2244 /* Only unnamed $DATA attributes can be compressed/encrypted. */
2245 if (ni->type == AT_DATA && !ni->name_len) {
2246 /* Encrypted files need separate handling. */
2247 if (NInoEncrypted(ni)) {
2248 // We never get here at present!
2249 BUG();
2250 }
2251 /* Compressed data streams are handled in compress.c. */
2252 if (NInoCompressed(ni)) {
2253 // TODO: Implement this!
2254 // return ntfs_write_compressed_block(page);
2255 // We never get here at present!
2256 BUG();
2257 }
2258 }
2259 /* Normal data stream. */
2260 return ntfs_commit_nonresident_write(page, from, to);
2261 }
2262 /*
2263 * Attribute is resident, implying it is not compressed, encrypted, or
2264 * sparse.
2265 */
2266 if (!NInoAttr(ni))
2267 base_ni = ni;
2268 else
2269 base_ni = ni->ext.base_ntfs_ino;
2270 /* Map, pin, and lock the mft record. */
2271 m = map_mft_record(base_ni);
2272 if (IS_ERR(m)) {
2273 err = PTR_ERR(m);
2274 m = NULL;
2275 ctx = NULL;
2276 goto err_out;
2277 }
2278 ctx = ntfs_attr_get_search_ctx(base_ni, m);
2279 if (unlikely(!ctx)) {
2280 err = -ENOMEM;
2281 goto err_out;
2282 }
2283 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2284 CASE_SENSITIVE, 0, NULL, 0, ctx);
2285 if (unlikely(err)) {
2286 if (err == -ENOENT)
2287 err = -EIO;
2288 goto err_out;
2289 }
2290 a = ctx->attr;
2291 /* The total length of the attribute value. */
2292 attr_len = le32_to_cpu(a->data.resident.value_length);
2293 BUG_ON(from > attr_len);
2294 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
2295 kaddr = kmap_atomic(page, KM_USER0);
2296 /* Copy the received data from the page to the mft record. */
2297 memcpy(kattr + from, kaddr + from, to - from);
2298 /* Update the attribute length if necessary. */
2299 if (to > attr_len) {
2300 attr_len = to;
2301 a->data.resident.value_length = cpu_to_le32(attr_len);
2302 }
2303 /*
2304 * If the page is not uptodate, bring the out of bounds area(s)
2305 * uptodate by copying data from the mft record to the page.
2306 */
2307 if (!PageUptodate(page)) {
2308 if (from > 0)
2309 memcpy(kaddr, kattr, from);
2310 if (to < attr_len)
2311 memcpy(kaddr + to, kattr + to, attr_len - to);
2312 /* Zero the region outside the end of the attribute value. */
2313 if (attr_len < PAGE_CACHE_SIZE)
2314 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
2315 /*
2316 * The probability of not having done any of the above is
2317 * extremely small, so we just flush unconditionally.
2318 */
2319 flush_dcache_page(page);
2320 SetPageUptodate(page);
2321 }
2322 kunmap_atomic(kaddr, KM_USER0);
2323 /* Update i_size if necessary. */
2324 if (i_size_read(vi) < attr_len) {
2325 unsigned long flags;
2326
2327 write_lock_irqsave(&ni->size_lock, flags);
2328 ni->allocated_size = ni->initialized_size = attr_len;
2329 i_size_write(vi, attr_len);
2330 write_unlock_irqrestore(&ni->size_lock, flags);
2331 }
2332 /* Mark the mft record dirty, so it gets written back. */
2333 flush_dcache_mft_record_page(ctx->ntfs_ino);
2334 mark_mft_record_dirty(ctx->ntfs_ino);
2335 ntfs_attr_put_search_ctx(ctx);
2336 unmap_mft_record(base_ni);
2337 ntfs_debug("Done.");
2338 return 0;
2339err_out:
2340 if (err == -ENOMEM) {
2341 ntfs_warning(vi->i_sb, "Error allocating memory required to "
2342 "commit the write.");
2343 if (PageUptodate(page)) {
2344 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
2345 "dirty so the write will be retried "
2346 "later on by the VM.");
2347 /*
2348 * Put the page on mapping->dirty_pages, but leave its
2349 * buffers' dirty state as-is.
2350 */
2351 __set_page_dirty_nobuffers(page);
2352 err = 0;
2353 } else
2354 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
2355 "data has been lost.");
2356 } else {
2357 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
2358 "with error %i.", err);
2359 NVolSetErrors(ni->vol);
2360 make_bad_inode(vi);
2361 }
2362 if (ctx)
2363 ntfs_attr_put_search_ctx(ctx);
2364 if (m)
2365 unmap_mft_record(base_ni);
2366 return err;
2367}
2368
2369#endif /* NTFS_RW */ 1544#endif /* NTFS_RW */
2370 1545
2371/** 1546/**
@@ -2377,9 +1552,6 @@ struct address_space_operations ntfs_aops = {
2377 disk request queue. */ 1552 disk request queue. */
2378#ifdef NTFS_RW 1553#ifdef NTFS_RW
2379 .writepage = ntfs_writepage, /* Write dirty page to disk. */ 1554 .writepage = ntfs_writepage, /* Write dirty page to disk. */
2380 .prepare_write = ntfs_prepare_write, /* Prepare page and buffers
2381 ready to receive data. */
2382 .commit_write = ntfs_commit_write, /* Commit received data. */
2383#endif /* NTFS_RW */ 1555#endif /* NTFS_RW */
2384}; 1556};
2385 1557
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 3f9a4ff42ee5..eda056bac256 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -21,7 +21,9 @@
21 */ 21 */
22 22
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/sched.h>
24#include <linux/swap.h> 25#include <linux/swap.h>
26#include <linux/writeback.h>
25 27
26#include "attrib.h" 28#include "attrib.h"
27#include "debug.h" 29#include "debug.h"
@@ -36,9 +38,27 @@
36 * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode 38 * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
37 * @ni: ntfs inode for which to map (part of) a runlist 39 * @ni: ntfs inode for which to map (part of) a runlist
38 * @vcn: map runlist part containing this vcn 40 * @vcn: map runlist part containing this vcn
41 * @ctx: active attribute search context if present or NULL if not
39 * 42 *
40 * Map the part of a runlist containing the @vcn of the ntfs inode @ni. 43 * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
41 * 44 *
45 * If @ctx is specified, it is an active search context of @ni and its base mft
46 * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped
47 * runlist fragments and allows their mapping. If you do not have the mft
48 * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
49 * will perform the necessary mapping and unmapping.
50 *
51 * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
52 * restores it before returning. Thus, @ctx will be left pointing to the same
53 * attribute on return as on entry. However, the actual pointers in @ctx may
54 * point to different memory locations on return, so you must remember to reset
55 * any cached pointers from the @ctx, i.e. after the call to
56 * ntfs_map_runlist_nolock(), you will probably want to do:
57 * m = ctx->mrec;
58 * a = ctx->attr;
59 * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
60 * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
61 *
42 * Return 0 on success and -errno on error. There is one special error code 62 * Return 0 on success and -errno on error. There is one special error code
43 * which is not an error as such. This is -ENOENT. It means that @vcn is out 63 * which is not an error as such. This is -ENOENT. It means that @vcn is out
44 * of bounds of the runlist. 64 * of bounds of the runlist.
@@ -46,19 +66,32 @@
46 * Note the runlist can be NULL after this function returns if @vcn is zero and 66 * Note the runlist can be NULL after this function returns if @vcn is zero and
47 * the attribute has zero allocated size, i.e. there simply is no runlist. 67 * the attribute has zero allocated size, i.e. there simply is no runlist.
48 * 68 *
49 * Locking: - The runlist must be locked for writing. 69 * WARNING: If @ctx is supplied, regardless of whether success or failure is
50 * - This function modifies the runlist. 70 * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
71 * is no longer valid, i.e. you need to either call
72 * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
73 * In that case PTR_ERR(@ctx->mrec) will give you the error code for
74 * why the mapping of the old inode failed.
75 *
76 * Locking: - The runlist described by @ni must be locked for writing on entry
77 * and is locked on return. Note the runlist will be modified.
78 * - If @ctx is NULL, the base mft record of @ni must not be mapped on
79 * entry and it will be left unmapped on return.
80 * - If @ctx is not NULL, the base mft record must be mapped on entry
81 * and it will be left mapped on return.
51 */ 82 */
52int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn) 83int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
53{ 84{
54 VCN end_vcn; 85 VCN end_vcn;
86 unsigned long flags;
55 ntfs_inode *base_ni; 87 ntfs_inode *base_ni;
56 MFT_RECORD *m; 88 MFT_RECORD *m;
57 ATTR_RECORD *a; 89 ATTR_RECORD *a;
58 ntfs_attr_search_ctx *ctx;
59 runlist_element *rl; 90 runlist_element *rl;
60 unsigned long flags; 91 struct page *put_this_page = NULL;
61 int err = 0; 92 int err = 0;
93 BOOL ctx_is_temporary, ctx_needs_reset;
94 ntfs_attr_search_ctx old_ctx = { NULL, };
62 95
63 ntfs_debug("Mapping runlist part containing vcn 0x%llx.", 96 ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
64 (unsigned long long)vcn); 97 (unsigned long long)vcn);
@@ -66,20 +99,77 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
66 base_ni = ni; 99 base_ni = ni;
67 else 100 else
68 base_ni = ni->ext.base_ntfs_ino; 101 base_ni = ni->ext.base_ntfs_ino;
69 m = map_mft_record(base_ni); 102 if (!ctx) {
70 if (IS_ERR(m)) 103 ctx_is_temporary = ctx_needs_reset = TRUE;
71 return PTR_ERR(m); 104 m = map_mft_record(base_ni);
72 ctx = ntfs_attr_get_search_ctx(base_ni, m); 105 if (IS_ERR(m))
73 if (unlikely(!ctx)) { 106 return PTR_ERR(m);
74 err = -ENOMEM; 107 ctx = ntfs_attr_get_search_ctx(base_ni, m);
75 goto err_out; 108 if (unlikely(!ctx)) {
109 err = -ENOMEM;
110 goto err_out;
111 }
112 } else {
113 VCN allocated_size_vcn;
114
115 BUG_ON(IS_ERR(ctx->mrec));
116 a = ctx->attr;
117 BUG_ON(!a->non_resident);
118 ctx_is_temporary = FALSE;
119 end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
120 read_lock_irqsave(&ni->size_lock, flags);
121 allocated_size_vcn = ni->allocated_size >>
122 ni->vol->cluster_size_bits;
123 read_unlock_irqrestore(&ni->size_lock, flags);
124 if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
125 end_vcn = allocated_size_vcn - 1;
126 /*
127 * If we already have the attribute extent containing @vcn in
128 * @ctx, no need to look it up again. We slightly cheat in
129 * that if vcn exceeds the allocated size, we will refuse to
130 * map the runlist below, so there is definitely no need to get
131 * the right attribute extent.
132 */
133 if (vcn >= allocated_size_vcn || (a->type == ni->type &&
134 a->name_length == ni->name_len &&
135 !memcmp((u8*)a + le16_to_cpu(a->name_offset),
136 ni->name, ni->name_len) &&
137 sle64_to_cpu(a->data.non_resident.lowest_vcn)
138 <= vcn && end_vcn >= vcn))
139 ctx_needs_reset = FALSE;
140 else {
141 /* Save the old search context. */
142 old_ctx = *ctx;
143 /*
144 * If the currently mapped (extent) inode is not the
145 * base inode we will unmap it when we reinitialize the
146 * search context which means we need to get a
147 * reference to the page containing the mapped mft
148 * record so we do not accidentally drop changes to the
149 * mft record when it has not been marked dirty yet.
150 */
151 if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
152 old_ctx.base_ntfs_ino) {
153 put_this_page = old_ctx.ntfs_ino->page;
154 page_cache_get(put_this_page);
155 }
156 /*
157 * Reinitialize the search context so we can lookup the
158 * needed attribute extent.
159 */
160 ntfs_attr_reinit_search_ctx(ctx);
161 ctx_needs_reset = TRUE;
162 }
76 } 163 }
77 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 164 if (ctx_needs_reset) {
78 CASE_SENSITIVE, vcn, NULL, 0, ctx); 165 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
79 if (unlikely(err)) { 166 CASE_SENSITIVE, vcn, NULL, 0, ctx);
80 if (err == -ENOENT) 167 if (unlikely(err)) {
81 err = -EIO; 168 if (err == -ENOENT)
82 goto err_out; 169 err = -EIO;
170 goto err_out;
171 }
172 BUG_ON(!ctx->attr->non_resident);
83 } 173 }
84 a = ctx->attr; 174 a = ctx->attr;
85 /* 175 /*
@@ -89,11 +179,9 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
89 * ntfs_mapping_pairs_decompress() fails. 179 * ntfs_mapping_pairs_decompress() fails.
90 */ 180 */
91 end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; 181 end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
92 if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1)) { 182 if (!a->data.non_resident.lowest_vcn && end_vcn == 1)
93 read_lock_irqsave(&ni->size_lock, flags); 183 end_vcn = sle64_to_cpu(a->data.non_resident.allocated_size) >>
94 end_vcn = ni->allocated_size >> ni->vol->cluster_size_bits; 184 ni->vol->cluster_size_bits;
95 read_unlock_irqrestore(&ni->size_lock, flags);
96 }
97 if (unlikely(vcn >= end_vcn)) { 185 if (unlikely(vcn >= end_vcn)) {
98 err = -ENOENT; 186 err = -ENOENT;
99 goto err_out; 187 goto err_out;
@@ -104,9 +192,93 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
104 else 192 else
105 ni->runlist.rl = rl; 193 ni->runlist.rl = rl;
106err_out: 194err_out:
107 if (likely(ctx)) 195 if (ctx_is_temporary) {
108 ntfs_attr_put_search_ctx(ctx); 196 if (likely(ctx))
109 unmap_mft_record(base_ni); 197 ntfs_attr_put_search_ctx(ctx);
198 unmap_mft_record(base_ni);
199 } else if (ctx_needs_reset) {
200 /*
201 * If there is no attribute list, restoring the search context
202 * is acomplished simply by copying the saved context back over
203 * the caller supplied context. If there is an attribute list,
204 * things are more complicated as we need to deal with mapping
205 * of mft records and resulting potential changes in pointers.
206 */
207 if (NInoAttrList(base_ni)) {
208 /*
209 * If the currently mapped (extent) inode is not the
210 * one we had before, we need to unmap it and map the
211 * old one.
212 */
213 if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
214 /*
215 * If the currently mapped inode is not the
216 * base inode, unmap it.
217 */
218 if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
219 ctx->base_ntfs_ino) {
220 unmap_extent_mft_record(ctx->ntfs_ino);
221 ctx->mrec = ctx->base_mrec;
222 BUG_ON(!ctx->mrec);
223 }
224 /*
225 * If the old mapped inode is not the base
226 * inode, map it.
227 */
228 if (old_ctx.base_ntfs_ino &&
229 old_ctx.ntfs_ino !=
230 old_ctx.base_ntfs_ino) {
231retry_map:
232 ctx->mrec = map_mft_record(
233 old_ctx.ntfs_ino);
234 /*
235 * Something bad has happened. If out
236 * of memory retry till it succeeds.
237 * Any other errors are fatal and we
238 * return the error code in ctx->mrec.
239 * Let the caller deal with it... We
240 * just need to fudge things so the
241 * caller can reinit and/or put the
242 * search context safely.
243 */
244 if (IS_ERR(ctx->mrec)) {
245 if (PTR_ERR(ctx->mrec) ==
246 -ENOMEM) {
247 schedule();
248 goto retry_map;
249 } else
250 old_ctx.ntfs_ino =
251 old_ctx.
252 base_ntfs_ino;
253 }
254 }
255 }
256 /* Update the changed pointers in the saved context. */
257 if (ctx->mrec != old_ctx.mrec) {
258 if (!IS_ERR(ctx->mrec))
259 old_ctx.attr = (ATTR_RECORD*)(
260 (u8*)ctx->mrec +
261 ((u8*)old_ctx.attr -
262 (u8*)old_ctx.mrec));
263 old_ctx.mrec = ctx->mrec;
264 }
265 }
266 /* Restore the search context to the saved one. */
267 *ctx = old_ctx;
268 /*
269 * We drop the reference on the page we took earlier. In the
270 * case that IS_ERR(ctx->mrec) is true this means we might lose
271 * some changes to the mft record that had been made between
272 * the last time it was marked dirty/written out and now. This
273 * at this stage is not a problem as the mapping error is fatal
274 * enough that the mft record cannot be written out anyway and
275 * the caller is very likely to shutdown the whole inode
276 * immediately and mark the volume dirty for chkdsk to pick up
277 * the pieces anyway.
278 */
279 if (put_this_page)
280 page_cache_release(put_this_page);
281 }
110 return err; 282 return err;
111} 283}
112 284
@@ -122,8 +294,8 @@ err_out:
122 * of bounds of the runlist. 294 * of bounds of the runlist.
123 * 295 *
124 * Locking: - The runlist must be unlocked on entry and is unlocked on return. 296 * Locking: - The runlist must be unlocked on entry and is unlocked on return.
125 * - This function takes the runlist lock for writing and modifies the 297 * - This function takes the runlist lock for writing and may modify
126 * runlist. 298 * the runlist.
127 */ 299 */
128int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) 300int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
129{ 301{
@@ -133,7 +305,7 @@ int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
133 /* Make sure someone else didn't do the work while we were sleeping. */ 305 /* Make sure someone else didn't do the work while we were sleeping. */
134 if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= 306 if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
135 LCN_RL_NOT_MAPPED)) 307 LCN_RL_NOT_MAPPED))
136 err = ntfs_map_runlist_nolock(ni, vcn); 308 err = ntfs_map_runlist_nolock(ni, vcn, NULL);
137 up_write(&ni->runlist.lock); 309 up_write(&ni->runlist.lock);
138 return err; 310 return err;
139} 311}
@@ -212,7 +384,7 @@ retry_remap:
212 goto retry_remap; 384 goto retry_remap;
213 } 385 }
214 } 386 }
215 err = ntfs_map_runlist_nolock(ni, vcn); 387 err = ntfs_map_runlist_nolock(ni, vcn, NULL);
216 if (!write_locked) { 388 if (!write_locked) {
217 up_write(&ni->runlist.lock); 389 up_write(&ni->runlist.lock);
218 down_read(&ni->runlist.lock); 390 down_read(&ni->runlist.lock);
@@ -236,9 +408,9 @@ retry_remap:
236 408
237/** 409/**
238 * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode 410 * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
239 * @ni: ntfs inode describing the runlist to search 411 * @ni: ntfs inode describing the runlist to search
240 * @vcn: vcn to find 412 * @vcn: vcn to find
241 * @write_locked: true if the runlist is locked for writing 413 * @ctx: active attribute search context if present or NULL if not
242 * 414 *
243 * Find the virtual cluster number @vcn in the runlist described by the ntfs 415 * Find the virtual cluster number @vcn in the runlist described by the ntfs
244 * inode @ni and return the address of the runlist element containing the @vcn. 416 * inode @ni and return the address of the runlist element containing the @vcn.
@@ -246,9 +418,22 @@ retry_remap:
246 * If the @vcn is not mapped yet, the attempt is made to map the attribute 418 * If the @vcn is not mapped yet, the attempt is made to map the attribute
247 * extent containing the @vcn and the vcn to lcn conversion is retried. 419 * extent containing the @vcn and the vcn to lcn conversion is retried.
248 * 420 *
249 * If @write_locked is true the caller has locked the runlist for writing and 421 * If @ctx is specified, it is an active search context of @ni and its base mft
250 * if false for reading. 422 * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
251 * 423 * runlist fragments and allows their mapping. If you do not have the mft
424 * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
425 * will perform the necessary mapping and unmapping.
426 *
427 * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
428 * restores it before returning. Thus, @ctx will be left pointing to the same
429 * attribute on return as on entry. However, the actual pointers in @ctx may
430 * point to different memory locations on return, so you must remember to reset
431 * any cached pointers from the @ctx, i.e. after the call to
432 * ntfs_attr_find_vcn_nolock(), you will probably want to do:
433 * m = ctx->mrec;
434 * a = ctx->attr;
435 * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
436 * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
252 * Note you need to distinguish between the lcn of the returned runlist element 437 * Note you need to distinguish between the lcn of the returned runlist element
253 * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on 438 * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on
254 * read and allocate clusters on write. 439 * read and allocate clusters on write.
@@ -263,22 +448,31 @@ retry_remap:
263 * -ENOMEM - Not enough memory to map runlist. 448 * -ENOMEM - Not enough memory to map runlist.
264 * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). 449 * -EIO - Critical error (runlist/file is corrupt, i/o error, etc).
265 * 450 *
266 * Locking: - The runlist must be locked on entry and is left locked on return. 451 * WARNING: If @ctx is supplied, regardless of whether success or failure is
267 * - If @write_locked is FALSE, i.e. the runlist is locked for reading, 452 * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
268 * the lock may be dropped inside the function so you cannot rely on 453 * is no longer valid, i.e. you need to either call
269 * the runlist still being the same when this function returns. 454 * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
455 * In that case PTR_ERR(@ctx->mrec) will give you the error code for
456 * why the mapping of the old inode failed.
457 *
458 * Locking: - The runlist described by @ni must be locked for writing on entry
459 * and is locked on return. Note the runlist may be modified when
460 * needed runlist fragments need to be mapped.
461 * - If @ctx is NULL, the base mft record of @ni must not be mapped on
462 * entry and it will be left unmapped on return.
463 * - If @ctx is not NULL, the base mft record must be mapped on entry
464 * and it will be left mapped on return.
270 */ 465 */
271runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, 466runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
272 const BOOL write_locked) 467 ntfs_attr_search_ctx *ctx)
273{ 468{
274 unsigned long flags; 469 unsigned long flags;
275 runlist_element *rl; 470 runlist_element *rl;
276 int err = 0; 471 int err = 0;
277 BOOL is_retry = FALSE; 472 BOOL is_retry = FALSE;
278 473
279 ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", 474 ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
280 ni->mft_no, (unsigned long long)vcn, 475 ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
281 write_locked ? "write" : "read");
282 BUG_ON(!ni); 476 BUG_ON(!ni);
283 BUG_ON(!NInoNonResident(ni)); 477 BUG_ON(!NInoNonResident(ni));
284 BUG_ON(vcn < 0); 478 BUG_ON(vcn < 0);
@@ -312,33 +506,22 @@ retry_remap:
312 } 506 }
313 if (!err && !is_retry) { 507 if (!err && !is_retry) {
314 /* 508 /*
315 * The @vcn is in an unmapped region, map the runlist and 509 * If the search context is invalid we cannot map the unmapped
316 * retry. 510 * region.
317 */ 511 */
318 if (!write_locked) { 512 if (IS_ERR(ctx->mrec))
319 up_read(&ni->runlist.lock); 513 err = PTR_ERR(ctx->mrec);
320 down_write(&ni->runlist.lock); 514 else {
321 if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != 515 /*
322 LCN_RL_NOT_MAPPED)) { 516 * The @vcn is in an unmapped region, map the runlist
323 up_write(&ni->runlist.lock); 517 * and retry.
324 down_read(&ni->runlist.lock); 518 */
519 err = ntfs_map_runlist_nolock(ni, vcn, ctx);
520 if (likely(!err)) {
521 is_retry = TRUE;
325 goto retry_remap; 522 goto retry_remap;
326 } 523 }
327 } 524 }
328 err = ntfs_map_runlist_nolock(ni, vcn);
329 if (!write_locked) {
330 up_write(&ni->runlist.lock);
331 down_read(&ni->runlist.lock);
332 }
333 if (likely(!err)) {
334 is_retry = TRUE;
335 goto retry_remap;
336 }
337 /*
338 * -EINVAL coming from a failed mapping attempt is equivalent
339 * to i/o error for us as it should not happen in our code
340 * paths.
341 */
342 if (err == -EINVAL) 525 if (err == -EINVAL)
343 err = -EIO; 526 err = -EIO;
344 } else if (!err) 527 } else if (!err)
@@ -1011,6 +1194,7 @@ int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
1011 ntfs_inode *base_ni; 1194 ntfs_inode *base_ni;
1012 1195
1013 ntfs_debug("Entering."); 1196 ntfs_debug("Entering.");
1197 BUG_ON(IS_ERR(ctx->mrec));
1014 if (ctx->base_ntfs_ino) 1198 if (ctx->base_ntfs_ino)
1015 base_ni = ctx->base_ntfs_ino; 1199 base_ni = ctx->base_ntfs_ino;
1016 else 1200 else
@@ -1227,7 +1411,7 @@ int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
1227 */ 1411 */
1228int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) 1412int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
1229{ 1413{
1230 if (type == AT_INDEX_ALLOCATION || type == AT_EA) 1414 if (type == AT_INDEX_ALLOCATION)
1231 return -EPERM; 1415 return -EPERM;
1232 return 0; 1416 return 0;
1233} 1417}
@@ -1319,10 +1503,17 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
1319/** 1503/**
1320 * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute 1504 * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
1321 * @ni: ntfs inode describing the attribute to convert 1505 * @ni: ntfs inode describing the attribute to convert
1506 * @data_size: size of the resident data to copy to the non-resident attribute
1322 * 1507 *
1323 * Convert the resident ntfs attribute described by the ntfs inode @ni to a 1508 * Convert the resident ntfs attribute described by the ntfs inode @ni to a
1324 * non-resident one. 1509 * non-resident one.
1325 * 1510 *
1511 * @data_size must be equal to the attribute value size. This is needed since
1512 * we need to know the size before we can map the mft record and our callers
1513 * always know it. The reason we cannot simply read the size from the vfs
1514 * inode i_size is that this is not necessarily uptodate. This happens when
1515 * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
1516 *
1326 * Return 0 on success and -errno on error. The following error return codes 1517 * Return 0 on success and -errno on error. The following error return codes
1327 * are defined: 1518 * are defined:
1328 * -EPERM - The attribute is not allowed to be non-resident. 1519 * -EPERM - The attribute is not allowed to be non-resident.
@@ -1343,7 +1534,7 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
1343 * 1534 *
1344 * Locking: - The caller must hold i_sem on the inode. 1535 * Locking: - The caller must hold i_sem on the inode.
1345 */ 1536 */
1346int ntfs_attr_make_non_resident(ntfs_inode *ni) 1537int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
1347{ 1538{
1348 s64 new_size; 1539 s64 new_size;
1349 struct inode *vi = VFS_I(ni); 1540 struct inode *vi = VFS_I(ni);
@@ -1381,11 +1572,9 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1381 * The size needs to be aligned to a cluster boundary for allocation 1572 * The size needs to be aligned to a cluster boundary for allocation
1382 * purposes. 1573 * purposes.
1383 */ 1574 */
1384 new_size = (i_size_read(vi) + vol->cluster_size - 1) & 1575 new_size = (data_size + vol->cluster_size - 1) &
1385 ~(vol->cluster_size - 1); 1576 ~(vol->cluster_size - 1);
1386 if (new_size > 0) { 1577 if (new_size > 0) {
1387 runlist_element *rl2;
1388
1389 /* 1578 /*
1390 * Will need the page later and since the page lock nests 1579 * Will need the page later and since the page lock nests
1391 * outside all ntfs locks, we need to get the page now. 1580 * outside all ntfs locks, we need to get the page now.
@@ -1396,7 +1585,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1396 return -ENOMEM; 1585 return -ENOMEM;
1397 /* Start by allocating clusters to hold the attribute value. */ 1586 /* Start by allocating clusters to hold the attribute value. */
1398 rl = ntfs_cluster_alloc(vol, 0, new_size >> 1587 rl = ntfs_cluster_alloc(vol, 0, new_size >>
1399 vol->cluster_size_bits, -1, DATA_ZONE); 1588 vol->cluster_size_bits, -1, DATA_ZONE, TRUE);
1400 if (IS_ERR(rl)) { 1589 if (IS_ERR(rl)) {
1401 err = PTR_ERR(rl); 1590 err = PTR_ERR(rl);
1402 ntfs_debug("Failed to allocate cluster%s, error code " 1591 ntfs_debug("Failed to allocate cluster%s, error code "
@@ -1405,12 +1594,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1405 err); 1594 err);
1406 goto page_err_out; 1595 goto page_err_out;
1407 } 1596 }
1408 /* Change the runlist terminator to LCN_ENOENT. */
1409 rl2 = rl;
1410 while (rl2->length)
1411 rl2++;
1412 BUG_ON(rl2->lcn != LCN_RL_NOT_MAPPED);
1413 rl2->lcn = LCN_ENOENT;
1414 } else { 1597 } else {
1415 rl = NULL; 1598 rl = NULL;
1416 page = NULL; 1599 page = NULL;
@@ -1473,7 +1656,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1473 * attribute value. 1656 * attribute value.
1474 */ 1657 */
1475 attr_size = le32_to_cpu(a->data.resident.value_length); 1658 attr_size = le32_to_cpu(a->data.resident.value_length);
1476 BUG_ON(attr_size != i_size_read(vi)); 1659 BUG_ON(attr_size != data_size);
1477 if (page && !PageUptodate(page)) { 1660 if (page && !PageUptodate(page)) {
1478 kaddr = kmap_atomic(page, KM_USER0); 1661 kaddr = kmap_atomic(page, KM_USER0);
1479 memcpy(kaddr, (u8*)a + 1662 memcpy(kaddr, (u8*)a +
@@ -1538,7 +1721,9 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
1538 ffs(ni->itype.compressed.block_size) - 1; 1721 ffs(ni->itype.compressed.block_size) - 1;
1539 ni->itype.compressed.block_clusters = 1U << 1722 ni->itype.compressed.block_clusters = 1U <<
1540 a->data.non_resident.compression_unit; 1723 a->data.non_resident.compression_unit;
1541 } 1724 vi->i_blocks = ni->itype.compressed.size >> 9;
1725 } else
1726 vi->i_blocks = ni->allocated_size >> 9;
1542 write_unlock_irqrestore(&ni->size_lock, flags); 1727 write_unlock_irqrestore(&ni->size_lock, flags);
1543 /* 1728 /*
1544 * This needs to be last since the address space operations ->readpage 1729 * This needs to be last since the address space operations ->readpage
@@ -1652,6 +1837,640 @@ page_err_out:
1652} 1837}
1653 1838
1654/** 1839/**
1840 * ntfs_attr_extend_allocation - extend the allocated space of an attribute
1841 * @ni: ntfs inode of the attribute whose allocation to extend
1842 * @new_alloc_size: new size in bytes to which to extend the allocation to
1843 * @new_data_size: new size in bytes to which to extend the data to
1844 * @data_start: beginning of region which is required to be non-sparse
1845 *
1846 * Extend the allocated space of an attribute described by the ntfs inode @ni
1847 * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be
1848 * implemented as a hole in the file (as long as both the volume and the ntfs
1849 * inode @ni have sparse support enabled). If @data_start is >= 0, then the
1850 * region between the old allocated size and @data_start - 1 may be made sparse
1851 * but the regions between @data_start and @new_alloc_size must be backed by
1852 * actual clusters.
1853 *
1854 * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size
1855 * of the attribute is extended to @new_data_size. Note that the i_size of the
1856 * vfs inode is not updated. Only the data size in the base attribute record
1857 * is updated. The caller has to update i_size separately if this is required.
1858 * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
1859 * size as well as for @new_data_size to be greater than @new_alloc_size.
1860 *
1861 * For resident attributes this involves resizing the attribute record and if
1862 * necessary moving it and/or other attributes into extent mft records and/or
1863 * converting the attribute to a non-resident attribute which in turn involves
1864 * extending the allocation of a non-resident attribute as described below.
1865 *
1866 * For non-resident attributes this involves allocating clusters in the data
1867 * zone on the volume (except for regions that are being made sparse) and
1868 * extending the run list to describe the allocated clusters as well as
1869 * updating the mapping pairs array of the attribute. This in turn involves
1870 * resizing the attribute record and if necessary moving it and/or other
1871 * attributes into extent mft records and/or splitting the attribute record
1872 * into multiple extent attribute records.
1873 *
1874 * Also, the attribute list attribute is updated if present and in some of the
1875 * above cases (the ones where extent mft records/attributes come into play),
1876 * an attribute list attribute is created if not already present.
1877 *
1878 * Return the new allocated size on success and -errno on error. In the case
1879 * that an error is encountered but a partial extension at least up to
1880 * @data_start (if present) is possible, the allocation is partially extended
1881 * and this is returned. This means the caller must check the returned size to
1882 * determine if the extension was partial. If @data_start is -1 then partial
1883 * allocations are not performed.
1884 *
1885 * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
1886 *
1887 * Locking: This function takes the runlist lock of @ni for writing as well as
1888 * locking the mft record of the base ntfs inode. These locks are maintained
1889 * throughout execution of the function. These locks are required so that the
1890 * attribute can be resized safely and so that it can for example be converted
1891 * from resident to non-resident safely.
1892 *
1893 * TODO: At present attribute list attribute handling is not implemented.
1894 *
1895 * TODO: At present it is not safe to call this function for anything other
1896 * than the $DATA attribute(s) of an uncompressed and unencrypted file.
1897 */
1898s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
1899 const s64 new_data_size, const s64 data_start)
1900{
1901 VCN vcn;
1902 s64 ll, allocated_size, start = data_start;
1903 struct inode *vi = VFS_I(ni);
1904 ntfs_volume *vol = ni->vol;
1905 ntfs_inode *base_ni;
1906 MFT_RECORD *m;
1907 ATTR_RECORD *a;
1908 ntfs_attr_search_ctx *ctx;
1909 runlist_element *rl, *rl2;
1910 unsigned long flags;
1911 int err, mp_size;
1912 u32 attr_len = 0; /* Silence stupid gcc warning. */
1913 BOOL mp_rebuilt;
1914
1915#ifdef NTFS_DEBUG
1916 read_lock_irqsave(&ni->size_lock, flags);
1917 allocated_size = ni->allocated_size;
1918 read_unlock_irqrestore(&ni->size_lock, flags);
1919 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1920 "old_allocated_size 0x%llx, "
1921 "new_allocated_size 0x%llx, new_data_size 0x%llx, "
1922 "data_start 0x%llx.", vi->i_ino,
1923 (unsigned)le32_to_cpu(ni->type),
1924 (unsigned long long)allocated_size,
1925 (unsigned long long)new_alloc_size,
1926 (unsigned long long)new_data_size,
1927 (unsigned long long)start);
1928#endif
1929retry_extend:
1930 /*
1931 * For non-resident attributes, @start and @new_size need to be aligned
1932 * to cluster boundaries for allocation purposes.
1933 */
1934 if (NInoNonResident(ni)) {
1935 if (start > 0)
1936 start &= ~(s64)vol->cluster_size_mask;
1937 new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
1938 ~(s64)vol->cluster_size_mask;
1939 }
1940 BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
1941 /* Check if new size is allowed in $AttrDef. */
1942 err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
1943 if (unlikely(err)) {
1944 /* Only emit errors when the write will fail completely. */
1945 read_lock_irqsave(&ni->size_lock, flags);
1946 allocated_size = ni->allocated_size;
1947 read_unlock_irqrestore(&ni->size_lock, flags);
1948 if (start < 0 || start >= allocated_size) {
1949 if (err == -ERANGE) {
1950 ntfs_error(vol->sb, "Cannot extend allocation "
1951 "of inode 0x%lx, attribute "
1952 "type 0x%x, because the new "
1953 "allocation would exceed the "
1954 "maximum allowed size for "
1955 "this attribute type.",
1956 vi->i_ino, (unsigned)
1957 le32_to_cpu(ni->type));
1958 } else {
1959 ntfs_error(vol->sb, "Cannot extend allocation "
1960 "of inode 0x%lx, attribute "
1961 "type 0x%x, because this "
1962 "attribute type is not "
1963 "defined on the NTFS volume. "
1964 "Possible corruption! You "
1965 "should run chkdsk!",
1966 vi->i_ino, (unsigned)
1967 le32_to_cpu(ni->type));
1968 }
1969 }
1970 /* Translate error code to be POSIX conformant for write(2). */
1971 if (err == -ERANGE)
1972 err = -EFBIG;
1973 else
1974 err = -EIO;
1975 return err;
1976 }
1977 if (!NInoAttr(ni))
1978 base_ni = ni;
1979 else
1980 base_ni = ni->ext.base_ntfs_ino;
1981 /*
1982 * We will be modifying both the runlist (if non-resident) and the mft
1983 * record so lock them both down.
1984 */
1985 down_write(&ni->runlist.lock);
1986 m = map_mft_record(base_ni);
1987 if (IS_ERR(m)) {
1988 err = PTR_ERR(m);
1989 m = NULL;
1990 ctx = NULL;
1991 goto err_out;
1992 }
1993 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1994 if (unlikely(!ctx)) {
1995 err = -ENOMEM;
1996 goto err_out;
1997 }
1998 read_lock_irqsave(&ni->size_lock, flags);
1999 allocated_size = ni->allocated_size;
2000 read_unlock_irqrestore(&ni->size_lock, flags);
2001 /*
2002 * If non-resident, seek to the last extent. If resident, there is
2003 * only one extent, so seek to that.
2004 */
2005 vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
2006 0;
2007 /*
2008 * Abort if someone did the work whilst we waited for the locks. If we
2009 * just converted the attribute from resident to non-resident it is
2010 * likely that exactly this has happened already. We cannot quite
2011 * abort if we need to update the data size.
2012 */
2013 if (unlikely(new_alloc_size <= allocated_size)) {
2014 ntfs_debug("Allocated size already exceeds requested size.");
2015 new_alloc_size = allocated_size;
2016 if (new_data_size < 0)
2017 goto done;
2018 /*
2019 * We want the first attribute extent so that we can update the
2020 * data size.
2021 */
2022 vcn = 0;
2023 }
2024 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2025 CASE_SENSITIVE, vcn, NULL, 0, ctx);
2026 if (unlikely(err)) {
2027 if (err == -ENOENT)
2028 err = -EIO;
2029 goto err_out;
2030 }
2031 m = ctx->mrec;
2032 a = ctx->attr;
2033 /* Use goto to reduce indentation. */
2034 if (a->non_resident)
2035 goto do_non_resident_extend;
2036 BUG_ON(NInoNonResident(ni));
2037 /* The total length of the attribute value. */
2038 attr_len = le32_to_cpu(a->data.resident.value_length);
2039 /*
2040 * Extend the attribute record to be able to store the new attribute
2041 * size. ntfs_attr_record_resize() will not do anything if the size is
2042 * not changing.
2043 */
2044 if (new_alloc_size < vol->mft_record_size &&
2045 !ntfs_attr_record_resize(m, a,
2046 le16_to_cpu(a->data.resident.value_offset) +
2047 new_alloc_size)) {
2048 /* The resize succeeded! */
2049 write_lock_irqsave(&ni->size_lock, flags);
2050 ni->allocated_size = le32_to_cpu(a->length) -
2051 le16_to_cpu(a->data.resident.value_offset);
2052 write_unlock_irqrestore(&ni->size_lock, flags);
2053 if (new_data_size >= 0) {
2054 BUG_ON(new_data_size < attr_len);
2055 a->data.resident.value_length =
2056 cpu_to_le32((u32)new_data_size);
2057 }
2058 goto flush_done;
2059 }
2060 /*
2061 * We have to drop all the locks so we can call
2062 * ntfs_attr_make_non_resident(). This could be optimised by try-
2063 * locking the first page cache page and only if that fails dropping
2064 * the locks, locking the page, and redoing all the locking and
2065 * lookups. While this would be a huge optimisation, it is not worth
2066 * it as this is definitely a slow code path.
2067 */
2068 ntfs_attr_put_search_ctx(ctx);
2069 unmap_mft_record(base_ni);
2070 up_write(&ni->runlist.lock);
2071 /*
2072 * Not enough space in the mft record, try to make the attribute
2073 * non-resident and if successful restart the extension process.
2074 */
2075 err = ntfs_attr_make_non_resident(ni, attr_len);
2076 if (likely(!err))
2077 goto retry_extend;
2078 /*
2079 * Could not make non-resident. If this is due to this not being
2080 * permitted for this attribute type or there not being enough space,
2081 * try to make other attributes non-resident. Otherwise fail.
2082 */
2083 if (unlikely(err != -EPERM && err != -ENOSPC)) {
2084 /* Only emit errors when the write will fail completely. */
2085 read_lock_irqsave(&ni->size_lock, flags);
2086 allocated_size = ni->allocated_size;
2087 read_unlock_irqrestore(&ni->size_lock, flags);
2088 if (start < 0 || start >= allocated_size)
2089 ntfs_error(vol->sb, "Cannot extend allocation of "
2090 "inode 0x%lx, attribute type 0x%x, "
2091 "because the conversion from resident "
2092 "to non-resident attribute failed "
2093 "with error code %i.", vi->i_ino,
2094 (unsigned)le32_to_cpu(ni->type), err);
2095 if (err != -ENOMEM)
2096 err = -EIO;
2097 goto conv_err_out;
2098 }
2099 /* TODO: Not implemented from here, abort. */
2100 read_lock_irqsave(&ni->size_lock, flags);
2101 allocated_size = ni->allocated_size;
2102 read_unlock_irqrestore(&ni->size_lock, flags);
2103 if (start < 0 || start >= allocated_size) {
2104 if (err == -ENOSPC)
2105 ntfs_error(vol->sb, "Not enough space in the mft "
2106 "record/on disk for the non-resident "
2107 "attribute value. This case is not "
2108 "implemented yet.");
2109 else /* if (err == -EPERM) */
2110 ntfs_error(vol->sb, "This attribute type may not be "
2111 "non-resident. This case is not "
2112 "implemented yet.");
2113 }
2114 err = -EOPNOTSUPP;
2115 goto conv_err_out;
2116#if 0
2117 // TODO: Attempt to make other attributes non-resident.
2118 if (!err)
2119 goto do_resident_extend;
2120 /*
2121 * Both the attribute list attribute and the standard information
2122 * attribute must remain in the base inode. Thus, if this is one of
2123 * these attributes, we have to try to move other attributes out into
2124 * extent mft records instead.
2125 */
2126 if (ni->type == AT_ATTRIBUTE_LIST ||
2127 ni->type == AT_STANDARD_INFORMATION) {
2128 // TODO: Attempt to move other attributes into extent mft
2129 // records.
2130 err = -EOPNOTSUPP;
2131 if (!err)
2132 goto do_resident_extend;
2133 goto err_out;
2134 }
2135 // TODO: Attempt to move this attribute to an extent mft record, but
2136 // only if it is not already the only attribute in an mft record in
2137 // which case there would be nothing to gain.
2138 err = -EOPNOTSUPP;
2139 if (!err)
2140 goto do_resident_extend;
2141 /* There is nothing we can do to make enough space. )-: */
2142 goto err_out;
2143#endif
2144do_non_resident_extend:
2145 BUG_ON(!NInoNonResident(ni));
2146 if (new_alloc_size == allocated_size) {
2147 BUG_ON(vcn);
2148 goto alloc_done;
2149 }
2150 /*
2151 * If the data starts after the end of the old allocation, this is a
2152 * $DATA attribute and sparse attributes are enabled on the volume and
2153 * for this inode, then create a sparse region between the old
2154 * allocated size and the start of the data. Otherwise simply proceed
2155 * with filling the whole space between the old allocated size and the
2156 * new allocated size with clusters.
2157 */
2158 if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
2159 !NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
2160 goto skip_sparse;
2161 // TODO: This is not implemented yet. We just fill in with real
2162 // clusters for now...
2163 ntfs_debug("Inserting holes is not-implemented yet. Falling back to "
2164 "allocating real clusters instead.");
2165skip_sparse:
2166 rl = ni->runlist.rl;
2167 if (likely(rl)) {
2168 /* Seek to the end of the runlist. */
2169 while (rl->length)
2170 rl++;
2171 }
2172 /* If this attribute extent is not mapped, map it now. */
2173 if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
2174 (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
2175 (rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
2176 if (!rl && !allocated_size)
2177 goto first_alloc;
2178 rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
2179 if (IS_ERR(rl)) {
2180 err = PTR_ERR(rl);
2181 if (start < 0 || start >= allocated_size)
2182 ntfs_error(vol->sb, "Cannot extend allocation "
2183 "of inode 0x%lx, attribute "
2184 "type 0x%x, because the "
2185 "mapping of a runlist "
2186 "fragment failed with error "
2187 "code %i.", vi->i_ino,
2188 (unsigned)le32_to_cpu(ni->type),
2189 err);
2190 if (err != -ENOMEM)
2191 err = -EIO;
2192 goto err_out;
2193 }
2194 ni->runlist.rl = rl;
2195 /* Seek to the end of the runlist. */
2196 while (rl->length)
2197 rl++;
2198 }
2199 /*
2200 * We now know the runlist of the last extent is mapped and @rl is at
2201 * the end of the runlist. We want to begin allocating clusters
2202 * starting at the last allocated cluster to reduce fragmentation. If
2203 * there are no valid LCNs in the attribute we let the cluster
2204 * allocator choose the starting cluster.
2205 */
2206 /* If the last LCN is a hole or simillar seek back to last real LCN. */
2207 while (rl->lcn < 0 && rl > ni->runlist.rl)
2208 rl--;
2209first_alloc:
2210 // FIXME: Need to implement partial allocations so at least part of the
2211 // write can be performed when start >= 0. (Needed for POSIX write(2)
2212 // conformance.)
2213 rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
2214 (new_alloc_size - allocated_size) >>
2215 vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
2216 rl->lcn + rl->length : -1, DATA_ZONE, TRUE);
2217 if (IS_ERR(rl2)) {
2218 err = PTR_ERR(rl2);
2219 if (start < 0 || start >= allocated_size)
2220 ntfs_error(vol->sb, "Cannot extend allocation of "
2221 "inode 0x%lx, attribute type 0x%x, "
2222 "because the allocation of clusters "
2223 "failed with error code %i.", vi->i_ino,
2224 (unsigned)le32_to_cpu(ni->type), err);
2225 if (err != -ENOMEM && err != -ENOSPC)
2226 err = -EIO;
2227 goto err_out;
2228 }
2229 rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
2230 if (IS_ERR(rl)) {
2231 err = PTR_ERR(rl);
2232 if (start < 0 || start >= allocated_size)
2233 ntfs_error(vol->sb, "Cannot extend allocation of "
2234 "inode 0x%lx, attribute type 0x%x, "
2235 "because the runlist merge failed "
2236 "with error code %i.", vi->i_ino,
2237 (unsigned)le32_to_cpu(ni->type), err);
2238 if (err != -ENOMEM)
2239 err = -EIO;
2240 if (ntfs_cluster_free_from_rl(vol, rl2)) {
2241 ntfs_error(vol->sb, "Failed to release allocated "
2242 "cluster(s) in error code path. Run "
2243 "chkdsk to recover the lost "
2244 "cluster(s).");
2245 NVolSetErrors(vol);
2246 }
2247 ntfs_free(rl2);
2248 goto err_out;
2249 }
2250 ni->runlist.rl = rl;
2251 ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
2252 allocated_size) >> vol->cluster_size_bits);
2253 /* Find the runlist element with which the attribute extent starts. */
2254 ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
2255 rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
2256 BUG_ON(!rl2);
2257 BUG_ON(!rl2->length);
2258 BUG_ON(rl2->lcn < LCN_HOLE);
2259 mp_rebuilt = FALSE;
2260 /* Get the size for the new mapping pairs array for this extent. */
2261 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
2262 if (unlikely(mp_size <= 0)) {
2263 err = mp_size;
2264 if (start < 0 || start >= allocated_size)
2265 ntfs_error(vol->sb, "Cannot extend allocation of "
2266 "inode 0x%lx, attribute type 0x%x, "
2267 "because determining the size for the "
2268 "mapping pairs failed with error code "
2269 "%i.", vi->i_ino,
2270 (unsigned)le32_to_cpu(ni->type), err);
2271 err = -EIO;
2272 goto undo_alloc;
2273 }
2274 /* Extend the attribute record to fit the bigger mapping pairs array. */
2275 attr_len = le32_to_cpu(a->length);
2276 err = ntfs_attr_record_resize(m, a, mp_size +
2277 le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
2278 if (unlikely(err)) {
2279 BUG_ON(err != -ENOSPC);
2280 // TODO: Deal with this by moving this extent to a new mft
2281 // record or by starting a new extent in a new mft record,
2282 // possibly by extending this extent partially and filling it
2283 // and creating a new extent for the remainder, or by making
2284 // other attributes non-resident and/or by moving other
2285 // attributes out of this mft record.
2286 if (start < 0 || start >= allocated_size)
2287 ntfs_error(vol->sb, "Not enough space in the mft "
2288 "record for the extended attribute "
2289 "record. This case is not "
2290 "implemented yet.");
2291 err = -EOPNOTSUPP;
2292 goto undo_alloc;
2293 }
2294 mp_rebuilt = TRUE;
2295 /* Generate the mapping pairs array directly into the attr record. */
2296 err = ntfs_mapping_pairs_build(vol, (u8*)a +
2297 le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
2298 mp_size, rl2, ll, -1, NULL);
2299 if (unlikely(err)) {
2300 if (start < 0 || start >= allocated_size)
2301 ntfs_error(vol->sb, "Cannot extend allocation of "
2302 "inode 0x%lx, attribute type 0x%x, "
2303 "because building the mapping pairs "
2304 "failed with error code %i.", vi->i_ino,
2305 (unsigned)le32_to_cpu(ni->type), err);
2306 err = -EIO;
2307 goto undo_alloc;
2308 }
2309 /* Update the highest_vcn. */
2310 a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
2311 vol->cluster_size_bits) - 1);
2312 /*
2313 * We now have extended the allocated size of the attribute. Reflect
2314 * this in the ntfs_inode structure and the attribute record.
2315 */
2316 if (a->data.non_resident.lowest_vcn) {
2317 /*
2318 * We are not in the first attribute extent, switch to it, but
2319 * first ensure the changes will make it to disk later.
2320 */
2321 flush_dcache_mft_record_page(ctx->ntfs_ino);
2322 mark_mft_record_dirty(ctx->ntfs_ino);
2323 ntfs_attr_reinit_search_ctx(ctx);
2324 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2325 CASE_SENSITIVE, 0, NULL, 0, ctx);
2326 if (unlikely(err))
2327 goto restore_undo_alloc;
2328 /* @m is not used any more so no need to set it. */
2329 a = ctx->attr;
2330 }
2331 write_lock_irqsave(&ni->size_lock, flags);
2332 ni->allocated_size = new_alloc_size;
2333 a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
2334 /*
2335 * FIXME: This would fail if @ni is a directory, $MFT, or an index,
2336 * since those can have sparse/compressed set. For example can be
2337 * set compressed even though it is not compressed itself and in that
2338 * case the bit means that files are to be created compressed in the
2339 * directory... At present this is ok as this code is only called for
2340 * regular files, and only for their $DATA attribute(s).
2341 * FIXME: The calculation is wrong if we created a hole above. For now
2342 * it does not matter as we never create holes.
2343 */
2344 if (NInoSparse(ni) || NInoCompressed(ni)) {
2345 ni->itype.compressed.size += new_alloc_size - allocated_size;
2346 a->data.non_resident.compressed_size =
2347 cpu_to_sle64(ni->itype.compressed.size);
2348 vi->i_blocks = ni->itype.compressed.size >> 9;
2349 } else
2350 vi->i_blocks = new_alloc_size >> 9;
2351 write_unlock_irqrestore(&ni->size_lock, flags);
2352alloc_done:
2353 if (new_data_size >= 0) {
2354 BUG_ON(new_data_size <
2355 sle64_to_cpu(a->data.non_resident.data_size));
2356 a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
2357 }
2358flush_done:
2359 /* Ensure the changes make it to disk. */
2360 flush_dcache_mft_record_page(ctx->ntfs_ino);
2361 mark_mft_record_dirty(ctx->ntfs_ino);
2362done:
2363 ntfs_attr_put_search_ctx(ctx);
2364 unmap_mft_record(base_ni);
2365 up_write(&ni->runlist.lock);
2366 ntfs_debug("Done, new_allocated_size 0x%llx.",
2367 (unsigned long long)new_alloc_size);
2368 return new_alloc_size;
2369restore_undo_alloc:
2370 if (start < 0 || start >= allocated_size)
2371 ntfs_error(vol->sb, "Cannot complete extension of allocation "
2372 "of inode 0x%lx, attribute type 0x%x, because "
2373 "lookup of first attribute extent failed with "
2374 "error code %i.", vi->i_ino,
2375 (unsigned)le32_to_cpu(ni->type), err);
2376 if (err == -ENOENT)
2377 err = -EIO;
2378 ntfs_attr_reinit_search_ctx(ctx);
2379 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
2380 allocated_size >> vol->cluster_size_bits, NULL, 0,
2381 ctx)) {
2382 ntfs_error(vol->sb, "Failed to find last attribute extent of "
2383 "attribute in error code path. Run chkdsk to "
2384 "recover.");
2385 write_lock_irqsave(&ni->size_lock, flags);
2386 ni->allocated_size = new_alloc_size;
2387 /*
2388 * FIXME: This would fail if @ni is a directory... See above.
2389 * FIXME: The calculation is wrong if we created a hole above.
2390 * For now it does not matter as we never create holes.
2391 */
2392 if (NInoSparse(ni) || NInoCompressed(ni)) {
2393 ni->itype.compressed.size += new_alloc_size -
2394 allocated_size;
2395 vi->i_blocks = ni->itype.compressed.size >> 9;
2396 } else
2397 vi->i_blocks = new_alloc_size >> 9;
2398 write_unlock_irqrestore(&ni->size_lock, flags);
2399 ntfs_attr_put_search_ctx(ctx);
2400 unmap_mft_record(base_ni);
2401 up_write(&ni->runlist.lock);
2402 /*
2403 * The only thing that is now wrong is the allocated size of the
2404 * base attribute extent which chkdsk should be able to fix.
2405 */
2406 NVolSetErrors(vol);
2407 return err;
2408 }
2409 ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
2410 (allocated_size >> vol->cluster_size_bits) - 1);
2411undo_alloc:
2412 ll = allocated_size >> vol->cluster_size_bits;
2413 if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
2414 ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
2415 "in error code path. Run chkdsk to recover "
2416 "the lost cluster(s).");
2417 NVolSetErrors(vol);
2418 }
2419 m = ctx->mrec;
2420 a = ctx->attr;
2421 /*
2422 * If the runlist truncation fails and/or the search context is no
2423 * longer valid, we cannot resize the attribute record or build the
2424 * mapping pairs array thus we mark the inode bad so that no access to
2425 * the freed clusters can happen.
2426 */
2427 if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
2428 ntfs_error(vol->sb, "Failed to %s in error code path. Run "
2429 "chkdsk to recover.", IS_ERR(m) ?
2430 "restore attribute search context" :
2431 "truncate attribute runlist");
2432 make_bad_inode(vi);
2433 make_bad_inode(VFS_I(base_ni));
2434 NVolSetErrors(vol);
2435 } else if (mp_rebuilt) {
2436 if (ntfs_attr_record_resize(m, a, attr_len)) {
2437 ntfs_error(vol->sb, "Failed to restore attribute "
2438 "record in error code path. Run "
2439 "chkdsk to recover.");
2440 make_bad_inode(vi);
2441 make_bad_inode(VFS_I(base_ni));
2442 NVolSetErrors(vol);
2443 } else /* if (success) */ {
2444 if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
2445 a->data.non_resident.
2446 mapping_pairs_offset), attr_len -
2447 le16_to_cpu(a->data.non_resident.
2448 mapping_pairs_offset), rl2, ll, -1,
2449 NULL)) {
2450 ntfs_error(vol->sb, "Failed to restore "
2451 "mapping pairs array in error "
2452 "code path. Run chkdsk to "
2453 "recover.");
2454 make_bad_inode(vi);
2455 make_bad_inode(VFS_I(base_ni));
2456 NVolSetErrors(vol);
2457 }
2458 flush_dcache_mft_record_page(ctx->ntfs_ino);
2459 mark_mft_record_dirty(ctx->ntfs_ino);
2460 }
2461 }
2462err_out:
2463 if (ctx)
2464 ntfs_attr_put_search_ctx(ctx);
2465 if (m)
2466 unmap_mft_record(base_ni);
2467 up_write(&ni->runlist.lock);
2468conv_err_out:
2469 ntfs_debug("Failed. Returning error code %i.", err);
2470 return err;
2471}
2472
2473/**
1655 * ntfs_attr_set - fill (a part of) an attribute with a byte 2474 * ntfs_attr_set - fill (a part of) an attribute with a byte
1656 * @ni: ntfs inode describing the attribute to fill 2475 * @ni: ntfs inode describing the attribute to fill
1657 * @ofs: offset inside the attribute at which to start to fill 2476 * @ofs: offset inside the attribute at which to start to fill
@@ -1773,6 +2592,8 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
1773 /* Finally unlock and release the page. */ 2592 /* Finally unlock and release the page. */
1774 unlock_page(page); 2593 unlock_page(page);
1775 page_cache_release(page); 2594 page_cache_release(page);
2595 balance_dirty_pages_ratelimited(mapping);
2596 cond_resched();
1776 } 2597 }
1777 /* If there is a last partial page, need to do it the slow way. */ 2598 /* If there is a last partial page, need to do it the slow way. */
1778 if (end_ofs) { 2599 if (end_ofs) {
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 0618ed6fd7b3..9074886b44ba 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -60,14 +60,15 @@ typedef struct {
60 ATTR_RECORD *base_attr; 60 ATTR_RECORD *base_attr;
61} ntfs_attr_search_ctx; 61} ntfs_attr_search_ctx;
62 62
63extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn); 63extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
64 ntfs_attr_search_ctx *ctx);
64extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); 65extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
65 66
66extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, 67extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
67 const BOOL write_locked); 68 const BOOL write_locked);
68 69
69extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, 70extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
70 const VCN vcn, const BOOL write_locked); 71 const VCN vcn, ntfs_attr_search_ctx *ctx);
71 72
72int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, 73int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
73 const u32 name_len, const IGNORE_CASE_BOOL ic, 74 const u32 name_len, const IGNORE_CASE_BOOL ic,
@@ -102,7 +103,10 @@ extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
102extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, 103extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
103 const u32 new_size); 104 const u32 new_size);
104 105
105extern int ntfs_attr_make_non_resident(ntfs_inode *ni); 106extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
107
108extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
109 const s64 new_data_size, const s64 data_start);
106 110
107extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, 111extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
108 const u8 val); 112 const u8 val);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index be9fd1dd423d..cf3e6ced2d01 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,11 +19,24 @@
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/pagemap.h>
23#include <linux/buffer_head.h> 22#include <linux/buffer_head.h>
23#include <linux/pagemap.h>
24#include <linux/pagevec.h>
25#include <linux/sched.h>
26#include <linux/swap.h>
27#include <linux/uio.h>
28#include <linux/writeback.h>
24 29
30#include <asm/page.h>
31#include <asm/uaccess.h>
32
33#include "attrib.h"
34#include "bitmap.h"
25#include "inode.h" 35#include "inode.h"
26#include "debug.h" 36#include "debug.h"
37#include "lcnalloc.h"
38#include "malloc.h"
39#include "mft.h"
27#include "ntfs.h" 40#include "ntfs.h"
28 41
29/** 42/**
@@ -56,6 +69,2184 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
56#ifdef NTFS_RW 69#ifdef NTFS_RW
57 70
58/** 71/**
72 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
73 * @ni: ntfs inode of the attribute to extend
74 * @new_init_size: requested new initialized size in bytes
75 * @cached_page: store any allocated but unused page here
76 * @lru_pvec: lru-buffering pagevec of the caller
77 *
78 * Extend the initialized size of an attribute described by the ntfs inode @ni
79 * to @new_init_size bytes. This involves zeroing any non-sparse space between
80 * the old initialized size and @new_init_size both in the page cache and on
81 * disk (if relevant complete pages are already uptodate in the page cache then
82 * these are simply marked dirty).
83 *
84 * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
85 * in the resident attribute case, it is tied to the initialized size and, in
86 * the non-resident attribute case, it may not fall below the initialized size.
87 *
88 * Note that if the attribute is resident, we do not need to touch the page
89 * cache at all. This is because if the page cache page is not uptodate we
90 * bring it uptodate later, when doing the write to the mft record since we
91 * then already have the page mapped. And if the page is uptodate, the
92 * non-initialized region will already have been zeroed when the page was
93 * brought uptodate and the region may in fact already have been overwritten
94 * with new data via mmap() based writes, so we cannot just zero it. And since
95 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
96 * is unspecified, we choose not to do zeroing and thus we do not need to touch
97 * the page at all. For a more detailed explanation see ntfs_truncate() in
98 * fs/ntfs/inode.c.
99 *
100 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
101 * pages.
102 *
103 * Return 0 on success and -errno on error. In the case that an error is
104 * encountered it is possible that the initialized size will already have been
105 * incremented some way towards @new_init_size but it is guaranteed that if
106 * this is the case, the necessary zeroing will also have happened and that all
107 * metadata is self-consistent.
108 *
109 * Locking: i_sem on the vfs inode corrseponsind to the ntfs inode @ni must be
110 * held by the caller.
111 */
112static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
113 struct page **cached_page, struct pagevec *lru_pvec)
114{
115 s64 old_init_size;
116 loff_t old_i_size;
117 pgoff_t index, end_index;
118 unsigned long flags;
119 struct inode *vi = VFS_I(ni);
120 ntfs_inode *base_ni;
121 MFT_RECORD *m = NULL;
122 ATTR_RECORD *a;
123 ntfs_attr_search_ctx *ctx = NULL;
124 struct address_space *mapping;
125 struct page *page = NULL;
126 u8 *kattr;
127 int err;
128 u32 attr_len;
129
130 read_lock_irqsave(&ni->size_lock, flags);
131 old_init_size = ni->initialized_size;
132 old_i_size = i_size_read(vi);
133 BUG_ON(new_init_size > ni->allocated_size);
134 read_unlock_irqrestore(&ni->size_lock, flags);
135 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
136 "old_initialized_size 0x%llx, "
137 "new_initialized_size 0x%llx, i_size 0x%llx.",
138 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
139 (unsigned long long)old_init_size,
140 (unsigned long long)new_init_size, old_i_size);
141 if (!NInoAttr(ni))
142 base_ni = ni;
143 else
144 base_ni = ni->ext.base_ntfs_ino;
145 /* Use goto to reduce indentation and we need the label below anyway. */
146 if (NInoNonResident(ni))
147 goto do_non_resident_extend;
148 BUG_ON(old_init_size != old_i_size);
149 m = map_mft_record(base_ni);
150 if (IS_ERR(m)) {
151 err = PTR_ERR(m);
152 m = NULL;
153 goto err_out;
154 }
155 ctx = ntfs_attr_get_search_ctx(base_ni, m);
156 if (unlikely(!ctx)) {
157 err = -ENOMEM;
158 goto err_out;
159 }
160 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
161 CASE_SENSITIVE, 0, NULL, 0, ctx);
162 if (unlikely(err)) {
163 if (err == -ENOENT)
164 err = -EIO;
165 goto err_out;
166 }
167 m = ctx->mrec;
168 a = ctx->attr;
169 BUG_ON(a->non_resident);
170 /* The total length of the attribute value. */
171 attr_len = le32_to_cpu(a->data.resident.value_length);
172 BUG_ON(old_i_size != (loff_t)attr_len);
173 /*
174 * Do the zeroing in the mft record and update the attribute size in
175 * the mft record.
176 */
177 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
178 memset(kattr + attr_len, 0, new_init_size - attr_len);
179 a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
180 /* Finally, update the sizes in the vfs and ntfs inodes. */
181 write_lock_irqsave(&ni->size_lock, flags);
182 i_size_write(vi, new_init_size);
183 ni->initialized_size = new_init_size;
184 write_unlock_irqrestore(&ni->size_lock, flags);
185 goto done;
186do_non_resident_extend:
187 /*
188 * If the new initialized size @new_init_size exceeds the current file
189 * size (vfs inode->i_size), we need to extend the file size to the
190 * new initialized size.
191 */
192 if (new_init_size > old_i_size) {
193 m = map_mft_record(base_ni);
194 if (IS_ERR(m)) {
195 err = PTR_ERR(m);
196 m = NULL;
197 goto err_out;
198 }
199 ctx = ntfs_attr_get_search_ctx(base_ni, m);
200 if (unlikely(!ctx)) {
201 err = -ENOMEM;
202 goto err_out;
203 }
204 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
205 CASE_SENSITIVE, 0, NULL, 0, ctx);
206 if (unlikely(err)) {
207 if (err == -ENOENT)
208 err = -EIO;
209 goto err_out;
210 }
211 m = ctx->mrec;
212 a = ctx->attr;
213 BUG_ON(!a->non_resident);
214 BUG_ON(old_i_size != (loff_t)
215 sle64_to_cpu(a->data.non_resident.data_size));
216 a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
217 flush_dcache_mft_record_page(ctx->ntfs_ino);
218 mark_mft_record_dirty(ctx->ntfs_ino);
219 /* Update the file size in the vfs inode. */
220 i_size_write(vi, new_init_size);
221 ntfs_attr_put_search_ctx(ctx);
222 ctx = NULL;
223 unmap_mft_record(base_ni);
224 m = NULL;
225 }
226 mapping = vi->i_mapping;
227 index = old_init_size >> PAGE_CACHE_SHIFT;
228 end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
229 do {
230 /*
231 * Read the page. If the page is not present, this will zero
232 * the uninitialized regions for us.
233 */
234 page = read_cache_page(mapping, index,
235 (filler_t*)mapping->a_ops->readpage, NULL);
236 if (IS_ERR(page)) {
237 err = PTR_ERR(page);
238 goto init_err_out;
239 }
240 wait_on_page_locked(page);
241 if (unlikely(!PageUptodate(page) || PageError(page))) {
242 page_cache_release(page);
243 err = -EIO;
244 goto init_err_out;
245 }
246 /*
247 * Update the initialized size in the ntfs inode. This is
248 * enough to make ntfs_writepage() work.
249 */
250 write_lock_irqsave(&ni->size_lock, flags);
251 ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT;
252 if (ni->initialized_size > new_init_size)
253 ni->initialized_size = new_init_size;
254 write_unlock_irqrestore(&ni->size_lock, flags);
255 /* Set the page dirty so it gets written out. */
256 set_page_dirty(page);
257 page_cache_release(page);
258 /*
259 * Play nice with the vm and the rest of the system. This is
260 * very much needed as we can potentially be modifying the
261 * initialised size from a very small value to a really huge
262 * value, e.g.
263 * f = open(somefile, O_TRUNC);
264 * truncate(f, 10GiB);
265 * seek(f, 10GiB);
266 * write(f, 1);
267 * And this would mean we would be marking dirty hundreds of
268 * thousands of pages or as in the above example more than
269 * two and a half million pages!
270 *
271 * TODO: For sparse pages could optimize this workload by using
272 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
273 * would be set in readpage for sparse pages and here we would
274 * not need to mark dirty any pages which have this bit set.
275 * The only caveat is that we have to clear the bit everywhere
276 * where we allocate any clusters that lie in the page or that
277 * contain the page.
278 *
279 * TODO: An even greater optimization would be for us to only
280 * call readpage() on pages which are not in sparse regions as
281 * determined from the runlist. This would greatly reduce the
282 * number of pages we read and make dirty in the case of sparse
283 * files.
284 */
285 balance_dirty_pages_ratelimited(mapping);
286 cond_resched();
287 } while (++index < end_index);
288 read_lock_irqsave(&ni->size_lock, flags);
289 BUG_ON(ni->initialized_size != new_init_size);
290 read_unlock_irqrestore(&ni->size_lock, flags);
291 /* Now bring in sync the initialized_size in the mft record. */
292 m = map_mft_record(base_ni);
293 if (IS_ERR(m)) {
294 err = PTR_ERR(m);
295 m = NULL;
296 goto init_err_out;
297 }
298 ctx = ntfs_attr_get_search_ctx(base_ni, m);
299 if (unlikely(!ctx)) {
300 err = -ENOMEM;
301 goto init_err_out;
302 }
303 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
304 CASE_SENSITIVE, 0, NULL, 0, ctx);
305 if (unlikely(err)) {
306 if (err == -ENOENT)
307 err = -EIO;
308 goto init_err_out;
309 }
310 m = ctx->mrec;
311 a = ctx->attr;
312 BUG_ON(!a->non_resident);
313 a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
314done:
315 flush_dcache_mft_record_page(ctx->ntfs_ino);
316 mark_mft_record_dirty(ctx->ntfs_ino);
317 if (ctx)
318 ntfs_attr_put_search_ctx(ctx);
319 if (m)
320 unmap_mft_record(base_ni);
321 ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
322 (unsigned long long)new_init_size, i_size_read(vi));
323 return 0;
324init_err_out:
325 write_lock_irqsave(&ni->size_lock, flags);
326 ni->initialized_size = old_init_size;
327 write_unlock_irqrestore(&ni->size_lock, flags);
328err_out:
329 if (ctx)
330 ntfs_attr_put_search_ctx(ctx);
331 if (m)
332 unmap_mft_record(base_ni);
333 ntfs_debug("Failed. Returning error code %i.", err);
334 return err;
335}
336
337/**
338 * ntfs_fault_in_pages_readable -
339 *
340 * Fault a number of userspace pages into pagetables.
341 *
342 * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
343 * with more than two userspace pages as well as handling the single page case
344 * elegantly.
345 *
346 * If you find this difficult to understand, then think of the while loop being
347 * the following code, except that we do without the integer variable ret:
348 *
349 * do {
350 * ret = __get_user(c, uaddr);
351 * uaddr += PAGE_SIZE;
352 * } while (!ret && uaddr < end);
353 *
354 * Note, the final __get_user() may well run out-of-bounds of the user buffer,
355 * but _not_ out-of-bounds of the page the user buffer belongs to, and since
356 * this is only a read and not a write, and since it is still in the same page,
357 * it should not matter and this makes the code much simpler.
358 */
359static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
360 int bytes)
361{
362 const char __user *end;
363 volatile char c;
364
365 /* Set @end to the first byte outside the last page we care about. */
366 end = (const char __user*)PAGE_ALIGN((ptrdiff_t __user)uaddr + bytes);
367
368 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
369 ;
370}
371
372/**
373 * ntfs_fault_in_pages_readable_iovec -
374 *
375 * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
376 */
377static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
378 size_t iov_ofs, int bytes)
379{
380 do {
381 const char __user *buf;
382 unsigned len;
383
384 buf = iov->iov_base + iov_ofs;
385 len = iov->iov_len - iov_ofs;
386 if (len > bytes)
387 len = bytes;
388 ntfs_fault_in_pages_readable(buf, len);
389 bytes -= len;
390 iov++;
391 iov_ofs = 0;
392 } while (bytes);
393}
394
395/**
396 * __ntfs_grab_cache_pages - obtain a number of locked pages
397 * @mapping: address space mapping from which to obtain page cache pages
398 * @index: starting index in @mapping at which to begin obtaining pages
399 * @nr_pages: number of page cache pages to obtain
400 * @pages: array of pages in which to return the obtained page cache pages
401 * @cached_page: allocated but as yet unused page
402 * @lru_pvec: lru-buffering pagevec of caller
403 *
404 * Obtain @nr_pages locked page cache pages from the mapping @maping and
405 * starting at index @index.
406 *
407 * If a page is newly created, increment its refcount and add it to the
408 * caller's lru-buffering pagevec @lru_pvec.
409 *
410 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
411 * are obtained at once instead of just one page and that 0 is returned on
412 * success and -errno on error.
413 *
414 * Note, the page locks are obtained in ascending page index order.
415 */
416static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
417 pgoff_t index, const unsigned nr_pages, struct page **pages,
418 struct page **cached_page, struct pagevec *lru_pvec)
419{
420 int err, nr;
421
422 BUG_ON(!nr_pages);
423 err = nr = 0;
424 do {
425 pages[nr] = find_lock_page(mapping, index);
426 if (!pages[nr]) {
427 if (!*cached_page) {
428 *cached_page = page_cache_alloc(mapping);
429 if (unlikely(!*cached_page)) {
430 err = -ENOMEM;
431 goto err_out;
432 }
433 }
434 err = add_to_page_cache(*cached_page, mapping, index,
435 GFP_KERNEL);
436 if (unlikely(err)) {
437 if (err == -EEXIST)
438 continue;
439 goto err_out;
440 }
441 pages[nr] = *cached_page;
442 page_cache_get(*cached_page);
443 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
444 __pagevec_lru_add(lru_pvec);
445 *cached_page = NULL;
446 }
447 index++;
448 nr++;
449 } while (nr < nr_pages);
450out:
451 return err;
452err_out:
453 while (nr > 0) {
454 unlock_page(pages[--nr]);
455 page_cache_release(pages[nr]);
456 }
457 goto out;
458}
459
460static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
461{
462 lock_buffer(bh);
463 get_bh(bh);
464 bh->b_end_io = end_buffer_read_sync;
465 return submit_bh(READ, bh);
466}
467
468/**
469 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
470 * @pages: array of destination pages
471 * @nr_pages: number of pages in @pages
472 * @pos: byte position in file at which the write begins
473 * @bytes: number of bytes to be written
474 *
475 * This is called for non-resident attributes from ntfs_file_buffered_write()
476 * with i_sem held on the inode (@pages[0]->mapping->host). There are
477 * @nr_pages pages in @pages which are locked but not kmap()ped. The source
478 * data has not yet been copied into the @pages.
479 *
480 * Need to fill any holes with actual clusters, allocate buffers if necessary,
481 * ensure all the buffers are mapped, and bring uptodate any buffers that are
482 * only partially being written to.
483 *
484 * If @nr_pages is greater than one, we are guaranteed that the cluster size is
485 * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
486 * the same cluster and that they are the entirety of that cluster, and that
487 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
488 *
489 * i_size is not to be modified yet.
490 *
491 * Return 0 on success or -errno on error.
492 */
493static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
494 unsigned nr_pages, s64 pos, size_t bytes)
495{
496 VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
497 LCN lcn;
498 s64 bh_pos, vcn_len, end, initialized_size;
499 sector_t lcn_block;
500 struct page *page;
501 struct inode *vi;
502 ntfs_inode *ni, *base_ni = NULL;
503 ntfs_volume *vol;
504 runlist_element *rl, *rl2;
505 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
506 ntfs_attr_search_ctx *ctx = NULL;
507 MFT_RECORD *m = NULL;
508 ATTR_RECORD *a = NULL;
509 unsigned long flags;
510 u32 attr_rec_len = 0;
511 unsigned blocksize, u;
512 int err, mp_size;
513 BOOL rl_write_locked, was_hole, is_retry;
514 unsigned char blocksize_bits;
515 struct {
516 u8 runlist_merged:1;
517 u8 mft_attr_mapped:1;
518 u8 mp_rebuilt:1;
519 u8 attr_switched:1;
520 } status = { 0, 0, 0, 0 };
521
522 BUG_ON(!nr_pages);
523 BUG_ON(!pages);
524 BUG_ON(!*pages);
525 vi = pages[0]->mapping->host;
526 ni = NTFS_I(vi);
527 vol = ni->vol;
528 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
529 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
530 vi->i_ino, ni->type, pages[0]->index, nr_pages,
531 (long long)pos, bytes);
532 blocksize_bits = vi->i_blkbits;
533 blocksize = 1 << blocksize_bits;
534 u = 0;
535 do {
536 struct page *page = pages[u];
537 /*
538 * create_empty_buffers() will create uptodate/dirty buffers if
539 * the page is uptodate/dirty.
540 */
541 if (!page_has_buffers(page)) {
542 create_empty_buffers(page, blocksize, 0);
543 if (unlikely(!page_has_buffers(page)))
544 return -ENOMEM;
545 }
546 } while (++u < nr_pages);
547 rl_write_locked = FALSE;
548 rl = NULL;
549 err = 0;
550 vcn = lcn = -1;
551 vcn_len = 0;
552 lcn_block = -1;
553 was_hole = FALSE;
554 cpos = pos >> vol->cluster_size_bits;
555 end = pos + bytes;
556 cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
557 /*
558 * Loop over each page and for each page over each buffer. Use goto to
559 * reduce indentation.
560 */
561 u = 0;
562do_next_page:
563 page = pages[u];
564 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
565 bh = head = page_buffers(page);
566 do {
567 VCN cdelta;
568 s64 bh_end;
569 unsigned bh_cofs;
570
571 /* Clear buffer_new on all buffers to reinitialise state. */
572 if (buffer_new(bh))
573 clear_buffer_new(bh);
574 bh_end = bh_pos + blocksize;
575 bh_cpos = bh_pos >> vol->cluster_size_bits;
576 bh_cofs = bh_pos & vol->cluster_size_mask;
577 if (buffer_mapped(bh)) {
578 /*
579 * The buffer is already mapped. If it is uptodate,
580 * ignore it.
581 */
582 if (buffer_uptodate(bh))
583 continue;
584 /*
585 * The buffer is not uptodate. If the page is uptodate
586 * set the buffer uptodate and otherwise ignore it.
587 */
588 if (PageUptodate(page)) {
589 set_buffer_uptodate(bh);
590 continue;
591 }
592 /*
593 * Neither the page nor the buffer are uptodate. If
594 * the buffer is only partially being written to, we
595 * need to read it in before the write, i.e. now.
596 */
597 if ((bh_pos < pos && bh_end > pos) ||
598 (bh_pos < end && bh_end > end)) {
599 /*
600 * If the buffer is fully or partially within
601 * the initialized size, do an actual read.
602 * Otherwise, simply zero the buffer.
603 */
604 read_lock_irqsave(&ni->size_lock, flags);
605 initialized_size = ni->initialized_size;
606 read_unlock_irqrestore(&ni->size_lock, flags);
607 if (bh_pos < initialized_size) {
608 ntfs_submit_bh_for_read(bh);
609 *wait_bh++ = bh;
610 } else {
611 u8 *kaddr = kmap_atomic(page, KM_USER0);
612 memset(kaddr + bh_offset(bh), 0,
613 blocksize);
614 kunmap_atomic(kaddr, KM_USER0);
615 flush_dcache_page(page);
616 set_buffer_uptodate(bh);
617 }
618 }
619 continue;
620 }
621 /* Unmapped buffer. Need to map it. */
622 bh->b_bdev = vol->sb->s_bdev;
623 /*
624 * If the current buffer is in the same clusters as the map
625 * cache, there is no need to check the runlist again. The
626 * map cache is made up of @vcn, which is the first cached file
627 * cluster, @vcn_len which is the number of cached file
628 * clusters, @lcn is the device cluster corresponding to @vcn,
629 * and @lcn_block is the block number corresponding to @lcn.
630 */
631 cdelta = bh_cpos - vcn;
632 if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
633map_buffer_cached:
634 BUG_ON(lcn < 0);
635 bh->b_blocknr = lcn_block +
636 (cdelta << (vol->cluster_size_bits -
637 blocksize_bits)) +
638 (bh_cofs >> blocksize_bits);
639 set_buffer_mapped(bh);
640 /*
641 * If the page is uptodate so is the buffer. If the
642 * buffer is fully outside the write, we ignore it if
643 * it was already allocated and we mark it dirty so it
644 * gets written out if we allocated it. On the other
645 * hand, if we allocated the buffer but we are not
646 * marking it dirty we set buffer_new so we can do
647 * error recovery.
648 */
649 if (PageUptodate(page)) {
650 if (!buffer_uptodate(bh))
651 set_buffer_uptodate(bh);
652 if (unlikely(was_hole)) {
653 /* We allocated the buffer. */
654 unmap_underlying_metadata(bh->b_bdev,
655 bh->b_blocknr);
656 if (bh_end <= pos || bh_pos >= end)
657 mark_buffer_dirty(bh);
658 else
659 set_buffer_new(bh);
660 }
661 continue;
662 }
663 /* Page is _not_ uptodate. */
664 if (likely(!was_hole)) {
665 /*
666 * Buffer was already allocated. If it is not
667 * uptodate and is only partially being written
668 * to, we need to read it in before the write,
669 * i.e. now.
670 */
671 if (!buffer_uptodate(bh) && ((bh_pos < pos &&
672 bh_end > pos) ||
673 (bh_end > end &&
674 bh_end > end))) {
675 /*
676 * If the buffer is fully or partially
677 * within the initialized size, do an
678 * actual read. Otherwise, simply zero
679 * the buffer.
680 */
681 read_lock_irqsave(&ni->size_lock,
682 flags);
683 initialized_size = ni->initialized_size;
684 read_unlock_irqrestore(&ni->size_lock,
685 flags);
686 if (bh_pos < initialized_size) {
687 ntfs_submit_bh_for_read(bh);
688 *wait_bh++ = bh;
689 } else {
690 u8 *kaddr = kmap_atomic(page,
691 KM_USER0);
692 memset(kaddr + bh_offset(bh),
693 0, blocksize);
694 kunmap_atomic(kaddr, KM_USER0);
695 flush_dcache_page(page);
696 set_buffer_uptodate(bh);
697 }
698 }
699 continue;
700 }
701 /* We allocated the buffer. */
702 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
703 /*
704 * If the buffer is fully outside the write, zero it,
705 * set it uptodate, and mark it dirty so it gets
706 * written out. If it is partially being written to,
707 * zero region surrounding the write but leave it to
708 * commit write to do anything else. Finally, if the
709 * buffer is fully being overwritten, do nothing.
710 */
711 if (bh_end <= pos || bh_pos >= end) {
712 if (!buffer_uptodate(bh)) {
713 u8 *kaddr = kmap_atomic(page, KM_USER0);
714 memset(kaddr + bh_offset(bh), 0,
715 blocksize);
716 kunmap_atomic(kaddr, KM_USER0);
717 flush_dcache_page(page);
718 set_buffer_uptodate(bh);
719 }
720 mark_buffer_dirty(bh);
721 continue;
722 }
723 set_buffer_new(bh);
724 if (!buffer_uptodate(bh) &&
725 (bh_pos < pos || bh_end > end)) {
726 u8 *kaddr;
727 unsigned pofs;
728
729 kaddr = kmap_atomic(page, KM_USER0);
730 if (bh_pos < pos) {
731 pofs = bh_pos & ~PAGE_CACHE_MASK;
732 memset(kaddr + pofs, 0, pos - bh_pos);
733 }
734 if (bh_end > end) {
735 pofs = end & ~PAGE_CACHE_MASK;
736 memset(kaddr + pofs, 0, bh_end - end);
737 }
738 kunmap_atomic(kaddr, KM_USER0);
739 flush_dcache_page(page);
740 }
741 continue;
742 }
743 /*
744 * Slow path: this is the first buffer in the cluster. If it
745 * is outside allocated size and is not uptodate, zero it and
746 * set it uptodate.
747 */
748 read_lock_irqsave(&ni->size_lock, flags);
749 initialized_size = ni->allocated_size;
750 read_unlock_irqrestore(&ni->size_lock, flags);
751 if (bh_pos > initialized_size) {
752 if (PageUptodate(page)) {
753 if (!buffer_uptodate(bh))
754 set_buffer_uptodate(bh);
755 } else if (!buffer_uptodate(bh)) {
756 u8 *kaddr = kmap_atomic(page, KM_USER0);
757 memset(kaddr + bh_offset(bh), 0, blocksize);
758 kunmap_atomic(kaddr, KM_USER0);
759 flush_dcache_page(page);
760 set_buffer_uptodate(bh);
761 }
762 continue;
763 }
764 is_retry = FALSE;
765 if (!rl) {
766 down_read(&ni->runlist.lock);
767retry_remap:
768 rl = ni->runlist.rl;
769 }
770 if (likely(rl != NULL)) {
771 /* Seek to element containing target cluster. */
772 while (rl->length && rl[1].vcn <= bh_cpos)
773 rl++;
774 lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
775 if (likely(lcn >= 0)) {
776 /*
777 * Successful remap, setup the map cache and
778 * use that to deal with the buffer.
779 */
780 was_hole = FALSE;
781 vcn = bh_cpos;
782 vcn_len = rl[1].vcn - vcn;
783 lcn_block = lcn << (vol->cluster_size_bits -
784 blocksize_bits);
785 cdelta = 0;
786 /*
787 * If the number of remaining clusters in the
788 * @pages is smaller or equal to the number of
789 * cached clusters, unlock the runlist as the
790 * map cache will be used from now on.
791 */
792 if (likely(vcn + vcn_len >= cend)) {
793 if (rl_write_locked) {
794 up_write(&ni->runlist.lock);
795 rl_write_locked = FALSE;
796 } else
797 up_read(&ni->runlist.lock);
798 rl = NULL;
799 }
800 goto map_buffer_cached;
801 }
802 } else
803 lcn = LCN_RL_NOT_MAPPED;
804 /*
805 * If it is not a hole and not out of bounds, the runlist is
806 * probably unmapped so try to map it now.
807 */
808 if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
809 if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
810 /* Attempt to map runlist. */
811 if (!rl_write_locked) {
812 /*
813 * We need the runlist locked for
814 * writing, so if it is locked for
815 * reading relock it now and retry in
816 * case it changed whilst we dropped
817 * the lock.
818 */
819 up_read(&ni->runlist.lock);
820 down_write(&ni->runlist.lock);
821 rl_write_locked = TRUE;
822 goto retry_remap;
823 }
824 err = ntfs_map_runlist_nolock(ni, bh_cpos,
825 NULL);
826 if (likely(!err)) {
827 is_retry = TRUE;
828 goto retry_remap;
829 }
830 /*
831 * If @vcn is out of bounds, pretend @lcn is
832 * LCN_ENOENT. As long as the buffer is out
833 * of bounds this will work fine.
834 */
835 if (err == -ENOENT) {
836 lcn = LCN_ENOENT;
837 err = 0;
838 goto rl_not_mapped_enoent;
839 }
840 } else
841 err = -EIO;
842 /* Failed to map the buffer, even after retrying. */
843 bh->b_blocknr = -1;
844 ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
845 "attribute type 0x%x, vcn 0x%llx, "
846 "vcn offset 0x%x, because its "
847 "location on disk could not be "
848 "determined%s (error code %i).",
849 ni->mft_no, ni->type,
850 (unsigned long long)bh_cpos,
851 (unsigned)bh_pos &
852 vol->cluster_size_mask,
853 is_retry ? " even after retrying" : "",
854 err);
855 break;
856 }
857rl_not_mapped_enoent:
858 /*
859 * The buffer is in a hole or out of bounds. We need to fill
860 * the hole, unless the buffer is in a cluster which is not
861 * touched by the write, in which case we just leave the buffer
862 * unmapped. This can only happen when the cluster size is
863 * less than the page cache size.
864 */
865 if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
866 bh_cend = (bh_end + vol->cluster_size - 1) >>
867 vol->cluster_size_bits;
868 if ((bh_cend <= cpos || bh_cpos >= cend)) {
869 bh->b_blocknr = -1;
870 /*
871 * If the buffer is uptodate we skip it. If it
872 * is not but the page is uptodate, we can set
873 * the buffer uptodate. If the page is not
874 * uptodate, we can clear the buffer and set it
875 * uptodate. Whether this is worthwhile is
876 * debatable and this could be removed.
877 */
878 if (PageUptodate(page)) {
879 if (!buffer_uptodate(bh))
880 set_buffer_uptodate(bh);
881 } else if (!buffer_uptodate(bh)) {
882 u8 *kaddr = kmap_atomic(page, KM_USER0);
883 memset(kaddr + bh_offset(bh), 0,
884 blocksize);
885 kunmap_atomic(kaddr, KM_USER0);
886 flush_dcache_page(page);
887 set_buffer_uptodate(bh);
888 }
889 continue;
890 }
891 }
892 /*
893 * Out of bounds buffer is invalid if it was not really out of
894 * bounds.
895 */
896 BUG_ON(lcn != LCN_HOLE);
897 /*
898 * We need the runlist locked for writing, so if it is locked
899 * for reading relock it now and retry in case it changed
900 * whilst we dropped the lock.
901 */
902 BUG_ON(!rl);
903 if (!rl_write_locked) {
904 up_read(&ni->runlist.lock);
905 down_write(&ni->runlist.lock);
906 rl_write_locked = TRUE;
907 goto retry_remap;
908 }
909 /* Find the previous last allocated cluster. */
910 BUG_ON(rl->lcn != LCN_HOLE);
911 lcn = -1;
912 rl2 = rl;
913 while (--rl2 >= ni->runlist.rl) {
914 if (rl2->lcn >= 0) {
915 lcn = rl2->lcn + rl2->length;
916 break;
917 }
918 }
919 rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
920 FALSE);
921 if (IS_ERR(rl2)) {
922 err = PTR_ERR(rl2);
923 ntfs_debug("Failed to allocate cluster, error code %i.",
924 err);
925 break;
926 }
927 lcn = rl2->lcn;
928 rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
929 if (IS_ERR(rl)) {
930 err = PTR_ERR(rl);
931 if (err != -ENOMEM)
932 err = -EIO;
933 if (ntfs_cluster_free_from_rl(vol, rl2)) {
934 ntfs_error(vol->sb, "Failed to release "
935 "allocated cluster in error "
936 "code path. Run chkdsk to "
937 "recover the lost cluster.");
938 NVolSetErrors(vol);
939 }
940 ntfs_free(rl2);
941 break;
942 }
943 ni->runlist.rl = rl;
944 status.runlist_merged = 1;
945 ntfs_debug("Allocated cluster, lcn 0x%llx.", lcn);
946 /* Map and lock the mft record and get the attribute record. */
947 if (!NInoAttr(ni))
948 base_ni = ni;
949 else
950 base_ni = ni->ext.base_ntfs_ino;
951 m = map_mft_record(base_ni);
952 if (IS_ERR(m)) {
953 err = PTR_ERR(m);
954 break;
955 }
956 ctx = ntfs_attr_get_search_ctx(base_ni, m);
957 if (unlikely(!ctx)) {
958 err = -ENOMEM;
959 unmap_mft_record(base_ni);
960 break;
961 }
962 status.mft_attr_mapped = 1;
963 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
964 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
965 if (unlikely(err)) {
966 if (err == -ENOENT)
967 err = -EIO;
968 break;
969 }
970 m = ctx->mrec;
971 a = ctx->attr;
972 /*
973 * Find the runlist element with which the attribute extent
974 * starts. Note, we cannot use the _attr_ version because we
975 * have mapped the mft record. That is ok because we know the
976 * runlist fragment must be mapped already to have ever gotten
977 * here, so we can just use the _rl_ version.
978 */
979 vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
980 rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
981 BUG_ON(!rl2);
982 BUG_ON(!rl2->length);
983 BUG_ON(rl2->lcn < LCN_HOLE);
984 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
985 /*
986 * If @highest_vcn is zero, calculate the real highest_vcn
987 * (which can really be zero).
988 */
989 if (!highest_vcn)
990 highest_vcn = (sle64_to_cpu(
991 a->data.non_resident.allocated_size) >>
992 vol->cluster_size_bits) - 1;
993 /*
994 * Determine the size of the mapping pairs array for the new
995 * extent, i.e. the old extent with the hole filled.
996 */
997 mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
998 highest_vcn);
999 if (unlikely(mp_size <= 0)) {
1000 if (!(err = mp_size))
1001 err = -EIO;
1002 ntfs_debug("Failed to get size for mapping pairs "
1003 "array, error code %i.", err);
1004 break;
1005 }
1006 /*
1007 * Resize the attribute record to fit the new mapping pairs
1008 * array.
1009 */
1010 attr_rec_len = le32_to_cpu(a->length);
1011 err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
1012 a->data.non_resident.mapping_pairs_offset));
1013 if (unlikely(err)) {
1014 BUG_ON(err != -ENOSPC);
1015 // TODO: Deal with this by using the current attribute
1016 // and fill it with as much of the mapping pairs
1017 // array as possible. Then loop over each attribute
1018 // extent rewriting the mapping pairs arrays as we go
1019 // along and if when we reach the end we have not
1020 // enough space, try to resize the last attribute
1021 // extent and if even that fails, add a new attribute
1022 // extent.
1023 // We could also try to resize at each step in the hope
1024 // that we will not need to rewrite every single extent.
1025 // Note, we may need to decompress some extents to fill
1026 // the runlist as we are walking the extents...
1027 ntfs_error(vol->sb, "Not enough space in the mft "
1028 "record for the extended attribute "
1029 "record. This case is not "
1030 "implemented yet.");
1031 err = -EOPNOTSUPP;
1032 break ;
1033 }
1034 status.mp_rebuilt = 1;
1035 /*
1036 * Generate the mapping pairs array directly into the attribute
1037 * record.
1038 */
1039 err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1040 a->data.non_resident.mapping_pairs_offset),
1041 mp_size, rl2, vcn, highest_vcn, NULL);
1042 if (unlikely(err)) {
1043 ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
1044 "attribute type 0x%x, because building "
1045 "the mapping pairs failed with error "
1046 "code %i.", vi->i_ino,
1047 (unsigned)le32_to_cpu(ni->type), err);
1048 err = -EIO;
1049 break;
1050 }
1051 /* Update the highest_vcn but only if it was not set. */
1052 if (unlikely(!a->data.non_resident.highest_vcn))
1053 a->data.non_resident.highest_vcn =
1054 cpu_to_sle64(highest_vcn);
1055 /*
1056 * If the attribute is sparse/compressed, update the compressed
1057 * size in the ntfs_inode structure and the attribute record.
1058 */
1059 if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
1060 /*
1061 * If we are not in the first attribute extent, switch
1062 * to it, but first ensure the changes will make it to
1063 * disk later.
1064 */
1065 if (a->data.non_resident.lowest_vcn) {
1066 flush_dcache_mft_record_page(ctx->ntfs_ino);
1067 mark_mft_record_dirty(ctx->ntfs_ino);
1068 ntfs_attr_reinit_search_ctx(ctx);
1069 err = ntfs_attr_lookup(ni->type, ni->name,
1070 ni->name_len, CASE_SENSITIVE,
1071 0, NULL, 0, ctx);
1072 if (unlikely(err)) {
1073 status.attr_switched = 1;
1074 break;
1075 }
1076 /* @m is not used any more so do not set it. */
1077 a = ctx->attr;
1078 }
1079 write_lock_irqsave(&ni->size_lock, flags);
1080 ni->itype.compressed.size += vol->cluster_size;
1081 a->data.non_resident.compressed_size =
1082 cpu_to_sle64(ni->itype.compressed.size);
1083 write_unlock_irqrestore(&ni->size_lock, flags);
1084 }
1085 /* Ensure the changes make it to disk. */
1086 flush_dcache_mft_record_page(ctx->ntfs_ino);
1087 mark_mft_record_dirty(ctx->ntfs_ino);
1088 ntfs_attr_put_search_ctx(ctx);
1089 unmap_mft_record(base_ni);
1090 /* Successfully filled the hole. */
1091 status.runlist_merged = 0;
1092 status.mft_attr_mapped = 0;
1093 status.mp_rebuilt = 0;
1094 /* Setup the map cache and use that to deal with the buffer. */
1095 was_hole = TRUE;
1096 vcn = bh_cpos;
1097 vcn_len = 1;
1098 lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
1099 cdelta = 0;
1100 /*
1101 * If the number of remaining clusters in the @pages is smaller
1102 * or equal to the number of cached clusters, unlock the
1103 * runlist as the map cache will be used from now on.
1104 */
1105 if (likely(vcn + vcn_len >= cend)) {
1106 up_write(&ni->runlist.lock);
1107 rl_write_locked = FALSE;
1108 rl = NULL;
1109 }
1110 goto map_buffer_cached;
1111 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1112 /* If there are no errors, do the next page. */
1113 if (likely(!err && ++u < nr_pages))
1114 goto do_next_page;
1115 /* If there are no errors, release the runlist lock if we took it. */
1116 if (likely(!err)) {
1117 if (unlikely(rl_write_locked)) {
1118 up_write(&ni->runlist.lock);
1119 rl_write_locked = FALSE;
1120 } else if (unlikely(rl))
1121 up_read(&ni->runlist.lock);
1122 rl = NULL;
1123 }
1124 /* If we issued read requests, let them complete. */
1125 read_lock_irqsave(&ni->size_lock, flags);
1126 initialized_size = ni->initialized_size;
1127 read_unlock_irqrestore(&ni->size_lock, flags);
1128 while (wait_bh > wait) {
1129 bh = *--wait_bh;
1130 wait_on_buffer(bh);
1131 if (likely(buffer_uptodate(bh))) {
1132 page = bh->b_page;
1133 bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
1134 bh_offset(bh);
1135 /*
1136 * If the buffer overflows the initialized size, need
1137 * to zero the overflowing region.
1138 */
1139 if (unlikely(bh_pos + blocksize > initialized_size)) {
1140 u8 *kaddr;
1141 int ofs = 0;
1142
1143 if (likely(bh_pos < initialized_size))
1144 ofs = initialized_size - bh_pos;
1145 kaddr = kmap_atomic(page, KM_USER0);
1146 memset(kaddr + bh_offset(bh) + ofs, 0,
1147 blocksize - ofs);
1148 kunmap_atomic(kaddr, KM_USER0);
1149 flush_dcache_page(page);
1150 }
1151 } else /* if (unlikely(!buffer_uptodate(bh))) */
1152 err = -EIO;
1153 }
1154 if (likely(!err)) {
1155 /* Clear buffer_new on all buffers. */
1156 u = 0;
1157 do {
1158 bh = head = page_buffers(pages[u]);
1159 do {
1160 if (buffer_new(bh))
1161 clear_buffer_new(bh);
1162 } while ((bh = bh->b_this_page) != head);
1163 } while (++u < nr_pages);
1164 ntfs_debug("Done.");
1165 return err;
1166 }
1167 if (status.attr_switched) {
1168 /* Get back to the attribute extent we modified. */
1169 ntfs_attr_reinit_search_ctx(ctx);
1170 if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1171 CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
1172 ntfs_error(vol->sb, "Failed to find required "
1173 "attribute extent of attribute in "
1174 "error code path. Run chkdsk to "
1175 "recover.");
1176 write_lock_irqsave(&ni->size_lock, flags);
1177 ni->itype.compressed.size += vol->cluster_size;
1178 write_unlock_irqrestore(&ni->size_lock, flags);
1179 flush_dcache_mft_record_page(ctx->ntfs_ino);
1180 mark_mft_record_dirty(ctx->ntfs_ino);
1181 /*
1182 * The only thing that is now wrong is the compressed
1183 * size of the base attribute extent which chkdsk
1184 * should be able to fix.
1185 */
1186 NVolSetErrors(vol);
1187 } else {
1188 m = ctx->mrec;
1189 a = ctx->attr;
1190 status.attr_switched = 0;
1191 }
1192 }
1193 /*
1194 * If the runlist has been modified, need to restore it by punching a
1195 * hole into it and we then need to deallocate the on-disk cluster as
1196 * well. Note, we only modify the runlist if we are able to generate a
1197 * new mapping pairs array, i.e. only when the mapped attribute extent
1198 * is not switched.
1199 */
1200 if (status.runlist_merged && !status.attr_switched) {
1201 BUG_ON(!rl_write_locked);
1202 /* Make the file cluster we allocated sparse in the runlist. */
1203 if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
1204 ntfs_error(vol->sb, "Failed to punch hole into "
1205 "attribute runlist in error code "
1206 "path. Run chkdsk to recover the "
1207 "lost cluster.");
1208 make_bad_inode(vi);
1209 make_bad_inode(VFS_I(base_ni));
1210 NVolSetErrors(vol);
1211 } else /* if (success) */ {
1212 status.runlist_merged = 0;
1213 /*
1214 * Deallocate the on-disk cluster we allocated but only
1215 * if we succeeded in punching its vcn out of the
1216 * runlist.
1217 */
1218 down_write(&vol->lcnbmp_lock);
1219 if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1220 ntfs_error(vol->sb, "Failed to release "
1221 "allocated cluster in error "
1222 "code path. Run chkdsk to "
1223 "recover the lost cluster.");
1224 NVolSetErrors(vol);
1225 }
1226 up_write(&vol->lcnbmp_lock);
1227 }
1228 }
1229 /*
1230 * Resize the attribute record to its old size and rebuild the mapping
1231 * pairs array. Note, we only can do this if the runlist has been
1232 * restored to its old state which also implies that the mapped
1233 * attribute extent is not switched.
1234 */
1235 if (status.mp_rebuilt && !status.runlist_merged) {
1236 if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
1237 ntfs_error(vol->sb, "Failed to restore attribute "
1238 "record in error code path. Run "
1239 "chkdsk to recover.");
1240 make_bad_inode(vi);
1241 make_bad_inode(VFS_I(base_ni));
1242 NVolSetErrors(vol);
1243 } else /* if (success) */ {
1244 if (ntfs_mapping_pairs_build(vol, (u8*)a +
1245 le16_to_cpu(a->data.non_resident.
1246 mapping_pairs_offset), attr_rec_len -
1247 le16_to_cpu(a->data.non_resident.
1248 mapping_pairs_offset), ni->runlist.rl,
1249 vcn, highest_vcn, NULL)) {
1250 ntfs_error(vol->sb, "Failed to restore "
1251 "mapping pairs array in error "
1252 "code path. Run chkdsk to "
1253 "recover.");
1254 make_bad_inode(vi);
1255 make_bad_inode(VFS_I(base_ni));
1256 NVolSetErrors(vol);
1257 }
1258 flush_dcache_mft_record_page(ctx->ntfs_ino);
1259 mark_mft_record_dirty(ctx->ntfs_ino);
1260 }
1261 }
1262 /* Release the mft record and the attribute. */
1263 if (status.mft_attr_mapped) {
1264 ntfs_attr_put_search_ctx(ctx);
1265 unmap_mft_record(base_ni);
1266 }
1267 /* Release the runlist lock. */
1268 if (rl_write_locked)
1269 up_write(&ni->runlist.lock);
1270 else if (rl)
1271 up_read(&ni->runlist.lock);
1272 /*
1273 * Zero out any newly allocated blocks to avoid exposing stale data.
1274 * If BH_New is set, we know that the block was newly allocated above
1275 * and that it has not been fully zeroed and marked dirty yet.
1276 */
1277 nr_pages = u;
1278 u = 0;
1279 end = bh_cpos << vol->cluster_size_bits;
1280 do {
1281 page = pages[u];
1282 bh = head = page_buffers(page);
1283 do {
1284 if (u == nr_pages &&
1285 ((s64)page->index << PAGE_CACHE_SHIFT) +
1286 bh_offset(bh) >= end)
1287 break;
1288 if (!buffer_new(bh))
1289 continue;
1290 clear_buffer_new(bh);
1291 if (!buffer_uptodate(bh)) {
1292 if (PageUptodate(page))
1293 set_buffer_uptodate(bh);
1294 else {
1295 u8 *kaddr = kmap_atomic(page, KM_USER0);
1296 memset(kaddr + bh_offset(bh), 0,
1297 blocksize);
1298 kunmap_atomic(kaddr, KM_USER0);
1299 flush_dcache_page(page);
1300 set_buffer_uptodate(bh);
1301 }
1302 }
1303 mark_buffer_dirty(bh);
1304 } while ((bh = bh->b_this_page) != head);
1305 } while (++u <= nr_pages);
1306 ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
1307 return err;
1308}
1309
1310/*
1311 * Copy as much as we can into the pages and return the number of bytes which
1312 * were sucessfully copied. If a fault is encountered then clear the pages
1313 * out to (ofs + bytes) and return the number of bytes which were copied.
1314 */
1315static inline size_t ntfs_copy_from_user(struct page **pages,
1316 unsigned nr_pages, unsigned ofs, const char __user *buf,
1317 size_t bytes)
1318{
1319 struct page **last_page = pages + nr_pages;
1320 char *kaddr;
1321 size_t total = 0;
1322 unsigned len;
1323 int left;
1324
1325 do {
1326 len = PAGE_CACHE_SIZE - ofs;
1327 if (len > bytes)
1328 len = bytes;
1329 kaddr = kmap_atomic(*pages, KM_USER0);
1330 left = __copy_from_user_inatomic(kaddr + ofs, buf, len);
1331 kunmap_atomic(kaddr, KM_USER0);
1332 if (unlikely(left)) {
1333 /* Do it the slow way. */
1334 kaddr = kmap(*pages);
1335 left = __copy_from_user(kaddr + ofs, buf, len);
1336 kunmap(*pages);
1337 if (unlikely(left))
1338 goto err_out;
1339 }
1340 total += len;
1341 bytes -= len;
1342 if (!bytes)
1343 break;
1344 buf += len;
1345 ofs = 0;
1346 } while (++pages < last_page);
1347out:
1348 return total;
1349err_out:
1350 total += len - left;
1351 /* Zero the rest of the target like __copy_from_user(). */
1352 while (++pages < last_page) {
1353 bytes -= len;
1354 if (!bytes)
1355 break;
1356 len = PAGE_CACHE_SIZE;
1357 if (len > bytes)
1358 len = bytes;
1359 kaddr = kmap_atomic(*pages, KM_USER0);
1360 memset(kaddr, 0, len);
1361 kunmap_atomic(kaddr, KM_USER0);
1362 }
1363 goto out;
1364}
1365
1366static size_t __ntfs_copy_from_user_iovec(char *vaddr,
1367 const struct iovec *iov, size_t iov_ofs, size_t bytes)
1368{
1369 size_t total = 0;
1370
1371 while (1) {
1372 const char __user *buf = iov->iov_base + iov_ofs;
1373 unsigned len;
1374 size_t left;
1375
1376 len = iov->iov_len - iov_ofs;
1377 if (len > bytes)
1378 len = bytes;
1379 left = __copy_from_user_inatomic(vaddr, buf, len);
1380 total += len;
1381 bytes -= len;
1382 vaddr += len;
1383 if (unlikely(left)) {
1384 /*
1385 * Zero the rest of the target like __copy_from_user().
1386 */
1387 memset(vaddr, 0, bytes);
1388 total -= left;
1389 break;
1390 }
1391 if (!bytes)
1392 break;
1393 iov++;
1394 iov_ofs = 0;
1395 }
1396 return total;
1397}
1398
1399static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1400 size_t *iov_ofsp, size_t bytes)
1401{
1402 const struct iovec *iov = *iovp;
1403 size_t iov_ofs = *iov_ofsp;
1404
1405 while (bytes) {
1406 unsigned len;
1407
1408 len = iov->iov_len - iov_ofs;
1409 if (len > bytes)
1410 len = bytes;
1411 bytes -= len;
1412 iov_ofs += len;
1413 if (iov->iov_len == iov_ofs) {
1414 iov++;
1415 iov_ofs = 0;
1416 }
1417 }
1418 *iovp = iov;
1419 *iov_ofsp = iov_ofs;
1420}
1421
1422/*
1423 * This has the same side-effects and return value as ntfs_copy_from_user().
1424 * The difference is that on a fault we need to memset the remainder of the
1425 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1426 * single-segment behaviour.
1427 *
1428 * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
1429 * when not atomic. This is ok because __ntfs_copy_from_user_iovec() calls
1430 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
1431 * fact, the only difference between __copy_from_user_inatomic() and
1432 * __copy_from_user() is that the latter calls might_sleep(). And on many
1433 * architectures __copy_from_user_inatomic() is just defined to
1434 * __copy_from_user() so it makes no difference at all on those architectures.
1435 */
1436static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1437 unsigned nr_pages, unsigned ofs, const struct iovec **iov,
1438 size_t *iov_ofs, size_t bytes)
1439{
1440 struct page **last_page = pages + nr_pages;
1441 char *kaddr;
1442 size_t copied, len, total = 0;
1443
1444 do {
1445 len = PAGE_CACHE_SIZE - ofs;
1446 if (len > bytes)
1447 len = bytes;
1448 kaddr = kmap_atomic(*pages, KM_USER0);
1449 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1450 *iov, *iov_ofs, len);
1451 kunmap_atomic(kaddr, KM_USER0);
1452 if (unlikely(copied != len)) {
1453 /* Do it the slow way. */
1454 kaddr = kmap(*pages);
1455 copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
1456 *iov, *iov_ofs, len);
1457 kunmap(*pages);
1458 if (unlikely(copied != len))
1459 goto err_out;
1460 }
1461 total += len;
1462 bytes -= len;
1463 if (!bytes)
1464 break;
1465 ntfs_set_next_iovec(iov, iov_ofs, len);
1466 ofs = 0;
1467 } while (++pages < last_page);
1468out:
1469 return total;
1470err_out:
1471 total += copied;
1472 /* Zero the rest of the target like __copy_from_user(). */
1473 while (++pages < last_page) {
1474 bytes -= len;
1475 if (!bytes)
1476 break;
1477 len = PAGE_CACHE_SIZE;
1478 if (len > bytes)
1479 len = bytes;
1480 kaddr = kmap_atomic(*pages, KM_USER0);
1481 memset(kaddr, 0, len);
1482 kunmap_atomic(kaddr, KM_USER0);
1483 }
1484 goto out;
1485}
1486
1487static inline void ntfs_flush_dcache_pages(struct page **pages,
1488 unsigned nr_pages)
1489{
1490 BUG_ON(!nr_pages);
1491 do {
1492 /*
1493 * Warning: Do not do the decrement at the same time as the
1494 * call because flush_dcache_page() is a NULL macro on i386
1495 * and hence the decrement never happens.
1496 */
1497 flush_dcache_page(pages[nr_pages]);
1498 } while (--nr_pages > 0);
1499}
1500
1501/**
1502 * ntfs_commit_pages_after_non_resident_write - commit the received data
1503 * @pages: array of destination pages
1504 * @nr_pages: number of pages in @pages
1505 * @pos: byte position in file at which the write begins
1506 * @bytes: number of bytes to be written
1507 *
1508 * See description of ntfs_commit_pages_after_write(), below.
1509 */
1510static inline int ntfs_commit_pages_after_non_resident_write(
1511 struct page **pages, const unsigned nr_pages,
1512 s64 pos, size_t bytes)
1513{
1514 s64 end, initialized_size;
1515 struct inode *vi;
1516 ntfs_inode *ni, *base_ni;
1517 struct buffer_head *bh, *head;
1518 ntfs_attr_search_ctx *ctx;
1519 MFT_RECORD *m;
1520 ATTR_RECORD *a;
1521 unsigned long flags;
1522 unsigned blocksize, u;
1523 int err;
1524
1525 vi = pages[0]->mapping->host;
1526 ni = NTFS_I(vi);
1527 blocksize = 1 << vi->i_blkbits;
1528 end = pos + bytes;
1529 u = 0;
1530 do {
1531 s64 bh_pos;
1532 struct page *page;
1533 BOOL partial;
1534
1535 page = pages[u];
1536 bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
1537 bh = head = page_buffers(page);
1538 partial = FALSE;
1539 do {
1540 s64 bh_end;
1541
1542 bh_end = bh_pos + blocksize;
1543 if (bh_end <= pos || bh_pos >= end) {
1544 if (!buffer_uptodate(bh))
1545 partial = TRUE;
1546 } else {
1547 set_buffer_uptodate(bh);
1548 mark_buffer_dirty(bh);
1549 }
1550 } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1551 /*
1552 * If all buffers are now uptodate but the page is not, set the
1553 * page uptodate.
1554 */
1555 if (!partial && !PageUptodate(page))
1556 SetPageUptodate(page);
1557 } while (++u < nr_pages);
1558 /*
1559 * Finally, if we do not need to update initialized_size or i_size we
1560 * are finished.
1561 */
1562 read_lock_irqsave(&ni->size_lock, flags);
1563 initialized_size = ni->initialized_size;
1564 read_unlock_irqrestore(&ni->size_lock, flags);
1565 if (end <= initialized_size) {
1566 ntfs_debug("Done.");
1567 return 0;
1568 }
1569 /*
1570 * Update initialized_size/i_size as appropriate, both in the inode and
1571 * the mft record.
1572 */
1573 if (!NInoAttr(ni))
1574 base_ni = ni;
1575 else
1576 base_ni = ni->ext.base_ntfs_ino;
1577 /* Map, pin, and lock the mft record. */
1578 m = map_mft_record(base_ni);
1579 if (IS_ERR(m)) {
1580 err = PTR_ERR(m);
1581 m = NULL;
1582 ctx = NULL;
1583 goto err_out;
1584 }
1585 BUG_ON(!NInoNonResident(ni));
1586 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1587 if (unlikely(!ctx)) {
1588 err = -ENOMEM;
1589 goto err_out;
1590 }
1591 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1592 CASE_SENSITIVE, 0, NULL, 0, ctx);
1593 if (unlikely(err)) {
1594 if (err == -ENOENT)
1595 err = -EIO;
1596 goto err_out;
1597 }
1598 a = ctx->attr;
1599 BUG_ON(!a->non_resident);
1600 write_lock_irqsave(&ni->size_lock, flags);
1601 BUG_ON(end > ni->allocated_size);
1602 ni->initialized_size = end;
1603 a->data.non_resident.initialized_size = cpu_to_sle64(end);
1604 if (end > i_size_read(vi)) {
1605 i_size_write(vi, end);
1606 a->data.non_resident.data_size =
1607 a->data.non_resident.initialized_size;
1608 }
1609 write_unlock_irqrestore(&ni->size_lock, flags);
1610 /* Mark the mft record dirty, so it gets written back. */
1611 flush_dcache_mft_record_page(ctx->ntfs_ino);
1612 mark_mft_record_dirty(ctx->ntfs_ino);
1613 ntfs_attr_put_search_ctx(ctx);
1614 unmap_mft_record(base_ni);
1615 ntfs_debug("Done.");
1616 return 0;
1617err_out:
1618 if (ctx)
1619 ntfs_attr_put_search_ctx(ctx);
1620 if (m)
1621 unmap_mft_record(base_ni);
1622 ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
1623 "code %i).", err);
1624 if (err != -ENOMEM) {
1625 NVolSetErrors(ni->vol);
1626 make_bad_inode(VFS_I(base_ni));
1627 make_bad_inode(vi);
1628 }
1629 return err;
1630}
1631
1632/**
1633 * ntfs_commit_pages_after_write - commit the received data
1634 * @pages: array of destination pages
1635 * @nr_pages: number of pages in @pages
1636 * @pos: byte position in file at which the write begins
1637 * @bytes: number of bytes to be written
1638 *
1639 * This is called from ntfs_file_buffered_write() with i_sem held on the inode
1640 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
1641 * locked but not kmap()ped. The source data has already been copied into the
1642 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
1643 * the data was copied (for non-resident attributes only) and it returned
1644 * success.
1645 *
1646 * Need to set uptodate and mark dirty all buffers within the boundary of the
1647 * write. If all buffers in a page are uptodate we set the page uptodate, too.
1648 *
1649 * Setting the buffers dirty ensures that they get written out later when
1650 * ntfs_writepage() is invoked by the VM.
1651 *
1652 * Finally, we need to update i_size and initialized_size as appropriate both
1653 * in the inode and the mft record.
1654 *
1655 * This is modelled after fs/buffer.c::generic_commit_write(), which marks
1656 * buffers uptodate and dirty, sets the page uptodate if all buffers in the
1657 * page are uptodate, and updates i_size if the end of io is beyond i_size. In
1658 * that case, it also marks the inode dirty.
1659 *
1660 * If things have gone as outlined in
1661 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
1662 * content modifications here for non-resident attributes. For resident
1663 * attributes we need to do the uptodate bringing here which we combine with
1664 * the copying into the mft record which means we save one atomic kmap.
1665 *
1666 * Return 0 on success or -errno on error.
1667 */
1668static int ntfs_commit_pages_after_write(struct page **pages,
1669 const unsigned nr_pages, s64 pos, size_t bytes)
1670{
1671 s64 end, initialized_size;
1672 loff_t i_size;
1673 struct inode *vi;
1674 ntfs_inode *ni, *base_ni;
1675 struct page *page;
1676 ntfs_attr_search_ctx *ctx;
1677 MFT_RECORD *m;
1678 ATTR_RECORD *a;
1679 char *kattr, *kaddr;
1680 unsigned long flags;
1681 u32 attr_len;
1682 int err;
1683
1684 BUG_ON(!nr_pages);
1685 BUG_ON(!pages);
1686 page = pages[0];
1687 BUG_ON(!page);
1688 vi = page->mapping->host;
1689 ni = NTFS_I(vi);
1690 ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
1691 "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
1692 vi->i_ino, ni->type, page->index, nr_pages,
1693 (long long)pos, bytes);
1694 if (NInoNonResident(ni))
1695 return ntfs_commit_pages_after_non_resident_write(pages,
1696 nr_pages, pos, bytes);
1697 BUG_ON(nr_pages > 1);
1698 /*
1699 * Attribute is resident, implying it is not compressed, encrypted, or
1700 * sparse.
1701 */
1702 if (!NInoAttr(ni))
1703 base_ni = ni;
1704 else
1705 base_ni = ni->ext.base_ntfs_ino;
1706 BUG_ON(NInoNonResident(ni));
1707 /* Map, pin, and lock the mft record. */
1708 m = map_mft_record(base_ni);
1709 if (IS_ERR(m)) {
1710 err = PTR_ERR(m);
1711 m = NULL;
1712 ctx = NULL;
1713 goto err_out;
1714 }
1715 ctx = ntfs_attr_get_search_ctx(base_ni, m);
1716 if (unlikely(!ctx)) {
1717 err = -ENOMEM;
1718 goto err_out;
1719 }
1720 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1721 CASE_SENSITIVE, 0, NULL, 0, ctx);
1722 if (unlikely(err)) {
1723 if (err == -ENOENT)
1724 err = -EIO;
1725 goto err_out;
1726 }
1727 a = ctx->attr;
1728 BUG_ON(a->non_resident);
1729 /* The total length of the attribute value. */
1730 attr_len = le32_to_cpu(a->data.resident.value_length);
1731 i_size = i_size_read(vi);
1732 BUG_ON(attr_len != i_size);
1733 BUG_ON(pos > attr_len);
1734 end = pos + bytes;
1735 BUG_ON(end > le32_to_cpu(a->length) -
1736 le16_to_cpu(a->data.resident.value_offset));
1737 kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
1738 kaddr = kmap_atomic(page, KM_USER0);
1739 /* Copy the received data from the page to the mft record. */
1740 memcpy(kattr + pos, kaddr + pos, bytes);
1741 /* Update the attribute length if necessary. */
1742 if (end > attr_len) {
1743 attr_len = end;
1744 a->data.resident.value_length = cpu_to_le32(attr_len);
1745 }
1746 /*
1747 * If the page is not uptodate, bring the out of bounds area(s)
1748 * uptodate by copying data from the mft record to the page.
1749 */
1750 if (!PageUptodate(page)) {
1751 if (pos > 0)
1752 memcpy(kaddr, kattr, pos);
1753 if (end < attr_len)
1754 memcpy(kaddr + end, kattr + end, attr_len - end);
1755 /* Zero the region outside the end of the attribute value. */
1756 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1757 flush_dcache_page(page);
1758 SetPageUptodate(page);
1759 }
1760 kunmap_atomic(kaddr, KM_USER0);
1761 /* Update initialized_size/i_size if necessary. */
1762 read_lock_irqsave(&ni->size_lock, flags);
1763 initialized_size = ni->initialized_size;
1764 BUG_ON(end > ni->allocated_size);
1765 read_unlock_irqrestore(&ni->size_lock, flags);
1766 BUG_ON(initialized_size != i_size);
1767 if (end > initialized_size) {
1768 unsigned long flags;
1769
1770 write_lock_irqsave(&ni->size_lock, flags);
1771 ni->initialized_size = end;
1772 i_size_write(vi, end);
1773 write_unlock_irqrestore(&ni->size_lock, flags);
1774 }
1775 /* Mark the mft record dirty, so it gets written back. */
1776 flush_dcache_mft_record_page(ctx->ntfs_ino);
1777 mark_mft_record_dirty(ctx->ntfs_ino);
1778 ntfs_attr_put_search_ctx(ctx);
1779 unmap_mft_record(base_ni);
1780 ntfs_debug("Done.");
1781 return 0;
1782err_out:
1783 if (err == -ENOMEM) {
1784 ntfs_warning(vi->i_sb, "Error allocating memory required to "
1785 "commit the write.");
1786 if (PageUptodate(page)) {
1787 ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1788 "dirty so the write will be retried "
1789 "later on by the VM.");
1790 /*
1791 * Put the page on mapping->dirty_pages, but leave its
1792 * buffers' dirty state as-is.
1793 */
1794 __set_page_dirty_nobuffers(page);
1795 err = 0;
1796 } else
1797 ntfs_error(vi->i_sb, "Page is not uptodate. Written "
1798 "data has been lost.");
1799 } else {
1800 ntfs_error(vi->i_sb, "Resident attribute commit write failed "
1801 "with error %i.", err);
1802 NVolSetErrors(ni->vol);
1803 make_bad_inode(VFS_I(base_ni));
1804 make_bad_inode(vi);
1805 }
1806 if (ctx)
1807 ntfs_attr_put_search_ctx(ctx);
1808 if (m)
1809 unmap_mft_record(base_ni);
1810 return err;
1811}
1812
1813/**
1814 * ntfs_file_buffered_write -
1815 *
1816 * Locking: The vfs is holding ->i_sem on the inode.
1817 */
1818static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1819 const struct iovec *iov, unsigned long nr_segs,
1820 loff_t pos, loff_t *ppos, size_t count)
1821{
1822 struct file *file = iocb->ki_filp;
1823 struct address_space *mapping = file->f_mapping;
1824 struct inode *vi = mapping->host;
1825 ntfs_inode *ni = NTFS_I(vi);
1826 ntfs_volume *vol = ni->vol;
1827 struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
1828 struct page *cached_page = NULL;
1829 char __user *buf = NULL;
1830 s64 end, ll;
1831 VCN last_vcn;
1832 LCN lcn;
1833 unsigned long flags;
1834 size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */
1835 ssize_t status, written;
1836 unsigned nr_pages;
1837 int err;
1838 struct pagevec lru_pvec;
1839
1840 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1841 "pos 0x%llx, count 0x%lx.",
1842 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
1843 (unsigned long long)pos, (unsigned long)count);
1844 if (unlikely(!count))
1845 return 0;
1846 BUG_ON(NInoMstProtected(ni));
1847 /*
1848 * If the attribute is not an index root and it is encrypted or
1849 * compressed, we cannot write to it yet. Note we need to check for
1850 * AT_INDEX_ALLOCATION since this is the type of both directory and
1851 * index inodes.
1852 */
1853 if (ni->type != AT_INDEX_ALLOCATION) {
1854 /* If file is encrypted, deny access, just like NT4. */
1855 if (NInoEncrypted(ni)) {
1856 /*
1857 * Reminder for later: Encrypted files are _always_
1858 * non-resident so that the content can always be
1859 * encrypted.
1860 */
1861 ntfs_debug("Denying write access to encrypted file.");
1862 return -EACCES;
1863 }
1864 if (NInoCompressed(ni)) {
1865 /* Only unnamed $DATA attribute can be compressed. */
1866 BUG_ON(ni->type != AT_DATA);
1867 BUG_ON(ni->name_len);
1868 /*
1869 * Reminder for later: If resident, the data is not
1870 * actually compressed. Only on the switch to non-
1871 * resident does compression kick in. This is in
1872 * contrast to encrypted files (see above).
1873 */
1874 ntfs_error(vi->i_sb, "Writing to compressed files is "
1875 "not implemented yet. Sorry.");
1876 return -EOPNOTSUPP;
1877 }
1878 }
1879 /*
1880 * If a previous ntfs_truncate() failed, repeat it and abort if it
1881 * fails again.
1882 */
1883 if (unlikely(NInoTruncateFailed(ni))) {
1884 down_write(&vi->i_alloc_sem);
1885 err = ntfs_truncate(vi);
1886 up_write(&vi->i_alloc_sem);
1887 if (err || NInoTruncateFailed(ni)) {
1888 if (!err)
1889 err = -EIO;
1890 ntfs_error(vol->sb, "Cannot perform write to inode "
1891 "0x%lx, attribute type 0x%x, because "
1892 "ntfs_truncate() failed (error code "
1893 "%i).", vi->i_ino,
1894 (unsigned)le32_to_cpu(ni->type), err);
1895 return err;
1896 }
1897 }
1898 /* The first byte after the write. */
1899 end = pos + count;
1900 /*
1901 * If the write goes beyond the allocated size, extend the allocation
1902 * to cover the whole of the write, rounded up to the nearest cluster.
1903 */
1904 read_lock_irqsave(&ni->size_lock, flags);
1905 ll = ni->allocated_size;
1906 read_unlock_irqrestore(&ni->size_lock, flags);
1907 if (end > ll) {
1908 /* Extend the allocation without changing the data size. */
1909 ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
1910 if (likely(ll >= 0)) {
1911 BUG_ON(pos >= ll);
1912 /* If the extension was partial truncate the write. */
1913 if (end > ll) {
1914 ntfs_debug("Truncating write to inode 0x%lx, "
1915 "attribute type 0x%x, because "
1916 "the allocation was only "
1917 "partially extended.",
1918 vi->i_ino, (unsigned)
1919 le32_to_cpu(ni->type));
1920 end = ll;
1921 count = ll - pos;
1922 }
1923 } else {
1924 err = ll;
1925 read_lock_irqsave(&ni->size_lock, flags);
1926 ll = ni->allocated_size;
1927 read_unlock_irqrestore(&ni->size_lock, flags);
1928 /* Perform a partial write if possible or fail. */
1929 if (pos < ll) {
1930 ntfs_debug("Truncating write to inode 0x%lx, "
1931 "attribute type 0x%x, because "
1932 "extending the allocation "
1933 "failed (error code %i).",
1934 vi->i_ino, (unsigned)
1935 le32_to_cpu(ni->type), err);
1936 end = ll;
1937 count = ll - pos;
1938 } else {
1939 ntfs_error(vol->sb, "Cannot perform write to "
1940 "inode 0x%lx, attribute type "
1941 "0x%x, because extending the "
1942 "allocation failed (error "
1943 "code %i).", vi->i_ino,
1944 (unsigned)
1945 le32_to_cpu(ni->type), err);
1946 return err;
1947 }
1948 }
1949 }
1950 pagevec_init(&lru_pvec, 0);
1951 written = 0;
1952 /*
1953 * If the write starts beyond the initialized size, extend it up to the
1954 * beginning of the write and initialize all non-sparse space between
1955 * the old initialized size and the new one. This automatically also
1956 * increments the vfs inode->i_size to keep it above or equal to the
1957 * initialized_size.
1958 */
1959 read_lock_irqsave(&ni->size_lock, flags);
1960 ll = ni->initialized_size;
1961 read_unlock_irqrestore(&ni->size_lock, flags);
1962 if (pos > ll) {
1963 err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
1964 &lru_pvec);
1965 if (err < 0) {
1966 ntfs_error(vol->sb, "Cannot perform write to inode "
1967 "0x%lx, attribute type 0x%x, because "
1968 "extending the initialized size "
1969 "failed (error code %i).", vi->i_ino,
1970 (unsigned)le32_to_cpu(ni->type), err);
1971 status = err;
1972 goto err_out;
1973 }
1974 }
1975 /*
1976 * Determine the number of pages per cluster for non-resident
1977 * attributes.
1978 */
1979 nr_pages = 1;
1980 if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
1981 nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
1982 /* Finally, perform the actual write. */
1983 last_vcn = -1;
1984 if (likely(nr_segs == 1))
1985 buf = iov->iov_base;
1986 do {
1987 VCN vcn;
1988 pgoff_t idx, start_idx;
1989 unsigned ofs, do_pages, u;
1990 size_t copied;
1991
1992 start_idx = idx = pos >> PAGE_CACHE_SHIFT;
1993 ofs = pos & ~PAGE_CACHE_MASK;
1994 bytes = PAGE_CACHE_SIZE - ofs;
1995 do_pages = 1;
1996 if (nr_pages > 1) {
1997 vcn = pos >> vol->cluster_size_bits;
1998 if (vcn != last_vcn) {
1999 last_vcn = vcn;
2000 /*
2001 * Get the lcn of the vcn the write is in. If
2002 * it is a hole, need to lock down all pages in
2003 * the cluster.
2004 */
2005 down_read(&ni->runlist.lock);
2006 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
2007 vol->cluster_size_bits, FALSE);
2008 up_read(&ni->runlist.lock);
2009 if (unlikely(lcn < LCN_HOLE)) {
2010 status = -EIO;
2011 if (lcn == LCN_ENOMEM)
2012 status = -ENOMEM;
2013 else
2014 ntfs_error(vol->sb, "Cannot "
2015 "perform write to "
2016 "inode 0x%lx, "
2017 "attribute type 0x%x, "
2018 "because the attribute "
2019 "is corrupt.",
2020 vi->i_ino, (unsigned)
2021 le32_to_cpu(ni->type));
2022 break;
2023 }
2024 if (lcn == LCN_HOLE) {
2025 start_idx = (pos & ~(s64)
2026 vol->cluster_size_mask)
2027 >> PAGE_CACHE_SHIFT;
2028 bytes = vol->cluster_size - (pos &
2029 vol->cluster_size_mask);
2030 do_pages = nr_pages;
2031 }
2032 }
2033 }
2034 if (bytes > count)
2035 bytes = count;
2036 /*
2037 * Bring in the user page(s) that we will copy from _first_.
2038 * Otherwise there is a nasty deadlock on copying from the same
2039 * page(s) as we are writing to, without it/them being marked
2040 * up-to-date. Note, at present there is nothing to stop the
2041 * pages being swapped out between us bringing them into memory
2042 * and doing the actual copying.
2043 */
2044 if (likely(nr_segs == 1))
2045 ntfs_fault_in_pages_readable(buf, bytes);
2046 else
2047 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2048 /* Get and lock @do_pages starting at index @start_idx. */
2049 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2050 pages, &cached_page, &lru_pvec);
2051 if (unlikely(status))
2052 break;
2053 /*
2054 * For non-resident attributes, we need to fill any holes with
2055 * actual clusters and ensure all bufferes are mapped. We also
2056 * need to bring uptodate any buffers that are only partially
2057 * being written to.
2058 */
2059 if (NInoNonResident(ni)) {
2060 status = ntfs_prepare_pages_for_non_resident_write(
2061 pages, do_pages, pos, bytes);
2062 if (unlikely(status)) {
2063 loff_t i_size;
2064
2065 do {
2066 unlock_page(pages[--do_pages]);
2067 page_cache_release(pages[do_pages]);
2068 } while (do_pages);
2069 /*
2070 * The write preparation may have instantiated
2071 * allocated space outside i_size. Trim this
2072 * off again. We can ignore any errors in this
2073 * case as we will just be waisting a bit of
2074 * allocated space, which is not a disaster.
2075 */
2076 i_size = i_size_read(vi);
2077 if (pos + bytes > i_size)
2078 vmtruncate(vi, i_size);
2079 break;
2080 }
2081 }
2082 u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
2083 if (likely(nr_segs == 1)) {
2084 copied = ntfs_copy_from_user(pages + u, do_pages - u,
2085 ofs, buf, bytes);
2086 buf += copied;
2087 } else
2088 copied = ntfs_copy_from_user_iovec(pages + u,
2089 do_pages - u, ofs, &iov, &iov_ofs,
2090 bytes);
2091 ntfs_flush_dcache_pages(pages + u, do_pages - u);
2092 status = ntfs_commit_pages_after_write(pages, do_pages, pos,
2093 bytes);
2094 if (likely(!status)) {
2095 written += copied;
2096 count -= copied;
2097 pos += copied;
2098 if (unlikely(copied != bytes))
2099 status = -EFAULT;
2100 }
2101 do {
2102 unlock_page(pages[--do_pages]);
2103 mark_page_accessed(pages[do_pages]);
2104 page_cache_release(pages[do_pages]);
2105 } while (do_pages);
2106 if (unlikely(status))
2107 break;
2108 balance_dirty_pages_ratelimited(mapping);
2109 cond_resched();
2110 } while (count);
2111err_out:
2112 *ppos = pos;
2113 if (cached_page)
2114 page_cache_release(cached_page);
2115 /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
2116 if (likely(!status)) {
2117 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
2118 if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
2119 status = generic_osync_inode(vi, mapping,
2120 OSYNC_METADATA|OSYNC_DATA);
2121 }
2122 }
2123 pagevec_lru_add(&lru_pvec);
2124 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2125 written ? "written" : "status", (unsigned long)written,
2126 (long)status);
2127 return written ? written : status;
2128}
2129
2130/**
2131 * ntfs_file_aio_write_nolock -
2132 */
2133static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2134 const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
2135{
2136 struct file *file = iocb->ki_filp;
2137 struct address_space *mapping = file->f_mapping;
2138 struct inode *inode = mapping->host;
2139 loff_t pos;
2140 unsigned long seg;
2141 size_t count; /* after file limit checks */
2142 ssize_t written, err;
2143
2144 count = 0;
2145 for (seg = 0; seg < nr_segs; seg++) {
2146 const struct iovec *iv = &iov[seg];
2147 /*
2148 * If any segment has a negative length, or the cumulative
2149 * length ever wraps negative then return -EINVAL.
2150 */
2151 count += iv->iov_len;
2152 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
2153 return -EINVAL;
2154 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2155 continue;
2156 if (!seg)
2157 return -EFAULT;
2158 nr_segs = seg;
2159 count -= iv->iov_len; /* This segment is no good */
2160 break;
2161 }
2162 pos = *ppos;
2163 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2164 /* We can write back this queue in page reclaim. */
2165 current->backing_dev_info = mapping->backing_dev_info;
2166 written = 0;
2167 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2168 if (err)
2169 goto out;
2170 if (!count)
2171 goto out;
2172 err = remove_suid(file->f_dentry);
2173 if (err)
2174 goto out;
2175 inode_update_time(inode, 1);
2176 written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
2177 count);
2178out:
2179 current->backing_dev_info = NULL;
2180 return written ? written : err;
2181}
2182
2183/**
2184 * ntfs_file_aio_write -
2185 */
2186static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
2187 size_t count, loff_t pos)
2188{
2189 struct file *file = iocb->ki_filp;
2190 struct address_space *mapping = file->f_mapping;
2191 struct inode *inode = mapping->host;
2192 ssize_t ret;
2193 struct iovec local_iov = { .iov_base = (void __user *)buf,
2194 .iov_len = count };
2195
2196 BUG_ON(iocb->ki_pos != pos);
2197
2198 down(&inode->i_sem);
2199 ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
2200 up(&inode->i_sem);
2201 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2202 int err = sync_page_range(inode, mapping, pos, ret);
2203 if (err < 0)
2204 ret = err;
2205 }
2206 return ret;
2207}
2208
2209/**
2210 * ntfs_file_writev -
2211 *
2212 * Basically the same as generic_file_writev() except that it ends up calling
2213 * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
2214 */
2215static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2216 unsigned long nr_segs, loff_t *ppos)
2217{
2218 struct address_space *mapping = file->f_mapping;
2219 struct inode *inode = mapping->host;
2220 struct kiocb kiocb;
2221 ssize_t ret;
2222
2223 down(&inode->i_sem);
2224 init_sync_kiocb(&kiocb, file);
2225 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2226 if (ret == -EIOCBQUEUED)
2227 ret = wait_on_sync_kiocb(&kiocb);
2228 up(&inode->i_sem);
2229 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2230 int err = sync_page_range(inode, mapping, *ppos - ret, ret);
2231 if (err < 0)
2232 ret = err;
2233 }
2234 return ret;
2235}
2236
2237/**
2238 * ntfs_file_write - simple wrapper for ntfs_file_writev()
2239 */
2240static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
2241 size_t count, loff_t *ppos)
2242{
2243 struct iovec local_iov = { .iov_base = (void __user *)buf,
2244 .iov_len = count };
2245
2246 return ntfs_file_writev(file, &local_iov, 1, ppos);
2247}
2248
2249/**
59 * ntfs_file_fsync - sync a file to disk 2250 * ntfs_file_fsync - sync a file to disk
60 * @filp: file to be synced 2251 * @filp: file to be synced
61 * @dentry: dentry describing the file to sync 2252 * @dentry: dentry describing the file to sync
@@ -113,39 +2304,39 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
113#endif /* NTFS_RW */ 2304#endif /* NTFS_RW */
114 2305
115struct file_operations ntfs_file_ops = { 2306struct file_operations ntfs_file_ops = {
116 .llseek = generic_file_llseek, /* Seek inside file. */ 2307 .llseek = generic_file_llseek, /* Seek inside file. */
117 .read = generic_file_read, /* Read from file. */ 2308 .read = generic_file_read, /* Read from file. */
118 .aio_read = generic_file_aio_read, /* Async read from file. */ 2309 .aio_read = generic_file_aio_read, /* Async read from file. */
119 .readv = generic_file_readv, /* Read from file. */ 2310 .readv = generic_file_readv, /* Read from file. */
120#ifdef NTFS_RW 2311#ifdef NTFS_RW
121 .write = generic_file_write, /* Write to file. */ 2312 .write = ntfs_file_write, /* Write to file. */
122 .aio_write = generic_file_aio_write, /* Async write to file. */ 2313 .aio_write = ntfs_file_aio_write, /* Async write to file. */
123 .writev = generic_file_writev, /* Write to file. */ 2314 .writev = ntfs_file_writev, /* Write to file. */
124 /*.release = ,*/ /* Last file is closed. See 2315 /*.release = ,*/ /* Last file is closed. See
125 fs/ext2/file.c:: 2316 fs/ext2/file.c::
126 ext2_release_file() for 2317 ext2_release_file() for
127 how to use this to discard 2318 how to use this to discard
128 preallocated space for 2319 preallocated space for
129 write opened files. */ 2320 write opened files. */
130 .fsync = ntfs_file_fsync, /* Sync a file to disk. */ 2321 .fsync = ntfs_file_fsync, /* Sync a file to disk. */
131 /*.aio_fsync = ,*/ /* Sync all outstanding async 2322 /*.aio_fsync = ,*/ /* Sync all outstanding async
132 i/o operations on a 2323 i/o operations on a
133 kiocb. */ 2324 kiocb. */
134#endif /* NTFS_RW */ 2325#endif /* NTFS_RW */
135 /*.ioctl = ,*/ /* Perform function on the 2326 /*.ioctl = ,*/ /* Perform function on the
136 mounted filesystem. */ 2327 mounted filesystem. */
137 .mmap = generic_file_mmap, /* Mmap file. */ 2328 .mmap = generic_file_mmap, /* Mmap file. */
138 .open = ntfs_file_open, /* Open file. */ 2329 .open = ntfs_file_open, /* Open file. */
139 .sendfile = generic_file_sendfile, /* Zero-copy data send with 2330 .sendfile = generic_file_sendfile, /* Zero-copy data send with
140 the data source being on 2331 the data source being on
141 the ntfs partition. We 2332 the ntfs partition. We do
142 do not need to care about 2333 not need to care about the
143 the data destination. */ 2334 data destination. */
144 /*.sendpage = ,*/ /* Zero-copy data send with 2335 /*.sendpage = ,*/ /* Zero-copy data send with
145 the data destination being 2336 the data destination being
146 on the ntfs partition. We 2337 on the ntfs partition. We
147 do not need to care about 2338 do not need to care about
148 the data source. */ 2339 the data source. */
149}; 2340};
150 2341
151struct inode_operations ntfs_file_inode_ops = { 2342struct inode_operations ntfs_file_inode_ops = {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 7ec045131808..b24f4c4b2c5c 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -30,6 +30,7 @@
30#include "debug.h" 30#include "debug.h"
31#include "inode.h" 31#include "inode.h"
32#include "attrib.h" 32#include "attrib.h"
33#include "lcnalloc.h"
33#include "malloc.h" 34#include "malloc.h"
34#include "mft.h" 35#include "mft.h"
35#include "time.h" 36#include "time.h"
@@ -2291,11 +2292,16 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
2291 2292
2292#ifdef NTFS_RW 2293#ifdef NTFS_RW
2293 2294
2295static const char *es = " Leaving inconsistent metadata. Unmount and run "
2296 "chkdsk.";
2297
2294/** 2298/**
2295 * ntfs_truncate - called when the i_size of an ntfs inode is changed 2299 * ntfs_truncate - called when the i_size of an ntfs inode is changed
2296 * @vi: inode for which the i_size was changed 2300 * @vi: inode for which the i_size was changed
2297 * 2301 *
2298 * We do not support i_size changes yet. 2302 * We only support i_size changes for normal files at present, i.e. not
2303 * compressed and not encrypted. This is enforced in ntfs_setattr(), see
2304 * below.
2299 * 2305 *
2300 * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and 2306 * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
2301 * that the change is allowed. 2307 * that the change is allowed.
@@ -2306,80 +2312,499 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
2306 * Returns 0 on success or -errno on error. 2312 * Returns 0 on success or -errno on error.
2307 * 2313 *
2308 * Called with ->i_sem held. In all but one case ->i_alloc_sem is held for 2314 * Called with ->i_sem held. In all but one case ->i_alloc_sem is held for
2309 * writing. The only case where ->i_alloc_sem is not held is 2315 * writing. The only case in the kernel where ->i_alloc_sem is not held is
2310 * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called 2316 * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
2311 * with the current i_size as the offset which means that it is a noop as far 2317 * with the current i_size as the offset. The analogous place in NTFS is in
2312 * as ntfs_truncate() is concerned. 2318 * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
2319 * without holding ->i_alloc_sem.
2313 */ 2320 */
2314int ntfs_truncate(struct inode *vi) 2321int ntfs_truncate(struct inode *vi)
2315{ 2322{
2316 ntfs_inode *ni = NTFS_I(vi); 2323 s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
2324 VCN highest_vcn;
2325 unsigned long flags;
2326 ntfs_inode *base_ni, *ni = NTFS_I(vi);
2317 ntfs_volume *vol = ni->vol; 2327 ntfs_volume *vol = ni->vol;
2318 ntfs_attr_search_ctx *ctx; 2328 ntfs_attr_search_ctx *ctx;
2319 MFT_RECORD *m; 2329 MFT_RECORD *m;
2320 ATTR_RECORD *a; 2330 ATTR_RECORD *a;
2321 const char *te = " Leaving file length out of sync with i_size."; 2331 const char *te = " Leaving file length out of sync with i_size.";
2322 int err; 2332 int err, mp_size, size_change, alloc_change;
2333 u32 attr_len;
2323 2334
2324 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2335 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
2325 BUG_ON(NInoAttr(ni)); 2336 BUG_ON(NInoAttr(ni));
2337 BUG_ON(S_ISDIR(vi->i_mode));
2338 BUG_ON(NInoMstProtected(ni));
2326 BUG_ON(ni->nr_extents < 0); 2339 BUG_ON(ni->nr_extents < 0);
2327 m = map_mft_record(ni); 2340retry_truncate:
2341 /*
2342 * Lock the runlist for writing and map the mft record to ensure it is
2343 * safe to mess with the attribute runlist and sizes.
2344 */
2345 down_write(&ni->runlist.lock);
2346 if (!NInoAttr(ni))
2347 base_ni = ni;
2348 else
2349 base_ni = ni->ext.base_ntfs_ino;
2350 m = map_mft_record(base_ni);
2328 if (IS_ERR(m)) { 2351 if (IS_ERR(m)) {
2329 err = PTR_ERR(m); 2352 err = PTR_ERR(m);
2330 ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx " 2353 ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
2331 "(error code %d).%s", vi->i_ino, err, te); 2354 "(error code %d).%s", vi->i_ino, err, te);
2332 ctx = NULL; 2355 ctx = NULL;
2333 m = NULL; 2356 m = NULL;
2334 goto err_out; 2357 goto old_bad_out;
2335 } 2358 }
2336 ctx = ntfs_attr_get_search_ctx(ni, m); 2359 ctx = ntfs_attr_get_search_ctx(base_ni, m);
2337 if (unlikely(!ctx)) { 2360 if (unlikely(!ctx)) {
2338 ntfs_error(vi->i_sb, "Failed to allocate a search context for " 2361 ntfs_error(vi->i_sb, "Failed to allocate a search context for "
2339 "inode 0x%lx (not enough memory).%s", 2362 "inode 0x%lx (not enough memory).%s",
2340 vi->i_ino, te); 2363 vi->i_ino, te);
2341 err = -ENOMEM; 2364 err = -ENOMEM;
2342 goto err_out; 2365 goto old_bad_out;
2343 } 2366 }
2344 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 2367 err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2345 CASE_SENSITIVE, 0, NULL, 0, ctx); 2368 CASE_SENSITIVE, 0, NULL, 0, ctx);
2346 if (unlikely(err)) { 2369 if (unlikely(err)) {
2347 if (err == -ENOENT) 2370 if (err == -ENOENT) {
2348 ntfs_error(vi->i_sb, "Open attribute is missing from " 2371 ntfs_error(vi->i_sb, "Open attribute is missing from "
2349 "mft record. Inode 0x%lx is corrupt. " 2372 "mft record. Inode 0x%lx is corrupt. "
2350 "Run chkdsk.", vi->i_ino); 2373 "Run chkdsk.%s", vi->i_ino, te);
2351 else 2374 err = -EIO;
2375 } else
2352 ntfs_error(vi->i_sb, "Failed to lookup attribute in " 2376 ntfs_error(vi->i_sb, "Failed to lookup attribute in "
2353 "inode 0x%lx (error code %d).", 2377 "inode 0x%lx (error code %d).%s",
2354 vi->i_ino, err); 2378 vi->i_ino, err, te);
2355 goto err_out; 2379 goto old_bad_out;
2356 } 2380 }
2381 m = ctx->mrec;
2357 a = ctx->attr; 2382 a = ctx->attr;
2358 /* If the size has not changed there is nothing to do. */ 2383 /*
2359 if (ntfs_attr_size(a) == i_size_read(vi)) 2384 * The i_size of the vfs inode is the new size for the attribute value.
2360 goto done; 2385 */
2361 // TODO: Implement the truncate... 2386 new_size = i_size_read(vi);
2362 ntfs_error(vi->i_sb, "Inode size has changed but this is not " 2387 /* The current size of the attribute value is the old size. */
2363 "implemented yet. Resetting inode size to old value. " 2388 old_size = ntfs_attr_size(a);
2364 " This is most likely a bug in the ntfs driver!"); 2389 /* Calculate the new allocated size. */
2365 i_size_write(vi, ntfs_attr_size(a)); 2390 if (NInoNonResident(ni))
2366done: 2391 new_alloc_size = (new_size + vol->cluster_size - 1) &
2392 ~(s64)vol->cluster_size_mask;
2393 else
2394 new_alloc_size = (new_size + 7) & ~7;
2395 /* The current allocated size is the old allocated size. */
2396 read_lock_irqsave(&ni->size_lock, flags);
2397 old_alloc_size = ni->allocated_size;
2398 read_unlock_irqrestore(&ni->size_lock, flags);
2399 /*
2400 * The change in the file size. This will be 0 if no change, >0 if the
2401 * size is growing, and <0 if the size is shrinking.
2402 */
2403 size_change = -1;
2404 if (new_size - old_size >= 0) {
2405 size_change = 1;
2406 if (new_size == old_size)
2407 size_change = 0;
2408 }
2409 /* As above for the allocated size. */
2410 alloc_change = -1;
2411 if (new_alloc_size - old_alloc_size >= 0) {
2412 alloc_change = 1;
2413 if (new_alloc_size == old_alloc_size)
2414 alloc_change = 0;
2415 }
2416 /*
2417 * If neither the size nor the allocation are being changed there is
2418 * nothing to do.
2419 */
2420 if (!size_change && !alloc_change)
2421 goto unm_done;
2422 /* If the size is changing, check if new size is allowed in $AttrDef. */
2423 if (size_change) {
2424 err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
2425 if (unlikely(err)) {
2426 if (err == -ERANGE) {
2427 ntfs_error(vol->sb, "Truncate would cause the "
2428 "inode 0x%lx to %simum size "
2429 "for its attribute type "
2430 "(0x%x). Aborting truncate.",
2431 vi->i_ino,
2432 new_size > old_size ? "exceed "
2433 "the max" : "go under the min",
2434 le32_to_cpu(ni->type));
2435 err = -EFBIG;
2436 } else {
2437 ntfs_error(vol->sb, "Inode 0x%lx has unknown "
2438 "attribute type 0x%x. "
2439 "Aborting truncate.",
2440 vi->i_ino,
2441 le32_to_cpu(ni->type));
2442 err = -EIO;
2443 }
2444 /* Reset the vfs inode size to the old size. */
2445 i_size_write(vi, old_size);
2446 goto err_out;
2447 }
2448 }
2449 if (NInoCompressed(ni) || NInoEncrypted(ni)) {
2450 ntfs_warning(vi->i_sb, "Changes in inode size are not "
2451 "supported yet for %s files, ignoring.",
2452 NInoCompressed(ni) ? "compressed" :
2453 "encrypted");
2454 err = -EOPNOTSUPP;
2455 goto bad_out;
2456 }
2457 if (a->non_resident)
2458 goto do_non_resident_truncate;
2459 BUG_ON(NInoNonResident(ni));
2460 /* Resize the attribute record to best fit the new attribute size. */
2461 if (new_size < vol->mft_record_size &&
2462 !ntfs_resident_attr_value_resize(m, a, new_size)) {
2463 unsigned long flags;
2464
2465 /* The resize succeeded! */
2466 flush_dcache_mft_record_page(ctx->ntfs_ino);
2467 mark_mft_record_dirty(ctx->ntfs_ino);
2468 write_lock_irqsave(&ni->size_lock, flags);
2469 /* Update the sizes in the ntfs inode and all is done. */
2470 ni->allocated_size = le32_to_cpu(a->length) -
2471 le16_to_cpu(a->data.resident.value_offset);
2472 /*
2473 * Note ntfs_resident_attr_value_resize() has already done any
2474 * necessary data clearing in the attribute record. When the
2475 * file is being shrunk vmtruncate() will already have cleared
2476 * the top part of the last partial page, i.e. since this is
2477 * the resident case this is the page with index 0. However,
2478 * when the file is being expanded, the page cache page data
2479 * between the old data_size, i.e. old_size, and the new_size
2480 * has not been zeroed. Fortunately, we do not need to zero it
2481 * either since on one hand it will either already be zero due
2482 * to both readpage and writepage clearing partial page data
2483 * beyond i_size in which case there is nothing to do or in the
2484 * case of the file being mmap()ped at the same time, POSIX
2485 * specifies that the behaviour is unspecified thus we do not
2486 * have to do anything. This means that in our implementation
2487 * in the rare case that the file is mmap()ped and a write
2488 * occured into the mmap()ped region just beyond the file size
2489 * and writepage has not yet been called to write out the page
2490 * (which would clear the area beyond the file size) and we now
2491 * extend the file size to incorporate this dirty region
2492 * outside the file size, a write of the page would result in
2493 * this data being written to disk instead of being cleared.
2494 * Given both POSIX and the Linux mmap(2) man page specify that
2495 * this corner case is undefined, we choose to leave it like
2496 * that as this is much simpler for us as we cannot lock the
2497 * relevant page now since we are holding too many ntfs locks
2498 * which would result in a lock reversal deadlock.
2499 */
2500 ni->initialized_size = new_size;
2501 write_unlock_irqrestore(&ni->size_lock, flags);
2502 goto unm_done;
2503 }
2504 /* If the above resize failed, this must be an attribute extension. */
2505 BUG_ON(size_change < 0);
2506 /*
2507 * We have to drop all the locks so we can call
2508 * ntfs_attr_make_non_resident(). This could be optimised by try-
2509 * locking the first page cache page and only if that fails dropping
2510 * the locks, locking the page, and redoing all the locking and
2511 * lookups. While this would be a huge optimisation, it is not worth
2512 * it as this is definitely a slow code path as it only ever can happen
2513 * once for any given file.
2514 */
2367 ntfs_attr_put_search_ctx(ctx); 2515 ntfs_attr_put_search_ctx(ctx);
2368 unmap_mft_record(ni); 2516 unmap_mft_record(base_ni);
2369 NInoClearTruncateFailed(ni); 2517 up_write(&ni->runlist.lock);
2370 ntfs_debug("Done."); 2518 /*
2371 return 0; 2519 * Not enough space in the mft record, try to make the attribute
2372err_out: 2520 * non-resident and if successful restart the truncation process.
2373 if (err != -ENOMEM) { 2521 */
2522 err = ntfs_attr_make_non_resident(ni, old_size);
2523 if (likely(!err))
2524 goto retry_truncate;
2525 /*
2526 * Could not make non-resident. If this is due to this not being
2527 * permitted for this attribute type or there not being enough space,
2528 * try to make other attributes non-resident. Otherwise fail.
2529 */
2530 if (unlikely(err != -EPERM && err != -ENOSPC)) {
2531 ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
2532 "type 0x%x, because the conversion from "
2533 "resident to non-resident attribute failed "
2534 "with error code %i.", vi->i_ino,
2535 (unsigned)le32_to_cpu(ni->type), err);
2536 if (err != -ENOMEM)
2537 err = -EIO;
2538 goto conv_err_out;
2539 }
2540 /* TODO: Not implemented from here, abort. */
2541 if (err == -ENOSPC)
2542 ntfs_error(vol->sb, "Not enough space in the mft record/on "
2543 "disk for the non-resident attribute value. "
2544 "This case is not implemented yet.");
2545 else /* if (err == -EPERM) */
2546 ntfs_error(vol->sb, "This attribute type may not be "
2547 "non-resident. This case is not implemented "
2548 "yet.");
2549 err = -EOPNOTSUPP;
2550 goto conv_err_out;
2551#if 0
2552 // TODO: Attempt to make other attributes non-resident.
2553 if (!err)
2554 goto do_resident_extend;
2555 /*
2556 * Both the attribute list attribute and the standard information
2557 * attribute must remain in the base inode. Thus, if this is one of
2558 * these attributes, we have to try to move other attributes out into
2559 * extent mft records instead.
2560 */
2561 if (ni->type == AT_ATTRIBUTE_LIST ||
2562 ni->type == AT_STANDARD_INFORMATION) {
2563 // TODO: Attempt to move other attributes into extent mft
2564 // records.
2565 err = -EOPNOTSUPP;
2566 if (!err)
2567 goto do_resident_extend;
2568 goto err_out;
2569 }
2570 // TODO: Attempt to move this attribute to an extent mft record, but
2571 // only if it is not already the only attribute in an mft record in
2572 // which case there would be nothing to gain.
2573 err = -EOPNOTSUPP;
2574 if (!err)
2575 goto do_resident_extend;
2576 /* There is nothing we can do to make enough space. )-: */
2577 goto err_out;
2578#endif
2579do_non_resident_truncate:
2580 BUG_ON(!NInoNonResident(ni));
2581 if (alloc_change < 0) {
2582 highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
2583 if (highest_vcn > 0 &&
2584 old_alloc_size >> vol->cluster_size_bits >
2585 highest_vcn + 1) {
2586 /*
2587 * This attribute has multiple extents. Not yet
2588 * supported.
2589 */
2590 ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
2591 "attribute type 0x%x, because the "
2592 "attribute is highly fragmented (it "
2593 "consists of multiple extents) and "
2594 "this case is not implemented yet.",
2595 vi->i_ino,
2596 (unsigned)le32_to_cpu(ni->type));
2597 err = -EOPNOTSUPP;
2598 goto bad_out;
2599 }
2600 }
2601 /*
2602 * If the size is shrinking, need to reduce the initialized_size and
2603 * the data_size before reducing the allocation.
2604 */
2605 if (size_change < 0) {
2606 /*
2607 * Make the valid size smaller (i_size is already up-to-date).
2608 */
2609 write_lock_irqsave(&ni->size_lock, flags);
2610 if (new_size < ni->initialized_size) {
2611 ni->initialized_size = new_size;
2612 a->data.non_resident.initialized_size =
2613 cpu_to_sle64(new_size);
2614 }
2615 a->data.non_resident.data_size = cpu_to_sle64(new_size);
2616 write_unlock_irqrestore(&ni->size_lock, flags);
2617 flush_dcache_mft_record_page(ctx->ntfs_ino);
2618 mark_mft_record_dirty(ctx->ntfs_ino);
2619 /* If the allocated size is not changing, we are done. */
2620 if (!alloc_change)
2621 goto unm_done;
2622 /*
2623 * If the size is shrinking it makes no sense for the
2624 * allocation to be growing.
2625 */
2626 BUG_ON(alloc_change > 0);
2627 } else /* if (size_change >= 0) */ {
2628 /*
2629 * The file size is growing or staying the same but the
2630 * allocation can be shrinking, growing or staying the same.
2631 */
2632 if (alloc_change > 0) {
2633 /*
2634 * We need to extend the allocation and possibly update
2635 * the data size. If we are updating the data size,
2636 * since we are not touching the initialized_size we do
2637 * not need to worry about the actual data on disk.
2638 * And as far as the page cache is concerned, there
2639 * will be no pages beyond the old data size and any
2640 * partial region in the last page between the old and
2641 * new data size (or the end of the page if the new
2642 * data size is outside the page) does not need to be
2643 * modified as explained above for the resident
2644 * attribute truncate case. To do this, we simply drop
2645 * the locks we hold and leave all the work to our
2646 * friendly helper ntfs_attr_extend_allocation().
2647 */
2648 ntfs_attr_put_search_ctx(ctx);
2649 unmap_mft_record(base_ni);
2650 up_write(&ni->runlist.lock);
2651 err = ntfs_attr_extend_allocation(ni, new_size,
2652 size_change > 0 ? new_size : -1, -1);
2653 /*
2654 * ntfs_attr_extend_allocation() will have done error
2655 * output already.
2656 */
2657 goto done;
2658 }
2659 if (!alloc_change)
2660 goto alloc_done;
2661 }
2662 /* alloc_change < 0 */
2663 /* Free the clusters. */
2664 nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
2665 vol->cluster_size_bits, -1, ctx);
2666 m = ctx->mrec;
2667 a = ctx->attr;
2668 if (unlikely(nr_freed < 0)) {
2669 ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
2670 "%lli). Unmount and run chkdsk to recover "
2671 "the lost cluster(s).", (long long)nr_freed);
2374 NVolSetErrors(vol); 2672 NVolSetErrors(vol);
2673 nr_freed = 0;
2674 }
2675 /* Truncate the runlist. */
2676 err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
2677 new_alloc_size >> vol->cluster_size_bits);
2678 /*
2679 * If the runlist truncation failed and/or the search context is no
2680 * longer valid, we cannot resize the attribute record or build the
2681 * mapping pairs array thus we mark the inode bad so that no access to
2682 * the freed clusters can happen.
2683 */
2684 if (unlikely(err || IS_ERR(m))) {
2685 ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
2686 IS_ERR(m) ?
2687 "restore attribute search context" :
2688 "truncate attribute runlist",
2689 IS_ERR(m) ? PTR_ERR(m) : err, es);
2690 err = -EIO;
2691 goto bad_out;
2692 }
2693 /* Get the size for the shrunk mapping pairs array for the runlist. */
2694 mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
2695 if (unlikely(mp_size <= 0)) {
2696 ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
2697 "attribute type 0x%x, because determining the "
2698 "size for the mapping pairs failed with error "
2699 "code %i.%s", vi->i_ino,
2700 (unsigned)le32_to_cpu(ni->type), mp_size, es);
2701 err = -EIO;
2702 goto bad_out;
2703 }
2704 /*
2705 * Shrink the attribute record for the new mapping pairs array. Note,
2706 * this cannot fail since we are making the attribute smaller thus by
2707 * definition there is enough space to do so.
2708 */
2709 attr_len = le32_to_cpu(a->length);
2710 err = ntfs_attr_record_resize(m, a, mp_size +
2711 le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
2712 BUG_ON(err);
2713 /*
2714 * Generate the mapping pairs array directly into the attribute record.
2715 */
2716 err = ntfs_mapping_pairs_build(vol, (u8*)a +
2717 le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
2718 mp_size, ni->runlist.rl, 0, -1, NULL);
2719 if (unlikely(err)) {
2720 ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
2721 "attribute type 0x%x, because building the "
2722 "mapping pairs failed with error code %i.%s",
2723 vi->i_ino, (unsigned)le32_to_cpu(ni->type),
2724 err, es);
2725 err = -EIO;
2726 goto bad_out;
2727 }
2728 /* Update the allocated/compressed size as well as the highest vcn. */
2729 a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
2730 vol->cluster_size_bits) - 1);
2731 write_lock_irqsave(&ni->size_lock, flags);
2732 ni->allocated_size = new_alloc_size;
2733 a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
2734 if (NInoSparse(ni) || NInoCompressed(ni)) {
2735 if (nr_freed) {
2736 ni->itype.compressed.size -= nr_freed <<
2737 vol->cluster_size_bits;
2738 BUG_ON(ni->itype.compressed.size < 0);
2739 a->data.non_resident.compressed_size = cpu_to_sle64(
2740 ni->itype.compressed.size);
2741 vi->i_blocks = ni->itype.compressed.size >> 9;
2742 }
2743 } else
2744 vi->i_blocks = new_alloc_size >> 9;
2745 write_unlock_irqrestore(&ni->size_lock, flags);
2746 /*
2747 * We have shrunk the allocation. If this is a shrinking truncate we
2748 * have already dealt with the initialized_size and the data_size above
2749 * and we are done. If the truncate is only changing the allocation
2750 * and not the data_size, we are also done. If this is an extending
2751 * truncate, need to extend the data_size now which is ensured by the
2752 * fact that @size_change is positive.
2753 */
2754alloc_done:
2755 /*
2756 * If the size is growing, need to update it now. If it is shrinking,
2757 * we have already updated it above (before the allocation change).
2758 */
2759 if (size_change > 0)
2760 a->data.non_resident.data_size = cpu_to_sle64(new_size);
2761 /* Ensure the modified mft record is written out. */
2762 flush_dcache_mft_record_page(ctx->ntfs_ino);
2763 mark_mft_record_dirty(ctx->ntfs_ino);
2764unm_done:
2765 ntfs_attr_put_search_ctx(ctx);
2766 unmap_mft_record(base_ni);
2767 up_write(&ni->runlist.lock);
2768done:
2769 /* Update the mtime and ctime on the base inode. */
2770 inode_update_time(VFS_I(base_ni), 1);
2771 if (likely(!err)) {
2772 NInoClearTruncateFailed(ni);
2773 ntfs_debug("Done.");
2774 }
2775 return err;
2776old_bad_out:
2777 old_size = -1;
2778bad_out:
2779 if (err != -ENOMEM && err != -EOPNOTSUPP) {
2375 make_bad_inode(vi); 2780 make_bad_inode(vi);
2781 make_bad_inode(VFS_I(base_ni));
2782 NVolSetErrors(vol);
2376 } 2783 }
2784 if (err != -EOPNOTSUPP)
2785 NInoSetTruncateFailed(ni);
2786 else if (old_size >= 0)
2787 i_size_write(vi, old_size);
2788err_out:
2377 if (ctx) 2789 if (ctx)
2378 ntfs_attr_put_search_ctx(ctx); 2790 ntfs_attr_put_search_ctx(ctx);
2379 if (m) 2791 if (m)
2380 unmap_mft_record(ni); 2792 unmap_mft_record(base_ni);
2381 NInoSetTruncateFailed(ni); 2793 up_write(&ni->runlist.lock);
2794out:
2795 ntfs_debug("Failed. Returning error code %i.", err);
2382 return err; 2796 return err;
2797conv_err_out:
2798 if (err != -ENOMEM && err != -EOPNOTSUPP) {
2799 make_bad_inode(vi);
2800 make_bad_inode(VFS_I(base_ni));
2801 NVolSetErrors(vol);
2802 }
2803 if (err != -EOPNOTSUPP)
2804 NInoSetTruncateFailed(ni);
2805 else
2806 i_size_write(vi, old_size);
2807 goto out;
2383} 2808}
2384 2809
2385/** 2810/**
@@ -2420,8 +2845,7 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2420 2845
2421 err = inode_change_ok(vi, attr); 2846 err = inode_change_ok(vi, attr);
2422 if (err) 2847 if (err)
2423 return err; 2848 goto out;
2424
2425 /* We do not support NTFS ACLs yet. */ 2849 /* We do not support NTFS ACLs yet. */
2426 if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { 2850 if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
2427 ntfs_warning(vi->i_sb, "Changes in user/group/mode are not " 2851 ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
@@ -2429,14 +2853,22 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2429 err = -EOPNOTSUPP; 2853 err = -EOPNOTSUPP;
2430 goto out; 2854 goto out;
2431 } 2855 }
2432
2433 if (ia_valid & ATTR_SIZE) { 2856 if (ia_valid & ATTR_SIZE) {
2434 if (attr->ia_size != i_size_read(vi)) { 2857 if (attr->ia_size != i_size_read(vi)) {
2435 ntfs_warning(vi->i_sb, "Changes in inode size are not " 2858 ntfs_inode *ni = NTFS_I(vi);
2436 "supported yet, ignoring."); 2859 /*
2437 err = -EOPNOTSUPP; 2860 * FIXME: For now we do not support resizing of
2438 // TODO: Implement... 2861 * compressed or encrypted files yet.
2439 // err = vmtruncate(vi, attr->ia_size); 2862 */
2863 if (NInoCompressed(ni) || NInoEncrypted(ni)) {
2864 ntfs_warning(vi->i_sb, "Changes in inode size "
2865 "are not supported yet for "
2866 "%s files, ignoring.",
2867 NInoCompressed(ni) ?
2868 "compressed" : "encrypted");
2869 err = -EOPNOTSUPP;
2870 } else
2871 err = vmtruncate(vi, attr->ia_size);
2440 if (err || ia_valid == ATTR_SIZE) 2872 if (err || ia_valid == ATTR_SIZE)
2441 goto out; 2873 goto out;
2442 } else { 2874 } else {
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 5c248d404f05..f5678d5d7919 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -1021,10 +1021,17 @@ enum {
1021 FILE_NAME_POSIX = 0x00, 1021 FILE_NAME_POSIX = 0x00,
1022 /* This is the largest namespace. It is case sensitive and allows all 1022 /* This is the largest namespace. It is case sensitive and allows all
1023 Unicode characters except for: '\0' and '/'. Beware that in 1023 Unicode characters except for: '\0' and '/'. Beware that in
1024 WinNT/2k files which eg have the same name except for their case 1024 WinNT/2k/2003 by default files which eg have the same name except
1025 will not be distinguished by the standard utilities and thus a "del 1025 for their case will not be distinguished by the standard utilities
1026 filename" will delete both "filename" and "fileName" without 1026 and thus a "del filename" will delete both "filename" and "fileName"
1027 warning. */ 1027 without warning. However if for example Services For Unix (SFU) are
1028 installed and the case sensitive option was enabled at installation
1029 time, then you can create/access/delete such files.
1030 Note that even SFU places restrictions on the filenames beyond the
1031 '\0' and '/' and in particular the following set of characters is
1032 not allowed: '"', '/', '<', '>', '\'. All other characters,
1033 including the ones no allowed in WIN32 namespace are allowed.
1034 Tested with SFU 3.5 (this is now free) running on Windows XP. */
1028 FILE_NAME_WIN32 = 0x01, 1035 FILE_NAME_WIN32 = 0x01,
1029 /* The standard WinNT/2k NTFS long filenames. Case insensitive. All 1036 /* The standard WinNT/2k NTFS long filenames. Case insensitive. All
1030 Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', 1037 Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
@@ -2367,7 +2374,9 @@ typedef struct {
2367 * Extended attribute flags (8-bit). 2374 * Extended attribute flags (8-bit).
2368 */ 2375 */
2369enum { 2376enum {
2370 NEED_EA = 0x80 2377 NEED_EA = 0x80 /* If set the file to which the EA belongs
2378 cannot be interpreted without understanding
2379 the associates extended attributes. */
2371} __attribute__ ((__packed__)); 2380} __attribute__ ((__packed__));
2372 2381
2373typedef u8 EA_FLAGS; 2382typedef u8 EA_FLAGS;
@@ -2375,20 +2384,20 @@ typedef u8 EA_FLAGS;
2375/* 2384/*
2376 * Attribute: Extended attribute (EA) (0xe0). 2385 * Attribute: Extended attribute (EA) (0xe0).
2377 * 2386 *
2378 * NOTE: Always non-resident. (Is this true?) 2387 * NOTE: Can be resident or non-resident.
2379 * 2388 *
2380 * Like the attribute list and the index buffer list, the EA attribute value is 2389 * Like the attribute list and the index buffer list, the EA attribute value is
2381 * a sequence of EA_ATTR variable length records. 2390 * a sequence of EA_ATTR variable length records.
2382 *
2383 * FIXME: It appears weird that the EA name is not unicode. Is it true?
2384 */ 2391 */
2385typedef struct { 2392typedef struct {
2386 le32 next_entry_offset; /* Offset to the next EA_ATTR. */ 2393 le32 next_entry_offset; /* Offset to the next EA_ATTR. */
2387 EA_FLAGS flags; /* Flags describing the EA. */ 2394 EA_FLAGS flags; /* Flags describing the EA. */
2388 u8 ea_name_length; /* Length of the name of the EA in bytes. */ 2395 u8 ea_name_length; /* Length of the name of the EA in bytes
2396 excluding the '\0' byte terminator. */
2389 le16 ea_value_length; /* Byte size of the EA's value. */ 2397 le16 ea_value_length; /* Byte size of the EA's value. */
2390 u8 ea_name[0]; /* Name of the EA. */ 2398 u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not
2391 u8 ea_value[0]; /* The value of the EA. Immediately follows 2399 Unicode and it is zero terminated. */
2400 u8 ea_value[0]; /* The value of the EA. Immediately follows
2392 the name. */ 2401 the name. */
2393} __attribute__ ((__packed__)) EA_ATTR; 2402} __attribute__ ((__packed__)) EA_ATTR;
2394 2403
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 5af3bf0b7eee..29cabf93d2d2 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -76,6 +76,7 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
76 * @count: number of clusters to allocate 76 * @count: number of clusters to allocate
77 * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) 77 * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none)
78 * @zone: zone from which to allocate the clusters 78 * @zone: zone from which to allocate the clusters
79 * @is_extension: if TRUE, this is an attribute extension
79 * 80 *
80 * Allocate @count clusters preferably starting at cluster @start_lcn or at the 81 * Allocate @count clusters preferably starting at cluster @start_lcn or at the
81 * current allocator position if @start_lcn is -1, on the mounted ntfs volume 82 * current allocator position if @start_lcn is -1, on the mounted ntfs volume
@@ -86,6 +87,13 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
86 * @start_vcn specifies the vcn of the first allocated cluster. This makes 87 * @start_vcn specifies the vcn of the first allocated cluster. This makes
87 * merging the resulting runlist with the old runlist easier. 88 * merging the resulting runlist with the old runlist easier.
88 * 89 *
90 * If @is_extension is TRUE, the caller is allocating clusters to extend an
91 * attribute and if it is FALSE, the caller is allocating clusters to fill a
92 * hole in an attribute. Practically the difference is that if @is_extension
93 * is TRUE the returned runlist will be terminated with LCN_ENOENT and if
94 * @is_extension is FALSE the runlist will be terminated with
95 * LCN_RL_NOT_MAPPED.
96 *
89 * You need to check the return value with IS_ERR(). If this is false, the 97 * You need to check the return value with IS_ERR(). If this is false, the
90 * function was successful and the return value is a runlist describing the 98 * function was successful and the return value is a runlist describing the
91 * allocated cluster(s). If IS_ERR() is true, the function failed and 99 * allocated cluster(s). If IS_ERR() is true, the function failed and
@@ -137,7 +145,8 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
137 */ 145 */
138runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, 146runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
139 const s64 count, const LCN start_lcn, 147 const s64 count, const LCN start_lcn,
140 const NTFS_CLUSTER_ALLOCATION_ZONES zone) 148 const NTFS_CLUSTER_ALLOCATION_ZONES zone,
149 const BOOL is_extension)
141{ 150{
142 LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; 151 LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
143 LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; 152 LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
@@ -310,7 +319,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
310 continue; 319 continue;
311 } 320 }
312 bit = 1 << (lcn & 7); 321 bit = 1 << (lcn & 7);
313 ntfs_debug("bit %i.", bit); 322 ntfs_debug("bit 0x%x.", bit);
314 /* If the bit is already set, go onto the next one. */ 323 /* If the bit is already set, go onto the next one. */
315 if (*byte & bit) { 324 if (*byte & bit) {
316 lcn++; 325 lcn++;
@@ -729,7 +738,7 @@ out:
729 /* Add runlist terminator element. */ 738 /* Add runlist terminator element. */
730 if (likely(rl)) { 739 if (likely(rl)) {
731 rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length; 740 rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
732 rl[rlpos].lcn = LCN_RL_NOT_MAPPED; 741 rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
733 rl[rlpos].length = 0; 742 rl[rlpos].length = 0;
734 } 743 }
735 if (likely(page && !IS_ERR(page))) { 744 if (likely(page && !IS_ERR(page))) {
@@ -782,6 +791,7 @@ out:
782 * @ni: ntfs inode whose runlist describes the clusters to free 791 * @ni: ntfs inode whose runlist describes the clusters to free
783 * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters 792 * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
784 * @count: number of clusters to free or -1 for all clusters 793 * @count: number of clusters to free or -1 for all clusters
794 * @ctx: active attribute search context if present or NULL if not
785 * @is_rollback: true if this is a rollback operation 795 * @is_rollback: true if this is a rollback operation
786 * 796 *
787 * Free @count clusters starting at the cluster @start_vcn in the runlist 797 * Free @count clusters starting at the cluster @start_vcn in the runlist
@@ -791,15 +801,39 @@ out:
791 * deallocated. Thus, to completely free all clusters in a runlist, use 801 * deallocated. Thus, to completely free all clusters in a runlist, use
792 * @start_vcn = 0 and @count = -1. 802 * @start_vcn = 0 and @count = -1.
793 * 803 *
804 * If @ctx is specified, it is an active search context of @ni and its base mft
805 * record. This is needed when __ntfs_cluster_free() encounters unmapped
806 * runlist fragments and allows their mapping. If you do not have the mft
807 * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
808 * perform the necessary mapping and unmapping.
809 *
810 * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
811 * before returning. Thus, @ctx will be left pointing to the same attribute on
812 * return as on entry. However, the actual pointers in @ctx may point to
813 * different memory locations on return, so you must remember to reset any
814 * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
815 * you will probably want to do:
816 * m = ctx->mrec;
817 * a = ctx->attr;
818 * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
819 * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
820 *
794 * @is_rollback should always be FALSE, it is for internal use to rollback 821 * @is_rollback should always be FALSE, it is for internal use to rollback
795 * errors. You probably want to use ntfs_cluster_free() instead. 822 * errors. You probably want to use ntfs_cluster_free() instead.
796 * 823 *
797 * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller 824 * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
798 * has to deal with it later. 825 * remove from the runlist or mark sparse the freed runs later.
799 * 826 *
800 * Return the number of deallocated clusters (not counting sparse ones) on 827 * Return the number of deallocated clusters (not counting sparse ones) on
801 * success and -errno on error. 828 * success and -errno on error.
802 * 829 *
830 * WARNING: If @ctx is supplied, regardless of whether success or failure is
831 * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
832 * is no longer valid, i.e. you need to either call
833 * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
834 * In that case PTR_ERR(@ctx->mrec) will give you the error code for
835 * why the mapping of the old inode failed.
836 *
803 * Locking: - The runlist described by @ni must be locked for writing on entry 837 * Locking: - The runlist described by @ni must be locked for writing on entry
804 * and is locked on return. Note the runlist may be modified when 838 * and is locked on return. Note the runlist may be modified when
805 * needed runlist fragments need to be mapped. 839 * needed runlist fragments need to be mapped.
@@ -807,9 +841,13 @@ out:
807 * on return. 841 * on return.
808 * - This function takes the volume lcn bitmap lock for writing and 842 * - This function takes the volume lcn bitmap lock for writing and
809 * modifies the bitmap contents. 843 * modifies the bitmap contents.
844 * - If @ctx is NULL, the base mft record of @ni must not be mapped on
845 * entry and it will be left unmapped on return.
846 * - If @ctx is not NULL, the base mft record must be mapped on entry
847 * and it will be left mapped on return.
810 */ 848 */
811s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, 849s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
812 const BOOL is_rollback) 850 ntfs_attr_search_ctx *ctx, const BOOL is_rollback)
813{ 851{
814 s64 delta, to_free, total_freed, real_freed; 852 s64 delta, to_free, total_freed, real_freed;
815 ntfs_volume *vol; 853 ntfs_volume *vol;
@@ -839,7 +877,7 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
839 877
840 total_freed = real_freed = 0; 878 total_freed = real_freed = 0;
841 879
842 rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, TRUE); 880 rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
843 if (IS_ERR(rl)) { 881 if (IS_ERR(rl)) {
844 if (!is_rollback) 882 if (!is_rollback)
845 ntfs_error(vol->sb, "Failed to find first runlist " 883 ntfs_error(vol->sb, "Failed to find first runlist "
@@ -893,7 +931,7 @@ s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
893 931
894 /* Attempt to map runlist. */ 932 /* Attempt to map runlist. */
895 vcn = rl->vcn; 933 vcn = rl->vcn;
896 rl = ntfs_attr_find_vcn_nolock(ni, vcn, TRUE); 934 rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
897 if (IS_ERR(rl)) { 935 if (IS_ERR(rl)) {
898 err = PTR_ERR(rl); 936 err = PTR_ERR(rl);
899 if (!is_rollback) 937 if (!is_rollback)
@@ -961,7 +999,7 @@ err_out:
961 * If rollback fails, set the volume errors flag, emit an error 999 * If rollback fails, set the volume errors flag, emit an error
962 * message, and return the error code. 1000 * message, and return the error code.
963 */ 1001 */
964 delta = __ntfs_cluster_free(ni, start_vcn, total_freed, TRUE); 1002 delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, TRUE);
965 if (delta < 0) { 1003 if (delta < 0) {
966 ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " 1004 ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving "
967 "inconsistent metadata! Unmount and run " 1005 "inconsistent metadata! Unmount and run "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index a6a8827882e7..72cbca7003b2 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -27,6 +27,7 @@
27 27
28#include <linux/fs.h> 28#include <linux/fs.h>
29 29
30#include "attrib.h"
30#include "types.h" 31#include "types.h"
31#include "inode.h" 32#include "inode.h"
32#include "runlist.h" 33#include "runlist.h"
@@ -41,16 +42,18 @@ typedef enum {
41 42
42extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, 43extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
43 const VCN start_vcn, const s64 count, const LCN start_lcn, 44 const VCN start_vcn, const s64 count, const LCN start_lcn,
44 const NTFS_CLUSTER_ALLOCATION_ZONES zone); 45 const NTFS_CLUSTER_ALLOCATION_ZONES zone,
46 const BOOL is_extension);
45 47
46extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, 48extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
47 s64 count, const BOOL is_rollback); 49 s64 count, ntfs_attr_search_ctx *ctx, const BOOL is_rollback);
48 50
49/** 51/**
50 * ntfs_cluster_free - free clusters on an ntfs volume 52 * ntfs_cluster_free - free clusters on an ntfs volume
51 * @ni: ntfs inode whose runlist describes the clusters to free 53 * @ni: ntfs inode whose runlist describes the clusters to free
52 * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters 54 * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
53 * @count: number of clusters to free or -1 for all clusters 55 * @count: number of clusters to free or -1 for all clusters
56 * @ctx: active attribute search context if present or NULL if not
54 * 57 *
55 * Free @count clusters starting at the cluster @start_vcn in the runlist 58 * Free @count clusters starting at the cluster @start_vcn in the runlist
56 * described by the ntfs inode @ni. 59 * described by the ntfs inode @ni.
@@ -59,12 +62,36 @@ extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
59 * deallocated. Thus, to completely free all clusters in a runlist, use 62 * deallocated. Thus, to completely free all clusters in a runlist, use
60 * @start_vcn = 0 and @count = -1. 63 * @start_vcn = 0 and @count = -1.
61 * 64 *
62 * Note, ntfs_cluster_free() does not modify the runlist at all, so the caller 65 * If @ctx is specified, it is an active search context of @ni and its base mft
63 * has to deal with it later. 66 * record. This is needed when ntfs_cluster_free() encounters unmapped runlist
67 * fragments and allows their mapping. If you do not have the mft record
68 * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
69 * the necessary mapping and unmapping.
70 *
71 * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
72 * before returning. Thus, @ctx will be left pointing to the same attribute on
73 * return as on entry. However, the actual pointers in @ctx may point to
74 * different memory locations on return, so you must remember to reset any
75 * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
76 * you will probably want to do:
77 * m = ctx->mrec;
78 * a = ctx->attr;
79 * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
80 * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
81 *
82 * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
83 * from the runlist or mark sparse the freed runs later.
64 * 84 *
65 * Return the number of deallocated clusters (not counting sparse ones) on 85 * Return the number of deallocated clusters (not counting sparse ones) on
66 * success and -errno on error. 86 * success and -errno on error.
67 * 87 *
88 * WARNING: If @ctx is supplied, regardless of whether success or failure is
89 * returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
90 * is no longer valid, i.e. you need to either call
91 * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
92 * In that case PTR_ERR(@ctx->mrec) will give you the error code for
93 * why the mapping of the old inode failed.
94 *
68 * Locking: - The runlist described by @ni must be locked for writing on entry 95 * Locking: - The runlist described by @ni must be locked for writing on entry
69 * and is locked on return. Note the runlist may be modified when 96 * and is locked on return. Note the runlist may be modified when
70 * needed runlist fragments need to be mapped. 97 * needed runlist fragments need to be mapped.
@@ -72,11 +99,15 @@ extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
72 * on return. 99 * on return.
73 * - This function takes the volume lcn bitmap lock for writing and 100 * - This function takes the volume lcn bitmap lock for writing and
74 * modifies the bitmap contents. 101 * modifies the bitmap contents.
102 * - If @ctx is NULL, the base mft record of @ni must not be mapped on
103 * entry and it will be left unmapped on return.
104 * - If @ctx is not NULL, the base mft record must be mapped on entry
105 * and it will be left mapped on return.
75 */ 106 */
76static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, 107static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
77 s64 count) 108 s64 count, ntfs_attr_search_ctx *ctx)
78{ 109{
79 return __ntfs_cluster_free(ni, start_vcn, count, FALSE); 110 return __ntfs_cluster_free(ni, start_vcn, count, ctx, FALSE);
80} 111}
81 112
82extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, 113extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index 590887b943f5..e38e402e4103 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -39,8 +39,7 @@
39 * If there was insufficient memory to complete the request, return NULL. 39 * If there was insufficient memory to complete the request, return NULL.
40 * Depending on @gfp_mask the allocation may be guaranteed to succeed. 40 * Depending on @gfp_mask the allocation may be guaranteed to succeed.
41 */ 41 */
42static inline void *__ntfs_malloc(unsigned long size, 42static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
43 gfp_t gfp_mask)
44{ 43{
45 if (likely(size <= PAGE_SIZE)) { 44 if (likely(size <= PAGE_SIZE)) {
46 BUG_ON(!size); 45 BUG_ON(!size);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b011369b5956..0c65cbb8c5cf 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -49,7 +49,8 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
49 ntfs_volume *vol = ni->vol; 49 ntfs_volume *vol = ni->vol;
50 struct inode *mft_vi = vol->mft_ino; 50 struct inode *mft_vi = vol->mft_ino;
51 struct page *page; 51 struct page *page;
52 unsigned long index, ofs, end_index; 52 unsigned long index, end_index;
53 unsigned ofs;
53 54
54 BUG_ON(ni->page); 55 BUG_ON(ni->page);
55 /* 56 /*
@@ -1308,7 +1309,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
1308 ll = mftbmp_ni->allocated_size; 1309 ll = mftbmp_ni->allocated_size;
1309 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); 1310 read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1310 rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, 1311 rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
1311 (ll - 1) >> vol->cluster_size_bits, TRUE); 1312 (ll - 1) >> vol->cluster_size_bits, NULL);
1312 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { 1313 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
1313 up_write(&mftbmp_ni->runlist.lock); 1314 up_write(&mftbmp_ni->runlist.lock);
1314 ntfs_error(vol->sb, "Failed to determine last allocated " 1315 ntfs_error(vol->sb, "Failed to determine last allocated "
@@ -1354,7 +1355,8 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
1354 up_write(&vol->lcnbmp_lock); 1355 up_write(&vol->lcnbmp_lock);
1355 ntfs_unmap_page(page); 1356 ntfs_unmap_page(page);
1356 /* Allocate a cluster from the DATA_ZONE. */ 1357 /* Allocate a cluster from the DATA_ZONE. */
1357 rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE); 1358 rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
1359 TRUE);
1358 if (IS_ERR(rl2)) { 1360 if (IS_ERR(rl2)) {
1359 up_write(&mftbmp_ni->runlist.lock); 1361 up_write(&mftbmp_ni->runlist.lock);
1360 ntfs_error(vol->sb, "Failed to allocate a cluster for " 1362 ntfs_error(vol->sb, "Failed to allocate a cluster for "
@@ -1738,7 +1740,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1738 ll = mft_ni->allocated_size; 1740 ll = mft_ni->allocated_size;
1739 read_unlock_irqrestore(&mft_ni->size_lock, flags); 1741 read_unlock_irqrestore(&mft_ni->size_lock, flags);
1740 rl = ntfs_attr_find_vcn_nolock(mft_ni, 1742 rl = ntfs_attr_find_vcn_nolock(mft_ni,
1741 (ll - 1) >> vol->cluster_size_bits, TRUE); 1743 (ll - 1) >> vol->cluster_size_bits, NULL);
1742 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { 1744 if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
1743 up_write(&mft_ni->runlist.lock); 1745 up_write(&mft_ni->runlist.lock);
1744 ntfs_error(vol->sb, "Failed to determine last allocated " 1746 ntfs_error(vol->sb, "Failed to determine last allocated "
@@ -1779,7 +1781,8 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1779 nr > min_nr ? "default" : "minimal", (long long)nr); 1781 nr > min_nr ? "default" : "minimal", (long long)nr);
1780 old_last_vcn = rl[1].vcn; 1782 old_last_vcn = rl[1].vcn;
1781 do { 1783 do {
1782 rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE); 1784 rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
1785 TRUE);
1783 if (likely(!IS_ERR(rl2))) 1786 if (likely(!IS_ERR(rl2)))
1784 break; 1787 break;
1785 if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { 1788 if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
@@ -1951,20 +1954,21 @@ restore_undo_alloc:
1951 NVolSetErrors(vol); 1954 NVolSetErrors(vol);
1952 return ret; 1955 return ret;
1953 } 1956 }
1954 a = ctx->attr; 1957 ctx->attr->data.non_resident.highest_vcn =
1955 a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1); 1958 cpu_to_sle64(old_last_vcn - 1);
1956undo_alloc: 1959undo_alloc:
1957 if (ntfs_cluster_free(mft_ni, old_last_vcn, -1) < 0) { 1960 if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
1958 ntfs_error(vol->sb, "Failed to free clusters from mft data " 1961 ntfs_error(vol->sb, "Failed to free clusters from mft data "
1959 "attribute.%s", es); 1962 "attribute.%s", es);
1960 NVolSetErrors(vol); 1963 NVolSetErrors(vol);
1961 } 1964 }
1965 a = ctx->attr;
1962 if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { 1966 if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
1963 ntfs_error(vol->sb, "Failed to truncate mft data attribute " 1967 ntfs_error(vol->sb, "Failed to truncate mft data attribute "
1964 "runlist.%s", es); 1968 "runlist.%s", es);
1965 NVolSetErrors(vol); 1969 NVolSetErrors(vol);
1966 } 1970 }
1967 if (mp_rebuilt) { 1971 if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
1968 if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( 1972 if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1969 a->data.non_resident.mapping_pairs_offset), 1973 a->data.non_resident.mapping_pairs_offset),
1970 old_alen - le16_to_cpu( 1974 old_alen - le16_to_cpu(
@@ -1981,6 +1985,10 @@ undo_alloc:
1981 } 1985 }
1982 flush_dcache_mft_record_page(ctx->ntfs_ino); 1986 flush_dcache_mft_record_page(ctx->ntfs_ino);
1983 mark_mft_record_dirty(ctx->ntfs_ino); 1987 mark_mft_record_dirty(ctx->ntfs_ino);
1988 } else if (IS_ERR(ctx->mrec)) {
1989 ntfs_error(vol->sb, "Failed to restore attribute search "
1990 "context.%s", es);
1991 NVolSetErrors(vol);
1984 } 1992 }
1985 if (ctx) 1993 if (ctx)
1986 ntfs_attr_put_search_ctx(ctx); 1994 ntfs_attr_put_search_ctx(ctx);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 453d0d51ea4b..6c16db9e1a8a 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1447,7 +1447,7 @@ not_enabled:
1447 if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { 1447 if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
1448 ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " 1448 ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
1449 "attribute (size is 0x%llx but should be at " 1449 "attribute (size is 0x%llx but should be at "
1450 "least 0x%x bytes).", i_size_read(tmp_ino), 1450 "least 0x%zx bytes).", i_size_read(tmp_ino),
1451 sizeof(USN_HEADER)); 1451 sizeof(USN_HEADER));
1452 return FALSE; 1452 return FALSE;
1453 } 1453 }
diff --git a/fs/open.c b/fs/open.c
index f0d90cf0495c..8d06ec911fd9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -739,7 +739,8 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
739} 739}
740 740
741static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, 741static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
742 int flags, struct file *f) 742 int flags, struct file *f,
743 int (*open)(struct inode *, struct file *))
743{ 744{
744 struct inode *inode; 745 struct inode *inode;
745 int error; 746 int error;
@@ -761,11 +762,14 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
761 f->f_op = fops_get(inode->i_fop); 762 f->f_op = fops_get(inode->i_fop);
762 file_move(f, &inode->i_sb->s_files); 763 file_move(f, &inode->i_sb->s_files);
763 764
764 if (f->f_op && f->f_op->open) { 765 if (!open && f->f_op)
765 error = f->f_op->open(inode,f); 766 open = f->f_op->open;
767 if (open) {
768 error = open(inode, f);
766 if (error) 769 if (error)
767 goto cleanup_all; 770 goto cleanup_all;
768 } 771 }
772
769 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); 773 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
770 774
771 file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); 775 file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
@@ -814,28 +818,75 @@ struct file *filp_open(const char * filename, int flags, int mode)
814{ 818{
815 int namei_flags, error; 819 int namei_flags, error;
816 struct nameidata nd; 820 struct nameidata nd;
817 struct file *f;
818 821
819 namei_flags = flags; 822 namei_flags = flags;
820 if ((namei_flags+1) & O_ACCMODE) 823 if ((namei_flags+1) & O_ACCMODE)
821 namei_flags++; 824 namei_flags++;
822 if (namei_flags & O_TRUNC)
823 namei_flags |= 2;
824
825 error = -ENFILE;
826 f = get_empty_filp();
827 if (f == NULL)
828 return ERR_PTR(error);
829 825
830 error = open_namei(filename, namei_flags, mode, &nd); 826 error = open_namei(filename, namei_flags, mode, &nd);
831 if (!error) 827 if (!error)
832 return __dentry_open(nd.dentry, nd.mnt, flags, f); 828 return nameidata_to_filp(&nd, flags);
833 829
834 put_filp(f);
835 return ERR_PTR(error); 830 return ERR_PTR(error);
836} 831}
837EXPORT_SYMBOL(filp_open); 832EXPORT_SYMBOL(filp_open);
838 833
834/**
835 * lookup_instantiate_filp - instantiates the open intent filp
836 * @nd: pointer to nameidata
837 * @dentry: pointer to dentry
838 * @open: open callback
839 *
840 * Helper for filesystems that want to use lookup open intents and pass back
841 * a fully instantiated struct file to the caller.
842 * This function is meant to be called from within a filesystem's
843 * lookup method.
844 * Note that in case of error, nd->intent.open.file is destroyed, but the
845 * path information remains valid.
846 * If the open callback is set to NULL, then the standard f_op->open()
847 * filesystem callback is substituted.
848 */
849struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
850 int (*open)(struct inode *, struct file *))
851{
852 if (IS_ERR(nd->intent.open.file))
853 goto out;
854 if (IS_ERR(dentry))
855 goto out_err;
856 nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->mnt),
857 nd->intent.open.flags - 1,
858 nd->intent.open.file,
859 open);
860out:
861 return nd->intent.open.file;
862out_err:
863 release_open_intent(nd);
864 nd->intent.open.file = (struct file *)dentry;
865 goto out;
866}
867EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
868
869/**
870 * nameidata_to_filp - convert a nameidata to an open filp.
871 * @nd: pointer to nameidata
872 * @flags: open flags
873 *
874 * Note that this function destroys the original nameidata
875 */
876struct file *nameidata_to_filp(struct nameidata *nd, int flags)
877{
878 struct file *filp;
879
880 /* Pick up the filp from the open intent */
881 filp = nd->intent.open.file;
882 /* Has the filesystem initialised the file for us? */
883 if (filp->f_dentry == NULL)
884 filp = __dentry_open(nd->dentry, nd->mnt, flags, filp, NULL);
885 else
886 path_release(nd);
887 return filp;
888}
889
839struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) 890struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
840{ 891{
841 int error; 892 int error;
@@ -846,7 +897,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
846 if (f == NULL) 897 if (f == NULL)
847 return ERR_PTR(error); 898 return ERR_PTR(error);
848 899
849 return __dentry_open(dentry, mnt, flags, f); 900 return __dentry_open(dentry, mnt, flags, f, NULL);
850} 901}
851EXPORT_SYMBOL(dentry_open); 902EXPORT_SYMBOL(dentry_open);
852 903
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 77e178f13162..9c06c5434ec4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -192,6 +192,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
192struct part_attribute { 192struct part_attribute {
193 struct attribute attr; 193 struct attribute attr;
194 ssize_t (*show)(struct hd_struct *,char *); 194 ssize_t (*show)(struct hd_struct *,char *);
195 ssize_t (*store)(struct hd_struct *,const char *, size_t);
195}; 196};
196 197
197static ssize_t 198static ssize_t
@@ -201,14 +202,33 @@ part_attr_show(struct kobject * kobj, struct attribute * attr, char * page)
201 struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr); 202 struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
202 ssize_t ret = 0; 203 ssize_t ret = 0;
203 if (part_attr->show) 204 if (part_attr->show)
204 ret = part_attr->show(p,page); 205 ret = part_attr->show(p, page);
206 return ret;
207}
208static ssize_t
209part_attr_store(struct kobject * kobj, struct attribute * attr,
210 const char *page, size_t count)
211{
212 struct hd_struct * p = container_of(kobj,struct hd_struct,kobj);
213 struct part_attribute * part_attr = container_of(attr,struct part_attribute,attr);
214 ssize_t ret = 0;
215
216 if (part_attr->store)
217 ret = part_attr->store(p, page, count);
205 return ret; 218 return ret;
206} 219}
207 220
208static struct sysfs_ops part_sysfs_ops = { 221static struct sysfs_ops part_sysfs_ops = {
209 .show = part_attr_show, 222 .show = part_attr_show,
223 .store = part_attr_store,
210}; 224};
211 225
226static ssize_t part_uevent_store(struct hd_struct * p,
227 const char *page, size_t count)
228{
229 kobject_hotplug(&p->kobj, KOBJ_ADD);
230 return count;
231}
212static ssize_t part_dev_read(struct hd_struct * p, char *page) 232static ssize_t part_dev_read(struct hd_struct * p, char *page)
213{ 233{
214 struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj); 234 struct gendisk *disk = container_of(p->kobj.parent,struct gendisk,kobj);
@@ -229,6 +249,10 @@ static ssize_t part_stat_read(struct hd_struct * p, char *page)
229 p->reads, (unsigned long long)p->read_sectors, 249 p->reads, (unsigned long long)p->read_sectors,
230 p->writes, (unsigned long long)p->write_sectors); 250 p->writes, (unsigned long long)p->write_sectors);
231} 251}
252static struct part_attribute part_attr_uevent = {
253 .attr = {.name = "uevent", .mode = S_IWUSR },
254 .store = part_uevent_store
255};
232static struct part_attribute part_attr_dev = { 256static struct part_attribute part_attr_dev = {
233 .attr = {.name = "dev", .mode = S_IRUGO }, 257 .attr = {.name = "dev", .mode = S_IRUGO },
234 .show = part_dev_read 258 .show = part_dev_read
@@ -247,6 +271,7 @@ static struct part_attribute part_attr_stat = {
247}; 271};
248 272
249static struct attribute * default_attrs[] = { 273static struct attribute * default_attrs[] = {
274 &part_attr_uevent.attr,
250 &part_attr_dev.attr, 275 &part_attr_dev.attr,
251 &part_attr_start.attr, 276 &part_attr_start.attr,
252 &part_attr_size.attr, 277 &part_attr_size.attr,
@@ -430,7 +455,7 @@ void del_gendisk(struct gendisk *disk)
430 disk->flags &= ~GENHD_FL_UP; 455 disk->flags &= ~GENHD_FL_UP;
431 unlink_gendisk(disk); 456 unlink_gendisk(disk);
432 disk_stat_set_all(disk, 0); 457 disk_stat_set_all(disk, 0);
433 disk->stamp = disk->stamp_idle = 0; 458 disk->stamp = 0;
434 459
435 devfs_remove_disk(disk); 460 devfs_remove_disk(disk);
436 461
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d84eecacbeaf..3e1239e4b303 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
438 jiffies_to_clock_t(it_real_value), 438 jiffies_to_clock_t(it_real_value),
439 start_time, 439 start_time,
440 vsize, 440 vsize,
441 mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */ 441 mm ? get_mm_rss(mm) : 0,
442 rsslim, 442 rsslim,
443 mm ? mm->start_code : 0, 443 mm ? mm->start_code : 0,
444 mm ? mm->end_code : 0, 444 mm ? mm->end_code : 0,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8a8c34461d48..b638fb500743 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -533,7 +533,7 @@ static void proc_kill_inodes(struct proc_dir_entry *de)
533 */ 533 */
534 file_list_lock(); 534 file_list_lock();
535 list_for_each(p, &sb->s_files) { 535 list_for_each(p, &sb->s_files) {
536 struct file * filp = list_entry(p, struct file, f_list); 536 struct file * filp = list_entry(p, struct file, f_u.fu_list);
537 struct dentry * dentry = filp->f_dentry; 537 struct dentry * dentry = filp->f_dentry;
538 struct inode * inode; 538 struct inode * inode;
539 struct file_operations *fops; 539 struct file_operations *fops;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index effa6c0c467a..e6a818a93f3d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -156,10 +156,13 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
156 156
157 WARN_ON(de && de->deleted); 157 WARN_ON(de && de->deleted);
158 158
159 if (de != NULL && !try_module_get(de->owner))
160 goto out_mod;
161
159 inode = iget(sb, ino); 162 inode = iget(sb, ino);
160 if (!inode) 163 if (!inode)
161 goto out_fail; 164 goto out_ino;
162 165
163 PROC_I(inode)->pde = de; 166 PROC_I(inode)->pde = de;
164 if (de) { 167 if (de) {
165 if (de->mode) { 168 if (de->mode) {
@@ -171,20 +174,20 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
171 inode->i_size = de->size; 174 inode->i_size = de->size;
172 if (de->nlink) 175 if (de->nlink)
173 inode->i_nlink = de->nlink; 176 inode->i_nlink = de->nlink;
174 if (!try_module_get(de->owner))
175 goto out_fail;
176 if (de->proc_iops) 177 if (de->proc_iops)
177 inode->i_op = de->proc_iops; 178 inode->i_op = de->proc_iops;
178 if (de->proc_fops) 179 if (de->proc_fops)
179 inode->i_fop = de->proc_fops; 180 inode->i_fop = de->proc_fops;
180 } 181 }
181 182
182out:
183 return inode; 183 return inode;
184 184
185out_fail: 185out_ino:
186 if (de != NULL)
187 module_put(de->owner);
188out_mod:
186 de_put(de); 189 de_put(de);
187 goto out; 190 return NULL;
188} 191}
189 192
190int proc_fill_super(struct super_block *s, void *data, int silent) 193int proc_fill_super(struct super_block *s, void *data, int silent)
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index a3453555a94e..5b6b0b6038a7 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -629,12 +629,4 @@ void __init proc_misc_init(void)
629 if (entry) 629 if (entry)
630 entry->proc_fops = &proc_sysrq_trigger_operations; 630 entry->proc_fops = &proc_sysrq_trigger_operations;
631#endif 631#endif
632#ifdef CONFIG_PPC32
633 {
634 extern struct file_operations ppc_htab_operations;
635 entry = create_proc_entry("ppc_htab", S_IRUGO|S_IWUSR, NULL);
636 if (entry)
637 entry->proc_fops = &ppc_htab_operations;
638 }
639#endif
640} 632}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c7ef3e48e35b..d2fa42006d8f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,22 +14,41 @@
14char *task_mem(struct mm_struct *mm, char *buffer) 14char *task_mem(struct mm_struct *mm, char *buffer)
15{ 15{
16 unsigned long data, text, lib; 16 unsigned long data, text, lib;
17 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
18
19 /*
20 * Note: to minimize their overhead, mm maintains hiwater_vm and
21 * hiwater_rss only when about to *lower* total_vm or rss. Any
22 * collector of these hiwater stats must therefore get total_vm
23 * and rss too, which will usually be the higher. Barriers? not
24 * worth the effort, such snapshots can always be inconsistent.
25 */
26 hiwater_vm = total_vm = mm->total_vm;
27 if (hiwater_vm < mm->hiwater_vm)
28 hiwater_vm = mm->hiwater_vm;
29 hiwater_rss = total_rss = get_mm_rss(mm);
30 if (hiwater_rss < mm->hiwater_rss)
31 hiwater_rss = mm->hiwater_rss;
17 32
18 data = mm->total_vm - mm->shared_vm - mm->stack_vm; 33 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
19 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 34 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
20 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 35 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
21 buffer += sprintf(buffer, 36 buffer += sprintf(buffer,
37 "VmPeak:\t%8lu kB\n"
22 "VmSize:\t%8lu kB\n" 38 "VmSize:\t%8lu kB\n"
23 "VmLck:\t%8lu kB\n" 39 "VmLck:\t%8lu kB\n"
40 "VmHWM:\t%8lu kB\n"
24 "VmRSS:\t%8lu kB\n" 41 "VmRSS:\t%8lu kB\n"
25 "VmData:\t%8lu kB\n" 42 "VmData:\t%8lu kB\n"
26 "VmStk:\t%8lu kB\n" 43 "VmStk:\t%8lu kB\n"
27 "VmExe:\t%8lu kB\n" 44 "VmExe:\t%8lu kB\n"
28 "VmLib:\t%8lu kB\n" 45 "VmLib:\t%8lu kB\n"
29 "VmPTE:\t%8lu kB\n", 46 "VmPTE:\t%8lu kB\n",
30 (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), 47 hiwater_vm << (PAGE_SHIFT-10),
48 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
31 mm->locked_vm << (PAGE_SHIFT-10), 49 mm->locked_vm << (PAGE_SHIFT-10),
32 get_mm_counter(mm, rss) << (PAGE_SHIFT-10), 50 hiwater_rss << (PAGE_SHIFT-10),
51 total_rss << (PAGE_SHIFT-10),
33 data << (PAGE_SHIFT-10), 52 data << (PAGE_SHIFT-10),
34 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 53 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
35 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); 54 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -44,13 +63,11 @@ unsigned long task_vsize(struct mm_struct *mm)
44int task_statm(struct mm_struct *mm, int *shared, int *text, 63int task_statm(struct mm_struct *mm, int *shared, int *text,
45 int *data, int *resident) 64 int *data, int *resident)
46{ 65{
47 int rss = get_mm_counter(mm, rss); 66 *shared = get_mm_counter(mm, file_rss);
48
49 *shared = rss - get_mm_counter(mm, anon_rss);
50 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) 67 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
51 >> PAGE_SHIFT; 68 >> PAGE_SHIFT;
52 *data = mm->total_vm - mm->shared_vm; 69 *data = mm->total_vm - mm->shared_vm;
53 *resident = rss; 70 *resident = *shared + get_mm_counter(mm, anon_rss);
54 return mm->total_vm; 71 return mm->total_vm;
55} 72}
56 73
@@ -186,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
186 struct mem_size_stats *mss) 203 struct mem_size_stats *mss)
187{ 204{
188 pte_t *pte, ptent; 205 pte_t *pte, ptent;
206 spinlock_t *ptl;
189 unsigned long pfn; 207 unsigned long pfn;
190 struct page *page; 208 struct page *page;
191 209
192 pte = pte_offset_map(pmd, addr); 210 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
193 do { 211 do {
194 ptent = *pte; 212 ptent = *pte;
195 if (pte_none(ptent) || !pte_present(ptent)) 213 if (!pte_present(ptent))
196 continue; 214 continue;
197 215
198 mss->resident += PAGE_SIZE; 216 mss->resident += PAGE_SIZE;
@@ -213,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
213 mss->private_clean += PAGE_SIZE; 231 mss->private_clean += PAGE_SIZE;
214 } 232 }
215 } while (pte++, addr += PAGE_SIZE, addr != end); 233 } while (pte++, addr += PAGE_SIZE, addr != end);
216 pte_unmap(pte - 1); 234 pte_unmap_unlock(pte - 1, ptl);
217 cond_resched_lock(&vma->vm_mm->page_table_lock); 235 cond_resched();
218} 236}
219 237
220static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud, 238static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -268,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma,
268static int show_smap(struct seq_file *m, void *v) 286static int show_smap(struct seq_file *m, void *v)
269{ 287{
270 struct vm_area_struct *vma = v; 288 struct vm_area_struct *vma = v;
271 struct mm_struct *mm = vma->vm_mm;
272 struct mem_size_stats mss; 289 struct mem_size_stats mss;
273 290
274 memset(&mss, 0, sizeof mss); 291 memset(&mss, 0, sizeof mss);
275 292 if (vma->vm_mm)
276 if (mm) {
277 spin_lock(&mm->page_table_lock);
278 smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); 293 smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
279 spin_unlock(&mm->page_table_lock);
280 }
281
282 return show_map_internal(m, v, &mss); 294 return show_map_internal(m, v, &mss);
283} 295}
284 296
@@ -407,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
407 for_each_node(i) 419 for_each_node(i)
408 md->node[i] =0; 420 md->node[i] =0;
409 421
410 spin_lock(&mm->page_table_lock);
411 for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { 422 for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
412 page = follow_page(mm, vaddr, 0); 423 page = follow_page(mm, vaddr, 0);
413 if (page) { 424 if (page) {
@@ -422,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
422 md->anon++; 433 md->anon++;
423 md->node[page_to_nid(page)]++; 434 md->node[page_to_nid(page)]++;
424 } 435 }
436 cond_resched();
425 } 437 }
426 spin_unlock(&mm->page_table_lock);
427 return md; 438 return md;
428} 439}
429 440
@@ -469,7 +480,7 @@ static int show_numa_map(struct seq_file *m, void *v)
469 seq_printf(m, " interleave={"); 480 seq_printf(m, " interleave={");
470 first = 1; 481 first = 1;
471 for_each_node(n) { 482 for_each_node(n) {
472 if (test_bit(n, pol->v.nodes)) { 483 if (node_isset(n, pol->v.nodes)) {
473 if (!first) 484 if (!first)
474 seq_putc(m,','); 485 seq_putc(m,',');
475 else 486 else
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 2706e2adffab..45829889dcdc 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2022,7 +2022,7 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
2022} 2022}
2023 2023
2024#ifdef CONFIG_REISERFS_CHECK 2024#ifdef CONFIG_REISERFS_CHECK
2025void *reiserfs_kmalloc(size_t size, int flags, struct super_block *s) 2025void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s)
2026{ 2026{
2027 void *vp; 2027 void *vp;
2028 static size_t malloced; 2028 static size_t malloced;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d76ee6c4f9b8..5f82352b97e1 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2842,7 +2842,7 @@ static int reiserfs_set_page_dirty(struct page *page)
2842 * even in -o notail mode, we can't be sure an old mount without -o notail 2842 * even in -o notail mode, we can't be sure an old mount without -o notail
2843 * didn't create files with tails. 2843 * didn't create files with tails.
2844 */ 2844 */
2845static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) 2845static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
2846{ 2846{
2847 struct inode *inode = page->mapping->host; 2847 struct inode *inode = page->mapping->host;
2848 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); 2848 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 44b02fc02ebe..42afb5bef111 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1024,12 +1024,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1024 strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg); 1024 strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
1025 *mount_options |= 1 << REISERFS_QUOTA; 1025 *mount_options |= 1 << REISERFS_QUOTA;
1026 } else { 1026 } else {
1027 if (REISERFS_SB(s)->s_qf_names[qtype]) { 1027 kfree(REISERFS_SB(s)->s_qf_names[qtype]);
1028 kfree(REISERFS_SB(s)-> 1028 REISERFS_SB(s)->s_qf_names[qtype] = NULL;
1029 s_qf_names[qtype]);
1030 REISERFS_SB(s)->s_qf_names[qtype] =
1031 NULL;
1032 }
1033 } 1029 }
1034 } 1030 }
1035 if (c == 'f') { 1031 if (c == 'f') {
@@ -1158,11 +1154,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1158 if (!reiserfs_parse_options 1154 if (!reiserfs_parse_options
1159 (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) { 1155 (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) {
1160#ifdef CONFIG_QUOTA 1156#ifdef CONFIG_QUOTA
1161 for (i = 0; i < MAXQUOTAS; i++) 1157 for (i = 0; i < MAXQUOTAS; i++) {
1162 if (REISERFS_SB(s)->s_qf_names[i]) { 1158 kfree(REISERFS_SB(s)->s_qf_names[i]);
1163 kfree(REISERFS_SB(s)->s_qf_names[i]); 1159 REISERFS_SB(s)->s_qf_names[i] = NULL;
1164 REISERFS_SB(s)->s_qf_names[i] = NULL; 1160 }
1165 }
1166#endif 1161#endif
1167 return -EINVAL; 1162 return -EINVAL;
1168 } 1163 }
@@ -1940,13 +1935,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1940 brelse(SB_BUFFER_WITH_SB(s)); 1935 brelse(SB_BUFFER_WITH_SB(s));
1941#ifdef CONFIG_QUOTA 1936#ifdef CONFIG_QUOTA
1942 for (j = 0; j < MAXQUOTAS; j++) { 1937 for (j = 0; j < MAXQUOTAS; j++) {
1943 if (sbi->s_qf_names[j]) 1938 kfree(sbi->s_qf_names[j]);
1944 kfree(sbi->s_qf_names[j]); 1939 sbi->s_qf_names[j] = NULL;
1945 } 1940 }
1946#endif 1941#endif
1947 if (sbi != NULL) { 1942 kfree(sbi);
1948 kfree(sbi);
1949 }
1950 1943
1951 s->s_fs_info = NULL; 1944 s->s_fs_info = NULL;
1952 return errval; 1945 return errval;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 87ac9dc8b381..72e120798677 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -453,7 +453,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
453 struct page *page; 453 struct page *page;
454 /* We can deadlock if we try to free dentries, 454 /* We can deadlock if we try to free dentries,
455 and an unlink/rmdir has just occured - GFP_NOFS avoids this */ 455 and an unlink/rmdir has just occured - GFP_NOFS avoids this */
456 mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS; 456 mapping_set_gfp_mask(mapping, GFP_NOFS);
457 page = read_cache_page(mapping, n, 457 page = read_cache_page(mapping, n,
458 (filler_t *) mapping->a_ops->readpage, NULL); 458 (filler_t *) mapping->a_ops->readpage, NULL);
459 if (!IS_ERR(page)) { 459 if (!IS_ERR(page)) {
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 6703efa3c430..a47ac9aac8b2 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -296,8 +296,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
296 } 296 }
297 } 297 }
298 298
299 if (value) 299 kfree(value);
300 kfree(value);
301 300
302 if (!error) { 301 if (!error) {
303 /* Release the old one */ 302 /* Release the old one */
diff --git a/fs/super.c b/fs/super.c
index 6e57ee252e14..f60155ec7780 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -513,7 +513,7 @@ static void mark_files_ro(struct super_block *sb)
513 struct file *f; 513 struct file *f;
514 514
515 file_list_lock(); 515 file_list_lock();
516 list_for_each_entry(f, &sb->s_files, f_list) { 516 list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
517 if (S_ISREG(f->f_dentry->d_inode->i_mode) && file_count(f)) 517 if (S_ISREG(f->f_dentry->d_inode->i_mode) && file_count(f))
518 f->f_mode &= ~FMODE_WRITE; 518 f->f_mode &= ~FMODE_WRITE;
519 } 519 }
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index 1c6f6b57ef1c..ef46939c0c1a 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -621,8 +621,7 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name,
621 } 621 }
622 622
623 /* build the entry of long file name */ 623 /* build the entry of long file name */
624 for (cksum = i = 0; i < 11; i++) 624 cksum = fat_checksum(msdos_name);
625 cksum = (((cksum&1)<<7)|((cksum&0xfe)>>1)) + msdos_name[i];
626 625
627 *nr_slots = usize / 13; 626 *nr_slots = usize / 13;
628 for (ps = slots, i = *nr_slots; i > 0; i--, ps++) { 627 for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
@@ -888,10 +887,10 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
888{ 887{
889 struct buffer_head *dotdot_bh; 888 struct buffer_head *dotdot_bh;
890 struct msdos_dir_entry *dotdot_de; 889 struct msdos_dir_entry *dotdot_de;
891 loff_t dotdot_i_pos;
892 struct inode *old_inode, *new_inode; 890 struct inode *old_inode, *new_inode;
893 struct fat_slot_info old_sinfo, sinfo; 891 struct fat_slot_info old_sinfo, sinfo;
894 struct timespec ts; 892 struct timespec ts;
893 loff_t dotdot_i_pos, new_i_pos;
895 int err, is_dir, update_dotdot, corrupt = 0; 894 int err, is_dir, update_dotdot, corrupt = 0;
896 895
897 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 896 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
@@ -914,31 +913,24 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
914 913
915 ts = CURRENT_TIME_SEC; 914 ts = CURRENT_TIME_SEC;
916 if (new_inode) { 915 if (new_inode) {
917 err = vfat_find(new_dir, &new_dentry->d_name, &sinfo);
918 if (err)
919 goto out;
920 if (MSDOS_I(new_inode)->i_pos != sinfo.i_pos) {
921 /* WTF??? Cry and fail. */
922 printk(KERN_WARNING "vfat_rename: fs corrupted\n");
923 goto out;
924 }
925
926 if (is_dir) { 916 if (is_dir) {
927 err = fat_dir_empty(new_inode); 917 err = fat_dir_empty(new_inode);
928 if (err) 918 if (err)
929 goto out; 919 goto out;
930 } 920 }
921 new_i_pos = MSDOS_I(new_inode)->i_pos;
931 fat_detach(new_inode); 922 fat_detach(new_inode);
932 } else { 923 } else {
933 err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0, 924 err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
934 &ts, &sinfo); 925 &ts, &sinfo);
935 if (err) 926 if (err)
936 goto out; 927 goto out;
928 new_i_pos = sinfo.i_pos;
937 } 929 }
938 new_dir->i_version++; 930 new_dir->i_version++;
939 931
940 fat_detach(old_inode); 932 fat_detach(old_inode);
941 fat_attach(old_inode, sinfo.i_pos); 933 fat_attach(old_inode, new_i_pos);
942 if (IS_DIRSYNC(new_dir)) { 934 if (IS_DIRSYNC(new_dir)) {
943 err = fat_sync_inode(old_inode); 935 err = fat_sync_inode(old_inode);
944 if (err) 936 if (err)
@@ -1002,7 +994,7 @@ error_inode:
1002 fat_detach(old_inode); 994 fat_detach(old_inode);
1003 fat_attach(old_inode, old_sinfo.i_pos); 995 fat_attach(old_inode, old_sinfo.i_pos);
1004 if (new_inode) { 996 if (new_inode) {
1005 fat_attach(new_inode, sinfo.i_pos); 997 fat_attach(new_inode, new_i_pos);
1006 if (corrupt) 998 if (corrupt)
1007 corrupt |= fat_sync_inode(new_inode); 999 corrupt |= fat_sync_inode(new_inode);
1008 } else { 1000 } else {
diff --git a/fs/xattr.c b/fs/xattr.c
index 3f9c64bea151..f6e00c0e114f 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -143,7 +143,7 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
143 if (size) { 143 if (size) {
144 if (size > XATTR_SIZE_MAX) 144 if (size > XATTR_SIZE_MAX)
145 size = XATTR_SIZE_MAX; 145 size = XATTR_SIZE_MAX;
146 kvalue = kmalloc(size, GFP_KERNEL); 146 kvalue = kzalloc(size, GFP_KERNEL);
147 if (!kvalue) 147 if (!kvalue)
148 return -ENOMEM; 148 return -ENOMEM;
149 } 149 }
@@ -154,11 +154,15 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
154 error = -EOPNOTSUPP; 154 error = -EOPNOTSUPP;
155 if (d->d_inode->i_op && d->d_inode->i_op->getxattr) 155 if (d->d_inode->i_op && d->d_inode->i_op->getxattr)
156 error = d->d_inode->i_op->getxattr(d, kname, kvalue, size); 156 error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
157 else if (!strncmp(kname, XATTR_SECURITY_PREFIX, 157
158 sizeof XATTR_SECURITY_PREFIX - 1)) { 158 if (!strncmp(kname, XATTR_SECURITY_PREFIX,
159 sizeof XATTR_SECURITY_PREFIX - 1)) {
159 const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1; 160 const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
160 error = security_inode_getsecurity(d->d_inode, suffix, kvalue, 161 int rv = security_inode_getsecurity(d->d_inode, suffix, kvalue,
161 size); 162 size, error);
163 /* Security module active: overwrite error value */
164 if (rv != -EOPNOTSUPP)
165 error = rv;
162 } 166 }
163 if (error > 0) { 167 if (error > 0) {
164 if (size && copy_to_user(value, kvalue, error)) 168 if (size && copy_to_user(value, kvalue, error))
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index d2653b589b1c..3c92162dc728 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -45,11 +45,11 @@
45 45
46 46
47void * 47void *
48kmem_alloc(size_t size, gfp_t flags) 48kmem_alloc(size_t size, unsigned int __nocast flags)
49{ 49{
50 int retries = 0; 50 int retries = 0;
51 unsigned int lflags = kmem_flags_convert(flags); 51 gfp_t lflags = kmem_flags_convert(flags);
52 void *ptr; 52 void *ptr;
53 53
54 do { 54 do {
55 if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) 55 if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
@@ -67,7 +67,7 @@ kmem_alloc(size_t size, gfp_t flags)
67} 67}
68 68
69void * 69void *
70kmem_zalloc(size_t size, gfp_t flags) 70kmem_zalloc(size_t size, unsigned int __nocast flags)
71{ 71{
72 void *ptr; 72 void *ptr;
73 73
@@ -90,7 +90,7 @@ kmem_free(void *ptr, size_t size)
90 90
91void * 91void *
92kmem_realloc(void *ptr, size_t newsize, size_t oldsize, 92kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
93 gfp_t flags) 93 unsigned int __nocast flags)
94{ 94{
95 void *new; 95 void *new;
96 96
@@ -105,11 +105,11 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
105} 105}
106 106
107void * 107void *
108kmem_zone_alloc(kmem_zone_t *zone, gfp_t flags) 108kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
109{ 109{
110 int retries = 0; 110 int retries = 0;
111 unsigned int lflags = kmem_flags_convert(flags); 111 gfp_t lflags = kmem_flags_convert(flags);
112 void *ptr; 112 void *ptr;
113 113
114 do { 114 do {
115 ptr = kmem_cache_alloc(zone, lflags); 115 ptr = kmem_cache_alloc(zone, lflags);
@@ -124,7 +124,7 @@ kmem_zone_alloc(kmem_zone_t *zone, gfp_t flags)
124} 124}
125 125
126void * 126void *
127kmem_zone_zalloc(kmem_zone_t *zone, gfp_t flags) 127kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
128{ 128{
129 void *ptr; 129 void *ptr;
130 130
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index ee7010f085bc..f4bb78c268c0 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -81,9 +81,9 @@ typedef unsigned long xfs_pflags_t;
81 *(NSTATEP) = *(OSTATEP); \ 81 *(NSTATEP) = *(OSTATEP); \
82} while (0) 82} while (0)
83 83
84static __inline unsigned int kmem_flags_convert(gfp_t flags) 84static __inline gfp_t kmem_flags_convert(unsigned int __nocast flags)
85{ 85{
86 unsigned int lflags = __GFP_NOWARN; /* we'll report problems, if need be */ 86 gfp_t lflags = __GFP_NOWARN; /* we'll report problems, if need be */
87 87
88#ifdef DEBUG 88#ifdef DEBUG
89 if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) { 89 if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) {
@@ -125,16 +125,16 @@ kmem_zone_destroy(kmem_zone_t *zone)
125 BUG(); 125 BUG();
126} 126}
127 127
128extern void *kmem_zone_zalloc(kmem_zone_t *, gfp_t); 128extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
129extern void *kmem_zone_alloc(kmem_zone_t *, gfp_t); 129extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
130 130
131extern void *kmem_alloc(size_t, gfp_t); 131extern void *kmem_alloc(size_t, unsigned int __nocast);
132extern void *kmem_realloc(void *, size_t, size_t, gfp_t); 132extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
133extern void *kmem_zalloc(size_t, gfp_t); 133extern void *kmem_zalloc(size_t, unsigned int __nocast);
134extern void kmem_free(void *, size_t); 134extern void kmem_free(void *, size_t);
135 135
136typedef struct shrinker *kmem_shaker_t; 136typedef struct shrinker *kmem_shaker_t;
137typedef int (*kmem_shake_func_t)(int, unsigned int); 137typedef int (*kmem_shake_func_t)(int, gfp_t);
138 138
139static __inline kmem_shaker_t 139static __inline kmem_shaker_t
140kmem_shake_register(kmem_shake_func_t sfunc) 140kmem_shake_register(kmem_shake_func_t sfunc)
@@ -149,7 +149,7 @@ kmem_shake_deregister(kmem_shaker_t shrinker)
149} 149}
150 150
151static __inline int 151static __inline int
152kmem_shake_allow(unsigned int gfp_mask) 152kmem_shake_allow(gfp_t gfp_mask)
153{ 153{
154 return (gfp_mask & __GFP_WAIT); 154 return (gfp_mask & __GFP_WAIT);
155} 155}
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c6c077978fe3..7aa398724706 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1296,7 +1296,7 @@ linvfs_invalidate_page(
1296STATIC int 1296STATIC int
1297linvfs_release_page( 1297linvfs_release_page(
1298 struct page *page, 1298 struct page *page,
1299 int gfp_mask) 1299 gfp_t gfp_mask)
1300{ 1300{
1301 struct inode *inode = page->mapping->host; 1301 struct inode *inode = page->mapping->host;
1302 int dirty, delalloc, unmapped, unwritten; 1302 int dirty, delalloc, unmapped, unwritten;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e82cf72ac599..4cd46abe8434 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -64,7 +64,7 @@
64 64
65STATIC kmem_cache_t *pagebuf_zone; 65STATIC kmem_cache_t *pagebuf_zone;
66STATIC kmem_shaker_t pagebuf_shake; 66STATIC kmem_shaker_t pagebuf_shake;
67STATIC int xfsbufd_wakeup(int, unsigned int); 67STATIC int xfsbufd_wakeup(int, gfp_t);
68STATIC void pagebuf_delwri_queue(xfs_buf_t *, int); 68STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
69 69
70STATIC struct workqueue_struct *xfslogd_workqueue; 70STATIC struct workqueue_struct *xfslogd_workqueue;
@@ -181,8 +181,9 @@ set_page_region(
181 size_t offset, 181 size_t offset,
182 size_t length) 182 size_t length)
183{ 183{
184 page->private |= page_region_mask(offset, length); 184 set_page_private(page,
185 if (page->private == ~0UL) 185 page_private(page) | page_region_mask(offset, length));
186 if (page_private(page) == ~0UL)
186 SetPageUptodate(page); 187 SetPageUptodate(page);
187} 188}
188 189
@@ -194,7 +195,7 @@ test_page_region(
194{ 195{
195 unsigned long mask = page_region_mask(offset, length); 196 unsigned long mask = page_region_mask(offset, length);
196 197
197 return (mask && (page->private & mask) == mask); 198 return (mask && (page_private(page) & mask) == mask);
198} 199}
199 200
200/* 201/*
@@ -383,7 +384,7 @@ _pagebuf_lookup_pages(
383 size_t blocksize = bp->pb_target->pbr_bsize; 384 size_t blocksize = bp->pb_target->pbr_bsize;
384 size_t size = bp->pb_count_desired; 385 size_t size = bp->pb_count_desired;
385 size_t nbytes, offset; 386 size_t nbytes, offset;
386 int gfp_mask = pb_to_gfp(flags); 387 gfp_t gfp_mask = pb_to_gfp(flags);
387 unsigned short page_count, i; 388 unsigned short page_count, i;
388 pgoff_t first; 389 pgoff_t first;
389 loff_t end; 390 loff_t end;
@@ -1749,8 +1750,8 @@ STATIC int xfsbufd_force_sleep;
1749 1750
1750STATIC int 1751STATIC int
1751xfsbufd_wakeup( 1752xfsbufd_wakeup(
1752 int priority, 1753 int priority,
1753 unsigned int mask) 1754 gfp_t mask)
1754{ 1755{
1755 if (xfsbufd_force_sleep) 1756 if (xfsbufd_force_sleep)
1756 return 0; 1757 return 0;