aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@infradead.org>2007-10-13 09:58:23 -0400
committerDavid Woodhouse <dwmw2@infradead.org>2007-10-13 09:58:23 -0400
commitebf8889bd1fe3615991ff4494635d237280652a2 (patch)
tree10fb735717122bbb86474339eac07f26e7ccdf40 /fs
parentb160292cc216a50fd0cd386b0bda2cd48352c73b (diff)
parent752097cec53eea111d087c545179b421e2bde98a (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/netdevices.c5
-rw-r--r--fs/binfmt_elf.c14
-rw-r--r--fs/bio.c50
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/compat_ioctl.c692
-rw-r--r--fs/debugfs/file.c36
-rw-r--r--fs/direct-io.c13
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lock.c142
-rw-r--r--fs/dlm/lock.h3
-rw-r--r--fs/dlm/lockspace.c3
-rw-r--r--fs/dlm/lowcomms.c23
-rw-r--r--fs/dlm/member.c41
-rw-r--r--fs/dlm/midcomms.c17
-rw-r--r--fs/dlm/rcom.c36
-rw-r--r--fs/dlm/rcom.h5
-rw-r--r--fs/dlm/recoverd.c11
-rw-r--r--fs/dlm/requestqueue.c58
-rw-r--r--fs/dlm/requestqueue.h4
-rw-r--r--fs/ecryptfs/netlink.c16
-rw-r--r--fs/fs-writeback.c1
-rw-r--r--fs/gfs2/bmap.c35
-rw-r--r--fs/gfs2/daemon.c24
-rw-r--r--fs/gfs2/daemon.h1
-rw-r--r--fs/gfs2/dir.c3
-rw-r--r--fs/gfs2/eaops.c8
-rw-r--r--fs/gfs2/eaops.h4
-rw-r--r--fs/gfs2/glock.c293
-rw-r--r--fs/gfs2/glock.h5
-rw-r--r--fs/gfs2/glops.c24
-rw-r--r--fs/gfs2/incore.h31
-rw-r--r--fs/gfs2/inode.c78
-rw-r--r--fs/gfs2/inode.h3
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h1
-rw-r--r--fs/gfs2/locking/dlm/plock.c11
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c2
-rw-r--r--fs/gfs2/locking/dlm/thread.c20
-rw-r--r--fs/gfs2/locking/nolock/main.c1
-rw-r--r--fs/gfs2/log.c230
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c470
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/gfs2/meta_io.c136
-rw-r--r--fs/gfs2/meta_io.h6
-rw-r--r--fs/gfs2/mount.c5
-rw-r--r--fs/gfs2/ops_address.c146
-rw-r--r--fs/gfs2/ops_export.c2
-rw-r--r--fs/gfs2/ops_file.c13
-rw-r--r--fs/gfs2/ops_fstype.c40
-rw-r--r--fs/gfs2/ops_inode.c38
-rw-r--r--fs/gfs2/ops_super.c14
-rw-r--r--fs/gfs2/quota.c13
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/rgrp.c39
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/gfs2/sys.c4
-rw-r--r--fs/gfs2/trans.c22
-rw-r--r--fs/gfs2/trans.h2
-rw-r--r--fs/jfs/jfs_logmgr.c12
-rw-r--r--fs/jfs/jfs_metapage.c15
-rw-r--r--fs/mpage.c12
-rw-r--r--fs/ntfs/ChangeLog12
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/aops.c22
-rw-r--r--fs/ntfs/attrib.c8
-rw-r--r--fs/ntfs/file.c36
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ntfs/logfile.c143
-rw-r--r--fs/ntfs/runlist.c4
-rw-r--r--fs/ocfs2/alloc.c482
-rw-r--r--fs/ocfs2/alloc.h7
-rw-r--r--fs/ocfs2/aops.c309
-rw-r--r--fs/ocfs2/aops.h6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c7
-rw-r--r--fs/ocfs2/cluster/masklog.c3
-rw-r--r--fs/ocfs2/dir.c1423
-rw-r--r--fs/ocfs2/dir.h48
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/dlmglue.h4
-rw-r--r--fs/ocfs2/export.c8
-rw-r--r--fs/ocfs2/extent_map.c6
-rw-r--r--fs/ocfs2/file.c298
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/inode.c7
-rw-r--r--fs/ocfs2/inode.h1
-rw-r--r--fs/ocfs2/journal.c120
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/namei.c552
-rw-r--r--fs/ocfs2/namei.h19
-rw-r--r--fs/ocfs2/ocfs2.h7
-rw-r--r--fs/ocfs2/ocfs2_fs.h64
-rw-r--r--fs/ocfs2/super.c62
-rw-r--r--fs/ocfs2/sysfile.c10
-rw-r--r--fs/partitions/check.c12
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/proc_net.c200
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/seq_file.c33
-rw-r--r--fs/sysfs/bin.c36
-rw-r--r--fs/sysfs/dir.c754
-rw-r--r--fs/sysfs/file.c248
-rw-r--r--fs/sysfs/group.c2
-rw-r--r--fs/sysfs/inode.c103
-rw-r--r--fs/sysfs/mount.c26
-rw-r--r--fs/sysfs/symlink.c34
-rw-r--r--fs/sysfs/sysfs.h184
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c7
110 files changed, 4599 insertions, 3696 deletions
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index fc27d4b52e5f..49f189423063 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -8,6 +8,7 @@
8#include <linux/inetdevice.h> 8#include <linux/inetdevice.h>
9#include <linux/netdevice.h> 9#include <linux/netdevice.h>
10#include <linux/if_arp.h> 10#include <linux/if_arp.h>
11#include <net/net_namespace.h>
11#include "internal.h" 12#include "internal.h"
12 13
13/* 14/*
@@ -23,7 +24,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen)
23 BUG(); 24 BUG();
24 25
25 rtnl_lock(); 26 rtnl_lock();
26 dev = __dev_getfirstbyhwtype(ARPHRD_ETHER); 27 dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
27 if (dev) { 28 if (dev) {
28 memcpy(mac, dev->dev_addr, maclen); 29 memcpy(mac, dev->dev_addr, maclen);
29 ret = 0; 30 ret = 0;
@@ -47,7 +48,7 @@ int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
47 ASSERT(maxbufs > 0); 48 ASSERT(maxbufs > 0);
48 49
49 rtnl_lock(); 50 rtnl_lock();
50 for_each_netdev(dev) { 51 for_each_netdev(&init_net, dev) {
51 if (dev->type == ARPHRD_LOOPBACK && !wantloopback) 52 if (dev->type == ARPHRD_LOOPBACK && !wantloopback)
52 continue; 53 continue;
53 idev = __in_dev_get_rtnl(dev); 54 idev = __in_dev_get_rtnl(dev);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4482a0673b15..b1013f34085d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1514,9 +1514,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1514 int thread_status_size = 0; 1514 int thread_status_size = 0;
1515 elf_addr_t *auxv; 1515 elf_addr_t *auxv;
1516 unsigned long mm_flags; 1516 unsigned long mm_flags;
1517#ifdef ELF_CORE_WRITE_EXTRA_NOTES
1518 int extra_notes_size;
1519#endif
1520 1517
1521 /* 1518 /*
1522 * We no longer stop all VM operations. 1519 * We no longer stop all VM operations.
@@ -1645,10 +1642,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1645 1642
1646 sz += thread_status_size; 1643 sz += thread_status_size;
1647 1644
1648#ifdef ELF_CORE_WRITE_EXTRA_NOTES 1645 sz += elf_coredump_extra_notes_size();
1649 extra_notes_size = ELF_CORE_EXTRA_NOTES_SIZE;
1650 sz += extra_notes_size;
1651#endif
1652 1646
1653 fill_elf_note_phdr(&phdr, sz, offset); 1647 fill_elf_note_phdr(&phdr, sz, offset);
1654 offset += sz; 1648 offset += sz;
@@ -1698,10 +1692,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1698 if (!writenote(notes + i, file, &foffset)) 1692 if (!writenote(notes + i, file, &foffset))
1699 goto end_coredump; 1693 goto end_coredump;
1700 1694
1701#ifdef ELF_CORE_WRITE_EXTRA_NOTES 1695 if (elf_coredump_extra_notes_write(file, &foffset))
1702 ELF_CORE_WRITE_EXTRA_NOTES; 1696 goto end_coredump;
1703 foffset += extra_notes_size;
1704#endif
1705 1697
1706 /* write out the thread status notes section */ 1698 /* write out the thread status notes section */
1707 list_for_each(t, &thread_list) { 1699 list_for_each(t, &thread_list) {
diff --git a/fs/bio.c b/fs/bio.c
index 29a44c1b64c6..5f604f269dfa 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -798,13 +798,9 @@ void bio_unmap_user(struct bio *bio)
798 bio_put(bio); 798 bio_put(bio);
799} 799}
800 800
801static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err) 801static void bio_map_kern_endio(struct bio *bio, int err)
802{ 802{
803 if (bio->bi_size)
804 return 1;
805
806 bio_put(bio); 803 bio_put(bio);
807 return 0;
808} 804}
809 805
810 806
@@ -1002,34 +998,26 @@ void bio_check_pages_dirty(struct bio *bio)
1002/** 998/**
1003 * bio_endio - end I/O on a bio 999 * bio_endio - end I/O on a bio
1004 * @bio: bio 1000 * @bio: bio
1005 * @bytes_done: number of bytes completed
1006 * @error: error, if any 1001 * @error: error, if any
1007 * 1002 *
1008 * Description: 1003 * Description:
1009 * bio_endio() will end I/O on @bytes_done number of bytes. This may be 1004 * bio_endio() will end I/O on the whole bio. bio_endio() is the
1010 * just a partial part of the bio, or it may be the whole bio. bio_endio() 1005 * preferred way to end I/O on a bio, it takes care of clearing
1011 * is the preferred way to end I/O on a bio, it takes care of decrementing 1006 * BIO_UPTODATE on error. @error is 0 on success, and and one of the
1012 * bi_size and clearing BIO_UPTODATE on error. @error is 0 on success, and 1007 * established -Exxxx (-EIO, for instance) error values in case
1013 * and one of the established -Exxxx (-EIO, for instance) error values in 1008 * something went wrong. Noone should call bi_end_io() directly on a
1014 * case something went wrong. Noone should call bi_end_io() directly on 1009 * bio unless they own it and thus know that it has an end_io
1015 * a bio unless they own it and thus know that it has an end_io function. 1010 * function.
1016 **/ 1011 **/
1017void bio_endio(struct bio *bio, unsigned int bytes_done, int error) 1012void bio_endio(struct bio *bio, int error)
1018{ 1013{
1019 if (error) 1014 if (error)
1020 clear_bit(BIO_UPTODATE, &bio->bi_flags); 1015 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1021 1016 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1022 if (unlikely(bytes_done > bio->bi_size)) { 1017 error = -EIO;
1023 printk("%s: want %u bytes done, only %u left\n", __FUNCTION__,
1024 bytes_done, bio->bi_size);
1025 bytes_done = bio->bi_size;
1026 }
1027
1028 bio->bi_size -= bytes_done;
1029 bio->bi_sector += (bytes_done >> 9);
1030 1018
1031 if (bio->bi_end_io) 1019 if (bio->bi_end_io)
1032 bio->bi_end_io(bio, bytes_done, error); 1020 bio->bi_end_io(bio, error);
1033} 1021}
1034 1022
1035void bio_pair_release(struct bio_pair *bp) 1023void bio_pair_release(struct bio_pair *bp)
@@ -1037,37 +1025,29 @@ void bio_pair_release(struct bio_pair *bp)
1037 if (atomic_dec_and_test(&bp->cnt)) { 1025 if (atomic_dec_and_test(&bp->cnt)) {
1038 struct bio *master = bp->bio1.bi_private; 1026 struct bio *master = bp->bio1.bi_private;
1039 1027
1040 bio_endio(master, master->bi_size, bp->error); 1028 bio_endio(master, bp->error);
1041 mempool_free(bp, bp->bio2.bi_private); 1029 mempool_free(bp, bp->bio2.bi_private);
1042 } 1030 }
1043} 1031}
1044 1032
1045static int bio_pair_end_1(struct bio * bi, unsigned int done, int err) 1033static void bio_pair_end_1(struct bio *bi, int err)
1046{ 1034{
1047 struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); 1035 struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
1048 1036
1049 if (err) 1037 if (err)
1050 bp->error = err; 1038 bp->error = err;
1051 1039
1052 if (bi->bi_size)
1053 return 1;
1054
1055 bio_pair_release(bp); 1040 bio_pair_release(bp);
1056 return 0;
1057} 1041}
1058 1042
1059static int bio_pair_end_2(struct bio * bi, unsigned int done, int err) 1043static void bio_pair_end_2(struct bio *bi, int err)
1060{ 1044{
1061 struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); 1045 struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
1062 1046
1063 if (err) 1047 if (err)
1064 bp->error = err; 1048 bp->error = err;
1065 1049
1066 if (bi->bi_size)
1067 return 1;
1068
1069 bio_pair_release(bp); 1050 bio_pair_release(bp);
1070 return 0;
1071} 1051}
1072 1052
1073/* 1053/*
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2980eabe5779..6339a30879b7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,7 +172,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
172} 172}
173 173
174#if 0 174#if 0
175static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error) 175static void blk_end_aio(struct bio *bio, int error)
176{ 176{
177 struct kiocb *iocb = bio->bi_private; 177 struct kiocb *iocb = bio->bi_private;
178 atomic_t *bio_count = &iocb->ki_bio_count; 178 atomic_t *bio_count = &iocb->ki_bio_count;
diff --git a/fs/buffer.c b/fs/buffer.c
index 0e5ec371ce72..75b51dfa5e03 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2634,13 +2634,10 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2634 return tmp.b_blocknr; 2634 return tmp.b_blocknr;
2635} 2635}
2636 2636
2637static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err) 2637static void end_bio_bh_io_sync(struct bio *bio, int err)
2638{ 2638{
2639 struct buffer_head *bh = bio->bi_private; 2639 struct buffer_head *bh = bio->bi_private;
2640 2640
2641 if (bio->bi_size)
2642 return 1;
2643
2644 if (err == -EOPNOTSUPP) { 2641 if (err == -EOPNOTSUPP) {
2645 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2642 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2646 set_bit(BH_Eopnotsupp, &bh->b_state); 2643 set_bit(BH_Eopnotsupp, &bh->b_state);
@@ -2648,7 +2645,6 @@ static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2648 2645
2649 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); 2646 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2650 bio_put(bio); 2647 bio_put(bio);
2651 return 0;
2652} 2648}
2653 2649
2654int submit_bh(int rw, struct buffer_head * bh) 2650int submit_bh(int rw, struct buffer_head * bh)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 37310b0e8107..9c3fd07f35e0 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -21,7 +21,6 @@
21#include <linux/if.h> 21#include <linux/if.h>
22#include <linux/if_bridge.h> 22#include <linux/if_bridge.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/hdreg.h>
25#include <linux/raid/md.h> 24#include <linux/raid/md.h>
26#include <linux/kd.h> 25#include <linux/kd.h>
27#include <linux/dirent.h> 26#include <linux/dirent.h>
@@ -33,12 +32,10 @@
33#include <linux/vt.h> 32#include <linux/vt.h>
34#include <linux/fs.h> 33#include <linux/fs.h>
35#include <linux/file.h> 34#include <linux/file.h>
36#include <linux/fd.h>
37#include <linux/ppp_defs.h> 35#include <linux/ppp_defs.h>
38#include <linux/if_ppp.h> 36#include <linux/if_ppp.h>
39#include <linux/if_pppox.h> 37#include <linux/if_pppox.h>
40#include <linux/mtio.h> 38#include <linux/mtio.h>
41#include <linux/cdrom.h>
42#include <linux/auto_fs.h> 39#include <linux/auto_fs.h>
43#include <linux/auto_fs4.h> 40#include <linux/auto_fs4.h>
44#include <linux/tty.h> 41#include <linux/tty.h>
@@ -48,7 +45,6 @@
48#include <linux/netdevice.h> 45#include <linux/netdevice.h>
49#include <linux/raw.h> 46#include <linux/raw.h>
50#include <linux/smb_fs.h> 47#include <linux/smb_fs.h>
51#include <linux/blkpg.h>
52#include <linux/blkdev.h> 48#include <linux/blkdev.h>
53#include <linux/elevator.h> 49#include <linux/elevator.h>
54#include <linux/rtc.h> 50#include <linux/rtc.h>
@@ -62,7 +58,6 @@
62#include <linux/i2c-dev.h> 58#include <linux/i2c-dev.h>
63#include <linux/wireless.h> 59#include <linux/wireless.h>
64#include <linux/atalk.h> 60#include <linux/atalk.h>
65#include <linux/blktrace_api.h>
66#include <linux/loop.h> 61#include <linux/loop.h>
67 62
68#include <net/bluetooth/bluetooth.h> 63#include <net/bluetooth/bluetooth.h>
@@ -324,22 +319,21 @@ struct ifconf32 {
324 319
325static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg) 320static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg)
326{ 321{
327 struct net_device *dev; 322 struct ifreq __user *uifr;
328 struct ifreq32 ifr32;
329 int err; 323 int err;
330 324
331 if (copy_from_user(&ifr32, compat_ptr(arg), sizeof(ifr32))) 325 uifr = compat_alloc_user_space(sizeof(struct ifreq));
326 if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)));
332 return -EFAULT; 327 return -EFAULT;
333 328
334 dev = dev_get_by_index(ifr32.ifr_ifindex); 329 err = sys_ioctl(fd, SIOCGIFNAME, (unsigned long)uifr);
335 if (!dev) 330 if (err)
336 return -ENODEV; 331 return err;
332
333 if (copy_in_user(compat_ptr(arg), uifr, sizeof(struct ifreq32)))
334 return -EFAULT;
337 335
338 strlcpy(ifr32.ifr_name, dev->name, sizeof(ifr32.ifr_name)); 336 return 0;
339 dev_put(dev);
340
341 err = copy_to_user(compat_ptr(arg), &ifr32, sizeof(ifr32));
342 return (err ? -EFAULT : 0);
343} 337}
344 338
345static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg) 339static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
@@ -668,53 +662,6 @@ out:
668#endif 662#endif
669 663
670#ifdef CONFIG_BLOCK 664#ifdef CONFIG_BLOCK
671struct hd_geometry32 {
672 unsigned char heads;
673 unsigned char sectors;
674 unsigned short cylinders;
675 u32 start;
676};
677
678static int hdio_getgeo(unsigned int fd, unsigned int cmd, unsigned long arg)
679{
680 mm_segment_t old_fs = get_fs();
681 struct hd_geometry geo;
682 struct hd_geometry32 __user *ugeo;
683 int err;
684
685 set_fs (KERNEL_DS);
686 err = sys_ioctl(fd, HDIO_GETGEO, (unsigned long)&geo);
687 set_fs (old_fs);
688 ugeo = compat_ptr(arg);
689 if (!err) {
690 err = copy_to_user (ugeo, &geo, 4);
691 err |= __put_user (geo.start, &ugeo->start);
692 if (err)
693 err = -EFAULT;
694 }
695 return err;
696}
697
698static int hdio_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
699{
700 mm_segment_t old_fs = get_fs();
701 unsigned long kval;
702 unsigned int __user *uvp;
703 int error;
704
705 set_fs(KERNEL_DS);
706 error = sys_ioctl(fd, cmd, (long)&kval);
707 set_fs(old_fs);
708
709 if(error == 0) {
710 uvp = compat_ptr(arg);
711 if(put_user(kval, uvp))
712 error = -EFAULT;
713 }
714 return error;
715}
716
717
718typedef struct sg_io_hdr32 { 665typedef struct sg_io_hdr32 {
719 compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ 666 compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
720 compat_int_t dxfer_direction; /* [i] data transfer direction */ 667 compat_int_t dxfer_direction; /* [i] data transfer direction */
@@ -1089,108 +1036,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
1089 return err ? -EFAULT: 0; 1036 return err ? -EFAULT: 0;
1090} 1037}
1091 1038
1092struct cdrom_read_audio32 {
1093 union cdrom_addr addr;
1094 u8 addr_format;
1095 compat_int_t nframes;
1096 compat_caddr_t buf;
1097};
1098
1099struct cdrom_generic_command32 {
1100 unsigned char cmd[CDROM_PACKET_SIZE];
1101 compat_caddr_t buffer;
1102 compat_uint_t buflen;
1103 compat_int_t stat;
1104 compat_caddr_t sense;
1105 unsigned char data_direction;
1106 compat_int_t quiet;
1107 compat_int_t timeout;
1108 compat_caddr_t reserved[1];
1109};
1110
1111static int cdrom_do_read_audio(unsigned int fd, unsigned int cmd, unsigned long arg)
1112{
1113 struct cdrom_read_audio __user *cdread_audio;
1114 struct cdrom_read_audio32 __user *cdread_audio32;
1115 __u32 data;
1116 void __user *datap;
1117
1118 cdread_audio = compat_alloc_user_space(sizeof(*cdread_audio));
1119 cdread_audio32 = compat_ptr(arg);
1120
1121 if (copy_in_user(&cdread_audio->addr,
1122 &cdread_audio32->addr,
1123 (sizeof(*cdread_audio32) -
1124 sizeof(compat_caddr_t))))
1125 return -EFAULT;
1126
1127 if (get_user(data, &cdread_audio32->buf))
1128 return -EFAULT;
1129 datap = compat_ptr(data);
1130 if (put_user(datap, &cdread_audio->buf))
1131 return -EFAULT;
1132
1133 return sys_ioctl(fd, cmd, (unsigned long) cdread_audio);
1134}
1135
1136static int cdrom_do_generic_command(unsigned int fd, unsigned int cmd, unsigned long arg)
1137{
1138 struct cdrom_generic_command __user *cgc;
1139 struct cdrom_generic_command32 __user *cgc32;
1140 u32 data;
1141 unsigned char dir;
1142 int itmp;
1143
1144 cgc = compat_alloc_user_space(sizeof(*cgc));
1145 cgc32 = compat_ptr(arg);
1146
1147 if (copy_in_user(&cgc->cmd, &cgc32->cmd, sizeof(cgc->cmd)) ||
1148 get_user(data, &cgc32->buffer) ||
1149 put_user(compat_ptr(data), &cgc->buffer) ||
1150 copy_in_user(&cgc->buflen, &cgc32->buflen,
1151 (sizeof(unsigned int) + sizeof(int))) ||
1152 get_user(data, &cgc32->sense) ||
1153 put_user(compat_ptr(data), &cgc->sense) ||
1154 get_user(dir, &cgc32->data_direction) ||
1155 put_user(dir, &cgc->data_direction) ||
1156 get_user(itmp, &cgc32->quiet) ||
1157 put_user(itmp, &cgc->quiet) ||
1158 get_user(itmp, &cgc32->timeout) ||
1159 put_user(itmp, &cgc->timeout) ||
1160 get_user(data, &cgc32->reserved[0]) ||
1161 put_user(compat_ptr(data), &cgc->reserved[0]))
1162 return -EFAULT;
1163
1164 return sys_ioctl(fd, cmd, (unsigned long) cgc);
1165}
1166
1167static int cdrom_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
1168{
1169 int err;
1170
1171 switch(cmd) {
1172 case CDROMREADAUDIO:
1173 err = cdrom_do_read_audio(fd, cmd, arg);
1174 break;
1175
1176 case CDROM_SEND_PACKET:
1177 err = cdrom_do_generic_command(fd, cmd, arg);
1178 break;
1179
1180 default:
1181 do {
1182 static int count;
1183 if (++count <= 20)
1184 printk("cdrom_ioctl: Unknown cmd fd(%d) "
1185 "cmd(%08x) arg(%08x)\n",
1186 (int)fd, (unsigned int)cmd, (unsigned int)arg);
1187 } while(0);
1188 err = -EINVAL;
1189 break;
1190 };
1191
1192 return err;
1193}
1194#endif /* CONFIG_BLOCK */ 1039#endif /* CONFIG_BLOCK */
1195 1040
1196#ifdef CONFIG_VT 1041#ifdef CONFIG_VT
@@ -1536,71 +1381,11 @@ ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
1536 return -EINVAL; 1381 return -EINVAL;
1537} 1382}
1538 1383
1539#ifdef CONFIG_BLOCK
1540static int broken_blkgetsize(unsigned int fd, unsigned int cmd, unsigned long arg)
1541{
1542 /* The mkswap binary hard codes it to Intel value :-((( */
1543 return w_long(fd, BLKGETSIZE, arg);
1544}
1545
1546struct blkpg_ioctl_arg32 {
1547 compat_int_t op;
1548 compat_int_t flags;
1549 compat_int_t datalen;
1550 compat_caddr_t data;
1551};
1552
1553static int blkpg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
1554{
1555 struct blkpg_ioctl_arg32 __user *ua32 = compat_ptr(arg);
1556 struct blkpg_ioctl_arg __user *a = compat_alloc_user_space(sizeof(*a));
1557 compat_caddr_t udata;
1558 compat_int_t n;
1559 int err;
1560
1561 err = get_user(n, &ua32->op);
1562 err |= put_user(n, &a->op);
1563 err |= get_user(n, &ua32->flags);
1564 err |= put_user(n, &a->flags);
1565 err |= get_user(n, &ua32->datalen);
1566 err |= put_user(n, &a->datalen);
1567 err |= get_user(udata, &ua32->data);
1568 err |= put_user(compat_ptr(udata), &a->data);
1569 if (err)
1570 return err;
1571
1572 return sys_ioctl(fd, cmd, (unsigned long)a);
1573}
1574#endif
1575
1576static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg) 1384static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
1577{ 1385{
1578 return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg); 1386 return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg);
1579} 1387}
1580 1388
1581#ifdef CONFIG_BLOCK
1582/* Fix sizeof(sizeof()) breakage */
1583#define BLKBSZGET_32 _IOR(0x12,112,int)
1584#define BLKBSZSET_32 _IOW(0x12,113,int)
1585#define BLKGETSIZE64_32 _IOR(0x12,114,int)
1586
1587static int do_blkbszget(unsigned int fd, unsigned int cmd, unsigned long arg)
1588{
1589 return sys_ioctl(fd, BLKBSZGET, (unsigned long)compat_ptr(arg));
1590}
1591
1592static int do_blkbszset(unsigned int fd, unsigned int cmd, unsigned long arg)
1593{
1594 return sys_ioctl(fd, BLKBSZSET, (unsigned long)compat_ptr(arg));
1595}
1596
1597static int do_blkgetsize64(unsigned int fd, unsigned int cmd,
1598 unsigned long arg)
1599{
1600 return sys_ioctl(fd, BLKGETSIZE64, (unsigned long)compat_ptr(arg));
1601}
1602#endif
1603
1604/* Bluetooth ioctls */ 1389/* Bluetooth ioctls */
1605#define HCIUARTSETPROTO _IOW('U', 200, int) 1390#define HCIUARTSETPROTO _IOW('U', 200, int)
1606#define HCIUARTGETPROTO _IOR('U', 201, int) 1391#define HCIUARTGETPROTO _IOR('U', 201, int)
@@ -1620,333 +1405,6 @@ static int do_blkgetsize64(unsigned int fd, unsigned int cmd,
1620#define HIDPGETCONNLIST _IOR('H', 210, int) 1405#define HIDPGETCONNLIST _IOR('H', 210, int)
1621#define HIDPGETCONNINFO _IOR('H', 211, int) 1406#define HIDPGETCONNINFO _IOR('H', 211, int)
1622 1407
1623#ifdef CONFIG_BLOCK
1624struct floppy_struct32 {
1625 compat_uint_t size;
1626 compat_uint_t sect;
1627 compat_uint_t head;
1628 compat_uint_t track;
1629 compat_uint_t stretch;
1630 unsigned char gap;
1631 unsigned char rate;
1632 unsigned char spec1;
1633 unsigned char fmt_gap;
1634 const compat_caddr_t name;
1635};
1636
1637struct floppy_drive_params32 {
1638 char cmos;
1639 compat_ulong_t max_dtr;
1640 compat_ulong_t hlt;
1641 compat_ulong_t hut;
1642 compat_ulong_t srt;
1643 compat_ulong_t spinup;
1644 compat_ulong_t spindown;
1645 unsigned char spindown_offset;
1646 unsigned char select_delay;
1647 unsigned char rps;
1648 unsigned char tracks;
1649 compat_ulong_t timeout;
1650 unsigned char interleave_sect;
1651 struct floppy_max_errors max_errors;
1652 char flags;
1653 char read_track;
1654 short autodetect[8];
1655 compat_int_t checkfreq;
1656 compat_int_t native_format;
1657};
1658
1659struct floppy_drive_struct32 {
1660 signed char flags;
1661 compat_ulong_t spinup_date;
1662 compat_ulong_t select_date;
1663 compat_ulong_t first_read_date;
1664 short probed_format;
1665 short track;
1666 short maxblock;
1667 short maxtrack;
1668 compat_int_t generation;
1669 compat_int_t keep_data;
1670 compat_int_t fd_ref;
1671 compat_int_t fd_device;
1672 compat_int_t last_checked;
1673 compat_caddr_t dmabuf;
1674 compat_int_t bufblocks;
1675};
1676
1677struct floppy_fdc_state32 {
1678 compat_int_t spec1;
1679 compat_int_t spec2;
1680 compat_int_t dtr;
1681 unsigned char version;
1682 unsigned char dor;
1683 compat_ulong_t address;
1684 unsigned int rawcmd:2;
1685 unsigned int reset:1;
1686 unsigned int need_configure:1;
1687 unsigned int perp_mode:2;
1688 unsigned int has_fifo:1;
1689 unsigned int driver_version;
1690 unsigned char track[4];
1691};
1692
1693struct floppy_write_errors32 {
1694 unsigned int write_errors;
1695 compat_ulong_t first_error_sector;
1696 compat_int_t first_error_generation;
1697 compat_ulong_t last_error_sector;
1698 compat_int_t last_error_generation;
1699 compat_uint_t badness;
1700};
1701
1702#define FDSETPRM32 _IOW(2, 0x42, struct floppy_struct32)
1703#define FDDEFPRM32 _IOW(2, 0x43, struct floppy_struct32)
1704#define FDGETPRM32 _IOR(2, 0x04, struct floppy_struct32)
1705#define FDSETDRVPRM32 _IOW(2, 0x90, struct floppy_drive_params32)
1706#define FDGETDRVPRM32 _IOR(2, 0x11, struct floppy_drive_params32)
1707#define FDGETDRVSTAT32 _IOR(2, 0x12, struct floppy_drive_struct32)
1708#define FDPOLLDRVSTAT32 _IOR(2, 0x13, struct floppy_drive_struct32)
1709#define FDGETFDCSTAT32 _IOR(2, 0x15, struct floppy_fdc_state32)
1710#define FDWERRORGET32 _IOR(2, 0x17, struct floppy_write_errors32)
1711
1712static struct {
1713 unsigned int cmd32;
1714 unsigned int cmd;
1715} fd_ioctl_trans_table[] = {
1716 { FDSETPRM32, FDSETPRM },
1717 { FDDEFPRM32, FDDEFPRM },
1718 { FDGETPRM32, FDGETPRM },
1719 { FDSETDRVPRM32, FDSETDRVPRM },
1720 { FDGETDRVPRM32, FDGETDRVPRM },
1721 { FDGETDRVSTAT32, FDGETDRVSTAT },
1722 { FDPOLLDRVSTAT32, FDPOLLDRVSTAT },
1723 { FDGETFDCSTAT32, FDGETFDCSTAT },
1724 { FDWERRORGET32, FDWERRORGET }
1725};
1726
1727#define NR_FD_IOCTL_TRANS ARRAY_SIZE(fd_ioctl_trans_table)
1728
1729static int fd_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
1730{
1731 mm_segment_t old_fs = get_fs();
1732 void *karg = NULL;
1733 unsigned int kcmd = 0;
1734 int i, err;
1735
1736 for (i = 0; i < NR_FD_IOCTL_TRANS; i++)
1737 if (cmd == fd_ioctl_trans_table[i].cmd32) {
1738 kcmd = fd_ioctl_trans_table[i].cmd;
1739 break;
1740 }
1741 if (!kcmd)
1742 return -EINVAL;
1743
1744 switch (cmd) {
1745 case FDSETPRM32:
1746 case FDDEFPRM32:
1747 case FDGETPRM32:
1748 {
1749 compat_uptr_t name;
1750 struct floppy_struct32 __user *uf;
1751 struct floppy_struct *f;
1752
1753 uf = compat_ptr(arg);
1754 f = karg = kmalloc(sizeof(struct floppy_struct), GFP_KERNEL);
1755 if (!karg)
1756 return -ENOMEM;
1757 if (cmd == FDGETPRM32)
1758 break;
1759 err = __get_user(f->size, &uf->size);
1760 err |= __get_user(f->sect, &uf->sect);
1761 err |= __get_user(f->head, &uf->head);
1762 err |= __get_user(f->track, &uf->track);
1763 err |= __get_user(f->stretch, &uf->stretch);
1764 err |= __get_user(f->gap, &uf->gap);
1765 err |= __get_user(f->rate, &uf->rate);
1766 err |= __get_user(f->spec1, &uf->spec1);
1767 err |= __get_user(f->fmt_gap, &uf->fmt_gap);
1768 err |= __get_user(name, &uf->name);
1769 f->name = compat_ptr(name);
1770 if (err) {
1771 err = -EFAULT;
1772 goto out;
1773 }
1774 break;
1775 }
1776 case FDSETDRVPRM32:
1777 case FDGETDRVPRM32:
1778 {
1779 struct floppy_drive_params32 __user *uf;
1780 struct floppy_drive_params *f;
1781
1782 uf = compat_ptr(arg);
1783 f = karg = kmalloc(sizeof(struct floppy_drive_params), GFP_KERNEL);
1784 if (!karg)
1785 return -ENOMEM;
1786 if (cmd == FDGETDRVPRM32)
1787 break;
1788 err = __get_user(f->cmos, &uf->cmos);
1789 err |= __get_user(f->max_dtr, &uf->max_dtr);
1790 err |= __get_user(f->hlt, &uf->hlt);
1791 err |= __get_user(f->hut, &uf->hut);
1792 err |= __get_user(f->srt, &uf->srt);
1793 err |= __get_user(f->spinup, &uf->spinup);
1794 err |= __get_user(f->spindown, &uf->spindown);
1795 err |= __get_user(f->spindown_offset, &uf->spindown_offset);
1796 err |= __get_user(f->select_delay, &uf->select_delay);
1797 err |= __get_user(f->rps, &uf->rps);
1798 err |= __get_user(f->tracks, &uf->tracks);
1799 err |= __get_user(f->timeout, &uf->timeout);
1800 err |= __get_user(f->interleave_sect, &uf->interleave_sect);
1801 err |= __copy_from_user(&f->max_errors, &uf->max_errors, sizeof(f->max_errors));
1802 err |= __get_user(f->flags, &uf->flags);
1803 err |= __get_user(f->read_track, &uf->read_track);
1804 err |= __copy_from_user(f->autodetect, uf->autodetect, sizeof(f->autodetect));
1805 err |= __get_user(f->checkfreq, &uf->checkfreq);
1806 err |= __get_user(f->native_format, &uf->native_format);
1807 if (err) {
1808 err = -EFAULT;
1809 goto out;
1810 }
1811 break;
1812 }
1813 case FDGETDRVSTAT32:
1814 case FDPOLLDRVSTAT32:
1815 karg = kmalloc(sizeof(struct floppy_drive_struct), GFP_KERNEL);
1816 if (!karg)
1817 return -ENOMEM;
1818 break;
1819 case FDGETFDCSTAT32:
1820 karg = kmalloc(sizeof(struct floppy_fdc_state), GFP_KERNEL);
1821 if (!karg)
1822 return -ENOMEM;
1823 break;
1824 case FDWERRORGET32:
1825 karg = kmalloc(sizeof(struct floppy_write_errors), GFP_KERNEL);
1826 if (!karg)
1827 return -ENOMEM;
1828 break;
1829 default:
1830 return -EINVAL;
1831 }
1832 set_fs (KERNEL_DS);
1833 err = sys_ioctl (fd, kcmd, (unsigned long)karg);
1834 set_fs (old_fs);
1835 if (err)
1836 goto out;
1837 switch (cmd) {
1838 case FDGETPRM32:
1839 {
1840 struct floppy_struct *f = karg;
1841 struct floppy_struct32 __user *uf = compat_ptr(arg);
1842
1843 err = __put_user(f->size, &uf->size);
1844 err |= __put_user(f->sect, &uf->sect);
1845 err |= __put_user(f->head, &uf->head);
1846 err |= __put_user(f->track, &uf->track);
1847 err |= __put_user(f->stretch, &uf->stretch);
1848 err |= __put_user(f->gap, &uf->gap);
1849 err |= __put_user(f->rate, &uf->rate);
1850 err |= __put_user(f->spec1, &uf->spec1);
1851 err |= __put_user(f->fmt_gap, &uf->fmt_gap);
1852 err |= __put_user((u64)f->name, (compat_caddr_t __user *)&uf->name);
1853 break;
1854 }
1855 case FDGETDRVPRM32:
1856 {
1857 struct floppy_drive_params32 __user *uf;
1858 struct floppy_drive_params *f = karg;
1859
1860 uf = compat_ptr(arg);
1861 err = __put_user(f->cmos, &uf->cmos);
1862 err |= __put_user(f->max_dtr, &uf->max_dtr);
1863 err |= __put_user(f->hlt, &uf->hlt);
1864 err |= __put_user(f->hut, &uf->hut);
1865 err |= __put_user(f->srt, &uf->srt);
1866 err |= __put_user(f->spinup, &uf->spinup);
1867 err |= __put_user(f->spindown, &uf->spindown);
1868 err |= __put_user(f->spindown_offset, &uf->spindown_offset);
1869 err |= __put_user(f->select_delay, &uf->select_delay);
1870 err |= __put_user(f->rps, &uf->rps);
1871 err |= __put_user(f->tracks, &uf->tracks);
1872 err |= __put_user(f->timeout, &uf->timeout);
1873 err |= __put_user(f->interleave_sect, &uf->interleave_sect);
1874 err |= __copy_to_user(&uf->max_errors, &f->max_errors, sizeof(f->max_errors));
1875 err |= __put_user(f->flags, &uf->flags);
1876 err |= __put_user(f->read_track, &uf->read_track);
1877 err |= __copy_to_user(uf->autodetect, f->autodetect, sizeof(f->autodetect));
1878 err |= __put_user(f->checkfreq, &uf->checkfreq);
1879 err |= __put_user(f->native_format, &uf->native_format);
1880 break;
1881 }
1882 case FDGETDRVSTAT32:
1883 case FDPOLLDRVSTAT32:
1884 {
1885 struct floppy_drive_struct32 __user *uf;
1886 struct floppy_drive_struct *f = karg;
1887
1888 uf = compat_ptr(arg);
1889 err = __put_user(f->flags, &uf->flags);
1890 err |= __put_user(f->spinup_date, &uf->spinup_date);
1891 err |= __put_user(f->select_date, &uf->select_date);
1892 err |= __put_user(f->first_read_date, &uf->first_read_date);
1893 err |= __put_user(f->probed_format, &uf->probed_format);
1894 err |= __put_user(f->track, &uf->track);
1895 err |= __put_user(f->maxblock, &uf->maxblock);
1896 err |= __put_user(f->maxtrack, &uf->maxtrack);
1897 err |= __put_user(f->generation, &uf->generation);
1898 err |= __put_user(f->keep_data, &uf->keep_data);
1899 err |= __put_user(f->fd_ref, &uf->fd_ref);
1900 err |= __put_user(f->fd_device, &uf->fd_device);
1901 err |= __put_user(f->last_checked, &uf->last_checked);
1902 err |= __put_user((u64)f->dmabuf, &uf->dmabuf);
1903 err |= __put_user((u64)f->bufblocks, &uf->bufblocks);
1904 break;
1905 }
1906 case FDGETFDCSTAT32:
1907 {
1908 struct floppy_fdc_state32 __user *uf;
1909 struct floppy_fdc_state *f = karg;
1910
1911 uf = compat_ptr(arg);
1912 err = __put_user(f->spec1, &uf->spec1);
1913 err |= __put_user(f->spec2, &uf->spec2);
1914 err |= __put_user(f->dtr, &uf->dtr);
1915 err |= __put_user(f->version, &uf->version);
1916 err |= __put_user(f->dor, &uf->dor);
1917 err |= __put_user(f->address, &uf->address);
1918 err |= __copy_to_user((char __user *)&uf->address + sizeof(uf->address),
1919 (char *)&f->address + sizeof(f->address), sizeof(int));
1920 err |= __put_user(f->driver_version, &uf->driver_version);
1921 err |= __copy_to_user(uf->track, f->track, sizeof(f->track));
1922 break;
1923 }
1924 case FDWERRORGET32:
1925 {
1926 struct floppy_write_errors32 __user *uf;
1927 struct floppy_write_errors *f = karg;
1928
1929 uf = compat_ptr(arg);
1930 err = __put_user(f->write_errors, &uf->write_errors);
1931 err |= __put_user(f->first_error_sector, &uf->first_error_sector);
1932 err |= __put_user(f->first_error_generation, &uf->first_error_generation);
1933 err |= __put_user(f->last_error_sector, &uf->last_error_sector);
1934 err |= __put_user(f->last_error_generation, &uf->last_error_generation);
1935 err |= __put_user(f->badness, &uf->badness);
1936 break;
1937 }
1938 default:
1939 break;
1940 }
1941 if (err)
1942 err = -EFAULT;
1943
1944out:
1945 kfree(karg);
1946 return err;
1947}
1948#endif
1949
1950struct mtd_oob_buf32 { 1408struct mtd_oob_buf32 {
1951 u_int32_t start; 1409 u_int32_t start;
1952 u_int32_t length; 1410 u_int32_t length;
@@ -2506,60 +1964,6 @@ COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
2506/* 0x00 */ 1964/* 0x00 */
2507COMPATIBLE_IOCTL(FIBMAP) 1965COMPATIBLE_IOCTL(FIBMAP)
2508COMPATIBLE_IOCTL(FIGETBSZ) 1966COMPATIBLE_IOCTL(FIGETBSZ)
2509/* 0x03 -- HD/IDE ioctl's used by hdparm and friends.
2510 * Some need translations, these do not.
2511 */
2512COMPATIBLE_IOCTL(HDIO_GET_IDENTITY)
2513COMPATIBLE_IOCTL(HDIO_DRIVE_TASK)
2514COMPATIBLE_IOCTL(HDIO_DRIVE_CMD)
2515ULONG_IOCTL(HDIO_SET_MULTCOUNT)
2516ULONG_IOCTL(HDIO_SET_UNMASKINTR)
2517ULONG_IOCTL(HDIO_SET_KEEPSETTINGS)
2518ULONG_IOCTL(HDIO_SET_32BIT)
2519ULONG_IOCTL(HDIO_SET_NOWERR)
2520ULONG_IOCTL(HDIO_SET_DMA)
2521ULONG_IOCTL(HDIO_SET_PIO_MODE)
2522ULONG_IOCTL(HDIO_SET_NICE)
2523ULONG_IOCTL(HDIO_SET_WCACHE)
2524ULONG_IOCTL(HDIO_SET_ACOUSTIC)
2525ULONG_IOCTL(HDIO_SET_BUSSTATE)
2526ULONG_IOCTL(HDIO_SET_ADDRESS)
2527COMPATIBLE_IOCTL(HDIO_SCAN_HWIF)
2528/* 0x330 is reserved -- it used to be HDIO_GETGEO_BIG */
2529COMPATIBLE_IOCTL(0x330)
2530/* 0x02 -- Floppy ioctls */
2531COMPATIBLE_IOCTL(FDMSGON)
2532COMPATIBLE_IOCTL(FDMSGOFF)
2533COMPATIBLE_IOCTL(FDSETEMSGTRESH)
2534COMPATIBLE_IOCTL(FDFLUSH)
2535COMPATIBLE_IOCTL(FDWERRORCLR)
2536COMPATIBLE_IOCTL(FDSETMAXERRS)
2537COMPATIBLE_IOCTL(FDGETMAXERRS)
2538COMPATIBLE_IOCTL(FDGETDRVTYP)
2539COMPATIBLE_IOCTL(FDEJECT)
2540COMPATIBLE_IOCTL(FDCLRPRM)
2541COMPATIBLE_IOCTL(FDFMTBEG)
2542COMPATIBLE_IOCTL(FDFMTEND)
2543COMPATIBLE_IOCTL(FDRESET)
2544COMPATIBLE_IOCTL(FDTWADDLE)
2545COMPATIBLE_IOCTL(FDFMTTRK)
2546COMPATIBLE_IOCTL(FDRAWCMD)
2547/* 0x12 */
2548#ifdef CONFIG_BLOCK
2549COMPATIBLE_IOCTL(BLKRASET)
2550COMPATIBLE_IOCTL(BLKROSET)
2551COMPATIBLE_IOCTL(BLKROGET)
2552COMPATIBLE_IOCTL(BLKRRPART)
2553COMPATIBLE_IOCTL(BLKFLSBUF)
2554COMPATIBLE_IOCTL(BLKSECTSET)
2555COMPATIBLE_IOCTL(BLKSSZGET)
2556COMPATIBLE_IOCTL(BLKTRACESTART)
2557COMPATIBLE_IOCTL(BLKTRACESTOP)
2558COMPATIBLE_IOCTL(BLKTRACESETUP)
2559COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
2560ULONG_IOCTL(BLKRASET)
2561ULONG_IOCTL(BLKFRASET)
2562#endif
2563/* RAID */ 1967/* RAID */
2564COMPATIBLE_IOCTL(RAID_VERSION) 1968COMPATIBLE_IOCTL(RAID_VERSION)
2565COMPATIBLE_IOCTL(GET_ARRAY_INFO) 1969COMPATIBLE_IOCTL(GET_ARRAY_INFO)
@@ -2807,50 +2211,6 @@ COMPATIBLE_IOCTL(PPGETMODE)
2807COMPATIBLE_IOCTL(PPGETPHASE) 2211COMPATIBLE_IOCTL(PPGETPHASE)
2808COMPATIBLE_IOCTL(PPGETFLAGS) 2212COMPATIBLE_IOCTL(PPGETFLAGS)
2809COMPATIBLE_IOCTL(PPSETFLAGS) 2213COMPATIBLE_IOCTL(PPSETFLAGS)
2810/* CDROM stuff */
2811COMPATIBLE_IOCTL(CDROMPAUSE)
2812COMPATIBLE_IOCTL(CDROMRESUME)
2813COMPATIBLE_IOCTL(CDROMPLAYMSF)
2814COMPATIBLE_IOCTL(CDROMPLAYTRKIND)
2815COMPATIBLE_IOCTL(CDROMREADTOCHDR)
2816COMPATIBLE_IOCTL(CDROMREADTOCENTRY)
2817COMPATIBLE_IOCTL(CDROMSTOP)
2818COMPATIBLE_IOCTL(CDROMSTART)
2819COMPATIBLE_IOCTL(CDROMEJECT)
2820COMPATIBLE_IOCTL(CDROMVOLCTRL)
2821COMPATIBLE_IOCTL(CDROMSUBCHNL)
2822ULONG_IOCTL(CDROMEJECT_SW)
2823COMPATIBLE_IOCTL(CDROMMULTISESSION)
2824COMPATIBLE_IOCTL(CDROM_GET_MCN)
2825COMPATIBLE_IOCTL(CDROMRESET)
2826COMPATIBLE_IOCTL(CDROMVOLREAD)
2827COMPATIBLE_IOCTL(CDROMSEEK)
2828COMPATIBLE_IOCTL(CDROMPLAYBLK)
2829COMPATIBLE_IOCTL(CDROMCLOSETRAY)
2830ULONG_IOCTL(CDROM_SET_OPTIONS)
2831ULONG_IOCTL(CDROM_CLEAR_OPTIONS)
2832ULONG_IOCTL(CDROM_SELECT_SPEED)
2833ULONG_IOCTL(CDROM_SELECT_DISC)
2834ULONG_IOCTL(CDROM_MEDIA_CHANGED)
2835ULONG_IOCTL(CDROM_DRIVE_STATUS)
2836COMPATIBLE_IOCTL(CDROM_DISC_STATUS)
2837COMPATIBLE_IOCTL(CDROM_CHANGER_NSLOTS)
2838ULONG_IOCTL(CDROM_LOCKDOOR)
2839ULONG_IOCTL(CDROM_DEBUG)
2840COMPATIBLE_IOCTL(CDROM_GET_CAPABILITY)
2841/* Ignore cdrom.h about these next 5 ioctls, they absolutely do
2842 * not take a struct cdrom_read, instead they take a struct cdrom_msf
2843 * which is compatible.
2844 */
2845COMPATIBLE_IOCTL(CDROMREADMODE2)
2846COMPATIBLE_IOCTL(CDROMREADMODE1)
2847COMPATIBLE_IOCTL(CDROMREADRAW)
2848COMPATIBLE_IOCTL(CDROMREADCOOKED)
2849COMPATIBLE_IOCTL(CDROMREADALL)
2850/* DVD ioctls */
2851COMPATIBLE_IOCTL(DVD_READ_STRUCT)
2852COMPATIBLE_IOCTL(DVD_WRITE_STRUCT)
2853COMPATIBLE_IOCTL(DVD_AUTH)
2854/* pktcdvd */ 2214/* pktcdvd */
2855COMPATIBLE_IOCTL(PACKET_CTRL_CMD) 2215COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
2856/* Big A */ 2216/* Big A */
@@ -3336,33 +2696,6 @@ HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
3336HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns) 2696HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
3337#endif 2697#endif
3338#ifdef CONFIG_BLOCK 2698#ifdef CONFIG_BLOCK
3339HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
3340HANDLE_IOCTL(BLKRAGET, w_long)
3341HANDLE_IOCTL(BLKGETSIZE, w_long)
3342HANDLE_IOCTL(0x1260, broken_blkgetsize)
3343HANDLE_IOCTL(BLKFRAGET, w_long)
3344HANDLE_IOCTL(BLKSECTGET, w_long)
3345HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans)
3346HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans)
3347HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans)
3348HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
3349HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
3350HANDLE_IOCTL(HDIO_GET_NOWERR, hdio_ioctl_trans)
3351HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
3352HANDLE_IOCTL(HDIO_GET_NICE, hdio_ioctl_trans)
3353HANDLE_IOCTL(HDIO_GET_WCACHE, hdio_ioctl_trans)
3354HANDLE_IOCTL(HDIO_GET_ACOUSTIC, hdio_ioctl_trans)
3355HANDLE_IOCTL(HDIO_GET_ADDRESS, hdio_ioctl_trans)
3356HANDLE_IOCTL(HDIO_GET_BUSSTATE, hdio_ioctl_trans)
3357HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans)
3358HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans)
3359HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans)
3360HANDLE_IOCTL(FDSETDRVPRM32, fd_ioctl_trans)
3361HANDLE_IOCTL(FDGETDRVPRM32, fd_ioctl_trans)
3362HANDLE_IOCTL(FDGETDRVSTAT32, fd_ioctl_trans)
3363HANDLE_IOCTL(FDPOLLDRVSTAT32, fd_ioctl_trans)
3364HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans)
3365HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans)
3366HANDLE_IOCTL(SG_IO,sg_ioctl_trans) 2699HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
3367HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans) 2700HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
3368#endif 2701#endif
@@ -3373,8 +2706,6 @@ HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans)
3373#ifdef CONFIG_BLOCK 2706#ifdef CONFIG_BLOCK
3374HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans) 2707HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans)
3375HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans) 2708HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans)
3376HANDLE_IOCTL(CDROMREADAUDIO, cdrom_ioctl_trans)
3377HANDLE_IOCTL(CDROM_SEND_PACKET, cdrom_ioctl_trans)
3378#endif 2709#endif
3379#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int) 2710#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
3380HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout) 2711HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout)
@@ -3415,9 +2746,6 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
3415HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl) 2746HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
3416/* block stuff */ 2747/* block stuff */
3417#ifdef CONFIG_BLOCK 2748#ifdef CONFIG_BLOCK
3418HANDLE_IOCTL(BLKBSZGET_32, do_blkbszget)
3419HANDLE_IOCTL(BLKBSZSET_32, do_blkbszset)
3420HANDLE_IOCTL(BLKGETSIZE64_32, do_blkgetsize64)
3421/* Raw devices */ 2749/* Raw devices */
3422HANDLE_IOCTL(RAW_SETBIND, raw_ioctl) 2750HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
3423HANDLE_IOCTL(RAW_GETBIND, raw_ioctl) 2751HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 2e124e0075c5..a9b99c0dc2e7 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -221,6 +221,42 @@ struct dentry *debugfs_create_u64(const char *name, mode_t mode,
221} 221}
222EXPORT_SYMBOL_GPL(debugfs_create_u64); 222EXPORT_SYMBOL_GPL(debugfs_create_u64);
223 223
224DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
225
226DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
227
228DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
229
230/**
231 * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value
232 * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value
233 * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value
234 *
235 * These functions are exactly the same as the above functions, (but use a hex
236 * output for the decimal challenged) for details look at the above unsigned
237 * decimal functions.
238 */
239struct dentry *debugfs_create_x8(const char *name, mode_t mode,
240 struct dentry *parent, u8 *value)
241{
242 return debugfs_create_file(name, mode, parent, value, &fops_x8);
243}
244EXPORT_SYMBOL_GPL(debugfs_create_x8);
245
246struct dentry *debugfs_create_x16(const char *name, mode_t mode,
247 struct dentry *parent, u16 *value)
248{
249 return debugfs_create_file(name, mode, parent, value, &fops_x16);
250}
251EXPORT_SYMBOL_GPL(debugfs_create_x16);
252
253struct dentry *debugfs_create_x32(const char *name, mode_t mode,
254 struct dentry *parent, u32 *value)
255{
256 return debugfs_create_file(name, mode, parent, value, &fops_x32);
257}
258EXPORT_SYMBOL_GPL(debugfs_create_x32);
259
224static ssize_t read_file_bool(struct file *file, char __user *user_buf, 260static ssize_t read_file_bool(struct file *file, char __user *user_buf,
225 size_t count, loff_t *ppos) 261 size_t count, loff_t *ppos)
226{ 262{
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 901dc55e9f54..b5928a7b6a5a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -264,15 +264,12 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
264/* 264/*
265 * Asynchronous IO callback. 265 * Asynchronous IO callback.
266 */ 266 */
267static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error) 267static void dio_bio_end_aio(struct bio *bio, int error)
268{ 268{
269 struct dio *dio = bio->bi_private; 269 struct dio *dio = bio->bi_private;
270 unsigned long remaining; 270 unsigned long remaining;
271 unsigned long flags; 271 unsigned long flags;
272 272
273 if (bio->bi_size)
274 return 1;
275
276 /* cleanup the bio */ 273 /* cleanup the bio */
277 dio_bio_complete(dio, bio); 274 dio_bio_complete(dio, bio);
278 275
@@ -287,8 +284,6 @@ static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
287 aio_complete(dio->iocb, ret, 0); 284 aio_complete(dio->iocb, ret, 0);
288 kfree(dio); 285 kfree(dio);
289 } 286 }
290
291 return 0;
292} 287}
293 288
294/* 289/*
@@ -298,21 +293,17 @@ static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
298 * During I/O bi_private points at the dio. After I/O, bi_private is used to 293 * During I/O bi_private points at the dio. After I/O, bi_private is used to
299 * implement a singly-linked list of completed BIOs, at dio->bio_list. 294 * implement a singly-linked list of completed BIOs, at dio->bio_list.
300 */ 295 */
301static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error) 296static void dio_bio_end_io(struct bio *bio, int error)
302{ 297{
303 struct dio *dio = bio->bi_private; 298 struct dio *dio = bio->bi_private;
304 unsigned long flags; 299 unsigned long flags;
305 300
306 if (bio->bi_size)
307 return 1;
308
309 spin_lock_irqsave(&dio->bio_lock, flags); 301 spin_lock_irqsave(&dio->bio_lock, flags);
310 bio->bi_private = dio->bio_list; 302 bio->bi_private = dio->bio_list;
311 dio->bio_list = bio; 303 dio->bio_list = bio;
312 if (--dio->refcount == 1 && dio->waiter) 304 if (--dio->refcount == 1 && dio->waiter)
313 wake_up_process(dio->waiter); 305 wake_up_process(dio->waiter);
314 spin_unlock_irqrestore(&dio->bio_lock, flags); 306 spin_unlock_irqrestore(&dio->bio_lock, flags);
315 return 0;
316} 307}
317 308
318static int 309static int
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 74901e981e10..d2fc2384c3be 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -491,6 +491,7 @@ struct dlm_ls {
491 uint64_t ls_recover_seq; 491 uint64_t ls_recover_seq;
492 struct dlm_recover *ls_recover_args; 492 struct dlm_recover *ls_recover_args;
493 struct rw_semaphore ls_in_recovery; /* block local requests */ 493 struct rw_semaphore ls_in_recovery; /* block local requests */
494 struct rw_semaphore ls_recv_active; /* block dlm_recv */
494 struct list_head ls_requestqueue;/* queue remote requests */ 495 struct list_head ls_requestqueue;/* queue remote requests */
495 struct mutex ls_requestqueue_mutex; 496 struct mutex ls_requestqueue_mutex;
496 char *ls_recover_buf; 497 char *ls_recover_buf;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 2082daf083d8..3915b8e14146 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3638,55 +3638,8 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
3638 dlm_put_lkb(lkb); 3638 dlm_put_lkb(lkb);
3639} 3639}
3640 3640
3641int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) 3641static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
3642{ 3642{
3643 struct dlm_message *ms = (struct dlm_message *) hd;
3644 struct dlm_ls *ls;
3645 int error = 0;
3646
3647 if (!recovery)
3648 dlm_message_in(ms);
3649
3650 ls = dlm_find_lockspace_global(hd->h_lockspace);
3651 if (!ls) {
3652 log_print("drop message %d from %d for unknown lockspace %d",
3653 ms->m_type, nodeid, hd->h_lockspace);
3654 return -EINVAL;
3655 }
3656
3657 /* recovery may have just ended leaving a bunch of backed-up requests
3658 in the requestqueue; wait while dlm_recoverd clears them */
3659
3660 if (!recovery)
3661 dlm_wait_requestqueue(ls);
3662
3663 /* recovery may have just started while there were a bunch of
3664 in-flight requests -- save them in requestqueue to be processed
3665 after recovery. we can't let dlm_recvd block on the recovery
3666 lock. if dlm_recoverd is calling this function to clear the
3667 requestqueue, it needs to be interrupted (-EINTR) if another
3668 recovery operation is starting. */
3669
3670 while (1) {
3671 if (dlm_locking_stopped(ls)) {
3672 if (recovery) {
3673 error = -EINTR;
3674 goto out;
3675 }
3676 error = dlm_add_requestqueue(ls, nodeid, hd);
3677 if (error == -EAGAIN)
3678 continue;
3679 else {
3680 error = -EINTR;
3681 goto out;
3682 }
3683 }
3684
3685 if (dlm_lock_recovery_try(ls))
3686 break;
3687 schedule();
3688 }
3689
3690 switch (ms->m_type) { 3643 switch (ms->m_type) {
3691 3644
3692 /* messages sent to a master node */ 3645 /* messages sent to a master node */
@@ -3761,17 +3714,90 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3761 log_error(ls, "unknown message type %d", ms->m_type); 3714 log_error(ls, "unknown message type %d", ms->m_type);
3762 } 3715 }
3763 3716
3764 dlm_unlock_recovery(ls);
3765 out:
3766 dlm_put_lockspace(ls);
3767 dlm_astd_wake(); 3717 dlm_astd_wake();
3768 return error;
3769} 3718}
3770 3719
3720/* If the lockspace is in recovery mode (locking stopped), then normal
3721 messages are saved on the requestqueue for processing after recovery is
3722 done. When not in recovery mode, we wait for dlm_recoverd to drain saved
3723 messages off the requestqueue before we process new ones. This occurs right
3724 after recovery completes when we transition from saving all messages on
3725 requestqueue, to processing all the saved messages, to processing new
3726 messages as they arrive. */
3771 3727
3772/* 3728static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
3773 * Recovery related 3729 int nodeid)
3774 */ 3730{
3731 if (dlm_locking_stopped(ls)) {
3732 dlm_add_requestqueue(ls, nodeid, (struct dlm_header *) ms);
3733 } else {
3734 dlm_wait_requestqueue(ls);
3735 _receive_message(ls, ms);
3736 }
3737}
3738
3739/* This is called by dlm_recoverd to process messages that were saved on
3740 the requestqueue. */
3741
3742void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms)
3743{
3744 _receive_message(ls, ms);
3745}
3746
3747/* This is called by the midcomms layer when something is received for
3748 the lockspace. It could be either a MSG (normal message sent as part of
3749 standard locking activity) or an RCOM (recovery message sent as part of
3750 lockspace recovery). */
3751
3752void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
3753{
3754 struct dlm_message *ms = (struct dlm_message *) hd;
3755 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
3756 struct dlm_ls *ls;
3757 int type = 0;
3758
3759 switch (hd->h_cmd) {
3760 case DLM_MSG:
3761 dlm_message_in(ms);
3762 type = ms->m_type;
3763 break;
3764 case DLM_RCOM:
3765 dlm_rcom_in(rc);
3766 type = rc->rc_type;
3767 break;
3768 default:
3769 log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
3770 return;
3771 }
3772
3773 if (hd->h_nodeid != nodeid) {
3774 log_print("invalid h_nodeid %d from %d lockspace %x",
3775 hd->h_nodeid, nodeid, hd->h_lockspace);
3776 return;
3777 }
3778
3779 ls = dlm_find_lockspace_global(hd->h_lockspace);
3780 if (!ls) {
3781 log_print("invalid h_lockspace %x from %d cmd %d type %d",
3782 hd->h_lockspace, nodeid, hd->h_cmd, type);
3783
3784 if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
3785 dlm_send_ls_not_ready(nodeid, rc);
3786 return;
3787 }
3788
3789 /* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to
3790 be inactive (in this ls) before transitioning to recovery mode */
3791
3792 down_read(&ls->ls_recv_active);
3793 if (hd->h_cmd == DLM_MSG)
3794 dlm_receive_message(ls, ms, nodeid);
3795 else
3796 dlm_receive_rcom(ls, rc, nodeid);
3797 up_read(&ls->ls_recv_active);
3798
3799 dlm_put_lockspace(ls);
3800}
3775 3801
3776static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) 3802static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3777{ 3803{
@@ -4429,7 +4455,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4429 4455
4430 if (lvb_in && ua->lksb.sb_lvbptr) 4456 if (lvb_in && ua->lksb.sb_lvbptr)
4431 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN); 4457 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
4432 ua->castparam = ua_tmp->castparam; 4458 if (ua_tmp->castparam)
4459 ua->castparam = ua_tmp->castparam;
4433 ua->user_lksb = ua_tmp->user_lksb; 4460 ua->user_lksb = ua_tmp->user_lksb;
4434 4461
4435 error = set_unlock_args(flags, ua, &args); 4462 error = set_unlock_args(flags, ua, &args);
@@ -4474,7 +4501,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4474 goto out; 4501 goto out;
4475 4502
4476 ua = (struct dlm_user_args *)lkb->lkb_astparam; 4503 ua = (struct dlm_user_args *)lkb->lkb_astparam;
4477 ua->castparam = ua_tmp->castparam; 4504 if (ua_tmp->castparam)
4505 ua->castparam = ua_tmp->castparam;
4478 ua->user_lksb = ua_tmp->user_lksb; 4506 ua->user_lksb = ua_tmp->user_lksb;
4479 4507
4480 error = set_unlock_args(flags, ua, &args); 4508 error = set_unlock_args(flags, ua, &args);
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 1720313c22df..ada04680a1e5 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -16,7 +16,8 @@
16void dlm_print_rsb(struct dlm_rsb *r); 16void dlm_print_rsb(struct dlm_rsb *r);
17void dlm_dump_rsb(struct dlm_rsb *r); 17void dlm_dump_rsb(struct dlm_rsb *r);
18void dlm_print_lkb(struct dlm_lkb *lkb); 18void dlm_print_lkb(struct dlm_lkb *lkb);
19int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery); 19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
20void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
20int dlm_modes_compat(int mode1, int mode2); 21int dlm_modes_compat(int mode1, int mode2);
21int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, 22int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
22 unsigned int flags, struct dlm_rsb **r_ret); 23 unsigned int flags, struct dlm_rsb **r_ret);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 1dc72105ab12..6353a8384520 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -167,7 +167,6 @@ static struct kobj_type dlm_ktype = {
167}; 167};
168 168
169static struct kset dlm_kset = { 169static struct kset dlm_kset = {
170 .kobj = {.name = "dlm",},
171 .ktype = &dlm_ktype, 170 .ktype = &dlm_ktype,
172}; 171};
173 172
@@ -228,6 +227,7 @@ int dlm_lockspace_init(void)
228 INIT_LIST_HEAD(&lslist); 227 INIT_LIST_HEAD(&lslist);
229 spin_lock_init(&lslist_lock); 228 spin_lock_init(&lslist_lock);
230 229
230 kobject_set_name(&dlm_kset.kobj, "dlm");
231 kobj_set_kset_s(&dlm_kset, kernel_subsys); 231 kobj_set_kset_s(&dlm_kset, kernel_subsys);
232 error = kset_register(&dlm_kset); 232 error = kset_register(&dlm_kset);
233 if (error) 233 if (error)
@@ -519,6 +519,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
519 ls->ls_recover_seq = 0; 519 ls->ls_recover_seq = 0;
520 ls->ls_recover_args = NULL; 520 ls->ls_recover_args = NULL;
521 init_rwsem(&ls->ls_in_recovery); 521 init_rwsem(&ls->ls_in_recovery);
522 init_rwsem(&ls->ls_recv_active);
522 INIT_LIST_HEAD(&ls->ls_requestqueue); 523 INIT_LIST_HEAD(&ls->ls_requestqueue);
523 mutex_init(&ls->ls_requestqueue_mutex); 524 mutex_init(&ls->ls_requestqueue_mutex);
524 mutex_init(&ls->ls_clear_proc_locks); 525 mutex_init(&ls->ls_clear_proc_locks);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9e9d2e82f40f..58bf3f5cdbe2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -334,18 +334,8 @@ static void close_connection(struct connection *con, bool and_other)
334 con->rx_page = NULL; 334 con->rx_page = NULL;
335 } 335 }
336 336
337 /* If we are an 'othercon' then NULL the pointer to us 337 con->retries = 0;
338 from the parent and tidy ourself up */ 338 mutex_unlock(&con->sock_mutex);
339 if (test_bit(CF_IS_OTHERCON, &con->flags)) {
340 struct connection *parent = __nodeid2con(con->nodeid, 0);
341 parent->othercon = NULL;
342 kmem_cache_free(con_cache, con);
343 }
344 else {
345 /* Parent connections get reused */
346 con->retries = 0;
347 mutex_unlock(&con->sock_mutex);
348 }
349} 339}
350 340
351/* We only send shutdown messages to nodes that are not part of the cluster */ 341/* We only send shutdown messages to nodes that are not part of the cluster */
@@ -731,6 +721,8 @@ static int tcp_accept_from_sock(struct connection *con)
731 INIT_WORK(&othercon->swork, process_send_sockets); 721 INIT_WORK(&othercon->swork, process_send_sockets);
732 INIT_WORK(&othercon->rwork, process_recv_sockets); 722 INIT_WORK(&othercon->rwork, process_recv_sockets);
733 set_bit(CF_IS_OTHERCON, &othercon->flags); 723 set_bit(CF_IS_OTHERCON, &othercon->flags);
724 }
725 if (!othercon->sock) {
734 newcon->othercon = othercon; 726 newcon->othercon = othercon;
735 othercon->sock = newsock; 727 othercon->sock = newsock;
736 newsock->sk->sk_user_data = othercon; 728 newsock->sk->sk_user_data = othercon;
@@ -1272,14 +1264,15 @@ static void send_to_sock(struct connection *con)
1272 if (len) { 1264 if (len) {
1273 ret = sendpage(con->sock, e->page, offset, len, 1265 ret = sendpage(con->sock, e->page, offset, len,
1274 msg_flags); 1266 msg_flags);
1275 if (ret == -EAGAIN || ret == 0) 1267 if (ret == -EAGAIN || ret == 0) {
1268 cond_resched();
1276 goto out; 1269 goto out;
1270 }
1277 if (ret <= 0) 1271 if (ret <= 0)
1278 goto send_error; 1272 goto send_error;
1279 } else { 1273 }
1280 /* Don't starve people filling buffers */ 1274 /* Don't starve people filling buffers */
1281 cond_resched(); 1275 cond_resched();
1282 }
1283 1276
1284 spin_lock(&con->writequeue_lock); 1277 spin_lock(&con->writequeue_lock);
1285 e->offset += ret; 1278 e->offset += ret;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index d09977528f69..e9cdcab306e2 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -18,10 +18,6 @@
18#include "rcom.h" 18#include "rcom.h"
19#include "config.h" 19#include "config.h"
20 20
21/*
22 * Following called by dlm_recoverd thread
23 */
24
25static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) 21static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
26{ 22{
27 struct dlm_member *memb = NULL; 23 struct dlm_member *memb = NULL;
@@ -250,18 +246,30 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
250 return error; 246 return error;
251} 247}
252 248
253/* 249/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before
254 * Following called from lockspace.c 250 dlm_ls_start() is called on any of them to start the new recovery. */
255 */
256 251
257int dlm_ls_stop(struct dlm_ls *ls) 252int dlm_ls_stop(struct dlm_ls *ls)
258{ 253{
259 int new; 254 int new;
260 255
261 /* 256 /*
262 * A stop cancels any recovery that's in progress (see RECOVERY_STOP, 257 * Prevent dlm_recv from being in the middle of something when we do
263 * dlm_recovery_stopped()) and prevents any new locks from being 258 * the stop. This includes ensuring dlm_recv isn't processing a
264 * processed (see RUNNING, dlm_locking_stopped()). 259 * recovery message (rcom), while dlm_recoverd is aborting and
260 * resetting things from an in-progress recovery. i.e. we want
261 * dlm_recoverd to abort its recovery without worrying about dlm_recv
262 * processing an rcom at the same time. Stopping dlm_recv also makes
263 * it easy for dlm_receive_message() to check locking stopped and add a
264 * message to the requestqueue without races.
265 */
266
267 down_write(&ls->ls_recv_active);
268
269 /*
270 * Abort any recovery that's in progress (see RECOVERY_STOP,
271 * dlm_recovery_stopped()) and tell any other threads running in the
272 * dlm to quit any processing (see RUNNING, dlm_locking_stopped()).
265 */ 273 */
266 274
267 spin_lock(&ls->ls_recover_lock); 275 spin_lock(&ls->ls_recover_lock);
@@ -271,8 +279,14 @@ int dlm_ls_stop(struct dlm_ls *ls)
271 spin_unlock(&ls->ls_recover_lock); 279 spin_unlock(&ls->ls_recover_lock);
272 280
273 /* 281 /*
282 * Let dlm_recv run again, now any normal messages will be saved on the
283 * requestqueue for later.
284 */
285
286 up_write(&ls->ls_recv_active);
287
288 /*
274 * This in_recovery lock does two things: 289 * This in_recovery lock does two things:
275 *
276 * 1) Keeps this function from returning until all threads are out 290 * 1) Keeps this function from returning until all threads are out
277 * of locking routines and locking is truely stopped. 291 * of locking routines and locking is truely stopped.
278 * 2) Keeps any new requests from being processed until it's unlocked 292 * 2) Keeps any new requests from being processed until it's unlocked
@@ -284,9 +298,8 @@ int dlm_ls_stop(struct dlm_ls *ls)
284 298
285 /* 299 /*
286 * The recoverd suspend/resume makes sure that dlm_recoverd (if 300 * The recoverd suspend/resume makes sure that dlm_recoverd (if
287 * running) has noticed the clearing of RUNNING above and quit 301 * running) has noticed RECOVERY_STOP above and quit processing the
288 * processing the previous recovery. This will be true for all nodes 302 * previous recovery.
289 * before any nodes start the new recovery.
290 */ 303 */
291 304
292 dlm_recoverd_suspend(ls); 305 dlm_recoverd_suspend(ls);
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index a5126e0c68a6..f8c69dda16a0 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,6 @@
27#include "dlm_internal.h" 27#include "dlm_internal.h"
28#include "lowcomms.h" 28#include "lowcomms.h"
29#include "config.h" 29#include "config.h"
30#include "rcom.h"
31#include "lock.h" 30#include "lock.h"
32#include "midcomms.h" 31#include "midcomms.h"
33 32
@@ -117,19 +116,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
117 offset &= (limit - 1); 116 offset &= (limit - 1);
118 len -= msglen; 117 len -= msglen;
119 118
120 switch (msg->h_cmd) { 119 dlm_receive_buffer(msg, nodeid);
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 } 120 }
134 121
135 if (msg != (struct dlm_header *) __tmp) 122 if (msg != (struct dlm_header *) __tmp)
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 188b91c027e4..ae2fd97fa4ad 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -386,7 +386,10 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
386 dlm_recover_process_copy(ls, rc_in); 386 dlm_recover_process_copy(ls, rc_in);
387} 387}
388 388
389static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) 389/* If the lockspace doesn't exist then still send a status message
390 back; it's possible that it just doesn't have its global_id yet. */
391
392int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
390{ 393{
391 struct dlm_rcom *rc; 394 struct dlm_rcom *rc;
392 struct rcom_config *rf; 395 struct rcom_config *rf;
@@ -446,28 +449,11 @@ static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
446 return rv; 449 return rv;
447} 450}
448 451
449/* Called by dlm_recvd; corresponds to dlm_receive_message() but special 452/* Called by dlm_recv; corresponds to dlm_receive_message() but special
450 recovery-only comms are sent through here. */ 453 recovery-only comms are sent through here. */
451 454
452void dlm_receive_rcom(struct dlm_header *hd, int nodeid) 455void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
453{ 456{
454 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
455 struct dlm_ls *ls;
456
457 dlm_rcom_in(rc);
458
459 /* If the lockspace doesn't exist then still send a status message
460 back; it's possible that it just doesn't have its global_id yet. */
461
462 ls = dlm_find_lockspace_global(hd->h_lockspace);
463 if (!ls) {
464 log_print("lockspace %x from %d type %x not found",
465 hd->h_lockspace, nodeid, rc->rc_type);
466 if (rc->rc_type == DLM_RCOM_STATUS)
467 send_ls_not_ready(nodeid, rc);
468 return;
469 }
470
471 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { 457 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
472 log_debug(ls, "ignoring recovery message %x from %d", 458 log_debug(ls, "ignoring recovery message %x from %d",
473 rc->rc_type, nodeid); 459 rc->rc_type, nodeid);
@@ -477,12 +463,6 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
477 if (is_old_reply(ls, rc)) 463 if (is_old_reply(ls, rc))
478 goto out; 464 goto out;
479 465
480 if (nodeid != rc->rc_header.h_nodeid) {
481 log_error(ls, "bad rcom nodeid %d from %d",
482 rc->rc_header.h_nodeid, nodeid);
483 goto out;
484 }
485
486 switch (rc->rc_type) { 466 switch (rc->rc_type) {
487 case DLM_RCOM_STATUS: 467 case DLM_RCOM_STATUS:
488 receive_rcom_status(ls, rc); 468 receive_rcom_status(ls, rc);
@@ -520,6 +500,6 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
520 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); 500 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
521 } 501 }
522 out: 502 out:
523 dlm_put_lockspace(ls); 503 return;
524} 504}
525 505
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index d7984321ff41..b09abd29ba38 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -18,7 +18,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); 18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); 19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); 20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid); 21void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid);
22int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);
22 23
23#endif 24#endif
24 25
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 66575997861c..4b89e20eebe7 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -24,19 +24,28 @@
24 24
25 25
26/* If the start for which we're re-enabling locking (seq) has been superseded 26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */ 27 by a newer stop (ls_recover_seq), we need to leave locking disabled.
28
29 We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
30 locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
31 enables locking and clears the requestqueue between a and b. */
28 32
29static int enable_locking(struct dlm_ls *ls, uint64_t seq) 33static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{ 34{
31 int error = -EINTR; 35 int error = -EINTR;
32 36
37 down_write(&ls->ls_recv_active);
38
33 spin_lock(&ls->ls_recover_lock); 39 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) { 40 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags); 41 set_bit(LSFL_RUNNING, &ls->ls_flags);
42 /* unblocks processes waiting to enter the dlm */
36 up_write(&ls->ls_in_recovery); 43 up_write(&ls->ls_in_recovery);
37 error = 0; 44 error = 0;
38 } 45 }
39 spin_unlock(&ls->ls_recover_lock); 46 spin_unlock(&ls->ls_recover_lock);
47
48 up_write(&ls->ls_recv_active);
40 return error; 49 return error;
41} 50}
42 51
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 65008d79c96d..0de04f17ccea 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -20,7 +20,7 @@
20struct rq_entry { 20struct rq_entry {
21 struct list_head list; 21 struct list_head list;
22 int nodeid; 22 int nodeid;
23 char request[1]; 23 char request[0];
24}; 24};
25 25
26/* 26/*
@@ -30,42 +30,39 @@ struct rq_entry {
30 * lockspace is enabled on some while still suspended on others. 30 * lockspace is enabled on some while still suspended on others.
31 */ 31 */
32 32
33int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) 33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{ 34{
35 struct rq_entry *e; 35 struct rq_entry *e;
36 int length = hd->h_length; 36 int length = hd->h_length;
37 int rv = 0;
38 37
39 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); 38 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
40 if (!e) { 39 if (!e) {
41 log_print("dlm_add_requestqueue: out of memory\n"); 40 log_print("dlm_add_requestqueue: out of memory len %d", length);
42 return 0; 41 return;
43 } 42 }
44 43
45 e->nodeid = nodeid; 44 e->nodeid = nodeid;
46 memcpy(e->request, hd, length); 45 memcpy(e->request, hd, length);
47 46
48 /* We need to check dlm_locking_stopped() after taking the mutex to
49 avoid a race where dlm_recoverd enables locking and runs
50 process_requestqueue between our earlier dlm_locking_stopped check
51 and this addition to the requestqueue. */
52
53 mutex_lock(&ls->ls_requestqueue_mutex); 47 mutex_lock(&ls->ls_requestqueue_mutex);
54 if (dlm_locking_stopped(ls)) 48 list_add_tail(&e->list, &ls->ls_requestqueue);
55 list_add_tail(&e->list, &ls->ls_requestqueue);
56 else {
57 log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid);
58 kfree(e);
59 rv = -EAGAIN;
60 }
61 mutex_unlock(&ls->ls_requestqueue_mutex); 49 mutex_unlock(&ls->ls_requestqueue_mutex);
62 return rv;
63} 50}
64 51
52/*
53 * Called by dlm_recoverd to process normal messages saved while recovery was
54 * happening. Normal locking has been enabled before this is called. dlm_recv
55 * upon receiving a message, will wait for all saved messages to be drained
56 * here before processing the message it got. If a new dlm_ls_stop() arrives
57 * while we're processing these saved messages, it may block trying to suspend
58 * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue. In that
59 * case, we don't abort since locking_stopped is still 0. If dlm_recv is not
60 * waiting for us, then this processing may be aborted due to locking_stopped.
61 */
62
65int dlm_process_requestqueue(struct dlm_ls *ls) 63int dlm_process_requestqueue(struct dlm_ls *ls)
66{ 64{
67 struct rq_entry *e; 65 struct rq_entry *e;
68 struct dlm_header *hd;
69 int error = 0; 66 int error = 0;
70 67
71 mutex_lock(&ls->ls_requestqueue_mutex); 68 mutex_lock(&ls->ls_requestqueue_mutex);
@@ -79,14 +76,7 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
79 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); 76 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
80 mutex_unlock(&ls->ls_requestqueue_mutex); 77 mutex_unlock(&ls->ls_requestqueue_mutex);
81 78
82 hd = (struct dlm_header *) e->request; 79 dlm_receive_message_saved(ls, (struct dlm_message *)e->request);
83 error = dlm_receive_message(hd, e->nodeid, 1);
84
85 if (error == -EINTR) {
86 /* entry is left on requestqueue */
87 log_debug(ls, "process_requestqueue abort eintr");
88 break;
89 }
90 80
91 mutex_lock(&ls->ls_requestqueue_mutex); 81 mutex_lock(&ls->ls_requestqueue_mutex);
92 list_del(&e->list); 82 list_del(&e->list);
@@ -106,10 +96,12 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
106 96
107/* 97/*
108 * After recovery is done, locking is resumed and dlm_recoverd takes all the 98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
109 * saved requests and processes them as they would have been by dlm_recvd. At 99 * saved requests and processes them as they would have been by dlm_recv. At
110 * the same time, dlm_recvd will start receiving new requests from remote 100 * the same time, dlm_recv will start receiving new requests from remote nodes.
111 * nodes. We want to delay dlm_recvd processing new requests until 101 * We want to delay dlm_recv processing new requests until dlm_recoverd has
112 * dlm_recoverd has finished processing the old saved requests. 102 * finished processing the old saved requests. We don't check for locking
103 * stopped here because dlm_ls_stop won't stop locking until it's suspended us
104 * (dlm_recv).
113 */ 105 */
114 106
115void dlm_wait_requestqueue(struct dlm_ls *ls) 107void dlm_wait_requestqueue(struct dlm_ls *ls)
@@ -118,8 +110,6 @@ void dlm_wait_requestqueue(struct dlm_ls *ls)
118 mutex_lock(&ls->ls_requestqueue_mutex); 110 mutex_lock(&ls->ls_requestqueue_mutex);
119 if (list_empty(&ls->ls_requestqueue)) 111 if (list_empty(&ls->ls_requestqueue))
120 break; 112 break;
121 if (dlm_locking_stopped(ls))
122 break;
123 mutex_unlock(&ls->ls_requestqueue_mutex); 113 mutex_unlock(&ls->ls_requestqueue_mutex);
124 schedule(); 114 schedule();
125 } 115 }
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
index 6a53ea03335d..aba34fc05ee4 100644
--- a/fs/dlm/requestqueue.h
+++ b/fs/dlm/requestqueue.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
13#ifndef __REQUESTQUEUE_DOT_H__ 13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__ 14#define __REQUESTQUEUE_DOT_H__
15 15
16int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd); 16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls); 17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls); 18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls); 19void dlm_purge_requestqueue(struct dlm_ls *ls);
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index fe9186312d7c..9aa345121e09 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -165,22 +165,10 @@ static int ecryptfs_process_nl_quit(struct sk_buff *skb)
165 * it to its desired netlink context element and wake up the process 165 * it to its desired netlink context element and wake up the process
166 * that is waiting for a response. 166 * that is waiting for a response.
167 */ 167 */
168static void ecryptfs_receive_nl_message(struct sock *sk, int len) 168static void ecryptfs_receive_nl_message(struct sk_buff *skb)
169{ 169{
170 struct sk_buff *skb;
171 struct nlmsghdr *nlh; 170 struct nlmsghdr *nlh;
172 int rc = 0; /* skb_recv_datagram requires this */
173 171
174receive:
175 skb = skb_recv_datagram(sk, 0, 0, &rc);
176 if (rc == -EINTR)
177 goto receive;
178 else if (rc < 0) {
179 ecryptfs_printk(KERN_ERR, "Error occurred while "
180 "receiving eCryptfs netlink message; "
181 "rc = [%d]\n", rc);
182 return;
183 }
184 nlh = nlmsg_hdr(skb); 172 nlh = nlmsg_hdr(skb);
185 if (!NLMSG_OK(nlh, skb->len)) { 173 if (!NLMSG_OK(nlh, skb->len)) {
186 ecryptfs_printk(KERN_ERR, "Received corrupt netlink " 174 ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
@@ -227,7 +215,7 @@ int ecryptfs_init_netlink(void)
227{ 215{
228 int rc; 216 int rc;
229 217
230 ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0, 218 ecryptfs_nl_sock = netlink_kernel_create(&init_net, NETLINK_ECRYPTFS, 0,
231 ecryptfs_receive_nl_message, 219 ecryptfs_receive_nl_message,
232 NULL, THIS_MODULE); 220 NULL, THIS_MODULE);
233 if (!ecryptfs_nl_sock) { 221 if (!ecryptfs_nl_sock) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a4b142a6a2c7..8d23b0b38717 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/module.h>
17#include <linux/spinlock.h> 18#include <linux/spinlock.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index cd805a66880d..93fa427bb5f5 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -93,9 +93,10 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
93 map_bh(bh, inode->i_sb, block); 93 map_bh(bh, inode->i_sb, block);
94 94
95 set_buffer_uptodate(bh); 95 set_buffer_uptodate(bh);
96 if (!gfs2_is_jdata(ip))
97 mark_buffer_dirty(bh);
96 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) 98 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
97 gfs2_trans_add_bh(ip->i_gl, bh, 0); 99 gfs2_trans_add_bh(ip->i_gl, bh, 0);
98 mark_buffer_dirty(bh);
99 100
100 if (release) { 101 if (release) {
101 unlock_page(page); 102 unlock_page(page);
@@ -1085,6 +1086,33 @@ static int do_shrink(struct gfs2_inode *ip, u64 size)
1085 return error; 1086 return error;
1086} 1087}
1087 1088
1089static int do_touch(struct gfs2_inode *ip, u64 size)
1090{
1091 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1092 struct buffer_head *dibh;
1093 int error;
1094
1095 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1096 if (error)
1097 return error;
1098
1099 down_write(&ip->i_rw_mutex);
1100
1101 error = gfs2_meta_inode_buffer(ip, &dibh);
1102 if (error)
1103 goto do_touch_out;
1104
1105 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1106 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1107 gfs2_dinode_out(ip, dibh->b_data);
1108 brelse(dibh);
1109
1110do_touch_out:
1111 up_write(&ip->i_rw_mutex);
1112 gfs2_trans_end(sdp);
1113 return error;
1114}
1115
1088/** 1116/**
1089 * gfs2_truncatei - make a file a given size 1117 * gfs2_truncatei - make a file a given size
1090 * @ip: the inode 1118 * @ip: the inode
@@ -1105,8 +1133,11 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1105 1133
1106 if (size > ip->i_di.di_size) 1134 if (size > ip->i_di.di_size)
1107 error = do_grow(ip, size); 1135 error = do_grow(ip, size);
1108 else 1136 else if (size < ip->i_di.di_size)
1109 error = do_shrink(ip, size); 1137 error = do_shrink(ip, size);
1138 else
1139 /* update time stamps */
1140 error = do_touch(ip, size);
1110 1141
1111 return error; 1142 return error;
1112} 1143}
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 3548d9f31e0d..3731ab0771d5 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -35,30 +35,6 @@
35 The kthread functions used to start these daemons block and flush signals. */ 35 The kthread functions used to start these daemons block and flush signals. */
36 36
37/** 37/**
38 * gfs2_scand - Look for cached glocks and inodes to toss from memory
39 * @sdp: Pointer to GFS2 superblock
40 *
41 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
42 * See gfs2_glockd()
43 */
44
45int gfs2_scand(void *data)
46{
47 struct gfs2_sbd *sdp = data;
48 unsigned long t;
49
50 while (!kthread_should_stop()) {
51 gfs2_scand_internal(sdp);
52 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
53 if (freezing(current))
54 refrigerator();
55 schedule_timeout_interruptible(t);
56 }
57
58 return 0;
59}
60
61/**
62 * gfs2_glockd - Reclaim unused glock structures 38 * gfs2_glockd - Reclaim unused glock structures
63 * @sdp: Pointer to GFS2 superblock 39 * @sdp: Pointer to GFS2 superblock
64 * 40 *
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
index 801007120fb2..0de9b3557955 100644
--- a/fs/gfs2/daemon.h
+++ b/fs/gfs2/daemon.h
@@ -10,7 +10,6 @@
10#ifndef __DAEMON_DOT_H__ 10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__ 11#define __DAEMON_DOT_H__
12 12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data); 13int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data); 14int gfs2_recoverd(void *data);
16int gfs2_logd(void *data); 15int gfs2_logd(void *data);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2beb2f401aa2..9949bb746a52 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1043,6 +1043,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
1043 1043
1044 error = gfs2_meta_inode_buffer(dip, &dibh); 1044 error = gfs2_meta_inode_buffer(dip, &dibh);
1045 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { 1045 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1046 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1046 dip->i_di.di_blocks++; 1047 dip->i_di.di_blocks++;
1047 gfs2_set_inode_blocks(&dip->i_inode); 1048 gfs2_set_inode_blocks(&dip->i_inode);
1048 gfs2_dinode_out(dip, dibh->b_data); 1049 gfs2_dinode_out(dip, dibh->b_data);
@@ -1501,7 +1502,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
1501 inode = gfs2_inode_lookup(dir->i_sb, 1502 inode = gfs2_inode_lookup(dir->i_sb,
1502 be16_to_cpu(dent->de_type), 1503 be16_to_cpu(dent->de_type),
1503 be64_to_cpu(dent->de_inum.no_addr), 1504 be64_to_cpu(dent->de_inum.no_addr),
1504 be64_to_cpu(dent->de_inum.no_formal_ino)); 1505 be64_to_cpu(dent->de_inum.no_formal_ino), 0);
1505 brelse(bh); 1506 brelse(bh);
1506 return inode; 1507 return inode;
1507 } 1508 }
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index 1ab3e9d73886..aa8dbf303f6d 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -200,28 +200,28 @@ static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
200 return gfs2_ea_remove_i(ip, er); 200 return gfs2_ea_remove_i(ip, er);
201} 201}
202 202
203static struct gfs2_eattr_operations gfs2_user_eaops = { 203static const struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get, 204 .eo_get = user_eo_get,
205 .eo_set = user_eo_set, 205 .eo_set = user_eo_set,
206 .eo_remove = user_eo_remove, 206 .eo_remove = user_eo_remove,
207 .eo_name = "user", 207 .eo_name = "user",
208}; 208};
209 209
210struct gfs2_eattr_operations gfs2_system_eaops = { 210const struct gfs2_eattr_operations gfs2_system_eaops = {
211 .eo_get = system_eo_get, 211 .eo_get = system_eo_get,
212 .eo_set = system_eo_set, 212 .eo_set = system_eo_set,
213 .eo_remove = system_eo_remove, 213 .eo_remove = system_eo_remove,
214 .eo_name = "system", 214 .eo_name = "system",
215}; 215};
216 216
217static struct gfs2_eattr_operations gfs2_security_eaops = { 217static const struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get, 218 .eo_get = security_eo_get,
219 .eo_set = security_eo_set, 219 .eo_set = security_eo_set,
220 .eo_remove = security_eo_remove, 220 .eo_remove = security_eo_remove,
221 .eo_name = "security", 221 .eo_name = "security",
222}; 222};
223 223
224struct gfs2_eattr_operations *gfs2_ea_ops[] = { 224const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
225 NULL, 225 NULL,
226 &gfs2_user_eaops, 226 &gfs2_user_eaops,
227 &gfs2_system_eaops, 227 &gfs2_system_eaops,
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
index 508b4f7a2449..da2f7fbbb40d 100644
--- a/fs/gfs2/eaops.h
+++ b/fs/gfs2/eaops.h
@@ -22,9 +22,9 @@ struct gfs2_eattr_operations {
22 22
23unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name); 23unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
24 24
25extern struct gfs2_eattr_operations gfs2_system_eaops; 25extern const struct gfs2_eattr_operations gfs2_system_eaops;
26 26
27extern struct gfs2_eattr_operations *gfs2_ea_ops[]; 27extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
28 28
29#endif /* __EAOPS_DOT_H__ */ 29#endif /* __EAOPS_DOT_H__ */
30 30
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3f0974e1afef..a37efe4aae6f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -25,8 +25,10 @@
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <linux/seq_file.h> 26#include <linux/seq_file.h>
27#include <linux/debugfs.h> 27#include <linux/debugfs.h>
28#include <linux/module.h> 28#include <linux/kthread.h>
29#include <linux/kallsyms.h> 29#include <linux/freezer.h>
30#include <linux/workqueue.h>
31#include <linux/jiffies.h>
30 32
31#include "gfs2.h" 33#include "gfs2.h"
32#include "incore.h" 34#include "incore.h"
@@ -48,7 +50,6 @@ struct glock_iter {
48 int hash; /* hash bucket index */ 50 int hash; /* hash bucket index */
49 struct gfs2_sbd *sdp; /* incore superblock */ 51 struct gfs2_sbd *sdp; /* incore superblock */
50 struct gfs2_glock *gl; /* current glock struct */ 52 struct gfs2_glock *gl; /* current glock struct */
51 struct hlist_head *hb_list; /* current hash bucket ptr */
52 struct seq_file *seq; /* sequence file for debugfs */ 53 struct seq_file *seq; /* sequence file for debugfs */
53 char string[512]; /* scratch space */ 54 char string[512]; /* scratch space */
54}; 55};
@@ -59,8 +60,13 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
59static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl); 60static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
60static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh); 61static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
61static void gfs2_glock_drop_th(struct gfs2_glock *gl); 62static void gfs2_glock_drop_th(struct gfs2_glock *gl);
63static void run_queue(struct gfs2_glock *gl);
64
62static DECLARE_RWSEM(gfs2_umount_flush_sem); 65static DECLARE_RWSEM(gfs2_umount_flush_sem);
63static struct dentry *gfs2_root; 66static struct dentry *gfs2_root;
67static struct task_struct *scand_process;
68static unsigned int scand_secs = 5;
69static struct workqueue_struct *glock_workqueue;
64 70
65#define GFS2_GL_HASH_SHIFT 15 71#define GFS2_GL_HASH_SHIFT 15
66#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) 72#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
@@ -276,6 +282,18 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
276 return gl; 282 return gl;
277} 283}
278 284
285static void glock_work_func(struct work_struct *work)
286{
287 struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
288
289 spin_lock(&gl->gl_spin);
290 if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
291 set_bit(GLF_DEMOTE, &gl->gl_flags);
292 run_queue(gl);
293 spin_unlock(&gl->gl_spin);
294 gfs2_glock_put(gl);
295}
296
279/** 297/**
280 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist 298 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
281 * @sdp: The GFS2 superblock 299 * @sdp: The GFS2 superblock
@@ -315,6 +333,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
315 gl->gl_name = name; 333 gl->gl_name = name;
316 atomic_set(&gl->gl_ref, 1); 334 atomic_set(&gl->gl_ref, 1);
317 gl->gl_state = LM_ST_UNLOCKED; 335 gl->gl_state = LM_ST_UNLOCKED;
336 gl->gl_demote_state = LM_ST_EXCLUSIVE;
318 gl->gl_hash = hash; 337 gl->gl_hash = hash;
319 gl->gl_owner_pid = 0; 338 gl->gl_owner_pid = 0;
320 gl->gl_ip = 0; 339 gl->gl_ip = 0;
@@ -323,10 +342,12 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
323 gl->gl_req_bh = NULL; 342 gl->gl_req_bh = NULL;
324 gl->gl_vn = 0; 343 gl->gl_vn = 0;
325 gl->gl_stamp = jiffies; 344 gl->gl_stamp = jiffies;
345 gl->gl_tchange = jiffies;
326 gl->gl_object = NULL; 346 gl->gl_object = NULL;
327 gl->gl_sbd = sdp; 347 gl->gl_sbd = sdp;
328 gl->gl_aspace = NULL; 348 gl->gl_aspace = NULL;
329 lops_init_le(&gl->gl_le, &gfs2_glock_lops); 349 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
350 INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
330 351
331 /* If this glock protects actual on-disk data or metadata blocks, 352 /* If this glock protects actual on-disk data or metadata blocks,
332 create a VFS inode to manage the pages/buffers holding them. */ 353 create a VFS inode to manage the pages/buffers holding them. */
@@ -440,6 +461,8 @@ static void wait_on_holder(struct gfs2_holder *gh)
440 461
441static void gfs2_demote_wake(struct gfs2_glock *gl) 462static void gfs2_demote_wake(struct gfs2_glock *gl)
442{ 463{
464 BUG_ON(!spin_is_locked(&gl->gl_spin));
465 gl->gl_demote_state = LM_ST_EXCLUSIVE;
443 clear_bit(GLF_DEMOTE, &gl->gl_flags); 466 clear_bit(GLF_DEMOTE, &gl->gl_flags);
444 smp_mb__after_clear_bit(); 467 smp_mb__after_clear_bit();
445 wake_up_bit(&gl->gl_flags, GLF_DEMOTE); 468 wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
@@ -545,12 +568,14 @@ static int rq_demote(struct gfs2_glock *gl)
545 return 0; 568 return 0;
546 } 569 }
547 set_bit(GLF_LOCK, &gl->gl_flags); 570 set_bit(GLF_LOCK, &gl->gl_flags);
548 spin_unlock(&gl->gl_spin);
549 if (gl->gl_demote_state == LM_ST_UNLOCKED || 571 if (gl->gl_demote_state == LM_ST_UNLOCKED ||
550 gl->gl_state != LM_ST_EXCLUSIVE) 572 gl->gl_state != LM_ST_EXCLUSIVE) {
573 spin_unlock(&gl->gl_spin);
551 gfs2_glock_drop_th(gl); 574 gfs2_glock_drop_th(gl);
552 else 575 } else {
576 spin_unlock(&gl->gl_spin);
553 gfs2_glock_xmote_th(gl, NULL); 577 gfs2_glock_xmote_th(gl, NULL);
578 }
554 spin_lock(&gl->gl_spin); 579 spin_lock(&gl->gl_spin);
555 580
556 return 0; 581 return 0;
@@ -679,24 +704,25 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
679 * practise: LM_ST_SHARED and LM_ST_UNLOCKED 704 * practise: LM_ST_SHARED and LM_ST_UNLOCKED
680 */ 705 */
681 706
682static void handle_callback(struct gfs2_glock *gl, unsigned int state, int remote) 707static void handle_callback(struct gfs2_glock *gl, unsigned int state,
708 int remote, unsigned long delay)
683{ 709{
710 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
711
684 spin_lock(&gl->gl_spin); 712 spin_lock(&gl->gl_spin);
685 if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) { 713 set_bit(bit, &gl->gl_flags);
714 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
686 gl->gl_demote_state = state; 715 gl->gl_demote_state = state;
687 gl->gl_demote_time = jiffies; 716 gl->gl_demote_time = jiffies;
688 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && 717 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
689 gl->gl_object) { 718 gl->gl_object) {
690 struct inode *inode = igrab(gl->gl_object); 719 gfs2_glock_schedule_for_reclaim(gl);
691 spin_unlock(&gl->gl_spin); 720 spin_unlock(&gl->gl_spin);
692 if (inode) {
693 d_prune_aliases(inode);
694 iput(inode);
695 }
696 return; 721 return;
697 } 722 }
698 } else if (gl->gl_demote_state != LM_ST_UNLOCKED) { 723 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
699 gl->gl_demote_state = state; 724 gl->gl_demote_state != state) {
725 gl->gl_demote_state = LM_ST_UNLOCKED;
700 } 726 }
701 spin_unlock(&gl->gl_spin); 727 spin_unlock(&gl->gl_spin);
702} 728}
@@ -723,6 +749,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
723 } 749 }
724 750
725 gl->gl_state = new_state; 751 gl->gl_state = new_state;
752 gl->gl_tchange = jiffies;
726} 753}
727 754
728/** 755/**
@@ -760,10 +787,20 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
760 787
761 if (!gh) { 788 if (!gh) {
762 gl->gl_stamp = jiffies; 789 gl->gl_stamp = jiffies;
763 if (ret & LM_OUT_CANCELED) 790 if (ret & LM_OUT_CANCELED) {
764 op_done = 0; 791 op_done = 0;
765 else 792 } else {
793 spin_lock(&gl->gl_spin);
794 if (gl->gl_state != gl->gl_demote_state) {
795 gl->gl_req_bh = NULL;
796 spin_unlock(&gl->gl_spin);
797 gfs2_glock_drop_th(gl);
798 gfs2_glock_put(gl);
799 return;
800 }
766 gfs2_demote_wake(gl); 801 gfs2_demote_wake(gl);
802 spin_unlock(&gl->gl_spin);
803 }
767 } else { 804 } else {
768 spin_lock(&gl->gl_spin); 805 spin_lock(&gl->gl_spin);
769 list_del_init(&gh->gh_list); 806 list_del_init(&gh->gh_list);
@@ -799,7 +836,6 @@ out:
799 gl->gl_req_gh = NULL; 836 gl->gl_req_gh = NULL;
800 gl->gl_req_bh = NULL; 837 gl->gl_req_bh = NULL;
801 clear_bit(GLF_LOCK, &gl->gl_flags); 838 clear_bit(GLF_LOCK, &gl->gl_flags);
802 run_queue(gl);
803 spin_unlock(&gl->gl_spin); 839 spin_unlock(&gl->gl_spin);
804 } 840 }
805 841
@@ -817,7 +853,7 @@ out:
817 * 853 *
818 */ 854 */
819 855
820void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh) 856static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
821{ 857{
822 struct gfs2_sbd *sdp = gl->gl_sbd; 858 struct gfs2_sbd *sdp = gl->gl_sbd;
823 int flags = gh ? gh->gh_flags : 0; 859 int flags = gh ? gh->gh_flags : 0;
@@ -871,7 +907,6 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
871 gfs2_assert_warn(sdp, !ret); 907 gfs2_assert_warn(sdp, !ret);
872 908
873 state_change(gl, LM_ST_UNLOCKED); 909 state_change(gl, LM_ST_UNLOCKED);
874 gfs2_demote_wake(gl);
875 910
876 if (glops->go_inval) 911 if (glops->go_inval)
877 glops->go_inval(gl, DIO_METADATA); 912 glops->go_inval(gl, DIO_METADATA);
@@ -884,10 +919,10 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
884 } 919 }
885 920
886 spin_lock(&gl->gl_spin); 921 spin_lock(&gl->gl_spin);
922 gfs2_demote_wake(gl);
887 gl->gl_req_gh = NULL; 923 gl->gl_req_gh = NULL;
888 gl->gl_req_bh = NULL; 924 gl->gl_req_bh = NULL;
889 clear_bit(GLF_LOCK, &gl->gl_flags); 925 clear_bit(GLF_LOCK, &gl->gl_flags);
890 run_queue(gl);
891 spin_unlock(&gl->gl_spin); 926 spin_unlock(&gl->gl_spin);
892 927
893 gfs2_glock_put(gl); 928 gfs2_glock_put(gl);
@@ -1067,24 +1102,31 @@ static void add_to_queue(struct gfs2_holder *gh)
1067 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) 1102 if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
1068 BUG(); 1103 BUG();
1069 1104
1070 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner_pid); 1105 if (!(gh->gh_flags & GL_FLOCK)) {
1071 if (existing) { 1106 existing = find_holder_by_owner(&gl->gl_holders,
1072 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); 1107 gh->gh_owner_pid);
1073 printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid); 1108 if (existing) {
1074 printk(KERN_INFO "lock type : %d lock state : %d\n", 1109 print_symbol(KERN_WARNING "original: %s\n",
1075 existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state); 1110 existing->gh_ip);
1076 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); 1111 printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid);
1077 printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid); 1112 printk(KERN_INFO "lock type : %d lock state : %d\n",
1078 printk(KERN_INFO "lock type : %d lock state : %d\n", 1113 existing->gh_gl->gl_name.ln_type,
1079 gl->gl_name.ln_type, gl->gl_state); 1114 existing->gh_gl->gl_state);
1080 BUG(); 1115 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1081 } 1116 printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid);
1082 1117 printk(KERN_INFO "lock type : %d lock state : %d\n",
1083 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner_pid); 1118 gl->gl_name.ln_type, gl->gl_state);
1084 if (existing) { 1119 BUG();
1085 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); 1120 }
1086 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); 1121
1087 BUG(); 1122 existing = find_holder_by_owner(&gl->gl_waiters3,
1123 gh->gh_owner_pid);
1124 if (existing) {
1125 print_symbol(KERN_WARNING "original: %s\n",
1126 existing->gh_ip);
1127 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1128 BUG();
1129 }
1088 } 1130 }
1089 1131
1090 if (gh->gh_flags & LM_FLAG_PRIORITY) 1132 if (gh->gh_flags & LM_FLAG_PRIORITY)
@@ -1195,9 +1237,10 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1195{ 1237{
1196 struct gfs2_glock *gl = gh->gh_gl; 1238 struct gfs2_glock *gl = gh->gh_gl;
1197 const struct gfs2_glock_operations *glops = gl->gl_ops; 1239 const struct gfs2_glock_operations *glops = gl->gl_ops;
1240 unsigned delay = 0;
1198 1241
1199 if (gh->gh_flags & GL_NOCACHE) 1242 if (gh->gh_flags & GL_NOCACHE)
1200 handle_callback(gl, LM_ST_UNLOCKED, 0); 1243 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1201 1244
1202 gfs2_glmutex_lock(gl); 1245 gfs2_glmutex_lock(gl);
1203 1246
@@ -1215,8 +1258,14 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1215 } 1258 }
1216 1259
1217 clear_bit(GLF_LOCK, &gl->gl_flags); 1260 clear_bit(GLF_LOCK, &gl->gl_flags);
1218 run_queue(gl);
1219 spin_unlock(&gl->gl_spin); 1261 spin_unlock(&gl->gl_spin);
1262
1263 gfs2_glock_hold(gl);
1264 if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
1265 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1266 delay = gl->gl_ops->go_min_hold_time;
1267 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
1268 gfs2_glock_put(gl);
1220} 1269}
1221 1270
1222void gfs2_glock_dq_wait(struct gfs2_holder *gh) 1271void gfs2_glock_dq_wait(struct gfs2_holder *gh)
@@ -1443,18 +1492,21 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1443 unsigned int state) 1492 unsigned int state)
1444{ 1493{
1445 struct gfs2_glock *gl; 1494 struct gfs2_glock *gl;
1495 unsigned long delay = 0;
1496 unsigned long holdtime;
1497 unsigned long now = jiffies;
1446 1498
1447 gl = gfs2_glock_find(sdp, name); 1499 gl = gfs2_glock_find(sdp, name);
1448 if (!gl) 1500 if (!gl)
1449 return; 1501 return;
1450 1502
1451 handle_callback(gl, state, 1); 1503 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1452 1504 if (time_before(now, holdtime))
1453 spin_lock(&gl->gl_spin); 1505 delay = holdtime - now;
1454 run_queue(gl);
1455 spin_unlock(&gl->gl_spin);
1456 1506
1457 gfs2_glock_put(gl); 1507 handle_callback(gl, state, 1, delay);
1508 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
1509 gfs2_glock_put(gl);
1458} 1510}
1459 1511
1460/** 1512/**
@@ -1495,7 +1547,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1495 return; 1547 return;
1496 if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) 1548 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1497 gl->gl_req_bh(gl, async->lc_ret); 1549 gl->gl_req_bh(gl, async->lc_ret);
1498 gfs2_glock_put(gl); 1550 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1551 gfs2_glock_put(gl);
1499 up_read(&gfs2_umount_flush_sem); 1552 up_read(&gfs2_umount_flush_sem);
1500 return; 1553 return;
1501 } 1554 }
@@ -1588,7 +1641,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1588 if (gfs2_glmutex_trylock(gl)) { 1641 if (gfs2_glmutex_trylock(gl)) {
1589 if (list_empty(&gl->gl_holders) && 1642 if (list_empty(&gl->gl_holders) &&
1590 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) 1643 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1591 handle_callback(gl, LM_ST_UNLOCKED, 0); 1644 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1592 gfs2_glmutex_unlock(gl); 1645 gfs2_glmutex_unlock(gl);
1593 } 1646 }
1594 1647
@@ -1617,7 +1670,7 @@ static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
1617 goto out; 1670 goto out;
1618 gl = list_entry(head->first, struct gfs2_glock, gl_list); 1671 gl = list_entry(head->first, struct gfs2_glock, gl_list);
1619 while(1) { 1672 while(1) {
1620 if (gl->gl_sbd == sdp) { 1673 if (!sdp || gl->gl_sbd == sdp) {
1621 gfs2_glock_hold(gl); 1674 gfs2_glock_hold(gl);
1622 read_unlock(gl_lock_addr(hash)); 1675 read_unlock(gl_lock_addr(hash));
1623 if (prev) 1676 if (prev)
@@ -1635,6 +1688,7 @@ out:
1635 read_unlock(gl_lock_addr(hash)); 1688 read_unlock(gl_lock_addr(hash));
1636 if (prev) 1689 if (prev)
1637 gfs2_glock_put(prev); 1690 gfs2_glock_put(prev);
1691 cond_resched();
1638 return has_entries; 1692 return has_entries;
1639} 1693}
1640 1694
@@ -1663,20 +1717,6 @@ out_schedule:
1663} 1717}
1664 1718
1665/** 1719/**
1666 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
1667 * @sdp: the filesystem
1668 *
1669 */
1670
1671void gfs2_scand_internal(struct gfs2_sbd *sdp)
1672{
1673 unsigned int x;
1674
1675 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1676 examine_bucket(scan_glock, sdp, x);
1677}
1678
1679/**
1680 * clear_glock - look at a glock and see if we can free it from glock cache 1720 * clear_glock - look at a glock and see if we can free it from glock cache
1681 * @gl: the glock to look at 1721 * @gl: the glock to look at
1682 * 1722 *
@@ -1701,7 +1741,7 @@ static void clear_glock(struct gfs2_glock *gl)
1701 if (gfs2_glmutex_trylock(gl)) { 1741 if (gfs2_glmutex_trylock(gl)) {
1702 if (list_empty(&gl->gl_holders) && 1742 if (list_empty(&gl->gl_holders) &&
1703 gl->gl_state != LM_ST_UNLOCKED) 1743 gl->gl_state != LM_ST_UNLOCKED)
1704 handle_callback(gl, LM_ST_UNLOCKED, 0); 1744 handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
1705 gfs2_glmutex_unlock(gl); 1745 gfs2_glmutex_unlock(gl);
1706 } 1746 }
1707} 1747}
@@ -1843,7 +1883,7 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
1843 1883
1844 spin_lock(&gl->gl_spin); 1884 spin_lock(&gl->gl_spin);
1845 1885
1846 print_dbg(gi, "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type, 1886 print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
1847 (unsigned long long)gl->gl_name.ln_number); 1887 (unsigned long long)gl->gl_name.ln_number);
1848 print_dbg(gi, " gl_flags ="); 1888 print_dbg(gi, " gl_flags =");
1849 for (x = 0; x < 32; x++) { 1889 for (x = 0; x < 32; x++) {
@@ -1963,6 +2003,35 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
1963 return error; 2003 return error;
1964} 2004}
1965 2005
2006/**
2007 * gfs2_scand - Look for cached glocks and inodes to toss from memory
2008 * @sdp: Pointer to GFS2 superblock
2009 *
2010 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
2011 * See gfs2_glockd()
2012 */
2013
2014static int gfs2_scand(void *data)
2015{
2016 unsigned x;
2017 unsigned delay;
2018
2019 while (!kthread_should_stop()) {
2020 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
2021 examine_bucket(scan_glock, NULL, x);
2022 if (freezing(current))
2023 refrigerator();
2024 delay = scand_secs;
2025 if (delay < 1)
2026 delay = 1;
2027 schedule_timeout_interruptible(delay * HZ);
2028 }
2029
2030 return 0;
2031}
2032
2033
2034
1966int __init gfs2_glock_init(void) 2035int __init gfs2_glock_init(void)
1967{ 2036{
1968 unsigned i; 2037 unsigned i;
@@ -1974,52 +2043,69 @@ int __init gfs2_glock_init(void)
1974 rwlock_init(&gl_hash_locks[i]); 2043 rwlock_init(&gl_hash_locks[i]);
1975 } 2044 }
1976#endif 2045#endif
2046
2047 scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
2048 if (IS_ERR(scand_process))
2049 return PTR_ERR(scand_process);
2050
2051 glock_workqueue = create_workqueue("glock_workqueue");
2052 if (IS_ERR(glock_workqueue)) {
2053 kthread_stop(scand_process);
2054 return PTR_ERR(glock_workqueue);
2055 }
2056
1977 return 0; 2057 return 0;
1978} 2058}
1979 2059
2060void gfs2_glock_exit(void)
2061{
2062 destroy_workqueue(glock_workqueue);
2063 kthread_stop(scand_process);
2064}
2065
2066module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
2067MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
2068
1980static int gfs2_glock_iter_next(struct glock_iter *gi) 2069static int gfs2_glock_iter_next(struct glock_iter *gi)
1981{ 2070{
2071 struct gfs2_glock *gl;
2072
2073restart:
1982 read_lock(gl_lock_addr(gi->hash)); 2074 read_lock(gl_lock_addr(gi->hash));
1983 while (1) { 2075 gl = gi->gl;
1984 if (!gi->hb_list) { /* If we don't have a hash bucket yet */ 2076 if (gl) {
1985 gi->hb_list = &gl_hash_table[gi->hash].hb_list; 2077 gi->gl = hlist_entry(gl->gl_list.next,
1986 if (hlist_empty(gi->hb_list)) { 2078 struct gfs2_glock, gl_list);
1987 read_unlock(gl_lock_addr(gi->hash));
1988 gi->hash++;
1989 read_lock(gl_lock_addr(gi->hash));
1990 gi->hb_list = NULL;
1991 if (gi->hash >= GFS2_GL_HASH_SIZE) {
1992 read_unlock(gl_lock_addr(gi->hash));
1993 return 1;
1994 }
1995 else
1996 continue;
1997 }
1998 if (!hlist_empty(gi->hb_list)) {
1999 gi->gl = list_entry(gi->hb_list->first,
2000 struct gfs2_glock,
2001 gl_list);
2002 }
2003 } else {
2004 if (gi->gl->gl_list.next == NULL) {
2005 read_unlock(gl_lock_addr(gi->hash));
2006 gi->hash++;
2007 read_lock(gl_lock_addr(gi->hash));
2008 gi->hb_list = NULL;
2009 continue;
2010 }
2011 gi->gl = list_entry(gi->gl->gl_list.next,
2012 struct gfs2_glock, gl_list);
2013 }
2014 if (gi->gl) 2079 if (gi->gl)
2015 break; 2080 gfs2_glock_hold(gi->gl);
2016 } 2081 }
2017 read_unlock(gl_lock_addr(gi->hash)); 2082 read_unlock(gl_lock_addr(gi->hash));
2083 if (gl)
2084 gfs2_glock_put(gl);
2085 if (gl && gi->gl == NULL)
2086 gi->hash++;
2087 while(gi->gl == NULL) {
2088 if (gi->hash >= GFS2_GL_HASH_SIZE)
2089 return 1;
2090 read_lock(gl_lock_addr(gi->hash));
2091 gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
2092 struct gfs2_glock, gl_list);
2093 if (gi->gl)
2094 gfs2_glock_hold(gi->gl);
2095 read_unlock(gl_lock_addr(gi->hash));
2096 gi->hash++;
2097 }
2098
2099 if (gi->sdp != gi->gl->gl_sbd)
2100 goto restart;
2101
2018 return 0; 2102 return 0;
2019} 2103}
2020 2104
2021static void gfs2_glock_iter_free(struct glock_iter *gi) 2105static void gfs2_glock_iter_free(struct glock_iter *gi)
2022{ 2106{
2107 if (gi->gl)
2108 gfs2_glock_put(gi->gl);
2023 kfree(gi); 2109 kfree(gi);
2024} 2110}
2025 2111
@@ -2033,9 +2119,8 @@ static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
2033 2119
2034 gi->sdp = sdp; 2120 gi->sdp = sdp;
2035 gi->hash = 0; 2121 gi->hash = 0;
2036 gi->gl = NULL;
2037 gi->hb_list = NULL;
2038 gi->seq = NULL; 2122 gi->seq = NULL;
2123 gi->gl = NULL;
2039 memset(gi->string, 0, sizeof(gi->string)); 2124 memset(gi->string, 0, sizeof(gi->string));
2040 2125
2041 if (gfs2_glock_iter_next(gi)) { 2126 if (gfs2_glock_iter_next(gi)) {
@@ -2055,7 +2140,7 @@ static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
2055 if (!gi) 2140 if (!gi)
2056 return NULL; 2141 return NULL;
2057 2142
2058 while (n--) { 2143 while(n--) {
2059 if (gfs2_glock_iter_next(gi)) { 2144 if (gfs2_glock_iter_next(gi)) {
2060 gfs2_glock_iter_free(gi); 2145 gfs2_glock_iter_free(gi);
2061 return NULL; 2146 return NULL;
@@ -2082,7 +2167,9 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
2082 2167
2083static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr) 2168static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
2084{ 2169{
2085 /* nothing for now */ 2170 struct glock_iter *gi = iter_ptr;
2171 if (gi)
2172 gfs2_glock_iter_free(gi);
2086} 2173}
2087 2174
2088static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr) 2175static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
@@ -2095,7 +2182,7 @@ static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
2095 return 0; 2182 return 0;
2096} 2183}
2097 2184
2098static struct seq_operations gfs2_glock_seq_ops = { 2185static const struct seq_operations gfs2_glock_seq_ops = {
2099 .start = gfs2_glock_seq_start, 2186 .start = gfs2_glock_seq_start,
2100 .next = gfs2_glock_seq_next, 2187 .next = gfs2_glock_seq_next,
2101 .stop = gfs2_glock_seq_stop, 2188 .stop = gfs2_glock_seq_stop,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 7721ca3fff9e..b16f604eea9f 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,6 +26,7 @@
26#define GL_SKIP 0x00000100 26#define GL_SKIP 0x00000100
27#define GL_ATIME 0x00000200 27#define GL_ATIME 0x00000200
28#define GL_NOCACHE 0x00000400 28#define GL_NOCACHE 0x00000400
29#define GL_FLOCK 0x00000800
29#define GL_NOCANCEL 0x00001000 30#define GL_NOCANCEL 0x00001000
30 31
31#define GLR_TRYFAILED 13 32#define GLR_TRYFAILED 13
@@ -132,11 +133,11 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
132 133
133void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); 134void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
134void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 135void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
135
136void gfs2_scand_internal(struct gfs2_sbd *sdp);
137void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); 136void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
138 137
139int __init gfs2_glock_init(void); 138int __init gfs2_glock_init(void);
139void gfs2_glock_exit(void);
140
140int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); 141int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
141void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); 142void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
142int gfs2_register_debugfs(void); 143int gfs2_register_debugfs(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 777ca46010e8..4670dcb2a877 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -41,7 +41,6 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
41 struct list_head *head = &gl->gl_ail_list; 41 struct list_head *head = &gl->gl_ail_list;
42 struct gfs2_bufdata *bd; 42 struct gfs2_bufdata *bd;
43 struct buffer_head *bh; 43 struct buffer_head *bh;
44 u64 blkno;
45 int error; 44 int error;
46 45
47 blocks = atomic_read(&gl->gl_ail_count); 46 blocks = atomic_read(&gl->gl_ail_count);
@@ -57,19 +56,12 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
57 bd = list_entry(head->next, struct gfs2_bufdata, 56 bd = list_entry(head->next, struct gfs2_bufdata,
58 bd_ail_gl_list); 57 bd_ail_gl_list);
59 bh = bd->bd_bh; 58 bh = bd->bd_bh;
60 blkno = bh->b_blocknr; 59 gfs2_remove_from_ail(NULL, bd);
60 bd->bd_bh = NULL;
61 bh->b_private = NULL;
62 bd->bd_blkno = bh->b_blocknr;
61 gfs2_assert_withdraw(sdp, !buffer_busy(bh)); 63 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
62 64 gfs2_trans_add_revoke(sdp, bd);
63 bd->bd_ail = NULL;
64 list_del(&bd->bd_ail_st_list);
65 list_del(&bd->bd_ail_gl_list);
66 atomic_dec(&gl->gl_ail_count);
67 brelse(bh);
68 gfs2_log_unlock(sdp);
69
70 gfs2_trans_add_revoke(sdp, blkno);
71
72 gfs2_log_lock(sdp);
73 } 65 }
74 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); 66 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
75 gfs2_log_unlock(sdp); 67 gfs2_log_unlock(sdp);
@@ -156,9 +148,11 @@ static void inode_go_sync(struct gfs2_glock *gl)
156 ip = NULL; 148 ip = NULL;
157 149
158 if (test_bit(GLF_DIRTY, &gl->gl_flags)) { 150 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
159 if (ip) 151 if (ip && !gfs2_is_jdata(ip))
160 filemap_fdatawrite(ip->i_inode.i_mapping); 152 filemap_fdatawrite(ip->i_inode.i_mapping);
161 gfs2_log_flush(gl->gl_sbd, gl); 153 gfs2_log_flush(gl->gl_sbd, gl);
154 if (ip && gfs2_is_jdata(ip))
155 filemap_fdatawrite(ip->i_inode.i_mapping);
162 gfs2_meta_sync(gl); 156 gfs2_meta_sync(gl);
163 if (ip) { 157 if (ip) {
164 struct address_space *mapping = ip->i_inode.i_mapping; 158 struct address_space *mapping = ip->i_inode.i_mapping;
@@ -452,6 +446,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
452 .go_lock = inode_go_lock, 446 .go_lock = inode_go_lock,
453 .go_unlock = inode_go_unlock, 447 .go_unlock = inode_go_unlock,
454 .go_type = LM_TYPE_INODE, 448 .go_type = LM_TYPE_INODE,
449 .go_min_hold_time = HZ / 10,
455}; 450};
456 451
457const struct gfs2_glock_operations gfs2_rgrp_glops = { 452const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -462,6 +457,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
462 .go_lock = rgrp_go_lock, 457 .go_lock = rgrp_go_lock,
463 .go_unlock = rgrp_go_unlock, 458 .go_unlock = rgrp_go_unlock,
464 .go_type = LM_TYPE_RGRP, 459 .go_type = LM_TYPE_RGRP,
460 .go_min_hold_time = HZ / 10,
465}; 461};
466 462
467const struct gfs2_glock_operations gfs2_trans_glops = { 463const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 170ba93829c0..eaddfb5a8e6f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
11#define __INCORE_DOT_H__ 11#define __INCORE_DOT_H__
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/workqueue.h>
14 15
15#define DIO_WAIT 0x00000010 16#define DIO_WAIT 0x00000010
16#define DIO_METADATA 0x00000020 17#define DIO_METADATA 0x00000020
@@ -113,7 +114,13 @@ struct gfs2_bufdata {
113 struct buffer_head *bd_bh; 114 struct buffer_head *bd_bh;
114 struct gfs2_glock *bd_gl; 115 struct gfs2_glock *bd_gl;
115 116
116 struct list_head bd_list_tr; 117 union {
118 struct list_head list_tr;
119 u64 blkno;
120 } u;
121#define bd_list_tr u.list_tr
122#define bd_blkno u.blkno
123
117 struct gfs2_log_element bd_le; 124 struct gfs2_log_element bd_le;
118 125
119 struct gfs2_ail *bd_ail; 126 struct gfs2_ail *bd_ail;
@@ -130,6 +137,7 @@ struct gfs2_glock_operations {
130 int (*go_lock) (struct gfs2_holder *gh); 137 int (*go_lock) (struct gfs2_holder *gh);
131 void (*go_unlock) (struct gfs2_holder *gh); 138 void (*go_unlock) (struct gfs2_holder *gh);
132 const int go_type; 139 const int go_type;
140 const unsigned long go_min_hold_time;
133}; 141};
134 142
135enum { 143enum {
@@ -161,6 +169,7 @@ enum {
161 GLF_LOCK = 1, 169 GLF_LOCK = 1,
162 GLF_STICKY = 2, 170 GLF_STICKY = 2,
163 GLF_DEMOTE = 3, 171 GLF_DEMOTE = 3,
172 GLF_PENDING_DEMOTE = 4,
164 GLF_DIRTY = 5, 173 GLF_DIRTY = 5,
165}; 174};
166 175
@@ -193,6 +202,7 @@ struct gfs2_glock {
193 202
194 u64 gl_vn; 203 u64 gl_vn;
195 unsigned long gl_stamp; 204 unsigned long gl_stamp;
205 unsigned long gl_tchange;
196 void *gl_object; 206 void *gl_object;
197 207
198 struct list_head gl_reclaim; 208 struct list_head gl_reclaim;
@@ -203,6 +213,7 @@ struct gfs2_glock {
203 struct gfs2_log_element gl_le; 213 struct gfs2_log_element gl_le;
204 struct list_head gl_ail_list; 214 struct list_head gl_ail_list;
205 atomic_t gl_ail_count; 215 atomic_t gl_ail_count;
216 struct delayed_work gl_work;
206}; 217};
207 218
208struct gfs2_alloc { 219struct gfs2_alloc {
@@ -293,11 +304,6 @@ struct gfs2_file {
293 struct gfs2_holder f_fl_gh; 304 struct gfs2_holder f_fl_gh;
294}; 305};
295 306
296struct gfs2_revoke {
297 struct gfs2_log_element rv_le;
298 u64 rv_blkno;
299};
300
301struct gfs2_revoke_replay { 307struct gfs2_revoke_replay {
302 struct list_head rr_list; 308 struct list_head rr_list;
303 u64 rr_blkno; 309 u64 rr_blkno;
@@ -335,12 +341,6 @@ struct gfs2_quota_data {
335 unsigned long qd_last_touched; 341 unsigned long qd_last_touched;
336}; 342};
337 343
338struct gfs2_log_buf {
339 struct list_head lb_list;
340 struct buffer_head *lb_bh;
341 struct buffer_head *lb_real;
342};
343
344struct gfs2_trans { 344struct gfs2_trans {
345 unsigned long tr_ip; 345 unsigned long tr_ip;
346 346
@@ -429,7 +429,6 @@ struct gfs2_tune {
429 unsigned int gt_log_flush_secs; 429 unsigned int gt_log_flush_secs;
430 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */ 430 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
431 431
432 unsigned int gt_scand_secs;
433 unsigned int gt_recoverd_secs; 432 unsigned int gt_recoverd_secs;
434 unsigned int gt_logd_secs; 433 unsigned int gt_logd_secs;
435 unsigned int gt_quotad_secs; 434 unsigned int gt_quotad_secs;
@@ -574,7 +573,6 @@ struct gfs2_sbd {
574 573
575 /* Daemon stuff */ 574 /* Daemon stuff */
576 575
577 struct task_struct *sd_scand_process;
578 struct task_struct *sd_recoverd_process; 576 struct task_struct *sd_recoverd_process;
579 struct task_struct *sd_logd_process; 577 struct task_struct *sd_logd_process;
580 struct task_struct *sd_quotad_process; 578 struct task_struct *sd_quotad_process;
@@ -609,13 +607,13 @@ struct gfs2_sbd {
609 unsigned int sd_log_num_revoke; 607 unsigned int sd_log_num_revoke;
610 unsigned int sd_log_num_rg; 608 unsigned int sd_log_num_rg;
611 unsigned int sd_log_num_databuf; 609 unsigned int sd_log_num_databuf;
612 unsigned int sd_log_num_jdata;
613 610
614 struct list_head sd_log_le_gl; 611 struct list_head sd_log_le_gl;
615 struct list_head sd_log_le_buf; 612 struct list_head sd_log_le_buf;
616 struct list_head sd_log_le_revoke; 613 struct list_head sd_log_le_revoke;
617 struct list_head sd_log_le_rg; 614 struct list_head sd_log_le_rg;
618 struct list_head sd_log_le_databuf; 615 struct list_head sd_log_le_databuf;
616 struct list_head sd_log_le_ordered;
619 617
620 unsigned int sd_log_blks_free; 618 unsigned int sd_log_blks_free;
621 struct mutex sd_log_reserve_mutex; 619 struct mutex sd_log_reserve_mutex;
@@ -627,7 +625,8 @@ struct gfs2_sbd {
627 625
628 unsigned long sd_log_flush_time; 626 unsigned long sd_log_flush_time;
629 struct rw_semaphore sd_log_flush_lock; 627 struct rw_semaphore sd_log_flush_lock;
630 struct list_head sd_log_flush_list; 628 atomic_t sd_log_in_flight;
629 wait_queue_head_t sd_log_flush_wait;
631 630
632 unsigned int sd_log_flush_head; 631 unsigned int sd_log_flush_head;
633 u64 sd_log_flush_wrapped; 632 u64 sd_log_flush_wrapped;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 34f7bcdea1e9..5f6dc32946cd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -77,6 +77,49 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
77 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); 77 return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
78} 78}
79 79
80struct gfs2_skip_data {
81 u64 no_addr;
82 int skipped;
83};
84
85static int iget_skip_test(struct inode *inode, void *opaque)
86{
87 struct gfs2_inode *ip = GFS2_I(inode);
88 struct gfs2_skip_data *data = opaque;
89
90 if (ip->i_no_addr == data->no_addr && inode->i_private != NULL){
91 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
92 data->skipped = 1;
93 return 0;
94 }
95 return 1;
96 }
97 return 0;
98}
99
100static int iget_skip_set(struct inode *inode, void *opaque)
101{
102 struct gfs2_inode *ip = GFS2_I(inode);
103 struct gfs2_skip_data *data = opaque;
104
105 if (data->skipped)
106 return 1;
107 inode->i_ino = (unsigned long)(data->no_addr);
108 ip->i_no_addr = data->no_addr;
109 return 0;
110}
111
112static struct inode *gfs2_iget_skip(struct super_block *sb,
113 u64 no_addr)
114{
115 struct gfs2_skip_data data;
116 unsigned long hash = (unsigned long)no_addr;
117
118 data.no_addr = no_addr;
119 data.skipped = 0;
120 return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
121}
122
80/** 123/**
81 * GFS2 lookup code fills in vfs inode contents based on info obtained 124 * GFS2 lookup code fills in vfs inode contents based on info obtained
82 * from directory entry inside gfs2_inode_lookup(). This has caused issues 125 * from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -112,6 +155,7 @@ void gfs2_set_iop(struct inode *inode)
112 * @sb: The super block 155 * @sb: The super block
113 * @no_addr: The inode number 156 * @no_addr: The inode number
114 * @type: The type of the inode 157 * @type: The type of the inode
158 * @skip_freeing: set this not return an inode if it is currently being freed.
115 * 159 *
116 * Returns: A VFS inode, or an error 160 * Returns: A VFS inode, or an error
117 */ 161 */
@@ -119,13 +163,19 @@ void gfs2_set_iop(struct inode *inode)
119struct inode *gfs2_inode_lookup(struct super_block *sb, 163struct inode *gfs2_inode_lookup(struct super_block *sb,
120 unsigned int type, 164 unsigned int type,
121 u64 no_addr, 165 u64 no_addr,
122 u64 no_formal_ino) 166 u64 no_formal_ino, int skip_freeing)
123{ 167{
124 struct inode *inode = gfs2_iget(sb, no_addr); 168 struct inode *inode;
125 struct gfs2_inode *ip = GFS2_I(inode); 169 struct gfs2_inode *ip;
126 struct gfs2_glock *io_gl; 170 struct gfs2_glock *io_gl;
127 int error; 171 int error;
128 172
173 if (skip_freeing)
174 inode = gfs2_iget_skip(sb, no_addr);
175 else
176 inode = gfs2_iget(sb, no_addr);
177 ip = GFS2_I(inode);
178
129 if (!inode) 179 if (!inode)
130 return ERR_PTR(-ENOBUFS); 180 return ERR_PTR(-ENOBUFS);
131 181
@@ -244,6 +294,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
244 return 0; 294 return 0;
245} 295}
246 296
297static void gfs2_inode_bh(struct gfs2_inode *ip, struct buffer_head *bh)
298{
299 ip->i_cache[0] = bh;
300}
301
247/** 302/**
248 * gfs2_inode_refresh - Refresh the incore copy of the dinode 303 * gfs2_inode_refresh - Refresh the incore copy of the dinode
249 * @ip: The GFS2 inode 304 * @ip: The GFS2 inode
@@ -688,7 +743,7 @@ out:
688static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, 743static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
689 const struct gfs2_inum_host *inum, unsigned int mode, 744 const struct gfs2_inum_host *inum, unsigned int mode,
690 unsigned int uid, unsigned int gid, 745 unsigned int uid, unsigned int gid,
691 const u64 *generation, dev_t dev) 746 const u64 *generation, dev_t dev, struct buffer_head **bhp)
692{ 747{
693 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 748 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
694 struct gfs2_dinode *di; 749 struct gfs2_dinode *di;
@@ -743,13 +798,15 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
743 di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec); 798 di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
744 di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); 799 di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
745 memset(&di->di_reserved, 0, sizeof(di->di_reserved)); 800 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
801
802 set_buffer_uptodate(dibh);
746 803
747 brelse(dibh); 804 *bhp = dibh;
748} 805}
749 806
750static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, 807static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
751 unsigned int mode, const struct gfs2_inum_host *inum, 808 unsigned int mode, const struct gfs2_inum_host *inum,
752 const u64 *generation, dev_t dev) 809 const u64 *generation, dev_t dev, struct buffer_head **bhp)
753{ 810{
754 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 811 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
755 unsigned int uid, gid; 812 unsigned int uid, gid;
@@ -770,7 +827,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
770 if (error) 827 if (error)
771 goto out_quota; 828 goto out_quota;
772 829
773 init_dinode(dip, gl, inum, mode, uid, gid, generation, dev); 830 init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, bhp);
774 gfs2_quota_change(dip, +1, uid, gid); 831 gfs2_quota_change(dip, +1, uid, gid);
775 gfs2_trans_end(sdp); 832 gfs2_trans_end(sdp);
776 833
@@ -909,6 +966,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
909 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; 966 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
910 int error; 967 int error;
911 u64 generation; 968 u64 generation;
969 struct buffer_head *bh=NULL;
912 970
913 if (!name->len || name->len > GFS2_FNAMESIZE) 971 if (!name->len || name->len > GFS2_FNAMESIZE)
914 return ERR_PTR(-ENAMETOOLONG); 972 return ERR_PTR(-ENAMETOOLONG);
@@ -935,16 +993,18 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
935 if (error) 993 if (error)
936 goto fail_gunlock; 994 goto fail_gunlock;
937 995
938 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev); 996 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, &bh);
939 if (error) 997 if (error)
940 goto fail_gunlock2; 998 goto fail_gunlock2;
941 999
942 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), 1000 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
943 inum.no_addr, 1001 inum.no_addr,
944 inum.no_formal_ino); 1002 inum.no_formal_ino, 0);
945 if (IS_ERR(inode)) 1003 if (IS_ERR(inode))
946 goto fail_gunlock2; 1004 goto fail_gunlock2;
947 1005
1006 gfs2_inode_bh(GFS2_I(inode), bh);
1007
948 error = gfs2_inode_refresh(GFS2_I(inode)); 1008 error = gfs2_inode_refresh(GFS2_I(inode));
949 if (error) 1009 if (error)
950 goto fail_gunlock2; 1010 goto fail_gunlock2;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 4517ac82c01c..351ac87ab384 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -49,7 +49,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
49void gfs2_inode_attr_in(struct gfs2_inode *ip); 49void gfs2_inode_attr_in(struct gfs2_inode *ip);
50void gfs2_set_iop(struct inode *inode); 50void gfs2_set_iop(struct inode *inode);
51struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 51struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
52 u64 no_addr, u64 no_formal_ino); 52 u64 no_addr, u64 no_formal_ino,
53 int skip_freeing);
53struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 54struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
54 55
55int gfs2_inode_refresh(struct gfs2_inode *ip); 56int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 24d70f73b651..9e8265d28377 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -13,7 +13,6 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h> 16#include <linux/types.h>
18#include <linux/string.h> 17#include <linux/string.h>
19#include <linux/list.h> 18#include <linux/list.h>
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index fba1f1d87e4f..1f7b038530b4 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -346,15 +346,16 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
346 346
347static unsigned int dev_poll(struct file *file, poll_table *wait) 347static unsigned int dev_poll(struct file *file, poll_table *wait)
348{ 348{
349 unsigned int mask = 0;
350
349 poll_wait(file, &send_wq, wait); 351 poll_wait(file, &send_wq, wait);
350 352
351 spin_lock(&ops_lock); 353 spin_lock(&ops_lock);
352 if (!list_empty(&send_list)) { 354 if (!list_empty(&send_list))
353 spin_unlock(&ops_lock); 355 mask = POLLIN | POLLRDNORM;
354 return POLLIN | POLLRDNORM;
355 }
356 spin_unlock(&ops_lock); 356 spin_unlock(&ops_lock);
357 return 0; 357
358 return mask;
358} 359}
359 360
360static const struct file_operations dev_fops = { 361static const struct file_operations dev_fops = {
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index d9fe3ca40e18..ae9e6a25fe2b 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -190,7 +190,6 @@ static struct kobj_type gdlm_ktype = {
190}; 190};
191 191
192static struct kset gdlm_kset = { 192static struct kset gdlm_kset = {
193 .kobj = {.name = "lock_dlm",},
194 .ktype = &gdlm_ktype, 193 .ktype = &gdlm_ktype,
195}; 194};
196 195
@@ -224,6 +223,7 @@ int gdlm_sysfs_init(void)
224{ 223{
225 int error; 224 int error;
226 225
226 kobject_set_name(&gdlm_kset.kobj, "lock_dlm");
227 kobj_set_kset_s(&gdlm_kset, kernel_subsys); 227 kobj_set_kset_s(&gdlm_kset, kernel_subsys);
228 error = kset_register(&gdlm_kset); 228 error = kset_register(&gdlm_kset);
229 if (error) 229 if (error)
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index 1aca51e45092..bd938f06481d 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -268,20 +268,16 @@ static inline int check_drop(struct gdlm_ls *ls)
268 return 0; 268 return 0;
269} 269}
270 270
271static int gdlm_thread(void *data) 271static int gdlm_thread(void *data, int blist)
272{ 272{
273 struct gdlm_ls *ls = (struct gdlm_ls *) data; 273 struct gdlm_ls *ls = (struct gdlm_ls *) data;
274 struct gdlm_lock *lp = NULL; 274 struct gdlm_lock *lp = NULL;
275 int blist = 0;
276 uint8_t complete, blocking, submit, drop; 275 uint8_t complete, blocking, submit, drop;
277 DECLARE_WAITQUEUE(wait, current); 276 DECLARE_WAITQUEUE(wait, current);
278 277
279 /* Only thread1 is allowed to do blocking callbacks since gfs 278 /* Only thread1 is allowed to do blocking callbacks since gfs
280 may wait for a completion callback within a blocking cb. */ 279 may wait for a completion callback within a blocking cb. */
281 280
282 if (current == ls->thread1)
283 blist = 1;
284
285 while (!kthread_should_stop()) { 281 while (!kthread_should_stop()) {
286 set_current_state(TASK_INTERRUPTIBLE); 282 set_current_state(TASK_INTERRUPTIBLE);
287 add_wait_queue(&ls->thread_wait, &wait); 283 add_wait_queue(&ls->thread_wait, &wait);
@@ -333,12 +329,22 @@ static int gdlm_thread(void *data)
333 return 0; 329 return 0;
334} 330}
335 331
332static int gdlm_thread1(void *data)
333{
334 return gdlm_thread(data, 1);
335}
336
337static int gdlm_thread2(void *data)
338{
339 return gdlm_thread(data, 0);
340}
341
336int gdlm_init_threads(struct gdlm_ls *ls) 342int gdlm_init_threads(struct gdlm_ls *ls)
337{ 343{
338 struct task_struct *p; 344 struct task_struct *p;
339 int error; 345 int error;
340 346
341 p = kthread_run(gdlm_thread, ls, "lock_dlm1"); 347 p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
342 error = IS_ERR(p); 348 error = IS_ERR(p);
343 if (error) { 349 if (error) {
344 log_error("can't start lock_dlm1 thread %d", error); 350 log_error("can't start lock_dlm1 thread %d", error);
@@ -346,7 +352,7 @@ int gdlm_init_threads(struct gdlm_ls *ls)
346 } 352 }
347 ls->thread1 = p; 353 ls->thread1 = p;
348 354
349 p = kthread_run(gdlm_thread, ls, "lock_dlm2"); 355 p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
350 error = IS_ERR(p); 356 error = IS_ERR(p);
351 if (error) { 357 if (error) {
352 log_error("can't start lock_dlm2 thread %d", error); 358 log_error("can't start lock_dlm2 thread %d", error);
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index 0d149c8c493a..d3b8ce6fbbe3 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h> 12#include <linux/init.h>
14#include <linux/types.h> 13#include <linux/types.h>
15#include <linux/fs.h> 14#include <linux/fs.h>
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f49a12e24086..7df702473252 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -60,6 +60,26 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
60} 60}
61 61
62/** 62/**
63 * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters
64 * @mapping: The associated mapping (maybe NULL)
65 * @bd: The gfs2_bufdata to remove
66 *
67 * The log lock _must_ be held when calling this function
68 *
69 */
70
71void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd)
72{
73 bd->bd_ail = NULL;
74 list_del_init(&bd->bd_ail_st_list);
75 list_del_init(&bd->bd_ail_gl_list);
76 atomic_dec(&bd->bd_gl->gl_ail_count);
77 if (mapping)
78 gfs2_meta_cache_flush(GFS2_I(mapping->host));
79 brelse(bd->bd_bh);
80}
81
82/**
63 * gfs2_ail1_start_one - Start I/O on a part of the AIL 83 * gfs2_ail1_start_one - Start I/O on a part of the AIL
64 * @sdp: the filesystem 84 * @sdp: the filesystem
65 * @tr: the part of the AIL 85 * @tr: the part of the AIL
@@ -83,17 +103,9 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
83 103
84 gfs2_assert(sdp, bd->bd_ail == ai); 104 gfs2_assert(sdp, bd->bd_ail == ai);
85 105
86 if (!bh){
87 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
88 continue;
89 }
90
91 if (!buffer_busy(bh)) { 106 if (!buffer_busy(bh)) {
92 if (!buffer_uptodate(bh)) { 107 if (!buffer_uptodate(bh))
93 gfs2_log_unlock(sdp);
94 gfs2_io_error_bh(sdp, bh); 108 gfs2_io_error_bh(sdp, bh);
95 gfs2_log_lock(sdp);
96 }
97 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); 109 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
98 continue; 110 continue;
99 } 111 }
@@ -103,9 +115,16 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
103 115
104 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); 116 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
105 117
118 get_bh(bh);
106 gfs2_log_unlock(sdp); 119 gfs2_log_unlock(sdp);
107 wait_on_buffer(bh); 120 lock_buffer(bh);
108 ll_rw_block(WRITE, 1, &bh); 121 if (test_clear_buffer_dirty(bh)) {
122 bh->b_end_io = end_buffer_write_sync;
123 submit_bh(WRITE, bh);
124 } else {
125 unlock_buffer(bh);
126 brelse(bh);
127 }
109 gfs2_log_lock(sdp); 128 gfs2_log_lock(sdp);
110 129
111 retry = 1; 130 retry = 1;
@@ -130,11 +149,6 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
130 bd_ail_st_list) { 149 bd_ail_st_list) {
131 bh = bd->bd_bh; 150 bh = bd->bd_bh;
132 151
133 if (!bh){
134 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
135 continue;
136 }
137
138 gfs2_assert(sdp, bd->bd_ail == ai); 152 gfs2_assert(sdp, bd->bd_ail == ai);
139 153
140 if (buffer_busy(bh)) { 154 if (buffer_busy(bh)) {
@@ -155,13 +169,14 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
155 169
156static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) 170static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
157{ 171{
158 struct list_head *head = &sdp->sd_ail1_list; 172 struct list_head *head;
159 u64 sync_gen; 173 u64 sync_gen;
160 struct list_head *first; 174 struct list_head *first;
161 struct gfs2_ail *first_ai, *ai, *tmp; 175 struct gfs2_ail *first_ai, *ai, *tmp;
162 int done = 0; 176 int done = 0;
163 177
164 gfs2_log_lock(sdp); 178 gfs2_log_lock(sdp);
179 head = &sdp->sd_ail1_list;
165 if (list_empty(head)) { 180 if (list_empty(head)) {
166 gfs2_log_unlock(sdp); 181 gfs2_log_unlock(sdp);
167 return; 182 return;
@@ -233,11 +248,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
233 bd = list_entry(head->prev, struct gfs2_bufdata, 248 bd = list_entry(head->prev, struct gfs2_bufdata,
234 bd_ail_st_list); 249 bd_ail_st_list);
235 gfs2_assert(sdp, bd->bd_ail == ai); 250 gfs2_assert(sdp, bd->bd_ail == ai);
236 bd->bd_ail = NULL; 251 gfs2_remove_from_ail(bd->bd_bh->b_page->mapping, bd);
237 list_del(&bd->bd_ail_st_list);
238 list_del(&bd->bd_ail_gl_list);
239 atomic_dec(&bd->bd_gl->gl_ail_count);
240 brelse(bd->bd_bh);
241 } 252 }
242} 253}
243 254
@@ -439,10 +450,10 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
439 return tail; 450 return tail;
440} 451}
441 452
442static inline void log_incr_head(struct gfs2_sbd *sdp) 453void gfs2_log_incr_head(struct gfs2_sbd *sdp)
443{ 454{
444 if (sdp->sd_log_flush_head == sdp->sd_log_tail) 455 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
445 gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head); 456 BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head);
446 457
447 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { 458 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
448 sdp->sd_log_flush_head = 0; 459 sdp->sd_log_flush_head = 0;
@@ -451,6 +462,23 @@ static inline void log_incr_head(struct gfs2_sbd *sdp)
451} 462}
452 463
453/** 464/**
465 * gfs2_log_write_endio - End of I/O for a log buffer
466 * @bh: The buffer head
467 * @uptodate: I/O Status
468 *
469 */
470
471static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate)
472{
473 struct gfs2_sbd *sdp = bh->b_private;
474 bh->b_private = NULL;
475
476 end_buffer_write_sync(bh, uptodate);
477 if (atomic_dec_and_test(&sdp->sd_log_in_flight))
478 wake_up(&sdp->sd_log_flush_wait);
479}
480
481/**
454 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data 482 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
455 * @sdp: The GFS2 superblock 483 * @sdp: The GFS2 superblock
456 * 484 *
@@ -460,25 +488,43 @@ static inline void log_incr_head(struct gfs2_sbd *sdp)
460struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) 488struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
461{ 489{
462 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); 490 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
463 struct gfs2_log_buf *lb;
464 struct buffer_head *bh; 491 struct buffer_head *bh;
465 492
466 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL); 493 bh = sb_getblk(sdp->sd_vfs, blkno);
467 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
468
469 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
470 lock_buffer(bh); 494 lock_buffer(bh);
471 memset(bh->b_data, 0, bh->b_size); 495 memset(bh->b_data, 0, bh->b_size);
472 set_buffer_uptodate(bh); 496 set_buffer_uptodate(bh);
473 clear_buffer_dirty(bh); 497 clear_buffer_dirty(bh);
474 unlock_buffer(bh); 498 gfs2_log_incr_head(sdp);
475 499 atomic_inc(&sdp->sd_log_in_flight);
476 log_incr_head(sdp); 500 bh->b_private = sdp;
501 bh->b_end_io = gfs2_log_write_endio;
477 502
478 return bh; 503 return bh;
479} 504}
480 505
481/** 506/**
507 * gfs2_fake_write_endio -
508 * @bh: The buffer head
509 * @uptodate: The I/O Status
510 *
511 */
512
513static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate)
514{
515 struct buffer_head *real_bh = bh->b_private;
516 struct gfs2_bufdata *bd = real_bh->b_private;
517 struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd;
518
519 end_buffer_write_sync(bh, uptodate);
520 free_buffer_head(bh);
521 unlock_buffer(real_bh);
522 brelse(real_bh);
523 if (atomic_dec_and_test(&sdp->sd_log_in_flight))
524 wake_up(&sdp->sd_log_flush_wait);
525}
526
527/**
482 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log 528 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
483 * @sdp: the filesystem 529 * @sdp: the filesystem
484 * @data: the data the buffer_head should point to 530 * @data: the data the buffer_head should point to
@@ -490,22 +536,20 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
490 struct buffer_head *real) 536 struct buffer_head *real)
491{ 537{
492 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); 538 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
493 struct gfs2_log_buf *lb;
494 struct buffer_head *bh; 539 struct buffer_head *bh;
495 540
496 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL); 541 bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
497 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
498 lb->lb_real = real;
499
500 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
501 atomic_set(&bh->b_count, 1); 542 atomic_set(&bh->b_count, 1);
502 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate); 543 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
503 set_bh_page(bh, real->b_page, bh_offset(real)); 544 set_bh_page(bh, real->b_page, bh_offset(real));
504 bh->b_blocknr = blkno; 545 bh->b_blocknr = blkno;
505 bh->b_size = sdp->sd_sb.sb_bsize; 546 bh->b_size = sdp->sd_sb.sb_bsize;
506 bh->b_bdev = sdp->sd_vfs->s_bdev; 547 bh->b_bdev = sdp->sd_vfs->s_bdev;
548 bh->b_private = real;
549 bh->b_end_io = gfs2_fake_write_endio;
507 550
508 log_incr_head(sdp); 551 gfs2_log_incr_head(sdp);
552 atomic_inc(&sdp->sd_log_in_flight);
509 553
510 return bh; 554 return bh;
511} 555}
@@ -572,45 +616,75 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
572 gfs2_assert_withdraw(sdp, !pull); 616 gfs2_assert_withdraw(sdp, !pull);
573 617
574 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); 618 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
575 log_incr_head(sdp); 619 gfs2_log_incr_head(sdp);
576} 620}
577 621
578static void log_flush_commit(struct gfs2_sbd *sdp) 622static void log_flush_commit(struct gfs2_sbd *sdp)
579{ 623{
580 struct list_head *head = &sdp->sd_log_flush_list; 624 DEFINE_WAIT(wait);
581 struct gfs2_log_buf *lb; 625
582 struct buffer_head *bh; 626 if (atomic_read(&sdp->sd_log_in_flight)) {
583 int flushcount = 0; 627 do {
628 prepare_to_wait(&sdp->sd_log_flush_wait, &wait,
629 TASK_UNINTERRUPTIBLE);
630 if (atomic_read(&sdp->sd_log_in_flight))
631 io_schedule();
632 } while(atomic_read(&sdp->sd_log_in_flight));
633 finish_wait(&sdp->sd_log_flush_wait, &wait);
634 }
584 635
585 while (!list_empty(head)) { 636 log_write_header(sdp, 0, 0);
586 lb = list_entry(head->next, struct gfs2_log_buf, lb_list); 637}
587 list_del(&lb->lb_list);
588 bh = lb->lb_bh;
589 638
590 wait_on_buffer(bh); 639static void gfs2_ordered_write(struct gfs2_sbd *sdp)
591 if (!buffer_uptodate(bh)) 640{
592 gfs2_io_error_bh(sdp, bh); 641 struct gfs2_bufdata *bd;
593 if (lb->lb_real) { 642 struct buffer_head *bh;
594 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */ 643 LIST_HEAD(written);
595 schedule(); 644
596 free_buffer_head(bh); 645 gfs2_log_lock(sdp);
597 } else 646 while (!list_empty(&sdp->sd_log_le_ordered)) {
647 bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list);
648 list_move(&bd->bd_le.le_list, &written);
649 bh = bd->bd_bh;
650 if (!buffer_dirty(bh))
651 continue;
652 get_bh(bh);
653 gfs2_log_unlock(sdp);
654 lock_buffer(bh);
655 if (test_clear_buffer_dirty(bh)) {
656 bh->b_end_io = end_buffer_write_sync;
657 submit_bh(WRITE, bh);
658 } else {
659 unlock_buffer(bh);
598 brelse(bh); 660 brelse(bh);
599 kfree(lb); 661 }
600 flushcount++; 662 gfs2_log_lock(sdp);
601 } 663 }
664 list_splice(&written, &sdp->sd_log_le_ordered);
665 gfs2_log_unlock(sdp);
666}
602 667
603 /* If nothing was journaled, the header is unplanned and unwanted. */ 668static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
604 if (flushcount) { 669{
605 log_write_header(sdp, 0, 0); 670 struct gfs2_bufdata *bd;
606 } else { 671 struct buffer_head *bh;
607 unsigned int tail;
608 tail = current_tail(sdp);
609 672
610 gfs2_ail1_empty(sdp, 0); 673 gfs2_log_lock(sdp);
611 if (sdp->sd_log_tail != tail) 674 while (!list_empty(&sdp->sd_log_le_ordered)) {
612 log_pull_tail(sdp, tail); 675 bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list);
676 bh = bd->bd_bh;
677 if (buffer_locked(bh)) {
678 get_bh(bh);
679 gfs2_log_unlock(sdp);
680 wait_on_buffer(bh);
681 brelse(bh);
682 gfs2_log_lock(sdp);
683 continue;
684 }
685 list_del_init(&bd->bd_le.le_list);
613 } 686 }
687 gfs2_log_unlock(sdp);
614} 688}
615 689
616/** 690/**
@@ -640,10 +714,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
640 INIT_LIST_HEAD(&ai->ai_ail1_list); 714 INIT_LIST_HEAD(&ai->ai_ail1_list);
641 INIT_LIST_HEAD(&ai->ai_ail2_list); 715 INIT_LIST_HEAD(&ai->ai_ail2_list);
642 716
643 gfs2_assert_withdraw(sdp, 717 if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
644 sdp->sd_log_num_buf + sdp->sd_log_num_jdata == 718 printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
645 sdp->sd_log_commited_buf + 719 sdp->sd_log_commited_buf);
646 sdp->sd_log_commited_databuf); 720 gfs2_assert_withdraw(sdp, 0);
721 }
722 if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
723 printk(KERN_INFO "GFS2: log databuf %u %u\n",
724 sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
725 gfs2_assert_withdraw(sdp, 0);
726 }
647 gfs2_assert_withdraw(sdp, 727 gfs2_assert_withdraw(sdp,
648 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); 728 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
649 729
@@ -651,8 +731,11 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
651 sdp->sd_log_flush_wrapped = 0; 731 sdp->sd_log_flush_wrapped = 0;
652 ai->ai_first = sdp->sd_log_flush_head; 732 ai->ai_first = sdp->sd_log_flush_head;
653 733
734 gfs2_ordered_write(sdp);
654 lops_before_commit(sdp); 735 lops_before_commit(sdp);
655 if (!list_empty(&sdp->sd_log_flush_list)) 736 gfs2_ordered_wait(sdp);
737
738 if (sdp->sd_log_head != sdp->sd_log_flush_head)
656 log_flush_commit(sdp); 739 log_flush_commit(sdp);
657 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ 740 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
658 gfs2_log_lock(sdp); 741 gfs2_log_lock(sdp);
@@ -744,7 +827,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
744 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); 827 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
745 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl); 828 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
746 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); 829 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
747 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
748 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 830 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
749 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); 831 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
750 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); 832 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 8e7aa0f29109..dae282400627 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -52,12 +52,14 @@ int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
52 52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); 54void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
55void gfs2_log_incr_head(struct gfs2_sbd *sdp);
55 56
56struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
57struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 58struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
58 struct buffer_head *real); 59 struct buffer_head *real);
59void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 60void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
60void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); 61void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
62void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd);
61 63
62void gfs2_log_shutdown(struct gfs2_sbd *sdp); 64void gfs2_log_shutdown(struct gfs2_sbd *sdp);
63void gfs2_meta_syncfs(struct gfs2_sbd *sdp); 65void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 3b395c41b2f3..6c27cea761c6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -27,7 +27,104 @@
27#include "trans.h" 27#include "trans.h"
28#include "util.h" 28#include "util.h"
29 29
30static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) 30/**
31 * gfs2_pin - Pin a buffer in memory
32 * @sdp: The superblock
33 * @bh: The buffer to be pinned
34 *
35 * The log lock must be held when calling this function
36 */
37static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
38{
39 struct gfs2_bufdata *bd;
40
41 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
42
43 clear_buffer_dirty(bh);
44 if (test_set_buffer_pinned(bh))
45 gfs2_assert_withdraw(sdp, 0);
46 if (!buffer_uptodate(bh))
47 gfs2_io_error_bh(sdp, bh);
48 bd = bh->b_private;
49 /* If this buffer is in the AIL and it has already been written
50 * to in-place disk block, remove it from the AIL.
51 */
52 if (bd->bd_ail)
53 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
54 get_bh(bh);
55}
56
57/**
58 * gfs2_unpin - Unpin a buffer
59 * @sdp: the filesystem the buffer belongs to
60 * @bh: The buffer to unpin
61 * @ai:
62 *
63 */
64
65static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
66 struct gfs2_ail *ai)
67{
68 struct gfs2_bufdata *bd = bh->b_private;
69
70 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
71
72 if (!buffer_pinned(bh))
73 gfs2_assert_withdraw(sdp, 0);
74
75 lock_buffer(bh);
76 mark_buffer_dirty(bh);
77 clear_buffer_pinned(bh);
78
79 gfs2_log_lock(sdp);
80 if (bd->bd_ail) {
81 list_del(&bd->bd_ail_st_list);
82 brelse(bh);
83 } else {
84 struct gfs2_glock *gl = bd->bd_gl;
85 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
86 atomic_inc(&gl->gl_ail_count);
87 }
88 bd->bd_ail = ai;
89 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
90 gfs2_log_unlock(sdp);
91 unlock_buffer(bh);
92}
93
94
95static inline struct gfs2_log_descriptor *bh_log_desc(struct buffer_head *bh)
96{
97 return (struct gfs2_log_descriptor *)bh->b_data;
98}
99
100static inline __be64 *bh_log_ptr(struct buffer_head *bh)
101{
102 struct gfs2_log_descriptor *ld = bh_log_desc(bh);
103 return (__force __be64 *)(ld + 1);
104}
105
106static inline __be64 *bh_ptr_end(struct buffer_head *bh)
107{
108 return (__force __be64 *)(bh->b_data + bh->b_size);
109}
110
111
112static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
113{
114 struct buffer_head *bh = gfs2_log_get_buf(sdp);
115 struct gfs2_log_descriptor *ld = bh_log_desc(bh);
116 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
117 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
118 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
119 ld->ld_type = cpu_to_be32(ld_type);
120 ld->ld_length = 0;
121 ld->ld_data1 = 0;
122 ld->ld_data2 = 0;
123 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
124 return bh;
125}
126
127static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
31{ 128{
32 struct gfs2_glock *gl; 129 struct gfs2_glock *gl;
33 struct gfs2_trans *tr = current->journal_info; 130 struct gfs2_trans *tr = current->journal_info;
@@ -38,15 +135,19 @@ static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
38 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl))) 135 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
39 return; 136 return;
40 137
41 gfs2_log_lock(sdp); 138 if (!list_empty(&le->le_list))
42 if (!list_empty(&le->le_list)){
43 gfs2_log_unlock(sdp);
44 return; 139 return;
45 } 140
46 gfs2_glock_hold(gl); 141 gfs2_glock_hold(gl);
47 set_bit(GLF_DIRTY, &gl->gl_flags); 142 set_bit(GLF_DIRTY, &gl->gl_flags);
48 sdp->sd_log_num_gl++; 143 sdp->sd_log_num_gl++;
49 list_add(&le->le_list, &sdp->sd_log_le_gl); 144 list_add(&le->le_list, &sdp->sd_log_le_gl);
145}
146
147static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
148{
149 gfs2_log_lock(sdp);
150 __glock_lo_add(sdp, le);
50 gfs2_log_unlock(sdp); 151 gfs2_log_unlock(sdp);
51} 152}
52 153
@@ -71,30 +172,25 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
71 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); 172 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
72 struct gfs2_trans *tr; 173 struct gfs2_trans *tr;
73 174
175 lock_buffer(bd->bd_bh);
74 gfs2_log_lock(sdp); 176 gfs2_log_lock(sdp);
75 if (!list_empty(&bd->bd_list_tr)) { 177 if (!list_empty(&bd->bd_list_tr))
76 gfs2_log_unlock(sdp); 178 goto out;
77 return;
78 }
79 tr = current->journal_info; 179 tr = current->journal_info;
80 tr->tr_touched = 1; 180 tr->tr_touched = 1;
81 tr->tr_num_buf++; 181 tr->tr_num_buf++;
82 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 182 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
83 gfs2_log_unlock(sdp);
84
85 if (!list_empty(&le->le_list)) 183 if (!list_empty(&le->le_list))
86 return; 184 goto out;
87 185 __glock_lo_add(sdp, &bd->bd_gl->gl_le);
88 gfs2_trans_add_gl(bd->bd_gl);
89
90 gfs2_meta_check(sdp, bd->bd_bh); 186 gfs2_meta_check(sdp, bd->bd_bh);
91 gfs2_pin(sdp, bd->bd_bh); 187 gfs2_pin(sdp, bd->bd_bh);
92 gfs2_log_lock(sdp);
93 sdp->sd_log_num_buf++; 188 sdp->sd_log_num_buf++;
94 list_add(&le->le_list, &sdp->sd_log_le_buf); 189 list_add(&le->le_list, &sdp->sd_log_le_buf);
95 gfs2_log_unlock(sdp);
96
97 tr->tr_num_buf_new++; 190 tr->tr_num_buf_new++;
191out:
192 gfs2_log_unlock(sdp);
193 unlock_buffer(bd->bd_bh);
98} 194}
99 195
100static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 196static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
@@ -117,8 +213,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
117 struct buffer_head *bh; 213 struct buffer_head *bh;
118 struct gfs2_log_descriptor *ld; 214 struct gfs2_log_descriptor *ld;
119 struct gfs2_bufdata *bd1 = NULL, *bd2; 215 struct gfs2_bufdata *bd1 = NULL, *bd2;
120 unsigned int total = sdp->sd_log_num_buf; 216 unsigned int total;
121 unsigned int offset = BUF_OFFSET;
122 unsigned int limit; 217 unsigned int limit;
123 unsigned int num; 218 unsigned int num;
124 unsigned n; 219 unsigned n;
@@ -127,22 +222,20 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
127 limit = buf_limit(sdp); 222 limit = buf_limit(sdp);
128 /* for 4k blocks, limit = 503 */ 223 /* for 4k blocks, limit = 503 */
129 224
225 gfs2_log_lock(sdp);
226 total = sdp->sd_log_num_buf;
130 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list); 227 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
131 while(total) { 228 while(total) {
132 num = total; 229 num = total;
133 if (total > limit) 230 if (total > limit)
134 num = limit; 231 num = limit;
135 bh = gfs2_log_get_buf(sdp); 232 gfs2_log_unlock(sdp);
136 ld = (struct gfs2_log_descriptor *)bh->b_data; 233 bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA);
137 ptr = (__be64 *)(bh->b_data + offset); 234 gfs2_log_lock(sdp);
138 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 235 ld = bh_log_desc(bh);
139 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); 236 ptr = bh_log_ptr(bh);
140 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
141 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
142 ld->ld_length = cpu_to_be32(num + 1); 237 ld->ld_length = cpu_to_be32(num + 1);
143 ld->ld_data1 = cpu_to_be32(num); 238 ld->ld_data1 = cpu_to_be32(num);
144 ld->ld_data2 = cpu_to_be32(0);
145 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
146 239
147 n = 0; 240 n = 0;
148 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf, 241 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
@@ -152,21 +245,27 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
152 break; 245 break;
153 } 246 }
154 247
155 set_buffer_dirty(bh); 248 gfs2_log_unlock(sdp);
156 ll_rw_block(WRITE, 1, &bh); 249 submit_bh(WRITE, bh);
250 gfs2_log_lock(sdp);
157 251
158 n = 0; 252 n = 0;
159 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf, 253 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
160 bd_le.le_list) { 254 bd_le.le_list) {
255 get_bh(bd2->bd_bh);
256 gfs2_log_unlock(sdp);
257 lock_buffer(bd2->bd_bh);
161 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); 258 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
162 set_buffer_dirty(bh); 259 submit_bh(WRITE, bh);
163 ll_rw_block(WRITE, 1, &bh); 260 gfs2_log_lock(sdp);
164 if (++n >= num) 261 if (++n >= num)
165 break; 262 break;
166 } 263 }
167 264
265 BUG_ON(total < num);
168 total -= num; 266 total -= num;
169 } 267 }
268 gfs2_log_unlock(sdp);
170} 269}
171 270
172static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) 271static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
@@ -270,11 +369,8 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
270 tr = current->journal_info; 369 tr = current->journal_info;
271 tr->tr_touched = 1; 370 tr->tr_touched = 1;
272 tr->tr_num_revoke++; 371 tr->tr_num_revoke++;
273
274 gfs2_log_lock(sdp);
275 sdp->sd_log_num_revoke++; 372 sdp->sd_log_num_revoke++;
276 list_add(&le->le_list, &sdp->sd_log_le_revoke); 373 list_add(&le->le_list, &sdp->sd_log_le_revoke);
277 gfs2_log_unlock(sdp);
278} 374}
279 375
280static void revoke_lo_before_commit(struct gfs2_sbd *sdp) 376static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
@@ -284,32 +380,25 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
284 struct buffer_head *bh; 380 struct buffer_head *bh;
285 unsigned int offset; 381 unsigned int offset;
286 struct list_head *head = &sdp->sd_log_le_revoke; 382 struct list_head *head = &sdp->sd_log_le_revoke;
287 struct gfs2_revoke *rv; 383 struct gfs2_bufdata *bd;
288 384
289 if (!sdp->sd_log_num_revoke) 385 if (!sdp->sd_log_num_revoke)
290 return; 386 return;
291 387
292 bh = gfs2_log_get_buf(sdp); 388 bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE);
293 ld = (struct gfs2_log_descriptor *)bh->b_data; 389 ld = bh_log_desc(bh);
294 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
295 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
296 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
297 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
298 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, 390 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
299 sizeof(u64))); 391 sizeof(u64)));
300 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke); 392 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
301 ld->ld_data2 = cpu_to_be32(0);
302 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
303 offset = sizeof(struct gfs2_log_descriptor); 393 offset = sizeof(struct gfs2_log_descriptor);
304 394
305 while (!list_empty(head)) { 395 while (!list_empty(head)) {
306 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list); 396 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
307 list_del_init(&rv->rv_le.le_list); 397 list_del_init(&bd->bd_le.le_list);
308 sdp->sd_log_num_revoke--; 398 sdp->sd_log_num_revoke--;
309 399
310 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { 400 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
311 set_buffer_dirty(bh); 401 submit_bh(WRITE, bh);
312 ll_rw_block(WRITE, 1, &bh);
313 402
314 bh = gfs2_log_get_buf(sdp); 403 bh = gfs2_log_get_buf(sdp);
315 mh = (struct gfs2_meta_header *)bh->b_data; 404 mh = (struct gfs2_meta_header *)bh->b_data;
@@ -319,15 +408,14 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
319 offset = sizeof(struct gfs2_meta_header); 408 offset = sizeof(struct gfs2_meta_header);
320 } 409 }
321 410
322 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno); 411 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno);
323 kfree(rv); 412 kmem_cache_free(gfs2_bufdata_cachep, bd);
324 413
325 offset += sizeof(u64); 414 offset += sizeof(u64);
326 } 415 }
327 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 416 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
328 417
329 set_buffer_dirty(bh); 418 submit_bh(WRITE, bh);
330 ll_rw_block(WRITE, 1, &bh);
331} 419}
332 420
333static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 421static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -466,222 +554,136 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
466 struct address_space *mapping = bd->bd_bh->b_page->mapping; 554 struct address_space *mapping = bd->bd_bh->b_page->mapping;
467 struct gfs2_inode *ip = GFS2_I(mapping->host); 555 struct gfs2_inode *ip = GFS2_I(mapping->host);
468 556
557 lock_buffer(bd->bd_bh);
469 gfs2_log_lock(sdp); 558 gfs2_log_lock(sdp);
470 if (!list_empty(&bd->bd_list_tr)) { 559 if (!list_empty(&bd->bd_list_tr))
471 gfs2_log_unlock(sdp); 560 goto out;
472 return;
473 }
474 tr->tr_touched = 1; 561 tr->tr_touched = 1;
475 if (gfs2_is_jdata(ip)) { 562 if (gfs2_is_jdata(ip)) {
476 tr->tr_num_buf++; 563 tr->tr_num_buf++;
477 list_add(&bd->bd_list_tr, &tr->tr_list_buf); 564 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
478 } 565 }
479 gfs2_log_unlock(sdp);
480 if (!list_empty(&le->le_list)) 566 if (!list_empty(&le->le_list))
481 return; 567 goto out;
482 568
483 gfs2_trans_add_gl(bd->bd_gl); 569 __glock_lo_add(sdp, &bd->bd_gl->gl_le);
484 if (gfs2_is_jdata(ip)) { 570 if (gfs2_is_jdata(ip)) {
485 sdp->sd_log_num_jdata++;
486 gfs2_pin(sdp, bd->bd_bh); 571 gfs2_pin(sdp, bd->bd_bh);
487 tr->tr_num_databuf_new++; 572 tr->tr_num_databuf_new++;
573 sdp->sd_log_num_databuf++;
574 list_add(&le->le_list, &sdp->sd_log_le_databuf);
575 } else {
576 list_add(&le->le_list, &sdp->sd_log_le_ordered);
488 } 577 }
489 gfs2_log_lock(sdp); 578out:
490 sdp->sd_log_num_databuf++;
491 list_add(&le->le_list, &sdp->sd_log_le_databuf);
492 gfs2_log_unlock(sdp); 579 gfs2_log_unlock(sdp);
580 unlock_buffer(bd->bd_bh);
493} 581}
494 582
495static int gfs2_check_magic(struct buffer_head *bh) 583static void gfs2_check_magic(struct buffer_head *bh)
496{ 584{
497 struct page *page = bh->b_page;
498 void *kaddr; 585 void *kaddr;
499 __be32 *ptr; 586 __be32 *ptr;
500 int rv = 0;
501 587
502 kaddr = kmap_atomic(page, KM_USER0); 588 clear_buffer_escaped(bh);
589 kaddr = kmap_atomic(bh->b_page, KM_USER0);
503 ptr = kaddr + bh_offset(bh); 590 ptr = kaddr + bh_offset(bh);
504 if (*ptr == cpu_to_be32(GFS2_MAGIC)) 591 if (*ptr == cpu_to_be32(GFS2_MAGIC))
505 rv = 1; 592 set_buffer_escaped(bh);
506 kunmap_atomic(kaddr, KM_USER0); 593 kunmap_atomic(kaddr, KM_USER0);
507
508 return rv;
509} 594}
510 595
511/** 596static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
512 * databuf_lo_before_commit - Scan the data buffers, writing as we go 597 struct list_head *list, struct list_head *done,
513 * 598 unsigned int n)
514 * Here we scan through the lists of buffers and make the assumption
515 * that any buffer thats been pinned is being journaled, and that
516 * any unpinned buffer is an ordered write data buffer and therefore
517 * will be written back rather than journaled.
518 */
519static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
520{ 599{
521 LIST_HEAD(started); 600 struct buffer_head *bh1;
522 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
523 struct buffer_head *bh = NULL,*bh1 = NULL;
524 struct gfs2_log_descriptor *ld; 601 struct gfs2_log_descriptor *ld;
525 unsigned int limit; 602 struct gfs2_bufdata *bd;
526 unsigned int total_dbuf; 603 __be64 *ptr;
527 unsigned int total_jdata = sdp->sd_log_num_jdata; 604
528 unsigned int num, n; 605 if (!bh)
529 __be64 *ptr = NULL; 606 return;
530 607
531 limit = databuf_limit(sdp); 608 ld = bh_log_desc(bh);
609 ld->ld_length = cpu_to_be32(n + 1);
610 ld->ld_data1 = cpu_to_be32(n);
532 611
533 /* 612 ptr = bh_log_ptr(bh);
534 * Start writing ordered buffers, write journaled buffers 613
535 * into the log along with a header 614 get_bh(bh);
536 */ 615 submit_bh(WRITE, bh);
537 gfs2_log_lock(sdp); 616 gfs2_log_lock(sdp);
538 total_dbuf = sdp->sd_log_num_databuf; 617 while(!list_empty(list)) {
539 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf, 618 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
540 bd_le.le_list); 619 list_move_tail(&bd->bd_le.le_list, done);
541 while(total_dbuf) { 620 get_bh(bd->bd_bh);
542 num = total_jdata; 621 while (be64_to_cpu(*ptr) != bd->bd_bh->b_blocknr) {
543 if (num > limit) 622 gfs2_log_incr_head(sdp);
544 num = limit; 623 ptr += 2;
545 n = 0;
546 list_for_each_entry_safe_continue(bd1, bdt,
547 &sdp->sd_log_le_databuf,
548 bd_le.le_list) {
549 /* store off the buffer head in a local ptr since
550 * gfs2_bufdata might change when we drop the log lock
551 */
552 bh1 = bd1->bd_bh;
553
554 /* An ordered write buffer */
555 if (bh1 && !buffer_pinned(bh1)) {
556 list_move(&bd1->bd_le.le_list, &started);
557 if (bd1 == bd2) {
558 bd2 = NULL;
559 bd2 = list_prepare_entry(bd2,
560 &sdp->sd_log_le_databuf,
561 bd_le.le_list);
562 }
563 total_dbuf--;
564 if (bh1) {
565 if (buffer_dirty(bh1)) {
566 get_bh(bh1);
567
568 gfs2_log_unlock(sdp);
569
570 ll_rw_block(SWRITE, 1, &bh1);
571 brelse(bh1);
572
573 gfs2_log_lock(sdp);
574 }
575 continue;
576 }
577 continue;
578 } else if (bh1) { /* A journaled buffer */
579 int magic;
580 gfs2_log_unlock(sdp);
581 if (!bh) {
582 bh = gfs2_log_get_buf(sdp);
583 ld = (struct gfs2_log_descriptor *)
584 bh->b_data;
585 ptr = (__be64 *)(bh->b_data +
586 DATABUF_OFFSET);
587 ld->ld_header.mh_magic =
588 cpu_to_be32(GFS2_MAGIC);
589 ld->ld_header.mh_type =
590 cpu_to_be32(GFS2_METATYPE_LD);
591 ld->ld_header.mh_format =
592 cpu_to_be32(GFS2_FORMAT_LD);
593 ld->ld_type =
594 cpu_to_be32(GFS2_LOG_DESC_JDATA);
595 ld->ld_length = cpu_to_be32(num + 1);
596 ld->ld_data1 = cpu_to_be32(num);
597 ld->ld_data2 = cpu_to_be32(0);
598 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
599 }
600 magic = gfs2_check_magic(bh1);
601 *ptr++ = cpu_to_be64(bh1->b_blocknr);
602 *ptr++ = cpu_to_be64((__u64)magic);
603 clear_buffer_escaped(bh1);
604 if (unlikely(magic != 0))
605 set_buffer_escaped(bh1);
606 gfs2_log_lock(sdp);
607 if (++n >= num)
608 break;
609 } else if (!bh1) {
610 total_dbuf--;
611 sdp->sd_log_num_databuf--;
612 list_del_init(&bd1->bd_le.le_list);
613 if (bd1 == bd2) {
614 bd2 = NULL;
615 bd2 = list_prepare_entry(bd2,
616 &sdp->sd_log_le_databuf,
617 bd_le.le_list);
618 }
619 kmem_cache_free(gfs2_bufdata_cachep, bd1);
620 }
621 } 624 }
622 gfs2_log_unlock(sdp); 625 gfs2_log_unlock(sdp);
623 if (bh) { 626 lock_buffer(bd->bd_bh);
624 set_buffer_mapped(bh); 627 if (buffer_escaped(bd->bd_bh)) {
625 set_buffer_dirty(bh); 628 void *kaddr;
626 ll_rw_block(WRITE, 1, &bh); 629 bh1 = gfs2_log_get_buf(sdp);
627 bh = NULL; 630 kaddr = kmap_atomic(bd->bd_bh->b_page, KM_USER0);
631 memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh),
632 bh1->b_size);
633 kunmap_atomic(kaddr, KM_USER0);
634 *(__be32 *)bh1->b_data = 0;
635 clear_buffer_escaped(bd->bd_bh);
636 unlock_buffer(bd->bd_bh);
637 brelse(bd->bd_bh);
638 } else {
639 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
628 } 640 }
629 n = 0; 641 submit_bh(WRITE, bh1);
630 gfs2_log_lock(sdp); 642 gfs2_log_lock(sdp);
631 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf, 643 ptr += 2;
632 bd_le.le_list) {
633 if (!bd2->bd_bh)
634 continue;
635 /* copy buffer if it needs escaping */
636 gfs2_log_unlock(sdp);
637 if (unlikely(buffer_escaped(bd2->bd_bh))) {
638 void *kaddr;
639 struct page *page = bd2->bd_bh->b_page;
640 bh = gfs2_log_get_buf(sdp);
641 kaddr = kmap_atomic(page, KM_USER0);
642 memcpy(bh->b_data,
643 kaddr + bh_offset(bd2->bd_bh),
644 sdp->sd_sb.sb_bsize);
645 kunmap_atomic(kaddr, KM_USER0);
646 *(__be32 *)bh->b_data = 0;
647 } else {
648 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
649 }
650 set_buffer_dirty(bh);
651 ll_rw_block(WRITE, 1, &bh);
652 gfs2_log_lock(sdp);
653 if (++n >= num)
654 break;
655 }
656 bh = NULL;
657 BUG_ON(total_dbuf < num);
658 total_dbuf -= num;
659 total_jdata -= num;
660 } 644 }
661 gfs2_log_unlock(sdp); 645 gfs2_log_unlock(sdp);
646 brelse(bh);
647}
662 648
663 /* Wait on all ordered buffers */ 649/**
664 while (!list_empty(&started)) { 650 * databuf_lo_before_commit - Scan the data buffers, writing as we go
665 gfs2_log_lock(sdp); 651 *
666 bd1 = list_entry(started.next, struct gfs2_bufdata, 652 */
667 bd_le.le_list);
668 list_del_init(&bd1->bd_le.le_list);
669 sdp->sd_log_num_databuf--;
670 bh = bd1->bd_bh;
671 if (bh) {
672 bh->b_private = NULL;
673 get_bh(bh);
674 gfs2_log_unlock(sdp);
675 wait_on_buffer(bh);
676 brelse(bh);
677 } else
678 gfs2_log_unlock(sdp);
679 653
680 kmem_cache_free(gfs2_bufdata_cachep, bd1); 654static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
681 } 655{
656 struct gfs2_bufdata *bd = NULL;
657 struct buffer_head *bh = NULL;
658 unsigned int n = 0;
659 __be64 *ptr = NULL, *end = NULL;
660 LIST_HEAD(processed);
661 LIST_HEAD(in_progress);
682 662
683 /* We've removed all the ordered write bufs here, so only jdata left */ 663 gfs2_log_lock(sdp);
684 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata); 664 while (!list_empty(&sdp->sd_log_le_databuf)) {
665 if (ptr == end) {
666 gfs2_log_unlock(sdp);
667 gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
668 n = 0;
669 bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_JDATA);
670 ptr = bh_log_ptr(bh);
671 end = bh_ptr_end(bh) - 1;
672 gfs2_log_lock(sdp);
673 continue;
674 }
675 bd = list_entry(sdp->sd_log_le_databuf.next, struct gfs2_bufdata, bd_le.le_list);
676 list_move_tail(&bd->bd_le.le_list, &in_progress);
677 gfs2_check_magic(bd->bd_bh);
678 *ptr++ = cpu_to_be64(bd->bd_bh->b_blocknr);
679 *ptr++ = cpu_to_be64(buffer_escaped(bh) ? 1 : 0);
680 n++;
681 }
682 gfs2_log_unlock(sdp);
683 gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
684 gfs2_log_lock(sdp);
685 list_splice(&processed, &sdp->sd_log_le_databuf);
686 gfs2_log_unlock(sdp);
685} 687}
686 688
687static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, 689static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -765,11 +767,9 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
765 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); 767 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
766 list_del_init(&bd->bd_le.le_list); 768 list_del_init(&bd->bd_le.le_list);
767 sdp->sd_log_num_databuf--; 769 sdp->sd_log_num_databuf--;
768 sdp->sd_log_num_jdata--;
769 gfs2_unpin(sdp, bd->bd_bh, ai); 770 gfs2_unpin(sdp, bd->bd_bh, ai);
770 } 771 }
771 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); 772 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
772 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
773} 773}
774 774
775 775
@@ -817,10 +817,10 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
817 817
818const struct gfs2_log_operations *gfs2_log_ops[] = { 818const struct gfs2_log_operations *gfs2_log_ops[] = {
819 &gfs2_glock_lops, 819 &gfs2_glock_lops,
820 &gfs2_databuf_lops,
820 &gfs2_buf_lops, 821 &gfs2_buf_lops,
821 &gfs2_revoke_lops,
822 &gfs2_rg_lops, 822 &gfs2_rg_lops,
823 &gfs2_databuf_lops, 823 &gfs2_revoke_lops,
824 NULL, 824 NULL,
825}; 825};
826 826
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index d5d4e68b8807..79c91fd8381b 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -107,6 +107,8 @@ static int __init init_gfs2_fs(void)
107fail_unregister: 107fail_unregister:
108 unregister_filesystem(&gfs2_fs_type); 108 unregister_filesystem(&gfs2_fs_type);
109fail: 109fail:
110 gfs2_glock_exit();
111
110 if (gfs2_bufdata_cachep) 112 if (gfs2_bufdata_cachep)
111 kmem_cache_destroy(gfs2_bufdata_cachep); 113 kmem_cache_destroy(gfs2_bufdata_cachep);
112 114
@@ -127,6 +129,7 @@ fail:
127 129
128static void __exit exit_gfs2_fs(void) 130static void __exit exit_gfs2_fs(void)
129{ 131{
132 gfs2_glock_exit();
130 gfs2_unregister_debugfs(); 133 gfs2_unregister_debugfs();
131 unregister_filesystem(&gfs2_fs_type); 134 unregister_filesystem(&gfs2_fs_type);
132 unregister_filesystem(&gfs2meta_fs_type); 135 unregister_filesystem(&gfs2meta_fs_type);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8da343b34ae7..4da423985e4f 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -297,74 +297,35 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
297 unlock_page(bh->b_page); 297 unlock_page(bh->b_page);
298} 298}
299 299
300/** 300void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
301 * gfs2_pin - Pin a buffer in memory
302 * @sdp: the filesystem the buffer belongs to
303 * @bh: The buffer to be pinned
304 *
305 */
306
307void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
308{ 301{
302 struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host);
309 struct gfs2_bufdata *bd = bh->b_private; 303 struct gfs2_bufdata *bd = bh->b_private;
310 304 if (test_clear_buffer_pinned(bh)) {
311 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); 305 list_del_init(&bd->bd_le.le_list);
312 306 if (meta) {
313 if (test_set_buffer_pinned(bh)) 307 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
314 gfs2_assert_withdraw(sdp, 0); 308 sdp->sd_log_num_buf--;
315 309 tr->tr_num_buf_rm++;
316 wait_on_buffer(bh); 310 } else {
317 311 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf);
318 /* If this buffer is in the AIL and it has already been written 312 sdp->sd_log_num_databuf--;
319 to in-place disk block, remove it from the AIL. */ 313 tr->tr_num_databuf_rm++;
320 314 }
321 gfs2_log_lock(sdp); 315 tr->tr_touched = 1;
322 if (bd->bd_ail && !buffer_in_io(bh))
323 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
324 gfs2_log_unlock(sdp);
325
326 clear_buffer_dirty(bh);
327 wait_on_buffer(bh);
328
329 if (!buffer_uptodate(bh))
330 gfs2_io_error_bh(sdp, bh);
331
332 get_bh(bh);
333}
334
335/**
336 * gfs2_unpin - Unpin a buffer
337 * @sdp: the filesystem the buffer belongs to
338 * @bh: The buffer to unpin
339 * @ai:
340 *
341 */
342
343void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
344 struct gfs2_ail *ai)
345{
346 struct gfs2_bufdata *bd = bh->b_private;
347
348 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
349
350 if (!buffer_pinned(bh))
351 gfs2_assert_withdraw(sdp, 0);
352
353 mark_buffer_dirty(bh);
354 clear_buffer_pinned(bh);
355
356 gfs2_log_lock(sdp);
357 if (bd->bd_ail) {
358 list_del(&bd->bd_ail_st_list);
359 brelse(bh); 316 brelse(bh);
360 } else {
361 struct gfs2_glock *gl = bd->bd_gl;
362 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
363 atomic_inc(&gl->gl_ail_count);
364 } 317 }
365 bd->bd_ail = ai; 318 if (bd) {
366 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); 319 if (bd->bd_ail) {
367 gfs2_log_unlock(sdp); 320 gfs2_remove_from_ail(NULL, bd);
321 bh->b_private = NULL;
322 bd->bd_bh = NULL;
323 bd->bd_blkno = bh->b_blocknr;
324 gfs2_trans_add_revoke(sdp, bd);
325 }
326 }
327 clear_buffer_dirty(bh);
328 clear_buffer_uptodate(bh);
368} 329}
369 330
370/** 331/**
@@ -383,44 +344,11 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
383 while (blen) { 344 while (blen) {
384 bh = getbuf(ip->i_gl, bstart, NO_CREATE); 345 bh = getbuf(ip->i_gl, bstart, NO_CREATE);
385 if (bh) { 346 if (bh) {
386 struct gfs2_bufdata *bd = bh->b_private;
387
388 if (test_clear_buffer_pinned(bh)) {
389 struct gfs2_trans *tr = current->journal_info;
390 struct gfs2_inode *bh_ip =
391 GFS2_I(bh->b_page->mapping->host);
392
393 gfs2_log_lock(sdp);
394 list_del_init(&bd->bd_le.le_list);
395 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
396 sdp->sd_log_num_buf--;
397 gfs2_log_unlock(sdp);
398 if (bh_ip->i_inode.i_private != NULL)
399 tr->tr_num_databuf_rm++;
400 else
401 tr->tr_num_buf_rm++;
402 brelse(bh);
403 }
404 if (bd) {
405 gfs2_log_lock(sdp);
406 if (bd->bd_ail) {
407 u64 blkno = bh->b_blocknr;
408 bd->bd_ail = NULL;
409 list_del(&bd->bd_ail_st_list);
410 list_del(&bd->bd_ail_gl_list);
411 atomic_dec(&bd->bd_gl->gl_ail_count);
412 brelse(bh);
413 gfs2_log_unlock(sdp);
414 gfs2_trans_add_revoke(sdp, blkno);
415 } else
416 gfs2_log_unlock(sdp);
417 }
418
419 lock_buffer(bh); 347 lock_buffer(bh);
420 clear_buffer_dirty(bh); 348 gfs2_log_lock(sdp);
421 clear_buffer_uptodate(bh); 349 gfs2_remove_from_journal(bh, current->journal_info, 1);
350 gfs2_log_unlock(sdp);
422 unlock_buffer(bh); 351 unlock_buffer(bh);
423
424 brelse(bh); 352 brelse(bh);
425 } 353 }
426 354
@@ -446,10 +374,10 @@ void gfs2_meta_cache_flush(struct gfs2_inode *ip)
446 374
447 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) { 375 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
448 bh_slot = &ip->i_cache[x]; 376 bh_slot = &ip->i_cache[x];
449 if (!*bh_slot) 377 if (*bh_slot) {
450 break; 378 brelse(*bh_slot);
451 brelse(*bh_slot); 379 *bh_slot = NULL;
452 *bh_slot = NULL; 380 }
453 } 381 }
454 382
455 spin_unlock(&ip->i_spin); 383 spin_unlock(&ip->i_spin);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 527bf19d9690..b7048222ebb4 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -50,9 +50,9 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
50 50
51void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, 51void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
52 int meta); 52 int meta);
53void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); 53
54void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, 54void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
55 struct gfs2_ail *ai); 55 int meta);
56 56
57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); 57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
58 58
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 4864659555d4..b941f9f9f958 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,6 +42,7 @@ enum {
42 Opt_nosuiddir, 42 Opt_nosuiddir,
43 Opt_data_writeback, 43 Opt_data_writeback,
44 Opt_data_ordered, 44 Opt_data_ordered,
45 Opt_err,
45}; 46};
46 47
47static match_table_t tokens = { 48static match_table_t tokens = {
@@ -64,7 +65,8 @@ static match_table_t tokens = {
64 {Opt_suiddir, "suiddir"}, 65 {Opt_suiddir, "suiddir"},
65 {Opt_nosuiddir, "nosuiddir"}, 66 {Opt_nosuiddir, "nosuiddir"},
66 {Opt_data_writeback, "data=writeback"}, 67 {Opt_data_writeback, "data=writeback"},
67 {Opt_data_ordered, "data=ordered"} 68 {Opt_data_ordered, "data=ordered"},
69 {Opt_err, NULL}
68}; 70};
69 71
70/** 72/**
@@ -237,6 +239,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
237 case Opt_data_ordered: 239 case Opt_data_ordered:
238 args->ar_data = GFS2_DATA_ORDERED; 240 args->ar_data = GFS2_DATA_ORDERED;
239 break; 241 break;
242 case Opt_err:
240 default: 243 default:
241 fs_info(sdp, "unknown option: %s\n", o); 244 fs_info(sdp, "unknown option: %s\n", o);
242 error = -EINVAL; 245 error = -EINVAL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 42a5f58f6fca..873a511ef2be 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -90,7 +90,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
90 error = gfs2_block_map(inode, lblock, 0, bh_result); 90 error = gfs2_block_map(inode, lblock, 0, bh_result);
91 if (error) 91 if (error)
92 return error; 92 return error;
93 if (bh_result->b_blocknr == 0) 93 if (!buffer_mapped(bh_result))
94 return -EIO; 94 return -EIO;
95 return 0; 95 return 0;
96} 96}
@@ -414,7 +414,8 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
414 if (ind_blocks || data_blocks) 414 if (ind_blocks || data_blocks)
415 rblocks += RES_STATFS + RES_QUOTA; 415 rblocks += RES_STATFS + RES_QUOTA;
416 416
417 error = gfs2_trans_begin(sdp, rblocks, 0); 417 error = gfs2_trans_begin(sdp, rblocks,
418 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
418 if (error) 419 if (error)
419 goto out_trans_fail; 420 goto out_trans_fail;
420 421
@@ -616,58 +617,50 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
616 return dblock; 617 return dblock;
617} 618}
618 619
619static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh) 620static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
620{ 621{
621 struct gfs2_bufdata *bd; 622 struct gfs2_bufdata *bd;
622 623
624 lock_buffer(bh);
623 gfs2_log_lock(sdp); 625 gfs2_log_lock(sdp);
626 clear_buffer_dirty(bh);
624 bd = bh->b_private; 627 bd = bh->b_private;
625 if (bd) { 628 if (bd) {
626 bd->bd_bh = NULL; 629 if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
627 bh->b_private = NULL; 630 list_del_init(&bd->bd_le.le_list);
628 if (!bd->bd_ail && list_empty(&bd->bd_le.le_list)) 631 else
629 kmem_cache_free(gfs2_bufdata_cachep, bd); 632 gfs2_remove_from_journal(bh, current->journal_info, 0);
630 } 633 }
631 gfs2_log_unlock(sdp);
632
633 lock_buffer(bh);
634 clear_buffer_dirty(bh);
635 bh->b_bdev = NULL; 634 bh->b_bdev = NULL;
636 clear_buffer_mapped(bh); 635 clear_buffer_mapped(bh);
637 clear_buffer_req(bh); 636 clear_buffer_req(bh);
638 clear_buffer_new(bh); 637 clear_buffer_new(bh);
639 clear_buffer_delay(bh); 638 gfs2_log_unlock(sdp);
640 unlock_buffer(bh); 639 unlock_buffer(bh);
641} 640}
642 641
643static void gfs2_invalidatepage(struct page *page, unsigned long offset) 642static void gfs2_invalidatepage(struct page *page, unsigned long offset)
644{ 643{
645 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); 644 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
646 struct buffer_head *head, *bh, *next; 645 struct buffer_head *bh, *head;
647 unsigned int curr_off = 0; 646 unsigned long pos = 0;
648 647
649 BUG_ON(!PageLocked(page)); 648 BUG_ON(!PageLocked(page));
650 if (offset == 0) 649 if (offset == 0)
651 ClearPageChecked(page); 650 ClearPageChecked(page);
652 if (!page_has_buffers(page)) 651 if (!page_has_buffers(page))
653 return; 652 goto out;
654 653
655 bh = head = page_buffers(page); 654 bh = head = page_buffers(page);
656 do { 655 do {
657 unsigned int next_off = curr_off + bh->b_size; 656 if (offset <= pos)
658 next = bh->b_this_page; 657 gfs2_discard(sdp, bh);
659 658 pos += bh->b_size;
660 if (offset <= curr_off) 659 bh = bh->b_this_page;
661 discard_buffer(sdp, bh);
662
663 curr_off = next_off;
664 bh = next;
665 } while (bh != head); 660 } while (bh != head);
666 661out:
667 if (!offset) 662 if (offset == 0)
668 try_to_release_page(page, 0); 663 try_to_release_page(page, 0);
669
670 return;
671} 664}
672 665
673/** 666/**
@@ -736,59 +729,6 @@ out:
736} 729}
737 730
738/** 731/**
739 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
740 * @bh: the buffer we're stuck on
741 *
742 */
743
744static void stuck_releasepage(struct buffer_head *bh)
745{
746 struct inode *inode = bh->b_page->mapping->host;
747 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
748 struct gfs2_bufdata *bd = bh->b_private;
749 struct gfs2_glock *gl;
750static unsigned limit = 0;
751
752 if (limit > 3)
753 return;
754 limit++;
755
756 fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
757 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
758 (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
759 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
760 fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
761
762 if (!bd)
763 return;
764
765 gl = bd->bd_gl;
766
767 fs_warn(sdp, "gl = (%u, %llu)\n",
768 gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
769
770 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
771 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
772 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
773
774 if (gl->gl_ops == &gfs2_inode_glops) {
775 struct gfs2_inode *ip = gl->gl_object;
776 unsigned int x;
777
778 if (!ip)
779 return;
780
781 fs_warn(sdp, "ip = %llu %llu\n",
782 (unsigned long long)ip->i_no_formal_ino,
783 (unsigned long long)ip->i_no_addr);
784
785 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
786 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
787 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
788 }
789}
790
791/**
792 * gfs2_releasepage - free the metadata associated with a page 732 * gfs2_releasepage - free the metadata associated with a page
793 * @page: the page that's being released 733 * @page: the page that's being released
794 * @gfp_mask: passed from Linux VFS, ignored by us 734 * @gfp_mask: passed from Linux VFS, ignored by us
@@ -805,41 +745,39 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
805 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; 745 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
806 struct buffer_head *bh, *head; 746 struct buffer_head *bh, *head;
807 struct gfs2_bufdata *bd; 747 struct gfs2_bufdata *bd;
808 unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
809 748
810 if (!page_has_buffers(page)) 749 if (!page_has_buffers(page))
811 goto out; 750 return 0;
812 751
752 gfs2_log_lock(sdp);
813 head = bh = page_buffers(page); 753 head = bh = page_buffers(page);
814 do { 754 do {
815 while (atomic_read(&bh->b_count)) { 755 if (atomic_read(&bh->b_count))
816 if (!atomic_read(&aspace->i_writecount)) 756 goto cannot_release;
817 return 0; 757 bd = bh->b_private;
818 758 if (bd && bd->bd_ail)
819 if (!(gfp_mask & __GFP_WAIT)) 759 goto cannot_release;
820 return 0;
821
822 if (time_after_eq(jiffies, t)) {
823 stuck_releasepage(bh);
824 /* should we withdraw here? */
825 return 0;
826 }
827
828 yield();
829 }
830
831 gfs2_assert_warn(sdp, !buffer_pinned(bh)); 760 gfs2_assert_warn(sdp, !buffer_pinned(bh));
832 gfs2_assert_warn(sdp, !buffer_dirty(bh)); 761 gfs2_assert_warn(sdp, !buffer_dirty(bh));
762 bh = bh->b_this_page;
763 } while(bh != head);
764 gfs2_log_unlock(sdp);
833 765
766 head = bh = page_buffers(page);
767 do {
834 gfs2_log_lock(sdp); 768 gfs2_log_lock(sdp);
835 bd = bh->b_private; 769 bd = bh->b_private;
836 if (bd) { 770 if (bd) {
837 gfs2_assert_warn(sdp, bd->bd_bh == bh); 771 gfs2_assert_warn(sdp, bd->bd_bh == bh);
838 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr)); 772 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
839 gfs2_assert_warn(sdp, !bd->bd_ail); 773 if (!list_empty(&bd->bd_le.le_list)) {
840 bd->bd_bh = NULL; 774 if (!buffer_pinned(bh))
841 if (!list_empty(&bd->bd_le.le_list)) 775 list_del_init(&bd->bd_le.le_list);
842 bd = NULL; 776 else
777 bd = NULL;
778 }
779 if (bd)
780 bd->bd_bh = NULL;
843 bh->b_private = NULL; 781 bh->b_private = NULL;
844 } 782 }
845 gfs2_log_unlock(sdp); 783 gfs2_log_unlock(sdp);
@@ -849,8 +787,10 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
849 bh = bh->b_this_page; 787 bh = bh->b_this_page;
850 } while (bh != head); 788 } while (bh != head);
851 789
852out:
853 return try_to_free_buffers(page); 790 return try_to_free_buffers(page);
791cannot_release:
792 gfs2_log_unlock(sdp);
793 return 0;
854} 794}
855 795
856const struct address_space_operations gfs2_file_aops = { 796const struct address_space_operations gfs2_file_aops = {
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index b8312edee0e4..e2d1347796a9 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -237,7 +237,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
237 237
238 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, 238 inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
239 inum->no_addr, 239 inum->no_addr,
240 0); 240 0, 0);
241 if (!inode) 241 if (!inode)
242 goto fail; 242 goto fail;
243 if (IS_ERR(inode)) { 243 if (IS_ERR(inode)) {
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 94d76ace0b95..46a9e10ff17b 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -571,7 +571,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
571 int error = 0; 571 int error = 0;
572 572
573 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; 573 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
574 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; 574 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE
575 | GL_FLOCK;
575 576
576 mutex_lock(&fp->f_fl_mutex); 577 mutex_lock(&fp->f_fl_mutex);
577 578
@@ -579,21 +580,19 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
579 if (gl) { 580 if (gl) {
580 if (fl_gh->gh_state == state) 581 if (fl_gh->gh_state == state)
581 goto out; 582 goto out;
582 gfs2_glock_hold(gl);
583 flock_lock_file_wait(file, 583 flock_lock_file_wait(file,
584 &(struct file_lock){.fl_type = F_UNLCK}); 584 &(struct file_lock){.fl_type = F_UNLCK});
585 gfs2_glock_dq_uninit(fl_gh); 585 gfs2_glock_dq_wait(fl_gh);
586 gfs2_holder_reinit(state, flags, fl_gh);
586 } else { 587 } else {
587 error = gfs2_glock_get(GFS2_SB(&ip->i_inode), 588 error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
588 ip->i_no_addr, &gfs2_flock_glops, 589 ip->i_no_addr, &gfs2_flock_glops,
589 CREATE, &gl); 590 CREATE, &gl);
590 if (error) 591 if (error)
591 goto out; 592 goto out;
593 gfs2_holder_init(gl, state, flags, fl_gh);
594 gfs2_glock_put(gl);
592 } 595 }
593
594 gfs2_holder_init(gl, state, flags, fl_gh);
595 gfs2_glock_put(gl);
596
597 error = gfs2_glock_nq(fl_gh); 596 error = gfs2_glock_nq(fl_gh);
598 if (error) { 597 if (error) {
599 gfs2_holder_uninit(fl_gh); 598 gfs2_holder_uninit(fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index cf5aa5050548..17de58e83d92 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -28,18 +28,18 @@
28#include "lm.h" 28#include "lm.h"
29#include "mount.h" 29#include "mount.h"
30#include "ops_fstype.h" 30#include "ops_fstype.h"
31#include "ops_dentry.h"
31#include "ops_super.h" 32#include "ops_super.h"
32#include "recovery.h" 33#include "recovery.h"
33#include "rgrp.h" 34#include "rgrp.h"
34#include "super.h" 35#include "super.h"
35#include "sys.h" 36#include "sys.h"
36#include "util.h" 37#include "util.h"
38#include "log.h"
37 39
38#define DO 0 40#define DO 0
39#define UNDO 1 41#define UNDO 1
40 42
41extern struct dentry_operations gfs2_dops;
42
43static struct gfs2_sbd *init_sbd(struct super_block *sb) 43static struct gfs2_sbd *init_sbd(struct super_block *sb)
44{ 44{
45 struct gfs2_sbd *sdp; 45 struct gfs2_sbd *sdp;
@@ -82,13 +82,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
82 INIT_LIST_HEAD(&sdp->sd_log_le_revoke); 82 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
83 INIT_LIST_HEAD(&sdp->sd_log_le_rg); 83 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
84 INIT_LIST_HEAD(&sdp->sd_log_le_databuf); 84 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
85 INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
85 86
86 mutex_init(&sdp->sd_log_reserve_mutex); 87 mutex_init(&sdp->sd_log_reserve_mutex);
87 INIT_LIST_HEAD(&sdp->sd_ail1_list); 88 INIT_LIST_HEAD(&sdp->sd_ail1_list);
88 INIT_LIST_HEAD(&sdp->sd_ail2_list); 89 INIT_LIST_HEAD(&sdp->sd_ail2_list);
89 90
90 init_rwsem(&sdp->sd_log_flush_lock); 91 init_rwsem(&sdp->sd_log_flush_lock);
91 INIT_LIST_HEAD(&sdp->sd_log_flush_list); 92 atomic_set(&sdp->sd_log_in_flight, 0);
93 init_waitqueue_head(&sdp->sd_log_flush_wait);
92 94
93 INIT_LIST_HEAD(&sdp->sd_revoke_list); 95 INIT_LIST_HEAD(&sdp->sd_revoke_list);
94 96
@@ -145,7 +147,8 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
145 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto); 147 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
146 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table); 148 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
147 149
148 while ((table = strchr(sdp->sd_table_name, '/'))) 150 table = sdp->sd_table_name;
151 while ((table = strchr(table, '/')))
149 *table = '_'; 152 *table = '_';
150 153
151out: 154out:
@@ -161,14 +164,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
161 if (undo) 164 if (undo)
162 goto fail_trans; 165 goto fail_trans;
163 166
164 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
165 error = IS_ERR(p);
166 if (error) {
167 fs_err(sdp, "can't start scand thread: %d\n", error);
168 return error;
169 }
170 sdp->sd_scand_process = p;
171
172 for (sdp->sd_glockd_num = 0; 167 for (sdp->sd_glockd_num = 0;
173 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd; 168 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
174 sdp->sd_glockd_num++) { 169 sdp->sd_glockd_num++) {
@@ -229,14 +224,13 @@ fail:
229 while (sdp->sd_glockd_num--) 224 while (sdp->sd_glockd_num--)
230 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); 225 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
231 226
232 kthread_stop(sdp->sd_scand_process);
233 return error; 227 return error;
234} 228}
235 229
236static inline struct inode *gfs2_lookup_root(struct super_block *sb, 230static inline struct inode *gfs2_lookup_root(struct super_block *sb,
237 u64 no_addr) 231 u64 no_addr)
238{ 232{
239 return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); 233 return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
240} 234}
241 235
242static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) 236static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
@@ -301,8 +295,9 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
301 fs_err(sdp, "can't get root dentry\n"); 295 fs_err(sdp, "can't get root dentry\n");
302 error = -ENOMEM; 296 error = -ENOMEM;
303 iput(inode); 297 iput(inode);
304 } 298 } else
305 sb->s_root->d_op = &gfs2_dops; 299 sb->s_root->d_op = &gfs2_dops;
300
306out: 301out:
307 gfs2_glock_dq_uninit(&sb_gh); 302 gfs2_glock_dq_uninit(&sb_gh);
308 return error; 303 return error;
@@ -368,7 +363,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
368 363
369 ip = GFS2_I(sdp->sd_jdesc->jd_inode); 364 ip = GFS2_I(sdp->sd_jdesc->jd_inode);
370 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 365 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
371 LM_FLAG_NOEXP | GL_EXACT, 366 LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE,
372 &sdp->sd_jinode_gh); 367 &sdp->sd_jinode_gh);
373 if (error) { 368 if (error) {
374 fs_err(sdp, "can't acquire journal inode glock: %d\n", 369 fs_err(sdp, "can't acquire journal inode glock: %d\n",
@@ -818,7 +813,6 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
818 struct nameidata nd; 813 struct nameidata nd;
819 struct file_system_type *fstype; 814 struct file_system_type *fstype;
820 struct super_block *sb = NULL, *s; 815 struct super_block *sb = NULL, *s;
821 struct list_head *l;
822 int error; 816 int error;
823 817
824 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); 818 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
@@ -830,8 +824,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
830 error = vfs_getattr(nd.mnt, nd.dentry, &stat); 824 error = vfs_getattr(nd.mnt, nd.dentry, &stat);
831 825
832 fstype = get_fs_type("gfs2"); 826 fstype = get_fs_type("gfs2");
833 list_for_each(l, &fstype->fs_supers) { 827 list_for_each_entry(s, &fstype->fs_supers, s_instances) {
834 s = list_entry(l, struct super_block, s_instances);
835 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) || 828 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
836 (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) { 829 (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
837 sb = s; 830 sb = s;
@@ -861,7 +854,7 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
861 error = -ENOENT; 854 error = -ENOENT;
862 goto error; 855 goto error;
863 } 856 }
864 sdp = (struct gfs2_sbd*) sb->s_fs_info; 857 sdp = sb->s_fs_info;
865 if (sdp->sd_vfs_meta) { 858 if (sdp->sd_vfs_meta) {
866 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); 859 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
867 error = -EBUSY; 860 error = -EBUSY;
@@ -896,7 +889,10 @@ error:
896 889
897static void gfs2_kill_sb(struct super_block *sb) 890static void gfs2_kill_sb(struct super_block *sb)
898{ 891{
899 gfs2_delete_debugfs_file(sb->s_fs_info); 892 if (sb->s_fs_info) {
893 gfs2_delete_debugfs_file(sb->s_fs_info);
894 gfs2_meta_syncfs(sb->s_fs_info);
895 }
900 kill_block_super(sb); 896 kill_block_super(sb);
901} 897}
902 898
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 911c115b5c6c..291f0c7eaa3b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -69,7 +69,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
69 mark_inode_dirty(inode); 69 mark_inode_dirty(inode);
70 break; 70 break;
71 } else if (PTR_ERR(inode) != -EEXIST || 71 } else if (PTR_ERR(inode) != -EEXIST ||
72 (nd->intent.open.flags & O_EXCL)) { 72 (nd && (nd->intent.open.flags & O_EXCL))) {
73 gfs2_holder_uninit(ghs); 73 gfs2_holder_uninit(ghs);
74 return PTR_ERR(inode); 74 return PTR_ERR(inode);
75 } 75 }
@@ -278,17 +278,25 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
278 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); 278 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
279 279
280 280
281 error = gfs2_glock_nq_m(3, ghs); 281 error = gfs2_glock_nq(ghs); /* parent */
282 if (error) 282 if (error)
283 goto out; 283 goto out_parent;
284
285 error = gfs2_glock_nq(ghs + 1); /* child */
286 if (error)
287 goto out_child;
288
289 error = gfs2_glock_nq(ghs + 2); /* rgrp */
290 if (error)
291 goto out_rgrp;
284 292
285 error = gfs2_unlink_ok(dip, &dentry->d_name, ip); 293 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
286 if (error) 294 if (error)
287 goto out_gunlock; 295 goto out_rgrp;
288 296
289 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); 297 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
290 if (error) 298 if (error)
291 goto out_gunlock; 299 goto out_rgrp;
292 300
293 error = gfs2_dir_del(dip, &dentry->d_name); 301 error = gfs2_dir_del(dip, &dentry->d_name);
294 if (error) 302 if (error)
@@ -298,12 +306,15 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
298 306
299out_end_trans: 307out_end_trans:
300 gfs2_trans_end(sdp); 308 gfs2_trans_end(sdp);
301out_gunlock: 309 gfs2_glock_dq(ghs + 2);
302 gfs2_glock_dq_m(3, ghs); 310out_rgrp:
303out:
304 gfs2_holder_uninit(ghs);
305 gfs2_holder_uninit(ghs + 1);
306 gfs2_holder_uninit(ghs + 2); 311 gfs2_holder_uninit(ghs + 2);
312 gfs2_glock_dq(ghs + 1);
313out_child:
314 gfs2_holder_uninit(ghs + 1);
315 gfs2_glock_dq(ghs);
316out_parent:
317 gfs2_holder_uninit(ghs);
307 gfs2_glock_dq_uninit(&ri_gh); 318 gfs2_glock_dq_uninit(&ri_gh);
308 return error; 319 return error;
309} 320}
@@ -894,12 +905,17 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
894static int setattr_size(struct inode *inode, struct iattr *attr) 905static int setattr_size(struct inode *inode, struct iattr *attr)
895{ 906{
896 struct gfs2_inode *ip = GFS2_I(inode); 907 struct gfs2_inode *ip = GFS2_I(inode);
908 struct gfs2_sbd *sdp = GFS2_SB(inode);
897 int error; 909 int error;
898 910
899 if (attr->ia_size != ip->i_di.di_size) { 911 if (attr->ia_size != ip->i_di.di_size) {
900 error = vmtruncate(inode, attr->ia_size); 912 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
901 if (error) 913 if (error)
902 return error; 914 return error;
915 error = vmtruncate(inode, attr->ia_size);
916 gfs2_trans_end(sdp);
917 if (error)
918 return error;
903 } 919 }
904 920
905 error = gfs2_truncatei(ip, attr->ia_size); 921 error = gfs2_truncatei(ip, attr->ia_size);
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 603d940f1159..950f31460e8b 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -92,7 +92,6 @@ static void gfs2_put_super(struct super_block *sb)
92 kthread_stop(sdp->sd_recoverd_process); 92 kthread_stop(sdp->sd_recoverd_process);
93 while (sdp->sd_glockd_num--) 93 while (sdp->sd_glockd_num--)
94 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); 94 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
95 kthread_stop(sdp->sd_scand_process);
96 95
97 if (!(sb->s_flags & MS_RDONLY)) { 96 if (!(sb->s_flags & MS_RDONLY)) {
98 error = gfs2_make_fs_ro(sdp); 97 error = gfs2_make_fs_ro(sdp);
@@ -456,12 +455,15 @@ static void gfs2_delete_inode(struct inode *inode)
456 } 455 }
457 456
458 error = gfs2_dinode_dealloc(ip); 457 error = gfs2_dinode_dealloc(ip);
459 /* 458 if (error)
460 * Must do this before unlock to avoid trying to write back 459 goto out_unlock;
461 * potentially dirty data now that inode no longer exists 460
462 * on disk. 461 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
463 */ 462 if (error)
463 goto out_unlock;
464 /* Needs to be done before glock release & also in a transaction */
464 truncate_inode_pages(&inode->i_data, 0); 465 truncate_inode_pages(&inode->i_data, 0);
466 gfs2_trans_end(sdp);
465 467
466out_unlock: 468out_unlock:
467 gfs2_glock_dq(&ip->i_iopen_gh); 469 gfs2_glock_dq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6e546ee8f3d4..addb51e0f135 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -70,6 +70,7 @@ struct gfs2_quota_host {
70 u64 qu_limit; 70 u64 qu_limit;
71 u64 qu_warn; 71 u64 qu_warn;
72 s64 qu_value; 72 s64 qu_value;
73 u32 qu_ll_next;
73}; 74};
74 75
75struct gfs2_quota_change_host { 76struct gfs2_quota_change_host {
@@ -580,6 +581,7 @@ static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
580 qu->qu_limit = be64_to_cpu(str->qu_limit); 581 qu->qu_limit = be64_to_cpu(str->qu_limit);
581 qu->qu_warn = be64_to_cpu(str->qu_warn); 582 qu->qu_warn = be64_to_cpu(str->qu_warn);
582 qu->qu_value = be64_to_cpu(str->qu_value); 583 qu->qu_value = be64_to_cpu(str->qu_value);
584 qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
583} 585}
584 586
585static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf) 587static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
@@ -589,6 +591,7 @@ static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
589 str->qu_limit = cpu_to_be64(qu->qu_limit); 591 str->qu_limit = cpu_to_be64(qu->qu_limit);
590 str->qu_warn = cpu_to_be64(qu->qu_warn); 592 str->qu_warn = cpu_to_be64(qu->qu_warn);
591 str->qu_value = cpu_to_be64(qu->qu_value); 593 str->qu_value = cpu_to_be64(qu->qu_value);
594 str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
592 memset(&str->qu_reserved, 0, sizeof(str->qu_reserved)); 595 memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
593} 596}
594 597
@@ -614,6 +617,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
614 s64 value; 617 s64 value;
615 int err = -EIO; 618 int err = -EIO;
616 619
620 if (gfs2_is_stuffed(ip)) {
621 struct gfs2_alloc *al = NULL;
622 al = gfs2_alloc_get(ip);
623 /* just request 1 blk */
624 al->al_requested = 1;
625 gfs2_inplace_reserve(ip);
626 gfs2_unstuff_dinode(ip, NULL);
627 gfs2_inplace_release(ip);
628 gfs2_alloc_put(ip);
629 }
617 page = grab_cache_page(mapping, index); 630 page = grab_cache_page(mapping, index);
618 if (!page) 631 if (!page)
619 return -ENOMEM; 632 return -ENOMEM;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 5ada38c99a2c..beb6c7ac0086 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -469,7 +469,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
469 }; 469 };
470 470
471 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 471 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
472 LM_FLAG_NOEXP, &ji_gh); 472 LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh);
473 if (error) 473 if (error)
474 goto fail_gunlock_j; 474 goto fail_gunlock_j;
475 } else { 475 } else {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ce48c4594ec8..708c287e1d0e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -31,6 +31,7 @@
31#include "inode.h" 31#include "inode.h"
32 32
33#define BFITNOENT ((u32)~0) 33#define BFITNOENT ((u32)~0)
34#define NO_BLOCK ((u64)~0)
34 35
35/* 36/*
36 * These routines are used by the resource group routines (rgrp.c) 37 * These routines are used by the resource group routines (rgrp.c)
@@ -116,8 +117,7 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
116 * @buffer: the buffer that holds the bitmaps 117 * @buffer: the buffer that holds the bitmaps
117 * @buflen: the length (in bytes) of the buffer 118 * @buflen: the length (in bytes) of the buffer
118 * @goal: start search at this block's bit-pair (within @buffer) 119 * @goal: start search at this block's bit-pair (within @buffer)
119 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for; 120 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for.
120 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
121 * 121 *
122 * Scope of @goal and returned block number is only within this bitmap buffer, 122 * Scope of @goal and returned block number is only within this bitmap buffer,
123 * not entire rgrp or filesystem. @buffer will be offset from the actual 123 * not entire rgrp or filesystem. @buffer will be offset from the actual
@@ -137,9 +137,13 @@ static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
137 byte = buffer + (goal / GFS2_NBBY); 137 byte = buffer + (goal / GFS2_NBBY);
138 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; 138 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
139 end = buffer + buflen; 139 end = buffer + buflen;
140 alloc = (old_state & 1) ? 0 : 0x55; 140 alloc = (old_state == GFS2_BLKST_FREE) ? 0x55 : 0;
141 141
142 while (byte < end) { 142 while (byte < end) {
143 /* If we're looking for a free block we can eliminate all
144 bitmap settings with 0x55, which represents four data
145 blocks in a row. If we're looking for a data block, we can
146 eliminate 0x00 which corresponds to four free blocks. */
143 if ((*byte & 0x55) == alloc) { 147 if ((*byte & 0x55) == alloc) {
144 blk += (8 - bit) >> 1; 148 blk += (8 - bit) >> 1;
145 149
@@ -859,23 +863,28 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
859static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) 863static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
860{ 864{
861 struct inode *inode; 865 struct inode *inode;
862 u32 goal = 0; 866 u32 goal = 0, block;
863 u64 no_addr; 867 u64 no_addr;
868 struct gfs2_sbd *sdp = rgd->rd_sbd;
864 869
865 for(;;) { 870 for(;;) {
866 if (goal >= rgd->rd_data) 871 if (goal >= rgd->rd_data)
867 break; 872 break;
868 goal = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, 873 down_write(&sdp->sd_log_flush_lock);
869 GFS2_BLKST_UNLINKED); 874 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
870 if (goal == BFITNOENT) 875 GFS2_BLKST_UNLINKED);
876 up_write(&sdp->sd_log_flush_lock);
877 if (block == BFITNOENT)
871 break; 878 break;
872 no_addr = goal + rgd->rd_data0; 879 /* rgblk_search can return a block < goal, so we need to
880 keep it marching forward. */
881 no_addr = block + rgd->rd_data0;
873 goal++; 882 goal++;
874 if (no_addr < *last_unlinked) 883 if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
875 continue; 884 continue;
876 *last_unlinked = no_addr; 885 *last_unlinked = no_addr;
877 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, 886 inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
878 no_addr, -1); 887 no_addr, -1, 1);
879 if (!IS_ERR(inode)) 888 if (!IS_ERR(inode))
880 return inode; 889 return inode;
881 } 890 }
@@ -1152,7 +1161,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1152 struct gfs2_alloc *al = &ip->i_alloc; 1161 struct gfs2_alloc *al = &ip->i_alloc;
1153 struct inode *inode; 1162 struct inode *inode;
1154 int error = 0; 1163 int error = 0;
1155 u64 last_unlinked = 0; 1164 u64 last_unlinked = NO_BLOCK;
1156 1165
1157 if (gfs2_assert_warn(sdp, al->al_requested)) 1166 if (gfs2_assert_warn(sdp, al->al_requested))
1158 return -EINVAL; 1167 return -EINVAL;
@@ -1289,7 +1298,9 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1289 allocatable block anywhere else, we want to be able wrap around and 1298 allocatable block anywhere else, we want to be able wrap around and
1290 search in the first part of our first-searched bit block. */ 1299 search in the first part of our first-searched bit block. */
1291 for (x = 0; x <= length; x++) { 1300 for (x = 0; x <= length; x++) {
1292 if (bi->bi_clone) 1301 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1302 bitmaps, so we must search the originals for that. */
1303 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1293 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset, 1304 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
1294 bi->bi_len, goal, old_state); 1305 bi->bi_len, goal, old_state);
1295 else 1306 else
@@ -1305,9 +1316,7 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1305 goal = 0; 1316 goal = 0;
1306 } 1317 }
1307 1318
1308 if (old_state != new_state) { 1319 if (blk != BFITNOENT && old_state != new_state) {
1309 gfs2_assert_withdraw(rgd->rd_sbd, blk != BFITNOENT);
1310
1311 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); 1320 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1312 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, 1321 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1313 bi->bi_len, blk, new_state); 1322 bi->bi_len, blk, new_state);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f916b9740c75..dd3e737f528e 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -58,7 +58,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
58 gt->gt_incore_log_blocks = 1024; 58 gt->gt_incore_log_blocks = 1024;
59 gt->gt_log_flush_secs = 60; 59 gt->gt_log_flush_secs = 60;
60 gt->gt_jindex_refresh_secs = 60; 60 gt->gt_jindex_refresh_secs = 60;
61 gt->gt_scand_secs = 15;
62 gt->gt_recoverd_secs = 60; 61 gt->gt_recoverd_secs = 60;
63 gt->gt_logd_secs = 1; 62 gt->gt_logd_secs = 1;
64 gt->gt_quotad_secs = 5; 63 gt->gt_quotad_secs = 5;
@@ -160,18 +159,15 @@ int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
160} 159}
161 160
162 161
163static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error) 162static void end_bio_io_page(struct bio *bio, int error)
164{ 163{
165 struct page *page = bio->bi_private; 164 struct page *page = bio->bi_private;
166 if (bio->bi_size)
167 return 1;
168 165
169 if (!error) 166 if (!error)
170 SetPageUptodate(page); 167 SetPageUptodate(page);
171 else 168 else
172 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); 169 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
173 unlock_page(page); 170 unlock_page(page);
174 return 0;
175} 171}
176 172
177static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) 173static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index c26c21b53c19..06e0b7768d97 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -222,7 +222,6 @@ static struct kobj_type gfs2_ktype = {
222}; 222};
223 223
224static struct kset gfs2_kset = { 224static struct kset gfs2_kset = {
225 .kobj = {.name = "gfs2"},
226 .ktype = &gfs2_ktype, 225 .ktype = &gfs2_ktype,
227}; 226};
228 227
@@ -442,7 +441,6 @@ TUNE_ATTR(quota_simul_sync, 1);
442TUNE_ATTR(quota_cache_secs, 1); 441TUNE_ATTR(quota_cache_secs, 1);
443TUNE_ATTR(stall_secs, 1); 442TUNE_ATTR(stall_secs, 1);
444TUNE_ATTR(statfs_quantum, 1); 443TUNE_ATTR(statfs_quantum, 1);
445TUNE_ATTR_DAEMON(scand_secs, scand_process);
446TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); 444TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
447TUNE_ATTR_DAEMON(logd_secs, logd_process); 445TUNE_ATTR_DAEMON(logd_secs, logd_process);
448TUNE_ATTR_DAEMON(quotad_secs, quotad_process); 446TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
@@ -464,7 +462,6 @@ static struct attribute *tune_attrs[] = {
464 &tune_attr_quota_cache_secs.attr, 462 &tune_attr_quota_cache_secs.attr,
465 &tune_attr_stall_secs.attr, 463 &tune_attr_stall_secs.attr,
466 &tune_attr_statfs_quantum.attr, 464 &tune_attr_statfs_quantum.attr,
467 &tune_attr_scand_secs.attr,
468 &tune_attr_recoverd_secs.attr, 465 &tune_attr_recoverd_secs.attr,
469 &tune_attr_logd_secs.attr, 466 &tune_attr_logd_secs.attr,
470 &tune_attr_quotad_secs.attr, 467 &tune_attr_quotad_secs.attr,
@@ -553,6 +550,7 @@ int gfs2_sys_init(void)
553{ 550{
554 gfs2_sys_margs = NULL; 551 gfs2_sys_margs = NULL;
555 spin_lock_init(&gfs2_sys_margs_lock); 552 spin_lock_init(&gfs2_sys_margs_lock);
553 kobject_set_name(&gfs2_kset.kobj, "gfs2");
556 kobj_set_kset_s(&gfs2_kset, fs_subsys); 554 kobj_set_kset_s(&gfs2_kset, fs_subsys);
557 return kset_register(&gfs2_kset); 555 return kset_register(&gfs2_kset);
558} 556}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index f8dabf8446bb..717983e2c2ae 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -142,25 +142,25 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
142 lops_add(sdp, &bd->bd_le); 142 lops_add(sdp, &bd->bd_le);
143} 143}
144 144
145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno) 145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
146{ 146{
147 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke), 147 BUG_ON(!list_empty(&bd->bd_le.le_list));
148 GFP_NOFS | __GFP_NOFAIL); 148 BUG_ON(!list_empty(&bd->bd_ail_st_list));
149 lops_init_le(&rv->rv_le, &gfs2_revoke_lops); 149 BUG_ON(!list_empty(&bd->bd_ail_gl_list));
150 rv->rv_blkno = blkno; 150 lops_init_le(&bd->bd_le, &gfs2_revoke_lops);
151 lops_add(sdp, &rv->rv_le); 151 lops_add(sdp, &bd->bd_le);
152} 152}
153 153
154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno) 154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
155{ 155{
156 struct gfs2_revoke *rv; 156 struct gfs2_bufdata *bd;
157 int found = 0; 157 int found = 0;
158 158
159 gfs2_log_lock(sdp); 159 gfs2_log_lock(sdp);
160 160
161 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) { 161 list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) {
162 if (rv->rv_blkno == blkno) { 162 if (bd->bd_blkno == blkno) {
163 list_del(&rv->rv_le.le_list); 163 list_del_init(&bd->bd_le.le_list);
164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); 164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
165 sdp->sd_log_num_revoke--; 165 sdp->sd_log_num_revoke--;
166 found = 1; 166 found = 1;
@@ -172,7 +172,7 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
172 172
173 if (found) { 173 if (found) {
174 struct gfs2_trans *tr = current->journal_info; 174 struct gfs2_trans *tr = current->journal_info;
175 kfree(rv); 175 kmem_cache_free(gfs2_bufdata_cachep, bd);
176 tr->tr_num_revoke_rm++; 176 tr->tr_num_revoke_rm++;
177 } 177 }
178} 178}
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 23d4cbe1de5b..043d5f4b9c4c 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -32,7 +32,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp);
32 32
33void gfs2_trans_add_gl(struct gfs2_glock *gl); 33void gfs2_trans_add_gl(struct gfs2_glock *gl);
34void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); 34void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
35void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno); 35void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
36void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno); 36void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
37void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); 37void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
38 38
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index de3e4a506dbc..ccfd02944053 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2162,7 +2162,7 @@ static void lbmStartIO(struct lbuf * bp)
2162 /* check if journaling to disk has been disabled */ 2162 /* check if journaling to disk has been disabled */
2163 if (log->no_integrity) { 2163 if (log->no_integrity) {
2164 bio->bi_size = 0; 2164 bio->bi_size = 0;
2165 lbmIODone(bio, 0, 0); 2165 lbmIODone(bio, 0);
2166 } else { 2166 } else {
2167 submit_bio(WRITE_SYNC, bio); 2167 submit_bio(WRITE_SYNC, bio);
2168 INCREMENT(lmStat.submitted); 2168 INCREMENT(lmStat.submitted);
@@ -2200,16 +2200,13 @@ static int lbmIOWait(struct lbuf * bp, int flag)
2200 * 2200 *
2201 * executed at INTIODONE level 2201 * executed at INTIODONE level
2202 */ 2202 */
2203static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) 2203static void lbmIODone(struct bio *bio, int error)
2204{ 2204{
2205 struct lbuf *bp = bio->bi_private; 2205 struct lbuf *bp = bio->bi_private;
2206 struct lbuf *nextbp, *tail; 2206 struct lbuf *nextbp, *tail;
2207 struct jfs_log *log; 2207 struct jfs_log *log;
2208 unsigned long flags; 2208 unsigned long flags;
2209 2209
2210 if (bio->bi_size)
2211 return 1;
2212
2213 /* 2210 /*
2214 * get back jfs buffer bound to the i/o buffer 2211 * get back jfs buffer bound to the i/o buffer
2215 */ 2212 */
@@ -2237,8 +2234,6 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2237 2234
2238 /* wakeup I/O initiator */ 2235 /* wakeup I/O initiator */
2239 LCACHE_WAKEUP(&bp->l_ioevent); 2236 LCACHE_WAKEUP(&bp->l_ioevent);
2240
2241 return 0;
2242 } 2237 }
2243 2238
2244 /* 2239 /*
@@ -2263,7 +2258,6 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2263 if (bp->l_flag & lbmDIRECT) { 2258 if (bp->l_flag & lbmDIRECT) {
2264 LCACHE_WAKEUP(&bp->l_ioevent); 2259 LCACHE_WAKEUP(&bp->l_ioevent);
2265 LCACHE_UNLOCK(flags); 2260 LCACHE_UNLOCK(flags);
2266 return 0;
2267 } 2261 }
2268 2262
2269 tail = log->wqueue; 2263 tail = log->wqueue;
@@ -2342,8 +2336,6 @@ static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2342 2336
2343 LCACHE_UNLOCK(flags); /* unlock+enable */ 2337 LCACHE_UNLOCK(flags); /* unlock+enable */
2344 } 2338 }
2345
2346 return 0;
2347} 2339}
2348 2340
2349int jfsIOWait(void *arg) 2341int jfsIOWait(void *arg)
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 62e96be02acf..941369c1ac8d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -280,14 +280,10 @@ static void last_read_complete(struct page *page)
280 unlock_page(page); 280 unlock_page(page);
281} 281}
282 282
283static int metapage_read_end_io(struct bio *bio, unsigned int bytes_done, 283static void metapage_read_end_io(struct bio *bio, int err)
284 int err)
285{ 284{
286 struct page *page = bio->bi_private; 285 struct page *page = bio->bi_private;
287 286
288 if (bio->bi_size)
289 return 1;
290
291 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 287 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
292 printk(KERN_ERR "metapage_read_end_io: I/O error\n"); 288 printk(KERN_ERR "metapage_read_end_io: I/O error\n");
293 SetPageError(page); 289 SetPageError(page);
@@ -295,8 +291,6 @@ static int metapage_read_end_io(struct bio *bio, unsigned int bytes_done,
295 291
296 dec_io(page, last_read_complete); 292 dec_io(page, last_read_complete);
297 bio_put(bio); 293 bio_put(bio);
298
299 return 0;
300} 294}
301 295
302static void remove_from_logsync(struct metapage *mp) 296static void remove_from_logsync(struct metapage *mp)
@@ -341,23 +335,18 @@ static void last_write_complete(struct page *page)
341 end_page_writeback(page); 335 end_page_writeback(page);
342} 336}
343 337
344static int metapage_write_end_io(struct bio *bio, unsigned int bytes_done, 338static void metapage_write_end_io(struct bio *bio, int err)
345 int err)
346{ 339{
347 struct page *page = bio->bi_private; 340 struct page *page = bio->bi_private;
348 341
349 BUG_ON(!PagePrivate(page)); 342 BUG_ON(!PagePrivate(page));
350 343
351 if (bio->bi_size)
352 return 1;
353
354 if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) { 344 if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) {
355 printk(KERN_ERR "metapage_write_end_io: I/O error\n"); 345 printk(KERN_ERR "metapage_write_end_io: I/O error\n");
356 SetPageError(page); 346 SetPageError(page);
357 } 347 }
358 dec_io(page, last_write_complete); 348 dec_io(page, last_write_complete);
359 bio_put(bio); 349 bio_put(bio);
360 return 0;
361} 350}
362 351
363static int metapage_writepage(struct page *page, struct writeback_control *wbc) 352static int metapage_writepage(struct page *page, struct writeback_control *wbc)
diff --git a/fs/mpage.c b/fs/mpage.c
index c1698f2291aa..b1c3e5890508 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -39,14 +39,11 @@
39 * status of that page is hard. See end_buffer_async_read() for the details. 39 * status of that page is hard. See end_buffer_async_read() for the details.
40 * There is no point in duplicating all that complexity. 40 * There is no point in duplicating all that complexity.
41 */ 41 */
42static int mpage_end_io_read(struct bio *bio, unsigned int bytes_done, int err) 42static void mpage_end_io_read(struct bio *bio, int err)
43{ 43{
44 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 44 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
45 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 45 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
46 46
47 if (bio->bi_size)
48 return 1;
49
50 do { 47 do {
51 struct page *page = bvec->bv_page; 48 struct page *page = bvec->bv_page;
52 49
@@ -62,17 +59,13 @@ static int mpage_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
62 unlock_page(page); 59 unlock_page(page);
63 } while (bvec >= bio->bi_io_vec); 60 } while (bvec >= bio->bi_io_vec);
64 bio_put(bio); 61 bio_put(bio);
65 return 0;
66} 62}
67 63
68static int mpage_end_io_write(struct bio *bio, unsigned int bytes_done, int err) 64static void mpage_end_io_write(struct bio *bio, int err)
69{ 65{
70 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 66 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
71 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 67 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
72 68
73 if (bio->bi_size)
74 return 1;
75
76 do { 69 do {
77 struct page *page = bvec->bv_page; 70 struct page *page = bvec->bv_page;
78 71
@@ -87,7 +80,6 @@ static int mpage_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
87 end_page_writeback(page); 80 end_page_writeback(page);
88 } while (bvec >= bio->bi_io_vec); 81 } while (bvec >= bio->bi_io_vec);
89 bio_put(bio); 82 bio_put(bio);
90 return 0;
91} 83}
92 84
93static struct bio *mpage_bio_submit(int rw, struct bio *bio) 85static struct bio *mpage_bio_submit(int rw, struct bio *bio)
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index af4ef808fa94..345798ebd366 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -17,6 +17,18 @@ ToDo/Notes:
17 happen is unclear however so it is worth waiting until someone hits 17 happen is unclear however so it is worth waiting until someone hits
18 the problem. 18 the problem.
19 19
202.1.29 - Fix a deadlock at mount time.
21
22 - During mount the VFS holds s_umount lock on the superblock. So when
23 we try to empty the journal $LogFile contents by calling
24 ntfs_attr_set() when the machine does not have much memory and the
25 journal is large ntfs_attr_set() results in the VM trying to balance
26 dirty pages which in turn tries to that the s_umount lock and thus we
27 get a deadlock. The solution is to not use ntfs_attr_set() and
28 instead do the zeroing by hand at the block level rather than page
29 cache level.
30 - Fix sparse warnings.
31
202.1.28 - Fix a deadlock. 322.1.28 - Fix a deadlock.
21 33
22 - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey 34 - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 825508385565..58b6be992544 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.28\" 9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ifeq ($(CONFIG_NTFS_DEBUG),y)
12EXTRA_CFLAGS += -DDEBUG 12EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 6e5c2534f4bc..cfdc7900d271 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -2,7 +2,7 @@
2 * aops.c - NTFS kernel address space operations and page cache handling. 2 * aops.c - NTFS kernel address space operations and page cache handling.
3 * Part of the Linux-NTFS project. 3 * Part of the Linux-NTFS project.
4 * 4 *
5 * Copyright (c) 2001-2006 Anton Altaparmakov 5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (c) 2002 Richard Russon 6 * Copyright (c) 2002 Richard Russon
7 * 7 *
8 * This program/include file is free software; you can redistribute it and/or 8 * This program/include file is free software; you can redistribute it and/or
@@ -396,7 +396,7 @@ static int ntfs_readpage(struct file *file, struct page *page)
396 loff_t i_size; 396 loff_t i_size;
397 struct inode *vi; 397 struct inode *vi;
398 ntfs_inode *ni, *base_ni; 398 ntfs_inode *ni, *base_ni;
399 u8 *kaddr; 399 u8 *addr;
400 ntfs_attr_search_ctx *ctx; 400 ntfs_attr_search_ctx *ctx;
401 MFT_RECORD *mrec; 401 MFT_RECORD *mrec;
402 unsigned long flags; 402 unsigned long flags;
@@ -491,15 +491,15 @@ retry_readpage:
491 /* Race with shrinking truncate. */ 491 /* Race with shrinking truncate. */
492 attr_len = i_size; 492 attr_len = i_size;
493 } 493 }
494 kaddr = kmap_atomic(page, KM_USER0); 494 addr = kmap_atomic(page, KM_USER0);
495 /* Copy the data to the page. */ 495 /* Copy the data to the page. */
496 memcpy(kaddr, (u8*)ctx->attr + 496 memcpy(addr, (u8*)ctx->attr +
497 le16_to_cpu(ctx->attr->data.resident.value_offset), 497 le16_to_cpu(ctx->attr->data.resident.value_offset),
498 attr_len); 498 attr_len);
499 /* Zero the remainder of the page. */ 499 /* Zero the remainder of the page. */
500 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); 500 memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
501 flush_dcache_page(page); 501 flush_dcache_page(page);
502 kunmap_atomic(kaddr, KM_USER0); 502 kunmap_atomic(addr, KM_USER0);
503put_unm_err_out: 503put_unm_err_out:
504 ntfs_attr_put_search_ctx(ctx); 504 ntfs_attr_put_search_ctx(ctx);
505unm_err_out: 505unm_err_out:
@@ -1344,7 +1344,7 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
1344 loff_t i_size; 1344 loff_t i_size;
1345 struct inode *vi = page->mapping->host; 1345 struct inode *vi = page->mapping->host;
1346 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); 1346 ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1347 char *kaddr; 1347 char *addr;
1348 ntfs_attr_search_ctx *ctx = NULL; 1348 ntfs_attr_search_ctx *ctx = NULL;
1349 MFT_RECORD *m = NULL; 1349 MFT_RECORD *m = NULL;
1350 u32 attr_len; 1350 u32 attr_len;
@@ -1484,14 +1484,14 @@ retry_writepage:
1484 /* Shrinking cannot fail. */ 1484 /* Shrinking cannot fail. */
1485 BUG_ON(err); 1485 BUG_ON(err);
1486 } 1486 }
1487 kaddr = kmap_atomic(page, KM_USER0); 1487 addr = kmap_atomic(page, KM_USER0);
1488 /* Copy the data from the page to the mft record. */ 1488 /* Copy the data from the page to the mft record. */
1489 memcpy((u8*)ctx->attr + 1489 memcpy((u8*)ctx->attr +
1490 le16_to_cpu(ctx->attr->data.resident.value_offset), 1490 le16_to_cpu(ctx->attr->data.resident.value_offset),
1491 kaddr, attr_len); 1491 addr, attr_len);
1492 /* Zero out of bounds area in the page cache page. */ 1492 /* Zero out of bounds area in the page cache page. */
1493 memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); 1493 memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1494 kunmap_atomic(kaddr, KM_USER0); 1494 kunmap_atomic(addr, KM_USER0);
1495 flush_dcache_page(page); 1495 flush_dcache_page(page);
1496 flush_dcache_mft_record_page(ctx->ntfs_ino); 1496 flush_dcache_mft_record_page(ctx->ntfs_ino);
1497 /* We are done with the page. */ 1497 /* We are done with the page. */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1c08fefe487a..92dabdcf2b80 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. 2 * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2006 Anton Altaparmakov 4 * Copyright (c) 2001-2007 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon 5 * Copyright (c) 2002 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -2500,7 +2500,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
2500 struct page *page; 2500 struct page *page;
2501 u8 *kaddr; 2501 u8 *kaddr;
2502 pgoff_t idx, end; 2502 pgoff_t idx, end;
2503 unsigned int start_ofs, end_ofs, size; 2503 unsigned start_ofs, end_ofs, size;
2504 2504
2505 ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", 2505 ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
2506 (long long)ofs, (long long)cnt, val); 2506 (long long)ofs, (long long)cnt, val);
@@ -2548,6 +2548,8 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
2548 kunmap_atomic(kaddr, KM_USER0); 2548 kunmap_atomic(kaddr, KM_USER0);
2549 set_page_dirty(page); 2549 set_page_dirty(page);
2550 page_cache_release(page); 2550 page_cache_release(page);
2551 balance_dirty_pages_ratelimited(mapping);
2552 cond_resched();
2551 if (idx == end) 2553 if (idx == end)
2552 goto done; 2554 goto done;
2553 idx++; 2555 idx++;
@@ -2604,6 +2606,8 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
2604 kunmap_atomic(kaddr, KM_USER0); 2606 kunmap_atomic(kaddr, KM_USER0);
2605 set_page_dirty(page); 2607 set_page_dirty(page);
2606 page_cache_release(page); 2608 page_cache_release(page);
2609 balance_dirty_pages_ratelimited(mapping);
2610 cond_resched();
2607 } 2611 }
2608done: 2612done:
2609 ntfs_debug("Done."); 2613 ntfs_debug("Done.");
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ffcc504a1667..c814204d4ea0 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2006 Anton Altaparmakov 4 * Copyright (c) 2001-2007 Anton Altaparmakov
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -26,7 +26,6 @@
26#include <linux/swap.h> 26#include <linux/swap.h>
27#include <linux/uio.h> 27#include <linux/uio.h>
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/sched.h>
30 29
31#include <asm/page.h> 30#include <asm/page.h>
32#include <asm/uaccess.h> 31#include <asm/uaccess.h>
@@ -362,7 +361,7 @@ static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
362 volatile char c; 361 volatile char c;
363 362
364 /* Set @end to the first byte outside the last page we care about. */ 363 /* Set @end to the first byte outside the last page we care about. */
365 end = (const char __user*)PAGE_ALIGN((ptrdiff_t __user)uaddr + bytes); 364 end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
366 365
367 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end)) 366 while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
368 ; 367 ;
@@ -532,7 +531,8 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
532 blocksize_bits = vol->sb->s_blocksize_bits; 531 blocksize_bits = vol->sb->s_blocksize_bits;
533 u = 0; 532 u = 0;
534 do { 533 do {
535 struct page *page = pages[u]; 534 page = pages[u];
535 BUG_ON(!page);
536 /* 536 /*
537 * create_empty_buffers() will create uptodate/dirty buffers if 537 * create_empty_buffers() will create uptodate/dirty buffers if
538 * the page is uptodate/dirty. 538 * the page is uptodate/dirty.
@@ -1291,7 +1291,7 @@ static inline size_t ntfs_copy_from_user(struct page **pages,
1291 size_t bytes) 1291 size_t bytes)
1292{ 1292{
1293 struct page **last_page = pages + nr_pages; 1293 struct page **last_page = pages + nr_pages;
1294 char *kaddr; 1294 char *addr;
1295 size_t total = 0; 1295 size_t total = 0;
1296 unsigned len; 1296 unsigned len;
1297 int left; 1297 int left;
@@ -1300,13 +1300,13 @@ static inline size_t ntfs_copy_from_user(struct page **pages,
1300 len = PAGE_CACHE_SIZE - ofs; 1300 len = PAGE_CACHE_SIZE - ofs;
1301 if (len > bytes) 1301 if (len > bytes)
1302 len = bytes; 1302 len = bytes;
1303 kaddr = kmap_atomic(*pages, KM_USER0); 1303 addr = kmap_atomic(*pages, KM_USER0);
1304 left = __copy_from_user_inatomic(kaddr + ofs, buf, len); 1304 left = __copy_from_user_inatomic(addr + ofs, buf, len);
1305 kunmap_atomic(kaddr, KM_USER0); 1305 kunmap_atomic(addr, KM_USER0);
1306 if (unlikely(left)) { 1306 if (unlikely(left)) {
1307 /* Do it the slow way. */ 1307 /* Do it the slow way. */
1308 kaddr = kmap(*pages); 1308 addr = kmap(*pages);
1309 left = __copy_from_user(kaddr + ofs, buf, len); 1309 left = __copy_from_user(addr + ofs, buf, len);
1310 kunmap(*pages); 1310 kunmap(*pages);
1311 if (unlikely(left)) 1311 if (unlikely(left))
1312 goto err_out; 1312 goto err_out;
@@ -1408,26 +1408,26 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1408 size_t *iov_ofs, size_t bytes) 1408 size_t *iov_ofs, size_t bytes)
1409{ 1409{
1410 struct page **last_page = pages + nr_pages; 1410 struct page **last_page = pages + nr_pages;
1411 char *kaddr; 1411 char *addr;
1412 size_t copied, len, total = 0; 1412 size_t copied, len, total = 0;
1413 1413
1414 do { 1414 do {
1415 len = PAGE_CACHE_SIZE - ofs; 1415 len = PAGE_CACHE_SIZE - ofs;
1416 if (len > bytes) 1416 if (len > bytes)
1417 len = bytes; 1417 len = bytes;
1418 kaddr = kmap_atomic(*pages, KM_USER0); 1418 addr = kmap_atomic(*pages, KM_USER0);
1419 copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs, 1419 copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
1420 *iov, *iov_ofs, len); 1420 *iov, *iov_ofs, len);
1421 kunmap_atomic(kaddr, KM_USER0); 1421 kunmap_atomic(addr, KM_USER0);
1422 if (unlikely(copied != len)) { 1422 if (unlikely(copied != len)) {
1423 /* Do it the slow way. */ 1423 /* Do it the slow way. */
1424 kaddr = kmap(*pages); 1424 addr = kmap(*pages);
1425 copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs, 1425 copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
1426 *iov, *iov_ofs, len); 1426 *iov, *iov_ofs, len);
1427 /* 1427 /*
1428 * Zero the rest of the target like __copy_from_user(). 1428 * Zero the rest of the target like __copy_from_user().
1429 */ 1429 */
1430 memset(kaddr + ofs + copied, 0, len - copied); 1430 memset(addr + ofs + copied, 0, len - copied);
1431 kunmap(*pages); 1431 kunmap(*pages);
1432 if (unlikely(copied != len)) 1432 if (unlikely(copied != len))
1433 goto err_out; 1433 goto err_out;
@@ -1735,8 +1735,6 @@ static int ntfs_commit_pages_after_write(struct page **pages,
1735 read_unlock_irqrestore(&ni->size_lock, flags); 1735 read_unlock_irqrestore(&ni->size_lock, flags);
1736 BUG_ON(initialized_size != i_size); 1736 BUG_ON(initialized_size != i_size);
1737 if (end > initialized_size) { 1737 if (end > initialized_size) {
1738 unsigned long flags;
1739
1740 write_lock_irqsave(&ni->size_lock, flags); 1738 write_lock_irqsave(&ni->size_lock, flags);
1741 ni->initialized_size = end; 1739 ni->initialized_size = end;
1742 i_size_write(vi, end); 1740 i_size_write(vi, end);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index b532a730cec2..e9da092e2772 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -34,7 +34,6 @@
34#include "dir.h" 34#include "dir.h"
35#include "debug.h" 35#include "debug.h"
36#include "inode.h" 36#include "inode.h"
37#include "attrib.h"
38#include "lcnalloc.h" 37#include "lcnalloc.h"
39#include "malloc.h" 38#include "malloc.h"
40#include "mft.h" 39#include "mft.h"
@@ -2500,8 +2499,6 @@ retry_truncate:
2500 /* Resize the attribute record to best fit the new attribute size. */ 2499 /* Resize the attribute record to best fit the new attribute size. */
2501 if (new_size < vol->mft_record_size && 2500 if (new_size < vol->mft_record_size &&
2502 !ntfs_resident_attr_value_resize(m, a, new_size)) { 2501 !ntfs_resident_attr_value_resize(m, a, new_size)) {
2503 unsigned long flags;
2504
2505 /* The resize succeeded! */ 2502 /* The resize succeeded! */
2506 flush_dcache_mft_record_page(ctx->ntfs_ino); 2503 flush_dcache_mft_record_page(ctx->ntfs_ino);
2507 mark_mft_record_dirty(ctx->ntfs_ino); 2504 mark_mft_record_dirty(ctx->ntfs_ino);
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index acfed325f4ec..d7932e95b1fd 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. 2 * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2002-2005 Anton Altaparmakov 4 * Copyright (c) 2002-2007 Anton Altaparmakov
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -724,24 +724,139 @@ bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
724 */ 724 */
725bool ntfs_empty_logfile(struct inode *log_vi) 725bool ntfs_empty_logfile(struct inode *log_vi)
726{ 726{
727 ntfs_volume *vol = NTFS_SB(log_vi->i_sb); 727 VCN vcn, end_vcn;
728 ntfs_inode *log_ni = NTFS_I(log_vi);
729 ntfs_volume *vol = log_ni->vol;
730 struct super_block *sb = vol->sb;
731 runlist_element *rl;
732 unsigned long flags;
733 unsigned block_size, block_size_bits;
734 int err;
735 bool should_wait = true;
728 736
729 ntfs_debug("Entering."); 737 ntfs_debug("Entering.");
730 if (!NVolLogFileEmpty(vol)) { 738 if (NVolLogFileEmpty(vol)) {
731 int err; 739 ntfs_debug("Done.");
732 740 return true;
733 err = ntfs_attr_set(NTFS_I(log_vi), 0, i_size_read(log_vi),
734 0xff);
735 if (unlikely(err)) {
736 ntfs_error(vol->sb, "Failed to fill $LogFile with "
737 "0xff bytes (error code %i).", err);
738 return false;
739 }
740 /* Set the flag so we do not have to do it again on remount. */
741 NVolSetLogFileEmpty(vol);
742 } 741 }
742 /*
743 * We cannot use ntfs_attr_set() because we may be still in the middle
744 * of a mount operation. Thus we do the emptying by hand by first
745 * zapping the page cache pages for the $LogFile/$DATA attribute and
746 * then emptying each of the buffers in each of the clusters specified
747 * by the runlist by hand.
748 */
749 block_size = sb->s_blocksize;
750 block_size_bits = sb->s_blocksize_bits;
751 vcn = 0;
752 read_lock_irqsave(&log_ni->size_lock, flags);
753 end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
754 vol->cluster_size_bits;
755 read_unlock_irqrestore(&log_ni->size_lock, flags);
756 truncate_inode_pages(log_vi->i_mapping, 0);
757 down_write(&log_ni->runlist.lock);
758 rl = log_ni->runlist.rl;
759 if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
760map_vcn:
761 err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
762 if (err) {
763 ntfs_error(sb, "Failed to map runlist fragment (error "
764 "%d).", -err);
765 goto err;
766 }
767 rl = log_ni->runlist.rl;
768 BUG_ON(!rl || vcn < rl->vcn || !rl->length);
769 }
770 /* Seek to the runlist element containing @vcn. */
771 while (rl->length && vcn >= rl[1].vcn)
772 rl++;
773 do {
774 LCN lcn;
775 sector_t block, end_block;
776 s64 len;
777
778 /*
779 * If this run is not mapped map it now and start again as the
780 * runlist will have been updated.
781 */
782 lcn = rl->lcn;
783 if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
784 vcn = rl->vcn;
785 goto map_vcn;
786 }
787 /* If this run is not valid abort with an error. */
788 if (unlikely(!rl->length || lcn < LCN_HOLE))
789 goto rl_err;
790 /* Skip holes. */
791 if (lcn == LCN_HOLE)
792 continue;
793 block = lcn << vol->cluster_size_bits >> block_size_bits;
794 len = rl->length;
795 if (rl[1].vcn > end_vcn)
796 len = end_vcn - rl->vcn;
797 end_block = (lcn + len) << vol->cluster_size_bits >>
798 block_size_bits;
799 /* Iterate over the blocks in the run and empty them. */
800 do {
801 struct buffer_head *bh;
802
803 /* Obtain the buffer, possibly not uptodate. */
804 bh = sb_getblk(sb, block);
805 BUG_ON(!bh);
806 /* Setup buffer i/o submission. */
807 lock_buffer(bh);
808 bh->b_end_io = end_buffer_write_sync;
809 get_bh(bh);
810 /* Set the entire contents of the buffer to 0xff. */
811 memset(bh->b_data, -1, block_size);
812 if (!buffer_uptodate(bh))
813 set_buffer_uptodate(bh);
814 if (buffer_dirty(bh))
815 clear_buffer_dirty(bh);
816 /*
817 * Submit the buffer and wait for i/o to complete but
818 * only for the first buffer so we do not miss really
819 * serious i/o errors. Once the first buffer has
820 * completed ignore errors afterwards as we can assume
821 * that if one buffer worked all of them will work.
822 */
823 submit_bh(WRITE, bh);
824 if (should_wait) {
825 should_wait = false;
826 wait_on_buffer(bh);
827 if (unlikely(!buffer_uptodate(bh)))
828 goto io_err;
829 }
830 brelse(bh);
831 } while (++block < end_block);
832 } while ((++rl)->vcn < end_vcn);
833 up_write(&log_ni->runlist.lock);
834 /*
835 * Zap the pages again just in case any got instantiated whilst we were
836 * emptying the blocks by hand. FIXME: We may not have completed
837 * writing to all the buffer heads yet so this may happen too early.
838 * We really should use a kernel thread to do the emptying
839 * asynchronously and then we can also set the volume dirty and output
840 * an error message if emptying should fail.
841 */
842 truncate_inode_pages(log_vi->i_mapping, 0);
843 /* Set the flag so we do not have to do it again on remount. */
844 NVolSetLogFileEmpty(vol);
743 ntfs_debug("Done."); 845 ntfs_debug("Done.");
744 return true; 846 return true;
847io_err:
848 ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk.");
849 goto dirty_err;
850rl_err:
851 ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk.");
852dirty_err:
853 NVolSetErrors(vol);
854 err = -EIO;
855err:
856 up_write(&log_ni->runlist.lock);
857 ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
858 -err);
859 return false;
745} 860}
746 861
747#endif /* NTFS_RW */ 862#endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 9afd72c7ad0d..56a9a6d25a2a 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. 2 * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2005 Anton Altaparmakov 4 * Copyright (c) 2001-2007 Anton Altaparmakov
5 * Copyright (c) 2002-2005 Richard Russon 5 * Copyright (c) 2002-2005 Richard Russon
6 * 6 *
7 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -1714,7 +1714,7 @@ extend_hole:
1714 sizeof(*rl)); 1714 sizeof(*rl));
1715 /* Adjust the beginning of the tail if necessary. */ 1715 /* Adjust the beginning of the tail if necessary. */
1716 if (end > rl->vcn) { 1716 if (end > rl->vcn) {
1717 s64 delta = end - rl->vcn; 1717 delta = end - rl->vcn;
1718 rl->vcn = end; 1718 rl->vcn = end;
1719 rl->length -= delta; 1719 rl->length -= delta;
1720 /* Only adjust the lcn if it is real. */ 1720 /* Only adjust the lcn if it is real. */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 778a850b4634..4ba7f0bdc248 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -354,7 +354,6 @@ struct ocfs2_insert_type {
354 enum ocfs2_append_type ins_appending; 354 enum ocfs2_append_type ins_appending;
355 enum ocfs2_contig_type ins_contig; 355 enum ocfs2_contig_type ins_contig;
356 int ins_contig_index; 356 int ins_contig_index;
357 int ins_free_records;
358 int ins_tree_depth; 357 int ins_tree_depth;
359}; 358};
360 359
@@ -362,7 +361,6 @@ struct ocfs2_merge_ctxt {
362 enum ocfs2_contig_type c_contig_type; 361 enum ocfs2_contig_type c_contig_type;
363 int c_has_empty_extent; 362 int c_has_empty_extent;
364 int c_split_covers_rec; 363 int c_split_covers_rec;
365 int c_used_tail_recs;
366}; 364};
367 365
368/* 366/*
@@ -2808,36 +2806,28 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2808 struct ocfs2_merge_ctxt *ctxt) 2806 struct ocfs2_merge_ctxt *ctxt)
2809 2807
2810{ 2808{
2811 int ret = 0, delete_tail_recs = 0; 2809 int ret = 0;
2812 struct ocfs2_extent_list *el = path_leaf_el(left_path); 2810 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2813 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 2811 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2814 2812
2815 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 2813 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
2816 2814
2817 if (ctxt->c_split_covers_rec) { 2815 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
2818 delete_tail_recs++; 2816 /*
2819 2817 * The merge code will need to create an empty
2820 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT || 2818 * extent to take the place of the newly
2821 ctxt->c_has_empty_extent) 2819 * emptied slot. Remove any pre-existing empty
2822 delete_tail_recs++; 2820 * extents - having more than one in a leaf is
2823 2821 * illegal.
2824 if (ctxt->c_has_empty_extent) { 2822 */
2825 /* 2823 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2826 * The merge code will need to create an empty 2824 dealloc);
2827 * extent to take the place of the newly 2825 if (ret) {
2828 * emptied slot. Remove any pre-existing empty 2826 mlog_errno(ret);
2829 * extents - having more than one in a leaf is 2827 goto out;
2830 * illegal.
2831 */
2832 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2833 dealloc);
2834 if (ret) {
2835 mlog_errno(ret);
2836 goto out;
2837 }
2838 split_index--;
2839 rec = &el->l_recs[split_index];
2840 } 2828 }
2829 split_index--;
2830 rec = &el->l_recs[split_index];
2841 } 2831 }
2842 2832
2843 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { 2833 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
@@ -3593,6 +3583,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
3593 struct buffer_head *di_bh, 3583 struct buffer_head *di_bh,
3594 struct buffer_head **last_eb_bh, 3584 struct buffer_head **last_eb_bh,
3595 struct ocfs2_extent_rec *insert_rec, 3585 struct ocfs2_extent_rec *insert_rec,
3586 int *free_records,
3596 struct ocfs2_insert_type *insert) 3587 struct ocfs2_insert_type *insert)
3597{ 3588{
3598 int ret; 3589 int ret;
@@ -3633,7 +3624,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
3633 * XXX: This test is simplistic, we can search for empty 3624 * XXX: This test is simplistic, we can search for empty
3634 * extent records too. 3625 * extent records too.
3635 */ 3626 */
3636 insert->ins_free_records = le16_to_cpu(el->l_count) - 3627 *free_records = le16_to_cpu(el->l_count) -
3637 le16_to_cpu(el->l_next_free_rec); 3628 le16_to_cpu(el->l_next_free_rec);
3638 3629
3639 if (!insert->ins_tree_depth) { 3630 if (!insert->ins_tree_depth) {
@@ -3730,10 +3721,13 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
3730 struct ocfs2_alloc_context *meta_ac) 3721 struct ocfs2_alloc_context *meta_ac)
3731{ 3722{
3732 int status; 3723 int status;
3724 int uninitialized_var(free_records);
3733 struct buffer_head *last_eb_bh = NULL; 3725 struct buffer_head *last_eb_bh = NULL;
3734 struct ocfs2_insert_type insert = {0, }; 3726 struct ocfs2_insert_type insert = {0, };
3735 struct ocfs2_extent_rec rec; 3727 struct ocfs2_extent_rec rec;
3736 3728
3729 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
3730
3737 mlog(0, "add %u clusters at position %u to inode %llu\n", 3731 mlog(0, "add %u clusters at position %u to inode %llu\n",
3738 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 3732 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
3739 3733
@@ -3752,7 +3746,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
3752 rec.e_flags = flags; 3746 rec.e_flags = flags;
3753 3747
3754 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 3748 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
3755 &insert); 3749 &free_records, &insert);
3756 if (status < 0) { 3750 if (status < 0) {
3757 mlog_errno(status); 3751 mlog_errno(status);
3758 goto bail; 3752 goto bail;
@@ -3762,9 +3756,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
3762 "Insert.contig_index: %d, Insert.free_records: %d, " 3756 "Insert.contig_index: %d, Insert.free_records: %d, "
3763 "Insert.tree_depth: %d\n", 3757 "Insert.tree_depth: %d\n",
3764 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 3758 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
3765 insert.ins_free_records, insert.ins_tree_depth); 3759 free_records, insert.ins_tree_depth);
3766 3760
3767 if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) { 3761 if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
3768 status = ocfs2_grow_tree(inode, handle, fe_bh, 3762 status = ocfs2_grow_tree(inode, handle, fe_bh,
3769 &insert.ins_tree_depth, &last_eb_bh, 3763 &insert.ins_tree_depth, &last_eb_bh,
3770 meta_ac); 3764 meta_ac);
@@ -3847,26 +3841,17 @@ leftright:
3847 3841
3848 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 3842 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
3849 le16_to_cpu(rightmost_el->l_count)) { 3843 le16_to_cpu(rightmost_el->l_count)) {
3850 int old_depth = depth;
3851
3852 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, 3844 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
3853 meta_ac); 3845 meta_ac);
3854 if (ret) { 3846 if (ret) {
3855 mlog_errno(ret); 3847 mlog_errno(ret);
3856 goto out; 3848 goto out;
3857 } 3849 }
3858
3859 if (old_depth != depth) {
3860 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
3861 rightmost_el = &eb->h_list;
3862 }
3863 } 3850 }
3864 3851
3865 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 3852 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
3866 insert.ins_appending = APPEND_NONE; 3853 insert.ins_appending = APPEND_NONE;
3867 insert.ins_contig = CONTIG_NONE; 3854 insert.ins_contig = CONTIG_NONE;
3868 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
3869 - le16_to_cpu(rightmost_el->l_next_free_rec);
3870 insert.ins_tree_depth = depth; 3855 insert.ins_tree_depth = depth;
3871 3856
3872 insert_range = le32_to_cpu(split_rec.e_cpos) + 3857 insert_range = le32_to_cpu(split_rec.e_cpos) +
@@ -4015,11 +4000,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4015 } else 4000 } else
4016 rightmost_el = path_root_el(path); 4001 rightmost_el = path_root_el(path);
4017 4002
4018 ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
4019 if (ctxt.c_used_tail_recs > 0 &&
4020 ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
4021 ctxt.c_used_tail_recs--;
4022
4023 if (rec->e_cpos == split_rec->e_cpos && 4003 if (rec->e_cpos == split_rec->e_cpos &&
4024 rec->e_leaf_clusters == split_rec->e_leaf_clusters) 4004 rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4025 ctxt.c_split_covers_rec = 1; 4005 ctxt.c_split_covers_rec = 1;
@@ -4028,10 +4008,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4028 4008
4029 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); 4009 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4030 4010
4031 mlog(0, "index: %d, contig: %u, used_tail_recs: %u, " 4011 mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4032 "has_empty: %u, split_covers: %u\n", split_index, 4012 split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4033 ctxt.c_contig_type, ctxt.c_used_tail_recs, 4013 ctxt.c_split_covers_rec);
4034 ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
4035 4014
4036 if (ctxt.c_contig_type == CONTIG_NONE) { 4015 if (ctxt.c_contig_type == CONTIG_NONE) {
4037 if (ctxt.c_split_covers_rec) 4016 if (ctxt.c_split_covers_rec)
@@ -4180,27 +4159,18 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4180 4159
4181 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4160 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4182 le16_to_cpu(rightmost_el->l_count)) { 4161 le16_to_cpu(rightmost_el->l_count)) {
4183 int old_depth = depth;
4184
4185 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, 4162 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4186 meta_ac); 4163 meta_ac);
4187 if (ret) { 4164 if (ret) {
4188 mlog_errno(ret); 4165 mlog_errno(ret);
4189 goto out; 4166 goto out;
4190 } 4167 }
4191
4192 if (old_depth != depth) {
4193 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
4194 rightmost_el = &eb->h_list;
4195 }
4196 } 4168 }
4197 4169
4198 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 4170 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4199 insert.ins_appending = APPEND_NONE; 4171 insert.ins_appending = APPEND_NONE;
4200 insert.ins_contig = CONTIG_NONE; 4172 insert.ins_contig = CONTIG_NONE;
4201 insert.ins_split = SPLIT_RIGHT; 4173 insert.ins_split = SPLIT_RIGHT;
4202 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
4203 - le16_to_cpu(rightmost_el->l_next_free_rec);
4204 insert.ins_tree_depth = depth; 4174 insert.ins_tree_depth = depth;
4205 4175
4206 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); 4176 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
@@ -5665,12 +5635,50 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
5665 return ocfs2_journal_dirty_data(handle, bh); 5635 return ocfs2_journal_dirty_data(handle, bh);
5666} 5636}
5667 5637
5638static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
5639 unsigned int from, unsigned int to,
5640 struct page *page, int zero, u64 *phys)
5641{
5642 int ret, partial = 0;
5643
5644 ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
5645 if (ret)
5646 mlog_errno(ret);
5647
5648 if (zero)
5649 zero_user_page(page, from, to - from, KM_USER0);
5650
5651 /*
5652 * Need to set the buffers we zero'd into uptodate
5653 * here if they aren't - ocfs2_map_page_blocks()
5654 * might've skipped some
5655 */
5656 if (ocfs2_should_order_data(inode)) {
5657 ret = walk_page_buffers(handle,
5658 page_buffers(page),
5659 from, to, &partial,
5660 ocfs2_ordered_zero_func);
5661 if (ret < 0)
5662 mlog_errno(ret);
5663 } else {
5664 ret = walk_page_buffers(handle, page_buffers(page),
5665 from, to, &partial,
5666 ocfs2_writeback_zero_func);
5667 if (ret < 0)
5668 mlog_errno(ret);
5669 }
5670
5671 if (!partial)
5672 SetPageUptodate(page);
5673
5674 flush_dcache_page(page);
5675}
5676
5668static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, 5677static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
5669 loff_t end, struct page **pages, 5678 loff_t end, struct page **pages,
5670 int numpages, u64 phys, handle_t *handle) 5679 int numpages, u64 phys, handle_t *handle)
5671{ 5680{
5672 int i, ret, partial = 0; 5681 int i;
5673 void *kaddr;
5674 struct page *page; 5682 struct page *page;
5675 unsigned int from, to = PAGE_CACHE_SIZE; 5683 unsigned int from, to = PAGE_CACHE_SIZE;
5676 struct super_block *sb = inode->i_sb; 5684 struct super_block *sb = inode->i_sb;
@@ -5691,87 +5699,31 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
5691 BUG_ON(from > PAGE_CACHE_SIZE); 5699 BUG_ON(from > PAGE_CACHE_SIZE);
5692 BUG_ON(to > PAGE_CACHE_SIZE); 5700 BUG_ON(to > PAGE_CACHE_SIZE);
5693 5701
5694 ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); 5702 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
5695 if (ret) 5703 &phys);
5696 mlog_errno(ret);
5697
5698 kaddr = kmap_atomic(page, KM_USER0);
5699 memset(kaddr + from, 0, to - from);
5700 kunmap_atomic(kaddr, KM_USER0);
5701
5702 /*
5703 * Need to set the buffers we zero'd into uptodate
5704 * here if they aren't - ocfs2_map_page_blocks()
5705 * might've skipped some
5706 */
5707 if (ocfs2_should_order_data(inode)) {
5708 ret = walk_page_buffers(handle,
5709 page_buffers(page),
5710 from, to, &partial,
5711 ocfs2_ordered_zero_func);
5712 if (ret < 0)
5713 mlog_errno(ret);
5714 } else {
5715 ret = walk_page_buffers(handle, page_buffers(page),
5716 from, to, &partial,
5717 ocfs2_writeback_zero_func);
5718 if (ret < 0)
5719 mlog_errno(ret);
5720 }
5721
5722 if (!partial)
5723 SetPageUptodate(page);
5724
5725 flush_dcache_page(page);
5726 5704
5727 start = (page->index + 1) << PAGE_CACHE_SHIFT; 5705 start = (page->index + 1) << PAGE_CACHE_SHIFT;
5728 } 5706 }
5729out: 5707out:
5730 if (pages) { 5708 if (pages)
5731 for (i = 0; i < numpages; i++) { 5709 ocfs2_unlock_and_free_pages(pages, numpages);
5732 page = pages[i];
5733 unlock_page(page);
5734 mark_page_accessed(page);
5735 page_cache_release(page);
5736 }
5737 }
5738} 5710}
5739 5711
5740static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 5712static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
5741 struct page **pages, int *num, u64 *phys) 5713 struct page **pages, int *num)
5742{ 5714{
5743 int i, numpages = 0, ret = 0; 5715 int numpages, ret = 0;
5744 unsigned int ext_flags;
5745 struct super_block *sb = inode->i_sb; 5716 struct super_block *sb = inode->i_sb;
5746 struct address_space *mapping = inode->i_mapping; 5717 struct address_space *mapping = inode->i_mapping;
5747 unsigned long index; 5718 unsigned long index;
5748 loff_t last_page_bytes; 5719 loff_t last_page_bytes;
5749 5720
5750 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
5751 BUG_ON(start > end); 5721 BUG_ON(start > end);
5752 5722
5753 if (start == end)
5754 goto out;
5755
5756 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != 5723 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
5757 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); 5724 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
5758 5725
5759 ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits, 5726 numpages = 0;
5760 phys, NULL, &ext_flags);
5761 if (ret) {
5762 mlog_errno(ret);
5763 goto out;
5764 }
5765
5766 /* Tail is a hole. */
5767 if (*phys == 0)
5768 goto out;
5769
5770 /* Tail is marked as unwritten, we can count on write to zero
5771 * in that case. */
5772 if (ext_flags & OCFS2_EXT_UNWRITTEN)
5773 goto out;
5774
5775 last_page_bytes = PAGE_ALIGN(end); 5727 last_page_bytes = PAGE_ALIGN(end);
5776 index = start >> PAGE_CACHE_SHIFT; 5728 index = start >> PAGE_CACHE_SHIFT;
5777 do { 5729 do {
@@ -5788,14 +5740,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
5788 5740
5789out: 5741out:
5790 if (ret != 0) { 5742 if (ret != 0) {
5791 if (pages) { 5743 if (pages)
5792 for (i = 0; i < numpages; i++) { 5744 ocfs2_unlock_and_free_pages(pages, numpages);
5793 if (pages[i]) {
5794 unlock_page(pages[i]);
5795 page_cache_release(pages[i]);
5796 }
5797 }
5798 }
5799 numpages = 0; 5745 numpages = 0;
5800 } 5746 }
5801 5747
@@ -5816,18 +5762,20 @@ out:
5816int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 5762int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
5817 u64 range_start, u64 range_end) 5763 u64 range_start, u64 range_end)
5818{ 5764{
5819 int ret, numpages; 5765 int ret = 0, numpages;
5820 struct page **pages = NULL; 5766 struct page **pages = NULL;
5821 u64 phys; 5767 u64 phys;
5768 unsigned int ext_flags;
5769 struct super_block *sb = inode->i_sb;
5822 5770
5823 /* 5771 /*
5824 * File systems which don't support sparse files zero on every 5772 * File systems which don't support sparse files zero on every
5825 * extend. 5773 * extend.
5826 */ 5774 */
5827 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 5775 if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
5828 return 0; 5776 return 0;
5829 5777
5830 pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), 5778 pages = kcalloc(ocfs2_pages_per_cluster(sb),
5831 sizeof(struct page *), GFP_NOFS); 5779 sizeof(struct page *), GFP_NOFS);
5832 if (pages == NULL) { 5780 if (pages == NULL) {
5833 ret = -ENOMEM; 5781 ret = -ENOMEM;
@@ -5835,16 +5783,31 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
5835 goto out; 5783 goto out;
5836 } 5784 }
5837 5785
5838 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, 5786 if (range_start == range_end)
5839 &numpages, &phys); 5787 goto out;
5788
5789 ret = ocfs2_extent_map_get_blocks(inode,
5790 range_start >> sb->s_blocksize_bits,
5791 &phys, NULL, &ext_flags);
5840 if (ret) { 5792 if (ret) {
5841 mlog_errno(ret); 5793 mlog_errno(ret);
5842 goto out; 5794 goto out;
5843 } 5795 }
5844 5796
5845 if (numpages == 0) 5797 /*
5798 * Tail is a hole, or is marked unwritten. In either case, we
5799 * can count on read and write to return/push zero's.
5800 */
5801 if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
5846 goto out; 5802 goto out;
5847 5803
5804 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
5805 &numpages);
5806 if (ret) {
5807 mlog_errno(ret);
5808 goto out;
5809 }
5810
5848 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, 5811 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
5849 numpages, phys, handle); 5812 numpages, phys, handle);
5850 5813
@@ -5865,6 +5828,178 @@ out:
5865 return ret; 5828 return ret;
5866} 5829}
5867 5830
5831static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
5832{
5833 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
5834
5835 memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
5836}
5837
5838void ocfs2_dinode_new_extent_list(struct inode *inode,
5839 struct ocfs2_dinode *di)
5840{
5841 ocfs2_zero_dinode_id2(inode, di);
5842 di->id2.i_list.l_tree_depth = 0;
5843 di->id2.i_list.l_next_free_rec = 0;
5844 di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
5845}
5846
5847void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
5848{
5849 struct ocfs2_inode_info *oi = OCFS2_I(inode);
5850 struct ocfs2_inline_data *idata = &di->id2.i_data;
5851
5852 spin_lock(&oi->ip_lock);
5853 oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
5854 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
5855 spin_unlock(&oi->ip_lock);
5856
5857 /*
5858 * We clear the entire i_data structure here so that all
5859 * fields can be properly initialized.
5860 */
5861 ocfs2_zero_dinode_id2(inode, di);
5862
5863 idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
5864}
5865
5866int ocfs2_convert_inline_data_to_extents(struct inode *inode,
5867 struct buffer_head *di_bh)
5868{
5869 int ret, i, has_data, num_pages = 0;
5870 handle_t *handle;
5871 u64 uninitialized_var(block);
5872 struct ocfs2_inode_info *oi = OCFS2_I(inode);
5873 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5874 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
5875 struct ocfs2_alloc_context *data_ac = NULL;
5876 struct page **pages = NULL;
5877 loff_t end = osb->s_clustersize;
5878
5879 has_data = i_size_read(inode) ? 1 : 0;
5880
5881 if (has_data) {
5882 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
5883 sizeof(struct page *), GFP_NOFS);
5884 if (pages == NULL) {
5885 ret = -ENOMEM;
5886 mlog_errno(ret);
5887 goto out;
5888 }
5889
5890 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
5891 if (ret) {
5892 mlog_errno(ret);
5893 goto out;
5894 }
5895 }
5896
5897 handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
5898 if (IS_ERR(handle)) {
5899 ret = PTR_ERR(handle);
5900 mlog_errno(ret);
5901 goto out_unlock;
5902 }
5903
5904 ret = ocfs2_journal_access(handle, inode, di_bh,
5905 OCFS2_JOURNAL_ACCESS_WRITE);
5906 if (ret) {
5907 mlog_errno(ret);
5908 goto out_commit;
5909 }
5910
5911 if (has_data) {
5912 u32 bit_off, num;
5913 unsigned int page_end;
5914 u64 phys;
5915
5916 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
5917 &num);
5918 if (ret) {
5919 mlog_errno(ret);
5920 goto out_commit;
5921 }
5922
5923 /*
5924 * Save two copies, one for insert, and one that can
5925 * be changed by ocfs2_map_and_dirty_page() below.
5926 */
5927 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
5928
5929 /*
5930 * Non sparse file systems zero on extend, so no need
5931 * to do that now.
5932 */
5933 if (!ocfs2_sparse_alloc(osb) &&
5934 PAGE_CACHE_SIZE < osb->s_clustersize)
5935 end = PAGE_CACHE_SIZE;
5936
5937 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
5938 if (ret) {
5939 mlog_errno(ret);
5940 goto out_commit;
5941 }
5942
5943 /*
5944 * This should populate the 1st page for us and mark
5945 * it up to date.
5946 */
5947 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
5948 if (ret) {
5949 mlog_errno(ret);
5950 goto out_commit;
5951 }
5952
5953 page_end = PAGE_CACHE_SIZE;
5954 if (PAGE_CACHE_SIZE > osb->s_clustersize)
5955 page_end = osb->s_clustersize;
5956
5957 for (i = 0; i < num_pages; i++)
5958 ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
5959 pages[i], i > 0, &phys);
5960 }
5961
5962 spin_lock(&oi->ip_lock);
5963 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
5964 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
5965 spin_unlock(&oi->ip_lock);
5966
5967 ocfs2_dinode_new_extent_list(inode, di);
5968
5969 ocfs2_journal_dirty(handle, di_bh);
5970
5971 if (has_data) {
5972 /*
5973 * An error at this point should be extremely rare. If
5974 * this proves to be false, we could always re-build
5975 * the in-inode data from our pages.
5976 */
5977 ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
5978 0, block, 1, 0, NULL);
5979 if (ret) {
5980 mlog_errno(ret);
5981 goto out_commit;
5982 }
5983
5984 inode->i_blocks = ocfs2_inode_sector_count(inode);
5985 }
5986
5987out_commit:
5988 ocfs2_commit_trans(osb, handle);
5989
5990out_unlock:
5991 if (data_ac)
5992 ocfs2_free_alloc_context(data_ac);
5993
5994out:
5995 if (pages) {
5996 ocfs2_unlock_and_free_pages(pages, num_pages);
5997 kfree(pages);
5998 }
5999
6000 return ret;
6001}
6002
5868/* 6003/*
5869 * It is expected, that by the time you call this function, 6004 * It is expected, that by the time you call this function,
5870 * inode->i_size and fe->i_size have been adjusted. 6005 * inode->i_size and fe->i_size have been adjusted.
@@ -6090,6 +6225,81 @@ bail:
6090 return status; 6225 return status;
6091} 6226}
6092 6227
6228/*
6229 * 'start' is inclusive, 'end' is not.
6230 */
6231int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
6232 unsigned int start, unsigned int end, int trunc)
6233{
6234 int ret;
6235 unsigned int numbytes;
6236 handle_t *handle;
6237 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6238 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6239 struct ocfs2_inline_data *idata = &di->id2.i_data;
6240
6241 if (end > i_size_read(inode))
6242 end = i_size_read(inode);
6243
6244 BUG_ON(start >= end);
6245
6246 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
6247 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
6248 !ocfs2_supports_inline_data(osb)) {
6249 ocfs2_error(inode->i_sb,
6250 "Inline data flags for inode %llu don't agree! "
6251 "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
6252 (unsigned long long)OCFS2_I(inode)->ip_blkno,
6253 le16_to_cpu(di->i_dyn_features),
6254 OCFS2_I(inode)->ip_dyn_features,
6255 osb->s_feature_incompat);
6256 ret = -EROFS;
6257 goto out;
6258 }
6259
6260 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
6261 if (IS_ERR(handle)) {
6262 ret = PTR_ERR(handle);
6263 mlog_errno(ret);
6264 goto out;
6265 }
6266
6267 ret = ocfs2_journal_access(handle, inode, di_bh,
6268 OCFS2_JOURNAL_ACCESS_WRITE);
6269 if (ret) {
6270 mlog_errno(ret);
6271 goto out_commit;
6272 }
6273
6274 numbytes = end - start;
6275 memset(idata->id_data + start, 0, numbytes);
6276
6277 /*
6278 * No need to worry about the data page here - it's been
6279 * truncated already and inline data doesn't need it for
6280 * pushing zero's to disk, so we'll let readpage pick it up
6281 * later.
6282 */
6283 if (trunc) {
6284 i_size_write(inode, start);
6285 di->i_size = cpu_to_le64(start);
6286 }
6287
6288 inode->i_blocks = ocfs2_inode_sector_count(inode);
6289 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
6290
6291 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
6292 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
6293
6294 ocfs2_journal_dirty(handle, di_bh);
6295
6296out_commit:
6297 ocfs2_commit_trans(osb, handle);
6298
6299out:
6300 return ret;
6301}
6302
6093static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 6303static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
6094{ 6304{
6095 /* 6305 /*
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 990df48ae8d3..42ff94bd8011 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -62,6 +62,11 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
62 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; 62 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
63} 63}
64 64
65void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
66void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di);
67int ocfs2_convert_inline_data_to_extents(struct inode *inode,
68 struct buffer_head *di_bh);
69
65int ocfs2_truncate_log_init(struct ocfs2_super *osb); 70int ocfs2_truncate_log_init(struct ocfs2_super *osb);
66void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb); 71void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
67void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 72void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
@@ -115,6 +120,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
115 struct inode *inode, 120 struct inode *inode,
116 struct buffer_head *fe_bh, 121 struct buffer_head *fe_bh,
117 struct ocfs2_truncate_context *tc); 122 struct ocfs2_truncate_context *tc);
123int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
124 unsigned int start, unsigned int end, int trunc);
118 125
119int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 126int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
120 u32 cpos, struct buffer_head **leaf_bh); 127 u32 cpos, struct buffer_head **leaf_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f37f25c931f5..34d10452c56d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -206,9 +206,70 @@ bail:
206 return err; 206 return err;
207} 207}
208 208
209int ocfs2_read_inline_data(struct inode *inode, struct page *page,
210 struct buffer_head *di_bh)
211{
212 void *kaddr;
213 unsigned int size;
214 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
215
216 if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
217 ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
218 (unsigned long long)OCFS2_I(inode)->ip_blkno);
219 return -EROFS;
220 }
221
222 size = i_size_read(inode);
223
224 if (size > PAGE_CACHE_SIZE ||
225 size > ocfs2_max_inline_data(inode->i_sb)) {
226 ocfs2_error(inode->i_sb,
227 "Inode %llu has with inline data has bad size: %u",
228 (unsigned long long)OCFS2_I(inode)->ip_blkno, size);
229 return -EROFS;
230 }
231
232 kaddr = kmap_atomic(page, KM_USER0);
233 if (size)
234 memcpy(kaddr, di->id2.i_data.id_data, size);
235 /* Clear the remaining part of the page */
236 memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
237 flush_dcache_page(page);
238 kunmap_atomic(kaddr, KM_USER0);
239
240 SetPageUptodate(page);
241
242 return 0;
243}
244
245static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
246{
247 int ret;
248 struct buffer_head *di_bh = NULL;
249 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
250
251 BUG_ON(!PageLocked(page));
252 BUG_ON(!OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
253
254 ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
255 OCFS2_BH_CACHED, inode);
256 if (ret) {
257 mlog_errno(ret);
258 goto out;
259 }
260
261 ret = ocfs2_read_inline_data(inode, page, di_bh);
262out:
263 unlock_page(page);
264
265 brelse(di_bh);
266 return ret;
267}
268
209static int ocfs2_readpage(struct file *file, struct page *page) 269static int ocfs2_readpage(struct file *file, struct page *page)
210{ 270{
211 struct inode *inode = page->mapping->host; 271 struct inode *inode = page->mapping->host;
272 struct ocfs2_inode_info *oi = OCFS2_I(inode);
212 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; 273 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
213 int ret, unlock = 1; 274 int ret, unlock = 1;
214 275
@@ -222,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
222 goto out; 283 goto out;
223 } 284 }
224 285
225 if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) { 286 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
226 ret = AOP_TRUNCATED_PAGE; 287 ret = AOP_TRUNCATED_PAGE;
227 goto out_meta_unlock; 288 goto out_meta_unlock;
228 } 289 }
@@ -252,7 +313,10 @@ static int ocfs2_readpage(struct file *file, struct page *page)
252 goto out_alloc; 313 goto out_alloc;
253 } 314 }
254 315
255 ret = block_read_full_page(page, ocfs2_get_block); 316 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
317 ret = ocfs2_readpage_inline(inode, page);
318 else
319 ret = block_read_full_page(page, ocfs2_get_block);
256 unlock = 0; 320 unlock = 0;
257 321
258 ocfs2_data_unlock(inode, 0); 322 ocfs2_data_unlock(inode, 0);
@@ -301,12 +365,8 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
301{ 365{
302 int ret; 366 int ret;
303 367
304 down_read(&OCFS2_I(inode)->ip_alloc_sem);
305
306 ret = block_prepare_write(page, from, to, ocfs2_get_block); 368 ret = block_prepare_write(page, from, to, ocfs2_get_block);
307 369
308 up_read(&OCFS2_I(inode)->ip_alloc_sem);
309
310 return ret; 370 return ret;
311} 371}
312 372
@@ -401,7 +461,9 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
401 down_read(&OCFS2_I(inode)->ip_alloc_sem); 461 down_read(&OCFS2_I(inode)->ip_alloc_sem);
402 } 462 }
403 463
404 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); 464 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
465 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
466 NULL);
405 467
406 if (!INODE_JOURNAL(inode)) { 468 if (!INODE_JOURNAL(inode)) {
407 up_read(&OCFS2_I(inode)->ip_alloc_sem); 469 up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -415,7 +477,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
415 goto bail; 477 goto bail;
416 } 478 }
417 479
418
419bail: 480bail:
420 status = err ? 0 : p_blkno; 481 status = err ? 0 : p_blkno;
421 482
@@ -570,6 +631,13 @@ static ssize_t ocfs2_direct_IO(int rw,
570 631
571 mlog_entry_void(); 632 mlog_entry_void();
572 633
634 /*
635 * Fallback to buffered I/O if we see an inode without
636 * extents.
637 */
638 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
639 return 0;
640
573 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 641 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
574 /* 642 /*
575 * We get PR data locks even for O_DIRECT. This 643 * We get PR data locks even for O_DIRECT. This
@@ -834,18 +902,22 @@ struct ocfs2_write_ctxt {
834 struct ocfs2_cached_dealloc_ctxt w_dealloc; 902 struct ocfs2_cached_dealloc_ctxt w_dealloc;
835}; 903};
836 904
837static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 905void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
838{ 906{
839 int i; 907 int i;
840 908
841 for(i = 0; i < wc->w_num_pages; i++) { 909 for(i = 0; i < num_pages; i++) {
842 if (wc->w_pages[i] == NULL) 910 if (pages[i]) {
843 continue; 911 unlock_page(pages[i]);
844 912 mark_page_accessed(pages[i]);
845 unlock_page(wc->w_pages[i]); 913 page_cache_release(pages[i]);
846 mark_page_accessed(wc->w_pages[i]); 914 }
847 page_cache_release(wc->w_pages[i]);
848 } 915 }
916}
917
918static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
919{
920 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
849 921
850 brelse(wc->w_di_bh); 922 brelse(wc->w_di_bh);
851 kfree(wc); 923 kfree(wc);
@@ -1360,6 +1432,160 @@ out:
1360 return ret; 1432 return ret;
1361} 1433}
1362 1434
1435static int ocfs2_write_begin_inline(struct address_space *mapping,
1436 struct inode *inode,
1437 struct ocfs2_write_ctxt *wc)
1438{
1439 int ret;
1440 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1441 struct page *page;
1442 handle_t *handle;
1443 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1444
1445 page = find_or_create_page(mapping, 0, GFP_NOFS);
1446 if (!page) {
1447 ret = -ENOMEM;
1448 mlog_errno(ret);
1449 goto out;
1450 }
1451 /*
1452 * If we don't set w_num_pages then this page won't get unlocked
1453 * and freed on cleanup of the write context.
1454 */
1455 wc->w_pages[0] = wc->w_target_page = page;
1456 wc->w_num_pages = 1;
1457
1458 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1459 if (IS_ERR(handle)) {
1460 ret = PTR_ERR(handle);
1461 mlog_errno(ret);
1462 goto out;
1463 }
1464
1465 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1466 OCFS2_JOURNAL_ACCESS_WRITE);
1467 if (ret) {
1468 ocfs2_commit_trans(osb, handle);
1469
1470 mlog_errno(ret);
1471 goto out;
1472 }
1473
1474 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1475 ocfs2_set_inode_data_inline(inode, di);
1476
1477 if (!PageUptodate(page)) {
1478 ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
1479 if (ret) {
1480 ocfs2_commit_trans(osb, handle);
1481
1482 goto out;
1483 }
1484 }
1485
1486 wc->w_handle = handle;
1487out:
1488 return ret;
1489}
1490
1491int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
1492{
1493 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1494
1495 if (new_size < le16_to_cpu(di->id2.i_data.id_count))
1496 return 1;
1497 return 0;
1498}
1499
1500static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1501 struct inode *inode, loff_t pos,
1502 unsigned len, struct page *mmap_page,
1503 struct ocfs2_write_ctxt *wc)
1504{
1505 int ret, written = 0;
1506 loff_t end = pos + len;
1507 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1508
1509 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
1510 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
1511 oi->ip_dyn_features);
1512
1513 /*
1514 * Handle inodes which already have inline data 1st.
1515 */
1516 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1517 if (mmap_page == NULL &&
1518 ocfs2_size_fits_inline_data(wc->w_di_bh, end))
1519 goto do_inline_write;
1520
1521 /*
1522 * The write won't fit - we have to give this inode an
1523 * inline extent list now.
1524 */
1525 ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
1526 if (ret)
1527 mlog_errno(ret);
1528 goto out;
1529 }
1530
1531 /*
1532 * Check whether the inode can accept inline data.
1533 */
1534 if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
1535 return 0;
1536
1537 /*
1538 * Check whether the write can fit.
1539 */
1540 if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb))
1541 return 0;
1542
1543do_inline_write:
1544 ret = ocfs2_write_begin_inline(mapping, inode, wc);
1545 if (ret) {
1546 mlog_errno(ret);
1547 goto out;
1548 }
1549
1550 /*
1551 * This signals to the caller that the data can be written
1552 * inline.
1553 */
1554 written = 1;
1555out:
1556 return written ? written : ret;
1557}
1558
1559/*
1560 * This function only does anything for file systems which can't
1561 * handle sparse files.
1562 *
1563 * What we want to do here is fill in any hole between the current end
1564 * of allocation and the end of our write. That way the rest of the
1565 * write path can treat it as an non-allocating write, which has no
1566 * special case code for sparse/nonsparse files.
1567 */
1568static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
1569 unsigned len,
1570 struct ocfs2_write_ctxt *wc)
1571{
1572 int ret;
1573 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1574 loff_t newsize = pos + len;
1575
1576 if (ocfs2_sparse_alloc(osb))
1577 return 0;
1578
1579 if (newsize <= i_size_read(inode))
1580 return 0;
1581
1582 ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
1583 if (ret)
1584 mlog_errno(ret);
1585
1586 return ret;
1587}
1588
1363int ocfs2_write_begin_nolock(struct address_space *mapping, 1589int ocfs2_write_begin_nolock(struct address_space *mapping,
1364 loff_t pos, unsigned len, unsigned flags, 1590 loff_t pos, unsigned len, unsigned flags,
1365 struct page **pagep, void **fsdata, 1591 struct page **pagep, void **fsdata,
@@ -1381,6 +1607,25 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1381 return ret; 1607 return ret;
1382 } 1608 }
1383 1609
1610 if (ocfs2_supports_inline_data(osb)) {
1611 ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
1612 mmap_page, wc);
1613 if (ret == 1) {
1614 ret = 0;
1615 goto success;
1616 }
1617 if (ret < 0) {
1618 mlog_errno(ret);
1619 goto out;
1620 }
1621 }
1622
1623 ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
1624 if (ret) {
1625 mlog_errno(ret);
1626 goto out;
1627 }
1628
1384 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, 1629 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1385 &extents_to_split); 1630 &extents_to_split);
1386 if (ret) { 1631 if (ret) {
@@ -1462,6 +1707,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1462 if (meta_ac) 1707 if (meta_ac)
1463 ocfs2_free_alloc_context(meta_ac); 1708 ocfs2_free_alloc_context(meta_ac);
1464 1709
1710success:
1465 *pagep = wc->w_target_page; 1711 *pagep = wc->w_target_page;
1466 *fsdata = wc; 1712 *fsdata = wc;
1467 return 0; 1713 return 0;
@@ -1529,6 +1775,31 @@ out_fail:
1529 return ret; 1775 return ret;
1530} 1776}
1531 1777
1778static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
1779 unsigned len, unsigned *copied,
1780 struct ocfs2_dinode *di,
1781 struct ocfs2_write_ctxt *wc)
1782{
1783 void *kaddr;
1784
1785 if (unlikely(*copied < len)) {
1786 if (!PageUptodate(wc->w_target_page)) {
1787 *copied = 0;
1788 return;
1789 }
1790 }
1791
1792 kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
1793 memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
1794 kunmap_atomic(kaddr, KM_USER0);
1795
1796 mlog(0, "Data written to inode at offset %llu. "
1797 "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
1798 (unsigned long long)pos, *copied,
1799 le16_to_cpu(di->id2.i_data.id_count),
1800 le16_to_cpu(di->i_dyn_features));
1801}
1802
1532int ocfs2_write_end_nolock(struct address_space *mapping, 1803int ocfs2_write_end_nolock(struct address_space *mapping,
1533 loff_t pos, unsigned len, unsigned copied, 1804 loff_t pos, unsigned len, unsigned copied,
1534 struct page *page, void *fsdata) 1805 struct page *page, void *fsdata)
@@ -1542,6 +1813,11 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
1542 handle_t *handle = wc->w_handle; 1813 handle_t *handle = wc->w_handle;
1543 struct page *tmppage; 1814 struct page *tmppage;
1544 1815
1816 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1817 ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
1818 goto out_write_size;
1819 }
1820
1545 if (unlikely(copied < len)) { 1821 if (unlikely(copied < len)) {
1546 if (!PageUptodate(wc->w_target_page)) 1822 if (!PageUptodate(wc->w_target_page))
1547 copied = 0; 1823 copied = 0;
@@ -1579,6 +1855,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
1579 block_commit_write(tmppage, from, to); 1855 block_commit_write(tmppage, from, to);
1580 } 1856 }
1581 1857
1858out_write_size:
1582 pos += copied; 1859 pos += copied;
1583 if (pos > inode->i_size) { 1860 if (pos > inode->i_size) {
1584 i_size_write(inode, pos); 1861 i_size_write(inode, pos);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 389579bd64e3..113560877dbb 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -34,6 +34,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
34 struct inode *inode, unsigned int from, 34 struct inode *inode, unsigned int from,
35 unsigned int to, int new); 35 unsigned int to, int new);
36 36
37void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
38
37int walk_page_buffers( handle_t *handle, 39int walk_page_buffers( handle_t *handle,
38 struct buffer_head *head, 40 struct buffer_head *head,
39 unsigned from, 41 unsigned from,
@@ -59,6 +61,10 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
59 struct page **pagep, void **fsdata, 61 struct page **pagep, void **fsdata,
60 struct buffer_head *di_bh, struct page *mmap_page); 62 struct buffer_head *di_bh, struct page *mmap_page);
61 63
64int ocfs2_read_inline_data(struct inode *inode, struct page *page,
65 struct buffer_head *di_bh);
66int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
67
62/* all ocfs2_dio_end_io()'s fault */ 68/* all ocfs2_dio_end_io()'s fault */
63#define ocfs2_iocb_is_rw_locked(iocb) \ 69#define ocfs2_iocb_is_rw_locked(iocb) \
64 test_bit(0, (unsigned long *)&iocb->private) 70 test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 2bd7f788cf34..f14b541fab95 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,8 +216,7 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
216 wait_for_completion(&wc->wc_io_complete); 216 wait_for_completion(&wc->wc_io_complete);
217} 217}
218 218
219static int o2hb_bio_end_io(struct bio *bio, 219static void o2hb_bio_end_io(struct bio *bio,
220 unsigned int bytes_done,
221 int error) 220 int error)
222{ 221{
223 struct o2hb_bio_wait_ctxt *wc = bio->bi_private; 222 struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
@@ -227,12 +226,8 @@ static int o2hb_bio_end_io(struct bio *bio,
227 wc->wc_error = error; 226 wc->wc_error = error;
228 } 227 }
229 228
230 if (bio->bi_size)
231 return 1;
232
233 o2hb_bio_wait_dec(wc, 1); 229 o2hb_bio_wait_dec(wc, 1);
234 bio_put(bio); 230 bio_put(bio);
235 return 0;
236} 231}
237 232
238/* Setup a Bio to cover I/O against num_slots slots starting at 233/* Setup a Bio to cover I/O against num_slots slots starting at
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index e9e042b93dbf..a4882c8df945 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -143,7 +143,7 @@ static struct kobj_type mlog_ktype = {
143}; 143};
144 144
145static struct kset mlog_kset = { 145static struct kset mlog_kset = {
146 .kobj = {.name = "logmask", .ktype = &mlog_ktype}, 146 .kobj = {.ktype = &mlog_ktype},
147}; 147};
148 148
149int mlog_sys_init(struct kset *o2cb_subsys) 149int mlog_sys_init(struct kset *o2cb_subsys)
@@ -156,6 +156,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
156 } 156 }
157 mlog_attr_ptrs[i] = NULL; 157 mlog_attr_ptrs[i] = NULL;
158 158
159 kobject_set_name(&mlog_kset.kobj, "logmask");
159 kobj_set_kset_s(&mlog_kset, *o2cb_subsys); 160 kobj_set_kset_s(&mlog_kset, *o2cb_subsys);
160 return kset_register(&mlog_kset); 161 return kset_register(&mlog_kset);
161} 162}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 0d5fdde959c8..7453b70c1a19 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -55,10 +55,16 @@
55#include "journal.h" 55#include "journal.h"
56#include "namei.h" 56#include "namei.h"
57#include "suballoc.h" 57#include "suballoc.h"
58#include "super.h"
58#include "uptodate.h" 59#include "uptodate.h"
59 60
60#include "buffer_head_io.h" 61#include "buffer_head_io.h"
61 62
63#define NAMEI_RA_CHUNKS 2
64#define NAMEI_RA_BLOCKS 4
65#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
66#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
67
62static unsigned char ocfs2_filetype_table[] = { 68static unsigned char ocfs2_filetype_table[] = {
63 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 69 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
64}; 70};
@@ -66,12 +72,614 @@ static unsigned char ocfs2_filetype_table[] = {
66static int ocfs2_extend_dir(struct ocfs2_super *osb, 72static int ocfs2_extend_dir(struct ocfs2_super *osb,
67 struct inode *dir, 73 struct inode *dir,
68 struct buffer_head *parent_fe_bh, 74 struct buffer_head *parent_fe_bh,
75 unsigned int blocks_wanted,
69 struct buffer_head **new_de_bh); 76 struct buffer_head **new_de_bh);
77static int ocfs2_do_extend_dir(struct super_block *sb,
78 handle_t *handle,
79 struct inode *dir,
80 struct buffer_head *parent_fe_bh,
81 struct ocfs2_alloc_context *data_ac,
82 struct ocfs2_alloc_context *meta_ac,
83 struct buffer_head **new_bh);
84
70/* 85/*
71 * ocfs2_readdir() 86 * bh passed here can be an inode block or a dir data block, depending
87 * on the inode inline data flag.
88 */
89static int ocfs2_check_dir_entry(struct inode * dir,
90 struct ocfs2_dir_entry * de,
91 struct buffer_head * bh,
92 unsigned long offset)
93{
94 const char *error_msg = NULL;
95 const int rlen = le16_to_cpu(de->rec_len);
96
97 if (rlen < OCFS2_DIR_REC_LEN(1))
98 error_msg = "rec_len is smaller than minimal";
99 else if (rlen % 4 != 0)
100 error_msg = "rec_len % 4 != 0";
101 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
102 error_msg = "rec_len is too small for name_len";
103 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
104 error_msg = "directory entry across blocks";
105
106 if (error_msg != NULL)
107 mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
108 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
109 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
110 offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
111 de->name_len);
112 return error_msg == NULL ? 1 : 0;
113}
114
115static inline int ocfs2_match(int len,
116 const char * const name,
117 struct ocfs2_dir_entry *de)
118{
119 if (len != de->name_len)
120 return 0;
121 if (!de->inode)
122 return 0;
123 return !memcmp(name, de->name, len);
124}
125
126/*
127 * Returns 0 if not found, -1 on failure, and 1 on success
128 */
129static int inline ocfs2_search_dirblock(struct buffer_head *bh,
130 struct inode *dir,
131 const char *name, int namelen,
132 unsigned long offset,
133 char *first_de,
134 unsigned int bytes,
135 struct ocfs2_dir_entry **res_dir)
136{
137 struct ocfs2_dir_entry *de;
138 char *dlimit, *de_buf;
139 int de_len;
140 int ret = 0;
141
142 mlog_entry_void();
143
144 de_buf = first_de;
145 dlimit = de_buf + bytes;
146
147 while (de_buf < dlimit) {
148 /* this code is executed quadratically often */
149 /* do minimal checking `by hand' */
150
151 de = (struct ocfs2_dir_entry *) de_buf;
152
153 if (de_buf + namelen <= dlimit &&
154 ocfs2_match(namelen, name, de)) {
155 /* found a match - just to be sure, do a full check */
156 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
157 ret = -1;
158 goto bail;
159 }
160 *res_dir = de;
161 ret = 1;
162 goto bail;
163 }
164
165 /* prevent looping on a bad block */
166 de_len = le16_to_cpu(de->rec_len);
167 if (de_len <= 0) {
168 ret = -1;
169 goto bail;
170 }
171
172 de_buf += de_len;
173 offset += de_len;
174 }
175
176bail:
177 mlog_exit(ret);
178 return ret;
179}
180
181static struct buffer_head *ocfs2_find_entry_id(const char *name,
182 int namelen,
183 struct inode *dir,
184 struct ocfs2_dir_entry **res_dir)
185{
186 int ret, found;
187 struct buffer_head *di_bh = NULL;
188 struct ocfs2_dinode *di;
189 struct ocfs2_inline_data *data;
190
191 ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
192 &di_bh, OCFS2_BH_CACHED, dir);
193 if (ret) {
194 mlog_errno(ret);
195 goto out;
196 }
197
198 di = (struct ocfs2_dinode *)di_bh->b_data;
199 data = &di->id2.i_data;
200
201 found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0,
202 data->id_data, i_size_read(dir), res_dir);
203 if (found == 1)
204 return di_bh;
205
206 brelse(di_bh);
207out:
208 return NULL;
209}
210
211struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
212 struct inode *dir,
213 struct ocfs2_dir_entry **res_dir)
214{
215 struct super_block *sb;
216 struct buffer_head *bh_use[NAMEI_RA_SIZE];
217 struct buffer_head *bh, *ret = NULL;
218 unsigned long start, block, b;
219 int ra_max = 0; /* Number of bh's in the readahead
220 buffer, bh_use[] */
221 int ra_ptr = 0; /* Current index into readahead
222 buffer */
223 int num = 0;
224 int nblocks, i, err;
225
226 mlog_entry_void();
227
228 sb = dir->i_sb;
229
230 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
231 start = OCFS2_I(dir)->ip_dir_start_lookup;
232 if (start >= nblocks)
233 start = 0;
234 block = start;
235
236restart:
237 do {
238 /*
239 * We deal with the read-ahead logic here.
240 */
241 if (ra_ptr >= ra_max) {
242 /* Refill the readahead buffer */
243 ra_ptr = 0;
244 b = block;
245 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
246 /*
247 * Terminate if we reach the end of the
248 * directory and must wrap, or if our
249 * search has finished at this block.
250 */
251 if (b >= nblocks || (num && block == start)) {
252 bh_use[ra_max] = NULL;
253 break;
254 }
255 num++;
256
257 bh = ocfs2_bread(dir, b++, &err, 1);
258 bh_use[ra_max] = bh;
259 }
260 }
261 if ((bh = bh_use[ra_ptr++]) == NULL)
262 goto next;
263 wait_on_buffer(bh);
264 if (!buffer_uptodate(bh)) {
265 /* read error, skip block & hope for the best */
266 ocfs2_error(dir->i_sb, "reading directory %llu, "
267 "offset %lu\n",
268 (unsigned long long)OCFS2_I(dir)->ip_blkno,
269 block);
270 brelse(bh);
271 goto next;
272 }
273 i = ocfs2_search_dirblock(bh, dir, name, namelen,
274 block << sb->s_blocksize_bits,
275 bh->b_data, sb->s_blocksize,
276 res_dir);
277 if (i == 1) {
278 OCFS2_I(dir)->ip_dir_start_lookup = block;
279 ret = bh;
280 goto cleanup_and_exit;
281 } else {
282 brelse(bh);
283 if (i < 0)
284 goto cleanup_and_exit;
285 }
286 next:
287 if (++block >= nblocks)
288 block = 0;
289 } while (block != start);
290
291 /*
292 * If the directory has grown while we were searching, then
293 * search the last part of the directory before giving up.
294 */
295 block = nblocks;
296 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
297 if (block < nblocks) {
298 start = 0;
299 goto restart;
300 }
301
302cleanup_and_exit:
303 /* Clean up the read-ahead blocks */
304 for (; ra_ptr < ra_max; ra_ptr++)
305 brelse(bh_use[ra_ptr]);
306
307 mlog_exit_ptr(ret);
308 return ret;
309}
310
311/*
312 * Try to find an entry of the provided name within 'dir'.
72 * 313 *
314 * If nothing was found, NULL is returned. Otherwise, a buffer_head
315 * and pointer to the dir entry are passed back.
316 *
317 * Caller can NOT assume anything about the contents of the
318 * buffer_head - it is passed back only so that it can be passed into
319 * any one of the manipulation functions (add entry, delete entry,
320 * etc). As an example, bh in the extent directory case is a data
321 * block, in the inline-data case it actually points to an inode.
73 */ 322 */
74int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) 323struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
324 struct inode *dir,
325 struct ocfs2_dir_entry **res_dir)
326{
327 *res_dir = NULL;
328
329 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
330 return ocfs2_find_entry_id(name, namelen, dir, res_dir);
331
332 return ocfs2_find_entry_el(name, namelen, dir, res_dir);
333}
334
335/*
336 * Update inode number and type of a previously found directory entry.
337 */
338int ocfs2_update_entry(struct inode *dir, handle_t *handle,
339 struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
340 struct inode *new_entry_inode)
341{
342 int ret;
343
344 /*
345 * The same code works fine for both inline-data and extent
346 * based directories, so no need to split this up.
347 */
348
349 ret = ocfs2_journal_access(handle, dir, de_bh,
350 OCFS2_JOURNAL_ACCESS_WRITE);
351 if (ret) {
352 mlog_errno(ret);
353 goto out;
354 }
355
356 de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
357 ocfs2_set_de_type(de, new_entry_inode->i_mode);
358
359 ocfs2_journal_dirty(handle, de_bh);
360
361out:
362 return ret;
363}
364
365static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
366 struct ocfs2_dir_entry *de_del,
367 struct buffer_head *bh, char *first_de,
368 unsigned int bytes)
369{
370 struct ocfs2_dir_entry *de, *pde;
371 int i, status = -ENOENT;
372
373 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
374
375 i = 0;
376 pde = NULL;
377 de = (struct ocfs2_dir_entry *) first_de;
378 while (i < bytes) {
379 if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
380 status = -EIO;
381 mlog_errno(status);
382 goto bail;
383 }
384 if (de == de_del) {
385 status = ocfs2_journal_access(handle, dir, bh,
386 OCFS2_JOURNAL_ACCESS_WRITE);
387 if (status < 0) {
388 status = -EIO;
389 mlog_errno(status);
390 goto bail;
391 }
392 if (pde)
393 pde->rec_len =
394 cpu_to_le16(le16_to_cpu(pde->rec_len) +
395 le16_to_cpu(de->rec_len));
396 else
397 de->inode = 0;
398 dir->i_version++;
399 status = ocfs2_journal_dirty(handle, bh);
400 goto bail;
401 }
402 i += le16_to_cpu(de->rec_len);
403 pde = de;
404 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
405 }
406bail:
407 mlog_exit(status);
408 return status;
409}
410
411static inline int ocfs2_delete_entry_id(handle_t *handle,
412 struct inode *dir,
413 struct ocfs2_dir_entry *de_del,
414 struct buffer_head *bh)
415{
416 int ret;
417 struct buffer_head *di_bh = NULL;
418 struct ocfs2_dinode *di;
419 struct ocfs2_inline_data *data;
420
421 ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
422 &di_bh, OCFS2_BH_CACHED, dir);
423 if (ret) {
424 mlog_errno(ret);
425 goto out;
426 }
427
428 di = (struct ocfs2_dinode *)di_bh->b_data;
429 data = &di->id2.i_data;
430
431 ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data,
432 i_size_read(dir));
433
434 brelse(di_bh);
435out:
436 return ret;
437}
438
439static inline int ocfs2_delete_entry_el(handle_t *handle,
440 struct inode *dir,
441 struct ocfs2_dir_entry *de_del,
442 struct buffer_head *bh)
443{
444 return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data,
445 bh->b_size);
446}
447
448/*
449 * ocfs2_delete_entry deletes a directory entry by merging it with the
450 * previous entry
451 */
452int ocfs2_delete_entry(handle_t *handle,
453 struct inode *dir,
454 struct ocfs2_dir_entry *de_del,
455 struct buffer_head *bh)
456{
457 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
458 return ocfs2_delete_entry_id(handle, dir, de_del, bh);
459
460 return ocfs2_delete_entry_el(handle, dir, de_del, bh);
461}
462
463/*
464 * Check whether 'de' has enough room to hold an entry of
465 * 'new_rec_len' bytes.
466 */
467static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
468 unsigned int new_rec_len)
469{
470 unsigned int de_really_used;
471
472 /* Check whether this is an empty record with enough space */
473 if (le64_to_cpu(de->inode) == 0 &&
474 le16_to_cpu(de->rec_len) >= new_rec_len)
475 return 1;
476
477 /*
478 * Record might have free space at the end which we can
479 * use.
480 */
481 de_really_used = OCFS2_DIR_REC_LEN(de->name_len);
482 if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len))
483 return 1;
484
485 return 0;
486}
487
488/* we don't always have a dentry for what we want to add, so people
489 * like orphan dir can call this instead.
490 *
491 * If you pass me insert_bh, I'll skip the search of the other dir
492 * blocks and put the record in there.
493 */
494int __ocfs2_add_entry(handle_t *handle,
495 struct inode *dir,
496 const char *name, int namelen,
497 struct inode *inode, u64 blkno,
498 struct buffer_head *parent_fe_bh,
499 struct buffer_head *insert_bh)
500{
501 unsigned long offset;
502 unsigned short rec_len;
503 struct ocfs2_dir_entry *de, *de1;
504 struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
505 struct super_block *sb = dir->i_sb;
506 int retval, status;
507 unsigned int size = sb->s_blocksize;
508 char *data_start = insert_bh->b_data;
509
510 mlog_entry_void();
511
512 if (!namelen)
513 return -EINVAL;
514
515 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
516 data_start = di->id2.i_data.id_data;
517 size = i_size_read(dir);
518
519 BUG_ON(insert_bh != parent_fe_bh);
520 }
521
522 rec_len = OCFS2_DIR_REC_LEN(namelen);
523 offset = 0;
524 de = (struct ocfs2_dir_entry *) data_start;
525 while (1) {
526 BUG_ON((char *)de >= (size + data_start));
527
528 /* These checks should've already been passed by the
529 * prepare function, but I guess we can leave them
530 * here anyway. */
531 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
532 retval = -ENOENT;
533 goto bail;
534 }
535 if (ocfs2_match(namelen, name, de)) {
536 retval = -EEXIST;
537 goto bail;
538 }
539
540 if (ocfs2_dirent_would_fit(de, rec_len)) {
541 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
542 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
543 if (retval < 0) {
544 mlog_errno(retval);
545 goto bail;
546 }
547
548 status = ocfs2_journal_access(handle, dir, insert_bh,
549 OCFS2_JOURNAL_ACCESS_WRITE);
550 /* By now the buffer is marked for journaling */
551 offset += le16_to_cpu(de->rec_len);
552 if (le64_to_cpu(de->inode)) {
553 de1 = (struct ocfs2_dir_entry *)((char *) de +
554 OCFS2_DIR_REC_LEN(de->name_len));
555 de1->rec_len =
556 cpu_to_le16(le16_to_cpu(de->rec_len) -
557 OCFS2_DIR_REC_LEN(de->name_len));
558 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
559 de = de1;
560 }
561 de->file_type = OCFS2_FT_UNKNOWN;
562 if (blkno) {
563 de->inode = cpu_to_le64(blkno);
564 ocfs2_set_de_type(de, inode->i_mode);
565 } else
566 de->inode = 0;
567 de->name_len = namelen;
568 memcpy(de->name, name, namelen);
569
570 dir->i_version++;
571 status = ocfs2_journal_dirty(handle, insert_bh);
572 retval = 0;
573 goto bail;
574 }
575 offset += le16_to_cpu(de->rec_len);
576 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
577 }
578
579 /* when you think about it, the assert above should prevent us
580 * from ever getting here. */
581 retval = -ENOSPC;
582bail:
583
584 mlog_exit(retval);
585 return retval;
586}
587
588static int ocfs2_dir_foreach_blk_id(struct inode *inode,
589 unsigned long *f_version,
590 loff_t *f_pos, void *priv,
591 filldir_t filldir, int *filldir_err)
592{
593 int ret, i, filldir_ret;
594 unsigned long offset = *f_pos;
595 struct buffer_head *di_bh = NULL;
596 struct ocfs2_dinode *di;
597 struct ocfs2_inline_data *data;
598 struct ocfs2_dir_entry *de;
599
600 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
601 &di_bh, OCFS2_BH_CACHED, inode);
602 if (ret) {
603 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
604 (unsigned long long)OCFS2_I(inode)->ip_blkno);
605 goto out;
606 }
607
608 di = (struct ocfs2_dinode *)di_bh->b_data;
609 data = &di->id2.i_data;
610
611 while (*f_pos < i_size_read(inode)) {
612revalidate:
613 /* If the dir block has changed since the last call to
614 * readdir(2), then we might be pointing to an invalid
615 * dirent right now. Scan from the start of the block
616 * to make sure. */
617 if (*f_version != inode->i_version) {
618 for (i = 0; i < i_size_read(inode) && i < offset; ) {
619 de = (struct ocfs2_dir_entry *)
620 (data->id_data + i);
621 /* It's too expensive to do a full
622 * dirent test each time round this
623 * loop, but we do have to test at
624 * least that it is non-zero. A
625 * failure will be detected in the
626 * dirent test below. */
627 if (le16_to_cpu(de->rec_len) <
628 OCFS2_DIR_REC_LEN(1))
629 break;
630 i += le16_to_cpu(de->rec_len);
631 }
632 *f_pos = offset = i;
633 *f_version = inode->i_version;
634 }
635
636 de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
637 if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
638 /* On error, skip the f_pos to the end. */
639 *f_pos = i_size_read(inode);
640 goto out;
641 }
642 offset += le16_to_cpu(de->rec_len);
643 if (le64_to_cpu(de->inode)) {
644 /* We might block in the next section
645 * if the data destination is
646 * currently swapped out. So, use a
647 * version stamp to detect whether or
648 * not the directory has been modified
649 * during the copy operation.
650 */
651 unsigned long version = *f_version;
652 unsigned char d_type = DT_UNKNOWN;
653
654 if (de->file_type < OCFS2_FT_MAX)
655 d_type = ocfs2_filetype_table[de->file_type];
656
657 filldir_ret = filldir(priv, de->name,
658 de->name_len,
659 *f_pos,
660 le64_to_cpu(de->inode),
661 d_type);
662 if (filldir_ret) {
663 if (filldir_err)
664 *filldir_err = filldir_ret;
665 break;
666 }
667 if (version != *f_version)
668 goto revalidate;
669 }
670 *f_pos += le16_to_cpu(de->rec_len);
671 }
672
673out:
674 brelse(di_bh);
675
676 return 0;
677}
678
679static int ocfs2_dir_foreach_blk_el(struct inode *inode,
680 unsigned long *f_version,
681 loff_t *f_pos, void *priv,
682 filldir_t filldir, int *filldir_err)
75{ 683{
76 int error = 0; 684 int error = 0;
77 unsigned long offset, blk, last_ra_blk = 0; 685 unsigned long offset, blk, last_ra_blk = 0;
@@ -79,45 +687,23 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
79 struct buffer_head * bh, * tmp; 687 struct buffer_head * bh, * tmp;
80 struct ocfs2_dir_entry * de; 688 struct ocfs2_dir_entry * de;
81 int err; 689 int err;
82 struct inode *inode = filp->f_path.dentry->d_inode;
83 struct super_block * sb = inode->i_sb; 690 struct super_block * sb = inode->i_sb;
84 unsigned int ra_sectors = 16; 691 unsigned int ra_sectors = 16;
85 int lock_level = 0;
86
87 mlog_entry("dirino=%llu\n",
88 (unsigned long long)OCFS2_I(inode)->ip_blkno);
89 692
90 stored = 0; 693 stored = 0;
91 bh = NULL; 694 bh = NULL;
92 695
93 error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 696 offset = (*f_pos) & (sb->s_blocksize - 1);
94 if (lock_level && error >= 0) {
95 /* We release EX lock which used to update atime
96 * and get PR lock again to reduce contention
97 * on commonly accessed directories. */
98 ocfs2_meta_unlock(inode, 1);
99 lock_level = 0;
100 error = ocfs2_meta_lock(inode, NULL, 0);
101 }
102 if (error < 0) {
103 if (error != -ENOENT)
104 mlog_errno(error);
105 /* we haven't got any yet, so propagate the error. */
106 stored = error;
107 goto bail_nolock;
108 }
109 697
110 offset = filp->f_pos & (sb->s_blocksize - 1); 698 while (!error && !stored && *f_pos < i_size_read(inode)) {
111 699 blk = (*f_pos) >> sb->s_blocksize_bits;
112 while (!error && !stored && filp->f_pos < i_size_read(inode)) {
113 blk = (filp->f_pos) >> sb->s_blocksize_bits;
114 bh = ocfs2_bread(inode, blk, &err, 0); 700 bh = ocfs2_bread(inode, blk, &err, 0);
115 if (!bh) { 701 if (!bh) {
116 mlog(ML_ERROR, 702 mlog(ML_ERROR,
117 "directory #%llu contains a hole at offset %lld\n", 703 "directory #%llu contains a hole at offset %lld\n",
118 (unsigned long long)OCFS2_I(inode)->ip_blkno, 704 (unsigned long long)OCFS2_I(inode)->ip_blkno,
119 filp->f_pos); 705 *f_pos);
120 filp->f_pos += sb->s_blocksize - offset; 706 *f_pos += sb->s_blocksize - offset;
121 continue; 707 continue;
122 } 708 }
123 709
@@ -143,7 +729,7 @@ revalidate:
143 * readdir(2), then we might be pointing to an invalid 729 * readdir(2), then we might be pointing to an invalid
144 * dirent right now. Scan from the start of the block 730 * dirent right now. Scan from the start of the block
145 * to make sure. */ 731 * to make sure. */
146 if (filp->f_version != inode->i_version) { 732 if (*f_version != inode->i_version) {
147 for (i = 0; i < sb->s_blocksize && i < offset; ) { 733 for (i = 0; i < sb->s_blocksize && i < offset; ) {
148 de = (struct ocfs2_dir_entry *) (bh->b_data + i); 734 de = (struct ocfs2_dir_entry *) (bh->b_data + i);
149 /* It's too expensive to do a full 735 /* It's too expensive to do a full
@@ -158,21 +744,20 @@ revalidate:
158 i += le16_to_cpu(de->rec_len); 744 i += le16_to_cpu(de->rec_len);
159 } 745 }
160 offset = i; 746 offset = i;
161 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 747 *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
162 | offset; 748 | offset;
163 filp->f_version = inode->i_version; 749 *f_version = inode->i_version;
164 } 750 }
165 751
166 while (!error && filp->f_pos < i_size_read(inode) 752 while (!error && *f_pos < i_size_read(inode)
167 && offset < sb->s_blocksize) { 753 && offset < sb->s_blocksize) {
168 de = (struct ocfs2_dir_entry *) (bh->b_data + offset); 754 de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
169 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { 755 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
170 /* On error, skip the f_pos to the 756 /* On error, skip the f_pos to the
171 next block. */ 757 next block. */
172 filp->f_pos = (filp->f_pos | 758 *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
173 (sb->s_blocksize - 1)) + 1;
174 brelse(bh); 759 brelse(bh);
175 goto bail; 760 goto out;
176 } 761 }
177 offset += le16_to_cpu(de->rec_len); 762 offset += le16_to_cpu(de->rec_len);
178 if (le64_to_cpu(de->inode)) { 763 if (le64_to_cpu(de->inode)) {
@@ -183,36 +768,109 @@ revalidate:
183 * not the directory has been modified 768 * not the directory has been modified
184 * during the copy operation. 769 * during the copy operation.
185 */ 770 */
186 unsigned long version = filp->f_version; 771 unsigned long version = *f_version;
187 unsigned char d_type = DT_UNKNOWN; 772 unsigned char d_type = DT_UNKNOWN;
188 773
189 if (de->file_type < OCFS2_FT_MAX) 774 if (de->file_type < OCFS2_FT_MAX)
190 d_type = ocfs2_filetype_table[de->file_type]; 775 d_type = ocfs2_filetype_table[de->file_type];
191 error = filldir(dirent, de->name, 776 error = filldir(priv, de->name,
192 de->name_len, 777 de->name_len,
193 filp->f_pos, 778 *f_pos,
194 ino_from_blkno(sb, le64_to_cpu(de->inode)), 779 le64_to_cpu(de->inode),
195 d_type); 780 d_type);
196 if (error) 781 if (error) {
782 if (filldir_err)
783 *filldir_err = error;
197 break; 784 break;
198 if (version != filp->f_version) 785 }
786 if (version != *f_version)
199 goto revalidate; 787 goto revalidate;
200 stored ++; 788 stored ++;
201 } 789 }
202 filp->f_pos += le16_to_cpu(de->rec_len); 790 *f_pos += le16_to_cpu(de->rec_len);
203 } 791 }
204 offset = 0; 792 offset = 0;
205 brelse(bh); 793 brelse(bh);
206 } 794 }
207 795
208 stored = 0; 796 stored = 0;
209bail: 797out:
798 return stored;
799}
800
801static int ocfs2_dir_foreach_blk(struct inode *inode, unsigned long *f_version,
802 loff_t *f_pos, void *priv, filldir_t filldir,
803 int *filldir_err)
804{
805 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
806 return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
807 filldir, filldir_err);
808
809 return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
810 filldir_err);
811}
812
813/*
814 * This is intended to be called from inside other kernel functions,
815 * so we fake some arguments.
816 */
817int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
818 filldir_t filldir)
819{
820 int ret = 0, filldir_err = 0;
821 unsigned long version = inode->i_version;
822
823 while (*f_pos < i_size_read(inode)) {
824 ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
825 filldir, &filldir_err);
826 if (ret || filldir_err)
827 break;
828 }
829
830 if (ret > 0)
831 ret = -EIO;
832
833 return 0;
834}
835
836/*
837 * ocfs2_readdir()
838 *
839 */
840int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
841{
842 int error = 0;
843 struct inode *inode = filp->f_path.dentry->d_inode;
844 int lock_level = 0;
845
846 mlog_entry("dirino=%llu\n",
847 (unsigned long long)OCFS2_I(inode)->ip_blkno);
848
849 error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
850 if (lock_level && error >= 0) {
851 /* We release EX lock which used to update atime
852 * and get PR lock again to reduce contention
853 * on commonly accessed directories. */
854 ocfs2_meta_unlock(inode, 1);
855 lock_level = 0;
856 error = ocfs2_meta_lock(inode, NULL, 0);
857 }
858 if (error < 0) {
859 if (error != -ENOENT)
860 mlog_errno(error);
861 /* we haven't got any yet, so propagate the error. */
862 goto bail_nolock;
863 }
864
865 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
866 dirent, filldir, NULL);
867
210 ocfs2_meta_unlock(inode, lock_level); 868 ocfs2_meta_unlock(inode, lock_level);
211 869
212bail_nolock: 870bail_nolock:
213 mlog_exit(stored); 871 mlog_exit(error);
214 872
215 return stored; 873 return error;
216} 874}
217 875
218/* 876/*
@@ -252,6 +910,23 @@ leave:
252 return status; 910 return status;
253} 911}
254 912
913/*
914 * Convenience function for callers which just want the block number
915 * mapped to a name and don't require the full dirent info, etc.
916 */
917int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
918 int namelen, u64 *blkno)
919{
920 int ret;
921 struct buffer_head *bh = NULL;
922 struct ocfs2_dir_entry *dirent = NULL;
923
924 ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent);
925 brelse(bh);
926
927 return ret;
928}
929
255/* Check for a name within a directory. 930/* Check for a name within a directory.
256 * 931 *
257 * Return 0 if the name does not exist 932 * Return 0 if the name does not exist
@@ -284,77 +959,414 @@ bail:
284 return ret; 959 return ret;
285} 960}
286 961
962struct ocfs2_empty_dir_priv {
963 unsigned seen_dot;
964 unsigned seen_dot_dot;
965 unsigned seen_other;
966};
967static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
968 loff_t pos, u64 ino, unsigned type)
969{
970 struct ocfs2_empty_dir_priv *p = priv;
971
972 /*
973 * Check the positions of "." and ".." records to be sure
974 * they're in the correct place.
975 */
976 if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
977 p->seen_dot = 1;
978 return 0;
979 }
980
981 if (name_len == 2 && !strncmp("..", name, 2) &&
982 pos == OCFS2_DIR_REC_LEN(1)) {
983 p->seen_dot_dot = 1;
984 return 0;
985 }
986
987 p->seen_other = 1;
988 return 1;
989}
287/* 990/*
288 * routine to check that the specified directory is empty (for rmdir) 991 * routine to check that the specified directory is empty (for rmdir)
992 *
993 * Returns 1 if dir is empty, zero otherwise.
289 */ 994 */
290int ocfs2_empty_dir(struct inode *inode) 995int ocfs2_empty_dir(struct inode *inode)
291{ 996{
292 unsigned long offset; 997 int ret;
293 struct buffer_head * bh; 998 loff_t start = 0;
294 struct ocfs2_dir_entry * de, * de1; 999 struct ocfs2_empty_dir_priv priv;
295 struct super_block * sb; 1000
296 int err; 1001 memset(&priv, 0, sizeof(priv));
1002
1003 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
1004 if (ret)
1005 mlog_errno(ret);
297 1006
298 sb = inode->i_sb; 1007 if (!priv.seen_dot || !priv.seen_dot_dot) {
299 if ((i_size_read(inode) < 1008 mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
300 (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
301 !(bh = ocfs2_bread(inode, 0, &err, 0))) {
302 mlog(ML_ERROR, "bad directory (dir #%llu) - no data block\n",
303 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1009 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1010 /*
1011 * XXX: Is it really safe to allow an unlink to continue?
1012 */
304 return 1; 1013 return 1;
305 } 1014 }
306 1015
307 de = (struct ocfs2_dir_entry *) bh->b_data; 1016 return !priv.seen_other;
308 de1 = (struct ocfs2_dir_entry *) 1017}
309 ((char *)de + le16_to_cpu(de->rec_len)); 1018
310 if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) || 1019static void ocfs2_fill_initial_dirents(struct inode *inode,
311 !le64_to_cpu(de1->inode) || 1020 struct inode *parent,
312 strcmp(".", de->name) || 1021 char *start, unsigned int size)
313 strcmp("..", de1->name)) { 1022{
314 mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n", 1023 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
315 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1024
316 brelse(bh); 1025 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
317 return 1; 1026 de->name_len = 1;
1027 de->rec_len =
1028 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1029 strcpy(de->name, ".");
1030 ocfs2_set_de_type(de, S_IFDIR);
1031
1032 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
1033 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
1034 de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1));
1035 de->name_len = 2;
1036 strcpy(de->name, "..");
1037 ocfs2_set_de_type(de, S_IFDIR);
1038}
1039
1040/*
1041 * This works together with code in ocfs2_mknod_locked() which sets
1042 * the inline-data flag and initializes the inline-data section.
1043 */
1044static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
1045 handle_t *handle,
1046 struct inode *parent,
1047 struct inode *inode,
1048 struct buffer_head *di_bh)
1049{
1050 int ret;
1051 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1052 struct ocfs2_inline_data *data = &di->id2.i_data;
1053 unsigned int size = le16_to_cpu(data->id_count);
1054
1055 ret = ocfs2_journal_access(handle, inode, di_bh,
1056 OCFS2_JOURNAL_ACCESS_WRITE);
1057 if (ret) {
1058 mlog_errno(ret);
1059 goto out;
318 } 1060 }
319 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); 1061
320 de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len)); 1062 ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
321 while (offset < i_size_read(inode) ) { 1063
322 if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) { 1064 ocfs2_journal_dirty(handle, di_bh);
323 brelse(bh); 1065 if (ret) {
324 bh = ocfs2_bread(inode, 1066 mlog_errno(ret);
325 offset >> sb->s_blocksize_bits, &err, 0); 1067 goto out;
326 if (!bh) { 1068 }
327 mlog(ML_ERROR, "dir %llu has a hole at %lu\n", 1069
328 (unsigned long long)OCFS2_I(inode)->ip_blkno, offset); 1070 i_size_write(inode, size);
329 offset += sb->s_blocksize; 1071 inode->i_nlink = 2;
330 continue; 1072 inode->i_blocks = ocfs2_inode_sector_count(inode);
331 } 1073
332 de = (struct ocfs2_dir_entry *) bh->b_data; 1074 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
333 } 1075 if (ret < 0)
334 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { 1076 mlog_errno(ret);
335 brelse(bh); 1077
336 return 1; 1078out:
1079 return ret;
1080}
1081
1082static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1083 handle_t *handle,
1084 struct inode *parent,
1085 struct inode *inode,
1086 struct buffer_head *fe_bh,
1087 struct ocfs2_alloc_context *data_ac)
1088{
1089 int status;
1090 struct buffer_head *new_bh = NULL;
1091
1092 mlog_entry_void();
1093
1094 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
1095 data_ac, NULL, &new_bh);
1096 if (status < 0) {
1097 mlog_errno(status);
1098 goto bail;
1099 }
1100
1101 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1102
1103 status = ocfs2_journal_access(handle, inode, new_bh,
1104 OCFS2_JOURNAL_ACCESS_CREATE);
1105 if (status < 0) {
1106 mlog_errno(status);
1107 goto bail;
1108 }
1109 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1110
1111 ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
1112 osb->sb->s_blocksize);
1113
1114 status = ocfs2_journal_dirty(handle, new_bh);
1115 if (status < 0) {
1116 mlog_errno(status);
1117 goto bail;
1118 }
1119
1120 i_size_write(inode, inode->i_sb->s_blocksize);
1121 inode->i_nlink = 2;
1122 inode->i_blocks = ocfs2_inode_sector_count(inode);
1123 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1124 if (status < 0) {
1125 mlog_errno(status);
1126 goto bail;
1127 }
1128
1129 status = 0;
1130bail:
1131 if (new_bh)
1132 brelse(new_bh);
1133
1134 mlog_exit(status);
1135 return status;
1136}
1137
1138int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1139 handle_t *handle,
1140 struct inode *parent,
1141 struct inode *inode,
1142 struct buffer_head *fe_bh,
1143 struct ocfs2_alloc_context *data_ac)
1144{
1145 BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
1146
1147 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1148 return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
1149
1150 return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
1151 data_ac);
1152}
1153
1154static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1155 unsigned int new_size)
1156{
1157 struct ocfs2_dir_entry *de;
1158 struct ocfs2_dir_entry *prev_de;
1159 char *de_buf, *limit;
1160 unsigned int bytes = new_size - old_size;
1161
1162 limit = start + old_size;
1163 de_buf = start;
1164 de = (struct ocfs2_dir_entry *)de_buf;
1165 do {
1166 prev_de = de;
1167 de_buf += le16_to_cpu(de->rec_len);
1168 de = (struct ocfs2_dir_entry *)de_buf;
1169 } while (de_buf < limit);
1170
1171 le16_add_cpu(&prev_de->rec_len, bytes);
1172}
1173
1174/*
1175 * We allocate enough clusters to fulfill "blocks_wanted", but set
1176 * i_size to exactly one block. Ocfs2_extend_dir() will handle the
1177 * rest automatically for us.
1178 *
1179 * *first_block_bh is a pointer to the 1st data block allocated to the
1180 * directory.
1181 */
1182static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1183 unsigned int blocks_wanted,
1184 struct buffer_head **first_block_bh)
1185{
1186 int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
1187 u32 alloc, bit_off, len;
1188 struct super_block *sb = dir->i_sb;
1189 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
1190 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1191 struct ocfs2_inode_info *oi = OCFS2_I(dir);
1192 struct ocfs2_alloc_context *data_ac;
1193 struct buffer_head *dirdata_bh = NULL;
1194 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1195 handle_t *handle;
1196
1197 alloc = ocfs2_clusters_for_bytes(sb, bytes);
1198
1199 /*
1200 * We should never need more than 2 clusters for this -
1201 * maximum dirent size is far less than one block. In fact,
1202 * the only time we'd need more than one cluster is if
1203 * blocksize == clustersize and the dirent won't fit in the
1204 * extra space that the expansion to a single block gives. As
1205 * of today, that only happens on 4k/4k file systems.
1206 */
1207 BUG_ON(alloc > 2);
1208
1209 ret = ocfs2_reserve_clusters(osb, alloc, &data_ac);
1210 if (ret) {
1211 mlog_errno(ret);
1212 goto out;
1213 }
1214
1215 down_write(&oi->ip_alloc_sem);
1216
1217 /*
1218 * Prepare for worst case allocation scenario of two seperate
1219 * extents.
1220 */
1221 if (alloc == 2)
1222 credits += OCFS2_SUBALLOC_ALLOC;
1223
1224 handle = ocfs2_start_trans(osb, credits);
1225 if (IS_ERR(handle)) {
1226 ret = PTR_ERR(handle);
1227 mlog_errno(ret);
1228 goto out_sem;
1229 }
1230
1231 /*
1232 * Try to claim as many clusters as the bitmap can give though
1233 * if we only get one now, that's enough to continue. The rest
1234 * will be claimed after the conversion to extents.
1235 */
1236 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
1237 if (ret) {
1238 mlog_errno(ret);
1239 goto out_commit;
1240 }
1241
1242 /*
1243 * Operations are carefully ordered so that we set up the new
1244 * data block first. The conversion from inline data to
1245 * extents follows.
1246 */
1247 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
1248 dirdata_bh = sb_getblk(sb, blkno);
1249 if (!dirdata_bh) {
1250 ret = -EIO;
1251 mlog_errno(ret);
1252 goto out_commit;
1253 }
1254
1255 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
1256
1257 ret = ocfs2_journal_access(handle, dir, dirdata_bh,
1258 OCFS2_JOURNAL_ACCESS_CREATE);
1259 if (ret) {
1260 mlog_errno(ret);
1261 goto out_commit;
1262 }
1263
1264 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1265 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1266 sb->s_blocksize - i_size_read(dir));
1267 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
1268 sb->s_blocksize);
1269
1270 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1271 if (ret) {
1272 mlog_errno(ret);
1273 goto out_commit;
1274 }
1275
1276 /*
1277 * Set extent, i_size, etc on the directory. After this, the
1278 * inode should contain the same exact dirents as before and
1279 * be fully accessible from system calls.
1280 *
1281 * We let the later dirent insert modify c/mtime - to the user
1282 * the data hasn't changed.
1283 */
1284 ret = ocfs2_journal_access(handle, dir, di_bh,
1285 OCFS2_JOURNAL_ACCESS_CREATE);
1286 if (ret) {
1287 mlog_errno(ret);
1288 goto out_commit;
1289 }
1290
1291 spin_lock(&oi->ip_lock);
1292 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
1293 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1294 spin_unlock(&oi->ip_lock);
1295
1296 ocfs2_dinode_new_extent_list(dir, di);
1297
1298 i_size_write(dir, sb->s_blocksize);
1299 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1300
1301 di->i_size = cpu_to_le64(sb->s_blocksize);
1302 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
1303 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
1304 dir->i_blocks = ocfs2_inode_sector_count(dir);
1305
1306 /*
1307 * This should never fail as our extent list is empty and all
1308 * related blocks have been journaled already.
1309 */
1310 ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
1311 NULL);
1312 if (ret) {
1313 mlog_errno(ret);
1314 goto out;
1315 }
1316
1317 ret = ocfs2_journal_dirty(handle, di_bh);
1318 if (ret) {
1319 mlog_errno(ret);
1320 goto out_commit;
1321 }
1322
1323 /*
1324 * We asked for two clusters, but only got one in the 1st
1325 * pass. Claim the 2nd cluster as a separate extent.
1326 */
1327 if (alloc > len) {
1328 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
1329 &len);
1330 if (ret) {
1331 mlog_errno(ret);
1332 goto out_commit;
337 } 1333 }
338 if (le64_to_cpu(de->inode)) { 1334 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
339 brelse(bh); 1335
340 return 0; 1336 ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
1337 len, 0, NULL);
1338 if (ret) {
1339 mlog_errno(ret);
1340 goto out;
341 } 1341 }
342 offset += le16_to_cpu(de->rec_len);
343 de = (struct ocfs2_dir_entry *)
344 ((char *)de + le16_to_cpu(de->rec_len));
345 } 1342 }
346 brelse(bh); 1343
347 return 1; 1344 *first_block_bh = dirdata_bh;
1345 dirdata_bh = NULL;
1346
1347out_commit:
1348 ocfs2_commit_trans(osb, handle);
1349
1350out_sem:
1351 up_write(&oi->ip_alloc_sem);
1352
1353out:
1354 if (data_ac)
1355 ocfs2_free_alloc_context(data_ac);
1356
1357 brelse(dirdata_bh);
1358
1359 return ret;
348} 1360}
349 1361
350/* returns a bh of the 1st new block in the allocation. */ 1362/* returns a bh of the 1st new block in the allocation. */
351int ocfs2_do_extend_dir(struct super_block *sb, 1363static int ocfs2_do_extend_dir(struct super_block *sb,
352 handle_t *handle, 1364 handle_t *handle,
353 struct inode *dir, 1365 struct inode *dir,
354 struct buffer_head *parent_fe_bh, 1366 struct buffer_head *parent_fe_bh,
355 struct ocfs2_alloc_context *data_ac, 1367 struct ocfs2_alloc_context *data_ac,
356 struct ocfs2_alloc_context *meta_ac, 1368 struct ocfs2_alloc_context *meta_ac,
357 struct buffer_head **new_bh) 1369 struct buffer_head **new_bh)
358{ 1370{
359 int status; 1371 int status;
360 int extend; 1372 int extend;
@@ -396,10 +1408,18 @@ bail:
396 return status; 1408 return status;
397} 1409}
398 1410
399/* assumes you already have a cluster lock on the directory. */ 1411/*
1412 * Assumes you already have a cluster lock on the directory.
1413 *
1414 * 'blocks_wanted' is only used if we have an inline directory which
1415 * is to be turned into an extent based one. The size of the dirent to
1416 * insert might be larger than the space gained by growing to just one
1417 * block, so we may have to grow the inode by two blocks in that case.
1418 */
400static int ocfs2_extend_dir(struct ocfs2_super *osb, 1419static int ocfs2_extend_dir(struct ocfs2_super *osb,
401 struct inode *dir, 1420 struct inode *dir,
402 struct buffer_head *parent_fe_bh, 1421 struct buffer_head *parent_fe_bh,
1422 unsigned int blocks_wanted,
403 struct buffer_head **new_de_bh) 1423 struct buffer_head **new_de_bh)
404{ 1424{
405 int status = 0; 1425 int status = 0;
@@ -415,6 +1435,38 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
415 1435
416 mlog_entry_void(); 1436 mlog_entry_void();
417 1437
1438 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1439 status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
1440 blocks_wanted, &new_bh);
1441 if (status) {
1442 mlog_errno(status);
1443 goto bail;
1444 }
1445
1446 if (blocks_wanted == 1) {
1447 /*
1448 * If the new dirent will fit inside the space
1449 * created by pushing out to one block, then
1450 * we can complete the operation
1451 * here. Otherwise we have to expand i_size
1452 * and format the 2nd block below.
1453 */
1454 BUG_ON(new_bh == NULL);
1455 goto bail_bh;
1456 }
1457
1458 /*
1459 * Get rid of 'new_bh' - we want to format the 2nd
1460 * data block and return that instead.
1461 */
1462 brelse(new_bh);
1463 new_bh = NULL;
1464
1465 dir_i_size = i_size_read(dir);
1466 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
1467 goto do_extend;
1468 }
1469
418 dir_i_size = i_size_read(dir); 1470 dir_i_size = i_size_read(dir);
419 mlog(0, "extending dir %llu (i_size = %lld)\n", 1471 mlog(0, "extending dir %llu (i_size = %lld)\n",
420 (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size); 1472 (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
@@ -452,6 +1504,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
452 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; 1504 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
453 } 1505 }
454 1506
1507do_extend:
455 down_write(&OCFS2_I(dir)->ip_alloc_sem); 1508 down_write(&OCFS2_I(dir)->ip_alloc_sem);
456 drop_alloc_sem = 1; 1509 drop_alloc_sem = 1;
457 1510
@@ -497,6 +1550,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
497 goto bail; 1550 goto bail;
498 } 1551 }
499 1552
1553bail_bh:
500 *new_de_bh = new_bh; 1554 *new_de_bh = new_bh;
501 get_bh(*new_de_bh); 1555 get_bh(*new_de_bh);
502bail: 1556bail:
@@ -517,41 +1571,71 @@ bail:
517 return status; 1571 return status;
518} 1572}
519 1573
520/* 1574static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
521 * Search the dir for a good spot, extending it if necessary. The 1575 const char *name, int namelen,
522 * block containing an appropriate record is returned in ret_de_bh. 1576 struct buffer_head **ret_de_bh,
523 */ 1577 unsigned int *blocks_wanted)
524int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
525 struct inode *dir,
526 struct buffer_head *parent_fe_bh,
527 const char *name,
528 int namelen,
529 struct buffer_head **ret_de_bh)
530{ 1578{
531 unsigned long offset; 1579 int ret;
532 struct buffer_head * bh = NULL; 1580 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
533 unsigned short rec_len; 1581 struct ocfs2_dir_entry *de, *last_de = NULL;
534 struct ocfs2_dinode *fe; 1582 char *de_buf, *limit;
535 struct ocfs2_dir_entry *de; 1583 unsigned long offset = 0;
536 struct super_block *sb; 1584 unsigned int rec_len, new_rec_len;
537 int status; 1585
1586 de_buf = di->id2.i_data.id_data;
1587 limit = de_buf + i_size_read(dir);
1588 rec_len = OCFS2_DIR_REC_LEN(namelen);
538 1589
539 mlog_entry_void(); 1590 while (de_buf < limit) {
1591 de = (struct ocfs2_dir_entry *)de_buf;
540 1592
541 mlog(0, "getting ready to insert namelen %d into dir %llu\n", 1593 if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
542 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); 1594 ret = -ENOENT;
1595 goto out;
1596 }
1597 if (ocfs2_match(namelen, name, de)) {
1598 ret = -EEXIST;
1599 goto out;
1600 }
1601 if (ocfs2_dirent_would_fit(de, rec_len)) {
1602 /* Ok, we found a spot. Return this bh and let
1603 * the caller actually fill it in. */
1604 *ret_de_bh = di_bh;
1605 get_bh(*ret_de_bh);
1606 ret = 0;
1607 goto out;
1608 }
543 1609
544 BUG_ON(!S_ISDIR(dir->i_mode)); 1610 last_de = de;
545 fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1611 de_buf += le16_to_cpu(de->rec_len);
546 BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir)); 1612 offset += le16_to_cpu(de->rec_len);
1613 }
547 1614
548 sb = dir->i_sb; 1615 /*
1616 * We're going to require expansion of the directory - figure
1617 * out how many blocks we'll need so that a place for the
1618 * dirent can be found.
1619 */
1620 *blocks_wanted = 1;
1621 new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
1622 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
1623 *blocks_wanted = 2;
1624
1625 ret = -ENOSPC;
1626out:
1627 return ret;
1628}
549 1629
550 if (!namelen) { 1630static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
551 status = -EINVAL; 1631 int namelen, struct buffer_head **ret_de_bh)
552 mlog_errno(status); 1632{
553 goto bail; 1633 unsigned long offset;
554 } 1634 struct buffer_head *bh = NULL;
1635 unsigned short rec_len;
1636 struct ocfs2_dir_entry *de;
1637 struct super_block *sb = dir->i_sb;
1638 int status;
555 1639
556 bh = ocfs2_bread(dir, 0, &status, 0); 1640 bh = ocfs2_bread(dir, 0, &status, 0);
557 if (!bh) { 1641 if (!bh) {
@@ -568,17 +1652,11 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
568 bh = NULL; 1652 bh = NULL;
569 1653
570 if (i_size_read(dir) <= offset) { 1654 if (i_size_read(dir) <= offset) {
571 status = ocfs2_extend_dir(osb, 1655 /*
572 dir, 1656 * Caller will have to expand this
573 parent_fe_bh, 1657 * directory.
574 &bh); 1658 */
575 if (status < 0) { 1659 status = -ENOSPC;
576 mlog_errno(status);
577 goto bail;
578 }
579 BUG_ON(!bh);
580 *ret_de_bh = bh;
581 get_bh(*ret_de_bh);
582 goto bail; 1660 goto bail;
583 } 1661 }
584 bh = ocfs2_bread(dir, 1662 bh = ocfs2_bread(dir,
@@ -600,10 +1678,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
600 status = -EEXIST; 1678 status = -EEXIST;
601 goto bail; 1679 goto bail;
602 } 1680 }
603 if (((le64_to_cpu(de->inode) == 0) && 1681 if (ocfs2_dirent_would_fit(de, rec_len)) {
604 (le16_to_cpu(de->rec_len) >= rec_len)) ||
605 (le16_to_cpu(de->rec_len) >=
606 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
607 /* Ok, we found a spot. Return this bh and let 1682 /* Ok, we found a spot. Return this bh and let
608 * the caller actually fill it in. */ 1683 * the caller actually fill it in. */
609 *ret_de_bh = bh; 1684 *ret_de_bh = bh;
@@ -623,3 +1698,61 @@ bail:
623 mlog_exit(status); 1698 mlog_exit(status);
624 return status; 1699 return status;
625} 1700}
1701
1702int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1703 struct inode *dir,
1704 struct buffer_head *parent_fe_bh,
1705 const char *name,
1706 int namelen,
1707 struct buffer_head **ret_de_bh)
1708{
1709 int ret;
1710 unsigned int blocks_wanted = 1;
1711 struct buffer_head *bh = NULL;
1712
1713 mlog(0, "getting ready to insert namelen %d into dir %llu\n",
1714 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
1715
1716 *ret_de_bh = NULL;
1717
1718 if (!namelen) {
1719 ret = -EINVAL;
1720 mlog_errno(ret);
1721 goto out;
1722 }
1723
1724 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1725 ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
1726 namelen, &bh, &blocks_wanted);
1727 } else
1728 ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh);
1729
1730 if (ret && ret != -ENOSPC) {
1731 mlog_errno(ret);
1732 goto out;
1733 }
1734
1735 if (ret == -ENOSPC) {
1736 /*
1737 * We have to expand the directory to add this name.
1738 */
1739 BUG_ON(bh);
1740
1741 ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
1742 &bh);
1743 if (ret) {
1744 if (ret != -ENOSPC)
1745 mlog_errno(ret);
1746 goto out;
1747 }
1748
1749 BUG_ON(!bh);
1750 }
1751
1752 *ret_de_bh = bh;
1753 bh = NULL;
1754out:
1755 if (bh)
1756 brelse(bh);
1757 return ret;
1758}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index 3f67e146864a..ce48b9080d87 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,17 +26,49 @@
26#ifndef OCFS2_DIR_H 26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H 27#define OCFS2_DIR_H
28 28
29struct buffer_head *ocfs2_find_entry(const char *name,
30 int namelen,
31 struct inode *dir,
32 struct ocfs2_dir_entry **res_dir);
33int ocfs2_delete_entry(handle_t *handle,
34 struct inode *dir,
35 struct ocfs2_dir_entry *de_del,
36 struct buffer_head *bh);
37int __ocfs2_add_entry(handle_t *handle,
38 struct inode *dir,
39 const char *name, int namelen,
40 struct inode *inode, u64 blkno,
41 struct buffer_head *parent_fe_bh,
42 struct buffer_head *insert_bh);
43static inline int ocfs2_add_entry(handle_t *handle,
44 struct dentry *dentry,
45 struct inode *inode, u64 blkno,
46 struct buffer_head *parent_fe_bh,
47 struct buffer_head *insert_bh)
48{
49 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
50 dentry->d_name.name, dentry->d_name.len,
51 inode, blkno, parent_fe_bh, insert_bh);
52}
53int ocfs2_update_entry(struct inode *dir, handle_t *handle,
54 struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
55 struct inode *new_entry_inode);
56
29int ocfs2_check_dir_for_entry(struct inode *dir, 57int ocfs2_check_dir_for_entry(struct inode *dir,
30 const char *name, 58 const char *name,
31 int namelen); 59 int namelen);
32int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */ 60int ocfs2_empty_dir(struct inode *inode);
33int ocfs2_find_files_on_disk(const char *name, 61int ocfs2_find_files_on_disk(const char *name,
34 int namelen, 62 int namelen,
35 u64 *blkno, 63 u64 *blkno,
36 struct inode *inode, 64 struct inode *inode,
37 struct buffer_head **dirent_bh, 65 struct buffer_head **dirent_bh,
38 struct ocfs2_dir_entry **dirent); 66 struct ocfs2_dir_entry **dirent);
67int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
68 int namelen, u64 *blkno);
39int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); 69int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
70int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
71 filldir_t filldir);
40int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 72int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
41 struct inode *dir, 73 struct inode *dir,
42 struct buffer_head *parent_fe_bh, 74 struct buffer_head *parent_fe_bh,
@@ -44,11 +76,11 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
44 int namelen, 76 int namelen,
45 struct buffer_head **ret_de_bh); 77 struct buffer_head **ret_de_bh);
46struct ocfs2_alloc_context; 78struct ocfs2_alloc_context;
47int ocfs2_do_extend_dir(struct super_block *sb, 79int ocfs2_fill_new_dir(struct ocfs2_super *osb,
48 handle_t *handle, 80 handle_t *handle,
49 struct inode *dir, 81 struct inode *parent,
50 struct buffer_head *parent_fe_bh, 82 struct inode *inode,
51 struct ocfs2_alloc_context *data_ac, 83 struct buffer_head *fe_bh,
52 struct ocfs2_alloc_context *meta_ac, 84 struct ocfs2_alloc_context *data_ac);
53 struct buffer_head **new_bh); 85
54#endif /* OCFS2_DIR_H */ 86#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f71250ed166f..41c76ff2fcfb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1482,6 +1482,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1482 lvb->lvb_imtime_packed = 1482 lvb->lvb_imtime_packed =
1483 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); 1483 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
1484 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); 1484 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
1485 lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
1485 lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); 1486 lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
1486 1487
1487out: 1488out:
@@ -1515,6 +1516,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1515 i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); 1516 i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1516 1517
1517 oi->ip_attr = be32_to_cpu(lvb->lvb_iattr); 1518 oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
1519 oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures);
1518 ocfs2_set_inode_flags(inode); 1520 ocfs2_set_inode_flags(inode);
1519 1521
1520 /* fast-symlinks are a special case */ 1522 /* fast-symlinks are a special case */
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 492bad32a8c0..87a785e41205 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -29,12 +29,12 @@
29 29
30#include "dcache.h" 30#include "dcache.h"
31 31
32#define OCFS2_LVB_VERSION 4 32#define OCFS2_LVB_VERSION 5
33 33
34struct ocfs2_meta_lvb { 34struct ocfs2_meta_lvb {
35 __u8 lvb_version; 35 __u8 lvb_version;
36 __u8 lvb_reserved0; 36 __u8 lvb_reserved0;
37 __be16 lvb_reserved1; 37 __be16 lvb_idynfeatures;
38 __be32 lvb_iclusters; 38 __be32 lvb_iclusters;
39 __be32 lvb_iuid; 39 __be32 lvb_iuid;
40 __be32 lvb_igid; 40 __be32 lvb_igid;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index bc48177bd183..c3bbc198f9ce 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -88,8 +88,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
88 struct dentry *parent; 88 struct dentry *parent;
89 struct inode *inode; 89 struct inode *inode;
90 struct inode *dir = child->d_inode; 90 struct inode *dir = child->d_inode;
91 struct buffer_head *dirent_bh = NULL;
92 struct ocfs2_dir_entry *dirent;
93 91
94 mlog_entry("(0x%p, '%.*s')\n", child, 92 mlog_entry("(0x%p, '%.*s')\n", child,
95 child->d_name.len, child->d_name.name); 93 child->d_name.len, child->d_name.name);
@@ -105,8 +103,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
105 goto bail; 103 goto bail;
106 } 104 }
107 105
108 status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh, 106 status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
109 &dirent);
110 if (status < 0) { 107 if (status < 0) {
111 parent = ERR_PTR(-ENOENT); 108 parent = ERR_PTR(-ENOENT);
112 goto bail_unlock; 109 goto bail_unlock;
@@ -131,9 +128,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
131bail_unlock: 128bail_unlock:
132 ocfs2_meta_unlock(dir, 0); 129 ocfs2_meta_unlock(dir, 0);
133 130
134 if (dirent_bh)
135 brelse(dirent_bh);
136
137bail: 131bail:
138 mlog_exit_ptr(parent); 132 mlog_exit_ptr(parent);
139 133
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 03c1d365c78b..c58668a326fe 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -387,6 +387,12 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
387 struct ocfs2_extent_rec *rec; 387 struct ocfs2_extent_rec *rec;
388 u32 coff; 388 u32 coff;
389 389
390 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
391 ret = -ERANGE;
392 mlog_errno(ret);
393 goto out;
394 }
395
390 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, 396 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
391 num_clusters, extent_flags); 397 num_clusters, extent_flags);
392 if (ret == 0) 398 if (ret == 0)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f3bc3658e7a5..a62b14eb4065 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -397,6 +397,15 @@ static int ocfs2_truncate_file(struct inode *inode,
397 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 397 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
398 truncate_inode_pages(inode->i_mapping, new_i_size); 398 truncate_inode_pages(inode->i_mapping, new_i_size);
399 399
400 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
401 status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
402 i_size_read(inode), 0);
403 if (status)
404 mlog_errno(status);
405
406 goto bail_unlock_data;
407 }
408
400 /* alright, we're going to need to do a full blown alloc size 409 /* alright, we're going to need to do a full blown alloc size
401 * change. Orphan the inode so that recovery can complete the 410 * change. Orphan the inode so that recovery can complete the
402 * truncate if necessary. This does the task of marking 411 * truncate if necessary. This does the task of marking
@@ -779,25 +788,6 @@ leave:
779 return status; 788 return status;
780} 789}
781 790
782static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
783 u32 clusters_to_add, int mark_unwritten)
784{
785 int ret;
786
787 /*
788 * The alloc sem blocks peope in read/write from reading our
789 * allocation until we're done changing it. We depend on
790 * i_mutex to block other extend/truncate calls while we're
791 * here.
792 */
793 down_write(&OCFS2_I(inode)->ip_alloc_sem);
794 ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
795 mark_unwritten);
796 up_write(&OCFS2_I(inode)->ip_alloc_sem);
797
798 return ret;
799}
800
801/* Some parts of this taken from generic_cont_expand, which turned out 791/* Some parts of this taken from generic_cont_expand, which turned out
802 * to be too fragile to do exactly what we need without us having to 792 * to be too fragile to do exactly what we need without us having to
803 * worry about recursive locking in ->prepare_write() and 793 * worry about recursive locking in ->prepare_write() and
@@ -889,25 +879,48 @@ out:
889 return ret; 879 return ret;
890} 880}
891 881
892/* 882int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
893 * A tail_to_skip value > 0 indicates that we're being called from 883{
894 * ocfs2_file_aio_write(). This has the following implications: 884 int ret;
895 * 885 u32 clusters_to_add;
896 * - we don't want to update i_size 886 struct ocfs2_inode_info *oi = OCFS2_I(inode);
897 * - di_bh will be NULL, which is fine because it's only used in the 887
898 * case where we want to update i_size. 888 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
899 * - ocfs2_zero_extend() will then only be filling the hole created 889 if (clusters_to_add < oi->ip_clusters)
900 * between i_size and the start of the write. 890 clusters_to_add = 0;
901 */ 891 else
892 clusters_to_add -= oi->ip_clusters;
893
894 if (clusters_to_add) {
895 ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
896 clusters_to_add, 0);
897 if (ret) {
898 mlog_errno(ret);
899 goto out;
900 }
901 }
902
903 /*
904 * Call this even if we don't add any clusters to the tree. We
905 * still need to zero the area between the old i_size and the
906 * new i_size.
907 */
908 ret = ocfs2_zero_extend(inode, zero_to);
909 if (ret < 0)
910 mlog_errno(ret);
911
912out:
913 return ret;
914}
915
902static int ocfs2_extend_file(struct inode *inode, 916static int ocfs2_extend_file(struct inode *inode,
903 struct buffer_head *di_bh, 917 struct buffer_head *di_bh,
904 u64 new_i_size, 918 u64 new_i_size)
905 size_t tail_to_skip)
906{ 919{
907 int ret = 0; 920 int ret = 0, data_locked = 0;
908 u32 clusters_to_add = 0; 921 struct ocfs2_inode_info *oi = OCFS2_I(inode);
909 922
910 BUG_ON(!tail_to_skip && !di_bh); 923 BUG_ON(!di_bh);
911 924
912 /* setattr sometimes calls us like this. */ 925 /* setattr sometimes calls us like this. */
913 if (new_i_size == 0) 926 if (new_i_size == 0)
@@ -917,13 +930,18 @@ static int ocfs2_extend_file(struct inode *inode,
917 goto out; 930 goto out;
918 BUG_ON(new_i_size < i_size_read(inode)); 931 BUG_ON(new_i_size < i_size_read(inode));
919 932
920 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 933 /*
921 BUG_ON(tail_to_skip != 0); 934 * Fall through for converting inline data, even if the fs
935 * supports sparse files.
936 *
937 * The check for inline data here is legal - nobody can add
938 * the feature since we have i_mutex. We must check it again
939 * after acquiring ip_alloc_sem though, as paths like mmap
940 * might have raced us to converting the inode to extents.
941 */
942 if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
943 && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
922 goto out_update_size; 944 goto out_update_size;
923 }
924
925 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
926 OCFS2_I(inode)->ip_clusters;
927 945
928 /* 946 /*
929 * protect the pages that ocfs2_zero_extend is going to be 947 * protect the pages that ocfs2_zero_extend is going to be
@@ -937,39 +955,52 @@ static int ocfs2_extend_file(struct inode *inode,
937 mlog_errno(ret); 955 mlog_errno(ret);
938 goto out; 956 goto out;
939 } 957 }
958 data_locked = 1;
959
960 /*
961 * The alloc sem blocks people in read/write from reading our
962 * allocation until we're done changing it. We depend on
963 * i_mutex to block other extend/truncate calls while we're
964 * here.
965 */
966 down_write(&oi->ip_alloc_sem);
967
968 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
969 /*
970 * We can optimize small extends by keeping the inodes
971 * inline data.
972 */
973 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
974 up_write(&oi->ip_alloc_sem);
975 goto out_update_size;
976 }
977
978 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
979 if (ret) {
980 up_write(&oi->ip_alloc_sem);
940 981
941 if (clusters_to_add) {
942 ret = ocfs2_extend_allocation(inode,
943 OCFS2_I(inode)->ip_clusters,
944 clusters_to_add, 0);
945 if (ret < 0) {
946 mlog_errno(ret); 982 mlog_errno(ret);
947 goto out_unlock; 983 goto out_unlock;
948 } 984 }
949 } 985 }
950 986
951 /* 987 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
952 * Call this even if we don't add any clusters to the tree. We 988 ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
953 * still need to zero the area between the old i_size and the 989
954 * new i_size. 990 up_write(&oi->ip_alloc_sem);
955 */ 991
956 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
957 if (ret < 0) { 992 if (ret < 0) {
958 mlog_errno(ret); 993 mlog_errno(ret);
959 goto out_unlock; 994 goto out_unlock;
960 } 995 }
961 996
962out_update_size: 997out_update_size:
963 if (!tail_to_skip) { 998 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
964 /* We're being called from ocfs2_setattr() which wants 999 if (ret < 0)
965 * us to update i_size */ 1000 mlog_errno(ret);
966 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
967 if (ret < 0)
968 mlog_errno(ret);
969 }
970 1001
971out_unlock: 1002out_unlock:
972 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 1003 if (data_locked)
973 ocfs2_data_unlock(inode, 1); 1004 ocfs2_data_unlock(inode, 1);
974 1005
975out: 1006out:
@@ -1035,7 +1066,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1035 if (i_size_read(inode) > attr->ia_size) 1066 if (i_size_read(inode) > attr->ia_size)
1036 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 1067 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1037 else 1068 else
1038 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); 1069 status = ocfs2_extend_file(inode, bh, attr->ia_size);
1039 if (status < 0) { 1070 if (status < 0) {
1040 if (status != -ENOSPC) 1071 if (status != -ENOSPC)
1041 mlog_errno(status); 1072 mlog_errno(status);
@@ -1243,6 +1274,31 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1243{ 1274{
1244 int ret; 1275 int ret;
1245 u32 cpos, phys_cpos, clusters, alloc_size; 1276 u32 cpos, phys_cpos, clusters, alloc_size;
1277 u64 end = start + len;
1278 struct buffer_head *di_bh = NULL;
1279
1280 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1281 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1282 OCFS2_I(inode)->ip_blkno, &di_bh,
1283 OCFS2_BH_CACHED, inode);
1284 if (ret) {
1285 mlog_errno(ret);
1286 goto out;
1287 }
1288
1289 /*
1290 * Nothing to do if the requested reservation range
1291 * fits within the inode.
1292 */
1293 if (ocfs2_size_fits_inline_data(di_bh, end))
1294 goto out;
1295
1296 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1297 if (ret) {
1298 mlog_errno(ret);
1299 goto out;
1300 }
1301 }
1246 1302
1247 /* 1303 /*
1248 * We consider both start and len to be inclusive. 1304 * We consider both start and len to be inclusive.
@@ -1288,6 +1344,8 @@ next:
1288 1344
1289 ret = 0; 1345 ret = 0;
1290out: 1346out:
1347
1348 brelse(di_bh);
1291 return ret; 1349 return ret;
1292} 1350}
1293 1351
@@ -1469,6 +1527,14 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1469 if (byte_len == 0) 1527 if (byte_len == 0)
1470 return 0; 1528 return 0;
1471 1529
1530 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1531 ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1532 byte_start + byte_len, 1);
1533 if (ret)
1534 mlog_errno(ret);
1535 return ret;
1536 }
1537
1472 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); 1538 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1473 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; 1539 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1474 if (trunc_len >= trunc_start) 1540 if (trunc_len >= trunc_start)
@@ -1713,15 +1779,13 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1713 int appending, 1779 int appending,
1714 int *direct_io) 1780 int *direct_io)
1715{ 1781{
1716 int ret = 0, meta_level = appending; 1782 int ret = 0, meta_level = 0;
1717 struct inode *inode = dentry->d_inode; 1783 struct inode *inode = dentry->d_inode;
1718 u32 clusters; 1784 loff_t saved_pos, end;
1719 loff_t newsize, saved_pos;
1720 1785
1721 /* 1786 /*
1722 * We sample i_size under a read level meta lock to see if our write 1787 * We start with a read level meta lock and only jump to an ex
1723 * is extending the file, if it is we back off and get a write level 1788 * if we need to make modifications here.
1724 * meta lock.
1725 */ 1789 */
1726 for(;;) { 1790 for(;;) {
1727 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1791 ret = ocfs2_meta_lock(inode, NULL, meta_level);
@@ -1763,87 +1827,47 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1763 saved_pos = *ppos; 1827 saved_pos = *ppos;
1764 } 1828 }
1765 1829
1766 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 1830 end = saved_pos + count;
1767 loff_t end = saved_pos + count;
1768 1831
1769 /* 1832 /*
1770 * Skip the O_DIRECT checks if we don't need 1833 * Skip the O_DIRECT checks if we don't need
1771 * them. 1834 * them.
1772 */ 1835 */
1773 if (!direct_io || !(*direct_io)) 1836 if (!direct_io || !(*direct_io))
1774 break;
1775
1776 /*
1777 * Allowing concurrent direct writes means
1778 * i_size changes wouldn't be synchronized, so
1779 * one node could wind up truncating another
1780 * nodes writes.
1781 */
1782 if (end > i_size_read(inode)) {
1783 *direct_io = 0;
1784 break;
1785 }
1786
1787 /*
1788 * We don't fill holes during direct io, so
1789 * check for them here. If any are found, the
1790 * caller will have to retake some cluster
1791 * locks and initiate the io as buffered.
1792 */
1793 ret = ocfs2_check_range_for_holes(inode, saved_pos,
1794 count);
1795 if (ret == 1) {
1796 *direct_io = 0;
1797 ret = 0;
1798 } else if (ret < 0)
1799 mlog_errno(ret);
1800 break; 1837 break;
1801 }
1802 1838
1803 /* 1839 /*
1804 * The rest of this loop is concerned with legacy file 1840 * There's no sane way to do direct writes to an inode
1805 * systems which don't support sparse files. 1841 * with inline data.
1806 */ 1842 */
1807 1843 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1808 newsize = count + saved_pos; 1844 *direct_io = 0;
1809
1810 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1811 (long long) saved_pos, (long long) newsize,
1812 (long long) i_size_read(inode));
1813
1814 /* No need for a higher level metadata lock if we're
1815 * never going past i_size. */
1816 if (newsize <= i_size_read(inode))
1817 break; 1845 break;
1818
1819 if (meta_level == 0) {
1820 ocfs2_meta_unlock(inode, meta_level);
1821 meta_level = 1;
1822 continue;
1823 } 1846 }
1824 1847
1825 spin_lock(&OCFS2_I(inode)->ip_lock); 1848 /*
1826 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1849 * Allowing concurrent direct writes means
1827 OCFS2_I(inode)->ip_clusters; 1850 * i_size changes wouldn't be synchronized, so
1828 spin_unlock(&OCFS2_I(inode)->ip_lock); 1851 * one node could wind up truncating another
1829 1852 * nodes writes.
1830 mlog(0, "Writing at EOF, may need more allocation: " 1853 */
1831 "i_size = %lld, newsize = %lld, need %u clusters\n", 1854 if (end > i_size_read(inode)) {
1832 (long long) i_size_read(inode), (long long) newsize, 1855 *direct_io = 0;
1833 clusters);
1834
1835 /* We only want to continue the rest of this loop if
1836 * our extend will actually require more
1837 * allocation. */
1838 if (!clusters)
1839 break; 1856 break;
1840
1841 ret = ocfs2_extend_file(inode, NULL, newsize, count);
1842 if (ret < 0) {
1843 if (ret != -ENOSPC)
1844 mlog_errno(ret);
1845 goto out_unlock;
1846 } 1857 }
1858
1859 /*
1860 * We don't fill holes during direct io, so
1861 * check for them here. If any are found, the
1862 * caller will have to retake some cluster
1863 * locks and initiate the io as buffered.
1864 */
1865 ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
1866 if (ret == 1) {
1867 *direct_io = 0;
1868 ret = 0;
1869 } else if (ret < 0)
1870 mlog_errno(ret);
1847 break; 1871 break;
1848 } 1872 }
1849 1873
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 36fe27f268ee..066f14add3a8 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -47,6 +47,8 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
47 struct ocfs2_alloc_context *data_ac, 47 struct ocfs2_alloc_context *data_ac,
48 struct ocfs2_alloc_context *meta_ac, 48 struct ocfs2_alloc_context *meta_ac,
49 enum ocfs2_alloc_restarted *reason_ret); 49 enum ocfs2_alloc_restarted *reason_ret);
50int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
51 u64 zero_to);
50int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 52int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
51 u32 clusters_to_add, u32 extents_to_split, 53 u32 clusters_to_add, u32 extents_to_split,
52 struct ocfs2_alloc_context **data_ac, 54 struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c53a6763bbbe..1d5e0cb0fda1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -241,6 +241,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
241 241
242 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 242 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
243 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 243 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
244 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
244 245
245 inode->i_version = 1; 246 inode->i_version = 1;
246 inode->i_generation = le32_to_cpu(fe->i_generation); 247 inode->i_generation = le32_to_cpu(fe->i_generation);
@@ -513,6 +514,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
513 514
514 fe = (struct ocfs2_dinode *) fe_bh->b_data; 515 fe = (struct ocfs2_dinode *) fe_bh->b_data;
515 516
517 /*
518 * This check will also skip truncate of inodes with inline
519 * data and fast symlinks.
520 */
516 if (fe->i_clusters) { 521 if (fe->i_clusters) {
517 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 522 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
518 if (IS_ERR(handle)) { 523 if (IS_ERR(handle)) {
@@ -1220,6 +1225,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1220 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); 1225 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1221 ocfs2_get_inode_flags(OCFS2_I(inode)); 1226 ocfs2_get_inode_flags(OCFS2_I(inode));
1222 fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr); 1227 fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
1228 fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
1223 spin_unlock(&OCFS2_I(inode)->ip_lock); 1229 spin_unlock(&OCFS2_I(inode)->ip_lock);
1224 1230
1225 fe->i_size = cpu_to_le64(i_size_read(inode)); 1231 fe->i_size = cpu_to_le64(i_size_read(inode));
@@ -1257,6 +1263,7 @@ void ocfs2_refresh_inode(struct inode *inode,
1257 1263
1258 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1264 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1259 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 1265 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
1266 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
1260 ocfs2_set_inode_flags(inode); 1267 ocfs2_set_inode_flags(inode);
1261 i_size_write(inode, le64_to_cpu(fe->i_size)); 1268 i_size_write(inode, le64_to_cpu(fe->i_size));
1262 inode->i_nlink = le16_to_cpu(fe->i_links_count); 1269 inode->i_nlink = le16_to_cpu(fe->i_links_count);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a41d0817121b..70e881c55536 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -51,6 +51,7 @@ struct ocfs2_inode_info
51 51
52 u32 ip_flags; /* see below */ 52 u32 ip_flags; /* see below */
53 u32 ip_attr; /* inode attributes */ 53 u32 ip_attr; /* inode attributes */
54 u16 ip_dyn_features;
54 55
55 /* protected by recovery_lock. */ 56 /* protected by recovery_lock. */
56 struct inode *ip_next_orphan; 57 struct inode *ip_next_orphan;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dbfb20bb27ea..f9d01e25298d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,13 +35,13 @@
35#include "ocfs2.h" 35#include "ocfs2.h"
36 36
37#include "alloc.h" 37#include "alloc.h"
38#include "dir.h"
38#include "dlmglue.h" 39#include "dlmglue.h"
39#include "extent_map.h" 40#include "extent_map.h"
40#include "heartbeat.h" 41#include "heartbeat.h"
41#include "inode.h" 42#include "inode.h"
42#include "journal.h" 43#include "journal.h"
43#include "localalloc.h" 44#include "localalloc.h"
44#include "namei.h"
45#include "slot_map.h" 45#include "slot_map.h"
46#include "super.h" 46#include "super.h"
47#include "vote.h" 47#include "vote.h"
@@ -1213,17 +1213,49 @@ bail:
1213 return status; 1213 return status;
1214} 1214}
1215 1215
1216struct ocfs2_orphan_filldir_priv {
1217 struct inode *head;
1218 struct ocfs2_super *osb;
1219};
1220
1221static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
1222 loff_t pos, u64 ino, unsigned type)
1223{
1224 struct ocfs2_orphan_filldir_priv *p = priv;
1225 struct inode *iter;
1226
1227 if (name_len == 1 && !strncmp(".", name, 1))
1228 return 0;
1229 if (name_len == 2 && !strncmp("..", name, 2))
1230 return 0;
1231
1232 /* Skip bad inodes so that recovery can continue */
1233 iter = ocfs2_iget(p->osb, ino,
1234 OCFS2_FI_FLAG_ORPHAN_RECOVERY);
1235 if (IS_ERR(iter))
1236 return 0;
1237
1238 mlog(0, "queue orphan %llu\n",
1239 (unsigned long long)OCFS2_I(iter)->ip_blkno);
1240 /* No locking is required for the next_orphan queue as there
1241 * is only ever a single process doing orphan recovery. */
1242 OCFS2_I(iter)->ip_next_orphan = p->head;
1243 p->head = iter;
1244
1245 return 0;
1246}
1247
1216static int ocfs2_queue_orphans(struct ocfs2_super *osb, 1248static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1217 int slot, 1249 int slot,
1218 struct inode **head) 1250 struct inode **head)
1219{ 1251{
1220 int status; 1252 int status;
1221 struct inode *orphan_dir_inode = NULL; 1253 struct inode *orphan_dir_inode = NULL;
1222 struct inode *iter; 1254 struct ocfs2_orphan_filldir_priv priv;
1223 unsigned long offset, blk, local; 1255 loff_t pos = 0;
1224 struct buffer_head *bh = NULL; 1256
1225 struct ocfs2_dir_entry *de; 1257 priv.osb = osb;
1226 struct super_block *sb = osb->sb; 1258 priv.head = *head;
1227 1259
1228 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1260 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1229 ORPHAN_DIR_SYSTEM_INODE, 1261 ORPHAN_DIR_SYSTEM_INODE,
@@ -1241,77 +1273,15 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1241 goto out; 1273 goto out;
1242 } 1274 }
1243 1275
1244 offset = 0; 1276 status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
1245 iter = NULL; 1277 ocfs2_orphan_filldir);
1246 while(offset < i_size_read(orphan_dir_inode)) { 1278 if (status) {
1247 blk = offset >> sb->s_blocksize_bits; 1279 mlog_errno(status);
1248 1280 goto out;
1249 bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
1250 if (!bh)
1251 status = -EINVAL;
1252 if (status < 0) {
1253 if (bh)
1254 brelse(bh);
1255 mlog_errno(status);
1256 goto out_unlock;
1257 }
1258
1259 local = 0;
1260 while(offset < i_size_read(orphan_dir_inode)
1261 && local < sb->s_blocksize) {
1262 de = (struct ocfs2_dir_entry *) (bh->b_data + local);
1263
1264 if (!ocfs2_check_dir_entry(orphan_dir_inode,
1265 de, bh, local)) {
1266 status = -EINVAL;
1267 mlog_errno(status);
1268 brelse(bh);
1269 goto out_unlock;
1270 }
1271
1272 local += le16_to_cpu(de->rec_len);
1273 offset += le16_to_cpu(de->rec_len);
1274
1275 /* I guess we silently fail on no inode? */
1276 if (!le64_to_cpu(de->inode))
1277 continue;
1278 if (de->file_type > OCFS2_FT_MAX) {
1279 mlog(ML_ERROR,
1280 "block %llu contains invalid de: "
1281 "inode = %llu, rec_len = %u, "
1282 "name_len = %u, file_type = %u, "
1283 "name='%.*s'\n",
1284 (unsigned long long)bh->b_blocknr,
1285 (unsigned long long)le64_to_cpu(de->inode),
1286 le16_to_cpu(de->rec_len),
1287 de->name_len,
1288 de->file_type,
1289 de->name_len,
1290 de->name);
1291 continue;
1292 }
1293 if (de->name_len == 1 && !strncmp(".", de->name, 1))
1294 continue;
1295 if (de->name_len == 2 && !strncmp("..", de->name, 2))
1296 continue;
1297
1298 iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
1299 OCFS2_FI_FLAG_ORPHAN_RECOVERY);
1300 if (IS_ERR(iter))
1301 continue;
1302
1303 mlog(0, "queue orphan %llu\n",
1304 (unsigned long long)OCFS2_I(iter)->ip_blkno);
1305 /* No locking is required for the next_orphan
1306 * queue as there is only ever a single
1307 * process doing orphan recovery. */
1308 OCFS2_I(iter)->ip_next_orphan = *head;
1309 *head = iter;
1310 }
1311 brelse(bh);
1312 } 1281 }
1313 1282
1314out_unlock: 1283 *head = priv.head;
1284
1315 ocfs2_meta_unlock(orphan_dir_inode, 0); 1285 ocfs2_meta_unlock(orphan_dir_inode, 0);
1316out: 1286out:
1317 mutex_unlock(&orphan_dir_inode->i_mutex); 1287 mutex_unlock(&orphan_dir_inode->i_mutex);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index ce60aab013aa..4b32e0961568 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -282,6 +282,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
282 * prev. group desc. if we relink. */ 282 * prev. group desc. if we relink. */
283#define OCFS2_SUBALLOC_ALLOC (3) 283#define OCFS2_SUBALLOC_ALLOC (3)
284 284
285#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC \
286 + OCFS2_INODE_UPDATE_CREDITS)
287
285/* dinode + group descriptor update. We don't relink on free yet. */ 288/* dinode + group descriptor update. We don't relink on free yet. */
286#define OCFS2_SUBALLOC_FREE (2) 289#define OCFS2_SUBALLOC_FREE (2)
287 290
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 701e6d04ed5d..729259016c18 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -64,29 +64,6 @@
64 64
65#include "buffer_head_io.h" 65#include "buffer_head_io.h"
66 66
67#define NAMEI_RA_CHUNKS 2
68#define NAMEI_RA_BLOCKS 4
69#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
70#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
71
72static int inline ocfs2_search_dirblock(struct buffer_head *bh,
73 struct inode *dir,
74 const char *name, int namelen,
75 unsigned long offset,
76 struct ocfs2_dir_entry **res_dir);
77
78static int ocfs2_delete_entry(handle_t *handle,
79 struct inode *dir,
80 struct ocfs2_dir_entry *de_del,
81 struct buffer_head *bh);
82
83static int __ocfs2_add_entry(handle_t *handle,
84 struct inode *dir,
85 const char *name, int namelen,
86 struct inode *inode, u64 blkno,
87 struct buffer_head *parent_fe_bh,
88 struct buffer_head *insert_bh);
89
90static int ocfs2_mknod_locked(struct ocfs2_super *osb, 67static int ocfs2_mknod_locked(struct ocfs2_super *osb,
91 struct inode *dir, 68 struct inode *dir,
92 struct dentry *dentry, int mode, 69 struct dentry *dentry, int mode,
@@ -97,13 +74,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
97 struct inode **ret_inode, 74 struct inode **ret_inode,
98 struct ocfs2_alloc_context *inode_ac); 75 struct ocfs2_alloc_context *inode_ac);
99 76
100static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
101 handle_t *handle,
102 struct inode *parent,
103 struct inode *inode,
104 struct buffer_head *fe_bh,
105 struct ocfs2_alloc_context *data_ac);
106
107static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 77static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
108 struct inode **ret_orphan_dir, 78 struct inode **ret_orphan_dir,
109 struct inode *inode, 79 struct inode *inode,
@@ -123,17 +93,6 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
123 struct inode *inode, 93 struct inode *inode,
124 const char *symname); 94 const char *symname);
125 95
126static inline int ocfs2_add_entry(handle_t *handle,
127 struct dentry *dentry,
128 struct inode *inode, u64 blkno,
129 struct buffer_head *parent_fe_bh,
130 struct buffer_head *insert_bh)
131{
132 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
133 dentry->d_name.name, dentry->d_name.len,
134 inode, blkno, parent_fe_bh, insert_bh);
135}
136
137/* An orphan dir name is an 8 byte value, printed as a hex string */ 96/* An orphan dir name is an 8 byte value, printed as a hex string */
138#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) 97#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
139 98
@@ -142,10 +101,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
142{ 101{
143 int status; 102 int status;
144 u64 blkno; 103 u64 blkno;
145 struct buffer_head *dirent_bh = NULL;
146 struct inode *inode = NULL; 104 struct inode *inode = NULL;
147 struct dentry *ret; 105 struct dentry *ret;
148 struct ocfs2_dir_entry *dirent;
149 struct ocfs2_inode_info *oi; 106 struct ocfs2_inode_info *oi;
150 107
151 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 108 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
@@ -167,9 +124,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
167 goto bail; 124 goto bail;
168 } 125 }
169 126
170 status = ocfs2_find_files_on_disk(dentry->d_name.name, 127 status = ocfs2_lookup_ino_from_name(dir, dentry->d_name.name,
171 dentry->d_name.len, &blkno, 128 dentry->d_name.len, &blkno);
172 dir, &dirent_bh, &dirent);
173 if (status < 0) 129 if (status < 0)
174 goto bail_add; 130 goto bail_add;
175 131
@@ -224,83 +180,12 @@ bail_unlock:
224 ocfs2_meta_unlock(dir, 0); 180 ocfs2_meta_unlock(dir, 0);
225 181
226bail: 182bail:
227 if (dirent_bh)
228 brelse(dirent_bh);
229 183
230 mlog_exit_ptr(ret); 184 mlog_exit_ptr(ret);
231 185
232 return ret; 186 return ret;
233} 187}
234 188
235static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
236 handle_t *handle,
237 struct inode *parent,
238 struct inode *inode,
239 struct buffer_head *fe_bh,
240 struct ocfs2_alloc_context *data_ac)
241{
242 int status;
243 struct buffer_head *new_bh = NULL;
244 struct ocfs2_dir_entry *de = NULL;
245
246 mlog_entry_void();
247
248 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
249 data_ac, NULL, &new_bh);
250 if (status < 0) {
251 mlog_errno(status);
252 goto bail;
253 }
254
255 ocfs2_set_new_buffer_uptodate(inode, new_bh);
256
257 status = ocfs2_journal_access(handle, inode, new_bh,
258 OCFS2_JOURNAL_ACCESS_CREATE);
259 if (status < 0) {
260 mlog_errno(status);
261 goto bail;
262 }
263 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
264
265 de = (struct ocfs2_dir_entry *) new_bh->b_data;
266 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
267 de->name_len = 1;
268 de->rec_len =
269 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
270 strcpy(de->name, ".");
271 ocfs2_set_de_type(de, S_IFDIR);
272 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
273 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
274 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
275 OCFS2_DIR_REC_LEN(1));
276 de->name_len = 2;
277 strcpy(de->name, "..");
278 ocfs2_set_de_type(de, S_IFDIR);
279
280 status = ocfs2_journal_dirty(handle, new_bh);
281 if (status < 0) {
282 mlog_errno(status);
283 goto bail;
284 }
285
286 i_size_write(inode, inode->i_sb->s_blocksize);
287 inode->i_nlink = 2;
288 inode->i_blocks = ocfs2_inode_sector_count(inode);
289 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
290 if (status < 0) {
291 mlog_errno(status);
292 goto bail;
293 }
294
295 status = 0;
296bail:
297 if (new_bh)
298 brelse(new_bh);
299
300 mlog_exit(status);
301 return status;
302}
303
304static int ocfs2_mknod(struct inode *dir, 189static int ocfs2_mknod(struct inode *dir,
305 struct dentry *dentry, 190 struct dentry *dentry,
306 int mode, 191 int mode,
@@ -365,9 +250,8 @@ static int ocfs2_mknod(struct inode *dir,
365 goto leave; 250 goto leave;
366 } 251 }
367 252
368 /* are we making a directory? If so, reserve a cluster for his 253 /* Reserve a cluster if creating an extent based directory. */
369 * 1st extent. */ 254 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
370 if (S_ISDIR(mode)) {
371 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 255 status = ocfs2_reserve_clusters(osb, 1, &data_ac);
372 if (status < 0) { 256 if (status < 0) {
373 if (status != -ENOSPC) 257 if (status != -ENOSPC)
@@ -564,10 +448,21 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
564 cpu_to_le32(CURRENT_TIME.tv_nsec); 448 cpu_to_le32(CURRENT_TIME.tv_nsec);
565 fe->i_dtime = 0; 449 fe->i_dtime = 0;
566 450
567 fel = &fe->id2.i_list; 451 /*
568 fel->l_tree_depth = 0; 452 * If supported, directories start with inline data.
569 fel->l_next_free_rec = 0; 453 */
570 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); 454 if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
455 u16 feat = le16_to_cpu(fe->i_dyn_features);
456
457 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
458
459 fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb));
460 } else {
461 fel = &fe->id2.i_list;
462 fel->l_tree_depth = 0;
463 fel->l_next_free_rec = 0;
464 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
465 }
571 466
572 status = ocfs2_journal_dirty(handle, *new_fe_bh); 467 status = ocfs2_journal_dirty(handle, *new_fe_bh);
573 if (status < 0) { 468 if (status < 0) {
@@ -1048,11 +943,6 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
1048 ocfs2_meta_unlock(inode2, 1); 943 ocfs2_meta_unlock(inode2, 1);
1049} 944}
1050 945
1051#define PARENT_INO(buffer) \
1052 ((struct ocfs2_dir_entry *) \
1053 ((char *)buffer + \
1054 le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
1055
1056static int ocfs2_rename(struct inode *old_dir, 946static int ocfs2_rename(struct inode *old_dir,
1057 struct dentry *old_dentry, 947 struct dentry *old_dentry,
1058 struct inode *new_dir, 948 struct inode *new_dir,
@@ -1070,12 +960,12 @@ static int ocfs2_rename(struct inode *old_dir,
1070 struct buffer_head *old_inode_bh = NULL; 960 struct buffer_head *old_inode_bh = NULL;
1071 struct buffer_head *insert_entry_bh = NULL; 961 struct buffer_head *insert_entry_bh = NULL;
1072 struct ocfs2_super *osb = NULL; 962 struct ocfs2_super *osb = NULL;
1073 u64 newfe_blkno; 963 u64 newfe_blkno, old_de_ino;
1074 handle_t *handle = NULL; 964 handle_t *handle = NULL;
1075 struct buffer_head *old_dir_bh = NULL; 965 struct buffer_head *old_dir_bh = NULL;
1076 struct buffer_head *new_dir_bh = NULL; 966 struct buffer_head *new_dir_bh = NULL;
1077 struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry 967 struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
1078 // and new_dentry 968 *new_de = NULL;
1079 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above 969 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1080 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, 970 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1081 // this is the 1st dirent bh 971 // this is the 1st dirent bh
@@ -1159,27 +1049,35 @@ static int ocfs2_rename(struct inode *old_dir,
1159 } 1049 }
1160 1050
1161 if (S_ISDIR(old_inode->i_mode)) { 1051 if (S_ISDIR(old_inode->i_mode)) {
1162 status = -EIO; 1052 u64 old_inode_parent;
1163 old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0); 1053
1164 if (!old_inode_de_bh) 1054 status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
1055 old_inode, &old_inode_de_bh,
1056 &old_inode_dot_dot_de);
1057 if (status) {
1058 status = -EIO;
1165 goto bail; 1059 goto bail;
1060 }
1166 1061
1167 status = -EIO; 1062 if (old_inode_parent != OCFS2_I(old_dir)->ip_blkno) {
1168 if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) != 1063 status = -EIO;
1169 OCFS2_I(old_dir)->ip_blkno)
1170 goto bail; 1064 goto bail;
1171 status = -EMLINK; 1065 }
1172 if (!new_inode && new_dir!=old_dir && 1066
1173 new_dir->i_nlink >= OCFS2_LINK_MAX) 1067 if (!new_inode && new_dir != old_dir &&
1068 new_dir->i_nlink >= OCFS2_LINK_MAX) {
1069 status = -EMLINK;
1174 goto bail; 1070 goto bail;
1071 }
1175 } 1072 }
1176 1073
1177 status = -ENOENT; 1074 status = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
1178 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, 1075 old_dentry->d_name.len,
1179 old_dentry->d_name.len, 1076 &old_de_ino);
1180 old_dir, &old_de); 1077 if (status) {
1181 if (!old_de_bh) 1078 status = -ENOENT;
1182 goto bail; 1079 goto bail;
1080 }
1183 1081
1184 /* 1082 /*
1185 * Check for inode number is _not_ due to possible IO errors. 1083 * Check for inode number is _not_ due to possible IO errors.
@@ -1187,8 +1085,10 @@ static int ocfs2_rename(struct inode *old_dir,
1187 * and merrily kill the link to whatever was created under the 1085 * and merrily kill the link to whatever was created under the
1188 * same name. Goodbye sticky bit ;-< 1086 * same name. Goodbye sticky bit ;-<
1189 */ 1087 */
1190 if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno) 1088 if (old_de_ino != OCFS2_I(old_inode)->ip_blkno) {
1089 status = -ENOENT;
1191 goto bail; 1090 goto bail;
1091 }
1192 1092
1193 /* check if the target already exists (in which case we need 1093 /* check if the target already exists (in which case we need
1194 * to delete it */ 1094 * to delete it */
@@ -1321,20 +1221,13 @@ static int ocfs2_rename(struct inode *old_dir,
1321 } 1221 }
1322 1222
1323 /* change the dirent to point to the correct inode */ 1223 /* change the dirent to point to the correct inode */
1324 status = ocfs2_journal_access(handle, new_dir, new_de_bh, 1224 status = ocfs2_update_entry(new_dir, handle, new_de_bh,
1325 OCFS2_JOURNAL_ACCESS_WRITE); 1225 new_de, old_inode);
1326 if (status < 0) { 1226 if (status < 0) {
1327 mlog_errno(status); 1227 mlog_errno(status);
1328 goto bail; 1228 goto bail;
1329 } 1229 }
1330 new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
1331 new_de->file_type = old_de->file_type;
1332 new_dir->i_version++; 1230 new_dir->i_version++;
1333 status = ocfs2_journal_dirty(handle, new_de_bh);
1334 if (status < 0) {
1335 mlog_errno(status);
1336 goto bail;
1337 }
1338 1231
1339 if (S_ISDIR(new_inode->i_mode)) 1232 if (S_ISDIR(new_inode->i_mode))
1340 newfe->i_links_count = 0; 1233 newfe->i_links_count = 0;
@@ -1370,7 +1263,21 @@ static int ocfs2_rename(struct inode *old_dir,
1370 } else 1263 } else
1371 mlog_errno(status); 1264 mlog_errno(status);
1372 1265
1373 /* now that the name has been added to new_dir, remove the old name */ 1266 /*
1267 * Now that the name has been added to new_dir, remove the old name.
1268 *
1269 * We don't keep any directory entry context around until now
1270 * because the insert might have changed the type of directory
1271 * we're dealing with.
1272 */
1273 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
1274 old_dentry->d_name.len,
1275 old_dir, &old_de);
1276 if (!old_de_bh) {
1277 status = -EIO;
1278 goto bail;
1279 }
1280
1374 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); 1281 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
1375 if (status < 0) { 1282 if (status < 0) {
1376 mlog_errno(status); 1283 mlog_errno(status);
@@ -1383,12 +1290,8 @@ static int ocfs2_rename(struct inode *old_dir,
1383 } 1290 }
1384 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; 1291 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1385 if (old_inode_de_bh) { 1292 if (old_inode_de_bh) {
1386 status = ocfs2_journal_access(handle, old_inode, 1293 status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh,
1387 old_inode_de_bh, 1294 old_inode_dot_dot_de, new_dir);
1388 OCFS2_JOURNAL_ACCESS_WRITE);
1389 PARENT_INO(old_inode_de_bh->b_data) =
1390 cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
1391 status = ocfs2_journal_dirty(handle, old_inode_de_bh);
1392 old_dir->i_nlink--; 1295 old_dir->i_nlink--;
1393 if (new_inode) { 1296 if (new_inode) {
1394 new_inode->i_nlink--; 1297 new_inode->i_nlink--;
@@ -1767,329 +1670,6 @@ bail:
1767 return status; 1670 return status;
1768} 1671}
1769 1672
1770int ocfs2_check_dir_entry(struct inode * dir,
1771 struct ocfs2_dir_entry * de,
1772 struct buffer_head * bh,
1773 unsigned long offset)
1774{
1775 const char *error_msg = NULL;
1776 const int rlen = le16_to_cpu(de->rec_len);
1777
1778 if (rlen < OCFS2_DIR_REC_LEN(1))
1779 error_msg = "rec_len is smaller than minimal";
1780 else if (rlen % 4 != 0)
1781 error_msg = "rec_len % 4 != 0";
1782 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
1783 error_msg = "rec_len is too small for name_len";
1784 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
1785 error_msg = "directory entry across blocks";
1786
1787 if (error_msg != NULL)
1788 mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
1789 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
1790 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
1791 offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
1792 de->name_len);
1793 return error_msg == NULL ? 1 : 0;
1794}
1795
1796/* we don't always have a dentry for what we want to add, so people
1797 * like orphan dir can call this instead.
1798 *
1799 * If you pass me insert_bh, I'll skip the search of the other dir
1800 * blocks and put the record in there.
1801 */
1802static int __ocfs2_add_entry(handle_t *handle,
1803 struct inode *dir,
1804 const char *name, int namelen,
1805 struct inode *inode, u64 blkno,
1806 struct buffer_head *parent_fe_bh,
1807 struct buffer_head *insert_bh)
1808{
1809 unsigned long offset;
1810 unsigned short rec_len;
1811 struct ocfs2_dir_entry *de, *de1;
1812 struct super_block *sb;
1813 int retval, status;
1814
1815 mlog_entry_void();
1816
1817 sb = dir->i_sb;
1818
1819 if (!namelen)
1820 return -EINVAL;
1821
1822 rec_len = OCFS2_DIR_REC_LEN(namelen);
1823 offset = 0;
1824 de = (struct ocfs2_dir_entry *) insert_bh->b_data;
1825 while (1) {
1826 BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
1827 /* These checks should've already been passed by the
1828 * prepare function, but I guess we can leave them
1829 * here anyway. */
1830 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
1831 retval = -ENOENT;
1832 goto bail;
1833 }
1834 if (ocfs2_match(namelen, name, de)) {
1835 retval = -EEXIST;
1836 goto bail;
1837 }
1838 if (((le64_to_cpu(de->inode) == 0) &&
1839 (le16_to_cpu(de->rec_len) >= rec_len)) ||
1840 (le16_to_cpu(de->rec_len) >=
1841 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
1842 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1843 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
1844 if (retval < 0) {
1845 mlog_errno(retval);
1846 goto bail;
1847 }
1848
1849 status = ocfs2_journal_access(handle, dir, insert_bh,
1850 OCFS2_JOURNAL_ACCESS_WRITE);
1851 /* By now the buffer is marked for journaling */
1852 offset += le16_to_cpu(de->rec_len);
1853 if (le64_to_cpu(de->inode)) {
1854 de1 = (struct ocfs2_dir_entry *)((char *) de +
1855 OCFS2_DIR_REC_LEN(de->name_len));
1856 de1->rec_len =
1857 cpu_to_le16(le16_to_cpu(de->rec_len) -
1858 OCFS2_DIR_REC_LEN(de->name_len));
1859 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1860 de = de1;
1861 }
1862 de->file_type = OCFS2_FT_UNKNOWN;
1863 if (blkno) {
1864 de->inode = cpu_to_le64(blkno);
1865 ocfs2_set_de_type(de, inode->i_mode);
1866 } else
1867 de->inode = 0;
1868 de->name_len = namelen;
1869 memcpy(de->name, name, namelen);
1870
1871 dir->i_version++;
1872 status = ocfs2_journal_dirty(handle, insert_bh);
1873 retval = 0;
1874 goto bail;
1875 }
1876 offset += le16_to_cpu(de->rec_len);
1877 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1878 }
1879
1880 /* when you think about it, the assert above should prevent us
1881 * from ever getting here. */
1882 retval = -ENOSPC;
1883bail:
1884
1885 mlog_exit(retval);
1886 return retval;
1887}
1888
1889
1890/*
1891 * ocfs2_delete_entry deletes a directory entry by merging it with the
1892 * previous entry
1893 */
1894static int ocfs2_delete_entry(handle_t *handle,
1895 struct inode *dir,
1896 struct ocfs2_dir_entry *de_del,
1897 struct buffer_head *bh)
1898{
1899 struct ocfs2_dir_entry *de, *pde;
1900 int i, status = -ENOENT;
1901
1902 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
1903
1904 i = 0;
1905 pde = NULL;
1906 de = (struct ocfs2_dir_entry *) bh->b_data;
1907 while (i < bh->b_size) {
1908 if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
1909 status = -EIO;
1910 mlog_errno(status);
1911 goto bail;
1912 }
1913 if (de == de_del) {
1914 status = ocfs2_journal_access(handle, dir, bh,
1915 OCFS2_JOURNAL_ACCESS_WRITE);
1916 if (status < 0) {
1917 status = -EIO;
1918 mlog_errno(status);
1919 goto bail;
1920 }
1921 if (pde)
1922 pde->rec_len =
1923 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1924 le16_to_cpu(de->rec_len));
1925 else
1926 de->inode = 0;
1927 dir->i_version++;
1928 status = ocfs2_journal_dirty(handle, bh);
1929 goto bail;
1930 }
1931 i += le16_to_cpu(de->rec_len);
1932 pde = de;
1933 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1934 }
1935bail:
1936 mlog_exit(status);
1937 return status;
1938}
1939
1940/*
1941 * Returns 0 if not found, -1 on failure, and 1 on success
1942 */
1943static int inline ocfs2_search_dirblock(struct buffer_head *bh,
1944 struct inode *dir,
1945 const char *name, int namelen,
1946 unsigned long offset,
1947 struct ocfs2_dir_entry **res_dir)
1948{
1949 struct ocfs2_dir_entry *de;
1950 char *dlimit, *de_buf;
1951 int de_len;
1952 int ret = 0;
1953
1954 mlog_entry_void();
1955
1956 de_buf = bh->b_data;
1957 dlimit = de_buf + dir->i_sb->s_blocksize;
1958
1959 while (de_buf < dlimit) {
1960 /* this code is executed quadratically often */
1961 /* do minimal checking `by hand' */
1962
1963 de = (struct ocfs2_dir_entry *) de_buf;
1964
1965 if (de_buf + namelen <= dlimit &&
1966 ocfs2_match(namelen, name, de)) {
1967 /* found a match - just to be sure, do a full check */
1968 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
1969 ret = -1;
1970 goto bail;
1971 }
1972 *res_dir = de;
1973 ret = 1;
1974 goto bail;
1975 }
1976
1977 /* prevent looping on a bad block */
1978 de_len = le16_to_cpu(de->rec_len);
1979 if (de_len <= 0) {
1980 ret = -1;
1981 goto bail;
1982 }
1983
1984 de_buf += de_len;
1985 offset += de_len;
1986 }
1987
1988bail:
1989 mlog_exit(ret);
1990 return ret;
1991}
1992
1993struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
1994 struct inode *dir,
1995 struct ocfs2_dir_entry **res_dir)
1996{
1997 struct super_block *sb;
1998 struct buffer_head *bh_use[NAMEI_RA_SIZE];
1999 struct buffer_head *bh, *ret = NULL;
2000 unsigned long start, block, b;
2001 int ra_max = 0; /* Number of bh's in the readahead
2002 buffer, bh_use[] */
2003 int ra_ptr = 0; /* Current index into readahead
2004 buffer */
2005 int num = 0;
2006 int nblocks, i, err;
2007
2008 mlog_entry_void();
2009
2010 *res_dir = NULL;
2011 sb = dir->i_sb;
2012
2013 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2014 start = OCFS2_I(dir)->ip_dir_start_lookup;
2015 if (start >= nblocks)
2016 start = 0;
2017 block = start;
2018
2019restart:
2020 do {
2021 /*
2022 * We deal with the read-ahead logic here.
2023 */
2024 if (ra_ptr >= ra_max) {
2025 /* Refill the readahead buffer */
2026 ra_ptr = 0;
2027 b = block;
2028 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
2029 /*
2030 * Terminate if we reach the end of the
2031 * directory and must wrap, or if our
2032 * search has finished at this block.
2033 */
2034 if (b >= nblocks || (num && block == start)) {
2035 bh_use[ra_max] = NULL;
2036 break;
2037 }
2038 num++;
2039
2040 bh = ocfs2_bread(dir, b++, &err, 1);
2041 bh_use[ra_max] = bh;
2042 }
2043 }
2044 if ((bh = bh_use[ra_ptr++]) == NULL)
2045 goto next;
2046 wait_on_buffer(bh);
2047 if (!buffer_uptodate(bh)) {
2048 /* read error, skip block & hope for the best */
2049 ocfs2_error(dir->i_sb, "reading directory %llu, "
2050 "offset %lu\n",
2051 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2052 block);
2053 brelse(bh);
2054 goto next;
2055 }
2056 i = ocfs2_search_dirblock(bh, dir, name, namelen,
2057 block << sb->s_blocksize_bits,
2058 res_dir);
2059 if (i == 1) {
2060 OCFS2_I(dir)->ip_dir_start_lookup = block;
2061 ret = bh;
2062 goto cleanup_and_exit;
2063 } else {
2064 brelse(bh);
2065 if (i < 0)
2066 goto cleanup_and_exit;
2067 }
2068 next:
2069 if (++block >= nblocks)
2070 block = 0;
2071 } while (block != start);
2072
2073 /*
2074 * If the directory has grown while we were searching, then
2075 * search the last part of the directory before giving up.
2076 */
2077 block = nblocks;
2078 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2079 if (block < nblocks) {
2080 start = 0;
2081 goto restart;
2082 }
2083
2084cleanup_and_exit:
2085 /* Clean up the read-ahead blocks */
2086 for (; ra_ptr < ra_max; ra_ptr++)
2087 brelse(bh_use[ra_ptr]);
2088
2089 mlog_exit_ptr(ret);
2090 return ret;
2091}
2092
2093static int ocfs2_blkno_stringify(u64 blkno, char *name) 1673static int ocfs2_blkno_stringify(u64 blkno, char *name)
2094{ 1674{
2095 int status, namelen; 1675 int status, namelen;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 0975c7b7212b..688aef64c879 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -30,29 +30,10 @@ extern const struct inode_operations ocfs2_dir_iops;
30 30
31struct dentry *ocfs2_get_parent(struct dentry *child); 31struct dentry *ocfs2_get_parent(struct dentry *child);
32 32
33int ocfs2_check_dir_entry (struct inode *dir,
34 struct ocfs2_dir_entry *de,
35 struct buffer_head *bh,
36 unsigned long offset);
37struct buffer_head *ocfs2_find_entry(const char *name,
38 int namelen,
39 struct inode *dir,
40 struct ocfs2_dir_entry **res_dir);
41int ocfs2_orphan_del(struct ocfs2_super *osb, 33int ocfs2_orphan_del(struct ocfs2_super *osb,
42 handle_t *handle, 34 handle_t *handle,
43 struct inode *orphan_dir_inode, 35 struct inode *orphan_dir_inode,
44 struct inode *inode, 36 struct inode *inode,
45 struct buffer_head *orphan_dir_bh); 37 struct buffer_head *orphan_dir_bh);
46 38
47static inline int ocfs2_match(int len,
48 const char * const name,
49 struct ocfs2_dir_entry *de)
50{
51 if (len != de->name_len)
52 return 0;
53 if (!de->inode)
54 return 0;
55 return !memcmp(name, de->name, len);
56}
57
58#endif /* OCFS2_NAMEI_H */ 39#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 58307853fb4a..60a23e1906b0 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -319,6 +319,13 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
319 return 0; 319 return 0;
320} 320}
321 321
322static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
323{
324 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
325 return 1;
326 return 0;
327}
328
322/* set / clear functions because cluster events can make these happen 329/* set / clear functions because cluster events can make these happen
323 * in parallel so we want the transitions to be atomic. this also 330 * in parallel so we want the transitions to be atomic. this also
324 * means that any future flags osb_flags must be protected by spinlock 331 * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 82f8a75b207e..6ef876759a73 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -87,7 +87,8 @@
87 87
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
91#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 92#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
92 93
93/* 94/*
@@ -111,6 +112,20 @@
111#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010 112#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010
112 113
113/* 114/*
115 * Tunefs sets this incompat flag before starting an operation which
116 * would require cleanup on abort. This is done to protect users from
117 * inadvertently mounting the fs after an aborted run without
118 * fsck-ing.
119 *
120 * s_tunefs_flags on the super block describes precisely which
121 * operations were in progress.
122 */
123#define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG 0x0020
124
125/* Support for data packed into inode blocks */
126#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
127
128/*
114 * backup superblock flag is used to indicate that this volume 129 * backup superblock flag is used to indicate that this volume
115 * has backup superblocks. 130 * has backup superblocks.
116 */ 131 */
@@ -130,6 +145,11 @@
130#define OCFS2_MAX_BACKUP_SUPERBLOCKS 6 145#define OCFS2_MAX_BACKUP_SUPERBLOCKS 6
131 146
132/* 147/*
148 * Flags on ocfs2_super_block.s_tunefs_flags
149 */
150#define OCFS2_TUNEFS_INPROG_REMOVE_SLOT 0x0001 /* Removing slots */
151
152/*
133 * Flags on ocfs2_dinode.i_flags 153 * Flags on ocfs2_dinode.i_flags
134 */ 154 */
135#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ 155#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */
@@ -146,6 +166,17 @@
146#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ 166#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
147#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ 167#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
148 168
169/*
170 * Flags on ocfs2_dinode.i_dyn_features
171 *
172 * These can change much more often than i_flags. When adding flags,
173 * keep in mind that i_dyn_features is only 16 bits wide.
174 */
175#define OCFS2_INLINE_DATA_FL (0x0001) /* Data stored in inode block */
176#define OCFS2_HAS_XATTR_FL (0x0002)
177#define OCFS2_INLINE_XATTR_FL (0x0004)
178#define OCFS2_INDEXED_DIR_FL (0x0008)
179
149/* Inode attributes, keep in sync with EXT2 */ 180/* Inode attributes, keep in sync with EXT2 */
150#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 181#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
151#define OCFS2_UNRM_FL (0x00000002) /* Undelete */ 182#define OCFS2_UNRM_FL (0x00000002) /* Undelete */
@@ -447,8 +478,8 @@ struct ocfs2_super_block {
447 __le32 s_clustersize_bits; /* Clustersize for this fs */ 478 __le32 s_clustersize_bits; /* Clustersize for this fs */
448/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts 479/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
449 before tunefs required */ 480 before tunefs required */
450 __le16 s_reserved1; 481 __le16 s_tunefs_flag;
451 __le32 s_reserved2; 482 __le32 s_reserved1;
452 __le64 s_first_cluster_group; /* Block offset of 1st cluster 483 __le64 s_first_cluster_group; /* Block offset of 1st cluster
453 * group header */ 484 * group header */
454/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 485/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
@@ -471,6 +502,19 @@ struct ocfs2_local_alloc
471}; 502};
472 503
473/* 504/*
505 * Data-in-inode header. This is only used if i_dyn_features has
506 * OCFS2_INLINE_DATA_FL set.
507 */
508struct ocfs2_inline_data
509{
510/*00*/ __le16 id_count; /* Number of bytes that can be used
511 * for data, starting at id_data */
512 __le16 id_reserved0;
513 __le32 id_reserved1;
514 __u8 id_data[0]; /* Start of user data */
515};
516
517/*
474 * On disk inode for OCFS2 518 * On disk inode for OCFS2
475 */ 519 */
476struct ocfs2_dinode { 520struct ocfs2_dinode {
@@ -502,7 +546,7 @@ struct ocfs2_dinode {
502 __le32 i_attr; 546 __le32 i_attr;
503 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL 547 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
504 was set in i_flags */ 548 was set in i_flags */
505 __le16 i_reserved1; 549 __le16 i_dyn_features;
506/*70*/ __le64 i_reserved2[8]; 550/*70*/ __le64 i_reserved2[8];
507/*B8*/ union { 551/*B8*/ union {
508 __le64 i_pad1; /* Generic way to refer to this 552 __le64 i_pad1; /* Generic way to refer to this
@@ -528,6 +572,7 @@ struct ocfs2_dinode {
528 struct ocfs2_chain_list i_chain; 572 struct ocfs2_chain_list i_chain;
529 struct ocfs2_extent_list i_list; 573 struct ocfs2_extent_list i_list;
530 struct ocfs2_truncate_log i_dealloc; 574 struct ocfs2_truncate_log i_dealloc;
575 struct ocfs2_inline_data i_data;
531 __u8 i_symlink[0]; 576 __u8 i_symlink[0];
532 } id2; 577 } id2;
533/* Actual on-disk size is one block */ 578/* Actual on-disk size is one block */
@@ -577,6 +622,12 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
577 offsetof(struct ocfs2_dinode, id2.i_symlink); 622 offsetof(struct ocfs2_dinode, id2.i_symlink);
578} 623}
579 624
625static inline int ocfs2_max_inline_data(struct super_block *sb)
626{
627 return sb->s_blocksize -
628 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
629}
630
580static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) 631static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
581{ 632{
582 int size; 633 int size;
@@ -656,6 +707,11 @@ static inline int ocfs2_fast_symlink_chars(int blocksize)
656 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); 707 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
657} 708}
658 709
710static inline int ocfs2_max_inline_data(int blocksize)
711{
712 return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data);
713}
714
659static inline int ocfs2_extent_recs_per_inode(int blocksize) 715static inline int ocfs2_extent_recs_per_inode(int blocksize)
660{ 716{
661 int size; 717 int size;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c034b5129c1e..0e2a1b45bf92 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -39,6 +39,7 @@
39#include <linux/parser.h> 39#include <linux/parser.h>
40#include <linux/crc32.h> 40#include <linux/crc32.h>
41#include <linux/debugfs.h> 41#include <linux/debugfs.h>
42#include <linux/mount.h>
42 43
43#include <cluster/nodemanager.h> 44#include <cluster/nodemanager.h>
44 45
@@ -91,6 +92,7 @@ struct mount_options
91static int ocfs2_parse_options(struct super_block *sb, char *options, 92static int ocfs2_parse_options(struct super_block *sb, char *options,
92 struct mount_options *mopt, 93 struct mount_options *mopt,
93 int is_remount); 94 int is_remount);
95static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
94static void ocfs2_put_super(struct super_block *sb); 96static void ocfs2_put_super(struct super_block *sb);
95static int ocfs2_mount_volume(struct super_block *sb); 97static int ocfs2_mount_volume(struct super_block *sb);
96static int ocfs2_remount(struct super_block *sb, int *flags, char *data); 98static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -105,7 +107,7 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
105 107
106static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); 108static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
107static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); 109static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
108static int ocfs2_release_system_inodes(struct ocfs2_super *osb); 110static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
109static int ocfs2_fill_local_node_info(struct ocfs2_super *osb); 111static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
110static int ocfs2_check_volume(struct ocfs2_super *osb); 112static int ocfs2_check_volume(struct ocfs2_super *osb);
111static int ocfs2_verify_volume(struct ocfs2_dinode *di, 113static int ocfs2_verify_volume(struct ocfs2_dinode *di,
@@ -133,6 +135,7 @@ static const struct super_operations ocfs2_sops = {
133 .write_super = ocfs2_write_super, 135 .write_super = ocfs2_write_super,
134 .put_super = ocfs2_put_super, 136 .put_super = ocfs2_put_super,
135 .remount_fs = ocfs2_remount, 137 .remount_fs = ocfs2_remount,
138 .show_options = ocfs2_show_options,
136}; 139};
137 140
138enum { 141enum {
@@ -177,7 +180,7 @@ static void ocfs2_write_super(struct super_block *sb)
177 180
178static int ocfs2_sync_fs(struct super_block *sb, int wait) 181static int ocfs2_sync_fs(struct super_block *sb, int wait)
179{ 182{
180 int status = 0; 183 int status;
181 tid_t target; 184 tid_t target;
182 struct ocfs2_super *osb = OCFS2_SB(sb); 185 struct ocfs2_super *osb = OCFS2_SB(sb);
183 186
@@ -275,9 +278,9 @@ bail:
275 return status; 278 return status;
276} 279}
277 280
278static int ocfs2_release_system_inodes(struct ocfs2_super *osb) 281static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
279{ 282{
280 int status = 0, i; 283 int i;
281 struct inode *inode; 284 struct inode *inode;
282 285
283 mlog_entry_void(); 286 mlog_entry_void();
@@ -302,8 +305,7 @@ static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
302 osb->root_inode = NULL; 305 osb->root_inode = NULL;
303 } 306 }
304 307
305 mlog_exit(status); 308 mlog_exit(0);
306 return status;
307} 309}
308 310
309/* We're allocating fs objects, use GFP_NOFS */ 311/* We're allocating fs objects, use GFP_NOFS */
@@ -453,7 +455,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
453 struct buffer_head **bh, 455 struct buffer_head **bh,
454 int *sector_size) 456 int *sector_size)
455{ 457{
456 int status = 0, tmpstat; 458 int status, tmpstat;
457 struct ocfs1_vol_disk_hdr *hdr; 459 struct ocfs1_vol_disk_hdr *hdr;
458 struct ocfs2_dinode *di; 460 struct ocfs2_dinode *di;
459 int blksize; 461 int blksize;
@@ -830,6 +832,41 @@ bail:
830 return status; 832 return status;
831} 833}
832 834
835static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
836{
837 struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
838 unsigned long opts = osb->s_mount_opt;
839
840 if (opts & OCFS2_MOUNT_HB_LOCAL)
841 seq_printf(s, ",_netdev,heartbeat=local");
842 else
843 seq_printf(s, ",heartbeat=none");
844
845 if (opts & OCFS2_MOUNT_NOINTR)
846 seq_printf(s, ",nointr");
847
848 if (opts & OCFS2_MOUNT_DATA_WRITEBACK)
849 seq_printf(s, ",data=writeback");
850 else
851 seq_printf(s, ",data=ordered");
852
853 if (opts & OCFS2_MOUNT_BARRIER)
854 seq_printf(s, ",barrier=1");
855
856 if (opts & OCFS2_MOUNT_ERRORS_PANIC)
857 seq_printf(s, ",errors=panic");
858 else
859 seq_printf(s, ",errors=remount-ro");
860
861 if (osb->preferred_slot != OCFS2_INVALID_SLOT)
862 seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
863
864 if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
865 seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
866
867 return 0;
868}
869
833static int __init ocfs2_init(void) 870static int __init ocfs2_init(void)
834{ 871{
835 int status; 872 int status;
@@ -1209,12 +1246,13 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1209 tmp = ocfs2_request_umount_vote(osb); 1246 tmp = ocfs2_request_umount_vote(osb);
1210 if (tmp < 0) 1247 if (tmp < 0)
1211 mlog_errno(tmp); 1248 mlog_errno(tmp);
1249 }
1212 1250
1213 if (osb->slot_num != OCFS2_INVALID_SLOT) 1251 if (osb->slot_num != OCFS2_INVALID_SLOT)
1214 ocfs2_put_slot(osb); 1252 ocfs2_put_slot(osb);
1215 1253
1254 if (osb->dlm)
1216 ocfs2_super_unlock(osb, 1); 1255 ocfs2_super_unlock(osb, 1);
1217 }
1218 1256
1219 ocfs2_release_system_inodes(osb); 1257 ocfs2_release_system_inodes(osb);
1220 1258
@@ -1275,7 +1313,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1275 struct buffer_head *bh, 1313 struct buffer_head *bh,
1276 int sector_size) 1314 int sector_size)
1277{ 1315{
1278 int status = 0; 1316 int status;
1279 int i, cbits, bbits; 1317 int i, cbits, bbits;
1280 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1318 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1281 struct inode *inode = NULL; 1319 struct inode *inode = NULL;
@@ -1596,7 +1634,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1596 1634
1597static int ocfs2_check_volume(struct ocfs2_super *osb) 1635static int ocfs2_check_volume(struct ocfs2_super *osb)
1598{ 1636{
1599 int status = 0; 1637 int status;
1600 int dirty; 1638 int dirty;
1601 int local; 1639 int local;
1602 struct ocfs2_dinode *local_alloc = NULL; /* only used if we 1640 struct ocfs2_dinode *local_alloc = NULL; /* only used if we
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 5df6e35d09b1..fd2e846e3e6f 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -100,17 +100,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
100 char namebuf[40]; 100 char namebuf[40];
101 struct inode *inode = NULL; 101 struct inode *inode = NULL;
102 u64 blkno; 102 u64 blkno;
103 struct buffer_head *dirent_bh = NULL;
104 struct ocfs2_dir_entry *de = NULL;
105 int status = 0; 103 int status = 0;
106 104
107 ocfs2_sprintf_system_inode_name(namebuf, 105 ocfs2_sprintf_system_inode_name(namebuf,
108 sizeof(namebuf), 106 sizeof(namebuf),
109 type, slot); 107 type, slot);
110 108
111 status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf), 109 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
112 &blkno, osb->sys_root_inode, 110 strlen(namebuf), &blkno);
113 &dirent_bh, &de);
114 if (status < 0) { 111 if (status < 0) {
115 goto bail; 112 goto bail;
116 } 113 }
@@ -122,8 +119,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
122 goto bail; 119 goto bail;
123 } 120 }
124bail: 121bail:
125 if (dirent_bh) 122
126 brelse(dirent_bh);
127 return inode; 123 return inode;
128} 124}
129 125
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 783c57ec07d3..722e12e5acc7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -381,10 +381,12 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
381 p->partno = part; 381 p->partno = part;
382 p->policy = disk->policy; 382 p->policy = disk->policy;
383 383
384 if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1])) 384 if (isdigit(disk->kobj.k_name[strlen(disk->kobj.k_name)-1]))
385 snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part); 385 kobject_set_name(&p->kobj, "%sp%d",
386 kobject_name(&disk->kobj), part);
386 else 387 else
387 snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part); 388 kobject_set_name(&p->kobj, "%s%d",
389 kobject_name(&disk->kobj),part);
388 p->kobj.parent = &disk->kobj; 390 p->kobj.parent = &disk->kobj;
389 p->kobj.ktype = &ktype_part; 391 p->kobj.ktype = &ktype_part;
390 kobject_init(&p->kobj); 392 kobject_init(&p->kobj);
@@ -477,9 +479,9 @@ void register_disk(struct gendisk *disk)
477 struct hd_struct *p; 479 struct hd_struct *p;
478 int err; 480 int err;
479 481
480 strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN); 482 kobject_set_name(&disk->kobj, "%s", disk->disk_name);
481 /* ewww... some of these buggers have / in name... */ 483 /* ewww... some of these buggers have / in name... */
482 s = strchr(disk->kobj.name, '/'); 484 s = strchr(disk->kobj.k_name, '/');
483 if (s) 485 if (s)
484 *s = '!'; 486 *s = '!';
485 if ((err = kobject_add(&disk->kobj))) 487 if ((err = kobject_add(&disk->kobj)))
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index bce38e3f06cb..ebaba0213546 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -11,6 +11,7 @@ proc-y += inode.o root.o base.o generic.o array.o \
11 proc_tty.o proc_misc.o 11 proc_tty.o proc_misc.o
12 12
13proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 13proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
14proc-$(CONFIG_NET) += proc_net.o
14proc-$(CONFIG_PROC_KCORE) += kcore.o 15proc-$(CONFIG_PROC_KCORE) += kcore.o
15proc-$(CONFIG_PROC_VMCORE) += vmcore.o 16proc-$(CONFIG_PROC_VMCORE) += vmcore.o
16proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o 17proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index b215c3524fa6..1820eb2ef762 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -16,6 +16,11 @@ extern int proc_sys_init(void);
16#else 16#else
17static inline void proc_sys_init(void) { } 17static inline void proc_sys_init(void) { }
18#endif 18#endif
19#ifdef CONFIG_NET
20extern int proc_net_init(void);
21#else
22static inline int proc_net_init(void) { return 0; }
23#endif
19 24
20struct vmalloc_info { 25struct vmalloc_info {
21 unsigned long used; 26 unsigned long used;
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
new file mode 100644
index 000000000000..2e91fb756e9a
--- /dev/null
+++ b/fs/proc/proc_net.c
@@ -0,0 +1,200 @@
1/*
2 * linux/fs/proc/net.c
3 *
4 * Copyright (C) 2007
5 *
6 * Author: Eric Biederman <ebiederm@xmission.com>
7 *
8 * proc net directory handling functions
9 */
10
11#include <asm/uaccess.h>
12
13#include <linux/errno.h>
14#include <linux/time.h>
15#include <linux/proc_fs.h>
16#include <linux/stat.h>
17#include <linux/init.h>
18#include <linux/sched.h>
19#include <linux/module.h>
20#include <linux/bitops.h>
21#include <linux/smp_lock.h>
22#include <linux/mount.h>
23#include <linux/nsproxy.h>
24#include <net/net_namespace.h>
25
26#include "internal.h"
27
28
29struct proc_dir_entry *proc_net_create(struct net *net,
30 const char *name, mode_t mode, get_info_t *get_info)
31{
32 return create_proc_info_entry(name,mode, net->proc_net, get_info);
33}
34EXPORT_SYMBOL_GPL(proc_net_create);
35
36struct proc_dir_entry *proc_net_fops_create(struct net *net,
37 const char *name, mode_t mode, const struct file_operations *fops)
38{
39 struct proc_dir_entry *res;
40
41 res = create_proc_entry(name, mode, net->proc_net);
42 if (res)
43 res->proc_fops = fops;
44 return res;
45}
46EXPORT_SYMBOL_GPL(proc_net_fops_create);
47
48void proc_net_remove(struct net *net, const char *name)
49{
50 remove_proc_entry(name, net->proc_net);
51}
52EXPORT_SYMBOL_GPL(proc_net_remove);
53
54struct net *get_proc_net(const struct inode *inode)
55{
56 return maybe_get_net(PDE_NET(PDE(inode)));
57}
58EXPORT_SYMBOL_GPL(get_proc_net);
59
60static struct proc_dir_entry *proc_net_shadow;
61
62static struct dentry *proc_net_shadow_dentry(struct dentry *parent,
63 struct proc_dir_entry *de)
64{
65 struct dentry *shadow = NULL;
66 struct inode *inode;
67 if (!de)
68 goto out;
69 de_get(de);
70 inode = proc_get_inode(parent->d_inode->i_sb, de->low_ino, de);
71 if (!inode)
72 goto out_de_put;
73 shadow = d_alloc_name(parent, de->name);
74 if (!shadow)
75 goto out_iput;
76 shadow->d_op = parent->d_op; /* proc_dentry_operations */
77 d_instantiate(shadow, inode);
78out:
79 return shadow;
80out_iput:
81 iput(inode);
82out_de_put:
83 de_put(de);
84 goto out;
85}
86
87static void *proc_net_follow_link(struct dentry *parent, struct nameidata *nd)
88{
89 struct net *net = current->nsproxy->net_ns;
90 struct dentry *shadow;
91 shadow = proc_net_shadow_dentry(parent, net->proc_net);
92 if (!shadow)
93 return ERR_PTR(-ENOENT);
94
95 dput(nd->dentry);
96 /* My dentry count is 1 and that should be enough as the
97 * shadow dentry is thrown away immediately.
98 */
99 nd->dentry = shadow;
100 return NULL;
101}
102
103static struct dentry *proc_net_lookup(struct inode *dir, struct dentry *dentry,
104 struct nameidata *nd)
105{
106 struct net *net = current->nsproxy->net_ns;
107 struct dentry *shadow;
108
109 shadow = proc_net_shadow_dentry(nd->dentry, net->proc_net);
110 if (!shadow)
111 return ERR_PTR(-ENOENT);
112
113 dput(nd->dentry);
114 nd->dentry = shadow;
115
116 return shadow->d_inode->i_op->lookup(shadow->d_inode, dentry, nd);
117}
118
119static int proc_net_setattr(struct dentry *dentry, struct iattr *iattr)
120{
121 struct net *net = current->nsproxy->net_ns;
122 struct dentry *shadow;
123 int ret;
124
125 shadow = proc_net_shadow_dentry(dentry->d_parent, net->proc_net);
126 if (!shadow)
127 return -ENOENT;
128 ret = shadow->d_inode->i_op->setattr(shadow, iattr);
129 dput(shadow);
130 return ret;
131}
132
133static const struct file_operations proc_net_dir_operations = {
134 .read = generic_read_dir,
135};
136
137static struct inode_operations proc_net_dir_inode_operations = {
138 .follow_link = proc_net_follow_link,
139 .lookup = proc_net_lookup,
140 .setattr = proc_net_setattr,
141};
142
143static __net_init int proc_net_ns_init(struct net *net)
144{
145 struct proc_dir_entry *root, *netd, *net_statd;
146 int err;
147
148 err = -ENOMEM;
149 root = kzalloc(sizeof(*root), GFP_KERNEL);
150 if (!root)
151 goto out;
152
153 err = -EEXIST;
154 netd = proc_mkdir("net", root);
155 if (!netd)
156 goto free_root;
157
158 err = -EEXIST;
159 net_statd = proc_mkdir("stat", netd);
160 if (!net_statd)
161 goto free_net;
162
163 root->data = net;
164 netd->data = net;
165 net_statd->data = net;
166
167 net->proc_net_root = root;
168 net->proc_net = netd;
169 net->proc_net_stat = net_statd;
170 err = 0;
171
172out:
173 return err;
174free_net:
175 remove_proc_entry("net", root);
176free_root:
177 kfree(root);
178 goto out;
179}
180
181static __net_exit void proc_net_ns_exit(struct net *net)
182{
183 remove_proc_entry("stat", net->proc_net);
184 remove_proc_entry("net", net->proc_net_root);
185 kfree(net->proc_net_root);
186}
187
188struct pernet_operations __net_initdata proc_net_ns_ops = {
189 .init = proc_net_ns_init,
190 .exit = proc_net_ns_exit,
191};
192
193int __init proc_net_init(void)
194{
195 proc_net_shadow = proc_mkdir("net", NULL);
196 proc_net_shadow->proc_iops = &proc_net_dir_inode_operations;
197 proc_net_shadow->proc_fops = &proc_net_dir_operations;
198
199 return register_pernet_subsys(&proc_net_ns_ops);
200}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 41f17037f738..cf3046638b09 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -21,7 +21,7 @@
21 21
22#include "internal.h" 22#include "internal.h"
23 23
24struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver; 24struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver;
25 25
26static int proc_get_sb(struct file_system_type *fs_type, 26static int proc_get_sb(struct file_system_type *fs_type,
27 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 27 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
@@ -61,8 +61,8 @@ void __init proc_root_init(void)
61 return; 61 return;
62 } 62 }
63 proc_misc_init(); 63 proc_misc_init();
64 proc_net = proc_mkdir("net", NULL); 64
65 proc_net_stat = proc_mkdir("net/stat", NULL); 65 proc_net_init();
66 66
67#ifdef CONFIG_SYSVIPC 67#ifdef CONFIG_SYSVIPC
68 proc_mkdir("sysvipc", NULL); 68 proc_mkdir("sysvipc", NULL);
@@ -159,7 +159,5 @@ EXPORT_SYMBOL(create_proc_entry);
159EXPORT_SYMBOL(remove_proc_entry); 159EXPORT_SYMBOL(remove_proc_entry);
160EXPORT_SYMBOL(proc_root); 160EXPORT_SYMBOL(proc_root);
161EXPORT_SYMBOL(proc_root_fs); 161EXPORT_SYMBOL(proc_root_fs);
162EXPORT_SYMBOL(proc_net);
163EXPORT_SYMBOL(proc_net_stat);
164EXPORT_SYMBOL(proc_bus); 162EXPORT_SYMBOL(proc_bus);
165EXPORT_SYMBOL(proc_root_driver); 163EXPORT_SYMBOL(proc_root_driver);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index bbb19be260ce..ca71c115bdaa 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -429,6 +429,39 @@ int seq_release_private(struct inode *inode, struct file *file)
429} 429}
430EXPORT_SYMBOL(seq_release_private); 430EXPORT_SYMBOL(seq_release_private);
431 431
432void *__seq_open_private(struct file *f, const struct seq_operations *ops,
433 int psize)
434{
435 int rc;
436 void *private;
437 struct seq_file *seq;
438
439 private = kzalloc(psize, GFP_KERNEL);
440 if (private == NULL)
441 goto out;
442
443 rc = seq_open(f, ops);
444 if (rc < 0)
445 goto out_free;
446
447 seq = f->private_data;
448 seq->private = private;
449 return private;
450
451out_free:
452 kfree(private);
453out:
454 return NULL;
455}
456EXPORT_SYMBOL(__seq_open_private);
457
458int seq_open_private(struct file *filp, const struct seq_operations *ops,
459 int psize)
460{
461 return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;
462}
463EXPORT_SYMBOL(seq_open_private);
464
432int seq_putc(struct seq_file *m, char c) 465int seq_putc(struct seq_file *m, char c)
433{ 466{
434 if (m->count < m->size) { 467 if (m->count < m->size) {
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 5afe2a26f5d8..006fc64227dd 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -1,9 +1,15 @@
1/* 1/*
2 * bin.c - binary file operations for sysfs. 2 * fs/sysfs/bin.c - sysfs binary file implementation
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Matthew Wilcox 5 * Copyright (c) 2003 Matthew Wilcox
6 * Copyright (c) 2004 Silicon Graphics, Inc. 6 * Copyright (c) 2004 Silicon Graphics, Inc.
7 * Copyright (c) 2007 SUSE Linux Products GmbH
8 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
9 *
10 * This file is released under the GPLv2.
11 *
12 * Please see Documentation/filesystems/sysfs.txt for more information.
7 */ 13 */
8 14
9#undef DEBUG 15#undef DEBUG
@@ -14,9 +20,9 @@
14#include <linux/kobject.h> 20#include <linux/kobject.h>
15#include <linux/module.h> 21#include <linux/module.h>
16#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/mutex.h>
17 24
18#include <asm/uaccess.h> 25#include <asm/uaccess.h>
19#include <asm/semaphore.h>
20 26
21#include "sysfs.h" 27#include "sysfs.h"
22 28
@@ -30,8 +36,8 @@ static int
30fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) 36fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
31{ 37{
32 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 38 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
33 struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; 39 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
34 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; 40 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
35 int rc; 41 int rc;
36 42
37 /* need attr_sd for attr, its parent for kobj */ 43 /* need attr_sd for attr, its parent for kobj */
@@ -87,8 +93,8 @@ static int
87flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) 93flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
88{ 94{
89 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 95 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
90 struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; 96 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
91 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; 97 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
92 int rc; 98 int rc;
93 99
94 /* need attr_sd for attr, its parent for kobj */ 100 /* need attr_sd for attr, its parent for kobj */
@@ -140,8 +146,8 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
140{ 146{
141 struct bin_buffer *bb = file->private_data; 147 struct bin_buffer *bb = file->private_data;
142 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 148 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
143 struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; 149 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
144 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; 150 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
145 int rc; 151 int rc;
146 152
147 mutex_lock(&bb->mutex); 153 mutex_lock(&bb->mutex);
@@ -167,12 +173,12 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
167static int open(struct inode * inode, struct file * file) 173static int open(struct inode * inode, struct file * file)
168{ 174{
169 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 175 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
170 struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr; 176 struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
171 struct bin_buffer *bb = NULL; 177 struct bin_buffer *bb = NULL;
172 int error; 178 int error;
173 179
174 /* need attr_sd for attr */ 180 /* binary file operations requires both @sd and its parent */
175 if (!sysfs_get_active(attr_sd)) 181 if (!sysfs_get_active_two(attr_sd))
176 return -ENODEV; 182 return -ENODEV;
177 183
178 error = -EACCES; 184 error = -EACCES;
@@ -193,13 +199,12 @@ static int open(struct inode * inode, struct file * file)
193 mutex_init(&bb->mutex); 199 mutex_init(&bb->mutex);
194 file->private_data = bb; 200 file->private_data = bb;
195 201
196 /* open succeeded, put active reference and pin attr_sd */ 202 /* open succeeded, put active references */
197 sysfs_put_active(attr_sd); 203 sysfs_put_active_two(attr_sd);
198 sysfs_get(attr_sd);
199 return 0; 204 return 0;
200 205
201 err_out: 206 err_out:
202 sysfs_put_active(attr_sd); 207 sysfs_put_active_two(attr_sd);
203 kfree(bb); 208 kfree(bb);
204 return error; 209 return error;
205} 210}
@@ -211,7 +216,6 @@ static int release(struct inode * inode, struct file * file)
211 216
212 if (bb->mmapped) 217 if (bb->mmapped)
213 sysfs_put_active_two(attr_sd); 218 sysfs_put_active_two(attr_sd);
214 sysfs_put(attr_sd);
215 kfree(bb->buffer); 219 kfree(bb->buffer);
216 kfree(bb); 220 kfree(bb);
217 return 0; 221 return 0;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 83e76b3813c9..9161db4d6b5c 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -1,5 +1,13 @@
1/* 1/*
2 * dir.c - Operations for sysfs directories. 2 * fs/sysfs/dir.c - sysfs core and dir operation implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
9 *
10 * Please see Documentation/filesystems/sysfs.txt for more information.
3 */ 11 */
4 12
5#undef DEBUG 13#undef DEBUG
@@ -11,10 +19,11 @@
11#include <linux/namei.h> 19#include <linux/namei.h>
12#include <linux/idr.h> 20#include <linux/idr.h>
13#include <linux/completion.h> 21#include <linux/completion.h>
14#include <asm/semaphore.h> 22#include <linux/mutex.h>
15#include "sysfs.h" 23#include "sysfs.h"
16 24
17DEFINE_MUTEX(sysfs_mutex); 25DEFINE_MUTEX(sysfs_mutex);
26DEFINE_MUTEX(sysfs_rename_mutex);
18spinlock_t sysfs_assoc_lock = SPIN_LOCK_UNLOCKED; 27spinlock_t sysfs_assoc_lock = SPIN_LOCK_UNLOCKED;
19 28
20static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED; 29static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED;
@@ -25,18 +34,28 @@ static DEFINE_IDA(sysfs_ino_ida);
25 * @sd: sysfs_dirent of interest 34 * @sd: sysfs_dirent of interest
26 * 35 *
27 * Link @sd into its sibling list which starts from 36 * Link @sd into its sibling list which starts from
28 * sd->s_parent->s_children. 37 * sd->s_parent->s_dir.children.
29 * 38 *
30 * Locking: 39 * Locking:
31 * mutex_lock(sysfs_mutex) 40 * mutex_lock(sysfs_mutex)
32 */ 41 */
33void sysfs_link_sibling(struct sysfs_dirent *sd) 42static void sysfs_link_sibling(struct sysfs_dirent *sd)
34{ 43{
35 struct sysfs_dirent *parent_sd = sd->s_parent; 44 struct sysfs_dirent *parent_sd = sd->s_parent;
45 struct sysfs_dirent **pos;
36 46
37 BUG_ON(sd->s_sibling); 47 BUG_ON(sd->s_sibling);
38 sd->s_sibling = parent_sd->s_children; 48
39 parent_sd->s_children = sd; 49 /* Store directory entries in order by ino. This allows
50 * readdir to properly restart without having to add a
51 * cursor into the s_dir.children list.
52 */
53 for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
54 if (sd->s_ino < (*pos)->s_ino)
55 break;
56 }
57 sd->s_sibling = *pos;
58 *pos = sd;
40} 59}
41 60
42/** 61/**
@@ -44,16 +63,17 @@ void sysfs_link_sibling(struct sysfs_dirent *sd)
44 * @sd: sysfs_dirent of interest 63 * @sd: sysfs_dirent of interest
45 * 64 *
46 * Unlink @sd from its sibling list which starts from 65 * Unlink @sd from its sibling list which starts from
47 * sd->s_parent->s_children. 66 * sd->s_parent->s_dir.children.
48 * 67 *
49 * Locking: 68 * Locking:
50 * mutex_lock(sysfs_mutex) 69 * mutex_lock(sysfs_mutex)
51 */ 70 */
52void sysfs_unlink_sibling(struct sysfs_dirent *sd) 71static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
53{ 72{
54 struct sysfs_dirent **pos; 73 struct sysfs_dirent **pos;
55 74
56 for (pos = &sd->s_parent->s_children; *pos; pos = &(*pos)->s_sibling) { 75 for (pos = &sd->s_parent->s_dir.children; *pos;
76 pos = &(*pos)->s_sibling) {
57 if (*pos == sd) { 77 if (*pos == sd) {
58 *pos = sd->s_sibling; 78 *pos = sd->s_sibling;
59 sd->s_sibling = NULL; 79 sd->s_sibling = NULL;
@@ -67,96 +87,39 @@ void sysfs_unlink_sibling(struct sysfs_dirent *sd)
67 * @sd: sysfs_dirent of interest 87 * @sd: sysfs_dirent of interest
68 * 88 *
69 * Get dentry for @sd. Dentry is looked up if currently not 89 * Get dentry for @sd. Dentry is looked up if currently not
70 * present. This function climbs sysfs_dirent tree till it 90 * present. This function descends from the root looking up
71 * reaches a sysfs_dirent with valid dentry attached and descends 91 * dentry for each step.
72 * down from there looking up dentry for each step.
73 * 92 *
74 * LOCKING: 93 * LOCKING:
75 * Kernel thread context (may sleep) 94 * mutex_lock(sysfs_rename_mutex)
76 * 95 *
77 * RETURNS: 96 * RETURNS:
78 * Pointer to found dentry on success, ERR_PTR() value on error. 97 * Pointer to found dentry on success, ERR_PTR() value on error.
79 */ 98 */
80struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd) 99struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
81{ 100{
82 struct sysfs_dirent *cur; 101 struct dentry *dentry = dget(sysfs_sb->s_root);
83 struct dentry *parent_dentry, *dentry;
84 int i, depth;
85
86 /* Find the first parent which has valid s_dentry and get the
87 * dentry.
88 */
89 mutex_lock(&sysfs_mutex);
90 restart0:
91 spin_lock(&sysfs_assoc_lock);
92 restart1:
93 spin_lock(&dcache_lock);
94 102
95 dentry = NULL; 103 while (dentry->d_fsdata != sd) {
96 depth = 0; 104 struct sysfs_dirent *cur;
97 cur = sd; 105 struct dentry *parent;
98 while (!cur->s_dentry || !cur->s_dentry->d_inode) {
99 if (cur->s_flags & SYSFS_FLAG_REMOVED) {
100 dentry = ERR_PTR(-ENOENT);
101 depth = 0;
102 break;
103 }
104 cur = cur->s_parent;
105 depth++;
106 }
107 if (!IS_ERR(dentry))
108 dentry = dget_locked(cur->s_dentry);
109 106
110 spin_unlock(&dcache_lock); 107 /* find the first ancestor which hasn't been looked up */
111 spin_unlock(&sysfs_assoc_lock); 108 cur = sd;
112 109 while (cur->s_parent != dentry->d_fsdata)
113 /* from the found dentry, look up depth times */
114 while (depth--) {
115 /* find and get depth'th ancestor */
116 for (cur = sd, i = 0; cur && i < depth; i++)
117 cur = cur->s_parent; 110 cur = cur->s_parent;
118 111
119 /* This can happen if tree structure was modified due
120 * to move/rename. Restart.
121 */
122 if (i != depth) {
123 dput(dentry);
124 goto restart0;
125 }
126
127 sysfs_get(cur);
128
129 mutex_unlock(&sysfs_mutex);
130
131 /* look it up */ 112 /* look it up */
132 parent_dentry = dentry; 113 parent = dentry;
133 dentry = lookup_one_len_kern(cur->s_name, parent_dentry, 114 mutex_lock(&parent->d_inode->i_mutex);
115 dentry = lookup_one_len_kern(cur->s_name, parent,
134 strlen(cur->s_name)); 116 strlen(cur->s_name));
135 dput(parent_dentry); 117 mutex_unlock(&parent->d_inode->i_mutex);
136 118 dput(parent);
137 if (IS_ERR(dentry)) {
138 sysfs_put(cur);
139 return dentry;
140 }
141 119
142 mutex_lock(&sysfs_mutex); 120 if (IS_ERR(dentry))
143 spin_lock(&sysfs_assoc_lock); 121 break;
144
145 /* This, again, can happen if tree structure has
146 * changed and we looked up the wrong thing. Restart.
147 */
148 if (cur->s_dentry != dentry) {
149 dput(dentry);
150 sysfs_put(cur);
151 goto restart1;
152 }
153
154 spin_unlock(&sysfs_assoc_lock);
155
156 sysfs_put(cur);
157 } 122 }
158
159 mutex_unlock(&sysfs_mutex);
160 return dentry; 123 return dentry;
161} 124}
162 125
@@ -319,7 +282,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
319 parent_sd = sd->s_parent; 282 parent_sd = sd->s_parent;
320 283
321 if (sysfs_type(sd) == SYSFS_KOBJ_LINK) 284 if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
322 sysfs_put(sd->s_elem.symlink.target_sd); 285 sysfs_put(sd->s_symlink.target_sd);
323 if (sysfs_type(sd) & SYSFS_COPY_NAME) 286 if (sysfs_type(sd) & SYSFS_COPY_NAME)
324 kfree(sd->s_name); 287 kfree(sd->s_name);
325 kfree(sd->s_iattr); 288 kfree(sd->s_iattr);
@@ -335,22 +298,7 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
335{ 298{
336 struct sysfs_dirent * sd = dentry->d_fsdata; 299 struct sysfs_dirent * sd = dentry->d_fsdata;
337 300
338 if (sd) { 301 sysfs_put(sd);
339 /* sd->s_dentry is protected with sysfs_assoc_lock.
340 * This allows sysfs_drop_dentry() to dereference it.
341 */
342 spin_lock(&sysfs_assoc_lock);
343
344 /* The dentry might have been deleted or another
345 * lookup could have happened updating sd->s_dentry to
346 * point the new dentry. Ignore if it isn't pointing
347 * to this dentry.
348 */
349 if (sd->s_dentry == dentry)
350 sd->s_dentry = NULL;
351 spin_unlock(&sysfs_assoc_lock);
352 sysfs_put(sd);
353 }
354 iput(inode); 302 iput(inode);
355} 303}
356 304
@@ -378,7 +326,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
378 326
379 atomic_set(&sd->s_count, 1); 327 atomic_set(&sd->s_count, 1);
380 atomic_set(&sd->s_active, 0); 328 atomic_set(&sd->s_active, 0);
381 atomic_set(&sd->s_event, 1);
382 329
383 sd->s_name = name; 330 sd->s_name = name;
384 sd->s_mode = mode; 331 sd->s_mode = mode;
@@ -393,30 +340,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
393 return NULL; 340 return NULL;
394} 341}
395 342
396/**
397 * sysfs_attach_dentry - associate sysfs_dirent with dentry
398 * @sd: target sysfs_dirent
399 * @dentry: dentry to associate
400 *
401 * Associate @sd with @dentry. This is protected by
402 * sysfs_assoc_lock to avoid race with sysfs_d_iput().
403 *
404 * LOCKING:
405 * mutex_lock(sysfs_mutex)
406 */
407static void sysfs_attach_dentry(struct sysfs_dirent *sd, struct dentry *dentry)
408{
409 dentry->d_op = &sysfs_dentry_ops;
410 dentry->d_fsdata = sysfs_get(sd);
411
412 /* protect sd->s_dentry against sysfs_d_iput */
413 spin_lock(&sysfs_assoc_lock);
414 sd->s_dentry = dentry;
415 spin_unlock(&sysfs_assoc_lock);
416
417 d_rehash(dentry);
418}
419
420static int sysfs_ilookup_test(struct inode *inode, void *arg) 343static int sysfs_ilookup_test(struct inode *inode, void *arg)
421{ 344{
422 struct sysfs_dirent *sd = arg; 345 struct sysfs_dirent *sd = arg;
@@ -480,10 +403,8 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
480 * @sd: sysfs_dirent to be added 403 * @sd: sysfs_dirent to be added
481 * 404 *
482 * Get @acxt->parent_sd and set sd->s_parent to it and increment 405 * Get @acxt->parent_sd and set sd->s_parent to it and increment
483 * nlink of parent inode if @sd is a directory. @sd is NOT 406 * nlink of parent inode if @sd is a directory and link into the
484 * linked into the children list of the parent. The caller 407 * children list of the parent.
485 * should invoke sysfs_link_sibling() after this function
486 * completes if @sd needs to be on the children list.
487 * 408 *
488 * This function should be called between calls to 409 * This function should be called between calls to
489 * sysfs_addrm_start() and sysfs_addrm_finish() and should be 410 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
@@ -491,15 +412,30 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
491 * 412 *
492 * LOCKING: 413 * LOCKING:
493 * Determined by sysfs_addrm_start(). 414 * Determined by sysfs_addrm_start().
415 *
416 * RETURNS:
417 * 0 on success, -EEXIST if entry with the given name already
418 * exists.
494 */ 419 */
495void sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) 420int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
496{ 421{
422 if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) {
423 printk(KERN_WARNING "sysfs: duplicate filename '%s' "
424 "can not be created\n", sd->s_name);
425 WARN_ON(1);
426 return -EEXIST;
427 }
428
497 sd->s_parent = sysfs_get(acxt->parent_sd); 429 sd->s_parent = sysfs_get(acxt->parent_sd);
498 430
499 if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode) 431 if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
500 inc_nlink(acxt->parent_inode); 432 inc_nlink(acxt->parent_inode);
501 433
502 acxt->cnt++; 434 acxt->cnt++;
435
436 sysfs_link_sibling(sd);
437
438 return 0;
503} 439}
504 440
505/** 441/**
@@ -508,9 +444,7 @@ void sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
508 * @sd: sysfs_dirent to be added 444 * @sd: sysfs_dirent to be added
509 * 445 *
510 * Mark @sd removed and drop nlink of parent inode if @sd is a 446 * Mark @sd removed and drop nlink of parent inode if @sd is a
511 * directory. @sd is NOT unlinked from the children list of the 447 * directory. @sd is unlinked from the children list.
512 * parent. The caller is repsonsible for removing @sd from the
513 * children list before calling this function.
514 * 448 *
515 * This function should be called between calls to 449 * This function should be called between calls to
516 * sysfs_addrm_start() and sysfs_addrm_finish() and should be 450 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
@@ -521,7 +455,9 @@ void sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
521 */ 455 */
522void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) 456void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
523{ 457{
524 BUG_ON(sd->s_sibling || (sd->s_flags & SYSFS_FLAG_REMOVED)); 458 BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
459
460 sysfs_unlink_sibling(sd);
525 461
526 sd->s_flags |= SYSFS_FLAG_REMOVED; 462 sd->s_flags |= SYSFS_FLAG_REMOVED;
527 sd->s_sibling = acxt->removed; 463 sd->s_sibling = acxt->removed;
@@ -540,53 +476,49 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
540 * Drop dentry for @sd. @sd must have been unlinked from its 476 * Drop dentry for @sd. @sd must have been unlinked from its
541 * parent on entry to this function such that it can't be looked 477 * parent on entry to this function such that it can't be looked
542 * up anymore. 478 * up anymore.
543 *
544 * @sd->s_dentry which is protected with sysfs_assoc_lock points
545 * to the currently associated dentry but we're not holding a
546 * reference to it and racing with dput(). Grab dcache_lock and
547 * verify dentry before dropping it. If @sd->s_dentry is NULL or
548 * dput() beats us, no need to bother.
549 */ 479 */
550static void sysfs_drop_dentry(struct sysfs_dirent *sd) 480static void sysfs_drop_dentry(struct sysfs_dirent *sd)
551{ 481{
552 struct dentry *dentry = NULL;
553 struct inode *inode; 482 struct inode *inode;
483 struct dentry *dentry;
554 484
555 /* We're not holding a reference to ->s_dentry dentry but the 485 inode = ilookup(sysfs_sb, sd->s_ino);
556 * field will stay valid as long as sysfs_assoc_lock is held. 486 if (!inode)
487 return;
488
489 /* Drop any existing dentries associated with sd.
490 *
491 * For the dentry to be properly freed we need to grab a
492 * reference to the dentry under the dcache lock, unhash it,
493 * and then put it. The playing with the dentry count allows
494 * dput to immediately free the dentry if it is not in use.
557 */ 495 */
558 spin_lock(&sysfs_assoc_lock); 496repeat:
559 spin_lock(&dcache_lock); 497 spin_lock(&dcache_lock);
560 498 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
561 /* drop dentry if it's there and dput() didn't kill it yet */ 499 if (d_unhashed(dentry))
562 if (sd->s_dentry && sd->s_dentry->d_inode) { 500 continue;
563 dentry = dget_locked(sd->s_dentry); 501 dget_locked(dentry);
564 spin_lock(&dentry->d_lock); 502 spin_lock(&dentry->d_lock);
565 __d_drop(dentry); 503 __d_drop(dentry);
566 spin_unlock(&dentry->d_lock); 504 spin_unlock(&dentry->d_lock);
505 spin_unlock(&dcache_lock);
506 dput(dentry);
507 goto repeat;
567 } 508 }
568
569 spin_unlock(&dcache_lock); 509 spin_unlock(&dcache_lock);
570 spin_unlock(&sysfs_assoc_lock);
571
572 /* dentries for shadowed inodes are pinned, unpin */
573 if (dentry && sysfs_is_shadowed_inode(dentry->d_inode))
574 dput(dentry);
575 dput(dentry);
576 510
577 /* adjust nlink and update timestamp */ 511 /* adjust nlink and update timestamp */
578 inode = ilookup(sysfs_sb, sd->s_ino); 512 mutex_lock(&inode->i_mutex);
579 if (inode) {
580 mutex_lock(&inode->i_mutex);
581 513
582 inode->i_ctime = CURRENT_TIME; 514 inode->i_ctime = CURRENT_TIME;
515 drop_nlink(inode);
516 if (sysfs_type(sd) == SYSFS_DIR)
583 drop_nlink(inode); 517 drop_nlink(inode);
584 if (sysfs_type(sd) == SYSFS_DIR)
585 drop_nlink(inode);
586 518
587 mutex_unlock(&inode->i_mutex); 519 mutex_unlock(&inode->i_mutex);
588 iput(inode); 520
589 } 521 iput(inode);
590} 522}
591 523
592/** 524/**
@@ -599,11 +531,8 @@ static void sysfs_drop_dentry(struct sysfs_dirent *sd)
599 * 531 *
600 * LOCKING: 532 * LOCKING:
601 * All mutexes acquired by sysfs_addrm_start() are released. 533 * All mutexes acquired by sysfs_addrm_start() are released.
602 *
603 * RETURNS:
604 * Number of added/removed sysfs_dirents since sysfs_addrm_start().
605 */ 534 */
606int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) 535void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
607{ 536{
608 /* release resources acquired by sysfs_addrm_start() */ 537 /* release resources acquired by sysfs_addrm_start() */
609 mutex_unlock(&sysfs_mutex); 538 mutex_unlock(&sysfs_mutex);
@@ -629,8 +558,6 @@ int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
629 sysfs_deactivate(sd); 558 sysfs_deactivate(sd);
630 sysfs_put(sd); 559 sysfs_put(sd);
631 } 560 }
632
633 return acxt->cnt;
634} 561}
635 562
636/** 563/**
@@ -651,8 +578,8 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
651{ 578{
652 struct sysfs_dirent *sd; 579 struct sysfs_dirent *sd;
653 580
654 for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) 581 for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
655 if (sysfs_type(sd) && !strcmp(sd->s_name, name)) 582 if (!strcmp(sd->s_name, name))
656 return sd; 583 return sd;
657 return NULL; 584 return NULL;
658} 585}
@@ -690,28 +617,25 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
690 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; 617 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
691 struct sysfs_addrm_cxt acxt; 618 struct sysfs_addrm_cxt acxt;
692 struct sysfs_dirent *sd; 619 struct sysfs_dirent *sd;
620 int rc;
693 621
694 /* allocate */ 622 /* allocate */
695 sd = sysfs_new_dirent(name, mode, SYSFS_DIR); 623 sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
696 if (!sd) 624 if (!sd)
697 return -ENOMEM; 625 return -ENOMEM;
698 sd->s_elem.dir.kobj = kobj; 626 sd->s_dir.kobj = kobj;
699 627
700 /* link in */ 628 /* link in */
701 sysfs_addrm_start(&acxt, parent_sd); 629 sysfs_addrm_start(&acxt, parent_sd);
630 rc = sysfs_add_one(&acxt, sd);
631 sysfs_addrm_finish(&acxt);
702 632
703 if (!sysfs_find_dirent(parent_sd, name)) { 633 if (rc == 0)
704 sysfs_add_one(&acxt, sd); 634 *p_sd = sd;
705 sysfs_link_sibling(sd); 635 else
706 }
707
708 if (!sysfs_addrm_finish(&acxt)) {
709 sysfs_put(sd); 636 sysfs_put(sd);
710 return -EEXIST;
711 }
712 637
713 *p_sd = sd; 638 return rc;
714 return 0;
715} 639}
716 640
717int sysfs_create_subdir(struct kobject *kobj, const char *name, 641int sysfs_create_subdir(struct kobject *kobj, const char *name,
@@ -723,24 +647,18 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
723/** 647/**
724 * sysfs_create_dir - create a directory for an object. 648 * sysfs_create_dir - create a directory for an object.
725 * @kobj: object we're creating directory for. 649 * @kobj: object we're creating directory for.
726 * @shadow_parent: parent object.
727 */ 650 */
728int sysfs_create_dir(struct kobject *kobj, 651int sysfs_create_dir(struct kobject * kobj)
729 struct sysfs_dirent *shadow_parent_sd)
730{ 652{
731 struct sysfs_dirent *parent_sd, *sd; 653 struct sysfs_dirent *parent_sd, *sd;
732 int error = 0; 654 int error = 0;
733 655
734 BUG_ON(!kobj); 656 BUG_ON(!kobj);
735 657
736 if (shadow_parent_sd) 658 if (kobj->parent)
737 parent_sd = shadow_parent_sd;
738 else if (kobj->parent)
739 parent_sd = kobj->parent->sd; 659 parent_sd = kobj->parent->sd;
740 else if (sysfs_mount && sysfs_mount->mnt_sb)
741 parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata;
742 else 660 else
743 return -EFAULT; 661 parent_sd = &sysfs_root;
744 662
745 error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); 663 error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
746 if (!error) 664 if (!error)
@@ -748,39 +666,20 @@ int sysfs_create_dir(struct kobject *kobj,
748 return error; 666 return error;
749} 667}
750 668
751static int sysfs_count_nlink(struct sysfs_dirent *sd)
752{
753 struct sysfs_dirent *child;
754 int nr = 0;
755
756 for (child = sd->s_children; child; child = child->s_sibling)
757 if (sysfs_type(child) == SYSFS_DIR)
758 nr++;
759 return nr + 2;
760}
761
762static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, 669static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
763 struct nameidata *nd) 670 struct nameidata *nd)
764{ 671{
765 struct dentry *ret = NULL; 672 struct dentry *ret = NULL;
766 struct sysfs_dirent * parent_sd = dentry->d_parent->d_fsdata; 673 struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
767 struct sysfs_dirent * sd; 674 struct sysfs_dirent *sd;
768 struct bin_attribute *bin_attr;
769 struct inode *inode; 675 struct inode *inode;
770 int found = 0;
771 676
772 mutex_lock(&sysfs_mutex); 677 mutex_lock(&sysfs_mutex);
773 678
774 for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) { 679 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
775 if (sysfs_type(sd) &&
776 !strcmp(sd->s_name, dentry->d_name.name)) {
777 found = 1;
778 break;
779 }
780 }
781 680
782 /* no such entry */ 681 /* no such entry */
783 if (!found) 682 if (!sd)
784 goto out_unlock; 683 goto out_unlock;
785 684
786 /* attach dentry and inode */ 685 /* attach dentry and inode */
@@ -790,33 +689,11 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
790 goto out_unlock; 689 goto out_unlock;
791 } 690 }
792 691
793 if (inode->i_state & I_NEW) { 692 /* instantiate and hash dentry */
794 /* initialize inode according to type */ 693 dentry->d_op = &sysfs_dentry_ops;
795 switch (sysfs_type(sd)) { 694 dentry->d_fsdata = sysfs_get(sd);
796 case SYSFS_DIR: 695 d_instantiate(dentry, inode);
797 inode->i_op = &sysfs_dir_inode_operations; 696 d_rehash(dentry);
798 inode->i_fop = &sysfs_dir_operations;
799 inode->i_nlink = sysfs_count_nlink(sd);
800 break;
801 case SYSFS_KOBJ_ATTR:
802 inode->i_size = PAGE_SIZE;
803 inode->i_fop = &sysfs_file_operations;
804 break;
805 case SYSFS_KOBJ_BIN_ATTR:
806 bin_attr = sd->s_elem.bin_attr.bin_attr;
807 inode->i_size = bin_attr->size;
808 inode->i_fop = &bin_fops;
809 break;
810 case SYSFS_KOBJ_LINK:
811 inode->i_op = &sysfs_symlink_inode_operations;
812 break;
813 default:
814 BUG();
815 }
816 }
817
818 sysfs_instantiate(dentry, inode);
819 sysfs_attach_dentry(sd, dentry);
820 697
821 out_unlock: 698 out_unlock:
822 mutex_unlock(&sysfs_mutex); 699 mutex_unlock(&sysfs_mutex);
@@ -833,7 +710,6 @@ static void remove_dir(struct sysfs_dirent *sd)
833 struct sysfs_addrm_cxt acxt; 710 struct sysfs_addrm_cxt acxt;
834 711
835 sysfs_addrm_start(&acxt, sd->s_parent); 712 sysfs_addrm_start(&acxt, sd->s_parent);
836 sysfs_unlink_sibling(sd);
837 sysfs_remove_one(&acxt, sd); 713 sysfs_remove_one(&acxt, sd);
838 sysfs_addrm_finish(&acxt); 714 sysfs_addrm_finish(&acxt);
839} 715}
@@ -854,15 +730,13 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
854 730
855 pr_debug("sysfs %s: removing dir\n", dir_sd->s_name); 731 pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
856 sysfs_addrm_start(&acxt, dir_sd); 732 sysfs_addrm_start(&acxt, dir_sd);
857 pos = &dir_sd->s_children; 733 pos = &dir_sd->s_dir.children;
858 while (*pos) { 734 while (*pos) {
859 struct sysfs_dirent *sd = *pos; 735 struct sysfs_dirent *sd = *pos;
860 736
861 if (sysfs_type(sd) && sysfs_type(sd) != SYSFS_DIR) { 737 if (sysfs_type(sd) != SYSFS_DIR)
862 *pos = sd->s_sibling;
863 sd->s_sibling = NULL;
864 sysfs_remove_one(&acxt, sd); 738 sysfs_remove_one(&acxt, sd);
865 } else 739 else
866 pos = &(*pos)->s_sibling; 740 pos = &(*pos)->s_sibling;
867 } 741 }
868 sysfs_addrm_finish(&acxt); 742 sysfs_addrm_finish(&acxt);
@@ -890,90 +764,68 @@ void sysfs_remove_dir(struct kobject * kobj)
890 __sysfs_remove_dir(sd); 764 __sysfs_remove_dir(sd);
891} 765}
892 766
893int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd, 767int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
894 const char *new_name)
895{ 768{
896 struct sysfs_dirent *sd = kobj->sd; 769 struct sysfs_dirent *sd = kobj->sd;
897 struct dentry *new_parent = NULL; 770 struct dentry *parent = NULL;
898 struct dentry *old_dentry = NULL, *new_dentry = NULL; 771 struct dentry *old_dentry = NULL, *new_dentry = NULL;
899 const char *dup_name = NULL; 772 const char *dup_name = NULL;
900 int error; 773 int error;
901 774
902 /* get dentries */ 775 mutex_lock(&sysfs_rename_mutex);
776
777 error = 0;
778 if (strcmp(sd->s_name, new_name) == 0)
779 goto out; /* nothing to rename */
780
781 /* get the original dentry */
903 old_dentry = sysfs_get_dentry(sd); 782 old_dentry = sysfs_get_dentry(sd);
904 if (IS_ERR(old_dentry)) { 783 if (IS_ERR(old_dentry)) {
905 error = PTR_ERR(old_dentry); 784 error = PTR_ERR(old_dentry);
906 goto out_dput; 785 goto out;
907 }
908
909 new_parent = sysfs_get_dentry(new_parent_sd);
910 if (IS_ERR(new_parent)) {
911 error = PTR_ERR(new_parent);
912 goto out_dput;
913 } 786 }
914 787
915 /* lock new_parent and get dentry for new name */ 788 parent = old_dentry->d_parent;
916 mutex_lock(&new_parent->d_inode->i_mutex);
917 789
918 new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name)); 790 /* lock parent and get dentry for new name */
919 if (IS_ERR(new_dentry)) { 791 mutex_lock(&parent->d_inode->i_mutex);
920 error = PTR_ERR(new_dentry); 792 mutex_lock(&sysfs_mutex);
921 goto out_unlock;
922 }
923 793
924 /* By allowing two different directories with the same 794 error = -EEXIST;
925 * d_parent we allow this routine to move between different 795 if (sysfs_find_dirent(sd->s_parent, new_name))
926 * shadows of the same directory
927 */
928 error = -EINVAL;
929 if (old_dentry->d_parent->d_inode != new_parent->d_inode ||
930 new_dentry->d_parent->d_inode != new_parent->d_inode ||
931 old_dentry == new_dentry)
932 goto out_unlock; 796 goto out_unlock;
933 797
934 error = -EEXIST; 798 error = -ENOMEM;
935 if (new_dentry->d_inode) 799 new_dentry = d_alloc_name(parent, new_name);
800 if (!new_dentry)
936 goto out_unlock; 801 goto out_unlock;
937 802
938 /* rename kobject and sysfs_dirent */ 803 /* rename kobject and sysfs_dirent */
939 error = -ENOMEM; 804 error = -ENOMEM;
940 new_name = dup_name = kstrdup(new_name, GFP_KERNEL); 805 new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
941 if (!new_name) 806 if (!new_name)
942 goto out_drop; 807 goto out_unlock;
943 808
944 error = kobject_set_name(kobj, "%s", new_name); 809 error = kobject_set_name(kobj, "%s", new_name);
945 if (error) 810 if (error)
946 goto out_drop; 811 goto out_unlock;
947
948 mutex_lock(&sysfs_mutex);
949 812
950 dup_name = sd->s_name; 813 dup_name = sd->s_name;
951 sd->s_name = new_name; 814 sd->s_name = new_name;
952 815
953 /* move under the new parent */ 816 /* rename */
954 d_add(new_dentry, NULL); 817 d_add(new_dentry, NULL);
955 d_move(sd->s_dentry, new_dentry); 818 d_move(old_dentry, new_dentry);
956
957 sysfs_unlink_sibling(sd);
958 sysfs_get(new_parent_sd);
959 sysfs_put(sd->s_parent);
960 sd->s_parent = new_parent_sd;
961 sysfs_link_sibling(sd);
962
963 mutex_unlock(&sysfs_mutex);
964 819
965 error = 0; 820 error = 0;
966 goto out_unlock;
967
968 out_drop:
969 d_drop(new_dentry);
970 out_unlock: 821 out_unlock:
971 mutex_unlock(&new_parent->d_inode->i_mutex); 822 mutex_unlock(&sysfs_mutex);
972 out_dput: 823 mutex_unlock(&parent->d_inode->i_mutex);
973 kfree(dup_name); 824 kfree(dup_name);
974 dput(new_parent);
975 dput(old_dentry); 825 dput(old_dentry);
976 dput(new_dentry); 826 dput(new_dentry);
827 out:
828 mutex_unlock(&sysfs_rename_mutex);
977 return error; 829 return error;
978} 830}
979 831
@@ -985,96 +837,69 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
985 struct dentry *old_dentry = NULL, *new_dentry = NULL; 837 struct dentry *old_dentry = NULL, *new_dentry = NULL;
986 int error; 838 int error;
987 839
840 mutex_lock(&sysfs_rename_mutex);
988 BUG_ON(!sd->s_parent); 841 BUG_ON(!sd->s_parent);
989 new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; 842 new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
990 843
844 error = 0;
845 if (sd->s_parent == new_parent_sd)
846 goto out; /* nothing to move */
847
991 /* get dentries */ 848 /* get dentries */
992 old_dentry = sysfs_get_dentry(sd); 849 old_dentry = sysfs_get_dentry(sd);
993 if (IS_ERR(old_dentry)) { 850 if (IS_ERR(old_dentry)) {
994 error = PTR_ERR(old_dentry); 851 error = PTR_ERR(old_dentry);
995 goto out_dput; 852 goto out;
996 } 853 }
997 old_parent = sd->s_parent->s_dentry; 854 old_parent = old_dentry->d_parent;
998 855
999 new_parent = sysfs_get_dentry(new_parent_sd); 856 new_parent = sysfs_get_dentry(new_parent_sd);
1000 if (IS_ERR(new_parent)) { 857 if (IS_ERR(new_parent)) {
1001 error = PTR_ERR(new_parent); 858 error = PTR_ERR(new_parent);
1002 goto out_dput; 859 goto out;
1003 } 860 }
1004 861
1005 if (old_parent->d_inode == new_parent->d_inode) {
1006 error = 0;
1007 goto out_dput; /* nothing to move */
1008 }
1009again: 862again:
1010 mutex_lock(&old_parent->d_inode->i_mutex); 863 mutex_lock(&old_parent->d_inode->i_mutex);
1011 if (!mutex_trylock(&new_parent->d_inode->i_mutex)) { 864 if (!mutex_trylock(&new_parent->d_inode->i_mutex)) {
1012 mutex_unlock(&old_parent->d_inode->i_mutex); 865 mutex_unlock(&old_parent->d_inode->i_mutex);
1013 goto again; 866 goto again;
1014 } 867 }
868 mutex_lock(&sysfs_mutex);
1015 869
1016 new_dentry = lookup_one_len(kobj->name, new_parent, strlen(kobj->name)); 870 error = -EEXIST;
1017 if (IS_ERR(new_dentry)) { 871 if (sysfs_find_dirent(new_parent_sd, sd->s_name))
1018 error = PTR_ERR(new_dentry);
1019 goto out_unlock; 872 goto out_unlock;
1020 } else 873
1021 error = 0; 874 error = -ENOMEM;
875 new_dentry = d_alloc_name(new_parent, sd->s_name);
876 if (!new_dentry)
877 goto out_unlock;
878
879 error = 0;
1022 d_add(new_dentry, NULL); 880 d_add(new_dentry, NULL);
1023 d_move(sd->s_dentry, new_dentry); 881 d_move(old_dentry, new_dentry);
1024 dput(new_dentry); 882 dput(new_dentry);
1025 883
1026 /* Remove from old parent's list and insert into new parent's list. */ 884 /* Remove from old parent's list and insert into new parent's list. */
1027 mutex_lock(&sysfs_mutex);
1028
1029 sysfs_unlink_sibling(sd); 885 sysfs_unlink_sibling(sd);
1030 sysfs_get(new_parent_sd); 886 sysfs_get(new_parent_sd);
1031 sysfs_put(sd->s_parent); 887 sysfs_put(sd->s_parent);
1032 sd->s_parent = new_parent_sd; 888 sd->s_parent = new_parent_sd;
1033 sysfs_link_sibling(sd); 889 sysfs_link_sibling(sd);
1034 890
1035 mutex_unlock(&sysfs_mutex);
1036
1037 out_unlock: 891 out_unlock:
892 mutex_unlock(&sysfs_mutex);
1038 mutex_unlock(&new_parent->d_inode->i_mutex); 893 mutex_unlock(&new_parent->d_inode->i_mutex);
1039 mutex_unlock(&old_parent->d_inode->i_mutex); 894 mutex_unlock(&old_parent->d_inode->i_mutex);
1040 out_dput: 895 out:
1041 dput(new_parent); 896 dput(new_parent);
1042 dput(old_dentry); 897 dput(old_dentry);
1043 dput(new_dentry); 898 dput(new_dentry);
899 mutex_unlock(&sysfs_rename_mutex);
1044 return error; 900 return error;
1045} 901}
1046 902
1047static int sysfs_dir_open(struct inode *inode, struct file *file)
1048{
1049 struct dentry * dentry = file->f_path.dentry;
1050 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
1051 struct sysfs_dirent * sd;
1052
1053 sd = sysfs_new_dirent("_DIR_", 0, 0);
1054 if (sd) {
1055 mutex_lock(&sysfs_mutex);
1056 sd->s_parent = sysfs_get(parent_sd);
1057 sysfs_link_sibling(sd);
1058 mutex_unlock(&sysfs_mutex);
1059 }
1060
1061 file->private_data = sd;
1062 return sd ? 0 : -ENOMEM;
1063}
1064
1065static int sysfs_dir_close(struct inode *inode, struct file *file)
1066{
1067 struct sysfs_dirent * cursor = file->private_data;
1068
1069 mutex_lock(&sysfs_mutex);
1070 sysfs_unlink_sibling(cursor);
1071 mutex_unlock(&sysfs_mutex);
1072
1073 release_sysfs_dirent(cursor);
1074
1075 return 0;
1076}
1077
1078/* Relationship between s_mode and the DT_xxx types */ 903/* Relationship between s_mode and the DT_xxx types */
1079static inline unsigned char dt_type(struct sysfs_dirent *sd) 904static inline unsigned char dt_type(struct sysfs_dirent *sd)
1080{ 905{
@@ -1085,232 +910,51 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
1085{ 910{
1086 struct dentry *dentry = filp->f_path.dentry; 911 struct dentry *dentry = filp->f_path.dentry;
1087 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 912 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
1088 struct sysfs_dirent *cursor = filp->private_data; 913 struct sysfs_dirent *pos;
1089 struct sysfs_dirent **pos;
1090 ino_t ino; 914 ino_t ino;
1091 int i = filp->f_pos;
1092 915
1093 switch (i) { 916 if (filp->f_pos == 0) {
1094 case 0: 917 ino = parent_sd->s_ino;
1095 ino = parent_sd->s_ino; 918 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
1096 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
1097 break;
1098 filp->f_pos++; 919 filp->f_pos++;
1099 i++; 920 }
1100 /* fallthrough */ 921 if (filp->f_pos == 1) {
1101 case 1: 922 if (parent_sd->s_parent)
1102 if (parent_sd->s_parent) 923 ino = parent_sd->s_parent->s_ino;
1103 ino = parent_sd->s_parent->s_ino; 924 else
1104 else 925 ino = parent_sd->s_ino;
1105 ino = parent_sd->s_ino; 926 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
1106 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
1107 break;
1108 filp->f_pos++; 927 filp->f_pos++;
1109 i++; 928 }
1110 /* fallthrough */ 929 if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) {
1111 default: 930 mutex_lock(&sysfs_mutex);
1112 mutex_lock(&sysfs_mutex);
1113
1114 pos = &parent_sd->s_children;
1115 while (*pos != cursor)
1116 pos = &(*pos)->s_sibling;
1117
1118 /* unlink cursor */
1119 *pos = cursor->s_sibling;
1120
1121 if (filp->f_pos == 2)
1122 pos = &parent_sd->s_children;
1123
1124 for ( ; *pos; pos = &(*pos)->s_sibling) {
1125 struct sysfs_dirent *next = *pos;
1126 const char * name;
1127 int len;
1128
1129 if (!sysfs_type(next))
1130 continue;
1131
1132 name = next->s_name;
1133 len = strlen(name);
1134 ino = next->s_ino;
1135
1136 if (filldir(dirent, name, len, filp->f_pos, ino,
1137 dt_type(next)) < 0)
1138 break;
1139 931
1140 filp->f_pos++; 932 /* Skip the dentries we have already reported */
1141 } 933 pos = parent_sd->s_dir.children;
934 while (pos && (filp->f_pos > pos->s_ino))
935 pos = pos->s_sibling;
1142 936
1143 /* put cursor back in */ 937 for ( ; pos; pos = pos->s_sibling) {
1144 cursor->s_sibling = *pos; 938 const char * name;
1145 *pos = cursor; 939 int len;
1146 940
1147 mutex_unlock(&sysfs_mutex); 941 name = pos->s_name;
1148 } 942 len = strlen(name);
1149 return 0; 943 filp->f_pos = ino = pos->s_ino;
1150}
1151
1152static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
1153{
1154 struct dentry * dentry = file->f_path.dentry;
1155 944
1156 switch (origin) { 945 if (filldir(dirent, name, len, filp->f_pos, ino,
1157 case 1: 946 dt_type(pos)) < 0)
1158 offset += file->f_pos;
1159 case 0:
1160 if (offset >= 0)
1161 break; 947 break;
1162 default:
1163 return -EINVAL;
1164 }
1165 if (offset != file->f_pos) {
1166 mutex_lock(&sysfs_mutex);
1167
1168 file->f_pos = offset;
1169 if (file->f_pos >= 2) {
1170 struct sysfs_dirent *sd = dentry->d_fsdata;
1171 struct sysfs_dirent *cursor = file->private_data;
1172 struct sysfs_dirent **pos;
1173 loff_t n = file->f_pos - 2;
1174
1175 sysfs_unlink_sibling(cursor);
1176
1177 pos = &sd->s_children;
1178 while (n && *pos) {
1179 struct sysfs_dirent *next = *pos;
1180 if (sysfs_type(next))
1181 n--;
1182 pos = &(*pos)->s_sibling;
1183 }
1184
1185 cursor->s_sibling = *pos;
1186 *pos = cursor;
1187 } 948 }
1188 949 if (!pos)
950 filp->f_pos = INT_MAX;
1189 mutex_unlock(&sysfs_mutex); 951 mutex_unlock(&sysfs_mutex);
1190 } 952 }
1191
1192 return offset;
1193}
1194
1195
1196/**
1197 * sysfs_make_shadowed_dir - Setup so a directory can be shadowed
1198 * @kobj: object we're creating shadow of.
1199 */
1200
1201int sysfs_make_shadowed_dir(struct kobject *kobj,
1202 void * (*follow_link)(struct dentry *, struct nameidata *))
1203{
1204 struct dentry *dentry;
1205 struct inode *inode;
1206 struct inode_operations *i_op;
1207
1208 /* get dentry for @kobj->sd, dentry of a shadowed dir is pinned */
1209 dentry = sysfs_get_dentry(kobj->sd);
1210 if (IS_ERR(dentry))
1211 return PTR_ERR(dentry);
1212
1213 inode = dentry->d_inode;
1214 if (inode->i_op != &sysfs_dir_inode_operations) {
1215 dput(dentry);
1216 return -EINVAL;
1217 }
1218
1219 i_op = kmalloc(sizeof(*i_op), GFP_KERNEL);
1220 if (!i_op)
1221 return -ENOMEM;
1222
1223 memcpy(i_op, &sysfs_dir_inode_operations, sizeof(*i_op));
1224 i_op->follow_link = follow_link;
1225
1226 /* Locking of inode->i_op?
1227 * Since setting i_op is a single word write and they
1228 * are atomic we should be ok here.
1229 */
1230 inode->i_op = i_op;
1231 return 0; 953 return 0;
1232} 954}
1233 955
1234/**
1235 * sysfs_create_shadow_dir - create a shadow directory for an object.
1236 * @kobj: object we're creating directory for.
1237 *
1238 * sysfs_make_shadowed_dir must already have been called on this
1239 * directory.
1240 */
1241
1242struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj)
1243{
1244 struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
1245 struct dentry *dir, *parent, *shadow;
1246 struct inode *inode;
1247 struct sysfs_dirent *sd;
1248 struct sysfs_addrm_cxt acxt;
1249
1250 dir = sysfs_get_dentry(kobj->sd);
1251 if (IS_ERR(dir)) {
1252 sd = (void *)dir;
1253 goto out;
1254 }
1255 parent = dir->d_parent;
1256
1257 inode = dir->d_inode;
1258 sd = ERR_PTR(-EINVAL);
1259 if (!sysfs_is_shadowed_inode(inode))
1260 goto out_dput;
1261
1262 shadow = d_alloc(parent, &dir->d_name);
1263 if (!shadow)
1264 goto nomem;
1265
1266 sd = sysfs_new_dirent("_SHADOW_", inode->i_mode, SYSFS_DIR);
1267 if (!sd)
1268 goto nomem;
1269 sd->s_elem.dir.kobj = kobj;
1270
1271 sysfs_addrm_start(&acxt, parent_sd);
1272
1273 /* add but don't link into children list */
1274 sysfs_add_one(&acxt, sd);
1275
1276 /* attach and instantiate dentry */
1277 sysfs_attach_dentry(sd, shadow);
1278 d_instantiate(shadow, igrab(inode));
1279 inc_nlink(inode); /* tj: synchronization? */
1280
1281 sysfs_addrm_finish(&acxt);
1282
1283 dget(shadow); /* Extra count - pin the dentry in core */
1284
1285 goto out_dput;
1286
1287 nomem:
1288 dput(shadow);
1289 sd = ERR_PTR(-ENOMEM);
1290 out_dput:
1291 dput(dir);
1292 out:
1293 return sd;
1294}
1295
1296/**
1297 * sysfs_remove_shadow_dir - remove an object's directory.
1298 * @shadow_sd: sysfs_dirent of shadow directory
1299 *
1300 * The only thing special about this is that we remove any files in
1301 * the directory before we remove the directory, and we've inlined
1302 * what used to be sysfs_rmdir() below, instead of calling separately.
1303 */
1304
1305void sysfs_remove_shadow_dir(struct sysfs_dirent *shadow_sd)
1306{
1307 __sysfs_remove_dir(shadow_sd);
1308}
1309 956
1310const struct file_operations sysfs_dir_operations = { 957const struct file_operations sysfs_dir_operations = {
1311 .open = sysfs_dir_open,
1312 .release = sysfs_dir_close,
1313 .llseek = sysfs_dir_lseek,
1314 .read = generic_read_dir, 958 .read = generic_read_dir,
1315 .readdir = sysfs_readdir, 959 .readdir = sysfs_readdir,
1316}; 960};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 3e1cc062a740..d3be1e7fb48b 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -1,15 +1,22 @@
1/* 1/*
2 * file.c - operations for regular (text) files. 2 * fs/sysfs/file.c - sysfs regular (text) file implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
9 *
10 * Please see Documentation/filesystems/sysfs.txt for more information.
3 */ 11 */
4 12
5#include <linux/module.h> 13#include <linux/module.h>
6#include <linux/fsnotify.h>
7#include <linux/kobject.h> 14#include <linux/kobject.h>
8#include <linux/namei.h> 15#include <linux/namei.h>
9#include <linux/poll.h> 16#include <linux/poll.h>
10#include <linux/list.h> 17#include <linux/list.h>
18#include <linux/mutex.h>
11#include <asm/uaccess.h> 19#include <asm/uaccess.h>
12#include <asm/semaphore.h>
13 20
14#include "sysfs.h" 21#include "sysfs.h"
15 22
@@ -50,14 +57,33 @@ static struct sysfs_ops subsys_sysfs_ops = {
50 .store = subsys_attr_store, 57 .store = subsys_attr_store,
51}; 58};
52 59
60/*
61 * There's one sysfs_buffer for each open file and one
62 * sysfs_open_dirent for each sysfs_dirent with one or more open
63 * files.
64 *
65 * filp->private_data points to sysfs_buffer and
66 * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open
67 * is protected by sysfs_open_dirent_lock.
68 */
69static spinlock_t sysfs_open_dirent_lock = SPIN_LOCK_UNLOCKED;
70
71struct sysfs_open_dirent {
72 atomic_t refcnt;
73 atomic_t event;
74 wait_queue_head_t poll;
75 struct list_head buffers; /* goes through sysfs_buffer.list */
76};
77
53struct sysfs_buffer { 78struct sysfs_buffer {
54 size_t count; 79 size_t count;
55 loff_t pos; 80 loff_t pos;
56 char * page; 81 char * page;
57 struct sysfs_ops * ops; 82 struct sysfs_ops * ops;
58 struct semaphore sem; 83 struct mutex mutex;
59 int needs_read_fill; 84 int needs_read_fill;
60 int event; 85 int event;
86 struct list_head list;
61}; 87};
62 88
63/** 89/**
@@ -74,7 +100,7 @@ struct sysfs_buffer {
74static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) 100static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
75{ 101{
76 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 102 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
77 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; 103 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
78 struct sysfs_ops * ops = buffer->ops; 104 struct sysfs_ops * ops = buffer->ops;
79 int ret = 0; 105 int ret = 0;
80 ssize_t count; 106 ssize_t count;
@@ -88,8 +114,8 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
88 if (!sysfs_get_active_two(attr_sd)) 114 if (!sysfs_get_active_two(attr_sd))
89 return -ENODEV; 115 return -ENODEV;
90 116
91 buffer->event = atomic_read(&attr_sd->s_event); 117 buffer->event = atomic_read(&attr_sd->s_attr.open->event);
92 count = ops->show(kobj, attr_sd->s_elem.attr.attr, buffer->page); 118 count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
93 119
94 sysfs_put_active_two(attr_sd); 120 sysfs_put_active_two(attr_sd);
95 121
@@ -128,7 +154,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
128 struct sysfs_buffer * buffer = file->private_data; 154 struct sysfs_buffer * buffer = file->private_data;
129 ssize_t retval = 0; 155 ssize_t retval = 0;
130 156
131 down(&buffer->sem); 157 mutex_lock(&buffer->mutex);
132 if (buffer->needs_read_fill) { 158 if (buffer->needs_read_fill) {
133 retval = fill_read_buffer(file->f_path.dentry,buffer); 159 retval = fill_read_buffer(file->f_path.dentry,buffer);
134 if (retval) 160 if (retval)
@@ -139,7 +165,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
139 retval = simple_read_from_buffer(buf, count, ppos, buffer->page, 165 retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
140 buffer->count); 166 buffer->count);
141out: 167out:
142 up(&buffer->sem); 168 mutex_unlock(&buffer->mutex);
143 return retval; 169 return retval;
144} 170}
145 171
@@ -189,7 +215,7 @@ static int
189flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count) 215flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count)
190{ 216{
191 struct sysfs_dirent *attr_sd = dentry->d_fsdata; 217 struct sysfs_dirent *attr_sd = dentry->d_fsdata;
192 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; 218 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
193 struct sysfs_ops * ops = buffer->ops; 219 struct sysfs_ops * ops = buffer->ops;
194 int rc; 220 int rc;
195 221
@@ -197,7 +223,7 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
197 if (!sysfs_get_active_two(attr_sd)) 223 if (!sysfs_get_active_two(attr_sd))
198 return -ENODEV; 224 return -ENODEV;
199 225
200 rc = ops->store(kobj, attr_sd->s_elem.attr.attr, buffer->page, count); 226 rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
201 227
202 sysfs_put_active_two(attr_sd); 228 sysfs_put_active_two(attr_sd);
203 229
@@ -228,20 +254,102 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t
228 struct sysfs_buffer * buffer = file->private_data; 254 struct sysfs_buffer * buffer = file->private_data;
229 ssize_t len; 255 ssize_t len;
230 256
231 down(&buffer->sem); 257 mutex_lock(&buffer->mutex);
232 len = fill_write_buffer(buffer, buf, count); 258 len = fill_write_buffer(buffer, buf, count);
233 if (len > 0) 259 if (len > 0)
234 len = flush_write_buffer(file->f_path.dentry, buffer, len); 260 len = flush_write_buffer(file->f_path.dentry, buffer, len);
235 if (len > 0) 261 if (len > 0)
236 *ppos += len; 262 *ppos += len;
237 up(&buffer->sem); 263 mutex_unlock(&buffer->mutex);
238 return len; 264 return len;
239} 265}
240 266
267/**
268 * sysfs_get_open_dirent - get or create sysfs_open_dirent
269 * @sd: target sysfs_dirent
270 * @buffer: sysfs_buffer for this instance of open
271 *
272 * If @sd->s_attr.open exists, increment its reference count;
273 * otherwise, create one. @buffer is chained to the buffers
274 * list.
275 *
276 * LOCKING:
277 * Kernel thread context (may sleep).
278 *
279 * RETURNS:
280 * 0 on success, -errno on failure.
281 */
282static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
283 struct sysfs_buffer *buffer)
284{
285 struct sysfs_open_dirent *od, *new_od = NULL;
286
287 retry:
288 spin_lock(&sysfs_open_dirent_lock);
289
290 if (!sd->s_attr.open && new_od) {
291 sd->s_attr.open = new_od;
292 new_od = NULL;
293 }
294
295 od = sd->s_attr.open;
296 if (od) {
297 atomic_inc(&od->refcnt);
298 list_add_tail(&buffer->list, &od->buffers);
299 }
300
301 spin_unlock(&sysfs_open_dirent_lock);
302
303 if (od) {
304 kfree(new_od);
305 return 0;
306 }
307
308 /* not there, initialize a new one and retry */
309 new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
310 if (!new_od)
311 return -ENOMEM;
312
313 atomic_set(&new_od->refcnt, 0);
314 atomic_set(&new_od->event, 1);
315 init_waitqueue_head(&new_od->poll);
316 INIT_LIST_HEAD(&new_od->buffers);
317 goto retry;
318}
319
320/**
321 * sysfs_put_open_dirent - put sysfs_open_dirent
322 * @sd: target sysfs_dirent
323 * @buffer: associated sysfs_buffer
324 *
325 * Put @sd->s_attr.open and unlink @buffer from the buffers list.
326 * If reference count reaches zero, disassociate and free it.
327 *
328 * LOCKING:
329 * None.
330 */
331static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
332 struct sysfs_buffer *buffer)
333{
334 struct sysfs_open_dirent *od = sd->s_attr.open;
335
336 spin_lock(&sysfs_open_dirent_lock);
337
338 list_del(&buffer->list);
339 if (atomic_dec_and_test(&od->refcnt))
340 sd->s_attr.open = NULL;
341 else
342 od = NULL;
343
344 spin_unlock(&sysfs_open_dirent_lock);
345
346 kfree(od);
347}
348
241static int sysfs_open_file(struct inode *inode, struct file *file) 349static int sysfs_open_file(struct inode *inode, struct file *file)
242{ 350{
243 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 351 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
244 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; 352 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
245 struct sysfs_buffer * buffer; 353 struct sysfs_buffer * buffer;
246 struct sysfs_ops * ops = NULL; 354 struct sysfs_ops * ops = NULL;
247 int error; 355 int error;
@@ -294,33 +402,38 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
294 if (!buffer) 402 if (!buffer)
295 goto err_out; 403 goto err_out;
296 404
297 init_MUTEX(&buffer->sem); 405 mutex_init(&buffer->mutex);
298 buffer->needs_read_fill = 1; 406 buffer->needs_read_fill = 1;
299 buffer->ops = ops; 407 buffer->ops = ops;
300 file->private_data = buffer; 408 file->private_data = buffer;
301 409
302 /* open succeeded, put active references and pin attr_sd */ 410 /* make sure we have open dirent struct */
411 error = sysfs_get_open_dirent(attr_sd, buffer);
412 if (error)
413 goto err_free;
414
415 /* open succeeded, put active references */
303 sysfs_put_active_two(attr_sd); 416 sysfs_put_active_two(attr_sd);
304 sysfs_get(attr_sd);
305 return 0; 417 return 0;
306 418
419 err_free:
420 kfree(buffer);
307 err_out: 421 err_out:
308 sysfs_put_active_two(attr_sd); 422 sysfs_put_active_two(attr_sd);
309 return error; 423 return error;
310} 424}
311 425
312static int sysfs_release(struct inode * inode, struct file * filp) 426static int sysfs_release(struct inode *inode, struct file *filp)
313{ 427{
314 struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; 428 struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
315 struct sysfs_buffer *buffer = filp->private_data; 429 struct sysfs_buffer *buffer = filp->private_data;
316 430
317 sysfs_put(attr_sd); 431 sysfs_put_open_dirent(sd, buffer);
432
433 if (buffer->page)
434 free_page((unsigned long)buffer->page);
435 kfree(buffer);
318 436
319 if (buffer) {
320 if (buffer->page)
321 free_page((unsigned long)buffer->page);
322 kfree(buffer);
323 }
324 return 0; 437 return 0;
325} 438}
326 439
@@ -335,24 +448,24 @@ static int sysfs_release(struct inode * inode, struct file * filp)
335 * again will not get new data, or reset the state of 'poll'. 448 * again will not get new data, or reset the state of 'poll'.
336 * Reminder: this only works for attributes which actively support 449 * Reminder: this only works for attributes which actively support
337 * it, and it is not possible to test an attribute from userspace 450 * it, and it is not possible to test an attribute from userspace
338 * to see if it supports poll (Nether 'poll' or 'select' return 451 * to see if it supports poll (Neither 'poll' nor 'select' return
339 * an appropriate error code). When in doubt, set a suitable timeout value. 452 * an appropriate error code). When in doubt, set a suitable timeout value.
340 */ 453 */
341static unsigned int sysfs_poll(struct file *filp, poll_table *wait) 454static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
342{ 455{
343 struct sysfs_buffer * buffer = filp->private_data; 456 struct sysfs_buffer * buffer = filp->private_data;
344 struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; 457 struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
345 struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj; 458 struct sysfs_open_dirent *od = attr_sd->s_attr.open;
346 459
347 /* need parent for the kobj, grab both */ 460 /* need parent for the kobj, grab both */
348 if (!sysfs_get_active_two(attr_sd)) 461 if (!sysfs_get_active_two(attr_sd))
349 goto trigger; 462 goto trigger;
350 463
351 poll_wait(filp, &kobj->poll, wait); 464 poll_wait(filp, &od->poll, wait);
352 465
353 sysfs_put_active_two(attr_sd); 466 sysfs_put_active_two(attr_sd);
354 467
355 if (buffer->event != atomic_read(&attr_sd->s_event)) 468 if (buffer->event != atomic_read(&od->event))
356 goto trigger; 469 goto trigger;
357 470
358 return 0; 471 return 0;
@@ -373,8 +486,17 @@ void sysfs_notify(struct kobject *k, char *dir, char *attr)
373 if (sd && attr) 486 if (sd && attr)
374 sd = sysfs_find_dirent(sd, attr); 487 sd = sysfs_find_dirent(sd, attr);
375 if (sd) { 488 if (sd) {
376 atomic_inc(&sd->s_event); 489 struct sysfs_open_dirent *od;
377 wake_up_interruptible(&k->poll); 490
491 spin_lock(&sysfs_open_dirent_lock);
492
493 od = sd->s_attr.open;
494 if (od) {
495 atomic_inc(&od->event);
496 wake_up_interruptible(&od->poll);
497 }
498
499 spin_unlock(&sysfs_open_dirent_lock);
378 } 500 }
379 501
380 mutex_unlock(&sysfs_mutex); 502 mutex_unlock(&sysfs_mutex);
@@ -397,25 +519,21 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
397 umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG; 519 umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG;
398 struct sysfs_addrm_cxt acxt; 520 struct sysfs_addrm_cxt acxt;
399 struct sysfs_dirent *sd; 521 struct sysfs_dirent *sd;
522 int rc;
400 523
401 sd = sysfs_new_dirent(attr->name, mode, type); 524 sd = sysfs_new_dirent(attr->name, mode, type);
402 if (!sd) 525 if (!sd)
403 return -ENOMEM; 526 return -ENOMEM;
404 sd->s_elem.attr.attr = (void *)attr; 527 sd->s_attr.attr = (void *)attr;
405 528
406 sysfs_addrm_start(&acxt, dir_sd); 529 sysfs_addrm_start(&acxt, dir_sd);
530 rc = sysfs_add_one(&acxt, sd);
531 sysfs_addrm_finish(&acxt);
407 532
408 if (!sysfs_find_dirent(dir_sd, attr->name)) { 533 if (rc)
409 sysfs_add_one(&acxt, sd);
410 sysfs_link_sibling(sd);
411 }
412
413 if (!sysfs_addrm_finish(&acxt)) {
414 sysfs_put(sd); 534 sysfs_put(sd);
415 return -EEXIST;
416 }
417 535
418 return 0; 536 return rc;
419} 537}
420 538
421 539
@@ -457,42 +575,6 @@ int sysfs_add_file_to_group(struct kobject *kobj,
457} 575}
458EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); 576EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
459 577
460
461/**
462 * sysfs_update_file - update the modified timestamp on an object attribute.
463 * @kobj: object we're acting for.
464 * @attr: attribute descriptor.
465 */
466int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
467{
468 struct sysfs_dirent *victim_sd = NULL;
469 struct dentry *victim = NULL;
470 int rc;
471
472 rc = -ENOENT;
473 victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
474 if (!victim_sd)
475 goto out;
476
477 victim = sysfs_get_dentry(victim_sd);
478 if (IS_ERR(victim)) {
479 rc = PTR_ERR(victim);
480 victim = NULL;
481 goto out;
482 }
483
484 mutex_lock(&victim->d_inode->i_mutex);
485 victim->d_inode->i_mtime = CURRENT_TIME;
486 fsnotify_modify(victim);
487 mutex_unlock(&victim->d_inode->i_mutex);
488 rc = 0;
489 out:
490 dput(victim);
491 sysfs_put(victim_sd);
492 return rc;
493}
494
495
496/** 578/**
497 * sysfs_chmod_file - update the modified mode value on an object attribute. 579 * sysfs_chmod_file - update the modified mode value on an object attribute.
498 * @kobj: object we're acting for. 580 * @kobj: object we're acting for.
@@ -513,7 +595,9 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
513 if (!victim_sd) 595 if (!victim_sd)
514 goto out; 596 goto out;
515 597
598 mutex_lock(&sysfs_rename_mutex);
516 victim = sysfs_get_dentry(victim_sd); 599 victim = sysfs_get_dentry(victim_sd);
600 mutex_unlock(&sysfs_rename_mutex);
517 if (IS_ERR(victim)) { 601 if (IS_ERR(victim)) {
518 rc = PTR_ERR(victim); 602 rc = PTR_ERR(victim);
519 victim = NULL; 603 victim = NULL;
@@ -521,10 +605,19 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
521 } 605 }
522 606
523 inode = victim->d_inode; 607 inode = victim->d_inode;
608
524 mutex_lock(&inode->i_mutex); 609 mutex_lock(&inode->i_mutex);
610
525 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); 611 newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
526 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; 612 newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
527 rc = notify_change(victim, &newattrs); 613 rc = notify_change(victim, &newattrs);
614
615 if (rc == 0) {
616 mutex_lock(&sysfs_mutex);
617 victim_sd->s_mode = newattrs.ia_mode;
618 mutex_unlock(&sysfs_mutex);
619 }
620
528 mutex_unlock(&inode->i_mutex); 621 mutex_unlock(&inode->i_mutex);
529 out: 622 out:
530 dput(victim); 623 dput(victim);
@@ -632,4 +725,3 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
632 725
633EXPORT_SYMBOL_GPL(sysfs_create_file); 726EXPORT_SYMBOL_GPL(sysfs_create_file);
634EXPORT_SYMBOL_GPL(sysfs_remove_file); 727EXPORT_SYMBOL_GPL(sysfs_remove_file);
635EXPORT_SYMBOL_GPL(sysfs_update_file);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index f318b73c790c..d1972374655a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -13,8 +13,6 @@
13#include <linux/dcache.h> 13#include <linux/dcache.h>
14#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/fs.h>
17#include <asm/semaphore.h>
18#include "sysfs.h" 16#include "sysfs.h"
19 17
20 18
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 10d1b52899f1..9236635111f4 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -1,7 +1,11 @@
1/* 1/*
2 * inode.c - basic inode and dentry operations. 2 * fs/sysfs/inode.c - basic sysfs inode and dentry operations
3 * 3 *
4 * sysfs is Copyright (c) 2001-3 Patrick Mochel 4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
5 * 9 *
6 * Please see Documentation/filesystems/sysfs.txt for more information. 10 * Please see Documentation/filesystems/sysfs.txt for more information.
7 */ 11 */
@@ -14,7 +18,6 @@
14#include <linux/capability.h> 18#include <linux/capability.h>
15#include <linux/errno.h> 19#include <linux/errno.h>
16#include <linux/sched.h> 20#include <linux/sched.h>
17#include <asm/semaphore.h>
18#include "sysfs.h" 21#include "sysfs.h"
19 22
20extern struct super_block * sysfs_sb; 23extern struct super_block * sysfs_sb;
@@ -34,16 +37,6 @@ static const struct inode_operations sysfs_inode_operations ={
34 .setattr = sysfs_setattr, 37 .setattr = sysfs_setattr,
35}; 38};
36 39
37void sysfs_delete_inode(struct inode *inode)
38{
39 /* Free the shadowed directory inode operations */
40 if (sysfs_is_shadowed_inode(inode)) {
41 kfree(inode->i_op);
42 inode->i_op = NULL;
43 }
44 return generic_delete_inode(inode);
45}
46
47int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) 40int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
48{ 41{
49 struct inode * inode = dentry->d_inode; 42 struct inode * inode = dentry->d_inode;
@@ -133,8 +126,22 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
133 */ 126 */
134static struct lock_class_key sysfs_inode_imutex_key; 127static struct lock_class_key sysfs_inode_imutex_key;
135 128
129static int sysfs_count_nlink(struct sysfs_dirent *sd)
130{
131 struct sysfs_dirent *child;
132 int nr = 0;
133
134 for (child = sd->s_dir.children; child; child = child->s_sibling)
135 if (sysfs_type(child) == SYSFS_DIR)
136 nr++;
137
138 return nr + 2;
139}
140
136static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) 141static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
137{ 142{
143 struct bin_attribute *bin_attr;
144
138 inode->i_blocks = 0; 145 inode->i_blocks = 0;
139 inode->i_mapping->a_ops = &sysfs_aops; 146 inode->i_mapping->a_ops = &sysfs_aops;
140 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; 147 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
@@ -150,6 +157,32 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
150 set_inode_attr(inode, sd->s_iattr); 157 set_inode_attr(inode, sd->s_iattr);
151 } else 158 } else
152 set_default_inode_attr(inode, sd->s_mode); 159 set_default_inode_attr(inode, sd->s_mode);
160
161
162 /* initialize inode according to type */
163 switch (sysfs_type(sd)) {
164 case SYSFS_DIR:
165 inode->i_op = &sysfs_dir_inode_operations;
166 inode->i_fop = &sysfs_dir_operations;
167 inode->i_nlink = sysfs_count_nlink(sd);
168 break;
169 case SYSFS_KOBJ_ATTR:
170 inode->i_size = PAGE_SIZE;
171 inode->i_fop = &sysfs_file_operations;
172 break;
173 case SYSFS_KOBJ_BIN_ATTR:
174 bin_attr = sd->s_bin_attr.bin_attr;
175 inode->i_size = bin_attr->size;
176 inode->i_fop = &bin_fops;
177 break;
178 case SYSFS_KOBJ_LINK:
179 inode->i_op = &sysfs_symlink_inode_operations;
180 break;
181 default:
182 BUG();
183 }
184
185 unlock_new_inode(inode);
153} 186}
154 187
155/** 188/**
@@ -177,50 +210,24 @@ struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
177 return inode; 210 return inode;
178} 211}
179 212
180/**
181 * sysfs_instantiate - instantiate dentry
182 * @dentry: dentry to be instantiated
183 * @inode: inode associated with @sd
184 *
185 * Unlock @inode if locked and instantiate @dentry with @inode.
186 *
187 * LOCKING:
188 * None.
189 */
190void sysfs_instantiate(struct dentry *dentry, struct inode *inode)
191{
192 BUG_ON(!dentry || dentry->d_inode);
193
194 if (inode->i_state & I_NEW)
195 unlock_new_inode(inode);
196
197 d_instantiate(dentry, inode);
198}
199
200int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) 213int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
201{ 214{
202 struct sysfs_addrm_cxt acxt; 215 struct sysfs_addrm_cxt acxt;
203 struct sysfs_dirent **pos, *sd; 216 struct sysfs_dirent *sd;
204 217
205 if (!dir_sd) 218 if (!dir_sd)
206 return -ENOENT; 219 return -ENOENT;
207 220
208 sysfs_addrm_start(&acxt, dir_sd); 221 sysfs_addrm_start(&acxt, dir_sd);
209 222
210 for (pos = &dir_sd->s_children; *pos; pos = &(*pos)->s_sibling) { 223 sd = sysfs_find_dirent(dir_sd, name);
211 sd = *pos; 224 if (sd)
212 225 sysfs_remove_one(&acxt, sd);
213 if (!sysfs_type(sd)) 226
214 continue; 227 sysfs_addrm_finish(&acxt);
215 if (!strcmp(sd->s_name, name)) {
216 *pos = sd->s_sibling;
217 sd->s_sibling = NULL;
218 sysfs_remove_one(&acxt, sd);
219 break;
220 }
221 }
222 228
223 if (sysfs_addrm_finish(&acxt)) 229 if (sd)
224 return 0; 230 return 0;
225 return -ENOENT; 231 else
232 return -ENOENT;
226} 233}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index fbc7b65fe262..c76c540be3c8 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -1,5 +1,13 @@
1/* 1/*
2 * mount.c - operations for initializing and mounting sysfs. 2 * fs/sysfs/symlink.c - operations for initializing and mounting sysfs
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
9 *
10 * Please see Documentation/filesystems/sysfs.txt for more information.
3 */ 11 */
4 12
5#define DEBUG 13#define DEBUG
@@ -8,25 +16,25 @@
8#include <linux/mount.h> 16#include <linux/mount.h>
9#include <linux/pagemap.h> 17#include <linux/pagemap.h>
10#include <linux/init.h> 18#include <linux/init.h>
11#include <asm/semaphore.h>
12 19
13#include "sysfs.h" 20#include "sysfs.h"
14 21
15/* Random magic number */ 22/* Random magic number */
16#define SYSFS_MAGIC 0x62656572 23#define SYSFS_MAGIC 0x62656572
17 24
18struct vfsmount *sysfs_mount; 25static struct vfsmount *sysfs_mount;
19struct super_block * sysfs_sb = NULL; 26struct super_block * sysfs_sb = NULL;
20struct kmem_cache *sysfs_dir_cachep; 27struct kmem_cache *sysfs_dir_cachep;
21 28
22static const struct super_operations sysfs_ops = { 29static const struct super_operations sysfs_ops = {
23 .statfs = simple_statfs, 30 .statfs = simple_statfs,
24 .drop_inode = sysfs_delete_inode, 31 .drop_inode = generic_delete_inode,
25}; 32};
26 33
27struct sysfs_dirent sysfs_root = { 34struct sysfs_dirent sysfs_root = {
35 .s_name = "",
28 .s_count = ATOMIC_INIT(1), 36 .s_count = ATOMIC_INIT(1),
29 .s_flags = SYSFS_ROOT, 37 .s_flags = SYSFS_DIR,
30 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, 38 .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
31 .s_ino = 1, 39 .s_ino = 1,
32}; 40};
@@ -50,11 +58,6 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
50 return -ENOMEM; 58 return -ENOMEM;
51 } 59 }
52 60
53 inode->i_op = &sysfs_dir_inode_operations;
54 inode->i_fop = &sysfs_dir_operations;
55 inc_nlink(inode); /* directory, account for "." */
56 unlock_new_inode(inode);
57
58 /* instantiate and link root dentry */ 61 /* instantiate and link root dentry */
59 root = d_alloc_root(inode); 62 root = d_alloc_root(inode);
60 if (!root) { 63 if (!root) {
@@ -62,7 +65,6 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
62 iput(inode); 65 iput(inode);
63 return -ENOMEM; 66 return -ENOMEM;
64 } 67 }
65 sysfs_root.s_dentry = root;
66 root->d_fsdata = &sysfs_root; 68 root->d_fsdata = &sysfs_root;
67 sb->s_root = root; 69 sb->s_root = root;
68 return 0; 70 return 0;
@@ -77,7 +79,7 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
77static struct file_system_type sysfs_fs_type = { 79static struct file_system_type sysfs_fs_type = {
78 .name = "sysfs", 80 .name = "sysfs",
79 .get_sb = sysfs_get_sb, 81 .get_sb = sysfs_get_sb,
80 .kill_sb = kill_litter_super, 82 .kill_sb = kill_anon_super,
81}; 83};
82 84
83int __init sysfs_init(void) 85int __init sysfs_init(void)
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 4ce687f0b5d0..3eac20c63c41 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -1,5 +1,13 @@
1/* 1/*
2 * symlink.c - operations for sysfs symlinks. 2 * fs/sysfs/symlink.c - sysfs symlink implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
9 *
10 * Please see Documentation/filesystems/sysfs.txt for more information.
3 */ 11 */
4 12
5#include <linux/fs.h> 13#include <linux/fs.h>
@@ -7,7 +15,7 @@
7#include <linux/module.h> 15#include <linux/module.h>
8#include <linux/kobject.h> 16#include <linux/kobject.h>
9#include <linux/namei.h> 17#include <linux/namei.h>
10#include <asm/semaphore.h> 18#include <linux/mutex.h>
11 19
12#include "sysfs.h" 20#include "sysfs.h"
13 21
@@ -60,10 +68,9 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
60 68
61 BUG_ON(!name); 69 BUG_ON(!name);
62 70
63 if (!kobj) { 71 if (!kobj)
64 if (sysfs_mount && sysfs_mount->mnt_sb) 72 parent_sd = &sysfs_root;
65 parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata; 73 else
66 } else
67 parent_sd = kobj->sd; 74 parent_sd = kobj->sd;
68 75
69 error = -EFAULT; 76 error = -EFAULT;
@@ -87,20 +94,15 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
87 if (!sd) 94 if (!sd)
88 goto out_put; 95 goto out_put;
89 96
90 sd->s_elem.symlink.target_sd = target_sd; 97 sd->s_symlink.target_sd = target_sd;
91 target_sd = NULL; /* reference is now owned by the symlink */ 98 target_sd = NULL; /* reference is now owned by the symlink */
92 99
93 sysfs_addrm_start(&acxt, parent_sd); 100 sysfs_addrm_start(&acxt, parent_sd);
101 error = sysfs_add_one(&acxt, sd);
102 sysfs_addrm_finish(&acxt);
94 103
95 if (!sysfs_find_dirent(parent_sd, name)) { 104 if (error)
96 sysfs_add_one(&acxt, sd);
97 sysfs_link_sibling(sd);
98 }
99
100 if (!sysfs_addrm_finish(&acxt)) {
101 error = -EEXIST;
102 goto out_put; 105 goto out_put;
103 }
104 106
105 return 0; 107 return 0;
106 108
@@ -148,7 +150,7 @@ static int sysfs_getlink(struct dentry *dentry, char * path)
148{ 150{
149 struct sysfs_dirent *sd = dentry->d_fsdata; 151 struct sysfs_dirent *sd = dentry->d_fsdata;
150 struct sysfs_dirent *parent_sd = sd->s_parent; 152 struct sysfs_dirent *parent_sd = sd->s_parent;
151 struct sysfs_dirent *target_sd = sd->s_elem.symlink.target_sd; 153 struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
152 int error; 154 int error;
153 155
154 mutex_lock(&sysfs_mutex); 156 mutex_lock(&sysfs_mutex);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6b8c8d76d308..f0326f281d1c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -1,20 +1,39 @@
1/*
2 * fs/sysfs/sysfs.h - sysfs internal header file
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
9 */
10
11struct sysfs_open_dirent;
12
13/* type-specific structures for sysfs_dirent->s_* union members */
1struct sysfs_elem_dir { 14struct sysfs_elem_dir {
2 struct kobject * kobj; 15 struct kobject *kobj;
16 /* children list starts here and goes through sd->s_sibling */
17 struct sysfs_dirent *children;
3}; 18};
4 19
5struct sysfs_elem_symlink { 20struct sysfs_elem_symlink {
6 struct sysfs_dirent * target_sd; 21 struct sysfs_dirent *target_sd;
7}; 22};
8 23
9struct sysfs_elem_attr { 24struct sysfs_elem_attr {
10 struct attribute * attr; 25 struct attribute *attr;
26 struct sysfs_open_dirent *open;
11}; 27};
12 28
13struct sysfs_elem_bin_attr { 29struct sysfs_elem_bin_attr {
14 struct bin_attribute * bin_attr; 30 struct bin_attribute *bin_attr;
15}; 31};
16 32
17/* 33/*
34 * sysfs_dirent - the building block of sysfs hierarchy. Each and
35 * every sysfs node is represented by single sysfs_dirent.
36 *
18 * As long as s_count reference is held, the sysfs_dirent itself is 37 * As long as s_count reference is held, the sysfs_dirent itself is
19 * accessible. Dereferencing s_elem or any other outer entity 38 * accessible. Dereferencing s_elem or any other outer entity
20 * requires s_active reference. 39 * requires s_active reference.
@@ -22,28 +41,43 @@ struct sysfs_elem_bin_attr {
22struct sysfs_dirent { 41struct sysfs_dirent {
23 atomic_t s_count; 42 atomic_t s_count;
24 atomic_t s_active; 43 atomic_t s_active;
25 struct sysfs_dirent * s_parent; 44 struct sysfs_dirent *s_parent;
26 struct sysfs_dirent * s_sibling; 45 struct sysfs_dirent *s_sibling;
27 struct sysfs_dirent * s_children; 46 const char *s_name;
28 const char * s_name;
29 47
30 union { 48 union {
31 struct sysfs_elem_dir dir; 49 struct sysfs_elem_dir s_dir;
32 struct sysfs_elem_symlink symlink; 50 struct sysfs_elem_symlink s_symlink;
33 struct sysfs_elem_attr attr; 51 struct sysfs_elem_attr s_attr;
34 struct sysfs_elem_bin_attr bin_attr; 52 struct sysfs_elem_bin_attr s_bin_attr;
35 } s_elem; 53 };
36 54
37 unsigned int s_flags; 55 unsigned int s_flags;
38 umode_t s_mode;
39 ino_t s_ino; 56 ino_t s_ino;
40 struct dentry * s_dentry; 57 umode_t s_mode;
41 struct iattr * s_iattr; 58 struct iattr *s_iattr;
42 atomic_t s_event;
43}; 59};
44 60
45#define SD_DEACTIVATED_BIAS INT_MIN 61#define SD_DEACTIVATED_BIAS INT_MIN
62
63#define SYSFS_TYPE_MASK 0x00ff
64#define SYSFS_DIR 0x0001
65#define SYSFS_KOBJ_ATTR 0x0002
66#define SYSFS_KOBJ_BIN_ATTR 0x0004
67#define SYSFS_KOBJ_LINK 0x0008
68#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
69
70#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK
71#define SYSFS_FLAG_REMOVED 0x0200
72
73static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
74{
75 return sd->s_flags & SYSFS_TYPE_MASK;
76}
46 77
78/*
79 * Context structure to be used while adding/removing nodes.
80 */
47struct sysfs_addrm_cxt { 81struct sysfs_addrm_cxt {
48 struct sysfs_dirent *parent_sd; 82 struct sysfs_dirent *parent_sd;
49 struct inode *parent_inode; 83 struct inode *parent_inode;
@@ -51,63 +85,47 @@ struct sysfs_addrm_cxt {
51 int cnt; 85 int cnt;
52}; 86};
53 87
54extern struct vfsmount * sysfs_mount; 88/*
89 * mount.c
90 */
55extern struct sysfs_dirent sysfs_root; 91extern struct sysfs_dirent sysfs_root;
92extern struct super_block *sysfs_sb;
56extern struct kmem_cache *sysfs_dir_cachep; 93extern struct kmem_cache *sysfs_dir_cachep;
57 94
58extern struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); 95/*
59extern void sysfs_link_sibling(struct sysfs_dirent *sd); 96 * dir.c
60extern void sysfs_unlink_sibling(struct sysfs_dirent *sd); 97 */
61extern struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
62extern void sysfs_put_active(struct sysfs_dirent *sd);
63extern struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
64extern void sysfs_put_active_two(struct sysfs_dirent *sd);
65extern void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
66 struct sysfs_dirent *parent_sd);
67extern void sysfs_add_one(struct sysfs_addrm_cxt *acxt,
68 struct sysfs_dirent *sd);
69extern void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
70 struct sysfs_dirent *sd);
71extern int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
72
73extern void sysfs_delete_inode(struct inode *inode);
74extern struct inode * sysfs_get_inode(struct sysfs_dirent *sd);
75extern void sysfs_instantiate(struct dentry *dentry, struct inode *inode);
76
77extern void release_sysfs_dirent(struct sysfs_dirent * sd);
78extern struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
79 const unsigned char *name);
80extern struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
81 const unsigned char *name);
82extern struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode,
83 int type);
84
85extern int sysfs_add_file(struct sysfs_dirent *dir_sd,
86 const struct attribute *attr, int type);
87extern int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
88extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
89
90extern int sysfs_create_subdir(struct kobject *kobj, const char *name,
91 struct sysfs_dirent **p_sd);
92extern void sysfs_remove_subdir(struct sysfs_dirent *sd);
93
94extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
95
96extern spinlock_t sysfs_assoc_lock;
97extern struct mutex sysfs_mutex; 98extern struct mutex sysfs_mutex;
98extern struct super_block * sysfs_sb; 99extern struct mutex sysfs_rename_mutex;
100extern spinlock_t sysfs_assoc_lock;
101
99extern const struct file_operations sysfs_dir_operations; 102extern const struct file_operations sysfs_dir_operations;
100extern const struct file_operations sysfs_file_operations;
101extern const struct file_operations bin_fops;
102extern const struct inode_operations sysfs_dir_inode_operations; 103extern const struct inode_operations sysfs_dir_inode_operations;
103extern const struct inode_operations sysfs_symlink_inode_operations;
104
105static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
106{
107 return sd->s_flags & SYSFS_TYPE_MASK;
108}
109 104
110static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd) 105struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
106struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
107void sysfs_put_active(struct sysfs_dirent *sd);
108struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
109void sysfs_put_active_two(struct sysfs_dirent *sd);
110void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
111 struct sysfs_dirent *parent_sd);
112int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
113void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
114void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
115
116struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
117 const unsigned char *name);
118struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
119 const unsigned char *name);
120struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
121
122void release_sysfs_dirent(struct sysfs_dirent *sd);
123
124int sysfs_create_subdir(struct kobject *kobj, const char *name,
125 struct sysfs_dirent **p_sd);
126void sysfs_remove_subdir(struct sysfs_dirent *sd);
127
128static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
111{ 129{
112 if (sd) { 130 if (sd) {
113 WARN_ON(!atomic_read(&sd->s_count)); 131 WARN_ON(!atomic_read(&sd->s_count));
@@ -116,13 +134,33 @@ static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd)
116 return sd; 134 return sd;
117} 135}
118 136
119static inline void sysfs_put(struct sysfs_dirent * sd) 137static inline void sysfs_put(struct sysfs_dirent *sd)
120{ 138{
121 if (sd && atomic_dec_and_test(&sd->s_count)) 139 if (sd && atomic_dec_and_test(&sd->s_count))
122 release_sysfs_dirent(sd); 140 release_sysfs_dirent(sd);
123} 141}
124 142
125static inline int sysfs_is_shadowed_inode(struct inode *inode) 143/*
126{ 144 * inode.c
127 return S_ISDIR(inode->i_mode) && inode->i_op->follow_link; 145 */
128} 146struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
147int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
148int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
149
150/*
151 * file.c
152 */
153extern const struct file_operations sysfs_file_operations;
154
155int sysfs_add_file(struct sysfs_dirent *dir_sd,
156 const struct attribute *attr, int type);
157
158/*
159 * bin.c
160 */
161extern const struct file_operations bin_fops;
162
163/*
164 * symlink.c
165 */
166extern const struct inode_operations sysfs_symlink_inode_operations;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 5f152f60d74d..6f4c29e9c3d9 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -323,17 +323,13 @@ xfs_iomap_valid(
323/* 323/*
324 * BIO completion handler for buffered IO. 324 * BIO completion handler for buffered IO.
325 */ 325 */
326STATIC int 326STATIC void
327xfs_end_bio( 327xfs_end_bio(
328 struct bio *bio, 328 struct bio *bio,
329 unsigned int bytes_done,
330 int error) 329 int error)
331{ 330{
332 xfs_ioend_t *ioend = bio->bi_private; 331 xfs_ioend_t *ioend = bio->bi_private;
333 332
334 if (bio->bi_size)
335 return 1;
336
337 ASSERT(atomic_read(&bio->bi_cnt) >= 1); 333 ASSERT(atomic_read(&bio->bi_cnt) >= 1);
338 ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; 334 ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
339 335
@@ -343,7 +339,6 @@ xfs_end_bio(
343 bio_put(bio); 339 bio_put(bio);
344 340
345 xfs_finish_ioend(ioend, 0); 341 xfs_finish_ioend(ioend, 0);
346 return 0;
347} 342}
348 343
349STATIC void 344STATIC void
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b0f0e58866de..39f44ee572e8 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1103,19 +1103,15 @@ _xfs_buf_ioend(
1103 } 1103 }
1104} 1104}
1105 1105
1106STATIC int 1106STATIC void
1107xfs_buf_bio_end_io( 1107xfs_buf_bio_end_io(
1108 struct bio *bio, 1108 struct bio *bio,
1109 unsigned int bytes_done,
1110 int error) 1109 int error)
1111{ 1110{
1112 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1111 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1113 unsigned int blocksize = bp->b_target->bt_bsize; 1112 unsigned int blocksize = bp->b_target->bt_bsize;
1114 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1113 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1115 1114
1116 if (bio->bi_size)
1117 return 1;
1118
1119 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1115 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1120 bp->b_error = EIO; 1116 bp->b_error = EIO;
1121 1117
@@ -1143,7 +1139,6 @@ xfs_buf_bio_end_io(
1143 1139
1144 _xfs_buf_ioend(bp, 1); 1140 _xfs_buf_ioend(bp, 1);
1145 bio_put(bio); 1141 bio_put(bio);
1146 return 0;
1147} 1142}
1148 1143
1149STATIC void 1144STATIC void