aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile6
-rw-r--r--fs/binfmt_elf.c14
-rw-r--r--fs/bio-integrity.c5
-rw-r--r--fs/bio.c11
-rw-r--r--fs/btrfs/Kconfig13
-rw-r--r--fs/btrfs/async-thread.c61
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c325
-rw-r--r--fs/btrfs/ctree.h77
-rw-r--r--fs/btrfs/disk-io.c170
-rw-r--r--fs/btrfs/disk-io.h12
-rw-r--r--fs/btrfs/extent-tree.c775
-rw-r--r--fs/btrfs/extent_io.c134
-rw-r--r--fs/btrfs/extent_io.h18
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/file.c29
-rw-r--r--fs/btrfs/inode-map.c1
-rw-r--r--fs/btrfs/inode.c142
-rw-r--r--fs/btrfs/ioctl.c7
-rw-r--r--fs/btrfs/locking.c207
-rw-r--r--fs/btrfs/locking.h8
-rw-r--r--fs/btrfs/ordered-data.c4
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h1
-rw-r--r--fs/btrfs/super.c11
-rw-r--r--fs/btrfs/transaction.c6
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c356
-rw-r--r--fs/btrfs/volumes.c55
-rw-r--r--fs/btrfs/xattr.c48
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/buffer.c5
-rw-r--r--fs/cifs/CHANGES15
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h6
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c7
-rw-r--r--fs/cifs/connect.c51
-rw-r--r--fs/cifs/dir.c307
-rw-r--r--fs/cifs/inode.c104
-rw-r--r--fs/cifs/readdir.c58
-rw-r--r--fs/cifs/sess.c91
-rw-r--r--fs/compat.c2
-rw-r--r--fs/compat_ioctl.c7
-rw-r--r--fs/configfs/dir.c59
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/devpts/inode.c5
-rw-r--r--fs/ecryptfs/crypto.c6
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h3
-rw-r--r--fs/ecryptfs/keystore.c3
-rw-r--r--fs/ecryptfs/main.c5
-rw-r--r--fs/exec.c28
-rw-r--r--fs/ext2/super.c9
-rw-r--r--fs/ext3/super.c11
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/ialloc.c15
-rw-r--r--fs/ext4/inode.c38
-rw-r--r--fs/ext4/mballoc.c32
-rw-r--r--fs/ext4/migrate.c8
-rw-r--r--fs/ext4/super.c12
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/hugetlbfs/inode.c8
-rw-r--r--fs/inode.c7
-rw-r--r--fs/internal.h2
-rw-r--r--fs/jbd/journal.c17
-rw-r--r--fs/jbd2/journal.c17
-rw-r--r--fs/jbd2/transaction.c42
-rw-r--r--fs/jffs2/background.c18
-rw-r--r--fs/jffs2/readinode.c42
-rw-r--r--fs/lockd/clntlock.c51
-rw-r--r--fs/lockd/svclock.c6
-rw-r--r--fs/namespace.c6
-rw-r--r--fs/nfs/client.c73
-rw-r--r--fs/nfs/dir.c8
-rw-r--r--fs/nfs/nfs3acl.c27
-rw-r--r--fs/nfs/nfs3xdr.c34
-rw-r--r--fs/nfs/nfs4namespace.c15
-rw-r--r--fs/notify/inotify/inotify.c2
-rw-r--r--fs/ocfs2/alloc.c30
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c12
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmglue.c11
-rw-r--r--fs/ocfs2/journal.h6
-rw-r--r--fs/ocfs2/namei.c3
-rw-r--r--fs/ocfs2/ocfs2.h3
-rw-r--r--fs/ocfs2/ocfs2_fs.h6
-rw-r--r--fs/ocfs2/super.c8
-rw-r--r--fs/ocfs2/xattr.c57
-rw-r--r--fs/pipe.c8
-rw-r--r--fs/proc/inode.c4
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/ramfs/file-nommu.c4
-rw-r--r--fs/seq_file.c151
-rw-r--r--fs/squashfs/block.c18
-rw-r--r--fs/squashfs/cache.c4
-rw-r--r--fs/squashfs/inode.c6
-rw-r--r--fs/squashfs/squashfs.h2
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/super.c26
-rw-r--r--fs/timerfd.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c91
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c10
-rw-r--r--fs/xfs/xfs_iget.c15
-rw-r--r--fs/xfs/xfs_log_recover.c17
110 files changed, 3145 insertions, 1180 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 38bc735c67ad..dc20db348679 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -69,10 +69,12 @@ obj-$(CONFIG_DLM) += dlm/
69# Do not add any filesystems before this line 69# Do not add any filesystems before this line
70obj-$(CONFIG_REISERFS_FS) += reiserfs/ 70obj-$(CONFIG_REISERFS_FS) += reiserfs/
71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
72obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 72obj-$(CONFIG_EXT2_FS) += ext2/
73# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
74# unless explicitly requested by rootfstype
75obj-$(CONFIG_EXT4_FS) += ext4/
73obj-$(CONFIG_JBD) += jbd/ 76obj-$(CONFIG_JBD) += jbd/
74obj-$(CONFIG_JBD2) += jbd2/ 77obj-$(CONFIG_JBD2) += jbd2/
75obj-$(CONFIG_EXT2_FS) += ext2/
76obj-$(CONFIG_CRAMFS) += cramfs/ 78obj-$(CONFIG_CRAMFS) += cramfs/
77obj-$(CONFIG_SQUASHFS) += squashfs/ 79obj-$(CONFIG_SQUASHFS) += squashfs/
78obj-y += ramfs/ 80obj-y += ramfs/
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e3ff2b9e602f..33b7235f853b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1208,9 +1208,11 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1208 * check for an ELF header. If we find one, dump the first page to 1208 * check for an ELF header. If we find one, dump the first page to
1209 * aid in determining what was mapped here. 1209 * aid in determining what was mapped here.
1210 */ 1210 */
1211 if (FILTER(ELF_HEADERS) && vma->vm_file != NULL && vma->vm_pgoff == 0) { 1211 if (FILTER(ELF_HEADERS) &&
1212 vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
1212 u32 __user *header = (u32 __user *) vma->vm_start; 1213 u32 __user *header = (u32 __user *) vma->vm_start;
1213 u32 word; 1214 u32 word;
1215 mm_segment_t fs = get_fs();
1214 /* 1216 /*
1215 * Doing it this way gets the constant folded by GCC. 1217 * Doing it this way gets the constant folded by GCC.
1216 */ 1218 */
@@ -1223,7 +1225,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
1223 magic.elfmag[EI_MAG1] = ELFMAG1; 1225 magic.elfmag[EI_MAG1] = ELFMAG1;
1224 magic.elfmag[EI_MAG2] = ELFMAG2; 1226 magic.elfmag[EI_MAG2] = ELFMAG2;
1225 magic.elfmag[EI_MAG3] = ELFMAG3; 1227 magic.elfmag[EI_MAG3] = ELFMAG3;
1226 if (get_user(word, header) == 0 && word == magic.cmp) 1228 /*
1229 * Switch to the user "segment" for get_user(),
1230 * then put back what elf_core_dump() had in place.
1231 */
1232 set_fs(USER_DS);
1233 if (unlikely(get_user(word, header)))
1234 word = 0;
1235 set_fs(fs);
1236 if (word == magic.cmp)
1227 return PAGE_SIZE; 1237 return PAGE_SIZE;
1228 } 1238 }
1229 1239
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 549b0144da11..fe2b1aa2464e 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -685,19 +685,20 @@ EXPORT_SYMBOL(bio_integrity_split);
685 * bio_integrity_clone - Callback for cloning bios with integrity metadata 685 * bio_integrity_clone - Callback for cloning bios with integrity metadata
686 * @bio: New bio 686 * @bio: New bio
687 * @bio_src: Original bio 687 * @bio_src: Original bio
688 * @gfp_mask: Memory allocation mask
688 * @bs: bio_set to allocate bip from 689 * @bs: bio_set to allocate bip from
689 * 690 *
690 * Description: Called to allocate a bip when cloning a bio 691 * Description: Called to allocate a bip when cloning a bio
691 */ 692 */
692int bio_integrity_clone(struct bio *bio, struct bio *bio_src, 693int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
693 struct bio_set *bs) 694 gfp_t gfp_mask, struct bio_set *bs)
694{ 695{
695 struct bio_integrity_payload *bip_src = bio_src->bi_integrity; 696 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
696 struct bio_integrity_payload *bip; 697 struct bio_integrity_payload *bip;
697 698
698 BUG_ON(bip_src == NULL); 699 BUG_ON(bip_src == NULL);
699 700
700 bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); 701 bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
701 702
702 if (bip == NULL) 703 if (bip == NULL)
703 return -EIO; 704 return -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 062299acbccd..d4f06327c810 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -302,9 +302,10 @@ void bio_init(struct bio *bio)
302struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 302struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
303{ 303{
304 struct bio *bio = NULL; 304 struct bio *bio = NULL;
305 void *uninitialized_var(p);
305 306
306 if (bs) { 307 if (bs) {
307 void *p = mempool_alloc(bs->bio_pool, gfp_mask); 308 p = mempool_alloc(bs->bio_pool, gfp_mask);
308 309
309 if (p) 310 if (p)
310 bio = p + bs->front_pad; 311 bio = p + bs->front_pad;
@@ -329,7 +330,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
329 } 330 }
330 if (unlikely(!bvl)) { 331 if (unlikely(!bvl)) {
331 if (bs) 332 if (bs)
332 mempool_free(bio, bs->bio_pool); 333 mempool_free(p, bs->bio_pool);
333 else 334 else
334 kfree(bio); 335 kfree(bio);
335 bio = NULL; 336 bio = NULL;
@@ -462,10 +463,12 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
462 if (bio_integrity(bio)) { 463 if (bio_integrity(bio)) {
463 int ret; 464 int ret;
464 465
465 ret = bio_integrity_clone(b, bio, fs_bio_set); 466 ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
466 467
467 if (ret < 0) 468 if (ret < 0) {
469 bio_put(b);
468 return NULL; 470 return NULL;
471 }
469 } 472 }
470 473
471 return b; 474 return b;
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index f8fcf999ea1b..7bb3c020e570 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -16,3 +16,16 @@ config BTRFS_FS
16 module will be called btrfs. 16 module will be called btrfs.
17 17
18 If unsure, say N. 18 If unsure, say N.
19
20config BTRFS_FS_POSIX_ACL
21 bool "Btrfs POSIX Access Control Lists"
22 depends on BTRFS_FS
23 select FS_POSIX_ACL
24 help
25 POSIX Access Control Lists (ACLs) support permissions for users and
26 groups beyond the owner/group/world scheme.
27
28 To learn more about Access Control Lists, visit the POSIX ACLs for
29 Linux website <http://acl.bestbits.at/>.
30
31 If you don't know what Access Control Lists are, say N
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8e2fec05dbe0..c84ca1f5259a 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,11 +16,11 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/kthread.h> 19#include <linux/kthread.h>
21#include <linux/list.h> 20#include <linux/list.h>
22#include <linux/spinlock.h> 21#include <linux/spinlock.h>
23# include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 24#include "async-thread.h"
25 25
26#define WORK_QUEUED_BIT 0 26#define WORK_QUEUED_BIT 0
@@ -143,6 +143,7 @@ static int worker_loop(void *arg)
143 struct btrfs_work *work; 143 struct btrfs_work *work;
144 do { 144 do {
145 spin_lock_irq(&worker->lock); 145 spin_lock_irq(&worker->lock);
146again_locked:
146 while (!list_empty(&worker->pending)) { 147 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next; 148 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list); 149 work = list_entry(cur, struct btrfs_work, list);
@@ -165,14 +166,50 @@ static int worker_loop(void *arg)
165 check_idle_worker(worker); 166 check_idle_worker(worker);
166 167
167 } 168 }
168 worker->working = 0;
169 if (freezing(current)) { 169 if (freezing(current)) {
170 worker->working = 0;
171 spin_unlock_irq(&worker->lock);
170 refrigerator(); 172 refrigerator();
171 } else { 173 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock); 174 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop()) 175 if (!kthread_should_stop()) {
176 cpu_relax();
177 /*
178 * we've dropped the lock, did someone else
179 * jump_in?
180 */
181 smp_mb();
182 if (!list_empty(&worker->pending))
183 continue;
184
185 /*
186 * this short schedule allows more work to
187 * come in without the queue functions
188 * needing to go through wake_up_process()
189 *
190 * worker->working is still 1, so nobody
191 * is going to try and wake us up
192 */
193 schedule_timeout(1);
194 smp_mb();
195 if (!list_empty(&worker->pending))
196 continue;
197
198 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE);
201 if (!list_empty(&worker->pending))
202 goto again_locked;
203
204 /*
205 * this makes sure we get a wakeup when someone
206 * adds something new to the queue
207 */
208 worker->working = 0;
209 spin_unlock_irq(&worker->lock);
210
175 schedule(); 211 schedule();
212 }
176 __set_current_state(TASK_RUNNING); 213 __set_current_state(TASK_RUNNING);
177 } 214 }
178 } while (!kthread_should_stop()); 215 } while (!kthread_should_stop());
@@ -350,13 +387,14 @@ int btrfs_requeue_work(struct btrfs_work *work)
350{ 387{
351 struct btrfs_worker_thread *worker = work->worker; 388 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags; 389 unsigned long flags;
390 int wake = 0;
353 391
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) 392 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out; 393 goto out;
356 394
357 spin_lock_irqsave(&worker->lock, flags); 395 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending); 396 list_add_tail(&work->list, &worker->pending);
397 atomic_inc(&worker->num_pending);
360 398
361 /* by definition we're busy, take ourselves off the idle 399 /* by definition we're busy, take ourselves off the idle
362 * list 400 * list
@@ -368,10 +406,16 @@ int btrfs_requeue_work(struct btrfs_work *work)
368 &worker->workers->worker_list); 406 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags); 407 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 } 408 }
409 if (!worker->working) {
410 wake = 1;
411 worker->working = 1;
412 }
371 413
372 spin_unlock_irqrestore(&worker->lock, flags); 414 spin_unlock_irqrestore(&worker->lock, flags);
373 415 if (wake)
416 wake_up_process(worker->task);
374out: 417out:
418
375 return 0; 419 return 0;
376} 420}
377 421
@@ -398,9 +442,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
398 } 442 }
399 443
400 spin_lock_irqsave(&worker->lock, flags); 444 spin_lock_irqsave(&worker->lock, flags);
445
446 list_add_tail(&work->list, &worker->pending);
401 atomic_inc(&worker->num_pending); 447 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker); 448 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404 449
405 /* 450 /*
406 * avoid calling into wake_up_process if this thread has already 451 * avoid calling into wake_up_process if this thread has already
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693b75ac..72677ce2b74f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,9 @@ struct btrfs_inode {
66 */ 66 */
67 struct list_head delalloc_inodes; 67 struct list_head delalloc_inodes;
68 68
69 /* the space_info for where this inode's data allocations are done */
70 struct btrfs_space_info *space_info;
71
69 /* full 64 bit generation number, struct vfs_inode doesn't have a big 72 /* full 64 bit generation number, struct vfs_inode doesn't have a big
70 * enough field for this. 73 * enough field for this.
71 */ 74 */
@@ -94,6 +97,11 @@ struct btrfs_inode {
94 */ 97 */
95 u64 delalloc_bytes; 98 u64 delalloc_bytes;
96 99
100 /* total number of bytes that may be used for this inode for
101 * delalloc
102 */
103 u64 reserved_bytes;
104
97 /* 105 /*
98 * the size of the file stored in the metadata on disk. data=ordered 106 * the size of the file stored in the metadata on disk. data=ordered
99 * means the in-memory i_size might be larger than the size on disk 107 * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee848d8585d9..ab07627084f1 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/writeback.h> 33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h> 34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h> 35#include <linux/pagevec.h>
37#include "compat.h" 36#include "compat.h"
38#include "ctree.h" 37#include "ctree.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e46c0776816..37f31b5529aa 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,22 +38,64 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot); 39 struct btrfs_path *path, int level, int slot);
40 40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void) 41struct btrfs_path *btrfs_alloc_path(void)
47{ 42{
48 struct btrfs_path *path; 43 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); 44 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) { 45 if (path)
51 btrfs_init_path(path);
52 path->reada = 1; 46 path->reada = 1;
53 }
54 return path; 47 return path;
55} 48}
56 49
50/*
51 * set all locked nodes in the path to blocking locks. This should
52 * be done before scheduling
53 */
54noinline void btrfs_set_path_blocking(struct btrfs_path *p)
55{
56 int i;
57 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
58 if (p->nodes[i] && p->locks[i])
59 btrfs_set_lock_blocking(p->nodes[i]);
60 }
61}
62
63/*
64 * reset all the locked nodes in the patch to spinning locks.
65 *
66 * held is used to keep lockdep happy, when lockdep is enabled
67 * we set held to a blocking lock before we go around and
68 * retake all the spinlocks in the path. You can safely use NULL
69 * for held
70 */
71noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
72 struct extent_buffer *held)
73{
74 int i;
75
76#ifdef CONFIG_DEBUG_LOCK_ALLOC
77 /* lockdep really cares that we take all of these spinlocks
78 * in the right order. If any of the locks in the path are not
79 * currently blocking, it is going to complain. So, make really
80 * really sure by forcing the path to blocking before we clear
81 * the path blocking.
82 */
83 if (held)
84 btrfs_set_lock_blocking(held);
85 btrfs_set_path_blocking(p);
86#endif
87
88 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
89 if (p->nodes[i] && p->locks[i])
90 btrfs_clear_lock_blocking(p->nodes[i]);
91 }
92
93#ifdef CONFIG_DEBUG_LOCK_ALLOC
94 if (held)
95 btrfs_clear_lock_blocking(held);
96#endif
97}
98
57/* this also releases the path */ 99/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p) 100void btrfs_free_path(struct btrfs_path *p)
59{ 101{
@@ -235,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
235 if (*cow_ret == buf) 277 if (*cow_ret == buf)
236 unlock_orig = 1; 278 unlock_orig = 1;
237 279
238 WARN_ON(!btrfs_tree_locked(buf)); 280 btrfs_assert_tree_locked(buf);
239 281
240 if (parent) 282 if (parent)
241 parent_start = parent->start; 283 parent_start = parent->start;
@@ -261,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
261 trans->transid, level, &ins); 303 trans->transid, level, &ins);
262 BUG_ON(ret); 304 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest, 305 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len); 306 buf->len, level);
265 } else { 307 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len, 308 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start, 309 parent_start,
@@ -272,6 +314,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
272 if (IS_ERR(cow)) 314 if (IS_ERR(cow))
273 return PTR_ERR(cow); 315 return PTR_ERR(cow);
274 316
317 /* cow is set to blocking by btrfs_init_new_buffer */
318
275 copy_extent_buffer(cow, buf, 0, 0, cow->len); 319 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start); 320 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid); 321 btrfs_set_header_generation(cow, trans->transid);
@@ -388,17 +432,20 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
388 WARN_ON(1); 432 WARN_ON(1);
389 } 433 }
390 434
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid && 435 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid && 436 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 437 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf; 438 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest); 439 WARN_ON(prealloc_dest);
398 return 0; 440 return 0;
399 } 441 }
400 spin_unlock(&root->fs_info->hash_lock); 442
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); 443 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
444
445 if (parent)
446 btrfs_set_lock_blocking(parent);
447 btrfs_set_lock_blocking(buf);
448
402 ret = __btrfs_cow_block(trans, root, buf, parent, 449 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0, 450 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest); 451 prealloc_dest);
@@ -504,6 +551,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
504 if (parent_nritems == 1) 551 if (parent_nritems == 1)
505 return 0; 552 return 0;
506 553
554 btrfs_set_lock_blocking(parent);
555
507 for (i = start_slot; i < end_slot; i++) { 556 for (i = start_slot; i < end_slot; i++) {
508 int close = 1; 557 int close = 1;
509 558
@@ -564,6 +613,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
564 search_start = last_block; 613 search_start = last_block;
565 614
566 btrfs_tree_lock(cur); 615 btrfs_tree_lock(cur);
616 btrfs_set_lock_blocking(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i, 617 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start, 618 &cur, search_start,
569 min(16 * blocksize, 619 min(16 * blocksize,
@@ -862,6 +912,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
862 return 0; 912 return 0;
863 913
864 mid = path->nodes[level]; 914 mid = path->nodes[level];
915
865 WARN_ON(!path->locks[level]); 916 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid); 917 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867 918
@@ -883,8 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
883 934
884 /* promote the child to a root */ 935 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0); 936 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child);
887 BUG_ON(!child); 937 BUG_ON(!child);
938 btrfs_tree_lock(child);
939 btrfs_set_lock_blocking(child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); 940 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret); 941 BUG_ON(ret);
890 942
@@ -900,6 +952,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
900 952
901 add_root_to_dirty_list(root); 953 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child); 954 btrfs_tree_unlock(child);
955
903 path->locks[level] = 0; 956 path->locks[level] = 0;
904 path->nodes[level] = NULL; 957 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid); 958 clean_tree_block(trans, root, mid);
@@ -924,6 +977,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
924 left = read_node_slot(root, parent, pslot - 1); 977 left = read_node_slot(root, parent, pslot - 1);
925 if (left) { 978 if (left) {
926 btrfs_tree_lock(left); 979 btrfs_tree_lock(left);
980 btrfs_set_lock_blocking(left);
927 wret = btrfs_cow_block(trans, root, left, 981 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0); 982 parent, pslot - 1, &left, 0);
929 if (wret) { 983 if (wret) {
@@ -934,6 +988,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
934 right = read_node_slot(root, parent, pslot + 1); 988 right = read_node_slot(root, parent, pslot + 1);
935 if (right) { 989 if (right) {
936 btrfs_tree_lock(right); 990 btrfs_tree_lock(right);
991 btrfs_set_lock_blocking(right);
937 wret = btrfs_cow_block(trans, root, right, 992 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0); 993 parent, pslot + 1, &right, 0);
939 if (wret) { 994 if (wret) {
@@ -1109,6 +1164,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1109 u32 left_nr; 1164 u32 left_nr;
1110 1165
1111 btrfs_tree_lock(left); 1166 btrfs_tree_lock(left);
1167 btrfs_set_lock_blocking(left);
1168
1112 left_nr = btrfs_header_nritems(left); 1169 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1170 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1; 1171 wret = 1;
@@ -1155,7 +1212,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1155 */ 1212 */
1156 if (right) { 1213 if (right) {
1157 u32 right_nr; 1214 u32 right_nr;
1215
1158 btrfs_tree_lock(right); 1216 btrfs_tree_lock(right);
1217 btrfs_set_lock_blocking(right);
1218
1159 right_nr = btrfs_header_nritems(right); 1219 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { 1220 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1; 1221 wret = 1;
@@ -1210,8 +1270,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1210 struct btrfs_disk_key disk_key; 1270 struct btrfs_disk_key disk_key;
1211 u32 nritems; 1271 u32 nritems;
1212 u64 search; 1272 u64 search;
1213 u64 lowest_read; 1273 u64 target;
1214 u64 highest_read;
1215 u64 nread = 0; 1274 u64 nread = 0;
1216 int direction = path->reada; 1275 int direction = path->reada;
1217 struct extent_buffer *eb; 1276 struct extent_buffer *eb;
@@ -1235,8 +1294,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
1235 return; 1294 return;
1236 } 1295 }
1237 1296
1238 highest_read = search; 1297 target = search;
1239 lowest_read = search;
1240 1298
1241 nritems = btrfs_header_nritems(node); 1299 nritems = btrfs_header_nritems(node);
1242 nr = slot; 1300 nr = slot;
@@ -1256,27 +1314,80 @@ static noinline void reada_for_search(struct btrfs_root *root,
1256 break; 1314 break;
1257 } 1315 }
1258 search = btrfs_node_blockptr(node, nr); 1316 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) || 1317 if ((search <= target && target - search <= 65536) ||
1260 (search < lowest_read && lowest_read - search <= 16384) || 1318 (search > target && search - target <= 65536)) {
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize, 1319 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr)); 1320 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize; 1321 nread += blocksize;
1265 } 1322 }
1266 nscan++; 1323 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) 1324 if ((nread > 65536 || nscan > 32))
1268 break; 1325 break;
1326 }
1327}
1269 1328
1270 if (nread > (256 * 1024) || nscan > 128) 1329/*
1271 break; 1330 * returns -EAGAIN if it had to drop the path, or zero if everything was in
1331 * cache
1332 */
1333static noinline int reada_for_balance(struct btrfs_root *root,
1334 struct btrfs_path *path, int level)
1335{
1336 int slot;
1337 int nritems;
1338 struct extent_buffer *parent;
1339 struct extent_buffer *eb;
1340 u64 gen;
1341 u64 block1 = 0;
1342 u64 block2 = 0;
1343 int ret = 0;
1344 int blocksize;
1345
1346 parent = path->nodes[level - 1];
1347 if (!parent)
1348 return 0;
1272 1349
1273 if (search < lowest_read) 1350 nritems = btrfs_header_nritems(parent);
1274 lowest_read = search; 1351 slot = path->slots[level];
1275 if (search > highest_read) 1352 blocksize = btrfs_level_size(root, level);
1276 highest_read = search; 1353
1354 if (slot > 0) {
1355 block1 = btrfs_node_blockptr(parent, slot - 1);
1356 gen = btrfs_node_ptr_generation(parent, slot - 1);
1357 eb = btrfs_find_tree_block(root, block1, blocksize);
1358 if (eb && btrfs_buffer_uptodate(eb, gen))
1359 block1 = 0;
1360 free_extent_buffer(eb);
1361 }
1362 if (slot < nritems) {
1363 block2 = btrfs_node_blockptr(parent, slot + 1);
1364 gen = btrfs_node_ptr_generation(parent, slot + 1);
1365 eb = btrfs_find_tree_block(root, block2, blocksize);
1366 if (eb && btrfs_buffer_uptodate(eb, gen))
1367 block2 = 0;
1368 free_extent_buffer(eb);
1277 } 1369 }
1370 if (block1 || block2) {
1371 ret = -EAGAIN;
1372 btrfs_release_path(root, path);
1373 if (block1)
1374 readahead_tree_block(root, block1, blocksize, 0);
1375 if (block2)
1376 readahead_tree_block(root, block2, blocksize, 0);
1377
1378 if (block1) {
1379 eb = read_tree_block(root, block1, blocksize, 0);
1380 free_extent_buffer(eb);
1381 }
1382 if (block1) {
1383 eb = read_tree_block(root, block2, blocksize, 0);
1384 free_extent_buffer(eb);
1385 }
1386 }
1387 return ret;
1278} 1388}
1279 1389
1390
1280/* 1391/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers 1392 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because 1393 * in the tree. The exceptions are when our path goes through slot 0, because
@@ -1328,6 +1439,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1328} 1439}
1329 1440
1330/* 1441/*
1442 * This releases any locks held in the path starting at level and
1443 * going all the way up to the root.
1444 *
1445 * btrfs_search_slot will keep the lock held on higher nodes in a few
1446 * corner cases, such as COW of the block at slot zero in the node. This
1447 * ignores those rules, and it should only be called when there are no
1448 * more updates to be done higher up in the tree.
1449 */
1450noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1451{
1452 int i;
1453
1454 if (path->keep_locks || path->lowest_level)
1455 return;
1456
1457 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1458 if (!path->nodes[i])
1459 continue;
1460 if (!path->locks[i])
1461 continue;
1462 btrfs_tree_unlock(path->nodes[i]);
1463 path->locks[i] = 0;
1464 }
1465}
1466
1467/*
1331 * look for key in the tree. path is filled in with nodes along the way 1468 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf 1469 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0) 1470 * level of the path (level 0)
@@ -1387,32 +1524,30 @@ again:
1387 int wret; 1524 int wret;
1388 1525
1389 /* is a cow on this block not required */ 1526 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid && 1527 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid && 1528 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1529 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done; 1530 goto cow_done;
1396 } 1531 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398 1532
1399 /* ok, we have to cow, is our old prealloc the right 1533 /* ok, we have to cow, is our old prealloc the right
1400 * size? 1534 * size?
1401 */ 1535 */
1402 if (prealloc_block.objectid && 1536 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) { 1537 prealloc_block.offset != b->len) {
1538 btrfs_release_path(root, p);
1404 btrfs_free_reserved_extent(root, 1539 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid, 1540 prealloc_block.objectid,
1406 prealloc_block.offset); 1541 prealloc_block.offset);
1407 prealloc_block.objectid = 0; 1542 prealloc_block.objectid = 0;
1543 goto again;
1408 } 1544 }
1409 1545
1410 /* 1546 /*
1411 * for higher level blocks, try not to allocate blocks 1547 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held. 1548 * with the block and the parent locks held.
1413 */ 1549 */
1414 if (level > 1 && !prealloc_block.objectid && 1550 if (level > 0 && !prealloc_block.objectid) {
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len; 1551 u32 size = b->len;
1417 u64 hint = b->start; 1552 u64 hint = b->start;
1418 1553
@@ -1425,6 +1560,8 @@ again:
1425 goto again; 1560 goto again;
1426 } 1561 }
1427 1562
1563 btrfs_set_path_blocking(p);
1564
1428 wret = btrfs_cow_block(trans, root, b, 1565 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1], 1566 p->nodes[level + 1],
1430 p->slots[level + 1], 1567 p->slots[level + 1],
@@ -1446,6 +1583,22 @@ cow_done:
1446 if (!p->skip_locking) 1583 if (!p->skip_locking)
1447 p->locks[level] = 1; 1584 p->locks[level] = 1;
1448 1585
1586 btrfs_clear_path_blocking(p, NULL);
1587
1588 /*
1589 * we have a lock on b and as long as we aren't changing
1590 * the tree, there is no way to for the items in b to change.
1591 * It is safe to drop the lock on our parent before we
1592 * go through the expensive btree search on b.
1593 *
1594 * If cow is true, then we might be changing slot zero,
1595 * which may require changing the parent. So, we can't
1596 * drop the lock until after we know which slot we're
1597 * operating on.
1598 */
1599 if (!cow)
1600 btrfs_unlock_up_safe(p, level + 1);
1601
1449 ret = check_block(root, p, level); 1602 ret = check_block(root, p, level);
1450 if (ret) { 1603 if (ret) {
1451 ret = -1; 1604 ret = -1;
@@ -1453,6 +1606,7 @@ cow_done:
1453 } 1606 }
1454 1607
1455 ret = bin_search(b, key, level, &slot); 1608 ret = bin_search(b, key, level, &slot);
1609
1456 if (level != 0) { 1610 if (level != 0) {
1457 if (ret && slot > 0) 1611 if (ret && slot > 0)
1458 slot -= 1; 1612 slot -= 1;
@@ -1460,7 +1614,16 @@ cow_done:
1460 if ((p->search_for_split || ins_len > 0) && 1614 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >= 1615 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1616 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level); 1617 int sret;
1618
1619 sret = reada_for_balance(root, p, level);
1620 if (sret)
1621 goto again;
1622
1623 btrfs_set_path_blocking(p);
1624 sret = split_node(trans, root, p, level);
1625 btrfs_clear_path_blocking(p, NULL);
1626
1464 BUG_ON(sret > 0); 1627 BUG_ON(sret > 0);
1465 if (sret) { 1628 if (sret) {
1466 ret = sret; 1629 ret = sret;
@@ -1468,9 +1631,19 @@ cow_done:
1468 } 1631 }
1469 b = p->nodes[level]; 1632 b = p->nodes[level];
1470 slot = p->slots[level]; 1633 slot = p->slots[level];
1471 } else if (ins_len < 0) { 1634 } else if (ins_len < 0 &&
1472 int sret = balance_level(trans, root, p, 1635 btrfs_header_nritems(b) <
1473 level); 1636 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1637 int sret;
1638
1639 sret = reada_for_balance(root, p, level);
1640 if (sret)
1641 goto again;
1642
1643 btrfs_set_path_blocking(p);
1644 sret = balance_level(trans, root, p, level);
1645 btrfs_clear_path_blocking(p, NULL);
1646
1474 if (sret) { 1647 if (sret) {
1475 ret = sret; 1648 ret = sret;
1476 goto done; 1649 goto done;
@@ -1504,7 +1677,7 @@ cow_done:
1504 * of the btree by dropping locks before 1677 * of the btree by dropping locks before
1505 * we read. 1678 * we read.
1506 */ 1679 */
1507 if (level > 1) { 1680 if (level > 0) {
1508 btrfs_release_path(NULL, p); 1681 btrfs_release_path(NULL, p);
1509 if (tmp) 1682 if (tmp)
1510 free_extent_buffer(tmp); 1683 free_extent_buffer(tmp);
@@ -1519,6 +1692,7 @@ cow_done:
1519 free_extent_buffer(tmp); 1692 free_extent_buffer(tmp);
1520 goto again; 1693 goto again;
1521 } else { 1694 } else {
1695 btrfs_set_path_blocking(p);
1522 if (tmp) 1696 if (tmp)
1523 free_extent_buffer(tmp); 1697 free_extent_buffer(tmp);
1524 if (should_reada) 1698 if (should_reada)
@@ -1528,14 +1702,29 @@ cow_done:
1528 b = read_node_slot(root, b, slot); 1702 b = read_node_slot(root, b, slot);
1529 } 1703 }
1530 } 1704 }
1531 if (!p->skip_locking) 1705 if (!p->skip_locking) {
1532 btrfs_tree_lock(b); 1706 int lret;
1707
1708 btrfs_clear_path_blocking(p, NULL);
1709 lret = btrfs_try_spin_lock(b);
1710
1711 if (!lret) {
1712 btrfs_set_path_blocking(p);
1713 btrfs_tree_lock(b);
1714 btrfs_clear_path_blocking(p, b);
1715 }
1716 }
1533 } else { 1717 } else {
1534 p->slots[level] = slot; 1718 p->slots[level] = slot;
1535 if (ins_len > 0 && 1719 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) { 1720 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key, 1721 int sret;
1722
1723 btrfs_set_path_blocking(p);
1724 sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0); 1725 p, ins_len, ret == 0);
1726 btrfs_clear_path_blocking(p, NULL);
1727
1539 BUG_ON(sret > 0); 1728 BUG_ON(sret > 0);
1540 if (sret) { 1729 if (sret) {
1541 ret = sret; 1730 ret = sret;
@@ -1549,12 +1738,16 @@ cow_done:
1549 } 1738 }
1550 ret = 1; 1739 ret = 1;
1551done: 1740done:
1741 /*
1742 * we don't really know what they plan on doing with the path
1743 * from here on, so for now just mark it as blocking
1744 */
1745 btrfs_set_path_blocking(p);
1552 if (prealloc_block.objectid) { 1746 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root, 1747 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid, 1748 prealloc_block.objectid,
1555 prealloc_block.offset); 1749 prealloc_block.offset);
1556 } 1750 }
1557
1558 return ret; 1751 return ret;
1559} 1752}
1560 1753
@@ -1578,6 +1771,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); 1771 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret); 1772 BUG_ON(ret);
1580 1773
1774 btrfs_set_lock_blocking(eb);
1775
1581 parent = eb; 1776 parent = eb;
1582 while (1) { 1777 while (1) {
1583 level = btrfs_header_level(parent); 1778 level = btrfs_header_level(parent);
@@ -1602,6 +1797,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1602 eb = read_tree_block(root, bytenr, blocksize, 1797 eb = read_tree_block(root, bytenr, blocksize,
1603 generation); 1798 generation);
1604 btrfs_tree_lock(eb); 1799 btrfs_tree_lock(eb);
1800 btrfs_set_lock_blocking(eb);
1605 } 1801 }
1606 1802
1607 /* 1803 /*
@@ -1626,6 +1822,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
1626 eb = read_tree_block(root, bytenr, blocksize, 1822 eb = read_tree_block(root, bytenr, blocksize,
1627 generation); 1823 generation);
1628 btrfs_tree_lock(eb); 1824 btrfs_tree_lock(eb);
1825 btrfs_set_lock_blocking(eb);
1629 } 1826 }
1630 1827
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot, 1828 ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2168,10 +2365,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2168 if (slot >= btrfs_header_nritems(upper) - 1) 2365 if (slot >= btrfs_header_nritems(upper) - 1)
2169 return 1; 2366 return 1;
2170 2367
2171 WARN_ON(!btrfs_tree_locked(path->nodes[1])); 2368 btrfs_assert_tree_locked(path->nodes[1]);
2172 2369
2173 right = read_node_slot(root, upper, slot + 1); 2370 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right); 2371 btrfs_tree_lock(right);
2372 btrfs_set_lock_blocking(right);
2373
2175 free_space = btrfs_leaf_free_space(root, right); 2374 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size) 2375 if (free_space < data_size)
2177 goto out_unlock; 2376 goto out_unlock;
@@ -2363,10 +2562,12 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2363 if (right_nritems == 0) 2562 if (right_nritems == 0)
2364 return 1; 2563 return 1;
2365 2564
2366 WARN_ON(!btrfs_tree_locked(path->nodes[1])); 2565 btrfs_assert_tree_locked(path->nodes[1]);
2367 2566
2368 left = read_node_slot(root, path->nodes[1], slot - 1); 2567 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left); 2568 btrfs_tree_lock(left);
2569 btrfs_set_lock_blocking(left);
2570
2370 free_space = btrfs_leaf_free_space(root, left); 2571 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) { 2572 if (free_space < data_size) {
2372 ret = 1; 2573 ret = 1;
@@ -2825,6 +3026,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
2825 path->keep_locks = 0; 3026 path->keep_locks = 0;
2826 BUG_ON(ret); 3027 BUG_ON(ret);
2827 3028
3029 /*
3030 * make sure any changes to the path from split_leaf leave it
3031 * in a blocking state
3032 */
3033 btrfs_set_path_blocking(path);
3034
2828 leaf = path->nodes[0]; 3035 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 3036 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830 3037
@@ -3354,6 +3561,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3354 BUG(); 3561 BUG();
3355 } 3562 }
3356out: 3563out:
3564 btrfs_unlock_up_safe(path, 1);
3357 return ret; 3565 return ret;
3358} 3566}
3359 3567
@@ -3441,15 +3649,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3441{ 3649{
3442 int ret; 3650 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]); 3651 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3652 u64 parent_start = path->nodes[1]->start;
3653 u64 parent_owner = btrfs_header_owner(path->nodes[1]);
3444 3654
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]); 3655 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret) 3656 if (ret)
3447 return ret; 3657 return ret;
3448 3658
3659 /*
3660 * btrfs_free_extent is expensive, we want to make sure we
3661 * aren't holding any locks when we call it
3662 */
3663 btrfs_unlock_up_safe(path, 0);
3664
3449 ret = btrfs_free_extent(trans, root, bytenr, 3665 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0), 3666 btrfs_level_size(root, 0),
3451 path->nodes[1]->start, 3667 parent_start, parent_owner,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1); 3668 root_gen, 0, 1);
3454 return ret; 3669 return ret;
3455} 3670}
@@ -3721,6 +3936,7 @@ find_next_key:
3721 */ 3936 */
3722 if (slot >= nritems) { 3937 if (slot >= nritems) {
3723 path->slots[level] = slot; 3938 path->slots[level] = slot;
3939 btrfs_set_path_blocking(path);
3724 sret = btrfs_find_next_key(root, path, min_key, level, 3940 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans); 3941 cache_only, min_trans);
3726 if (sret == 0) { 3942 if (sret == 0) {
@@ -3738,16 +3954,20 @@ find_next_key:
3738 unlock_up(path, level, 1); 3954 unlock_up(path, level, 1);
3739 goto out; 3955 goto out;
3740 } 3956 }
3957 btrfs_set_path_blocking(path);
3741 cur = read_node_slot(root, cur, slot); 3958 cur = read_node_slot(root, cur, slot);
3742 3959
3743 btrfs_tree_lock(cur); 3960 btrfs_tree_lock(cur);
3961
3744 path->locks[level - 1] = 1; 3962 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur; 3963 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1); 3964 unlock_up(path, level, 1);
3965 btrfs_clear_path_blocking(path, NULL);
3747 } 3966 }
3748out: 3967out:
3749 if (ret == 0) 3968 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key)); 3969 memcpy(min_key, &found_key, sizeof(found_key));
3970 btrfs_set_path_blocking(path);
3751 return ret; 3971 return ret;
3752} 3972}
3753 3973
@@ -3843,6 +4063,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3843 if (ret < 0) 4063 if (ret < 0)
3844 return ret; 4064 return ret;
3845 4065
4066 btrfs_set_path_blocking(path);
3846 nritems = btrfs_header_nritems(path->nodes[0]); 4067 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /* 4068 /*
3848 * by releasing the path above we dropped all our locks. A balance 4069 * by releasing the path above we dropped all our locks. A balance
@@ -3873,14 +4094,16 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3873 free_extent_buffer(next); 4094 free_extent_buffer(next);
3874 } 4095 }
3875 4096
4097 /* the path was set to blocking above */
3876 if (level == 1 && (path->locks[1] || path->skip_locking) && 4098 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada) 4099 path->reada)
3878 reada_for_search(root, path, level, slot, 0); 4100 reada_for_search(root, path, level, slot, 0);
3879 4101
3880 next = read_node_slot(root, c, slot); 4102 next = read_node_slot(root, c, slot);
3881 if (!path->skip_locking) { 4103 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c)); 4104 btrfs_assert_tree_locked(c);
3883 btrfs_tree_lock(next); 4105 btrfs_tree_lock(next);
4106 btrfs_set_lock_blocking(next);
3884 } 4107 }
3885 break; 4108 break;
3886 } 4109 }
@@ -3897,12 +4120,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3897 path->locks[level] = 1; 4120 path->locks[level] = 1;
3898 if (!level) 4121 if (!level)
3899 break; 4122 break;
4123
4124 btrfs_set_path_blocking(path);
3900 if (level == 1 && path->locks[1] && path->reada) 4125 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0); 4126 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0); 4127 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) { 4128 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level])); 4129 btrfs_assert_tree_locked(path->nodes[level]);
3905 btrfs_tree_lock(next); 4130 btrfs_tree_lock(next);
4131 btrfs_set_lock_blocking(next);
3906 } 4132 }
3907 } 4133 }
3908done: 4134done:
@@ -3927,6 +4153,7 @@ int btrfs_previous_item(struct btrfs_root *root,
3927 4153
3928 while (1) { 4154 while (1) {
3929 if (path->slots[0] == 0) { 4155 if (path->slots[0] == 0) {
4156 btrfs_set_path_blocking(path);
3930 ret = btrfs_prev_leaf(root, path); 4157 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0) 4158 if (ret != 0)
3932 return ret; 4159 return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eee060f88113..82491ba8fa40 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
43 43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1) 44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45 45
46#ifdef CONFIG_LOCKDEP 46#define BTRFS_MAX_LEVEL 8
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51 47
52/* holds pointers to all of the tree roots */ 48/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL 49#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -454,17 +450,11 @@ struct btrfs_timespec {
454 __le32 nsec; 450 __le32 nsec;
455} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
456 452
457typedef enum { 453enum btrfs_compression_type {
458 BTRFS_COMPRESS_NONE = 0, 454 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1, 455 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2, 456 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type; 457};
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468 458
469struct btrfs_inode_item { 459struct btrfs_inode_item {
470 /* nfs style generation number */ 460 /* nfs style generation number */
@@ -606,13 +596,27 @@ struct btrfs_block_group_item {
606 596
607struct btrfs_space_info { 597struct btrfs_space_info {
608 u64 flags; 598 u64 flags;
609 u64 total_bytes; 599
610 u64 bytes_used; 600 u64 total_bytes; /* total bytes in the space */
611 u64 bytes_pinned; 601 u64 bytes_used; /* total bytes used on disk */
612 u64 bytes_reserved; 602 u64 bytes_pinned; /* total bytes pinned, will be freed when the
613 u64 bytes_readonly; 603 transaction finishes */
614 int full; 604 u64 bytes_reserved; /* total bytes the allocator has reserved for
615 int force_alloc; 605 current allocations */
606 u64 bytes_readonly; /* total bytes that are read only */
607
608 /* delalloc accounting */
609 u64 bytes_delalloc; /* number of bytes reserved for allocation,
610 this space is not necessarily reserved yet
611 by the allocator */
612 u64 bytes_may_use; /* number of bytes that may be used for
613 delalloc */
614
615 int full; /* indicates that we cannot allocate any more
616 chunks for this space */
617 int force_alloc; /* set if we need to force a chunk alloc for
618 this space */
619
616 struct list_head list; 620 struct list_head list;
617 621
618 /* for block groups in our same type */ 622 /* for block groups in our same type */
@@ -701,9 +705,7 @@ struct btrfs_fs_info {
701 struct btrfs_transaction *running_transaction; 705 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle; 706 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait; 707 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait; 708 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707 709
708 struct btrfs_super_block super_copy; 710 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit; 711 struct btrfs_super_block super_for_commit;
@@ -711,7 +713,6 @@ struct btrfs_fs_info {
711 struct super_block *sb; 713 struct super_block *sb;
712 struct inode *btree_inode; 714 struct inode *btree_inode;
713 struct backing_dev_info bdi; 715 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex; 716 struct mutex trans_mutex;
716 struct mutex tree_log_mutex; 717 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex; 718 struct mutex transaction_kthread_mutex;
@@ -730,10 +731,6 @@ struct btrfs_fs_info {
730 atomic_t async_submit_draining; 731 atomic_t async_submit_draining;
731 atomic_t nr_async_bios; 732 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages; 733 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737 734
738 /* 735 /*
739 * this is used by the balancing code to wait for all the pending 736 * this is used by the balancing code to wait for all the pending
@@ -833,7 +830,14 @@ struct btrfs_root {
833 struct kobject root_kobj; 830 struct kobject root_kobj;
834 struct completion kobj_unregister; 831 struct completion kobj_unregister;
835 struct mutex objectid_mutex; 832 struct mutex objectid_mutex;
833
836 struct mutex log_mutex; 834 struct mutex log_mutex;
835 wait_queue_head_t log_writer_wait;
836 wait_queue_head_t log_commit_wait[2];
837 atomic_t log_writers;
838 atomic_t log_commit[2];
839 unsigned long log_transid;
840 unsigned long log_batch;
837 841
838 u64 objectid; 842 u64 objectid;
839 u64 last_trans; 843 u64 last_trans;
@@ -1721,7 +1725,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1721 u64 empty_size); 1725 u64 empty_size);
1722struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 1726struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *root, 1727 struct btrfs_root *root,
1724 u64 bytenr, u32 blocksize); 1728 u64 bytenr, u32 blocksize,
1729 int level);
1725int btrfs_alloc_extent(struct btrfs_trans_handle *trans, 1730int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1726 struct btrfs_root *root, 1731 struct btrfs_root *root,
1727 u64 num_bytes, u64 parent, u64 min_bytes, 1732 u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1791,6 +1796,16 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1791int btrfs_cleanup_reloc_trees(struct btrfs_root *root); 1796int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1792int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 1797int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1793u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 1798u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1799void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
1800int btrfs_check_metadata_free_space(struct btrfs_root *root);
1801int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
1802 u64 bytes);
1803void btrfs_free_reserved_data_space(struct btrfs_root *root,
1804 struct inode *inode, u64 bytes);
1805void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1806 u64 bytes);
1807void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1808 u64 bytes);
1794/* ctree.c */ 1809/* ctree.c */
1795int btrfs_previous_item(struct btrfs_root *root, 1810int btrfs_previous_item(struct btrfs_root *root,
1796 struct btrfs_path *path, u64 min_objectid, 1811 struct btrfs_path *path, u64 min_objectid,
@@ -1840,7 +1855,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1840void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); 1855void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void); 1856struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p); 1857void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p); 1858void btrfs_set_path_blocking(struct btrfs_path *p);
1859void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
1860
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1861int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr); 1862 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans, 1863int btrfs_del_leaf(struct btrfs_trans_handle *trans,
@@ -2034,8 +2051,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2034unsigned long btrfs_force_ra(struct address_space *mapping, 2051unsigned long btrfs_force_ra(struct address_space *mapping,
2035 struct file_ra_state *ra, struct file *file, 2052 struct file_ra_state *ra, struct file *file,
2036 pgoff_t offset, pgoff_t last_index); 2053 pgoff_t offset, pgoff_t last_index);
2037int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
2038 int for_del);
2039int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); 2054int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
2040int btrfs_readpage(struct file *file, struct page *page); 2055int btrfs_readpage(struct file *file, struct page *page);
2041void btrfs_delete_inode(struct inode *inode); 2056void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81a313874ae5..3e18175248e0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/version.h>
20#include <linux/fs.h> 19#include <linux/fs.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/scatterlist.h> 21#include <linux/scatterlist.h>
@@ -76,6 +75,40 @@ struct async_submit_bio {
76 struct btrfs_work work; 75 struct btrfs_work work;
77}; 76};
78 77
78/* These are used to set the lockdep class on the extent buffer locks.
79 * The class is set by the readpage_end_io_hook after the buffer has
80 * passed csum validation but before the pages are unlocked.
81 *
82 * The lockdep class is also set by btrfs_init_new_buffer on freshly
83 * allocated blocks.
84 *
85 * The class is based on the level in the tree block, which allows lockdep
86 * to know that lower nodes nest inside the locks of higher nodes.
87 *
88 * We also add a check to make sure the highest level of the tree is
89 * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this
90 * code needs update as well.
91 */
92#ifdef CONFIG_DEBUG_LOCK_ALLOC
93# if BTRFS_MAX_LEVEL != 8
94# error
95# endif
96static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
97static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
98 /* leaf */
99 "btrfs-extent-00",
100 "btrfs-extent-01",
101 "btrfs-extent-02",
102 "btrfs-extent-03",
103 "btrfs-extent-04",
104 "btrfs-extent-05",
105 "btrfs-extent-06",
106 "btrfs-extent-07",
107 /* highest possible level */
108 "btrfs-extent-08",
109};
110#endif
111
79/* 112/*
80 * extents on the btree inode are pretty simple, there's one extent 113 * extents on the btree inode are pretty simple, there's one extent
81 * that covers the entire device 114 * that covers the entire device
@@ -348,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
348 return ret; 381 return ret;
349} 382}
350 383
384#ifdef CONFIG_DEBUG_LOCK_ALLOC
385void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
386{
387 lockdep_set_class_and_name(&eb->lock,
388 &btrfs_eb_class[level],
389 btrfs_eb_name[level]);
390}
391#endif
392
351static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 393static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
352 struct extent_state *state) 394 struct extent_state *state)
353{ 395{
@@ -393,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
393 } 435 }
394 found_level = btrfs_header_level(eb); 436 found_level = btrfs_header_level(eb);
395 437
438 btrfs_set_buffer_lockdep_class(eb, found_level);
439
396 ret = csum_tree_block(root, eb, 1); 440 ret = csum_tree_block(root, eb, 1);
397 if (ret) 441 if (ret)
398 ret = -EIO; 442 ret = -EIO;
@@ -800,7 +844,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 844 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801 845
802 if (ret == 0) 846 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE; 847 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
804 else 848 else
805 WARN_ON(1); 849 WARN_ON(1);
806 return buf; 850 return buf;
@@ -813,7 +857,11 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
813 struct inode *btree_inode = root->fs_info->btree_inode; 857 struct inode *btree_inode = root->fs_info->btree_inode;
814 if (btrfs_header_generation(buf) == 858 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) { 859 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf)); 860 btrfs_assert_tree_locked(buf);
861
862 /* ugh, clear_extent_buffer_dirty can be expensive */
863 btrfs_set_lock_blocking(buf);
864
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, 865 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf); 866 buf);
819 } 867 }
@@ -850,6 +898,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
850 spin_lock_init(&root->list_lock); 898 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex); 899 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex); 900 mutex_init(&root->log_mutex);
901 init_waitqueue_head(&root->log_writer_wait);
902 init_waitqueue_head(&root->log_commit_wait[0]);
903 init_waitqueue_head(&root->log_commit_wait[1]);
904 atomic_set(&root->log_commit[0], 0);
905 atomic_set(&root->log_commit[1], 0);
906 atomic_set(&root->log_writers, 0);
907 root->log_batch = 0;
908 root->log_transid = 0;
853 extent_io_tree_init(&root->dirty_log_pages, 909 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS); 910 fs_info->btree_inode->i_mapping, GFP_NOFS);
855 911
@@ -934,15 +990,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
934 return 0; 990 return 0;
935} 991}
936 992
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 993static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info) 994 struct btrfs_fs_info *fs_info)
939{ 995{
940 struct btrfs_root *root; 996 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root; 997 struct btrfs_root *tree_root = fs_info->tree_root;
998 struct extent_buffer *leaf;
942 999
943 root = kzalloc(sizeof(*root), GFP_NOFS); 1000 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root) 1001 if (!root)
945 return -ENOMEM; 1002 return ERR_PTR(-ENOMEM);
946 1003
947 __setup_root(tree_root->nodesize, tree_root->leafsize, 1004 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize, 1005 tree_root->sectorsize, tree_root->stripesize,
@@ -951,12 +1008,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1008 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1009 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 1010 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1011 /*
1012 * log trees do not get reference counted because they go away
1013 * before a real commit is actually done. They do store pointers
1014 * to file data extents, and those reference counts still get
1015 * updated (along with back refs to the log tree).
1016 */
954 root->ref_cows = 0; 1017 root->ref_cows = 0;
955 1018
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize, 1019 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID, 1020 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0); 1021 trans->transid, 0, 0, 0);
1022 if (IS_ERR(leaf)) {
1023 kfree(root);
1024 return ERR_CAST(leaf);
1025 }
959 1026
1027 root->node = leaf;
960 btrfs_set_header_nritems(root->node, 0); 1028 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0); 1029 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start); 1030 btrfs_set_header_bytenr(root->node, root->node->start);
@@ -968,7 +1036,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
968 BTRFS_FSID_SIZE); 1036 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node); 1037 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node); 1038 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root; 1039 return root;
1040}
1041
1042int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1043 struct btrfs_fs_info *fs_info)
1044{
1045 struct btrfs_root *log_root;
1046
1047 log_root = alloc_log_tree(trans, fs_info);
1048 if (IS_ERR(log_root))
1049 return PTR_ERR(log_root);
1050 WARN_ON(fs_info->log_root_tree);
1051 fs_info->log_root_tree = log_root;
1052 return 0;
1053}
1054
1055int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1056 struct btrfs_root *root)
1057{
1058 struct btrfs_root *log_root;
1059 struct btrfs_inode_item *inode_item;
1060
1061 log_root = alloc_log_tree(trans, root->fs_info);
1062 if (IS_ERR(log_root))
1063 return PTR_ERR(log_root);
1064
1065 log_root->last_trans = trans->transid;
1066 log_root->root_key.offset = root->root_key.objectid;
1067
1068 inode_item = &log_root->root_item.inode;
1069 inode_item->generation = cpu_to_le64(1);
1070 inode_item->size = cpu_to_le64(3);
1071 inode_item->nlink = cpu_to_le32(1);
1072 inode_item->nbytes = cpu_to_le64(root->leafsize);
1073 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
1074
1075 btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
1076 btrfs_set_root_generation(&log_root->root_item, trans->transid);
1077
1078 WARN_ON(root->log_root);
1079 root->log_root = log_root;
1080 root->log_transid = 0;
972 return 0; 1081 return 0;
973} 1082}
974 1083
@@ -1136,7 +1245,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{ 1245{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1246 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0; 1247 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device; 1248 struct btrfs_device *device;
1141 struct backing_dev_info *bdi; 1249 struct backing_dev_info *bdi;
1142#if 0 1250#if 0
@@ -1144,8 +1252,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1144 btrfs_congested_async(info, 0)) 1252 btrfs_congested_async(info, 0))
1145 return 1; 1253 return 1;
1146#endif 1254#endif
1147 list_for_each(cur, &info->fs_devices->devices) { 1255 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev) 1256 if (!device->bdev)
1150 continue; 1257 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev); 1258 bdi = blk_get_backing_dev_info(device->bdev);
@@ -1163,13 +1270,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1163 */ 1270 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1271static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{ 1272{
1166 struct list_head *cur;
1167 struct btrfs_device *device; 1273 struct btrfs_device *device;
1168 struct btrfs_fs_info *info; 1274 struct btrfs_fs_info *info;
1169 1275
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data; 1276 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) { 1277 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev) 1278 if (!device->bdev)
1174 continue; 1279 continue;
1175 1280
@@ -1447,7 +1552,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1447 INIT_LIST_HEAD(&fs_info->dead_roots); 1552 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers); 1553 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1554 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock); 1555 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock); 1556 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock); 1557 spin_lock_init(&fs_info->ref_cache_lock);
@@ -1535,10 +1639,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1535 init_waitqueue_head(&fs_info->transaction_throttle); 1639 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait); 1640 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait); 1641 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542 1642
1543 __setup_root(4096, 4096, 4096, 4096, tree_root, 1643 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1644 fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -1627,6 +1727,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 * low idle thresh 1727 * low idle thresh
1628 */ 1728 */
1629 fs_info->endio_workers.idle_thresh = 4; 1729 fs_info->endio_workers.idle_thresh = 4;
1730 fs_info->endio_meta_workers.idle_thresh = 4;
1731
1630 fs_info->endio_write_workers.idle_thresh = 64; 1732 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64; 1733 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632 1734
@@ -1720,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1720 ret = find_and_setup_root(tree_root, fs_info, 1822 ret = find_and_setup_root(tree_root, fs_info,
1721 BTRFS_DEV_TREE_OBJECTID, dev_root); 1823 BTRFS_DEV_TREE_OBJECTID, dev_root);
1722 dev_root->track_dirty = 1; 1824 dev_root->track_dirty = 1;
1723
1724 if (ret) 1825 if (ret)
1725 goto fail_extent_root; 1826 goto fail_extent_root;
1726 1827
@@ -1740,13 +1841,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1841 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1842 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner"); 1843 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread) 1844 if (IS_ERR(fs_info->cleaner_kthread))
1744 goto fail_csum_root; 1845 goto fail_csum_root;
1745 1846
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1847 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root, 1848 tree_root,
1748 "btrfs-transaction"); 1849 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread) 1850 if (IS_ERR(fs_info->transaction_kthread))
1750 goto fail_cleaner; 1851 goto fail_cleaner;
1751 1852
1752 if (btrfs_super_log_root(disk_super) != 0) { 1853 if (btrfs_super_log_root(disk_super) != 0) {
@@ -1828,13 +1929,14 @@ fail_sb_buffer:
1828fail_iput: 1929fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 1930 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode); 1931 iput(fs_info->btree_inode);
1831fail: 1932
1832 btrfs_close_devices(fs_info->fs_devices); 1933 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree); 1934 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1935 bdi_destroy(&fs_info->bdi);
1834 1936
1937fail:
1835 kfree(extent_root); 1938 kfree(extent_root);
1836 kfree(tree_root); 1939 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info); 1940 kfree(fs_info);
1839 kfree(chunk_root); 1941 kfree(chunk_root);
1840 kfree(dev_root); 1942 kfree(dev_root);
@@ -1995,7 +2097,6 @@ static int write_dev_supers(struct btrfs_device *device,
1995 2097
1996int write_all_supers(struct btrfs_root *root, int max_mirrors) 2098int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{ 2099{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices; 2100 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev; 2101 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb; 2102 struct btrfs_super_block *sb;
@@ -2011,8 +2112,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2011 2112
2012 sb = &root->fs_info->super_for_commit; 2113 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item; 2114 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) { 2115 list_for_each_entry(dev, head, dev_list) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) { 2116 if (!dev->bdev) {
2017 total_errors++; 2117 total_errors++;
2018 continue; 2118 continue;
@@ -2045,8 +2145,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2045 } 2145 }
2046 2146
2047 total_errors = 0; 2147 total_errors = 0;
2048 list_for_each(cur, head) { 2148 list_for_each_entry(dev, head, dev_list) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev) 2149 if (!dev->bdev)
2051 continue; 2150 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable) 2151 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2260,7 +2359,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2260 u64 transid = btrfs_header_generation(buf); 2359 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode; 2360 struct inode *btree_inode = root->fs_info->btree_inode;
2262 2361
2263 WARN_ON(!btrfs_tree_locked(buf)); 2362 btrfs_set_lock_blocking(buf);
2363
2364 btrfs_assert_tree_locked(buf);
2264 if (transid != root->fs_info->generation) { 2365 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 2366 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2266 "found %llu running %llu\n", 2367 "found %llu running %llu\n",
@@ -2302,14 +2403,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2302 int ret; 2403 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 2404 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0) 2405 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE; 2406 set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
2306 return ret; 2407 return ret;
2307} 2408}
2308 2409
2309int btree_lock_page_hook(struct page *page) 2410int btree_lock_page_hook(struct page *page)
2310{ 2411{
2311 struct inode *inode = page->mapping->host; 2412 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2413 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb; 2414 struct extent_buffer *eb;
2315 unsigned long len; 2415 unsigned long len;
@@ -2324,9 +2424,7 @@ int btree_lock_page_hook(struct page *page)
2324 goto out; 2424 goto out;
2325 2425
2326 btrfs_tree_lock(eb); 2426 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 2427 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb); 2428 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb); 2429 free_extent_buffer(eb);
2332out: 2430out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c0ff404c31b7..95029db227be 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -98,5 +98,17 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info); 98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info); 100 struct btrfs_fs_info *fs_info);
101int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
102 struct btrfs_root *root);
101int btree_lock_page_hook(struct page *page); 103int btree_lock_page_hook(struct page *page);
104
105
106#ifdef CONFIG_DEBUG_LOCK_ALLOC
107void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
108#else
109static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
110 int level)
111{
112}
113#endif
102#endif 114#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da650873f..9abf81f71c46 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/version.h> 22#include <linux/sort.h>
23#include "compat.h" 23#include "compat.h"
24#include "hash.h" 24#include "hash.h"
25#include "crc32c.h" 25#include "crc32c.h"
@@ -30,7 +30,6 @@
30#include "volumes.h" 30#include "volumes.h"
31#include "locking.h" 31#include "locking.h"
32#include "ref-cache.h" 32#include "ref-cache.h"
33#include "compat.h"
34 33
35#define PENDING_EXTENT_INSERT 0 34#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 35#define PENDING_EXTENT_DELETE 1
@@ -61,6 +60,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
61 u64 bytenr, u64 num_bytes, int alloc, 60 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free); 61 int mark_free);
63 62
63static int do_chunk_alloc(struct btrfs_trans_handle *trans,
64 struct btrfs_root *extent_root, u64 alloc_bytes,
65 u64 flags, int force);
66
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 67static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{ 68{
66 return (cache->flags & bits) == bits; 69 return (cache->flags & bits) == bits;
@@ -326,10 +329,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags) 329 u64 flags)
327{ 330{
328 struct list_head *head = &info->space_info; 331 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found; 332 struct btrfs_space_info *found;
331 list_for_each(cur, head) { 333 list_for_each_entry(found, head, list) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags) 334 if (found->flags == flags)
334 return found; 335 return found;
335 } 336 }
@@ -1326,8 +1327,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 1327int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root) 1328 struct btrfs_root *root)
1328{ 1329{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1); 1330 u64 start;
1330 del_pending_extents(trans, root->fs_info->extent_root, 1); 1331 u64 end;
1332 int ret;
1333
1334 while(1) {
1335 finish_current_insert(trans, root->fs_info->extent_root, 1);
1336 del_pending_extents(trans, root->fs_info->extent_root, 1);
1337
1338 /* is there more work to do? */
1339 ret = find_first_extent_bit(&root->fs_info->pending_del,
1340 0, &start, &end, EXTENT_WRITEBACK);
1341 if (!ret)
1342 continue;
1343 ret = find_first_extent_bit(&root->fs_info->extent_ins,
1344 0, &start, &end, EXTENT_WRITEBACK);
1345 if (!ret)
1346 continue;
1347 break;
1348 }
1331 return 0; 1349 return 0;
1332} 1350}
1333 1351
@@ -1525,15 +1543,55 @@ out:
1525 return ret; 1543 return ret;
1526} 1544}
1527 1545
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1546/* when a block goes through cow, we update the reference counts of
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1547 * everything that block points to. The internal pointers of the block
1530 u32 *nr_extents) 1548 * can be in just about any order, and it is likely to have clusters of
1549 * things that are close together and clusters of things that are not.
1550 *
1551 * To help reduce the seeks that come with updating all of these reference
1552 * counts, sort them by byte number before actual updates are done.
1553 *
1554 * struct refsort is used to match byte number to slot in the btree block.
1555 * we sort based on the byte number and then use the slot to actually
1556 * find the item.
1557 *
1558 * struct refsort is smaller than strcut btrfs_item and smaller than
1559 * struct btrfs_key_ptr. Since we're currently limited to the page size
1560 * for a btree block, there's no way for a kmalloc of refsorts for a
1561 * single node to be bigger than a page.
1562 */
1563struct refsort {
1564 u64 bytenr;
1565 u32 slot;
1566};
1567
1568/*
1569 * for passing into sort()
1570 */
1571static int refsort_cmp(const void *a_void, const void *b_void)
1572{
1573 const struct refsort *a = a_void;
1574 const struct refsort *b = b_void;
1575
1576 if (a->bytenr < b->bytenr)
1577 return -1;
1578 if (a->bytenr > b->bytenr)
1579 return 1;
1580 return 0;
1581}
1582
1583
1584noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1585 struct btrfs_root *root,
1586 struct extent_buffer *orig_buf,
1587 struct extent_buffer *buf, u32 *nr_extents)
1531{ 1588{
1532 u64 bytenr; 1589 u64 bytenr;
1533 u64 ref_root; 1590 u64 ref_root;
1534 u64 orig_root; 1591 u64 orig_root;
1535 u64 ref_generation; 1592 u64 ref_generation;
1536 u64 orig_generation; 1593 u64 orig_generation;
1594 struct refsort *sorted;
1537 u32 nritems; 1595 u32 nritems;
1538 u32 nr_file_extents = 0; 1596 u32 nr_file_extents = 0;
1539 struct btrfs_key key; 1597 struct btrfs_key key;
@@ -1542,6 +1600,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1542 int level; 1600 int level;
1543 int ret = 0; 1601 int ret = 0;
1544 int faili = 0; 1602 int faili = 0;
1603 int refi = 0;
1604 int slot;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1605 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64); 1606 u64, u64, u64, u64, u64, u64, u64, u64);
1547 1607
@@ -1553,6 +1613,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1553 nritems = btrfs_header_nritems(buf); 1613 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf); 1614 level = btrfs_header_level(buf);
1555 1615
1616 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1617 BUG_ON(!sorted);
1618
1556 if (root->ref_cows) { 1619 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref; 1620 process_func = __btrfs_inc_extent_ref;
1558 } else { 1621 } else {
@@ -1565,6 +1628,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1565 process_func = __btrfs_update_extent_ref; 1628 process_func = __btrfs_update_extent_ref;
1566 } 1629 }
1567 1630
1631 /*
1632 * we make two passes through the items. In the first pass we
1633 * only record the byte number and slot. Then we sort based on
1634 * byte number and do the actual work based on the sorted results
1635 */
1568 for (i = 0; i < nritems; i++) { 1636 for (i = 0; i < nritems; i++) {
1569 cond_resched(); 1637 cond_resched();
1570 if (level == 0) { 1638 if (level == 0) {
@@ -1581,6 +1649,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1581 continue; 1649 continue;
1582 1650
1583 nr_file_extents++; 1651 nr_file_extents++;
1652 sorted[refi].bytenr = bytenr;
1653 sorted[refi].slot = i;
1654 refi++;
1655 } else {
1656 bytenr = btrfs_node_blockptr(buf, i);
1657 sorted[refi].bytenr = bytenr;
1658 sorted[refi].slot = i;
1659 refi++;
1660 }
1661 }
1662 /*
1663 * if refi == 0, we didn't actually put anything into the sorted
1664 * array and we're done
1665 */
1666 if (refi == 0)
1667 goto out;
1668
1669 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1670
1671 for (i = 0; i < refi; i++) {
1672 cond_resched();
1673 slot = sorted[i].slot;
1674 bytenr = sorted[i].bytenr;
1675
1676 if (level == 0) {
1677 btrfs_item_key_to_cpu(buf, &key, slot);
1584 1678
1585 ret = process_func(trans, root, bytenr, 1679 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start, 1680 orig_buf->start, buf->start,
@@ -1589,25 +1683,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1589 key.objectid); 1683 key.objectid);
1590 1684
1591 if (ret) { 1685 if (ret) {
1592 faili = i; 1686 faili = slot;
1593 WARN_ON(1); 1687 WARN_ON(1);
1594 goto fail; 1688 goto fail;
1595 } 1689 }
1596 } else { 1690 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr, 1691 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start, 1692 orig_buf->start, buf->start,
1600 orig_root, ref_root, 1693 orig_root, ref_root,
1601 orig_generation, ref_generation, 1694 orig_generation, ref_generation,
1602 level - 1); 1695 level - 1);
1603 if (ret) { 1696 if (ret) {
1604 faili = i; 1697 faili = slot;
1605 WARN_ON(1); 1698 WARN_ON(1);
1606 goto fail; 1699 goto fail;
1607 } 1700 }
1608 } 1701 }
1609 } 1702 }
1610out: 1703out:
1704 kfree(sorted);
1611 if (nr_extents) { 1705 if (nr_extents) {
1612 if (level == 0) 1706 if (level == 0)
1613 *nr_extents = nr_file_extents; 1707 *nr_extents = nr_file_extents;
@@ -1616,6 +1710,7 @@ out:
1616 } 1710 }
1617 return 0; 1711 return 0;
1618fail: 1712fail:
1713 kfree(sorted);
1619 WARN_ON(1); 1714 WARN_ON(1);
1620 return ret; 1715 return ret;
1621} 1716}
@@ -1818,6 +1913,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1818 found->bytes_pinned = 0; 1913 found->bytes_pinned = 0;
1819 found->bytes_reserved = 0; 1914 found->bytes_reserved = 0;
1820 found->bytes_readonly = 0; 1915 found->bytes_readonly = 0;
1916 found->bytes_delalloc = 0;
1821 found->full = 0; 1917 found->full = 0;
1822 found->force_alloc = 0; 1918 found->force_alloc = 0;
1823 *space_info = found; 1919 *space_info = found;
@@ -1881,6 +1977,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1881 return flags; 1977 return flags;
1882} 1978}
1883 1979
1980static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
1981{
1982 struct btrfs_fs_info *info = root->fs_info;
1983 u64 alloc_profile;
1984
1985 if (data) {
1986 alloc_profile = info->avail_data_alloc_bits &
1987 info->data_alloc_profile;
1988 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1989 } else if (root == root->fs_info->chunk_root) {
1990 alloc_profile = info->avail_system_alloc_bits &
1991 info->system_alloc_profile;
1992 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1993 } else {
1994 alloc_profile = info->avail_metadata_alloc_bits &
1995 info->metadata_alloc_profile;
1996 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1997 }
1998
1999 return btrfs_reduce_alloc_profile(root, data);
2000}
2001
2002void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2003{
2004 u64 alloc_target;
2005
2006 alloc_target = btrfs_get_alloc_profile(root, 1);
2007 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2008 alloc_target);
2009}
2010
2011/*
2012 * for now this just makes sure we have at least 5% of our metadata space free
2013 * for use.
2014 */
2015int btrfs_check_metadata_free_space(struct btrfs_root *root)
2016{
2017 struct btrfs_fs_info *info = root->fs_info;
2018 struct btrfs_space_info *meta_sinfo;
2019 u64 alloc_target, thresh;
2020 int committed = 0, ret;
2021
2022 /* get the space info for where the metadata will live */
2023 alloc_target = btrfs_get_alloc_profile(root, 0);
2024 meta_sinfo = __find_space_info(info, alloc_target);
2025
2026again:
2027 spin_lock(&meta_sinfo->lock);
2028 if (!meta_sinfo->full)
2029 thresh = meta_sinfo->total_bytes * 80;
2030 else
2031 thresh = meta_sinfo->total_bytes * 95;
2032
2033 do_div(thresh, 100);
2034
2035 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2036 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2037 struct btrfs_trans_handle *trans;
2038 if (!meta_sinfo->full) {
2039 meta_sinfo->force_alloc = 1;
2040 spin_unlock(&meta_sinfo->lock);
2041
2042 trans = btrfs_start_transaction(root, 1);
2043 if (!trans)
2044 return -ENOMEM;
2045
2046 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2047 2 * 1024 * 1024, alloc_target, 0);
2048 btrfs_end_transaction(trans, root);
2049 goto again;
2050 }
2051 spin_unlock(&meta_sinfo->lock);
2052
2053 if (!committed) {
2054 committed = 1;
2055 trans = btrfs_join_transaction(root, 1);
2056 if (!trans)
2057 return -ENOMEM;
2058 ret = btrfs_commit_transaction(trans, root);
2059 if (ret)
2060 return ret;
2061 goto again;
2062 }
2063 return -ENOSPC;
2064 }
2065 spin_unlock(&meta_sinfo->lock);
2066
2067 return 0;
2068}
2069
2070/*
2071 * This will check the space that the inode allocates from to make sure we have
2072 * enough space for bytes.
2073 */
2074int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2075 u64 bytes)
2076{
2077 struct btrfs_space_info *data_sinfo;
2078 int ret = 0, committed = 0;
2079
2080 /* make sure bytes are sectorsize aligned */
2081 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2082
2083 data_sinfo = BTRFS_I(inode)->space_info;
2084again:
2085 /* make sure we have enough space to handle the data first */
2086 spin_lock(&data_sinfo->lock);
2087 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2088 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2089 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2090 data_sinfo->bytes_may_use < bytes) {
2091 struct btrfs_trans_handle *trans;
2092
2093 /*
2094 * if we don't have enough free bytes in this space then we need
2095 * to alloc a new chunk.
2096 */
2097 if (!data_sinfo->full) {
2098 u64 alloc_target;
2099
2100 data_sinfo->force_alloc = 1;
2101 spin_unlock(&data_sinfo->lock);
2102
2103 alloc_target = btrfs_get_alloc_profile(root, 1);
2104 trans = btrfs_start_transaction(root, 1);
2105 if (!trans)
2106 return -ENOMEM;
2107
2108 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2109 bytes + 2 * 1024 * 1024,
2110 alloc_target, 0);
2111 btrfs_end_transaction(trans, root);
2112 if (ret)
2113 return ret;
2114 goto again;
2115 }
2116 spin_unlock(&data_sinfo->lock);
2117
2118 /* commit the current transaction and try again */
2119 if (!committed) {
2120 committed = 1;
2121 trans = btrfs_join_transaction(root, 1);
2122 if (!trans)
2123 return -ENOMEM;
2124 ret = btrfs_commit_transaction(trans, root);
2125 if (ret)
2126 return ret;
2127 goto again;
2128 }
2129
2130 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2131 ", %llu bytes_used, %llu bytes_reserved, "
2132 "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
2133 "%llu total\n", bytes, data_sinfo->bytes_delalloc,
2134 data_sinfo->bytes_used, data_sinfo->bytes_reserved,
2135 data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
2136 data_sinfo->bytes_may_use, data_sinfo->total_bytes);
2137 return -ENOSPC;
2138 }
2139 data_sinfo->bytes_may_use += bytes;
2140 BTRFS_I(inode)->reserved_bytes += bytes;
2141 spin_unlock(&data_sinfo->lock);
2142
2143 return btrfs_check_metadata_free_space(root);
2144}
2145
2146/*
2147 * if there was an error for whatever reason after calling
2148 * btrfs_check_data_free_space, call this so we can cleanup the counters.
2149 */
2150void btrfs_free_reserved_data_space(struct btrfs_root *root,
2151 struct inode *inode, u64 bytes)
2152{
2153 struct btrfs_space_info *data_sinfo;
2154
2155 /* make sure bytes are sectorsize aligned */
2156 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2157
2158 data_sinfo = BTRFS_I(inode)->space_info;
2159 spin_lock(&data_sinfo->lock);
2160 data_sinfo->bytes_may_use -= bytes;
2161 BTRFS_I(inode)->reserved_bytes -= bytes;
2162 spin_unlock(&data_sinfo->lock);
2163}
2164
2165/* called when we are adding a delalloc extent to the inode's io_tree */
2166void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2167 u64 bytes)
2168{
2169 struct btrfs_space_info *data_sinfo;
2170
2171 /* get the space info for where this inode will be storing its data */
2172 data_sinfo = BTRFS_I(inode)->space_info;
2173
2174 /* make sure we have enough space to handle the data first */
2175 spin_lock(&data_sinfo->lock);
2176 data_sinfo->bytes_delalloc += bytes;
2177
2178 /*
2179 * we are adding a delalloc extent without calling
2180 * btrfs_check_data_free_space first. This happens on a weird
2181 * writepage condition, but shouldn't hurt our accounting
2182 */
2183 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
2184 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
2185 BTRFS_I(inode)->reserved_bytes = 0;
2186 } else {
2187 data_sinfo->bytes_may_use -= bytes;
2188 BTRFS_I(inode)->reserved_bytes -= bytes;
2189 }
2190
2191 spin_unlock(&data_sinfo->lock);
2192}
2193
2194/* called when we are clearing an delalloc extent from the inode's io_tree */
2195void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2196 u64 bytes)
2197{
2198 struct btrfs_space_info *info;
2199
2200 info = BTRFS_I(inode)->space_info;
2201
2202 spin_lock(&info->lock);
2203 info->bytes_delalloc -= bytes;
2204 spin_unlock(&info->lock);
2205}
2206
1884static int do_chunk_alloc(struct btrfs_trans_handle *trans, 2207static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885 struct btrfs_root *extent_root, u64 alloc_bytes, 2208 struct btrfs_root *extent_root, u64 alloc_bytes,
1886 u64 flags, int force) 2209 u64 flags, int force)
@@ -2137,13 +2460,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
2137 u64 end; 2460 u64 end;
2138 u64 priv; 2461 u64 priv;
2139 u64 search = 0; 2462 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info; 2463 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path; 2464 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp; 2465 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list; 2466 struct list_head insert_list, update_list;
2145 int ret; 2467 int ret;
2146 int num_inserts = 0, max_inserts; 2468 int num_inserts = 0, max_inserts, restart = 0;
2147 2469
2148 path = btrfs_alloc_path(); 2470 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list); 2471 INIT_LIST_HEAD(&insert_list);
@@ -2159,18 +2481,19 @@ again:
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start, 2481 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK); 2482 &end, EXTENT_WRITEBACK);
2161 if (ret) { 2483 if (ret) {
2162 if (skipped && all && !num_inserts) { 2484 if (restart && !num_inserts &&
2163 skipped = 0; 2485 list_empty(&update_list)) {
2486 restart = 0;
2164 search = 0; 2487 search = 0;
2165 continue; 2488 continue;
2166 } 2489 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break; 2490 break;
2169 } 2491 }
2170 2492
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); 2493 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) { 2494 if (!ret) {
2173 skipped = 1; 2495 if (all)
2496 restart = 1;
2174 search = end + 1; 2497 search = end + 1;
2175 if (need_resched()) { 2498 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex); 2499 mutex_unlock(&info->extent_ins_mutex);
@@ -2189,7 +2512,7 @@ again:
2189 list_add_tail(&extent_op->list, &insert_list); 2512 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1; 2513 search = end + 1;
2191 if (num_inserts == max_inserts) { 2514 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex); 2515 restart = 1;
2193 break; 2516 break;
2194 } 2517 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) { 2518 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2205,7 +2528,6 @@ again:
2205 * somebody marked this thing for deletion then just unlock it and be 2528 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it 2529 * done, the free_extents will handle it
2207 */ 2530 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) { 2531 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr, 2532 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1, 2533 extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2227,6 +2549,10 @@ again:
2227 if (!list_empty(&update_list)) { 2549 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list); 2550 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret); 2551 BUG_ON(ret);
2552
2553 /* we may have COW'ed new blocks, so lets start over */
2554 if (all)
2555 restart = 1;
2230 } 2556 }
2231 2557
2232 /* 2558 /*
@@ -2234,9 +2560,9 @@ again:
2234 * need to make sure everything is cleaned then reset everything and 2560 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning 2561 * go back to the beginning
2236 */ 2562 */
2237 if (!num_inserts && all && skipped) { 2563 if (!num_inserts && restart) {
2238 search = 0; 2564 search = 0;
2239 skipped = 0; 2565 restart = 0;
2240 INIT_LIST_HEAD(&update_list); 2566 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list); 2567 INIT_LIST_HEAD(&insert_list);
2242 goto again; 2568 goto again;
@@ -2293,27 +2619,19 @@ again:
2293 BUG_ON(ret); 2619 BUG_ON(ret);
2294 2620
2295 /* 2621 /*
2296 * if we broke out of the loop in order to insert stuff because we hit 2622 * if restart is set for whatever reason we need to go back and start
2297 * the maximum number of inserts at a time we can handle, then loop 2623 * searching through the pending list again.
2298 * back and pick up where we left off 2624 *
2299 */ 2625 * We just inserted some extents, which could have resulted in new
2300 if (num_inserts == max_inserts) { 2626 * blocks being allocated, which would result in new blocks needing
2301 INIT_LIST_HEAD(&insert_list); 2627 * updates, so if all is set we _must_ restart to get the updated
2302 INIT_LIST_HEAD(&update_list); 2628 * blocks.
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */ 2629 */
2312 if (all && skipped) { 2630 if (restart || all) {
2313 INIT_LIST_HEAD(&insert_list); 2631 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list); 2632 INIT_LIST_HEAD(&update_list);
2315 search = 0; 2633 search = 0;
2316 skipped = 0; 2634 restart = 0;
2317 num_inserts = 0; 2635 num_inserts = 0;
2318 goto again; 2636 goto again;
2319 } 2637 }
@@ -2547,6 +2865,7 @@ again:
2547 if (ret) { 2865 if (ret) {
2548 if (all && skipped && !nr) { 2866 if (all && skipped && !nr) {
2549 search = 0; 2867 search = 0;
2868 skipped = 0;
2550 continue; 2869 continue;
2551 } 2870 }
2552 mutex_unlock(&info->extent_ins_mutex); 2871 mutex_unlock(&info->extent_ins_mutex);
@@ -2633,6 +2952,8 @@ again:
2633 goto again; 2952 goto again;
2634 } 2953 }
2635 2954
2955 if (!err)
2956 finish_current_insert(trans, extent_root, 0);
2636 return err; 2957 return err;
2637} 2958}
2638 2959
@@ -2700,13 +3021,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2700 /* if metadata always pin */ 3021 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 3022 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3023 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache; 3024 mutex_lock(&root->fs_info->pinned_mutex);
2704 3025 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2705 /* btrfs_free_reserved_extent */ 3026 mutex_unlock(&root->fs_info->pinned_mutex);
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0); 3027 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0; 3028 return 0;
2712 } 3029 }
@@ -2787,7 +3104,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2787 3104
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) { 3105 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc; 3106 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024; 3107 if (!btrfs_test_opt(root, SSD))
3108 empty_cluster = 64 * 1024;
2791 } 3109 }
2792 3110
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 3111 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3014,16 +3332,18 @@ loop_check:
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 3332static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{ 3333{
3016 struct btrfs_block_group_cache *cache; 3334 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018 3335
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 3336 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used - 3337 (unsigned long long)(info->total_bytes - info->bytes_used -
3021 info->bytes_pinned - info->bytes_reserved), 3338 info->bytes_pinned - info->bytes_reserved),
3022 (info->full) ? "" : "not "); 3339 (info->full) ? "" : "not ");
3340 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3341 " may_use=%llu, used=%llu\n", info->total_bytes,
3342 info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
3343 info->bytes_used);
3023 3344
3024 down_read(&info->groups_sem); 3345 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) { 3346 list_for_each_entry(cache, &info->block_groups, list) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock); 3347 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 3348 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n", 3349 "%llu pinned %llu reserved\n",
@@ -3047,24 +3367,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3047{ 3367{
3048 int ret; 3368 int ret;
3049 u64 search_start = 0; 3369 u64 search_start = 0;
3050 u64 alloc_profile;
3051 struct btrfs_fs_info *info = root->fs_info; 3370 struct btrfs_fs_info *info = root->fs_info;
3052 3371
3053 if (data) { 3372 data = btrfs_get_alloc_profile(root, data);
3054 alloc_profile = info->avail_data_alloc_bits &
3055 info->data_alloc_profile;
3056 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057 } else if (root == root->fs_info->chunk_root) {
3058 alloc_profile = info->avail_system_alloc_bits &
3059 info->system_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061 } else {
3062 alloc_profile = info->avail_metadata_alloc_bits &
3063 info->metadata_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065 }
3066again: 3373again:
3067 data = btrfs_reduce_alloc_profile(root, data);
3068 /* 3374 /*
3069 * the only place that sets empty_size is btrfs_realloc_node, which 3375 * the only place that sets empty_size is btrfs_realloc_node, which
3070 * is not called recursively on allocations 3376 * is not called recursively on allocations
@@ -3332,7 +3638,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3332 3638
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 3639struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root, 3640 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize) 3641 u64 bytenr, u32 blocksize,
3642 int level)
3336{ 3643{
3337 struct extent_buffer *buf; 3644 struct extent_buffer *buf;
3338 3645
@@ -3340,9 +3647,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3340 if (!buf) 3647 if (!buf)
3341 return ERR_PTR(-ENOMEM); 3648 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid); 3649 btrfs_set_header_generation(buf, trans->transid);
3650 btrfs_set_buffer_lockdep_class(buf, level);
3343 btrfs_tree_lock(buf); 3651 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf); 3652 clean_tree_block(trans, root, buf);
3653
3654 btrfs_set_lock_blocking(buf);
3345 btrfs_set_buffer_uptodate(buf); 3655 btrfs_set_buffer_uptodate(buf);
3656
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3657 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start, 3658 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS); 3659 buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3662,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3351 buf->start + buf->len - 1, GFP_NOFS); 3662 buf->start + buf->len - 1, GFP_NOFS);
3352 } 3663 }
3353 trans->blocks_used++; 3664 trans->blocks_used++;
3665 /* this returns a buffer locked for blocking */
3354 return buf; 3666 return buf;
3355} 3667}
3356 3668
@@ -3379,7 +3691,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3379 return ERR_PTR(ret); 3691 return ERR_PTR(ret);
3380 } 3692 }
3381 3693
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); 3694 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3695 blocksize, level);
3383 return buf; 3696 return buf;
3384} 3697}
3385 3698
@@ -3388,36 +3701,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3388{ 3701{
3389 u64 leaf_owner; 3702 u64 leaf_owner;
3390 u64 leaf_generation; 3703 u64 leaf_generation;
3704 struct refsort *sorted;
3391 struct btrfs_key key; 3705 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi; 3706 struct btrfs_file_extent_item *fi;
3393 int i; 3707 int i;
3394 int nritems; 3708 int nritems;
3395 int ret; 3709 int ret;
3710 int refi = 0;
3711 int slot;
3396 3712
3397 BUG_ON(!btrfs_is_leaf(leaf)); 3713 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf); 3714 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf); 3715 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf); 3716 leaf_generation = btrfs_header_generation(leaf);
3401 3717
3718 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3719 /* we do this loop twice. The first time we build a list
3720 * of the extents we have a reference on, then we sort the list
3721 * by bytenr. The second time around we actually do the
3722 * extent freeing.
3723 */
3402 for (i = 0; i < nritems; i++) { 3724 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr; 3725 u64 disk_bytenr;
3404 cond_resched(); 3726 cond_resched();
3405 3727
3406 btrfs_item_key_to_cpu(leaf, &key, i); 3728 btrfs_item_key_to_cpu(leaf, &key, i);
3729
3730 /* only extents have references, skip everything else */
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3731 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue; 3732 continue;
3733
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3734 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3735
3736 /* inline extents live in the btree, they don't have refs */
3410 if (btrfs_file_extent_type(leaf, fi) == 3737 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE) 3738 BTRFS_FILE_EXTENT_INLINE)
3412 continue; 3739 continue;
3413 /* 3740
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3741 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3742
3743 /* holes don't have refs */
3418 if (disk_bytenr == 0) 3744 if (disk_bytenr == 0)
3419 continue; 3745 continue;
3420 3746
3747 sorted[refi].bytenr = disk_bytenr;
3748 sorted[refi].slot = i;
3749 refi++;
3750 }
3751
3752 if (refi == 0)
3753 goto out;
3754
3755 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3756
3757 for (i = 0; i < refi; i++) {
3758 u64 disk_bytenr;
3759
3760 disk_bytenr = sorted[i].bytenr;
3761 slot = sorted[i].slot;
3762
3763 cond_resched();
3764
3765 btrfs_item_key_to_cpu(leaf, &key, slot);
3766 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3767 continue;
3768
3769 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3770
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3771 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi), 3772 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation, 3773 leaf->start, leaf_owner, leaf_generation,
@@ -3428,6 +3778,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3428 wake_up(&root->fs_info->transaction_throttle); 3778 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched(); 3779 cond_resched();
3430 } 3780 }
3781out:
3782 kfree(sorted);
3431 return 0; 3783 return 0;
3432} 3784}
3433 3785
@@ -3437,9 +3789,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3437{ 3789{
3438 int i; 3790 int i;
3439 int ret; 3791 int ret;
3440 struct btrfs_extent_info *info = ref->extents; 3792 struct btrfs_extent_info *info;
3793 struct refsort *sorted;
3794
3795 if (ref->nritems == 0)
3796 return 0;
3441 3797
3798 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3442 for (i = 0; i < ref->nritems; i++) { 3799 for (i = 0; i < ref->nritems; i++) {
3800 sorted[i].bytenr = ref->extents[i].bytenr;
3801 sorted[i].slot = i;
3802 }
3803 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3804
3805 /*
3806 * the items in the ref were sorted when the ref was inserted
3807 * into the ref cache, so this is already in order
3808 */
3809 for (i = 0; i < ref->nritems; i++) {
3810 info = ref->extents + sorted[i].slot;
3443 ret = __btrfs_free_extent(trans, root, info->bytenr, 3811 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr, 3812 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation, 3813 ref->owner, ref->generation,
@@ -3453,6 +3821,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3453 info++; 3821 info++;
3454 } 3822 }
3455 3823
3824 kfree(sorted);
3456 return 0; 3825 return 0;
3457} 3826}
3458 3827
@@ -3497,6 +3866,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3497} 3866}
3498 3867
3499/* 3868/*
3869 * this is used while deleting old snapshots, and it drops the refs
3870 * on a whole subtree starting from a level 1 node.
3871 *
3872 * The idea is to sort all the leaf pointers, and then drop the
3873 * ref on all the leaves in order. Most of the time the leaves
3874 * will have ref cache entries, so no leaf IOs will be required to
3875 * find the extents they have references on.
3876 *
3877 * For each leaf, any references it has are also dropped in order
3878 *
3879 * This ends up dropping the references in something close to optimal
3880 * order for reading and modifying the extent allocation tree.
3881 */
3882static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3883 struct btrfs_root *root,
3884 struct btrfs_path *path)
3885{
3886 u64 bytenr;
3887 u64 root_owner;
3888 u64 root_gen;
3889 struct extent_buffer *eb = path->nodes[1];
3890 struct extent_buffer *leaf;
3891 struct btrfs_leaf_ref *ref;
3892 struct refsort *sorted = NULL;
3893 int nritems = btrfs_header_nritems(eb);
3894 int ret;
3895 int i;
3896 int refi = 0;
3897 int slot = path->slots[1];
3898 u32 blocksize = btrfs_level_size(root, 0);
3899 u32 refs;
3900
3901 if (nritems == 0)
3902 goto out;
3903
3904 root_owner = btrfs_header_owner(eb);
3905 root_gen = btrfs_header_generation(eb);
3906 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3907
3908 /*
3909 * step one, sort all the leaf pointers so we don't scribble
3910 * randomly into the extent allocation tree
3911 */
3912 for (i = slot; i < nritems; i++) {
3913 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3914 sorted[refi].slot = i;
3915 refi++;
3916 }
3917
3918 /*
3919 * nritems won't be zero, but if we're picking up drop_snapshot
3920 * after a crash, slot might be > 0, so double check things
3921 * just in case.
3922 */
3923 if (refi == 0)
3924 goto out;
3925
3926 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3927
3928 /*
3929 * the first loop frees everything the leaves point to
3930 */
3931 for (i = 0; i < refi; i++) {
3932 u64 ptr_gen;
3933
3934 bytenr = sorted[i].bytenr;
3935
3936 /*
3937 * check the reference count on this leaf. If it is > 1
3938 * we just decrement it below and don't update any
3939 * of the refs the leaf points to.
3940 */
3941 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3942 BUG_ON(ret);
3943 if (refs != 1)
3944 continue;
3945
3946 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3947
3948 /*
3949 * the leaf only had one reference, which means the
3950 * only thing pointing to this leaf is the snapshot
3951 * we're deleting. It isn't possible for the reference
3952 * count to increase again later
3953 *
3954 * The reference cache is checked for the leaf,
3955 * and if found we'll be able to drop any refs held by
3956 * the leaf without needing to read it in.
3957 */
3958 ref = btrfs_lookup_leaf_ref(root, bytenr);
3959 if (ref && ref->generation != ptr_gen) {
3960 btrfs_free_leaf_ref(root, ref);
3961 ref = NULL;
3962 }
3963 if (ref) {
3964 ret = cache_drop_leaf_ref(trans, root, ref);
3965 BUG_ON(ret);
3966 btrfs_remove_leaf_ref(root, ref);
3967 btrfs_free_leaf_ref(root, ref);
3968 } else {
3969 /*
3970 * the leaf wasn't in the reference cache, so
3971 * we have to read it.
3972 */
3973 leaf = read_tree_block(root, bytenr, blocksize,
3974 ptr_gen);
3975 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3976 BUG_ON(ret);
3977 free_extent_buffer(leaf);
3978 }
3979 atomic_inc(&root->fs_info->throttle_gen);
3980 wake_up(&root->fs_info->transaction_throttle);
3981 cond_resched();
3982 }
3983
3984 /*
3985 * run through the loop again to free the refs on the leaves.
3986 * This is faster than doing it in the loop above because
3987 * the leaves are likely to be clustered together. We end up
3988 * working in nice chunks on the extent allocation tree.
3989 */
3990 for (i = 0; i < refi; i++) {
3991 bytenr = sorted[i].bytenr;
3992 ret = __btrfs_free_extent(trans, root, bytenr,
3993 blocksize, eb->start,
3994 root_owner, root_gen, 0, 1);
3995 BUG_ON(ret);
3996
3997 atomic_inc(&root->fs_info->throttle_gen);
3998 wake_up(&root->fs_info->transaction_throttle);
3999 cond_resched();
4000 }
4001out:
4002 kfree(sorted);
4003
4004 /*
4005 * update the path to show we've processed the entire level 1
4006 * node. This will get saved into the root's drop_snapshot_progress
4007 * field so these drops are not repeated again if this transaction
4008 * commits.
4009 */
4010 path->slots[1] = nritems;
4011 return 0;
4012}
4013
4014/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref 4015 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes. 4016 * counts as it goes.
3502 */ 4017 */
@@ -3511,7 +4026,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3511 struct extent_buffer *next; 4026 struct extent_buffer *next;
3512 struct extent_buffer *cur; 4027 struct extent_buffer *cur;
3513 struct extent_buffer *parent; 4028 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize; 4029 u32 blocksize;
3516 int ret; 4030 int ret;
3517 u32 refs; 4031 u32 refs;
@@ -3538,17 +4052,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3538 if (path->slots[*level] >= 4052 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur)) 4053 btrfs_header_nritems(cur))
3540 break; 4054 break;
4055
4056 /* the new code goes down to level 1 and does all the
4057 * leaves pointed to that node in bulk. So, this check
4058 * for level 0 will always be false.
4059 *
4060 * But, the disk format allows the drop_snapshot_progress
4061 * field in the root to leave things in a state where
4062 * a leaf will need cleaning up here. If someone crashes
4063 * with the old code and then boots with the new code,
4064 * we might find a leaf here.
4065 */
3541 if (*level == 0) { 4066 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur); 4067 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret); 4068 BUG_ON(ret);
3544 break; 4069 break;
3545 } 4070 }
4071
4072 /*
4073 * once we get to level one, process the whole node
4074 * at once, including everything below it.
4075 */
4076 if (*level == 1) {
4077 ret = drop_level_one_refs(trans, root, path);
4078 BUG_ON(ret);
4079 break;
4080 }
4081
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 4082 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 4083 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1); 4084 blocksize = btrfs_level_size(root, *level - 1);
3549 4085
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 4086 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret); 4087 BUG_ON(ret);
4088
4089 /*
4090 * if there is more than one reference, we don't need
4091 * to read that node to drop any references it has. We
4092 * just drop the ref we hold on that node and move on to the
4093 * next slot in this level.
4094 */
3552 if (refs != 1) { 4095 if (refs != 1) {
3553 parent = path->nodes[*level]; 4096 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent); 4097 root_owner = btrfs_header_owner(parent);
@@ -3567,46 +4110,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3567 4110
3568 continue; 4111 continue;
3569 } 4112 }
4113
3570 /* 4114 /*
3571 * at this point, we have a single ref, and since the 4115 * we need to keep freeing things in the next level down.
3572 * only place referencing this extent is a dead root 4116 * read the block and loop around to process it
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */ 4117 */
3576 if (*level == 1) { 4118 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0); 4119 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1]) 4120 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]); 4121 free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +4140,16 @@ out:
3631 root_owner = btrfs_header_owner(parent); 4140 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent); 4141 root_gen = btrfs_header_generation(parent);
3633 4142
4143 /*
4144 * cleanup and free the reference on the last node
4145 * we processed
4146 */
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 4147 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen, 4148 parent->start, root_owner, root_gen,
3636 *level, 1); 4149 *level, 1);
3637 free_extent_buffer(path->nodes[*level]); 4150 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL; 4151 path->nodes[*level] = NULL;
4152
3639 *level += 1; 4153 *level += 1;
3640 BUG_ON(ret); 4154 BUG_ON(ret);
3641 4155
@@ -3687,6 +4201,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3687 4201
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 4202 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next); 4203 btrfs_tree_lock(next);
4204 btrfs_set_lock_blocking(next);
3690 4205
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 4206 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs); 4207 &refs);
@@ -3754,6 +4269,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4269 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node; 4270 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key; 4271 struct btrfs_disk_key disk_key;
4272
4273 /*
4274 * there is more work to do in this level.
4275 * Update the drop_progress marker to reflect
4276 * the work we've done so far, and then bump
4277 * the slot number
4278 */
3757 node = path->nodes[i]; 4279 node = path->nodes[i];
3758 path->slots[i]++; 4280 path->slots[i]++;
3759 *level = i; 4281 *level = i;
@@ -3765,6 +4287,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3765 return 0; 4287 return 0;
3766 } else { 4288 } else {
3767 struct extent_buffer *parent; 4289 struct extent_buffer *parent;
4290
4291 /*
4292 * this whole node is done, free our reference
4293 * on it and go up one level
4294 */
3768 if (path->nodes[*level] == root->node) 4295 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level]; 4296 parent = path->nodes[*level];
3770 else 4297 else
@@ -3891,13 +4418,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3891 path = btrfs_alloc_path(); 4418 path = btrfs_alloc_path();
3892 BUG_ON(!path); 4419 BUG_ON(!path);
3893 4420
3894 BUG_ON(!btrfs_tree_locked(parent)); 4421 btrfs_assert_tree_locked(parent);
3895 parent_level = btrfs_header_level(parent); 4422 parent_level = btrfs_header_level(parent);
3896 extent_buffer_get(parent); 4423 extent_buffer_get(parent);
3897 path->nodes[parent_level] = parent; 4424 path->nodes[parent_level] = parent;
3898 path->slots[parent_level] = btrfs_header_nritems(parent); 4425 path->slots[parent_level] = btrfs_header_nritems(parent);
3899 4426
3900 BUG_ON(!btrfs_tree_locked(node)); 4427 btrfs_assert_tree_locked(node);
3901 level = btrfs_header_level(node); 4428 level = btrfs_header_level(node);
3902 extent_buffer_get(node); 4429 extent_buffer_get(node);
3903 path->nodes[level] = node; 4430 path->nodes[level] = node;
@@ -4444,7 +4971,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4444 u64 lock_end = 0; 4971 u64 lock_end = 0;
4445 u64 num_bytes; 4972 u64 num_bytes;
4446 u64 ext_offset; 4973 u64 ext_offset;
4447 u64 first_pos; 4974 u64 search_end = (u64)-1;
4448 u32 nritems; 4975 u32 nritems;
4449 int nr_scaned = 0; 4976 int nr_scaned = 0;
4450 int extent_locked = 0; 4977 int extent_locked = 0;
@@ -4452,7 +4979,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4452 int ret; 4979 int ret;
4453 4980
4454 memcpy(&key, leaf_key, sizeof(key)); 4981 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 4982 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid || 4983 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid && 4984 (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +5027,7 @@ next:
4501 if ((key.objectid > ref_path->owner_objectid) || 5027 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid && 5028 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) || 5029 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset)) 5030 key.offset >= search_end)
4505 break; 5031 break;
4506 } 5032 }
4507 5033
@@ -4534,8 +5060,10 @@ next:
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 5060 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi); 5061 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536 5062
4537 if (first_pos > key.offset - ext_offset) 5063 if (search_end == (u64)-1) {
4538 first_pos = key.offset - ext_offset; 5064 search_end = key.offset - ext_offset +
5065 btrfs_file_extent_ram_bytes(leaf, fi);
5066 }
4539 5067
4540 if (!extent_locked) { 5068 if (!extent_locked) {
4541 lock_start = key.offset; 5069 lock_start = key.offset;
@@ -4724,7 +5252,7 @@ next:
4724 } 5252 }
4725skip: 5253skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 5254 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset) 5255 key.offset >= search_end)
4728 break; 5256 break;
4729 5257
4730 cond_resched(); 5258 cond_resched();
@@ -4778,6 +5306,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4778 ref->bytenr = buf->start; 5306 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf); 5307 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf); 5308 ref->generation = btrfs_header_generation(buf);
5309
4781 ret = btrfs_add_leaf_ref(root, ref, 0); 5310 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret); 5311 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref); 5312 btrfs_free_leaf_ref(root, ref);
@@ -5351,7 +5880,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5351 prev_block = block_start; 5880 prev_block = block_start;
5352 } 5881 }
5353 5882
5883 mutex_lock(&extent_root->fs_info->trans_mutex);
5354 btrfs_record_root_in_trans(found_root); 5884 btrfs_record_root_in_trans(found_root);
5885 mutex_unlock(&extent_root->fs_info->trans_mutex);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5886 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /* 5887 /*
5357 * try to update data extent references while 5888 * try to update data extent references while
@@ -5957,9 +6488,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5957 path = btrfs_alloc_path(); 6488 path = btrfs_alloc_path();
5958 BUG_ON(!path); 6489 BUG_ON(!path);
5959 6490
5960 btrfs_remove_free_space_cache(block_group); 6491 spin_lock(&root->fs_info->block_group_cache_lock);
5961 rb_erase(&block_group->cache_node, 6492 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree); 6493 &root->fs_info->block_group_cache_tree);
6494 spin_unlock(&root->fs_info->block_group_cache_lock);
6495 btrfs_remove_free_space_cache(block_group);
5963 down_write(&block_group->space_info->groups_sem); 6496 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list); 6497 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem); 6498 up_write(&block_group->space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e086d407f1fa..ebe6b29e6069 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -9,7 +9,6 @@
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/blkdev.h> 10#include <linux/blkdev.h>
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h> 12#include <linux/writeback.h>
14#include <linux/pagevec.h> 13#include <linux/pagevec.h>
15#include "extent_io.h" 14#include "extent_io.h"
@@ -31,7 +30,7 @@ static LIST_HEAD(buffers);
31static LIST_HEAD(states); 30static LIST_HEAD(states);
32 31
33#define LEAK_DEBUG 0 32#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG 33#if LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock); 34static DEFINE_SPINLOCK(leak_lock);
36#endif 35#endif
37 36
@@ -120,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
120static struct extent_state *alloc_extent_state(gfp_t mask) 119static struct extent_state *alloc_extent_state(gfp_t mask)
121{ 120{
122 struct extent_state *state; 121 struct extent_state *state;
123#ifdef LEAK_DEBUG 122#if LEAK_DEBUG
124 unsigned long flags; 123 unsigned long flags;
125#endif 124#endif
126 125
@@ -130,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
130 state->state = 0; 129 state->state = 0;
131 state->private = 0; 130 state->private = 0;
132 state->tree = NULL; 131 state->tree = NULL;
133#ifdef LEAK_DEBUG 132#if LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags); 133 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states); 134 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags); 135 spin_unlock_irqrestore(&leak_lock, flags);
@@ -145,11 +144,11 @@ static void free_extent_state(struct extent_state *state)
145 if (!state) 144 if (!state)
146 return; 145 return;
147 if (atomic_dec_and_test(&state->refs)) { 146 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG 147#if LEAK_DEBUG
149 unsigned long flags; 148 unsigned long flags;
150#endif 149#endif
151 WARN_ON(state->tree); 150 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG 151#if LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags); 152 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list); 153 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags); 154 spin_unlock_irqrestore(&leak_lock, flags);
@@ -416,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
416 415
417 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 416 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
418 if (node) { 417 if (node) {
419 struct extent_state *found;
420 found = rb_entry(node, struct extent_state, rb_node);
421 free_extent_state(prealloc); 418 free_extent_state(prealloc);
422 return -EEXIST; 419 return -EEXIST;
423 } 420 }
@@ -2378,11 +2375,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2378 int scanned = 0; 2375 int scanned = 0;
2379 int range_whole = 0; 2376 int range_whole = 0;
2380 2377
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0); 2378 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) { 2379 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */ 2380 index = mapping->writeback_index; /* Start from prev offset */
@@ -2855,6 +2847,98 @@ out:
2855 return sector; 2847 return sector;
2856} 2848}
2857 2849
2850int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2851 __u64 start, __u64 len, get_extent_t *get_extent)
2852{
2853 int ret;
2854 u64 off = start;
2855 u64 max = start + len;
2856 u32 flags = 0;
2857 u64 disko = 0;
2858 struct extent_map *em = NULL;
2859 int end = 0;
2860 u64 em_start = 0, em_len = 0;
2861 unsigned long emflags;
2862 ret = 0;
2863
2864 if (len == 0)
2865 return -EINVAL;
2866
2867 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2868 GFP_NOFS);
2869 em = get_extent(inode, NULL, 0, off, max - off, 0);
2870 if (!em)
2871 goto out;
2872 if (IS_ERR(em)) {
2873 ret = PTR_ERR(em);
2874 goto out;
2875 }
2876 while (!end) {
2877 off = em->start + em->len;
2878 if (off >= max)
2879 end = 1;
2880
2881 em_start = em->start;
2882 em_len = em->len;
2883
2884 disko = 0;
2885 flags = 0;
2886
2887 switch (em->block_start) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST;
2891 break;
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break;
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break;
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN);
2902 break;
2903 default:
2904 disko = em->block_start;
2905 break;
2906 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED;
2909
2910 emflags = em->flags;
2911 free_extent_map(em);
2912 em = NULL;
2913
2914 if (!end) {
2915 em = get_extent(inode, NULL, 0, off, max - off, 0);
2916 if (!em)
2917 goto out;
2918 if (IS_ERR(em)) {
2919 ret = PTR_ERR(em);
2920 goto out;
2921 }
2922 emflags = em->flags;
2923 }
2924 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2925 flags |= FIEMAP_EXTENT_LAST;
2926 end = 1;
2927 }
2928
2929 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2930 em_len, flags);
2931 if (ret)
2932 goto out_free;
2933 }
2934out_free:
2935 free_extent_map(em);
2936out:
2937 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2938 GFP_NOFS);
2939 return ret;
2940}
2941
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2942static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i) 2943 unsigned long i)
2860{ 2944{
@@ -2892,15 +2976,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2892 gfp_t mask) 2976 gfp_t mask)
2893{ 2977{
2894 struct extent_buffer *eb = NULL; 2978 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG 2979#if LEAK_DEBUG
2896 unsigned long flags; 2980 unsigned long flags;
2897#endif 2981#endif
2898 2982
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2983 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start; 2984 eb->start = start;
2901 eb->len = len; 2985 eb->len = len;
2902 mutex_init(&eb->mutex); 2986 spin_lock_init(&eb->lock);
2903#ifdef LEAK_DEBUG 2987 init_waitqueue_head(&eb->lock_wq);
2988
2989#if LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags); 2990 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers); 2991 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags); 2992 spin_unlock_irqrestore(&leak_lock, flags);
@@ -2912,7 +2998,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2912 2998
2913static void __free_extent_buffer(struct extent_buffer *eb) 2999static void __free_extent_buffer(struct extent_buffer *eb)
2914{ 3000{
2915#ifdef LEAK_DEBUG 3001#if LEAK_DEBUG
2916 unsigned long flags; 3002 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags); 3003 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list); 3004 list_del(&eb->leak_list);
@@ -2980,8 +3066,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2980 unlock_page(p); 3066 unlock_page(p);
2981 } 3067 }
2982 if (uptodate) 3068 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE; 3069 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985 3070
2986 spin_lock(&tree->buffer_lock); 3071 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3072 exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3135,7 +3220,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3135 unsigned long num_pages; 3220 unsigned long num_pages;
3136 3221
3137 num_pages = num_extent_pages(eb->start, eb->len); 3222 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE; 3223 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3139 3224
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3225 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS); 3226 GFP_NOFS);
@@ -3206,7 +3291,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3206 struct page *page; 3291 struct page *page;
3207 int pg_uptodate = 1; 3292 int pg_uptodate = 1;
3208 3293
3209 if (eb->flags & EXTENT_UPTODATE) 3294 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3210 return 1; 3295 return 1;
3211 3296
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3297 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3242,7 +3327,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3242 struct bio *bio = NULL; 3327 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0; 3328 unsigned long bio_flags = 0;
3244 3329
3245 if (eb->flags & EXTENT_UPTODATE) 3330 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3246 return 0; 3331 return 0;
3247 3332
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3333 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3273,7 +3358,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3273 } 3358 }
3274 if (all_uptodate) { 3359 if (all_uptodate) {
3275 if (start_i == 0) 3360 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE; 3361 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3277 goto unlock_exit; 3362 goto unlock_exit;
3278 } 3363 }
3279 3364
@@ -3309,7 +3394,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3309 } 3394 }
3310 3395
3311 if (!ret) 3396 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE; 3397 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3313 return ret; 3398 return ret;
3314 3399
3315unlock_exit: 3400unlock_exit:
@@ -3406,7 +3491,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3406 unmap_extent_buffer(eb, eb->map_token, km); 3491 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL; 3492 eb->map_token = NULL;
3408 save = 1; 3493 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 } 3494 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map, 3495 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km); 3496 map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5b483a79137..1f9df88afbf6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
22/* flags for bio submission */ 22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1 23#define EXTENT_BIO_COMPRESSED 1
24 24
25/* these are bit numbers for test/set bit */
26#define EXTENT_BUFFER_UPTODATE 0
27#define EXTENT_BUFFER_BLOCKING 1
28
25/* 29/*
26 * page->private values. Every page that is controlled by the extent 30 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one. 31 * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
95 unsigned long map_start; 99 unsigned long map_start;
96 unsigned long map_len; 100 unsigned long map_len;
97 struct page *first_page; 101 struct page *first_page;
102 unsigned long bflags;
98 atomic_t refs; 103 atomic_t refs;
99 int flags;
100 struct list_head leak_list; 104 struct list_head leak_list;
101 struct rb_node rb_node; 105 struct rb_node rb_node;
102 struct mutex mutex; 106
107 /* the spinlock is used to protect most operations */
108 spinlock_t lock;
109
110 /*
111 * when we keep the lock held while blocking, waiters go onto
112 * the wq
113 */
114 wait_queue_head_t lock_wq;
103}; 115};
104 116
105struct extent_map_tree; 117struct extent_map_tree;
@@ -193,6 +205,8 @@ int extent_commit_write(struct extent_io_tree *tree,
193 unsigned from, unsigned to); 205 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 206sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent); 207 get_extent_t *get_extent);
208int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
209 __u64 start, __u64 len, get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); 210int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); 211int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); 212int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a83e33ada32..50da69da20ce 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,7 +3,6 @@
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/spinlock.h> 5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h> 6#include <linux/hardirq.h>
8#include "extent_map.h" 7#include "extent_map.h"
9 8
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 90268334145e..dc78954861b3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -29,7 +29,6 @@
29#include <linux/writeback.h> 29#include <linux/writeback.h>
30#include <linux/statfs.h> 30#include <linux/statfs.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h" 32#include "ctree.h"
34#include "disk-io.h" 33#include "disk-io.h"
35#include "transaction.h" 34#include "transaction.h"
@@ -1092,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1092 WARN_ON(num_pages > nrptrs); 1091 WARN_ON(num_pages > nrptrs);
1093 memset(pages, 0, sizeof(struct page *) * nrptrs); 1092 memset(pages, 0, sizeof(struct page *) * nrptrs);
1094 1093
1095 ret = btrfs_check_free_space(root, write_bytes, 0); 1094 ret = btrfs_check_data_free_space(root, inode, write_bytes);
1096 if (ret) 1095 if (ret)
1097 goto out; 1096 goto out;
1098 1097
1099 ret = prepare_pages(root, file, pages, num_pages, 1098 ret = prepare_pages(root, file, pages, num_pages,
1100 pos, first_index, last_index, 1099 pos, first_index, last_index,
1101 write_bytes); 1100 write_bytes);
1102 if (ret) 1101 if (ret) {
1102 btrfs_free_reserved_data_space(root, inode,
1103 write_bytes);
1103 goto out; 1104 goto out;
1105 }
1104 1106
1105 ret = btrfs_copy_from_user(pos, num_pages, 1107 ret = btrfs_copy_from_user(pos, num_pages,
1106 write_bytes, pages, buf); 1108 write_bytes, pages, buf);
1107 if (ret) { 1109 if (ret) {
1110 btrfs_free_reserved_data_space(root, inode,
1111 write_bytes);
1108 btrfs_drop_pages(pages, num_pages); 1112 btrfs_drop_pages(pages, num_pages);
1109 goto out; 1113 goto out;
1110 } 1114 }
@@ -1112,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1112 ret = dirty_and_release_pages(NULL, root, file, pages, 1116 ret = dirty_and_release_pages(NULL, root, file, pages,
1113 num_pages, pos, write_bytes); 1117 num_pages, pos, write_bytes);
1114 btrfs_drop_pages(pages, num_pages); 1118 btrfs_drop_pages(pages, num_pages);
1115 if (ret) 1119 if (ret) {
1120 btrfs_free_reserved_data_space(root, inode,
1121 write_bytes);
1116 goto out; 1122 goto out;
1123 }
1117 1124
1118 if (will_write) { 1125 if (will_write) {
1119 btrfs_fdatawrite_range(inode->i_mapping, pos, 1126 btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1137,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1137 } 1144 }
1138out: 1145out:
1139 mutex_unlock(&inode->i_mutex); 1146 mutex_unlock(&inode->i_mutex);
1147 if (ret)
1148 err = ret;
1140 1149
1141out_nolock: 1150out_nolock:
1142 kfree(pages); 1151 kfree(pages);
@@ -1215,15 +1224,15 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1215 } 1224 }
1216 mutex_unlock(&root->fs_info->trans_mutex); 1225 mutex_unlock(&root->fs_info->trans_mutex);
1217 1226
1218 root->fs_info->tree_log_batch++; 1227 root->log_batch++;
1219 filemap_fdatawrite(inode->i_mapping); 1228 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1229 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++; 1230 root->log_batch++;
1222 1231
1223 /* 1232 /*
1224 * ok we haven't committed the transaction yet, lets do a commit 1233 * ok we haven't committed the transaction yet, lets do a commit
1225 */ 1234 */
1226 if (file->private_data) 1235 if (file && file->private_data)
1227 btrfs_ioctl_trans_end(file); 1236 btrfs_ioctl_trans_end(file);
1228 1237
1229 trans = btrfs_start_transaction(root, 1); 1238 trans = btrfs_start_transaction(root, 1);
@@ -1232,7 +1241,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1232 goto out; 1241 goto out;
1233 } 1242 }
1234 1243
1235 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); 1244 ret = btrfs_log_dentry_safe(trans, root, dentry);
1236 if (ret < 0) 1245 if (ret < 0)
1237 goto out; 1246 goto out;
1238 1247
@@ -1246,7 +1255,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1246 * file again, but that will end up using the synchronization 1255 * file again, but that will end up using the synchronization
1247 * inside btrfs_sync_log to keep things safe. 1256 * inside btrfs_sync_log to keep things safe.
1248 */ 1257 */
1249 mutex_unlock(&file->f_dentry->d_inode->i_mutex); 1258 mutex_unlock(&dentry->d_inode->i_mutex);
1250 1259
1251 if (ret > 0) { 1260 if (ret > 0) {
1252 ret = btrfs_commit_transaction(trans, root); 1261 ret = btrfs_commit_transaction(trans, root);
@@ -1254,7 +1263,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1254 btrfs_sync_log(trans, root); 1263 btrfs_sync_log(trans, root);
1255 ret = btrfs_end_transaction(trans, root); 1264 ret = btrfs_end_transaction(trans, root);
1256 } 1265 }
1257 mutex_lock(&file->f_dentry->d_inode->i_mutex); 1266 mutex_lock(&dentry->d_inode->i_mutex);
1258out: 1267out:
1259 return ret > 0 ? EIO : ret; 1268 return ret > 0 ? EIO : ret;
1260} 1269}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb46..cc7334d833c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
84 search_key.type = 0; 84 search_key.type = 0;
85 search_key.offset = 0; 85 search_key.offset = 0;
86 86
87 btrfs_init_path(path);
88 start_found = 0; 87 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); 88 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0) 89 if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8adfe059ab41..7d4f948bc22a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
34#include <linux/statfs.h> 34#include <linux/statfs.h>
35#include <linux/compat.h> 35#include <linux/compat.h>
36#include <linux/bit_spinlock.h> 36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h> 37#include <linux/xattr.h>
39#include <linux/posix_acl.h> 38#include <linux/posix_acl.h>
40#include <linux/falloc.h> 39#include <linux/falloc.h>
@@ -51,6 +50,7 @@
51#include "tree-log.h" 50#include "tree-log.h"
52#include "ref-cache.h" 51#include "ref-cache.h"
53#include "compression.h" 52#include "compression.h"
53#include "locking.h"
54 54
55struct btrfs_iget_args { 55struct btrfs_iget_args {
56 u64 ino; 56 u64 ino;
@@ -91,32 +91,14 @@ static noinline int cow_file_range(struct inode *inode,
91 u64 start, u64 end, int *page_started, 91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock); 92 unsigned long *nr_written, int unlock);
93 93
94/* 94static int btrfs_init_inode_security(struct inode *inode, struct inode *dir)
95 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing.
97 */
98int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99 int for_del)
100{ 95{
101 u64 total; 96 int err;
102 u64 used;
103 u64 thresh;
104 int ret = 0;
105
106 spin_lock(&root->fs_info->delalloc_lock);
107 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
108 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
109 if (for_del)
110 thresh = total * 90;
111 else
112 thresh = total * 85;
113
114 do_div(thresh, 100);
115 97
116 if (used + root->fs_info->delalloc_bytes + num_required > thresh) 98 err = btrfs_init_acl(inode, dir);
117 ret = -ENOSPC; 99 if (!err)
118 spin_unlock(&root->fs_info->delalloc_lock); 100 err = btrfs_xattr_security_init(inode, dir);
119 return ret; 101 return err;
120} 102}
121 103
122/* 104/*
@@ -350,6 +332,19 @@ again:
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; 332 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); 333 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352 334
335 /*
336 * we don't want to send crud past the end of i_size through
337 * compression, that's just a waste of CPU time. So, if the
338 * end of the file is before the start of our current
339 * requested range of bytes, we bail out to the uncompressed
340 * cleanup code that can deal with all of this.
341 *
342 * It isn't really the fastest way to fix things, but this is a
343 * very uncommon corner.
344 */
345 if (actual_end <= start)
346 goto cleanup_and_bail_uncompressed;
347
353 total_compressed = actual_end - start; 348 total_compressed = actual_end - start;
354 349
355 /* we want to make sure that amount of ram required to uncompress 350 /* we want to make sure that amount of ram required to uncompress
@@ -494,6 +489,7 @@ again:
494 goto again; 489 goto again;
495 } 490 }
496 } else { 491 } else {
492cleanup_and_bail_uncompressed:
497 /* 493 /*
498 * No compression, but we still need to write the pages in 494 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked 495 * the file we've been given so far. redirty the locked
@@ -1166,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1166 */ 1162 */
1167 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1163 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1168 struct btrfs_root *root = BTRFS_I(inode)->root; 1164 struct btrfs_root *root = BTRFS_I(inode)->root;
1165 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1169 spin_lock(&root->fs_info->delalloc_lock); 1166 spin_lock(&root->fs_info->delalloc_lock);
1170 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1167 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1171 root->fs_info->delalloc_bytes += end - start + 1; 1168 root->fs_info->delalloc_bytes += end - start + 1;
@@ -1199,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1199 (unsigned long long)end - start + 1, 1196 (unsigned long long)end - start + 1,
1200 (unsigned long long) 1197 (unsigned long long)
1201 root->fs_info->delalloc_bytes); 1198 root->fs_info->delalloc_bytes);
1199 btrfs_delalloc_free_space(root, inode, (u64)-1);
1202 root->fs_info->delalloc_bytes = 0; 1200 root->fs_info->delalloc_bytes = 0;
1203 BTRFS_I(inode)->delalloc_bytes = 0; 1201 BTRFS_I(inode)->delalloc_bytes = 0;
1204 } else { 1202 } else {
1203 btrfs_delalloc_free_space(root, inode,
1204 end - start + 1);
1205 root->fs_info->delalloc_bytes -= end - start + 1; 1205 root->fs_info->delalloc_bytes -= end - start + 1;
1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1207 } 1207 }
@@ -1324,12 +1324,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset, 1324 struct inode *inode, u64 file_offset,
1325 struct list_head *list) 1325 struct list_head *list)
1326{ 1326{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum; 1327 struct btrfs_ordered_sum *sum;
1329 1328
1330 btrfs_set_trans_block_group(trans, inode); 1329 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) { 1330
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list); 1331 list_for_each_entry(sum, list, list) {
1333 btrfs_csum_file_blocks(trans, 1332 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1333 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 } 1334 }
@@ -2013,6 +2012,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2012 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014 2013
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item); 2014 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2015
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0); 2017 alloc_group_block, 0);
2018 btrfs_free_path(path); 2018 btrfs_free_path(path);
@@ -2039,6 +2039,7 @@ void btrfs_read_locked_inode(struct inode *inode)
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break; 2040 break;
2041 default: 2041 default:
2042 inode->i_op = &btrfs_special_inode_operations;
2042 init_special_inode(inode, inode->i_mode, rdev); 2043 init_special_inode(inode, inode->i_mode, rdev);
2043 break; 2044 break;
2044 } 2045 }
@@ -2108,6 +2109,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2108 goto failed; 2109 goto failed;
2109 } 2110 }
2110 2111
2112 btrfs_unlock_up_safe(path, 1);
2111 leaf = path->nodes[0]; 2113 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2114 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item); 2115 struct btrfs_inode_item);
@@ -2219,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2219 2221
2220 root = BTRFS_I(dir)->root; 2222 root = BTRFS_I(dir)->root;
2221 2223
2222 ret = btrfs_check_free_space(root, 1, 1);
2223 if (ret)
2224 goto fail;
2225
2226 trans = btrfs_start_transaction(root, 1); 2224 trans = btrfs_start_transaction(root, 1);
2227 2225
2228 btrfs_set_trans_block_group(trans, dir); 2226 btrfs_set_trans_block_group(trans, dir);
@@ -2235,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2235 nr = trans->blocks_used; 2233 nr = trans->blocks_used;
2236 2234
2237 btrfs_end_transaction_throttle(trans, root); 2235 btrfs_end_transaction_throttle(trans, root);
2238fail:
2239 btrfs_btree_balance_dirty(root, nr); 2236 btrfs_btree_balance_dirty(root, nr);
2240 return ret; 2237 return ret;
2241} 2238}
@@ -2258,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2258 return -ENOTEMPTY; 2255 return -ENOTEMPTY;
2259 } 2256 }
2260 2257
2261 ret = btrfs_check_free_space(root, 1, 1);
2262 if (ret)
2263 goto fail;
2264
2265 trans = btrfs_start_transaction(root, 1); 2258 trans = btrfs_start_transaction(root, 1);
2266 btrfs_set_trans_block_group(trans, dir); 2259 btrfs_set_trans_block_group(trans, dir);
2267 2260
@@ -2278,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2278fail_trans: 2271fail_trans:
2279 nr = trans->blocks_used; 2272 nr = trans->blocks_used;
2280 ret = btrfs_end_transaction_throttle(trans, root); 2273 ret = btrfs_end_transaction_throttle(trans, root);
2281fail:
2282 btrfs_btree_balance_dirty(root, nr); 2274 btrfs_btree_balance_dirty(root, nr);
2283 2275
2284 if (ret && !err) 2276 if (ret && !err)
@@ -2429,6 +2421,8 @@ next_node:
2429 ref->generation = leaf_gen; 2421 ref->generation = leaf_gen;
2430 ref->nritems = 0; 2422 ref->nritems = 0;
2431 2423
2424 btrfs_sort_leaf_ref(ref);
2425
2432 ret = btrfs_add_leaf_ref(root, ref, 0); 2426 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret); 2427 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref); 2428 btrfs_free_leaf_ref(root, ref);
@@ -2476,7 +2470,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2476 struct btrfs_path *path; 2470 struct btrfs_path *path;
2477 struct btrfs_key key; 2471 struct btrfs_key key;
2478 struct btrfs_key found_key; 2472 struct btrfs_key found_key;
2479 u32 found_type; 2473 u32 found_type = (u8)-1;
2480 struct extent_buffer *leaf; 2474 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi; 2475 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0; 2476 u64 extent_start = 0;
@@ -2503,8 +2497,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2503 key.offset = (u64)-1; 2497 key.offset = (u64)-1;
2504 key.type = (u8)-1; 2498 key.type = (u8)-1;
2505 2499
2506 btrfs_init_path(path);
2507
2508search_again: 2500search_again:
2509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2501 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2510 if (ret < 0) 2502 if (ret < 0)
@@ -2663,6 +2655,8 @@ next:
2663 if (pending_del_nr) 2655 if (pending_del_nr)
2664 goto del_pending; 2656 goto del_pending;
2665 btrfs_release_path(root, path); 2657 btrfs_release_path(root, path);
2658 if (found_type == BTRFS_INODE_ITEM_KEY)
2659 break;
2666 goto search_again; 2660 goto search_again;
2667 } 2661 }
2668 2662
@@ -2679,6 +2673,8 @@ del_pending:
2679 BUG_ON(ret); 2673 BUG_ON(ret);
2680 pending_del_nr = 0; 2674 pending_del_nr = 0;
2681 btrfs_release_path(root, path); 2675 btrfs_release_path(root, path);
2676 if (found_type == BTRFS_INODE_ITEM_KEY)
2677 break;
2682 goto search_again; 2678 goto search_again;
2683 } 2679 }
2684 } 2680 }
@@ -2788,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2788 if (size <= hole_start) 2784 if (size <= hole_start)
2789 return 0; 2785 return 0;
2790 2786
2791 err = btrfs_check_free_space(root, 1, 0); 2787 err = btrfs_check_metadata_free_space(root);
2792 if (err) 2788 if (err)
2793 return err; 2789 return err;
2794 2790
@@ -2984,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode)
2984 bi->last_trans = 0; 2980 bi->last_trans = 0;
2985 bi->logged_trans = 0; 2981 bi->logged_trans = 0;
2986 bi->delalloc_bytes = 0; 2982 bi->delalloc_bytes = 0;
2983 bi->reserved_bytes = 0;
2987 bi->disk_i_size = 0; 2984 bi->disk_i_size = 0;
2988 bi->flags = 0; 2985 bi->flags = 0;
2989 bi->index_cnt = (u64)-1; 2986 bi->index_cnt = (u64)-1;
@@ -3005,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
3005 inode->i_ino = args->ino; 3002 inode->i_ino = args->ino;
3006 init_btrfs_i(inode); 3003 init_btrfs_i(inode);
3007 BTRFS_I(inode)->root = args->root; 3004 BTRFS_I(inode)->root = args->root;
3005 btrfs_set_inode_space_info(args->root, inode);
3008 return 0; 3006 return 0;
3009} 3007}
3010 3008
@@ -3265,7 +3263,7 @@ skip:
3265 3263
3266 /* Reached end of directory/root. Bump pos past the last item. */ 3264 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY) 3265 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); 3266 filp->f_pos = INT_LIMIT(off_t);
3269 else 3267 else
3270 filp->f_pos++; 3268 filp->f_pos++;
3271nopos: 3269nopos:
@@ -3425,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3425 BTRFS_I(inode)->index_cnt = 2; 3423 BTRFS_I(inode)->index_cnt = 2;
3426 BTRFS_I(inode)->root = root; 3424 BTRFS_I(inode)->root = root;
3427 BTRFS_I(inode)->generation = trans->transid; 3425 BTRFS_I(inode)->generation = trans->transid;
3426 btrfs_set_inode_space_info(root, inode);
3428 3427
3429 if (mode & S_IFDIR) 3428 if (mode & S_IFDIR)
3430 owner = 0; 3429 owner = 0;
@@ -3458,7 +3457,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3458 root->highest_inode = objectid; 3457 root->highest_inode = objectid;
3459 3458
3460 inode->i_uid = current_fsuid(); 3459 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid(); 3460
3461 if (dir && (dir->i_mode & S_ISGID)) {
3462 inode->i_gid = dir->i_gid;
3463 if (S_ISDIR(mode))
3464 mode |= S_ISGID;
3465 } else
3466 inode->i_gid = current_fsgid();
3467
3462 inode->i_mode = mode; 3468 inode->i_mode = mode;
3463 inode->i_ino = objectid; 3469 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0); 3470 inode_set_bytes(inode, 0);
@@ -3565,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3565 if (!new_valid_dev(rdev)) 3571 if (!new_valid_dev(rdev))
3566 return -EINVAL; 3572 return -EINVAL;
3567 3573
3568 err = btrfs_check_free_space(root, 1, 0); 3574 err = btrfs_check_metadata_free_space(root);
3569 if (err) 3575 if (err)
3570 goto fail; 3576 goto fail;
3571 3577
@@ -3586,7 +3592,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3586 if (IS_ERR(inode)) 3592 if (IS_ERR(inode))
3587 goto out_unlock; 3593 goto out_unlock;
3588 3594
3589 err = btrfs_init_acl(inode, dir); 3595 err = btrfs_init_inode_security(inode, dir);
3590 if (err) { 3596 if (err) {
3591 drop_inode = 1; 3597 drop_inode = 1;
3592 goto out_unlock; 3598 goto out_unlock;
@@ -3628,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3628 u64 objectid; 3634 u64 objectid;
3629 u64 index = 0; 3635 u64 index = 0;
3630 3636
3631 err = btrfs_check_free_space(root, 1, 0); 3637 err = btrfs_check_metadata_free_space(root);
3632 if (err) 3638 if (err)
3633 goto fail; 3639 goto fail;
3634 trans = btrfs_start_transaction(root, 1); 3640 trans = btrfs_start_transaction(root, 1);
@@ -3649,7 +3655,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
3649 if (IS_ERR(inode)) 3655 if (IS_ERR(inode))
3650 goto out_unlock; 3656 goto out_unlock;
3651 3657
3652 err = btrfs_init_acl(inode, dir); 3658 err = btrfs_init_inode_security(inode, dir);
3653 if (err) { 3659 if (err) {
3654 drop_inode = 1; 3660 drop_inode = 1;
3655 goto out_unlock; 3661 goto out_unlock;
@@ -3696,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3696 return -ENOENT; 3702 return -ENOENT;
3697 3703
3698 btrfs_inc_nlink(inode); 3704 btrfs_inc_nlink(inode);
3699 err = btrfs_check_free_space(root, 1, 0); 3705 err = btrfs_check_metadata_free_space(root);
3700 if (err) 3706 if (err)
3701 goto fail; 3707 goto fail;
3702 err = btrfs_set_inode_index(dir, &index); 3708 err = btrfs_set_inode_index(dir, &index);
@@ -3742,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3742 u64 index = 0; 3748 u64 index = 0;
3743 unsigned long nr = 1; 3749 unsigned long nr = 1;
3744 3750
3745 err = btrfs_check_free_space(root, 1, 0); 3751 err = btrfs_check_metadata_free_space(root);
3746 if (err) 3752 if (err)
3747 goto out_unlock; 3753 goto out_unlock;
3748 3754
@@ -3772,7 +3778,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3772 3778
3773 drop_on_err = 1; 3779 drop_on_err = 1;
3774 3780
3775 err = btrfs_init_acl(inode, dir); 3781 err = btrfs_init_inode_security(inode, dir);
3776 if (err) 3782 if (err)
3777 goto out_fail; 3783 goto out_fail;
3778 3784
@@ -4158,9 +4164,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4158 return -EINVAL; 4164 return -EINVAL;
4159} 4165}
4160 4166
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) 4167static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4168 __u64 start, __u64 len)
4162{ 4169{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent); 4170 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
4164} 4171}
4165 4172
4166int btrfs_readpage(struct file *file, struct page *page) 4173int btrfs_readpage(struct file *file, struct page *page)
@@ -4223,7 +4230,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4223{ 4230{
4224 if (PageWriteback(page) || PageDirty(page)) 4231 if (PageWriteback(page) || PageDirty(page))
4225 return 0; 4232 return 0;
4226 return __btrfs_releasepage(page, gfp_flags); 4233 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
4227} 4234}
4228 4235
4229static void btrfs_invalidatepage(struct page *page, unsigned long offset) 4236static void btrfs_invalidatepage(struct page *page, unsigned long offset)
@@ -4298,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4298 u64 page_start; 4305 u64 page_start;
4299 u64 page_end; 4306 u64 page_end;
4300 4307
4301 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); 4308 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
4302 if (ret) 4309 if (ret)
4303 goto out; 4310 goto out;
4304 4311
@@ -4311,6 +4318,7 @@ again:
4311 4318
4312 if ((page->mapping != inode->i_mapping) || 4319 if ((page->mapping != inode->i_mapping) ||
4313 (page_start >= size)) { 4320 (page_start >= size)) {
4321 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4314 /* page got truncated out from underneath us */ 4322 /* page got truncated out from underneath us */
4315 goto out_unlock; 4323 goto out_unlock;
4316 } 4324 }
@@ -4593,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4593 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 4601 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4594 return -EXDEV; 4602 return -EXDEV;
4595 4603
4596 ret = btrfs_check_free_space(root, 1, 0); 4604 ret = btrfs_check_metadata_free_space(root);
4597 if (ret) 4605 if (ret)
4598 goto out_unlock; 4606 goto out_unlock;
4599 4607
@@ -4711,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4711 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 4719 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4712 return -ENAMETOOLONG; 4720 return -ENAMETOOLONG;
4713 4721
4714 err = btrfs_check_free_space(root, 1, 0); 4722 err = btrfs_check_metadata_free_space(root);
4715 if (err) 4723 if (err)
4716 goto out_fail; 4724 goto out_fail;
4717 4725
@@ -4733,7 +4741,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4733 if (IS_ERR(inode)) 4741 if (IS_ERR(inode))
4734 goto out_unlock; 4742 goto out_unlock;
4735 4743
4736 err = btrfs_init_acl(inode, dir); 4744 err = btrfs_init_inode_security(inode, dir);
4737 if (err) { 4745 if (err) {
4738 drop_inode = 1; 4746 drop_inode = 1;
4739 goto out_unlock; 4747 goto out_unlock;
@@ -4987,13 +4995,24 @@ static struct extent_io_ops btrfs_extent_io_ops = {
4987 .clear_bit_hook = btrfs_clear_bit_hook, 4995 .clear_bit_hook = btrfs_clear_bit_hook,
4988}; 4996};
4989 4997
4998/*
4999 * btrfs doesn't support the bmap operation because swapfiles
5000 * use bmap to make a mapping of extents in the file. They assume
5001 * these extents won't change over the life of the file and they
5002 * use the bmap result to do IO directly to the drive.
5003 *
5004 * the btrfs bmap call would return logical addresses that aren't
5005 * suitable for IO and they also will change frequently as COW
5006 * operations happen. So, swapfile + btrfs == corruption.
5007 *
5008 * For now we're avoiding this by dropping bmap.
5009 */
4990static struct address_space_operations btrfs_aops = { 5010static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage, 5011 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage, 5012 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages, 5013 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages, 5014 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page, 5015 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO, 5016 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage, 5017 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage, 5018 .releasepage = btrfs_releasepage,
@@ -5017,6 +5036,7 @@ static struct inode_operations btrfs_file_inode_operations = {
5017 .removexattr = btrfs_removexattr, 5036 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission, 5037 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate, 5038 .fallocate = btrfs_fallocate,
5039 .fiemap = btrfs_fiemap,
5020}; 5040};
5021static struct inode_operations btrfs_special_inode_operations = { 5041static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr, 5042 .getattr = btrfs_getattr,
@@ -5032,4 +5052,8 @@ static struct inode_operations btrfs_symlink_inode_operations = {
5032 .follow_link = page_follow_link_light, 5052 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link, 5053 .put_link = page_put_link,
5034 .permission = btrfs_permission, 5054 .permission = btrfs_permission,
5055 .setxattr = btrfs_setxattr,
5056 .getxattr = btrfs_getxattr,
5057 .listxattr = btrfs_listxattr,
5058 .removexattr = btrfs_removexattr,
5035}; 5059};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2aa33e3feb5..bca729fc80c8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -38,7 +38,6 @@
38#include <linux/compat.h> 38#include <linux/compat.h>
39#include <linux/bit_spinlock.h> 39#include <linux/bit_spinlock.h>
40#include <linux/security.h> 40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h> 41#include <linux/xattr.h>
43#include <linux/vmalloc.h> 42#include <linux/vmalloc.h>
44#include "compat.h" 43#include "compat.h"
@@ -71,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
71 u64 index = 0; 70 u64 index = 0;
72 unsigned long nr = 1; 71 unsigned long nr = 1;
73 72
74 ret = btrfs_check_free_space(root, 1, 0); 73 ret = btrfs_check_metadata_free_space(root);
75 if (ret) 74 if (ret)
76 goto fail_commit; 75 goto fail_commit;
77 76
@@ -204,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
204 if (!root->ref_cows) 203 if (!root->ref_cows)
205 return -EINVAL; 204 return -EINVAL;
206 205
207 ret = btrfs_check_free_space(root, 1, 0); 206 ret = btrfs_check_metadata_free_space(root);
208 if (ret) 207 if (ret)
209 goto fail_unlock; 208 goto fail_unlock;
210 209
@@ -375,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
375 unsigned long i; 374 unsigned long i;
376 int ret; 375 int ret;
377 376
378 ret = btrfs_check_free_space(root, inode->i_size, 0); 377 ret = btrfs_check_data_free_space(root, inode, inode->i_size);
379 if (ret) 378 if (ret)
380 return -ENOSPC; 379 return -ENOSPC;
381 380
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae7761db6..47b0a88c12a2 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,64 +25,203 @@
25#include "extent_io.h" 25#include "extent_io.h"
26#include "locking.h" 26#include "locking.h"
27 27
28static inline void spin_nested(struct extent_buffer *eb)
29{
30 spin_lock(&eb->lock);
31}
32
28/* 33/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks 34 * Setting a lock to blocking will drop the spinlock and set the
30 * and the spin is not tuned very extensively. The spinning does make a big 35 * flag that forces other procs who want the lock to wait. After
31 * difference in almost every workload, but spinning for the right amount of 36 * this you can safely schedule with the lock held.
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */ 37 */
38void btrfs_set_lock_blocking(struct extent_buffer *eb)
39{
40 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
41 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
42 spin_unlock(&eb->lock);
43 }
44 /* exit with the spin lock released and the bit set */
45}
37 46
38int btrfs_tree_lock(struct extent_buffer *eb) 47/*
48 * clearing the blocking flag will take the spinlock again.
49 * After this you can't safely schedule
50 */
51void btrfs_clear_lock_blocking(struct extent_buffer *eb)
39{ 52{
40 int i; 53 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
54 spin_nested(eb);
55 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
56 smp_mb__after_clear_bit();
57 }
58 /* exit with the spin lock held */
59}
41 60
42 if (mutex_trylock(&eb->mutex)) 61/*
43 return 0; 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 *
67 * So, we're still stuck with this crummy spin on the blocking bit,
68 * at least until the most common causes of the short blocks
69 * can be dealt with.
70 */
71static int btrfs_spin_on_block(struct extent_buffer *eb)
72{
73 int i;
44 for (i = 0; i < 512; i++) { 74 for (i = 0; i < 512; i++) {
45 cpu_relax(); 75 cpu_relax();
46 if (mutex_trylock(&eb->mutex)) 76 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
47 return 0; 77 return 1;
78 if (need_resched())
79 break;
48 } 80 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0; 81 return 0;
52} 82}
53 83
54int btrfs_try_tree_lock(struct extent_buffer *eb) 84/*
85 * This is somewhat different from trylock. It will take the
86 * spinlock but if it finds the lock is set to blocking, it will
87 * return without the lock held.
88 *
89 * returns 1 if it was able to take the lock and zero otherwise
90 *
91 * After this call, scheduling is not safe without first calling
92 * btrfs_set_lock_blocking()
93 */
94int btrfs_try_spin_lock(struct extent_buffer *eb)
55{ 95{
56 return mutex_trylock(&eb->mutex); 96 int i;
97
98 spin_nested(eb);
99 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
100 return 1;
101 spin_unlock(&eb->lock);
102
103 /* spin for a bit on the BLOCKING flag */
104 for (i = 0; i < 2; i++) {
105 if (!btrfs_spin_on_block(eb))
106 break;
107
108 spin_nested(eb);
109 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
110 return 1;
111 spin_unlock(&eb->lock);
112 }
113 return 0;
57} 114}
58 115
59int btrfs_tree_unlock(struct extent_buffer *eb) 116/*
117 * the autoremove wake function will return 0 if it tried to wake up
118 * a process that was already awake, which means that process won't
119 * count as an exclusive wakeup. The waitq code will continue waking
120 * procs until it finds one that was actually sleeping.
121 *
122 * For btrfs, this isn't quite what we want. We want a single proc
123 * to be notified that the lock is ready for taking. If that proc
124 * already happen to be awake, great, it will loop around and try for
125 * the lock.
126 *
127 * So, btrfs_wake_function always returns 1, even when the proc that we
128 * tried to wake up was already awake.
129 */
130static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
131 int sync, void *key)
60{ 132{
61 mutex_unlock(&eb->mutex); 133 autoremove_wake_function(wait, mode, sync, key);
62 return 0; 134 return 1;
63} 135}
64 136
65int btrfs_tree_locked(struct extent_buffer *eb) 137/*
138 * returns with the extent buffer spinlocked.
139 *
140 * This will spin and/or wait as required to take the lock, and then
141 * return with the spinlock held.
142 *
143 * After this call, scheduling is not safe without first calling
144 * btrfs_set_lock_blocking()
145 */
146int btrfs_tree_lock(struct extent_buffer *eb)
66{ 147{
67 return mutex_is_locked(&eb->mutex); 148 DEFINE_WAIT(wait);
149 wait.func = btrfs_wake_function;
150
151 while(1) {
152 spin_nested(eb);
153
154 /* nobody is blocking, exit with the spinlock held */
155 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
156 return 0;
157
158 /*
159 * we have the spinlock, but the real owner is blocking.
160 * wait for them
161 */
162 spin_unlock(&eb->lock);
163
164 /*
165 * spin for a bit, and if the blocking flag goes away,
166 * loop around
167 */
168 if (btrfs_spin_on_block(eb))
169 continue;
170
171 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
172 TASK_UNINTERRUPTIBLE);
173
174 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
175 schedule();
176
177 finish_wait(&eb->lock_wq, &wait);
178 }
179 return 0;
68} 180}
69 181
70/* 182/*
71 * btrfs_search_slot uses this to decide if it should drop its locks 183 * Very quick trylock, this does not spin or schedule. It returns
72 * before doing something expensive like allocating free blocks for cow. 184 * 1 with the spinlock held if it was able to take the lock, or it
185 * returns zero if it was unable to take the lock.
186 *
187 * After this call, scheduling is not safe without first calling
188 * btrfs_set_lock_blocking()
73 */ 189 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level) 190int btrfs_try_tree_lock(struct extent_buffer *eb)
75{ 191{
76 int i; 192 if (spin_trylock(&eb->lock)) {
77 struct extent_buffer *eb; 193 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { 194 /*
79 eb = path->nodes[i]; 195 * we've got the spinlock, but the real owner is
80 if (!eb) 196 * blocking. Drop the spinlock and return failure
81 break; 197 */
82 smp_mb(); 198 spin_unlock(&eb->lock);
83 if (!list_empty(&eb->mutex.wait_list)) 199 return 0;
84 return 1; 200 }
201 return 1;
85 } 202 }
203 /* someone else has the spinlock giveup */
86 return 0; 204 return 0;
87} 205}
88 206
207int btrfs_tree_unlock(struct extent_buffer *eb)
208{
209 /*
210 * if we were a blocking owner, we don't have the spinlock held
211 * just clear the bit and look for waiters
212 */
213 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
214 smp_mb__after_clear_bit();
215 else
216 spin_unlock(&eb->lock);
217
218 if (waitqueue_active(&eb->lock_wq))
219 wake_up(&eb->lock_wq);
220 return 0;
221}
222
223void btrfs_assert_tree_locked(struct extent_buffer *eb)
224{
225 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
226 assert_spin_locked(&eb->lock);
227}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef12519..6c4ce457168c 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,7 +21,11 @@
21 21
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb); 24
25int btrfs_try_tree_lock(struct extent_buffer *eb); 25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level); 26int btrfs_try_spin_lock(struct extent_buffer *eb);
27
28void btrfs_set_lock_blocking(struct extent_buffer *eb);
29void btrfs_clear_lock_blocking(struct extent_buffer *eb);
30void btrfs_assert_tree_locked(struct extent_buffer *eb);
27#endif 31#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a20940170274..77c2411a5f0f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
613 struct btrfs_sector_sum *sector_sums; 613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered; 614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors; 616 unsigned long num_sectors;
618 unsigned long i; 617 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize; 618 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
@@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
624 return 1; 623 return 1;
625 624
626 mutex_lock(&tree->mutex); 625 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) { 626 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) { 627 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize; 628 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums; 629 sector_sums = ordered_sum->sums;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4c9eab..d0cc62bccb94 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sort.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "ref-cache.h" 22#include "ref-cache.h"
22#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183d7c59..bc283ad2db73 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared); 74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif 76#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index db9fb3bc1e33..19a4daf03ccb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,7 +37,6 @@
37#include <linux/ctype.h> 37#include <linux/ctype.h>
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include <linux/magic.h> 40#include <linux/magic.h>
42#include "compat.h" 41#include "compat.h"
43#include "ctree.h" 42#include "ctree.h"
@@ -380,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
380 btrfs_start_delalloc_inodes(root); 379 btrfs_start_delalloc_inodes(root);
381 btrfs_wait_ordered_extents(root, 0); 380 btrfs_wait_ordered_extents(root, 0);
382 381
383 btrfs_clean_old_snapshots(root);
384 trans = btrfs_start_transaction(root, 1); 382 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_commit_transaction(trans, root); 383 ret = btrfs_commit_transaction(trans, root);
386 sb->s_dirt = 0; 384 sb->s_dirt = 0;
@@ -512,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
512 struct btrfs_root *root = btrfs_sb(sb); 510 struct btrfs_root *root = btrfs_sb(sb);
513 int ret; 511 int ret;
514 512
513 ret = btrfs_parse_options(root, data);
514 if (ret)
515 return -EINVAL;
516
515 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 517 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
516 return 0; 518 return 0;
517 519
@@ -583,17 +585,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
583 struct btrfs_ioctl_vol_args *vol; 585 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices; 586 struct btrfs_fs_devices *fs_devices;
585 int ret = -ENOTTY; 587 int ret = -ENOTTY;
586 int len;
587 588
588 if (!capable(CAP_SYS_ADMIN)) 589 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM; 590 return -EPERM;
590 591
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL); 592 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
593 if (!vol)
594 return -ENOMEM;
595
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { 596 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT; 597 ret = -EFAULT;
594 goto out; 598 goto out;
595 } 599 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597 600
598 switch (cmd) { 601 switch (cmd) {
599 case BTRFS_IOC_SCAN_DEV: 602 case BTRFS_IOC_SCAN_DEV:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a08f9443340..4112d53d4f4d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
688 num_bytes -= btrfs_root_used(&dirty->root->root_item); 688 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item); 689 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) { 690 if (num_bytes) {
691 mutex_lock(&root->fs_info->trans_mutex);
691 btrfs_record_root_in_trans(root); 692 btrfs_record_root_in_trans(root);
693 mutex_unlock(&root->fs_info->trans_mutex);
692 btrfs_set_root_used(&root->root_item, 694 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes); 695 bytes_used - num_bytes);
694 } 696 }
@@ -852,11 +854,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
852{ 854{
853 struct btrfs_pending_snapshot *pending; 855 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots; 856 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret; 857 int ret;
857 858
858 list_for_each(cur, head) { 859 list_for_each_entry(pending, head, list) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending); 860 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret); 861 BUG_ON(ret);
862 } 862 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c36165..98d25fa4570e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
74 u32 nritems; 74 u32 nritems;
75 75
76 root_node = btrfs_lock_root_node(root); 76 root_node = btrfs_lock_root_node(root);
77 btrfs_set_lock_blocking(root_node);
77 nritems = btrfs_header_nritems(root_node); 78 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0; 79 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */ 80 /* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2e077c..9c462fbd60fa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
78 */ 78 */
79 79
80/* 80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree 81 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people 82 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish 83 * syncing the tree wait for us to finish
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root) 86 struct btrfs_root *root)
185{ 87{
186 int ret; 88 int ret;
89
90 mutex_lock(&root->log_mutex);
91 if (root->log_root) {
92 root->log_batch++;
93 atomic_inc(&root->log_writers);
94 mutex_unlock(&root->log_mutex);
95 return 0;
96 }
187 mutex_lock(&root->fs_info->tree_log_mutex); 97 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) { 98 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info); 99 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
193 ret = btrfs_add_log_tree(trans, root); 103 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret); 104 BUG_ON(ret);
195 } 105 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex); 106 mutex_unlock(&root->fs_info->tree_log_mutex);
107 root->log_batch++;
108 atomic_inc(&root->log_writers);
109 mutex_unlock(&root->log_mutex);
199 return 0; 110 return 0;
200} 111}
201 112
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
212 if (!root->log_root) 123 if (!root->log_root)
213 return -ENOENT; 124 return -ENOENT;
214 125
215 mutex_lock(&root->fs_info->tree_log_mutex); 126 mutex_lock(&root->log_mutex);
216 if (root->log_root) { 127 if (root->log_root) {
217 ret = 0; 128 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers); 129 atomic_inc(&root->log_writers);
219 root->fs_info->tree_log_batch++;
220 } 130 }
221 mutex_unlock(&root->fs_info->tree_log_mutex); 131 mutex_unlock(&root->log_mutex);
222 return ret; 132 return ret;
223} 133}
224 134
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
228 */ 138 */
229static int end_log_trans(struct btrfs_root *root) 139static int end_log_trans(struct btrfs_root *root)
230{ 140{
231 atomic_dec(&root->fs_info->tree_log_writers); 141 if (atomic_dec_and_test(&root->log_writers)) {
232 smp_mb(); 142 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait)) 143 if (waitqueue_active(&root->log_writer_wait))
234 wake_up(&root->fs_info->tree_log_wait); 144 wake_up(&root->log_writer_wait);
145 }
235 return 0; 146 return 0;
236} 147}
237 148
@@ -1704,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1704 1615
1705 btrfs_tree_lock(next); 1616 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next); 1617 clean_tree_block(trans, root, next);
1618 btrfs_set_lock_blocking(next);
1707 btrfs_wait_tree_block_writeback(next); 1619 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next); 1620 btrfs_tree_unlock(next);
1709 1621
@@ -1750,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1750 next = path->nodes[*level]; 1662 next = path->nodes[*level];
1751 btrfs_tree_lock(next); 1663 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next); 1664 clean_tree_block(trans, root, next);
1665 btrfs_set_lock_blocking(next);
1753 btrfs_wait_tree_block_writeback(next); 1666 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next); 1667 btrfs_tree_unlock(next);
1755 1668
@@ -1807,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1807 1720
1808 btrfs_tree_lock(next); 1721 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next); 1722 clean_tree_block(trans, root, next);
1723 btrfs_set_lock_blocking(next);
1810 btrfs_wait_tree_block_writeback(next); 1724 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next); 1725 btrfs_tree_unlock(next);
1812 1726
@@ -1879,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1879 1793
1880 btrfs_tree_lock(next); 1794 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next); 1795 clean_tree_block(trans, log, next);
1796 btrfs_set_lock_blocking(next);
1882 btrfs_wait_tree_block_writeback(next); 1797 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next); 1798 btrfs_tree_unlock(next);
1884 1799
@@ -1902,26 +1817,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1902 } 1817 }
1903 } 1818 }
1904 btrfs_free_path(path); 1819 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret; 1820 return ret;
1908} 1821}
1909 1822
1910static int wait_log_commit(struct btrfs_root *log) 1823/*
1824 * helper function to update the item for a given subvolumes log root
1825 * in the tree of log roots
1826 */
1827static int update_log_root(struct btrfs_trans_handle *trans,
1828 struct btrfs_root *log)
1829{
1830 int ret;
1831
1832 if (log->log_transid == 1) {
1833 /* insert root item on the first sync */
1834 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
1835 &log->root_key, &log->root_item);
1836 } else {
1837 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
1838 &log->root_key, &log->root_item);
1839 }
1840 return ret;
1841}
1842
1843static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
1911{ 1844{
1912 DEFINE_WAIT(wait); 1845 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid; 1846 int index = transid % 2;
1914 1847
1848 /*
1849 * we only allow two pending log transactions at a time,
1850 * so we know that if ours is more than 2 older than the
1851 * current transaction, we're done
1852 */
1915 do { 1853 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1854 prepare_to_wait(&root->log_commit_wait[index],
1917 TASK_UNINTERRUPTIBLE); 1855 &wait, TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex); 1856 mutex_unlock(&root->log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit)) 1857 if (root->log_transid < transid + 2 &&
1858 atomic_read(&root->log_commit[index]))
1920 schedule(); 1859 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait); 1860 finish_wait(&root->log_commit_wait[index], &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex); 1861 mutex_lock(&root->log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid && 1862 } while (root->log_transid < transid + 2 &&
1924 atomic_read(&log->fs_info->tree_log_commit)); 1863 atomic_read(&root->log_commit[index]));
1864 return 0;
1865}
1866
1867static int wait_for_writer(struct btrfs_root *root)
1868{
1869 DEFINE_WAIT(wait);
1870 while (atomic_read(&root->log_writers)) {
1871 prepare_to_wait(&root->log_writer_wait,
1872 &wait, TASK_UNINTERRUPTIBLE);
1873 mutex_unlock(&root->log_mutex);
1874 if (atomic_read(&root->log_writers))
1875 schedule();
1876 mutex_lock(&root->log_mutex);
1877 finish_wait(&root->log_writer_wait, &wait);
1878 }
1925 return 0; 1879 return 0;
1926} 1880}
1927 1881
@@ -1933,57 +1887,114 @@ static int wait_log_commit(struct btrfs_root *log)
1933int btrfs_sync_log(struct btrfs_trans_handle *trans, 1887int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root) 1888 struct btrfs_root *root)
1935{ 1889{
1890 int index1;
1891 int index2;
1936 int ret; 1892 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root; 1893 struct btrfs_root *log = root->log_root;
1894 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1939 1895
1940 mutex_lock(&log->fs_info->tree_log_mutex); 1896 mutex_lock(&root->log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) { 1897 index1 = root->log_transid % 2;
1942 wait_log_commit(log); 1898 if (atomic_read(&root->log_commit[index1])) {
1943 goto out; 1899 wait_log_commit(root, root->log_transid);
1900 mutex_unlock(&root->log_mutex);
1901 return 0;
1944 } 1902 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1); 1903 atomic_set(&root->log_commit[index1], 1);
1904
1905 /* wait for previous tree log sync to complete */
1906 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1907 wait_log_commit(root, root->log_transid - 1);
1946 1908
1947 while (1) { 1909 while (1) {
1948 batch = log->fs_info->tree_log_batch; 1910 unsigned long batch = root->log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex); 1911 mutex_unlock(&root->log_mutex);
1950 schedule_timeout_uninterruptible(1); 1912 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex); 1913 mutex_lock(&root->log_mutex);
1952 1914 wait_for_writer(root);
1953 while (atomic_read(&log->fs_info->tree_log_writers)) { 1915 if (batch == root->log_batch)
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break; 1916 break;
1965 } 1917 }
1966 1918
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1919 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret); 1920 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1921
1970 &root->fs_info->log_root_tree->dirty_log_pages); 1922 btrfs_set_root_bytenr(&log->root_item, log->node->start);
1923 btrfs_set_root_generation(&log->root_item, trans->transid);
1924 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
1925
1926 root->log_batch = 0;
1927 root->log_transid++;
1928 log->log_transid = root->log_transid;
1929 smp_mb();
1930 /*
1931 * log tree has been flushed to disk, new modifications of
1932 * the log will be written to new positions. so it's safe to
1933 * allow log writers to go in.
1934 */
1935 mutex_unlock(&root->log_mutex);
1936
1937 mutex_lock(&log_root_tree->log_mutex);
1938 log_root_tree->log_batch++;
1939 atomic_inc(&log_root_tree->log_writers);
1940 mutex_unlock(&log_root_tree->log_mutex);
1941
1942 ret = update_log_root(trans, log);
1943 BUG_ON(ret);
1944
1945 mutex_lock(&log_root_tree->log_mutex);
1946 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
1947 smp_mb();
1948 if (waitqueue_active(&log_root_tree->log_writer_wait))
1949 wake_up(&log_root_tree->log_writer_wait);
1950 }
1951
1952 index2 = log_root_tree->log_transid % 2;
1953 if (atomic_read(&log_root_tree->log_commit[index2])) {
1954 wait_log_commit(log_root_tree, log_root_tree->log_transid);
1955 mutex_unlock(&log_root_tree->log_mutex);
1956 goto out;
1957 }
1958 atomic_set(&log_root_tree->log_commit[index2], 1);
1959
1960 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
1961 wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
1962
1963 wait_for_writer(log_root_tree);
1964
1965 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
1966 &log_root_tree->dirty_log_pages);
1971 BUG_ON(ret); 1967 BUG_ON(ret);
1972 1968
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1969 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start); 1970 log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1971 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node)); 1972 btrfs_header_level(log_root_tree->node));
1973
1974 log_root_tree->log_batch = 0;
1975 log_root_tree->log_transid++;
1976 smp_mb();
1977
1978 mutex_unlock(&log_root_tree->log_mutex);
1979
1980 /*
1981 * nobody else is going to jump in and write the the ctree
1982 * super here because the log_commit atomic below is protecting
1983 * us. We must be called with a transaction handle pinning
1984 * the running transaction open, so a full commit can't hop
1985 * in and cause problems either.
1986 */
1987 write_ctree_super(trans, root->fs_info->tree_root, 2);
1977 1988
1978 write_ctree_super(trans, log->fs_info->tree_root, 2); 1989 atomic_set(&log_root_tree->log_commit[index2], 0);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb(); 1990 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait)) 1991 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
1984 wake_up(&log->fs_info->tree_log_wait); 1992 wake_up(&log_root_tree->log_commit_wait[index2]);
1985out: 1993out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex); 1994 atomic_set(&root->log_commit[index1], 0);
1995 smp_mb();
1996 if (waitqueue_active(&root->log_commit_wait[index1]))
1997 wake_up(&root->log_commit_wait[index1]);
1987 return 0; 1998 return 0;
1988} 1999}
1989 2000
@@ -2019,38 +2030,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2019 start, end, GFP_NOFS); 2030 start, end, GFP_NOFS);
2020 } 2031 }
2021 2032
2022 log = root->log_root; 2033 if (log->log_transid > 0) {
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2034 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key); 2035 &log->root_key);
2025 BUG_ON(ret); 2036 BUG_ON(ret);
2037 }
2026 root->log_root = NULL; 2038 root->log_root = NULL;
2027 kfree(root->log_root); 2039 free_extent_buffer(log->node);
2040 kfree(log);
2028 return 0; 2041 return 0;
2029} 2042}
2030 2043
2031/* 2044/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are 2045 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners: 2046 * mixed in, we have a few interesting corners:
2056 * 2047 *
@@ -2711,11 +2702,6 @@ next_slot:
2711 2702
2712 btrfs_free_path(path); 2703 btrfs_free_path(path);
2713 btrfs_free_path(dst_path); 2704 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out: 2705out:
2720 return 0; 2706 return 0;
2721} 2707}
@@ -2846,7 +2832,9 @@ again:
2846 BUG_ON(!wc.replay_dest); 2832 BUG_ON(!wc.replay_dest);
2847 2833
2848 wc.replay_dest->log_root = log; 2834 wc.replay_dest->log_root = log;
2835 mutex_lock(&fs_info->trans_mutex);
2849 btrfs_record_root_in_trans(wc.replay_dest); 2836 btrfs_record_root_in_trans(wc.replay_dest);
2837 mutex_unlock(&fs_info->trans_mutex);
2850 ret = walk_log_tree(trans, log, &wc); 2838 ret = walk_log_tree(trans, log, &wc);
2851 BUG_ON(ret); 2839 BUG_ON(ret);
2852 2840
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3451e1cca2b5..1316139bf9e8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,6 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h> 23#include <asm/div64.h>
25#include "compat.h" 24#include "compat.h"
26#include "ctree.h" 25#include "ctree.h"
@@ -104,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid) 103 u64 devid, u8 *uuid)
105{ 104{
106 struct btrfs_device *dev; 105 struct btrfs_device *dev;
107 struct list_head *cur;
108 106
109 list_for_each(cur, head) { 107 list_for_each_entry(dev, head, dev_list) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid && 108 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { 109 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev; 110 return dev;
@@ -118,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
118 115
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) 116static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{ 117{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices; 118 struct btrfs_fs_devices *fs_devices;
123 119
124 list_for_each(cur, &fs_uuids) { 120 list_for_each_entry(fs_devices, &fs_uuids, list) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 121 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices; 122 return fs_devices;
128 } 123 }
@@ -159,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
159loop: 154loop:
160 spin_lock(&device->io_lock); 155 spin_lock(&device->io_lock);
161 156
157loop_lock:
162 /* take all the bios off the list at once and process them 158 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the 159 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted 160 * tail and other pointers so the bios can be properly reinserted
@@ -208,7 +204,7 @@ loop:
208 * is now congested. Back off and let other work structs 204 * is now congested. Back off and let other work structs
209 * run instead 205 * run instead
210 */ 206 */
211 if (pending && bdi_write_congested(bdi) && 207 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
212 fs_info->fs_devices->open_devices > 1) { 208 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head; 209 struct bio *old_head;
214 210
@@ -220,7 +216,8 @@ loop:
220 tail->bi_next = old_head; 216 tail->bi_next = old_head;
221 else 217 else
222 device->pending_bio_tail = tail; 218 device->pending_bio_tail = tail;
223 device->running_pending = 0; 219
220 device->running_pending = 1;
224 221
225 spin_unlock(&device->io_lock); 222 spin_unlock(&device->io_lock);
226 btrfs_requeue_work(&device->work); 223 btrfs_requeue_work(&device->work);
@@ -229,6 +226,11 @@ loop:
229 } 226 }
230 if (again) 227 if (again)
231 goto loop; 228 goto loop;
229
230 spin_lock(&device->io_lock);
231 if (device->pending_bios)
232 goto loop_lock;
233 spin_unlock(&device->io_lock);
232done: 234done:
233 return 0; 235 return 0;
234} 236}
@@ -345,14 +347,11 @@ error:
345 347
346int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 348int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
347{ 349{
348 struct list_head *tmp; 350 struct btrfs_device *device, *next;
349 struct list_head *cur;
350 struct btrfs_device *device;
351 351
352 mutex_lock(&uuid_mutex); 352 mutex_lock(&uuid_mutex);
353again: 353again:
354 list_for_each_safe(cur, tmp, &fs_devices->devices) { 354 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
355 device = list_entry(cur, struct btrfs_device, dev_list);
356 if (device->in_fs_metadata) 355 if (device->in_fs_metadata)
357 continue; 356 continue;
358 357
@@ -383,14 +382,12 @@ again:
383 382
384static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 383static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
385{ 384{
386 struct list_head *cur;
387 struct btrfs_device *device; 385 struct btrfs_device *device;
388 386
389 if (--fs_devices->opened > 0) 387 if (--fs_devices->opened > 0)
390 return 0; 388 return 0;
391 389
392 list_for_each(cur, &fs_devices->devices) { 390 list_for_each_entry(device, &fs_devices->devices, dev_list) {
393 device = list_entry(cur, struct btrfs_device, dev_list);
394 if (device->bdev) { 391 if (device->bdev) {
395 close_bdev_exclusive(device->bdev, device->mode); 392 close_bdev_exclusive(device->bdev, device->mode);
396 fs_devices->open_devices--; 393 fs_devices->open_devices--;
@@ -439,7 +436,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
439{ 436{
440 struct block_device *bdev; 437 struct block_device *bdev;
441 struct list_head *head = &fs_devices->devices; 438 struct list_head *head = &fs_devices->devices;
442 struct list_head *cur;
443 struct btrfs_device *device; 439 struct btrfs_device *device;
444 struct block_device *latest_bdev = NULL; 440 struct block_device *latest_bdev = NULL;
445 struct buffer_head *bh; 441 struct buffer_head *bh;
@@ -450,8 +446,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
450 int seeding = 1; 446 int seeding = 1;
451 int ret = 0; 447 int ret = 0;
452 448
453 list_for_each(cur, head) { 449 list_for_each_entry(device, head, dev_list) {
454 device = list_entry(cur, struct btrfs_device, dev_list);
455 if (device->bdev) 450 if (device->bdev)
456 continue; 451 continue;
457 if (!device->name) 452 if (!device->name)
@@ -578,7 +573,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
578 *(unsigned long long *)disk_super->fsid, 573 *(unsigned long long *)disk_super->fsid,
579 *(unsigned long long *)(disk_super->fsid + 8)); 574 *(unsigned long long *)(disk_super->fsid + 8));
580 } 575 }
581 printk(KERN_INFO "devid %llu transid %llu %s\n", 576 printk(KERN_CONT "devid %llu transid %llu %s\n",
582 (unsigned long long)devid, (unsigned long long)transid, path); 577 (unsigned long long)devid, (unsigned long long)transid, path);
583 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 578 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
584 579
@@ -1017,14 +1012,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1017 } 1012 }
1018 1013
1019 if (strcmp(device_path, "missing") == 0) { 1014 if (strcmp(device_path, "missing") == 0) {
1020 struct list_head *cur;
1021 struct list_head *devices; 1015 struct list_head *devices;
1022 struct btrfs_device *tmp; 1016 struct btrfs_device *tmp;
1023 1017
1024 device = NULL; 1018 device = NULL;
1025 devices = &root->fs_info->fs_devices->devices; 1019 devices = &root->fs_info->fs_devices->devices;
1026 list_for_each(cur, devices) { 1020 list_for_each_entry(tmp, devices, dev_list) {
1027 tmp = list_entry(cur, struct btrfs_device, dev_list);
1028 if (tmp->in_fs_metadata && !tmp->bdev) { 1021 if (tmp->in_fs_metadata && !tmp->bdev) {
1029 device = tmp; 1022 device = tmp;
1030 break; 1023 break;
@@ -1280,7 +1273,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1280 struct btrfs_trans_handle *trans; 1273 struct btrfs_trans_handle *trans;
1281 struct btrfs_device *device; 1274 struct btrfs_device *device;
1282 struct block_device *bdev; 1275 struct block_device *bdev;
1283 struct list_head *cur;
1284 struct list_head *devices; 1276 struct list_head *devices;
1285 struct super_block *sb = root->fs_info->sb; 1277 struct super_block *sb = root->fs_info->sb;
1286 u64 total_bytes; 1278 u64 total_bytes;
@@ -1304,8 +1296,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1304 mutex_lock(&root->fs_info->volume_mutex); 1296 mutex_lock(&root->fs_info->volume_mutex);
1305 1297
1306 devices = &root->fs_info->fs_devices->devices; 1298 devices = &root->fs_info->fs_devices->devices;
1307 list_for_each(cur, devices) { 1299 list_for_each_entry(device, devices, dev_list) {
1308 device = list_entry(cur, struct btrfs_device, dev_list);
1309 if (device->bdev == bdev) { 1300 if (device->bdev == bdev) {
1310 ret = -EEXIST; 1301 ret = -EEXIST;
1311 goto error; 1302 goto error;
@@ -1704,7 +1695,6 @@ static u64 div_factor(u64 num, int factor)
1704int btrfs_balance(struct btrfs_root *dev_root) 1695int btrfs_balance(struct btrfs_root *dev_root)
1705{ 1696{
1706 int ret; 1697 int ret;
1707 struct list_head *cur;
1708 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 1698 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1709 struct btrfs_device *device; 1699 struct btrfs_device *device;
1710 u64 old_size; 1700 u64 old_size;
@@ -1723,8 +1713,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1723 dev_root = dev_root->fs_info->dev_root; 1713 dev_root = dev_root->fs_info->dev_root;
1724 1714
1725 /* step one make some room on all the devices */ 1715 /* step one make some room on all the devices */
1726 list_for_each(cur, devices) { 1716 list_for_each_entry(device, devices, dev_list) {
1727 device = list_entry(cur, struct btrfs_device, dev_list);
1728 old_size = device->total_bytes; 1717 old_size = device->total_bytes;
1729 size_to_free = div_factor(old_size, 1); 1718 size_to_free = div_factor(old_size, 1);
1730 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 1719 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
@@ -2905,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2905 free_extent_map(em); 2894 free_extent_map(em);
2906 } 2895 }
2907 2896
2908 map = kzalloc(sizeof(*map), GFP_NOFS);
2909 if (!map)
2910 return -ENOMEM;
2911
2912 em = alloc_extent_map(GFP_NOFS); 2897 em = alloc_extent_map(GFP_NOFS);
2913 if (!em) 2898 if (!em)
2914 return -ENOMEM; 2899 return -ENOMEM;
@@ -3117,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
3117 if (!sb) 3102 if (!sb)
3118 return -ENOMEM; 3103 return -ENOMEM;
3119 btrfs_set_buffer_uptodate(sb); 3104 btrfs_set_buffer_uptodate(sb);
3105 btrfs_set_buffer_lockdep_class(sb, 0);
3106
3120 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3107 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3121 array_size = btrfs_super_sys_array_size(super_copy); 3108 array_size = btrfs_super_sys_array_size(super_copy);
3122 3109
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7f332e270894..a9d3bf4d2689 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/rwsem.h> 22#include <linux/rwsem.h>
23#include <linux/xattr.h> 23#include <linux/xattr.h>
24#include <linux/security.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "btrfs_inode.h" 26#include "btrfs_inode.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -45,9 +46,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
45 /* lookup the xattr by name */ 46 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, 47 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0); 48 strlen(name), 0);
48 if (!di || IS_ERR(di)) { 49 if (!di) {
49 ret = -ENODATA; 50 ret = -ENODATA;
50 goto out; 51 goto out;
52 } else if (IS_ERR(di)) {
53 ret = PTR_ERR(di);
54 goto out;
51 } 55 }
52 56
53 leaf = path->nodes[0]; 57 leaf = path->nodes[0];
@@ -62,6 +66,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
62 ret = -ERANGE; 66 ret = -ERANGE;
63 goto out; 67 goto out;
64 } 68 }
69
70 /*
71 * The way things are packed into the leaf is like this
72 * |struct btrfs_dir_item|name|data|
73 * where name is the xattr name, so security.foo, and data is the
74 * content of the xattr. data_ptr points to the location in memory
75 * where the data starts in the in memory leaf
76 */
65 data_ptr = (unsigned long)((char *)(di + 1) + 77 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di)); 78 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr, 79 read_extent_buffer(leaf, buffer, data_ptr,
@@ -86,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
86 if (!path) 98 if (!path)
87 return -ENOMEM; 99 return -ENOMEM;
88 100
89 trans = btrfs_start_transaction(root, 1); 101 trans = btrfs_join_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode); 102 btrfs_set_trans_block_group(trans, inode);
91 103
92 /* first lets see if we already have this xattr */ 104 /* first lets see if we already have this xattr */
@@ -176,7 +188,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0) 189 if (ret < 0)
178 goto err; 190 goto err;
179 ret = 0;
180 advance = 0; 191 advance = 0;
181 while (1) { 192 while (1) {
182 leaf = path->nodes[0]; 193 leaf = path->nodes[0];
@@ -320,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
320 return -EOPNOTSUPP; 331 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 332 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322} 333}
334
335int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
336{
337 int err;
338 size_t len;
339 void *value;
340 char *suffix;
341 char *name;
342
343 err = security_inode_init_security(inode, dir, &suffix, &value, &len);
344 if (err) {
345 if (err == -EOPNOTSUPP)
346 return 0;
347 return err;
348 }
349
350 name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
351 GFP_NOFS);
352 if (!name) {
353 err = -ENOMEM;
354 } else {
355 strcpy(name, XATTR_SECURITY_PREFIX);
356 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
357 err = __btrfs_setxattr(inode, name, value, len, 0);
358 kfree(name);
359 }
360
361 kfree(suffix);
362 kfree(value);
363 return err;
364}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5b1d08f8e68d..c71e9c3cf3f7 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags); 36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
40
39#endif /* __XATTR__ */ 41#endif /* __XATTR__ */
diff --git a/fs/buffer.c b/fs/buffer.c
index b58208f1640a..9f697419ed8e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page,
777 __inc_zone_page_state(page, NR_FILE_DIRTY); 777 __inc_zone_page_state(page, NR_FILE_DIRTY);
778 __inc_bdi_stat(mapping->backing_dev_info, 778 __inc_bdi_stat(mapping->backing_dev_info,
779 BDI_RECLAIMABLE); 779 BDI_RECLAIMABLE);
780 task_dirty_inc(current);
780 task_io_account_write(PAGE_CACHE_SIZE); 781 task_io_account_write(PAGE_CACHE_SIZE);
781 } 782 }
782 radix_tree_tag_set(&mapping->page_tree, 783 radix_tree_tag_set(&mapping->page_tree,
@@ -2688,7 +2689,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
2688 struct buffer_head *bh; 2689 struct buffer_head *bh;
2689 BUG_ON(fsdata != NULL && page_has_buffers(page)); 2690 BUG_ON(fsdata != NULL && page_has_buffers(page));
2690 2691
2691 if (unlikely(copied < len) && !page_has_buffers(page)) 2692 if (unlikely(copied < len) && head)
2692 attach_nobh_buffers(page, head); 2693 attach_nobh_buffers(page, head);
2693 if (page_has_buffers(page)) 2694 if (page_has_buffers(page))
2694 return generic_write_end(file, mapping, pos, len, 2695 return generic_write_end(file, mapping, pos, len,
@@ -3108,7 +3109,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
3108 if (test_clear_buffer_dirty(bh)) { 3109 if (test_clear_buffer_dirty(bh)) {
3109 get_bh(bh); 3110 get_bh(bh);
3110 bh->b_end_io = end_buffer_write_sync; 3111 bh->b_end_io = end_buffer_write_sync;
3111 ret = submit_bh(WRITE_SYNC, bh); 3112 ret = submit_bh(WRITE, bh);
3112 wait_on_buffer(bh); 3113 wait_on_buffer(bh);
3113 if (buffer_eopnotsupp(bh)) { 3114 if (buffer_eopnotsupp(bh)) {
3114 clear_buffer_eopnotsupp(bh); 3115 clear_buffer_eopnotsupp(bh);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 73ac7ebd1dfc..851388fafc73 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
1Version 1.57
2------------
3Improve support for multiple security contexts to the same server. We
4used to use the same "vcnumber" for all connections which could cause
5the server to treat subsequent connections, especially those that
6are authenticated as guest, as reconnections, invalidating the earlier
7user's smb session. This fix allows cifs to mount multiple times to the
8same server with different userids without risking invalidating earlier
9established security contexts.
10
1Version 1.56 11Version 1.56
2------------ 12------------
3Add "forcemandatorylock" mount option to allow user to use mandatory 13Add "forcemandatorylock" mount option to allow user to use mandatory
@@ -7,7 +17,10 @@ specified and user does not have access to query information about the
7top of the share. Fix problem in 2.6.28 resolving DFS paths to 17top of the share. Fix problem in 2.6.28 resolving DFS paths to
8Samba servers (worked to Windows). Fix rmdir so that pending search 18Samba servers (worked to Windows). Fix rmdir so that pending search
9(readdir) requests do not get invalid results which include the now 19(readdir) requests do not get invalid results which include the now
10removed directory. 20removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
21when using DFS. Add better file create support to servers which support
22the CIFS POSIX protocol extensions (this adds support for new flags
23on create, and improves semantics for write of locked ranges).
11 24
12Version 1.55 25Version 1.55
13------------ 26------------
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7ac481841f87..2b1d28a9ee28 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
100extern const struct export_operations cifs_export_ops; 100extern const struct export_operations cifs_export_ops;
101#endif /* EXPERIMENTAL */ 101#endif /* EXPERIMENTAL */
102 102
103#define CIFS_VERSION "1.56" 103#define CIFS_VERSION "1.57"
104#endif /* _CIFSFS_H */ 104#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94c1ca0ec953..e004f6db5fc8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -164,9 +164,12 @@ struct TCP_Server_Info {
164 /* multiplexed reads or writes */ 164 /* multiplexed reads or writes */
165 unsigned int maxBuf; /* maxBuf specifies the maximum */ 165 unsigned int maxBuf; /* maxBuf specifies the maximum */
166 /* message size the server can send or receive for non-raw SMBs */ 166 /* message size the server can send or receive for non-raw SMBs */
167 unsigned int maxRw; /* maxRw specifies the maximum */ 167 unsigned int max_rw; /* maxRw specifies the maximum */
168 /* message size the server can send or receive for */ 168 /* message size the server can send or receive for */
169 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ 169 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
170 unsigned int max_vcs; /* maximum number of smb sessions, at least
171 those that can be specified uniquely with
172 vcnumbers */
170 char sessid[4]; /* unique token id for this session */ 173 char sessid[4]; /* unique token id for this session */
171 /* (returned on Negotiate */ 174 /* (returned on Negotiate */
172 int capabilities; /* allow selective disabling of caps by smb sess */ 175 int capabilities; /* allow selective disabling of caps by smb sess */
@@ -210,6 +213,7 @@ struct cifsSesInfo {
210 unsigned overrideSecFlg; /* if non-zero override global sec flags */ 213 unsigned overrideSecFlg; /* if non-zero override global sec flags */
211 __u16 ipc_tid; /* special tid for connection to IPC share */ 214 __u16 ipc_tid; /* special tid for connection to IPC share */
212 __u16 flags; 215 __u16 flags;
216 __u16 vcnum;
213 char *serverOS; /* name of operating system underlying server */ 217 char *serverOS; /* name of operating system underlying server */
214 char *serverNOS; /* name of network operating system of server */ 218 char *serverNOS; /* name of network operating system of server */
215 char *serverDomain; /* security realm of server */ 219 char *serverDomain; /* security realm of server */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 382ba6298809..083dfc57c7a3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -42,6 +42,7 @@ extern void _FreeXid(unsigned int);
42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid())); 42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
43#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));} 43#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
44extern char *build_path_from_dentry(struct dentry *); 44extern char *build_path_from_dentry(struct dentry *);
45extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
45extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 46extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
46/* extern void renew_parental_timestamps(struct dentry *direntry);*/ 47/* extern void renew_parental_timestamps(struct dentry *direntry);*/
47extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, 48extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
@@ -91,6 +92,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
91extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); 92extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
92extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); 93extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
93 94
95extern void posix_fill_in_inode(struct inode *tmp_inode,
96 FILE_UNIX_BASIC_INFO *pData, int isNewInode);
97extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
94extern int cifs_get_inode_info(struct inode **pinode, 98extern int cifs_get_inode_info(struct inode **pinode,
95 const unsigned char *search_path, 99 const unsigned char *search_path,
96 FILE_ALL_INFO *pfile_info, 100 FILE_ALL_INFO *pfile_info,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 552642a507c4..939e2f76b959 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
528 server->maxReq = le16_to_cpu(rsp->MaxMpxCount); 528 server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
529 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), 529 server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
530 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 530 (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
531 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
531 GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey); 532 GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
532 /* even though we do not use raw we might as well set this 533 /* even though we do not use raw we might as well set this
533 accurately, in case we ever find a need for it */ 534 accurately, in case we ever find a need for it */
534 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { 535 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
535 server->maxRw = 0xFF00; 536 server->max_rw = 0xFF00;
536 server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; 537 server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
537 } else { 538 } else {
538 server->maxRw = 0;/* we do not need to use raw anyway */ 539 server->max_rw = 0;/* do not need to use raw anyway */
539 server->capabilities = CAP_MPX_MODE; 540 server->capabilities = CAP_MPX_MODE;
540 } 541 }
541 tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); 542 tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
@@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
638 /* probably no need to store and check maxvcs */ 639 /* probably no need to store and check maxvcs */
639 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), 640 server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
640 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); 641 (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
641 server->maxRw = le32_to_cpu(pSMBr->MaxRawSize); 642 server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
642 cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf)); 643 cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
643 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); 644 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
644 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 645 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2209be943051..da0f4ffa0613 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,7 +23,6 @@
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/wait.h> 25#include <linux/wait.h>
26#include <linux/ipv6.h>
27#include <linux/pagemap.h> 26#include <linux/pagemap.h>
28#include <linux/ctype.h> 27#include <linux/ctype.h>
29#include <linux/utsname.h> 28#include <linux/utsname.h>
@@ -35,6 +34,7 @@
35#include <linux/freezer.h> 34#include <linux/freezer.h>
36#include <asm/uaccess.h> 35#include <asm/uaccess.h>
37#include <asm/processor.h> 36#include <asm/processor.h>
37#include <net/ipv6.h>
38#include "cifspdu.h" 38#include "cifspdu.h"
39#include "cifsglob.h" 39#include "cifsglob.h"
40#include "cifsproto.h" 40#include "cifsproto.h"
@@ -1379,8 +1379,8 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
1379 server->addr.sockAddr.sin_addr.s_addr)) 1379 server->addr.sockAddr.sin_addr.s_addr))
1380 continue; 1380 continue;
1381 else if (addr->ss_family == AF_INET6 && 1381 else if (addr->ss_family == AF_INET6 &&
1382 memcmp(&server->addr.sockAddr6.sin6_addr, 1382 !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
1383 &addr6->sin6_addr, sizeof(addr6->sin6_addr))) 1383 &addr6->sin6_addr))
1384 continue; 1384 continue;
1385 1385
1386 ++server->srv_count; 1386 ++server->srv_count;
@@ -2180,6 +2180,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
2180 "mount option supported")); 2180 "mount option supported"));
2181} 2181}
2182 2182
2183static int
2184is_path_accessible(int xid, struct cifsTconInfo *tcon,
2185 struct cifs_sb_info *cifs_sb, const char *full_path)
2186{
2187 int rc;
2188 __u64 inode_num;
2189 FILE_ALL_INFO *pfile_info;
2190
2191 rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
2192 cifs_sb->local_nls,
2193 cifs_sb->mnt_cifs_flags &
2194 CIFS_MOUNT_MAP_SPECIAL_CHR);
2195 if (rc != -EOPNOTSUPP)
2196 return rc;
2197
2198 pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
2199 if (pfile_info == NULL)
2200 return -ENOMEM;
2201
2202 rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
2203 0 /* not legacy */, cifs_sb->local_nls,
2204 cifs_sb->mnt_cifs_flags &
2205 CIFS_MOUNT_MAP_SPECIAL_CHR);
2206 kfree(pfile_info);
2207 return rc;
2208}
2209
2183int 2210int
2184cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, 2211cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2185 char *mount_data, const char *devname) 2212 char *mount_data, const char *devname)
@@ -2190,6 +2217,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
2190 struct cifsSesInfo *pSesInfo = NULL; 2217 struct cifsSesInfo *pSesInfo = NULL;
2191 struct cifsTconInfo *tcon = NULL; 2218 struct cifsTconInfo *tcon = NULL;
2192 struct TCP_Server_Info *srvTcp = NULL; 2219 struct TCP_Server_Info *srvTcp = NULL;
2220 char *full_path;
2193 2221
2194 xid = GetXid(); 2222 xid = GetXid();
2195 2223
@@ -2426,6 +2454,23 @@ mount_fail_check:
2426 cifs_sb->rsize = min(cifs_sb->rsize, 2454 cifs_sb->rsize = min(cifs_sb->rsize,
2427 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); 2455 (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
2428 2456
2457 if (!rc && cifs_sb->prepathlen) {
2458 /* build_path_to_root works only when we have a valid tcon */
2459 full_path = cifs_build_path_to_root(cifs_sb);
2460 if (full_path == NULL) {
2461 rc = -ENOMEM;
2462 goto mount_fail_check;
2463 }
2464 rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
2465 if (rc) {
2466 cERROR(1, ("Path %s in not accessible: %d",
2467 full_path, rc));
2468 kfree(full_path);
2469 goto mount_fail_check;
2470 }
2471 kfree(full_path);
2472 }
2473
2429 /* volume_info->password is freed above when existing session found 2474 /* volume_info->password is freed above when existing session found
2430 (in which case it is not needed anymore) but when new sesion is created 2475 (in which case it is not needed anymore) but when new sesion is created
2431 the password ptr is put in the new session structure (in which case the 2476 the password ptr is put in the new session structure (in which case the
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 964aad03c5ad..89fb72832652 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * vfs operations that deal with dentries 4 * vfs operations that deal with dentries
5 * 5 *
6 * Copyright (C) International Business Machines Corp., 2002,2008 6 * Copyright (C) International Business Machines Corp., 2002,2009
7 * Author(s): Steve French (sfrench@us.ibm.com) 7 * Author(s): Steve French (sfrench@us.ibm.com)
8 * 8 *
9 * This library is free software; you can redistribute it and/or modify 9 * This library is free software; you can redistribute it and/or modify
@@ -129,6 +129,78 @@ cifs_bp_rename_retry:
129 return full_path; 129 return full_path;
130} 130}
131 131
132static int cifs_posix_open(char *full_path, struct inode **pinode,
133 struct super_block *sb, int mode, int oflags,
134 int *poplock, __u16 *pnetfid, int xid)
135{
136 int rc;
137 __u32 oplock;
138 FILE_UNIX_BASIC_INFO *presp_data;
139 __u32 posix_flags = 0;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
141
142 cFYI(1, ("posix open %s", full_path));
143
144 presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
145 if (presp_data == NULL)
146 return -ENOMEM;
147
148/* So far cifs posix extensions can only map the following flags.
149 There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
150 so far we do not seem to need them, and we can treat them as local only */
151 if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
152 (FMODE_READ | FMODE_WRITE))
153 posix_flags = SMB_O_RDWR;
154 else if (oflags & FMODE_READ)
155 posix_flags = SMB_O_RDONLY;
156 else if (oflags & FMODE_WRITE)
157 posix_flags = SMB_O_WRONLY;
158 if (oflags & O_CREAT)
159 posix_flags |= SMB_O_CREAT;
160 if (oflags & O_EXCL)
161 posix_flags |= SMB_O_EXCL;
162 if (oflags & O_TRUNC)
163 posix_flags |= SMB_O_TRUNC;
164 if (oflags & O_APPEND)
165 posix_flags |= SMB_O_APPEND;
166 if (oflags & O_SYNC)
167 posix_flags |= SMB_O_SYNC;
168 if (oflags & O_DIRECTORY)
169 posix_flags |= SMB_O_DIRECTORY;
170 if (oflags & O_NOFOLLOW)
171 posix_flags |= SMB_O_NOFOLLOW;
172 if (oflags & O_DIRECT)
173 posix_flags |= SMB_O_DIRECT;
174
175
176 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
177 pnetfid, presp_data, &oplock, full_path,
178 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
179 CIFS_MOUNT_MAP_SPECIAL_CHR);
180 if (rc)
181 goto posix_open_ret;
182
183 if (presp_data->Type == cpu_to_le32(-1))
184 goto posix_open_ret; /* open ok, caller does qpathinfo */
185
186 /* get new inode and set it up */
187 if (!pinode)
188 goto posix_open_ret; /* caller does not need info */
189
190 *pinode = cifs_new_inode(sb, &presp_data->UniqueId);
191
192 /* We do not need to close the file if new_inode fails since
193 the caller will retry qpathinfo as long as inode is null */
194 if (*pinode == NULL)
195 goto posix_open_ret;
196
197 posix_fill_in_inode(*pinode, presp_data, 1);
198
199posix_open_ret:
200 kfree(presp_data);
201 return rc;
202}
203
132static void setup_cifs_dentry(struct cifsTconInfo *tcon, 204static void setup_cifs_dentry(struct cifsTconInfo *tcon,
133 struct dentry *direntry, 205 struct dentry *direntry,
134 struct inode *newinode) 206 struct inode *newinode)
@@ -150,7 +222,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
150 int xid; 222 int xid;
151 int create_options = CREATE_NOT_DIR; 223 int create_options = CREATE_NOT_DIR;
152 int oplock = 0; 224 int oplock = 0;
153 /* BB below access is too much for the mknod to request */ 225 int oflags;
226 /*
227 * BB below access is probably too much for mknod to request
228 * but we have to do query and setpathinfo so requesting
229 * less could fail (unless we want to request getatr and setatr
230 * permissions (only). At least for POSIX we do not have to
231 * request so much.
232 */
154 int desiredAccess = GENERIC_READ | GENERIC_WRITE; 233 int desiredAccess = GENERIC_READ | GENERIC_WRITE;
155 __u16 fileHandle; 234 __u16 fileHandle;
156 struct cifs_sb_info *cifs_sb; 235 struct cifs_sb_info *cifs_sb;
@@ -174,13 +253,43 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
174 } 253 }
175 254
176 mode &= ~current->fs->umask; 255 mode &= ~current->fs->umask;
256 if (oplockEnabled)
257 oplock = REQ_OPLOCK;
177 258
178 if (nd && (nd->flags & LOOKUP_OPEN)) { 259 if (nd && (nd->flags & LOOKUP_OPEN))
179 int oflags = nd->intent.open.flags; 260 oflags = nd->intent.open.flags;
261 else
262 oflags = FMODE_READ;
263
264 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
265 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
266 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
267 rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
268 mode, oflags, &oplock, &fileHandle, xid);
269 /* EIO could indicate that (posix open) operation is not
270 supported, despite what server claimed in capability
271 negotation. EREMOTE indicates DFS junction, which is not
272 handled in posix open */
273
274 if ((rc == 0) && (newinode == NULL))
275 goto cifs_create_get_file_info; /* query inode info */
276 else if (rc == 0) /* success, no need to query */
277 goto cifs_create_set_dentry;
278 else if ((rc != -EIO) && (rc != -EREMOTE) &&
279 (rc != -EOPNOTSUPP)) /* path not found or net err */
280 goto cifs_create_out;
281 /* else fallthrough to retry, using older open call, this is
282 case where server does not support this SMB level, and
283 falsely claims capability (also get here for DFS case
284 which should be rare for path not covered on files) */
285 }
180 286
287 if (nd && (nd->flags & LOOKUP_OPEN)) {
288 /* if the file is going to stay open, then we
289 need to set the desired access properly */
181 desiredAccess = 0; 290 desiredAccess = 0;
182 if (oflags & FMODE_READ) 291 if (oflags & FMODE_READ)
183 desiredAccess |= GENERIC_READ; 292 desiredAccess |= GENERIC_READ; /* is this too little? */
184 if (oflags & FMODE_WRITE) { 293 if (oflags & FMODE_WRITE) {
185 desiredAccess |= GENERIC_WRITE; 294 desiredAccess |= GENERIC_WRITE;
186 if (!(oflags & FMODE_READ)) 295 if (!(oflags & FMODE_READ))
@@ -199,8 +308,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
199 308
200 /* BB add processing to set equivalent of mode - e.g. via CreateX with 309 /* BB add processing to set equivalent of mode - e.g. via CreateX with
201 ACLs */ 310 ACLs */
202 if (oplockEnabled)
203 oplock = REQ_OPLOCK;
204 311
205 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 312 buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
206 if (buf == NULL) { 313 if (buf == NULL) {
@@ -233,116 +340,112 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
233 } 340 }
234 if (rc) { 341 if (rc) {
235 cFYI(1, ("cifs_create returned 0x%x", rc)); 342 cFYI(1, ("cifs_create returned 0x%x", rc));
236 } else { 343 goto cifs_create_out;
237 /* If Open reported that we actually created a file 344 }
238 then we now have to set the mode if possible */ 345
239 if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { 346 /* If Open reported that we actually created a file
240 struct cifs_unix_set_info_args args = { 347 then we now have to set the mode if possible */
348 if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
349 struct cifs_unix_set_info_args args = {
241 .mode = mode, 350 .mode = mode,
242 .ctime = NO_CHANGE_64, 351 .ctime = NO_CHANGE_64,
243 .atime = NO_CHANGE_64, 352 .atime = NO_CHANGE_64,
244 .mtime = NO_CHANGE_64, 353 .mtime = NO_CHANGE_64,
245 .device = 0, 354 .device = 0,
246 }; 355 };
247 356
248 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { 357 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
249 args.uid = (__u64) current_fsuid(); 358 args.uid = (__u64) current_fsuid();
250 if (inode->i_mode & S_ISGID) 359 if (inode->i_mode & S_ISGID)
251 args.gid = (__u64) inode->i_gid; 360 args.gid = (__u64) inode->i_gid;
252 else 361 else
253 args.gid = (__u64) current_fsgid(); 362 args.gid = (__u64) current_fsgid();
254 } else {
255 args.uid = NO_CHANGE_64;
256 args.gid = NO_CHANGE_64;
257 }
258 CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
259 cifs_sb->local_nls,
260 cifs_sb->mnt_cifs_flags &
261 CIFS_MOUNT_MAP_SPECIAL_CHR);
262 } else { 363 } else {
263 /* BB implement mode setting via Windows security 364 args.uid = NO_CHANGE_64;
264 descriptors e.g. */ 365 args.gid = NO_CHANGE_64;
265 /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
266
267 /* Could set r/o dos attribute if mode & 0222 == 0 */
268 } 366 }
367 CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
368 cifs_sb->local_nls,
369 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
370 } else {
371 /* BB implement mode setting via Windows security
372 descriptors e.g. */
373 /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
269 374
270 /* server might mask mode so we have to query for it */ 375 /* Could set r/o dos attribute if mode & 0222 == 0 */
271 if (tcon->unix_ext) 376 }
272 rc = cifs_get_inode_info_unix(&newinode, full_path, 377
273 inode->i_sb, xid); 378cifs_create_get_file_info:
274 else { 379 /* server might mask mode so we have to query for it */
275 rc = cifs_get_inode_info(&newinode, full_path, 380 if (tcon->unix_ext)
276 buf, inode->i_sb, xid, 381 rc = cifs_get_inode_info_unix(&newinode, full_path,
277 &fileHandle); 382 inode->i_sb, xid);
278 if (newinode) { 383 else {
279 if (cifs_sb->mnt_cifs_flags & 384 rc = cifs_get_inode_info(&newinode, full_path, buf,
280 CIFS_MOUNT_DYNPERM) 385 inode->i_sb, xid, &fileHandle);
281 newinode->i_mode = mode; 386 if (newinode) {
282 if ((oplock & CIFS_CREATE_ACTION) && 387 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
283 (cifs_sb->mnt_cifs_flags & 388 newinode->i_mode = mode;
284 CIFS_MOUNT_SET_UID)) { 389 if ((oplock & CIFS_CREATE_ACTION) &&
285 newinode->i_uid = current_fsuid(); 390 (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
286 if (inode->i_mode & S_ISGID) 391 newinode->i_uid = current_fsuid();
287 newinode->i_gid = 392 if (inode->i_mode & S_ISGID)
288 inode->i_gid; 393 newinode->i_gid = inode->i_gid;
289 else 394 else
290 newinode->i_gid = 395 newinode->i_gid = current_fsgid();
291 current_fsgid();
292 }
293 } 396 }
294 } 397 }
398 }
295 399
296 if (rc != 0) { 400cifs_create_set_dentry:
297 cFYI(1, ("Create worked, get_inode_info failed rc = %d", 401 if (rc == 0)
298 rc)); 402 setup_cifs_dentry(tcon, direntry, newinode);
299 } else 403 else
300 setup_cifs_dentry(tcon, direntry, newinode); 404 cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
301 405
302 if ((nd == NULL /* nfsd case - nfs srv does not set nd */) || 406 /* nfsd case - nfs srv does not set nd */
303 (!(nd->flags & LOOKUP_OPEN))) { 407 if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
304 /* mknod case - do not leave file open */ 408 /* mknod case - do not leave file open */
305 CIFSSMBClose(xid, tcon, fileHandle); 409 CIFSSMBClose(xid, tcon, fileHandle);
306 } else if (newinode) { 410 } else if (newinode) {
307 struct cifsFileInfo *pCifsFile = 411 struct cifsFileInfo *pCifsFile =
308 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 412 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
309 413
310 if (pCifsFile == NULL) 414 if (pCifsFile == NULL)
311 goto cifs_create_out; 415 goto cifs_create_out;
312 pCifsFile->netfid = fileHandle; 416 pCifsFile->netfid = fileHandle;
313 pCifsFile->pid = current->tgid; 417 pCifsFile->pid = current->tgid;
314 pCifsFile->pInode = newinode; 418 pCifsFile->pInode = newinode;
315 pCifsFile->invalidHandle = false; 419 pCifsFile->invalidHandle = false;
316 pCifsFile->closePend = false; 420 pCifsFile->closePend = false;
317 init_MUTEX(&pCifsFile->fh_sem); 421 init_MUTEX(&pCifsFile->fh_sem);
318 mutex_init(&pCifsFile->lock_mutex); 422 mutex_init(&pCifsFile->lock_mutex);
319 INIT_LIST_HEAD(&pCifsFile->llist); 423 INIT_LIST_HEAD(&pCifsFile->llist);
320 atomic_set(&pCifsFile->wrtPending, 0); 424 atomic_set(&pCifsFile->wrtPending, 0);
321 425
322 /* set the following in open now 426 /* set the following in open now
323 pCifsFile->pfile = file; */ 427 pCifsFile->pfile = file; */
324 write_lock(&GlobalSMBSeslock); 428 write_lock(&GlobalSMBSeslock);
325 list_add(&pCifsFile->tlist, &tcon->openFileList); 429 list_add(&pCifsFile->tlist, &tcon->openFileList);
326 pCifsInode = CIFS_I(newinode); 430 pCifsInode = CIFS_I(newinode);
327 if (pCifsInode) { 431 if (pCifsInode) {
328 /* if readable file instance put first in list*/ 432 /* if readable file instance put first in list*/
329 if (write_only) { 433 if (write_only) {
330 list_add_tail(&pCifsFile->flist, 434 list_add_tail(&pCifsFile->flist,
331 &pCifsInode->openFileList); 435 &pCifsInode->openFileList);
332 } else { 436 } else {
333 list_add(&pCifsFile->flist, 437 list_add(&pCifsFile->flist,
334 &pCifsInode->openFileList); 438 &pCifsInode->openFileList);
335 }
336 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
337 pCifsInode->clientCanCacheAll = true;
338 pCifsInode->clientCanCacheRead = true;
339 cFYI(1, ("Exclusive Oplock inode %p",
340 newinode));
341 } else if ((oplock & 0xF) == OPLOCK_READ)
342 pCifsInode->clientCanCacheRead = true;
343 } 439 }
344 write_unlock(&GlobalSMBSeslock); 440 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
441 pCifsInode->clientCanCacheAll = true;
442 pCifsInode->clientCanCacheRead = true;
443 cFYI(1, ("Exclusive Oplock inode %p",
444 newinode));
445 } else if ((oplock & 0xF) == OPLOCK_READ)
446 pCifsInode->clientCanCacheRead = true;
345 } 447 }
448 write_unlock(&GlobalSMBSeslock);
346 } 449 }
347cifs_create_out: 450cifs_create_out:
348 kfree(buf); 451 kfree(buf);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bcf7b5184664..4690a360c855 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
199 pfnd_dat->Gid = cpu_to_le64(pinode->i_gid); 199 pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
200} 200}
201 201
202/**
203 * cifs_new inode - create new inode, initialize, and hash it
204 * @sb - pointer to superblock
205 * @inum - if valid pointer and serverino is enabled, replace i_ino with val
206 *
207 * Create a new inode, initialize it for CIFS and hash it. Returns the new
208 * inode or NULL if one couldn't be allocated.
209 *
210 * If the share isn't mounted with "serverino" or inum is a NULL pointer then
211 * we'll just use the inode number assigned by new_inode(). Note that this can
212 * mean i_ino collisions since the i_ino assigned by new_inode is not
213 * guaranteed to be unique.
214 */
215struct inode *
216cifs_new_inode(struct super_block *sb, __u64 *inum)
217{
218 struct inode *inode;
219
220 inode = new_inode(sb);
221 if (inode == NULL)
222 return NULL;
223
224 /*
225 * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
226 * stop passing inum as ptr. Are there sanity checks we can use to
227 * ensure that the server is really filling in that field? Also,
228 * if serverino is disabled, perhaps we should be using iunique()?
229 */
230 if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
231 inode->i_ino = (unsigned long) *inum;
232
233 /*
234 * must set this here instead of cifs_alloc_inode since VFS will
235 * clobber i_flags
236 */
237 if (sb->s_flags & MS_NOATIME)
238 inode->i_flags |= S_NOATIME | S_NOCMTIME;
239
240 insert_inode_hash(inode);
241
242 return inode;
243}
244
202int cifs_get_inode_info_unix(struct inode **pinode, 245int cifs_get_inode_info_unix(struct inode **pinode,
203 const unsigned char *full_path, struct super_block *sb, int xid) 246 const unsigned char *full_path, struct super_block *sb, int xid)
204{ 247{
@@ -233,22 +276,11 @@ int cifs_get_inode_info_unix(struct inode **pinode,
233 276
234 /* get new inode */ 277 /* get new inode */
235 if (*pinode == NULL) { 278 if (*pinode == NULL) {
236 *pinode = new_inode(sb); 279 *pinode = cifs_new_inode(sb, &find_data.UniqueId);
237 if (*pinode == NULL) { 280 if (*pinode == NULL) {
238 rc = -ENOMEM; 281 rc = -ENOMEM;
239 goto cgiiu_exit; 282 goto cgiiu_exit;
240 } 283 }
241 /* Is an i_ino of zero legal? */
242 /* note ino incremented to unique num in new_inode */
243 /* Are there sanity checks we can use to ensure that
244 the server is really filling in that field? */
245 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
246 (*pinode)->i_ino = (unsigned long)find_data.UniqueId;
247
248 if (sb->s_flags & MS_NOATIME)
249 (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
250
251 insert_inode_hash(*pinode);
252 } 284 }
253 285
254 inode = *pinode; 286 inode = *pinode;
@@ -465,11 +497,9 @@ int cifs_get_inode_info(struct inode **pinode,
465 497
466 /* get new inode */ 498 /* get new inode */
467 if (*pinode == NULL) { 499 if (*pinode == NULL) {
468 *pinode = new_inode(sb); 500 __u64 inode_num;
469 if (*pinode == NULL) { 501 __u64 *pinum = &inode_num;
470 rc = -ENOMEM; 502
471 goto cgii_exit;
472 }
473 /* Is an i_ino of zero legal? Can we use that to check 503 /* Is an i_ino of zero legal? Can we use that to check
474 if the server supports returning inode numbers? Are 504 if the server supports returning inode numbers? Are
475 there other sanity checks we can use to ensure that 505 there other sanity checks we can use to ensure that
@@ -486,22 +516,26 @@ int cifs_get_inode_info(struct inode **pinode,
486 516
487 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 517 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
488 int rc1 = 0; 518 int rc1 = 0;
489 __u64 inode_num;
490 519
491 rc1 = CIFSGetSrvInodeNumber(xid, pTcon, 520 rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
492 full_path, &inode_num, 521 full_path, pinum,
493 cifs_sb->local_nls, 522 cifs_sb->local_nls,
494 cifs_sb->mnt_cifs_flags & 523 cifs_sb->mnt_cifs_flags &
495 CIFS_MOUNT_MAP_SPECIAL_CHR); 524 CIFS_MOUNT_MAP_SPECIAL_CHR);
496 if (rc1) { 525 if (rc1) {
497 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 526 cFYI(1, ("GetSrvInodeNum rc %d", rc1));
527 pinum = NULL;
498 /* BB EOPNOSUPP disable SERVER_INUM? */ 528 /* BB EOPNOSUPP disable SERVER_INUM? */
499 } else /* do we need cast or hash to ino? */ 529 }
500 (*pinode)->i_ino = inode_num; 530 } else {
501 } /* else ino incremented to unique num in new_inode*/ 531 pinum = NULL;
502 if (sb->s_flags & MS_NOATIME) 532 }
503 (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME; 533
504 insert_inode_hash(*pinode); 534 *pinode = cifs_new_inode(sb, pinum);
535 if (*pinode == NULL) {
536 rc = -ENOMEM;
537 goto cgii_exit;
538 }
505 } 539 }
506 inode = *pinode; 540 inode = *pinode;
507 cifsInfo = CIFS_I(inode); 541 cifsInfo = CIFS_I(inode);
@@ -621,7 +655,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
621 .lookup = cifs_lookup, 655 .lookup = cifs_lookup,
622}; 656};
623 657
624static char *build_path_to_root(struct cifs_sb_info *cifs_sb) 658char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
625{ 659{
626 int pplen = cifs_sb->prepathlen; 660 int pplen = cifs_sb->prepathlen;
627 int dfsplen; 661 int dfsplen;
@@ -678,7 +712,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
678 return inode; 712 return inode;
679 713
680 cifs_sb = CIFS_SB(inode->i_sb); 714 cifs_sb = CIFS_SB(inode->i_sb);
681 full_path = build_path_to_root(cifs_sb); 715 full_path = cifs_build_path_to_root(cifs_sb);
682 if (full_path == NULL) 716 if (full_path == NULL)
683 return ERR_PTR(-ENOMEM); 717 return ERR_PTR(-ENOMEM);
684 718
@@ -1017,7 +1051,7 @@ out_reval:
1017 return rc; 1051 return rc;
1018} 1052}
1019 1053
1020static void posix_fill_in_inode(struct inode *tmp_inode, 1054void posix_fill_in_inode(struct inode *tmp_inode,
1021 FILE_UNIX_BASIC_INFO *pData, int isNewInode) 1055 FILE_UNIX_BASIC_INFO *pData, int isNewInode)
1022{ 1056{
1023 struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode); 1057 struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
@@ -1114,24 +1148,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
1114 else 1148 else
1115 direntry->d_op = &cifs_dentry_ops; 1149 direntry->d_op = &cifs_dentry_ops;
1116 1150
1117 newinode = new_inode(inode->i_sb); 1151 newinode = cifs_new_inode(inode->i_sb,
1152 &pInfo->UniqueId);
1118 if (newinode == NULL) { 1153 if (newinode == NULL) {
1119 kfree(pInfo); 1154 kfree(pInfo);
1120 goto mkdir_get_info; 1155 goto mkdir_get_info;
1121 } 1156 }
1122 1157
1123 /* Is an i_ino of zero legal? */
1124 /* Are there sanity checks we can use to ensure that
1125 the server is really filling in that field? */
1126 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
1127 newinode->i_ino =
1128 (unsigned long)pInfo->UniqueId;
1129 } /* note ino incremented to unique num in new_inode */
1130 if (inode->i_sb->s_flags & MS_NOATIME)
1131 newinode->i_flags |= S_NOATIME | S_NOCMTIME;
1132 newinode->i_nlink = 2; 1158 newinode->i_nlink = 2;
1133
1134 insert_inode_hash(newinode);
1135 d_instantiate(direntry, newinode); 1159 d_instantiate(direntry, newinode);
1136 1160
1137 /* we already checked in POSIXCreate whether 1161 /* we already checked in POSIXCreate whether
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9f51f9bf0292..c2c01ff4c32c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
56} 56}
57#endif /* DEBUG2 */ 57#endif /* DEBUG2 */
58 58
59/* Returns one if new inode created (which therefore needs to be hashed) */ 59/* Returns 1 if new inode created, 2 if both dentry and inode were */
60/* Might check in the future if inode number changed so we can rehash inode */ 60/* Might check in the future if inode number changed so we can rehash inode */
61static int construct_dentry(struct qstr *qstring, struct file *file, 61static int
62 struct inode **ptmp_inode, struct dentry **pnew_dentry) 62construct_dentry(struct qstr *qstring, struct file *file,
63 struct inode **ptmp_inode, struct dentry **pnew_dentry,
64 __u64 *inum)
63{ 65{
64 struct dentry *tmp_dentry; 66 struct dentry *tmp_dentry = NULL;
65 struct cifs_sb_info *cifs_sb; 67 struct super_block *sb = file->f_path.dentry->d_sb;
66 struct cifsTconInfo *pTcon;
67 int rc = 0; 68 int rc = 0;
68 69
69 cFYI(1, ("For %s", qstring->name)); 70 cFYI(1, ("For %s", qstring->name));
70 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
71 pTcon = cifs_sb->tcon;
72 71
73 qstring->hash = full_name_hash(qstring->name, qstring->len); 72 qstring->hash = full_name_hash(qstring->name, qstring->len);
74 tmp_dentry = d_lookup(file->f_path.dentry, qstring); 73 tmp_dentry = d_lookup(file->f_path.dentry, qstring);
75 if (tmp_dentry) { 74 if (tmp_dentry) {
75 /* BB: overwrite old name? i.e. tmp_dentry->d_name and
76 * tmp_dentry->d_name.len??
77 */
76 cFYI(0, ("existing dentry with inode 0x%p", 78 cFYI(0, ("existing dentry with inode 0x%p",
77 tmp_dentry->d_inode)); 79 tmp_dentry->d_inode));
78 *ptmp_inode = tmp_dentry->d_inode; 80 *ptmp_inode = tmp_dentry->d_inode;
79/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
80 if (*ptmp_inode == NULL) { 81 if (*ptmp_inode == NULL) {
81 *ptmp_inode = new_inode(file->f_path.dentry->d_sb); 82 *ptmp_inode = cifs_new_inode(sb, inum);
82 if (*ptmp_inode == NULL) 83 if (*ptmp_inode == NULL)
83 return rc; 84 return rc;
84 rc = 1; 85 rc = 1;
85 } 86 }
86 if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
87 (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
88 } else { 87 } else {
89 tmp_dentry = d_alloc(file->f_path.dentry, qstring); 88 tmp_dentry = d_alloc(file->f_path.dentry, qstring);
90 if (tmp_dentry == NULL) { 89 if (tmp_dentry == NULL) {
@@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
93 return rc; 92 return rc;
94 } 93 }
95 94
96 *ptmp_inode = new_inode(file->f_path.dentry->d_sb); 95 if (CIFS_SB(sb)->tcon->nocase)
97 if (pTcon->nocase)
98 tmp_dentry->d_op = &cifs_ci_dentry_ops; 96 tmp_dentry->d_op = &cifs_ci_dentry_ops;
99 else 97 else
100 tmp_dentry->d_op = &cifs_dentry_ops; 98 tmp_dentry->d_op = &cifs_dentry_ops;
99
100 *ptmp_inode = cifs_new_inode(sb, inum);
101 if (*ptmp_inode == NULL) 101 if (*ptmp_inode == NULL)
102 return rc; 102 return rc;
103 if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
104 (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
105 rc = 2; 103 rc = 2;
106 } 104 }
107 105
@@ -822,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
822/* inode num, inode type and filename returned */ 820/* inode num, inode type and filename returned */
823static int cifs_get_name_from_search_buf(struct qstr *pqst, 821static int cifs_get_name_from_search_buf(struct qstr *pqst,
824 char *current_entry, __u16 level, unsigned int unicode, 822 char *current_entry, __u16 level, unsigned int unicode,
825 struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum) 823 struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
826{ 824{
827 int rc = 0; 825 int rc = 0;
828 unsigned int len = 0; 826 unsigned int len = 0;
@@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
842 len = strnlen(filename, PATH_MAX); 840 len = strnlen(filename, PATH_MAX);
843 } 841 }
844 842
845 /* BB fixme - hash low and high 32 bits if not 64 bit arch BB */ 843 *pinum = pFindData->UniqueId;
846 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
847 *pinum = pFindData->UniqueId;
848 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { 844 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
849 FILE_DIRECTORY_INFO *pFindData = 845 FILE_DIRECTORY_INFO *pFindData =
850 (FILE_DIRECTORY_INFO *)current_entry; 846 (FILE_DIRECTORY_INFO *)current_entry;
@@ -907,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
907 struct qstr qstring; 903 struct qstr qstring;
908 struct cifsFileInfo *pCifsF; 904 struct cifsFileInfo *pCifsF;
909 unsigned int obj_type; 905 unsigned int obj_type;
910 ino_t inum; 906 __u64 inum;
911 struct cifs_sb_info *cifs_sb; 907 struct cifs_sb_info *cifs_sb;
912 struct inode *tmp_inode; 908 struct inode *tmp_inode;
913 struct dentry *tmp_dentry; 909 struct dentry *tmp_dentry;
@@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
940 if (rc) 936 if (rc)
941 return rc; 937 return rc;
942 938
943 rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry); 939 /* only these two infolevels return valid inode numbers */
940 if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
941 pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
942 rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
943 &inum);
944 else
945 rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
946 NULL);
947
944 if ((tmp_inode == NULL) || (tmp_dentry == NULL)) 948 if ((tmp_inode == NULL) || (tmp_dentry == NULL))
945 return -ENOMEM; 949 return -ENOMEM;
946 950
947 if (rc) {
948 /* inode created, we need to hash it with right inode number */
949 if (inum != 0) {
950 /* BB fixme - hash the 2 32 quantities bits together if
951 * necessary BB */
952 tmp_inode->i_ino = inum;
953 }
954 insert_inode_hash(tmp_inode);
955 }
956
957 /* we pass in rc below, indicating whether it is a new inode, 951 /* we pass in rc below, indicating whether it is a new inode,
958 so we can figure out whether to invalidate the inode cached 952 so we can figure out whether to invalidate the inode cached
959 data if the file has changed */ 953 data if the file has changed */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5f22de7b79a9..5c68b4282be9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -34,15 +34,99 @@
34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, 34extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
35 unsigned char *p24); 35 unsigned char *p24);
36 36
37/* Checks if this is the first smb session to be reconnected after
38 the socket has been reestablished (so we know whether to use vc 0).
39 Called while holding the cifs_tcp_ses_lock, so do not block */
40static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
41{
42 struct list_head *tmp;
43 struct cifsSesInfo *tmp_ses;
44
45 list_for_each(tmp, &ses->server->smb_ses_list) {
46 tmp_ses = list_entry(tmp, struct cifsSesInfo,
47 smb_ses_list);
48 if (tmp_ses->need_reconnect == false)
49 return false;
50 }
51 /* could not find a session that was already connected,
52 this must be the first one we are reconnecting */
53 return true;
54}
55
56/*
57 * vc number 0 is treated specially by some servers, and should be the
58 * first one we request. After that we can use vcnumbers up to maxvcs,
59 * one for each smb session (some Windows versions set maxvcs incorrectly
60 * so maxvc=1 can be ignored). If we have too many vcs, we can reuse
61 * any vc but zero (some servers reset the connection on vcnum zero)
62 *
63 */
64static __le16 get_next_vcnum(struct cifsSesInfo *ses)
65{
66 __u16 vcnum = 0;
67 struct list_head *tmp;
68 struct cifsSesInfo *tmp_ses;
69 __u16 max_vcs = ses->server->max_vcs;
70 __u16 i;
71 int free_vc_found = 0;
72
73 /* Quoting the MS-SMB specification: "Windows-based SMB servers set this
74 field to one but do not enforce this limit, which allows an SMB client
75 to establish more virtual circuits than allowed by this value ... but
76 other server implementations can enforce this limit." */
77 if (max_vcs < 2)
78 max_vcs = 0xFFFF;
79
80 write_lock(&cifs_tcp_ses_lock);
81 if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
82 goto get_vc_num_exit; /* vcnum will be zero */
83 for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
84 if (i == 0) /* this is the only connection, use vc 0 */
85 break;
86
87 free_vc_found = 1;
88
89 list_for_each(tmp, &ses->server->smb_ses_list) {
90 tmp_ses = list_entry(tmp, struct cifsSesInfo,
91 smb_ses_list);
92 if (tmp_ses->vcnum == i) {
93 free_vc_found = 0;
94 break; /* found duplicate, try next vcnum */
95 }
96 }
97 if (free_vc_found)
98 break; /* we found a vcnumber that will work - use it */
99 }
100
101 if (i == 0)
102 vcnum = 0; /* for most common case, ie if one smb session, use
103 vc zero. Also for case when no free vcnum, zero
104 is safest to send (some clients only send zero) */
105 else if (free_vc_found == 0)
106 vcnum = 1; /* we can not reuse vc=0 safely, since some servers
107 reset all uids on that, but 1 is ok. */
108 else
109 vcnum = i;
110 ses->vcnum = vcnum;
111get_vc_num_exit:
112 write_unlock(&cifs_tcp_ses_lock);
113
114 return le16_to_cpu(vcnum);
115}
116
37static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) 117static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
38{ 118{
39 __u32 capabilities = 0; 119 __u32 capabilities = 0;
40 120
41 /* init fields common to all four types of SessSetup */ 121 /* init fields common to all four types of SessSetup */
42 /* note that header is initialized to zero in header_assemble */ 122 /* Note that offsets for first seven fields in req struct are same */
123 /* in CIFS Specs so does not matter which of 3 forms of struct */
124 /* that we use in next few lines */
125 /* Note that header is initialized to zero in header_assemble */
43 pSMB->req.AndXCommand = 0xFF; 126 pSMB->req.AndXCommand = 0xFF;
44 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); 127 pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
45 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); 128 pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
129 pSMB->req.VcNumber = get_next_vcnum(ses);
46 130
47 /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ 131 /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
48 132
@@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
71 if (ses->capabilities & CAP_UNIX) 155 if (ses->capabilities & CAP_UNIX)
72 capabilities |= CAP_UNIX; 156 capabilities |= CAP_UNIX;
73 157
74 /* BB check whether to init vcnum BB */
75 return capabilities; 158 return capabilities;
76} 159}
77 160
@@ -228,7 +311,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
228 311
229 kfree(ses->serverOS); 312 kfree(ses->serverOS);
230 /* UTF-8 string will not grow more than four times as big as UCS-16 */ 313 /* UTF-8 string will not grow more than four times as big as UCS-16 */
231 ses->serverOS = kzalloc(4 * len, GFP_KERNEL); 314 ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
232 if (ses->serverOS != NULL) 315 if (ses->serverOS != NULL)
233 cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp); 316 cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
234 data += 2 * (len + 1); 317 data += 2 * (len + 1);
@@ -241,7 +324,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
241 return rc; 324 return rc;
242 325
243 kfree(ses->serverNOS); 326 kfree(ses->serverNOS);
244 ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */ 327 ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
245 if (ses->serverNOS != NULL) { 328 if (ses->serverNOS != NULL) {
246 cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len, 329 cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
247 nls_cp); 330 nls_cp);
diff --git a/fs/compat.c b/fs/compat.c
index 65a070e705ab..d0145ca27572 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1407,7 +1407,7 @@ int compat_do_execve(char * filename,
1407 bprm->cred = prepare_exec_creds(); 1407 bprm->cred = prepare_exec_creds();
1408 if (!bprm->cred) 1408 if (!bprm->cred)
1409 goto out_unlock; 1409 goto out_unlock;
1410 check_unsafe_exec(bprm); 1410 check_unsafe_exec(bprm, current->files);
1411 1411
1412 file = open_exec(filename); 1412 file = open_exec(filename);
1413 retval = PTR_ERR(file); 1413 retval = PTR_ERR(file);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c8f8d5904f5e..45e59d3c7f1f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -785,7 +785,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
785 785
786 if (copy_in_user(&sgio->status, &sgio32->status, 786 if (copy_in_user(&sgio->status, &sgio32->status,
787 (4 * sizeof(unsigned char)) + 787 (4 * sizeof(unsigned char)) +
788 (2 * sizeof(unsigned (short))) + 788 (2 * sizeof(unsigned short)) +
789 (3 * sizeof(int)))) 789 (3 * sizeof(int))))
790 return -EFAULT; 790 return -EFAULT;
791 791
@@ -1913,6 +1913,9 @@ COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
1913/* 0x00 */ 1913/* 0x00 */
1914COMPATIBLE_IOCTL(FIBMAP) 1914COMPATIBLE_IOCTL(FIBMAP)
1915COMPATIBLE_IOCTL(FIGETBSZ) 1915COMPATIBLE_IOCTL(FIGETBSZ)
1916/* 'X' - originally XFS but some now in the VFS */
1917COMPATIBLE_IOCTL(FIFREEZE)
1918COMPATIBLE_IOCTL(FITHAW)
1916/* RAID */ 1919/* RAID */
1917COMPATIBLE_IOCTL(RAID_VERSION) 1920COMPATIBLE_IOCTL(RAID_VERSION)
1918COMPATIBLE_IOCTL(GET_ARRAY_INFO) 1921COMPATIBLE_IOCTL(GET_ARRAY_INFO)
@@ -1938,6 +1941,8 @@ ULONG_IOCTL(SET_BITMAP_FILE)
1938/* Big K */ 1941/* Big K */
1939COMPATIBLE_IOCTL(PIO_FONT) 1942COMPATIBLE_IOCTL(PIO_FONT)
1940COMPATIBLE_IOCTL(GIO_FONT) 1943COMPATIBLE_IOCTL(GIO_FONT)
1944COMPATIBLE_IOCTL(PIO_CMAP)
1945COMPATIBLE_IOCTL(GIO_CMAP)
1941ULONG_IOCTL(KDSIGACCEPT) 1946ULONG_IOCTL(KDSIGACCEPT)
1942COMPATIBLE_IOCTL(KDGETKEYCODE) 1947COMPATIBLE_IOCTL(KDGETKEYCODE)
1943COMPATIBLE_IOCTL(KDSETKEYCODE) 1948COMPATIBLE_IOCTL(KDSETKEYCODE)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9c2358391147..8e93341f3e82 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -553,24 +553,12 @@ static void detach_groups(struct config_group *group)
553 553
554 child = sd->s_dentry; 554 child = sd->s_dentry;
555 555
556 /*
557 * Note: we hide this from lockdep since we have no way
558 * to teach lockdep about recursive
559 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
560 * in an inode tree, which are valid as soon as
561 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
562 * parent inode to one of its children.
563 */
564 lockdep_off();
565 mutex_lock(&child->d_inode->i_mutex); 556 mutex_lock(&child->d_inode->i_mutex);
566 lockdep_on();
567 557
568 configfs_detach_group(sd->s_element); 558 configfs_detach_group(sd->s_element);
569 child->d_inode->i_flags |= S_DEAD; 559 child->d_inode->i_flags |= S_DEAD;
570 560
571 lockdep_off();
572 mutex_unlock(&child->d_inode->i_mutex); 561 mutex_unlock(&child->d_inode->i_mutex);
573 lockdep_on();
574 562
575 d_delete(child); 563 d_delete(child);
576 dput(child); 564 dput(child);
@@ -760,22 +748,11 @@ static int configfs_attach_item(struct config_item *parent_item,
760 * We are going to remove an inode and its dentry but 748 * We are going to remove an inode and its dentry but
761 * the VFS may already have hit and used them. Thus, 749 * the VFS may already have hit and used them. Thus,
762 * we must lock them as rmdir() would. 750 * we must lock them as rmdir() would.
763 *
764 * Note: we hide this from lockdep since we have no way
765 * to teach lockdep about recursive
766 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
767 * in an inode tree, which are valid as soon as
768 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
769 * parent inode to one of its children.
770 */ 751 */
771 lockdep_off();
772 mutex_lock(&dentry->d_inode->i_mutex); 752 mutex_lock(&dentry->d_inode->i_mutex);
773 lockdep_on();
774 configfs_remove_dir(item); 753 configfs_remove_dir(item);
775 dentry->d_inode->i_flags |= S_DEAD; 754 dentry->d_inode->i_flags |= S_DEAD;
776 lockdep_off();
777 mutex_unlock(&dentry->d_inode->i_mutex); 755 mutex_unlock(&dentry->d_inode->i_mutex);
778 lockdep_on();
779 d_delete(dentry); 756 d_delete(dentry);
780 } 757 }
781 } 758 }
@@ -810,25 +787,14 @@ static int configfs_attach_group(struct config_item *parent_item,
810 * 787 *
811 * We must also lock the inode to remove it safely in case of 788 * We must also lock the inode to remove it safely in case of
812 * error, as rmdir() would. 789 * error, as rmdir() would.
813 *
814 * Note: we hide this from lockdep since we have no way
815 * to teach lockdep about recursive
816 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
817 * in an inode tree, which are valid as soon as
818 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
819 * parent inode to one of its children.
820 */ 790 */
821 lockdep_off();
822 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 791 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
823 lockdep_on();
824 ret = populate_groups(to_config_group(item)); 792 ret = populate_groups(to_config_group(item));
825 if (ret) { 793 if (ret) {
826 configfs_detach_item(item); 794 configfs_detach_item(item);
827 dentry->d_inode->i_flags |= S_DEAD; 795 dentry->d_inode->i_flags |= S_DEAD;
828 } 796 }
829 lockdep_off();
830 mutex_unlock(&dentry->d_inode->i_mutex); 797 mutex_unlock(&dentry->d_inode->i_mutex);
831 lockdep_on();
832 if (ret) 798 if (ret)
833 d_delete(dentry); 799 d_delete(dentry);
834 } 800 }
@@ -990,17 +956,7 @@ static int configfs_depend_prep(struct dentry *origin,
990 BUG_ON(!origin || !sd); 956 BUG_ON(!origin || !sd);
991 957
992 /* Lock this guy on the way down */ 958 /* Lock this guy on the way down */
993 /*
994 * Note: we hide this from lockdep since we have no way
995 * to teach lockdep about recursive
996 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
997 * in an inode tree, which are valid as soon as
998 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
999 * parent inode to one of its children.
1000 */
1001 lockdep_off();
1002 mutex_lock(&sd->s_dentry->d_inode->i_mutex); 959 mutex_lock(&sd->s_dentry->d_inode->i_mutex);
1003 lockdep_on();
1004 if (sd->s_element == target) /* Boo-yah */ 960 if (sd->s_element == target) /* Boo-yah */
1005 goto out; 961 goto out;
1006 962
@@ -1014,9 +970,7 @@ static int configfs_depend_prep(struct dentry *origin,
1014 } 970 }
1015 971
1016 /* We looped all our children and didn't find target */ 972 /* We looped all our children and didn't find target */
1017 lockdep_off();
1018 mutex_unlock(&sd->s_dentry->d_inode->i_mutex); 973 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
1019 lockdep_on();
1020 ret = -ENOENT; 974 ret = -ENOENT;
1021 975
1022out: 976out:
@@ -1036,16 +990,11 @@ static void configfs_depend_rollback(struct dentry *origin,
1036 struct dentry *dentry = item->ci_dentry; 990 struct dentry *dentry = item->ci_dentry;
1037 991
1038 while (dentry != origin) { 992 while (dentry != origin) {
1039 /* See comments in configfs_depend_prep() */
1040 lockdep_off();
1041 mutex_unlock(&dentry->d_inode->i_mutex); 993 mutex_unlock(&dentry->d_inode->i_mutex);
1042 lockdep_on();
1043 dentry = dentry->d_parent; 994 dentry = dentry->d_parent;
1044 } 995 }
1045 996
1046 lockdep_off();
1047 mutex_unlock(&origin->d_inode->i_mutex); 997 mutex_unlock(&origin->d_inode->i_mutex);
1048 lockdep_on();
1049} 998}
1050 999
1051int configfs_depend_item(struct configfs_subsystem *subsys, 1000int configfs_depend_item(struct configfs_subsystem *subsys,
@@ -1380,16 +1329,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1380 } 1329 }
1381 1330
1382 /* Wait until the racing operation terminates */ 1331 /* Wait until the racing operation terminates */
1383 /*
1384 * Note: we hide this from lockdep since we are locked
1385 * with subclass I_MUTEX_NORMAL from vfs_rmdir() (why
1386 * not I_MUTEX_CHILD?), and I_MUTEX_XATTR or
1387 * I_MUTEX_QUOTA are not relevant for the locked inode.
1388 */
1389 lockdep_off();
1390 mutex_lock(wait_mutex); 1332 mutex_lock(wait_mutex);
1391 mutex_unlock(wait_mutex); 1333 mutex_unlock(wait_mutex);
1392 lockdep_on();
1393 } 1334 }
1394 } while (ret == -EAGAIN); 1335 } while (ret == -EAGAIN);
1395 1336
diff --git a/fs/dcache.c b/fs/dcache.c
index 937df0fb0da5..07e2d4a44bda 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1180,7 +1180,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
1180 iput(inode); 1180 iput(inode);
1181 return res; 1181 return res;
1182} 1182}
1183EXPORT_SYMBOL_GPL(d_obtain_alias); 1183EXPORT_SYMBOL(d_obtain_alias);
1184 1184
1185/** 1185/**
1186 * d_splice_alias - splice a disconnected dentry into the tree if one exists 1186 * d_splice_alias - splice a disconnected dentry into the tree if one exists
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5f3231b9633f..bff4052b05e7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb)
198 198
199 fsi->ptmx_dentry = dentry; 199 fsi->ptmx_dentry = dentry;
200 rc = 0; 200 rc = 0;
201
202 printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
203 inode->i_ino);
204out: 201out:
205 mutex_unlock(&root->d_inode->i_mutex); 202 mutex_unlock(&root->d_inode->i_mutex);
206 return rc; 203 return rc;
@@ -369,8 +366,6 @@ static int new_pts_mount(struct file_system_type *fs_type, int flags,
369 struct pts_fs_info *fsi; 366 struct pts_fs_info *fsi;
370 struct pts_mount_opts *opts; 367 struct pts_mount_opts *opts;
371 368
372 printk(KERN_NOTICE "devpts: newinstance mount\n");
373
374 err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt); 369 err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
375 if (err) 370 if (err)
376 return err; 371 return err;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c01e043670e2..bdca1f4b3a3e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
946 list_for_each_entry(global_auth_tok, 946 list_for_each_entry(global_auth_tok,
947 &mount_crypt_stat->global_auth_tok_list, 947 &mount_crypt_stat->global_auth_tok_list,
948 mount_crypt_stat_list) { 948 mount_crypt_stat_list) {
949 if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK)
950 continue;
949 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); 951 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
950 if (rc) { 952 if (rc) {
951 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); 953 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
@@ -1716,7 +1718,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1716{ 1718{
1717 int rc = 0; 1719 int rc = 0;
1718 1720
1719 (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL); 1721 (*copied_name) = kmalloc((name_size + 1), GFP_KERNEL);
1720 if (!(*copied_name)) { 1722 if (!(*copied_name)) {
1721 rc = -ENOMEM; 1723 rc = -ENOMEM;
1722 goto out; 1724 goto out;
@@ -1726,7 +1728,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1726 * in printing out the 1728 * in printing out the
1727 * string in debug 1729 * string in debug
1728 * messages */ 1730 * messages */
1729 (*copied_name_size) = (name_size + 1); 1731 (*copied_name_size) = name_size;
1730out: 1732out:
1731 return rc; 1733 return rc;
1732} 1734}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c11fc95714ab..eb2267eca1fe 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -328,6 +328,7 @@ struct ecryptfs_dentry_info {
328 */ 328 */
329struct ecryptfs_global_auth_tok { 329struct ecryptfs_global_auth_tok {
330#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 330#define ECRYPTFS_AUTH_TOK_INVALID 0x00000001
331#define ECRYPTFS_AUTH_TOK_FNEK 0x00000002
331 u32 flags; 332 u32 flags;
332 struct list_head mount_crypt_stat_list; 333 struct list_head mount_crypt_stat_list;
333 struct key *global_auth_tok_key; 334 struct key *global_auth_tok_key;
@@ -696,7 +697,7 @@ ecryptfs_write_header_metadata(char *virt,
696int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); 697int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig);
697int 698int
698ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 699ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
699 char *sig); 700 char *sig, u32 global_auth_tok_flags);
700int ecryptfs_get_global_auth_tok_for_sig( 701int ecryptfs_get_global_auth_tok_for_sig(
701 struct ecryptfs_global_auth_tok **global_auth_tok, 702 struct ecryptfs_global_auth_tok **global_auth_tok,
702 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); 703 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ff539420cc6f..e4a6223c3145 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -2375,7 +2375,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache;
2375 2375
2376int 2376int
2377ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 2377ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2378 char *sig) 2378 char *sig, u32 global_auth_tok_flags)
2379{ 2379{
2380 struct ecryptfs_global_auth_tok *new_auth_tok; 2380 struct ecryptfs_global_auth_tok *new_auth_tok;
2381 int rc = 0; 2381 int rc = 0;
@@ -2389,6 +2389,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2389 goto out; 2389 goto out;
2390 } 2390 }
2391 memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); 2391 memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
2392 new_auth_tok->flags = global_auth_tok_flags;
2392 new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; 2393 new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
2393 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); 2394 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
2394 list_add(&new_auth_tok->mount_crypt_stat_list, 2395 list_add(&new_auth_tok->mount_crypt_stat_list,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 789cf2e1be1e..aed56c25539b 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
319 case ecryptfs_opt_ecryptfs_sig: 319 case ecryptfs_opt_ecryptfs_sig:
320 sig_src = args[0].from; 320 sig_src = args[0].from;
321 rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, 321 rc = ecryptfs_add_global_auth_tok(mount_crypt_stat,
322 sig_src); 322 sig_src, 0);
323 if (rc) { 323 if (rc) {
324 printk(KERN_ERR "Error attempting to register " 324 printk(KERN_ERR "Error attempting to register "
325 "global sig; rc = [%d]\n", rc); 325 "global sig; rc = [%d]\n", rc);
@@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
370 ECRYPTFS_SIG_SIZE_HEX] = '\0'; 370 ECRYPTFS_SIG_SIZE_HEX] = '\0';
371 rc = ecryptfs_add_global_auth_tok( 371 rc = ecryptfs_add_global_auth_tok(
372 mount_crypt_stat, 372 mount_crypt_stat,
373 mount_crypt_stat->global_default_fnek_sig); 373 mount_crypt_stat->global_default_fnek_sig,
374 ECRYPTFS_AUTH_TOK_FNEK);
374 if (rc) { 375 if (rc) {
375 printk(KERN_ERR "Error attempting to register " 376 printk(KERN_ERR "Error attempting to register "
376 "global fnek sig [%s]; rc = [%d]\n", 377 "global fnek sig [%s]; rc = [%d]\n",
diff --git a/fs/exec.c b/fs/exec.c
index 0dd60a01f1b4..929b58004b7e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1049,16 +1049,32 @@ EXPORT_SYMBOL(install_exec_creds);
1049 * - the caller must hold current->cred_exec_mutex to protect against 1049 * - the caller must hold current->cred_exec_mutex to protect against
1050 * PTRACE_ATTACH 1050 * PTRACE_ATTACH
1051 */ 1051 */
1052void check_unsafe_exec(struct linux_binprm *bprm) 1052void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
1053{ 1053{
1054 struct task_struct *p = current; 1054 struct task_struct *p = current, *t;
1055 unsigned long flags;
1056 unsigned n_fs, n_files, n_sighand;
1055 1057
1056 bprm->unsafe = tracehook_unsafe_exec(p); 1058 bprm->unsafe = tracehook_unsafe_exec(p);
1057 1059
1058 if (atomic_read(&p->fs->count) > 1 || 1060 n_fs = 1;
1059 atomic_read(&p->files->count) > 1 || 1061 n_files = 1;
1060 atomic_read(&p->sighand->count) > 1) 1062 n_sighand = 1;
1063 lock_task_sighand(p, &flags);
1064 for (t = next_thread(p); t != p; t = next_thread(t)) {
1065 if (t->fs == p->fs)
1066 n_fs++;
1067 if (t->files == files)
1068 n_files++;
1069 n_sighand++;
1070 }
1071
1072 if (atomic_read(&p->fs->count) > n_fs ||
1073 atomic_read(&p->files->count) > n_files ||
1074 atomic_read(&p->sighand->count) > n_sighand)
1061 bprm->unsafe |= LSM_UNSAFE_SHARE; 1075 bprm->unsafe |= LSM_UNSAFE_SHARE;
1076
1077 unlock_task_sighand(p, &flags);
1062} 1078}
1063 1079
1064/* 1080/*
@@ -1273,7 +1289,7 @@ int do_execve(char * filename,
1273 bprm->cred = prepare_exec_creds(); 1289 bprm->cred = prepare_exec_creds();
1274 if (!bprm->cred) 1290 if (!bprm->cred)
1275 goto out_unlock; 1291 goto out_unlock;
1276 check_unsafe_exec(bprm); 1292 check_unsafe_exec(bprm, displaced);
1277 1293
1278 file = open_exec(filename); 1294 file = open_exec(filename);
1279 retval = PTR_ERR(file); 1295 retval = PTR_ERR(file);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index da8bdeaa2e6d..7c6e3606f0ec 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1185 es = sbi->s_es; 1185 es = sbi->s_es;
1186 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != 1186 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
1187 (old_mount_opt & EXT2_MOUNT_XIP)) && 1187 (old_mount_opt & EXT2_MOUNT_XIP)) &&
1188 invalidate_inodes(sb)) 1188 invalidate_inodes(sb)) {
1189 ext2_warning(sb, __func__, "busy inodes while remounting "\ 1189 ext2_warning(sb, __func__, "refusing change of xip flag "
1190 "xip remain in cache (no functional problem)"); 1190 "with busy inodes while remounting");
1191 sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
1192 sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
1193 }
1191 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1194 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1192 return 0; 1195 return 0;
1193 if (*flags & MS_RDONLY) { 1196 if (*flags & MS_RDONLY) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b70d90e08a3c..4a970411a458 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb)
2428 2428
2429static int ext3_sync_fs(struct super_block *sb, int wait) 2429static int ext3_sync_fs(struct super_block *sb, int wait)
2430{ 2430{
2431 sb->s_dirt = 0; 2431 tid_t target;
2432 if (wait)
2433 ext3_force_commit(sb);
2434 else
2435 journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
2436 2432
2433 sb->s_dirt = 0;
2434 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2435 if (wait)
2436 log_wait_commit(EXT3_SB(sb)->s_journal, target);
2437 }
2437 return 0; 2438 return 0;
2438} 2439}
2439 2440
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9a50b8052dcf..de9459b4cb94 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,7 +609,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
609 */ 609 */
610int ext4_should_retry_alloc(struct super_block *sb, int *retries) 610int ext4_should_retry_alloc(struct super_block *sb, int *retries)
611{ 611{
612 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) 612 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
613 (*retries)++ > 3 ||
614 !EXT4_SB(sb)->s_journal)
613 return 0; 615 return 0;
614 616
615 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); 617 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index aafc9eba1c25..b0c87dce66a3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
868{ 868{
869 unsigned len = le16_to_cpu(dlen); 869 unsigned len = le16_to_cpu(dlen);
870 870
871 if (len == EXT4_MAX_REC_LEN) 871 if (len == EXT4_MAX_REC_LEN || len == 0)
872 return 1 << 16; 872 return 1 << 16;
873 return len; 873 return len;
874} 874}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4fb86a0061d0..627f8c3337a3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -188,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
188 struct ext4_group_desc *gdp; 188 struct ext4_group_desc *gdp;
189 struct ext4_super_block *es; 189 struct ext4_super_block *es;
190 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
191 int fatal = 0, err, count; 191 int fatal = 0, err, count, cleared;
192 ext4_group_t flex_group; 192 ext4_group_t flex_group;
193 193
194 if (atomic_read(&inode->i_count) > 1) { 194 if (atomic_read(&inode->i_count) > 1) {
@@ -248,8 +248,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
248 goto error_return; 248 goto error_return;
249 249
250 /* Ok, now we can actually update the inode bitmaps.. */ 250 /* Ok, now we can actually update the inode bitmaps.. */
251 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 251 spin_lock(sb_bgl_lock(sbi, block_group));
252 bit, bitmap_bh->b_data)) 252 cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
253 spin_unlock(sb_bgl_lock(sbi, block_group));
254 if (!cleared)
253 ext4_error(sb, "ext4_free_inode", 255 ext4_error(sb, "ext4_free_inode",
254 "bit already cleared for inode %lu", ino); 256 "bit already cleared for inode %lu", ino);
255 else { 257 else {
@@ -715,6 +717,13 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
715 717
716 if (sbi->s_log_groups_per_flex) { 718 if (sbi->s_log_groups_per_flex) {
717 ret2 = find_group_flex(sb, dir, &group); 719 ret2 = find_group_flex(sb, dir, &group);
720 if (ret2 == -1) {
721 ret2 = find_group_other(sb, dir, &group);
722 if (ret2 == 0 && printk_ratelimit())
723 printk(KERN_NOTICE "ext4: find_group_flex "
724 "failed, fallback succeeded dir %lu\n",
725 dir->i_ino);
726 }
718 goto got_group; 727 goto got_group;
719 } 728 }
720 729
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20be1329..c7fed5b18745 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
47static inline int ext4_begin_ordered_truncate(struct inode *inode, 47static inline int ext4_begin_ordered_truncate(struct inode *inode,
48 loff_t new_size) 48 loff_t new_size)
49{ 49{
50 return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, 50 return jbd2_journal_begin_ordered_truncate(
51 new_size); 51 EXT4_SB(inode->i_sb)->s_journal,
52 &EXT4_I(inode)->jinode,
53 new_size);
52} 54}
53 55
54static void ext4_invalidatepage(struct page *page, unsigned long offset); 56static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -1366,6 +1368,10 @@ retry:
1366 goto out; 1368 goto out;
1367 } 1369 }
1368 1370
1371 /* We cannot recurse into the filesystem as the transaction is already
1372 * started */
1373 flags |= AOP_FLAG_NOFS;
1374
1369 page = grab_cache_page_write_begin(mapping, index, flags); 1375 page = grab_cache_page_write_begin(mapping, index, flags);
1370 if (!page) { 1376 if (!page) {
1371 ext4_journal_stop(handle); 1377 ext4_journal_stop(handle);
@@ -1375,7 +1381,7 @@ retry:
1375 *pagep = page; 1381 *pagep = page;
1376 1382
1377 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1383 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1378 ext4_get_block); 1384 ext4_get_block);
1379 1385
1380 if (!ret && ext4_should_journal_data(inode)) { 1386 if (!ret && ext4_should_journal_data(inode)) {
1381 ret = walk_page_buffers(handle, page_buffers(page), 1387 ret = walk_page_buffers(handle, page_buffers(page),
@@ -2437,6 +2443,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2437 int no_nrwrite_index_update; 2443 int no_nrwrite_index_update;
2438 int pages_written = 0; 2444 int pages_written = 0;
2439 long pages_skipped; 2445 long pages_skipped;
2446 int range_cyclic, cycled = 1, io_done = 0;
2440 int needed_blocks, ret = 0, nr_to_writebump = 0; 2447 int needed_blocks, ret = 0, nr_to_writebump = 0;
2441 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2448 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2442 2449
@@ -2488,9 +2495,15 @@ static int ext4_da_writepages(struct address_space *mapping,
2488 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2495 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2489 range_whole = 1; 2496 range_whole = 1;
2490 2497
2491 if (wbc->range_cyclic) 2498 range_cyclic = wbc->range_cyclic;
2499 if (wbc->range_cyclic) {
2492 index = mapping->writeback_index; 2500 index = mapping->writeback_index;
2493 else 2501 if (index)
2502 cycled = 0;
2503 wbc->range_start = index << PAGE_CACHE_SHIFT;
2504 wbc->range_end = LLONG_MAX;
2505 wbc->range_cyclic = 0;
2506 } else
2494 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2507 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2495 2508
2496 mpd.wbc = wbc; 2509 mpd.wbc = wbc;
@@ -2504,6 +2517,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2504 wbc->no_nrwrite_index_update = 1; 2517 wbc->no_nrwrite_index_update = 1;
2505 pages_skipped = wbc->pages_skipped; 2518 pages_skipped = wbc->pages_skipped;
2506 2519
2520retry:
2507 while (!ret && wbc->nr_to_write > 0) { 2521 while (!ret && wbc->nr_to_write > 0) {
2508 2522
2509 /* 2523 /*
@@ -2530,7 +2544,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2530 2544
2531 ext4_journal_stop(handle); 2545 ext4_journal_stop(handle);
2532 2546
2533 if (mpd.retval == -ENOSPC) { 2547 if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
2534 /* commit the transaction which would 2548 /* commit the transaction which would
2535 * free blocks released in the transaction 2549 * free blocks released in the transaction
2536 * and try again 2550 * and try again
@@ -2546,6 +2560,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2546 pages_written += mpd.pages_written; 2560 pages_written += mpd.pages_written;
2547 wbc->pages_skipped = pages_skipped; 2561 wbc->pages_skipped = pages_skipped;
2548 ret = 0; 2562 ret = 0;
2563 io_done = 1;
2549 } else if (wbc->nr_to_write) 2564 } else if (wbc->nr_to_write)
2550 /* 2565 /*
2551 * There is no more writeout needed 2566 * There is no more writeout needed
@@ -2554,6 +2569,13 @@ static int ext4_da_writepages(struct address_space *mapping,
2554 */ 2569 */
2555 break; 2570 break;
2556 } 2571 }
2572 if (!io_done && !cycled) {
2573 cycled = 1;
2574 index = 0;
2575 wbc->range_start = index << PAGE_CACHE_SHIFT;
2576 wbc->range_end = mapping->writeback_index - 1;
2577 goto retry;
2578 }
2557 if (pages_skipped != wbc->pages_skipped) 2579 if (pages_skipped != wbc->pages_skipped)
2558 printk(KERN_EMERG "This should not happen leaving %s " 2580 printk(KERN_EMERG "This should not happen leaving %s "
2559 "with nr_to_write = %ld ret = %d\n", 2581 "with nr_to_write = %ld ret = %d\n",
@@ -2561,6 +2583,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2561 2583
2562 /* Update index */ 2584 /* Update index */
2563 index += pages_written; 2585 index += pages_written;
2586 wbc->range_cyclic = range_cyclic;
2564 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2587 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2565 /* 2588 /*
2566 * set the writeback_index so that range_cyclic 2589 * set the writeback_index so that range_cyclic
@@ -2648,6 +2671,9 @@ retry:
2648 ret = PTR_ERR(handle); 2671 ret = PTR_ERR(handle);
2649 goto out; 2672 goto out;
2650 } 2673 }
2674 /* We cannot recurse into the filesystem as the transaction is already
2675 * started */
2676 flags |= AOP_FLAG_NOFS;
2651 2677
2652 page = grab_cache_page_write_begin(mapping, index, flags); 2678 page = grab_cache_page_write_begin(mapping, index, flags);
2653 if (!page) { 2679 if (!page) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index deba54f6cbed..4415beeb0b62 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3693,6 +3693,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3693 pa->pa_free = pa->pa_len; 3693 pa->pa_free = pa->pa_len;
3694 atomic_set(&pa->pa_count, 1); 3694 atomic_set(&pa->pa_count, 1);
3695 spin_lock_init(&pa->pa_lock); 3695 spin_lock_init(&pa->pa_lock);
3696 INIT_LIST_HEAD(&pa->pa_inode_list);
3697 INIT_LIST_HEAD(&pa->pa_group_list);
3696 pa->pa_deleted = 0; 3698 pa->pa_deleted = 0;
3697 pa->pa_linear = 0; 3699 pa->pa_linear = 0;
3698 3700
@@ -3755,6 +3757,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3755 atomic_set(&pa->pa_count, 1); 3757 atomic_set(&pa->pa_count, 1);
3756 spin_lock_init(&pa->pa_lock); 3758 spin_lock_init(&pa->pa_lock);
3757 INIT_LIST_HEAD(&pa->pa_inode_list); 3759 INIT_LIST_HEAD(&pa->pa_inode_list);
3760 INIT_LIST_HEAD(&pa->pa_group_list);
3758 pa->pa_deleted = 0; 3761 pa->pa_deleted = 0;
3759 pa->pa_linear = 1; 3762 pa->pa_linear = 1;
3760 3763
@@ -4476,23 +4479,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4476 pa->pa_free -= ac->ac_b_ex.fe_len; 4479 pa->pa_free -= ac->ac_b_ex.fe_len;
4477 pa->pa_len -= ac->ac_b_ex.fe_len; 4480 pa->pa_len -= ac->ac_b_ex.fe_len;
4478 spin_unlock(&pa->pa_lock); 4481 spin_unlock(&pa->pa_lock);
4479 /*
4480 * We want to add the pa to the right bucket.
4481 * Remove it from the list and while adding
4482 * make sure the list to which we are adding
4483 * doesn't grow big.
4484 */
4485 if (likely(pa->pa_free)) {
4486 spin_lock(pa->pa_obj_lock);
4487 list_del_rcu(&pa->pa_inode_list);
4488 spin_unlock(pa->pa_obj_lock);
4489 ext4_mb_add_n_trim(ac);
4490 }
4491 } 4482 }
4492 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4493 } 4483 }
4494 if (ac->alloc_semp) 4484 if (ac->alloc_semp)
4495 up_read(ac->alloc_semp); 4485 up_read(ac->alloc_semp);
4486 if (pa) {
4487 /*
4488 * We want to add the pa to the right bucket.
4489 * Remove it from the list and while adding
4490 * make sure the list to which we are adding
4491 * doesn't grow big. We need to release
4492 * alloc_semp before calling ext4_mb_add_n_trim()
4493 */
4494 if (pa->pa_linear && likely(pa->pa_free)) {
4495 spin_lock(pa->pa_obj_lock);
4496 list_del_rcu(&pa->pa_inode_list);
4497 spin_unlock(pa->pa_obj_lock);
4498 ext4_mb_add_n_trim(ac);
4499 }
4500 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4501 }
4496 if (ac->ac_bitmap_page) 4502 if (ac->ac_bitmap_page)
4497 page_cache_release(ac->ac_bitmap_page); 4503 page_cache_release(ac->ac_bitmap_page);
4498 if (ac->ac_buddy_page) 4504 if (ac->ac_buddy_page)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 734abca25e35..fe64d9f79852 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
481 + 1); 481 + 1);
482 if (IS_ERR(handle)) { 482 if (IS_ERR(handle)) {
483 retval = PTR_ERR(handle); 483 retval = PTR_ERR(handle);
484 goto err_out; 484 return retval;
485 } 485 }
486 tmp_inode = ext4_new_inode(handle, 486 tmp_inode = ext4_new_inode(handle,
487 inode->i_sb->s_root->d_inode, 487 inode->i_sb->s_root->d_inode,
@@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode)
489 if (IS_ERR(tmp_inode)) { 489 if (IS_ERR(tmp_inode)) {
490 retval = -ENOMEM; 490 retval = -ENOMEM;
491 ext4_journal_stop(handle); 491 ext4_journal_stop(handle);
492 tmp_inode = NULL; 492 return retval;
493 goto err_out;
494 } 493 }
495 i_size_write(tmp_inode, i_size_read(inode)); 494 i_size_write(tmp_inode, i_size_read(inode));
496 /* 495 /*
@@ -618,8 +617,7 @@ err_out:
618 617
619 ext4_journal_stop(handle); 618 ext4_journal_stop(handle);
620 619
621 if (tmp_inode) 620 iput(tmp_inode);
622 iput(tmp_inode);
623 621
624 return retval; 622 return retval;
625} 623}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5f06a5f045e..39d1993cfa13 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb)
3046static int ext4_sync_fs(struct super_block *sb, int wait) 3046static int ext4_sync_fs(struct super_block *sb, int wait)
3047{ 3047{
3048 int ret = 0; 3048 int ret = 0;
3049 tid_t target;
3049 3050
3050 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); 3051 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
3051 sb->s_dirt = 0; 3052 sb->s_dirt = 0;
3052 if (EXT4_SB(sb)->s_journal) { 3053 if (EXT4_SB(sb)->s_journal) {
3053 if (wait) 3054 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
3054 ret = ext4_force_commit(sb); 3055 &target)) {
3055 else 3056 if (wait)
3056 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); 3057 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
3058 target);
3059 }
3057 } else { 3060 } else {
3058 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); 3061 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
3059 } 3062 }
@@ -3088,7 +3091,6 @@ static int ext4_freeze(struct super_block *sb)
3088 3091
3089 /* Journal blocked and flushed, clear needs_recovery flag. */ 3092 /* Journal blocked and flushed, clear needs_recovery flag. */
3090 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3093 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3091 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3092 error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); 3094 error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3093 if (error) 3095 if (error)
3094 goto out; 3096 goto out;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 6b74d09adbe5..de0004fe6e00 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
202 sector_t blocknr; 202 sector_t blocknr;
203 203
204 /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ 204 /* fat_get_cluster() assumes the requested blocknr isn't truncated. */
205 mutex_lock(&mapping->host->i_mutex); 205 down_read(&mapping->host->i_alloc_sem);
206 blocknr = generic_block_bmap(mapping, block, fat_get_block); 206 blocknr = generic_block_bmap(mapping, block, fat_get_block);
207 mutex_unlock(&mapping->host->i_mutex); 207 up_read(&mapping->host->i_alloc_sem);
208 208
209 return blocknr; 209 return blocknr;
210} 210}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5eaa62fd17f..e3fe9918faaf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -274,6 +274,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
274 int ret; 274 int ret;
275 275
276 BUG_ON(inode->i_state & I_SYNC); 276 BUG_ON(inode->i_state & I_SYNC);
277 WARN_ON(inode->i_state & I_NEW);
277 278
278 /* Set I_SYNC, reset I_DIRTY */ 279 /* Set I_SYNC, reset I_DIRTY */
279 dirty = inode->i_state & I_DIRTY; 280 dirty = inode->i_state & I_DIRTY;
@@ -298,6 +299,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
298 } 299 }
299 300
300 spin_lock(&inode_lock); 301 spin_lock(&inode_lock);
302 WARN_ON(inode->i_state & I_NEW);
301 inode->i_state &= ~I_SYNC; 303 inode->i_state &= ~I_SYNC;
302 if (!(inode->i_state & I_FREEING)) { 304 if (!(inode->i_state & I_FREEING)) {
303 if (!(inode->i_state & I_DIRTY) && 305 if (!(inode->i_state & I_DIRTY) &&
@@ -470,6 +472,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
470 break; 472 break;
471 } 473 }
472 474
475 if (inode->i_state & I_NEW) {
476 requeue_io(inode);
477 continue;
478 }
479
473 if (wbc->nonblocking && bdi_write_congested(bdi)) { 480 if (wbc->nonblocking && bdi_write_congested(bdi)) {
474 wbc->encountered_congestion = 1; 481 wbc->encountered_congestion = 1;
475 if (!sb_is_blkdev_sb(sb)) 482 if (!sb_is_blkdev_sb(sb))
@@ -531,7 +538,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
531 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 538 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
532 struct address_space *mapping; 539 struct address_space *mapping;
533 540
534 if (inode->i_state & (I_FREEING|I_WILL_FREE)) 541 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
535 continue; 542 continue;
536 mapping = inode->i_mapping; 543 mapping = inode->i_mapping;
537 if (mapping->nrpages == 0) 544 if (mapping->nrpages == 0)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37af037..9b800d97a687 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
108 108
109 if (hugetlb_reserve_pages(inode, 109 if (hugetlb_reserve_pages(inode,
110 vma->vm_pgoff >> huge_page_order(h), 110 vma->vm_pgoff >> huge_page_order(h),
111 len >> huge_page_shift(h), vma)) 111 len >> huge_page_shift(h), vma,
112 vma->vm_flags))
112 goto out; 113 goto out;
113 114
114 ret = 0; 115 ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
947 can_do_mlock()); 948 can_do_mlock());
948} 949}
949 950
950struct file *hugetlb_file_setup(const char *name, size_t size) 951struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
951{ 952{
952 int error = -ENOMEM; 953 int error = -ENOMEM;
953 struct file *file; 954 struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
981 982
982 error = -ENOMEM; 983 error = -ENOMEM;
983 if (hugetlb_reserve_pages(inode, 0, 984 if (hugetlb_reserve_pages(inode, 0,
984 size >> huge_page_shift(hstate_inode(inode)), NULL)) 985 size >> huge_page_shift(hstate_inode(inode)), NULL,
986 acctflag))
985 goto out_inode; 987 goto out_inode;
986 988
987 d_instantiate(dentry, inode); 989 d_instantiate(dentry, inode);
diff --git a/fs/inode.c b/fs/inode.c
index 913ab2d9a5d1..826fb0b9d1c3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -359,6 +359,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
359 invalidate_inode_buffers(inode); 359 invalidate_inode_buffers(inode);
360 if (!atomic_read(&inode->i_count)) { 360 if (!atomic_read(&inode->i_count)) {
361 list_move(&inode->i_list, dispose); 361 list_move(&inode->i_list, dispose);
362 WARN_ON(inode->i_state & I_NEW);
362 inode->i_state |= I_FREEING; 363 inode->i_state |= I_FREEING;
363 count++; 364 count++;
364 continue; 365 continue;
@@ -460,6 +461,7 @@ static void prune_icache(int nr_to_scan)
460 continue; 461 continue;
461 } 462 }
462 list_move(&inode->i_list, &freeable); 463 list_move(&inode->i_list, &freeable);
464 WARN_ON(inode->i_state & I_NEW);
463 inode->i_state |= I_FREEING; 465 inode->i_state |= I_FREEING;
464 nr_pruned++; 466 nr_pruned++;
465 } 467 }
@@ -656,6 +658,7 @@ void unlock_new_inode(struct inode *inode)
656 * just created it (so there can be no old holders 658 * just created it (so there can be no old holders
657 * that haven't tested I_LOCK). 659 * that haven't tested I_LOCK).
658 */ 660 */
661 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
659 inode->i_state &= ~(I_LOCK|I_NEW); 662 inode->i_state &= ~(I_LOCK|I_NEW);
660 wake_up_inode(inode); 663 wake_up_inode(inode);
661} 664}
@@ -1145,6 +1148,7 @@ void generic_delete_inode(struct inode *inode)
1145 1148
1146 list_del_init(&inode->i_list); 1149 list_del_init(&inode->i_list);
1147 list_del_init(&inode->i_sb_list); 1150 list_del_init(&inode->i_sb_list);
1151 WARN_ON(inode->i_state & I_NEW);
1148 inode->i_state |= I_FREEING; 1152 inode->i_state |= I_FREEING;
1149 inodes_stat.nr_inodes--; 1153 inodes_stat.nr_inodes--;
1150 spin_unlock(&inode_lock); 1154 spin_unlock(&inode_lock);
@@ -1186,16 +1190,19 @@ static void generic_forget_inode(struct inode *inode)
1186 spin_unlock(&inode_lock); 1190 spin_unlock(&inode_lock);
1187 return; 1191 return;
1188 } 1192 }
1193 WARN_ON(inode->i_state & I_NEW);
1189 inode->i_state |= I_WILL_FREE; 1194 inode->i_state |= I_WILL_FREE;
1190 spin_unlock(&inode_lock); 1195 spin_unlock(&inode_lock);
1191 write_inode_now(inode, 1); 1196 write_inode_now(inode, 1);
1192 spin_lock(&inode_lock); 1197 spin_lock(&inode_lock);
1198 WARN_ON(inode->i_state & I_NEW);
1193 inode->i_state &= ~I_WILL_FREE; 1199 inode->i_state &= ~I_WILL_FREE;
1194 inodes_stat.nr_unused--; 1200 inodes_stat.nr_unused--;
1195 hlist_del_init(&inode->i_hash); 1201 hlist_del_init(&inode->i_hash);
1196 } 1202 }
1197 list_del_init(&inode->i_list); 1203 list_del_init(&inode->i_list);
1198 list_del_init(&inode->i_sb_list); 1204 list_del_init(&inode->i_sb_list);
1205 WARN_ON(inode->i_state & I_NEW);
1199 inode->i_state |= I_FREEING; 1206 inode->i_state |= I_FREEING;
1200 inodes_stat.nr_inodes--; 1207 inodes_stat.nr_inodes--;
1201 spin_unlock(&inode_lock); 1208 spin_unlock(&inode_lock);
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..0d8ac497b3d5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,7 +43,7 @@ extern void __init chrdev_init(void);
43/* 43/*
44 * exec.c 44 * exec.c
45 */ 45 */
46extern void check_unsafe_exec(struct linux_binprm *); 46extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *);
47 47
48/* 48/*
49 * namespace.c 49 * namespace.c
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9e4fa52d7dc8..e79c07812afa 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal)
427} 427}
428 428
429/* 429/*
430 * Called under j_state_lock. Returns true if a transaction was started. 430 * Called under j_state_lock. Returns true if a transaction commit was started.
431 */ 431 */
432int __log_start_commit(journal_t *journal, tid_t target) 432int __log_start_commit(journal_t *journal, tid_t target)
433{ 433{
@@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal)
495 495
496/* 496/*
497 * Start a commit of the current running transaction (if any). Returns true 497 * Start a commit of the current running transaction (if any). Returns true
498 * if a transaction was started, and fills its tid in at *ptid 498 * if a transaction is going to be committed (or is currently already
499 * committing), and fills its tid in at *ptid
499 */ 500 */
500int journal_start_commit(journal_t *journal, tid_t *ptid) 501int journal_start_commit(journal_t *journal, tid_t *ptid)
501{ 502{
@@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
505 if (journal->j_running_transaction) { 506 if (journal->j_running_transaction) {
506 tid_t tid = journal->j_running_transaction->t_tid; 507 tid_t tid = journal->j_running_transaction->t_tid;
507 508
508 ret = __log_start_commit(journal, tid); 509 __log_start_commit(journal, tid);
509 if (ret && ptid) 510 /* There's a running transaction and we've just made sure
511 * it's commit has been scheduled. */
512 if (ptid)
510 *ptid = tid; 513 *ptid = tid;
511 } else if (journal->j_committing_transaction && ptid) { 514 ret = 1;
515 } else if (journal->j_committing_transaction) {
512 /* 516 /*
513 * If ext3_write_super() recently started a commit, then we 517 * If ext3_write_super() recently started a commit, then we
514 * have to wait for completion of that transaction 518 * have to wait for completion of that transaction
515 */ 519 */
516 *ptid = journal->j_committing_transaction->t_tid; 520 if (ptid)
521 *ptid = journal->j_committing_transaction->t_tid;
517 ret = 1; 522 ret = 1;
518 } 523 }
519 spin_unlock(&journal->j_state_lock); 524 spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index eb343008eded..58144102bf25 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal)
450} 450}
451 451
452/* 452/*
453 * Called under j_state_lock. Returns true if a transaction was started. 453 * Called under j_state_lock. Returns true if a transaction commit was started.
454 */ 454 */
455int __jbd2_log_start_commit(journal_t *journal, tid_t target) 455int __jbd2_log_start_commit(journal_t *journal, tid_t target)
456{ 456{
@@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
518 518
519/* 519/*
520 * Start a commit of the current running transaction (if any). Returns true 520 * Start a commit of the current running transaction (if any). Returns true
521 * if a transaction was started, and fills its tid in at *ptid 521 * if a transaction is going to be committed (or is currently already
522 * committing), and fills its tid in at *ptid
522 */ 523 */
523int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) 524int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
524{ 525{
@@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
528 if (journal->j_running_transaction) { 529 if (journal->j_running_transaction) {
529 tid_t tid = journal->j_running_transaction->t_tid; 530 tid_t tid = journal->j_running_transaction->t_tid;
530 531
531 ret = __jbd2_log_start_commit(journal, tid); 532 __jbd2_log_start_commit(journal, tid);
532 if (ret && ptid) 533 /* There's a running transaction and we've just made sure
534 * it's commit has been scheduled. */
535 if (ptid)
533 *ptid = tid; 536 *ptid = tid;
534 } else if (journal->j_committing_transaction && ptid) { 537 ret = 1;
538 } else if (journal->j_committing_transaction) {
535 /* 539 /*
536 * If ext3_write_super() recently started a commit, then we 540 * If ext3_write_super() recently started a commit, then we
537 * have to wait for completion of that transaction 541 * have to wait for completion of that transaction
538 */ 542 */
539 *ptid = journal->j_committing_transaction->t_tid; 543 if (ptid)
544 *ptid = journal->j_committing_transaction->t_tid;
540 ret = 1; 545 ret = 1;
541 } 546 }
542 spin_unlock(&journal->j_state_lock); 547 spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e347ed7d..28ce21d8598e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2129,26 +2129,46 @@ done:
2129} 2129}
2130 2130
2131/* 2131/*
2132 * This function must be called when inode is journaled in ordered mode 2132 * File truncate and transaction commit interact with each other in a
2133 * before truncation happens. It starts writeout of truncated part in 2133 * non-trivial way. If a transaction writing data block A is
2134 * case it is in the committing transaction so that we stand to ordered 2134 * committing, we cannot discard the data by truncate until we have
2135 * mode consistency guarantees. 2135 * written them. Otherwise if we crashed after the transaction with
2136 * write has committed but before the transaction with truncate has
2137 * committed, we could see stale data in block A. This function is a
2138 * helper to solve this problem. It starts writeout of the truncated
2139 * part in case it is in the committing transaction.
2140 *
2141 * Filesystem code must call this function when inode is journaled in
2142 * ordered mode before truncation happens and after the inode has been
2143 * placed on orphan list with the new inode size. The second condition
2144 * avoids the race that someone writes new data and we start
2145 * committing the transaction after this function has been called but
2146 * before a transaction for truncate is started (and furthermore it
2147 * allows us to optimize the case where the addition to orphan list
2148 * happens in the same transaction as write --- we don't have to write
2149 * any data in such case).
2136 */ 2150 */
2137int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, 2151int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2152 struct jbd2_inode *jinode,
2138 loff_t new_size) 2153 loff_t new_size)
2139{ 2154{
2140 journal_t *journal; 2155 transaction_t *inode_trans, *commit_trans;
2141 transaction_t *commit_trans;
2142 int ret = 0; 2156 int ret = 0;
2143 2157
2144 if (!inode->i_transaction && !inode->i_next_transaction) 2158 /* This is a quick check to avoid locking if not necessary */
2159 if (!jinode->i_transaction)
2145 goto out; 2160 goto out;
2146 journal = inode->i_transaction->t_journal; 2161 /* Locks are here just to force reading of recent values, it is
2162 * enough that the transaction was not committing before we started
2163 * a transaction adding the inode to orphan list */
2147 spin_lock(&journal->j_state_lock); 2164 spin_lock(&journal->j_state_lock);
2148 commit_trans = journal->j_committing_transaction; 2165 commit_trans = journal->j_committing_transaction;
2149 spin_unlock(&journal->j_state_lock); 2166 spin_unlock(&journal->j_state_lock);
2150 if (inode->i_transaction == commit_trans) { 2167 spin_lock(&journal->j_list_lock);
2151 ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, 2168 inode_trans = jinode->i_transaction;
2169 spin_unlock(&journal->j_list_lock);
2170 if (inode_trans == commit_trans) {
2171 ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
2152 new_size, LLONG_MAX); 2172 new_size, LLONG_MAX);
2153 if (ret) 2173 if (ret)
2154 jbd2_journal_abort(journal, ret); 2174 jbd2_journal_abort(journal, ret);
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3cceef4ad2b7..e9580104b6ba 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -95,13 +95,17 @@ static int jffs2_garbage_collect_thread(void *_c)
95 spin_unlock(&c->erase_completion_lock); 95 spin_unlock(&c->erase_completion_lock);
96 96
97 97
98 /* This thread is purely an optimisation. But if it runs when 98 /* Problem - immediately after bootup, the GCD spends a lot
99 other things could be running, it actually makes things a 99 * of time in places like jffs2_kill_fragtree(); so much so
100 lot worse. Use yield() and put it at the back of the runqueue 100 * that userspace processes (like gdm and X) are starved
101 every time. Especially during boot, pulling an inode in 101 * despite plenty of cond_resched()s and renicing. Yield()
102 with read_inode() is much preferable to having the GC thread 102 * doesn't help, either (presumably because userspace and GCD
103 get there first. */ 103 * are generally competing for a higher latency resource -
104 yield(); 104 * disk).
105 * This forces the GCD to slow the hell down. Pulling an
106 * inode in with read_inode() is much preferable to having
107 * the GC thread get there first. */
108 schedule_timeout_interruptible(msecs_to_jiffies(50));
105 109
106 /* Put_super will send a SIGKILL and then wait on the sem. 110 /* Put_super will send a SIGKILL and then wait on the sem.
107 */ 111 */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 6ca08ad887c0..1fc1e92356ee 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -220,7 +220,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
220 struct jffs2_tmp_dnode_info *tn) 220 struct jffs2_tmp_dnode_info *tn)
221{ 221{
222 uint32_t fn_end = tn->fn->ofs + tn->fn->size; 222 uint32_t fn_end = tn->fn->ofs + tn->fn->size;
223 struct jffs2_tmp_dnode_info *this; 223 struct jffs2_tmp_dnode_info *this, *ptn;
224 224
225 dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw)); 225 dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
226 226
@@ -251,11 +251,18 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
251 if (this) { 251 if (this) {
252 /* If the node is coincident with another at a lower address, 252 /* If the node is coincident with another at a lower address,
253 back up until the other node is found. It may be relevant */ 253 back up until the other node is found. It may be relevant */
254 while (this->overlapped) 254 while (this->overlapped) {
255 this = tn_prev(this); 255 ptn = tn_prev(this);
256 256 if (!ptn) {
257 /* First node should never be marked overlapped */ 257 /*
258 BUG_ON(!this); 258 * We killed a node which set the overlapped
259 * flags during the scan. Fix it up.
260 */
261 this->overlapped = 0;
262 break;
263 }
264 this = ptn;
265 }
259 dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole"); 266 dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
260 } 267 }
261 268
@@ -360,7 +367,17 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
360 } 367 }
361 if (!this->overlapped) 368 if (!this->overlapped)
362 break; 369 break;
363 this = tn_prev(this); 370
371 ptn = tn_prev(this);
372 if (!ptn) {
373 /*
374 * We killed a node which set the overlapped
375 * flags during the scan. Fix it up.
376 */
377 this->overlapped = 0;
378 break;
379 }
380 this = ptn;
364 } 381 }
365 } 382 }
366 383
@@ -456,8 +473,15 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
456 eat_last(&rii->tn_root, &last->rb); 473 eat_last(&rii->tn_root, &last->rb);
457 ver_insert(&ver_root, last); 474 ver_insert(&ver_root, last);
458 475
459 if (unlikely(last->overlapped)) 476 if (unlikely(last->overlapped)) {
460 continue; 477 if (pen)
478 continue;
479 /*
480 * We killed a node which set the overlapped
481 * flags during the scan. Fix it up.
482 */
483 last->overlapped = 0;
484 }
461 485
462 /* Now we have a bunch of nodes in reverse version 486 /* Now we have a bunch of nodes in reverse version
463 order, in the tree at ver_root. Most of the time, 487 order, in the tree at ver_root. Most of the time,
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc0d351..aedc47a264c1 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
139 return 0; 139 return 0;
140} 140}
141 141
142#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
143static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
144 struct in6_addr *addr_mapped)
145{
146 const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
147
148 switch (sap->sa_family) {
149 case AF_INET6:
150 return &((const struct sockaddr_in6 *)sap)->sin6_addr;
151 case AF_INET:
152 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
153 return addr_mapped;
154 }
155
156 return NULL;
157}
158
159/*
160 * If lockd is using a PF_INET6 listener, all incoming requests appear
161 * to come from AF_INET6 remotes. The address of AF_INET remotes are
162 * mapped to AF_INET6 automatically by the network layer. In case the
163 * user passed an AF_INET server address at mount time, ensure both
164 * addresses are AF_INET6 before comparing them.
165 */
166static int nlmclnt_cmp_addr(const struct nlm_host *host,
167 const struct sockaddr *sap)
168{
169 const struct in6_addr *addr1;
170 const struct in6_addr *addr2;
171 struct in6_addr addr1_mapped;
172 struct in6_addr addr2_mapped;
173
174 addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
175 if (likely(addr1 != NULL)) {
176 addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
177 if (likely(addr2 != NULL))
178 return ipv6_addr_equal(addr1, addr2);
179 }
180
181 return 0;
182}
183#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
184static int nlmclnt_cmp_addr(const struct nlm_host *host,
185 const struct sockaddr *sap)
186{
187 return nlm_cmp_addr(nlm_addr(host), sap);
188}
189#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
190
142/* 191/*
143 * The server lockd has called us back to tell us the lock was granted 192 * The server lockd has called us back to tell us the lock was granted
144 */ 193 */
@@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
166 */ 215 */
167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 216 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
168 continue; 217 continue;
169 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) 218 if (!nlmclnt_cmp_addr(block->b_host, addr))
170 continue; 219 continue;
171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 220 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
172 continue; 221 continue;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6063a8e4b9f3..763b78a6e9de 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -427,7 +427,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
427 goto out; 427 goto out;
428 case -EAGAIN: 428 case -EAGAIN:
429 ret = nlm_lck_denied; 429 ret = nlm_lck_denied;
430 goto out; 430 break;
431 case FILE_LOCK_DEFERRED: 431 case FILE_LOCK_DEFERRED:
432 if (wait) 432 if (wait)
433 break; 433 break;
@@ -443,6 +443,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
443 goto out; 443 goto out;
444 } 444 }
445 445
446 ret = nlm_lck_denied;
447 if (!wait)
448 goto out;
449
446 ret = nlm_lck_blocked; 450 ret = nlm_lck_blocked;
447 451
448 /* Append to list of blocked */ 452 /* Append to list of blocked */
diff --git a/fs/namespace.c b/fs/namespace.c
index 228d8c4bfd18..06f8e63f6cb1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt)
614 */ 614 */
615 for_each_possible_cpu(cpu) { 615 for_each_possible_cpu(cpu) {
616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); 616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617 if (cpu_writer->mnt != mnt)
618 continue;
619 spin_lock(&cpu_writer->lock); 617 spin_lock(&cpu_writer->lock);
618 if (cpu_writer->mnt != mnt) {
619 spin_unlock(&cpu_writer->lock);
620 continue;
621 }
620 atomic_add(cpu_writer->count, &mnt->__mnt_writers); 622 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
621 cpu_writer->count = 0; 623 cpu_writer->count = 0;
622 /* 624 /*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9b728f3565a1..574158ae2398 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -255,6 +255,32 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
255 } 255 }
256 return 0; 256 return 0;
257} 257}
258
259/*
260 * Test if two ip6 socket addresses refer to the same socket by
261 * comparing relevant fields. The padding bytes specifically, are not
262 * compared. sin6_flowinfo is not compared because it only affects QoS
263 * and sin6_scope_id is only compared if the address is "link local"
264 * because "link local" addresses need only be unique to a specific
265 * link. Conversely, ordinary unicast addresses might have different
266 * sin6_scope_id.
267 *
268 * The caller should ensure both socket addresses are AF_INET6.
269 */
270static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
271 const struct sockaddr *sa2)
272{
273 const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
274 const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
275
276 if (!ipv6_addr_equal(&saddr1->sin6_addr,
277 &saddr1->sin6_addr))
278 return 0;
279 if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
280 saddr1->sin6_scope_id != saddr2->sin6_scope_id)
281 return 0;
282 return saddr1->sin6_port == saddr2->sin6_port;
283}
258#else 284#else
259static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, 285static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
260 const struct sockaddr_in *sa2) 286 const struct sockaddr_in *sa2)
@@ -270,9 +296,52 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
270 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, 296 return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
271 (const struct sockaddr_in *)sa2); 297 (const struct sockaddr_in *)sa2);
272} 298}
299
300static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
301 const struct sockaddr * sa2)
302{
303 return 0;
304}
273#endif 305#endif
274 306
275/* 307/*
308 * Test if two ip4 socket addresses refer to the same socket, by
309 * comparing relevant fields. The padding bytes specifically, are
310 * not compared.
311 *
312 * The caller should ensure both socket addresses are AF_INET.
313 */
314static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
315 const struct sockaddr *sa2)
316{
317 const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
318 const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
319
320 if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
321 return 0;
322 return saddr1->sin_port == saddr2->sin_port;
323}
324
325/*
326 * Test if two socket addresses represent the same actual socket,
327 * by comparing (only) relevant fields.
328 */
329static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
330 const struct sockaddr *sa2)
331{
332 if (sa1->sa_family != sa2->sa_family)
333 return 0;
334
335 switch (sa1->sa_family) {
336 case AF_INET:
337 return nfs_sockaddr_cmp_ip4(sa1, sa2);
338 case AF_INET6:
339 return nfs_sockaddr_cmp_ip6(sa1, sa2);
340 }
341 return 0;
342}
343
344/*
276 * Find a client by IP address and protocol version 345 * Find a client by IP address and protocol version
277 * - returns NULL if no such client 346 * - returns NULL if no such client
278 */ 347 */
@@ -344,8 +413,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
344static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) 413static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
345{ 414{
346 struct nfs_client *clp; 415 struct nfs_client *clp;
416 const struct sockaddr *sap = data->addr;
347 417
348 list_for_each_entry(clp, &nfs_client_list, cl_share_link) { 418 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
419 const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
349 /* Don't match clients that failed to initialise properly */ 420 /* Don't match clients that failed to initialise properly */
350 if (clp->cl_cons_state < 0) 421 if (clp->cl_cons_state < 0)
351 continue; 422 continue;
@@ -358,7 +429,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
358 continue; 429 continue;
359 430
360 /* Match the full socket address */ 431 /* Match the full socket address */
361 if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0) 432 if (!nfs_sockaddr_cmp(sap, clap))
362 continue; 433 continue;
363 434
364 atomic_inc(&clp->cl_count); 435 atomic_inc(&clp->cl_count);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e35c8199f82f..672368f865ca 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1892,8 +1892,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
1892 cache.cred = cred; 1892 cache.cred = cred;
1893 cache.jiffies = jiffies; 1893 cache.jiffies = jiffies;
1894 status = NFS_PROTO(inode)->access(inode, &cache); 1894 status = NFS_PROTO(inode)->access(inode, &cache);
1895 if (status != 0) 1895 if (status != 0) {
1896 if (status == -ESTALE) {
1897 nfs_zap_caches(inode);
1898 if (!S_ISDIR(inode->i_mode))
1899 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
1900 }
1896 return status; 1901 return status;
1902 }
1897 nfs_access_add_cache(inode, &cache); 1903 nfs_access_add_cache(inode, &cache);
1898out: 1904out:
1899 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 1905 if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index cef62557c87d..6bbf0e6daad2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
292{ 292{
293 struct nfs_server *server = NFS_SERVER(inode); 293 struct nfs_server *server = NFS_SERVER(inode);
294 struct nfs_fattr fattr; 294 struct nfs_fattr fattr;
295 struct page *pages[NFSACL_MAXPAGES] = { }; 295 struct page *pages[NFSACL_MAXPAGES];
296 struct nfs3_setaclargs args = { 296 struct nfs3_setaclargs args = {
297 .inode = inode, 297 .inode = inode,
298 .mask = NFS_ACL, 298 .mask = NFS_ACL,
@@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
303 .rpc_argp = &args, 303 .rpc_argp = &args,
304 .rpc_resp = &fattr, 304 .rpc_resp = &fattr,
305 }; 305 };
306 int status, count; 306 int status;
307 307
308 status = -EOPNOTSUPP; 308 status = -EOPNOTSUPP;
309 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 309 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
319 if (S_ISDIR(inode->i_mode)) { 319 if (S_ISDIR(inode->i_mode)) {
320 args.mask |= NFS_DFACL; 320 args.mask |= NFS_DFACL;
321 args.acl_default = dfacl; 321 args.acl_default = dfacl;
322 args.len = nfsacl_size(acl, dfacl);
323 } else
324 args.len = nfsacl_size(acl, NULL);
325
326 if (args.len > NFS_ACL_INLINE_BUFSIZE) {
327 unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
328
329 status = -ENOMEM;
330 do {
331 args.pages[args.npages] = alloc_page(GFP_KERNEL);
332 if (args.pages[args.npages] == NULL)
333 goto out_freepages;
334 args.npages++;
335 } while (args.npages < npages);
322 } 336 }
323 337
324 dprintk("NFS call setacl\n"); 338 dprintk("NFS call setacl\n");
@@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
329 nfs_zap_acl_cache(inode); 343 nfs_zap_acl_cache(inode);
330 dprintk("NFS reply setacl: %d\n", status); 344 dprintk("NFS reply setacl: %d\n", status);
331 345
332 /* pages may have been allocated at the xdr layer. */
333 for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
334 __free_page(args.pages[count]);
335
336 switch (status) { 346 switch (status) {
337 case 0: 347 case 0:
338 status = nfs_refresh_inode(inode, &fattr); 348 status = nfs_refresh_inode(inode, &fattr);
@@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
346 case -ENOTSUPP: 356 case -ENOTSUPP:
347 status = -EOPNOTSUPP; 357 status = -EOPNOTSUPP;
348 } 358 }
359out_freepages:
360 while (args.npages != 0) {
361 args.npages--;
362 __free_page(args.pages[args.npages]);
363 }
349out: 364out:
350 return status; 365 return status;
351} 366}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 11cdddec1432..6cdeacffde46 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -82,8 +82,10 @@
82#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) 82#define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2)
83 83
84#define ACL3_getaclargs_sz (NFS3_fh_sz+1) 84#define ACL3_getaclargs_sz (NFS3_fh_sz+1)
85#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) 85#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \
86#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) 86 XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
87#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \
88 XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
87#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) 89#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
88 90
89/* 91/*
@@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
703 struct nfs3_setaclargs *args) 705 struct nfs3_setaclargs *args)
704{ 706{
705 struct xdr_buf *buf = &req->rq_snd_buf; 707 struct xdr_buf *buf = &req->rq_snd_buf;
706 unsigned int base, len_in_head, len = nfsacl_size( 708 unsigned int base;
707 (args->mask & NFS_ACL) ? args->acl_access : NULL, 709 int err;
708 (args->mask & NFS_DFACL) ? args->acl_default : NULL);
709 int count, err;
710 710
711 p = xdr_encode_fhandle(p, NFS_FH(args->inode)); 711 p = xdr_encode_fhandle(p, NFS_FH(args->inode));
712 *p++ = htonl(args->mask); 712 *p++ = htonl(args->mask);
713 base = (char *)p - (char *)buf->head->iov_base; 713 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
714 /* put as much of the acls into head as possible. */ 714 base = req->rq_slen;
715 len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); 715
716 len -= len_in_head; 716 if (args->npages != 0)
717 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); 717 xdr_encode_pages(buf, args->pages, 0, args->len);
718 718 else
719 for (count = 0; (count << PAGE_SHIFT) < len; count++) { 719 req->rq_slen += args->len;
720 args->pages[count] = alloc_page(GFP_KERNEL);
721 if (!args->pages[count]) {
722 while (count)
723 __free_page(args->pages[--count]);
724 return -ENOMEM;
725 }
726 }
727 xdr_encode_pages(buf, args->pages, 0, len);
728 720
729 err = nfsacl_encode(buf, base, args->inode, 721 err = nfsacl_encode(buf, base, args->inode,
730 (args->mask & NFS_ACL) ? 722 (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 30befc39b3c6..2a2a0a7143ad 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -21,7 +21,9 @@
21#define NFSDBG_FACILITY NFSDBG_VFS 21#define NFSDBG_FACILITY NFSDBG_VFS
22 22
23/* 23/*
24 * Check if fs_root is valid 24 * Convert the NFSv4 pathname components into a standard posix path.
25 *
26 * Note that the resulting string will be placed at the end of the buffer
25 */ 27 */
26static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, 28static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
27 char *buffer, ssize_t buflen) 29 char *buffer, ssize_t buflen)
@@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
99{ 101{
100 struct vfsmount *mnt = ERR_PTR(-ENOENT); 102 struct vfsmount *mnt = ERR_PTR(-ENOENT);
101 char *mnt_path; 103 char *mnt_path;
102 int page2len; 104 unsigned int maxbuflen;
103 unsigned int s; 105 unsigned int s;
104 106
105 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); 107 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
106 if (IS_ERR(mnt_path)) 108 if (IS_ERR(mnt_path))
107 return mnt; 109 return mnt;
108 mountdata->mnt_path = mnt_path; 110 mountdata->mnt_path = mnt_path;
109 page2 += strlen(mnt_path) + 1; 111 maxbuflen = mnt_path - 1 - page2;
110 page2len = PAGE_SIZE - strlen(mnt_path) - 1;
111 112
112 for (s = 0; s < location->nservers; s++) { 113 for (s = 0; s < location->nservers; s++) {
113 const struct nfs4_string *buf = &location->servers[s]; 114 const struct nfs4_string *buf = &location->servers[s];
114 struct sockaddr_storage addr; 115 struct sockaddr_storage addr;
115 116
116 if (buf->len <= 0 || buf->len >= PAGE_SIZE) 117 if (buf->len <= 0 || buf->len >= maxbuflen)
117 continue; 118 continue;
118 119
119 mountdata->addr = (struct sockaddr *)&addr; 120 mountdata->addr = (struct sockaddr *)&addr;
@@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
126 continue; 127 continue;
127 nfs_set_port(mountdata->addr, NFS_PORT); 128 nfs_set_port(mountdata->addr, NFS_PORT);
128 129
129 strncpy(page2, buf->data, page2len); 130 memcpy(page2, buf->data, buf->len);
130 page2[page2len] = '\0'; 131 page2[buf->len] = '\0';
131 mountdata->hostname = page2; 132 mountdata->hostname = page2;
132 133
133 snprintf(page, PAGE_SIZE, "%s:%s", 134 snprintf(page, PAGE_SIZE, "%s:%s",
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..331f2e88e284 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih,
156 int ret; 156 int ret;
157 157
158 do { 158 do {
159 if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL))) 159 if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
160 return -ENOSPC; 160 return -ENOSPC;
161 ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd); 161 ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
162 } while (ret == -EAGAIN); 162 } while (ret == -EAGAIN);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 60fe74035db5..19e3a96aa02c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
176 176
177 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); 177 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
178 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 178 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
179 (OCFS2_I(inode)->ip_clusters != rec->e_cpos), 179 (OCFS2_I(inode)->ip_clusters !=
180 le32_to_cpu(rec->e_cpos)),
180 "Device %s, asking for sparse allocation: inode %llu, " 181 "Device %s, asking for sparse allocation: inode %llu, "
181 "cpos %u, clusters %u\n", 182 "cpos %u, clusters %u\n",
182 osb->dev_str, 183 osb->dev_str,
@@ -4796,6 +4797,29 @@ out:
4796 return ret; 4797 return ret;
4797} 4798}
4798 4799
4800static int ocfs2_replace_extent_rec(struct inode *inode,
4801 handle_t *handle,
4802 struct ocfs2_path *path,
4803 struct ocfs2_extent_list *el,
4804 int split_index,
4805 struct ocfs2_extent_rec *split_rec)
4806{
4807 int ret;
4808
4809 ret = ocfs2_path_bh_journal_access(handle, inode, path,
4810 path_num_items(path) - 1);
4811 if (ret) {
4812 mlog_errno(ret);
4813 goto out;
4814 }
4815
4816 el->l_recs[split_index] = *split_rec;
4817
4818 ocfs2_journal_dirty(handle, path_leaf_bh(path));
4819out:
4820 return ret;
4821}
4822
4799/* 4823/*
4800 * Mark part or all of the extent record at split_index in the leaf 4824 * Mark part or all of the extent record at split_index in the leaf
4801 * pointed to by path as written. This removes the unwritten 4825 * pointed to by path as written. This removes the unwritten
@@ -4885,7 +4909,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4885 4909
4886 if (ctxt.c_contig_type == CONTIG_NONE) { 4910 if (ctxt.c_contig_type == CONTIG_NONE) {
4887 if (ctxt.c_split_covers_rec) 4911 if (ctxt.c_split_covers_rec)
4888 el->l_recs[split_index] = *split_rec; 4912 ret = ocfs2_replace_extent_rec(inode, handle,
4913 path, el,
4914 split_index, split_rec);
4889 else 4915 else
4890 ret = ocfs2_split_and_insert(inode, handle, path, et, 4916 ret = ocfs2_split_and_insert(inode, handle, path, et,
4891 &last_eb_bh, split_index, 4917 &last_eb_bh, split_index,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a067a6cffb01..8e1709a679b7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
227 size = i_size_read(inode); 227 size = i_size_read(inode);
228 228
229 if (size > PAGE_CACHE_SIZE || 229 if (size > PAGE_CACHE_SIZE ||
230 size > ocfs2_max_inline_data(inode->i_sb)) { 230 size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
231 ocfs2_error(inode->i_sb, 231 ocfs2_error(inode->i_sb,
232 "Inode %llu has with inline data has bad size: %Lu", 232 "Inode %llu has with inline data has bad size: %Lu",
233 (unsigned long long)OCFS2_I(inode)->ip_blkno, 233 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1555 int ret, written = 0; 1555 int ret, written = 0;
1556 loff_t end = pos + len; 1556 loff_t end = pos + len;
1557 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1557 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1558 struct ocfs2_dinode *di = NULL;
1558 1559
1559 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", 1560 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
1560 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, 1561 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
@@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1587 /* 1588 /*
1588 * Check whether the write can fit. 1589 * Check whether the write can fit.
1589 */ 1590 */
1590 if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb)) 1591 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1592 if (mmap_page ||
1593 end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
1591 return 0; 1594 return 0;
1592 1595
1593do_inline_write: 1596do_inline_write:
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 54e182a27caf..0a2813947853 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1849,12 +1849,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1849 if (!mle) { 1849 if (!mle) {
1850 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && 1850 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
1851 res->owner != assert->node_idx) { 1851 res->owner != assert->node_idx) {
1852 mlog(ML_ERROR, "assert_master from " 1852 mlog(ML_ERROR, "DIE! Mastery assert from %u, "
1853 "%u, but current owner is " 1853 "but current owner is %u! (%.*s)\n",
1854 "%u! (%.*s)\n", 1854 assert->node_idx, res->owner, namelen,
1855 assert->node_idx, res->owner, 1855 name);
1856 namelen, name); 1856 __dlm_print_one_lock_resource(res);
1857 goto kill; 1857 BUG();
1858 } 1858 }
1859 } else if (mle->type != DLM_MLE_MIGRATION) { 1859 } else if (mle->type != DLM_MLE_MIGRATION) {
1860 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 1860 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d1295203029f..4060bb328bc8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,8 +181,7 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
181 181
182 spin_lock(&res->spinlock); 182 spin_lock(&res->spinlock);
183 /* This ensures that clear refmap is sent after the set */ 183 /* This ensures that clear refmap is sent after the set */
184 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG | 184 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
185 DLM_LOCK_RES_MIGRATING));
186 spin_unlock(&res->spinlock); 185 spin_unlock(&res->spinlock);
187 186
188 /* clear our bit from the master's refmap, ignore errors */ 187 /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 86ca085ef324..fcf879ed6930 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -117,11 +117,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
117 else 117 else
118 BUG_ON(res->owner == dlm->node_num); 118 BUG_ON(res->owner == dlm->node_num);
119 119
120 spin_lock(&dlm->spinlock); 120 spin_lock(&dlm->ast_lock);
121 /* We want to be sure that we're not freeing a lock 121 /* We want to be sure that we're not freeing a lock
122 * that still has AST's pending... */ 122 * that still has AST's pending... */
123 in_use = !list_empty(&lock->ast_list); 123 in_use = !list_empty(&lock->ast_list);
124 spin_unlock(&dlm->spinlock); 124 spin_unlock(&dlm->ast_lock);
125 if (in_use) { 125 if (in_use) {
126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " 126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
127 "while waiting for an ast!", res->lockname.len, 127 "while waiting for an ast!", res->lockname.len,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 206a2370876a..7219a86d34cc 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -320,9 +320,14 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
320 struct ocfs2_lock_res *lockres); 320 struct ocfs2_lock_res *lockres);
321static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 321static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
322 int convert); 322 int convert);
323#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ 323#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
324 mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ 324 if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \
325 _err, _func, _lockres->l_name); \ 325 mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
326 _err, _func, _lockres->l_name); \
327 else \
328 mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \
329 _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \
330 (unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \
326} while (0) 331} while (0)
327static int ocfs2_downconvert_thread(void *arg); 332static int ocfs2_downconvert_thread(void *arg);
328static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, 333static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e1307c..172850a9a12a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
513static inline int ocfs2_begin_ordered_truncate(struct inode *inode, 513static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
514 loff_t new_size) 514 loff_t new_size)
515{ 515{
516 return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, 516 return jbd2_journal_begin_ordered_truncate(
517 new_size); 517 OCFS2_SB(inode->i_sb)->journal->j_journal,
518 &OCFS2_I(inode)->ip_jinode,
519 new_size);
518} 520}
519 521
520#endif /* OCFS2_JOURNAL_H */ 522#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 084aba86c3b2..4b11762f249e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -532,7 +532,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
532 532
533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
534 534
535 fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb)); 535 fe->id2.i_data.id_count = cpu_to_le16(
536 ocfs2_max_inline_data_with_xattr(osb->sb, fe));
536 } else { 537 } else {
537 fel = &fe->id2.i_list; 538 fel = &fe->id2.i_list;
538 fel->l_tree_depth = 0; 539 fel->l_tree_depth = 0;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 077384135f4e..946d3c34b90b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -341,6 +341,9 @@ struct ocfs2_super
341 struct ocfs2_node_map osb_recovering_orphan_dirs; 341 struct ocfs2_node_map osb_recovering_orphan_dirs;
342 unsigned int *osb_orphan_wipes; 342 unsigned int *osb_orphan_wipes;
343 wait_queue_head_t osb_wipe_event; 343 wait_queue_head_t osb_wipe_event;
344
345 /* used to protect metaecc calculation check of xattr. */
346 spinlock_t osb_xattr_lock;
344}; 347};
345 348
346#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 349#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c7ae45aaa36c..2332ef740f4f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -1070,12 +1070,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
1070 offsetof(struct ocfs2_dinode, id2.i_symlink); 1070 offsetof(struct ocfs2_dinode, id2.i_symlink);
1071} 1071}
1072 1072
1073static inline int ocfs2_max_inline_data(struct super_block *sb)
1074{
1075 return sb->s_blocksize -
1076 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
1077}
1078
1079static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, 1073static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
1080 struct ocfs2_dinode *di) 1074 struct ocfs2_dinode *di)
1081{ 1075{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b1cb38fbe807..7ac83a81ee55 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1537,6 +1537,13 @@ static int ocfs2_get_sector(struct super_block *sb,
1537 unlock_buffer(*bh); 1537 unlock_buffer(*bh);
1538 ll_rw_block(READ, 1, bh); 1538 ll_rw_block(READ, 1, bh);
1539 wait_on_buffer(*bh); 1539 wait_on_buffer(*bh);
1540 if (!buffer_uptodate(*bh)) {
1541 mlog_errno(-EIO);
1542 brelse(*bh);
1543 *bh = NULL;
1544 return -EIO;
1545 }
1546
1540 return 0; 1547 return 0;
1541} 1548}
1542 1549
@@ -1747,6 +1754,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1747 INIT_LIST_HEAD(&osb->blocked_lock_list); 1754 INIT_LIST_HEAD(&osb->blocked_lock_list);
1748 osb->blocked_lock_count = 0; 1755 osb->blocked_lock_count = 0;
1749 spin_lock_init(&osb->osb_lock); 1756 spin_lock_init(&osb->osb_lock);
1757 spin_lock_init(&osb->osb_xattr_lock);
1750 ocfs2_init_inode_steal_slot(osb); 1758 ocfs2_init_inode_steal_slot(osb);
1751 1759
1752 atomic_set(&osb->alloc_stats.moves, 0); 1760 atomic_set(&osb->alloc_stats.moves, 0);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 915039fffe6e..2563df89fc2a 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -82,13 +82,14 @@ struct ocfs2_xattr_set_ctxt {
82 82
83#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) 83#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
84#define OCFS2_XATTR_INLINE_SIZE 80 84#define OCFS2_XATTR_INLINE_SIZE 80
85#define OCFS2_XATTR_HEADER_GAP 4
85#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \ 86#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \
86 - sizeof(struct ocfs2_xattr_header) \ 87 - sizeof(struct ocfs2_xattr_header) \
87 - sizeof(__u32)) 88 - OCFS2_XATTR_HEADER_GAP)
88#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \ 89#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \
89 - sizeof(struct ocfs2_xattr_block) \ 90 - sizeof(struct ocfs2_xattr_block) \
90 - sizeof(struct ocfs2_xattr_header) \ 91 - sizeof(struct ocfs2_xattr_header) \
91 - sizeof(__u32)) 92 - OCFS2_XATTR_HEADER_GAP)
92 93
93static struct ocfs2_xattr_def_value_root def_xv = { 94static struct ocfs2_xattr_def_value_root def_xv = {
94 .xv.xr_list.l_count = cpu_to_le16(1), 95 .xv.xr_list.l_count = cpu_to_le16(1),
@@ -274,10 +275,12 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
274 bucket->bu_blocks, bucket->bu_bhs, 0, 275 bucket->bu_blocks, bucket->bu_bhs, 0,
275 NULL); 276 NULL);
276 if (!rc) { 277 if (!rc) {
278 spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
277 rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb, 279 rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
278 bucket->bu_bhs, 280 bucket->bu_bhs,
279 bucket->bu_blocks, 281 bucket->bu_blocks,
280 &bucket_xh(bucket)->xh_check); 282 &bucket_xh(bucket)->xh_check);
283 spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
281 if (rc) 284 if (rc)
282 mlog_errno(rc); 285 mlog_errno(rc);
283 } 286 }
@@ -310,9 +313,11 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
310{ 313{
311 int i; 314 int i;
312 315
316 spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
313 ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb, 317 ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
314 bucket->bu_bhs, bucket->bu_blocks, 318 bucket->bu_bhs, bucket->bu_blocks,
315 &bucket_xh(bucket)->xh_check); 319 &bucket_xh(bucket)->xh_check);
320 spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
316 321
317 for (i = 0; i < bucket->bu_blocks; i++) 322 for (i = 0; i < bucket->bu_blocks; i++)
318 ocfs2_journal_dirty(handle, bucket->bu_bhs[i]); 323 ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
@@ -542,8 +547,12 @@ int ocfs2_calc_xattr_init(struct inode *dir,
542 * when blocksize = 512, may reserve one more cluser for 547 * when blocksize = 512, may reserve one more cluser for
543 * xattr bucket, otherwise reserve one metadata block 548 * xattr bucket, otherwise reserve one metadata block
544 * for them is ok. 549 * for them is ok.
550 * If this is a new directory with inline data,
551 * we choose to reserve the entire inline area for
552 * directory contents and force an external xattr block.
545 */ 553 */
546 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || 554 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
555 (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
547 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { 556 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
548 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); 557 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
549 if (ret) { 558 if (ret) {
@@ -1507,7 +1516,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1507 last += 1; 1516 last += 1;
1508 } 1517 }
1509 1518
1510 free = min_offs - ((void *)last - xs->base) - sizeof(__u32); 1519 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
1511 if (free < 0) 1520 if (free < 0)
1512 return -EIO; 1521 return -EIO;
1513 1522
@@ -2190,7 +2199,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2190 last += 1; 2199 last += 1;
2191 } 2200 }
2192 2201
2193 free = min_offs - ((void *)last - xs->base) - sizeof(__u32); 2202 free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
2194 if (free < 0) 2203 if (free < 0)
2195 return 0; 2204 return 0;
2196 2205
@@ -2592,8 +2601,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2592 2601
2593 if (!ret) { 2602 if (!ret) {
2594 /* Update inode ctime. */ 2603 /* Update inode ctime. */
2595 ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh, 2604 ret = ocfs2_journal_access_di(ctxt->handle, inode,
2596 OCFS2_JOURNAL_ACCESS_WRITE); 2605 xis->inode_bh,
2606 OCFS2_JOURNAL_ACCESS_WRITE);
2597 if (ret) { 2607 if (ret) {
2598 mlog_errno(ret); 2608 mlog_errno(ret);
2599 goto out; 2609 goto out;
@@ -4785,19 +4795,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4785 char *val, 4795 char *val,
4786 int value_len) 4796 int value_len)
4787{ 4797{
4788 int offset; 4798 int ret, offset, block_off;
4789 struct ocfs2_xattr_value_root *xv; 4799 struct ocfs2_xattr_value_root *xv;
4790 struct ocfs2_xattr_entry *xe = xs->here; 4800 struct ocfs2_xattr_entry *xe = xs->here;
4801 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
4802 void *base;
4791 4803
4792 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); 4804 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
4793 4805
4794 offset = le16_to_cpu(xe->xe_name_offset) + 4806 ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
4795 OCFS2_XATTR_SIZE(xe->xe_name_len); 4807 xe - xh->xh_entries,
4808 &block_off,
4809 &offset);
4810 if (ret) {
4811 mlog_errno(ret);
4812 goto out;
4813 }
4796 4814
4797 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); 4815 base = bucket_block(xs->bucket, block_off);
4816 xv = (struct ocfs2_xattr_value_root *)(base + offset +
4817 OCFS2_XATTR_SIZE(xe->xe_name_len));
4798 4818
4799 return __ocfs2_xattr_set_value_outside(inode, handle, 4819 ret = __ocfs2_xattr_set_value_outside(inode, handle,
4800 xv, val, value_len); 4820 xv, val, value_len);
4821 if (ret)
4822 mlog_errno(ret);
4823out:
4824 return ret;
4801} 4825}
4802 4826
4803static int ocfs2_rm_xattr_cluster(struct inode *inode, 4827static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -5060,8 +5084,8 @@ try_again:
5060 xh_free_start = le16_to_cpu(xh->xh_free_start); 5084 xh_free_start = le16_to_cpu(xh->xh_free_start);
5061 header_size = sizeof(struct ocfs2_xattr_header) + 5085 header_size = sizeof(struct ocfs2_xattr_header) +
5062 count * sizeof(struct ocfs2_xattr_entry); 5086 count * sizeof(struct ocfs2_xattr_entry);
5063 max_free = OCFS2_XATTR_BUCKET_SIZE - 5087 max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
5064 le16_to_cpu(xh->xh_name_value_len) - header_size; 5088 le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
5065 5089
5066 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " 5090 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
5067 "of %u which exceed block size\n", 5091 "of %u which exceed block size\n",
@@ -5094,7 +5118,7 @@ try_again:
5094 need = 0; 5118 need = 0;
5095 } 5119 }
5096 5120
5097 free = xh_free_start - header_size; 5121 free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
5098 /* 5122 /*
5099 * We need to make sure the new name/value pair 5123 * We need to make sure the new name/value pair
5100 * can exist in the same block. 5124 * can exist in the same block.
@@ -5127,7 +5151,8 @@ try_again:
5127 } 5151 }
5128 5152
5129 xh_free_start = le16_to_cpu(xh->xh_free_start); 5153 xh_free_start = le16_to_cpu(xh->xh_free_start);
5130 free = xh_free_start - header_size; 5154 free = xh_free_start - header_size
5155 - OCFS2_XATTR_HEADER_GAP;
5131 if (xh_free_start % blocksize < need) 5156 if (xh_free_start % blocksize < need)
5132 free -= xh_free_start % blocksize; 5157 free -= xh_free_start % blocksize;
5133 5158
diff --git a/fs/pipe.c b/fs/pipe.c
index 3a48ba5179d5..14f502b89cf5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -699,12 +699,12 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
699 int retval; 699 int retval;
700 700
701 mutex_lock(&inode->i_mutex); 701 mutex_lock(&inode->i_mutex);
702
703 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 702 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
704 703 if (retval >= 0) {
705 if (retval >= 0)
706 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 704 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
707 705 if (retval < 0) /* this can happen only if on == T */
706 fasync_helper(-1, filp, 0, &pipe->fasync_readers);
707 }
708 mutex_unlock(&inode->i_mutex); 708 mutex_unlock(&inode->i_mutex);
709 709
710 if (retval < 0) 710 if (retval < 0)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3e76bb9b3ad6..d8bb5c671f42 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -485,8 +485,10 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
485 } 485 }
486 } 486 }
487 unlock_new_inode(inode); 487 unlock_new_inode(inode);
488 } else 488 } else {
489 module_put(de->owner); 489 module_put(de->owner);
490 de_put(de);
491 }
490 return inode; 492 return inode;
491 493
492out_ino: 494out_ino:
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 767d95a6d1b1..e9983837d08d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -80,7 +80,7 @@ static const struct file_operations proc_kpagecount_operations = {
80#define KPF_RECLAIM 9 80#define KPF_RECLAIM 9
81#define KPF_BUDDY 10 81#define KPF_BUDDY 10
82 82
83#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) 83#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
84 84
85static ssize_t kpageflags_read(struct file *file, char __user *buf, 85static ssize_t kpageflags_read(struct file *file, char __user *buf,
86 size_t count, loff_t *ppos) 86 size_t count, loff_t *ppos)
@@ -107,7 +107,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
107 else 107 else
108 kflags = ppage->flags; 108 kflags = ppage->flags;
109 109
110 uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) | 110 uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
111 kpf_copy_bit(kflags, KPF_ERROR, PG_error) | 111 kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
112 kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | 112 kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
113 kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | 113 kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index b9b567a28376..5d7c7ececa64 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -114,6 +114,9 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
114 if (!pagevec_add(&lru_pvec, page)) 114 if (!pagevec_add(&lru_pvec, page))
115 __pagevec_lru_add_file(&lru_pvec); 115 __pagevec_lru_add_file(&lru_pvec);
116 116
117 /* prevent the page from being discarded on memory pressure */
118 SetPageDirty(page);
119
117 unlock_page(page); 120 unlock_page(page);
118 } 121 }
119 122
@@ -126,6 +129,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
126 return -EFBIG; 129 return -EFBIG;
127 130
128 add_error: 131 add_error:
132 pagevec_lru_add_file(&lru_pvec);
129 page_cache_release(pages + loop); 133 page_cache_release(pages + loop);
130 for (loop++; loop < npages; loop++) 134 for (loop++; loop < npages; loop++)
131 __free_page(pages + loop); 135 __free_page(pages + loop);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index b569ff1c4dc8..a1a4cfe19210 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,12 +48,78 @@ int seq_open(struct file *file, const struct seq_operations *op)
48 */ 48 */
49 file->f_version = 0; 49 file->f_version = 0;
50 50
51 /* SEQ files support lseek, but not pread/pwrite */ 51 /*
52 file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE); 52 * seq_files support lseek() and pread(). They do not implement
53 * write() at all, but we clear FMODE_PWRITE here for historical
54 * reasons.
55 *
56 * If a client of seq_files a) implements file.write() and b) wishes to
57 * support pwrite() then that client will need to implement its own
58 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
59 */
60 file->f_mode &= ~FMODE_PWRITE;
53 return 0; 61 return 0;
54} 62}
55EXPORT_SYMBOL(seq_open); 63EXPORT_SYMBOL(seq_open);
56 64
65static int traverse(struct seq_file *m, loff_t offset)
66{
67 loff_t pos = 0, index;
68 int error = 0;
69 void *p;
70
71 m->version = 0;
72 index = 0;
73 m->count = m->from = 0;
74 if (!offset) {
75 m->index = index;
76 return 0;
77 }
78 if (!m->buf) {
79 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
80 if (!m->buf)
81 return -ENOMEM;
82 }
83 p = m->op->start(m, &index);
84 while (p) {
85 error = PTR_ERR(p);
86 if (IS_ERR(p))
87 break;
88 error = m->op->show(m, p);
89 if (error < 0)
90 break;
91 if (unlikely(error)) {
92 error = 0;
93 m->count = 0;
94 }
95 if (m->count == m->size)
96 goto Eoverflow;
97 if (pos + m->count > offset) {
98 m->from = offset - pos;
99 m->count -= m->from;
100 m->index = index;
101 break;
102 }
103 pos += m->count;
104 m->count = 0;
105 if (pos == offset) {
106 index++;
107 m->index = index;
108 break;
109 }
110 p = m->op->next(m, p, &index);
111 }
112 m->op->stop(m, p);
113 m->index = index;
114 return error;
115
116Eoverflow:
117 m->op->stop(m, p);
118 kfree(m->buf);
119 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
120 return !m->buf ? -ENOMEM : -EAGAIN;
121}
122
57/** 123/**
58 * seq_read - ->read() method for sequential files. 124 * seq_read - ->read() method for sequential files.
59 * @file: the file to read from 125 * @file: the file to read from
@@ -73,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
73 int err = 0; 139 int err = 0;
74 140
75 mutex_lock(&m->lock); 141 mutex_lock(&m->lock);
142
143 /* Don't assume *ppos is where we left it */
144 if (unlikely(*ppos != m->read_pos)) {
145 m->read_pos = *ppos;
146 while ((err = traverse(m, *ppos)) == -EAGAIN)
147 ;
148 if (err) {
149 /* With prejudice... */
150 m->read_pos = 0;
151 m->version = 0;
152 m->index = 0;
153 m->count = 0;
154 goto Done;
155 }
156 }
157
76 /* 158 /*
77 * seq_file->op->..m_start/m_stop/m_next may do special actions 159 * seq_file->op->..m_start/m_stop/m_next may do special actions
78 * or optimisations based on the file->f_version, so we want to 160 * or optimisations based on the file->f_version, so we want to
@@ -172,8 +254,10 @@ Fill:
172Done: 254Done:
173 if (!copied) 255 if (!copied)
174 copied = err; 256 copied = err;
175 else 257 else {
176 *ppos += copied; 258 *ppos += copied;
259 m->read_pos += copied;
260 }
177 file->f_version = m->version; 261 file->f_version = m->version;
178 mutex_unlock(&m->lock); 262 mutex_unlock(&m->lock);
179 return copied; 263 return copied;
@@ -186,63 +270,6 @@ Efault:
186} 270}
187EXPORT_SYMBOL(seq_read); 271EXPORT_SYMBOL(seq_read);
188 272
189static int traverse(struct seq_file *m, loff_t offset)
190{
191 loff_t pos = 0, index;
192 int error = 0;
193 void *p;
194
195 m->version = 0;
196 index = 0;
197 m->count = m->from = 0;
198 if (!offset) {
199 m->index = index;
200 return 0;
201 }
202 if (!m->buf) {
203 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
204 if (!m->buf)
205 return -ENOMEM;
206 }
207 p = m->op->start(m, &index);
208 while (p) {
209 error = PTR_ERR(p);
210 if (IS_ERR(p))
211 break;
212 error = m->op->show(m, p);
213 if (error < 0)
214 break;
215 if (unlikely(error)) {
216 error = 0;
217 m->count = 0;
218 }
219 if (m->count == m->size)
220 goto Eoverflow;
221 if (pos + m->count > offset) {
222 m->from = offset - pos;
223 m->count -= m->from;
224 m->index = index;
225 break;
226 }
227 pos += m->count;
228 m->count = 0;
229 if (pos == offset) {
230 index++;
231 m->index = index;
232 break;
233 }
234 p = m->op->next(m, p, &index);
235 }
236 m->op->stop(m, p);
237 return error;
238
239Eoverflow:
240 m->op->stop(m, p);
241 kfree(m->buf);
242 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
243 return !m->buf ? -ENOMEM : -EAGAIN;
244}
245
246/** 273/**
247 * seq_lseek - ->llseek() method for sequential files. 274 * seq_lseek - ->llseek() method for sequential files.
248 * @file: the file in question 275 * @file: the file in question
@@ -265,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
265 if (offset < 0) 292 if (offset < 0)
266 break; 293 break;
267 retval = offset; 294 retval = offset;
268 if (offset != file->f_pos) { 295 if (offset != m->read_pos) {
269 while ((retval=traverse(m, offset)) == -EAGAIN) 296 while ((retval=traverse(m, offset)) == -EAGAIN)
270 ; 297 ;
271 if (retval) { 298 if (retval) {
272 /* with extreme prejudice... */ 299 /* with extreme prejudice... */
273 file->f_pos = 0; 300 file->f_pos = 0;
301 m->read_pos = 0;
274 m->version = 0; 302 m->version = 0;
275 m->index = 0; 303 m->index = 0;
276 m->count = 0; 304 m->count = 0;
277 } else { 305 } else {
306 m->read_pos = offset;
278 retval = file->f_pos = offset; 307 retval = file->f_pos = offset;
279 } 308 }
280 } 309 }
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index c837dfc2b3c6..2a7960310349 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb,
80 * generated a larger block - this does occasionally happen with zlib). 80 * generated a larger block - this does occasionally happen with zlib).
81 */ 81 */
82int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, 82int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
83 int length, u64 *next_index, int srclength) 83 int length, u64 *next_index, int srclength, int pages)
84{ 84{
85 struct squashfs_sb_info *msblk = sb->s_fs_info; 85 struct squashfs_sb_info *msblk = sb->s_fs_info;
86 struct buffer_head **bh; 86 struct buffer_head **bh;
@@ -184,7 +184,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
184 offset = 0; 184 offset = 0;
185 } 185 }
186 186
187 if (msblk->stream.avail_out == 0) { 187 if (msblk->stream.avail_out == 0 && page < pages) {
188 msblk->stream.next_out = buffer[page++]; 188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE; 189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 } 190 }
@@ -201,25 +201,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
201 zlib_init = 1; 201 zlib_init = 1;
202 } 202 }
203 203
204 zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH); 204 zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
205 205
206 if (msblk->stream.avail_in == 0 && k < b) 206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]); 207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK); 208 } while (zlib_err == Z_OK);
209 209
210 if (zlib_err != Z_STREAM_END) { 210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate returned unexpected result" 211 ERROR("zlib_inflate error, data probably corrupt\n");
212 " 0x%x, srclength %d, avail_in %d,"
213 " avail_out %d\n", zlib_err, srclength,
214 msblk->stream.avail_in,
215 msblk->stream.avail_out);
216 goto release_mutex; 212 goto release_mutex;
217 } 213 }
218 214
219 zlib_err = zlib_inflateEnd(&msblk->stream); 215 zlib_err = zlib_inflateEnd(&msblk->stream);
220 if (zlib_err != Z_OK) { 216 if (zlib_err != Z_OK) {
221 ERROR("zlib_inflateEnd returned unexpected result 0x%x," 217 ERROR("zlib_inflate error, data probably corrupt\n");
222 " srclength %d\n", zlib_err, srclength);
223 goto release_mutex; 218 goto release_mutex;
224 } 219 }
225 length = msblk->stream.total_out; 220 length = msblk->stream.total_out;
@@ -268,7 +263,8 @@ block_release:
268 put_bh(bh[k]); 263 put_bh(bh[k]);
269 264
270read_failure: 265read_failure:
271 ERROR("sb_bread failed reading block 0x%llx\n", cur_index); 266 ERROR("squashfs_read_data failed to read block 0x%llx\n",
267 (unsigned long long) index);
272 kfree(bh); 268 kfree(bh);
273 return -EIO; 269 return -EIO;
274} 270}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f29eda16d25e..1c4739e33af6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
119 119
120 entry->length = squashfs_read_data(sb, entry->data, 120 entry->length = squashfs_read_data(sb, entry->data,
121 block, length, &entry->next_index, 121 block, length, &entry->next_index,
122 cache->block_size); 122 cache->block_size, cache->pages);
123 123
124 spin_lock(&cache->lock); 124 spin_lock(&cache->lock);
125 125
@@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
406 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) 406 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
407 data[i] = buffer; 407 data[i] = buffer;
408 res = squashfs_read_data(sb, data, block, length | 408 res = squashfs_read_data(sb, data, block, length |
409 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length); 409 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
410 kfree(data); 410 kfree(data);
411 return res; 411 return res;
412} 412}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 7a63398bb855..9101dbde39ec 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
133 type = le16_to_cpu(sqshb_ino->inode_type); 133 type = le16_to_cpu(sqshb_ino->inode_type);
134 switch (type) { 134 switch (type) {
135 case SQUASHFS_REG_TYPE: { 135 case SQUASHFS_REG_TYPE: {
136 unsigned int frag_offset, frag_size, frag; 136 unsigned int frag_offset, frag;
137 int frag_size;
137 u64 frag_blk; 138 u64 frag_blk;
138 struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; 139 struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
139 140
@@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
175 break; 176 break;
176 } 177 }
177 case SQUASHFS_LREG_TYPE: { 178 case SQUASHFS_LREG_TYPE: {
178 unsigned int frag_offset, frag_size, frag; 179 unsigned int frag_offset, frag;
180 int frag_size;
179 u64 frag_blk; 181 u64 frag_blk;
180 struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; 182 struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
181 183
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6b2515d027d5..0e9feb6adf7e 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
34 34
35/* block.c */ 35/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, 36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int); 37 int, int);
38 38
39/* cache.c */ 39/* cache.c */
40extern struct squashfs_cache *squashfs_cache_init(char *, int, int); 40extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 071df5b5b491..681ec0d83799 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -389,7 +389,7 @@ static int __init init_squashfs_fs(void)
389 return err; 389 return err;
390 } 390 }
391 391
392 printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) " 392 printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
393 "Phillip Lougher\n"); 393 "Phillip Lougher\n");
394 394
395 return 0; 395 return 0;
diff --git a/fs/super.c b/fs/super.c
index 645e5403f2a0..6ce501447ada 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
82 * lock ordering than usbfs: 82 * lock ordering than usbfs:
83 */ 83 */
84 lockdep_set_class(&s->s_lock, &type->s_lock_key); 84 lockdep_set_class(&s->s_lock, &type->s_lock_key);
85 down_write(&s->s_umount); 85 /*
86 * sget() can have s_umount recursion.
87 *
88 * When it cannot find a suitable sb, it allocates a new
89 * one (this one), and tries again to find a suitable old
90 * one.
91 *
92 * In case that succeeds, it will acquire the s_umount
93 * lock of the old one. Since these are clearly distrinct
94 * locks, and this object isn't exposed yet, there's no
95 * risk of deadlocks.
96 *
97 * Annotate this by putting this lock in a different
98 * subclass.
99 */
100 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
86 s->s_count = S_BIAS; 101 s->s_count = S_BIAS;
87 atomic_set(&s->s_active, 1); 102 atomic_set(&s->s_active, 1);
88 mutex_init(&s->s_vfs_rename_mutex); 103 mutex_init(&s->s_vfs_rename_mutex);
@@ -301,7 +316,7 @@ void generic_shutdown_super(struct super_block *sb)
301 /* 316 /*
302 * wait for asynchronous fs operations to finish before going further 317 * wait for asynchronous fs operations to finish before going further
303 */ 318 */
304 async_synchronize_full_special(&sb->s_async_list); 319 async_synchronize_full_domain(&sb->s_async_list);
305 320
306 /* bad name - it should be evict_inodes() */ 321 /* bad name - it should be evict_inodes() */
307 invalidate_inodes(sb); 322 invalidate_inodes(sb);
@@ -356,8 +371,10 @@ retry:
356 continue; 371 continue;
357 if (!grab_super(old)) 372 if (!grab_super(old))
358 goto retry; 373 goto retry;
359 if (s) 374 if (s) {
375 up_write(&s->s_umount);
360 destroy_super(s); 376 destroy_super(s);
377 }
361 return old; 378 return old;
362 } 379 }
363 } 380 }
@@ -372,6 +389,7 @@ retry:
372 err = set(s, data); 389 err = set(s, data);
373 if (err) { 390 if (err) {
374 spin_unlock(&sb_lock); 391 spin_unlock(&sb_lock);
392 up_write(&s->s_umount);
375 destroy_super(s); 393 destroy_super(s);
376 return ERR_PTR(err); 394 return ERR_PTR(err);
377 } 395 }
@@ -470,7 +488,7 @@ restart:
470 sb->s_count++; 488 sb->s_count++;
471 spin_unlock(&sb_lock); 489 spin_unlock(&sb_lock);
472 down_read(&sb->s_umount); 490 down_read(&sb->s_umount);
473 async_synchronize_full_special(&sb->s_async_list); 491 async_synchronize_full_domain(&sb->s_async_list);
474 if (sb->s_root && (wait || sb->s_dirt)) 492 if (sb->s_root && (wait || sb->s_dirt))
475 sb->s_op->sync_fs(sb, wait); 493 sb->s_op->sync_fs(sb, wait);
476 up_read(&sb->s_umount); 494 up_read(&sb->s_umount);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8ff3f5..b042bd7034b1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
186 BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); 186 BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
187 BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); 187 BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
188 188
189 if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) 189 if ((flags & ~TFD_CREATE_FLAGS) ||
190 return -EINVAL; 190 (clockid != CLOCK_MONOTONIC &&
191 if (clockid != CLOCK_MONOTONIC && 191 clockid != CLOCK_REALTIME))
192 clockid != CLOCK_REALTIME)
193 return -EINVAL; 192 return -EINVAL;
194 193
195 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 194 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
201 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); 200 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
202 201
203 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 202 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
204 flags & (O_CLOEXEC | O_NONBLOCK)); 203 flags & TFD_SHARED_FCNTL_FLAGS);
205 if (ufd < 0) 204 if (ufd < 0)
206 kfree(ctx); 205 kfree(ctx);
207 206
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
219 if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) 218 if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
220 return -EFAULT; 219 return -EFAULT;
221 220
222 if (!timespec_valid(&ktmr.it_value) || 221 if ((flags & ~TFD_SETTIME_FLAGS) ||
222 !timespec_valid(&ktmr.it_value) ||
223 !timespec_valid(&ktmr.it_interval)) 223 !timespec_valid(&ktmr.it_interval))
224 return -EINVAL; 224 return -EINVAL;
225 225
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d71dc44e21ed..aa1016bb9134 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -34,6 +34,12 @@
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36 36
37#include "xfs_sb.h"
38#include "xfs_inum.h"
39#include "xfs_ag.h"
40#include "xfs_dmapi.h"
41#include "xfs_mount.h"
42
37static kmem_zone_t *xfs_buf_zone; 43static kmem_zone_t *xfs_buf_zone;
38STATIC int xfsbufd(void *); 44STATIC int xfsbufd(void *);
39STATIC int xfsbufd_wakeup(int, gfp_t); 45STATIC int xfsbufd_wakeup(int, gfp_t);
@@ -166,6 +172,75 @@ test_page_region(
166} 172}
167 173
168/* 174/*
175 * Mapping of multi-page buffers into contiguous virtual space
176 */
177
178typedef struct a_list {
179 void *vm_addr;
180 struct a_list *next;
181} a_list_t;
182
183static a_list_t *as_free_head;
184static int as_list_len;
185static DEFINE_SPINLOCK(as_lock);
186
187/*
188 * Try to batch vunmaps because they are costly.
189 */
190STATIC void
191free_address(
192 void *addr)
193{
194 a_list_t *aentry;
195
196#ifdef CONFIG_XEN
197 /*
198 * Xen needs to be able to make sure it can get an exclusive
199 * RO mapping of pages it wants to turn into a pagetable. If
200 * a newly allocated page is also still being vmap()ed by xfs,
201 * it will cause pagetable construction to fail. This is a
202 * quick workaround to always eagerly unmap pages so that Xen
203 * is happy.
204 */
205 vunmap(addr);
206 return;
207#endif
208
209 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
210 if (likely(aentry)) {
211 spin_lock(&as_lock);
212 aentry->next = as_free_head;
213 aentry->vm_addr = addr;
214 as_free_head = aentry;
215 as_list_len++;
216 spin_unlock(&as_lock);
217 } else {
218 vunmap(addr);
219 }
220}
221
222STATIC void
223purge_addresses(void)
224{
225 a_list_t *aentry, *old;
226
227 if (as_free_head == NULL)
228 return;
229
230 spin_lock(&as_lock);
231 aentry = as_free_head;
232 as_free_head = NULL;
233 as_list_len = 0;
234 spin_unlock(&as_lock);
235
236 while ((old = aentry) != NULL) {
237 vunmap(aentry->vm_addr);
238 aentry = aentry->next;
239 kfree(old);
240 }
241}
242
243/*
169 * Internal xfs_buf_t object manipulation 244 * Internal xfs_buf_t object manipulation
170 */ 245 */
171 246
@@ -264,7 +339,7 @@ xfs_buf_free(
264 uint i; 339 uint i;
265 340
266 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 341 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
267 vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count); 342 free_address(bp->b_addr - bp->b_offset);
268 343
269 for (i = 0; i < bp->b_page_count; i++) { 344 for (i = 0; i < bp->b_page_count; i++) {
270 struct page *page = bp->b_pages[i]; 345 struct page *page = bp->b_pages[i];
@@ -386,8 +461,10 @@ _xfs_buf_map_pages(
386 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 461 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
387 bp->b_flags |= XBF_MAPPED; 462 bp->b_flags |= XBF_MAPPED;
388 } else if (flags & XBF_MAPPED) { 463 } else if (flags & XBF_MAPPED) {
389 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 464 if (as_list_len > 64)
390 -1, PAGE_KERNEL); 465 purge_addresses();
466 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
467 VM_MAP, PAGE_KERNEL);
391 if (unlikely(bp->b_addr == NULL)) 468 if (unlikely(bp->b_addr == NULL))
392 return -ENOMEM; 469 return -ENOMEM;
393 bp->b_addr += bp->b_offset; 470 bp->b_addr += bp->b_offset;
@@ -1364,10 +1441,12 @@ xfs_unregister_buftarg(
1364 1441
1365void 1442void
1366xfs_free_buftarg( 1443xfs_free_buftarg(
1367 xfs_buftarg_t *btp) 1444 struct xfs_mount *mp,
1445 struct xfs_buftarg *btp)
1368{ 1446{
1369 xfs_flush_buftarg(btp, 1); 1447 xfs_flush_buftarg(btp, 1);
1370 xfs_blkdev_issue_flush(btp); 1448 if (mp->m_flags & XFS_MOUNT_BARRIER)
1449 xfs_blkdev_issue_flush(btp);
1371 xfs_free_bufhash(btp); 1450 xfs_free_bufhash(btp);
1372 iput(btp->bt_mapping->host); 1451 iput(btp->bt_mapping->host);
1373 1452
@@ -1672,6 +1751,8 @@ xfsbufd(
1672 count++; 1751 count++;
1673 } 1752 }
1674 1753
1754 if (as_list_len > 0)
1755 purge_addresses();
1675 if (count) 1756 if (count)
1676 blk_run_address_space(target->bt_mapping); 1757 blk_run_address_space(target->bt_mapping);
1677 1758
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 288ae7c4c800..9b4d666ad31f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
413 * Handling of buftargs. 413 * Handling of buftargs.
414 */ 414 */
415extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); 415extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
416extern void xfs_free_buftarg(xfs_buftarg_t *); 416extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
417extern void xfs_wait_buftarg(xfs_buftarg_t *); 417extern void xfs_wait_buftarg(xfs_buftarg_t *);
418extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 418extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
419extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 419extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c71e226da7f5..32ae5028e96b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -734,15 +734,15 @@ xfs_close_devices(
734{ 734{
735 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { 735 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
736 struct block_device *logdev = mp->m_logdev_targp->bt_bdev; 736 struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
737 xfs_free_buftarg(mp->m_logdev_targp); 737 xfs_free_buftarg(mp, mp->m_logdev_targp);
738 xfs_blkdev_put(logdev); 738 xfs_blkdev_put(logdev);
739 } 739 }
740 if (mp->m_rtdev_targp) { 740 if (mp->m_rtdev_targp) {
741 struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; 741 struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
742 xfs_free_buftarg(mp->m_rtdev_targp); 742 xfs_free_buftarg(mp, mp->m_rtdev_targp);
743 xfs_blkdev_put(rtdev); 743 xfs_blkdev_put(rtdev);
744 } 744 }
745 xfs_free_buftarg(mp->m_ddev_targp); 745 xfs_free_buftarg(mp, mp->m_ddev_targp);
746} 746}
747 747
748/* 748/*
@@ -811,9 +811,9 @@ xfs_open_devices(
811 811
812 out_free_rtdev_targ: 812 out_free_rtdev_targ:
813 if (mp->m_rtdev_targp) 813 if (mp->m_rtdev_targp)
814 xfs_free_buftarg(mp->m_rtdev_targp); 814 xfs_free_buftarg(mp, mp->m_rtdev_targp);
815 out_free_ddev_targ: 815 out_free_ddev_targ:
816 xfs_free_buftarg(mp->m_ddev_targp); 816 xfs_free_buftarg(mp, mp->m_ddev_targp);
817 out_close_rtdev: 817 out_close_rtdev:
818 if (rtdev) 818 if (rtdev)
819 xfs_blkdev_put(rtdev); 819 xfs_blkdev_put(rtdev);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e2fb6210d4c5..478e587087fe 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -246,9 +246,6 @@ xfs_iget_cache_miss(
246 goto out_destroy; 246 goto out_destroy;
247 } 247 }
248 248
249 if (lock_flags)
250 xfs_ilock(ip, lock_flags);
251
252 /* 249 /*
253 * Preload the radix tree so we can insert safely under the 250 * Preload the radix tree so we can insert safely under the
254 * write spinlock. Note that we cannot sleep inside the preload 251 * write spinlock. Note that we cannot sleep inside the preload
@@ -256,7 +253,16 @@ xfs_iget_cache_miss(
256 */ 253 */
257 if (radix_tree_preload(GFP_KERNEL)) { 254 if (radix_tree_preload(GFP_KERNEL)) {
258 error = EAGAIN; 255 error = EAGAIN;
259 goto out_unlock; 256 goto out_destroy;
257 }
258
259 /*
260 * Because the inode hasn't been added to the radix-tree yet it can't
261 * be found by another thread, so we can do the non-sleeping lock here.
262 */
263 if (lock_flags) {
264 if (!xfs_ilock_nowait(ip, lock_flags))
265 BUG();
260 } 266 }
261 267
262 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 268 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
@@ -284,7 +290,6 @@ xfs_iget_cache_miss(
284out_preload_end: 290out_preload_end:
285 write_unlock(&pag->pag_ici_lock); 291 write_unlock(&pag->pag_ici_lock);
286 radix_tree_preload_end(); 292 radix_tree_preload_end();
287out_unlock:
288 if (lock_flags) 293 if (lock_flags)
289 xfs_iunlock(ip, lock_flags); 294 xfs_iunlock(ip, lock_flags);
290out_destroy: 295out_destroy:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b1047de2fffd..61af610d79b3 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1455,10 +1455,19 @@ xlog_recover_add_to_trans(
1455 item = item->ri_prev; 1455 item = item->ri_prev;
1456 1456
1457 if (item->ri_total == 0) { /* first region to be added */ 1457 if (item->ri_total == 0) { /* first region to be added */
1458 item->ri_total = in_f->ilf_size; 1458 if (in_f->ilf_size == 0 ||
1459 ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); 1459 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1460 item->ri_buf = kmem_zalloc((item->ri_total * 1460 xlog_warn(
1461 sizeof(xfs_log_iovec_t)), KM_SLEEP); 1461 "XFS: bad number of regions (%d) in inode log format",
1462 in_f->ilf_size);
1463 ASSERT(0);
1464 return XFS_ERROR(EIO);
1465 }
1466
1467 item->ri_total = in_f->ilf_size;
1468 item->ri_buf =
1469 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1470 KM_SLEEP);
1462 } 1471 }
1463 ASSERT(item->ri_total > item->ri_cnt); 1472 ASSERT(item->ri_total > item->ri_cnt);
1464 /* Description region is ri_buf[0] */ 1473 /* Description region is ri_buf[0] */