aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorStefan Richter <stefanr@s5r6.in-berlin.de>2011-05-10 14:52:07 -0400
committerStefan Richter <stefanr@s5r6.in-berlin.de>2011-05-10 16:50:41 -0400
commit020abf03cd659388f94cb328e1e1df0656e0d7ff (patch)
tree40d05011708ad1b4a05928d167eb120420581aa6 /fs/xfs
parent0ff8fbc61727c926883eec381fbd3d32d1fab504 (diff)
parent693d92a1bbc9e42681c42ed190bd42b636ca876f (diff)
Merge tag 'v2.6.39-rc7'
in order to pull in changes in drivers/media/dvb/firewire/ and sound/firewire/.
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Makefile13
-rw-r--r--fs/xfs/linux-2.6/kmem.c9
-rw-r--r--fs/xfs/linux-2.6/sv.h59
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c437
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h16
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c607
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h69
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c193
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c591
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c38
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c65
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h24
-rw-r--r--fs/xfs/linux-2.6/xfs_message.c126
-rw-r--r--fs/xfs/linux-2.6/xfs_message.h40
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c292
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c366
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c25
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h92
-rw-r--r--fs/xfs/quota/xfs_dquot.c51
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c5
-rw-r--r--fs/xfs/quota/xfs_qm.c102
-rw-r--r--fs/xfs/quota/xfs_qm.h5
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c5
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c91
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c5
-rw-r--r--fs/xfs/support/debug.c115
-rw-r--r--fs/xfs/support/debug.h54
-rw-r--r--fs/xfs/xfs_acl.h2
-rw-r--r--fs/xfs/xfs_ag.h2
-rw-r--r--fs/xfs/xfs_alloc.c545
-rw-r--r--fs/xfs/xfs_alloc.h41
-rw-r--r--fs/xfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/xfs_bmap.c85
-rw-r--r--fs/xfs/xfs_btree.c9
-rw-r--r--fs/xfs/xfs_buf_item.c200
-rw-r--r--fs/xfs/xfs_buf_item.h11
-rw-r--r--fs/xfs/xfs_da_btree.c9
-rw-r--r--fs/xfs/xfs_dfrag.c4
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_dir2_node.c25
-rw-r--r--fs/xfs/xfs_error.c53
-rw-r--r--fs/xfs/xfs_error.h29
-rw-r--r--fs/xfs/xfs_extfree_item.c96
-rw-r--r--fs/xfs/xfs_extfree_item.h11
-rw-r--r--fs/xfs/xfs_fsops.c20
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c82
-rw-r--r--fs/xfs/xfs_iget.c90
-rw-r--r--fs/xfs/xfs_inode.c187
-rw-r--r--fs/xfs/xfs_inode.h42
-rw-r--r--fs/xfs/xfs_inode_item.c163
-rw-r--r--fs/xfs/xfs_iomap.c250
-rw-r--r--fs/xfs/xfs_iomap.h27
-rw-r--r--fs/xfs/xfs_itable.c2
-rw-r--r--fs/xfs/xfs_log.c885
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c32
-rw-r--r--fs/xfs/xfs_log_priv.h132
-rw-r--r--fs/xfs/xfs_log_recover.c819
-rw-r--r--fs/xfs/xfs_mount.c171
-rw-r--r--fs/xfs/xfs_mount.h23
-rw-r--r--fs/xfs/xfs_mru_cache.c4
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_rename.c1
-rw-r--r--fs/xfs/xfs_rtalloc.c92
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_rw.c58
-rw-r--r--fs/xfs/xfs_trans.c122
-rw-r--r--fs/xfs/xfs_trans.h4
-rw-r--r--fs/xfs/xfs_trans_ail.c629
-rw-r--r--fs/xfs/xfs_trans_buf.c9
-rw-r--r--fs/xfs/xfs_trans_extfree.c8
-rw-r--r--fs/xfs/xfs_trans_inode.c24
-rw-r--r--fs/xfs/xfs_trans_priv.h57
-rw-r--r--fs/xfs/xfs_vnodeops.c140
-rw-r--r--fs/xfs/xfs_vnodeops.h1
80 files changed, 4461 insertions, 4282 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..284a7c89697e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,14 +16,11 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 19ccflags-y := -I$(src) -I$(src)/linux-2.6
20ccflags-$(CONFIG_XFS_DEBUG) += -g
20 21
21XFS_LINUX := linux-2.6 22XFS_LINUX := linux-2.6
22 23
23ifeq ($(CONFIG_XFS_DEBUG),y)
24 EXTRA_CFLAGS += -g
25endif
26
27obj-$(CONFIG_XFS_FS) += xfs.o 24obj-$(CONFIG_XFS_FS) += xfs.o
28 25
29xfs-y += linux-2.6/xfs_trace.o 26xfs-y += linux-2.6/xfs_trace.o
@@ -98,17 +95,17 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
98 kmem.o \ 95 kmem.o \
99 xfs_aops.o \ 96 xfs_aops.o \
100 xfs_buf.o \ 97 xfs_buf.o \
98 xfs_discard.o \
101 xfs_export.o \ 99 xfs_export.o \
102 xfs_file.o \ 100 xfs_file.o \
103 xfs_fs_subr.o \ 101 xfs_fs_subr.o \
104 xfs_globals.o \ 102 xfs_globals.o \
105 xfs_ioctl.o \ 103 xfs_ioctl.o \
106 xfs_iops.o \ 104 xfs_iops.o \
105 xfs_message.o \
107 xfs_super.o \ 106 xfs_super.o \
108 xfs_sync.o \ 107 xfs_sync.o \
109 xfs_xattr.o) 108 xfs_xattr.o)
110 109
111# Objects in support/ 110# Objects in support/
112xfs-y += $(addprefix support/, \ 111xfs-y += support/uuid.o
113 debug.o \
114 uuid.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb6..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include "time.h" 24#include "time.h"
25#include "kmem.h" 25#include "kmem.h"
26#include "xfs_message.h"
26 27
27/* 28/*
28 * Greedy allocation. May fail and may return vmalloced memory. 29 * Greedy allocation. May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
56 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 57 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
57 return ptr; 58 return ptr;
58 if (!(++retries % 100)) 59 if (!(++retries % 100))
59 printk(KERN_ERR "XFS: possible memory allocation " 60 xfs_err(NULL,
60 "deadlock in %s (mode:0x%x)\n", 61 "possible memory allocation deadlock in %s (mode:0x%x)",
61 __func__, lflags); 62 __func__, lflags);
62 congestion_wait(BLK_RW_ASYNC, HZ/50); 63 congestion_wait(BLK_RW_ASYNC, HZ/50);
63 } while (1); 64 } while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
112 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 113 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
113 return ptr; 114 return ptr;
114 if (!(++retries % 100)) 115 if (!(++retries % 100))
115 printk(KERN_ERR "XFS: possible memory allocation " 116 xfs_err(NULL,
116 "deadlock in %s (mode:0x%x)\n", 117 "possible memory allocation deadlock in %s (mode:0x%x)",
117 __func__, lflags); 118 __func__, lflags);
118 congestion_wait(BLK_RW_ASYNC, HZ/50); 119 congestion_wait(BLK_RW_ASYNC, HZ/50);
119 } while (1); 120 } while (1);
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_SV_H__
19#define __XFS_SUPPORT_SV_H__
20
21#include <linux/wait.h>
22#include <linux/sched.h>
23#include <linux/spinlock.h>
24
25/*
26 * Synchronisation variables.
27 *
28 * (Parameters "pri", "svf" and "rts" are not implemented)
29 */
30
31typedef struct sv_s {
32 wait_queue_head_t waiters;
33} sv_t;
34
35static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
36{
37 DECLARE_WAITQUEUE(wait, current);
38
39 add_wait_queue_exclusive(&sv->waiters, &wait);
40 __set_current_state(TASK_UNINTERRUPTIBLE);
41 spin_unlock(lock);
42
43 schedule();
44
45 remove_wait_queue(&sv->waiters, &wait);
46}
47
48#define sv_init(sv,flag,name) \
49 init_waitqueue_head(&(sv)->waiters)
50#define sv_destroy(sv) \
51 /*NOTHING*/
52#define sv_wait(sv, pri, lock, s) \
53 _sv_wait(sv, lock)
54#define sv_signal(sv) \
55 wake_up(&(sv)->waiters)
56#define sv_broadcast(sv) \
57 wake_up_all(&(sv)->waiters)
58
59#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3d..39f4f809bb68 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
219} 219}
220 220
221int 221int
222xfs_check_acl(struct inode *inode, int mask) 222xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
223{ 223{
224 struct xfs_inode *ip = XFS_I(inode); 224 struct xfs_inode *ip;
225 struct posix_acl *acl; 225 struct posix_acl *acl;
226 int error = -EAGAIN; 226 int error = -EAGAIN;
227 227
228 ip = XFS_I(inode);
228 trace_xfs_check_acl(ip); 229 trace_xfs_check_acl(ip);
229 230
230 /* 231 /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
234 if (!XFS_IFORK_Q(ip)) 235 if (!XFS_IFORK_Q(ip))
235 return -EAGAIN; 236 return -EAGAIN;
236 237
238 if (flags & IPERM_FLAG_RCU) {
239 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
240 return -ECHILD;
241 return -EAGAIN;
242 }
243
237 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); 244 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
238 if (IS_ERR(acl)) 245 if (IS_ERR(acl))
239 return PTR_ERR(acl); 246 return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f61223ed6..79ce38be15a1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
38#include <linux/pagevec.h> 38#include <linux/pagevec.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40 40
41/*
42 * Types of I/O for bmap clustering and I/O completion tracking.
43 */
44enum {
45 IO_READ, /* mapping for a read */
46 IO_DELAY, /* mapping covers delalloc region */
47 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
48 IO_NEW /* just allocated */
49};
50 41
51/* 42/*
52 * Prime number of hash buckets since address is used as the key. 43 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
182 xfs_inode_t *ip = XFS_I(ioend->io_inode); 173 xfs_inode_t *ip = XFS_I(ioend->io_inode);
183 xfs_fsize_t isize; 174 xfs_fsize_t isize;
184 175
185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
186 ASSERT(ioend->io_type != IO_READ);
187
188 if (unlikely(ioend->io_error)) 176 if (unlikely(ioend->io_error))
189 return 0; 177 return 0;
190 178
@@ -244,10 +232,8 @@ xfs_end_io(
244 * We might have to update the on-disk file size after extending 232 * We might have to update the on-disk file size after extending
245 * writes. 233 * writes.
246 */ 234 */
247 if (ioend->io_type != IO_READ) { 235 error = xfs_setfilesize(ioend);
248 error = xfs_setfilesize(ioend); 236 ASSERT(!error || error == EAGAIN);
249 ASSERT(!error || error == EAGAIN);
250 }
251 237
252 /* 238 /*
253 * If we didn't complete processing of the ioend, requeue it to the 239 * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
318xfs_map_blocks( 304xfs_map_blocks(
319 struct inode *inode, 305 struct inode *inode,
320 loff_t offset, 306 loff_t offset,
321 ssize_t count,
322 struct xfs_bmbt_irec *imap, 307 struct xfs_bmbt_irec *imap,
323 int flags) 308 int type,
309 int nonblocking)
324{ 310{
325 int nmaps = 1; 311 struct xfs_inode *ip = XFS_I(inode);
326 int new = 0; 312 struct xfs_mount *mp = ip->i_mount;
313 ssize_t count = 1 << inode->i_blkbits;
314 xfs_fileoff_t offset_fsb, end_fsb;
315 int error = 0;
316 int bmapi_flags = XFS_BMAPI_ENTIRE;
317 int nimaps = 1;
318
319 if (XFS_FORCED_SHUTDOWN(mp))
320 return -XFS_ERROR(EIO);
321
322 if (type == IO_UNWRITTEN)
323 bmapi_flags |= XFS_BMAPI_IGSTATE;
324
325 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
326 if (nonblocking)
327 return -XFS_ERROR(EAGAIN);
328 xfs_ilock(ip, XFS_ILOCK_SHARED);
329 }
327 330
328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); 331 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
332 (ip->i_df.if_flags & XFS_IFEXTENTS));
333 ASSERT(offset <= mp->m_maxioffset);
334
335 if (offset + count > mp->m_maxioffset)
336 count = mp->m_maxioffset - offset;
337 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
338 offset_fsb = XFS_B_TO_FSBT(mp, offset);
339 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
340 bmapi_flags, NULL, 0, imap, &nimaps, NULL);
341 xfs_iunlock(ip, XFS_ILOCK_SHARED);
342
343 if (error)
344 return -XFS_ERROR(error);
345
346 if (type == IO_DELALLOC &&
347 (!nimaps || isnullstartblock(imap->br_startblock))) {
348 error = xfs_iomap_write_allocate(ip, offset, count, imap);
349 if (!error)
350 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
351 return -XFS_ERROR(error);
352 }
353
354#ifdef DEBUG
355 if (type == IO_UNWRITTEN) {
356 ASSERT(nimaps);
357 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
358 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
359 }
360#endif
361 if (nimaps)
362 trace_xfs_map_blocks_found(ip, offset, count, type, imap);
363 return 0;
329} 364}
330 365
331STATIC int 366STATIC int
@@ -378,28 +413,19 @@ xfs_submit_ioend_bio(
378 if (xfs_ioend_new_eof(ioend)) 413 if (xfs_ioend_new_eof(ioend))
379 xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); 414 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
380 415
381 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
382 WRITE_SYNC_PLUG : WRITE, bio);
383 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
384 bio_put(bio);
385} 417}
386 418
387STATIC struct bio * 419STATIC struct bio *
388xfs_alloc_ioend_bio( 420xfs_alloc_ioend_bio(
389 struct buffer_head *bh) 421 struct buffer_head *bh)
390{ 422{
391 struct bio *bio;
392 int nvecs = bio_get_nr_vecs(bh->b_bdev); 423 int nvecs = bio_get_nr_vecs(bh->b_bdev);
393 424 struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
394 do {
395 bio = bio_alloc(GFP_NOIO, nvecs);
396 nvecs >>= 1;
397 } while (!bio);
398 425
399 ASSERT(bio->bi_private == NULL); 426 ASSERT(bio->bi_private == NULL);
400 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 427 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
401 bio->bi_bdev = bh->b_bdev; 428 bio->bi_bdev = bh->b_bdev;
402 bio_get(bio);
403 return bio; 429 return bio;
404} 430}
405 431
@@ -470,9 +496,8 @@ xfs_submit_ioend(
470 /* Pass 1 - start writeback */ 496 /* Pass 1 - start writeback */
471 do { 497 do {
472 next = ioend->io_list; 498 next = ioend->io_list;
473 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 499 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
474 xfs_start_buffer_writeback(bh); 500 xfs_start_buffer_writeback(bh);
475 }
476 } while ((ioend = next) != NULL); 501 } while ((ioend = next) != NULL);
477 502
478 /* Pass 2 - submit I/O */ 503 /* Pass 2 - submit I/O */
@@ -600,117 +625,13 @@ xfs_map_at_offset(
600 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 625 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 626 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
602 627
603 lock_buffer(bh);
604 xfs_map_buffer(inode, bh, imap, offset); 628 xfs_map_buffer(inode, bh, imap, offset);
605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
606 set_buffer_mapped(bh); 629 set_buffer_mapped(bh);
607 clear_buffer_delay(bh); 630 clear_buffer_delay(bh);
608 clear_buffer_unwritten(bh); 631 clear_buffer_unwritten(bh);
609} 632}
610 633
611/* 634/*
612 * Look for a page at index that is suitable for clustering.
613 */
614STATIC unsigned int
615xfs_probe_page(
616 struct page *page,
617 unsigned int pg_offset)
618{
619 struct buffer_head *bh, *head;
620 int ret = 0;
621
622 if (PageWriteback(page))
623 return 0;
624 if (!PageDirty(page))
625 return 0;
626 if (!page->mapping)
627 return 0;
628 if (!page_has_buffers(page))
629 return 0;
630
631 bh = head = page_buffers(page);
632 do {
633 if (!buffer_uptodate(bh))
634 break;
635 if (!buffer_mapped(bh))
636 break;
637 ret += bh->b_size;
638 if (ret >= pg_offset)
639 break;
640 } while ((bh = bh->b_this_page) != head);
641
642 return ret;
643}
644
645STATIC size_t
646xfs_probe_cluster(
647 struct inode *inode,
648 struct page *startpage,
649 struct buffer_head *bh,
650 struct buffer_head *head)
651{
652 struct pagevec pvec;
653 pgoff_t tindex, tlast, tloff;
654 size_t total = 0;
655 int done = 0, i;
656
657 /* First sum forwards in this page */
658 do {
659 if (!buffer_uptodate(bh) || !buffer_mapped(bh))
660 return total;
661 total += bh->b_size;
662 } while ((bh = bh->b_this_page) != head);
663
664 /* if we reached the end of the page, sum forwards in following pages */
665 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
666 tindex = startpage->index + 1;
667
668 /* Prune this back to avoid pathological behavior */
669 tloff = min(tlast, startpage->index + 64);
670
671 pagevec_init(&pvec, 0);
672 while (!done && tindex <= tloff) {
673 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
674
675 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
676 break;
677
678 for (i = 0; i < pagevec_count(&pvec); i++) {
679 struct page *page = pvec.pages[i];
680 size_t pg_offset, pg_len = 0;
681
682 if (tindex == tlast) {
683 pg_offset =
684 i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
685 if (!pg_offset) {
686 done = 1;
687 break;
688 }
689 } else
690 pg_offset = PAGE_CACHE_SIZE;
691
692 if (page->index == tindex && trylock_page(page)) {
693 pg_len = xfs_probe_page(page, pg_offset);
694 unlock_page(page);
695 }
696
697 if (!pg_len) {
698 done = 1;
699 break;
700 }
701
702 total += pg_len;
703 tindex++;
704 }
705
706 pagevec_release(&pvec);
707 cond_resched();
708 }
709
710 return total;
711}
712
713/*
714 * Test if a given page is suitable for writing as part of an unwritten 635 * Test if a given page is suitable for writing as part of an unwritten
715 * or delayed allocate extent. 636 * or delayed allocate extent.
716 */ 637 */
@@ -731,9 +652,9 @@ xfs_is_delayed_page(
731 if (buffer_unwritten(bh)) 652 if (buffer_unwritten(bh))
732 acceptable = (type == IO_UNWRITTEN); 653 acceptable = (type == IO_UNWRITTEN);
733 else if (buffer_delay(bh)) 654 else if (buffer_delay(bh))
734 acceptable = (type == IO_DELAY); 655 acceptable = (type == IO_DELALLOC);
735 else if (buffer_dirty(bh) && buffer_mapped(bh)) 656 else if (buffer_dirty(bh) && buffer_mapped(bh))
736 acceptable = (type == IO_NEW); 657 acceptable = (type == IO_OVERWRITE);
737 else 658 else
738 break; 659 break;
739 } while ((bh = bh->b_this_page) != head); 660 } while ((bh = bh->b_this_page) != head);
@@ -758,8 +679,7 @@ xfs_convert_page(
758 loff_t tindex, 679 loff_t tindex,
759 struct xfs_bmbt_irec *imap, 680 struct xfs_bmbt_irec *imap,
760 xfs_ioend_t **ioendp, 681 xfs_ioend_t **ioendp,
761 struct writeback_control *wbc, 682 struct writeback_control *wbc)
762 int all_bh)
763{ 683{
764 struct buffer_head *bh, *head; 684 struct buffer_head *bh, *head;
765 xfs_off_t end_offset; 685 xfs_off_t end_offset;
@@ -814,37 +734,30 @@ xfs_convert_page(
814 continue; 734 continue;
815 } 735 }
816 736
817 if (buffer_unwritten(bh) || buffer_delay(bh)) { 737 if (buffer_unwritten(bh) || buffer_delay(bh) ||
738 buffer_mapped(bh)) {
818 if (buffer_unwritten(bh)) 739 if (buffer_unwritten(bh))
819 type = IO_UNWRITTEN; 740 type = IO_UNWRITTEN;
741 else if (buffer_delay(bh))
742 type = IO_DELALLOC;
820 else 743 else
821 type = IO_DELAY; 744 type = IO_OVERWRITE;
822 745
823 if (!xfs_imap_valid(inode, imap, offset)) { 746 if (!xfs_imap_valid(inode, imap, offset)) {
824 done = 1; 747 done = 1;
825 continue; 748 continue;
826 } 749 }
827 750
828 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 751 lock_buffer(bh);
829 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 752 if (type != IO_OVERWRITE)
830 753 xfs_map_at_offset(inode, bh, imap, offset);
831 xfs_map_at_offset(inode, bh, imap, offset);
832 xfs_add_to_ioend(inode, bh, offset, type, 754 xfs_add_to_ioend(inode, bh, offset, type,
833 ioendp, done); 755 ioendp, done);
834 756
835 page_dirty--; 757 page_dirty--;
836 count++; 758 count++;
837 } else { 759 } else {
838 type = IO_NEW; 760 done = 1;
839 if (buffer_mapped(bh) && all_bh) {
840 lock_buffer(bh);
841 xfs_add_to_ioend(inode, bh, offset,
842 type, ioendp, done);
843 count++;
844 page_dirty--;
845 } else {
846 done = 1;
847 }
848 } 761 }
849 } while (offset += len, (bh = bh->b_this_page) != head); 762 } while (offset += len, (bh = bh->b_this_page) != head);
850 763
@@ -876,7 +789,6 @@ xfs_cluster_write(
876 struct xfs_bmbt_irec *imap, 789 struct xfs_bmbt_irec *imap,
877 xfs_ioend_t **ioendp, 790 xfs_ioend_t **ioendp,
878 struct writeback_control *wbc, 791 struct writeback_control *wbc,
879 int all_bh,
880 pgoff_t tlast) 792 pgoff_t tlast)
881{ 793{
882 struct pagevec pvec; 794 struct pagevec pvec;
@@ -891,7 +803,7 @@ xfs_cluster_write(
891 803
892 for (i = 0; i < pagevec_count(&pvec); i++) { 804 for (i = 0; i < pagevec_count(&pvec); i++) {
893 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 805 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
894 imap, ioendp, wbc, all_bh); 806 imap, ioendp, wbc);
895 if (done) 807 if (done)
896 break; 808 break;
897 } 809 }
@@ -935,13 +847,13 @@ xfs_aops_discard_page(
935 struct buffer_head *bh, *head; 847 struct buffer_head *bh, *head;
936 loff_t offset = page_offset(page); 848 loff_t offset = page_offset(page);
937 849
938 if (!xfs_is_delayed_page(page, IO_DELAY)) 850 if (!xfs_is_delayed_page(page, IO_DELALLOC))
939 goto out_invalidate; 851 goto out_invalidate;
940 852
941 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 853 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
942 goto out_invalidate; 854 goto out_invalidate;
943 855
944 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 856 xfs_alert(ip->i_mount,
945 "page discard on page %p, inode 0x%llx, offset %llu.", 857 "page discard on page %p, inode 0x%llx, offset %llu.",
946 page, ip->i_ino, offset); 858 page, ip->i_ino, offset);
947 859
@@ -959,7 +871,7 @@ xfs_aops_discard_page(
959 if (error) { 871 if (error) {
960 /* something screwed, just bail */ 872 /* something screwed, just bail */
961 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 873 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
962 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 874 xfs_alert(ip->i_mount,
963 "page discard unable to remove delalloc mapping."); 875 "page discard unable to remove delalloc mapping.");
964 } 876 }
965 break; 877 break;
@@ -1002,10 +914,10 @@ xfs_vm_writepage(
1002 unsigned int type; 914 unsigned int type;
1003 __uint64_t end_offset; 915 __uint64_t end_offset;
1004 pgoff_t end_index, last_index; 916 pgoff_t end_index, last_index;
1005 ssize_t size, len; 917 ssize_t len;
1006 int flags, err, imap_valid = 0, uptodate = 1; 918 int err, imap_valid = 0, uptodate = 1;
1007 int count = 0; 919 int count = 0;
1008 int all_bh = 0; 920 int nonblocking = 0;
1009 921
1010 trace_xfs_writepage(inode, page, 0); 922 trace_xfs_writepage(inode, page, 0);
1011 923
@@ -1056,10 +968,14 @@ xfs_vm_writepage(
1056 968
1057 bh = head = page_buffers(page); 969 bh = head = page_buffers(page);
1058 offset = page_offset(page); 970 offset = page_offset(page);
1059 flags = BMAPI_READ; 971 type = IO_OVERWRITE;
1060 type = IO_NEW; 972
973 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
974 nonblocking = 1;
1061 975
1062 do { 976 do {
977 int new_ioend = 0;
978
1063 if (offset >= end_offset) 979 if (offset >= end_offset)
1064 break; 980 break;
1065 if (!buffer_uptodate(bh)) 981 if (!buffer_uptodate(bh))
@@ -1076,90 +992,54 @@ xfs_vm_writepage(
1076 continue; 992 continue;
1077 } 993 }
1078 994
1079 if (imap_valid) 995 if (buffer_unwritten(bh)) {
1080 imap_valid = xfs_imap_valid(inode, &imap, offset); 996 if (type != IO_UNWRITTEN) {
1081
1082 if (buffer_unwritten(bh) || buffer_delay(bh)) {
1083 int new_ioend = 0;
1084
1085 /*
1086 * Make sure we don't use a read-only iomap
1087 */
1088 if (flags == BMAPI_READ)
1089 imap_valid = 0;
1090
1091 if (buffer_unwritten(bh)) {
1092 type = IO_UNWRITTEN; 997 type = IO_UNWRITTEN;
1093 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 998 imap_valid = 0;
1094 } else if (buffer_delay(bh)) {
1095 type = IO_DELAY;
1096 flags = BMAPI_ALLOCATE;
1097
1098 if (wbc->sync_mode == WB_SYNC_NONE)
1099 flags |= BMAPI_TRYLOCK;
1100 }
1101
1102 if (!imap_valid) {
1103 /*
1104 * If we didn't have a valid mapping then we
1105 * need to ensure that we put the new mapping
1106 * in a new ioend structure. This needs to be
1107 * done to ensure that the ioends correctly
1108 * reflect the block mappings at io completion
1109 * for unwritten extent conversion.
1110 */
1111 new_ioend = 1;
1112 err = xfs_map_blocks(inode, offset, len,
1113 &imap, flags);
1114 if (err)
1115 goto error;
1116 imap_valid = xfs_imap_valid(inode, &imap,
1117 offset);
1118 } 999 }
1119 if (imap_valid) { 1000 } else if (buffer_delay(bh)) {
1120 xfs_map_at_offset(inode, bh, &imap, offset); 1001 if (type != IO_DELALLOC) {
1121 xfs_add_to_ioend(inode, bh, offset, type, 1002 type = IO_DELALLOC;
1122 &ioend, new_ioend); 1003 imap_valid = 0;
1123 count++;
1124 } 1004 }
1125 } else if (buffer_uptodate(bh)) { 1005 } else if (buffer_uptodate(bh)) {
1126 /* 1006 if (type != IO_OVERWRITE) {
1127 * we got here because the buffer is already mapped. 1007 type = IO_OVERWRITE;
1128 * That means it must already have extents allocated 1008 imap_valid = 0;
1129 * underneath it. Map the extent by reading it.
1130 */
1131 if (!imap_valid || flags != BMAPI_READ) {
1132 flags = BMAPI_READ;
1133 size = xfs_probe_cluster(inode, page, bh, head);
1134 err = xfs_map_blocks(inode, offset, size,
1135 &imap, flags);
1136 if (err)
1137 goto error;
1138 imap_valid = xfs_imap_valid(inode, &imap,
1139 offset);
1140 } 1009 }
1010 } else {
1011 if (PageUptodate(page)) {
1012 ASSERT(buffer_mapped(bh));
1013 imap_valid = 0;
1014 }
1015 continue;
1016 }
1141 1017
1018 if (imap_valid)
1019 imap_valid = xfs_imap_valid(inode, &imap, offset);
1020 if (!imap_valid) {
1142 /* 1021 /*
1143 * We set the type to IO_NEW in case we are doing a 1022 * If we didn't have a valid mapping then we need to
1144 * small write at EOF that is extending the file but 1023 * put the new mapping into a separate ioend structure.
1145 * without needing an allocation. We need to update the 1024 * This ensures non-contiguous extents always have
1146 * file size on I/O completion in this case so it is 1025 * separate ioends, which is particularly important
1147 * the same case as having just allocated a new extent 1026 * for unwritten extent conversion at I/O completion
1148 * that we are writing into for the first time. 1027 * time.
1149 */ 1028 */
1150 type = IO_NEW; 1029 new_ioend = 1;
1151 if (trylock_buffer(bh)) { 1030 err = xfs_map_blocks(inode, offset, &imap, type,
1152 if (imap_valid) 1031 nonblocking);
1153 all_bh = 1; 1032 if (err)
1154 xfs_add_to_ioend(inode, bh, offset, type, 1033 goto error;
1155 &ioend, !imap_valid); 1034 imap_valid = xfs_imap_valid(inode, &imap, offset);
1156 count++; 1035 }
1157 } else { 1036 if (imap_valid) {
1158 imap_valid = 0; 1037 lock_buffer(bh);
1159 } 1038 if (type != IO_OVERWRITE)
1160 } else if (PageUptodate(page)) { 1039 xfs_map_at_offset(inode, bh, &imap, offset);
1161 ASSERT(buffer_mapped(bh)); 1040 xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1162 imap_valid = 0; 1041 new_ioend);
1042 count++;
1163 } 1043 }
1164 1044
1165 if (!iohead) 1045 if (!iohead)
@@ -1188,7 +1068,7 @@ xfs_vm_writepage(
1188 end_index = last_index; 1068 end_index = last_index;
1189 1069
1190 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1070 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1191 wbc, all_bh, end_index); 1071 wbc, end_index);
1192 } 1072 }
1193 1073
1194 if (iohead) 1074 if (iohead)
@@ -1257,13 +1137,19 @@ __xfs_get_blocks(
1257 int create, 1137 int create,
1258 int direct) 1138 int direct)
1259{ 1139{
1260 int flags = create ? BMAPI_WRITE : BMAPI_READ; 1140 struct xfs_inode *ip = XFS_I(inode);
1141 struct xfs_mount *mp = ip->i_mount;
1142 xfs_fileoff_t offset_fsb, end_fsb;
1143 int error = 0;
1144 int lockmode = 0;
1261 struct xfs_bmbt_irec imap; 1145 struct xfs_bmbt_irec imap;
1146 int nimaps = 1;
1262 xfs_off_t offset; 1147 xfs_off_t offset;
1263 ssize_t size; 1148 ssize_t size;
1264 int nimap = 1;
1265 int new = 0; 1149 int new = 0;
1266 int error; 1150
1151 if (XFS_FORCED_SHUTDOWN(mp))
1152 return -XFS_ERROR(EIO);
1267 1153
1268 offset = (xfs_off_t)iblock << inode->i_blkbits; 1154 offset = (xfs_off_t)iblock << inode->i_blkbits;
1269 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1155 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1158,45 @@ __xfs_get_blocks(
1272 if (!create && direct && offset >= i_size_read(inode)) 1158 if (!create && direct && offset >= i_size_read(inode))
1273 return 0; 1159 return 0;
1274 1160
1275 if (direct && create) 1161 if (create) {
1276 flags |= BMAPI_DIRECT; 1162 lockmode = XFS_ILOCK_EXCL;
1163 xfs_ilock(ip, lockmode);
1164 } else {
1165 lockmode = xfs_ilock_map_shared(ip);
1166 }
1167
1168 ASSERT(offset <= mp->m_maxioffset);
1169 if (offset + size > mp->m_maxioffset)
1170 size = mp->m_maxioffset - offset;
1171 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1172 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1277 1173
1278 error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, 1174 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
1279 &new); 1175 XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
1280 if (error) 1176 if (error)
1281 return -error; 1177 goto out_unlock;
1282 if (nimap == 0) 1178
1283 return 0; 1179 if (create &&
1180 (!nimaps ||
1181 (imap.br_startblock == HOLESTARTBLOCK ||
1182 imap.br_startblock == DELAYSTARTBLOCK))) {
1183 if (direct) {
1184 error = xfs_iomap_write_direct(ip, offset, size,
1185 &imap, nimaps);
1186 } else {
1187 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1188 }
1189 if (error)
1190 goto out_unlock;
1191
1192 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1193 } else if (nimaps) {
1194 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1195 } else {
1196 trace_xfs_get_blocks_notfound(ip, offset, size);
1197 goto out_unlock;
1198 }
1199 xfs_iunlock(ip, lockmode);
1284 1200
1285 if (imap.br_startblock != HOLESTARTBLOCK && 1201 if (imap.br_startblock != HOLESTARTBLOCK &&
1286 imap.br_startblock != DELAYSTARTBLOCK) { 1202 imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1263,10 @@ __xfs_get_blocks(
1347 } 1263 }
1348 1264
1349 return 0; 1265 return 0;
1266
1267out_unlock:
1268 xfs_iunlock(ip, lockmode);
1269 return -error;
1350} 1270}
1351 1271
1352int 1272int
@@ -1375,7 +1295,7 @@ xfs_get_blocks_direct(
1375 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1295 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1376 * need to issue a transaction to convert the range from unwritten to written 1296 * need to issue a transaction to convert the range from unwritten to written
1377 * extents. In case this is regular synchronous I/O we just call xfs_end_io 1297 * extents. In case this is regular synchronous I/O we just call xfs_end_io
1378 * to do this and we are done. But in case this was a successfull AIO 1298 * to do this and we are done. But in case this was a successful AIO
1379 * request this handler is called from interrupt context, from which we 1299 * request this handler is called from interrupt context, from which we
1380 * can't start transactions. In that case offload the I/O completion to 1300 * can't start transactions. In that case offload the I/O completion to
1381 * the workqueues we also use for buffered I/O completion. 1301 * the workqueues we also use for buffered I/O completion.
@@ -1434,7 +1354,7 @@ xfs_vm_direct_IO(
1434 ssize_t ret; 1354 ssize_t ret;
1435 1355
1436 if (rw & WRITE) { 1356 if (rw & WRITE) {
1437 iocb->private = xfs_alloc_ioend(inode, IO_NEW); 1357 iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
1438 1358
1439 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1359 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1440 offset, nr_segs, 1360 offset, nr_segs,
@@ -1490,7 +1410,7 @@ xfs_vm_write_failed(
1490 if (error) { 1410 if (error) {
1491 /* something screwed, just bail */ 1411 /* something screwed, just bail */
1492 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1412 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1493 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 1413 xfs_alert(ip->i_mount,
1494 "xfs_vm_write_failed: unable to clean up ino %lld", 1414 "xfs_vm_write_failed: unable to clean up ino %lld",
1495 ip->i_ino); 1415 ip->i_ino);
1496 } 1416 }
@@ -1574,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
1574 .readpages = xfs_vm_readpages, 1494 .readpages = xfs_vm_readpages,
1575 .writepage = xfs_vm_writepage, 1495 .writepage = xfs_vm_writepage,
1576 .writepages = xfs_vm_writepages, 1496 .writepages = xfs_vm_writepages,
1577 .sync_page = block_sync_page,
1578 .releasepage = xfs_vm_releasepage, 1497 .releasepage = xfs_vm_releasepage,
1579 .invalidatepage = xfs_vm_invalidatepage, 1498 .invalidatepage = xfs_vm_invalidatepage,
1580 .write_begin = xfs_vm_write_begin, 1499 .write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
23extern mempool_t *xfs_ioend_pool; 23extern mempool_t *xfs_ioend_pool;
24 24
25/* 25/*
26 * Types of I/O for bmap clustering and I/O completion tracking.
27 */
28enum {
29 IO_DIRECT = 0, /* special case for direct I/O ioends */
30 IO_DELALLOC, /* mapping covers delalloc region */
31 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
32 IO_OVERWRITE, /* mapping covers already allocated extent */
33};
34
35#define XFS_IO_TYPES \
36 { 0, "" }, \
37 { IO_DELALLOC, "delalloc" }, \
38 { IO_UNWRITTEN, "unwritten" }, \
39 { IO_OVERWRITE, "overwrite" }
40
41/*
26 * xfs_ioend struct manages large extent writes for XFS. 42 * xfs_ioend struct manages large extent writes for XFS.
27 * It can manage several multi-page bio's at once. 43 * It can manage several multi-page bio's at once.
28 */ 44 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6e9e31..9ef9ed2cfe2e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
44 44
45static kmem_zone_t *xfs_buf_zone; 45static kmem_zone_t *xfs_buf_zone;
46STATIC int xfsbufd(void *); 46STATIC int xfsbufd(void *);
47STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
48STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 47STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
49static struct shrinker xfs_buf_shake = {
50 .shrink = xfsbufd_wakeup,
51 .seeks = DEFAULT_SEEKS,
52};
53 48
54static struct workqueue_struct *xfslogd_workqueue; 49static struct workqueue_struct *xfslogd_workqueue;
55struct workqueue_struct *xfsdatad_workqueue; 50struct workqueue_struct *xfsdatad_workqueue;
@@ -99,77 +94,79 @@ xfs_buf_vmap_len(
99} 94}
100 95
101/* 96/*
102 * Page Region interfaces. 97 * xfs_buf_lru_add - add a buffer to the LRU.
103 * 98 *
104 * For pages in filesystems where the blocksize is smaller than the 99 * The LRU takes a new reference to the buffer so that it will only be freed
105 * pagesize, we use the page->private field (long) to hold a bitmap 100 * once the shrinker takes the buffer off the LRU.
106 * of uptodate regions within the page.
107 *
108 * Each such region is "bytes per page / bits per long" bytes long.
109 *
110 * NBPPR == number-of-bytes-per-page-region
111 * BTOPR == bytes-to-page-region (rounded up)
112 * BTOPRT == bytes-to-page-region-truncated (rounded down)
113 */ 101 */
114#if (BITS_PER_LONG == 32) 102STATIC void
115#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 103xfs_buf_lru_add(
116#elif (BITS_PER_LONG == 64) 104 struct xfs_buf *bp)
117#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
118#else
119#error BITS_PER_LONG must be 32 or 64
120#endif
121#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
122#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
123#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
124
125STATIC unsigned long
126page_region_mask(
127 size_t offset,
128 size_t length)
129{ 105{
130 unsigned long mask; 106 struct xfs_buftarg *btp = bp->b_target;
131 int first, final;
132
133 first = BTOPR(offset);
134 final = BTOPRT(offset + length - 1);
135 first = min(first, final);
136
137 mask = ~0UL;
138 mask <<= BITS_PER_LONG - (final - first);
139 mask >>= BITS_PER_LONG - (final);
140
141 ASSERT(offset + length <= PAGE_CACHE_SIZE);
142 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
143 107
144 return mask; 108 spin_lock(&btp->bt_lru_lock);
109 if (list_empty(&bp->b_lru)) {
110 atomic_inc(&bp->b_hold);
111 list_add_tail(&bp->b_lru, &btp->bt_lru);
112 btp->bt_lru_nr++;
113 }
114 spin_unlock(&btp->bt_lru_lock);
145} 115}
146 116
117/*
118 * xfs_buf_lru_del - remove a buffer from the LRU
119 *
120 * The unlocked check is safe here because it only occurs when there are not
121 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
122 * to optimise the shrinker removing the buffer from the LRU and calling
123 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
124 * bt_lru_lock.
125 */
147STATIC void 126STATIC void
148set_page_region( 127xfs_buf_lru_del(
149 struct page *page, 128 struct xfs_buf *bp)
150 size_t offset,
151 size_t length)
152{ 129{
153 set_page_private(page, 130 struct xfs_buftarg *btp = bp->b_target;
154 page_private(page) | page_region_mask(offset, length));
155 if (page_private(page) == ~0UL)
156 SetPageUptodate(page);
157}
158 131
159STATIC int 132 if (list_empty(&bp->b_lru))
160test_page_region( 133 return;
161 struct page *page,
162 size_t offset,
163 size_t length)
164{
165 unsigned long mask = page_region_mask(offset, length);
166 134
167 return (mask && (page_private(page) & mask) == mask); 135 spin_lock(&btp->bt_lru_lock);
136 if (!list_empty(&bp->b_lru)) {
137 list_del_init(&bp->b_lru);
138 btp->bt_lru_nr--;
139 }
140 spin_unlock(&btp->bt_lru_lock);
168} 141}
169 142
170/* 143/*
171 * Internal xfs_buf_t object manipulation 144 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
145 * b_lru_ref count so that the buffer is freed immediately when the buffer
146 * reference count falls to zero. If the buffer is already on the LRU, we need
147 * to remove the reference that LRU holds on the buffer.
148 *
149 * This prevents build-up of stale buffers on the LRU.
172 */ 150 */
151void
152xfs_buf_stale(
153 struct xfs_buf *bp)
154{
155 bp->b_flags |= XBF_STALE;
156 atomic_set(&(bp)->b_lru_ref, 0);
157 if (!list_empty(&bp->b_lru)) {
158 struct xfs_buftarg *btp = bp->b_target;
159
160 spin_lock(&btp->bt_lru_lock);
161 if (!list_empty(&bp->b_lru)) {
162 list_del_init(&bp->b_lru);
163 btp->bt_lru_nr--;
164 atomic_dec(&bp->b_hold);
165 }
166 spin_unlock(&btp->bt_lru_lock);
167 }
168 ASSERT(atomic_read(&bp->b_hold) >= 1);
169}
173 170
174STATIC void 171STATIC void
175_xfs_buf_initialize( 172_xfs_buf_initialize(
@@ -186,7 +183,9 @@ _xfs_buf_initialize(
186 183
187 memset(bp, 0, sizeof(xfs_buf_t)); 184 memset(bp, 0, sizeof(xfs_buf_t));
188 atomic_set(&bp->b_hold, 1); 185 atomic_set(&bp->b_hold, 1);
186 atomic_set(&bp->b_lru_ref, 1);
189 init_completion(&bp->b_iowait); 187 init_completion(&bp->b_iowait);
188 INIT_LIST_HEAD(&bp->b_lru);
190 INIT_LIST_HEAD(&bp->b_list); 189 INIT_LIST_HEAD(&bp->b_list);
191 RB_CLEAR_NODE(&bp->b_rbnode); 190 RB_CLEAR_NODE(&bp->b_rbnode);
192 sema_init(&bp->b_sema, 0); /* held, no waiters */ 191 sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,7 +261,9 @@ xfs_buf_free(
262{ 261{
263 trace_xfs_buf_free(bp, _RET_IP_); 262 trace_xfs_buf_free(bp, _RET_IP_);
264 263
265 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 264 ASSERT(list_empty(&bp->b_lru));
265
266 if (bp->b_flags & _XBF_PAGES) {
266 uint i; 267 uint i;
267 268
268 if (xfs_buf_is_vmapped(bp)) 269 if (xfs_buf_is_vmapped(bp))
@@ -272,56 +273,77 @@ xfs_buf_free(
272 for (i = 0; i < bp->b_page_count; i++) { 273 for (i = 0; i < bp->b_page_count; i++) {
273 struct page *page = bp->b_pages[i]; 274 struct page *page = bp->b_pages[i];
274 275
275 if (bp->b_flags & _XBF_PAGE_CACHE) 276 __free_page(page);
276 ASSERT(!PagePrivate(page));
277 page_cache_release(page);
278 } 277 }
279 } 278 } else if (bp->b_flags & _XBF_KMEM)
279 kmem_free(bp->b_addr);
280 _xfs_buf_free_pages(bp); 280 _xfs_buf_free_pages(bp);
281 xfs_buf_deallocate(bp); 281 xfs_buf_deallocate(bp);
282} 282}
283 283
284/* 284/*
285 * Finds all pages for buffer in question and builds it's page list. 285 * Allocates all the pages for buffer in question and builds it's page list.
286 */ 286 */
287STATIC int 287STATIC int
288_xfs_buf_lookup_pages( 288xfs_buf_allocate_memory(
289 xfs_buf_t *bp, 289 xfs_buf_t *bp,
290 uint flags) 290 uint flags)
291{ 291{
292 struct address_space *mapping = bp->b_target->bt_mapping;
293 size_t blocksize = bp->b_target->bt_bsize;
294 size_t size = bp->b_count_desired; 292 size_t size = bp->b_count_desired;
295 size_t nbytes, offset; 293 size_t nbytes, offset;
296 gfp_t gfp_mask = xb_to_gfp(flags); 294 gfp_t gfp_mask = xb_to_gfp(flags);
297 unsigned short page_count, i; 295 unsigned short page_count, i;
298 pgoff_t first;
299 xfs_off_t end; 296 xfs_off_t end;
300 int error; 297 int error;
301 298
299 /*
300 * for buffers that are contained within a single page, just allocate
301 * the memory from the heap - there's no need for the complexity of
302 * page arrays to keep allocation down to order 0.
303 */
304 if (bp->b_buffer_length < PAGE_SIZE) {
305 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
306 if (!bp->b_addr) {
307 /* low memory - use alloc_page loop instead */
308 goto use_alloc_page;
309 }
310
311 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
312 PAGE_MASK) !=
313 ((unsigned long)bp->b_addr & PAGE_MASK)) {
314 /* b_addr spans two pages - use alloc_page instead */
315 kmem_free(bp->b_addr);
316 bp->b_addr = NULL;
317 goto use_alloc_page;
318 }
319 bp->b_offset = offset_in_page(bp->b_addr);
320 bp->b_pages = bp->b_page_array;
321 bp->b_pages[0] = virt_to_page(bp->b_addr);
322 bp->b_page_count = 1;
323 bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
324 return 0;
325 }
326
327use_alloc_page:
302 end = bp->b_file_offset + bp->b_buffer_length; 328 end = bp->b_file_offset + bp->b_buffer_length;
303 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 329 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
304
305 error = _xfs_buf_get_pages(bp, page_count, flags); 330 error = _xfs_buf_get_pages(bp, page_count, flags);
306 if (unlikely(error)) 331 if (unlikely(error))
307 return error; 332 return error;
308 bp->b_flags |= _XBF_PAGE_CACHE;
309 333
310 offset = bp->b_offset; 334 offset = bp->b_offset;
311 first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 335 bp->b_flags |= _XBF_PAGES;
312 336
313 for (i = 0; i < bp->b_page_count; i++) { 337 for (i = 0; i < bp->b_page_count; i++) {
314 struct page *page; 338 struct page *page;
315 uint retries = 0; 339 uint retries = 0;
316 340retry:
317 retry: 341 page = alloc_page(gfp_mask);
318 page = find_or_create_page(mapping, first + i, gfp_mask);
319 if (unlikely(page == NULL)) { 342 if (unlikely(page == NULL)) {
320 if (flags & XBF_READ_AHEAD) { 343 if (flags & XBF_READ_AHEAD) {
321 bp->b_page_count = i; 344 bp->b_page_count = i;
322 for (i = 0; i < bp->b_page_count; i++) 345 error = ENOMEM;
323 unlock_page(bp->b_pages[i]); 346 goto out_free_pages;
324 return -ENOMEM;
325 } 347 }
326 348
327 /* 349 /*
@@ -331,65 +353,55 @@ _xfs_buf_lookup_pages(
331 * handle buffer allocation failures we can't do much. 353 * handle buffer allocation failures we can't do much.
332 */ 354 */
333 if (!(++retries % 100)) 355 if (!(++retries % 100))
334 printk(KERN_ERR 356 xfs_err(NULL,
335 "XFS: possible memory allocation " 357 "possible memory allocation deadlock in %s (mode:0x%x)",
336 "deadlock in %s (mode:0x%x)\n",
337 __func__, gfp_mask); 358 __func__, gfp_mask);
338 359
339 XFS_STATS_INC(xb_page_retries); 360 XFS_STATS_INC(xb_page_retries);
340 xfsbufd_wakeup(NULL, 0, gfp_mask);
341 congestion_wait(BLK_RW_ASYNC, HZ/50); 361 congestion_wait(BLK_RW_ASYNC, HZ/50);
342 goto retry; 362 goto retry;
343 } 363 }
344 364
345 XFS_STATS_INC(xb_page_found); 365 XFS_STATS_INC(xb_page_found);
346 366
347 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 367 nbytes = min_t(size_t, size, PAGE_SIZE - offset);
348 size -= nbytes; 368 size -= nbytes;
349
350 ASSERT(!PagePrivate(page));
351 if (!PageUptodate(page)) {
352 page_count--;
353 if (blocksize >= PAGE_CACHE_SIZE) {
354 if (flags & XBF_READ)
355 bp->b_flags |= _XBF_PAGE_LOCKED;
356 } else if (!PagePrivate(page)) {
357 if (test_page_region(page, offset, nbytes))
358 page_count++;
359 }
360 }
361
362 bp->b_pages[i] = page; 369 bp->b_pages[i] = page;
363 offset = 0; 370 offset = 0;
364 } 371 }
372 return 0;
365 373
366 if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { 374out_free_pages:
367 for (i = 0; i < bp->b_page_count; i++) 375 for (i = 0; i < bp->b_page_count; i++)
368 unlock_page(bp->b_pages[i]); 376 __free_page(bp->b_pages[i]);
369 }
370
371 if (page_count == bp->b_page_count)
372 bp->b_flags |= XBF_DONE;
373
374 return error; 377 return error;
375} 378}
376 379
377/* 380/*
378 * Map buffer into kernel address-space if nessecary. 381 * Map buffer into kernel address-space if necessary.
379 */ 382 */
380STATIC int 383STATIC int
381_xfs_buf_map_pages( 384_xfs_buf_map_pages(
382 xfs_buf_t *bp, 385 xfs_buf_t *bp,
383 uint flags) 386 uint flags)
384{ 387{
385 /* A single page buffer is always mappable */ 388 ASSERT(bp->b_flags & _XBF_PAGES);
386 if (bp->b_page_count == 1) { 389 if (bp->b_page_count == 1) {
390 /* A single page buffer is always mappable */
387 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 391 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
388 bp->b_flags |= XBF_MAPPED; 392 bp->b_flags |= XBF_MAPPED;
389 } else if (flags & XBF_MAPPED) { 393 } else if (flags & XBF_MAPPED) {
390 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 394 int retried = 0;
391 -1, PAGE_KERNEL); 395
392 if (unlikely(bp->b_addr == NULL)) 396 do {
397 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
398 -1, PAGE_KERNEL);
399 if (bp->b_addr)
400 break;
401 vm_unmap_aliases();
402 } while (retried++ <= 1);
403
404 if (!bp->b_addr)
393 return -ENOMEM; 405 return -ENOMEM;
394 bp->b_addr += bp->b_offset; 406 bp->b_addr += bp->b_offset;
395 bp->b_flags |= XBF_MAPPED; 407 bp->b_flags |= XBF_MAPPED;
@@ -500,9 +512,14 @@ found:
500 } 512 }
501 } 513 }
502 514
515 /*
516 * if the buffer is stale, clear all the external state associated with
517 * it. We need to keep flags such as how we allocated the buffer memory
518 * intact here.
519 */
503 if (bp->b_flags & XBF_STALE) { 520 if (bp->b_flags & XBF_STALE) {
504 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 521 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
505 bp->b_flags &= XBF_MAPPED; 522 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
506 } 523 }
507 524
508 trace_xfs_buf_find(bp, flags, _RET_IP_); 525 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -523,7 +540,7 @@ xfs_buf_get(
523 xfs_buf_flags_t flags) 540 xfs_buf_flags_t flags)
524{ 541{
525 xfs_buf_t *bp, *new_bp; 542 xfs_buf_t *bp, *new_bp;
526 int error = 0, i; 543 int error = 0;
527 544
528 new_bp = xfs_buf_allocate(flags); 545 new_bp = xfs_buf_allocate(flags);
529 if (unlikely(!new_bp)) 546 if (unlikely(!new_bp))
@@ -531,7 +548,7 @@ xfs_buf_get(
531 548
532 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 549 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
533 if (bp == new_bp) { 550 if (bp == new_bp) {
534 error = _xfs_buf_lookup_pages(bp, flags); 551 error = xfs_buf_allocate_memory(bp, flags);
535 if (error) 552 if (error)
536 goto no_buffer; 553 goto no_buffer;
537 } else { 554 } else {
@@ -540,14 +557,11 @@ xfs_buf_get(
540 return NULL; 557 return NULL;
541 } 558 }
542 559
543 for (i = 0; i < bp->b_page_count; i++)
544 mark_page_accessed(bp->b_pages[i]);
545
546 if (!(bp->b_flags & XBF_MAPPED)) { 560 if (!(bp->b_flags & XBF_MAPPED)) {
547 error = _xfs_buf_map_pages(bp, flags); 561 error = _xfs_buf_map_pages(bp, flags);
548 if (unlikely(error)) { 562 if (unlikely(error)) {
549 printk(KERN_WARNING "%s: failed to map pages\n", 563 xfs_warn(target->bt_mount,
550 __func__); 564 "%s: failed to map pages\n", __func__);
551 goto no_buffer; 565 goto no_buffer;
552 } 566 }
553 } 567 }
@@ -641,10 +655,7 @@ xfs_buf_readahead(
641 xfs_off_t ioff, 655 xfs_off_t ioff,
642 size_t isize) 656 size_t isize)
643{ 657{
644 struct backing_dev_info *bdi; 658 if (bdi_read_congested(target->bt_bdi))
645
646 bdi = target->bt_mapping->backing_dev_info;
647 if (bdi_read_congested(bdi))
648 return; 659 return;
649 660
650 xfs_buf_read(target, ioff, isize, 661 xfs_buf_read(target, ioff, isize,
@@ -722,10 +733,10 @@ xfs_buf_associate_memory(
722 size_t buflen; 733 size_t buflen;
723 int page_count; 734 int page_count;
724 735
725 pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; 736 pageaddr = (unsigned long)mem & PAGE_MASK;
726 offset = (unsigned long)mem - pageaddr; 737 offset = (unsigned long)mem - pageaddr;
727 buflen = PAGE_CACHE_ALIGN(len + offset); 738 buflen = PAGE_ALIGN(len + offset);
728 page_count = buflen >> PAGE_CACHE_SHIFT; 739 page_count = buflen >> PAGE_SHIFT;
729 740
730 /* Free any previous set of page pointers */ 741 /* Free any previous set of page pointers */
731 if (bp->b_pages) 742 if (bp->b_pages)
@@ -742,13 +753,12 @@ xfs_buf_associate_memory(
742 753
743 for (i = 0; i < bp->b_page_count; i++) { 754 for (i = 0; i < bp->b_page_count; i++) {
744 bp->b_pages[i] = mem_to_page((void *)pageaddr); 755 bp->b_pages[i] = mem_to_page((void *)pageaddr);
745 pageaddr += PAGE_CACHE_SIZE; 756 pageaddr += PAGE_SIZE;
746 } 757 }
747 758
748 bp->b_count_desired = len; 759 bp->b_count_desired = len;
749 bp->b_buffer_length = buflen; 760 bp->b_buffer_length = buflen;
750 bp->b_flags |= XBF_MAPPED; 761 bp->b_flags |= XBF_MAPPED;
751 bp->b_flags &= ~_XBF_PAGE_LOCKED;
752 762
753 return 0; 763 return 0;
754} 764}
@@ -781,8 +791,8 @@ xfs_buf_get_uncached(
781 791
782 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 792 error = _xfs_buf_map_pages(bp, XBF_MAPPED);
783 if (unlikely(error)) { 793 if (unlikely(error)) {
784 printk(KERN_WARNING "%s: failed to map pages\n", 794 xfs_warn(target->bt_mount,
785 __func__); 795 "%s: failed to map pages\n", __func__);
786 goto fail_free_mem; 796 goto fail_free_mem;
787 } 797 }
788 798
@@ -827,7 +837,7 @@ xfs_buf_rele(
827 trace_xfs_buf_rele(bp, _RET_IP_); 837 trace_xfs_buf_rele(bp, _RET_IP_);
828 838
829 if (!pag) { 839 if (!pag) {
830 ASSERT(!bp->b_relse); 840 ASSERT(list_empty(&bp->b_lru));
831 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 841 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
832 if (atomic_dec_and_test(&bp->b_hold)) 842 if (atomic_dec_and_test(&bp->b_hold))
833 xfs_buf_free(bp); 843 xfs_buf_free(bp);
@@ -835,13 +845,15 @@ xfs_buf_rele(
835 } 845 }
836 846
837 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 847 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
848
838 ASSERT(atomic_read(&bp->b_hold) > 0); 849 ASSERT(atomic_read(&bp->b_hold) > 0);
839 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 850 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
840 if (bp->b_relse) { 851 if (!(bp->b_flags & XBF_STALE) &&
841 atomic_inc(&bp->b_hold); 852 atomic_read(&bp->b_lru_ref)) {
853 xfs_buf_lru_add(bp);
842 spin_unlock(&pag->pag_buf_lock); 854 spin_unlock(&pag->pag_buf_lock);
843 bp->b_relse(bp);
844 } else { 855 } else {
856 xfs_buf_lru_del(bp);
845 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 857 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
846 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 858 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
847 spin_unlock(&pag->pag_buf_lock); 859 spin_unlock(&pag->pag_buf_lock);
@@ -853,20 +865,7 @@ xfs_buf_rele(
853 865
854 866
855/* 867/*
856 * Mutual exclusion on buffers. Locking model: 868 * Lock a buffer object, if it is not already locked.
857 *
858 * Buffers associated with inodes for which buffer locking
859 * is not enabled are not protected by semaphores, and are
860 * assumed to be exclusively owned by the caller. There is a
861 * spinlock in the buffer, used by the caller when concurrent
862 * access is possible.
863 */
864
865/*
866 * Locks a buffer object, if it is not already locked. Note that this in
867 * no way locks the underlying pages, so it is only useful for
868 * synchronizing concurrent use of buffer objects, not for synchronizing
869 * independent access to the underlying pages.
870 * 869 *
871 * If we come across a stale, pinned, locked buffer, we know that we are 870 * If we come across a stale, pinned, locked buffer, we know that we are
872 * being asked to lock a buffer that has been reallocated. Because it is 871 * being asked to lock a buffer that has been reallocated. Because it is
@@ -900,10 +899,7 @@ xfs_buf_lock_value(
900} 899}
901 900
902/* 901/*
903 * Locks a buffer object. 902 * Lock a buffer object.
904 * Note that this in no way locks the underlying pages, so it is only
905 * useful for synchronizing concurrent use of buffer objects, not for
906 * synchronizing independent access to the underlying pages.
907 * 903 *
908 * If we come across a stale, pinned, locked buffer, we know that we 904 * If we come across a stale, pinned, locked buffer, we know that we
909 * are being asked to lock a buffer that has been reallocated. Because 905 * are being asked to lock a buffer that has been reallocated. Because
@@ -919,8 +915,6 @@ xfs_buf_lock(
919 915
920 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 916 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
921 xfs_log_force(bp->b_target->bt_mount, 0); 917 xfs_log_force(bp->b_target->bt_mount, 0);
922 if (atomic_read(&bp->b_io_remaining))
923 blk_run_address_space(bp->b_target->bt_mapping);
924 down(&bp->b_sema); 918 down(&bp->b_sema);
925 XB_SET_OWNER(bp); 919 XB_SET_OWNER(bp);
926 920
@@ -964,9 +958,7 @@ xfs_buf_wait_unpin(
964 set_current_state(TASK_UNINTERRUPTIBLE); 958 set_current_state(TASK_UNINTERRUPTIBLE);
965 if (atomic_read(&bp->b_pin_count) == 0) 959 if (atomic_read(&bp->b_pin_count) == 0)
966 break; 960 break;
967 if (atomic_read(&bp->b_io_remaining)) 961 io_schedule();
968 blk_run_address_space(bp->b_target->bt_mapping);
969 schedule();
970 } 962 }
971 remove_wait_queue(&bp->b_waiters, &wait); 963 remove_wait_queue(&bp->b_waiters, &wait);
972 set_current_state(TASK_RUNNING); 964 set_current_state(TASK_RUNNING);
@@ -1178,10 +1170,8 @@ _xfs_buf_ioend(
1178 xfs_buf_t *bp, 1170 xfs_buf_t *bp,
1179 int schedule) 1171 int schedule)
1180{ 1172{
1181 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1173 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1182 bp->b_flags &= ~_XBF_PAGE_LOCKED;
1183 xfs_buf_ioend(bp, schedule); 1174 xfs_buf_ioend(bp, schedule);
1184 }
1185} 1175}
1186 1176
1187STATIC void 1177STATIC void
@@ -1190,35 +1180,12 @@ xfs_buf_bio_end_io(
1190 int error) 1180 int error)
1191{ 1181{
1192 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1182 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1193 unsigned int blocksize = bp->b_target->bt_bsize;
1194 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1195 1183
1196 xfs_buf_ioerror(bp, -error); 1184 xfs_buf_ioerror(bp, -error);
1197 1185
1198 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1186 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1199 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1187 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1200 1188
1201 do {
1202 struct page *page = bvec->bv_page;
1203
1204 ASSERT(!PagePrivate(page));
1205 if (unlikely(bp->b_error)) {
1206 if (bp->b_flags & XBF_READ)
1207 ClearPageUptodate(page);
1208 } else if (blocksize >= PAGE_CACHE_SIZE) {
1209 SetPageUptodate(page);
1210 } else if (!PagePrivate(page) &&
1211 (bp->b_flags & _XBF_PAGE_CACHE)) {
1212 set_page_region(page, bvec->bv_offset, bvec->bv_len);
1213 }
1214
1215 if (--bvec >= bio->bi_io_vec)
1216 prefetchw(&bvec->bv_page->flags);
1217
1218 if (bp->b_flags & _XBF_PAGE_LOCKED)
1219 unlock_page(page);
1220 } while (bvec >= bio->bi_io_vec);
1221
1222 _xfs_buf_ioend(bp, 1); 1189 _xfs_buf_ioend(bp, 1);
1223 bio_put(bio); 1190 bio_put(bio);
1224} 1191}
@@ -1232,7 +1199,6 @@ _xfs_buf_ioapply(
1232 int offset = bp->b_offset; 1199 int offset = bp->b_offset;
1233 int size = bp->b_count_desired; 1200 int size = bp->b_count_desired;
1234 sector_t sector = bp->b_bn; 1201 sector_t sector = bp->b_bn;
1235 unsigned int blocksize = bp->b_target->bt_bsize;
1236 1202
1237 total_nr_pages = bp->b_page_count; 1203 total_nr_pages = bp->b_page_count;
1238 map_i = 0; 1204 map_i = 0;
@@ -1253,29 +1219,6 @@ _xfs_buf_ioapply(
1253 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1219 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1254 } 1220 }
1255 1221
1256 /* Special code path for reading a sub page size buffer in --
1257 * we populate up the whole page, and hence the other metadata
1258 * in the same page. This optimization is only valid when the
1259 * filesystem block size is not smaller than the page size.
1260 */
1261 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1262 ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
1263 (XBF_READ|_XBF_PAGE_LOCKED)) &&
1264 (blocksize >= PAGE_CACHE_SIZE)) {
1265 bio = bio_alloc(GFP_NOIO, 1);
1266
1267 bio->bi_bdev = bp->b_target->bt_bdev;
1268 bio->bi_sector = sector - (offset >> BBSHIFT);
1269 bio->bi_end_io = xfs_buf_bio_end_io;
1270 bio->bi_private = bp;
1271
1272 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1273 size = 0;
1274
1275 atomic_inc(&bp->b_io_remaining);
1276
1277 goto submit_io;
1278 }
1279 1222
1280next_chunk: 1223next_chunk:
1281 atomic_inc(&bp->b_io_remaining); 1224 atomic_inc(&bp->b_io_remaining);
@@ -1289,8 +1232,9 @@ next_chunk:
1289 bio->bi_end_io = xfs_buf_bio_end_io; 1232 bio->bi_end_io = xfs_buf_bio_end_io;
1290 bio->bi_private = bp; 1233 bio->bi_private = bp;
1291 1234
1235
1292 for (; size && nr_pages; nr_pages--, map_i++) { 1236 for (; size && nr_pages; nr_pages--, map_i++) {
1293 int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1237 int rbytes, nbytes = PAGE_SIZE - offset;
1294 1238
1295 if (nbytes > size) 1239 if (nbytes > size)
1296 nbytes = size; 1240 nbytes = size;
@@ -1305,7 +1249,6 @@ next_chunk:
1305 total_nr_pages--; 1249 total_nr_pages--;
1306 } 1250 }
1307 1251
1308submit_io:
1309 if (likely(bio->bi_size)) { 1252 if (likely(bio->bi_size)) {
1310 if (xfs_buf_is_vmapped(bp)) { 1253 if (xfs_buf_is_vmapped(bp)) {
1311 flush_kernel_vmap_range(bp->b_addr, 1254 flush_kernel_vmap_range(bp->b_addr,
@@ -1315,18 +1258,7 @@ submit_io:
1315 if (size) 1258 if (size)
1316 goto next_chunk; 1259 goto next_chunk;
1317 } else { 1260 } else {
1318 /*
1319 * if we get here, no pages were added to the bio. However,
1320 * we can't just error out here - if the pages are locked then
1321 * we have to unlock them otherwise we can hang on a later
1322 * access to the page.
1323 */
1324 xfs_buf_ioerror(bp, EIO); 1261 xfs_buf_ioerror(bp, EIO);
1325 if (bp->b_flags & _XBF_PAGE_LOCKED) {
1326 int i;
1327 for (i = 0; i < bp->b_page_count; i++)
1328 unlock_page(bp->b_pages[i]);
1329 }
1330 bio_put(bio); 1262 bio_put(bio);
1331 } 1263 }
1332} 1264}
@@ -1371,8 +1303,6 @@ xfs_buf_iowait(
1371{ 1303{
1372 trace_xfs_buf_iowait(bp, _RET_IP_); 1304 trace_xfs_buf_iowait(bp, _RET_IP_);
1373 1305
1374 if (atomic_read(&bp->b_io_remaining))
1375 blk_run_address_space(bp->b_target->bt_mapping);
1376 wait_for_completion(&bp->b_iowait); 1306 wait_for_completion(&bp->b_iowait);
1377 1307
1378 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1308 trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1390,8 +1320,8 @@ xfs_buf_offset(
1390 return XFS_BUF_PTR(bp) + offset; 1320 return XFS_BUF_PTR(bp) + offset;
1391 1321
1392 offset += bp->b_offset; 1322 offset += bp->b_offset;
1393 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1323 page = bp->b_pages[offset >> PAGE_SHIFT];
1394 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1324 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1395} 1325}
1396 1326
1397/* 1327/*
@@ -1413,9 +1343,9 @@ xfs_buf_iomove(
1413 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1343 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1414 cpoff = xfs_buf_poff(boff + bp->b_offset); 1344 cpoff = xfs_buf_poff(boff + bp->b_offset);
1415 csize = min_t(size_t, 1345 csize = min_t(size_t,
1416 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1346 PAGE_SIZE-cpoff, bp->b_count_desired-boff);
1417 1347
1418 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1348 ASSERT(((csize + cpoff) <= PAGE_SIZE));
1419 1349
1420 switch (mode) { 1350 switch (mode) {
1421 case XBRW_ZERO: 1351 case XBRW_ZERO:
@@ -1438,51 +1368,84 @@ xfs_buf_iomove(
1438 */ 1368 */
1439 1369
1440/* 1370/*
1441 * Wait for any bufs with callbacks that have been submitted but 1371 * Wait for any bufs with callbacks that have been submitted but have not yet
1442 * have not yet returned... walk the hash list for the target. 1372 * returned. These buffers will have an elevated hold count, so wait on those
1373 * while freeing all the buffers only held by the LRU.
1443 */ 1374 */
1444void 1375void
1445xfs_wait_buftarg( 1376xfs_wait_buftarg(
1446 struct xfs_buftarg *btp) 1377 struct xfs_buftarg *btp)
1447{ 1378{
1448 struct xfs_perag *pag; 1379 struct xfs_buf *bp;
1449 uint i;
1450 1380
1451 for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { 1381restart:
1452 pag = xfs_perag_get(btp->bt_mount, i); 1382 spin_lock(&btp->bt_lru_lock);
1453 spin_lock(&pag->pag_buf_lock); 1383 while (!list_empty(&btp->bt_lru)) {
1454 while (rb_first(&pag->pag_buf_tree)) { 1384 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1455 spin_unlock(&pag->pag_buf_lock); 1385 if (atomic_read(&bp->b_hold) > 1) {
1386 spin_unlock(&btp->bt_lru_lock);
1456 delay(100); 1387 delay(100);
1457 spin_lock(&pag->pag_buf_lock); 1388 goto restart;
1458 } 1389 }
1459 spin_unlock(&pag->pag_buf_lock); 1390 /*
1460 xfs_perag_put(pag); 1391 * clear the LRU reference count so the bufer doesn't get
1392 * ignored in xfs_buf_rele().
1393 */
1394 atomic_set(&bp->b_lru_ref, 0);
1395 spin_unlock(&btp->bt_lru_lock);
1396 xfs_buf_rele(bp);
1397 spin_lock(&btp->bt_lru_lock);
1461 } 1398 }
1399 spin_unlock(&btp->bt_lru_lock);
1462} 1400}
1463 1401
1464/* 1402int
1465 * buftarg list for delwrite queue processing 1403xfs_buftarg_shrink(
1466 */ 1404 struct shrinker *shrink,
1467static LIST_HEAD(xfs_buftarg_list); 1405 int nr_to_scan,
1468static DEFINE_SPINLOCK(xfs_buftarg_lock); 1406 gfp_t mask)
1469
1470STATIC void
1471xfs_register_buftarg(
1472 xfs_buftarg_t *btp)
1473{ 1407{
1474 spin_lock(&xfs_buftarg_lock); 1408 struct xfs_buftarg *btp = container_of(shrink,
1475 list_add(&btp->bt_list, &xfs_buftarg_list); 1409 struct xfs_buftarg, bt_shrinker);
1476 spin_unlock(&xfs_buftarg_lock); 1410 struct xfs_buf *bp;
1477} 1411 LIST_HEAD(dispose);
1478 1412
1479STATIC void 1413 if (!nr_to_scan)
1480xfs_unregister_buftarg( 1414 return btp->bt_lru_nr;
1481 xfs_buftarg_t *btp) 1415
1482{ 1416 spin_lock(&btp->bt_lru_lock);
1483 spin_lock(&xfs_buftarg_lock); 1417 while (!list_empty(&btp->bt_lru)) {
1484 list_del(&btp->bt_list); 1418 if (nr_to_scan-- <= 0)
1485 spin_unlock(&xfs_buftarg_lock); 1419 break;
1420
1421 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1422
1423 /*
1424 * Decrement the b_lru_ref count unless the value is already
1425 * zero. If the value is already zero, we need to reclaim the
1426 * buffer, otherwise it gets another trip through the LRU.
1427 */
1428 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1429 list_move_tail(&bp->b_lru, &btp->bt_lru);
1430 continue;
1431 }
1432
1433 /*
1434 * remove the buffer from the LRU now to avoid needing another
1435 * lock round trip inside xfs_buf_rele().
1436 */
1437 list_move(&bp->b_lru, &dispose);
1438 btp->bt_lru_nr--;
1439 }
1440 spin_unlock(&btp->bt_lru_lock);
1441
1442 while (!list_empty(&dispose)) {
1443 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1444 list_del_init(&bp->b_lru);
1445 xfs_buf_rele(bp);
1446 }
1447
1448 return btp->bt_lru_nr;
1486} 1449}
1487 1450
1488void 1451void
@@ -1490,17 +1453,13 @@ xfs_free_buftarg(
1490 struct xfs_mount *mp, 1453 struct xfs_mount *mp,
1491 struct xfs_buftarg *btp) 1454 struct xfs_buftarg *btp)
1492{ 1455{
1456 unregister_shrinker(&btp->bt_shrinker);
1457
1493 xfs_flush_buftarg(btp, 1); 1458 xfs_flush_buftarg(btp, 1);
1494 if (mp->m_flags & XFS_MOUNT_BARRIER) 1459 if (mp->m_flags & XFS_MOUNT_BARRIER)
1495 xfs_blkdev_issue_flush(btp); 1460 xfs_blkdev_issue_flush(btp);
1496 iput(btp->bt_mapping->host);
1497 1461
1498 /* Unregister the buftarg first so that we don't get a
1499 * wakeup finding a non-existent task
1500 */
1501 xfs_unregister_buftarg(btp);
1502 kthread_stop(btp->bt_task); 1462 kthread_stop(btp->bt_task);
1503
1504 kmem_free(btp); 1463 kmem_free(btp);
1505} 1464}
1506 1465
@@ -1516,21 +1475,12 @@ xfs_setsize_buftarg_flags(
1516 btp->bt_smask = sectorsize - 1; 1475 btp->bt_smask = sectorsize - 1;
1517 1476
1518 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1477 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1519 printk(KERN_WARNING 1478 xfs_warn(btp->bt_mount,
1520 "XFS: Cannot set_blocksize to %u on device %s\n", 1479 "Cannot set_blocksize to %u on device %s\n",
1521 sectorsize, XFS_BUFTARG_NAME(btp)); 1480 sectorsize, XFS_BUFTARG_NAME(btp));
1522 return EINVAL; 1481 return EINVAL;
1523 } 1482 }
1524 1483
1525 if (verbose &&
1526 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1527 printk(KERN_WARNING
1528 "XFS: %u byte sectors in use on device %s. "
1529 "This is suboptimal; %u or greater is ideal.\n",
1530 sectorsize, XFS_BUFTARG_NAME(btp),
1531 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1532 }
1533
1534 return 0; 1484 return 0;
1535} 1485}
1536 1486
@@ -1545,7 +1495,7 @@ xfs_setsize_buftarg_early(
1545 struct block_device *bdev) 1495 struct block_device *bdev)
1546{ 1496{
1547 return xfs_setsize_buftarg_flags(btp, 1497 return xfs_setsize_buftarg_flags(btp,
1548 PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); 1498 PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1549} 1499}
1550 1500
1551int 1501int
@@ -1558,59 +1508,17 @@ xfs_setsize_buftarg(
1558} 1508}
1559 1509
1560STATIC int 1510STATIC int
1561xfs_mapping_buftarg(
1562 xfs_buftarg_t *btp,
1563 struct block_device *bdev)
1564{
1565 struct backing_dev_info *bdi;
1566 struct inode *inode;
1567 struct address_space *mapping;
1568 static const struct address_space_operations mapping_aops = {
1569 .sync_page = block_sync_page,
1570 .migratepage = fail_migrate_page,
1571 };
1572
1573 inode = new_inode(bdev->bd_inode->i_sb);
1574 if (!inode) {
1575 printk(KERN_WARNING
1576 "XFS: Cannot allocate mapping inode for device %s\n",
1577 XFS_BUFTARG_NAME(btp));
1578 return ENOMEM;
1579 }
1580 inode->i_ino = get_next_ino();
1581 inode->i_mode = S_IFBLK;
1582 inode->i_bdev = bdev;
1583 inode->i_rdev = bdev->bd_dev;
1584 bdi = blk_get_backing_dev_info(bdev);
1585 if (!bdi)
1586 bdi = &default_backing_dev_info;
1587 mapping = &inode->i_data;
1588 mapping->a_ops = &mapping_aops;
1589 mapping->backing_dev_info = bdi;
1590 mapping_set_gfp_mask(mapping, GFP_NOFS);
1591 btp->bt_mapping = mapping;
1592 return 0;
1593}
1594
1595STATIC int
1596xfs_alloc_delwrite_queue( 1511xfs_alloc_delwrite_queue(
1597 xfs_buftarg_t *btp, 1512 xfs_buftarg_t *btp,
1598 const char *fsname) 1513 const char *fsname)
1599{ 1514{
1600 int error = 0;
1601
1602 INIT_LIST_HEAD(&btp->bt_list);
1603 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1515 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1604 spin_lock_init(&btp->bt_delwrite_lock); 1516 spin_lock_init(&btp->bt_delwrite_lock);
1605 btp->bt_flags = 0; 1517 btp->bt_flags = 0;
1606 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1518 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1607 if (IS_ERR(btp->bt_task)) { 1519 if (IS_ERR(btp->bt_task))
1608 error = PTR_ERR(btp->bt_task); 1520 return PTR_ERR(btp->bt_task);
1609 goto out_error; 1521 return 0;
1610 }
1611 xfs_register_buftarg(btp);
1612out_error:
1613 return error;
1614} 1522}
1615 1523
1616xfs_buftarg_t * 1524xfs_buftarg_t *
@@ -1627,12 +1535,19 @@ xfs_alloc_buftarg(
1627 btp->bt_mount = mp; 1535 btp->bt_mount = mp;
1628 btp->bt_dev = bdev->bd_dev; 1536 btp->bt_dev = bdev->bd_dev;
1629 btp->bt_bdev = bdev; 1537 btp->bt_bdev = bdev;
1630 if (xfs_setsize_buftarg_early(btp, bdev)) 1538 btp->bt_bdi = blk_get_backing_dev_info(bdev);
1539 if (!btp->bt_bdi)
1631 goto error; 1540 goto error;
1632 if (xfs_mapping_buftarg(btp, bdev)) 1541
1542 INIT_LIST_HEAD(&btp->bt_lru);
1543 spin_lock_init(&btp->bt_lru_lock);
1544 if (xfs_setsize_buftarg_early(btp, bdev))
1633 goto error; 1545 goto error;
1634 if (xfs_alloc_delwrite_queue(btp, fsname)) 1546 if (xfs_alloc_delwrite_queue(btp, fsname))
1635 goto error; 1547 goto error;
1548 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1549 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1550 register_shrinker(&btp->bt_shrinker);
1636 return btp; 1551 return btp;
1637 1552
1638error: 1553error:
@@ -1737,27 +1652,6 @@ xfs_buf_runall_queues(
1737 flush_workqueue(queue); 1652 flush_workqueue(queue);
1738} 1653}
1739 1654
1740STATIC int
1741xfsbufd_wakeup(
1742 struct shrinker *shrink,
1743 int priority,
1744 gfp_t mask)
1745{
1746 xfs_buftarg_t *btp;
1747
1748 spin_lock(&xfs_buftarg_lock);
1749 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1750 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1751 continue;
1752 if (list_empty(&btp->bt_delwrite_queue))
1753 continue;
1754 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1755 wake_up_process(btp->bt_task);
1756 }
1757 spin_unlock(&xfs_buftarg_lock);
1758 return 0;
1759}
1760
1761/* 1655/*
1762 * Move as many buffers as specified to the supplied list 1656 * Move as many buffers as specified to the supplied list
1763 * idicating if we skipped any buffers to prevent deadlocks. 1657 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1845,8 +1739,8 @@ xfsbufd(
1845 do { 1739 do {
1846 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1740 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1847 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); 1741 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1848 int count = 0;
1849 struct list_head tmp; 1742 struct list_head tmp;
1743 struct blk_plug plug;
1850 1744
1851 if (unlikely(freezing(current))) { 1745 if (unlikely(freezing(current))) {
1852 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1746 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1862,16 +1756,15 @@ xfsbufd(
1862 1756
1863 xfs_buf_delwri_split(target, &tmp, age); 1757 xfs_buf_delwri_split(target, &tmp, age);
1864 list_sort(NULL, &tmp, xfs_buf_cmp); 1758 list_sort(NULL, &tmp, xfs_buf_cmp);
1759
1760 blk_start_plug(&plug);
1865 while (!list_empty(&tmp)) { 1761 while (!list_empty(&tmp)) {
1866 struct xfs_buf *bp; 1762 struct xfs_buf *bp;
1867 bp = list_first_entry(&tmp, struct xfs_buf, b_list); 1763 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1868 list_del_init(&bp->b_list); 1764 list_del_init(&bp->b_list);
1869 xfs_bdstrat_cb(bp); 1765 xfs_bdstrat_cb(bp);
1870 count++;
1871 } 1766 }
1872 if (count) 1767 blk_finish_plug(&plug);
1873 blk_run_address_space(target->bt_mapping);
1874
1875 } while (!kthread_should_stop()); 1768 } while (!kthread_should_stop());
1876 1769
1877 return 0; 1770 return 0;
@@ -1891,6 +1784,7 @@ xfs_flush_buftarg(
1891 int pincount = 0; 1784 int pincount = 0;
1892 LIST_HEAD(tmp_list); 1785 LIST_HEAD(tmp_list);
1893 LIST_HEAD(wait_list); 1786 LIST_HEAD(wait_list);
1787 struct blk_plug plug;
1894 1788
1895 xfs_buf_runall_queues(xfsconvertd_workqueue); 1789 xfs_buf_runall_queues(xfsconvertd_workqueue);
1896 xfs_buf_runall_queues(xfsdatad_workqueue); 1790 xfs_buf_runall_queues(xfsdatad_workqueue);
@@ -1905,6 +1799,8 @@ xfs_flush_buftarg(
1905 * we do that after issuing all the IO. 1799 * we do that after issuing all the IO.
1906 */ 1800 */
1907 list_sort(NULL, &tmp_list, xfs_buf_cmp); 1801 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1802
1803 blk_start_plug(&plug);
1908 while (!list_empty(&tmp_list)) { 1804 while (!list_empty(&tmp_list)) {
1909 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); 1805 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1910 ASSERT(target == bp->b_target); 1806 ASSERT(target == bp->b_target);
@@ -1915,10 +1811,10 @@ xfs_flush_buftarg(
1915 } 1811 }
1916 xfs_bdstrat_cb(bp); 1812 xfs_bdstrat_cb(bp);
1917 } 1813 }
1814 blk_finish_plug(&plug);
1918 1815
1919 if (wait) { 1816 if (wait) {
1920 /* Expedite and wait for IO to complete. */ 1817 /* Wait for IO to complete. */
1921 blk_run_address_space(target->bt_mapping);
1922 while (!list_empty(&wait_list)) { 1818 while (!list_empty(&wait_list)) {
1923 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1819 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1924 1820
@@ -1944,15 +1840,15 @@ xfs_buf_init(void)
1944 if (!xfslogd_workqueue) 1840 if (!xfslogd_workqueue)
1945 goto out_free_buf_zone; 1841 goto out_free_buf_zone;
1946 1842
1947 xfsdatad_workqueue = create_workqueue("xfsdatad"); 1843 xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
1948 if (!xfsdatad_workqueue) 1844 if (!xfsdatad_workqueue)
1949 goto out_destroy_xfslogd_workqueue; 1845 goto out_destroy_xfslogd_workqueue;
1950 1846
1951 xfsconvertd_workqueue = create_workqueue("xfsconvertd"); 1847 xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
1848 WQ_MEM_RECLAIM, 1);
1952 if (!xfsconvertd_workqueue) 1849 if (!xfsconvertd_workqueue)
1953 goto out_destroy_xfsdatad_workqueue; 1850 goto out_destroy_xfsdatad_workqueue;
1954 1851
1955 register_shrinker(&xfs_buf_shake);
1956 return 0; 1852 return 0;
1957 1853
1958 out_destroy_xfsdatad_workqueue: 1854 out_destroy_xfsdatad_workqueue:
@@ -1968,7 +1864,6 @@ xfs_buf_init(void)
1968void 1864void
1969xfs_buf_terminate(void) 1865xfs_buf_terminate(void)
1970{ 1866{
1971 unregister_shrinker(&xfs_buf_shake);
1972 destroy_workqueue(xfsconvertd_workqueue); 1867 destroy_workqueue(xfsconvertd_workqueue);
1973 destroy_workqueue(xfsdatad_workqueue); 1868 destroy_workqueue(xfsdatad_workqueue);
1974 destroy_workqueue(xfslogd_workqueue); 1869 destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf98..a9a1c4512645 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -61,30 +61,11 @@ typedef enum {
61#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ 61#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
62 62
63/* flags used only internally */ 63/* flags used only internally */
64#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
65#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ 64#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
66#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ 65#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
66#define _XBF_KMEM (1 << 20)/* backed by heap memory */
67#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ 67#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
68 68
69/*
70 * Special flag for supporting metadata blocks smaller than a FSB.
71 *
72 * In this case we can have multiple xfs_buf_t on a single page and
73 * need to lock out concurrent xfs_buf_t readers as they only
74 * serialise access to the buffer.
75 *
76 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
77 * between reads of the page. Hence we can have one thread read the
78 * page and modify it, but then race with another thread that thinks
79 * the page is not up-to-date and hence reads it again.
80 *
81 * The result is that the first modifcation to the page is lost.
82 * This sort of AGF/AGI reading race can happen when unlinking inodes
83 * that require truncation and results in the AGI unlinked list
84 * modifications being lost.
85 */
86#define _XBF_PAGE_LOCKED (1 << 22)
87
88typedef unsigned int xfs_buf_flags_t; 69typedef unsigned int xfs_buf_flags_t;
89 70
90#define XFS_BUF_FLAGS \ 71#define XFS_BUF_FLAGS \
@@ -100,12 +81,10 @@ typedef unsigned int xfs_buf_flags_t;
100 { XBF_LOCK, "LOCK" }, /* should never be set */\ 81 { XBF_LOCK, "LOCK" }, /* should never be set */\
101 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ 82 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
102 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ 83 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
103 { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \
104 { _XBF_PAGES, "PAGES" }, \ 84 { _XBF_PAGES, "PAGES" }, \
105 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 85 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
106 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 86 { _XBF_KMEM, "KMEM" }, \
107 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" } 87 { _XBF_DELWRI_Q, "DELWRI_Q" }
108
109 88
110typedef enum { 89typedef enum {
111 XBT_FORCE_SLEEP = 0, 90 XBT_FORCE_SLEEP = 0,
@@ -120,7 +99,7 @@ typedef struct xfs_bufhash {
120typedef struct xfs_buftarg { 99typedef struct xfs_buftarg {
121 dev_t bt_dev; 100 dev_t bt_dev;
122 struct block_device *bt_bdev; 101 struct block_device *bt_bdev;
123 struct address_space *bt_mapping; 102 struct backing_dev_info *bt_bdi;
124 struct xfs_mount *bt_mount; 103 struct xfs_mount *bt_mount;
125 unsigned int bt_bsize; 104 unsigned int bt_bsize;
126 unsigned int bt_sshift; 105 unsigned int bt_sshift;
@@ -128,27 +107,19 @@ typedef struct xfs_buftarg {
128 107
129 /* per device delwri queue */ 108 /* per device delwri queue */
130 struct task_struct *bt_task; 109 struct task_struct *bt_task;
131 struct list_head bt_list;
132 struct list_head bt_delwrite_queue; 110 struct list_head bt_delwrite_queue;
133 spinlock_t bt_delwrite_lock; 111 spinlock_t bt_delwrite_lock;
134 unsigned long bt_flags; 112 unsigned long bt_flags;
135} xfs_buftarg_t;
136 113
137/* 114 /* LRU control structures */
138 * xfs_buf_t: Buffer structure for pagecache-based buffers 115 struct shrinker bt_shrinker;
139 * 116 struct list_head bt_lru;
140 * This buffer structure is used by the pagecache buffer management routines 117 spinlock_t bt_lru_lock;
141 * to refer to an assembly of pages forming a logical buffer. 118 unsigned int bt_lru_nr;
142 * 119} xfs_buftarg_t;
143 * The buffer structure is used on a temporary basis only, and discarded when
144 * released. The real data storage is recorded in the pagecache. Buffers are
145 * hashed to the block device on which the file system resides.
146 */
147 120
148struct xfs_buf; 121struct xfs_buf;
149typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 122typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
150typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
151typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
152 123
153#define XB_PAGES 2 124#define XB_PAGES 2
154 125
@@ -164,9 +135,11 @@ typedef struct xfs_buf {
164 xfs_off_t b_file_offset; /* offset in file */ 135 xfs_off_t b_file_offset; /* offset in file */
165 size_t b_buffer_length;/* size of buffer in bytes */ 136 size_t b_buffer_length;/* size of buffer in bytes */
166 atomic_t b_hold; /* reference count */ 137 atomic_t b_hold; /* reference count */
138 atomic_t b_lru_ref; /* lru reclaim ref count */
167 xfs_buf_flags_t b_flags; /* status flags */ 139 xfs_buf_flags_t b_flags; /* status flags */
168 struct semaphore b_sema; /* semaphore for lockables */ 140 struct semaphore b_sema; /* semaphore for lockables */
169 141
142 struct list_head b_lru; /* lru list */
170 wait_queue_head_t b_waiters; /* unpin waiters */ 143 wait_queue_head_t b_waiters; /* unpin waiters */
171 struct list_head b_list; 144 struct list_head b_list;
172 struct xfs_perag *b_pag; /* contains rbtree root */ 145 struct xfs_perag *b_pag; /* contains rbtree root */
@@ -176,7 +149,6 @@ typedef struct xfs_buf {
176 void *b_addr; /* virtual address of buffer */ 149 void *b_addr; /* virtual address of buffer */
177 struct work_struct b_iodone_work; 150 struct work_struct b_iodone_work;
178 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 151 xfs_buf_iodone_t b_iodone; /* I/O completion function */
179 xfs_buf_relse_t b_relse; /* releasing function */
180 struct completion b_iowait; /* queue for I/O waiters */ 152 struct completion b_iowait; /* queue for I/O waiters */
181 void *b_fspriv; 153 void *b_fspriv;
182 void *b_fspriv2; 154 void *b_fspriv2;
@@ -264,7 +236,8 @@ extern void xfs_buf_terminate(void);
264#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 236#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
265 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 237 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
266 238
267#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) 239void xfs_buf_stale(struct xfs_buf *bp);
240#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
268#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 241#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
269#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 242#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
270#define XFS_BUF_SUPER_STALE(bp) do { \ 243#define XFS_BUF_SUPER_STALE(bp) do { \
@@ -315,7 +288,6 @@ extern void xfs_buf_terminate(void);
315#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 288#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
316#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 289#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
317#define XFS_BUF_SET_START(bp) do { } while (0) 290#define XFS_BUF_SET_START(bp) do { } while (0)
318#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
319 291
320#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 292#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
321#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 293#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -328,9 +300,15 @@ extern void xfs_buf_terminate(void);
328#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 300#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
329#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 301#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
330 302
331#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) 303static inline void
304xfs_buf_set_ref(
305 struct xfs_buf *bp,
306 int lru_ref)
307{
308 atomic_set(&bp->b_lru_ref, lru_ref);
309}
310#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
332#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 311#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
333#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
334 312
335#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) 313#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
336 314
@@ -346,8 +324,7 @@ extern void xfs_buf_terminate(void);
346 324
347static inline void xfs_buf_relse(xfs_buf_t *bp) 325static inline void xfs_buf_relse(xfs_buf_t *bp)
348{ 326{
349 if (!bp->b_relse) 327 xfs_buf_unlock(bp);
350 xfs_buf_unlock(bp);
351 xfs_buf_rele(bp); 328 xfs_buf_rele(bp);
352} 329}
353 330
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..d61611c88012
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,193 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_trans.h"
26#include "xfs_alloc_btree.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_inode.h"
31#include "xfs_alloc.h"
32#include "xfs_error.h"
33#include "xfs_discard.h"
34#include "xfs_trace.h"
35
36STATIC int
37xfs_trim_extents(
38 struct xfs_mount *mp,
39 xfs_agnumber_t agno,
40 xfs_fsblock_t start,
41 xfs_fsblock_t len,
42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed)
44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
46 struct xfs_btree_cur *cur;
47 struct xfs_buf *agbp;
48 struct xfs_perag *pag;
49 int error;
50 int i;
51
52 pag = xfs_perag_get(mp, agno);
53
54 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
55 if (error || !agbp)
56 goto out_put_perag;
57
58 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
59
60 /*
61 * Force out the log. This means any transactions that might have freed
62 * space before we took the AGF buffer lock are now on disk, and the
63 * volatile disk cache is flushed.
64 */
65 xfs_log_force(mp, XFS_LOG_SYNC);
66
67 /*
68 * Look up the longest btree in the AGF and start with it.
69 */
70 error = xfs_alloc_lookup_le(cur, 0,
71 XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
72 if (error)
73 goto out_del_cursor;
74
75 /*
76 * Loop until we are done with all extents that are large
77 * enough to be worth discarding.
78 */
79 while (i) {
80 xfs_agblock_t fbno;
81 xfs_extlen_t flen;
82
83 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
84 if (error)
85 goto out_del_cursor;
86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
87 ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
88
89 /*
90 * Too small? Give up.
91 */
92 if (flen < minlen) {
93 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
94 goto out_del_cursor;
95 }
96
97 /*
98 * If the extent is entirely outside of the range we are
99 * supposed to discard skip it. Do not bother to trim
100 * down partially overlapping ranges for now.
101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent;
106 }
107
108 /*
109 * If any blocks in the range are still busy, skip the
110 * discard and try again the next time.
111 */
112 if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
113 trace_xfs_discard_busy(mp, agno, fbno, flen);
114 goto next_extent;
115 }
116
117 trace_xfs_discard_extent(mp, agno, fbno, flen);
118 error = -blkdev_issue_discard(bdev,
119 XFS_AGB_TO_DADDR(mp, agno, fbno),
120 XFS_FSB_TO_BB(mp, flen),
121 GFP_NOFS, 0);
122 if (error)
123 goto out_del_cursor;
124 *blocks_trimmed += flen;
125
126next_extent:
127 error = xfs_btree_decrement(cur, 0, &i);
128 if (error)
129 goto out_del_cursor;
130 }
131
132out_del_cursor:
133 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
134 xfs_buf_relse(agbp);
135out_put_perag:
136 xfs_perag_put(pag);
137 return error;
138}
139
140int
141xfs_ioc_trim(
142 struct xfs_mount *mp,
143 struct fstrim_range __user *urange)
144{
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0;
152
153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM);
155 if (!blk_queue_discard(q))
156 return -XFS_ERROR(EOPNOTSUPP);
157 if (copy_from_user(&range, urange, sizeof(range)))
158 return -XFS_ERROR(EFAULT);
159
160 /*
161 * Truncating down the len isn't actually quite correct, but using
162 * XFS_B_TO_FSB would mean we trivially get overflows for values
163 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
164 * used by the fstrim application. In the end it really doesn't
165 * matter as trimming blocks is an advisory interface.
166 */
167 start = XFS_B_TO_FSBT(mp, range.start);
168 len = XFS_B_TO_FSBT(mp, range.len);
169 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
170
171 start_agno = XFS_FSB_TO_AGNO(mp, start);
172 if (start_agno >= mp->m_sb.sb_agcount)
173 return -XFS_ERROR(EINVAL);
174
175 end_agno = XFS_FSB_TO_AGNO(mp, start + len);
176 if (end_agno >= mp->m_sb.sb_agcount)
177 end_agno = mp->m_sb.sb_agcount - 1;
178
179 for (agno = start_agno; agno <= end_agno; agno++) {
180 error = -xfs_trim_extents(mp, agno, start, len, minlen,
181 &blocks_trimmed);
182 if (error)
183 last_error = error;
184 }
185
186 if (last_error)
187 return last_error;
188
189 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
190 if (copy_to_user(urange, &range, sizeof(range)))
191 return -XFS_ERROR(EFAULT);
192 return 0;
193}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
1#ifndef XFS_DISCARD_H
2#define XFS_DISCARD_H 1
3
4struct fstrim_range;
5
6extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
7
8#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
70 else 70 else
71 fileid_type = FILEID_INO32_GEN_PARENT; 71 fileid_type = FILEID_INO32_GEN_PARENT;
72 72
73 /* filesystem may contain 64bit inode numbers */ 73 /*
74 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) 74 * If the the filesystem may contain 64bit inode numbers, we need
75 * to use larger file handles that can represent them.
76 *
77 * While we only allocate inodes that do not fit into 32 bits any
78 * large enough filesystem may contain them, thus the slightly
79 * confusing looking conditional below.
80 */
81 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
82 (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
75 fileid_type |= XFS_FILEID_TYPE_64FLAG; 83 fileid_type |= XFS_FILEID_TYPE_64FLAG;
76 84
77 /* 85 /*
@@ -81,8 +89,10 @@ xfs_fs_encode_fh(
81 * seven combinations work. The real answer is "don't use v2". 89 * seven combinations work. The real answer is "don't use v2".
82 */ 90 */
83 len = xfs_fileid_length(fileid_type); 91 len = xfs_fileid_length(fileid_type);
84 if (*max_len < len) 92 if (*max_len < len) {
93 *max_len = len;
85 return 255; 94 return 255;
95 }
86 *max_len = len; 96 *max_len = len;
87 97
88 switch (fileid_type) { 98 switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..f4213ba1ff85 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38 38
39#include <linux/dcache.h> 39#include <linux/dcache.h>
40#include <linux/falloc.h>
40 41
41static const struct vm_operations_struct xfs_file_vm_ops; 42static const struct vm_operations_struct xfs_file_vm_ops;
42 43
43/* 44/*
45 * Locking primitives for read and write IO paths to ensure we consistently use
46 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
47 */
48static inline void
49xfs_rw_ilock(
50 struct xfs_inode *ip,
51 int type)
52{
53 if (type & XFS_IOLOCK_EXCL)
54 mutex_lock(&VFS_I(ip)->i_mutex);
55 xfs_ilock(ip, type);
56}
57
58static inline void
59xfs_rw_iunlock(
60 struct xfs_inode *ip,
61 int type)
62{
63 xfs_iunlock(ip, type);
64 if (type & XFS_IOLOCK_EXCL)
65 mutex_unlock(&VFS_I(ip)->i_mutex);
66}
67
68static inline void
69xfs_rw_ilock_demote(
70 struct xfs_inode *ip,
71 int type)
72{
73 xfs_ilock_demote(ip, type);
74 if (type & XFS_IOLOCK_EXCL)
75 mutex_unlock(&VFS_I(ip)->i_mutex);
76}
77
78/*
44 * xfs_iozero 79 * xfs_iozero
45 * 80 *
46 * xfs_iozero clears the specified range of buffer supplied, 81 * xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
262 if (XFS_FORCED_SHUTDOWN(mp)) 297 if (XFS_FORCED_SHUTDOWN(mp))
263 return -EIO; 298 return -EIO;
264 299
265 if (unlikely(ioflags & IO_ISDIRECT))
266 mutex_lock(&inode->i_mutex);
267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
268
269 if (unlikely(ioflags & IO_ISDIRECT)) { 300 if (unlikely(ioflags & IO_ISDIRECT)) {
301 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
302
270 if (inode->i_mapping->nrpages) { 303 if (inode->i_mapping->nrpages) {
271 ret = -xfs_flushinval_pages(ip, 304 ret = -xfs_flushinval_pages(ip,
272 (iocb->ki_pos & PAGE_CACHE_MASK), 305 (iocb->ki_pos & PAGE_CACHE_MASK),
273 -1, FI_REMAPF_LOCKED); 306 -1, FI_REMAPF_LOCKED);
307 if (ret) {
308 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310 }
274 } 311 }
275 mutex_unlock(&inode->i_mutex); 312 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
276 if (ret) { 313 } else
277 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 314 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
278 return ret;
279 }
280 }
281 315
282 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 316 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
283 317
@@ -285,7 +319,7 @@ xfs_file_aio_read(
285 if (ret > 0) 319 if (ret > 0)
286 XFS_STATS_ADD(xs_read_bytes, ret); 320 XFS_STATS_ADD(xs_read_bytes, ret);
287 321
288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
289 return ret; 323 return ret;
290} 324}
291 325
@@ -309,7 +343,7 @@ xfs_file_splice_read(
309 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 343 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
310 return -EIO; 344 return -EIO;
311 345
312 xfs_ilock(ip, XFS_IOLOCK_SHARED); 346 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
313 347
314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
315 349
@@ -317,10 +351,61 @@ xfs_file_splice_read(
317 if (ret > 0) 351 if (ret > 0)
318 XFS_STATS_ADD(xs_read_bytes, ret); 352 XFS_STATS_ADD(xs_read_bytes, ret);
319 353
320 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 354 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 return ret; 355 return ret;
322} 356}
323 357
358STATIC void
359xfs_aio_write_isize_update(
360 struct inode *inode,
361 loff_t *ppos,
362 ssize_t bytes_written)
363{
364 struct xfs_inode *ip = XFS_I(inode);
365 xfs_fsize_t isize = i_size_read(inode);
366
367 if (bytes_written > 0)
368 XFS_STATS_ADD(xs_write_bytes, bytes_written);
369
370 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
371 *ppos > isize))
372 *ppos = isize;
373
374 if (*ppos > ip->i_size) {
375 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
376 if (*ppos > ip->i_size)
377 ip->i_size = *ppos;
378 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
379 }
380}
381
382/*
383 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
384 * part of the I/O may have been written to disk before the error occurred. In
385 * this case the on-disk file size may have been adjusted beyond the in-memory
386 * file size and now needs to be truncated back.
387 */
388STATIC void
389xfs_aio_write_newsize_update(
390 struct xfs_inode *ip)
391{
392 if (ip->i_new_size) {
393 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
394 ip->i_new_size = 0;
395 if (ip->i_d.di_size > ip->i_size)
396 ip->i_d.di_size = ip->i_size;
397 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
398 }
399}
400
401/*
402 * xfs_file_splice_write() does not use xfs_rw_ilock() because
403 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
404 * couuld cause lock inversions between the aio_write path and the splice path
405 * if someone is doing concurrent splice(2) based writes and write(2) based
406 * writes to the same inode. The only real way to fix this is to re-implement
407 * the generic code here with correct locking orders.
408 */
324STATIC ssize_t 409STATIC ssize_t
325xfs_file_splice_write( 410xfs_file_splice_write(
326 struct pipe_inode_info *pipe, 411 struct pipe_inode_info *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
331{ 416{
332 struct inode *inode = outfilp->f_mapping->host; 417 struct inode *inode = outfilp->f_mapping->host;
333 struct xfs_inode *ip = XFS_I(inode); 418 struct xfs_inode *ip = XFS_I(inode);
334 xfs_fsize_t isize, new_size; 419 xfs_fsize_t new_size;
335 int ioflags = 0; 420 int ioflags = 0;
336 ssize_t ret; 421 ssize_t ret;
337 422
@@ -355,27 +440,9 @@ xfs_file_splice_write(
355 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 440 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
356 441
357 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 442 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
358 if (ret > 0)
359 XFS_STATS_ADD(xs_write_bytes, ret);
360
361 isize = i_size_read(inode);
362 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
363 *ppos = isize;
364
365 if (*ppos > ip->i_size) {
366 xfs_ilock(ip, XFS_ILOCK_EXCL);
367 if (*ppos > ip->i_size)
368 ip->i_size = *ppos;
369 xfs_iunlock(ip, XFS_ILOCK_EXCL);
370 }
371 443
372 if (ip->i_new_size) { 444 xfs_aio_write_isize_update(inode, ppos, ret);
373 xfs_ilock(ip, XFS_ILOCK_EXCL); 445 xfs_aio_write_newsize_update(ip);
374 ip->i_new_size = 0;
375 if (ip->i_d.di_size > ip->i_size)
376 ip->i_d.di_size = ip->i_size;
377 xfs_iunlock(ip, XFS_ILOCK_EXCL);
378 }
379 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 446 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
380 return ret; 447 return ret;
381} 448}
@@ -562,247 +629,318 @@ out_lock:
562 return error; 629 return error;
563} 630}
564 631
632/*
633 * Common pre-write limit and setup checks.
634 *
635 * Returns with iolock held according to @iolock.
636 */
565STATIC ssize_t 637STATIC ssize_t
566xfs_file_aio_write( 638xfs_file_aio_write_checks(
567 struct kiocb *iocb, 639 struct file *file,
568 const struct iovec *iovp, 640 loff_t *pos,
569 unsigned long nr_segs, 641 size_t *count,
570 loff_t pos) 642 int *iolock)
571{ 643{
572 struct file *file = iocb->ki_filp; 644 struct inode *inode = file->f_mapping->host;
573 struct address_space *mapping = file->f_mapping;
574 struct inode *inode = mapping->host;
575 struct xfs_inode *ip = XFS_I(inode); 645 struct xfs_inode *ip = XFS_I(inode);
576 struct xfs_mount *mp = ip->i_mount; 646 xfs_fsize_t new_size;
577 ssize_t ret = 0, error = 0; 647 int error = 0;
578 int ioflags = 0;
579 xfs_fsize_t isize, new_size;
580 int iolock;
581 size_t ocount = 0, count;
582 int need_i_mutex;
583 648
584 XFS_STATS_INC(xs_write_calls); 649 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
650 if (error) {
651 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
652 *iolock = 0;
653 return error;
654 }
585 655
586 BUG_ON(iocb->ki_pos != pos); 656 new_size = *pos + *count;
657 if (new_size > ip->i_size)
658 ip->i_new_size = new_size;
587 659
588 if (unlikely(file->f_flags & O_DIRECT)) 660 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
589 ioflags |= IO_ISDIRECT; 661 file_update_time(file);
590 if (file->f_mode & FMODE_NOCMTIME) 662
591 ioflags |= IO_INVIS; 663 /*
664 * If the offset is beyond the size of the file, we need to zero any
665 * blocks that fall between the existing EOF and the start of this
666 * write.
667 */
668 if (*pos > ip->i_size)
669 error = -xfs_zero_eof(ip, *pos, ip->i_size);
592 670
593 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); 671 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
594 if (error) 672 if (error)
595 return error; 673 return error;
596 674
597 count = ocount; 675 /*
598 if (count == 0) 676 * If we're writing the file then make sure to clear the setuid and
599 return 0; 677 * setgid bits if the process is not being run by root. This keeps
600 678 * people from modifying setuid and setgid binaries.
601 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 679 */
680 return file_remove_suid(file);
602 681
603 if (XFS_FORCED_SHUTDOWN(mp)) 682}
604 return -EIO;
605 683
606relock: 684/*
607 if (ioflags & IO_ISDIRECT) { 685 * xfs_file_dio_aio_write - handle direct IO writes
608 iolock = XFS_IOLOCK_SHARED; 686 *
609 need_i_mutex = 0; 687 * Lock the inode appropriately to prepare for and issue a direct IO write.
610 } else { 688 * By separating it from the buffered write path we remove all the tricky to
611 iolock = XFS_IOLOCK_EXCL; 689 * follow locking changes and looping.
612 need_i_mutex = 1; 690 *
613 mutex_lock(&inode->i_mutex); 691 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
692 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
693 * pages are flushed out.
694 *
695 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
696 * allowing them to be done in parallel with reads and other direct IO writes.
697 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
698 * needs to do sub-block zeroing and that requires serialisation against other
699 * direct IOs to the same block. In this case we need to serialise the
700 * submission of the unaligned IOs so that we don't get racing block zeroing in
701 * the dio layer. To avoid the problem with aio, we also need to wait for
702 * outstanding IOs to complete so that unwritten extent conversion is completed
703 * before we try to map the overlapping block. This is currently implemented by
704 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
705 *
706 * Returns with locks held indicated by @iolock and errors indicated by
707 * negative return values.
708 */
709STATIC ssize_t
710xfs_file_dio_aio_write(
711 struct kiocb *iocb,
712 const struct iovec *iovp,
713 unsigned long nr_segs,
714 loff_t pos,
715 size_t ocount,
716 int *iolock)
717{
718 struct file *file = iocb->ki_filp;
719 struct address_space *mapping = file->f_mapping;
720 struct inode *inode = mapping->host;
721 struct xfs_inode *ip = XFS_I(inode);
722 struct xfs_mount *mp = ip->i_mount;
723 ssize_t ret = 0;
724 size_t count = ocount;
725 int unaligned_io = 0;
726 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
727 mp->m_rtdev_targp : mp->m_ddev_targp;
728
729 *iolock = 0;
730 if ((pos & target->bt_smask) || (count & target->bt_smask))
731 return -XFS_ERROR(EINVAL);
732
733 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
734 unaligned_io = 1;
735
736 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
737 *iolock = XFS_IOLOCK_EXCL;
738 else
739 *iolock = XFS_IOLOCK_SHARED;
740 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
741
742 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
743 if (ret)
744 return ret;
745
746 if (mapping->nrpages) {
747 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
748 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
749 FI_REMAPF_LOCKED);
750 if (ret)
751 return ret;
614 } 752 }
615 753
616 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 754 /*
617 755 * If we are doing unaligned IO, wait for all other IO to drain,
618start: 756 * otherwise demote the lock if we had to flush cached pages
619 error = -generic_write_checks(file, &pos, &count, 757 */
620 S_ISBLK(inode->i_mode)); 758 if (unaligned_io)
621 if (error) { 759 xfs_ioend_wait(ip);
622 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 760 else if (*iolock == XFS_IOLOCK_EXCL) {
623 goto out_unlock_mutex; 761 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
762 *iolock = XFS_IOLOCK_SHARED;
624 } 763 }
625 764
626 if (ioflags & IO_ISDIRECT) { 765 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
627 xfs_buftarg_t *target = 766 ret = generic_file_direct_write(iocb, iovp,
628 XFS_IS_REALTIME_INODE(ip) ? 767 &nr_segs, pos, &iocb->ki_pos, count, ocount);
629 mp->m_rtdev_targp : mp->m_ddev_targp;
630 768
631 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 769 /* No fallback to buffered IO on errors for XFS. */
632 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 770 ASSERT(ret < 0 || ret == count);
633 return XFS_ERROR(-EINVAL); 771 return ret;
634 } 772}
635 773
636 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { 774STATIC ssize_t
637 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 775xfs_file_buffered_aio_write(
638 iolock = XFS_IOLOCK_EXCL; 776 struct kiocb *iocb,
639 need_i_mutex = 1; 777 const struct iovec *iovp,
640 mutex_lock(&inode->i_mutex); 778 unsigned long nr_segs,
641 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 779 loff_t pos,
642 goto start; 780 size_t ocount,
643 } 781 int *iolock)
644 } 782{
783 struct file *file = iocb->ki_filp;
784 struct address_space *mapping = file->f_mapping;
785 struct inode *inode = mapping->host;
786 struct xfs_inode *ip = XFS_I(inode);
787 ssize_t ret;
788 int enospc = 0;
789 size_t count = ocount;
645 790
646 new_size = pos + count; 791 *iolock = XFS_IOLOCK_EXCL;
647 if (new_size > ip->i_size) 792 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
648 ip->i_new_size = new_size;
649 793
650 if (likely(!(ioflags & IO_INVIS))) 794 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
651 file_update_time(file); 795 if (ret)
796 return ret;
652 797
798 /* We can write back this queue in page reclaim */
799 current->backing_dev_info = mapping->backing_dev_info;
800
801write_retry:
802 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
803 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
804 pos, &iocb->ki_pos, count, ret);
653 /* 805 /*
654 * If the offset is beyond the size of the file, we have a couple 806 * if we just got an ENOSPC, flush the inode now we aren't holding any
655 * of things to do. First, if there is already space allocated 807 * page locks and retry *once*
656 * we need to either create holes or zero the disk or ...
657 *
658 * If there is a page where the previous size lands, we need
659 * to zero it out up to the new size.
660 */ 808 */
661 809 if (ret == -ENOSPC && !enospc) {
662 if (pos > ip->i_size) { 810 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
663 error = xfs_zero_eof(ip, pos, ip->i_size); 811 if (ret)
664 if (error) { 812 return ret;
665 xfs_iunlock(ip, XFS_ILOCK_EXCL); 813 enospc = 1;
666 goto out_unlock_internal; 814 goto write_retry;
667 }
668 } 815 }
669 xfs_iunlock(ip, XFS_ILOCK_EXCL); 816 current->backing_dev_info = NULL;
817 return ret;
818}
670 819
671 /* 820STATIC ssize_t
672 * If we're writing the file then make sure to clear the 821xfs_file_aio_write(
673 * setuid and setgid bits if the process is not being run 822 struct kiocb *iocb,
674 * by root. This keeps people from modifying setuid and 823 const struct iovec *iovp,
675 * setgid binaries. 824 unsigned long nr_segs,
676 */ 825 loff_t pos)
677 error = -file_remove_suid(file); 826{
678 if (unlikely(error)) 827 struct file *file = iocb->ki_filp;
679 goto out_unlock_internal; 828 struct address_space *mapping = file->f_mapping;
829 struct inode *inode = mapping->host;
830 struct xfs_inode *ip = XFS_I(inode);
831 ssize_t ret;
832 int iolock;
833 size_t ocount = 0;
680 834
681 /* We can write back this queue in page reclaim */ 835 XFS_STATS_INC(xs_write_calls);
682 current->backing_dev_info = mapping->backing_dev_info;
683 836
684 if ((ioflags & IO_ISDIRECT)) { 837 BUG_ON(iocb->ki_pos != pos);
685 if (mapping->nrpages) {
686 WARN_ON(need_i_mutex == 0);
687 error = xfs_flushinval_pages(ip,
688 (pos & PAGE_CACHE_MASK),
689 -1, FI_REMAPF_LOCKED);
690 if (error)
691 goto out_unlock_internal;
692 }
693 838
694 if (need_i_mutex) { 839 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
695 /* demote the lock now the cached pages are gone */ 840 if (ret)
696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 841 return ret;
697 mutex_unlock(&inode->i_mutex);
698 842
699 iolock = XFS_IOLOCK_SHARED; 843 if (ocount == 0)
700 need_i_mutex = 0; 844 return 0;
701 }
702 845
703 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); 846 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
704 ret = generic_file_direct_write(iocb, iovp,
705 &nr_segs, pos, &iocb->ki_pos, count, ocount);
706 847
707 /* 848 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
708 * direct-io write to a hole: fall through to buffered I/O 849 return -EIO;
709 * for completing the rest of the request.
710 */
711 if (ret >= 0 && ret != count) {
712 XFS_STATS_ADD(xs_write_bytes, ret);
713 850
714 pos += ret; 851 if (unlikely(file->f_flags & O_DIRECT))
715 count -= ret; 852 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
853 ocount, &iolock);
854 else
855 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
856 ocount, &iolock);
716 857
717 ioflags &= ~IO_ISDIRECT; 858 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
718 xfs_iunlock(ip, iolock);
719 goto relock;
720 }
721 } else {
722 int enospc = 0;
723 ssize_t ret2 = 0;
724 859
725write_retry: 860 if (ret <= 0)
726 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); 861 goto out_unlock;
727 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
728 pos, &iocb->ki_pos, count, ret);
729 /*
730 * if we just got an ENOSPC, flush the inode now we
731 * aren't holding any page locks and retry *once*
732 */
733 if (ret2 == -ENOSPC && !enospc) {
734 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
735 if (error)
736 goto out_unlock_internal;
737 enospc = 1;
738 goto write_retry;
739 }
740 ret = ret2;
741 }
742 862
743 current->backing_dev_info = NULL; 863 /* Handle various SYNC-type writes */
864 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
865 loff_t end = pos + ret - 1;
866 int error, error2;
744 867
745 isize = i_size_read(inode); 868 xfs_rw_iunlock(ip, iolock);
746 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) 869 error = filemap_write_and_wait_range(mapping, pos, end);
747 iocb->ki_pos = isize; 870 xfs_rw_ilock(ip, iolock);
748 871
749 if (iocb->ki_pos > ip->i_size) { 872 error2 = -xfs_file_fsync(file,
750 xfs_ilock(ip, XFS_ILOCK_EXCL); 873 (file->f_flags & __O_SYNC) ? 0 : 1);
751 if (iocb->ki_pos > ip->i_size) 874 if (error)
752 ip->i_size = iocb->ki_pos; 875 ret = error;
753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 876 else if (error2)
877 ret = error2;
754 } 878 }
755 879
756 error = -ret; 880out_unlock:
757 if (ret <= 0) 881 xfs_aio_write_newsize_update(ip);
758 goto out_unlock_internal; 882 xfs_rw_iunlock(ip, iolock);
883 return ret;
884}
759 885
760 XFS_STATS_ADD(xs_write_bytes, ret); 886STATIC long
887xfs_file_fallocate(
888 struct file *file,
889 int mode,
890 loff_t offset,
891 loff_t len)
892{
893 struct inode *inode = file->f_path.dentry->d_inode;
894 long error;
895 loff_t new_size = 0;
896 xfs_flock64_t bf;
897 xfs_inode_t *ip = XFS_I(inode);
898 int cmd = XFS_IOC_RESVSP;
899 int attr_flags = XFS_ATTR_NOLOCK;
761 900
762 /* Handle various SYNC-type writes */ 901 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
763 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 902 return -EOPNOTSUPP;
764 loff_t end = pos + ret - 1;
765 int error2;
766 903
767 xfs_iunlock(ip, iolock); 904 bf.l_whence = 0;
768 if (need_i_mutex) 905 bf.l_start = offset;
769 mutex_unlock(&inode->i_mutex); 906 bf.l_len = len;
770 907
771 error2 = filemap_write_and_wait_range(mapping, pos, end); 908 xfs_ilock(ip, XFS_IOLOCK_EXCL);
772 if (!error)
773 error = error2;
774 if (need_i_mutex)
775 mutex_lock(&inode->i_mutex);
776 xfs_ilock(ip, iolock);
777 909
778 error2 = -xfs_file_fsync(file, 910 if (mode & FALLOC_FL_PUNCH_HOLE)
779 (file->f_flags & __O_SYNC) ? 0 : 1); 911 cmd = XFS_IOC_UNRESVSP;
780 if (!error) 912
781 error = error2; 913 /* check the new inode size is valid before allocating */
914 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
915 offset + len > i_size_read(inode)) {
916 new_size = offset + len;
917 error = inode_newsize_ok(inode, new_size);
918 if (error)
919 goto out_unlock;
782 } 920 }
783 921
784 out_unlock_internal: 922 if (file->f_flags & O_DSYNC)
785 if (ip->i_new_size) { 923 attr_flags |= XFS_ATTR_SYNC;
786 xfs_ilock(ip, XFS_ILOCK_EXCL); 924
787 ip->i_new_size = 0; 925 error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
788 /* 926 if (error)
789 * If this was a direct or synchronous I/O that failed (such 927 goto out_unlock;
790 * as ENOSPC) then part of the I/O may have been written to 928
791 * disk before the error occured. In this case the on-disk 929 /* Change file size if needed */
792 * file size may have been adjusted beyond the in-memory file 930 if (new_size) {
793 * size and now needs to be truncated back. 931 struct iattr iattr;
794 */ 932
795 if (ip->i_d.di_size > ip->i_size) 933 iattr.ia_valid = ATTR_SIZE;
796 ip->i_d.di_size = ip->i_size; 934 iattr.ia_size = new_size;
797 xfs_iunlock(ip, XFS_ILOCK_EXCL); 935 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
798 } 936 }
799 xfs_iunlock(ip, iolock); 937
800 out_unlock_mutex: 938out_unlock:
801 if (need_i_mutex) 939 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
802 mutex_unlock(&inode->i_mutex); 940 return error;
803 return -error;
804} 941}
805 942
943
806STATIC int 944STATIC int
807xfs_file_open( 945xfs_file_open(
808 struct inode *inode, 946 struct inode *inode,
@@ -921,6 +1059,7 @@ const struct file_operations xfs_file_operations = {
921 .open = xfs_file_open, 1059 .open = xfs_file_open,
922 .release = xfs_file_release, 1060 .release = xfs_file_release,
923 .fsync = xfs_file_fsync, 1061 .fsync = xfs_file_fsync,
1062 .fallocate = xfs_file_fallocate,
924}; 1063};
925 1064
926const struct file_operations xfs_dir_file_operations = { 1065const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392e..acca2c5ca3fa 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
39#include "xfs_dfrag.h" 39#include "xfs_dfrag.h"
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_discard.h"
42#include "xfs_quota.h" 43#include "xfs_quota.h"
43#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
44#include "xfs_export.h" 45#include "xfs_export.h"
@@ -623,6 +624,10 @@ xfs_ioc_space(
623 624
624 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 625 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
625 attr_flags |= XFS_ATTR_NONBLOCK; 626 attr_flags |= XFS_ATTR_NONBLOCK;
627
628 if (filp->f_flags & O_DSYNC)
629 attr_flags |= XFS_ATTR_SYNC;
630
626 if (ioflags & IO_INVIS) 631 if (ioflags & IO_INVIS)
627 attr_flags |= XFS_ATTR_DMI; 632 attr_flags |= XFS_ATTR_DMI;
628 633
@@ -694,14 +699,19 @@ xfs_ioc_fsgeometry_v1(
694 xfs_mount_t *mp, 699 xfs_mount_t *mp,
695 void __user *arg) 700 void __user *arg)
696{ 701{
697 xfs_fsop_geom_v1_t fsgeo; 702 xfs_fsop_geom_t fsgeo;
698 int error; 703 int error;
699 704
700 error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); 705 error = xfs_fs_geometry(mp, &fsgeo, 3);
701 if (error) 706 if (error)
702 return -error; 707 return -error;
703 708
704 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) 709 /*
710 * Caller should have passed an argument of type
711 * xfs_fsop_geom_v1_t. This is a proper subset of the
712 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
713 */
714 if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
705 return -XFS_ERROR(EFAULT); 715 return -XFS_ERROR(EFAULT);
706 return 0; 716 return 0;
707} 717}
@@ -984,10 +994,22 @@ xfs_ioctl_setattr(
984 994
985 /* 995 /*
986 * Extent size must be a multiple of the appropriate block 996 * Extent size must be a multiple of the appropriate block
987 * size, if set at all. 997 * size, if set at all. It must also be smaller than the
998 * maximum extent size supported by the filesystem.
999 *
1000 * Also, for non-realtime files, limit the extent size hint to
1001 * half the size of the AGs in the filesystem so alignment
1002 * doesn't result in extents larger than an AG.
988 */ 1003 */
989 if (fa->fsx_extsize != 0) { 1004 if (fa->fsx_extsize != 0) {
990 xfs_extlen_t size; 1005 xfs_extlen_t size;
1006 xfs_fsblock_t extsize_fsb;
1007
1008 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1009 if (extsize_fsb > MAXEXTLEN) {
1010 code = XFS_ERROR(EINVAL);
1011 goto error_return;
1012 }
991 1013
992 if (XFS_IS_REALTIME_INODE(ip) || 1014 if (XFS_IS_REALTIME_INODE(ip) ||
993 ((mask & FSX_XFLAGS) && 1015 ((mask & FSX_XFLAGS) &&
@@ -996,6 +1018,10 @@ xfs_ioctl_setattr(
996 mp->m_sb.sb_blocklog; 1018 mp->m_sb.sb_blocklog;
997 } else { 1019 } else {
998 size = mp->m_sb.sb_blocksize; 1020 size = mp->m_sb.sb_blocksize;
1021 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1022 code = XFS_ERROR(EINVAL);
1023 goto error_return;
1024 }
999 } 1025 }
1000 1026
1001 if (fa->fsx_extsize % size) { 1027 if (fa->fsx_extsize % size) {
@@ -1294,6 +1320,8 @@ xfs_file_ioctl(
1294 trace_xfs_file_ioctl(ip); 1320 trace_xfs_file_ioctl(ip);
1295 1321
1296 switch (cmd) { 1322 switch (cmd) {
1323 case FITRIM:
1324 return xfs_ioc_trim(mp, arg);
1297 case XFS_IOC_ALLOCSP: 1325 case XFS_IOC_ALLOCSP:
1298 case XFS_IOC_FREESP: 1326 case XFS_IOC_FREESP:
1299 case XFS_IOC_RESVSP: 1327 case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 94d5fd6a2973..dd21784525a8 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/posix_acl.h> 47#include <linux/posix_acl.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/falloc.h>
50#include <linux/fiemap.h> 49#include <linux/fiemap.h>
51#include <linux/slab.h> 50#include <linux/slab.h>
52 51
@@ -71,7 +70,7 @@ xfs_synchronize_times(
71 70
72/* 71/*
73 * If the linux inode is valid, mark it dirty. 72 * If the linux inode is valid, mark it dirty.
74 * Used when commiting a dirty inode into a transaction so that 73 * Used when committing a dirty inode into a transaction so that
75 * the inode will get written back by the linux code 74 * the inode will get written back by the linux code
76 */ 75 */
77void 76void
@@ -103,7 +102,8 @@ xfs_mark_inode_dirty(
103STATIC int 102STATIC int
104xfs_init_security( 103xfs_init_security(
105 struct inode *inode, 104 struct inode *inode,
106 struct inode *dir) 105 struct inode *dir,
106 const struct qstr *qstr)
107{ 107{
108 struct xfs_inode *ip = XFS_I(inode); 108 struct xfs_inode *ip = XFS_I(inode);
109 size_t length; 109 size_t length;
@@ -111,7 +111,7 @@ xfs_init_security(
111 unsigned char *name; 111 unsigned char *name;
112 int error; 112 int error;
113 113
114 error = security_inode_init_security(inode, dir, (char **)&name, 114 error = security_inode_init_security(inode, dir, qstr, (char **)&name,
115 &value, &length); 115 &value, &length);
116 if (error) { 116 if (error) {
117 if (error == -EOPNOTSUPP) 117 if (error == -EOPNOTSUPP)
@@ -195,7 +195,7 @@ xfs_vn_mknod(
195 195
196 inode = VFS_I(ip); 196 inode = VFS_I(ip);
197 197
198 error = xfs_init_security(inode, dir); 198 error = xfs_init_security(inode, dir, &dentry->d_name);
199 if (unlikely(error)) 199 if (unlikely(error))
200 goto out_cleanup_inode; 200 goto out_cleanup_inode;
201 201
@@ -368,7 +368,7 @@ xfs_vn_symlink(
368 368
369 inode = VFS_I(cip); 369 inode = VFS_I(cip);
370 370
371 error = xfs_init_security(inode, dir); 371 error = xfs_init_security(inode, dir, &dentry->d_name);
372 if (unlikely(error)) 372 if (unlikely(error))
373 goto out_cleanup_inode; 373 goto out_cleanup_inode;
374 374
@@ -505,58 +505,6 @@ xfs_vn_setattr(
505 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 505 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
506} 506}
507 507
508STATIC long
509xfs_vn_fallocate(
510 struct inode *inode,
511 int mode,
512 loff_t offset,
513 loff_t len)
514{
515 long error;
516 loff_t new_size = 0;
517 xfs_flock64_t bf;
518 xfs_inode_t *ip = XFS_I(inode);
519
520 /* preallocation on directories not yet supported */
521 error = -ENODEV;
522 if (S_ISDIR(inode->i_mode))
523 goto out_error;
524
525 bf.l_whence = 0;
526 bf.l_start = offset;
527 bf.l_len = len;
528
529 xfs_ilock(ip, XFS_IOLOCK_EXCL);
530
531 /* check the new inode size is valid before allocating */
532 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
533 offset + len > i_size_read(inode)) {
534 new_size = offset + len;
535 error = inode_newsize_ok(inode, new_size);
536 if (error)
537 goto out_unlock;
538 }
539
540 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
541 0, XFS_ATTR_NOLOCK);
542 if (error)
543 goto out_unlock;
544
545 /* Change file size if needed */
546 if (new_size) {
547 struct iattr iattr;
548
549 iattr.ia_valid = ATTR_SIZE;
550 iattr.ia_size = new_size;
551 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
552 }
553
554out_unlock:
555 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
556out_error:
557 return error;
558}
559
560#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 508#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
561 509
562/* 510/*
@@ -650,7 +598,6 @@ static const struct inode_operations xfs_inode_operations = {
650 .getxattr = generic_getxattr, 598 .getxattr = generic_getxattr,
651 .removexattr = generic_removexattr, 599 .removexattr = generic_removexattr,
652 .listxattr = xfs_vn_listxattr, 600 .listxattr = xfs_vn_listxattr,
653 .fallocate = xfs_vn_fallocate,
654 .fiemap = xfs_vn_fiemap, 601 .fiemap = xfs_vn_fiemap,
655}; 602};
656 603
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff79..244be9cbfe78 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,10 +37,8 @@
37 37
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h>
41#include <time.h> 40#include <time.h>
42 41
43#include <support/debug.h>
44#include <support/uuid.h> 42#include <support/uuid.h>
45 43
46#include <linux/semaphore.h> 44#include <linux/semaphore.h>
@@ -87,6 +85,7 @@
87#include <xfs_aops.h> 85#include <xfs_aops.h>
88#include <xfs_super.h> 86#include <xfs_super.h>
89#include <xfs_buf.h> 87#include <xfs_buf.h>
88#include <xfs_message.h>
90 89
91/* 90/*
92 * Feature macros (disable/enable) 91 * Feature macros (disable/enable)
@@ -281,4 +280,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
281#define __arch_pack 280#define __arch_pack
282#endif 281#endif
283 282
283#define ASSERT_ALWAYS(expr) \
284 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
285
286#ifndef DEBUG
287#define ASSERT(expr) ((void)0)
288
289#ifndef STATIC
290# define STATIC static noinline
291#endif
292
293#else /* DEBUG */
294
295#define ASSERT(expr) \
296 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
297
298#ifndef STATIC
299# define STATIC noinline
300#endif
301
302#endif /* DEBUG */
303
284#endif /* __XFS_LINUX__ */ 304#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 000000000000..9f76cceb678d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,126 @@
1/*
2 * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27
28/*
29 * XFS logging functions
30 */
31static void
32__xfs_printk(
33 const char *level,
34 const struct xfs_mount *mp,
35 struct va_format *vaf)
36{
37 if (mp && mp->m_fsname) {
38 printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
39 return;
40 }
41 printk("%sXFS: %pV\n", level, vaf);
42}
43
44void xfs_printk(
45 const char *level,
46 const struct xfs_mount *mp,
47 const char *fmt, ...)
48{
49 struct va_format vaf;
50 va_list args;
51
52 va_start(args, fmt);
53
54 vaf.fmt = fmt;
55 vaf.va = &args;
56
57 __xfs_printk(level, mp, &vaf);
58 va_end(args);
59}
60
61#define define_xfs_printk_level(func, kern_level) \
62void func(const struct xfs_mount *mp, const char *fmt, ...) \
63{ \
64 struct va_format vaf; \
65 va_list args; \
66 \
67 va_start(args, fmt); \
68 \
69 vaf.fmt = fmt; \
70 vaf.va = &args; \
71 \
72 __xfs_printk(kern_level, mp, &vaf); \
73 va_end(args); \
74} \
75
76define_xfs_printk_level(xfs_emerg, KERN_EMERG);
77define_xfs_printk_level(xfs_alert, KERN_ALERT);
78define_xfs_printk_level(xfs_crit, KERN_CRIT);
79define_xfs_printk_level(xfs_err, KERN_ERR);
80define_xfs_printk_level(xfs_warn, KERN_WARNING);
81define_xfs_printk_level(xfs_notice, KERN_NOTICE);
82define_xfs_printk_level(xfs_info, KERN_INFO);
83#ifdef DEBUG
84define_xfs_printk_level(xfs_debug, KERN_DEBUG);
85#endif
86
87void
88xfs_alert_tag(
89 const struct xfs_mount *mp,
90 int panic_tag,
91 const char *fmt, ...)
92{
93 struct va_format vaf;
94 va_list args;
95 int do_panic = 0;
96
97 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
98 xfs_printk(KERN_ALERT, mp,
99 "XFS: Transforming an alert into a BUG.");
100 do_panic = 1;
101 }
102
103 va_start(args, fmt);
104
105 vaf.fmt = fmt;
106 vaf.va = &args;
107
108 __xfs_printk(KERN_ALERT, mp, &vaf);
109 va_end(args);
110
111 BUG_ON(do_panic);
112}
113
114void
115assfail(char *expr, char *file, int line)
116{
117 xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
118 expr, file, line);
119 BUG();
120}
121
122void
123xfs_hex_dump(void *p, int length)
124{
125 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
126}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 000000000000..f1b3fc1b6c4e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,40 @@
1#ifndef __XFS_MESSAGE_H
2#define __XFS_MESSAGE_H 1
3
4struct xfs_mount;
5
6extern void xfs_printk(const char *level, const struct xfs_mount *mp,
7 const char *fmt, ...)
8 __attribute__ ((format (printf, 3, 4)));
9extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
10 __attribute__ ((format (printf, 2, 3)));
11extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
12 __attribute__ ((format (printf, 2, 3)));
13extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
14 const char *fmt, ...)
15 __attribute__ ((format (printf, 3, 4)));
16extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
17 __attribute__ ((format (printf, 2, 3)));
18extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
19 __attribute__ ((format (printf, 2, 3)));
20extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
21 __attribute__ ((format (printf, 2, 3)));
22extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
23 __attribute__ ((format (printf, 2, 3)));
24extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
25 __attribute__ ((format (printf, 2, 3)));
26
27#ifdef DEBUG
28extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
29 __attribute__ ((format (printf, 2, 3)));
30#else
31static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
32{
33}
34#endif
35
36extern void assfail(char *expr, char *f, int l);
37
38extern void xfs_hex_dump(void *p, int length);
39
40#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 064f964d4f3c..b38e58d02299 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -173,6 +173,15 @@ xfs_parseargs(
173 __uint8_t iosizelog = 0; 173 __uint8_t iosizelog = 0;
174 174
175 /* 175 /*
176 * set up the mount name first so all the errors will refer to the
177 * correct device.
178 */
179 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
180 if (!mp->m_fsname)
181 return ENOMEM;
182 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
183
184 /*
176 * Copy binary VFS mount flags we are interested in. 185 * Copy binary VFS mount flags we are interested in.
177 */ 186 */
178 if (sb->s_flags & MS_RDONLY) 187 if (sb->s_flags & MS_RDONLY)
@@ -189,6 +198,7 @@ xfs_parseargs(
189 mp->m_flags |= XFS_MOUNT_BARRIER; 198 mp->m_flags |= XFS_MOUNT_BARRIER;
190 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; 199 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
191 mp->m_flags |= XFS_MOUNT_SMALL_INUMS; 200 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
201 mp->m_flags |= XFS_MOUNT_DELAYLOG;
192 202
193 /* 203 /*
194 * These can be overridden by the mount option parsing. 204 * These can be overridden by the mount option parsing.
@@ -207,24 +217,21 @@ xfs_parseargs(
207 217
208 if (!strcmp(this_char, MNTOPT_LOGBUFS)) { 218 if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
209 if (!value || !*value) { 219 if (!value || !*value) {
210 cmn_err(CE_WARN, 220 xfs_warn(mp, "%s option requires an argument",
211 "XFS: %s option requires an argument",
212 this_char); 221 this_char);
213 return EINVAL; 222 return EINVAL;
214 } 223 }
215 mp->m_logbufs = simple_strtoul(value, &eov, 10); 224 mp->m_logbufs = simple_strtoul(value, &eov, 10);
216 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { 225 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
217 if (!value || !*value) { 226 if (!value || !*value) {
218 cmn_err(CE_WARN, 227 xfs_warn(mp, "%s option requires an argument",
219 "XFS: %s option requires an argument",
220 this_char); 228 this_char);
221 return EINVAL; 229 return EINVAL;
222 } 230 }
223 mp->m_logbsize = suffix_strtoul(value, &eov, 10); 231 mp->m_logbsize = suffix_strtoul(value, &eov, 10);
224 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { 232 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
225 if (!value || !*value) { 233 if (!value || !*value) {
226 cmn_err(CE_WARN, 234 xfs_warn(mp, "%s option requires an argument",
227 "XFS: %s option requires an argument",
228 this_char); 235 this_char);
229 return EINVAL; 236 return EINVAL;
230 } 237 }
@@ -232,14 +239,12 @@ xfs_parseargs(
232 if (!mp->m_logname) 239 if (!mp->m_logname)
233 return ENOMEM; 240 return ENOMEM;
234 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 241 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
235 cmn_err(CE_WARN, 242 xfs_warn(mp, "%s option not allowed on this system",
236 "XFS: %s option not allowed on this system",
237 this_char); 243 this_char);
238 return EINVAL; 244 return EINVAL;
239 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 245 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
240 if (!value || !*value) { 246 if (!value || !*value) {
241 cmn_err(CE_WARN, 247 xfs_warn(mp, "%s option requires an argument",
242 "XFS: %s option requires an argument",
243 this_char); 248 this_char);
244 return EINVAL; 249 return EINVAL;
245 } 250 }
@@ -248,8 +253,7 @@ xfs_parseargs(
248 return ENOMEM; 253 return ENOMEM;
249 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { 254 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
250 if (!value || !*value) { 255 if (!value || !*value) {
251 cmn_err(CE_WARN, 256 xfs_warn(mp, "%s option requires an argument",
252 "XFS: %s option requires an argument",
253 this_char); 257 this_char);
254 return EINVAL; 258 return EINVAL;
255 } 259 }
@@ -257,8 +261,7 @@ xfs_parseargs(
257 iosizelog = ffs(iosize) - 1; 261 iosizelog = ffs(iosize) - 1;
258 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { 262 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
259 if (!value || !*value) { 263 if (!value || !*value) {
260 cmn_err(CE_WARN, 264 xfs_warn(mp, "%s option requires an argument",
261 "XFS: %s option requires an argument",
262 this_char); 265 this_char);
263 return EINVAL; 266 return EINVAL;
264 } 267 }
@@ -280,16 +283,14 @@ xfs_parseargs(
280 mp->m_flags |= XFS_MOUNT_SWALLOC; 283 mp->m_flags |= XFS_MOUNT_SWALLOC;
281 } else if (!strcmp(this_char, MNTOPT_SUNIT)) { 284 } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
282 if (!value || !*value) { 285 if (!value || !*value) {
283 cmn_err(CE_WARN, 286 xfs_warn(mp, "%s option requires an argument",
284 "XFS: %s option requires an argument",
285 this_char); 287 this_char);
286 return EINVAL; 288 return EINVAL;
287 } 289 }
288 dsunit = simple_strtoul(value, &eov, 10); 290 dsunit = simple_strtoul(value, &eov, 10);
289 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { 291 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
290 if (!value || !*value) { 292 if (!value || !*value) {
291 cmn_err(CE_WARN, 293 xfs_warn(mp, "%s option requires an argument",
292 "XFS: %s option requires an argument",
293 this_char); 294 this_char);
294 return EINVAL; 295 return EINVAL;
295 } 296 }
@@ -297,8 +298,7 @@ xfs_parseargs(
297 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { 298 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
298 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; 299 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
299#if !XFS_BIG_INUMS 300#if !XFS_BIG_INUMS
300 cmn_err(CE_WARN, 301 xfs_warn(mp, "%s option not allowed on this system",
301 "XFS: %s option not allowed on this system",
302 this_char); 302 this_char);
303 return EINVAL; 303 return EINVAL;
304#endif 304#endif
@@ -356,20 +356,19 @@ xfs_parseargs(
356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
358 } else if (!strcmp(this_char, "ihashsize")) { 358 } else if (!strcmp(this_char, "ihashsize")) {
359 cmn_err(CE_WARN, 359 xfs_warn(mp,
360 "XFS: ihashsize no longer used, option is deprecated."); 360 "ihashsize no longer used, option is deprecated.");
361 } else if (!strcmp(this_char, "osyncisdsync")) { 361 } else if (!strcmp(this_char, "osyncisdsync")) {
362 cmn_err(CE_WARN, 362 xfs_warn(mp,
363 "XFS: osyncisdsync has no effect, option is deprecated."); 363 "osyncisdsync has no effect, option is deprecated.");
364 } else if (!strcmp(this_char, "osyncisosync")) { 364 } else if (!strcmp(this_char, "osyncisosync")) {
365 cmn_err(CE_WARN, 365 xfs_warn(mp,
366 "XFS: osyncisosync has no effect, option is deprecated."); 366 "osyncisosync has no effect, option is deprecated.");
367 } else if (!strcmp(this_char, "irixsgid")) { 367 } else if (!strcmp(this_char, "irixsgid")) {
368 cmn_err(CE_WARN, 368 xfs_warn(mp,
369 "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); 369 "irixsgid is now a sysctl(2) variable, option is deprecated.");
370 } else { 370 } else {
371 cmn_err(CE_WARN, 371 xfs_warn(mp, "unknown mount option [%s].", this_char);
372 "XFS: unknown mount option [%s].", this_char);
373 return EINVAL; 372 return EINVAL;
374 } 373 }
375 } 374 }
@@ -379,40 +378,37 @@ xfs_parseargs(
379 */ 378 */
380 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && 379 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
381 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 380 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
382 cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only."); 381 xfs_warn(mp, "no-recovery mounts must be read-only.");
383 return EINVAL; 382 return EINVAL;
384 } 383 }
385 384
386 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { 385 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
387 cmn_err(CE_WARN, 386 xfs_warn(mp,
388 "XFS: sunit and swidth options incompatible with the noalign option"); 387 "sunit and swidth options incompatible with the noalign option");
389 return EINVAL; 388 return EINVAL;
390 } 389 }
391 390
392#ifndef CONFIG_XFS_QUOTA 391#ifndef CONFIG_XFS_QUOTA
393 if (XFS_IS_QUOTA_RUNNING(mp)) { 392 if (XFS_IS_QUOTA_RUNNING(mp)) {
394 cmn_err(CE_WARN, 393 xfs_warn(mp, "quota support not available in this kernel.");
395 "XFS: quota support not available in this kernel.");
396 return EINVAL; 394 return EINVAL;
397 } 395 }
398#endif 396#endif
399 397
400 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && 398 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
401 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { 399 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
402 cmn_err(CE_WARN, 400 xfs_warn(mp, "cannot mount with both project and group quota");
403 "XFS: cannot mount with both project and group quota");
404 return EINVAL; 401 return EINVAL;
405 } 402 }
406 403
407 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 404 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
408 cmn_err(CE_WARN, 405 xfs_warn(mp, "sunit and swidth must be specified together");
409 "XFS: sunit and swidth must be specified together");
410 return EINVAL; 406 return EINVAL;
411 } 407 }
412 408
413 if (dsunit && (dswidth % dsunit != 0)) { 409 if (dsunit && (dswidth % dsunit != 0)) {
414 cmn_err(CE_WARN, 410 xfs_warn(mp,
415 "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)", 411 "stripe width (%d) must be a multiple of the stripe unit (%d)",
416 dswidth, dsunit); 412 dswidth, dsunit);
417 return EINVAL; 413 return EINVAL;
418 } 414 }
@@ -438,8 +434,7 @@ done:
438 mp->m_logbufs != 0 && 434 mp->m_logbufs != 0 &&
439 (mp->m_logbufs < XLOG_MIN_ICLOGS || 435 (mp->m_logbufs < XLOG_MIN_ICLOGS ||
440 mp->m_logbufs > XLOG_MAX_ICLOGS)) { 436 mp->m_logbufs > XLOG_MAX_ICLOGS)) {
441 cmn_err(CE_WARN, 437 xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
442 "XFS: invalid logbufs value: %d [not %d-%d]",
443 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); 438 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
444 return XFS_ERROR(EINVAL); 439 return XFS_ERROR(EINVAL);
445 } 440 }
@@ -448,22 +443,16 @@ done:
448 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || 443 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
449 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || 444 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
450 !is_power_of_2(mp->m_logbsize))) { 445 !is_power_of_2(mp->m_logbsize))) {
451 cmn_err(CE_WARN, 446 xfs_warn(mp,
452 "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", 447 "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
453 mp->m_logbsize); 448 mp->m_logbsize);
454 return XFS_ERROR(EINVAL); 449 return XFS_ERROR(EINVAL);
455 } 450 }
456 451
457 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
458 if (!mp->m_fsname)
459 return ENOMEM;
460 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
461
462 if (iosizelog) { 452 if (iosizelog) {
463 if (iosizelog > XFS_MAX_IO_LOG || 453 if (iosizelog > XFS_MAX_IO_LOG ||
464 iosizelog < XFS_MIN_IO_LOG) { 454 iosizelog < XFS_MIN_IO_LOG) {
465 cmn_err(CE_WARN, 455 xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
466 "XFS: invalid log iosize: %d [not %d-%d]",
467 iosizelog, XFS_MIN_IO_LOG, 456 iosizelog, XFS_MIN_IO_LOG,
468 XFS_MAX_IO_LOG); 457 XFS_MAX_IO_LOG);
469 return XFS_ERROR(EINVAL); 458 return XFS_ERROR(EINVAL);
@@ -606,10 +595,11 @@ xfs_blkdev_get(
606{ 595{
607 int error = 0; 596 int error = 0;
608 597
609 *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); 598 *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
599 mp);
610 if (IS_ERR(*bdevp)) { 600 if (IS_ERR(*bdevp)) {
611 error = PTR_ERR(*bdevp); 601 error = PTR_ERR(*bdevp);
612 printk("XFS: Invalid device [%s], error=%d\n", name, error); 602 xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
613 } 603 }
614 604
615 return -error; 605 return -error;
@@ -620,7 +610,7 @@ xfs_blkdev_put(
620 struct block_device *bdev) 610 struct block_device *bdev)
621{ 611{
622 if (bdev) 612 if (bdev)
623 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 613 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
624} 614}
625 615
626/* 616/*
@@ -663,23 +653,23 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
663 int error; 653 int error;
664 654
665 if (mp->m_logdev_targp != mp->m_ddev_targp) { 655 if (mp->m_logdev_targp != mp->m_ddev_targp) {
666 xfs_fs_cmn_err(CE_NOTE, mp, 656 xfs_notice(mp,
667 "Disabling barriers, not supported with external log device"); 657 "Disabling barriers, not supported with external log device");
668 mp->m_flags &= ~XFS_MOUNT_BARRIER; 658 mp->m_flags &= ~XFS_MOUNT_BARRIER;
669 return; 659 return;
670 } 660 }
671 661
672 if (xfs_readonly_buftarg(mp->m_ddev_targp)) { 662 if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
673 xfs_fs_cmn_err(CE_NOTE, mp, 663 xfs_notice(mp,
674 "Disabling barriers, underlying device is readonly"); 664 "Disabling barriers, underlying device is readonly");
675 mp->m_flags &= ~XFS_MOUNT_BARRIER; 665 mp->m_flags &= ~XFS_MOUNT_BARRIER;
676 return; 666 return;
677 } 667 }
678 668
679 error = xfs_barrier_test(mp); 669 error = xfs_barrier_test(mp);
680 if (error) { 670 if (error) {
681 xfs_fs_cmn_err(CE_NOTE, mp, 671 xfs_notice(mp,
682 "Disabling barriers, trial barrier write failed"); 672 "Disabling barriers, trial barrier write failed");
683 mp->m_flags &= ~XFS_MOUNT_BARRIER; 673 mp->m_flags &= ~XFS_MOUNT_BARRIER;
684 return; 674 return;
685 } 675 }
@@ -742,8 +732,8 @@ xfs_open_devices(
742 goto out_close_logdev; 732 goto out_close_logdev;
743 733
744 if (rtdev == ddev || rtdev == logdev) { 734 if (rtdev == ddev || rtdev == logdev) {
745 cmn_err(CE_WARN, 735 xfs_warn(mp,
746 "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); 736 "Cannot mount filesystem with identical rtdev and ddev/logdev.");
747 error = EINVAL; 737 error = EINVAL;
748 goto out_close_rtdev; 738 goto out_close_rtdev;
749 } 739 }
@@ -826,63 +816,6 @@ xfs_setup_devices(
826 return 0; 816 return 0;
827} 817}
828 818
829/*
830 * XFS AIL push thread support
831 */
832void
833xfsaild_wakeup(
834 struct xfs_ail *ailp,
835 xfs_lsn_t threshold_lsn)
836{
837 ailp->xa_target = threshold_lsn;
838 wake_up_process(ailp->xa_task);
839}
840
841STATIC int
842xfsaild(
843 void *data)
844{
845 struct xfs_ail *ailp = data;
846 xfs_lsn_t last_pushed_lsn = 0;
847 long tout = 0; /* milliseconds */
848
849 while (!kthread_should_stop()) {
850 schedule_timeout_interruptible(tout ?
851 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
852
853 /* swsusp */
854 try_to_freeze();
855
856 ASSERT(ailp->xa_mount->m_log);
857 if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
858 continue;
859
860 tout = xfsaild_push(ailp, &last_pushed_lsn);
861 }
862
863 return 0;
864} /* xfsaild */
865
866int
867xfsaild_start(
868 struct xfs_ail *ailp)
869{
870 ailp->xa_target = 0;
871 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
872 ailp->xa_mount->m_fsname);
873 if (IS_ERR(ailp->xa_task))
874 return -PTR_ERR(ailp->xa_task);
875 return 0;
876}
877
878void
879xfsaild_stop(
880 struct xfs_ail *ailp)
881{
882 kthread_stop(ailp->xa_task);
883}
884
885
886/* Catch misguided souls that try to use this interface on XFS */ 819/* Catch misguided souls that try to use this interface on XFS */
887STATIC struct inode * 820STATIC struct inode *
888xfs_fs_alloc_inode( 821xfs_fs_alloc_inode(
@@ -935,7 +868,7 @@ out_reclaim:
935 * Slab object creation initialisation for the XFS inode. 868 * Slab object creation initialisation for the XFS inode.
936 * This covers only the idempotent fields in the XFS inode; 869 * This covers only the idempotent fields in the XFS inode;
937 * all other fields need to be initialised on allocation 870 * all other fields need to be initialised on allocation
938 * from the slab. This avoids the need to repeatedly intialise 871 * from the slab. This avoids the need to repeatedly initialise
939 * fields in the xfs inode that left in the initialise state 872 * fields in the xfs inode that left in the initialise state
940 * when freeing the inode. 873 * when freeing the inode.
941 */ 874 */
@@ -1076,7 +1009,7 @@ xfs_fs_write_inode(
1076 error = 0; 1009 error = 0;
1077 goto out_unlock; 1010 goto out_unlock;
1078 } 1011 }
1079 error = xfs_iflush(ip, 0); 1012 error = xfs_iflush(ip, SYNC_TRYLOCK);
1080 } 1013 }
1081 1014
1082 out_unlock: 1015 out_unlock:
@@ -1118,6 +1051,8 @@ xfs_fs_evict_inode(
1118 */ 1051 */
1119 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 1052 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
1120 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 1053 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
1054 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
1055 &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
1121 1056
1122 xfs_inactive(ip); 1057 xfs_inactive(ip);
1123} 1058}
@@ -1187,22 +1122,12 @@ xfs_fs_sync_fs(
1187 return -error; 1122 return -error;
1188 1123
1189 if (laptop_mode) { 1124 if (laptop_mode) {
1190 int prev_sync_seq = mp->m_sync_seq;
1191
1192 /* 1125 /*
1193 * The disk must be active because we're syncing. 1126 * The disk must be active because we're syncing.
1194 * We schedule xfssyncd now (now that the disk is 1127 * We schedule xfssyncd now (now that the disk is
1195 * active) instead of later (when it might not be). 1128 * active) instead of later (when it might not be).
1196 */ 1129 */
1197 wake_up_process(mp->m_sync_task); 1130 flush_delayed_work_sync(&mp->m_sync_work);
1198 /*
1199 * We have to wait for the sync iteration to complete.
1200 * If we don't, the disk activity caused by the sync
1201 * will come after the sync is completed, and that
1202 * triggers another sync from laptop mode.
1203 */
1204 wait_event(mp->m_wait_single_sync_task,
1205 mp->m_sync_seq != prev_sync_seq);
1206 } 1131 }
1207 1132
1208 return 0; 1133 return 0;
@@ -1330,8 +1255,8 @@ xfs_fs_remount(
1330 * options that we can't actually change. 1255 * options that we can't actually change.
1331 */ 1256 */
1332#if 0 1257#if 0
1333 printk(KERN_INFO 1258 xfs_info(mp,
1334 "XFS: mount option \"%s\" not supported for remount\n", p); 1259 "mount option \"%s\" not supported for remount\n", p);
1335 return -EINVAL; 1260 return -EINVAL;
1336#else 1261#else
1337 break; 1262 break;
@@ -1352,8 +1277,7 @@ xfs_fs_remount(
1352 if (mp->m_update_flags) { 1277 if (mp->m_update_flags) {
1353 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1278 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1354 if (error) { 1279 if (error) {
1355 cmn_err(CE_WARN, 1280 xfs_warn(mp, "failed to write sb changes");
1356 "XFS: failed to write sb changes");
1357 return error; 1281 return error;
1358 } 1282 }
1359 mp->m_update_flags = 0; 1283 mp->m_update_flags = 0;
@@ -1399,7 +1323,7 @@ xfs_fs_freeze(
1399 1323
1400 xfs_save_resvblks(mp); 1324 xfs_save_resvblks(mp);
1401 xfs_quiesce_attr(mp); 1325 xfs_quiesce_attr(mp);
1402 return -xfs_fs_log_dummy(mp, SYNC_WAIT); 1326 return -xfs_fs_log_dummy(mp);
1403} 1327}
1404 1328
1405STATIC int 1329STATIC int
@@ -1437,15 +1361,15 @@ xfs_finish_flags(
1437 mp->m_logbsize = mp->m_sb.sb_logsunit; 1361 mp->m_logbsize = mp->m_sb.sb_logsunit;
1438 } else if (mp->m_logbsize > 0 && 1362 } else if (mp->m_logbsize > 0 &&
1439 mp->m_logbsize < mp->m_sb.sb_logsunit) { 1363 mp->m_logbsize < mp->m_sb.sb_logsunit) {
1440 cmn_err(CE_WARN, 1364 xfs_warn(mp,
1441 "XFS: logbuf size must be greater than or equal to log stripe size"); 1365 "logbuf size must be greater than or equal to log stripe size");
1442 return XFS_ERROR(EINVAL); 1366 return XFS_ERROR(EINVAL);
1443 } 1367 }
1444 } else { 1368 } else {
1445 /* Fail a mount if the logbuf is larger than 32K */ 1369 /* Fail a mount if the logbuf is larger than 32K */
1446 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { 1370 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
1447 cmn_err(CE_WARN, 1371 xfs_warn(mp,
1448 "XFS: logbuf size for version 1 logs must be 16K or 32K"); 1372 "logbuf size for version 1 logs must be 16K or 32K");
1449 return XFS_ERROR(EINVAL); 1373 return XFS_ERROR(EINVAL);
1450 } 1374 }
1451 } 1375 }
@@ -1462,8 +1386,8 @@ xfs_finish_flags(
1462 * prohibit r/w mounts of read-only filesystems 1386 * prohibit r/w mounts of read-only filesystems
1463 */ 1387 */
1464 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { 1388 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
1465 cmn_err(CE_WARN, 1389 xfs_warn(mp,
1466 "XFS: cannot mount a read-only filesystem as read-write"); 1390 "cannot mount a read-only filesystem as read-write");
1467 return XFS_ERROR(EROFS); 1391 return XFS_ERROR(EROFS);
1468 } 1392 }
1469 1393
@@ -1487,9 +1411,6 @@ xfs_fs_fill_super(
1487 spin_lock_init(&mp->m_sb_lock); 1411 spin_lock_init(&mp->m_sb_lock);
1488 mutex_init(&mp->m_growlock); 1412 mutex_init(&mp->m_growlock);
1489 atomic_set(&mp->m_active_trans, 0); 1413 atomic_set(&mp->m_active_trans, 0);
1490 INIT_LIST_HEAD(&mp->m_sync_list);
1491 spin_lock_init(&mp->m_sync_lock);
1492 init_waitqueue_head(&mp->m_wait_single_sync_task);
1493 1414
1494 mp->m_super = sb; 1415 mp->m_super = sb;
1495 sb->s_fs_info = mp; 1416 sb->s_fs_info = mp;
@@ -1536,10 +1457,14 @@ xfs_fs_fill_super(
1536 if (error) 1457 if (error)
1537 goto out_free_sb; 1458 goto out_free_sb;
1538 1459
1539 error = xfs_mountfs(mp); 1460 /*
1540 if (error) 1461 * we must configure the block size in the superblock before we run the
1541 goto out_filestream_unmount; 1462 * full mount process as the mount process can lookup and cache inodes.
1542 1463 * For the same reason we must also initialise the syncd and register
1464 * the inode cache shrinker so that inodes can be reclaimed during
1465 * operations like a quotacheck that iterate all inodes in the
1466 * filesystem.
1467 */
1543 sb->s_magic = XFS_SB_MAGIC; 1468 sb->s_magic = XFS_SB_MAGIC;
1544 sb->s_blocksize = mp->m_sb.sb_blocksize; 1469 sb->s_blocksize = mp->m_sb.sb_blocksize;
1545 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1470 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1547,6 +1472,16 @@ xfs_fs_fill_super(
1547 sb->s_time_gran = 1; 1472 sb->s_time_gran = 1;
1548 set_posix_acl_flag(sb); 1473 set_posix_acl_flag(sb);
1549 1474
1475 error = xfs_syncd_init(mp);
1476 if (error)
1477 goto out_filestream_unmount;
1478
1479 xfs_inode_shrinker_register(mp);
1480
1481 error = xfs_mountfs(mp);
1482 if (error)
1483 goto out_syncd_stop;
1484
1550 root = igrab(VFS_I(mp->m_rootip)); 1485 root = igrab(VFS_I(mp->m_rootip));
1551 if (!root) { 1486 if (!root) {
1552 error = ENOENT; 1487 error = ENOENT;
@@ -1562,14 +1497,11 @@ xfs_fs_fill_super(
1562 goto fail_vnrele; 1497 goto fail_vnrele;
1563 } 1498 }
1564 1499
1565 error = xfs_syncd_init(mp);
1566 if (error)
1567 goto fail_vnrele;
1568
1569 xfs_inode_shrinker_register(mp);
1570
1571 return 0; 1500 return 0;
1572 1501
1502 out_syncd_stop:
1503 xfs_inode_shrinker_unregister(mp);
1504 xfs_syncd_stop(mp);
1573 out_filestream_unmount: 1505 out_filestream_unmount:
1574 xfs_filestream_unmount(mp); 1506 xfs_filestream_unmount(mp);
1575 out_free_sb: 1507 out_free_sb:
@@ -1593,6 +1525,9 @@ xfs_fs_fill_super(
1593 } 1525 }
1594 1526
1595 fail_unmount: 1527 fail_unmount:
1528 xfs_inode_shrinker_unregister(mp);
1529 xfs_syncd_stop(mp);
1530
1596 /* 1531 /*
1597 * Blow away any referenced inode in the filestreams cache. 1532 * Blow away any referenced inode in the filestreams cache.
1598 * This can and will cause log traffic as inodes go inactive 1533 * This can and will cause log traffic as inodes go inactive
@@ -1782,6 +1717,38 @@ xfs_destroy_zones(void)
1782} 1717}
1783 1718
1784STATIC int __init 1719STATIC int __init
1720xfs_init_workqueues(void)
1721{
1722 /*
1723 * max_active is set to 8 to give enough concurency to allow
1724 * multiple work operations on each CPU to run. This allows multiple
1725 * filesystems to be running sync work concurrently, and scales with
1726 * the number of CPUs in the system.
1727 */
1728 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
1729 if (!xfs_syncd_wq)
1730 goto out;
1731
1732 xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
1733 if (!xfs_ail_wq)
1734 goto out_destroy_syncd;
1735
1736 return 0;
1737
1738out_destroy_syncd:
1739 destroy_workqueue(xfs_syncd_wq);
1740out:
1741 return -ENOMEM;
1742}
1743
1744STATIC void
1745xfs_destroy_workqueues(void)
1746{
1747 destroy_workqueue(xfs_ail_wq);
1748 destroy_workqueue(xfs_syncd_wq);
1749}
1750
1751STATIC int __init
1785init_xfs_fs(void) 1752init_xfs_fs(void)
1786{ 1753{
1787 int error; 1754 int error;
@@ -1796,10 +1763,14 @@ init_xfs_fs(void)
1796 if (error) 1763 if (error)
1797 goto out; 1764 goto out;
1798 1765
1799 error = xfs_mru_cache_init(); 1766 error = xfs_init_workqueues();
1800 if (error) 1767 if (error)
1801 goto out_destroy_zones; 1768 goto out_destroy_zones;
1802 1769
1770 error = xfs_mru_cache_init();
1771 if (error)
1772 goto out_destroy_wq;
1773
1803 error = xfs_filestream_init(); 1774 error = xfs_filestream_init();
1804 if (error) 1775 if (error)
1805 goto out_mru_cache_uninit; 1776 goto out_mru_cache_uninit;
@@ -1816,6 +1787,10 @@ init_xfs_fs(void)
1816 if (error) 1787 if (error)
1817 goto out_cleanup_procfs; 1788 goto out_cleanup_procfs;
1818 1789
1790 error = xfs_init_workqueues();
1791 if (error)
1792 goto out_sysctl_unregister;
1793
1819 vfs_initquota(); 1794 vfs_initquota();
1820 1795
1821 error = register_filesystem(&xfs_fs_type); 1796 error = register_filesystem(&xfs_fs_type);
@@ -1833,6 +1808,8 @@ init_xfs_fs(void)
1833 xfs_filestream_uninit(); 1808 xfs_filestream_uninit();
1834 out_mru_cache_uninit: 1809 out_mru_cache_uninit:
1835 xfs_mru_cache_uninit(); 1810 xfs_mru_cache_uninit();
1811 out_destroy_wq:
1812 xfs_destroy_workqueues();
1836 out_destroy_zones: 1813 out_destroy_zones:
1837 xfs_destroy_zones(); 1814 xfs_destroy_zones();
1838 out: 1815 out:
@@ -1849,6 +1826,7 @@ exit_xfs_fs(void)
1849 xfs_buf_terminate(); 1826 xfs_buf_terminate();
1850 xfs_filestream_uninit(); 1827 xfs_filestream_uninit();
1851 xfs_mru_cache_uninit(); 1828 xfs_mru_cache_uninit();
1829 xfs_destroy_workqueues();
1852 xfs_destroy_zones(); 1830 xfs_destroy_zones();
1853} 1831}
1854 1832
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1c..e4f9c1b0836c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -39,6 +40,8 @@
39#include <linux/kthread.h> 40#include <linux/kthread.h>
40#include <linux/freezer.h> 41#include <linux/freezer.h>
41 42
43struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
44
42/* 45/*
43 * The inode lookup is done in batches to keep the amount of lock traffic and 46 * The inode lookup is done in batches to keep the amount of lock traffic and
44 * radix tree lookups to a minimum. The batch size is a trade off between 47 * radix tree lookups to a minimum. The batch size is a trade off between
@@ -53,14 +56,30 @@ xfs_inode_ag_walk_grab(
53{ 56{
54 struct inode *inode = VFS_I(ip); 57 struct inode *inode = VFS_I(ip);
55 58
59 ASSERT(rcu_read_lock_held());
60
61 /*
62 * check for stale RCU freed inode
63 *
64 * If the inode has been reallocated, it doesn't matter if it's not in
65 * the AG we are walking - we are walking for writeback, so if it
66 * passes all the "valid inode" checks and is dirty, then we'll write
67 * it back anyway. If it has been reallocated and still being
68 * initialised, the XFS_INEW check below will catch it.
69 */
70 spin_lock(&ip->i_flags_lock);
71 if (!ip->i_ino)
72 goto out_unlock_noent;
73
74 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
75 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
76 goto out_unlock_noent;
77 spin_unlock(&ip->i_flags_lock);
78
56 /* nothing to sync during shutdown */ 79 /* nothing to sync during shutdown */
57 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 80 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
58 return EFSCORRUPTED; 81 return EFSCORRUPTED;
59 82
60 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
61 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
62 return ENOENT;
63
64 /* If we can't grab the inode, it must on it's way to reclaim. */ 83 /* If we can't grab the inode, it must on it's way to reclaim. */
65 if (!igrab(inode)) 84 if (!igrab(inode))
66 return ENOENT; 85 return ENOENT;
@@ -72,6 +91,10 @@ xfs_inode_ag_walk_grab(
72 91
73 /* inode is valid */ 92 /* inode is valid */
74 return 0; 93 return 0;
94
95out_unlock_noent:
96 spin_unlock(&ip->i_flags_lock);
97 return ENOENT;
75} 98}
76 99
77STATIC int 100STATIC int
@@ -98,12 +121,12 @@ restart:
98 int error = 0; 121 int error = 0;
99 int i; 122 int i;
100 123
101 read_lock(&pag->pag_ici_lock); 124 rcu_read_lock();
102 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 125 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
103 (void **)batch, first_index, 126 (void **)batch, first_index,
104 XFS_LOOKUP_BATCH); 127 XFS_LOOKUP_BATCH);
105 if (!nr_found) { 128 if (!nr_found) {
106 read_unlock(&pag->pag_ici_lock); 129 rcu_read_unlock();
107 break; 130 break;
108 } 131 }
109 132
@@ -118,18 +141,26 @@ restart:
118 batch[i] = NULL; 141 batch[i] = NULL;
119 142
120 /* 143 /*
121 * Update the index for the next lookup. Catch overflows 144 * Update the index for the next lookup. Catch
122 * into the next AG range which can occur if we have inodes 145 * overflows into the next AG range which can occur if
123 * in the last block of the AG and we are currently 146 * we have inodes in the last block of the AG and we
124 * pointing to the last inode. 147 * are currently pointing to the last inode.
148 *
149 * Because we may see inodes that are from the wrong AG
150 * due to RCU freeing and reallocation, only update the
151 * index if it lies in this AG. It was a race that lead
152 * us to see this inode, so another lookup from the
153 * same index will not find it again.
125 */ 154 */
155 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
156 continue;
126 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 157 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 158 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128 done = 1; 159 done = 1;
129 } 160 }
130 161
131 /* unlock now we've grabbed the inodes. */ 162 /* unlock now we've grabbed the inodes. */
132 read_unlock(&pag->pag_ici_lock); 163 rcu_read_unlock();
133 164
134 for (i = 0; i < nr_found; i++) { 165 for (i = 0; i < nr_found; i++) {
135 if (!batch[i]) 166 if (!batch[i])
@@ -334,7 +365,7 @@ xfs_quiesce_data(
334 365
335 /* mark the log as covered if needed */ 366 /* mark the log as covered if needed */
336 if (xfs_log_need_covered(mp)) 367 if (xfs_log_need_covered(mp))
337 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); 368 error2 = xfs_fs_log_dummy(mp);
338 369
339 /* flush data-only devices */ 370 /* flush data-only devices */
340 if (mp->m_rtdev_targp) 371 if (mp->m_rtdev_targp)
@@ -373,7 +404,7 @@ xfs_quiesce_fs(
373/* 404/*
374 * Second stage of a quiesce. The data is already synced, now we have to take 405 * Second stage of a quiesce. The data is already synced, now we have to take
375 * care of the metadata. New transactions are already blocked, so we need to 406 * care of the metadata. New transactions are already blocked, so we need to
376 * wait for any remaining transactions to drain out before proceding. 407 * wait for any remaining transactions to drain out before proceeding.
377 */ 408 */
378void 409void
379xfs_quiesce_attr( 410xfs_quiesce_attr(
@@ -397,69 +428,18 @@ xfs_quiesce_attr(
397 /* Push the superblock and write an unmount record */ 428 /* Push the superblock and write an unmount record */
398 error = xfs_log_sbcount(mp, 1); 429 error = xfs_log_sbcount(mp, 1);
399 if (error) 430 if (error)
400 xfs_fs_cmn_err(CE_WARN, mp, 431 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
401 "xfs_attr_quiesce: failed to log sb changes. "
402 "Frozen image may not be consistent."); 432 "Frozen image may not be consistent.");
403 xfs_log_unmount_write(mp); 433 xfs_log_unmount_write(mp);
404 xfs_unmountfs_writesb(mp); 434 xfs_unmountfs_writesb(mp);
405} 435}
406 436
407/* 437static void
408 * Enqueue a work item to be picked up by the vfs xfssyncd thread. 438xfs_syncd_queue_sync(
409 * Doing this has two advantages: 439 struct xfs_mount *mp)
410 * - It saves on stack space, which is tight in certain situations
411 * - It can be used (with care) as a mechanism to avoid deadlocks.
412 * Flushing while allocating in a full filesystem requires both.
413 */
414STATIC void
415xfs_syncd_queue_work(
416 struct xfs_mount *mp,
417 void *data,
418 void (*syncer)(struct xfs_mount *, void *),
419 struct completion *completion)
420{ 440{
421 struct xfs_sync_work *work; 441 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
422 442 msecs_to_jiffies(xfs_syncd_centisecs * 10));
423 work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
424 INIT_LIST_HEAD(&work->w_list);
425 work->w_syncer = syncer;
426 work->w_data = data;
427 work->w_mount = mp;
428 work->w_completion = completion;
429 spin_lock(&mp->m_sync_lock);
430 list_add_tail(&work->w_list, &mp->m_sync_list);
431 spin_unlock(&mp->m_sync_lock);
432 wake_up_process(mp->m_sync_task);
433}
434
435/*
436 * Flush delayed allocate data, attempting to free up reserved space
437 * from existing allocations. At this point a new allocation attempt
438 * has failed with ENOSPC and we are in the process of scratching our
439 * heads, looking about for more room...
440 */
441STATIC void
442xfs_flush_inodes_work(
443 struct xfs_mount *mp,
444 void *arg)
445{
446 struct inode *inode = arg;
447 xfs_sync_data(mp, SYNC_TRYLOCK);
448 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
449 iput(inode);
450}
451
452void
453xfs_flush_inodes(
454 xfs_inode_t *ip)
455{
456 struct inode *inode = VFS_I(ip);
457 DECLARE_COMPLETION_ONSTACK(completion);
458
459 igrab(inode);
460 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
461 wait_for_completion(&completion);
462 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
463} 443}
464 444
465/* 445/*
@@ -469,84 +449,119 @@ xfs_flush_inodes(
469 */ 449 */
470STATIC void 450STATIC void
471xfs_sync_worker( 451xfs_sync_worker(
472 struct xfs_mount *mp, 452 struct work_struct *work)
473 void *unused)
474{ 453{
454 struct xfs_mount *mp = container_of(to_delayed_work(work),
455 struct xfs_mount, m_sync_work);
475 int error; 456 int error;
476 457
477 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 458 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
478 xfs_log_force(mp, 0);
479 xfs_reclaim_inodes(mp, 0);
480 /* dgc: errors ignored here */ 459 /* dgc: errors ignored here */
481 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
482 if (mp->m_super->s_frozen == SB_UNFROZEN && 460 if (mp->m_super->s_frozen == SB_UNFROZEN &&
483 xfs_log_need_covered(mp)) 461 xfs_log_need_covered(mp))
484 error = xfs_fs_log_dummy(mp, 0); 462 error = xfs_fs_log_dummy(mp);
463 else
464 xfs_log_force(mp, 0);
465 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
466
467 /* start pushing all the metadata that is currently dirty */
468 xfs_ail_push_all(mp->m_ail);
485 } 469 }
486 mp->m_sync_seq++; 470
487 wake_up(&mp->m_wait_single_sync_task); 471 /* queue us up again */
472 xfs_syncd_queue_sync(mp);
488} 473}
489 474
490STATIC int 475/*
491xfssyncd( 476 * Queue a new inode reclaim pass if there are reclaimable inodes and there
492 void *arg) 477 * isn't a reclaim pass already in progress. By default it runs every 5s based
478 * on the xfs syncd work default of 30s. Perhaps this should have it's own
479 * tunable, but that can be done if this method proves to be ineffective or too
480 * aggressive.
481 */
482static void
483xfs_syncd_queue_reclaim(
484 struct xfs_mount *mp)
493{ 485{
494 struct xfs_mount *mp = arg;
495 long timeleft;
496 xfs_sync_work_t *work, *n;
497 LIST_HEAD (tmp);
498
499 set_freezable();
500 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
501 for (;;) {
502 if (list_empty(&mp->m_sync_list))
503 timeleft = schedule_timeout_interruptible(timeleft);
504 /* swsusp */
505 try_to_freeze();
506 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
507 break;
508 486
509 spin_lock(&mp->m_sync_lock); 487 /*
510 /* 488 * We can have inodes enter reclaim after we've shut down the syncd
511 * We can get woken by laptop mode, to do a sync - 489 * workqueue during unmount, so don't allow reclaim work to be queued
512 * that's the (only!) case where the list would be 490 * during unmount.
513 * empty with time remaining. 491 */
514 */ 492 if (!(mp->m_super->s_flags & MS_ACTIVE))
515 if (!timeleft || list_empty(&mp->m_sync_list)) { 493 return;
516 if (!timeleft)
517 timeleft = xfs_syncd_centisecs *
518 msecs_to_jiffies(10);
519 INIT_LIST_HEAD(&mp->m_sync_work.w_list);
520 list_add_tail(&mp->m_sync_work.w_list,
521 &mp->m_sync_list);
522 }
523 list_splice_init(&mp->m_sync_list, &tmp);
524 spin_unlock(&mp->m_sync_lock);
525 494
526 list_for_each_entry_safe(work, n, &tmp, w_list) { 495 rcu_read_lock();
527 (*work->w_syncer)(mp, work->w_data); 496 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
528 list_del(&work->w_list); 497 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
529 if (work == &mp->m_sync_work) 498 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
530 continue;
531 if (work->w_completion)
532 complete(work->w_completion);
533 kmem_free(work);
534 }
535 } 499 }
500 rcu_read_unlock();
501}
536 502
537 return 0; 503/*
504 * This is a fast pass over the inode cache to try to get reclaim moving on as
505 * many inodes as possible in a short period of time. It kicks itself every few
506 * seconds, as well as being kicked by the inode cache shrinker when memory
507 * goes low. It scans as quickly as possible avoiding locked inodes or those
508 * already being flushed, and once done schedules a future pass.
509 */
510STATIC void
511xfs_reclaim_worker(
512 struct work_struct *work)
513{
514 struct xfs_mount *mp = container_of(to_delayed_work(work),
515 struct xfs_mount, m_reclaim_work);
516
517 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
518 xfs_syncd_queue_reclaim(mp);
519}
520
521/*
522 * Flush delayed allocate data, attempting to free up reserved space
523 * from existing allocations. At this point a new allocation attempt
524 * has failed with ENOSPC and we are in the process of scratching our
525 * heads, looking about for more room.
526 *
527 * Queue a new data flush if there isn't one already in progress and
528 * wait for completion of the flush. This means that we only ever have one
529 * inode flush in progress no matter how many ENOSPC events are occurring and
530 * so will prevent the system from bogging down due to every concurrent
531 * ENOSPC event scanning all the active inodes in the system for writeback.
532 */
533void
534xfs_flush_inodes(
535 struct xfs_inode *ip)
536{
537 struct xfs_mount *mp = ip->i_mount;
538
539 queue_work(xfs_syncd_wq, &mp->m_flush_work);
540 flush_work_sync(&mp->m_flush_work);
541}
542
543STATIC void
544xfs_flush_worker(
545 struct work_struct *work)
546{
547 struct xfs_mount *mp = container_of(work,
548 struct xfs_mount, m_flush_work);
549
550 xfs_sync_data(mp, SYNC_TRYLOCK);
551 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
538} 552}
539 553
540int 554int
541xfs_syncd_init( 555xfs_syncd_init(
542 struct xfs_mount *mp) 556 struct xfs_mount *mp)
543{ 557{
544 mp->m_sync_work.w_syncer = xfs_sync_worker; 558 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
545 mp->m_sync_work.w_mount = mp; 559 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
546 mp->m_sync_work.w_completion = NULL; 560 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
547 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); 561
548 if (IS_ERR(mp->m_sync_task)) 562 xfs_syncd_queue_sync(mp);
549 return -PTR_ERR(mp->m_sync_task); 563 xfs_syncd_queue_reclaim(mp);
564
550 return 0; 565 return 0;
551} 566}
552 567
@@ -554,7 +569,9 @@ void
554xfs_syncd_stop( 569xfs_syncd_stop(
555 struct xfs_mount *mp) 570 struct xfs_mount *mp)
556{ 571{
557 kthread_stop(mp->m_sync_task); 572 cancel_delayed_work_sync(&mp->m_sync_work);
573 cancel_delayed_work_sync(&mp->m_reclaim_work);
574 cancel_work_sync(&mp->m_flush_work);
558} 575}
559 576
560void 577void
@@ -573,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
573 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 590 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
574 XFS_ICI_RECLAIM_TAG); 591 XFS_ICI_RECLAIM_TAG);
575 spin_unlock(&ip->i_mount->m_perag_lock); 592 spin_unlock(&ip->i_mount->m_perag_lock);
593
594 /* schedule periodic background inode reclaim */
595 xfs_syncd_queue_reclaim(ip->i_mount);
596
576 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 597 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
577 -1, _RET_IP_); 598 -1, _RET_IP_);
578 } 599 }
@@ -592,12 +613,12 @@ xfs_inode_set_reclaim_tag(
592 struct xfs_perag *pag; 613 struct xfs_perag *pag;
593 614
594 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 615 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
595 write_lock(&pag->pag_ici_lock); 616 spin_lock(&pag->pag_ici_lock);
596 spin_lock(&ip->i_flags_lock); 617 spin_lock(&ip->i_flags_lock);
597 __xfs_inode_set_reclaim_tag(pag, ip); 618 __xfs_inode_set_reclaim_tag(pag, ip);
598 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 619 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
599 spin_unlock(&ip->i_flags_lock); 620 spin_unlock(&ip->i_flags_lock);
600 write_unlock(&pag->pag_ici_lock); 621 spin_unlock(&pag->pag_ici_lock);
601 xfs_perag_put(pag); 622 xfs_perag_put(pag);
602} 623}
603 624
@@ -639,9 +660,14 @@ xfs_reclaim_inode_grab(
639 struct xfs_inode *ip, 660 struct xfs_inode *ip,
640 int flags) 661 int flags)
641{ 662{
663 ASSERT(rcu_read_lock_held());
664
665 /* quick check for stale RCU freed inode */
666 if (!ip->i_ino)
667 return 1;
642 668
643 /* 669 /*
644 * do some unlocked checks first to avoid unnecceary lock traffic. 670 * do some unlocked checks first to avoid unnecessary lock traffic.
645 * The first is a flush lock check, the second is a already in reclaim 671 * The first is a flush lock check, the second is a already in reclaim
646 * check. Only do these checks if we are not going to block on locks. 672 * check. Only do these checks if we are not going to block on locks.
647 */ 673 */
@@ -654,11 +680,16 @@ xfs_reclaim_inode_grab(
654 * The radix tree lock here protects a thread in xfs_iget from racing 680 * The radix tree lock here protects a thread in xfs_iget from racing
655 * with us starting reclaim on the inode. Once we have the 681 * with us starting reclaim on the inode. Once we have the
656 * XFS_IRECLAIM flag set it will not touch us. 682 * XFS_IRECLAIM flag set it will not touch us.
683 *
684 * Due to RCU lookup, we may find inodes that have been freed and only
685 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
686 * aren't candidates for reclaim at all, so we must check the
687 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
657 */ 688 */
658 spin_lock(&ip->i_flags_lock); 689 spin_lock(&ip->i_flags_lock);
659 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); 690 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
660 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { 691 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
661 /* ignore as it is already under reclaim */ 692 /* not a reclaim candidate. */
662 spin_unlock(&ip->i_flags_lock); 693 spin_unlock(&ip->i_flags_lock);
663 return 1; 694 return 1;
664 } 695 }
@@ -723,8 +754,10 @@ xfs_reclaim_inode(
723 struct xfs_perag *pag, 754 struct xfs_perag *pag,
724 int sync_mode) 755 int sync_mode)
725{ 756{
726 int error = 0; 757 int error;
727 758
759restart:
760 error = 0;
728 xfs_ilock(ip, XFS_ILOCK_EXCL); 761 xfs_ilock(ip, XFS_ILOCK_EXCL);
729 if (!xfs_iflock_nowait(ip)) { 762 if (!xfs_iflock_nowait(ip)) {
730 if (!(sync_mode & SYNC_WAIT)) 763 if (!(sync_mode & SYNC_WAIT))
@@ -750,9 +783,31 @@ xfs_reclaim_inode(
750 if (xfs_inode_clean(ip)) 783 if (xfs_inode_clean(ip))
751 goto reclaim; 784 goto reclaim;
752 785
753 /* Now we have an inode that needs flushing */ 786 /*
754 error = xfs_iflush(ip, sync_mode); 787 * Now we have an inode that needs flushing.
788 *
789 * We do a nonblocking flush here even if we are doing a SYNC_WAIT
790 * reclaim as we can deadlock with inode cluster removal.
791 * xfs_ifree_cluster() can lock the inode buffer before it locks the
792 * ip->i_lock, and we are doing the exact opposite here. As a result,
793 * doing a blocking xfs_itobp() to get the cluster buffer will result
794 * in an ABBA deadlock with xfs_ifree_cluster().
795 *
796 * As xfs_ifree_cluser() must gather all inodes that are active in the
797 * cache to mark them stale, if we hit this case we don't actually want
798 * to do IO here - we want the inode marked stale so we can simply
799 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
800 * just unlock the inode, back off and try again. Hopefully the next
801 * pass through will see the stale flag set on the inode.
802 */
803 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
755 if (sync_mode & SYNC_WAIT) { 804 if (sync_mode & SYNC_WAIT) {
805 if (error == EAGAIN) {
806 xfs_iunlock(ip, XFS_ILOCK_EXCL);
807 /* backoff longer than in xfs_ifree_cluster */
808 delay(2);
809 goto restart;
810 }
756 xfs_iflock(ip); 811 xfs_iflock(ip);
757 goto reclaim; 812 goto reclaim;
758 } 813 }
@@ -767,7 +822,7 @@ xfs_reclaim_inode(
767 * pass on the error. 822 * pass on the error.
768 */ 823 */
769 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 824 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
770 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 825 xfs_warn(ip->i_mount,
771 "inode 0x%llx background reclaim flush failed with %d", 826 "inode 0x%llx background reclaim flush failed with %d",
772 (long long)ip->i_ino, error); 827 (long long)ip->i_ino, error);
773 } 828 }
@@ -795,12 +850,12 @@ reclaim:
795 * added to the tree assert that it's been there before to catch 850 * added to the tree assert that it's been there before to catch
796 * problems with the inode life time early on. 851 * problems with the inode life time early on.
797 */ 852 */
798 write_lock(&pag->pag_ici_lock); 853 spin_lock(&pag->pag_ici_lock);
799 if (!radix_tree_delete(&pag->pag_ici_root, 854 if (!radix_tree_delete(&pag->pag_ici_root,
800 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 855 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
801 ASSERT(0); 856 ASSERT(0);
802 __xfs_inode_clear_reclaim(pag, ip); 857 __xfs_inode_clear_reclaim(pag, ip);
803 write_unlock(&pag->pag_ici_lock); 858 spin_unlock(&pag->pag_ici_lock);
804 859
805 /* 860 /*
806 * Here we do an (almost) spurious inode lock in order to coordinate 861 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +919,14 @@ restart:
864 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 919 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
865 int i; 920 int i;
866 921
867 write_lock(&pag->pag_ici_lock); 922 rcu_read_lock();
868 nr_found = radix_tree_gang_lookup_tag( 923 nr_found = radix_tree_gang_lookup_tag(
869 &pag->pag_ici_root, 924 &pag->pag_ici_root,
870 (void **)batch, first_index, 925 (void **)batch, first_index,
871 XFS_LOOKUP_BATCH, 926 XFS_LOOKUP_BATCH,
872 XFS_ICI_RECLAIM_TAG); 927 XFS_ICI_RECLAIM_TAG);
873 if (!nr_found) { 928 if (!nr_found) {
874 write_unlock(&pag->pag_ici_lock); 929 rcu_read_unlock();
875 break; 930 break;
876 } 931 }
877 932
@@ -891,14 +946,24 @@ restart:
891 * occur if we have inodes in the last block of 946 * occur if we have inodes in the last block of
892 * the AG and we are currently pointing to the 947 * the AG and we are currently pointing to the
893 * last inode. 948 * last inode.
949 *
950 * Because we may see inodes that are from the
951 * wrong AG due to RCU freeing and
952 * reallocation, only update the index if it
953 * lies in this AG. It was a race that lead us
954 * to see this inode, so another lookup from
955 * the same index will not find it again.
894 */ 956 */
957 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
958 pag->pag_agno)
959 continue;
895 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 960 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
896 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 961 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
897 done = 1; 962 done = 1;
898 } 963 }
899 964
900 /* unlock now we've grabbed the inodes. */ 965 /* unlock now we've grabbed the inodes. */
901 write_unlock(&pag->pag_ici_lock); 966 rcu_read_unlock();
902 967
903 for (i = 0; i < nr_found; i++) { 968 for (i = 0; i < nr_found; i++) {
904 if (!batch[i]) 969 if (!batch[i])
@@ -945,7 +1010,13 @@ xfs_reclaim_inodes(
945} 1010}
946 1011
947/* 1012/*
948 * Shrinker infrastructure. 1013 * Inode cache shrinker.
1014 *
1015 * When called we make sure that there is a background (fast) inode reclaim in
1016 * progress, while we will throttle the speed of reclaim via doiing synchronous
1017 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1018 * them to be cleaned, which we hope will not be very long due to the
1019 * background walker having already kicked the IO off on those dirty inodes.
949 */ 1020 */
950static int 1021static int
951xfs_reclaim_inode_shrink( 1022xfs_reclaim_inode_shrink(
@@ -960,10 +1031,15 @@ xfs_reclaim_inode_shrink(
960 1031
961 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1032 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
962 if (nr_to_scan) { 1033 if (nr_to_scan) {
1034 /* kick background reclaimer and push the AIL */
1035 xfs_syncd_queue_reclaim(mp);
1036 xfs_ail_push_all(mp->m_ail);
1037
963 if (!(gfp_mask & __GFP_FS)) 1038 if (!(gfp_mask & __GFP_FS))
964 return -1; 1039 return -1;
965 1040
966 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); 1041 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
1042 &nr_to_scan);
967 /* terminate if we don't exhaust the scan */ 1043 /* terminate if we don't exhaust the scan */
968 if (nr_to_scan > 0) 1044 if (nr_to_scan > 0)
969 return -1; 1045 return -1;
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 32ba6628290c..e3a6ad27415f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -32,6 +32,8 @@ typedef struct xfs_sync_work {
32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
34 34
35extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
36
35int xfs_syncd_init(struct xfs_mount *mp); 37int xfs_syncd_init(struct xfs_mount *mp);
36void xfs_syncd_stop(struct xfs_mount *mp); 38void xfs_syncd_stop(struct xfs_mount *mp);
37 39
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include "xfs_error.h"
21 22
22static struct ctl_table_header *xfs_table_header; 23static struct ctl_table_header *xfs_table_header;
23 24
@@ -36,7 +37,7 @@ xfs_stats_clear_proc_handler(
36 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); 37 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
37 38
38 if (!ret && write && *valp) { 39 if (!ret && write && *valp) {
39 printk("XFS Clearing xfsstats\n"); 40 xfs_notice(NULL, "Clearing xfsstats");
40 for_each_possible_cpu(c) { 41 for_each_possible_cpu(c) {
41 preempt_disable(); 42 preempt_disable();
42 /* save vn_active, it's a universal truth! */ 43 /* save vn_active, it's a universal truth! */
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
51 52
52 return ret; 53 return ret;
53} 54}
55
56STATIC int
57xfs_panic_mask_proc_handler(
58 ctl_table *ctl,
59 int write,
60 void __user *buffer,
61 size_t *lenp,
62 loff_t *ppos)
63{
64 int ret, *valp = ctl->data;
65
66 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
67 if (!ret && write) {
68 xfs_panic_mask = *valp;
69#ifdef DEBUG
70 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
71#endif
72 }
73 return ret;
74}
54#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
55 76
56static ctl_table xfs_table[] = { 77static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
77 .data = &xfs_params.panic_mask.val, 98 .data = &xfs_params.panic_mask.val,
78 .maxlen = sizeof(int), 99 .maxlen = sizeof(int),
79 .mode = 0644, 100 .mode = 0644,
80 .proc_handler = proc_dointvec_minmax, 101 .proc_handler = xfs_panic_mask_proc_handler,
81 .extra1 = &xfs_params.panic_mask.min, 102 .extra1 = &xfs_params.panic_mask.min,
82 .extra2 = &xfs_params.panic_mask.max 103 .extra2 = &xfs_params.panic_mask.max
83 }, 104 },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c594..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
766 __field(int, curr_res) 766 __field(int, curr_res)
767 __field(int, unit_res) 767 __field(int, unit_res)
768 __field(unsigned int, flags) 768 __field(unsigned int, flags)
769 __field(void *, reserve_headq) 769 __field(int, reserveq)
770 __field(void *, write_headq) 770 __field(int, writeq)
771 __field(int, grant_reserve_cycle) 771 __field(int, grant_reserve_cycle)
772 __field(int, grant_reserve_bytes) 772 __field(int, grant_reserve_bytes)
773 __field(int, grant_write_cycle) 773 __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
784 __entry->curr_res = tic->t_curr_res; 784 __entry->curr_res = tic->t_curr_res;
785 __entry->unit_res = tic->t_unit_res; 785 __entry->unit_res = tic->t_unit_res;
786 __entry->flags = tic->t_flags; 786 __entry->flags = tic->t_flags;
787 __entry->reserve_headq = log->l_reserve_headq; 787 __entry->reserveq = list_empty(&log->l_reserveq);
788 __entry->write_headq = log->l_write_headq; 788 __entry->writeq = list_empty(&log->l_writeq);
789 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; 789 xlog_crack_grant_head(&log->l_grant_reserve_head,
790 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; 790 &__entry->grant_reserve_cycle,
791 __entry->grant_write_cycle = log->l_grant_write_cycle; 791 &__entry->grant_reserve_bytes);
792 __entry->grant_write_bytes = log->l_grant_write_bytes; 792 xlog_crack_grant_head(&log->l_grant_write_head,
793 &__entry->grant_write_cycle,
794 &__entry->grant_write_bytes);
793 __entry->curr_cycle = log->l_curr_cycle; 795 __entry->curr_cycle = log->l_curr_cycle;
794 __entry->curr_block = log->l_curr_block; 796 __entry->curr_block = log->l_curr_block;
795 __entry->tail_lsn = log->l_tail_lsn; 797 __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
796 ), 798 ),
797 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " 799 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
798 "t_unit_res %u t_flags %s reserve_headq 0x%p " 800 "t_unit_res %u t_flags %s reserveq %s "
799 "write_headq 0x%p grant_reserve_cycle %d " 801 "writeq %s grant_reserve_cycle %d "
800 "grant_reserve_bytes %d grant_write_cycle %d " 802 "grant_reserve_bytes %d grant_write_cycle %d "
801 "grant_write_bytes %d curr_cycle %d curr_block %d " 803 "grant_write_bytes %d curr_cycle %d curr_block %d "
802 "tail_cycle %d tail_block %d", 804 "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
807 __entry->curr_res, 809 __entry->curr_res,
808 __entry->unit_res, 810 __entry->unit_res,
809 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), 811 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
810 __entry->reserve_headq, 812 __entry->reserveq ? "empty" : "active",
811 __entry->write_headq, 813 __entry->writeq ? "empty" : "active",
812 __entry->grant_reserve_cycle, 814 __entry->grant_reserve_cycle,
813 __entry->grant_reserve_bytes, 815 __entry->grant_reserve_bytes,
814 __entry->grant_write_cycle, 816 __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
835DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); 837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); 838DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); 839DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
840DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
838DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
839DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 850DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); 851DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
935DEFINE_PAGE_EVENT(xfs_releasepage); 939DEFINE_PAGE_EVENT(xfs_releasepage);
936DEFINE_PAGE_EVENT(xfs_invalidatepage); 940DEFINE_PAGE_EVENT(xfs_invalidatepage);
937 941
938DECLARE_EVENT_CLASS(xfs_iomap_class, 942DECLARE_EVENT_CLASS(xfs_imap_class,
939 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 943 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
940 int flags, struct xfs_bmbt_irec *irec), 944 int type, struct xfs_bmbt_irec *irec),
941 TP_ARGS(ip, offset, count, flags, irec), 945 TP_ARGS(ip, offset, count, type, irec),
942 TP_STRUCT__entry( 946 TP_STRUCT__entry(
943 __field(dev_t, dev) 947 __field(dev_t, dev)
944 __field(xfs_ino_t, ino) 948 __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
946 __field(loff_t, new_size) 950 __field(loff_t, new_size)
947 __field(loff_t, offset) 951 __field(loff_t, offset)
948 __field(size_t, count) 952 __field(size_t, count)
949 __field(int, flags) 953 __field(int, type)
950 __field(xfs_fileoff_t, startoff) 954 __field(xfs_fileoff_t, startoff)
951 __field(xfs_fsblock_t, startblock) 955 __field(xfs_fsblock_t, startblock)
952 __field(xfs_filblks_t, blockcount) 956 __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
958 __entry->new_size = ip->i_new_size; 962 __entry->new_size = ip->i_new_size;
959 __entry->offset = offset; 963 __entry->offset = offset;
960 __entry->count = count; 964 __entry->count = count;
961 __entry->flags = flags; 965 __entry->type = type;
962 __entry->startoff = irec ? irec->br_startoff : 0; 966 __entry->startoff = irec ? irec->br_startoff : 0;
963 __entry->startblock = irec ? irec->br_startblock : 0; 967 __entry->startblock = irec ? irec->br_startblock : 0;
964 __entry->blockcount = irec ? irec->br_blockcount : 0; 968 __entry->blockcount = irec ? irec->br_blockcount : 0;
965 ), 969 ),
966 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 970 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
967 "offset 0x%llx count %zd flags %s " 971 "offset 0x%llx count %zd type %s "
968 "startoff 0x%llx startblock %lld blockcount 0x%llx", 972 "startoff 0x%llx startblock %lld blockcount 0x%llx",
969 MAJOR(__entry->dev), MINOR(__entry->dev), 973 MAJOR(__entry->dev), MINOR(__entry->dev),
970 __entry->ino, 974 __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
972 __entry->new_size, 976 __entry->new_size,
973 __entry->offset, 977 __entry->offset,
974 __entry->count, 978 __entry->count,
975 __print_flags(__entry->flags, "|", BMAPI_FLAGS), 979 __print_symbolic(__entry->type, XFS_IO_TYPES),
976 __entry->startoff, 980 __entry->startoff,
977 (__int64_t)__entry->startblock, 981 (__int64_t)__entry->startblock,
978 __entry->blockcount) 982 __entry->blockcount)
979) 983)
980 984
981#define DEFINE_IOMAP_EVENT(name) \ 985#define DEFINE_IOMAP_EVENT(name) \
982DEFINE_EVENT(xfs_iomap_class, name, \ 986DEFINE_EVENT(xfs_imap_class, name, \
983 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 987 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
984 int flags, struct xfs_bmbt_irec *irec), \ 988 int type, struct xfs_bmbt_irec *irec), \
985 TP_ARGS(ip, offset, count, flags, irec)) 989 TP_ARGS(ip, offset, count, type, irec))
986DEFINE_IOMAP_EVENT(xfs_iomap_enter); 990DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
987DEFINE_IOMAP_EVENT(xfs_iomap_found); 991DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
988DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 992DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
993DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
989 994
990DECLARE_EVENT_CLASS(xfs_simple_io_class, 995DECLARE_EVENT_CLASS(xfs_simple_io_class,
991 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 996 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
1022 TP_ARGS(ip, offset, count)) 1027 TP_ARGS(ip, offset, count))
1023DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 1028DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
1024DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 1029DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
1030DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
1025 1031
1026 1032
1027TRACE_EVENT(xfs_itruncate_start, 1033TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
1420 TP_PROTO(struct xfs_alloc_arg *args), \ 1426 TP_PROTO(struct xfs_alloc_arg *args), \
1421 TP_ARGS(args)) 1427 TP_ARGS(args))
1422DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); 1428DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
1429DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
1423DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); 1430DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
1424DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); 1431DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
1425DEFINE_ALLOC_EVENT(xfs_alloc_near_first); 1432DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1752DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); 1759DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1753DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); 1760DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1754 1761
1762DECLARE_EVENT_CLASS(xfs_discard_class,
1763 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1764 xfs_agblock_t agbno, xfs_extlen_t len),
1765 TP_ARGS(mp, agno, agbno, len),
1766 TP_STRUCT__entry(
1767 __field(dev_t, dev)
1768 __field(xfs_agnumber_t, agno)
1769 __field(xfs_agblock_t, agbno)
1770 __field(xfs_extlen_t, len)
1771 ),
1772 TP_fast_assign(
1773 __entry->dev = mp->m_super->s_dev;
1774 __entry->agno = agno;
1775 __entry->agbno = agbno;
1776 __entry->len = len;
1777 ),
1778 TP_printk("dev %d:%d agno %u agbno %u len %u\n",
1779 MAJOR(__entry->dev), MINOR(__entry->dev),
1780 __entry->agno,
1781 __entry->agbno,
1782 __entry->len)
1783)
1784
1785#define DEFINE_DISCARD_EVENT(name) \
1786DEFINE_EVENT(xfs_discard_class, name, \
1787 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1788 xfs_agblock_t agbno, xfs_extlen_t len), \
1789 TP_ARGS(mp, agno, agbno, len))
1790DEFINE_DISCARD_EVENT(xfs_discard_extent);
1791DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
1792DEFINE_DISCARD_EVENT(xfs_discard_exclude);
1793DEFINE_DISCARD_EVENT(xfs_discard_busy);
1794
1755#endif /* _TRACE_XFS_H */ 1795#endif /* _TRACE_XFS_H */
1756 1796
1757#undef TRACE_INCLUDE_PATH 1797#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a12..6fa214603819 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
149 ASSERT(list_empty(&dqp->q_freelist)); 149 ASSERT(list_empty(&dqp->q_freelist));
150 150
151 mutex_destroy(&dqp->q_qlock); 151 mutex_destroy(&dqp->q_qlock);
152 sv_destroy(&dqp->q_pinwait);
153 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); 152 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
154 153
155 atomic_dec(&xfs_Gqm->qm_totaldquots); 154 atomic_dec(&xfs_Gqm->qm_totaldquots);
@@ -545,9 +544,10 @@ xfs_qm_dqtobp(
545 /* 544 /*
546 * A simple sanity check in case we got a corrupted dquot... 545 * A simple sanity check in case we got a corrupted dquot...
547 */ 546 */
548 if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, 547 error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
549 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), 548 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
550 "dqtobp")) { 549 "dqtobp");
550 if (error) {
551 if (!(flags & XFS_QMOPT_DQREPAIR)) { 551 if (!(flags & XFS_QMOPT_DQREPAIR)) {
552 xfs_trans_brelse(tp, bp); 552 xfs_trans_brelse(tp, bp);
553 return XFS_ERROR(EIO); 553 return XFS_ERROR(EIO);
@@ -600,7 +600,7 @@ xfs_qm_dqread(
600 600
601 /* 601 /*
602 * Reservation counters are defined as reservation plus current usage 602 * Reservation counters are defined as reservation plus current usage
603 * to avoid having to add everytime. 603 * to avoid having to add every time.
604 */ 604 */
605 dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); 605 dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
606 dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); 606 dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
@@ -828,7 +828,7 @@ xfs_qm_dqget(
828 if (xfs_do_dqerror) { 828 if (xfs_do_dqerror) {
829 if ((xfs_dqerror_target == mp->m_ddev_targp) && 829 if ((xfs_dqerror_target == mp->m_ddev_targp) &&
830 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { 830 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
831 cmn_err(CE_DEBUG, "Returning error in dqget"); 831 xfs_debug(mp, "Returning error in dqget");
832 return (EIO); 832 return (EIO);
833 } 833 }
834 } 834 }
@@ -1208,8 +1208,9 @@ xfs_qm_dqflush(
1208 /* 1208 /*
1209 * A simple sanity check in case we got a corrupted dquot.. 1209 * A simple sanity check in case we got a corrupted dquot..
1210 */ 1210 */
1211 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 1211 error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
1212 XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { 1212 XFS_QMOPT_DOWARN, "dqflush (incore copy)");
1213 if (error) {
1213 xfs_buf_relse(bp); 1214 xfs_buf_relse(bp);
1214 xfs_dqfunlock(dqp); 1215 xfs_dqfunlock(dqp);
1215 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1216 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1392,8 +1393,8 @@ xfs_qm_dqpurge(
1392 */ 1393 */
1393 error = xfs_qm_dqflush(dqp, SYNC_WAIT); 1394 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
1394 if (error) 1395 if (error)
1395 xfs_fs_cmn_err(CE_WARN, mp, 1396 xfs_warn(mp, "%s: dquot %p flush failed",
1396 "xfs_qm_dqpurge: dquot %p flush failed", dqp); 1397 __func__, dqp);
1397 xfs_dqflock(dqp); 1398 xfs_dqflock(dqp);
1398 } 1399 }
1399 ASSERT(atomic_read(&dqp->q_pincount) == 0); 1400 ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -1426,36 +1427,38 @@ xfs_qm_dqpurge(
1426void 1427void
1427xfs_qm_dqprint(xfs_dquot_t *dqp) 1428xfs_qm_dqprint(xfs_dquot_t *dqp)
1428{ 1429{
1429 cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------"); 1430 struct xfs_mount *mp = dqp->q_mount;
1430 cmn_err(CE_DEBUG, "---- dquotID = %d", 1431
1432 xfs_debug(mp, "-----------KERNEL DQUOT----------------");
1433 xfs_debug(mp, "---- dquotID = %d",
1431 (int)be32_to_cpu(dqp->q_core.d_id)); 1434 (int)be32_to_cpu(dqp->q_core.d_id));
1432 cmn_err(CE_DEBUG, "---- type = %s", DQFLAGTO_TYPESTR(dqp)); 1435 xfs_debug(mp, "---- type = %s", DQFLAGTO_TYPESTR(dqp));
1433 cmn_err(CE_DEBUG, "---- fs = 0x%p", dqp->q_mount); 1436 xfs_debug(mp, "---- fs = 0x%p", dqp->q_mount);
1434 cmn_err(CE_DEBUG, "---- blkno = 0x%x", (int) dqp->q_blkno); 1437 xfs_debug(mp, "---- blkno = 0x%x", (int) dqp->q_blkno);
1435 cmn_err(CE_DEBUG, "---- boffset = 0x%x", (int) dqp->q_bufoffset); 1438 xfs_debug(mp, "---- boffset = 0x%x", (int) dqp->q_bufoffset);
1436 cmn_err(CE_DEBUG, "---- blkhlimit = %Lu (0x%x)", 1439 xfs_debug(mp, "---- blkhlimit = %Lu (0x%x)",
1437 be64_to_cpu(dqp->q_core.d_blk_hardlimit), 1440 be64_to_cpu(dqp->q_core.d_blk_hardlimit),
1438 (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit)); 1441 (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
1439 cmn_err(CE_DEBUG, "---- blkslimit = %Lu (0x%x)", 1442 xfs_debug(mp, "---- blkslimit = %Lu (0x%x)",
1440 be64_to_cpu(dqp->q_core.d_blk_softlimit), 1443 be64_to_cpu(dqp->q_core.d_blk_softlimit),
1441 (int)be64_to_cpu(dqp->q_core.d_blk_softlimit)); 1444 (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
1442 cmn_err(CE_DEBUG, "---- inohlimit = %Lu (0x%x)", 1445 xfs_debug(mp, "---- inohlimit = %Lu (0x%x)",
1443 be64_to_cpu(dqp->q_core.d_ino_hardlimit), 1446 be64_to_cpu(dqp->q_core.d_ino_hardlimit),
1444 (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit)); 1447 (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
1445 cmn_err(CE_DEBUG, "---- inoslimit = %Lu (0x%x)", 1448 xfs_debug(mp, "---- inoslimit = %Lu (0x%x)",
1446 be64_to_cpu(dqp->q_core.d_ino_softlimit), 1449 be64_to_cpu(dqp->q_core.d_ino_softlimit),
1447 (int)be64_to_cpu(dqp->q_core.d_ino_softlimit)); 1450 (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
1448 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", 1451 xfs_debug(mp, "---- bcount = %Lu (0x%x)",
1449 be64_to_cpu(dqp->q_core.d_bcount), 1452 be64_to_cpu(dqp->q_core.d_bcount),
1450 (int)be64_to_cpu(dqp->q_core.d_bcount)); 1453 (int)be64_to_cpu(dqp->q_core.d_bcount));
1451 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", 1454 xfs_debug(mp, "---- icount = %Lu (0x%x)",
1452 be64_to_cpu(dqp->q_core.d_icount), 1455 be64_to_cpu(dqp->q_core.d_icount),
1453 (int)be64_to_cpu(dqp->q_core.d_icount)); 1456 (int)be64_to_cpu(dqp->q_core.d_icount));
1454 cmn_err(CE_DEBUG, "---- btimer = %d", 1457 xfs_debug(mp, "---- btimer = %d",
1455 (int)be32_to_cpu(dqp->q_core.d_btimer)); 1458 (int)be32_to_cpu(dqp->q_core.d_btimer));
1456 cmn_err(CE_DEBUG, "---- itimer = %d", 1459 xfs_debug(mp, "---- itimer = %d",
1457 (int)be32_to_cpu(dqp->q_core.d_itimer)); 1460 (int)be32_to_cpu(dqp->q_core.d_itimer));
1458 cmn_err(CE_DEBUG, "---------------------------"); 1461 xfs_debug(mp, "---------------------------");
1459} 1462}
1460#endif 1463#endif
1461 1464
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2a1f3dc10a02..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -136,9 +136,8 @@ xfs_qm_dquot_logitem_push(
136 */ 136 */
137 error = xfs_qm_dqflush(dqp, 0); 137 error = xfs_qm_dqflush(dqp, 0);
138 if (error) 138 if (error)
139 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 139 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
140 "xfs_qm_dquot_logitem_push: push error %d on dqp %p", 140 __func__, error, dqp);
141 error, dqp);
142 xfs_dqunlock(dqp); 141 xfs_dqunlock(dqp);
143} 142}
144 143
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde8..69228aa8605a 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -80,7 +80,7 @@ xfs_qm_dquot_list_print(
80 int i = 0; 80 int i = 0;
81 81
82 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { 82 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
83 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " 83 xfs_debug(mp, " %d. \"%d (%s)\" "
84 "bcnt = %lld, icnt = %lld, refs = %d", 84 "bcnt = %lld, icnt = %lld, refs = %d",
85 i++, be32_to_cpu(dqp->q_core.d_id), 85 i++, be32_to_cpu(dqp->q_core.d_id),
86 DQFLAGTO_TYPESTR(dqp), 86 DQFLAGTO_TYPESTR(dqp),
@@ -205,7 +205,7 @@ xfs_qm_destroy(
205 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { 205 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
206 xfs_dqlock(dqp); 206 xfs_dqlock(dqp);
207#ifdef QUOTADEBUG 207#ifdef QUOTADEBUG
208 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); 208 xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
209#endif 209#endif
210 list_del_init(&dqp->q_freelist); 210 list_del_init(&dqp->q_freelist);
211 xfs_Gqm->qm_dqfrlist_cnt--; 211 xfs_Gqm->qm_dqfrlist_cnt--;
@@ -341,9 +341,7 @@ xfs_qm_mount_quotas(
341 * quotas immediately. 341 * quotas immediately.
342 */ 342 */
343 if (mp->m_sb.sb_rextents) { 343 if (mp->m_sb.sb_rextents) {
344 cmn_err(CE_NOTE, 344 xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
345 "Cannot turn on quotas for realtime filesystem %s",
346 mp->m_fsname);
347 mp->m_qflags = 0; 345 mp->m_qflags = 0;
348 goto write_changes; 346 goto write_changes;
349 } 347 }
@@ -402,14 +400,13 @@ xfs_qm_mount_quotas(
402 * off, but the on disk superblock doesn't know that ! 400 * off, but the on disk superblock doesn't know that !
403 */ 401 */
404 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); 402 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
405 xfs_fs_cmn_err(CE_ALERT, mp, 403 xfs_alert(mp, "%s: Superblock update failed!",
406 "XFS mount_quotas: Superblock update failed!"); 404 __func__);
407 } 405 }
408 } 406 }
409 407
410 if (error) { 408 if (error) {
411 xfs_fs_cmn_err(CE_WARN, mp, 409 xfs_warn(mp, "Failed to initialize disk quotas.");
412 "Failed to initialize disk quotas.");
413 return; 410 return;
414 } 411 }
415 412
@@ -464,12 +461,10 @@ xfs_qm_dqflush_all(
464 struct xfs_quotainfo *q = mp->m_quotainfo; 461 struct xfs_quotainfo *q = mp->m_quotainfo;
465 int recl; 462 int recl;
466 struct xfs_dquot *dqp; 463 struct xfs_dquot *dqp;
467 int niters;
468 int error; 464 int error;
469 465
470 if (!q) 466 if (!q)
471 return 0; 467 return 0;
472 niters = 0;
473again: 468again:
474 mutex_lock(&q->qi_dqlist_lock); 469 mutex_lock(&q->qi_dqlist_lock);
475 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { 470 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
@@ -1230,13 +1225,6 @@ xfs_qm_qino_alloc(
1230 } 1225 }
1231 1226
1232 /* 1227 /*
1233 * Keep an extra reference to this quota inode. This inode is
1234 * locked exclusively and joined to the transaction already.
1235 */
1236 ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
1237 IHOLD(*ip);
1238
1239 /*
1240 * Make the changes in the superblock, and log those too. 1228 * Make the changes in the superblock, and log those too.
1241 * sbfields arg may contain fields other than *QUOTINO; 1229 * sbfields arg may contain fields other than *QUOTINO;
1242 * VERSIONNUM for example. 1230 * VERSIONNUM for example.
@@ -1264,7 +1252,7 @@ xfs_qm_qino_alloc(
1264 xfs_mod_sb(tp, sbfields); 1252 xfs_mod_sb(tp, sbfields);
1265 1253
1266 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { 1254 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
1267 xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!"); 1255 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
1268 return error; 1256 return error;
1269 } 1257 }
1270 return 0; 1258 return 0;
@@ -1299,7 +1287,7 @@ xfs_qm_reset_dqcounts(
1299 * output any warnings because it's perfectly possible to 1287 * output any warnings because it's perfectly possible to
1300 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck. 1288 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
1301 */ 1289 */
1302 (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR, 1290 (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
1303 "xfs_quotacheck"); 1291 "xfs_quotacheck");
1304 ddq->d_bcount = 0; 1292 ddq->d_bcount = 0;
1305 ddq->d_icount = 0; 1293 ddq->d_icount = 0;
@@ -1324,14 +1312,9 @@ xfs_qm_dqiter_bufs(
1324{ 1312{
1325 xfs_buf_t *bp; 1313 xfs_buf_t *bp;
1326 int error; 1314 int error;
1327 int notcommitted;
1328 int incr;
1329 int type; 1315 int type;
1330 1316
1331 ASSERT(blkcnt > 0); 1317 ASSERT(blkcnt > 0);
1332 notcommitted = 0;
1333 incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
1334 XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
1335 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : 1318 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
1336 (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); 1319 (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
1337 error = 0; 1320 error = 0;
@@ -1676,7 +1659,7 @@ xfs_qm_quotacheck(
1676 */ 1659 */
1677 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); 1660 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
1678 1661
1679 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); 1662 xfs_notice(mp, "Quotacheck needed: Please wait.");
1680 1663
1681 /* 1664 /*
1682 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset 1665 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
@@ -1754,9 +1737,9 @@ xfs_qm_quotacheck(
1754 1737
1755 error_return: 1738 error_return:
1756 if (error) { 1739 if (error) {
1757 cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): " 1740 xfs_warn(mp,
1758 "Disabling quotas.", 1741 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
1759 mp->m_fsname, error); 1742 error);
1760 /* 1743 /*
1761 * We must turn off quotas. 1744 * We must turn off quotas.
1762 */ 1745 */
@@ -1764,12 +1747,11 @@ xfs_qm_quotacheck(
1764 ASSERT(xfs_Gqm != NULL); 1747 ASSERT(xfs_Gqm != NULL);
1765 xfs_qm_destroy_quotainfo(mp); 1748 xfs_qm_destroy_quotainfo(mp);
1766 if (xfs_mount_reset_sbqflags(mp)) { 1749 if (xfs_mount_reset_sbqflags(mp)) {
1767 cmn_err(CE_WARN, "XFS quotacheck %s: " 1750 xfs_warn(mp,
1768 "Failed to reset quota flags.", mp->m_fsname); 1751 "Quotacheck: Failed to reset quota flags.");
1769 } 1752 }
1770 } else { 1753 } else
1771 cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname); 1754 xfs_notice(mp, "Quotacheck: Done.");
1772 }
1773 return (error); 1755 return (error);
1774} 1756}
1775 1757
@@ -1863,12 +1845,14 @@ xfs_qm_dqreclaim_one(void)
1863 xfs_dquot_t *dqpout; 1845 xfs_dquot_t *dqpout;
1864 xfs_dquot_t *dqp; 1846 xfs_dquot_t *dqp;
1865 int restarts; 1847 int restarts;
1848 int startagain;
1866 1849
1867 restarts = 0; 1850 restarts = 0;
1868 dqpout = NULL; 1851 dqpout = NULL;
1869 1852
1870 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ 1853 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1871startagain: 1854again:
1855 startagain = 0;
1872 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); 1856 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1873 1857
1874 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { 1858 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1869,10 @@ startagain:
1885 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); 1869 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1886 1870
1887 trace_xfs_dqreclaim_want(dqp); 1871 trace_xfs_dqreclaim_want(dqp);
1888
1889 xfs_dqunlock(dqp);
1890 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1891 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1892 return NULL;
1893 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1872 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1894 goto startagain; 1873 restarts++;
1874 startagain = 1;
1875 goto dqunlock;
1895 } 1876 }
1896 1877
1897 /* 1878 /*
@@ -1906,23 +1887,20 @@ startagain:
1906 ASSERT(list_empty(&dqp->q_mplist)); 1887 ASSERT(list_empty(&dqp->q_mplist));
1907 list_del_init(&dqp->q_freelist); 1888 list_del_init(&dqp->q_freelist);
1908 xfs_Gqm->qm_dqfrlist_cnt--; 1889 xfs_Gqm->qm_dqfrlist_cnt--;
1909 xfs_dqunlock(dqp);
1910 dqpout = dqp; 1890 dqpout = dqp;
1911 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1891 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1912 break; 1892 goto dqunlock;
1913 } 1893 }
1914 1894
1915 ASSERT(dqp->q_hash); 1895 ASSERT(dqp->q_hash);
1916 ASSERT(!list_empty(&dqp->q_mplist)); 1896 ASSERT(!list_empty(&dqp->q_mplist));
1917 1897
1918 /* 1898 /*
1919 * Try to grab the flush lock. If this dquot is in the process of 1899 * Try to grab the flush lock. If this dquot is in the process
1920 * getting flushed to disk, we don't want to reclaim it. 1900 * of getting flushed to disk, we don't want to reclaim it.
1921 */ 1901 */
1922 if (!xfs_dqflock_nowait(dqp)) { 1902 if (!xfs_dqflock_nowait(dqp))
1923 xfs_dqunlock(dqp); 1903 goto dqunlock;
1924 continue;
1925 }
1926 1904
1927 /* 1905 /*
1928 * We have the flush lock so we know that this is not in the 1906 * We have the flush lock so we know that this is not in the
@@ -1941,11 +1919,10 @@ startagain:
1941 */ 1919 */
1942 error = xfs_qm_dqflush(dqp, 0); 1920 error = xfs_qm_dqflush(dqp, 0);
1943 if (error) { 1921 if (error) {
1944 xfs_fs_cmn_err(CE_WARN, mp, 1922 xfs_warn(mp, "%s: dquot %p flush failed",
1945 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 1923 __func__, dqp);
1946 } 1924 }
1947 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 1925 goto dqunlock;
1948 continue;
1949 } 1926 }
1950 1927
1951 /* 1928 /*
@@ -1967,13 +1944,8 @@ startagain:
1967 */ 1944 */
1968 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { 1945 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
1969 restarts++; 1946 restarts++;
1970 mutex_unlock(&dqp->q_hash->qh_lock); 1947 startagain = 1;
1971 xfs_dqfunlock(dqp); 1948 goto qhunlock;
1972 xfs_dqunlock(dqp);
1973 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1974 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
1975 return NULL;
1976 goto startagain;
1977 } 1949 }
1978 1950
1979 ASSERT(dqp->q_nrefs == 0); 1951 ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1958,20 @@ startagain:
1986 xfs_Gqm->qm_dqfrlist_cnt--; 1958 xfs_Gqm->qm_dqfrlist_cnt--;
1987 dqpout = dqp; 1959 dqpout = dqp;
1988 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); 1960 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1961qhunlock:
1989 mutex_unlock(&dqp->q_hash->qh_lock); 1962 mutex_unlock(&dqp->q_hash->qh_lock);
1990dqfunlock: 1963dqfunlock:
1991 xfs_dqfunlock(dqp); 1964 xfs_dqfunlock(dqp);
1965dqunlock:
1992 xfs_dqunlock(dqp); 1966 xfs_dqunlock(dqp);
1993 if (dqpout) 1967 if (dqpout)
1994 break; 1968 break;
1995 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1969 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1996 return NULL; 1970 break;
1971 if (startagain) {
1972 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1973 goto again;
1974 }
1997 } 1975 }
1998 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); 1976 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1999 return dqpout; 1977 return dqpout;
@@ -2119,7 +2097,7 @@ xfs_qm_write_sb_changes(
2119 int error; 2097 int error;
2120 2098
2121#ifdef QUOTADEBUG 2099#ifdef QUOTADEBUG
2122 cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname); 2100 xfs_notice(mp, "Writing superblock quota changes");
2123#endif 2101#endif
2124 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 2102 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
2125 if ((error = xfs_trans_reserve(tp, 0, 2103 if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index c9446f1c726d..567b29b9f1b3 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -65,11 +65,6 @@ extern kmem_zone_t *qm_dqtrxzone;
65 * block in the dquot/xqm code. 65 * block in the dquot/xqm code.
66 */ 66 */
67#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 67#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
68/*
69 * When doing a quotacheck, we log dquot clusters of this many FSBs at most
70 * in a single transaction. We don't want to ask for too huge a log reservation.
71 */
72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
73 68
74typedef xfs_dqhash_t xfs_dqlist_t; 69typedef xfs_dqhash_t xfs_dqlist_t;
75 70
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 45b5cb1788ab..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -119,8 +119,7 @@ xfs_qm_newmount(
119 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || 119 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
120 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && 120 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) &&
121 xfs_dev_is_read_only(mp, "changing quota state")) { 121 xfs_dev_is_read_only(mp, "changing quota state")) {
122 cmn_err(CE_WARN, 122 xfs_warn(mp, "please mount with%s%s%s%s.",
123 "XFS: please mount with%s%s%s%s.",
124 (!quotaondisk ? "out quota" : ""), 123 (!quotaondisk ? "out quota" : ""),
125 (uquotaondisk ? " usrquota" : ""), 124 (uquotaondisk ? " usrquota" : ""),
126 (pquotaondisk ? " prjquota" : ""), 125 (pquotaondisk ? " prjquota" : ""),
@@ -135,7 +134,7 @@ xfs_qm_newmount(
135 */ 134 */
136 if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) { 135 if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
137 /* 136 /*
138 * If an error occured, qm_mount_quotas code 137 * If an error occurred, qm_mount_quotas code
139 * has already disabled quotas. So, just finish 138 * has already disabled quotas. So, just finish
140 * mounting, and get on with the boring life 139 * mounting, and get on with the boring life
141 * without disk quotas. 140 * without disk quotas.
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index bdebc183223e..2dadb15d5ca9 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -41,12 +41,6 @@
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43 43
44#ifdef DEBUG
45# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args)
46#else
47# define qdprintk(s, args...) do { } while (0)
48#endif
49
50STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 44STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
51STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 45STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
52 uint); 46 uint);
@@ -178,7 +172,7 @@ xfs_qm_scall_quotaoff(
178 /* 172 /*
179 * Next we make the changes in the quota flag in the mount struct. 173 * Next we make the changes in the quota flag in the mount struct.
180 * This isn't protected by a particular lock directly, because we 174 * This isn't protected by a particular lock directly, because we
181 * don't want to take a mrlock everytime we depend on quotas being on. 175 * don't want to take a mrlock every time we depend on quotas being on.
182 */ 176 */
183 mp->m_qflags &= ~(flags); 177 mp->m_qflags &= ~(flags);
184 178
@@ -294,7 +288,8 @@ xfs_qm_scall_trunc_qfiles(
294 int error = 0, error2 = 0; 288 int error = 0, error2 = 0;
295 289
296 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 290 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
297 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 291 xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
292 __func__, flags, mp->m_qflags);
298 return XFS_ERROR(EINVAL); 293 return XFS_ERROR(EINVAL);
299 } 294 }
300 295
@@ -318,20 +313,19 @@ xfs_qm_scall_quotaon(
318{ 313{
319 int error; 314 int error;
320 uint qf; 315 uint qf;
321 uint accflags;
322 __int64_t sbflags; 316 __int64_t sbflags;
323 317
324 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); 318 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
325 /* 319 /*
326 * Switching on quota accounting must be done at mount time. 320 * Switching on quota accounting must be done at mount time.
327 */ 321 */
328 accflags = flags & XFS_ALL_QUOTA_ACCT;
329 flags &= ~(XFS_ALL_QUOTA_ACCT); 322 flags &= ~(XFS_ALL_QUOTA_ACCT);
330 323
331 sbflags = 0; 324 sbflags = 0;
332 325
333 if (flags == 0) { 326 if (flags == 0) {
334 qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags); 327 xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
328 __func__, mp->m_qflags);
335 return XFS_ERROR(EINVAL); 329 return XFS_ERROR(EINVAL);
336 } 330 }
337 331
@@ -352,12 +346,13 @@ xfs_qm_scall_quotaon(
352 (flags & XFS_GQUOTA_ACCT) == 0 && 346 (flags & XFS_GQUOTA_ACCT) == 0 &&
353 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && 347 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
354 (flags & XFS_OQUOTA_ENFD))) { 348 (flags & XFS_OQUOTA_ENFD))) {
355 qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n", 349 xfs_debug(mp,
356 flags, mp->m_sb.sb_qflags); 350 "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
351 __func__, flags, mp->m_sb.sb_qflags);
357 return XFS_ERROR(EINVAL); 352 return XFS_ERROR(EINVAL);
358 } 353 }
359 /* 354 /*
360 * If everything's upto-date incore, then don't waste time. 355 * If everything's up to-date incore, then don't waste time.
361 */ 356 */
362 if ((mp->m_qflags & flags) == flags) 357 if ((mp->m_qflags & flags) == flags)
363 return XFS_ERROR(EEXIST); 358 return XFS_ERROR(EEXIST);
@@ -541,7 +536,7 @@ xfs_qm_scall_setqlim(
541 q->qi_bsoftlimit = soft; 536 q->qi_bsoftlimit = soft;
542 } 537 }
543 } else { 538 } else {
544 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); 539 xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
545 } 540 }
546 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? 541 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
547 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : 542 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -557,7 +552,7 @@ xfs_qm_scall_setqlim(
557 q->qi_rtbsoftlimit = soft; 552 q->qi_rtbsoftlimit = soft;
558 } 553 }
559 } else { 554 } else {
560 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); 555 xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
561 } 556 }
562 557
563 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? 558 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -574,7 +569,7 @@ xfs_qm_scall_setqlim(
574 q->qi_isoftlimit = soft; 569 q->qi_isoftlimit = soft;
575 } 570 }
576 } else { 571 } else {
577 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); 572 xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
578 } 573 }
579 574
580 /* 575 /*
@@ -939,10 +934,11 @@ struct mutex qcheck_lock;
939#define DQTEST_LIST_PRINT(l, NXT, title) \ 934#define DQTEST_LIST_PRINT(l, NXT, title) \
940{ \ 935{ \
941 xfs_dqtest_t *dqp; int i = 0;\ 936 xfs_dqtest_t *dqp; int i = 0;\
942 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ 937 xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
943 for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \ 938 for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
944 dqp = (xfs_dqtest_t *)dqp->NXT) { \ 939 dqp = (xfs_dqtest_t *)dqp->NXT) { \
945 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \ 940 xfs_debug(dqp->q_mount, \
941 " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \
946 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \ 942 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \
947 dqp->d_bcount, dqp->d_icount); } \ 943 dqp->d_bcount, dqp->d_icount); } \
948} 944}
@@ -966,16 +962,17 @@ xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
966} 962}
967STATIC void 963STATIC void
968xfs_qm_dqtest_print( 964xfs_qm_dqtest_print(
969 xfs_dqtest_t *d) 965 struct xfs_mount *mp,
966 struct dqtest *d)
970{ 967{
971 cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------"); 968 xfs_debug(mp, "-----------DQTEST DQUOT----------------");
972 cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id); 969 xfs_debug(mp, "---- dquot ID = %d", d->d_id);
973 cmn_err(CE_DEBUG, "---- fs = 0x%p", d->q_mount); 970 xfs_debug(mp, "---- fs = 0x%p", d->q_mount);
974 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", 971 xfs_debug(mp, "---- bcount = %Lu (0x%x)",
975 d->d_bcount, (int)d->d_bcount); 972 d->d_bcount, (int)d->d_bcount);
976 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", 973 xfs_debug(mp, "---- icount = %Lu (0x%x)",
977 d->d_icount, (int)d->d_icount); 974 d->d_icount, (int)d->d_icount);
978 cmn_err(CE_DEBUG, "---------------------------"); 975 xfs_debug(mp, "---------------------------");
979} 976}
980 977
981STATIC void 978STATIC void
@@ -989,12 +986,14 @@ xfs_qm_dqtest_failed(
989{ 986{
990 qmtest_nfails++; 987 qmtest_nfails++;
991 if (error) 988 if (error)
992 cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s", 989 xfs_debug(dqp->q_mount,
993 d->d_id, error, reason); 990 "quotacheck failed id=%d, err=%d\nreason: %s",
991 d->d_id, error, reason);
994 else 992 else
995 cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]", 993 xfs_debug(dqp->q_mount,
996 d->d_id, reason, (int)a, (int)b); 994 "quotacheck failed id=%d (%s) [%d != %d]",
997 xfs_qm_dqtest_print(d); 995 d->d_id, reason, (int)a, (int)b);
996 xfs_qm_dqtest_print(dqp->q_mount, d);
998 if (dqp) 997 if (dqp)
999 xfs_qm_dqprint(dqp); 998 xfs_qm_dqprint(dqp);
1000} 999}
@@ -1021,9 +1020,9 @@ xfs_dqtest_cmp2(
1021 be64_to_cpu(dqp->q_core.d_bcount) >= 1020 be64_to_cpu(dqp->q_core.d_bcount) >=
1022 be64_to_cpu(dqp->q_core.d_blk_softlimit)) { 1021 be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
1023 if (!dqp->q_core.d_btimer && dqp->q_core.d_id) { 1022 if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
1024 cmn_err(CE_DEBUG, 1023 xfs_debug(dqp->q_mount,
1025 "%d [%s] [0x%p] BLK TIMER NOT STARTED", 1024 "%d [%s] BLK TIMER NOT STARTED",
1026 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1025 d->d_id, DQFLAGTO_TYPESTR(d));
1027 err++; 1026 err++;
1028 } 1027 }
1029 } 1028 }
@@ -1031,16 +1030,16 @@ xfs_dqtest_cmp2(
1031 be64_to_cpu(dqp->q_core.d_icount) >= 1030 be64_to_cpu(dqp->q_core.d_icount) >=
1032 be64_to_cpu(dqp->q_core.d_ino_softlimit)) { 1031 be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
1033 if (!dqp->q_core.d_itimer && dqp->q_core.d_id) { 1032 if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
1034 cmn_err(CE_DEBUG, 1033 xfs_debug(dqp->q_mount,
1035 "%d [%s] [0x%p] INO TIMER NOT STARTED", 1034 "%d [%s] INO TIMER NOT STARTED",
1036 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1035 d->d_id, DQFLAGTO_TYPESTR(d));
1037 err++; 1036 err++;
1038 } 1037 }
1039 } 1038 }
1040#ifdef QUOTADEBUG 1039#ifdef QUOTADEBUG
1041 if (!err) { 1040 if (!err) {
1042 cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked", 1041 xfs_debug(dqp->q_mount, "%d [%s] qchecked",
1043 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1042 d->d_id, DQFLAGTO_TYPESTR(d));
1044 } 1043 }
1045#endif 1044#endif
1046 return (err); 1045 return (err);
@@ -1137,8 +1136,8 @@ xfs_qm_internalqcheck_adjust(
1137 1136
1138 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { 1137 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
1139 *res = BULKSTAT_RV_NOTHING; 1138 *res = BULKSTAT_RV_NOTHING;
1140 qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n", 1139 xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
1141 (unsigned long long) ino, 1140 __func__, (unsigned long long) ino,
1142 (unsigned long long) mp->m_sb.sb_uquotino, 1141 (unsigned long long) mp->m_sb.sb_uquotino,
1143 (unsigned long long) mp->m_sb.sb_gquotino); 1142 (unsigned long long) mp->m_sb.sb_gquotino);
1144 return XFS_ERROR(EINVAL); 1143 return XFS_ERROR(EINVAL);
@@ -1223,12 +1222,12 @@ xfs_qm_internalqcheck(
1223 xfs_qm_internalqcheck_adjust, 1222 xfs_qm_internalqcheck_adjust,
1224 0, NULL, &done); 1223 0, NULL, &done);
1225 if (error) { 1224 if (error) {
1226 cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); 1225 xfs_debug(mp, "Bulkstat returned error 0x%x", error);
1227 break; 1226 break;
1228 } 1227 }
1229 } while (!done); 1228 } while (!done);
1230 1229
1231 cmn_err(CE_DEBUG, "Checking results against system dquots"); 1230 xfs_debug(mp, "Checking results against system dquots");
1232 for (i = 0; i < qmtest_hashmask; i++) { 1231 for (i = 0; i < qmtest_hashmask; i++) {
1233 xfs_dqtest_t *d, *n; 1232 xfs_dqtest_t *d, *n;
1234 xfs_dqhash_t *h; 1233 xfs_dqhash_t *h;
@@ -1246,10 +1245,10 @@ xfs_qm_internalqcheck(
1246 } 1245 }
1247 1246
1248 if (qmtest_nfails) { 1247 if (qmtest_nfails) {
1249 cmn_err(CE_DEBUG, "******** quotacheck failed ********"); 1248 xfs_debug(mp, "******** quotacheck failed ********");
1250 cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails); 1249 xfs_debug(mp, "failures = %d", qmtest_nfails);
1251 } else { 1250 } else {
1252 cmn_err(CE_DEBUG, "******** quotacheck successful! ********"); 1251 xfs_debug(mp, "******** quotacheck successful! ********");
1253 } 1252 }
1254 kmem_free(qmtest_udqtab); 1253 kmem_free(qmtest_udqtab);
1255 kmem_free(qmtest_gdqtab); 1254 kmem_free(qmtest_gdqtab);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de91d1b75c0..2a3648731331 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -643,8 +643,9 @@ xfs_trans_dqresv(
643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && 643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { 644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
645#ifdef QUOTADEBUG 645#ifdef QUOTADEBUG
646 cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld" 646 xfs_debug(mp,
647 " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit); 647 "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
648 nblks, *resbcountp, hardlimit);
648#endif 649#endif
649 if (nblks > 0) { 650 if (nblks > 0) {
650 /* 651 /*
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index 975aa10e1a47..000000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,115 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include <xfs.h>
19#include "debug.h"
20
21/* xfs_mount.h drags a lot of crap in, sorry.. */
22#include "xfs_sb.h"
23#include "xfs_inum.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_error.h"
27
28static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock);
30
31/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
32#define XFS_MAX_ERR_LEVEL 7
33#define XFS_ERR_MASK ((1 << 3) - 1)
34static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
35 {KERN_EMERG, KERN_ALERT, KERN_CRIT,
36 KERN_ERR, KERN_WARNING, KERN_NOTICE,
37 KERN_INFO, KERN_DEBUG};
38
39void
40cmn_err(register int level, char *fmt, ...)
41{
42 char *fp = fmt;
43 int len;
44 ulong flags;
45 va_list ap;
46
47 level &= XFS_ERR_MASK;
48 if (level > XFS_MAX_ERR_LEVEL)
49 level = XFS_MAX_ERR_LEVEL;
50 spin_lock_irqsave(&xfs_err_lock,flags);
51 va_start(ap, fmt);
52 if (*fmt == '!') fp++;
53 len = vsnprintf(message, sizeof(message), fp, ap);
54 if (len >= sizeof(message))
55 len = sizeof(message) - 1;
56 if (message[len-1] == '\n')
57 message[len-1] = 0;
58 printk("%s%s\n", err_level[level], message);
59 va_end(ap);
60 spin_unlock_irqrestore(&xfs_err_lock,flags);
61 BUG_ON(level == CE_PANIC);
62}
63
64void
65xfs_fs_vcmn_err(
66 int level,
67 struct xfs_mount *mp,
68 char *fmt,
69 va_list ap)
70{
71 unsigned long flags;
72 int len = 0;
73
74 level &= XFS_ERR_MASK;
75 if (level > XFS_MAX_ERR_LEVEL)
76 level = XFS_MAX_ERR_LEVEL;
77
78 spin_lock_irqsave(&xfs_err_lock,flags);
79
80 if (mp) {
81 len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
82
83 /*
84 * Skip the printk if we can't print anything useful
85 * due to an over-long device name.
86 */
87 if (len >= sizeof(message))
88 goto out;
89 }
90
91 len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
92 if (len >= sizeof(message))
93 len = sizeof(message) - 1;
94 if (message[len-1] == '\n')
95 message[len-1] = 0;
96
97 printk("%s%s\n", err_level[level], message);
98 out:
99 spin_unlock_irqrestore(&xfs_err_lock,flags);
100
101 BUG_ON(level == CE_PANIC);
102}
103
104void
105assfail(char *expr, char *file, int line)
106{
107 printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
108 BUG();
109}
110
111void
112xfs_hex_dump(void *p, int length)
113{
114 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
115}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index d2d20462fd4f..000000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,54 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_DEBUG_H__
19#define __XFS_SUPPORT_DEBUG_H__
20
21#include <stdarg.h>
22
23#define CE_DEBUG 7 /* debug */
24#define CE_CONT 6 /* continuation */
25#define CE_NOTE 5 /* notice */
26#define CE_WARN 4 /* warning */
27#define CE_ALERT 1 /* alert */
28#define CE_PANIC 0 /* panic */
29
30extern void cmn_err(int, char *, ...)
31 __attribute__ ((format (printf, 2, 3)));
32extern void assfail(char *expr, char *f, int l);
33
34#define ASSERT_ALWAYS(expr) \
35 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
36
37#ifndef DEBUG
38#define ASSERT(expr) ((void)0)
39
40#ifndef STATIC
41# define STATIC static noinline
42#endif
43
44#else /* DEBUG */
45
46#define ASSERT(expr) \
47 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
48
49#ifndef STATIC
50# define STATIC noinline
51#endif
52
53#endif /* DEBUG */
54#endif /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d7..11dd72070cbb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
43 43
44#ifdef CONFIG_XFS_POSIX_ACL 44#ifdef CONFIG_XFS_POSIX_ACL
45extern int xfs_check_acl(struct inode *inode, int mask); 45extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
46extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); 46extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
47extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); 47extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
48extern int xfs_acl_chmod(struct inode *inode); 48extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c022..58632cc17f2d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
227 227
228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
229 229
230 rwlock_t pag_ici_lock; /* incore inode lock */ 230 spinlock_t pag_ici_lock; /* incore inode cache lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
233 struct mutex pag_ici_reclaim_lock; /* serialisation point */ 233 struct mutex pag_ici_reclaim_lock; /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca5..27d64d752eab 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44static int
45xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
46 xfs_agblock_t bno, xfs_extlen_t len);
47
48/* 44/*
49 * Prototypes for per-ag allocation routines 45 * Prototypes for per-ag allocation routines
50 */ 46 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
94 * Lookup the first record less than or equal to [bno, len] 90 * Lookup the first record less than or equal to [bno, len]
95 * in the btree given by cur. 91 * in the btree given by cur.
96 */ 92 */
97STATIC int /* error */ 93int /* error */
98xfs_alloc_lookup_le( 94xfs_alloc_lookup_le(
99 struct xfs_btree_cur *cur, /* btree cursor */ 95 struct xfs_btree_cur *cur, /* btree cursor */
100 xfs_agblock_t bno, /* starting block of extent */ 96 xfs_agblock_t bno, /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
127/* 123/*
128 * Get the data from the pointed-to record. 124 * Get the data from the pointed-to record.
129 */ 125 */
130STATIC int /* error */ 126int /* error */
131xfs_alloc_get_rec( 127xfs_alloc_get_rec(
132 struct xfs_btree_cur *cur, /* btree cursor */ 128 struct xfs_btree_cur *cur, /* btree cursor */
133 xfs_agblock_t *bno, /* output: starting block of extent */ 129 xfs_agblock_t *bno, /* output: starting block of extent */
@@ -151,10 +147,9 @@ xfs_alloc_get_rec(
151 */ 147 */
152STATIC void 148STATIC void
153xfs_alloc_compute_aligned( 149xfs_alloc_compute_aligned(
150 xfs_alloc_arg_t *args, /* allocation argument structure */
154 xfs_agblock_t foundbno, /* starting block in found extent */ 151 xfs_agblock_t foundbno, /* starting block in found extent */
155 xfs_extlen_t foundlen, /* length in found extent */ 152 xfs_extlen_t foundlen, /* length in found extent */
156 xfs_extlen_t alignment, /* alignment for allocation */
157 xfs_extlen_t minlen, /* minimum length for allocation */
158 xfs_agblock_t *resbno, /* result block number */ 153 xfs_agblock_t *resbno, /* result block number */
159 xfs_extlen_t *reslen) /* result length */ 154 xfs_extlen_t *reslen) /* result length */
160{ 155{
@@ -162,8 +157,8 @@ xfs_alloc_compute_aligned(
162 xfs_extlen_t diff; 157 xfs_extlen_t diff;
163 xfs_extlen_t len; 158 xfs_extlen_t len;
164 159
165 if (alignment > 1 && foundlen >= minlen) { 160 if (args->alignment > 1 && foundlen >= args->minlen) {
166 bno = roundup(foundbno, alignment); 161 bno = roundup(foundbno, args->alignment);
167 diff = bno - foundbno; 162 diff = bno - foundbno;
168 len = diff >= foundlen ? 0 : foundlen - diff; 163 len = diff >= foundlen ? 0 : foundlen - diff;
169 } else { 164 } else {
@@ -468,6 +463,27 @@ xfs_alloc_read_agfl(
468 return 0; 463 return 0;
469} 464}
470 465
466STATIC int
467xfs_alloc_update_counters(
468 struct xfs_trans *tp,
469 struct xfs_perag *pag,
470 struct xfs_buf *agbp,
471 long len)
472{
473 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
474
475 pag->pagf_freeblks += len;
476 be32_add_cpu(&agf->agf_freeblks, len);
477
478 xfs_trans_agblocks_delta(tp, len);
479 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
480 be32_to_cpu(agf->agf_length)))
481 return EFSCORRUPTED;
482
483 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
484 return 0;
485}
486
471/* 487/*
472 * Allocation group level functions. 488 * Allocation group level functions.
473 */ 489 */
@@ -509,49 +525,44 @@ xfs_alloc_ag_vextent(
509 ASSERT(0); 525 ASSERT(0);
510 /* NOTREACHED */ 526 /* NOTREACHED */
511 } 527 }
512 if (error) 528
529 if (error || args->agbno == NULLAGBLOCK)
513 return error; 530 return error;
514 /*
515 * If the allocation worked, need to change the agf structure
516 * (and log it), and the superblock.
517 */
518 if (args->agbno != NULLAGBLOCK) {
519 xfs_agf_t *agf; /* allocation group freelist header */
520 long slen = (long)args->len;
521 531
522 ASSERT(args->len >= args->minlen && args->len <= args->maxlen); 532 ASSERT(args->len >= args->minlen);
523 ASSERT(!(args->wasfromfl) || !args->isfl); 533 ASSERT(args->len <= args->maxlen);
524 ASSERT(args->agbno % args->alignment == 0); 534 ASSERT(!args->wasfromfl || !args->isfl);
525 if (!(args->wasfromfl)) { 535 ASSERT(args->agbno % args->alignment == 0);
526 536
527 agf = XFS_BUF_TO_AGF(args->agbp); 537 if (!args->wasfromfl) {
528 be32_add_cpu(&agf->agf_freeblks, -(args->len)); 538 error = xfs_alloc_update_counters(args->tp, args->pag,
529 xfs_trans_agblocks_delta(args->tp, 539 args->agbp,
530 -((long)(args->len))); 540 -((long)(args->len)));
531 args->pag->pagf_freeblks -= args->len; 541 if (error)
532 ASSERT(be32_to_cpu(agf->agf_freeblks) <= 542 return error;
533 be32_to_cpu(agf->agf_length)); 543
534 xfs_alloc_log_agf(args->tp, args->agbp, 544 /*
535 XFS_AGF_FREEBLKS); 545 * Search the busylist for these blocks and mark the
536 /* 546 * transaction as synchronous if blocks are found. This
537 * Search the busylist for these blocks and mark the 547 * avoids the need to block due to a synchronous log
538 * transaction as synchronous if blocks are found. This 548 * force to ensure correct ordering as the synchronous
539 * avoids the need to block due to a synchronous log 549 * transaction will guarantee that for us.
540 * force to ensure correct ordering as the synchronous 550 */
541 * transaction will guarantee that for us. 551 if (xfs_alloc_busy_search(args->mp, args->agno,
542 */ 552 args->agbno, args->len))
543 if (xfs_alloc_busy_search(args->mp, args->agno, 553 xfs_trans_set_sync(args->tp);
544 args->agbno, args->len))
545 xfs_trans_set_sync(args->tp);
546 }
547 if (!args->isfl)
548 xfs_trans_mod_sb(args->tp,
549 args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
550 XFS_TRANS_SB_FDBLOCKS, -slen);
551 XFS_STATS_INC(xs_allocx);
552 XFS_STATS_ADD(xs_allocb, args->len);
553 } 554 }
554 return 0; 555
556 if (!args->isfl) {
557 xfs_trans_mod_sb(args->tp, args->wasdel ?
558 XFS_TRANS_SB_RES_FDBLOCKS :
559 XFS_TRANS_SB_FDBLOCKS,
560 -((long)(args->len)));
561 }
562
563 XFS_STATS_INC(xs_allocx);
564 XFS_STATS_ADD(xs_allocb, args->len);
565 return error;
555} 566}
556 567
557/* 568/*
@@ -577,61 +588,58 @@ xfs_alloc_ag_vextent_exact(
577 xfs_extlen_t rlen; /* length of returned extent */ 588 xfs_extlen_t rlen; /* length of returned extent */
578 589
579 ASSERT(args->alignment == 1); 590 ASSERT(args->alignment == 1);
591
580 /* 592 /*
581 * Allocate/initialize a cursor for the by-number freespace btree. 593 * Allocate/initialize a cursor for the by-number freespace btree.
582 */ 594 */
583 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 595 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
584 args->agno, XFS_BTNUM_BNO); 596 args->agno, XFS_BTNUM_BNO);
597
585 /* 598 /*
586 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 599 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
587 * Look for the closest free block <= bno, it must contain bno 600 * Look for the closest free block <= bno, it must contain bno
588 * if any free block does. 601 * if any free block does.
589 */ 602 */
590 if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) 603 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
604 if (error)
591 goto error0; 605 goto error0;
592 if (!i) { 606 if (!i)
593 /* 607 goto not_found;
594 * Didn't find it, return null. 608
595 */
596 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
597 args->agbno = NULLAGBLOCK;
598 return 0;
599 }
600 /* 609 /*
601 * Grab the freespace record. 610 * Grab the freespace record.
602 */ 611 */
603 if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) 612 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
613 if (error)
604 goto error0; 614 goto error0;
605 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 615 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
606 ASSERT(fbno <= args->agbno); 616 ASSERT(fbno <= args->agbno);
607 minend = args->agbno + args->minlen; 617 minend = args->agbno + args->minlen;
608 maxend = args->agbno + args->maxlen; 618 maxend = args->agbno + args->maxlen;
609 fend = fbno + flen; 619 fend = fbno + flen;
620
610 /* 621 /*
611 * Give up if the freespace isn't long enough for the minimum request. 622 * Give up if the freespace isn't long enough for the minimum request.
612 */ 623 */
613 if (fend < minend) { 624 if (fend < minend)
614 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 625 goto not_found;
615 args->agbno = NULLAGBLOCK; 626
616 return 0;
617 }
618 /* 627 /*
619 * End of extent will be smaller of the freespace end and the 628 * End of extent will be smaller of the freespace end and the
620 * maximal requested end. 629 * maximal requested end.
621 */ 630 *
622 end = XFS_AGBLOCK_MIN(fend, maxend);
623 /*
624 * Fix the length according to mod and prod if given. 631 * Fix the length according to mod and prod if given.
625 */ 632 */
633 end = XFS_AGBLOCK_MIN(fend, maxend);
626 args->len = end - args->agbno; 634 args->len = end - args->agbno;
627 xfs_alloc_fix_len(args); 635 xfs_alloc_fix_len(args);
628 if (!xfs_alloc_fix_minleft(args)) { 636 if (!xfs_alloc_fix_minleft(args))
629 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 637 goto not_found;
630 return 0; 638
631 }
632 rlen = args->len; 639 rlen = args->len;
633 ASSERT(args->agbno + rlen <= fend); 640 ASSERT(args->agbno + rlen <= fend);
634 end = args->agbno + rlen; 641 end = args->agbno + rlen;
642
635 /* 643 /*
636 * We are allocating agbno for rlen [agbno .. end] 644 * We are allocating agbno for rlen [agbno .. end]
637 * Allocate/initialize a cursor for the by-size btree. 645 * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +648,25 @@ xfs_alloc_ag_vextent_exact(
640 args->agno, XFS_BTNUM_CNT); 648 args->agno, XFS_BTNUM_CNT);
641 ASSERT(args->agbno + args->len <= 649 ASSERT(args->agbno + args->len <=
642 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 650 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
643 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 651 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
644 args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { 652 args->len, XFSA_FIXUP_BNO_OK);
653 if (error) {
645 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 654 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
646 goto error0; 655 goto error0;
647 } 656 }
657
648 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 658 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
649 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 659 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
650 660
651 trace_xfs_alloc_exact_done(args);
652 args->wasfromfl = 0; 661 args->wasfromfl = 0;
662 trace_xfs_alloc_exact_done(args);
663 return 0;
664
665not_found:
666 /* Didn't find it, return null. */
667 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
668 args->agbno = NULLAGBLOCK;
669 trace_xfs_alloc_exact_notfound(args);
653 return 0; 670 return 0;
654 671
655error0: 672error0:
@@ -659,6 +676,94 @@ error0:
659} 676}
660 677
661/* 678/*
679 * Search the btree in a given direction via the search cursor and compare
680 * the records found against the good extent we've already found.
681 */
682STATIC int
683xfs_alloc_find_best_extent(
684 struct xfs_alloc_arg *args, /* allocation argument structure */
685 struct xfs_btree_cur **gcur, /* good cursor */
686 struct xfs_btree_cur **scur, /* searching cursor */
687 xfs_agblock_t gdiff, /* difference for search comparison */
688 xfs_agblock_t *sbno, /* extent found by search */
689 xfs_extlen_t *slen,
690 xfs_extlen_t *slena, /* aligned length */
691 int dir) /* 0 = search right, 1 = search left */
692{
693 xfs_agblock_t bno;
694 xfs_agblock_t new;
695 xfs_agblock_t sdiff;
696 int error;
697 int i;
698
699 /* The good extent is perfect, no need to search. */
700 if (!gdiff)
701 goto out_use_good;
702
703 /*
704 * Look until we find a better one, run out of space or run off the end.
705 */
706 do {
707 error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
708 if (error)
709 goto error0;
710 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
711 xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
712
713 /*
714 * The good extent is closer than this one.
715 */
716 if (!dir) {
717 if (bno >= args->agbno + gdiff)
718 goto out_use_good;
719 } else {
720 if (bno <= args->agbno - gdiff)
721 goto out_use_good;
722 }
723
724 /*
725 * Same distance, compare length and pick the best.
726 */
727 if (*slena >= args->minlen) {
728 args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
729 xfs_alloc_fix_len(args);
730
731 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
732 args->alignment, *sbno,
733 *slen, &new);
734
735 /*
736 * Choose closer size and invalidate other cursor.
737 */
738 if (sdiff < gdiff)
739 goto out_use_search;
740 goto out_use_good;
741 }
742
743 if (!dir)
744 error = xfs_btree_increment(*scur, 0, &i);
745 else
746 error = xfs_btree_decrement(*scur, 0, &i);
747 if (error)
748 goto error0;
749 } while (i);
750
751out_use_good:
752 xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
753 *scur = NULL;
754 return 0;
755
756out_use_search:
757 xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
758 *gcur = NULL;
759 return 0;
760
761error0:
762 /* caller invalidates cursors */
763 return error;
764}
765
766/*
662 * Allocate a variable extent near bno in the allocation group agno. 767 * Allocate a variable extent near bno in the allocation group agno.
663 * Extent's length (returned in len) will be between minlen and maxlen, 768 * Extent's length (returned in len) will be between minlen and maxlen,
664 * and of the form k * prod + mod unless there's nothing that large. 769 * and of the form k * prod + mod unless there's nothing that large.
@@ -775,8 +880,8 @@ xfs_alloc_ag_vextent_near(
775 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 880 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
776 goto error0; 881 goto error0;
777 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 882 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
778 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 883 xfs_alloc_compute_aligned(args, ltbno, ltlen,
779 args->minlen, &ltbnoa, &ltlena); 884 &ltbnoa, &ltlena);
780 if (ltlena < args->minlen) 885 if (ltlena < args->minlen)
781 continue; 886 continue;
782 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 887 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -896,8 +1001,8 @@ xfs_alloc_ag_vextent_near(
896 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i))) 1001 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
897 goto error0; 1002 goto error0;
898 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1003 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
899 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 1004 xfs_alloc_compute_aligned(args, ltbno, ltlen,
900 args->minlen, &ltbnoa, &ltlena); 1005 &ltbnoa, &ltlena);
901 if (ltlena >= args->minlen) 1006 if (ltlena >= args->minlen)
902 break; 1007 break;
903 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) 1008 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -912,8 +1017,8 @@ xfs_alloc_ag_vextent_near(
912 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i))) 1017 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
913 goto error0; 1018 goto error0;
914 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1019 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
915 xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment, 1020 xfs_alloc_compute_aligned(args, gtbno, gtlen,
916 args->minlen, &gtbnoa, &gtlena); 1021 &gtbnoa, &gtlena);
917 if (gtlena >= args->minlen) 1022 if (gtlena >= args->minlen)
918 break; 1023 break;
919 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) 1024 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -925,203 +1030,45 @@ xfs_alloc_ag_vextent_near(
925 } 1030 }
926 } 1031 }
927 } while (bno_cur_lt || bno_cur_gt); 1032 } while (bno_cur_lt || bno_cur_gt);
1033
928 /* 1034 /*
929 * Got both cursors still active, need to find better entry. 1035 * Got both cursors still active, need to find better entry.
930 */ 1036 */
931 if (bno_cur_lt && bno_cur_gt) { 1037 if (bno_cur_lt && bno_cur_gt) {
932 /*
933 * Left side is long enough, look for a right side entry.
934 */
935 if (ltlena >= args->minlen) { 1038 if (ltlena >= args->minlen) {
936 /* 1039 /*
937 * Fix up the length. 1040 * Left side is good, look for a right side entry.
938 */ 1041 */
939 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1042 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
940 xfs_alloc_fix_len(args); 1043 xfs_alloc_fix_len(args);
941 rlen = args->len; 1044 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
942 ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
943 args->alignment, ltbno, ltlen, &ltnew); 1045 args->alignment, ltbno, ltlen, &ltnew);
1046
1047 error = xfs_alloc_find_best_extent(args,
1048 &bno_cur_lt, &bno_cur_gt,
1049 ltdiff, &gtbno, &gtlen, &gtlena,
1050 0 /* search right */);
1051 } else {
1052 ASSERT(gtlena >= args->minlen);
1053
944 /* 1054 /*
945 * Not perfect. 1055 * Right side is good, look for a left side entry.
946 */
947 if (ltdiff) {
948 /*
949 * Look until we find a better one, run out of
950 * space, or run off the end.
951 */
952 while (bno_cur_lt && bno_cur_gt) {
953 if ((error = xfs_alloc_get_rec(
954 bno_cur_gt, &gtbno,
955 &gtlen, &i)))
956 goto error0;
957 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
958 xfs_alloc_compute_aligned(gtbno, gtlen,
959 args->alignment, args->minlen,
960 &gtbnoa, &gtlena);
961 /*
962 * The left one is clearly better.
963 */
964 if (gtbnoa >= args->agbno + ltdiff) {
965 xfs_btree_del_cursor(
966 bno_cur_gt,
967 XFS_BTREE_NOERROR);
968 bno_cur_gt = NULL;
969 break;
970 }
971 /*
972 * If we reach a big enough entry,
973 * compare the two and pick the best.
974 */
975 if (gtlena >= args->minlen) {
976 args->len =
977 XFS_EXTLEN_MIN(gtlena,
978 args->maxlen);
979 xfs_alloc_fix_len(args);
980 rlen = args->len;
981 gtdiff = xfs_alloc_compute_diff(
982 args->agbno, rlen,
983 args->alignment,
984 gtbno, gtlen, &gtnew);
985 /*
986 * Right side is better.
987 */
988 if (gtdiff < ltdiff) {
989 xfs_btree_del_cursor(
990 bno_cur_lt,
991 XFS_BTREE_NOERROR);
992 bno_cur_lt = NULL;
993 }
994 /*
995 * Left side is better.
996 */
997 else {
998 xfs_btree_del_cursor(
999 bno_cur_gt,
1000 XFS_BTREE_NOERROR);
1001 bno_cur_gt = NULL;
1002 }
1003 break;
1004 }
1005 /*
1006 * Fell off the right end.
1007 */
1008 if ((error = xfs_btree_increment(
1009 bno_cur_gt, 0, &i)))
1010 goto error0;
1011 if (!i) {
1012 xfs_btree_del_cursor(
1013 bno_cur_gt,
1014 XFS_BTREE_NOERROR);
1015 bno_cur_gt = NULL;
1016 break;
1017 }
1018 }
1019 }
1020 /*
1021 * The left side is perfect, trash the right side.
1022 */
1023 else {
1024 xfs_btree_del_cursor(bno_cur_gt,
1025 XFS_BTREE_NOERROR);
1026 bno_cur_gt = NULL;
1027 }
1028 }
1029 /*
1030 * It's the right side that was found first, look left.
1031 */
1032 else {
1033 /*
1034 * Fix up the length.
1035 */ 1056 */
1036 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1057 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1037 xfs_alloc_fix_len(args); 1058 xfs_alloc_fix_len(args);
1038 rlen = args->len; 1059 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1039 gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
1040 args->alignment, gtbno, gtlen, &gtnew); 1060 args->alignment, gtbno, gtlen, &gtnew);
1041 /* 1061
1042 * Right side entry isn't perfect. 1062 error = xfs_alloc_find_best_extent(args,
1043 */ 1063 &bno_cur_gt, &bno_cur_lt,
1044 if (gtdiff) { 1064 gtdiff, &ltbno, &ltlen, &ltlena,
1045 /* 1065 1 /* search left */);
1046 * Look until we find a better one, run out of
1047 * space, or run off the end.
1048 */
1049 while (bno_cur_lt && bno_cur_gt) {
1050 if ((error = xfs_alloc_get_rec(
1051 bno_cur_lt, &ltbno,
1052 &ltlen, &i)))
1053 goto error0;
1054 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1055 xfs_alloc_compute_aligned(ltbno, ltlen,
1056 args->alignment, args->minlen,
1057 &ltbnoa, &ltlena);
1058 /*
1059 * The right one is clearly better.
1060 */
1061 if (ltbnoa <= args->agbno - gtdiff) {
1062 xfs_btree_del_cursor(
1063 bno_cur_lt,
1064 XFS_BTREE_NOERROR);
1065 bno_cur_lt = NULL;
1066 break;
1067 }
1068 /*
1069 * If we reach a big enough entry,
1070 * compare the two and pick the best.
1071 */
1072 if (ltlena >= args->minlen) {
1073 args->len = XFS_EXTLEN_MIN(
1074 ltlena, args->maxlen);
1075 xfs_alloc_fix_len(args);
1076 rlen = args->len;
1077 ltdiff = xfs_alloc_compute_diff(
1078 args->agbno, rlen,
1079 args->alignment,
1080 ltbno, ltlen, &ltnew);
1081 /*
1082 * Left side is better.
1083 */
1084 if (ltdiff < gtdiff) {
1085 xfs_btree_del_cursor(
1086 bno_cur_gt,
1087 XFS_BTREE_NOERROR);
1088 bno_cur_gt = NULL;
1089 }
1090 /*
1091 * Right side is better.
1092 */
1093 else {
1094 xfs_btree_del_cursor(
1095 bno_cur_lt,
1096 XFS_BTREE_NOERROR);
1097 bno_cur_lt = NULL;
1098 }
1099 break;
1100 }
1101 /*
1102 * Fell off the left end.
1103 */
1104 if ((error = xfs_btree_decrement(
1105 bno_cur_lt, 0, &i)))
1106 goto error0;
1107 if (!i) {
1108 xfs_btree_del_cursor(bno_cur_lt,
1109 XFS_BTREE_NOERROR);
1110 bno_cur_lt = NULL;
1111 break;
1112 }
1113 }
1114 }
1115 /*
1116 * The right side is perfect, trash the left side.
1117 */
1118 else {
1119 xfs_btree_del_cursor(bno_cur_lt,
1120 XFS_BTREE_NOERROR);
1121 bno_cur_lt = NULL;
1122 }
1123 } 1066 }
1067
1068 if (error)
1069 goto error0;
1124 } 1070 }
1071
1125 /* 1072 /*
1126 * If we couldn't get anything, give up. 1073 * If we couldn't get anything, give up.
1127 */ 1074 */
@@ -1130,6 +1077,7 @@ xfs_alloc_ag_vextent_near(
1130 args->agbno = NULLAGBLOCK; 1077 args->agbno = NULLAGBLOCK;
1131 return 0; 1078 return 0;
1132 } 1079 }
1080
1133 /* 1081 /*
1134 * At this point we have selected a freespace entry, either to the 1082 * At this point we have selected a freespace entry, either to the
1135 * left or to the right. If it's on the right, copy all the 1083 * left or to the right. If it's on the right, copy all the
@@ -1146,6 +1094,7 @@ xfs_alloc_ag_vextent_near(
1146 j = 1; 1094 j = 1;
1147 } else 1095 } else
1148 j = 0; 1096 j = 0;
1097
1149 /* 1098 /*
1150 * Fix up the length and compute the useful address. 1099 * Fix up the length and compute the useful address.
1151 */ 1100 */
@@ -1248,8 +1197,7 @@ xfs_alloc_ag_vextent_size(
1248 * once aligned; if not, we search left for something better. 1197 * once aligned; if not, we search left for something better.
1249 * This can't happen in the second case above. 1198 * This can't happen in the second case above.
1250 */ 1199 */
1251 xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen, 1200 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1252 &rbno, &rlen);
1253 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1201 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1254 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1202 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1255 (rlen <= flen && rbno + rlen <= fbno + flen), error0); 1203 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1274,8 +1222,8 @@ xfs_alloc_ag_vextent_size(
1274 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1222 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1275 if (flen < bestrlen) 1223 if (flen < bestrlen)
1276 break; 1224 break;
1277 xfs_alloc_compute_aligned(fbno, flen, args->alignment, 1225 xfs_alloc_compute_aligned(args, fbno, flen,
1278 args->minlen, &rbno, &rlen); 1226 &rbno, &rlen);
1279 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1227 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1280 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1228 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1281 (rlen <= flen && rbno + rlen <= fbno + flen), 1229 (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1453,6 +1401,7 @@ xfs_free_ag_extent(
1453 xfs_mount_t *mp; /* mount point struct for filesystem */ 1401 xfs_mount_t *mp; /* mount point struct for filesystem */
1454 xfs_agblock_t nbno; /* new starting block of freespace */ 1402 xfs_agblock_t nbno; /* new starting block of freespace */
1455 xfs_extlen_t nlen; /* new length of freespace */ 1403 xfs_extlen_t nlen; /* new length of freespace */
1404 xfs_perag_t *pag; /* per allocation group data */
1456 1405
1457 mp = tp->t_mountp; 1406 mp = tp->t_mountp;
1458 /* 1407 /*
@@ -1651,30 +1600,20 @@ xfs_free_ag_extent(
1651 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1600 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1652 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1601 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1653 cnt_cur = NULL; 1602 cnt_cur = NULL;
1603
1654 /* 1604 /*
1655 * Update the freespace totals in the ag and superblock. 1605 * Update the freespace totals in the ag and superblock.
1656 */ 1606 */
1657 { 1607 pag = xfs_perag_get(mp, agno);
1658 xfs_agf_t *agf; 1608 error = xfs_alloc_update_counters(tp, pag, agbp, len);
1659 xfs_perag_t *pag; /* per allocation group data */ 1609 xfs_perag_put(pag);
1660 1610 if (error)
1661 pag = xfs_perag_get(mp, agno); 1611 goto error0;
1662 pag->pagf_freeblks += len; 1612
1663 xfs_perag_put(pag); 1613 if (!isfl)
1664 1614 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1665 agf = XFS_BUF_TO_AGF(agbp); 1615 XFS_STATS_INC(xs_freex);
1666 be32_add_cpu(&agf->agf_freeblks, len); 1616 XFS_STATS_ADD(xs_freeb, len);
1667 xfs_trans_agblocks_delta(tp, len);
1668 XFS_WANT_CORRUPTED_GOTO(
1669 be32_to_cpu(agf->agf_freeblks) <=
1670 be32_to_cpu(agf->agf_length),
1671 error0);
1672 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
1673 if (!isfl)
1674 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1675 XFS_STATS_INC(xs_freex);
1676 XFS_STATS_ADD(xs_freeb, len);
1677 }
1678 1617
1679 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); 1618 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
1680 1619
@@ -2456,17 +2395,33 @@ xfs_free_extent(
2456 memset(&args, 0, sizeof(xfs_alloc_arg_t)); 2395 memset(&args, 0, sizeof(xfs_alloc_arg_t));
2457 args.tp = tp; 2396 args.tp = tp;
2458 args.mp = tp->t_mountp; 2397 args.mp = tp->t_mountp;
2398
2399 /*
2400 * validate that the block number is legal - the enables us to detect
2401 * and handle a silent filesystem corruption rather than crashing.
2402 */
2459 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2403 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2460 ASSERT(args.agno < args.mp->m_sb.sb_agcount); 2404 if (args.agno >= args.mp->m_sb.sb_agcount)
2405 return EFSCORRUPTED;
2406
2461 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2407 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2408 if (args.agbno >= args.mp->m_sb.sb_agblocks)
2409 return EFSCORRUPTED;
2410
2462 args.pag = xfs_perag_get(args.mp, args.agno); 2411 args.pag = xfs_perag_get(args.mp, args.agno);
2463 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) 2412 ASSERT(args.pag);
2413
2414 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
2415 if (error)
2464 goto error0; 2416 goto error0;
2465#ifdef DEBUG 2417
2466 ASSERT(args.agbp != NULL); 2418 /* validate the extent size is legal now we have the agf locked */
2467 ASSERT((args.agbno + len) <= 2419 if (args.agbno + len >
2468 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)); 2420 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
2469#endif 2421 error = EFSCORRUPTED;
2422 goto error0;
2423 }
2424
2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2425 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2471error0: 2426error0:
2472 xfs_perag_put(args.pag); 2427 xfs_perag_put(args.pag);
@@ -2676,7 +2631,7 @@ restart:
2676 * will require a synchronous transaction, but it can still be 2631 * will require a synchronous transaction, but it can still be
2677 * used to distinguish between a partial or exact match. 2632 * used to distinguish between a partial or exact match.
2678 */ 2633 */
2679static int 2634int
2680xfs_alloc_busy_search( 2635xfs_alloc_busy_search(
2681 struct xfs_mount *mp, 2636 struct xfs_mount *mp,
2682 xfs_agnumber_t agno, 2637 xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..d0b3bc72005b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
19#define __XFS_ALLOC_H__ 19#define __XFS_ALLOC_H__
20 20
21struct xfs_buf; 21struct xfs_buf;
22struct xfs_btree_cur;
22struct xfs_mount; 23struct xfs_mount;
23struct xfs_perag; 24struct xfs_perag;
24struct xfs_trans; 25struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
74#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) 75#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
75 76
76/* 77/*
78 * When deciding how much space to allocate out of an AG, we limit the
79 * allocation maximum size to the size the AG. However, we cannot use all the
80 * blocks in the AG - some are permanently used by metadata. These
81 * blocks are generally:
82 * - the AG superblock, AGF, AGI and AGFL
83 * - the AGF (bno and cnt) and AGI btree root blocks
84 * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
85 *
86 * The AG headers are sector sized, so the amount of space they take up is
87 * dependent on filesystem geometry. The others are all single blocks.
88 */
89#define XFS_ALLOC_AG_MAX_USABLE(mp) \
90 ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
91
92
93/*
77 * Argument structure for xfs_alloc routines. 94 * Argument structure for xfs_alloc routines.
78 * This is turned into a structure to avoid having 20 arguments passed 95 * This is turned into a structure to avoid having 20 arguments passed
79 * down several levels of the stack. 96 * down several levels of the stack.
@@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
118 struct xfs_perag *pag); 135 struct xfs_perag *pag);
119 136
120#ifdef __KERNEL__ 137#ifdef __KERNEL__
121
122void 138void
123xfs_alloc_busy_insert(xfs_trans_t *tp, 139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
124 xfs_agnumber_t agno, 140 xfs_agblock_t bno, xfs_extlen_t len);
125 xfs_agblock_t bno,
126 xfs_extlen_t len);
127 141
128void 142void
129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); 143xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
130 144
145int
146xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
147 xfs_agblock_t bno, xfs_extlen_t len);
131#endif /* __KERNEL__ */ 148#endif /* __KERNEL__ */
132 149
133/* 150/*
@@ -205,4 +222,18 @@ xfs_free_extent(
205 xfs_fsblock_t bno, /* starting block number of extent */ 222 xfs_fsblock_t bno, /* starting block number of extent */
206 xfs_extlen_t len); /* length of extent */ 223 xfs_extlen_t len); /* length of extent */
207 224
225int /* error */
226xfs_alloc_lookup_le(
227 struct xfs_btree_cur *cur, /* btree cursor */
228 xfs_agblock_t bno, /* starting block of extent */
229 xfs_extlen_t len, /* length of extent */
230 int *stat); /* success/failure */
231
232int /* error */
233xfs_alloc_get_rec(
234 struct xfs_btree_cur *cur, /* btree cursor */
235 xfs_agblock_t *bno, /* output: starting block of extent */
236 xfs_extlen_t *len, /* output: length of extent */
237 int *stat); /* output: success/failure */
238
208#endif /* __XFS_ALLOC_H__ */ 239#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
637 * It didn't all fit, so we have to sort everything on hashval. 637 * It didn't all fit, so we have to sort everything on hashval.
638 */ 638 */
639 sbsize = sf->hdr.count * sizeof(*sbuf); 639 sbsize = sf->hdr.count * sizeof(*sbuf);
640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); 640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
641 641
642 /* 642 /*
643 * Scan the attribute list for the rest of the entries, storing 643 * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2386 args.dp = context->dp; 2386 args.dp = context->dp;
2387 args.whichfork = XFS_ATTR_FORK; 2387 args.whichfork = XFS_ATTR_FORK;
2388 args.valuelen = valuelen; 2388 args.valuelen = valuelen;
2389 args.value = kmem_alloc(valuelen, KM_SLEEP); 2389 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk); 2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); 2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
2392 retval = xfs_attr_rmtval_get(&args); 2392 retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4111cd3966c7..fa00788de2f5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
1038 * Filling in the middle part of a previous delayed allocation. 1038 * Filling in the middle part of a previous delayed allocation.
1039 * Contiguity is impossible here. 1039 * Contiguity is impossible here.
1040 * This case is avoided almost all the time. 1040 * This case is avoided almost all the time.
1041 *
1042 * We start with a delayed allocation:
1043 *
1044 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
1045 * PREV @ idx
1046 *
1047 * and we are allocating:
1048 * +rrrrrrrrrrrrrrrrr+
1049 * new
1050 *
1051 * and we set it up for insertion as:
1052 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
1053 * new
1054 * PREV @ idx LEFT RIGHT
1055 * inserted at idx + 1
1041 */ 1056 */
1042 temp = new->br_startoff - PREV.br_startoff; 1057 temp = new->br_startoff - PREV.br_startoff;
1043 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1044 xfs_bmbt_set_blockcount(ep, temp);
1045 r[0] = *new;
1046 r[1].br_state = PREV.br_state;
1047 r[1].br_startblock = 0;
1048 r[1].br_startoff = new_endoff;
1049 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1058 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1050 r[1].br_blockcount = temp2; 1059 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1051 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1060 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1061 LEFT = *new;
1062 RIGHT.br_state = PREV.br_state;
1063 RIGHT.br_startblock = nullstartblock(
1064 (int)xfs_bmap_worst_indlen(ip, temp2));
1065 RIGHT.br_startoff = new_endoff;
1066 RIGHT.br_blockcount = temp2;
1067 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1068 xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
1052 ip->i_df.if_lastex = idx + 1; 1069 ip->i_df.if_lastex = idx + 1;
1053 ip->i_d.di_nextents++; 1070 ip->i_d.di_nextents++;
1054 if (cur == NULL) 1071 if (cur == NULL)
@@ -2348,6 +2365,13 @@ xfs_bmap_rtalloc(
2348 */ 2365 */
2349 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) 2366 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
2350 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; 2367 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
2368
2369 /*
2370 * Lock out other modifications to the RT bitmap inode.
2371 */
2372 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2373 xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2374
2351 /* 2375 /*
2352 * If it's an allocation to an empty file at offset 0, 2376 * If it's an allocation to an empty file at offset 0,
2353 * pick an extent that will space things out in the rt area. 2377 * pick an extent that will space things out in the rt area.
@@ -2430,7 +2454,7 @@ xfs_bmap_btalloc_nullfb(
2430 startag = ag = 0; 2454 startag = ag = 0;
2431 2455
2432 pag = xfs_perag_get(mp, ag); 2456 pag = xfs_perag_get(mp, ag);
2433 while (*blen < ap->alen) { 2457 while (*blen < args->maxlen) {
2434 if (!pag->pagf_init) { 2458 if (!pag->pagf_init) {
2435 error = xfs_alloc_pagf_init(mp, args->tp, ag, 2459 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2436 XFS_ALLOC_FLAG_TRYLOCK); 2460 XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2476,7 @@ xfs_bmap_btalloc_nullfb(
2452 notinit = 1; 2476 notinit = 1;
2453 2477
2454 if (xfs_inode_is_filestream(ap->ip)) { 2478 if (xfs_inode_is_filestream(ap->ip)) {
2455 if (*blen >= ap->alen) 2479 if (*blen >= args->maxlen)
2456 break; 2480 break;
2457 2481
2458 if (ap->userdata) { 2482 if (ap->userdata) {
@@ -2498,14 +2522,14 @@ xfs_bmap_btalloc_nullfb(
2498 * If the best seen length is less than the request 2522 * If the best seen length is less than the request
2499 * length, use the best as the minimum. 2523 * length, use the best as the minimum.
2500 */ 2524 */
2501 else if (*blen < ap->alen) 2525 else if (*blen < args->maxlen)
2502 args->minlen = *blen; 2526 args->minlen = *blen;
2503 /* 2527 /*
2504 * Otherwise we've seen an extent as big as alen, 2528 * Otherwise we've seen an extent as big as maxlen,
2505 * use that as the minimum. 2529 * use that as the minimum.
2506 */ 2530 */
2507 else 2531 else
2508 args->minlen = ap->alen; 2532 args->minlen = args->maxlen;
2509 2533
2510 /* 2534 /*
2511 * set the failure fallback case to look in the selected 2535 * set the failure fallback case to look in the selected
@@ -2573,7 +2597,9 @@ xfs_bmap_btalloc(
2573 args.tp = ap->tp; 2597 args.tp = ap->tp;
2574 args.mp = mp; 2598 args.mp = mp;
2575 args.fsbno = ap->rval; 2599 args.fsbno = ap->rval;
2576 args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); 2600
2601 /* Trim the allocation back to the maximum an AG can fit. */
2602 args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
2577 args.firstblock = ap->firstblock; 2603 args.firstblock = ap->firstblock;
2578 blen = 0; 2604 blen = 0;
2579 if (nullfb) { 2605 if (nullfb) {
@@ -2621,7 +2647,7 @@ xfs_bmap_btalloc(
2621 /* 2647 /*
2622 * Adjust for alignment 2648 * Adjust for alignment
2623 */ 2649 */
2624 if (blen > args.alignment && blen <= ap->alen) 2650 if (blen > args.alignment && blen <= args.maxlen)
2625 args.minlen = blen - args.alignment; 2651 args.minlen = blen - args.alignment;
2626 args.minalignslop = 0; 2652 args.minalignslop = 0;
2627 } else { 2653 } else {
@@ -2640,7 +2666,7 @@ xfs_bmap_btalloc(
2640 * of minlen+alignment+slop doesn't go up 2666 * of minlen+alignment+slop doesn't go up
2641 * between the calls. 2667 * between the calls.
2642 */ 2668 */
2643 if (blen > mp->m_dalign && blen <= ap->alen) 2669 if (blen > mp->m_dalign && blen <= args.maxlen)
2644 nextminlen = blen - mp->m_dalign; 2670 nextminlen = blen - mp->m_dalign;
2645 else 2671 else
2646 nextminlen = args.minlen; 2672 nextminlen = args.minlen;
@@ -3500,7 +3526,7 @@ xfs_bmap_search_extents(
3500 3526
3501 if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && 3527 if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
3502 !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { 3528 !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
3503 xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, 3529 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
3504 "Access to block zero in inode %llu " 3530 "Access to block zero in inode %llu "
3505 "start_block: %llx start_off: %llx " 3531 "start_block: %llx start_off: %llx "
3506 "blkcnt: %llx extent-state: %x lastx: %x\n", 3532 "blkcnt: %llx extent-state: %x lastx: %x\n",
@@ -4174,12 +4200,11 @@ xfs_bmap_read_extents(
4174 num_recs = xfs_btree_get_numrecs(block); 4200 num_recs = xfs_btree_get_numrecs(block);
4175 if (unlikely(i + num_recs > room)) { 4201 if (unlikely(i + num_recs > room)) {
4176 ASSERT(i + num_recs <= room); 4202 ASSERT(i + num_recs <= room);
4177 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 4203 xfs_warn(ip->i_mount,
4178 "corrupt dinode %Lu, (btree extents).", 4204 "corrupt dinode %Lu, (btree extents).",
4179 (unsigned long long) ip->i_ino); 4205 (unsigned long long) ip->i_ino);
4180 XFS_ERROR_REPORT("xfs_bmap_read_extents(1)", 4206 XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
4181 XFS_ERRLEVEL_LOW, 4207 XFS_ERRLEVEL_LOW, ip->i_mount, block);
4182 ip->i_mount);
4183 goto error0; 4208 goto error0;
4184 } 4209 }
4185 XFS_WANT_CORRUPTED_GOTO( 4210 XFS_WANT_CORRUPTED_GOTO(
@@ -4485,6 +4510,16 @@ xfs_bmapi(
4485 /* Figure out the extent size, adjust alen */ 4510 /* Figure out the extent size, adjust alen */
4486 extsz = xfs_get_extsz_hint(ip); 4511 extsz = xfs_get_extsz_hint(ip);
4487 if (extsz) { 4512 if (extsz) {
4513 /*
4514 * make sure we don't exceed a single
4515 * extent length when we align the
4516 * extent by reducing length we are
4517 * going to allocate by the maximum
4518 * amount extent size aligment may
4519 * require.
4520 */
4521 alen = XFS_FILBLKS_MIN(len,
4522 MAXEXTLEN - (2 * extsz - 1));
4488 error = xfs_bmap_extsize_align(mp, 4523 error = xfs_bmap_extsize_align(mp,
4489 &got, &prev, extsz, 4524 &got, &prev, extsz,
4490 rt, eof, 4525 rt, eof,
@@ -5743,7 +5778,7 @@ xfs_check_block(
5743 else 5778 else
5744 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); 5779 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
5745 if (*thispa == *pp) { 5780 if (*thispa == *pp) {
5746 cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", 5781 xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
5747 __func__, j, i, 5782 __func__, j, i,
5748 (unsigned long long)be64_to_cpu(*thispa)); 5783 (unsigned long long)be64_to_cpu(*thispa));
5749 panic("%s: ptrs are equal in node\n", 5784 panic("%s: ptrs are equal in node\n",
@@ -5908,11 +5943,11 @@ xfs_bmap_check_leaf_extents(
5908 return; 5943 return;
5909 5944
5910error0: 5945error0:
5911 cmn_err(CE_WARN, "%s: at error0", __func__); 5946 xfs_warn(mp, "%s: at error0", __func__);
5912 if (bp_release) 5947 if (bp_release)
5913 xfs_trans_brelse(NULL, bp); 5948 xfs_trans_brelse(NULL, bp);
5914error_norelse: 5949error_norelse:
5915 cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents", 5950 xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
5916 __func__, i); 5951 __func__, i);
5917 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); 5952 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
5918 return; 5953 return;
@@ -6115,7 +6150,7 @@ xfs_bmap_punch_delalloc_range(
6115 if (error) { 6150 if (error) {
6116 /* something screwed, just bail */ 6151 /* something screwed, just bail */
6117 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 6152 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
6118 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 6153 xfs_alert(ip->i_mount,
6119 "Failed delalloc mapping lookup ino %lld fsb %lld.", 6154 "Failed delalloc mapping lookup ino %lld fsb %lld.",
6120 ip->i_ino, start_fsb); 6155 ip->i_ino, start_fsb);
6121 } 6156 }
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7e..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
634 return error; 634 return error;
635 } 635 }
636 ASSERT(!bp || !XFS_BUF_GETERROR(bp)); 636 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
637 if (bp != NULL) { 637 if (bp)
638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); 638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
639 }
640 *bpp = bp; 639 *bpp = bp;
641 return 0; 640 return 0;
642} 641}
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
944 switch (cur->bc_btnum) { 943 switch (cur->bc_btnum) {
945 case XFS_BTNUM_BNO: 944 case XFS_BTNUM_BNO:
946 case XFS_BTNUM_CNT: 945 case XFS_BTNUM_CNT:
947 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); 946 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
948 break; 947 break;
949 case XFS_BTNUM_INO: 948 case XFS_BTNUM_INO:
950 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); 949 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
951 break; 950 break;
952 case XFS_BTNUM_BMAP: 951 case XFS_BTNUM_BMAP:
953 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); 952 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
954 break; 953 break;
955 default: 954 default:
956 ASSERT(0); 955 ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5b..7b7e005e3dcc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -130,10 +130,12 @@ xfs_buf_item_log_check(
130 orig = bip->bli_orig; 130 orig = bip->bli_orig;
131 buffer = XFS_BUF_PTR(bp); 131 buffer = XFS_BUF_PTR(bp);
132 for (x = 0; x < XFS_BUF_COUNT(bp); x++) { 132 for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
133 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) 133 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
134 cmn_err(CE_PANIC, 134 xfs_emerg(bp->b_mount,
135 "xfs_buf_item_log_check bip %x buffer %x orig %x index %d", 135 "%s: bip %x buffer %x orig %x index %d",
136 bip, bp, orig, x); 136 __func__, bip, bp, orig, x);
137 ASSERT(0);
138 }
137 } 139 }
138} 140}
139#else 141#else
@@ -141,8 +143,7 @@ xfs_buf_item_log_check(
141#define xfs_buf_item_log_check(x) 143#define xfs_buf_item_log_check(x)
142#endif 144#endif
143 145
144STATIC void xfs_buf_error_relse(xfs_buf_t *bp); 146STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
145STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
146 147
147/* 148/*
148 * This returns the number of log iovecs needed to log the 149 * This returns the number of log iovecs needed to log the
@@ -428,13 +429,15 @@ xfs_buf_item_unpin(
428 429
429 if (remove) { 430 if (remove) {
430 /* 431 /*
431 * We have to remove the log item from the transaction 432 * If we are in a transaction context, we have to
432 * as we are about to release our reference to the 433 * remove the log item from the transaction as we are
433 * buffer. If we don't, the unlock that occurs later 434 * about to release our reference to the buffer. If we
434 * in xfs_trans_uncommit() will ry to reference the 435 * don't, the unlock that occurs later in
436 * xfs_trans_uncommit() will try to reference the
435 * buffer which we no longer have a hold on. 437 * buffer which we no longer have a hold on.
436 */ 438 */
437 xfs_trans_del_item(lip); 439 if (lip->li_desc)
440 xfs_trans_del_item(lip);
438 441
439 /* 442 /*
440 * Since the transaction no longer refers to the buffer, 443 * Since the transaction no longer refers to the buffer,
@@ -450,7 +453,7 @@ xfs_buf_item_unpin(
450 * xfs_trans_ail_delete() drops the AIL lock. 453 * xfs_trans_ail_delete() drops the AIL lock.
451 */ 454 */
452 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 455 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
453 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); 456 xfs_buf_do_callbacks(bp);
454 XFS_BUF_SET_FSPRIVATE(bp, NULL); 457 XFS_BUF_SET_FSPRIVATE(bp, NULL);
455 XFS_BUF_CLR_IODONE_FUNC(bp); 458 XFS_BUF_CLR_IODONE_FUNC(bp);
456 } else { 459 } else {
@@ -918,15 +921,26 @@ xfs_buf_attach_iodone(
918 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 921 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
919} 922}
920 923
924/*
925 * We can have many callbacks on a buffer. Running the callbacks individually
926 * can cause a lot of contention on the AIL lock, so we allow for a single
927 * callback to be able to scan the remaining lip->li_bio_list for other items
928 * of the same type and callback to be processed in the first call.
929 *
930 * As a result, the loop walking the callback list below will also modify the
931 * list. it removes the first item from the list and then runs the callback.
932 * The loop then restarts from the new head of the list. This allows the
933 * callback to scan and modify the list attached to the buffer and we don't
934 * have to care about maintaining a next item pointer.
935 */
921STATIC void 936STATIC void
922xfs_buf_do_callbacks( 937xfs_buf_do_callbacks(
923 xfs_buf_t *bp, 938 struct xfs_buf *bp)
924 xfs_log_item_t *lip)
925{ 939{
926 xfs_log_item_t *nlip; 940 struct xfs_log_item *lip;
927 941
928 while (lip != NULL) { 942 while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
929 nlip = lip->li_bio_list; 943 XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
930 ASSERT(lip->li_cb != NULL); 944 ASSERT(lip->li_cb != NULL);
931 /* 945 /*
932 * Clear the next pointer so we don't have any 946 * Clear the next pointer so we don't have any
@@ -936,7 +950,6 @@ xfs_buf_do_callbacks(
936 */ 950 */
937 lip->li_bio_list = NULL; 951 lip->li_bio_list = NULL;
938 lip->li_cb(bp, lip); 952 lip->li_cb(bp, lip);
939 lip = nlip;
940 } 953 }
941} 954}
942 955
@@ -949,128 +962,75 @@ xfs_buf_do_callbacks(
949 */ 962 */
950void 963void
951xfs_buf_iodone_callbacks( 964xfs_buf_iodone_callbacks(
952 xfs_buf_t *bp) 965 struct xfs_buf *bp)
953{ 966{
954 xfs_log_item_t *lip; 967 struct xfs_log_item *lip = bp->b_fspriv;
955 static ulong lasttime; 968 struct xfs_mount *mp = lip->li_mountp;
956 static xfs_buftarg_t *lasttarg; 969 static ulong lasttime;
957 xfs_mount_t *mp; 970 static xfs_buftarg_t *lasttarg;
958 971
959 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 972 if (likely(!XFS_BUF_GETERROR(bp)))
960 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 973 goto do_callbacks;
961 974
962 if (XFS_BUF_GETERROR(bp) != 0) { 975 /*
963 /* 976 * If we've already decided to shutdown the filesystem because of
964 * If we've already decided to shutdown the filesystem 977 * I/O errors, there's no point in giving this a retry.
965 * because of IO errors, there's no point in giving this 978 */
966 * a retry. 979 if (XFS_FORCED_SHUTDOWN(mp)) {
967 */ 980 XFS_BUF_SUPER_STALE(bp);
968 mp = lip->li_mountp; 981 trace_xfs_buf_item_iodone(bp, _RET_IP_);
969 if (XFS_FORCED_SHUTDOWN(mp)) { 982 goto do_callbacks;
970 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 983 }
971 XFS_BUF_SUPER_STALE(bp);
972 trace_xfs_buf_item_iodone(bp, _RET_IP_);
973 xfs_buf_do_callbacks(bp, lip);
974 XFS_BUF_SET_FSPRIVATE(bp, NULL);
975 XFS_BUF_CLR_IODONE_FUNC(bp);
976 xfs_buf_ioend(bp, 0);
977 return;
978 }
979 984
980 if ((XFS_BUF_TARGET(bp) != lasttarg) || 985 if (XFS_BUF_TARGET(bp) != lasttarg ||
981 (time_after(jiffies, (lasttime + 5*HZ)))) { 986 time_after(jiffies, (lasttime + 5*HZ))) {
982 lasttime = jiffies; 987 lasttime = jiffies;
983 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 988 xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
984 " block 0x%llx in %s", 989 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
985 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 990 (__uint64_t)XFS_BUF_ADDR(bp));
986 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 991 }
987 } 992 lasttarg = XFS_BUF_TARGET(bp);
988 lasttarg = XFS_BUF_TARGET(bp);
989 993
990 if (XFS_BUF_ISASYNC(bp)) { 994 /*
991 /* 995 * If the write was asynchronous then no one will be looking for the
992 * If the write was asynchronous then noone will be 996 * error. Clear the error state and write the buffer out again.
993 * looking for the error. Clear the error state 997 *
994 * and write the buffer out again delayed write. 998 * During sync or umount we'll write all pending buffers again
995 * 999 * synchronous, which will catch these errors if they keep hanging
996 * XXXsup This is OK, so long as we catch these 1000 * around.
997 * before we start the umount; we don't want these 1001 */
998 * DELWRI metadata bufs to be hanging around. 1002 if (XFS_BUF_ISASYNC(bp)) {
999 */ 1003 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
1000 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ 1004
1001 1005 if (!XFS_BUF_ISSTALE(bp)) {
1002 if (!(XFS_BUF_ISSTALE(bp))) { 1006 XFS_BUF_DELAYWRITE(bp);
1003 XFS_BUF_DELAYWRITE(bp);
1004 XFS_BUF_DONE(bp);
1005 XFS_BUF_SET_START(bp);
1006 }
1007 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1008 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1009 xfs_buf_relse(bp);
1010 } else {
1011 /*
1012 * If the write of the buffer was not asynchronous,
1013 * then we want to make sure to return the error
1014 * to the caller of bwrite(). Because of this we
1015 * cannot clear the B_ERROR state at this point.
1016 * Instead we install a callback function that
1017 * will be called when the buffer is released, and
1018 * that routine will clear the error state and
1019 * set the buffer to be written out again after
1020 * some delay.
1021 */
1022 /* We actually overwrite the existing b-relse
1023 function at times, but we're gonna be shutting down
1024 anyway. */
1025 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1026 XFS_BUF_DONE(bp); 1007 XFS_BUF_DONE(bp);
1027 XFS_BUF_FINISH_IOWAIT(bp); 1008 XFS_BUF_SET_START(bp);
1028 } 1009 }
1010 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1011 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1012 xfs_buf_relse(bp);
1029 return; 1013 return;
1030 } 1014 }
1031 1015
1032 xfs_buf_do_callbacks(bp, lip); 1016 /*
1033 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1017 * If the write of the buffer was synchronous, we want to make
1034 XFS_BUF_CLR_IODONE_FUNC(bp); 1018 * sure to return the error to the caller of xfs_bwrite().
1035 xfs_buf_ioend(bp, 0); 1019 */
1036}
1037
1038/*
1039 * This is a callback routine attached to a buffer which gets an error
1040 * when being written out synchronously.
1041 */
1042STATIC void
1043xfs_buf_error_relse(
1044 xfs_buf_t *bp)
1045{
1046 xfs_log_item_t *lip;
1047 xfs_mount_t *mp;
1048
1049 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1050 mp = (xfs_mount_t *)lip->li_mountp;
1051 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1052
1053 XFS_BUF_STALE(bp); 1020 XFS_BUF_STALE(bp);
1054 XFS_BUF_DONE(bp); 1021 XFS_BUF_DONE(bp);
1055 XFS_BUF_UNDELAYWRITE(bp); 1022 XFS_BUF_UNDELAYWRITE(bp);
1056 XFS_BUF_ERROR(bp,0);
1057 1023
1058 trace_xfs_buf_error_relse(bp, _RET_IP_); 1024 trace_xfs_buf_error_relse(bp, _RET_IP_);
1025 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1059 1026
1060 if (! XFS_FORCED_SHUTDOWN(mp)) 1027do_callbacks:
1061 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1028 xfs_buf_do_callbacks(bp);
1062 /*
1063 * We have to unpin the pinned buffers so do the
1064 * callbacks.
1065 */
1066 xfs_buf_do_callbacks(bp, lip);
1067 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1029 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1068 XFS_BUF_CLR_IODONE_FUNC(bp); 1030 XFS_BUF_CLR_IODONE_FUNC(bp);
1069 XFS_BUF_SET_BRELSE_FUNC(bp,NULL); 1031 xfs_buf_ioend(bp, 0);
1070 xfs_buf_relse(bp);
1071} 1032}
1072 1033
1073
1074/* 1034/*
1075 * This is the iodone() function for buffers which have been 1035 * This is the iodone() function for buffers which have been
1076 * logged. It is called when they are eventually flushed out. 1036 * logged. It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
105 xfs_buf_log_format_t bli_format; /* in-log header */ 105 xfs_buf_log_format_t bli_format; /* in-log header */
106} xfs_buf_log_item_t; 106} xfs_buf_log_item_t;
107 107
108/*
109 * This structure is used during recovery to record the buf log
110 * items which have been canceled and should not be replayed.
111 */
112typedef struct xfs_buf_cancel {
113 xfs_daddr_t bc_blkno;
114 uint bc_len;
115 int bc_refcount;
116 struct xfs_buf_cancel *bc_next;
117} xfs_buf_cancel_t;
118
119void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); 108void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
120void xfs_buf_item_relse(struct xfs_buf *); 109void xfs_buf_item_relse(struct xfs_buf *);
121void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); 110void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 1c00bedb3175..6102ac6d1dff 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,13 +1995,12 @@ xfs_da_do_buf(
1995 error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); 1995 error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
1996 if (unlikely(error == EFSCORRUPTED)) { 1996 if (unlikely(error == EFSCORRUPTED)) {
1997 if (xfs_error_level >= XFS_ERRLEVEL_LOW) { 1997 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
1998 cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n", 1998 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
1999 (long long)bno); 1999 __func__, (long long)bno,
2000 cmn_err(CE_ALERT, "dir: inode %lld\n",
2001 (long long)dp->i_ino); 2000 (long long)dp->i_ino);
2002 for (i = 0; i < nmap; i++) { 2001 for (i = 0; i < nmap; i++) {
2003 cmn_err(CE_ALERT, 2002 xfs_alert(mp,
2004 "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n", 2003"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
2005 i, 2004 i,
2006 (long long)mapp[i].br_startoff, 2005 (long long)mapp[i].br_startoff,
2007 (long long)mapp[i].br_startblock, 2006 (long long)mapp[i].br_startblock,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e60490bc00a6..be628677c288 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -270,9 +270,9 @@ xfs_swap_extents(
270 /* check inode formats now that data is flushed */ 270 /* check inode formats now that data is flushed */
271 error = xfs_swap_extents_check_format(ip, tip); 271 error = xfs_swap_extents_check_format(ip, tip);
272 if (error) { 272 if (error) {
273 xfs_fs_cmn_err(CE_NOTE, mp, 273 xfs_notice(mp,
274 "%s: inode 0x%llx format is incompatible for exchanging.", 274 "%s: inode 0x%llx format is incompatible for exchanging.",
275 __FILE__, ip->i_ino); 275 __func__, ip->i_ino);
276 goto out_unlock; 276 goto out_unlock;
277 } 277 }
278 278
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a1321bc7f192..dba7a71cedf3 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -159,7 +159,7 @@ xfs_dir_ino_validate(
159 XFS_AGINO_TO_INO(mp, agno, agino) == ino; 159 XFS_AGINO_TO_INO(mp, agno, agino) == ino;
160 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, 160 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
161 XFS_RANDOM_DIR_INO_VALIDATE))) { 161 XFS_RANDOM_DIR_INO_VALIDATE))) {
162 xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx", 162 xfs_warn(mp, "Invalid inode number 0x%Lx",
163 (unsigned long long) ino); 163 (unsigned long long) ino);
164 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); 164 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
165 return XFS_ERROR(EFSCORRUPTED); 165 return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f9a0864b696a..a0aab7d3294f 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -899,10 +899,9 @@ xfs_dir2_leafn_rebalance(
899 if(blk2->index < 0) { 899 if(blk2->index < 0) {
900 state->inleaf = 1; 900 state->inleaf = 1;
901 blk2->index = 0; 901 blk2->index = 0;
902 cmn_err(CE_ALERT, 902 xfs_alert(args->dp->i_mount,
903 "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: " 903 "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
904 "blk1->index %d\n", 904 __func__, blk1->index);
905 blk1->index);
906 } 905 }
907} 906}
908 907
@@ -1641,26 +1640,22 @@ xfs_dir2_node_addname_int(
1641 } 1640 }
1642 1641
1643 if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { 1642 if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
1644 cmn_err(CE_ALERT, 1643 xfs_alert(mp,
1645 "xfs_dir2_node_addname_int: dir ino " 1644 "%s: dir ino " "%llu needed freesp block %lld for\n"
1646 "%llu needed freesp block %lld for\n" 1645 " data block %lld, got %lld ifbno %llu lastfbno %d",
1647 " data block %lld, got %lld\n" 1646 __func__, (unsigned long long)dp->i_ino,
1648 " ifbno %llu lastfbno %d\n",
1649 (unsigned long long)dp->i_ino,
1650 (long long)xfs_dir2_db_to_fdb(mp, dbno), 1647 (long long)xfs_dir2_db_to_fdb(mp, dbno),
1651 (long long)dbno, (long long)fbno, 1648 (long long)dbno, (long long)fbno,
1652 (unsigned long long)ifbno, lastfbno); 1649 (unsigned long long)ifbno, lastfbno);
1653 if (fblk) { 1650 if (fblk) {
1654 cmn_err(CE_ALERT, 1651 xfs_alert(mp,
1655 " fblk 0x%p blkno %llu " 1652 " fblk 0x%p blkno %llu index %d magic 0x%x",
1656 "index %d magic 0x%x\n",
1657 fblk, 1653 fblk,
1658 (unsigned long long)fblk->blkno, 1654 (unsigned long long)fblk->blkno,
1659 fblk->index, 1655 fblk->index,
1660 fblk->magic); 1656 fblk->magic);
1661 } else { 1657 } else {
1662 cmn_err(CE_ALERT, 1658 xfs_alert(mp, " ... fblk is NULL");
1663 " ... fblk is NULL\n");
1664 } 1659 }
1665 XFS_ERROR_REPORT("xfs_dir2_node_addname_int", 1660 XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
1666 XFS_ERRLEVEL_LOW, mp); 1661 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c78cc6a3d87c..39f06336b99d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -48,7 +48,7 @@ xfs_error_trap(int e)
48 break; 48 break;
49 if (e != xfs_etrap[i]) 49 if (e != xfs_etrap[i])
50 continue; 50 continue;
51 cmn_err(CE_NOTE, "xfs_error_trap: error %d", e); 51 xfs_notice(NULL, "%s: error %d", __func__, e);
52 BUG(); 52 BUG();
53 break; 53 break;
54 } 54 }
@@ -74,7 +74,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
74 74
75 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 75 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
76 if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { 76 if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
77 cmn_err(CE_WARN, 77 xfs_warn(NULL,
78 "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", 78 "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
79 expression, file, line, xfs_etest_fsname[i]); 79 expression, file, line, xfs_etest_fsname[i]);
80 return 1; 80 return 1;
@@ -95,14 +95,14 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
95 95
96 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 96 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
97 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { 97 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
98 cmn_err(CE_WARN, "XFS error tag #%d on", error_tag); 98 xfs_warn(mp, "error tag #%d on", error_tag);
99 return 0; 99 return 0;
100 } 100 }
101 } 101 }
102 102
103 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 103 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
104 if (xfs_etest[i] == 0) { 104 if (xfs_etest[i] == 0) {
105 cmn_err(CE_WARN, "Turned on XFS error tag #%d", 105 xfs_warn(mp, "Turned on XFS error tag #%d",
106 error_tag); 106 error_tag);
107 xfs_etest[i] = error_tag; 107 xfs_etest[i] = error_tag;
108 xfs_etest_fsid[i] = fsid; 108 xfs_etest_fsid[i] = fsid;
@@ -114,7 +114,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
114 } 114 }
115 } 115 }
116 116
117 cmn_err(CE_WARN, "error tag overflow, too many turned on"); 117 xfs_warn(mp, "error tag overflow, too many turned on");
118 118
119 return 1; 119 return 1;
120} 120}
@@ -133,7 +133,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
133 if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && 133 if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
134 xfs_etest[i] != 0) { 134 xfs_etest[i] != 0) {
135 cleared = 1; 135 cleared = 1;
136 cmn_err(CE_WARN, "Clearing XFS error tag #%d", 136 xfs_warn(mp, "Clearing XFS error tag #%d",
137 xfs_etest[i]); 137 xfs_etest[i]);
138 xfs_etest[i] = 0; 138 xfs_etest[i] = 0;
139 xfs_etest_fsid[i] = 0LL; 139 xfs_etest_fsid[i] = 0LL;
@@ -144,45 +144,12 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
144 } 144 }
145 145
146 if (loud || cleared) 146 if (loud || cleared)
147 cmn_err(CE_WARN, 147 xfs_warn(mp, "Cleared all XFS error tags for filesystem");
148 "Cleared all XFS error tags for filesystem \"%s\"",
149 mp->m_fsname);
150 148
151 return 0; 149 return 0;
152} 150}
153#endif /* DEBUG */ 151#endif /* DEBUG */
154 152
155
156void
157xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
158{
159 va_list ap;
160
161 va_start(ap, fmt);
162 xfs_fs_vcmn_err(level, mp, fmt, ap);
163 va_end(ap);
164}
165
166void
167xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
168{
169 va_list ap;
170
171#ifdef DEBUG
172 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
173#endif
174
175 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
176 && (level & CE_ALERT)) {
177 level &= ~CE_ALERT;
178 level |= CE_PANIC;
179 cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
180 }
181 va_start(ap, fmt);
182 xfs_fs_vcmn_err(level, mp, fmt, ap);
183 va_end(ap);
184}
185
186void 153void
187xfs_error_report( 154xfs_error_report(
188 const char *tag, 155 const char *tag,
@@ -193,9 +160,8 @@ xfs_error_report(
193 inst_t *ra) 160 inst_t *ra)
194{ 161{
195 if (level <= xfs_error_level) { 162 if (level <= xfs_error_level) {
196 xfs_cmn_err(XFS_PTAG_ERROR_REPORT, 163 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
197 CE_ALERT, mp, 164 "Internal error %s at line %d of file %s. Caller 0x%p\n",
198 "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
199 tag, linenum, filename, ra); 165 tag, linenum, filename, ra);
200 166
201 xfs_stack_trace(); 167 xfs_stack_trace();
@@ -215,4 +181,5 @@ xfs_corruption_error(
215 if (level <= xfs_error_level) 181 if (level <= xfs_error_level)
216 xfs_hex_dump(p, 16); 182 xfs_hex_dump(p, 16);
217 xfs_error_report(tag, level, mp, filename, linenum, ra); 183 xfs_error_report(tag, level, mp, filename, linenum, ra);
184 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
218} 185}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index f338847f80b8..079a367f44ee 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
137 (rf)))) 137 (rf))))
138 138
139extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); 139extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
140extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); 140extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
141#else 141#else
142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
143#define xfs_errortag_add(tag, mp) (ENOSYS) 143#define xfs_errortag_add(tag, mp) (ENOSYS)
@@ -145,10 +145,8 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
145#endif /* DEBUG */ 145#endif /* DEBUG */
146 146
147/* 147/*
148 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into 148 * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
149 * a panic by setting xfs_panic_mask in a 149 * a panic by setting xfs_panic_mask in a sysctl.
150 * sysctl. update xfs_max[XFS_PARAM] if
151 * more are added.
152 */ 150 */
153#define XFS_NO_PTAG 0 151#define XFS_NO_PTAG 0
154#define XFS_PTAG_IFLUSH 0x00000001 152#define XFS_PTAG_IFLUSH 0x00000001
@@ -160,23 +158,4 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
160#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 158#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
161#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 159#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
162 160
163struct xfs_mount;
164
165extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
166 char *fmt, va_list ap)
167 __attribute__ ((format (printf, 3, 0)));
168extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
169 char *fmt, ...)
170 __attribute__ ((format (printf, 4, 5)));
171extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
172 __attribute__ ((format (printf, 3, 4)));
173
174extern void xfs_hex_dump(void *p, int length);
175
176#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
177 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
178
179#define xfs_fs_mount_cmn_err(f, fmt, args...) \
180 ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
181
182#endif /* __XFS_ERROR_H__ */ 161#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
48} 48}
49 49
50/* 50/*
51 * Freeing the efi requires that we remove it from the AIL if it has already
52 * been placed there. However, the EFI may not yet have been placed in the AIL
53 * when called by xfs_efi_release() from EFD processing due to the ordering of
54 * committed vs unpin operations in bulk insert operations. Hence the
55 * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
56 * the EFI.
57 */
58STATIC void
59__xfs_efi_release(
60 struct xfs_efi_log_item *efip)
61{
62 struct xfs_ail *ailp = efip->efi_item.li_ailp;
63
64 if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
65 spin_lock(&ailp->xa_lock);
66 /* xfs_trans_ail_delete() drops the AIL lock. */
67 xfs_trans_ail_delete(ailp, &efip->efi_item);
68 xfs_efi_item_free(efip);
69 }
70}
71
72/*
51 * This returns the number of iovecs needed to log the given efi item. 73 * This returns the number of iovecs needed to log the given efi item.
52 * We only need 1 iovec for an efi item. It just logs the efi_log_format 74 * We only need 1 iovec for an efi item. It just logs the efi_log_format
53 * structure. 75 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
74 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 96 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
75 uint size; 97 uint size;
76 98
77 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); 99 ASSERT(atomic_read(&efip->efi_next_extent) ==
100 efip->efi_format.efi_nextents);
78 101
79 efip->efi_format.efi_type = XFS_LI_EFI; 102 efip->efi_format.efi_type = XFS_LI_EFI;
80 103
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
99} 122}
100 123
101/* 124/*
102 * While EFIs cannot really be pinned, the unpin operation is the 125 * While EFIs cannot really be pinned, the unpin operation is the last place at
103 * last place at which the EFI is manipulated during a transaction. 126 * which the EFI is manipulated during a transaction. If we are being asked to
104 * Here we coordinate with xfs_efi_cancel() to determine who gets to 127 * remove the EFI it's because the transaction has been cancelled and by
105 * free the EFI. 128 * definition that means the EFI cannot be in the AIL so remove it from the
129 * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
130 * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
106 */ 131 */
107STATIC void 132STATIC void
108xfs_efi_item_unpin( 133xfs_efi_item_unpin(
@@ -110,20 +135,15 @@ xfs_efi_item_unpin(
110 int remove) 135 int remove)
111{ 136{
112 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 137 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
113 struct xfs_ail *ailp = lip->li_ailp;
114 138
115 spin_lock(&ailp->xa_lock); 139 if (remove) {
116 if (efip->efi_flags & XFS_EFI_CANCELED) { 140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
117 if (remove) 141 if (lip->li_desc)
118 xfs_trans_del_item(lip); 142 xfs_trans_del_item(lip);
119
120 /* xfs_trans_ail_delete() drops the AIL lock. */
121 xfs_trans_ail_delete(ailp, lip);
122 xfs_efi_item_free(efip); 143 xfs_efi_item_free(efip);
123 } else { 144 return;
124 efip->efi_flags |= XFS_EFI_COMMITTED;
125 spin_unlock(&ailp->xa_lock);
126 } 145 }
146 __xfs_efi_release(efip);
127} 147}
128 148
129/* 149/*
@@ -152,16 +172,20 @@ xfs_efi_item_unlock(
152} 172}
153 173
154/* 174/*
155 * The EFI is logged only once and cannot be moved in the log, so 175 * The EFI is logged only once and cannot be moved in the log, so simply return
156 * simply return the lsn at which it's been logged. The canceled 176 * the lsn at which it's been logged. For bulk transaction committed
157 * flag is not paid any attention here. Checking for that is delayed 177 * processing, the EFI may be processed but not yet unpinned prior to the EFD
158 * until the EFI is unpinned. 178 * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
179 * when processing the EFD.
159 */ 180 */
160STATIC xfs_lsn_t 181STATIC xfs_lsn_t
161xfs_efi_item_committed( 182xfs_efi_item_committed(
162 struct xfs_log_item *lip, 183 struct xfs_log_item *lip,
163 xfs_lsn_t lsn) 184 xfs_lsn_t lsn)
164{ 185{
186 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
187
188 set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
165 return lsn; 189 return lsn;
166} 190}
167 191
@@ -230,6 +254,7 @@ xfs_efi_init(
230 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); 254 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
231 efip->efi_format.efi_nextents = nextents; 255 efip->efi_format.efi_nextents = nextents;
232 efip->efi_format.efi_id = (__psint_t)(void*)efip; 256 efip->efi_format.efi_id = (__psint_t)(void*)efip;
257 atomic_set(&efip->efi_next_extent, 0);
233 258
234 return efip; 259 return efip;
235} 260}
@@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
289} 314}
290 315
291/* 316/*
292 * This is called by the efd item code below to release references to 317 * This is called by the efd item code below to release references to the given
293 * the given efi item. Each efd calls this with the number of 318 * efi item. Each efd calls this with the number of extents that it has
294 * extents that it has logged, and when the sum of these reaches 319 * logged, and when the sum of these reaches the total number of extents logged
295 * the total number of extents logged by this efi item we can free 320 * by this efi item we can free the efi item.
296 * the efi item.
297 *
298 * Freeing the efi item requires that we remove it from the AIL.
299 * We'll use the AIL lock to protect our counters as well as
300 * the removal from the AIL.
301 */ 321 */
302void 322void
303xfs_efi_release(xfs_efi_log_item_t *efip, 323xfs_efi_release(xfs_efi_log_item_t *efip,
304 uint nextents) 324 uint nextents)
305{ 325{
306 struct xfs_ail *ailp = efip->efi_item.li_ailp; 326 ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
307 int extents_left; 327 if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
308 328 __xfs_efi_release(efip);
309 ASSERT(efip->efi_next_extent > 0);
310 ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
311
312 spin_lock(&ailp->xa_lock);
313 ASSERT(efip->efi_next_extent >= nextents);
314 efip->efi_next_extent -= nextents;
315 extents_left = efip->efi_next_extent;
316 if (extents_left == 0) {
317 /* xfs_trans_ail_delete() drops the AIL lock. */
318 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
319 xfs_efi_item_free(efip);
320 } else {
321 spin_unlock(&ailp->xa_lock);
322 }
323} 329}
324 330
325static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) 331static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
111#define XFS_EFI_MAX_FAST_EXTENTS 16 111#define XFS_EFI_MAX_FAST_EXTENTS 16
112 112
113/* 113/*
114 * Define EFI flags. 114 * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
115 */ 115 */
116#define XFS_EFI_RECOVERED 0x1 116#define XFS_EFI_RECOVERED 1
117#define XFS_EFI_COMMITTED 0x2 117#define XFS_EFI_COMMITTED 2
118#define XFS_EFI_CANCELED 0x4
119 118
120/* 119/*
121 * This is the "extent free intention" log item. It is used 120 * This is the "extent free intention" log item. It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
125 */ 124 */
126typedef struct xfs_efi_log_item { 125typedef struct xfs_efi_log_item {
127 xfs_log_item_t efi_item; 126 xfs_log_item_t efi_item;
128 uint efi_flags; /* misc flags */ 127 atomic_t efi_next_extent;
129 uint efi_next_extent; 128 unsigned long efi_flags; /* misc flags */
130 xfs_efi_log_format_t efi_format; 129 xfs_efi_log_format_t efi_format;
131} xfs_efi_log_item_t; 130} xfs_efi_log_item_t;
132 131
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814af..9153d2c77caf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
53 xfs_fsop_geom_t *geo, 53 xfs_fsop_geom_t *geo,
54 int new_version) 54 int new_version)
55{ 55{
56
57 memset(geo, 0, sizeof(*geo));
58
56 geo->blocksize = mp->m_sb.sb_blocksize; 59 geo->blocksize = mp->m_sb.sb_blocksize;
57 geo->rtextsize = mp->m_sb.sb_rextsize; 60 geo->rtextsize = mp->m_sb.sb_rextsize;
58 geo->agblocks = mp->m_sb.sb_agblocks; 61 geo->agblocks = mp->m_sb.sb_agblocks;
@@ -374,6 +377,7 @@ xfs_growfs_data_private(
374 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 377 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
375 } else 378 } else
376 mp->m_maxicount = 0; 379 mp->m_maxicount = 0;
380 xfs_set_low_space_thresholds(mp);
377 381
378 /* update secondary superblocks. */ 382 /* update secondary superblocks. */
379 for (agno = 1; agno < nagcount; agno++) { 383 for (agno = 1; agno < nagcount; agno++) {
@@ -381,8 +385,8 @@ xfs_growfs_data_private(
381 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 385 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
382 XFS_FSS_TO_BB(mp, 1), 0, &bp); 386 XFS_FSS_TO_BB(mp, 1), 0, &bp);
383 if (error) { 387 if (error) {
384 xfs_fs_cmn_err(CE_WARN, mp, 388 xfs_warn(mp,
385 "error %d reading secondary superblock for ag %d", 389 "error %d reading secondary superblock for ag %d",
386 error, agno); 390 error, agno);
387 break; 391 break;
388 } 392 }
@@ -395,7 +399,7 @@ xfs_growfs_data_private(
395 if (!(error = xfs_bwrite(mp, bp))) { 399 if (!(error = xfs_bwrite(mp, bp))) {
396 continue; 400 continue;
397 } else { 401 } else {
398 xfs_fs_cmn_err(CE_WARN, mp, 402 xfs_warn(mp,
399 "write error %d updating secondary superblock for ag %d", 403 "write error %d updating secondary superblock for ag %d",
400 error, agno); 404 error, agno);
401 break; /* no point in continuing */ 405 break; /* no point in continuing */
@@ -611,12 +615,13 @@ out:
611 * 615 *
612 * We cannot use an inode here for this - that will push dirty state back up 616 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from 617 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead. 618 * making progress. Hence we log a field in the superblock instead and use a
619 * synchronous transaction to ensure the superblock is immediately unpinned
620 * and can be written back.
615 */ 621 */
616int 622int
617xfs_fs_log_dummy( 623xfs_fs_log_dummy(
618 xfs_mount_t *mp, 624 xfs_mount_t *mp)
619 int flags)
620{ 625{
621 xfs_trans_t *tp; 626 xfs_trans_t *tp;
622 int error; 627 int error;
@@ -631,8 +636,7 @@ xfs_fs_log_dummy(
631 636
632 /* log the UUID because it is an unchanging field */ 637 /* log the UUID because it is an unchanging field */
633 xfs_mod_sb(tp, XFS_SB_UUID); 638 xfs_mod_sb(tp, XFS_SB_UUID);
634 if (flags & SYNC_WAIT) 639 xfs_trans_set_sync(tp);
635 xfs_trans_set_sync(tp);
636 return xfs_trans_commit(tp, 0); 640 return xfs_trans_commit(tp, 0);
637} 641}
638 642
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); 28extern int xfs_fs_log_dummy(struct xfs_mount *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0626a32c3447..84ebeec16642 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1055,28 +1055,23 @@ xfs_difree(
1055 */ 1055 */
1056 agno = XFS_INO_TO_AGNO(mp, inode); 1056 agno = XFS_INO_TO_AGNO(mp, inode);
1057 if (agno >= mp->m_sb.sb_agcount) { 1057 if (agno >= mp->m_sb.sb_agcount) {
1058 cmn_err(CE_WARN, 1058 xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
1059 "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s. Returning EINVAL.", 1059 __func__, agno, mp->m_sb.sb_agcount);
1060 agno, mp->m_sb.sb_agcount, mp->m_fsname);
1061 ASSERT(0); 1060 ASSERT(0);
1062 return XFS_ERROR(EINVAL); 1061 return XFS_ERROR(EINVAL);
1063 } 1062 }
1064 agino = XFS_INO_TO_AGINO(mp, inode); 1063 agino = XFS_INO_TO_AGINO(mp, inode);
1065 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { 1064 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
1066 cmn_err(CE_WARN, 1065 xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
1067 "xfs_difree: inode != XFS_AGINO_TO_INO() " 1066 __func__, (unsigned long long)inode,
1068 "(%llu != %llu) on %s. Returning EINVAL.", 1067 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
1069 (unsigned long long)inode,
1070 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
1071 mp->m_fsname);
1072 ASSERT(0); 1068 ASSERT(0);
1073 return XFS_ERROR(EINVAL); 1069 return XFS_ERROR(EINVAL);
1074 } 1070 }
1075 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 1071 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1076 if (agbno >= mp->m_sb.sb_agblocks) { 1072 if (agbno >= mp->m_sb.sb_agblocks) {
1077 cmn_err(CE_WARN, 1073 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
1078 "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s. Returning EINVAL.", 1074 __func__, agbno, mp->m_sb.sb_agblocks);
1079 agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
1080 ASSERT(0); 1075 ASSERT(0);
1081 return XFS_ERROR(EINVAL); 1076 return XFS_ERROR(EINVAL);
1082 } 1077 }
@@ -1085,9 +1080,8 @@ xfs_difree(
1085 */ 1080 */
1086 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1081 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1087 if (error) { 1082 if (error) {
1088 cmn_err(CE_WARN, 1083 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
1089 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", 1084 __func__, error);
1090 error, mp->m_fsname);
1091 return error; 1085 return error;
1092 } 1086 }
1093 agi = XFS_BUF_TO_AGI(agbp); 1087 agi = XFS_BUF_TO_AGI(agbp);
@@ -1106,17 +1100,15 @@ xfs_difree(
1106 * Look for the entry describing this inode. 1100 * Look for the entry describing this inode.
1107 */ 1101 */
1108 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { 1102 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1109 cmn_err(CE_WARN, 1103 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
1110 "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.", 1104 __func__, error);
1111 error, mp->m_fsname);
1112 goto error0; 1105 goto error0;
1113 } 1106 }
1114 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1107 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1115 error = xfs_inobt_get_rec(cur, &rec, &i); 1108 error = xfs_inobt_get_rec(cur, &rec, &i);
1116 if (error) { 1109 if (error) {
1117 cmn_err(CE_WARN, 1110 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1118 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", 1111 __func__, error);
1119 error, mp->m_fsname);
1120 goto error0; 1112 goto error0;
1121 } 1113 }
1122 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1114 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1157,8 +1149,8 @@ xfs_difree(
1157 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 1149 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1158 1150
1159 if ((error = xfs_btree_delete(cur, &i))) { 1151 if ((error = xfs_btree_delete(cur, &i))) {
1160 cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n", 1152 xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
1161 error, mp->m_fsname); 1153 __func__, error);
1162 goto error0; 1154 goto error0;
1163 } 1155 }
1164 1156
@@ -1170,9 +1162,8 @@ xfs_difree(
1170 1162
1171 error = xfs_inobt_update(cur, &rec); 1163 error = xfs_inobt_update(cur, &rec);
1172 if (error) { 1164 if (error) {
1173 cmn_err(CE_WARN, 1165 xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
1174 "xfs_difree: xfs_inobt_update returned an error %d on %s.", 1166 __func__, error);
1175 error, mp->m_fsname);
1176 goto error0; 1167 goto error0;
1177 } 1168 }
1178 1169
@@ -1218,10 +1209,9 @@ xfs_imap_lookup(
1218 1209
1219 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1210 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1220 if (error) { 1211 if (error) {
1221 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1212 xfs_alert(mp,
1222 "xfs_ialloc_read_agi() returned " 1213 "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
1223 "error %d, agno %d", 1214 __func__, error, agno);
1224 error, agno);
1225 return error; 1215 return error;
1226 } 1216 }
1227 1217
@@ -1299,24 +1289,21 @@ xfs_imap(
1299 if (flags & XFS_IGET_UNTRUSTED) 1289 if (flags & XFS_IGET_UNTRUSTED)
1300 return XFS_ERROR(EINVAL); 1290 return XFS_ERROR(EINVAL);
1301 if (agno >= mp->m_sb.sb_agcount) { 1291 if (agno >= mp->m_sb.sb_agcount) {
1302 xfs_fs_cmn_err(CE_ALERT, mp, 1292 xfs_alert(mp,
1303 "xfs_imap: agno (%d) >= " 1293 "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
1304 "mp->m_sb.sb_agcount (%d)", 1294 __func__, agno, mp->m_sb.sb_agcount);
1305 agno, mp->m_sb.sb_agcount);
1306 } 1295 }
1307 if (agbno >= mp->m_sb.sb_agblocks) { 1296 if (agbno >= mp->m_sb.sb_agblocks) {
1308 xfs_fs_cmn_err(CE_ALERT, mp, 1297 xfs_alert(mp,
1309 "xfs_imap: agbno (0x%llx) >= " 1298 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
1310 "mp->m_sb.sb_agblocks (0x%lx)", 1299 __func__, (unsigned long long)agbno,
1311 (unsigned long long) agbno, 1300 (unsigned long)mp->m_sb.sb_agblocks);
1312 (unsigned long) mp->m_sb.sb_agblocks);
1313 } 1301 }
1314 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { 1302 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1315 xfs_fs_cmn_err(CE_ALERT, mp, 1303 xfs_alert(mp,
1316 "xfs_imap: ino (0x%llx) != " 1304 "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
1317 "XFS_AGINO_TO_INO(mp, agno, agino) " 1305 __func__, ino,
1318 "(0x%llx)", 1306 XFS_AGINO_TO_INO(mp, agno, agino));
1319 ino, XFS_AGINO_TO_INO(mp, agno, agino));
1320 } 1307 }
1321 xfs_stack_trace(); 1308 xfs_stack_trace();
1322#endif /* DEBUG */ 1309#endif /* DEBUG */
@@ -1388,10 +1375,9 @@ out_map:
1388 */ 1375 */
1389 if ((imap->im_blkno + imap->im_len) > 1376 if ((imap->im_blkno + imap->im_len) >
1390 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 1377 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
1391 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1378 xfs_alert(mp,
1392 "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > " 1379 "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
1393 " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)", 1380 __func__, (unsigned long long) imap->im_blkno,
1394 (unsigned long long) imap->im_blkno,
1395 (unsigned long long) imap->im_len, 1381 (unsigned long long) imap->im_len,
1396 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 1382 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1397 return XFS_ERROR(EINVAL); 1383 return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0cdd26932d8e..cb9b6d1469f7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
43 43
44 44
45/* 45/*
46 * Define xfs inode iolock lockdep classes. We need to ensure that all active
47 * inodes are considered the same for lockdep purposes, including inodes that
48 * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
49 * guarantee the locks are considered the same when there are multiple lock
50 * initialisation siteѕ. Also, define a reclaimable inode class so it is
51 * obvious in lockdep reports which class the report is against.
52 */
53static struct lock_class_key xfs_iolock_active;
54struct lock_class_key xfs_iolock_reclaimable;
55
56/*
46 * Allocate and initialise an xfs_inode. 57 * Allocate and initialise an xfs_inode.
47 */ 58 */
48STATIC struct xfs_inode * 59STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
69 ASSERT(atomic_read(&ip->i_pincount) == 0); 80 ASSERT(atomic_read(&ip->i_pincount) == 0);
70 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 81 ASSERT(!spin_is_locked(&ip->i_flags_lock));
71 ASSERT(completion_done(&ip->i_flush)); 82 ASSERT(completion_done(&ip->i_flush));
83 ASSERT(ip->i_ino == 0);
72 84
73 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 85 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
86 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
87 &xfs_iolock_active, "xfs_iolock_active");
74 88
75 /* initialise the xfs inode */ 89 /* initialise the xfs inode */
76 ip->i_ino = ino; 90 ip->i_ino = ino;
@@ -85,12 +99,20 @@ xfs_inode_alloc(
85 ip->i_size = 0; 99 ip->i_size = 0;
86 ip->i_new_size = 0; 100 ip->i_new_size = 0;
87 101
88 /* prevent anyone from using this yet */
89 VFS_I(ip)->i_state = I_NEW;
90
91 return ip; 102 return ip;
92} 103}
93 104
105STATIC void
106xfs_inode_free_callback(
107 struct rcu_head *head)
108{
109 struct inode *inode = container_of(head, struct inode, i_rcu);
110 struct xfs_inode *ip = XFS_I(inode);
111
112 INIT_LIST_HEAD(&inode->i_dentry);
113 kmem_zone_free(xfs_inode_zone, ip);
114}
115
94void 116void
95xfs_inode_free( 117xfs_inode_free(
96 struct xfs_inode *ip) 118 struct xfs_inode *ip)
@@ -134,7 +156,18 @@ xfs_inode_free(
134 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 156 ASSERT(!spin_is_locked(&ip->i_flags_lock));
135 ASSERT(completion_done(&ip->i_flush)); 157 ASSERT(completion_done(&ip->i_flush));
136 158
137 kmem_zone_free(xfs_inode_zone, ip); 159 /*
160 * Because we use RCU freeing we need to ensure the inode always
161 * appears to be reclaimed with an invalid inode number when in the
162 * free state. The ip->i_flags_lock provides the barrier against lookup
163 * races.
164 */
165 spin_lock(&ip->i_flags_lock);
166 ip->i_flags = XFS_IRECLAIM;
167 ip->i_ino = 0;
168 spin_unlock(&ip->i_flags_lock);
169
170 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138} 171}
139 172
140/* 173/*
@@ -144,14 +177,29 @@ static int
144xfs_iget_cache_hit( 177xfs_iget_cache_hit(
145 struct xfs_perag *pag, 178 struct xfs_perag *pag,
146 struct xfs_inode *ip, 179 struct xfs_inode *ip,
180 xfs_ino_t ino,
147 int flags, 181 int flags,
148 int lock_flags) __releases(pag->pag_ici_lock) 182 int lock_flags) __releases(RCU)
149{ 183{
150 struct inode *inode = VFS_I(ip); 184 struct inode *inode = VFS_I(ip);
151 struct xfs_mount *mp = ip->i_mount; 185 struct xfs_mount *mp = ip->i_mount;
152 int error; 186 int error;
153 187
188 /*
189 * check for re-use of an inode within an RCU grace period due to the
190 * radix tree nodes not being updated yet. We monitor for this by
191 * setting the inode number to zero before freeing the inode structure.
192 * If the inode has been reallocated and set up, then the inode number
193 * will not match, so check for that, too.
194 */
154 spin_lock(&ip->i_flags_lock); 195 spin_lock(&ip->i_flags_lock);
196 if (ip->i_ino != ino) {
197 trace_xfs_iget_skip(ip);
198 XFS_STATS_INC(xs_ig_frecycle);
199 error = EAGAIN;
200 goto out_error;
201 }
202
155 203
156 /* 204 /*
157 * If we are racing with another cache hit that is currently 205 * If we are racing with another cache hit that is currently
@@ -194,7 +242,7 @@ xfs_iget_cache_hit(
194 ip->i_flags |= XFS_IRECLAIM; 242 ip->i_flags |= XFS_IRECLAIM;
195 243
196 spin_unlock(&ip->i_flags_lock); 244 spin_unlock(&ip->i_flags_lock);
197 read_unlock(&pag->pag_ici_lock); 245 rcu_read_unlock();
198 246
199 error = -inode_init_always(mp->m_super, inode); 247 error = -inode_init_always(mp->m_super, inode);
200 if (error) { 248 if (error) {
@@ -202,7 +250,7 @@ xfs_iget_cache_hit(
202 * Re-initializing the inode failed, and we are in deep 250 * Re-initializing the inode failed, and we are in deep
203 * trouble. Try to re-add it to the reclaim list. 251 * trouble. Try to re-add it to the reclaim list.
204 */ 252 */
205 read_lock(&pag->pag_ici_lock); 253 rcu_read_lock();
206 spin_lock(&ip->i_flags_lock); 254 spin_lock(&ip->i_flags_lock);
207 255
208 ip->i_flags &= ~XFS_INEW; 256 ip->i_flags &= ~XFS_INEW;
@@ -212,14 +260,20 @@ xfs_iget_cache_hit(
212 goto out_error; 260 goto out_error;
213 } 261 }
214 262
215 write_lock(&pag->pag_ici_lock); 263 spin_lock(&pag->pag_ici_lock);
216 spin_lock(&ip->i_flags_lock); 264 spin_lock(&ip->i_flags_lock);
217 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); 265 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
218 ip->i_flags |= XFS_INEW; 266 ip->i_flags |= XFS_INEW;
219 __xfs_inode_clear_reclaim_tag(mp, pag, ip); 267 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
220 inode->i_state = I_NEW; 268 inode->i_state = I_NEW;
269
270 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
271 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
272 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
273 &xfs_iolock_active, "xfs_iolock_active");
274
221 spin_unlock(&ip->i_flags_lock); 275 spin_unlock(&ip->i_flags_lock);
222 write_unlock(&pag->pag_ici_lock); 276 spin_unlock(&pag->pag_ici_lock);
223 } else { 277 } else {
224 /* If the VFS inode is being torn down, pause and try again. */ 278 /* If the VFS inode is being torn down, pause and try again. */
225 if (!igrab(inode)) { 279 if (!igrab(inode)) {
@@ -230,7 +284,7 @@ xfs_iget_cache_hit(
230 284
231 /* We've got a live one. */ 285 /* We've got a live one. */
232 spin_unlock(&ip->i_flags_lock); 286 spin_unlock(&ip->i_flags_lock);
233 read_unlock(&pag->pag_ici_lock); 287 rcu_read_unlock();
234 trace_xfs_iget_hit(ip); 288 trace_xfs_iget_hit(ip);
235 } 289 }
236 290
@@ -244,7 +298,7 @@ xfs_iget_cache_hit(
244 298
245out_error: 299out_error:
246 spin_unlock(&ip->i_flags_lock); 300 spin_unlock(&ip->i_flags_lock);
247 read_unlock(&pag->pag_ici_lock); 301 rcu_read_unlock();
248 return error; 302 return error;
249} 303}
250 304
@@ -297,7 +351,7 @@ xfs_iget_cache_miss(
297 BUG(); 351 BUG();
298 } 352 }
299 353
300 write_lock(&pag->pag_ici_lock); 354 spin_lock(&pag->pag_ici_lock);
301 355
302 /* insert the new inode */ 356 /* insert the new inode */
303 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 357 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -312,14 +366,14 @@ xfs_iget_cache_miss(
312 ip->i_udquot = ip->i_gdquot = NULL; 366 ip->i_udquot = ip->i_gdquot = NULL;
313 xfs_iflags_set(ip, XFS_INEW); 367 xfs_iflags_set(ip, XFS_INEW);
314 368
315 write_unlock(&pag->pag_ici_lock); 369 spin_unlock(&pag->pag_ici_lock);
316 radix_tree_preload_end(); 370 radix_tree_preload_end();
317 371
318 *ipp = ip; 372 *ipp = ip;
319 return 0; 373 return 0;
320 374
321out_preload_end: 375out_preload_end:
322 write_unlock(&pag->pag_ici_lock); 376 spin_unlock(&pag->pag_ici_lock);
323 radix_tree_preload_end(); 377 radix_tree_preload_end();
324 if (lock_flags) 378 if (lock_flags)
325 xfs_iunlock(ip, lock_flags); 379 xfs_iunlock(ip, lock_flags);
@@ -366,7 +420,7 @@ xfs_iget(
366 xfs_agino_t agino; 420 xfs_agino_t agino;
367 421
368 /* reject inode numbers outside existing AGs */ 422 /* reject inode numbers outside existing AGs */
369 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 423 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
370 return EINVAL; 424 return EINVAL;
371 425
372 /* get the perag structure and ensure that it's inode capable */ 426 /* get the perag structure and ensure that it's inode capable */
@@ -375,15 +429,15 @@ xfs_iget(
375 429
376again: 430again:
377 error = 0; 431 error = 0;
378 read_lock(&pag->pag_ici_lock); 432 rcu_read_lock();
379 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 433 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
380 434
381 if (ip) { 435 if (ip) {
382 error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); 436 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
383 if (error) 437 if (error)
384 goto out_error_or_again; 438 goto out_error_or_again;
385 } else { 439 } else {
386 read_unlock(&pag->pag_ici_lock); 440 rcu_read_unlock();
387 XFS_STATS_INC(xs_ig_missed); 441 XFS_STATS_INC(xs_ig_missed);
388 442
389 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 443 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f94..a37480a6e023 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -110,8 +110,8 @@ xfs_inobp_check(
110 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 110 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
111 i * mp->m_sb.sb_inodesize); 111 i * mp->m_sb.sb_inodesize);
112 if (!dip->di_next_unlinked) { 112 if (!dip->di_next_unlinked) {
113 xfs_fs_cmn_err(CE_ALERT, mp, 113 xfs_alert(mp,
114 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", 114 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
115 bp); 115 bp);
116 ASSERT(dip->di_next_unlinked); 116 ASSERT(dip->di_next_unlinked);
117 } 117 }
@@ -142,10 +142,9 @@ xfs_imap_to_bp(
142 (int)imap->im_len, buf_flags, &bp); 142 (int)imap->im_len, buf_flags, &bp);
143 if (error) { 143 if (error) {
144 if (error != EAGAIN) { 144 if (error != EAGAIN) {
145 cmn_err(CE_WARN, 145 xfs_warn(mp,
146 "xfs_imap_to_bp: xfs_trans_read_buf()returned " 146 "%s: xfs_trans_read_buf() returned error %d.",
147 "an error %d on %s. Returning error.", 147 __func__, error);
148 error, mp->m_fsname);
149 } else { 148 } else {
150 ASSERT(buf_flags & XBF_TRYLOCK); 149 ASSERT(buf_flags & XBF_TRYLOCK);
151 } 150 }
@@ -180,12 +179,11 @@ xfs_imap_to_bp(
180 XFS_CORRUPTION_ERROR("xfs_imap_to_bp", 179 XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
181 XFS_ERRLEVEL_HIGH, mp, dip); 180 XFS_ERRLEVEL_HIGH, mp, dip);
182#ifdef DEBUG 181#ifdef DEBUG
183 cmn_err(CE_PANIC, 182 xfs_emerg(mp,
184 "Device %s - bad inode magic/vsn " 183 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
185 "daddr %lld #%d (magic=%x)",
186 XFS_BUFTARG_NAME(mp->m_ddev_targp),
187 (unsigned long long)imap->im_blkno, i, 184 (unsigned long long)imap->im_blkno, i,
188 be16_to_cpu(dip->di_magic)); 185 be16_to_cpu(dip->di_magic));
186 ASSERT(0);
189#endif 187#endif
190 xfs_trans_brelse(tp, bp); 188 xfs_trans_brelse(tp, bp);
191 return XFS_ERROR(EFSCORRUPTED); 189 return XFS_ERROR(EFSCORRUPTED);
@@ -317,7 +315,7 @@ xfs_iformat(
317 if (unlikely(be32_to_cpu(dip->di_nextents) + 315 if (unlikely(be32_to_cpu(dip->di_nextents) +
318 be16_to_cpu(dip->di_anextents) > 316 be16_to_cpu(dip->di_anextents) >
319 be64_to_cpu(dip->di_nblocks))) { 317 be64_to_cpu(dip->di_nblocks))) {
320 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 318 xfs_warn(ip->i_mount,
321 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 319 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
322 (unsigned long long)ip->i_ino, 320 (unsigned long long)ip->i_ino,
323 (int)(be32_to_cpu(dip->di_nextents) + 321 (int)(be32_to_cpu(dip->di_nextents) +
@@ -330,8 +328,7 @@ xfs_iformat(
330 } 328 }
331 329
332 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 330 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
333 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 331 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
334 "corrupt dinode %Lu, forkoff = 0x%x.",
335 (unsigned long long)ip->i_ino, 332 (unsigned long long)ip->i_ino,
336 dip->di_forkoff); 333 dip->di_forkoff);
337 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 334 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -341,7 +338,7 @@ xfs_iformat(
341 338
342 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 339 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
343 !ip->i_mount->m_rtdev_targp)) { 340 !ip->i_mount->m_rtdev_targp)) {
344 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 341 xfs_warn(ip->i_mount,
345 "corrupt dinode %Lu, has realtime flag set.", 342 "corrupt dinode %Lu, has realtime flag set.",
346 ip->i_ino); 343 ip->i_ino);
347 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 344 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
@@ -373,9 +370,8 @@ xfs_iformat(
373 * no local regular files yet 370 * no local regular files yet
374 */ 371 */
375 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { 372 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
376 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 373 xfs_warn(ip->i_mount,
377 "corrupt inode %Lu " 374 "corrupt inode %Lu (local format for regular file).",
378 "(local format for regular file).",
379 (unsigned long long) ip->i_ino); 375 (unsigned long long) ip->i_ino);
380 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 376 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
381 XFS_ERRLEVEL_LOW, 377 XFS_ERRLEVEL_LOW,
@@ -385,9 +381,8 @@ xfs_iformat(
385 381
386 di_size = be64_to_cpu(dip->di_size); 382 di_size = be64_to_cpu(dip->di_size);
387 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 383 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
388 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 384 xfs_warn(ip->i_mount,
389 "corrupt inode %Lu " 385 "corrupt inode %Lu (bad size %Ld for local inode).",
390 "(bad size %Ld for local inode).",
391 (unsigned long long) ip->i_ino, 386 (unsigned long long) ip->i_ino,
392 (long long) di_size); 387 (long long) di_size);
393 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 388 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -431,9 +426,8 @@ xfs_iformat(
431 size = be16_to_cpu(atp->hdr.totsize); 426 size = be16_to_cpu(atp->hdr.totsize);
432 427
433 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { 428 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
434 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 429 xfs_warn(ip->i_mount,
435 "corrupt inode %Lu " 430 "corrupt inode %Lu (bad attr fork size %Ld).",
436 "(bad attr fork size %Ld).",
437 (unsigned long long) ip->i_ino, 431 (unsigned long long) ip->i_ino,
438 (long long) size); 432 (long long) size);
439 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 433 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
@@ -488,9 +482,8 @@ xfs_iformat_local(
488 * kmem_alloc() or memcpy() below. 482 * kmem_alloc() or memcpy() below.
489 */ 483 */
490 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 484 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
491 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 485 xfs_warn(ip->i_mount,
492 "corrupt inode %Lu " 486 "corrupt inode %Lu (bad size %d for local fork, size = %d).",
493 "(bad size %d for local fork, size = %d).",
494 (unsigned long long) ip->i_ino, size, 487 (unsigned long long) ip->i_ino, size,
495 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 488 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
496 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 489 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -547,8 +540,7 @@ xfs_iformat_extents(
547 * kmem_alloc() or memcpy() below. 540 * kmem_alloc() or memcpy() below.
548 */ 541 */
549 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 542 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
550 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 543 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
551 "corrupt inode %Lu ((a)extents = %d).",
552 (unsigned long long) ip->i_ino, nex); 544 (unsigned long long) ip->i_ino, nex);
553 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 545 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
554 ip->i_mount, dip); 546 ip->i_mount, dip);
@@ -623,11 +615,10 @@ xfs_iformat_btree(
623 || XFS_BMDR_SPACE_CALC(nrecs) > 615 || XFS_BMDR_SPACE_CALC(nrecs) >
624 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 616 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
625 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 617 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
626 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 618 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
627 "corrupt inode %Lu (btree).",
628 (unsigned long long) ip->i_ino); 619 (unsigned long long) ip->i_ino);
629 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 620 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
630 ip->i_mount); 621 ip->i_mount, dip);
631 return XFS_ERROR(EFSCORRUPTED); 622 return XFS_ERROR(EFSCORRUPTED);
632 } 623 }
633 624
@@ -813,11 +804,9 @@ xfs_iread(
813 */ 804 */
814 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { 805 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
815#ifdef DEBUG 806#ifdef DEBUG
816 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 807 xfs_alert(mp,
817 "dip->di_magic (0x%x) != " 808 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
818 "XFS_DINODE_MAGIC (0x%x)", 809 __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
819 be16_to_cpu(dip->di_magic),
820 XFS_DINODE_MAGIC);
821#endif /* DEBUG */ 810#endif /* DEBUG */
822 error = XFS_ERROR(EINVAL); 811 error = XFS_ERROR(EINVAL);
823 goto out_brelse; 812 goto out_brelse;
@@ -835,9 +824,8 @@ xfs_iread(
835 error = xfs_iformat(ip, dip); 824 error = xfs_iformat(ip, dip);
836 if (error) { 825 if (error) {
837#ifdef DEBUG 826#ifdef DEBUG
838 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 827 xfs_alert(mp, "%s: xfs_iformat() returned error %d",
839 "xfs_iformat() returned error %d", 828 __func__, error);
840 error);
841#endif /* DEBUG */ 829#endif /* DEBUG */
842 goto out_brelse; 830 goto out_brelse;
843 } 831 }
@@ -887,7 +875,7 @@ xfs_iread(
887 * around for a while. This helps to keep recently accessed 875 * around for a while. This helps to keep recently accessed
888 * meta-data in-core longer. 876 * meta-data in-core longer.
889 */ 877 */
890 XFS_BUF_SET_REF(bp, XFS_INO_REF); 878 xfs_buf_set_ref(bp, XFS_INO_REF);
891 879
892 /* 880 /*
893 * Use xfs_trans_brelse() to release the buffer containing the 881 * Use xfs_trans_brelse() to release the buffer containing the
@@ -1016,8 +1004,8 @@ xfs_ialloc(
1016 * This is because we're setting fields here we need 1004 * This is because we're setting fields here we need
1017 * to prevent others from looking at until we're done. 1005 * to prevent others from looking at until we're done.
1018 */ 1006 */
1019 error = xfs_trans_iget(tp->t_mountp, tp, ino, 1007 error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
1020 XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 1008 XFS_ILOCK_EXCL, &ip);
1021 if (error) 1009 if (error)
1022 return error; 1010 return error;
1023 ASSERT(ip != NULL); 1011 ASSERT(ip != NULL);
@@ -1166,6 +1154,7 @@ xfs_ialloc(
1166 /* 1154 /*
1167 * Log the new values stuffed into the inode. 1155 * Log the new values stuffed into the inode.
1168 */ 1156 */
1157 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
1169 xfs_trans_log_inode(tp, ip, flags); 1158 xfs_trans_log_inode(tp, ip, flags);
1170 1159
1171 /* now that we have an i_mode we can setup inode ops and unlock */ 1160 /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1820,9 +1809,8 @@ xfs_iunlink_remove(
1820 */ 1809 */
1821 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1810 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1822 if (error) { 1811 if (error) {
1823 cmn_err(CE_WARN, 1812 xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
1824 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1813 __func__, error);
1825 error, mp->m_fsname);
1826 return error; 1814 return error;
1827 } 1815 }
1828 next_agino = be32_to_cpu(dip->di_next_unlinked); 1816 next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1867,9 +1855,9 @@ xfs_iunlink_remove(
1867 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 1855 error = xfs_inotobp(mp, tp, next_ino, &last_dip,
1868 &last_ibp, &last_offset, 0); 1856 &last_ibp, &last_offset, 0);
1869 if (error) { 1857 if (error) {
1870 cmn_err(CE_WARN, 1858 xfs_warn(mp,
1871 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", 1859 "%s: xfs_inotobp() returned error %d.",
1872 error, mp->m_fsname); 1860 __func__, error);
1873 return error; 1861 return error;
1874 } 1862 }
1875 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 1863 next_agino = be32_to_cpu(last_dip->di_next_unlinked);
@@ -1882,9 +1870,8 @@ xfs_iunlink_remove(
1882 */ 1870 */
1883 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1871 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1884 if (error) { 1872 if (error) {
1885 cmn_err(CE_WARN, 1873 xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
1886 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1874 __func__, error);
1887 error, mp->m_fsname);
1888 return error; 1875 return error;
1889 } 1876 }
1890 next_agino = be32_to_cpu(dip->di_next_unlinked); 1877 next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -2000,17 +1987,33 @@ xfs_ifree_cluster(
2000 */ 1987 */
2001 for (i = 0; i < ninodes; i++) { 1988 for (i = 0; i < ninodes; i++) {
2002retry: 1989retry:
2003 read_lock(&pag->pag_ici_lock); 1990 rcu_read_lock();
2004 ip = radix_tree_lookup(&pag->pag_ici_root, 1991 ip = radix_tree_lookup(&pag->pag_ici_root,
2005 XFS_INO_TO_AGINO(mp, (inum + i))); 1992 XFS_INO_TO_AGINO(mp, (inum + i)));
2006 1993
2007 /* Inode not in memory or stale, nothing to do */ 1994 /* Inode not in memory, nothing to do */
2008 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 1995 if (!ip) {
2009 read_unlock(&pag->pag_ici_lock); 1996 rcu_read_unlock();
2010 continue; 1997 continue;
2011 } 1998 }
2012 1999
2013 /* 2000 /*
2001 * because this is an RCU protected lookup, we could
2002 * find a recently freed or even reallocated inode
2003 * during the lookup. We need to check under the
2004 * i_flags_lock for a valid inode here. Skip it if it
2005 * is not valid, the wrong inode or stale.
2006 */
2007 spin_lock(&ip->i_flags_lock);
2008 if (ip->i_ino != inum + i ||
2009 __xfs_iflags_test(ip, XFS_ISTALE)) {
2010 spin_unlock(&ip->i_flags_lock);
2011 rcu_read_unlock();
2012 continue;
2013 }
2014 spin_unlock(&ip->i_flags_lock);
2015
2016 /*
2014 * Don't try to lock/unlock the current inode, but we 2017 * Don't try to lock/unlock the current inode, but we
2015 * _cannot_ skip the other inodes that we did not find 2018 * _cannot_ skip the other inodes that we did not find
2016 * in the list attached to the buffer and are not 2019 * in the list attached to the buffer and are not
@@ -2019,11 +2022,11 @@ retry:
2019 */ 2022 */
2020 if (ip != free_ip && 2023 if (ip != free_ip &&
2021 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2024 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2022 read_unlock(&pag->pag_ici_lock); 2025 rcu_read_unlock();
2023 delay(1); 2026 delay(1);
2024 goto retry; 2027 goto retry;
2025 } 2028 }
2026 read_unlock(&pag->pag_ici_lock); 2029 rcu_read_unlock();
2027 2030
2028 xfs_iflock(ip); 2031 xfs_iflock(ip);
2029 xfs_iflags_set(ip, XFS_ISTALE); 2032 xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2632,7 @@ xfs_iflush_cluster(
2629 2632
2630 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2633 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2631 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2634 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2632 read_lock(&pag->pag_ici_lock); 2635 rcu_read_lock();
2633 /* really need a gang lookup range call here */ 2636 /* really need a gang lookup range call here */
2634 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2637 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2635 first_index, inodes_per_cluster); 2638 first_index, inodes_per_cluster);
@@ -2640,9 +2643,21 @@ xfs_iflush_cluster(
2640 iq = ilist[i]; 2643 iq = ilist[i];
2641 if (iq == ip) 2644 if (iq == ip)
2642 continue; 2645 continue;
2643 /* if the inode lies outside this cluster, we're done. */ 2646
2644 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) 2647 /*
2645 break; 2648 * because this is an RCU protected lookup, we could find a
2649 * recently freed or even reallocated inode during the lookup.
2650 * We need to check under the i_flags_lock for a valid inode
2651 * here. Skip it if it is not valid or the wrong inode.
2652 */
2653 spin_lock(&ip->i_flags_lock);
2654 if (!ip->i_ino ||
2655 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2656 spin_unlock(&ip->i_flags_lock);
2657 continue;
2658 }
2659 spin_unlock(&ip->i_flags_lock);
2660
2646 /* 2661 /*
2647 * Do an un-protected check to see if the inode is dirty and 2662 * Do an un-protected check to see if the inode is dirty and
2648 * is a candidate for flushing. These checks will be repeated 2663 * is a candidate for flushing. These checks will be repeated
@@ -2692,7 +2707,7 @@ xfs_iflush_cluster(
2692 } 2707 }
2693 2708
2694out_free: 2709out_free:
2695 read_unlock(&pag->pag_ici_lock); 2710 rcu_read_unlock();
2696 kmem_free(ilist); 2711 kmem_free(ilist);
2697out_put: 2712out_put:
2698 xfs_perag_put(pag); 2713 xfs_perag_put(pag);
@@ -2704,7 +2719,7 @@ cluster_corrupt_out:
2704 * Corruption detected in the clustering loop. Invalidate the 2719 * Corruption detected in the clustering loop. Invalidate the
2705 * inode buffer and shut down the filesystem. 2720 * inode buffer and shut down the filesystem.
2706 */ 2721 */
2707 read_unlock(&pag->pag_ici_lock); 2722 rcu_read_unlock();
2708 /* 2723 /*
2709 * Clean up the buffer. If it was B_DELWRI, just release it -- 2724 * Clean up the buffer. If it was B_DELWRI, just release it --
2710 * brelse can handle it with no problems. If not, shut down the 2725 * brelse can handle it with no problems. If not, shut down the
@@ -2774,7 +2789,7 @@ xfs_iflush(
2774 2789
2775 /* 2790 /*
2776 * We can't flush the inode until it is unpinned, so wait for it if we 2791 * We can't flush the inode until it is unpinned, so wait for it if we
2777 * are allowed to block. We know noone new can pin it, because we are 2792 * are allowed to block. We know no one new can pin it, because we are
2778 * holding the inode lock shared and you need to hold it exclusively to 2793 * holding the inode lock shared and you need to hold it exclusively to
2779 * pin the inode. 2794 * pin the inode.
2780 * 2795 *
@@ -2820,7 +2835,7 @@ xfs_iflush(
2820 * Get the buffer containing the on-disk inode. 2835 * Get the buffer containing the on-disk inode.
2821 */ 2836 */
2822 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2837 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2823 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); 2838 (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
2824 if (error || !bp) { 2839 if (error || !bp) {
2825 xfs_ifunlock(ip); 2840 xfs_ifunlock(ip);
2826 return error; 2841 return error;
@@ -2911,16 +2926,16 @@ xfs_iflush_int(
2911 2926
2912 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, 2927 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
2913 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2928 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2914 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2929 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2915 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2930 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
2916 ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2931 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2917 goto corrupt_out; 2932 goto corrupt_out;
2918 } 2933 }
2919 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 2934 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
2920 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 2935 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
2921 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2936 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2922 "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 2937 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
2923 ip->i_ino, ip, ip->i_d.di_magic); 2938 __func__, ip->i_ino, ip, ip->i_d.di_magic);
2924 goto corrupt_out; 2939 goto corrupt_out;
2925 } 2940 }
2926 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 2941 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
@@ -2928,9 +2943,9 @@ xfs_iflush_int(
2928 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2943 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2929 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2944 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
2930 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 2945 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
2931 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2946 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2932 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", 2947 "%s: Bad regular inode %Lu, ptr 0x%p",
2933 ip->i_ino, ip); 2948 __func__, ip->i_ino, ip);
2934 goto corrupt_out; 2949 goto corrupt_out;
2935 } 2950 }
2936 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 2951 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
@@ -2939,28 +2954,28 @@ xfs_iflush_int(
2939 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2954 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
2940 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 2955 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
2941 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 2956 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
2942 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2957 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2943 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", 2958 "%s: Bad directory inode %Lu, ptr 0x%p",
2944 ip->i_ino, ip); 2959 __func__, ip->i_ino, ip);
2945 goto corrupt_out; 2960 goto corrupt_out;
2946 } 2961 }
2947 } 2962 }
2948 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 2963 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
2949 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 2964 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
2950 XFS_RANDOM_IFLUSH_5)) { 2965 XFS_RANDOM_IFLUSH_5)) {
2951 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2966 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2952 "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", 2967 "%s: detected corrupt incore inode %Lu, "
2953 ip->i_ino, 2968 "total extents = %d, nblocks = %Ld, ptr 0x%p",
2969 __func__, ip->i_ino,
2954 ip->i_d.di_nextents + ip->i_d.di_anextents, 2970 ip->i_d.di_nextents + ip->i_d.di_anextents,
2955 ip->i_d.di_nblocks, 2971 ip->i_d.di_nblocks, ip);
2956 ip);
2957 goto corrupt_out; 2972 goto corrupt_out;
2958 } 2973 }
2959 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 2974 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
2960 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 2975 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
2961 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2976 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2962 "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 2977 "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
2963 ip->i_ino, ip->i_d.di_forkoff, ip); 2978 __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2964 goto corrupt_out; 2979 goto corrupt_out;
2965 } 2980 }
2966 /* 2981 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc9..ff4e2a30227d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -111,7 +111,7 @@ struct xfs_imap {
111 * Generally, we do not want to hold the i_rlock while holding the 111 * Generally, we do not want to hold the i_rlock while holding the
112 * i_ilock. Hierarchy is i_iolock followed by i_rlock. 112 * i_ilock. Hierarchy is i_iolock followed by i_rlock.
113 * 113 *
114 * xfs_iptr_t contains all the inode fields upto and including the 114 * xfs_iptr_t contains all the inode fields up to and including the
115 * i_mnext and i_mprev fields, it is used as a marker in the inode 115 * i_mnext and i_mprev fields, it is used as a marker in the inode
116 * chain off the mount structure by xfs_sync calls. 116 * chain off the mount structure by xfs_sync calls.
117 */ 117 */
@@ -336,7 +336,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
336 336
337/* 337/*
338 * Project quota id helpers (previously projid was 16bit only 338 * Project quota id helpers (previously projid was 16bit only
339 * and using two 16bit values to hold new 32bit projid was choosen 339 * and using two 16bit values to hold new 32bit projid was chosen
340 * to retain compatibility with "old" filesystems). 340 * to retain compatibility with "old" filesystems).
341 */ 341 */
342static inline prid_t 342static inline prid_t
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
376/* 376/*
377 * In-core inode flags. 377 * In-core inode flags.
378 */ 378 */
379#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ 379#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
380#define XFS_ISTALE 0x0002 /* inode has been staled */ 380#define XFS_ISTALE 0x0002 /* inode has been staled */
381#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ 381#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
382#define XFS_INEW 0x0008 /* inode has just been allocated */ 382#define XFS_INEW 0x0008 /* inode has just been allocated */
383#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ 383#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
384#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ 384#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
385#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
385 386
386/* 387/*
387 * Flags for inode locking. 388 * Flags for inode locking.
@@ -408,28 +409,35 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
408/* 409/*
409 * Flags for lockdep annotations. 410 * Flags for lockdep annotations.
410 * 411 *
411 * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes 412 * XFS_LOCK_PARENT - for directory operations that require locking a
412 * (ie directory operations that require locking a directory inode and 413 * parent directory inode and a child entry inode. The parent gets locked
413 * an entry inode). The first inode gets locked with this flag so it 414 * with this flag so it gets a lockdep subclass of 1 and the child entry
414 * gets a lockdep subclass of 1 and the second lock will have a lockdep 415 * lock will have a lockdep subclass of 0.
415 * subclass of 0. 416 *
417 * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
418 * inodes do not participate in the normal lock order, and thus have their
419 * own subclasses.
416 * 420 *
417 * XFS_LOCK_INUMORDER - for locking several inodes at the some time 421 * XFS_LOCK_INUMORDER - for locking several inodes at the some time
418 * with xfs_lock_inodes(). This flag is used as the starting subclass 422 * with xfs_lock_inodes(). This flag is used as the starting subclass
419 * and each subsequent lock acquired will increment the subclass by one. 423 * and each subsequent lock acquired will increment the subclass by one.
420 * So the first lock acquired will have a lockdep subclass of 2, the 424 * So the first lock acquired will have a lockdep subclass of 4, the
421 * second lock will have a lockdep subclass of 3, and so on. It is 425 * second lock will have a lockdep subclass of 5, and so on. It is
422 * the responsibility of the class builder to shift this to the correct 426 * the responsibility of the class builder to shift this to the correct
423 * portion of the lock_mode lockdep mask. 427 * portion of the lock_mode lockdep mask.
424 */ 428 */
425#define XFS_LOCK_PARENT 1 429#define XFS_LOCK_PARENT 1
426#define XFS_LOCK_INUMORDER 2 430#define XFS_LOCK_RTBITMAP 2
431#define XFS_LOCK_RTSUM 3
432#define XFS_LOCK_INUMORDER 4
427 433
428#define XFS_IOLOCK_SHIFT 16 434#define XFS_IOLOCK_SHIFT 16
429#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) 435#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
430 436
431#define XFS_ILOCK_SHIFT 24 437#define XFS_ILOCK_SHIFT 24
432#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) 438#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
439#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
440#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
433 441
434#define XFS_IOLOCK_DEP_MASK 0x00ff0000 442#define XFS_IOLOCK_DEP_MASK 0x00ff0000
435#define XFS_ILOCK_DEP_MASK 0xff000000 443#define XFS_ILOCK_DEP_MASK 0xff000000
@@ -438,6 +446,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
438#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) 446#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
439#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 447#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
440 448
449extern struct lock_class_key xfs_iolock_reclaimable;
450
441/* 451/*
442 * Flags for xfs_itruncate_start(). 452 * Flags for xfs_itruncate_start().
443 */ 453 */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c8d30c453c3..576fdfe81d60 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -198,6 +198,41 @@ xfs_inode_item_size(
198} 198}
199 199
200/* 200/*
201 * xfs_inode_item_format_extents - convert in-core extents to on-disk form
202 *
203 * For either the data or attr fork in extent format, we need to endian convert
204 * the in-core extent as we place them into the on-disk inode. In this case, we
205 * need to do this conversion before we write the extents into the log. Because
206 * we don't have the disk inode to write into here, we allocate a buffer and
207 * format the extents into it via xfs_iextents_copy(). We free the buffer in
208 * the unlock routine after the copy for the log has been made.
209 *
210 * In the case of the data fork, the in-core and on-disk fork sizes can be
211 * different due to delayed allocation extents. We only log on-disk extents
212 * here, so always use the physical fork size to determine the size of the
213 * buffer we need to allocate.
214 */
215STATIC void
216xfs_inode_item_format_extents(
217 struct xfs_inode *ip,
218 struct xfs_log_iovec *vecp,
219 int whichfork,
220 int type)
221{
222 xfs_bmbt_rec_t *ext_buffer;
223
224 ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
225 if (whichfork == XFS_DATA_FORK)
226 ip->i_itemp->ili_extents_buf = ext_buffer;
227 else
228 ip->i_itemp->ili_aextents_buf = ext_buffer;
229
230 vecp->i_addr = ext_buffer;
231 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
232 vecp->i_type = type;
233}
234
235/*
201 * This is called to fill in the vector of log iovecs for the 236 * This is called to fill in the vector of log iovecs for the
202 * given inode log item. It fills the first item with an inode 237 * given inode log item. It fills the first item with an inode
203 * log format structure, the second with the on-disk inode structure, 238 * log format structure, the second with the on-disk inode structure,
@@ -213,7 +248,6 @@ xfs_inode_item_format(
213 struct xfs_inode *ip = iip->ili_inode; 248 struct xfs_inode *ip = iip->ili_inode;
214 uint nvecs; 249 uint nvecs;
215 size_t data_bytes; 250 size_t data_bytes;
216 xfs_bmbt_rec_t *ext_buffer;
217 xfs_mount_t *mp; 251 xfs_mount_t *mp;
218 252
219 vecp->i_addr = &iip->ili_format; 253 vecp->i_addr = &iip->ili_format;
@@ -320,22 +354,8 @@ xfs_inode_item_format(
320 } else 354 } else
321#endif 355#endif
322 { 356 {
323 /* 357 xfs_inode_item_format_extents(ip, vecp,
324 * There are delayed allocation extents 358 XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
325 * in the inode, or we need to convert
326 * the extents to on disk format.
327 * Use xfs_iextents_copy()
328 * to copy only the real extents into
329 * a separate buffer. We'll free the
330 * buffer in the unlock routine.
331 */
332 ext_buffer = kmem_alloc(ip->i_df.if_bytes,
333 KM_SLEEP);
334 iip->ili_extents_buf = ext_buffer;
335 vecp->i_addr = ext_buffer;
336 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
337 XFS_DATA_FORK);
338 vecp->i_type = XLOG_REG_TYPE_IEXT;
339 } 359 }
340 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 360 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
341 iip->ili_format.ilf_dsize = vecp->i_len; 361 iip->ili_format.ilf_dsize = vecp->i_len;
@@ -445,19 +465,12 @@ xfs_inode_item_format(
445 */ 465 */
446 vecp->i_addr = ip->i_afp->if_u1.if_extents; 466 vecp->i_addr = ip->i_afp->if_u1.if_extents;
447 vecp->i_len = ip->i_afp->if_bytes; 467 vecp->i_len = ip->i_afp->if_bytes;
468 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
448#else 469#else
449 ASSERT(iip->ili_aextents_buf == NULL); 470 ASSERT(iip->ili_aextents_buf == NULL);
450 /* 471 xfs_inode_item_format_extents(ip, vecp,
451 * Need to endian flip before logging 472 XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
452 */
453 ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
454 KM_SLEEP);
455 iip->ili_aextents_buf = ext_buffer;
456 vecp->i_addr = ext_buffer;
457 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
458 XFS_ATTR_FORK);
459#endif 473#endif
460 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
461 iip->ili_format.ilf_asize = vecp->i_len; 474 iip->ili_format.ilf_asize = vecp->i_len;
462 vecp++; 475 vecp++;
463 nvecs++; 476 nvecs++;
@@ -760,11 +773,11 @@ xfs_inode_item_push(
760 * Push the inode to it's backing buffer. This will not remove the 773 * Push the inode to it's backing buffer. This will not remove the
761 * inode from the AIL - a further push will be required to trigger a 774 * inode from the AIL - a further push will be required to trigger a
762 * buffer push. However, this allows all the dirty inodes to be pushed 775 * buffer push. However, this allows all the dirty inodes to be pushed
763 * to the buffer before it is pushed to disk. THe buffer IO completion 776 * to the buffer before it is pushed to disk. The buffer IO completion
764 * will pull th einode from the AIL, mark it clean and unlock the flush 777 * will pull the inode from the AIL, mark it clean and unlock the flush
765 * lock. 778 * lock.
766 */ 779 */
767 (void) xfs_iflush(ip, 0); 780 (void) xfs_iflush(ip, SYNC_TRYLOCK);
768 xfs_iunlock(ip, XFS_ILOCK_SHARED); 781 xfs_iunlock(ip, XFS_ILOCK_SHARED);
769} 782}
770 783
@@ -842,15 +855,64 @@ xfs_inode_item_destroy(
842 * flushed to disk. It is responsible for removing the inode item 855 * flushed to disk. It is responsible for removing the inode item
843 * from the AIL if it has not been re-logged, and unlocking the inode's 856 * from the AIL if it has not been re-logged, and unlocking the inode's
844 * flush lock. 857 * flush lock.
858 *
859 * To reduce AIL lock traffic as much as possible, we scan the buffer log item
860 * list for other inodes that will run this function. We remove them from the
861 * buffer list so we can process all the inode IO completions in one AIL lock
862 * traversal.
845 */ 863 */
846void 864void
847xfs_iflush_done( 865xfs_iflush_done(
848 struct xfs_buf *bp, 866 struct xfs_buf *bp,
849 struct xfs_log_item *lip) 867 struct xfs_log_item *lip)
850{ 868{
851 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 869 struct xfs_inode_log_item *iip;
852 xfs_inode_t *ip = iip->ili_inode; 870 struct xfs_log_item *blip;
871 struct xfs_log_item *next;
872 struct xfs_log_item *prev;
853 struct xfs_ail *ailp = lip->li_ailp; 873 struct xfs_ail *ailp = lip->li_ailp;
874 int need_ail = 0;
875
876 /*
877 * Scan the buffer IO completions for other inodes being completed and
878 * attach them to the current inode log item.
879 */
880 blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
881 prev = NULL;
882 while (blip != NULL) {
883 if (lip->li_cb != xfs_iflush_done) {
884 prev = blip;
885 blip = blip->li_bio_list;
886 continue;
887 }
888
889 /* remove from list */
890 next = blip->li_bio_list;
891 if (!prev) {
892 XFS_BUF_SET_FSPRIVATE(bp, next);
893 } else {
894 prev->li_bio_list = next;
895 }
896
897 /* add to current list */
898 blip->li_bio_list = lip->li_bio_list;
899 lip->li_bio_list = blip;
900
901 /*
902 * while we have the item, do the unlocked check for needing
903 * the AIL lock.
904 */
905 iip = INODE_ITEM(blip);
906 if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
907 need_ail++;
908
909 blip = next;
910 }
911
912 /* make sure we capture the state of the initial inode. */
913 iip = INODE_ITEM(lip);
914 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
915 need_ail++;
854 916
855 /* 917 /*
856 * We only want to pull the item from the AIL if it is 918 * We only want to pull the item from the AIL if it is
@@ -861,28 +923,37 @@ xfs_iflush_done(
861 * the lock since it's cheaper, and then we recheck while 923 * the lock since it's cheaper, and then we recheck while
862 * holding the lock before removing the inode from the AIL. 924 * holding the lock before removing the inode from the AIL.
863 */ 925 */
864 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { 926 if (need_ail) {
927 struct xfs_log_item *log_items[need_ail];
928 int i = 0;
865 spin_lock(&ailp->xa_lock); 929 spin_lock(&ailp->xa_lock);
866 if (lip->li_lsn == iip->ili_flush_lsn) { 930 for (blip = lip; blip; blip = blip->li_bio_list) {
867 /* xfs_trans_ail_delete() drops the AIL lock. */ 931 iip = INODE_ITEM(blip);
868 xfs_trans_ail_delete(ailp, lip); 932 if (iip->ili_logged &&
869 } else { 933 blip->li_lsn == iip->ili_flush_lsn) {
870 spin_unlock(&ailp->xa_lock); 934 log_items[i++] = blip;
935 }
936 ASSERT(i <= need_ail);
871 } 937 }
938 /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
939 xfs_trans_ail_delete_bulk(ailp, log_items, i);
872 } 940 }
873 941
874 iip->ili_logged = 0;
875 942
876 /* 943 /*
877 * Clear the ili_last_fields bits now that we know that the 944 * clean up and unlock the flush lock now we are done. We can clear the
878 * data corresponding to them is safely on disk. 945 * ili_last_fields bits now that we know that the data corresponding to
946 * them is safely on disk.
879 */ 947 */
880 iip->ili_last_fields = 0; 948 for (blip = lip; blip; blip = next) {
949 next = blip->li_bio_list;
950 blip->li_bio_list = NULL;
881 951
882 /* 952 iip = INODE_ITEM(blip);
883 * Release the inode's flush lock since we're done with it. 953 iip->ili_logged = 0;
884 */ 954 iip->ili_last_fields = 0;
885 xfs_ifunlock(ip); 955 xfs_ifunlock(iip->ili_inode);
956 }
886} 957}
887 958
888/* 959/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..091d82b94c4d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
47 47
48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
49 << mp->m_writeio_log) 49 << mp->m_writeio_log)
50#define XFS_STRAT_WRITE_IMAPS 2
51#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 50#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
52 51
53STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
54 int, struct xfs_bmbt_irec *, int *);
55STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
56 struct xfs_bmbt_irec *, int *);
57STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
58 struct xfs_bmbt_irec *, int *);
59
60int
61xfs_iomap(
62 struct xfs_inode *ip,
63 xfs_off_t offset,
64 ssize_t count,
65 int flags,
66 struct xfs_bmbt_irec *imap,
67 int *nimaps,
68 int *new)
69{
70 struct xfs_mount *mp = ip->i_mount;
71 xfs_fileoff_t offset_fsb, end_fsb;
72 int error = 0;
73 int lockmode = 0;
74 int bmapi_flags = 0;
75
76 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
77
78 *new = 0;
79
80 if (XFS_FORCED_SHUTDOWN(mp))
81 return XFS_ERROR(EIO);
82
83 trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
84
85 switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
86 case BMAPI_READ:
87 lockmode = xfs_ilock_map_shared(ip);
88 bmapi_flags = XFS_BMAPI_ENTIRE;
89 break;
90 case BMAPI_WRITE:
91 lockmode = XFS_ILOCK_EXCL;
92 if (flags & BMAPI_IGNSTATE)
93 bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
94 xfs_ilock(ip, lockmode);
95 break;
96 case BMAPI_ALLOCATE:
97 lockmode = XFS_ILOCK_SHARED;
98 bmapi_flags = XFS_BMAPI_ENTIRE;
99
100 /* Attempt non-blocking lock */
101 if (flags & BMAPI_TRYLOCK) {
102 if (!xfs_ilock_nowait(ip, lockmode))
103 return XFS_ERROR(EAGAIN);
104 } else {
105 xfs_ilock(ip, lockmode);
106 }
107 break;
108 default:
109 BUG();
110 }
111
112 ASSERT(offset <= mp->m_maxioffset);
113 if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
114 count = mp->m_maxioffset - offset;
115 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
116 offset_fsb = XFS_B_TO_FSBT(mp, offset);
117
118 error = xfs_bmapi(NULL, ip, offset_fsb,
119 (xfs_filblks_t)(end_fsb - offset_fsb),
120 bmapi_flags, NULL, 0, imap,
121 nimaps, NULL);
122
123 if (error)
124 goto out;
125
126 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
127 case BMAPI_WRITE:
128 /* If we found an extent, return it */
129 if (*nimaps &&
130 (imap->br_startblock != HOLESTARTBLOCK) &&
131 (imap->br_startblock != DELAYSTARTBLOCK)) {
132 trace_xfs_iomap_found(ip, offset, count, flags, imap);
133 break;
134 }
135
136 if (flags & BMAPI_DIRECT) {
137 error = xfs_iomap_write_direct(ip, offset, count, flags,
138 imap, nimaps);
139 } else {
140 error = xfs_iomap_write_delay(ip, offset, count, flags,
141 imap, nimaps);
142 }
143 if (!error) {
144 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
145 }
146 *new = 1;
147 break;
148 case BMAPI_ALLOCATE:
149 /* If we found an extent, return it */
150 xfs_iunlock(ip, lockmode);
151 lockmode = 0;
152
153 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
154 trace_xfs_iomap_found(ip, offset, count, flags, imap);
155 break;
156 }
157
158 error = xfs_iomap_write_allocate(ip, offset, count,
159 imap, nimaps);
160 break;
161 }
162
163 ASSERT(*nimaps <= 1);
164
165out:
166 if (lockmode)
167 xfs_iunlock(ip, lockmode);
168 return XFS_ERROR(error);
169}
170
171STATIC int 52STATIC int
172xfs_iomap_eof_align_last_fsb( 53xfs_iomap_eof_align_last_fsb(
173 xfs_mount_t *mp, 54 xfs_mount_t *mp,
@@ -220,11 +101,11 @@ xfs_iomap_eof_align_last_fsb(
220} 101}
221 102
222STATIC int 103STATIC int
223xfs_cmn_err_fsblock_zero( 104xfs_alert_fsblock_zero(
224 xfs_inode_t *ip, 105 xfs_inode_t *ip,
225 xfs_bmbt_irec_t *imap) 106 xfs_bmbt_irec_t *imap)
226{ 107{
227 xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, 108 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
228 "Access to block zero in inode %llu " 109 "Access to block zero in inode %llu "
229 "start_block: %llx start_off: %llx " 110 "start_block: %llx start_off: %llx "
230 "blkcnt: %llx extent-state: %x\n", 111 "blkcnt: %llx extent-state: %x\n",
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
236 return EFSCORRUPTED; 117 return EFSCORRUPTED;
237} 118}
238 119
239STATIC int 120int
240xfs_iomap_write_direct( 121xfs_iomap_write_direct(
241 xfs_inode_t *ip, 122 xfs_inode_t *ip,
242 xfs_off_t offset, 123 xfs_off_t offset,
243 size_t count, 124 size_t count,
244 int flags,
245 xfs_bmbt_irec_t *imap, 125 xfs_bmbt_irec_t *imap,
246 int *nmaps) 126 int nmaps)
247{ 127{
248 xfs_mount_t *mp = ip->i_mount; 128 xfs_mount_t *mp = ip->i_mount;
249 xfs_fileoff_t offset_fsb; 129 xfs_fileoff_t offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
279 if (error) 159 if (error)
280 goto error_out; 160 goto error_out;
281 } else { 161 } else {
282 if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) 162 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
283 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 163 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
284 imap->br_blockcount + 164 imap->br_blockcount +
285 imap->br_startoff); 165 imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
331 xfs_trans_ijoin(tp, ip); 211 xfs_trans_ijoin(tp, ip);
332 212
333 bmapi_flag = XFS_BMAPI_WRITE; 213 bmapi_flag = XFS_BMAPI_WRITE;
334 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) 214 if (offset < ip->i_size || extsz)
335 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
336 216
337 /* 217 /*
@@ -366,11 +246,10 @@ xfs_iomap_write_direct(
366 } 246 }
367 247
368 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { 248 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
369 error = xfs_cmn_err_fsblock_zero(ip, imap); 249 error = xfs_alert_fsblock_zero(ip, imap);
370 goto error_out; 250 goto error_out;
371 } 251 }
372 252
373 *nmaps = 1;
374 return 0; 253 return 0;
375 254
376error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 255error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
379 258
380error1: /* Just cancel transaction */ 259error1: /* Just cancel transaction */
381 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 260 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
382 *nmaps = 0; /* nothing set-up here */
383 261
384error_out: 262error_out:
385 return XFS_ERROR(error); 263 return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
389 * If the caller is doing a write at the end of the file, then extend the 267 * If the caller is doing a write at the end of the file, then extend the
390 * allocation out to the file system's write iosize. We clean up any extra 268 * allocation out to the file system's write iosize. We clean up any extra
391 * space left over when the file is closed in xfs_inactive(). 269 * space left over when the file is closed in xfs_inactive().
270 *
271 * If we find we already have delalloc preallocation beyond EOF, don't do more
272 * preallocation as it it not needed.
392 */ 273 */
393STATIC int 274STATIC int
394xfs_iomap_eof_want_preallocate( 275xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
396 xfs_inode_t *ip, 277 xfs_inode_t *ip,
397 xfs_off_t offset, 278 xfs_off_t offset,
398 size_t count, 279 size_t count,
399 int ioflag,
400 xfs_bmbt_irec_t *imap, 280 xfs_bmbt_irec_t *imap,
401 int nimaps, 281 int nimaps,
402 int *prealloc) 282 int *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
405 xfs_filblks_t count_fsb; 285 xfs_filblks_t count_fsb;
406 xfs_fsblock_t firstblock; 286 xfs_fsblock_t firstblock;
407 int n, error, imaps; 287 int n, error, imaps;
288 int found_delalloc = 0;
408 289
409 *prealloc = 0; 290 *prealloc = 0;
410 if ((offset + count) <= ip->i_size) 291 if ((offset + count) <= ip->i_size)
@@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate(
429 return 0; 310 return 0;
430 start_fsb += imap[n].br_blockcount; 311 start_fsb += imap[n].br_blockcount;
431 count_fsb -= imap[n].br_blockcount; 312 count_fsb -= imap[n].br_blockcount;
313
314 if (imap[n].br_startblock == DELAYSTARTBLOCK)
315 found_delalloc = 1;
432 } 316 }
433 } 317 }
434 *prealloc = 1; 318 if (!found_delalloc)
319 *prealloc = 1;
435 return 0; 320 return 0;
436} 321}
437 322
438STATIC int 323/*
324 * If we don't have a user specified preallocation size, dynamically increase
325 * the preallocation size as the size of the file grows. Cap the maximum size
326 * at a single extent or less if the filesystem is near full. The closer the
327 * filesystem is to full, the smaller the maximum prealocation.
328 */
329STATIC xfs_fsblock_t
330xfs_iomap_prealloc_size(
331 struct xfs_mount *mp,
332 struct xfs_inode *ip)
333{
334 xfs_fsblock_t alloc_blocks = 0;
335
336 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
337 int shift = 0;
338 int64_t freesp;
339
340 /*
341 * rounddown_pow_of_two() returns an undefined result
342 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
343 * ensure we always pass in a non-zero value.
344 */
345 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
346 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
347 rounddown_pow_of_two(alloc_blocks));
348
349 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
350 freesp = mp->m_sb.sb_fdblocks;
351 if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
352 shift = 2;
353 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
354 shift++;
355 if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
356 shift++;
357 if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
358 shift++;
359 if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
360 shift++;
361 }
362 if (shift)
363 alloc_blocks >>= shift;
364 }
365
366 if (alloc_blocks < mp->m_writeio_blocks)
367 alloc_blocks = mp->m_writeio_blocks;
368
369 return alloc_blocks;
370}
371
372int
439xfs_iomap_write_delay( 373xfs_iomap_write_delay(
440 xfs_inode_t *ip, 374 xfs_inode_t *ip,
441 xfs_off_t offset, 375 xfs_off_t offset,
442 size_t count, 376 size_t count,
443 int ioflag, 377 xfs_bmbt_irec_t *ret_imap)
444 xfs_bmbt_irec_t *ret_imap,
445 int *nmaps)
446{ 378{
447 xfs_mount_t *mp = ip->i_mount; 379 xfs_mount_t *mp = ip->i_mount;
448 xfs_fileoff_t offset_fsb; 380 xfs_fileoff_t offset_fsb;
@@ -469,16 +401,19 @@ xfs_iomap_write_delay(
469 extsz = xfs_get_extsz_hint(ip); 401 extsz = xfs_get_extsz_hint(ip);
470 offset_fsb = XFS_B_TO_FSBT(mp, offset); 402 offset_fsb = XFS_B_TO_FSBT(mp, offset);
471 403
404
472 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, 405 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
473 ioflag, imap, XFS_WRITE_IMAPS, &prealloc); 406 imap, XFS_WRITE_IMAPS, &prealloc);
474 if (error) 407 if (error)
475 return error; 408 return error;
476 409
477retry: 410retry:
478 if (prealloc) { 411 if (prealloc) {
412 xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
413
479 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 414 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
480 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 415 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
481 last_fsb = ioalign + mp->m_writeio_blocks; 416 last_fsb = ioalign + alloc_blocks;
482 } else { 417 } else {
483 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 418 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
484 } 419 }
@@ -496,22 +431,31 @@ retry:
496 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 431 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
497 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 432 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
498 &nimaps, NULL); 433 &nimaps, NULL);
499 if (error && (error != ENOSPC)) 434 switch (error) {
435 case 0:
436 case ENOSPC:
437 case EDQUOT:
438 break;
439 default:
500 return XFS_ERROR(error); 440 return XFS_ERROR(error);
441 }
501 442
502 /* 443 /*
503 * If bmapi returned us nothing, and if we didn't get back EDQUOT, 444 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
504 * then we must have run out of space - flush all other inodes with 445 * ENOSPC, * flush all other inodes with delalloc blocks to free up
505 * delalloc blocks and retry without EOF preallocation. 446 * some of the excess reserved metadata space. For both cases, retry
447 * without EOF preallocation.
506 */ 448 */
507 if (nimaps == 0) { 449 if (nimaps == 0) {
508 trace_xfs_delalloc_enospc(ip, offset, count); 450 trace_xfs_delalloc_enospc(ip, offset, count);
509 if (flushed) 451 if (flushed)
510 return XFS_ERROR(ENOSPC); 452 return XFS_ERROR(error ? error : ENOSPC);
511 453
512 xfs_iunlock(ip, XFS_ILOCK_EXCL); 454 if (error == ENOSPC) {
513 xfs_flush_inodes(ip); 455 xfs_iunlock(ip, XFS_ILOCK_EXCL);
514 xfs_ilock(ip, XFS_ILOCK_EXCL); 456 xfs_flush_inodes(ip);
457 xfs_ilock(ip, XFS_ILOCK_EXCL);
458 }
515 459
516 flushed = 1; 460 flushed = 1;
517 error = 0; 461 error = 0;
@@ -520,11 +464,9 @@ retry:
520 } 464 }
521 465
522 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 466 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
523 return xfs_cmn_err_fsblock_zero(ip, &imap[0]); 467 return xfs_alert_fsblock_zero(ip, &imap[0]);
524 468
525 *ret_imap = imap[0]; 469 *ret_imap = imap[0];
526 *nmaps = 1;
527
528 return 0; 470 return 0;
529} 471}
530 472
@@ -538,13 +480,12 @@ retry:
538 * We no longer bother to look at the incoming map - all we have to 480 * We no longer bother to look at the incoming map - all we have to
539 * guarantee is that whatever we allocate fills the required range. 481 * guarantee is that whatever we allocate fills the required range.
540 */ 482 */
541STATIC int 483int
542xfs_iomap_write_allocate( 484xfs_iomap_write_allocate(
543 xfs_inode_t *ip, 485 xfs_inode_t *ip,
544 xfs_off_t offset, 486 xfs_off_t offset,
545 size_t count, 487 size_t count,
546 xfs_bmbt_irec_t *imap, 488 xfs_bmbt_irec_t *imap)
547 int *retmap)
548{ 489{
549 xfs_mount_t *mp = ip->i_mount; 490 xfs_mount_t *mp = ip->i_mount;
550 xfs_fileoff_t offset_fsb, last_block; 491 xfs_fileoff_t offset_fsb, last_block;
@@ -557,8 +498,6 @@ xfs_iomap_write_allocate(
557 int error = 0; 498 int error = 0;
558 int nres; 499 int nres;
559 500
560 *retmap = 0;
561
562 /* 501 /*
563 * Make sure that the dquots are there. 502 * Make sure that the dquots are there.
564 */ 503 */
@@ -675,12 +614,11 @@ xfs_iomap_write_allocate(
675 * covers at least part of the callers request 614 * covers at least part of the callers request
676 */ 615 */
677 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) 616 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
678 return xfs_cmn_err_fsblock_zero(ip, imap); 617 return xfs_alert_fsblock_zero(ip, imap);
679 618
680 if ((offset_fsb >= imap->br_startoff) && 619 if ((offset_fsb >= imap->br_startoff) &&
681 (offset_fsb < (imap->br_startoff + 620 (offset_fsb < (imap->br_startoff +
682 imap->br_blockcount))) { 621 imap->br_blockcount))) {
683 *retmap = 1;
684 XFS_STATS_INC(xs_xstrat_quick); 622 XFS_STATS_INC(xs_xstrat_quick);
685 return 0; 623 return 0;
686 } 624 }
@@ -786,7 +724,7 @@ xfs_iomap_write_unwritten(
786 return XFS_ERROR(error); 724 return XFS_ERROR(error);
787 725
788 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 726 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
789 return xfs_cmn_err_fsblock_zero(ip, &imap); 727 return xfs_alert_fsblock_zero(ip, &imap);
790 728
791 if ((numblks_fsb = imap.br_blockcount) == 0) { 729 if ((numblks_fsb = imap.br_blockcount) == 0) {
792 /* 730 /*
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21/* base extent manipulation calls */
22#define BMAPI_READ (1 << 0) /* read extents */
23#define BMAPI_WRITE (1 << 1) /* create extents */
24#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
25
26/* modifiers */
27#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
28#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
29#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
30#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
31
32#define BMAPI_FLAGS \
33 { BMAPI_READ, "READ" }, \
34 { BMAPI_WRITE, "WRITE" }, \
35 { BMAPI_ALLOCATE, "ALLOCATE" }, \
36 { BMAPI_IGNSTATE, "IGNSTATE" }, \
37 { BMAPI_DIRECT, "DIRECT" }, \
38 { BMAPI_TRYLOCK, "TRYLOCK" }
39
40struct xfs_inode; 21struct xfs_inode;
41struct xfs_bmbt_irec; 22struct xfs_bmbt_irec;
42 23
43extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 24extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
44 struct xfs_bmbt_irec *, int *, int *); 25 struct xfs_bmbt_irec *, int);
26extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
27 struct xfs_bmbt_irec *);
28extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
29 struct xfs_bmbt_irec *);
45extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 30extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
46 31
47#endif /* __XFS_IOMAP_H__*/ 32#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index dc1882adaf54..751e94fe1f77 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -204,7 +204,6 @@ xfs_bulkstat(
204 xfs_agi_t *agi; /* agi header data */ 204 xfs_agi_t *agi; /* agi header data */
205 xfs_agino_t agino; /* inode # in allocation group */ 205 xfs_agino_t agino; /* inode # in allocation group */
206 xfs_agnumber_t agno; /* allocation group number */ 206 xfs_agnumber_t agno; /* allocation group number */
207 xfs_daddr_t bno; /* inode cluster start daddr */
208 int chunkidx; /* current index into inode chunk */ 207 int chunkidx; /* current index into inode chunk */
209 int clustidx; /* current index into inode cluster */ 208 int clustidx; /* current index into inode cluster */
210 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ 209 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
@@ -463,7 +462,6 @@ xfs_bulkstat(
463 mp->m_sb.sb_inopblog); 462 mp->m_sb.sb_inopblog);
464 } 463 }
465 ino = XFS_AGINO_TO_INO(mp, agno, agino); 464 ino = XFS_AGINO_TO_INO(mp, agno, agino);
466 bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
467 /* 465 /*
468 * Skip if this inode is free. 466 * Skip if this inode is free.
469 */ 467 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9e..b612ce4520ae 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
47 xfs_buftarg_t *log_target, 47 xfs_buftarg_t *log_target,
48 xfs_daddr_t blk_offset, 48 xfs_daddr_t blk_offset,
49 int num_bblks); 49 int num_bblks);
50STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 50STATIC int xlog_space_left(struct log *log, atomic64_t *head);
51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
52STATIC void xlog_dealloc_log(xlog_t *log); 52STATIC void xlog_dealloc_log(xlog_t *log);
53 53
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
70/* local functions to manipulate grant head */ 70/* local functions to manipulate grant head */
71STATIC int xlog_grant_log_space(xlog_t *log, 71STATIC int xlog_grant_log_space(xlog_t *log,
72 xlog_ticket_t *xtic); 72 xlog_ticket_t *xtic);
73STATIC void xlog_grant_push_ail(xfs_mount_t *mp, 73STATIC void xlog_grant_push_ail(struct log *log,
74 int need_bytes); 74 int need_bytes);
75STATIC void xlog_regrant_reserve_log_space(xlog_t *log, 75STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
76 xlog_ticket_t *ticket); 76 xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
81 81
82#if defined(DEBUG) 82#if defined(DEBUG)
83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
84STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 84STATIC void xlog_verify_grant_tail(struct log *log);
85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
86 int count, boolean_t syncing); 86 int count, boolean_t syncing);
87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, 87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
88 xfs_lsn_t tail_lsn); 88 xfs_lsn_t tail_lsn);
89#else 89#else
90#define xlog_verify_dest_ptr(a,b) 90#define xlog_verify_dest_ptr(a,b)
91#define xlog_verify_grant_head(a,b) 91#define xlog_verify_grant_tail(a)
92#define xlog_verify_iclog(a,b,c,d) 92#define xlog_verify_iclog(a,b,c,d)
93#define xlog_verify_tail_lsn(a,b,c) 93#define xlog_verify_tail_lsn(a,b,c)
94#endif 94#endif
95 95
96STATIC int xlog_iclogs_empty(xlog_t *log); 96STATIC int xlog_iclogs_empty(xlog_t *log);
97 97
98
99static void 98static void
100xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 99xlog_grant_sub_space(
100 struct log *log,
101 atomic64_t *head,
102 int bytes)
101{ 103{
102 if (*qp) { 104 int64_t head_val = atomic64_read(head);
103 tic->t_next = (*qp); 105 int64_t new, old;
104 tic->t_prev = (*qp)->t_prev;
105 (*qp)->t_prev->t_next = tic;
106 (*qp)->t_prev = tic;
107 } else {
108 tic->t_prev = tic->t_next = tic;
109 *qp = tic;
110 }
111 106
112 tic->t_flags |= XLOG_TIC_IN_Q; 107 do {
113} 108 int cycle, space;
114 109
115static void 110 xlog_crack_grant_head_val(head_val, &cycle, &space);
116xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
117{
118 if (tic == tic->t_next) {
119 *qp = NULL;
120 } else {
121 *qp = tic->t_next;
122 tic->t_next->t_prev = tic->t_prev;
123 tic->t_prev->t_next = tic->t_next;
124 }
125 111
126 tic->t_next = tic->t_prev = NULL; 112 space -= bytes;
127 tic->t_flags &= ~XLOG_TIC_IN_Q; 113 if (space < 0) {
114 space += log->l_logsize;
115 cycle--;
116 }
117
118 old = head_val;
119 new = xlog_assign_grant_head_val(cycle, space);
120 head_val = atomic64_cmpxchg(head, old, new);
121 } while (head_val != old);
128} 122}
129 123
130static void 124static void
131xlog_grant_sub_space(struct log *log, int bytes) 125xlog_grant_add_space(
126 struct log *log,
127 atomic64_t *head,
128 int bytes)
132{ 129{
133 log->l_grant_write_bytes -= bytes; 130 int64_t head_val = atomic64_read(head);
134 if (log->l_grant_write_bytes < 0) { 131 int64_t new, old;
135 log->l_grant_write_bytes += log->l_logsize;
136 log->l_grant_write_cycle--;
137 }
138
139 log->l_grant_reserve_bytes -= bytes;
140 if ((log)->l_grant_reserve_bytes < 0) {
141 log->l_grant_reserve_bytes += log->l_logsize;
142 log->l_grant_reserve_cycle--;
143 }
144 132
145} 133 do {
134 int tmp;
135 int cycle, space;
146 136
147static void 137 xlog_crack_grant_head_val(head_val, &cycle, &space);
148xlog_grant_add_space_write(struct log *log, int bytes)
149{
150 int tmp = log->l_logsize - log->l_grant_write_bytes;
151 if (tmp > bytes)
152 log->l_grant_write_bytes += bytes;
153 else {
154 log->l_grant_write_cycle++;
155 log->l_grant_write_bytes = bytes - tmp;
156 }
157}
158 138
159static void 139 tmp = log->l_logsize - space;
160xlog_grant_add_space_reserve(struct log *log, int bytes) 140 if (tmp > bytes)
161{ 141 space += bytes;
162 int tmp = log->l_logsize - log->l_grant_reserve_bytes; 142 else {
163 if (tmp > bytes) 143 space = bytes - tmp;
164 log->l_grant_reserve_bytes += bytes; 144 cycle++;
165 else { 145 }
166 log->l_grant_reserve_cycle++;
167 log->l_grant_reserve_bytes = bytes - tmp;
168 }
169}
170 146
171static inline void 147 old = head_val;
172xlog_grant_add_space(struct log *log, int bytes) 148 new = xlog_assign_grant_head_val(cycle, space);
173{ 149 head_val = atomic64_cmpxchg(head, old, new);
174 xlog_grant_add_space_write(log, bytes); 150 } while (head_val != old);
175 xlog_grant_add_space_reserve(log, bytes);
176} 151}
177 152
178static void 153static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
355 330
356 trace_xfs_log_reserve(log, internal_ticket); 331 trace_xfs_log_reserve(log, internal_ticket);
357 332
358 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 333 xlog_grant_push_ail(log, internal_ticket->t_unit_res);
359 retval = xlog_regrant_write_log_space(log, internal_ticket); 334 retval = xlog_regrant_write_log_space(log, internal_ticket);
360 } else { 335 } else {
361 /* may sleep if need to allocate more tickets */ 336 /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
369 344
370 trace_xfs_log_reserve(log, internal_ticket); 345 trace_xfs_log_reserve(log, internal_ticket);
371 346
372 xlog_grant_push_ail(mp, 347 xlog_grant_push_ail(log,
373 (internal_ticket->t_unit_res * 348 (internal_ticket->t_unit_res *
374 internal_ticket->t_cnt)); 349 internal_ticket->t_cnt));
375 retval = xlog_grant_log_space(log, internal_ticket); 350 retval = xlog_grant_log_space(log, internal_ticket);
@@ -399,11 +374,10 @@ xfs_log_mount(
399 int error; 374 int error;
400 375
401 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 376 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
402 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 377 xfs_notice(mp, "Mounting Filesystem");
403 else { 378 else {
404 cmn_err(CE_NOTE, 379 xfs_notice(mp,
405 "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 380"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent.");
406 mp->m_fsname);
407 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 381 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
408 } 382 }
409 383
@@ -418,7 +392,7 @@ xfs_log_mount(
418 */ 392 */
419 error = xfs_trans_ail_init(mp); 393 error = xfs_trans_ail_init(mp);
420 if (error) { 394 if (error) {
421 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); 395 xfs_warn(mp, "AIL initialisation failed: error %d", error);
422 goto out_free_log; 396 goto out_free_log;
423 } 397 }
424 mp->m_log->l_ailp = mp->m_ail; 398 mp->m_log->l_ailp = mp->m_ail;
@@ -438,7 +412,8 @@ xfs_log_mount(
438 if (readonly) 412 if (readonly)
439 mp->m_flags |= XFS_MOUNT_RDONLY; 413 mp->m_flags |= XFS_MOUNT_RDONLY;
440 if (error) { 414 if (error) {
441 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); 415 xfs_warn(mp, "log mount/recovery failed: error %d",
416 error);
442 goto out_destroy_ail; 417 goto out_destroy_ail;
443 } 418 }
444 } 419 }
@@ -567,10 +542,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
567 */ 542 */
568 } 543 }
569 544
570 if (error) { 545 if (error)
571 xfs_fs_cmn_err(CE_ALERT, mp, 546 xfs_alert(mp, "%s: unmount record failed", __func__);
572 "xfs_log_unmount: unmount record failed");
573 }
574 547
575 548
576 spin_lock(&log->l_icloglock); 549 spin_lock(&log->l_icloglock);
@@ -584,8 +557,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
584 if (!(iclog->ic_state == XLOG_STATE_ACTIVE || 557 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
585 iclog->ic_state == XLOG_STATE_DIRTY)) { 558 iclog->ic_state == XLOG_STATE_DIRTY)) {
586 if (!XLOG_FORCED_SHUTDOWN(log)) { 559 if (!XLOG_FORCED_SHUTDOWN(log)) {
587 sv_wait(&iclog->ic_force_wait, PMEM, 560 xlog_wait(&iclog->ic_force_wait,
588 &log->l_icloglock, s); 561 &log->l_icloglock);
589 } else { 562 } else {
590 spin_unlock(&log->l_icloglock); 563 spin_unlock(&log->l_icloglock);
591 } 564 }
@@ -625,8 +598,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
625 || iclog->ic_state == XLOG_STATE_DIRTY 598 || iclog->ic_state == XLOG_STATE_DIRTY
626 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 599 || iclog->ic_state == XLOG_STATE_IOERROR) ) {
627 600
628 sv_wait(&iclog->ic_force_wait, PMEM, 601 xlog_wait(&iclog->ic_force_wait,
629 &log->l_icloglock, s); 602 &log->l_icloglock);
630 } else { 603 } else {
631 spin_unlock(&log->l_icloglock); 604 spin_unlock(&log->l_icloglock);
632 } 605 }
@@ -703,55 +676,46 @@ xfs_log_move_tail(xfs_mount_t *mp,
703{ 676{
704 xlog_ticket_t *tic; 677 xlog_ticket_t *tic;
705 xlog_t *log = mp->m_log; 678 xlog_t *log = mp->m_log;
706 int need_bytes, free_bytes, cycle, bytes; 679 int need_bytes, free_bytes;
707 680
708 if (XLOG_FORCED_SHUTDOWN(log)) 681 if (XLOG_FORCED_SHUTDOWN(log))
709 return; 682 return;
710 683
711 if (tail_lsn == 0) { 684 if (tail_lsn == 0)
712 /* needed since sync_lsn is 64 bits */ 685 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
713 spin_lock(&log->l_icloglock);
714 tail_lsn = log->l_last_sync_lsn;
715 spin_unlock(&log->l_icloglock);
716 }
717
718 spin_lock(&log->l_grant_lock);
719 686
720 /* Also an invalid lsn. 1 implies that we aren't passing in a valid 687 /* tail_lsn == 1 implies that we weren't passed a valid value. */
721 * tail_lsn. 688 if (tail_lsn != 1)
722 */ 689 atomic64_set(&log->l_tail_lsn, tail_lsn);
723 if (tail_lsn != 1) {
724 log->l_tail_lsn = tail_lsn;
725 }
726 690
727 if ((tic = log->l_write_headq)) { 691 if (!list_empty_careful(&log->l_writeq)) {
728#ifdef DEBUG 692#ifdef DEBUG
729 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 693 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
730 panic("Recovery problem"); 694 panic("Recovery problem");
731#endif 695#endif
732 cycle = log->l_grant_write_cycle; 696 spin_lock(&log->l_grant_write_lock);
733 bytes = log->l_grant_write_bytes; 697 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
734 free_bytes = xlog_space_left(log, cycle, bytes); 698 list_for_each_entry(tic, &log->l_writeq, t_queue) {
735 do {
736 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 699 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
737 700
738 if (free_bytes < tic->t_unit_res && tail_lsn != 1) 701 if (free_bytes < tic->t_unit_res && tail_lsn != 1)
739 break; 702 break;
740 tail_lsn = 0; 703 tail_lsn = 0;
741 free_bytes -= tic->t_unit_res; 704 free_bytes -= tic->t_unit_res;
742 sv_signal(&tic->t_wait); 705 trace_xfs_log_regrant_write_wake_up(log, tic);
743 tic = tic->t_next; 706 wake_up(&tic->t_wait);
744 } while (tic != log->l_write_headq); 707 }
708 spin_unlock(&log->l_grant_write_lock);
745 } 709 }
746 if ((tic = log->l_reserve_headq)) { 710
711 if (!list_empty_careful(&log->l_reserveq)) {
747#ifdef DEBUG 712#ifdef DEBUG
748 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 713 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
749 panic("Recovery problem"); 714 panic("Recovery problem");
750#endif 715#endif
751 cycle = log->l_grant_reserve_cycle; 716 spin_lock(&log->l_grant_reserve_lock);
752 bytes = log->l_grant_reserve_bytes; 717 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
753 free_bytes = xlog_space_left(log, cycle, bytes); 718 list_for_each_entry(tic, &log->l_reserveq, t_queue) {
754 do {
755 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 719 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
756 need_bytes = tic->t_unit_res*tic->t_cnt; 720 need_bytes = tic->t_unit_res*tic->t_cnt;
757 else 721 else
@@ -760,12 +724,12 @@ xfs_log_move_tail(xfs_mount_t *mp,
760 break; 724 break;
761 tail_lsn = 0; 725 tail_lsn = 0;
762 free_bytes -= need_bytes; 726 free_bytes -= need_bytes;
763 sv_signal(&tic->t_wait); 727 trace_xfs_log_grant_wake_up(log, tic);
764 tic = tic->t_next; 728 wake_up(&tic->t_wait);
765 } while (tic != log->l_reserve_headq); 729 }
730 spin_unlock(&log->l_grant_reserve_lock);
766 } 731 }
767 spin_unlock(&log->l_grant_lock); 732}
768} /* xfs_log_move_tail */
769 733
770/* 734/*
771 * Determine if we have a transaction that has gone to disk 735 * Determine if we have a transaction that has gone to disk
@@ -797,7 +761,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
797 break; 761 break;
798 case XLOG_STATE_COVER_NEED: 762 case XLOG_STATE_COVER_NEED:
799 case XLOG_STATE_COVER_NEED2: 763 case XLOG_STATE_COVER_NEED2:
800 if (!xfs_trans_ail_tail(log->l_ailp) && 764 if (!xfs_ail_min_lsn(log->l_ailp) &&
801 xlog_iclogs_empty(log)) { 765 xlog_iclogs_empty(log)) {
802 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 766 if (log->l_covered_state == XLOG_STATE_COVER_NEED)
803 log->l_covered_state = XLOG_STATE_COVER_DONE; 767 log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -831,23 +795,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
831 * We may be holding the log iclog lock upon entering this routine. 795 * We may be holding the log iclog lock upon entering this routine.
832 */ 796 */
833xfs_lsn_t 797xfs_lsn_t
834xlog_assign_tail_lsn(xfs_mount_t *mp) 798xlog_assign_tail_lsn(
799 struct xfs_mount *mp)
835{ 800{
836 xfs_lsn_t tail_lsn; 801 xfs_lsn_t tail_lsn;
837 xlog_t *log = mp->m_log; 802 struct log *log = mp->m_log;
838 803
839 tail_lsn = xfs_trans_ail_tail(mp->m_ail); 804 tail_lsn = xfs_ail_min_lsn(mp->m_ail);
840 spin_lock(&log->l_grant_lock); 805 if (!tail_lsn)
841 if (tail_lsn != 0) { 806 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
842 log->l_tail_lsn = tail_lsn;
843 } else {
844 tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
845 }
846 spin_unlock(&log->l_grant_lock);
847 807
808 atomic64_set(&log->l_tail_lsn, tail_lsn);
848 return tail_lsn; 809 return tail_lsn;
849} /* xlog_assign_tail_lsn */ 810}
850
851 811
852/* 812/*
853 * Return the space in the log between the tail and the head. The head 813 * Return the space in the log between the tail and the head. The head
@@ -864,37 +824,42 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
864 * result is that we return the size of the log as the amount of space left. 824 * result is that we return the size of the log as the amount of space left.
865 */ 825 */
866STATIC int 826STATIC int
867xlog_space_left(xlog_t *log, int cycle, int bytes) 827xlog_space_left(
868{ 828 struct log *log,
869 int free_bytes; 829 atomic64_t *head)
870 int tail_bytes; 830{
871 int tail_cycle; 831 int free_bytes;
872 832 int tail_bytes;
873 tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); 833 int tail_cycle;
874 tail_cycle = CYCLE_LSN(log->l_tail_lsn); 834 int head_cycle;
875 if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { 835 int head_bytes;
876 free_bytes = log->l_logsize - (bytes - tail_bytes); 836
877 } else if ((tail_cycle + 1) < cycle) { 837 xlog_crack_grant_head(head, &head_cycle, &head_bytes);
838 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
839 tail_bytes = BBTOB(tail_bytes);
840 if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
841 free_bytes = log->l_logsize - (head_bytes - tail_bytes);
842 else if (tail_cycle + 1 < head_cycle)
878 return 0; 843 return 0;
879 } else if (tail_cycle < cycle) { 844 else if (tail_cycle < head_cycle) {
880 ASSERT(tail_cycle == (cycle - 1)); 845 ASSERT(tail_cycle == (head_cycle - 1));
881 free_bytes = tail_bytes - bytes; 846 free_bytes = tail_bytes - head_bytes;
882 } else { 847 } else {
883 /* 848 /*
884 * The reservation head is behind the tail. 849 * The reservation head is behind the tail.
885 * In this case we just want to return the size of the 850 * In this case we just want to return the size of the
886 * log as the amount of space left. 851 * log as the amount of space left.
887 */ 852 */
888 xfs_fs_cmn_err(CE_ALERT, log->l_mp, 853 xfs_alert(log->l_mp,
889 "xlog_space_left: head behind tail\n" 854 "xlog_space_left: head behind tail\n"
890 " tail_cycle = %d, tail_bytes = %d\n" 855 " tail_cycle = %d, tail_bytes = %d\n"
891 " GH cycle = %d, GH bytes = %d", 856 " GH cycle = %d, GH bytes = %d",
892 tail_cycle, tail_bytes, cycle, bytes); 857 tail_cycle, tail_bytes, head_cycle, head_bytes);
893 ASSERT(0); 858 ASSERT(0);
894 free_bytes = log->l_logsize; 859 free_bytes = log->l_logsize;
895 } 860 }
896 return free_bytes; 861 return free_bytes;
897} /* xlog_space_left */ 862}
898 863
899 864
900/* 865/*
@@ -1034,7 +999,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1034 999
1035 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); 1000 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1036 if (!log) { 1001 if (!log) {
1037 xlog_warn("XFS: Log allocation failed: No memory!"); 1002 xfs_warn(mp, "Log allocation failed: No memory!");
1038 goto out; 1003 goto out;
1039 } 1004 }
1040 1005
@@ -1047,35 +1012,39 @@ xlog_alloc_log(xfs_mount_t *mp,
1047 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1012 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1048 1013
1049 log->l_prev_block = -1; 1014 log->l_prev_block = -1;
1050 log->l_tail_lsn = xlog_assign_lsn(1, 0);
1051 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1015 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
1052 log->l_last_sync_lsn = log->l_tail_lsn; 1016 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
1017 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
1053 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1018 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
1054 log->l_grant_reserve_cycle = 1; 1019 xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
1055 log->l_grant_write_cycle = 1; 1020 xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
1021 INIT_LIST_HEAD(&log->l_reserveq);
1022 INIT_LIST_HEAD(&log->l_writeq);
1023 spin_lock_init(&log->l_grant_reserve_lock);
1024 spin_lock_init(&log->l_grant_write_lock);
1056 1025
1057 error = EFSCORRUPTED; 1026 error = EFSCORRUPTED;
1058 if (xfs_sb_version_hassector(&mp->m_sb)) { 1027 if (xfs_sb_version_hassector(&mp->m_sb)) {
1059 log2_size = mp->m_sb.sb_logsectlog; 1028 log2_size = mp->m_sb.sb_logsectlog;
1060 if (log2_size < BBSHIFT) { 1029 if (log2_size < BBSHIFT) {
1061 xlog_warn("XFS: Log sector size too small " 1030 xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
1062 "(0x%x < 0x%x)", log2_size, BBSHIFT); 1031 log2_size, BBSHIFT);
1063 goto out_free_log; 1032 goto out_free_log;
1064 } 1033 }
1065 1034
1066 log2_size -= BBSHIFT; 1035 log2_size -= BBSHIFT;
1067 if (log2_size > mp->m_sectbb_log) { 1036 if (log2_size > mp->m_sectbb_log) {
1068 xlog_warn("XFS: Log sector size too large " 1037 xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
1069 "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log); 1038 log2_size, mp->m_sectbb_log);
1070 goto out_free_log; 1039 goto out_free_log;
1071 } 1040 }
1072 1041
1073 /* for larger sector sizes, must have v2 or external log */ 1042 /* for larger sector sizes, must have v2 or external log */
1074 if (log2_size && log->l_logBBstart > 0 && 1043 if (log2_size && log->l_logBBstart > 0 &&
1075 !xfs_sb_version_haslogv2(&mp->m_sb)) { 1044 !xfs_sb_version_haslogv2(&mp->m_sb)) {
1076 1045 xfs_warn(mp,
1077 xlog_warn("XFS: log sector size (0x%x) invalid " 1046 "log sector size (0x%x) invalid for configuration.",
1078 "for configuration.", log2_size); 1047 log2_size);
1079 goto out_free_log; 1048 goto out_free_log;
1080 } 1049 }
1081 } 1050 }
@@ -1094,8 +1063,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1094 log->l_xbuf = bp; 1063 log->l_xbuf = bp;
1095 1064
1096 spin_lock_init(&log->l_icloglock); 1065 spin_lock_init(&log->l_icloglock);
1097 spin_lock_init(&log->l_grant_lock); 1066 init_waitqueue_head(&log->l_flush_wait);
1098 sv_init(&log->l_flush_wait, 0, "flush_wait");
1099 1067
1100 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1068 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1101 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1069 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1119,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1151 1119
1152 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); 1120 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1153 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1121 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
1154 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1122 init_waitqueue_head(&iclog->ic_force_wait);
1155 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1123 init_waitqueue_head(&iclog->ic_write_wait);
1156 1124
1157 iclogp = &iclog->ic_next; 1125 iclogp = &iclog->ic_next;
1158 } 1126 }
@@ -1167,15 +1135,11 @@ xlog_alloc_log(xfs_mount_t *mp,
1167out_free_iclog: 1135out_free_iclog:
1168 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { 1136 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1169 prev_iclog = iclog->ic_next; 1137 prev_iclog = iclog->ic_next;
1170 if (iclog->ic_bp) { 1138 if (iclog->ic_bp)
1171 sv_destroy(&iclog->ic_force_wait);
1172 sv_destroy(&iclog->ic_write_wait);
1173 xfs_buf_free(iclog->ic_bp); 1139 xfs_buf_free(iclog->ic_bp);
1174 }
1175 kmem_free(iclog); 1140 kmem_free(iclog);
1176 } 1141 }
1177 spinlock_destroy(&log->l_icloglock); 1142 spinlock_destroy(&log->l_icloglock);
1178 spinlock_destroy(&log->l_grant_lock);
1179 xfs_buf_free(log->l_xbuf); 1143 xfs_buf_free(log->l_xbuf);
1180out_free_log: 1144out_free_log:
1181 kmem_free(log); 1145 kmem_free(log);
@@ -1223,61 +1187,60 @@ xlog_commit_record(
1223 * water mark. In this manner, we would be creating a low water mark. 1187 * water mark. In this manner, we would be creating a low water mark.
1224 */ 1188 */
1225STATIC void 1189STATIC void
1226xlog_grant_push_ail(xfs_mount_t *mp, 1190xlog_grant_push_ail(
1227 int need_bytes) 1191 struct log *log,
1192 int need_bytes)
1228{ 1193{
1229 xlog_t *log = mp->m_log; /* pointer to the log */ 1194 xfs_lsn_t threshold_lsn = 0;
1230 xfs_lsn_t tail_lsn; /* lsn of the log tail */ 1195 xfs_lsn_t last_sync_lsn;
1231 xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ 1196 int free_blocks;
1232 int free_blocks; /* free blocks left to write to */ 1197 int free_bytes;
1233 int free_bytes; /* free bytes left to write to */ 1198 int threshold_block;
1234 int threshold_block; /* block in lsn we'd like to be at */ 1199 int threshold_cycle;
1235 int threshold_cycle; /* lsn cycle we'd like to be at */ 1200 int free_threshold;
1236 int free_threshold; 1201
1237 1202 ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1238 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1203
1239 1204 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
1240 spin_lock(&log->l_grant_lock); 1205 free_blocks = BTOBBT(free_bytes);
1241 free_bytes = xlog_space_left(log, 1206
1242 log->l_grant_reserve_cycle, 1207 /*
1243 log->l_grant_reserve_bytes); 1208 * Set the threshold for the minimum number of free blocks in the
1244 tail_lsn = log->l_tail_lsn; 1209 * log to the maximum of what the caller needs, one quarter of the
1245 free_blocks = BTOBBT(free_bytes); 1210 * log, and 256 blocks.
1246 1211 */
1247 /* 1212 free_threshold = BTOBB(need_bytes);
1248 * Set the threshold for the minimum number of free blocks in the 1213 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
1249 * log to the maximum of what the caller needs, one quarter of the 1214 free_threshold = MAX(free_threshold, 256);
1250 * log, and 256 blocks. 1215 if (free_blocks >= free_threshold)
1251 */ 1216 return;
1252 free_threshold = BTOBB(need_bytes); 1217
1253 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); 1218 xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
1254 free_threshold = MAX(free_threshold, 256); 1219 &threshold_block);
1255 if (free_blocks < free_threshold) { 1220 threshold_block += free_threshold;
1256 threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
1257 threshold_cycle = CYCLE_LSN(tail_lsn);
1258 if (threshold_block >= log->l_logBBsize) { 1221 if (threshold_block >= log->l_logBBsize) {
1259 threshold_block -= log->l_logBBsize; 1222 threshold_block -= log->l_logBBsize;
1260 threshold_cycle += 1; 1223 threshold_cycle += 1;
1261 } 1224 }
1262 threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block); 1225 threshold_lsn = xlog_assign_lsn(threshold_cycle,
1226 threshold_block);
1227 /*
1228 * Don't pass in an lsn greater than the lsn of the last
1229 * log record known to be on disk. Use a snapshot of the last sync lsn
1230 * so that it doesn't change between the compare and the set.
1231 */
1232 last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
1233 if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
1234 threshold_lsn = last_sync_lsn;
1263 1235
1264 /* Don't pass in an lsn greater than the lsn of the last 1236 /*
1265 * log record known to be on disk. 1237 * Get the transaction layer to kick the dirty buffers out to
1238 * disk asynchronously. No point in trying to do this if
1239 * the filesystem is shutting down.
1266 */ 1240 */
1267 if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) 1241 if (!XLOG_FORCED_SHUTDOWN(log))
1268 threshold_lsn = log->l_last_sync_lsn; 1242 xfs_ail_push(log->l_ailp, threshold_lsn);
1269 } 1243}
1270 spin_unlock(&log->l_grant_lock);
1271
1272 /*
1273 * Get the transaction layer to kick the dirty buffers out to
1274 * disk asynchronously. No point in trying to do this if
1275 * the filesystem is shutting down.
1276 */
1277 if (threshold_lsn &&
1278 !XLOG_FORCED_SHUTDOWN(log))
1279 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1280} /* xlog_grant_push_ail */
1281 1244
1282/* 1245/*
1283 * The bdstrat callback function for log bufs. This gives us a central 1246 * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1335,8 @@ xlog_sync(xlog_t *log,
1372 roundoff < BBTOB(1))); 1335 roundoff < BBTOB(1)));
1373 1336
1374 /* move grant heads by roundoff in sync */ 1337 /* move grant heads by roundoff in sync */
1375 spin_lock(&log->l_grant_lock); 1338 xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
1376 xlog_grant_add_space(log, roundoff); 1339 xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
1377 spin_unlock(&log->l_grant_lock);
1378 1340
1379 /* put cycle number in every block */ 1341 /* put cycle number in every block */
1380 xlog_pack_data(log, iclog, roundoff); 1342 xlog_pack_data(log, iclog, roundoff);
@@ -1489,15 +1451,12 @@ xlog_dealloc_log(xlog_t *log)
1489 1451
1490 iclog = log->l_iclog; 1452 iclog = log->l_iclog;
1491 for (i=0; i<log->l_iclog_bufs; i++) { 1453 for (i=0; i<log->l_iclog_bufs; i++) {
1492 sv_destroy(&iclog->ic_force_wait);
1493 sv_destroy(&iclog->ic_write_wait);
1494 xfs_buf_free(iclog->ic_bp); 1454 xfs_buf_free(iclog->ic_bp);
1495 next_iclog = iclog->ic_next; 1455 next_iclog = iclog->ic_next;
1496 kmem_free(iclog); 1456 kmem_free(iclog);
1497 iclog = next_iclog; 1457 iclog = next_iclog;
1498 } 1458 }
1499 spinlock_destroy(&log->l_icloglock); 1459 spinlock_destroy(&log->l_icloglock);
1500 spinlock_destroy(&log->l_grant_lock);
1501 1460
1502 xfs_buf_free(log->l_xbuf); 1461 xfs_buf_free(log->l_xbuf);
1503 log->l_mp->m_log = NULL; 1462 log->l_mp->m_log = NULL;
@@ -1602,38 +1561,36 @@ xlog_print_tic_res(
1602 "SWAPEXT" 1561 "SWAPEXT"
1603 }; 1562 };
1604 1563
1605 xfs_fs_cmn_err(CE_WARN, mp, 1564 xfs_warn(mp,
1606 "xfs_log_write: reservation summary:\n" 1565 "xfs_log_write: reservation summary:\n"
1607 " trans type = %s (%u)\n" 1566 " trans type = %s (%u)\n"
1608 " unit res = %d bytes\n" 1567 " unit res = %d bytes\n"
1609 " current res = %d bytes\n" 1568 " current res = %d bytes\n"
1610 " total reg = %u bytes (o/flow = %u bytes)\n" 1569 " total reg = %u bytes (o/flow = %u bytes)\n"
1611 " ophdrs = %u (ophdr space = %u bytes)\n" 1570 " ophdrs = %u (ophdr space = %u bytes)\n"
1612 " ophdr + reg = %u bytes\n" 1571 " ophdr + reg = %u bytes\n"
1613 " num regions = %u\n", 1572 " num regions = %u\n",
1614 ((ticket->t_trans_type <= 0 || 1573 ((ticket->t_trans_type <= 0 ||
1615 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? 1574 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
1616 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), 1575 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
1617 ticket->t_trans_type, 1576 ticket->t_trans_type,
1618 ticket->t_unit_res, 1577 ticket->t_unit_res,
1619 ticket->t_curr_res, 1578 ticket->t_curr_res,
1620 ticket->t_res_arr_sum, ticket->t_res_o_flow, 1579 ticket->t_res_arr_sum, ticket->t_res_o_flow,
1621 ticket->t_res_num_ophdrs, ophdr_spc, 1580 ticket->t_res_num_ophdrs, ophdr_spc,
1622 ticket->t_res_arr_sum + 1581 ticket->t_res_arr_sum +
1623 ticket->t_res_o_flow + ophdr_spc, 1582 ticket->t_res_o_flow + ophdr_spc,
1624 ticket->t_res_num); 1583 ticket->t_res_num);
1625 1584
1626 for (i = 0; i < ticket->t_res_num; i++) { 1585 for (i = 0; i < ticket->t_res_num; i++) {
1627 uint r_type = ticket->t_res_arr[i].r_type; 1586 uint r_type = ticket->t_res_arr[i].r_type;
1628 cmn_err(CE_WARN, 1587 xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
1629 "region[%u]: %s - %u bytes\n",
1630 i,
1631 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? 1588 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
1632 "bad-rtype" : res_type_str[r_type-1]), 1589 "bad-rtype" : res_type_str[r_type-1]),
1633 ticket->t_res_arr[i].r_len); 1590 ticket->t_res_arr[i].r_len);
1634 } 1591 }
1635 1592
1636 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, 1593 xfs_alert_tag(mp, XFS_PTAG_LOGRES,
1637 "xfs_log_write: reservation ran out. Need to up reservation"); 1594 "xfs_log_write: reservation ran out. Need to up reservation");
1638 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1595 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1639} 1596}
@@ -1721,7 +1678,7 @@ xlog_write_setup_ophdr(
1721 case XFS_LOG: 1678 case XFS_LOG:
1722 break; 1679 break;
1723 default: 1680 default:
1724 xfs_fs_cmn_err(CE_WARN, log->l_mp, 1681 xfs_warn(log->l_mp,
1725 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1682 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1726 ophdr->oh_clientid, ticket); 1683 ophdr->oh_clientid, ticket);
1727 return NULL; 1684 return NULL;
@@ -2232,7 +2189,7 @@ xlog_state_do_callback(
2232 lowest_lsn = xlog_get_lowest_lsn(log); 2189 lowest_lsn = xlog_get_lowest_lsn(log);
2233 if (lowest_lsn && 2190 if (lowest_lsn &&
2234 XFS_LSN_CMP(lowest_lsn, 2191 XFS_LSN_CMP(lowest_lsn,
2235 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { 2192 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2236 iclog = iclog->ic_next; 2193 iclog = iclog->ic_next;
2237 continue; /* Leave this iclog for 2194 continue; /* Leave this iclog for
2238 * another thread */ 2195 * another thread */
@@ -2240,23 +2197,21 @@ xlog_state_do_callback(
2240 2197
2241 iclog->ic_state = XLOG_STATE_CALLBACK; 2198 iclog->ic_state = XLOG_STATE_CALLBACK;
2242 2199
2243 spin_unlock(&log->l_icloglock);
2244 2200
2245 /* l_last_sync_lsn field protected by 2201 /*
2246 * l_grant_lock. Don't worry about iclog's lsn. 2202 * update the last_sync_lsn before we drop the
2247 * No one else can be here except us. 2203 * icloglock to ensure we are the only one that
2204 * can update it.
2248 */ 2205 */
2249 spin_lock(&log->l_grant_lock); 2206 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2250 ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, 2207 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2251 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2208 atomic64_set(&log->l_last_sync_lsn,
2252 log->l_last_sync_lsn = 2209 be64_to_cpu(iclog->ic_header.h_lsn));
2253 be64_to_cpu(iclog->ic_header.h_lsn);
2254 spin_unlock(&log->l_grant_lock);
2255 2210
2256 } else { 2211 } else
2257 spin_unlock(&log->l_icloglock);
2258 ioerrors++; 2212 ioerrors++;
2259 } 2213
2214 spin_unlock(&log->l_icloglock);
2260 2215
2261 /* 2216 /*
2262 * Keep processing entries in the callback list until 2217 * Keep processing entries in the callback list until
@@ -2297,7 +2252,7 @@ xlog_state_do_callback(
2297 xlog_state_clean_log(log); 2252 xlog_state_clean_log(log);
2298 2253
2299 /* wake up threads waiting in xfs_log_force() */ 2254 /* wake up threads waiting in xfs_log_force() */
2300 sv_broadcast(&iclog->ic_force_wait); 2255 wake_up_all(&iclog->ic_force_wait);
2301 2256
2302 iclog = iclog->ic_next; 2257 iclog = iclog->ic_next;
2303 } while (first_iclog != iclog); 2258 } while (first_iclog != iclog);
@@ -2305,7 +2260,7 @@ xlog_state_do_callback(
2305 if (repeats > 5000) { 2260 if (repeats > 5000) {
2306 flushcnt += repeats; 2261 flushcnt += repeats;
2307 repeats = 0; 2262 repeats = 0;
2308 xfs_fs_cmn_err(CE_WARN, log->l_mp, 2263 xfs_warn(log->l_mp,
2309 "%s: possible infinite loop (%d iterations)", 2264 "%s: possible infinite loop (%d iterations)",
2310 __func__, flushcnt); 2265 __func__, flushcnt);
2311 } 2266 }
@@ -2344,7 +2299,7 @@ xlog_state_do_callback(
2344 spin_unlock(&log->l_icloglock); 2299 spin_unlock(&log->l_icloglock);
2345 2300
2346 if (wake) 2301 if (wake)
2347 sv_broadcast(&log->l_flush_wait); 2302 wake_up_all(&log->l_flush_wait);
2348} 2303}
2349 2304
2350 2305
@@ -2395,7 +2350,7 @@ xlog_state_done_syncing(
2395 * iclog buffer, we wake them all, one will get to do the 2350 * iclog buffer, we wake them all, one will get to do the
2396 * I/O, the others get to wait for the result. 2351 * I/O, the others get to wait for the result.
2397 */ 2352 */
2398 sv_broadcast(&iclog->ic_write_wait); 2353 wake_up_all(&iclog->ic_write_wait);
2399 spin_unlock(&log->l_icloglock); 2354 spin_unlock(&log->l_icloglock);
2400 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2355 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
2401} /* xlog_state_done_syncing */ 2356} /* xlog_state_done_syncing */
@@ -2444,7 +2399,7 @@ restart:
2444 XFS_STATS_INC(xs_log_noiclogs); 2399 XFS_STATS_INC(xs_log_noiclogs);
2445 2400
2446 /* Wait for log writes to have flushed */ 2401 /* Wait for log writes to have flushed */
2447 sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); 2402 xlog_wait(&log->l_flush_wait, &log->l_icloglock);
2448 goto restart; 2403 goto restart;
2449 } 2404 }
2450 2405
@@ -2527,6 +2482,18 @@ restart:
2527 * 2482 *
2528 * Once a ticket gets put onto the reserveq, it will only return after 2483 * Once a ticket gets put onto the reserveq, it will only return after
2529 * the needed reservation is satisfied. 2484 * the needed reservation is satisfied.
2485 *
2486 * This function is structured so that it has a lock free fast path. This is
2487 * necessary because every new transaction reservation will come through this
2488 * path. Hence any lock will be globally hot if we take it unconditionally on
2489 * every pass.
2490 *
2491 * As tickets are only ever moved on and off the reserveq under the
2492 * l_grant_reserve_lock, we only need to take that lock if we are going
2493 * to add the ticket to the queue and sleep. We can avoid taking the lock if the
2494 * ticket was never added to the reserveq because the t_queue list head will be
2495 * empty and we hold the only reference to it so it can safely be checked
2496 * unlocked.
2530 */ 2497 */
2531STATIC int 2498STATIC int
2532xlog_grant_log_space(xlog_t *log, 2499xlog_grant_log_space(xlog_t *log,
@@ -2534,24 +2501,27 @@ xlog_grant_log_space(xlog_t *log,
2534{ 2501{
2535 int free_bytes; 2502 int free_bytes;
2536 int need_bytes; 2503 int need_bytes;
2537#ifdef DEBUG
2538 xfs_lsn_t tail_lsn;
2539#endif
2540
2541 2504
2542#ifdef DEBUG 2505#ifdef DEBUG
2543 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 2506 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2544 panic("grant Recovery problem"); 2507 panic("grant Recovery problem");
2545#endif 2508#endif
2546 2509
2547 /* Is there space or do we need to sleep? */
2548 spin_lock(&log->l_grant_lock);
2549
2550 trace_xfs_log_grant_enter(log, tic); 2510 trace_xfs_log_grant_enter(log, tic);
2551 2511
2512 need_bytes = tic->t_unit_res;
2513 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2514 need_bytes *= tic->t_ocnt;
2515
2552 /* something is already sleeping; insert new transaction at end */ 2516 /* something is already sleeping; insert new transaction at end */
2553 if (log->l_reserve_headq) { 2517 if (!list_empty_careful(&log->l_reserveq)) {
2554 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2518 spin_lock(&log->l_grant_reserve_lock);
2519 /* recheck the queue now we are locked */
2520 if (list_empty(&log->l_reserveq)) {
2521 spin_unlock(&log->l_grant_reserve_lock);
2522 goto redo;
2523 }
2524 list_add_tail(&tic->t_queue, &log->l_reserveq);
2555 2525
2556 trace_xfs_log_grant_sleep1(log, tic); 2526 trace_xfs_log_grant_sleep1(log, tic);
2557 2527
@@ -2563,72 +2533,57 @@ xlog_grant_log_space(xlog_t *log,
2563 goto error_return; 2533 goto error_return;
2564 2534
2565 XFS_STATS_INC(xs_sleep_logspace); 2535 XFS_STATS_INC(xs_sleep_logspace);
2566 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); 2536 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2537
2567 /* 2538 /*
2568 * If we got an error, and the filesystem is shutting down, 2539 * If we got an error, and the filesystem is shutting down,
2569 * we'll catch it down below. So just continue... 2540 * we'll catch it down below. So just continue...
2570 */ 2541 */
2571 trace_xfs_log_grant_wake1(log, tic); 2542 trace_xfs_log_grant_wake1(log, tic);
2572 spin_lock(&log->l_grant_lock);
2573 } 2543 }
2574 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2575 need_bytes = tic->t_unit_res*tic->t_ocnt;
2576 else
2577 need_bytes = tic->t_unit_res;
2578 2544
2579redo: 2545redo:
2580 if (XLOG_FORCED_SHUTDOWN(log)) 2546 if (XLOG_FORCED_SHUTDOWN(log))
2581 goto error_return; 2547 goto error_return_unlocked;
2582 2548
2583 free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, 2549 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
2584 log->l_grant_reserve_bytes);
2585 if (free_bytes < need_bytes) { 2550 if (free_bytes < need_bytes) {
2586 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2551 spin_lock(&log->l_grant_reserve_lock);
2587 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2552 if (list_empty(&tic->t_queue))
2553 list_add_tail(&tic->t_queue, &log->l_reserveq);
2588 2554
2589 trace_xfs_log_grant_sleep2(log, tic); 2555 trace_xfs_log_grant_sleep2(log, tic);
2590 2556
2591 spin_unlock(&log->l_grant_lock);
2592 xlog_grant_push_ail(log->l_mp, need_bytes);
2593 spin_lock(&log->l_grant_lock);
2594
2595 XFS_STATS_INC(xs_sleep_logspace);
2596 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2597
2598 spin_lock(&log->l_grant_lock);
2599 if (XLOG_FORCED_SHUTDOWN(log)) 2557 if (XLOG_FORCED_SHUTDOWN(log))
2600 goto error_return; 2558 goto error_return;
2601 2559
2602 trace_xfs_log_grant_wake2(log, tic); 2560 xlog_grant_push_ail(log, need_bytes);
2603 2561
2562 XFS_STATS_INC(xs_sleep_logspace);
2563 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2564
2565 trace_xfs_log_grant_wake2(log, tic);
2604 goto redo; 2566 goto redo;
2605 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2567 }
2606 xlog_del_ticketq(&log->l_reserve_headq, tic);
2607 2568
2608 /* we've got enough space */ 2569 if (!list_empty(&tic->t_queue)) {
2609 xlog_grant_add_space(log, need_bytes); 2570 spin_lock(&log->l_grant_reserve_lock);
2610#ifdef DEBUG 2571 list_del_init(&tic->t_queue);
2611 tail_lsn = log->l_tail_lsn; 2572 spin_unlock(&log->l_grant_reserve_lock);
2612 /*
2613 * Check to make sure the grant write head didn't just over lap the
2614 * tail. If the cycles are the same, we can't be overlapping.
2615 * Otherwise, make sure that the cycles differ by exactly one and
2616 * check the byte count.
2617 */
2618 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2619 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2620 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2621 } 2573 }
2622#endif 2574
2575 /* we've got enough space */
2576 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
2577 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2623 trace_xfs_log_grant_exit(log, tic); 2578 trace_xfs_log_grant_exit(log, tic);
2624 xlog_verify_grant_head(log, 1); 2579 xlog_verify_grant_tail(log);
2625 spin_unlock(&log->l_grant_lock);
2626 return 0; 2580 return 0;
2627 2581
2628 error_return: 2582error_return_unlocked:
2629 if (tic->t_flags & XLOG_TIC_IN_Q) 2583 spin_lock(&log->l_grant_reserve_lock);
2630 xlog_del_ticketq(&log->l_reserve_headq, tic); 2584error_return:
2631 2585 list_del_init(&tic->t_queue);
2586 spin_unlock(&log->l_grant_reserve_lock);
2632 trace_xfs_log_grant_error(log, tic); 2587 trace_xfs_log_grant_error(log, tic);
2633 2588
2634 /* 2589 /*
@@ -2638,7 +2593,6 @@ redo:
2638 */ 2593 */
2639 tic->t_curr_res = 0; 2594 tic->t_curr_res = 0;
2640 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2595 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2641 spin_unlock(&log->l_grant_lock);
2642 return XFS_ERROR(EIO); 2596 return XFS_ERROR(EIO);
2643} /* xlog_grant_log_space */ 2597} /* xlog_grant_log_space */
2644 2598
@@ -2646,17 +2600,14 @@ redo:
2646/* 2600/*
2647 * Replenish the byte reservation required by moving the grant write head. 2601 * Replenish the byte reservation required by moving the grant write head.
2648 * 2602 *
2649 * 2603 * Similar to xlog_grant_log_space, the function is structured to have a lock
2604 * free fast path.
2650 */ 2605 */
2651STATIC int 2606STATIC int
2652xlog_regrant_write_log_space(xlog_t *log, 2607xlog_regrant_write_log_space(xlog_t *log,
2653 xlog_ticket_t *tic) 2608 xlog_ticket_t *tic)
2654{ 2609{
2655 int free_bytes, need_bytes; 2610 int free_bytes, need_bytes;
2656 xlog_ticket_t *ntic;
2657#ifdef DEBUG
2658 xfs_lsn_t tail_lsn;
2659#endif
2660 2611
2661 tic->t_curr_res = tic->t_unit_res; 2612 tic->t_curr_res = tic->t_unit_res;
2662 xlog_tic_reset_res(tic); 2613 xlog_tic_reset_res(tic);
@@ -2669,12 +2620,9 @@ xlog_regrant_write_log_space(xlog_t *log,
2669 panic("regrant Recovery problem"); 2620 panic("regrant Recovery problem");
2670#endif 2621#endif
2671 2622
2672 spin_lock(&log->l_grant_lock);
2673
2674 trace_xfs_log_regrant_write_enter(log, tic); 2623 trace_xfs_log_regrant_write_enter(log, tic);
2675
2676 if (XLOG_FORCED_SHUTDOWN(log)) 2624 if (XLOG_FORCED_SHUTDOWN(log))
2677 goto error_return; 2625 goto error_return_unlocked;
2678 2626
2679 /* If there are other waiters on the queue then give them a 2627 /* If there are other waiters on the queue then give them a
2680 * chance at logspace before us. Wake up the first waiters, 2628 * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2631,76 @@ xlog_regrant_write_log_space(xlog_t *log,
2683 * this transaction. 2631 * this transaction.
2684 */ 2632 */
2685 need_bytes = tic->t_unit_res; 2633 need_bytes = tic->t_unit_res;
2686 if ((ntic = log->l_write_headq)) { 2634 if (!list_empty_careful(&log->l_writeq)) {
2687 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2635 struct xlog_ticket *ntic;
2688 log->l_grant_write_bytes); 2636
2689 do { 2637 spin_lock(&log->l_grant_write_lock);
2638 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2639 list_for_each_entry(ntic, &log->l_writeq, t_queue) {
2690 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); 2640 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
2691 2641
2692 if (free_bytes < ntic->t_unit_res) 2642 if (free_bytes < ntic->t_unit_res)
2693 break; 2643 break;
2694 free_bytes -= ntic->t_unit_res; 2644 free_bytes -= ntic->t_unit_res;
2695 sv_signal(&ntic->t_wait); 2645 wake_up(&ntic->t_wait);
2696 ntic = ntic->t_next; 2646 }
2697 } while (ntic != log->l_write_headq);
2698
2699 if (ntic != log->l_write_headq) {
2700 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2701 xlog_ins_ticketq(&log->l_write_headq, tic);
2702 2647
2648 if (ntic != list_first_entry(&log->l_writeq,
2649 struct xlog_ticket, t_queue)) {
2650 if (list_empty(&tic->t_queue))
2651 list_add_tail(&tic->t_queue, &log->l_writeq);
2703 trace_xfs_log_regrant_write_sleep1(log, tic); 2652 trace_xfs_log_regrant_write_sleep1(log, tic);
2704 2653
2705 spin_unlock(&log->l_grant_lock); 2654 xlog_grant_push_ail(log, need_bytes);
2706 xlog_grant_push_ail(log->l_mp, need_bytes);
2707 spin_lock(&log->l_grant_lock);
2708 2655
2709 XFS_STATS_INC(xs_sleep_logspace); 2656 XFS_STATS_INC(xs_sleep_logspace);
2710 sv_wait(&tic->t_wait, PINOD|PLTWAIT, 2657 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2711 &log->l_grant_lock, s);
2712
2713 /* If we're shutting down, this tic is already
2714 * off the queue */
2715 spin_lock(&log->l_grant_lock);
2716 if (XLOG_FORCED_SHUTDOWN(log))
2717 goto error_return;
2718
2719 trace_xfs_log_regrant_write_wake1(log, tic); 2658 trace_xfs_log_regrant_write_wake1(log, tic);
2720 } 2659 } else
2660 spin_unlock(&log->l_grant_write_lock);
2721 } 2661 }
2722 2662
2723redo: 2663redo:
2724 if (XLOG_FORCED_SHUTDOWN(log)) 2664 if (XLOG_FORCED_SHUTDOWN(log))
2725 goto error_return; 2665 goto error_return_unlocked;
2726 2666
2727 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2667 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2728 log->l_grant_write_bytes);
2729 if (free_bytes < need_bytes) { 2668 if (free_bytes < need_bytes) {
2730 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2669 spin_lock(&log->l_grant_write_lock);
2731 xlog_ins_ticketq(&log->l_write_headq, tic); 2670 if (list_empty(&tic->t_queue))
2732 spin_unlock(&log->l_grant_lock); 2671 list_add_tail(&tic->t_queue, &log->l_writeq);
2733 xlog_grant_push_ail(log->l_mp, need_bytes);
2734 spin_lock(&log->l_grant_lock);
2735
2736 XFS_STATS_INC(xs_sleep_logspace);
2737 trace_xfs_log_regrant_write_sleep2(log, tic);
2738
2739 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2740 2672
2741 /* If we're shutting down, this tic is already off the queue */
2742 spin_lock(&log->l_grant_lock);
2743 if (XLOG_FORCED_SHUTDOWN(log)) 2673 if (XLOG_FORCED_SHUTDOWN(log))
2744 goto error_return; 2674 goto error_return;
2745 2675
2676 xlog_grant_push_ail(log, need_bytes);
2677
2678 XFS_STATS_INC(xs_sleep_logspace);
2679 trace_xfs_log_regrant_write_sleep2(log, tic);
2680 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2681
2746 trace_xfs_log_regrant_write_wake2(log, tic); 2682 trace_xfs_log_regrant_write_wake2(log, tic);
2747 goto redo; 2683 goto redo;
2748 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2684 }
2749 xlog_del_ticketq(&log->l_write_headq, tic);
2750 2685
2751 /* we've got enough space */ 2686 if (!list_empty(&tic->t_queue)) {
2752 xlog_grant_add_space_write(log, need_bytes); 2687 spin_lock(&log->l_grant_write_lock);
2753#ifdef DEBUG 2688 list_del_init(&tic->t_queue);
2754 tail_lsn = log->l_tail_lsn; 2689 spin_unlock(&log->l_grant_write_lock);
2755 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2756 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2757 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2758 } 2690 }
2759#endif
2760 2691
2692 /* we've got enough space */
2693 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2761 trace_xfs_log_regrant_write_exit(log, tic); 2694 trace_xfs_log_regrant_write_exit(log, tic);
2762 2695 xlog_verify_grant_tail(log);
2763 xlog_verify_grant_head(log, 1);
2764 spin_unlock(&log->l_grant_lock);
2765 return 0; 2696 return 0;
2766 2697
2767 2698
2699 error_return_unlocked:
2700 spin_lock(&log->l_grant_write_lock);
2768 error_return: 2701 error_return:
2769 if (tic->t_flags & XLOG_TIC_IN_Q) 2702 list_del_init(&tic->t_queue);
2770 xlog_del_ticketq(&log->l_reserve_headq, tic); 2703 spin_unlock(&log->l_grant_write_lock);
2771
2772 trace_xfs_log_regrant_write_error(log, tic); 2704 trace_xfs_log_regrant_write_error(log, tic);
2773 2705
2774 /* 2706 /*
@@ -2778,7 +2710,6 @@ redo:
2778 */ 2710 */
2779 tic->t_curr_res = 0; 2711 tic->t_curr_res = 0;
2780 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2712 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2781 spin_unlock(&log->l_grant_lock);
2782 return XFS_ERROR(EIO); 2713 return XFS_ERROR(EIO);
2783} /* xlog_regrant_write_log_space */ 2714} /* xlog_regrant_write_log_space */
2784 2715
@@ -2799,27 +2730,24 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2799 if (ticket->t_cnt > 0) 2730 if (ticket->t_cnt > 0)
2800 ticket->t_cnt--; 2731 ticket->t_cnt--;
2801 2732
2802 spin_lock(&log->l_grant_lock); 2733 xlog_grant_sub_space(log, &log->l_grant_reserve_head,
2803 xlog_grant_sub_space(log, ticket->t_curr_res); 2734 ticket->t_curr_res);
2735 xlog_grant_sub_space(log, &log->l_grant_write_head,
2736 ticket->t_curr_res);
2804 ticket->t_curr_res = ticket->t_unit_res; 2737 ticket->t_curr_res = ticket->t_unit_res;
2805 xlog_tic_reset_res(ticket); 2738 xlog_tic_reset_res(ticket);
2806 2739
2807 trace_xfs_log_regrant_reserve_sub(log, ticket); 2740 trace_xfs_log_regrant_reserve_sub(log, ticket);
2808 2741
2809 xlog_verify_grant_head(log, 1);
2810
2811 /* just return if we still have some of the pre-reserved space */ 2742 /* just return if we still have some of the pre-reserved space */
2812 if (ticket->t_cnt > 0) { 2743 if (ticket->t_cnt > 0)
2813 spin_unlock(&log->l_grant_lock);
2814 return; 2744 return;
2815 }
2816 2745
2817 xlog_grant_add_space_reserve(log, ticket->t_unit_res); 2746 xlog_grant_add_space(log, &log->l_grant_reserve_head,
2747 ticket->t_unit_res);
2818 2748
2819 trace_xfs_log_regrant_reserve_exit(log, ticket); 2749 trace_xfs_log_regrant_reserve_exit(log, ticket);
2820 2750
2821 xlog_verify_grant_head(log, 0);
2822 spin_unlock(&log->l_grant_lock);
2823 ticket->t_curr_res = ticket->t_unit_res; 2751 ticket->t_curr_res = ticket->t_unit_res;
2824 xlog_tic_reset_res(ticket); 2752 xlog_tic_reset_res(ticket);
2825} /* xlog_regrant_reserve_log_space */ 2753} /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2771,29 @@ STATIC void
2843xlog_ungrant_log_space(xlog_t *log, 2771xlog_ungrant_log_space(xlog_t *log,
2844 xlog_ticket_t *ticket) 2772 xlog_ticket_t *ticket)
2845{ 2773{
2774 int bytes;
2775
2846 if (ticket->t_cnt > 0) 2776 if (ticket->t_cnt > 0)
2847 ticket->t_cnt--; 2777 ticket->t_cnt--;
2848 2778
2849 spin_lock(&log->l_grant_lock);
2850 trace_xfs_log_ungrant_enter(log, ticket); 2779 trace_xfs_log_ungrant_enter(log, ticket);
2851
2852 xlog_grant_sub_space(log, ticket->t_curr_res);
2853
2854 trace_xfs_log_ungrant_sub(log, ticket); 2780 trace_xfs_log_ungrant_sub(log, ticket);
2855 2781
2856 /* If this is a permanent reservation ticket, we may be able to free 2782 /*
2783 * If this is a permanent reservation ticket, we may be able to free
2857 * up more space based on the remaining count. 2784 * up more space based on the remaining count.
2858 */ 2785 */
2786 bytes = ticket->t_curr_res;
2859 if (ticket->t_cnt > 0) { 2787 if (ticket->t_cnt > 0) {
2860 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 2788 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2861 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); 2789 bytes += ticket->t_unit_res*ticket->t_cnt;
2862 } 2790 }
2863 2791
2792 xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
2793 xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
2794
2864 trace_xfs_log_ungrant_exit(log, ticket); 2795 trace_xfs_log_ungrant_exit(log, ticket);
2865 2796
2866 xlog_verify_grant_head(log, 1);
2867 spin_unlock(&log->l_grant_lock);
2868 xfs_log_move_tail(log->l_mp, 1); 2797 xfs_log_move_tail(log->l_mp, 1);
2869} /* xlog_ungrant_log_space */ 2798} /* xlog_ungrant_log_space */
2870 2799
@@ -2901,11 +2830,11 @@ xlog_state_release_iclog(
2901 2830
2902 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 2831 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
2903 /* update tail before writing to iclog */ 2832 /* update tail before writing to iclog */
2904 xlog_assign_tail_lsn(log->l_mp); 2833 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
2905 sync++; 2834 sync++;
2906 iclog->ic_state = XLOG_STATE_SYNCING; 2835 iclog->ic_state = XLOG_STATE_SYNCING;
2907 iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); 2836 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
2908 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); 2837 xlog_verify_tail_lsn(log, iclog, tail_lsn);
2909 /* cycle incremented when incrementing curr_block */ 2838 /* cycle incremented when incrementing curr_block */
2910 } 2839 }
2911 spin_unlock(&log->l_icloglock); 2840 spin_unlock(&log->l_icloglock);
@@ -3088,7 +3017,7 @@ maybe_sleep:
3088 return XFS_ERROR(EIO); 3017 return XFS_ERROR(EIO);
3089 } 3018 }
3090 XFS_STATS_INC(xs_log_force_sleep); 3019 XFS_STATS_INC(xs_log_force_sleep);
3091 sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); 3020 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3092 /* 3021 /*
3093 * No need to grab the log lock here since we're 3022 * No need to grab the log lock here since we're
3094 * only deciding whether or not to return EIO 3023 * only deciding whether or not to return EIO
@@ -3119,10 +3048,8 @@ xfs_log_force(
3119 int error; 3048 int error;
3120 3049
3121 error = _xfs_log_force(mp, flags, NULL); 3050 error = _xfs_log_force(mp, flags, NULL);
3122 if (error) { 3051 if (error)
3123 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " 3052 xfs_warn(mp, "%s: error %d returned.", __func__, error);
3124 "error %d returned.", error);
3125 }
3126} 3053}
3127 3054
3128/* 3055/*
@@ -3206,8 +3133,8 @@ try_again:
3206 3133
3207 XFS_STATS_INC(xs_log_force_sleep); 3134 XFS_STATS_INC(xs_log_force_sleep);
3208 3135
3209 sv_wait(&iclog->ic_prev->ic_write_wait, 3136 xlog_wait(&iclog->ic_prev->ic_write_wait,
3210 PSWP, &log->l_icloglock, s); 3137 &log->l_icloglock);
3211 if (log_flushed) 3138 if (log_flushed)
3212 *log_flushed = 1; 3139 *log_flushed = 1;
3213 already_slept = 1; 3140 already_slept = 1;
@@ -3235,7 +3162,7 @@ try_again:
3235 return XFS_ERROR(EIO); 3162 return XFS_ERROR(EIO);
3236 } 3163 }
3237 XFS_STATS_INC(xs_log_force_sleep); 3164 XFS_STATS_INC(xs_log_force_sleep);
3238 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); 3165 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3239 /* 3166 /*
3240 * No need to grab the log lock here since we're 3167 * No need to grab the log lock here since we're
3241 * only deciding whether or not to return EIO 3168 * only deciding whether or not to return EIO
@@ -3271,10 +3198,8 @@ xfs_log_force_lsn(
3271 int error; 3198 int error;
3272 3199
3273 error = _xfs_log_force_lsn(mp, lsn, flags, NULL); 3200 error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3274 if (error) { 3201 if (error)
3275 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " 3202 xfs_warn(mp, "%s: error %d returned.", __func__, error);
3276 "error %d returned.", error);
3277 }
3278} 3203}
3279 3204
3280/* 3205/*
@@ -3310,10 +3235,8 @@ xfs_log_ticket_put(
3310 xlog_ticket_t *ticket) 3235 xlog_ticket_t *ticket)
3311{ 3236{
3312 ASSERT(atomic_read(&ticket->t_ref) > 0); 3237 ASSERT(atomic_read(&ticket->t_ref) > 0);
3313 if (atomic_dec_and_test(&ticket->t_ref)) { 3238 if (atomic_dec_and_test(&ticket->t_ref))
3314 sv_destroy(&ticket->t_wait);
3315 kmem_zone_free(xfs_log_ticket_zone, ticket); 3239 kmem_zone_free(xfs_log_ticket_zone, ticket);
3316 }
3317} 3240}
3318 3241
3319xlog_ticket_t * 3242xlog_ticket_t *
@@ -3435,6 +3358,7 @@ xlog_ticket_alloc(
3435 } 3358 }
3436 3359
3437 atomic_set(&tic->t_ref, 1); 3360 atomic_set(&tic->t_ref, 1);
3361 INIT_LIST_HEAD(&tic->t_queue);
3438 tic->t_unit_res = unit_bytes; 3362 tic->t_unit_res = unit_bytes;
3439 tic->t_curr_res = unit_bytes; 3363 tic->t_curr_res = unit_bytes;
3440 tic->t_cnt = cnt; 3364 tic->t_cnt = cnt;
@@ -3445,7 +3369,7 @@ xlog_ticket_alloc(
3445 tic->t_trans_type = 0; 3369 tic->t_trans_type = 0;
3446 if (xflags & XFS_LOG_PERM_RESERV) 3370 if (xflags & XFS_LOG_PERM_RESERV)
3447 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3371 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3448 sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); 3372 init_waitqueue_head(&tic->t_wait);
3449 3373
3450 xlog_tic_reset_res(tic); 3374 xlog_tic_reset_res(tic);
3451 3375
@@ -3480,22 +3404,45 @@ xlog_verify_dest_ptr(
3480 } 3404 }
3481 3405
3482 if (!good_ptr) 3406 if (!good_ptr)
3483 xlog_panic("xlog_verify_dest_ptr: invalid ptr"); 3407 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3484} 3408}
3485 3409
3410/*
3411 * Check to make sure the grant write head didn't just over lap the tail. If
3412 * the cycles are the same, we can't be overlapping. Otherwise, make sure that
3413 * the cycles differ by exactly one and check the byte count.
3414 *
3415 * This check is run unlocked, so can give false positives. Rather than assert
3416 * on failures, use a warn-once flag and a panic tag to allow the admin to
3417 * determine if they want to panic the machine when such an error occurs. For
3418 * debug kernels this will have the same effect as using an assert but, unlinke
3419 * an assert, it can be turned off at runtime.
3420 */
3486STATIC void 3421STATIC void
3487xlog_verify_grant_head(xlog_t *log, int equals) 3422xlog_verify_grant_tail(
3488{ 3423 struct log *log)
3489 if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { 3424{
3490 if (equals) 3425 int tail_cycle, tail_blocks;
3491 ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); 3426 int cycle, space;
3492 else 3427
3493 ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); 3428 xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
3494 } else { 3429 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3495 ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); 3430 if (tail_cycle != cycle) {
3496 ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); 3431 if (cycle - 1 != tail_cycle &&
3497 } 3432 !(log->l_flags & XLOG_TAIL_WARN)) {
3498} /* xlog_verify_grant_head */ 3433 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3434 "%s: cycle - 1 != tail_cycle", __func__);
3435 log->l_flags |= XLOG_TAIL_WARN;
3436 }
3437
3438 if (space > BBTOB(tail_blocks) &&
3439 !(log->l_flags & XLOG_TAIL_WARN)) {
3440 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3441 "%s: space > BBTOB(tail_blocks)", __func__);
3442 log->l_flags |= XLOG_TAIL_WARN;
3443 }
3444 }
3445}
3499 3446
3500/* check if it will fit */ 3447/* check if it will fit */
3501STATIC void 3448STATIC void
@@ -3509,16 +3456,16 @@ xlog_verify_tail_lsn(xlog_t *log,
3509 blocks = 3456 blocks =
3510 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); 3457 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
3511 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) 3458 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
3512 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3459 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3513 } else { 3460 } else {
3514 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); 3461 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
3515 3462
3516 if (BLOCK_LSN(tail_lsn) == log->l_prev_block) 3463 if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
3517 xlog_panic("xlog_verify_tail_lsn: tail wrapped"); 3464 xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
3518 3465
3519 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; 3466 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
3520 if (blocks < BTOBB(iclog->ic_offset) + 1) 3467 if (blocks < BTOBB(iclog->ic_offset) + 1)
3521 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3468 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3522 } 3469 }
3523} /* xlog_verify_tail_lsn */ 3470} /* xlog_verify_tail_lsn */
3524 3471
@@ -3558,22 +3505,23 @@ xlog_verify_iclog(xlog_t *log,
3558 icptr = log->l_iclog; 3505 icptr = log->l_iclog;
3559 for (i=0; i < log->l_iclog_bufs; i++) { 3506 for (i=0; i < log->l_iclog_bufs; i++) {
3560 if (icptr == NULL) 3507 if (icptr == NULL)
3561 xlog_panic("xlog_verify_iclog: invalid ptr"); 3508 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3562 icptr = icptr->ic_next; 3509 icptr = icptr->ic_next;
3563 } 3510 }
3564 if (icptr != log->l_iclog) 3511 if (icptr != log->l_iclog)
3565 xlog_panic("xlog_verify_iclog: corrupt iclog ring"); 3512 xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
3566 spin_unlock(&log->l_icloglock); 3513 spin_unlock(&log->l_icloglock);
3567 3514
3568 /* check log magic numbers */ 3515 /* check log magic numbers */
3569 if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM) 3516 if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
3570 xlog_panic("xlog_verify_iclog: invalid magic num"); 3517 xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
3571 3518
3572 ptr = (xfs_caddr_t) &iclog->ic_header; 3519 ptr = (xfs_caddr_t) &iclog->ic_header;
3573 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; 3520 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
3574 ptr += BBSIZE) { 3521 ptr += BBSIZE) {
3575 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) 3522 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
3576 xlog_panic("xlog_verify_iclog: unexpected magic num"); 3523 xfs_emerg(log->l_mp, "%s: unexpected magic num",
3524 __func__);
3577 } 3525 }
3578 3526
3579 /* check fields */ 3527 /* check fields */
@@ -3603,9 +3551,10 @@ xlog_verify_iclog(xlog_t *log,
3603 } 3551 }
3604 } 3552 }
3605 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) 3553 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
3606 cmn_err(CE_WARN, "xlog_verify_iclog: " 3554 xfs_warn(log->l_mp,
3607 "invalid clientid %d op 0x%p offset 0x%lx", 3555 "%s: invalid clientid %d op 0x%p offset 0x%lx",
3608 clientid, ophead, (unsigned long)field_offset); 3556 __func__, clientid, ophead,
3557 (unsigned long)field_offset);
3609 3558
3610 /* check length */ 3559 /* check length */
3611 field_offset = (__psint_t) 3560 field_offset = (__psint_t)
@@ -3716,12 +3665,10 @@ xfs_log_force_umount(
3716 xlog_cil_force(log); 3665 xlog_cil_force(log);
3717 3666
3718 /* 3667 /*
3719 * We must hold both the GRANT lock and the LOG lock, 3668 * mark the filesystem and the as in a shutdown state and wake
3720 * before we mark the filesystem SHUTDOWN and wake 3669 * everybody up to tell them the bad news.
3721 * everybody up to tell the bad news.
3722 */ 3670 */
3723 spin_lock(&log->l_icloglock); 3671 spin_lock(&log->l_icloglock);
3724 spin_lock(&log->l_grant_lock);
3725 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3672 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3726 if (mp->m_sb_bp) 3673 if (mp->m_sb_bp)
3727 XFS_BUF_DONE(mp->m_sb_bp); 3674 XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3689,21 @@ xfs_log_force_umount(
3742 spin_unlock(&log->l_icloglock); 3689 spin_unlock(&log->l_icloglock);
3743 3690
3744 /* 3691 /*
3745 * We don't want anybody waiting for log reservations 3692 * We don't want anybody waiting for log reservations after this. That
3746 * after this. That means we have to wake up everybody 3693 * means we have to wake up everybody queued up on reserveq as well as
3747 * queued up on reserve_headq as well as write_headq. 3694 * writeq. In addition, we make sure in xlog_{re}grant_log_space that
3748 * In addition, we make sure in xlog_{re}grant_log_space 3695 * we don't enqueue anything once the SHUTDOWN flag is set, and this
3749 * that we don't enqueue anything once the SHUTDOWN flag 3696 * action is protected by the grant locks.
3750 * is set, and this action is protected by the GRANTLOCK.
3751 */ 3697 */
3752 if ((tic = log->l_reserve_headq)) { 3698 spin_lock(&log->l_grant_reserve_lock);
3753 do { 3699 list_for_each_entry(tic, &log->l_reserveq, t_queue)
3754 sv_signal(&tic->t_wait); 3700 wake_up(&tic->t_wait);
3755 tic = tic->t_next; 3701 spin_unlock(&log->l_grant_reserve_lock);
3756 } while (tic != log->l_reserve_headq); 3702
3757 } 3703 spin_lock(&log->l_grant_write_lock);
3758 3704 list_for_each_entry(tic, &log->l_writeq, t_queue)
3759 if ((tic = log->l_write_headq)) { 3705 wake_up(&tic->t_wait);
3760 do { 3706 spin_unlock(&log->l_grant_write_lock);
3761 sv_signal(&tic->t_wait);
3762 tic = tic->t_next;
3763 } while (tic != log->l_write_headq);
3764 }
3765 spin_unlock(&log->l_grant_lock);
3766 3707
3767 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3708 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3768 ASSERT(!logerror); 3709 ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..3bd3291ef8d2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket);
191 191
192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); 192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
193 193
194int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 194void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
195 struct xfs_log_vec *log_vector, 195 struct xfs_log_vec *log_vector,
196 xfs_lsn_t *commit_lsn, int flags); 196 xfs_lsn_t *commit_lsn, int flags);
197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97b..9ca59be08977 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
61 INIT_LIST_HEAD(&cil->xc_committing); 61 INIT_LIST_HEAD(&cil->xc_committing);
62 spin_lock_init(&cil->xc_cil_lock); 62 spin_lock_init(&cil->xc_cil_lock);
63 init_rwsem(&cil->xc_ctx_lock); 63 init_rwsem(&cil->xc_ctx_lock);
64 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); 64 init_waitqueue_head(&cil->xc_commit_wait);
65 65
66 INIT_LIST_HEAD(&ctx->committing); 66 INIT_LIST_HEAD(&ctx->committing);
67 INIT_LIST_HEAD(&ctx->busy_extents); 67 INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
361 int abort) 361 int abort)
362{ 362{
363 struct xfs_cil_ctx *ctx = args; 363 struct xfs_cil_ctx *ctx = args;
364 struct xfs_log_vec *lv;
365 int abortflag = abort ? XFS_LI_ABORTED : 0;
366 struct xfs_busy_extent *busyp, *n; 364 struct xfs_busy_extent *busyp, *n;
367 365
368 /* unpin all the log items */ 366 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
369 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { 367 ctx->start_lsn, abort);
370 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
371 abortflag);
372 }
373 368
374 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) 369 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
375 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); 370 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -548,7 +543,7 @@ xlog_cil_push(
548 543
549 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); 544 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
550 if (error) 545 if (error)
551 goto out_abort; 546 goto out_abort_free_ticket;
552 547
553 /* 548 /*
554 * now that we've written the checkpoint into the log, strictly 549 * now that we've written the checkpoint into the log, strictly
@@ -568,14 +563,15 @@ restart:
568 * It is still being pushed! Wait for the push to 563 * It is still being pushed! Wait for the push to
569 * complete, then start again from the beginning. 564 * complete, then start again from the beginning.
570 */ 565 */
571 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 566 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
572 goto restart; 567 goto restart;
573 } 568 }
574 } 569 }
575 spin_unlock(&cil->xc_cil_lock); 570 spin_unlock(&cil->xc_cil_lock);
576 571
572 /* xfs_log_done always frees the ticket on error. */
577 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 573 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
578 if (error || commit_lsn == -1) 574 if (commit_lsn == -1)
579 goto out_abort; 575 goto out_abort;
580 576
581 /* attach all the transactions w/ busy extents to iclog */ 577 /* attach all the transactions w/ busy extents to iclog */
@@ -592,7 +588,7 @@ restart:
592 */ 588 */
593 spin_lock(&cil->xc_cil_lock); 589 spin_lock(&cil->xc_cil_lock);
594 ctx->commit_lsn = commit_lsn; 590 ctx->commit_lsn = commit_lsn;
595 sv_broadcast(&cil->xc_commit_wait); 591 wake_up_all(&cil->xc_commit_wait);
596 spin_unlock(&cil->xc_cil_lock); 592 spin_unlock(&cil->xc_cil_lock);
597 593
598 /* release the hounds! */ 594 /* release the hounds! */
@@ -605,6 +601,8 @@ out_free_ticket:
605 kmem_free(new_ctx); 601 kmem_free(new_ctx);
606 return 0; 602 return 0;
607 603
604out_abort_free_ticket:
605 xfs_log_ticket_put(tic);
608out_abort: 606out_abort:
609 xlog_cil_committed(ctx, XFS_LI_ABORTED); 607 xlog_cil_committed(ctx, XFS_LI_ABORTED);
610 return XFS_ERROR(EIO); 608 return XFS_ERROR(EIO);
@@ -627,7 +625,7 @@ out_abort:
627 * background commit, returns without it held once background commits are 625 * background commit, returns without it held once background commits are
628 * allowed again. 626 * allowed again.
629 */ 627 */
630int 628void
631xfs_log_commit_cil( 629xfs_log_commit_cil(
632 struct xfs_mount *mp, 630 struct xfs_mount *mp,
633 struct xfs_trans *tp, 631 struct xfs_trans *tp,
@@ -642,11 +640,6 @@ xfs_log_commit_cil(
642 if (flags & XFS_TRANS_RELEASE_LOG_RES) 640 if (flags & XFS_TRANS_RELEASE_LOG_RES)
643 log_flags = XFS_LOG_REL_PERM_RESERV; 641 log_flags = XFS_LOG_REL_PERM_RESERV;
644 642
645 if (XLOG_FORCED_SHUTDOWN(log)) {
646 xlog_cil_free_logvec(log_vector);
647 return XFS_ERROR(EIO);
648 }
649
650 /* 643 /*
651 * do all the hard work of formatting items (including memory 644 * do all the hard work of formatting items (including memory
652 * allocation) outside the CIL context lock. This prevents stalling CIL 645 * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -706,7 +699,6 @@ xfs_log_commit_cil(
706 */ 699 */
707 if (push) 700 if (push)
708 xlog_cil_push(log, 0); 701 xlog_cil_push(log, 0);
709 return 0;
710} 702}
711 703
712/* 704/*
@@ -757,7 +749,7 @@ restart:
757 * It is still being pushed! Wait for the push to 749 * It is still being pushed! Wait for the push to
758 * complete, then start again from the beginning. 750 * complete, then start again from the beginning.
759 */ 751 */
760 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 752 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
761 goto restart; 753 goto restart;
762 } 754 }
763 if (ctx->sequence != sequence) 755 if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..5864850e9e34 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
21struct xfs_buf; 21struct xfs_buf;
22struct log; 22struct log;
23struct xlog_ticket; 23struct xlog_ticket;
24struct xfs_buf_cancel;
25struct xfs_mount; 24struct xfs_mount;
26 25
27/* 26/*
@@ -54,7 +53,6 @@ struct xfs_mount;
54 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ 53 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
55 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) 54 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
56 55
57
58static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) 56static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
59{ 57{
60 return ((xfs_lsn_t)cycle << 32) | block; 58 return ((xfs_lsn_t)cycle << 32) | block;
@@ -89,10 +87,6 @@ static inline uint xlog_get_client_id(__be32 i)
89 return be32_to_cpu(i) >> 24; 87 return be32_to_cpu(i) >> 24;
90} 88}
91 89
92#define xlog_panic(args...) cmn_err(CE_PANIC, ## args)
93#define xlog_exit(args...) cmn_err(CE_PANIC, ## args)
94#define xlog_warn(args...) cmn_err(CE_WARN, ## args)
95
96/* 90/*
97 * In core log state 91 * In core log state
98 */ 92 */
@@ -133,12 +127,10 @@ static inline uint xlog_get_client_id(__be32 i)
133 */ 127 */
134#define XLOG_TIC_INITED 0x1 /* has been initialized */ 128#define XLOG_TIC_INITED 0x1 /* has been initialized */
135#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ 129#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
136#define XLOG_TIC_IN_Q 0x4
137 130
138#define XLOG_TIC_FLAGS \ 131#define XLOG_TIC_FLAGS \
139 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ 132 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
140 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ 133 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
141 { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" }
142 134
143#endif /* __KERNEL__ */ 135#endif /* __KERNEL__ */
144 136
@@ -152,6 +144,7 @@ static inline uint xlog_get_client_id(__be32 i)
152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 144#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 145#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
154 shutdown */ 146 shutdown */
147#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
155 148
156#ifdef __KERNEL__ 149#ifdef __KERNEL__
157/* 150/*
@@ -244,9 +237,8 @@ typedef struct xlog_res {
244} xlog_res_t; 237} xlog_res_t;
245 238
246typedef struct xlog_ticket { 239typedef struct xlog_ticket {
247 sv_t t_wait; /* ticket wait queue : 20 */ 240 wait_queue_head_t t_wait; /* ticket wait queue */
248 struct xlog_ticket *t_next; /* :4|8 */ 241 struct list_head t_queue; /* reserve/write queue */
249 struct xlog_ticket *t_prev; /* :4|8 */
250 xlog_tid_t t_tid; /* transaction identifier : 4 */ 242 xlog_tid_t t_tid; /* transaction identifier : 4 */
251 atomic_t t_ref; /* ticket reference count : 4 */ 243 atomic_t t_ref; /* ticket reference count : 4 */
252 int t_curr_res; /* current reservation in bytes : 4 */ 244 int t_curr_res; /* current reservation in bytes : 4 */
@@ -353,8 +345,8 @@ typedef union xlog_in_core2 {
353 * and move everything else out to subsequent cachelines. 345 * and move everything else out to subsequent cachelines.
354 */ 346 */
355typedef struct xlog_in_core { 347typedef struct xlog_in_core {
356 sv_t ic_force_wait; 348 wait_queue_head_t ic_force_wait;
357 sv_t ic_write_wait; 349 wait_queue_head_t ic_write_wait;
358 struct xlog_in_core *ic_next; 350 struct xlog_in_core *ic_next;
359 struct xlog_in_core *ic_prev; 351 struct xlog_in_core *ic_prev;
360 struct xfs_buf *ic_bp; 352 struct xfs_buf *ic_bp;
@@ -421,7 +413,7 @@ struct xfs_cil {
421 struct xfs_cil_ctx *xc_ctx; 413 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock; 414 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 415 struct list_head xc_committing;
424 sv_t xc_commit_wait; 416 wait_queue_head_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence; 417 xfs_lsn_t xc_current_sequence;
426}; 418};
427 419
@@ -491,7 +483,7 @@ typedef struct log {
491 struct xfs_buftarg *l_targ; /* buftarg of log */ 483 struct xfs_buftarg *l_targ; /* buftarg of log */
492 uint l_flags; 484 uint l_flags;
493 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 485 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
494 struct xfs_buf_cancel **l_buf_cancel_table; 486 struct list_head *l_buf_cancel_table;
495 int l_iclog_hsize; /* size of iclog header */ 487 int l_iclog_hsize; /* size of iclog header */
496 int l_iclog_heads; /* # of iclog header sectors */ 488 int l_iclog_heads; /* # of iclog header sectors */
497 uint l_sectBBsize; /* sector size in BBs (2^n) */ 489 uint l_sectBBsize; /* sector size in BBs (2^n) */
@@ -503,29 +495,40 @@ typedef struct log {
503 int l_logBBsize; /* size of log in BB chunks */ 495 int l_logBBsize; /* size of log in BB chunks */
504 496
505 /* The following block of fields are changed while holding icloglock */ 497 /* The following block of fields are changed while holding icloglock */
506 sv_t l_flush_wait ____cacheline_aligned_in_smp; 498 wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
507 /* waiting for iclog flush */ 499 /* waiting for iclog flush */
508 int l_covered_state;/* state of "covering disk 500 int l_covered_state;/* state of "covering disk
509 * log entries" */ 501 * log entries" */
510 xlog_in_core_t *l_iclog; /* head log queue */ 502 xlog_in_core_t *l_iclog; /* head log queue */
511 spinlock_t l_icloglock; /* grab to change iclog state */ 503 spinlock_t l_icloglock; /* grab to change iclog state */
512 xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
513 * buffers */
514 xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
515 int l_curr_cycle; /* Cycle number of log writes */ 504 int l_curr_cycle; /* Cycle number of log writes */
516 int l_prev_cycle; /* Cycle number before last 505 int l_prev_cycle; /* Cycle number before last
517 * block increment */ 506 * block increment */
518 int l_curr_block; /* current logical log block */ 507 int l_curr_block; /* current logical log block */
519 int l_prev_block; /* previous logical log block */ 508 int l_prev_block; /* previous logical log block */
520 509
521 /* The following block of fields are changed while holding grant_lock */ 510 /*
522 spinlock_t l_grant_lock ____cacheline_aligned_in_smp; 511 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
523 xlog_ticket_t *l_reserve_headq; 512 * read without needing to hold specific locks. To avoid operations
524 xlog_ticket_t *l_write_headq; 513 * contending with other hot objects, place each of them on a separate
525 int l_grant_reserve_cycle; 514 * cacheline.
526 int l_grant_reserve_bytes; 515 */
527 int l_grant_write_cycle; 516 /* lsn of last LR on disk */
528 int l_grant_write_bytes; 517 atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
518 /* lsn of 1st LR with unflushed * buffers */
519 atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
520
521 /*
522 * ticket grant locks, queues and accounting have their own cachlines
523 * as these are quite hot and can be operated on concurrently.
524 */
525 spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
526 struct list_head l_reserveq;
527 atomic64_t l_grant_reserve_head;
528
529 spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
530 struct list_head l_writeq;
531 atomic64_t l_grant_write_head;
529 532
530 /* The following field are used for debugging; need to hold icloglock */ 533 /* The following field are used for debugging; need to hold icloglock */
531#ifdef DEBUG 534#ifdef DEBUG
@@ -534,6 +537,9 @@ typedef struct log {
534 537
535} xlog_t; 538} xlog_t;
536 539
540#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
541 ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
542
537#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 543#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
538 544
539/* common routines */ 545/* common routines */
@@ -562,6 +568,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
562 xlog_in_core_t **commit_iclog, uint flags); 568 xlog_in_core_t **commit_iclog, uint flags);
563 569
564/* 570/*
571 * When we crack an atomic LSN, we sample it first so that the value will not
572 * change while we are cracking it into the component values. This means we
573 * will always get consistent component values to work from. This should always
574 * be used to sample and crack LSNs that are stored and updated in atomic
575 * variables.
576 */
577static inline void
578xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
579{
580 xfs_lsn_t val = atomic64_read(lsn);
581
582 *cycle = CYCLE_LSN(val);
583 *block = BLOCK_LSN(val);
584}
585
586/*
587 * Calculate and assign a value to an atomic LSN variable from component pieces.
588 */
589static inline void
590xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
591{
592 atomic64_set(lsn, xlog_assign_lsn(cycle, block));
593}
594
595/*
596 * When we crack the grant head, we sample it first so that the value will not
597 * change while we are cracking it into the component values. This means we
598 * will always get consistent component values to work from.
599 */
600static inline void
601xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
602{
603 *cycle = val >> 32;
604 *space = val & 0xffffffff;
605}
606
607static inline void
608xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
609{
610 xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
611}
612
613static inline int64_t
614xlog_assign_grant_head_val(int cycle, int space)
615{
616 return ((int64_t)cycle << 32) | space;
617}
618
619static inline void
620xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
621{
622 atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
623}
624
625/*
565 * Committed Item List interfaces 626 * Committed Item List interfaces
566 */ 627 */
567int xlog_cil_init(struct log *log); 628int xlog_cil_init(struct log *log);
@@ -585,6 +646,21 @@ xlog_cil_force(struct log *log)
585 */ 646 */
586#define XLOG_UNMOUNT_REC_TYPE (-1U) 647#define XLOG_UNMOUNT_REC_TYPE (-1U)
587 648
649/*
650 * Wrapper function for waiting on a wait queue serialised against wakeups
651 * by a spinlock. This matches the semantics of all the wait queues used in the
652 * log code.
653 */
654static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
655{
656 DECLARE_WAITQUEUE(wait, current);
657
658 add_wait_queue_exclusive(wq, &wait);
659 __set_current_state(TASK_UNINTERRUPTIBLE);
660 spin_unlock(lock);
661 schedule();
662 remove_wait_queue(wq, &wait);
663}
588#endif /* __KERNEL__ */ 664#endif /* __KERNEL__ */
589 665
590#endif /* __XFS_LOG_PRIV_H__ */ 666#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458c..5cc464a17c93 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void xlog_recover_check_summary(xlog_t *);
53#endif 53#endif
54 54
55/* 55/*
56 * This structure is used during recovery to record the buf log items which
57 * have been canceled and should not be replayed.
58 */
59struct xfs_buf_cancel {
60 xfs_daddr_t bc_blkno;
61 uint bc_len;
62 int bc_refcount;
63 struct list_head bc_list;
64};
65
66/*
56 * Sector aligned buffer routines for buffer create/read/write/access 67 * Sector aligned buffer routines for buffer create/read/write/access
57 */ 68 */
58 69
@@ -81,7 +92,7 @@ xlog_get_bp(
81 int nbblks) 92 int nbblks)
82{ 93{
83 if (!xlog_buf_bbcount_valid(log, nbblks)) { 94 if (!xlog_buf_bbcount_valid(log, nbblks)) {
84 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 95 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
85 nbblks); 96 nbblks);
86 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 97 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
87 return NULL; 98 return NULL;
@@ -90,7 +101,7 @@ xlog_get_bp(
90 /* 101 /*
91 * We do log I/O in units of log sectors (a power-of-2 102 * We do log I/O in units of log sectors (a power-of-2
92 * multiple of the basic block size), so we round up the 103 * multiple of the basic block size), so we round up the
93 * requested size to acommodate the basic blocks required 104 * requested size to accommodate the basic blocks required
94 * for complete log sectors. 105 * for complete log sectors.
95 * 106 *
96 * In addition, the buffer may be used for a non-sector- 107 * In addition, the buffer may be used for a non-sector-
@@ -101,7 +112,7 @@ xlog_get_bp(
101 * an issue. Nor will this be a problem if the log I/O is 112 * an issue. Nor will this be a problem if the log I/O is
102 * done in basic blocks (sector size 1). But otherwise we 113 * done in basic blocks (sector size 1). But otherwise we
103 * extend the buffer by one extra log sector to ensure 114 * extend the buffer by one extra log sector to ensure
104 * there's space to accomodate this possiblility. 115 * there's space to accommodate this possibility.
105 */ 116 */
106 if (nbblks > 1 && log->l_sectBBsize > 1) 117 if (nbblks > 1 && log->l_sectBBsize > 1)
107 nbblks += log->l_sectBBsize; 118 nbblks += log->l_sectBBsize;
@@ -149,7 +160,7 @@ xlog_bread_noalign(
149 int error; 160 int error;
150 161
151 if (!xlog_buf_bbcount_valid(log, nbblks)) { 162 if (!xlog_buf_bbcount_valid(log, nbblks)) {
152 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 163 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
153 nbblks); 164 nbblks);
154 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 165 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
155 return EFSCORRUPTED; 166 return EFSCORRUPTED;
@@ -208,7 +219,7 @@ xlog_bwrite(
208 int error; 219 int error;
209 220
210 if (!xlog_buf_bbcount_valid(log, nbblks)) { 221 if (!xlog_buf_bbcount_valid(log, nbblks)) {
211 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 222 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
212 nbblks); 223 nbblks);
213 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 224 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
214 return EFSCORRUPTED; 225 return EFSCORRUPTED;
@@ -243,9 +254,9 @@ xlog_header_check_dump(
243 xfs_mount_t *mp, 254 xfs_mount_t *mp,
244 xlog_rec_header_t *head) 255 xlog_rec_header_t *head)
245{ 256{
246 cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n", 257 xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",
247 __func__, &mp->m_sb.sb_uuid, XLOG_FMT); 258 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
248 cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n", 259 xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",
249 &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); 260 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
250} 261}
251#else 262#else
@@ -268,15 +279,15 @@ xlog_header_check_recover(
268 * a dirty log created in IRIX. 279 * a dirty log created in IRIX.
269 */ 280 */
270 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { 281 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
271 xlog_warn( 282 xfs_warn(mp,
272 "XFS: dirty log written in incompatible format - can't recover"); 283 "dirty log written in incompatible format - can't recover");
273 xlog_header_check_dump(mp, head); 284 xlog_header_check_dump(mp, head);
274 XFS_ERROR_REPORT("xlog_header_check_recover(1)", 285 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
275 XFS_ERRLEVEL_HIGH, mp); 286 XFS_ERRLEVEL_HIGH, mp);
276 return XFS_ERROR(EFSCORRUPTED); 287 return XFS_ERROR(EFSCORRUPTED);
277 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 288 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
278 xlog_warn( 289 xfs_warn(mp,
279 "XFS: dirty log entry has mismatched uuid - can't recover"); 290 "dirty log entry has mismatched uuid - can't recover");
280 xlog_header_check_dump(mp, head); 291 xlog_header_check_dump(mp, head);
281 XFS_ERROR_REPORT("xlog_header_check_recover(2)", 292 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
282 XFS_ERRLEVEL_HIGH, mp); 293 XFS_ERRLEVEL_HIGH, mp);
@@ -301,9 +312,9 @@ xlog_header_check_mount(
301 * h_fs_uuid is nil, we assume this log was last mounted 312 * h_fs_uuid is nil, we assume this log was last mounted
302 * by IRIX and continue. 313 * by IRIX and continue.
303 */ 314 */
304 xlog_warn("XFS: nil uuid in log - IRIX style log"); 315 xfs_warn(mp, "nil uuid in log - IRIX style log");
305 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 316 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
306 xlog_warn("XFS: log has mismatched uuid - can't recover"); 317 xfs_warn(mp, "log has mismatched uuid - can't recover");
307 xlog_header_check_dump(mp, head); 318 xlog_header_check_dump(mp, head);
308 XFS_ERROR_REPORT("xlog_header_check_mount", 319 XFS_ERROR_REPORT("xlog_header_check_mount",
309 XFS_ERRLEVEL_HIGH, mp); 320 XFS_ERRLEVEL_HIGH, mp);
@@ -479,8 +490,8 @@ xlog_find_verify_log_record(
479 for (i = (*last_blk) - 1; i >= 0; i--) { 490 for (i = (*last_blk) - 1; i >= 0; i--) {
480 if (i < start_blk) { 491 if (i < start_blk) {
481 /* valid log record not found */ 492 /* valid log record not found */
482 xlog_warn( 493 xfs_warn(log->l_mp,
483 "XFS: Log inconsistent (didn't find previous header)"); 494 "Log inconsistent (didn't find previous header)");
484 ASSERT(0); 495 ASSERT(0);
485 error = XFS_ERROR(EIO); 496 error = XFS_ERROR(EIO);
486 goto out; 497 goto out;
@@ -580,12 +591,12 @@ xlog_find_head(
580 * mkfs etc write a dummy unmount record to a fresh 591 * mkfs etc write a dummy unmount record to a fresh
581 * log so we can store the uuid in there 592 * log so we can store the uuid in there
582 */ 593 */
583 xlog_warn("XFS: totally zeroed log"); 594 xfs_warn(log->l_mp, "totally zeroed log");
584 } 595 }
585 596
586 return 0; 597 return 0;
587 } else if (error) { 598 } else if (error) {
588 xlog_warn("XFS: empty log check failed"); 599 xfs_warn(log->l_mp, "empty log check failed");
589 return error; 600 return error;
590 } 601 }
591 602
@@ -808,7 +819,7 @@ validate_head:
808 xlog_put_bp(bp); 819 xlog_put_bp(bp);
809 820
810 if (error) 821 if (error)
811 xlog_warn("XFS: failed to find log head"); 822 xfs_warn(log->l_mp, "failed to find log head");
812 return error; 823 return error;
813} 824}
814 825
@@ -901,7 +912,7 @@ xlog_find_tail(
901 } 912 }
902 } 913 }
903 if (!found) { 914 if (!found) {
904 xlog_warn("XFS: xlog_find_tail: couldn't find sync record"); 915 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
905 ASSERT(0); 916 ASSERT(0);
906 return XFS_ERROR(EIO); 917 return XFS_ERROR(EIO);
907 } 918 }
@@ -925,12 +936,12 @@ xlog_find_tail(
925 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 936 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
926 if (found == 2) 937 if (found == 2)
927 log->l_curr_cycle++; 938 log->l_curr_cycle++;
928 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); 939 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
929 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); 940 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
930 log->l_grant_reserve_cycle = log->l_curr_cycle; 941 xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
931 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); 942 BBTOB(log->l_curr_block));
932 log->l_grant_write_cycle = log->l_curr_cycle; 943 xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
933 log->l_grant_write_bytes = BBTOB(log->l_curr_block); 944 BBTOB(log->l_curr_block));
934 945
935 /* 946 /*
936 * Look for unmount record. If we find it, then we know there 947 * Look for unmount record. If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
960 } 971 }
961 after_umount_blk = (i + hblks + (int) 972 after_umount_blk = (i + hblks + (int)
962 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; 973 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
963 tail_lsn = log->l_tail_lsn; 974 tail_lsn = atomic64_read(&log->l_tail_lsn);
964 if (*head_blk == after_umount_blk && 975 if (*head_blk == after_umount_blk &&
965 be32_to_cpu(rhead->h_num_logops) == 1) { 976 be32_to_cpu(rhead->h_num_logops) == 1) {
966 umount_data_blk = (i + hblks) % log->l_logBBsize; 977 umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
975 * log records will point recovery to after the 986 * log records will point recovery to after the
976 * current unmount record. 987 * current unmount record.
977 */ 988 */
978 log->l_tail_lsn = 989 xlog_assign_atomic_lsn(&log->l_tail_lsn,
979 xlog_assign_lsn(log->l_curr_cycle, 990 log->l_curr_cycle, after_umount_blk);
980 after_umount_blk); 991 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
981 log->l_last_sync_lsn = 992 log->l_curr_cycle, after_umount_blk);
982 xlog_assign_lsn(log->l_curr_cycle,
983 after_umount_blk);
984 *tail_blk = after_umount_blk; 993 *tail_blk = after_umount_blk;
985 994
986 /* 995 /*
@@ -1019,7 +1028,7 @@ done:
1019 xlog_put_bp(bp); 1028 xlog_put_bp(bp);
1020 1029
1021 if (error) 1030 if (error)
1022 xlog_warn("XFS: failed to locate log tail"); 1031 xfs_warn(log->l_mp, "failed to locate log tail");
1023 return error; 1032 return error;
1024} 1033}
1025 1034
@@ -1083,7 +1092,8 @@ xlog_find_zeroed(
1083 * the first block must be 1. If it's not, maybe we're 1092 * the first block must be 1. If it's not, maybe we're
1084 * not looking at a log... Bail out. 1093 * not looking at a log... Bail out.
1085 */ 1094 */
1086 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)"); 1095 xfs_warn(log->l_mp,
1096 "Log inconsistent or not a log (last==0, first!=1)");
1087 return XFS_ERROR(EINVAL); 1097 return XFS_ERROR(EINVAL);
1088 } 1098 }
1089 1099
@@ -1497,8 +1507,8 @@ xlog_recover_add_to_trans(
1497 if (list_empty(&trans->r_itemq)) { 1507 if (list_empty(&trans->r_itemq)) {
1498 /* we need to catch log corruptions here */ 1508 /* we need to catch log corruptions here */
1499 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 1509 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1500 xlog_warn("XFS: xlog_recover_add_to_trans: " 1510 xfs_warn(log->l_mp, "%s: bad header magic number",
1501 "bad header magic number"); 1511 __func__);
1502 ASSERT(0); 1512 ASSERT(0);
1503 return XFS_ERROR(EIO); 1513 return XFS_ERROR(EIO);
1504 } 1514 }
@@ -1525,8 +1535,8 @@ xlog_recover_add_to_trans(
1525 if (item->ri_total == 0) { /* first region to be added */ 1535 if (item->ri_total == 0) { /* first region to be added */
1526 if (in_f->ilf_size == 0 || 1536 if (in_f->ilf_size == 0 ||
1527 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { 1537 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1528 xlog_warn( 1538 xfs_warn(log->l_mp,
1529 "XFS: bad number of regions (%d) in inode log format", 1539 "bad number of regions (%d) in inode log format",
1530 in_f->ilf_size); 1540 in_f->ilf_size);
1531 ASSERT(0); 1541 ASSERT(0);
1532 return XFS_ERROR(EIO); 1542 return XFS_ERROR(EIO);
@@ -1583,8 +1593,9 @@ xlog_recover_reorder_trans(
1583 list_move_tail(&item->ri_list, &trans->r_itemq); 1593 list_move_tail(&item->ri_list, &trans->r_itemq);
1584 break; 1594 break;
1585 default: 1595 default:
1586 xlog_warn( 1596 xfs_warn(log->l_mp,
1587 "XFS: xlog_recover_reorder_trans: unrecognized type of log operation"); 1597 "%s: unrecognized type of log operation",
1598 __func__);
1588 ASSERT(0); 1599 ASSERT(0);
1589 return XFS_ERROR(EIO); 1600 return XFS_ERROR(EIO);
1590 } 1601 }
@@ -1605,82 +1616,45 @@ xlog_recover_reorder_trans(
1605 * record in the table to tell us how many times we expect to see this 1616 * record in the table to tell us how many times we expect to see this
1606 * record during the second pass. 1617 * record during the second pass.
1607 */ 1618 */
1608STATIC void 1619STATIC int
1609xlog_recover_do_buffer_pass1( 1620xlog_recover_buffer_pass1(
1610 xlog_t *log, 1621 struct log *log,
1611 xfs_buf_log_format_t *buf_f) 1622 xlog_recover_item_t *item)
1612{ 1623{
1613 xfs_buf_cancel_t *bcp; 1624 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1614 xfs_buf_cancel_t *nextp; 1625 struct list_head *bucket;
1615 xfs_buf_cancel_t *prevp; 1626 struct xfs_buf_cancel *bcp;
1616 xfs_buf_cancel_t **bucket;
1617 xfs_daddr_t blkno = 0;
1618 uint len = 0;
1619 ushort flags = 0;
1620
1621 switch (buf_f->blf_type) {
1622 case XFS_LI_BUF:
1623 blkno = buf_f->blf_blkno;
1624 len = buf_f->blf_len;
1625 flags = buf_f->blf_flags;
1626 break;
1627 }
1628 1627
1629 /* 1628 /*
1630 * If this isn't a cancel buffer item, then just return. 1629 * If this isn't a cancel buffer item, then just return.
1631 */ 1630 */
1632 if (!(flags & XFS_BLF_CANCEL)) { 1631 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1633 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1632 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1634 return; 1633 return 0;
1635 }
1636
1637 /*
1638 * Insert an xfs_buf_cancel record into the hash table of
1639 * them. If there is already an identical record, bump
1640 * its reference count.
1641 */
1642 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1643 XLOG_BC_TABLE_SIZE];
1644 /*
1645 * If the hash bucket is empty then just insert a new record into
1646 * the bucket.
1647 */
1648 if (*bucket == NULL) {
1649 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1650 KM_SLEEP);
1651 bcp->bc_blkno = blkno;
1652 bcp->bc_len = len;
1653 bcp->bc_refcount = 1;
1654 bcp->bc_next = NULL;
1655 *bucket = bcp;
1656 return;
1657 } 1634 }
1658 1635
1659 /* 1636 /*
1660 * The hash bucket is not empty, so search for duplicates of our 1637 * Insert an xfs_buf_cancel record into the hash table of them.
1661 * record. If we find one them just bump its refcount. If not 1638 * If there is already an identical record, bump its reference count.
1662 * then add us at the end of the list.
1663 */ 1639 */
1664 prevp = NULL; 1640 bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1665 nextp = *bucket; 1641 list_for_each_entry(bcp, bucket, bc_list) {
1666 while (nextp != NULL) { 1642 if (bcp->bc_blkno == buf_f->blf_blkno &&
1667 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1643 bcp->bc_len == buf_f->blf_len) {
1668 nextp->bc_refcount++; 1644 bcp->bc_refcount++;
1669 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); 1645 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1670 return; 1646 return 0;
1671 } 1647 }
1672 prevp = nextp; 1648 }
1673 nextp = nextp->bc_next; 1649
1674 } 1650 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1675 ASSERT(prevp != NULL); 1651 bcp->bc_blkno = buf_f->blf_blkno;
1676 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), 1652 bcp->bc_len = buf_f->blf_len;
1677 KM_SLEEP);
1678 bcp->bc_blkno = blkno;
1679 bcp->bc_len = len;
1680 bcp->bc_refcount = 1; 1653 bcp->bc_refcount = 1;
1681 bcp->bc_next = NULL; 1654 list_add_tail(&bcp->bc_list, bucket);
1682 prevp->bc_next = bcp; 1655
1683 trace_xfs_log_recover_buf_cancel_add(log, buf_f); 1656 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1657 return 0;
1684} 1658}
1685 1659
1686/* 1660/*
@@ -1698,14 +1672,13 @@ xlog_recover_do_buffer_pass1(
1698 */ 1672 */
1699STATIC int 1673STATIC int
1700xlog_check_buffer_cancelled( 1674xlog_check_buffer_cancelled(
1701 xlog_t *log, 1675 struct log *log,
1702 xfs_daddr_t blkno, 1676 xfs_daddr_t blkno,
1703 uint len, 1677 uint len,
1704 ushort flags) 1678 ushort flags)
1705{ 1679{
1706 xfs_buf_cancel_t *bcp; 1680 struct list_head *bucket;
1707 xfs_buf_cancel_t *prevp; 1681 struct xfs_buf_cancel *bcp;
1708 xfs_buf_cancel_t **bucket;
1709 1682
1710 if (log->l_buf_cancel_table == NULL) { 1683 if (log->l_buf_cancel_table == NULL) {
1711 /* 1684 /*
@@ -1716,128 +1689,70 @@ xlog_check_buffer_cancelled(
1716 return 0; 1689 return 0;
1717 } 1690 }
1718 1691
1719 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1720 XLOG_BC_TABLE_SIZE];
1721 bcp = *bucket;
1722 if (bcp == NULL) {
1723 /*
1724 * There is no corresponding entry in the table built
1725 * in pass one, so this buffer has not been cancelled.
1726 */
1727 ASSERT(!(flags & XFS_BLF_CANCEL));
1728 return 0;
1729 }
1730
1731 /* 1692 /*
1732 * Search for an entry in the buffer cancel table that 1693 * Search for an entry in the cancel table that matches our buffer.
1733 * matches our buffer.
1734 */ 1694 */
1735 prevp = NULL; 1695 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1736 while (bcp != NULL) { 1696 list_for_each_entry(bcp, bucket, bc_list) {
1737 if (bcp->bc_blkno == blkno && bcp->bc_len == len) { 1697 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1738 /* 1698 goto found;
1739 * We've go a match, so return 1 so that the
1740 * recovery of this buffer is cancelled.
1741 * If this buffer is actually a buffer cancel
1742 * log item, then decrement the refcount on the
1743 * one in the table and remove it if this is the
1744 * last reference.
1745 */
1746 if (flags & XFS_BLF_CANCEL) {
1747 bcp->bc_refcount--;
1748 if (bcp->bc_refcount == 0) {
1749 if (prevp == NULL) {
1750 *bucket = bcp->bc_next;
1751 } else {
1752 prevp->bc_next = bcp->bc_next;
1753 }
1754 kmem_free(bcp);
1755 }
1756 }
1757 return 1;
1758 }
1759 prevp = bcp;
1760 bcp = bcp->bc_next;
1761 } 1699 }
1700
1762 /* 1701 /*
1763 * We didn't find a corresponding entry in the table, so 1702 * We didn't find a corresponding entry in the table, so return 0 so
1764 * return 0 so that the buffer is NOT cancelled. 1703 * that the buffer is NOT cancelled.
1765 */ 1704 */
1766 ASSERT(!(flags & XFS_BLF_CANCEL)); 1705 ASSERT(!(flags & XFS_BLF_CANCEL));
1767 return 0; 1706 return 0;
1768}
1769 1707
1770STATIC int 1708found:
1771xlog_recover_do_buffer_pass2( 1709 /*
1772 xlog_t *log, 1710 * We've go a match, so return 1 so that the recovery of this buffer
1773 xfs_buf_log_format_t *buf_f) 1711 * is cancelled. If this buffer is actually a buffer cancel log
1774{ 1712 * item, then decrement the refcount on the one in the table and
1775 xfs_daddr_t blkno = 0; 1713 * remove it if this is the last reference.
1776 ushort flags = 0; 1714 */
1777 uint len = 0; 1715 if (flags & XFS_BLF_CANCEL) {
1778 1716 if (--bcp->bc_refcount == 0) {
1779 switch (buf_f->blf_type) { 1717 list_del(&bcp->bc_list);
1780 case XFS_LI_BUF: 1718 kmem_free(bcp);
1781 blkno = buf_f->blf_blkno; 1719 }
1782 flags = buf_f->blf_flags;
1783 len = buf_f->blf_len;
1784 break;
1785 } 1720 }
1786 1721 return 1;
1787 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1788} 1722}
1789 1723
1790/* 1724/*
1791 * Perform recovery for a buffer full of inodes. In these buffers, 1725 * Perform recovery for a buffer full of inodes. In these buffers, the only
1792 * the only data which should be recovered is that which corresponds 1726 * data which should be recovered is that which corresponds to the
1793 * to the di_next_unlinked pointers in the on disk inode structures. 1727 * di_next_unlinked pointers in the on disk inode structures. The rest of the
1794 * The rest of the data for the inodes is always logged through the 1728 * data for the inodes is always logged through the inodes themselves rather
1795 * inodes themselves rather than the inode buffer and is recovered 1729 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1796 * in xlog_recover_do_inode_trans().
1797 * 1730 *
1798 * The only time when buffers full of inodes are fully recovered is 1731 * The only time when buffers full of inodes are fully recovered is when the
1799 * when the buffer is full of newly allocated inodes. In this case 1732 * buffer is full of newly allocated inodes. In this case the buffer will
1800 * the buffer will not be marked as an inode buffer and so will be 1733 * not be marked as an inode buffer and so will be sent to
1801 * sent to xlog_recover_do_reg_buffer() below during recovery. 1734 * xlog_recover_do_reg_buffer() below during recovery.
1802 */ 1735 */
1803STATIC int 1736STATIC int
1804xlog_recover_do_inode_buffer( 1737xlog_recover_do_inode_buffer(
1805 xfs_mount_t *mp, 1738 struct xfs_mount *mp,
1806 xlog_recover_item_t *item, 1739 xlog_recover_item_t *item,
1807 xfs_buf_t *bp, 1740 struct xfs_buf *bp,
1808 xfs_buf_log_format_t *buf_f) 1741 xfs_buf_log_format_t *buf_f)
1809{ 1742{
1810 int i; 1743 int i;
1811 int item_index; 1744 int item_index = 0;
1812 int bit; 1745 int bit = 0;
1813 int nbits; 1746 int nbits = 0;
1814 int reg_buf_offset; 1747 int reg_buf_offset = 0;
1815 int reg_buf_bytes; 1748 int reg_buf_bytes = 0;
1816 int next_unlinked_offset; 1749 int next_unlinked_offset;
1817 int inodes_per_buf; 1750 int inodes_per_buf;
1818 xfs_agino_t *logged_nextp; 1751 xfs_agino_t *logged_nextp;
1819 xfs_agino_t *buffer_nextp; 1752 xfs_agino_t *buffer_nextp;
1820 unsigned int *data_map = NULL;
1821 unsigned int map_size = 0;
1822 1753
1823 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1754 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1824 1755
1825 switch (buf_f->blf_type) {
1826 case XFS_LI_BUF:
1827 data_map = buf_f->blf_data_map;
1828 map_size = buf_f->blf_map_size;
1829 break;
1830 }
1831 /*
1832 * Set the variables corresponding to the current region to
1833 * 0 so that we'll initialize them on the first pass through
1834 * the loop.
1835 */
1836 reg_buf_offset = 0;
1837 reg_buf_bytes = 0;
1838 bit = 0;
1839 nbits = 0;
1840 item_index = 0;
1841 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; 1756 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1842 for (i = 0; i < inodes_per_buf; i++) { 1757 for (i = 0; i < inodes_per_buf; i++) {
1843 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 1758 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1767,18 @@ xlog_recover_do_inode_buffer(
1852 * the current di_next_unlinked field. 1767 * the current di_next_unlinked field.
1853 */ 1768 */
1854 bit += nbits; 1769 bit += nbits;
1855 bit = xfs_next_bit(data_map, map_size, bit); 1770 bit = xfs_next_bit(buf_f->blf_data_map,
1771 buf_f->blf_map_size, bit);
1856 1772
1857 /* 1773 /*
1858 * If there are no more logged regions in the 1774 * If there are no more logged regions in the
1859 * buffer, then we're done. 1775 * buffer, then we're done.
1860 */ 1776 */
1861 if (bit == -1) { 1777 if (bit == -1)
1862 return 0; 1778 return 0;
1863 }
1864 1779
1865 nbits = xfs_contig_bits(data_map, map_size, 1780 nbits = xfs_contig_bits(buf_f->blf_data_map,
1866 bit); 1781 buf_f->blf_map_size, bit);
1867 ASSERT(nbits > 0); 1782 ASSERT(nbits > 0);
1868 reg_buf_offset = bit << XFS_BLF_SHIFT; 1783 reg_buf_offset = bit << XFS_BLF_SHIFT;
1869 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 1784 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1790,8 @@ xlog_recover_do_inode_buffer(
1875 * di_next_unlinked field, then move on to the next 1790 * di_next_unlinked field, then move on to the next
1876 * di_next_unlinked field. 1791 * di_next_unlinked field.
1877 */ 1792 */
1878 if (next_unlinked_offset < reg_buf_offset) { 1793 if (next_unlinked_offset < reg_buf_offset)
1879 continue; 1794 continue;
1880 }
1881 1795
1882 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1796 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1883 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 1797 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1891,8 +1805,9 @@ xlog_recover_do_inode_buffer(
1891 logged_nextp = item->ri_buf[item_index].i_addr + 1805 logged_nextp = item->ri_buf[item_index].i_addr +
1892 next_unlinked_offset - reg_buf_offset; 1806 next_unlinked_offset - reg_buf_offset;
1893 if (unlikely(*logged_nextp == 0)) { 1807 if (unlikely(*logged_nextp == 0)) {
1894 xfs_fs_cmn_err(CE_ALERT, mp, 1808 xfs_alert(mp,
1895 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", 1809 "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
1810 "Trying to replay bad (0) inode di_next_unlinked field.",
1896 item, bp); 1811 item, bp);
1897 XFS_ERROR_REPORT("xlog_recover_do_inode_buf", 1812 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1898 XFS_ERRLEVEL_LOW, mp); 1813 XFS_ERRLEVEL_LOW, mp);
@@ -1913,36 +1828,29 @@ xlog_recover_do_inode_buffer(
1913 * given buffer. The bitmap in the buf log format structure indicates 1828 * given buffer. The bitmap in the buf log format structure indicates
1914 * where to place the logged data. 1829 * where to place the logged data.
1915 */ 1830 */
1916/*ARGSUSED*/
1917STATIC void 1831STATIC void
1918xlog_recover_do_reg_buffer( 1832xlog_recover_do_reg_buffer(
1919 struct xfs_mount *mp, 1833 struct xfs_mount *mp,
1920 xlog_recover_item_t *item, 1834 xlog_recover_item_t *item,
1921 xfs_buf_t *bp, 1835 struct xfs_buf *bp,
1922 xfs_buf_log_format_t *buf_f) 1836 xfs_buf_log_format_t *buf_f)
1923{ 1837{
1924 int i; 1838 int i;
1925 int bit; 1839 int bit;
1926 int nbits; 1840 int nbits;
1927 unsigned int *data_map = NULL;
1928 unsigned int map_size = 0;
1929 int error; 1841 int error;
1930 1842
1931 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 1843 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1932 1844
1933 switch (buf_f->blf_type) {
1934 case XFS_LI_BUF:
1935 data_map = buf_f->blf_data_map;
1936 map_size = buf_f->blf_map_size;
1937 break;
1938 }
1939 bit = 0; 1845 bit = 0;
1940 i = 1; /* 0 is the buf format structure */ 1846 i = 1; /* 0 is the buf format structure */
1941 while (1) { 1847 while (1) {
1942 bit = xfs_next_bit(data_map, map_size, bit); 1848 bit = xfs_next_bit(buf_f->blf_data_map,
1849 buf_f->blf_map_size, bit);
1943 if (bit == -1) 1850 if (bit == -1)
1944 break; 1851 break;
1945 nbits = xfs_contig_bits(data_map, map_size, bit); 1852 nbits = xfs_contig_bits(buf_f->blf_data_map,
1853 buf_f->blf_map_size, bit);
1946 ASSERT(nbits > 0); 1854 ASSERT(nbits > 0);
1947 ASSERT(item->ri_buf[i].i_addr != NULL); 1855 ASSERT(item->ri_buf[i].i_addr != NULL);
1948 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 1856 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -1958,17 +1866,17 @@ xlog_recover_do_reg_buffer(
1958 if (buf_f->blf_flags & 1866 if (buf_f->blf_flags &
1959 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 1867 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1960 if (item->ri_buf[i].i_addr == NULL) { 1868 if (item->ri_buf[i].i_addr == NULL) {
1961 cmn_err(CE_ALERT, 1869 xfs_alert(mp,
1962 "XFS: NULL dquot in %s.", __func__); 1870 "XFS: NULL dquot in %s.", __func__);
1963 goto next; 1871 goto next;
1964 } 1872 }
1965 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { 1873 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1966 cmn_err(CE_ALERT, 1874 xfs_alert(mp,
1967 "XFS: dquot too small (%d) in %s.", 1875 "XFS: dquot too small (%d) in %s.",
1968 item->ri_buf[i].i_len, __func__); 1876 item->ri_buf[i].i_len, __func__);
1969 goto next; 1877 goto next;
1970 } 1878 }
1971 error = xfs_qm_dqcheck(item->ri_buf[i].i_addr, 1879 error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
1972 -1, 0, XFS_QMOPT_DOWARN, 1880 -1, 0, XFS_QMOPT_DOWARN,
1973 "dquot_buf_recover"); 1881 "dquot_buf_recover");
1974 if (error) 1882 if (error)
@@ -1993,6 +1901,7 @@ xlog_recover_do_reg_buffer(
1993 */ 1901 */
1994int 1902int
1995xfs_qm_dqcheck( 1903xfs_qm_dqcheck(
1904 struct xfs_mount *mp,
1996 xfs_disk_dquot_t *ddq, 1905 xfs_disk_dquot_t *ddq,
1997 xfs_dqid_t id, 1906 xfs_dqid_t id,
1998 uint type, /* used only when IO_dorepair is true */ 1907 uint type, /* used only when IO_dorepair is true */
@@ -2019,14 +1928,14 @@ xfs_qm_dqcheck(
2019 */ 1928 */
2020 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { 1929 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
2021 if (flags & XFS_QMOPT_DOWARN) 1930 if (flags & XFS_QMOPT_DOWARN)
2022 cmn_err(CE_ALERT, 1931 xfs_alert(mp,
2023 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", 1932 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
2024 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); 1933 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
2025 errs++; 1934 errs++;
2026 } 1935 }
2027 if (ddq->d_version != XFS_DQUOT_VERSION) { 1936 if (ddq->d_version != XFS_DQUOT_VERSION) {
2028 if (flags & XFS_QMOPT_DOWARN) 1937 if (flags & XFS_QMOPT_DOWARN)
2029 cmn_err(CE_ALERT, 1938 xfs_alert(mp,
2030 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", 1939 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
2031 str, id, ddq->d_version, XFS_DQUOT_VERSION); 1940 str, id, ddq->d_version, XFS_DQUOT_VERSION);
2032 errs++; 1941 errs++;
@@ -2036,7 +1945,7 @@ xfs_qm_dqcheck(
2036 ddq->d_flags != XFS_DQ_PROJ && 1945 ddq->d_flags != XFS_DQ_PROJ &&
2037 ddq->d_flags != XFS_DQ_GROUP) { 1946 ddq->d_flags != XFS_DQ_GROUP) {
2038 if (flags & XFS_QMOPT_DOWARN) 1947 if (flags & XFS_QMOPT_DOWARN)
2039 cmn_err(CE_ALERT, 1948 xfs_alert(mp,
2040 "%s : XFS dquot ID 0x%x, unknown flags 0x%x", 1949 "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
2041 str, id, ddq->d_flags); 1950 str, id, ddq->d_flags);
2042 errs++; 1951 errs++;
@@ -2044,7 +1953,7 @@ xfs_qm_dqcheck(
2044 1953
2045 if (id != -1 && id != be32_to_cpu(ddq->d_id)) { 1954 if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
2046 if (flags & XFS_QMOPT_DOWARN) 1955 if (flags & XFS_QMOPT_DOWARN)
2047 cmn_err(CE_ALERT, 1956 xfs_alert(mp,
2048 "%s : ondisk-dquot 0x%p, ID mismatch: " 1957 "%s : ondisk-dquot 0x%p, ID mismatch: "
2049 "0x%x expected, found id 0x%x", 1958 "0x%x expected, found id 0x%x",
2050 str, ddq, id, be32_to_cpu(ddq->d_id)); 1959 str, ddq, id, be32_to_cpu(ddq->d_id));
@@ -2057,9 +1966,8 @@ xfs_qm_dqcheck(
2057 be64_to_cpu(ddq->d_blk_softlimit)) { 1966 be64_to_cpu(ddq->d_blk_softlimit)) {
2058 if (!ddq->d_btimer) { 1967 if (!ddq->d_btimer) {
2059 if (flags & XFS_QMOPT_DOWARN) 1968 if (flags & XFS_QMOPT_DOWARN)
2060 cmn_err(CE_ALERT, 1969 xfs_alert(mp,
2061 "%s : Dquot ID 0x%x (0x%p) " 1970 "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
2062 "BLK TIMER NOT STARTED",
2063 str, (int)be32_to_cpu(ddq->d_id), ddq); 1971 str, (int)be32_to_cpu(ddq->d_id), ddq);
2064 errs++; 1972 errs++;
2065 } 1973 }
@@ -2069,9 +1977,8 @@ xfs_qm_dqcheck(
2069 be64_to_cpu(ddq->d_ino_softlimit)) { 1977 be64_to_cpu(ddq->d_ino_softlimit)) {
2070 if (!ddq->d_itimer) { 1978 if (!ddq->d_itimer) {
2071 if (flags & XFS_QMOPT_DOWARN) 1979 if (flags & XFS_QMOPT_DOWARN)
2072 cmn_err(CE_ALERT, 1980 xfs_alert(mp,
2073 "%s : Dquot ID 0x%x (0x%p) " 1981 "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
2074 "INODE TIMER NOT STARTED",
2075 str, (int)be32_to_cpu(ddq->d_id), ddq); 1982 str, (int)be32_to_cpu(ddq->d_id), ddq);
2076 errs++; 1983 errs++;
2077 } 1984 }
@@ -2081,9 +1988,8 @@ xfs_qm_dqcheck(
2081 be64_to_cpu(ddq->d_rtb_softlimit)) { 1988 be64_to_cpu(ddq->d_rtb_softlimit)) {
2082 if (!ddq->d_rtbtimer) { 1989 if (!ddq->d_rtbtimer) {
2083 if (flags & XFS_QMOPT_DOWARN) 1990 if (flags & XFS_QMOPT_DOWARN)
2084 cmn_err(CE_ALERT, 1991 xfs_alert(mp,
2085 "%s : Dquot ID 0x%x (0x%p) " 1992 "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
2086 "RTBLK TIMER NOT STARTED",
2087 str, (int)be32_to_cpu(ddq->d_id), ddq); 1993 str, (int)be32_to_cpu(ddq->d_id), ddq);
2088 errs++; 1994 errs++;
2089 } 1995 }
@@ -2094,7 +2000,7 @@ xfs_qm_dqcheck(
2094 return errs; 2000 return errs;
2095 2001
2096 if (flags & XFS_QMOPT_DOWARN) 2002 if (flags & XFS_QMOPT_DOWARN)
2097 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id); 2003 xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
2098 2004
2099 /* 2005 /*
2100 * Typically, a repair is only requested by quotacheck. 2006 * Typically, a repair is only requested by quotacheck.
@@ -2176,77 +2082,46 @@ xlog_recover_do_dquot_buffer(
2176 * for more details on the implementation of the table of cancel records. 2082 * for more details on the implementation of the table of cancel records.
2177 */ 2083 */
2178STATIC int 2084STATIC int
2179xlog_recover_do_buffer_trans( 2085xlog_recover_buffer_pass2(
2180 xlog_t *log, 2086 xlog_t *log,
2181 xlog_recover_item_t *item, 2087 xlog_recover_item_t *item)
2182 int pass)
2183{ 2088{
2184 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2089 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2185 xfs_mount_t *mp; 2090 xfs_mount_t *mp = log->l_mp;
2186 xfs_buf_t *bp; 2091 xfs_buf_t *bp;
2187 int error; 2092 int error;
2188 int cancel;
2189 xfs_daddr_t blkno;
2190 int len;
2191 ushort flags;
2192 uint buf_flags; 2093 uint buf_flags;
2193 2094
2194 if (pass == XLOG_RECOVER_PASS1) { 2095 /*
2195 /* 2096 * In this pass we only want to recover all the buffers which have
2196 * In this pass we're only looking for buf items 2097 * not been cancelled and are not cancellation buffers themselves.
2197 * with the XFS_BLF_CANCEL bit set. 2098 */
2198 */ 2099 if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2199 xlog_recover_do_buffer_pass1(log, buf_f); 2100 buf_f->blf_len, buf_f->blf_flags)) {
2101 trace_xfs_log_recover_buf_cancel(log, buf_f);
2200 return 0; 2102 return 0;
2201 } else {
2202 /*
2203 * In this pass we want to recover all the buffers
2204 * which have not been cancelled and are not
2205 * cancellation buffers themselves. The routine
2206 * we call here will tell us whether or not to
2207 * continue with the replay of this buffer.
2208 */
2209 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2210 if (cancel) {
2211 trace_xfs_log_recover_buf_cancel(log, buf_f);
2212 return 0;
2213 }
2214 } 2103 }
2104
2215 trace_xfs_log_recover_buf_recover(log, buf_f); 2105 trace_xfs_log_recover_buf_recover(log, buf_f);
2216 switch (buf_f->blf_type) {
2217 case XFS_LI_BUF:
2218 blkno = buf_f->blf_blkno;
2219 len = buf_f->blf_len;
2220 flags = buf_f->blf_flags;
2221 break;
2222 default:
2223 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2224 "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2225 buf_f->blf_type, log->l_mp->m_logname ?
2226 log->l_mp->m_logname : "internal");
2227 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2228 XFS_ERRLEVEL_LOW, log->l_mp);
2229 return XFS_ERROR(EFSCORRUPTED);
2230 }
2231 2106
2232 mp = log->l_mp;
2233 buf_flags = XBF_LOCK; 2107 buf_flags = XBF_LOCK;
2234 if (!(flags & XFS_BLF_INODE_BUF)) 2108 if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
2235 buf_flags |= XBF_MAPPED; 2109 buf_flags |= XBF_MAPPED;
2236 2110
2237 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2111 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2112 buf_flags);
2238 if (XFS_BUF_ISERROR(bp)) { 2113 if (XFS_BUF_ISERROR(bp)) {
2239 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, 2114 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
2240 bp, blkno); 2115 bp, buf_f->blf_blkno);
2241 error = XFS_BUF_GETERROR(bp); 2116 error = XFS_BUF_GETERROR(bp);
2242 xfs_buf_relse(bp); 2117 xfs_buf_relse(bp);
2243 return error; 2118 return error;
2244 } 2119 }
2245 2120
2246 error = 0; 2121 error = 0;
2247 if (flags & XFS_BLF_INODE_BUF) { 2122 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2248 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2123 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2249 } else if (flags & 2124 } else if (buf_f->blf_flags &
2250 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2125 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2251 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2126 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2252 } else { 2127 } else {
@@ -2286,16 +2161,14 @@ xlog_recover_do_buffer_trans(
2286} 2161}
2287 2162
2288STATIC int 2163STATIC int
2289xlog_recover_do_inode_trans( 2164xlog_recover_inode_pass2(
2290 xlog_t *log, 2165 xlog_t *log,
2291 xlog_recover_item_t *item, 2166 xlog_recover_item_t *item)
2292 int pass)
2293{ 2167{
2294 xfs_inode_log_format_t *in_f; 2168 xfs_inode_log_format_t *in_f;
2295 xfs_mount_t *mp; 2169 xfs_mount_t *mp = log->l_mp;
2296 xfs_buf_t *bp; 2170 xfs_buf_t *bp;
2297 xfs_dinode_t *dip; 2171 xfs_dinode_t *dip;
2298 xfs_ino_t ino;
2299 int len; 2172 int len;
2300 xfs_caddr_t src; 2173 xfs_caddr_t src;
2301 xfs_caddr_t dest; 2174 xfs_caddr_t dest;
@@ -2305,10 +2178,6 @@ xlog_recover_do_inode_trans(
2305 xfs_icdinode_t *dicp; 2178 xfs_icdinode_t *dicp;
2306 int need_free = 0; 2179 int need_free = 0;
2307 2180
2308 if (pass == XLOG_RECOVER_PASS1) {
2309 return 0;
2310 }
2311
2312 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2181 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2313 in_f = item->ri_buf[0].i_addr; 2182 in_f = item->ri_buf[0].i_addr;
2314 } else { 2183 } else {
@@ -2318,8 +2187,6 @@ xlog_recover_do_inode_trans(
2318 if (error) 2187 if (error)
2319 goto error; 2188 goto error;
2320 } 2189 }
2321 ino = in_f->ilf_ino;
2322 mp = log->l_mp;
2323 2190
2324 /* 2191 /*
2325 * Inode buffers can be freed, look out for it, 2192 * Inode buffers can be freed, look out for it,
@@ -2352,10 +2219,10 @@ xlog_recover_do_inode_trans(
2352 */ 2219 */
2353 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { 2220 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
2354 xfs_buf_relse(bp); 2221 xfs_buf_relse(bp);
2355 xfs_fs_cmn_err(CE_ALERT, mp, 2222 xfs_alert(mp,
2356 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2223 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2357 dip, bp, ino); 2224 __func__, dip, bp, in_f->ilf_ino);
2358 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", 2225 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2359 XFS_ERRLEVEL_LOW, mp); 2226 XFS_ERRLEVEL_LOW, mp);
2360 error = EFSCORRUPTED; 2227 error = EFSCORRUPTED;
2361 goto error; 2228 goto error;
@@ -2363,10 +2230,10 @@ xlog_recover_do_inode_trans(
2363 dicp = item->ri_buf[1].i_addr; 2230 dicp = item->ri_buf[1].i_addr;
2364 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2231 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2365 xfs_buf_relse(bp); 2232 xfs_buf_relse(bp);
2366 xfs_fs_cmn_err(CE_ALERT, mp, 2233 xfs_alert(mp,
2367 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", 2234 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2368 item, ino); 2235 __func__, item, in_f->ilf_ino);
2369 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", 2236 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2370 XFS_ERRLEVEL_LOW, mp); 2237 XFS_ERRLEVEL_LOW, mp);
2371 error = EFSCORRUPTED; 2238 error = EFSCORRUPTED;
2372 goto error; 2239 goto error;
@@ -2394,12 +2261,13 @@ xlog_recover_do_inode_trans(
2394 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { 2261 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2395 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2262 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2396 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2263 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2397 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", 2264 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2398 XFS_ERRLEVEL_LOW, mp, dicp); 2265 XFS_ERRLEVEL_LOW, mp, dicp);
2399 xfs_buf_relse(bp); 2266 xfs_buf_relse(bp);
2400 xfs_fs_cmn_err(CE_ALERT, mp, 2267 xfs_alert(mp,
2401 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2268 "%s: Bad regular inode log record, rec ptr 0x%p, "
2402 item, dip, bp, ino); 2269 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2270 __func__, item, dip, bp, in_f->ilf_ino);
2403 error = EFSCORRUPTED; 2271 error = EFSCORRUPTED;
2404 goto error; 2272 goto error;
2405 } 2273 }
@@ -2407,45 +2275,48 @@ xlog_recover_do_inode_trans(
2407 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2275 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2408 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 2276 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2409 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2277 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2410 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", 2278 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2411 XFS_ERRLEVEL_LOW, mp, dicp); 2279 XFS_ERRLEVEL_LOW, mp, dicp);
2412 xfs_buf_relse(bp); 2280 xfs_buf_relse(bp);
2413 xfs_fs_cmn_err(CE_ALERT, mp, 2281 xfs_alert(mp,
2414 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2282 "%s: Bad dir inode log record, rec ptr 0x%p, "
2415 item, dip, bp, ino); 2283 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2284 __func__, item, dip, bp, in_f->ilf_ino);
2416 error = EFSCORRUPTED; 2285 error = EFSCORRUPTED;
2417 goto error; 2286 goto error;
2418 } 2287 }
2419 } 2288 }
2420 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2289 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2421 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", 2290 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2422 XFS_ERRLEVEL_LOW, mp, dicp); 2291 XFS_ERRLEVEL_LOW, mp, dicp);
2423 xfs_buf_relse(bp); 2292 xfs_buf_relse(bp);
2424 xfs_fs_cmn_err(CE_ALERT, mp, 2293 xfs_alert(mp,
2425 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2294 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2426 item, dip, bp, ino, 2295 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2296 __func__, item, dip, bp, in_f->ilf_ino,
2427 dicp->di_nextents + dicp->di_anextents, 2297 dicp->di_nextents + dicp->di_anextents,
2428 dicp->di_nblocks); 2298 dicp->di_nblocks);
2429 error = EFSCORRUPTED; 2299 error = EFSCORRUPTED;
2430 goto error; 2300 goto error;
2431 } 2301 }
2432 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2302 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2433 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", 2303 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2434 XFS_ERRLEVEL_LOW, mp, dicp); 2304 XFS_ERRLEVEL_LOW, mp, dicp);
2435 xfs_buf_relse(bp); 2305 xfs_buf_relse(bp);
2436 xfs_fs_cmn_err(CE_ALERT, mp, 2306 xfs_alert(mp,
2437 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", 2307 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2438 item, dip, bp, ino, dicp->di_forkoff); 2308 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2309 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2439 error = EFSCORRUPTED; 2310 error = EFSCORRUPTED;
2440 goto error; 2311 goto error;
2441 } 2312 }
2442 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { 2313 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2443 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", 2314 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2444 XFS_ERRLEVEL_LOW, mp, dicp); 2315 XFS_ERRLEVEL_LOW, mp, dicp);
2445 xfs_buf_relse(bp); 2316 xfs_buf_relse(bp);
2446 xfs_fs_cmn_err(CE_ALERT, mp, 2317 xfs_alert(mp,
2447 "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p", 2318 "%s: Bad inode log record length %d, rec ptr 0x%p",
2448 item->ri_buf[1].i_len, item); 2319 __func__, item->ri_buf[1].i_len, item);
2449 error = EFSCORRUPTED; 2320 error = EFSCORRUPTED;
2450 goto error; 2321 goto error;
2451 } 2322 }
@@ -2532,7 +2403,7 @@ xlog_recover_do_inode_trans(
2532 break; 2403 break;
2533 2404
2534 default: 2405 default:
2535 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); 2406 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2536 ASSERT(0); 2407 ASSERT(0);
2537 xfs_buf_relse(bp); 2408 xfs_buf_relse(bp);
2538 error = EIO; 2409 error = EIO;
@@ -2556,18 +2427,11 @@ error:
2556 * of that type. 2427 * of that type.
2557 */ 2428 */
2558STATIC int 2429STATIC int
2559xlog_recover_do_quotaoff_trans( 2430xlog_recover_quotaoff_pass1(
2560 xlog_t *log, 2431 xlog_t *log,
2561 xlog_recover_item_t *item, 2432 xlog_recover_item_t *item)
2562 int pass)
2563{ 2433{
2564 xfs_qoff_logformat_t *qoff_f; 2434 xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
2565
2566 if (pass == XLOG_RECOVER_PASS2) {
2567 return (0);
2568 }
2569
2570 qoff_f = item->ri_buf[0].i_addr;
2571 ASSERT(qoff_f); 2435 ASSERT(qoff_f);
2572 2436
2573 /* 2437 /*
@@ -2588,22 +2452,17 @@ xlog_recover_do_quotaoff_trans(
2588 * Recover a dquot record 2452 * Recover a dquot record
2589 */ 2453 */
2590STATIC int 2454STATIC int
2591xlog_recover_do_dquot_trans( 2455xlog_recover_dquot_pass2(
2592 xlog_t *log, 2456 xlog_t *log,
2593 xlog_recover_item_t *item, 2457 xlog_recover_item_t *item)
2594 int pass)
2595{ 2458{
2596 xfs_mount_t *mp; 2459 xfs_mount_t *mp = log->l_mp;
2597 xfs_buf_t *bp; 2460 xfs_buf_t *bp;
2598 struct xfs_disk_dquot *ddq, *recddq; 2461 struct xfs_disk_dquot *ddq, *recddq;
2599 int error; 2462 int error;
2600 xfs_dq_logformat_t *dq_f; 2463 xfs_dq_logformat_t *dq_f;
2601 uint type; 2464 uint type;
2602 2465
2603 if (pass == XLOG_RECOVER_PASS1) {
2604 return 0;
2605 }
2606 mp = log->l_mp;
2607 2466
2608 /* 2467 /*
2609 * Filesystems are required to send in quota flags at mount time. 2468 * Filesystems are required to send in quota flags at mount time.
@@ -2613,13 +2472,11 @@ xlog_recover_do_dquot_trans(
2613 2472
2614 recddq = item->ri_buf[1].i_addr; 2473 recddq = item->ri_buf[1].i_addr;
2615 if (recddq == NULL) { 2474 if (recddq == NULL) {
2616 cmn_err(CE_ALERT, 2475 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2617 "XFS: NULL dquot in %s.", __func__);
2618 return XFS_ERROR(EIO); 2476 return XFS_ERROR(EIO);
2619 } 2477 }
2620 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { 2478 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2621 cmn_err(CE_ALERT, 2479 xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2622 "XFS: dquot too small (%d) in %s.",
2623 item->ri_buf[1].i_len, __func__); 2480 item->ri_buf[1].i_len, __func__);
2624 return XFS_ERROR(EIO); 2481 return XFS_ERROR(EIO);
2625 } 2482 }
@@ -2644,12 +2501,10 @@ xlog_recover_do_dquot_trans(
2644 */ 2501 */
2645 dq_f = item->ri_buf[0].i_addr; 2502 dq_f = item->ri_buf[0].i_addr;
2646 ASSERT(dq_f); 2503 ASSERT(dq_f);
2647 if ((error = xfs_qm_dqcheck(recddq, 2504 error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2648 dq_f->qlf_id, 2505 "xlog_recover_dquot_pass2 (log copy)");
2649 0, XFS_QMOPT_DOWARN, 2506 if (error)
2650 "xlog_recover_do_dquot_trans (log copy)"))) {
2651 return XFS_ERROR(EIO); 2507 return XFS_ERROR(EIO);
2652 }
2653 ASSERT(dq_f->qlf_len == 1); 2508 ASSERT(dq_f->qlf_len == 1);
2654 2509
2655 error = xfs_read_buf(mp, mp->m_ddev_targp, 2510 error = xfs_read_buf(mp, mp->m_ddev_targp,
@@ -2669,8 +2524,9 @@ xlog_recover_do_dquot_trans(
2669 * was among a chunk of dquots created earlier, and we did some 2524 * was among a chunk of dquots created earlier, and we did some
2670 * minimal initialization then. 2525 * minimal initialization then.
2671 */ 2526 */
2672 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2527 error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2673 "xlog_recover_do_dquot_trans")) { 2528 "xlog_recover_dquot_pass2");
2529 if (error) {
2674 xfs_buf_relse(bp); 2530 xfs_buf_relse(bp);
2675 return XFS_ERROR(EIO); 2531 return XFS_ERROR(EIO);
2676 } 2532 }
@@ -2693,38 +2549,31 @@ xlog_recover_do_dquot_trans(
2693 * LSN. 2549 * LSN.
2694 */ 2550 */
2695STATIC int 2551STATIC int
2696xlog_recover_do_efi_trans( 2552xlog_recover_efi_pass2(
2697 xlog_t *log, 2553 xlog_t *log,
2698 xlog_recover_item_t *item, 2554 xlog_recover_item_t *item,
2699 xfs_lsn_t lsn, 2555 xfs_lsn_t lsn)
2700 int pass)
2701{ 2556{
2702 int error; 2557 int error;
2703 xfs_mount_t *mp; 2558 xfs_mount_t *mp = log->l_mp;
2704 xfs_efi_log_item_t *efip; 2559 xfs_efi_log_item_t *efip;
2705 xfs_efi_log_format_t *efi_formatp; 2560 xfs_efi_log_format_t *efi_formatp;
2706 2561
2707 if (pass == XLOG_RECOVER_PASS1) {
2708 return 0;
2709 }
2710
2711 efi_formatp = item->ri_buf[0].i_addr; 2562 efi_formatp = item->ri_buf[0].i_addr;
2712 2563
2713 mp = log->l_mp;
2714 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2564 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2715 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), 2565 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2716 &(efip->efi_format)))) { 2566 &(efip->efi_format)))) {
2717 xfs_efi_item_free(efip); 2567 xfs_efi_item_free(efip);
2718 return error; 2568 return error;
2719 } 2569 }
2720 efip->efi_next_extent = efi_formatp->efi_nextents; 2570 atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2721 efip->efi_flags |= XFS_EFI_COMMITTED;
2722 2571
2723 spin_lock(&log->l_ailp->xa_lock); 2572 spin_lock(&log->l_ailp->xa_lock);
2724 /* 2573 /*
2725 * xfs_trans_ail_update() drops the AIL lock. 2574 * xfs_trans_ail_update() drops the AIL lock.
2726 */ 2575 */
2727 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); 2576 xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2728 return 0; 2577 return 0;
2729} 2578}
2730 2579
@@ -2737,11 +2586,10 @@ xlog_recover_do_efi_trans(
2737 * efd format structure. If we find it, we remove the efi from the 2586 * efd format structure. If we find it, we remove the efi from the
2738 * AIL and free it. 2587 * AIL and free it.
2739 */ 2588 */
2740STATIC void 2589STATIC int
2741xlog_recover_do_efd_trans( 2590xlog_recover_efd_pass2(
2742 xlog_t *log, 2591 xlog_t *log,
2743 xlog_recover_item_t *item, 2592 xlog_recover_item_t *item)
2744 int pass)
2745{ 2593{
2746 xfs_efd_log_format_t *efd_formatp; 2594 xfs_efd_log_format_t *efd_formatp;
2747 xfs_efi_log_item_t *efip = NULL; 2595 xfs_efi_log_item_t *efip = NULL;
@@ -2750,10 +2598,6 @@ xlog_recover_do_efd_trans(
2750 struct xfs_ail_cursor cur; 2598 struct xfs_ail_cursor cur;
2751 struct xfs_ail *ailp = log->l_ailp; 2599 struct xfs_ail *ailp = log->l_ailp;
2752 2600
2753 if (pass == XLOG_RECOVER_PASS1) {
2754 return;
2755 }
2756
2757 efd_formatp = item->ri_buf[0].i_addr; 2601 efd_formatp = item->ri_buf[0].i_addr;
2758 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2602 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2759 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2603 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2629,6 @@ xlog_recover_do_efd_trans(
2785 } 2629 }
2786 xfs_trans_ail_cursor_done(ailp, &cur); 2630 xfs_trans_ail_cursor_done(ailp, &cur);
2787 spin_unlock(&ailp->xa_lock); 2631 spin_unlock(&ailp->xa_lock);
2788}
2789
2790/*
2791 * Perform the transaction
2792 *
2793 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2794 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2795 */
2796STATIC int
2797xlog_recover_do_trans(
2798 xlog_t *log,
2799 xlog_recover_t *trans,
2800 int pass)
2801{
2802 int error = 0;
2803 xlog_recover_item_t *item;
2804
2805 error = xlog_recover_reorder_trans(log, trans, pass);
2806 if (error)
2807 return error;
2808
2809 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2810 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2811 switch (ITEM_TYPE(item)) {
2812 case XFS_LI_BUF:
2813 error = xlog_recover_do_buffer_trans(log, item, pass);
2814 break;
2815 case XFS_LI_INODE:
2816 error = xlog_recover_do_inode_trans(log, item, pass);
2817 break;
2818 case XFS_LI_EFI:
2819 error = xlog_recover_do_efi_trans(log, item,
2820 trans->r_lsn, pass);
2821 break;
2822 case XFS_LI_EFD:
2823 xlog_recover_do_efd_trans(log, item, pass);
2824 error = 0;
2825 break;
2826 case XFS_LI_DQUOT:
2827 error = xlog_recover_do_dquot_trans(log, item, pass);
2828 break;
2829 case XFS_LI_QUOTAOFF:
2830 error = xlog_recover_do_quotaoff_trans(log, item,
2831 pass);
2832 break;
2833 default:
2834 xlog_warn(
2835 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2836 ASSERT(0);
2837 error = XFS_ERROR(EIO);
2838 break;
2839 }
2840
2841 if (error)
2842 return error;
2843 }
2844 2632
2845 return 0; 2633 return 0;
2846} 2634}
@@ -2852,7 +2640,7 @@ xlog_recover_do_trans(
2852 */ 2640 */
2853STATIC void 2641STATIC void
2854xlog_recover_free_trans( 2642xlog_recover_free_trans(
2855 xlog_recover_t *trans) 2643 struct xlog_recover *trans)
2856{ 2644{
2857 xlog_recover_item_t *item, *n; 2645 xlog_recover_item_t *item, *n;
2858 int i; 2646 int i;
@@ -2871,26 +2659,103 @@ xlog_recover_free_trans(
2871} 2659}
2872 2660
2873STATIC int 2661STATIC int
2662xlog_recover_commit_pass1(
2663 struct log *log,
2664 struct xlog_recover *trans,
2665 xlog_recover_item_t *item)
2666{
2667 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
2668
2669 switch (ITEM_TYPE(item)) {
2670 case XFS_LI_BUF:
2671 return xlog_recover_buffer_pass1(log, item);
2672 case XFS_LI_QUOTAOFF:
2673 return xlog_recover_quotaoff_pass1(log, item);
2674 case XFS_LI_INODE:
2675 case XFS_LI_EFI:
2676 case XFS_LI_EFD:
2677 case XFS_LI_DQUOT:
2678 /* nothing to do in pass 1 */
2679 return 0;
2680 default:
2681 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2682 __func__, ITEM_TYPE(item));
2683 ASSERT(0);
2684 return XFS_ERROR(EIO);
2685 }
2686}
2687
2688STATIC int
2689xlog_recover_commit_pass2(
2690 struct log *log,
2691 struct xlog_recover *trans,
2692 xlog_recover_item_t *item)
2693{
2694 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2695
2696 switch (ITEM_TYPE(item)) {
2697 case XFS_LI_BUF:
2698 return xlog_recover_buffer_pass2(log, item);
2699 case XFS_LI_INODE:
2700 return xlog_recover_inode_pass2(log, item);
2701 case XFS_LI_EFI:
2702 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2703 case XFS_LI_EFD:
2704 return xlog_recover_efd_pass2(log, item);
2705 case XFS_LI_DQUOT:
2706 return xlog_recover_dquot_pass2(log, item);
2707 case XFS_LI_QUOTAOFF:
2708 /* nothing to do in pass2 */
2709 return 0;
2710 default:
2711 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2712 __func__, ITEM_TYPE(item));
2713 ASSERT(0);
2714 return XFS_ERROR(EIO);
2715 }
2716}
2717
2718/*
2719 * Perform the transaction.
2720 *
2721 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2722 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2723 */
2724STATIC int
2874xlog_recover_commit_trans( 2725xlog_recover_commit_trans(
2875 xlog_t *log, 2726 struct log *log,
2876 xlog_recover_t *trans, 2727 struct xlog_recover *trans,
2877 int pass) 2728 int pass)
2878{ 2729{
2879 int error; 2730 int error = 0;
2731 xlog_recover_item_t *item;
2880 2732
2881 hlist_del(&trans->r_list); 2733 hlist_del(&trans->r_list);
2882 if ((error = xlog_recover_do_trans(log, trans, pass))) 2734
2735 error = xlog_recover_reorder_trans(log, trans, pass);
2736 if (error)
2883 return error; 2737 return error;
2884 xlog_recover_free_trans(trans); /* no error */ 2738
2739 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2740 if (pass == XLOG_RECOVER_PASS1)
2741 error = xlog_recover_commit_pass1(log, trans, item);
2742 else
2743 error = xlog_recover_commit_pass2(log, trans, item);
2744 if (error)
2745 return error;
2746 }
2747
2748 xlog_recover_free_trans(trans);
2885 return 0; 2749 return 0;
2886} 2750}
2887 2751
2888STATIC int 2752STATIC int
2889xlog_recover_unmount_trans( 2753xlog_recover_unmount_trans(
2754 struct log *log,
2890 xlog_recover_t *trans) 2755 xlog_recover_t *trans)
2891{ 2756{
2892 /* Do nothing now */ 2757 /* Do nothing now */
2893 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR"); 2758 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2894 return 0; 2759 return 0;
2895} 2760}
2896 2761
@@ -2933,8 +2798,8 @@ xlog_recover_process_data(
2933 dp += sizeof(xlog_op_header_t); 2798 dp += sizeof(xlog_op_header_t);
2934 if (ohead->oh_clientid != XFS_TRANSACTION && 2799 if (ohead->oh_clientid != XFS_TRANSACTION &&
2935 ohead->oh_clientid != XFS_LOG) { 2800 ohead->oh_clientid != XFS_LOG) {
2936 xlog_warn( 2801 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2937 "XFS: xlog_recover_process_data: bad clientid"); 2802 __func__, ohead->oh_clientid);
2938 ASSERT(0); 2803 ASSERT(0);
2939 return (XFS_ERROR(EIO)); 2804 return (XFS_ERROR(EIO));
2940 } 2805 }
@@ -2947,8 +2812,8 @@ xlog_recover_process_data(
2947 be64_to_cpu(rhead->h_lsn)); 2812 be64_to_cpu(rhead->h_lsn));
2948 } else { 2813 } else {
2949 if (dp + be32_to_cpu(ohead->oh_len) > lp) { 2814 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2950 xlog_warn( 2815 xfs_warn(log->l_mp, "%s: bad length 0x%x",
2951 "XFS: xlog_recover_process_data: bad length"); 2816 __func__, be32_to_cpu(ohead->oh_len));
2952 WARN_ON(1); 2817 WARN_ON(1);
2953 return (XFS_ERROR(EIO)); 2818 return (XFS_ERROR(EIO));
2954 } 2819 }
@@ -2961,7 +2826,7 @@ xlog_recover_process_data(
2961 trans, pass); 2826 trans, pass);
2962 break; 2827 break;
2963 case XLOG_UNMOUNT_TRANS: 2828 case XLOG_UNMOUNT_TRANS:
2964 error = xlog_recover_unmount_trans(trans); 2829 error = xlog_recover_unmount_trans(log, trans);
2965 break; 2830 break;
2966 case XLOG_WAS_CONT_TRANS: 2831 case XLOG_WAS_CONT_TRANS:
2967 error = xlog_recover_add_to_cont_trans(log, 2832 error = xlog_recover_add_to_cont_trans(log,
@@ -2969,8 +2834,8 @@ xlog_recover_process_data(
2969 be32_to_cpu(ohead->oh_len)); 2834 be32_to_cpu(ohead->oh_len));
2970 break; 2835 break;
2971 case XLOG_START_TRANS: 2836 case XLOG_START_TRANS:
2972 xlog_warn( 2837 xfs_warn(log->l_mp, "%s: bad transaction",
2973 "XFS: xlog_recover_process_data: bad transaction"); 2838 __func__);
2974 ASSERT(0); 2839 ASSERT(0);
2975 error = XFS_ERROR(EIO); 2840 error = XFS_ERROR(EIO);
2976 break; 2841 break;
@@ -2980,8 +2845,8 @@ xlog_recover_process_data(
2980 dp, be32_to_cpu(ohead->oh_len)); 2845 dp, be32_to_cpu(ohead->oh_len));
2981 break; 2846 break;
2982 default: 2847 default:
2983 xlog_warn( 2848 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
2984 "XFS: xlog_recover_process_data: bad flag"); 2849 __func__, flags);
2985 ASSERT(0); 2850 ASSERT(0);
2986 error = XFS_ERROR(EIO); 2851 error = XFS_ERROR(EIO);
2987 break; 2852 break;
@@ -3011,7 +2876,7 @@ xlog_recover_process_efi(
3011 xfs_extent_t *extp; 2876 xfs_extent_t *extp;
3012 xfs_fsblock_t startblock_fsb; 2877 xfs_fsblock_t startblock_fsb;
3013 2878
3014 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); 2879 ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
3015 2880
3016 /* 2881 /*
3017 * First check the validity of the extents described by the 2882 * First check the validity of the extents described by the
@@ -3050,7 +2915,7 @@ xlog_recover_process_efi(
3050 extp->ext_len); 2915 extp->ext_len);
3051 } 2916 }
3052 2917
3053 efip->efi_flags |= XFS_EFI_RECOVERED; 2918 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3054 error = xfs_trans_commit(tp, 0); 2919 error = xfs_trans_commit(tp, 0);
3055 return error; 2920 return error;
3056 2921
@@ -3107,7 +2972,7 @@ xlog_recover_process_efis(
3107 * Skip EFIs that we've already processed. 2972 * Skip EFIs that we've already processed.
3108 */ 2973 */
3109 efip = (xfs_efi_log_item_t *)lip; 2974 efip = (xfs_efi_log_item_t *)lip;
3110 if (efip->efi_flags & XFS_EFI_RECOVERED) { 2975 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3111 lip = xfs_trans_ail_cursor_next(ailp, &cur); 2976 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3112 continue; 2977 continue;
3113 } 2978 }
@@ -3166,8 +3031,7 @@ xlog_recover_clear_agi_bucket(
3166out_abort: 3031out_abort:
3167 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3032 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3168out_error: 3033out_error:
3169 xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: " 3034 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3170 "failed to clear agi %d. Continuing.", agno);
3171 return; 3035 return;
3172} 3036}
3173 3037
@@ -3418,7 +3282,7 @@ xlog_valid_rec_header(
3418 if (unlikely( 3282 if (unlikely(
3419 (!rhead->h_version || 3283 (!rhead->h_version ||
3420 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { 3284 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3421 xlog_warn("XFS: %s: unrecognised log version (%d).", 3285 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
3422 __func__, be32_to_cpu(rhead->h_version)); 3286 __func__, be32_to_cpu(rhead->h_version));
3423 return XFS_ERROR(EIO); 3287 return XFS_ERROR(EIO);
3424 } 3288 }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
3724 xfs_daddr_t head_blk, 3588 xfs_daddr_t head_blk,
3725 xfs_daddr_t tail_blk) 3589 xfs_daddr_t tail_blk)
3726{ 3590{
3727 int error; 3591 int error, i;
3728 3592
3729 ASSERT(head_blk != tail_blk); 3593 ASSERT(head_blk != tail_blk);
3730 3594
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
3732 * First do a pass to find all of the cancelled buf log items. 3596 * First do a pass to find all of the cancelled buf log items.
3733 * Store them in the buf_cancel_table for use in the second pass. 3597 * Store them in the buf_cancel_table for use in the second pass.
3734 */ 3598 */
3735 log->l_buf_cancel_table = 3599 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3736 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * 3600 sizeof(struct list_head),
3737 sizeof(xfs_buf_cancel_t*),
3738 KM_SLEEP); 3601 KM_SLEEP);
3602 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3603 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3604
3739 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 3605 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3740 XLOG_RECOVER_PASS1); 3606 XLOG_RECOVER_PASS1);
3741 if (error != 0) { 3607 if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
3754 int i; 3620 int i;
3755 3621
3756 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 3622 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3757 ASSERT(log->l_buf_cancel_table[i] == NULL); 3623 ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3758 } 3624 }
3759#endif /* DEBUG */ 3625#endif /* DEBUG */
3760 3626
@@ -3874,10 +3740,9 @@ xlog_recover(
3874 return error; 3740 return error;
3875 } 3741 }
3876 3742
3877 cmn_err(CE_NOTE, 3743 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3878 "Starting XFS recovery on filesystem: %s (logdev: %s)", 3744 log->l_mp->m_logname ? log->l_mp->m_logname
3879 log->l_mp->m_fsname, log->l_mp->m_logname ? 3745 : "internal");
3880 log->l_mp->m_logname : "internal");
3881 3746
3882 error = xlog_do_recover(log, head_blk, tail_blk); 3747 error = xlog_do_recover(log, head_blk, tail_blk);
3883 log->l_flags |= XLOG_RECOVERY_NEEDED; 3748 log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3910,9 +3775,7 @@ xlog_recover_finish(
3910 int error; 3775 int error;
3911 error = xlog_recover_process_efis(log); 3776 error = xlog_recover_process_efis(log);
3912 if (error) { 3777 if (error) {
3913 cmn_err(CE_ALERT, 3778 xfs_alert(log->l_mp, "Failed to recover EFIs");
3914 "Failed to recover EFIs on filesystem: %s",
3915 log->l_mp->m_fsname);
3916 return error; 3779 return error;
3917 } 3780 }
3918 /* 3781 /*
@@ -3927,15 +3790,12 @@ xlog_recover_finish(
3927 3790
3928 xlog_recover_check_summary(log); 3791 xlog_recover_check_summary(log);
3929 3792
3930 cmn_err(CE_NOTE, 3793 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
3931 "Ending XFS recovery on filesystem: %s (logdev: %s)", 3794 log->l_mp->m_logname ? log->l_mp->m_logname
3932 log->l_mp->m_fsname, log->l_mp->m_logname ? 3795 : "internal");
3933 log->l_mp->m_logname : "internal");
3934 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3796 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3935 } else { 3797 } else {
3936 cmn_err(CE_DEBUG, 3798 xfs_info(log->l_mp, "Ending clean mount");
3937 "!Ending clean XFS mount for filesystem: %s\n",
3938 log->l_mp->m_fsname);
3939 } 3799 }
3940 return 0; 3800 return 0;
3941} 3801}
@@ -3968,10 +3828,8 @@ xlog_recover_check_summary(
3968 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 3828 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3969 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); 3829 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3970 if (error) { 3830 if (error) {
3971 xfs_fs_cmn_err(CE_ALERT, mp, 3831 xfs_alert(mp, "%s agf read failed agno %d error %d",
3972 "xlog_recover_check_summary(agf)" 3832 __func__, agno, error);
3973 "agf read failed agno %d error %d",
3974 agno, error);
3975 } else { 3833 } else {
3976 agfp = XFS_BUF_TO_AGF(agfbp); 3834 agfp = XFS_BUF_TO_AGF(agfbp);
3977 freeblks += be32_to_cpu(agfp->agf_freeblks) + 3835 freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3980,7 +3838,10 @@ xlog_recover_check_summary(
3980 } 3838 }
3981 3839
3982 error = xfs_read_agi(mp, NULL, agno, &agibp); 3840 error = xfs_read_agi(mp, NULL, agno, &agibp);
3983 if (!error) { 3841 if (error) {
3842 xfs_alert(mp, "%s agi read failed agno %d error %d",
3843 __func__, agno, error);
3844 } else {
3984 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); 3845 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
3985 3846
3986 itotal += be32_to_cpu(agi->agi_count); 3847 itotal += be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa1c254..bb3f9a7b24ed 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -133,9 +133,7 @@ xfs_uuid_mount(
133 return 0; 133 return 0;
134 134
135 if (uuid_is_nil(uuid)) { 135 if (uuid_is_nil(uuid)) {
136 cmn_err(CE_WARN, 136 xfs_warn(mp, "Filesystem has nil UUID - can't mount");
137 "XFS: Filesystem %s has nil UUID - can't mount",
138 mp->m_fsname);
139 return XFS_ERROR(EINVAL); 137 return XFS_ERROR(EINVAL);
140 } 138 }
141 139
@@ -163,8 +161,7 @@ xfs_uuid_mount(
163 161
164 out_duplicate: 162 out_duplicate:
165 mutex_unlock(&xfs_uuid_table_mutex); 163 mutex_unlock(&xfs_uuid_table_mutex);
166 cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount", 164 xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
167 mp->m_fsname);
168 return XFS_ERROR(EINVAL); 165 return XFS_ERROR(EINVAL);
169} 166}
170 167
@@ -311,6 +308,8 @@ xfs_mount_validate_sb(
311 xfs_sb_t *sbp, 308 xfs_sb_t *sbp,
312 int flags) 309 int flags)
313{ 310{
311 int loud = !(flags & XFS_MFSI_QUIET);
312
314 /* 313 /*
315 * If the log device and data device have the 314 * If the log device and data device have the
316 * same device number, the log is internal. 315 * same device number, the log is internal.
@@ -319,28 +318,32 @@ xfs_mount_validate_sb(
319 * a volume filesystem in a non-volume manner. 318 * a volume filesystem in a non-volume manner.
320 */ 319 */
321 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 320 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
322 xfs_fs_mount_cmn_err(flags, "bad magic number"); 321 if (loud)
322 xfs_warn(mp, "bad magic number");
323 return XFS_ERROR(EWRONGFS); 323 return XFS_ERROR(EWRONGFS);
324 } 324 }
325 325
326 if (!xfs_sb_good_version(sbp)) { 326 if (!xfs_sb_good_version(sbp)) {
327 xfs_fs_mount_cmn_err(flags, "bad version"); 327 if (loud)
328 xfs_warn(mp, "bad version");
328 return XFS_ERROR(EWRONGFS); 329 return XFS_ERROR(EWRONGFS);
329 } 330 }
330 331
331 if (unlikely( 332 if (unlikely(
332 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 333 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
333 xfs_fs_mount_cmn_err(flags, 334 if (loud)
334 "filesystem is marked as having an external log; " 335 xfs_warn(mp,
335 "specify logdev on the\nmount command line."); 336 "filesystem is marked as having an external log; "
337 "specify logdev on the mount command line.");
336 return XFS_ERROR(EINVAL); 338 return XFS_ERROR(EINVAL);
337 } 339 }
338 340
339 if (unlikely( 341 if (unlikely(
340 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { 342 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
341 xfs_fs_mount_cmn_err(flags, 343 if (loud)
342 "filesystem is marked as having an internal log; " 344 xfs_warn(mp,
343 "do not specify logdev on\nthe mount command line."); 345 "filesystem is marked as having an internal log; "
346 "do not specify logdev on the mount command line.");
344 return XFS_ERROR(EINVAL); 347 return XFS_ERROR(EINVAL);
345 } 348 }
346 349
@@ -369,7 +372,8 @@ xfs_mount_validate_sb(
369 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 372 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
370 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || 373 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
371 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { 374 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
372 xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); 375 if (loud)
376 xfs_warn(mp, "SB sanity check 1 failed");
373 return XFS_ERROR(EFSCORRUPTED); 377 return XFS_ERROR(EFSCORRUPTED);
374 } 378 }
375 379
@@ -382,7 +386,8 @@ xfs_mount_validate_sb(
382 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || 386 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
383 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * 387 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
384 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { 388 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
385 xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed"); 389 if (loud)
390 xfs_warn(mp, "SB sanity check 2 failed");
386 return XFS_ERROR(EFSCORRUPTED); 391 return XFS_ERROR(EFSCORRUPTED);
387 } 392 }
388 393
@@ -390,12 +395,12 @@ xfs_mount_validate_sb(
390 * Until this is fixed only page-sized or smaller data blocks work. 395 * Until this is fixed only page-sized or smaller data blocks work.
391 */ 396 */
392 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { 397 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
393 xfs_fs_mount_cmn_err(flags, 398 if (loud) {
394 "file system with blocksize %d bytes", 399 xfs_warn(mp,
395 sbp->sb_blocksize); 400 "File system with blocksize %d bytes. "
396 xfs_fs_mount_cmn_err(flags, 401 "Only pagesize (%ld) or less will currently work.",
397 "only pagesize (%ld) or less will currently work.", 402 sbp->sb_blocksize, PAGE_SIZE);
398 PAGE_SIZE); 403 }
399 return XFS_ERROR(ENOSYS); 404 return XFS_ERROR(ENOSYS);
400 } 405 }
401 406
@@ -409,21 +414,23 @@ xfs_mount_validate_sb(
409 case 2048: 414 case 2048:
410 break; 415 break;
411 default: 416 default:
412 xfs_fs_mount_cmn_err(flags, 417 if (loud)
413 "inode size of %d bytes not supported", 418 xfs_warn(mp, "inode size of %d bytes not supported",
414 sbp->sb_inodesize); 419 sbp->sb_inodesize);
415 return XFS_ERROR(ENOSYS); 420 return XFS_ERROR(ENOSYS);
416 } 421 }
417 422
418 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 423 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
419 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 424 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
420 xfs_fs_mount_cmn_err(flags, 425 if (loud)
421 "file system too large to be mounted on this system."); 426 xfs_warn(mp,
427 "file system too large to be mounted on this system.");
422 return XFS_ERROR(EFBIG); 428 return XFS_ERROR(EFBIG);
423 } 429 }
424 430
425 if (unlikely(sbp->sb_inprogress)) { 431 if (unlikely(sbp->sb_inprogress)) {
426 xfs_fs_mount_cmn_err(flags, "file system busy"); 432 if (loud)
433 xfs_warn(mp, "file system busy");
427 return XFS_ERROR(EFSCORRUPTED); 434 return XFS_ERROR(EFSCORRUPTED);
428 } 435 }
429 436
@@ -431,8 +438,9 @@ xfs_mount_validate_sb(
431 * Version 1 directory format has never worked on Linux. 438 * Version 1 directory format has never worked on Linux.
432 */ 439 */
433 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { 440 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
434 xfs_fs_mount_cmn_err(flags, 441 if (loud)
435 "file system using version 1 directory format"); 442 xfs_warn(mp,
443 "file system using version 1 directory format");
436 return XFS_ERROR(ENOSYS); 444 return XFS_ERROR(ENOSYS);
437 } 445 }
438 446
@@ -472,7 +480,7 @@ xfs_initialize_perag(
472 goto out_unwind; 480 goto out_unwind;
473 pag->pag_agno = index; 481 pag->pag_agno = index;
474 pag->pag_mount = mp; 482 pag->pag_mount = mp;
475 rwlock_init(&pag->pag_ici_lock); 483 spin_lock_init(&pag->pag_ici_lock);
476 mutex_init(&pag->pag_ici_reclaim_lock); 484 mutex_init(&pag->pag_ici_reclaim_lock);
477 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 485 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
478 spin_lock_init(&pag->pag_buf_lock); 486 spin_lock_init(&pag->pag_buf_lock);
@@ -673,6 +681,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
673 unsigned int sector_size; 681 unsigned int sector_size;
674 xfs_buf_t *bp; 682 xfs_buf_t *bp;
675 int error; 683 int error;
684 int loud = !(flags & XFS_MFSI_QUIET);
676 685
677 ASSERT(mp->m_sb_bp == NULL); 686 ASSERT(mp->m_sb_bp == NULL);
678 ASSERT(mp->m_ddev_targp != NULL); 687 ASSERT(mp->m_ddev_targp != NULL);
@@ -688,7 +697,8 @@ reread:
688 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, 697 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
689 XFS_SB_DADDR, sector_size, 0); 698 XFS_SB_DADDR, sector_size, 0);
690 if (!bp) { 699 if (!bp) {
691 xfs_fs_mount_cmn_err(flags, "SB buffer read failed"); 700 if (loud)
701 xfs_warn(mp, "SB buffer read failed");
692 return EIO; 702 return EIO;
693 } 703 }
694 704
@@ -699,7 +709,8 @@ reread:
699 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 709 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
700 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 710 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
701 if (error) { 711 if (error) {
702 xfs_fs_mount_cmn_err(flags, "SB validate failed"); 712 if (loud)
713 xfs_warn(mp, "SB validate failed");
703 goto release_buf; 714 goto release_buf;
704 } 715 }
705 716
@@ -707,9 +718,9 @@ reread:
707 * We must be able to do sector-sized and sector-aligned IO. 718 * We must be able to do sector-sized and sector-aligned IO.
708 */ 719 */
709 if (sector_size > mp->m_sb.sb_sectsize) { 720 if (sector_size > mp->m_sb.sb_sectsize) {
710 xfs_fs_mount_cmn_err(flags, 721 if (loud)
711 "device supports only %u byte sectors (not %u)", 722 xfs_warn(mp, "device supports %u byte sectors (not %u)",
712 sector_size, mp->m_sb.sb_sectsize); 723 sector_size, mp->m_sb.sb_sectsize);
713 error = ENOSYS; 724 error = ENOSYS;
714 goto release_buf; 725 goto release_buf;
715 } 726 }
@@ -853,8 +864,7 @@ xfs_update_alignment(xfs_mount_t *mp)
853 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || 864 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
854 (BBTOB(mp->m_swidth) & mp->m_blockmask)) { 865 (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
855 if (mp->m_flags & XFS_MOUNT_RETERR) { 866 if (mp->m_flags & XFS_MOUNT_RETERR) {
856 cmn_err(CE_WARN, 867 xfs_warn(mp, "alignment check 1 failed");
857 "XFS: alignment check 1 failed");
858 return XFS_ERROR(EINVAL); 868 return XFS_ERROR(EINVAL);
859 } 869 }
860 mp->m_dalign = mp->m_swidth = 0; 870 mp->m_dalign = mp->m_swidth = 0;
@@ -867,8 +877,9 @@ xfs_update_alignment(xfs_mount_t *mp)
867 if (mp->m_flags & XFS_MOUNT_RETERR) { 877 if (mp->m_flags & XFS_MOUNT_RETERR) {
868 return XFS_ERROR(EINVAL); 878 return XFS_ERROR(EINVAL);
869 } 879 }
870 xfs_fs_cmn_err(CE_WARN, mp, 880 xfs_warn(mp,
871"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)", 881 "stripe alignment turned off: sunit(%d)/swidth(%d) "
882 "incompatible with agsize(%d)",
872 mp->m_dalign, mp->m_swidth, 883 mp->m_dalign, mp->m_swidth,
873 sbp->sb_agblocks); 884 sbp->sb_agblocks);
874 885
@@ -878,9 +889,9 @@ xfs_update_alignment(xfs_mount_t *mp)
878 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 889 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
879 } else { 890 } else {
880 if (mp->m_flags & XFS_MOUNT_RETERR) { 891 if (mp->m_flags & XFS_MOUNT_RETERR) {
881 xfs_fs_cmn_err(CE_WARN, mp, 892 xfs_warn(mp,
882"stripe alignment turned off: sunit(%d) less than bsize(%d)", 893 "stripe alignment turned off: sunit(%d) less than bsize(%d)",
883 mp->m_dalign, 894 mp->m_dalign,
884 mp->m_blockmask +1); 895 mp->m_blockmask +1);
885 return XFS_ERROR(EINVAL); 896 return XFS_ERROR(EINVAL);
886 } 897 }
@@ -975,6 +986,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
975} 986}
976 987
977/* 988/*
989 * precalculate the low space thresholds for dynamic speculative preallocation.
990 */
991void
992xfs_set_low_space_thresholds(
993 struct xfs_mount *mp)
994{
995 int i;
996
997 for (i = 0; i < XFS_LOWSP_MAX; i++) {
998 __uint64_t space = mp->m_sb.sb_dblocks;
999
1000 do_div(space, 100);
1001 mp->m_low_space[i] = space * (i + 1);
1002 }
1003}
1004
1005
1006/*
978 * Set whether we're using inode alignment. 1007 * Set whether we're using inode alignment.
979 */ 1008 */
980STATIC void 1009STATIC void
@@ -1008,14 +1037,14 @@ xfs_check_sizes(xfs_mount_t *mp)
1008 1037
1009 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 1038 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
1010 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1039 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
1011 cmn_err(CE_WARN, "XFS: filesystem size mismatch detected"); 1040 xfs_warn(mp, "filesystem size mismatch detected");
1012 return XFS_ERROR(EFBIG); 1041 return XFS_ERROR(EFBIG);
1013 } 1042 }
1014 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, 1043 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
1015 d - XFS_FSS_TO_BB(mp, 1), 1044 d - XFS_FSS_TO_BB(mp, 1),
1016 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); 1045 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
1017 if (!bp) { 1046 if (!bp) {
1018 cmn_err(CE_WARN, "XFS: last sector read failed"); 1047 xfs_warn(mp, "last sector read failed");
1019 return EIO; 1048 return EIO;
1020 } 1049 }
1021 xfs_buf_relse(bp); 1050 xfs_buf_relse(bp);
@@ -1023,14 +1052,14 @@ xfs_check_sizes(xfs_mount_t *mp)
1023 if (mp->m_logdev_targp != mp->m_ddev_targp) { 1052 if (mp->m_logdev_targp != mp->m_ddev_targp) {
1024 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1053 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1025 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1054 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1026 cmn_err(CE_WARN, "XFS: log size mismatch detected"); 1055 xfs_warn(mp, "log size mismatch detected");
1027 return XFS_ERROR(EFBIG); 1056 return XFS_ERROR(EFBIG);
1028 } 1057 }
1029 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, 1058 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
1030 d - XFS_FSB_TO_BB(mp, 1), 1059 d - XFS_FSB_TO_BB(mp, 1),
1031 XFS_FSB_TO_B(mp, 1), 0); 1060 XFS_FSB_TO_B(mp, 1), 0);
1032 if (!bp) { 1061 if (!bp) {
1033 cmn_err(CE_WARN, "XFS: log device read failed"); 1062 xfs_warn(mp, "log device read failed");
1034 return EIO; 1063 return EIO;
1035 } 1064 }
1036 xfs_buf_relse(bp); 1065 xfs_buf_relse(bp);
@@ -1068,7 +1097,7 @@ xfs_mount_reset_sbqflags(
1068 return 0; 1097 return 0;
1069 1098
1070#ifdef QUOTADEBUG 1099#ifdef QUOTADEBUG
1071 xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes"); 1100 xfs_notice(mp, "Writing superblock quota changes");
1072#endif 1101#endif
1073 1102
1074 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 1103 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
@@ -1076,8 +1105,7 @@ xfs_mount_reset_sbqflags(
1076 XFS_DEFAULT_LOG_COUNT); 1105 XFS_DEFAULT_LOG_COUNT);
1077 if (error) { 1106 if (error) {
1078 xfs_trans_cancel(tp, 0); 1107 xfs_trans_cancel(tp, 0);
1079 xfs_fs_cmn_err(CE_ALERT, mp, 1108 xfs_alert(mp, "%s: Superblock update failed!", __func__);
1080 "xfs_mount_reset_sbqflags: Superblock update failed!");
1081 return error; 1109 return error;
1082 } 1110 }
1083 1111
@@ -1143,8 +1171,7 @@ xfs_mountfs(
1143 * transaction subsystem is online. 1171 * transaction subsystem is online.
1144 */ 1172 */
1145 if (xfs_sb_has_mismatched_features2(sbp)) { 1173 if (xfs_sb_has_mismatched_features2(sbp)) {
1146 cmn_err(CE_WARN, 1174 xfs_warn(mp, "correcting sb_features alignment problem");
1147 "XFS: correcting sb_features alignment problem");
1148 sbp->sb_features2 |= sbp->sb_bad_features2; 1175 sbp->sb_features2 |= sbp->sb_bad_features2;
1149 sbp->sb_bad_features2 = sbp->sb_features2; 1176 sbp->sb_bad_features2 = sbp->sb_features2;
1150 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; 1177 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
@@ -1196,6 +1223,9 @@ xfs_mountfs(
1196 */ 1223 */
1197 xfs_set_rw_sizes(mp); 1224 xfs_set_rw_sizes(mp);
1198 1225
1226 /* set the low space thresholds for dynamic preallocation */
1227 xfs_set_low_space_thresholds(mp);
1228
1199 /* 1229 /*
1200 * Set the inode cluster size. 1230 * Set the inode cluster size.
1201 * This may still be overridden by the file system 1231 * This may still be overridden by the file system
@@ -1220,7 +1250,7 @@ xfs_mountfs(
1220 */ 1250 */
1221 error = xfs_rtmount_init(mp); 1251 error = xfs_rtmount_init(mp);
1222 if (error) { 1252 if (error) {
1223 cmn_err(CE_WARN, "XFS: RT mount failed"); 1253 xfs_warn(mp, "RT mount failed");
1224 goto out_remove_uuid; 1254 goto out_remove_uuid;
1225 } 1255 }
1226 1256
@@ -1251,12 +1281,12 @@ xfs_mountfs(
1251 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); 1281 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
1252 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); 1282 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1253 if (error) { 1283 if (error) {
1254 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); 1284 xfs_warn(mp, "Failed per-ag init: %d", error);
1255 goto out_remove_uuid; 1285 goto out_remove_uuid;
1256 } 1286 }
1257 1287
1258 if (!sbp->sb_logblocks) { 1288 if (!sbp->sb_logblocks) {
1259 cmn_err(CE_WARN, "XFS: no log defined"); 1289 xfs_warn(mp, "no log defined");
1260 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); 1290 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
1261 error = XFS_ERROR(EFSCORRUPTED); 1291 error = XFS_ERROR(EFSCORRUPTED);
1262 goto out_free_perag; 1292 goto out_free_perag;
@@ -1269,7 +1299,7 @@ xfs_mountfs(
1269 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), 1299 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1270 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 1300 XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1271 if (error) { 1301 if (error) {
1272 cmn_err(CE_WARN, "XFS: log mount failed"); 1302 xfs_warn(mp, "log mount failed");
1273 goto out_free_perag; 1303 goto out_free_perag;
1274 } 1304 }
1275 1305
@@ -1306,16 +1336,14 @@ xfs_mountfs(
1306 */ 1336 */
1307 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); 1337 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
1308 if (error) { 1338 if (error) {
1309 cmn_err(CE_WARN, "XFS: failed to read root inode"); 1339 xfs_warn(mp, "failed to read root inode");
1310 goto out_log_dealloc; 1340 goto out_log_dealloc;
1311 } 1341 }
1312 1342
1313 ASSERT(rip != NULL); 1343 ASSERT(rip != NULL);
1314 1344
1315 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { 1345 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
1316 cmn_err(CE_WARN, "XFS: corrupted root inode"); 1346 xfs_warn(mp, "corrupted root inode %llu: not a directory",
1317 cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
1318 XFS_BUFTARG_NAME(mp->m_ddev_targp),
1319 (unsigned long long)rip->i_ino); 1347 (unsigned long long)rip->i_ino);
1320 xfs_iunlock(rip, XFS_ILOCK_EXCL); 1348 xfs_iunlock(rip, XFS_ILOCK_EXCL);
1321 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, 1349 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
@@ -1335,7 +1363,7 @@ xfs_mountfs(
1335 /* 1363 /*
1336 * Free up the root inode. 1364 * Free up the root inode.
1337 */ 1365 */
1338 cmn_err(CE_WARN, "XFS: failed to read RT inodes"); 1366 xfs_warn(mp, "failed to read RT inodes");
1339 goto out_rele_rip; 1367 goto out_rele_rip;
1340 } 1368 }
1341 1369
@@ -1347,7 +1375,7 @@ xfs_mountfs(
1347 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { 1375 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
1348 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1376 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1349 if (error) { 1377 if (error) {
1350 cmn_err(CE_WARN, "XFS: failed to write sb changes"); 1378 xfs_warn(mp, "failed to write sb changes");
1351 goto out_rtunmount; 1379 goto out_rtunmount;
1352 } 1380 }
1353 } 1381 }
@@ -1368,10 +1396,7 @@ xfs_mountfs(
1368 * quotachecked license. 1396 * quotachecked license.
1369 */ 1397 */
1370 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { 1398 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
1371 cmn_err(CE_NOTE, 1399 xfs_notice(mp, "resetting quota flags");
1372 "XFS: resetting qflags for filesystem %s",
1373 mp->m_fsname);
1374
1375 error = xfs_mount_reset_sbqflags(mp); 1400 error = xfs_mount_reset_sbqflags(mp);
1376 if (error) 1401 if (error)
1377 return error; 1402 return error;
@@ -1385,7 +1410,7 @@ xfs_mountfs(
1385 */ 1410 */
1386 error = xfs_log_mount_finish(mp); 1411 error = xfs_log_mount_finish(mp);
1387 if (error) { 1412 if (error) {
1388 cmn_err(CE_WARN, "XFS: log mount finish failed"); 1413 xfs_warn(mp, "log mount finish failed");
1389 goto out_rtunmount; 1414 goto out_rtunmount;
1390 } 1415 }
1391 1416
@@ -1414,8 +1439,8 @@ xfs_mountfs(
1414 resblks = xfs_default_resblks(mp); 1439 resblks = xfs_default_resblks(mp);
1415 error = xfs_reserve_blocks(mp, &resblks, NULL); 1440 error = xfs_reserve_blocks(mp, &resblks, NULL);
1416 if (error) 1441 if (error)
1417 cmn_err(CE_WARN, "XFS: Unable to allocate reserve " 1442 xfs_warn(mp,
1418 "blocks. Continuing without a reserve pool."); 1443 "Unable to allocate reserve blocks. Continuing without reserve pool.");
1419 } 1444 }
1420 1445
1421 return 0; 1446 return 0;
@@ -1504,12 +1529,12 @@ xfs_unmountfs(
1504 resblks = 0; 1529 resblks = 0;
1505 error = xfs_reserve_blocks(mp, &resblks, NULL); 1530 error = xfs_reserve_blocks(mp, &resblks, NULL);
1506 if (error) 1531 if (error)
1507 cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. " 1532 xfs_warn(mp, "Unable to free reserved block pool. "
1508 "Freespace may not be correct on next mount."); 1533 "Freespace may not be correct on next mount.");
1509 1534
1510 error = xfs_log_sbcount(mp, 1); 1535 error = xfs_log_sbcount(mp, 1);
1511 if (error) 1536 if (error)
1512 cmn_err(CE_WARN, "XFS: Unable to update superblock counters. " 1537 xfs_warn(mp, "Unable to update superblock counters. "
1513 "Freespace may not be correct on next mount."); 1538 "Freespace may not be correct on next mount.");
1514 xfs_unmountfs_writesb(mp); 1539 xfs_unmountfs_writesb(mp);
1515 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1540 xfs_unmountfs_wait(mp); /* wait for async bufs */
@@ -1992,10 +2017,8 @@ xfs_dev_is_read_only(
1992 if (xfs_readonly_buftarg(mp->m_ddev_targp) || 2017 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
1993 xfs_readonly_buftarg(mp->m_logdev_targp) || 2018 xfs_readonly_buftarg(mp->m_logdev_targp) ||
1994 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { 2019 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
1995 cmn_err(CE_NOTE, 2020 xfs_notice(mp, "%s required on read-only device.", message);
1996 "XFS: %s required on read-only device.", message); 2021 xfs_notice(mp, "write access unavailable, cannot proceed.");
1997 cmn_err(CE_NOTE,
1998 "XFS: write access unavailable, cannot proceed.");
1999 return EROFS; 2022 return EROFS;
2000 } 2023 }
2001 return 0; 2024 return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b4980740..19af0ab0d0c6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
103 xfs_mod_incore_sb(mp, field, delta, rsvd) 103 xfs_mod_incore_sb(mp, field, delta, rsvd)
104#endif 104#endif
105 105
106/* dynamic preallocation free space thresholds, 5% down to 1% */
107enum {
108 XFS_LOWSP_1_PCNT = 0,
109 XFS_LOWSP_2_PCNT,
110 XFS_LOWSP_3_PCNT,
111 XFS_LOWSP_4_PCNT,
112 XFS_LOWSP_5_PCNT,
113 XFS_LOWSP_MAX,
114};
115
106typedef struct xfs_mount { 116typedef struct xfs_mount {
107 struct super_block *m_super; 117 struct super_block *m_super;
108 xfs_tid_t m_tid; /* next unused tid for fs */ 118 xfs_tid_t m_tid; /* next unused tid for fs */
@@ -193,15 +203,14 @@ typedef struct xfs_mount {
193 struct mutex m_icsb_mutex; /* balancer sync lock */ 203 struct mutex m_icsb_mutex; /* balancer sync lock */
194#endif 204#endif
195 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 205 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
196 struct task_struct *m_sync_task; /* generalised sync thread */ 206 struct delayed_work m_sync_work; /* background sync work */
197 xfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ 207 struct delayed_work m_reclaim_work; /* background inode reclaim */
198 struct list_head m_sync_list; /* sync thread work item list */ 208 struct work_struct m_flush_work; /* background inode flush */
199 spinlock_t m_sync_lock; /* work item list lock */
200 int m_sync_seq; /* sync thread generation no. */
201 wait_queue_head_t m_wait_single_sync_task;
202 __int64_t m_update_flags; /* sb flags we need to update 209 __int64_t m_update_flags; /* sb flags we need to update
203 on the next remount,rw */ 210 on the next remount,rw */
204 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 211 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
212 int64_t m_low_space[XFS_LOWSP_MAX];
213 /* low free space thresholds */
205} xfs_mount_t; 214} xfs_mount_t;
206 215
207/* 216/*
@@ -379,6 +388,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
379 388
380extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 389extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
381 390
391extern void xfs_set_low_space_thresholds(struct xfs_mount *);
392
382#endif /* __KERNEL__ */ 393#endif /* __KERNEL__ */
383 394
384extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 395extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2b..4aff56395732 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
309 if (!xfs_mru_elem_zone) 309 if (!xfs_mru_elem_zone)
310 goto out; 310 goto out;
311 311
312 xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); 312 xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
313 if (!xfs_mru_reap_wq) 313 if (!xfs_mru_reap_wq)
314 goto out_destroy_mru_elem_zone; 314 goto out_destroy_mru_elem_zone;
315 315
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
408 spin_lock(&mru->lock); 408 spin_lock(&mru->lock);
409 if (mru->queued) { 409 if (mru->queued) {
410 spin_unlock(&mru->lock); 410 spin_unlock(&mru->lock);
411 cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work); 411 cancel_delayed_work_sync(&mru->work);
412 spin_lock(&mru->lock); 412 spin_lock(&mru->lock);
413 } 413 }
414 414
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 9bb6eda4cd21..a595f29567fe 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -382,7 +382,8 @@ static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
382 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \ 382 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
383 f | XFS_QMOPT_RES_REGBLKS) 383 f | XFS_QMOPT_RES_REGBLKS)
384 384
385extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); 385extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
386 xfs_dqid_t, uint, uint, char *);
386extern int xfs_mount_reset_sbqflags(struct xfs_mount *); 387extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
387 388
388#endif /* __KERNEL__ */ 389#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d2af0a8381a6..77a59891734e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -297,6 +297,7 @@ xfs_rename(
297 * it and some incremental backup programs won't work without it. 297 * it and some incremental backup programs won't work without it.
298 */ 298 */
299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
300 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
300 301
301 /* 302 /*
302 * Adjust the link count on src_dp. This is necessary when 303 * Adjust the link count on src_dp. This is necessary when
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 12a191385310..8f76fdff4f46 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -76,7 +76,7 @@ xfs_growfs_rt_alloc(
76 xfs_mount_t *mp, /* file system mount point */ 76 xfs_mount_t *mp, /* file system mount point */
77 xfs_extlen_t oblocks, /* old count of blocks */ 77 xfs_extlen_t oblocks, /* old count of blocks */
78 xfs_extlen_t nblocks, /* new count of blocks */ 78 xfs_extlen_t nblocks, /* new count of blocks */
79 xfs_ino_t ino) /* inode number (bitmap/summary) */ 79 xfs_inode_t *ip) /* inode (bitmap/summary) */
80{ 80{
81 xfs_fileoff_t bno; /* block number in file */ 81 xfs_fileoff_t bno; /* block number in file */
82 xfs_buf_t *bp; /* temporary buffer for zeroing */ 82 xfs_buf_t *bp; /* temporary buffer for zeroing */
@@ -86,7 +86,6 @@ xfs_growfs_rt_alloc(
86 xfs_fsblock_t firstblock; /* first block allocated in xaction */ 86 xfs_fsblock_t firstblock; /* first block allocated in xaction */
87 xfs_bmap_free_t flist; /* list of freed blocks */ 87 xfs_bmap_free_t flist; /* list of freed blocks */
88 xfs_fsblock_t fsbno; /* filesystem block for bno */ 88 xfs_fsblock_t fsbno; /* filesystem block for bno */
89 xfs_inode_t *ip; /* pointer to incore inode */
90 xfs_bmbt_irec_t map; /* block map output */ 89 xfs_bmbt_irec_t map; /* block map output */
91 int nmap; /* number of block maps */ 90 int nmap; /* number of block maps */
92 int resblks; /* space reservation */ 91 int resblks; /* space reservation */
@@ -112,9 +111,9 @@ xfs_growfs_rt_alloc(
112 /* 111 /*
113 * Lock the inode. 112 * Lock the inode.
114 */ 113 */
115 if ((error = xfs_trans_iget(mp, tp, ino, 0, 114 xfs_ilock(ip, XFS_ILOCK_EXCL);
116 XFS_ILOCK_EXCL, &ip))) 115 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
117 goto error_cancel; 116
118 xfs_bmap_init(&flist, &firstblock); 117 xfs_bmap_init(&flist, &firstblock);
119 /* 118 /*
120 * Allocate blocks to the bitmap file. 119 * Allocate blocks to the bitmap file.
@@ -155,9 +154,8 @@ xfs_growfs_rt_alloc(
155 /* 154 /*
156 * Lock the bitmap inode. 155 * Lock the bitmap inode.
157 */ 156 */
158 if ((error = xfs_trans_iget(mp, tp, ino, 0, 157 xfs_ilock(ip, XFS_ILOCK_EXCL);
159 XFS_ILOCK_EXCL, &ip))) 158 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
160 goto error_cancel;
161 /* 159 /*
162 * Get a buffer for the block. 160 * Get a buffer for the block.
163 */ 161 */
@@ -1854,7 +1852,6 @@ xfs_growfs_rt(
1854 xfs_rtblock_t bmbno; /* bitmap block number */ 1852 xfs_rtblock_t bmbno; /* bitmap block number */
1855 xfs_buf_t *bp; /* temporary buffer */ 1853 xfs_buf_t *bp; /* temporary buffer */
1856 int error; /* error return value */ 1854 int error; /* error return value */
1857 xfs_inode_t *ip; /* bitmap inode, used as lock */
1858 xfs_mount_t *nmp; /* new (fake) mount structure */ 1855 xfs_mount_t *nmp; /* new (fake) mount structure */
1859 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */ 1856 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */
1860 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ 1857 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
@@ -1918,11 +1915,11 @@ xfs_growfs_rt(
1918 /* 1915 /*
1919 * Allocate space to the bitmap and summary files, as necessary. 1916 * Allocate space to the bitmap and summary files, as necessary.
1920 */ 1917 */
1921 if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, 1918 error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
1922 mp->m_sb.sb_rbmino))) 1919 if (error)
1923 return error; 1920 return error;
1924 if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, 1921 error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
1925 mp->m_sb.sb_rsumino))) 1922 if (error)
1926 return error; 1923 return error;
1927 /* 1924 /*
1928 * Allocate a new (fake) mount/sb. 1925 * Allocate a new (fake) mount/sb.
@@ -1972,10 +1969,8 @@ xfs_growfs_rt(
1972 /* 1969 /*
1973 * Lock out other callers by grabbing the bitmap inode lock. 1970 * Lock out other callers by grabbing the bitmap inode lock.
1974 */ 1971 */
1975 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 1972 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
1976 XFS_ILOCK_EXCL, &ip))) 1973 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
1977 goto error_cancel;
1978 ASSERT(ip == mp->m_rbmip);
1979 /* 1974 /*
1980 * Update the bitmap inode's size. 1975 * Update the bitmap inode's size.
1981 */ 1976 */
@@ -1986,10 +1981,8 @@ xfs_growfs_rt(
1986 /* 1981 /*
1987 * Get the summary inode into the transaction. 1982 * Get the summary inode into the transaction.
1988 */ 1983 */
1989 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, 1984 xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
1990 XFS_ILOCK_EXCL, &ip))) 1985 xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
1991 goto error_cancel;
1992 ASSERT(ip == mp->m_rsumip);
1993 /* 1986 /*
1994 * Update the summary inode's size. 1987 * Update the summary inode's size.
1995 */ 1988 */
@@ -2075,15 +2068,15 @@ xfs_rtallocate_extent(
2075 xfs_extlen_t prod, /* extent product factor */ 2068 xfs_extlen_t prod, /* extent product factor */
2076 xfs_rtblock_t *rtblock) /* out: start block allocated */ 2069 xfs_rtblock_t *rtblock) /* out: start block allocated */
2077{ 2070{
2071 xfs_mount_t *mp = tp->t_mountp;
2078 int error; /* error value */ 2072 int error; /* error value */
2079 xfs_inode_t *ip; /* inode for bitmap file */
2080 xfs_mount_t *mp; /* file system mount structure */
2081 xfs_rtblock_t r; /* result allocated block */ 2073 xfs_rtblock_t r; /* result allocated block */
2082 xfs_fsblock_t sb; /* summary file block number */ 2074 xfs_fsblock_t sb; /* summary file block number */
2083 xfs_buf_t *sumbp; /* summary file block buffer */ 2075 xfs_buf_t *sumbp; /* summary file block buffer */
2084 2076
2077 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2085 ASSERT(minlen > 0 && minlen <= maxlen); 2078 ASSERT(minlen > 0 && minlen <= maxlen);
2086 mp = tp->t_mountp; 2079
2087 /* 2080 /*
2088 * If prod is set then figure out what to do to minlen and maxlen. 2081 * If prod is set then figure out what to do to minlen and maxlen.
2089 */ 2082 */
@@ -2099,12 +2092,7 @@ xfs_rtallocate_extent(
2099 return 0; 2092 return 0;
2100 } 2093 }
2101 } 2094 }
2102 /* 2095
2103 * Lock out other callers by grabbing the bitmap inode lock.
2104 */
2105 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
2106 XFS_ILOCK_EXCL, &ip)))
2107 return error;
2108 sumbp = NULL; 2096 sumbp = NULL;
2109 /* 2097 /*
2110 * Allocate by size, or near another block, or exactly at some block. 2098 * Allocate by size, or near another block, or exactly at some block.
@@ -2123,11 +2111,12 @@ xfs_rtallocate_extent(
2123 len, &sumbp, &sb, prod, &r); 2111 len, &sumbp, &sb, prod, &r);
2124 break; 2112 break;
2125 default: 2113 default:
2114 error = EIO;
2126 ASSERT(0); 2115 ASSERT(0);
2127 } 2116 }
2128 if (error) { 2117 if (error)
2129 return error; 2118 return error;
2130 } 2119
2131 /* 2120 /*
2132 * If it worked, update the superblock. 2121 * If it worked, update the superblock.
2133 */ 2122 */
@@ -2155,7 +2144,6 @@ xfs_rtfree_extent(
2155 xfs_extlen_t len) /* length of extent freed */ 2144 xfs_extlen_t len) /* length of extent freed */
2156{ 2145{
2157 int error; /* error value */ 2146 int error; /* error value */
2158 xfs_inode_t *ip; /* bitmap file inode */
2159 xfs_mount_t *mp; /* file system mount structure */ 2147 xfs_mount_t *mp; /* file system mount structure */
2160 xfs_fsblock_t sb; /* summary file block number */ 2148 xfs_fsblock_t sb; /* summary file block number */
2161 xfs_buf_t *sumbp; /* summary file block buffer */ 2149 xfs_buf_t *sumbp; /* summary file block buffer */
@@ -2164,9 +2152,9 @@ xfs_rtfree_extent(
2164 /* 2152 /*
2165 * Synchronize by locking the bitmap inode. 2153 * Synchronize by locking the bitmap inode.
2166 */ 2154 */
2167 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 2155 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2168 XFS_ILOCK_EXCL, &ip))) 2156 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2169 return error; 2157
2170#if defined(__KERNEL__) && defined(DEBUG) 2158#if defined(__KERNEL__) && defined(DEBUG)
2171 /* 2159 /*
2172 * Check to see that this whole range is currently allocated. 2160 * Check to see that this whole range is currently allocated.
@@ -2199,10 +2187,10 @@ xfs_rtfree_extent(
2199 */ 2187 */
2200 if (tp->t_frextents_delta + mp->m_sb.sb_frextents == 2188 if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
2201 mp->m_sb.sb_rextents) { 2189 mp->m_sb.sb_rextents) {
2202 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) 2190 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
2203 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; 2191 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2204 *(__uint64_t *)&ip->i_d.di_atime = 0; 2192 *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
2205 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2193 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
2206 } 2194 }
2207 return 0; 2195 return 0;
2208} 2196}
@@ -2222,8 +2210,8 @@ xfs_rtmount_init(
2222 if (sbp->sb_rblocks == 0) 2210 if (sbp->sb_rblocks == 0)
2223 return 0; 2211 return 0;
2224 if (mp->m_rtdev_targp == NULL) { 2212 if (mp->m_rtdev_targp == NULL) {
2225 cmn_err(CE_WARN, 2213 xfs_warn(mp,
2226 "XFS: This filesystem has a realtime volume, use rtdev=device option"); 2214 "Filesystem has a realtime volume, use rtdev=device option");
2227 return XFS_ERROR(ENODEV); 2215 return XFS_ERROR(ENODEV);
2228 } 2216 }
2229 mp->m_rsumlevels = sbp->sb_rextslog + 1; 2217 mp->m_rsumlevels = sbp->sb_rextslog + 1;
@@ -2237,7 +2225,7 @@ xfs_rtmount_init(
2237 */ 2225 */
2238 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); 2226 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
2239 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { 2227 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
2240 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", 2228 xfs_warn(mp, "realtime mount -- %llu != %llu",
2241 (unsigned long long) XFS_BB_TO_FSB(mp, d), 2229 (unsigned long long) XFS_BB_TO_FSB(mp, d),
2242 (unsigned long long) mp->m_sb.sb_rblocks); 2230 (unsigned long long) mp->m_sb.sb_rblocks);
2243 return XFS_ERROR(EFBIG); 2231 return XFS_ERROR(EFBIG);
@@ -2246,7 +2234,7 @@ xfs_rtmount_init(
2246 d - XFS_FSB_TO_BB(mp, 1), 2234 d - XFS_FSB_TO_BB(mp, 1),
2247 XFS_FSB_TO_B(mp, 1), 0); 2235 XFS_FSB_TO_B(mp, 1), 0);
2248 if (!bp) { 2236 if (!bp) {
2249 cmn_err(CE_WARN, "XFS: realtime device size check failed"); 2237 xfs_warn(mp, "realtime device size check failed");
2250 return EIO; 2238 return EIO;
2251 } 2239 }
2252 xfs_buf_relse(bp); 2240 xfs_buf_relse(bp);
@@ -2306,20 +2294,16 @@ xfs_rtpick_extent(
2306 xfs_rtblock_t *pick) /* result rt extent */ 2294 xfs_rtblock_t *pick) /* result rt extent */
2307{ 2295{
2308 xfs_rtblock_t b; /* result block */ 2296 xfs_rtblock_t b; /* result block */
2309 int error; /* error return value */
2310 xfs_inode_t *ip; /* bitmap incore inode */
2311 int log2; /* log of sequence number */ 2297 int log2; /* log of sequence number */
2312 __uint64_t resid; /* residual after log removed */ 2298 __uint64_t resid; /* residual after log removed */
2313 __uint64_t seq; /* sequence number of file creation */ 2299 __uint64_t seq; /* sequence number of file creation */
2314 __uint64_t *seqp; /* pointer to seqno in inode */ 2300 __uint64_t *seqp; /* pointer to seqno in inode */
2315 2301
2316 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 2302 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2317 XFS_ILOCK_EXCL, &ip))) 2303
2318 return error; 2304 seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
2319 ASSERT(ip == mp->m_rbmip); 2305 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
2320 seqp = (__uint64_t *)&ip->i_d.di_atime; 2306 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2321 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
2322 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2323 *seqp = 0; 2307 *seqp = 0;
2324 } 2308 }
2325 seq = *seqp; 2309 seq = *seqp;
@@ -2335,7 +2319,7 @@ xfs_rtpick_extent(
2335 b = mp->m_sb.sb_rextents - len; 2319 b = mp->m_sb.sb_rextents - len;
2336 } 2320 }
2337 *seqp = seq + 1; 2321 *seqp = seq + 1;
2338 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2322 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
2339 *pick = b; 2323 *pick = b;
2340 return 0; 2324 return 0;
2341} 2325}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ff614c29b441..09e1f4f35e97 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -154,7 +154,7 @@ xfs_rtmount_init(
154 if (mp->m_sb.sb_rblocks == 0) 154 if (mp->m_sb.sb_rblocks == 0)
155 return 0; 155 return 0;
156 156
157 cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT"); 157 xfs_warn(mp, "Not built with CONFIG_XFS_RT");
158 return ENOSYS; 158 return ENOSYS;
159} 159}
160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 56861d5daaef..d6d6fdfe9422 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -49,9 +49,9 @@ xfs_do_force_shutdown(
49 logerror = flags & SHUTDOWN_LOG_IO_ERROR; 49 logerror = flags & SHUTDOWN_LOG_IO_ERROR;
50 50
51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
52 cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from " 52 xfs_notice(mp,
53 "line %d of file %s. Return address = 0x%p", 53 "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
54 mp->m_fsname, flags, lnnum, fname, __return_address); 54 __func__, flags, lnnum, fname, __return_address);
55 } 55 }
56 /* 56 /*
57 * No need to duplicate efforts. 57 * No need to duplicate efforts.
@@ -69,30 +69,25 @@ xfs_do_force_shutdown(
69 return; 69 return;
70 70
71 if (flags & SHUTDOWN_CORRUPT_INCORE) { 71 if (flags & SHUTDOWN_CORRUPT_INCORE) {
72 xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp, 72 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
73 "Corruption of in-memory data detected. Shutting down filesystem: %s", 73 "Corruption of in-memory data detected. Shutting down filesystem");
74 mp->m_fsname); 74 if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
75 if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
76 xfs_stack_trace(); 75 xfs_stack_trace();
77 }
78 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 76 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
79 if (logerror) { 77 if (logerror) {
80 xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp, 78 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
81 "Log I/O Error Detected. Shutting down filesystem: %s", 79 "Log I/O Error Detected. Shutting down filesystem");
82 mp->m_fsname);
83 } else if (flags & SHUTDOWN_DEVICE_REQ) { 80 } else if (flags & SHUTDOWN_DEVICE_REQ) {
84 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 81 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
85 "All device paths lost. Shutting down filesystem: %s", 82 "All device paths lost. Shutting down filesystem");
86 mp->m_fsname);
87 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { 83 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
88 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 84 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
89 "I/O Error Detected. Shutting down filesystem: %s", 85 "I/O Error Detected. Shutting down filesystem");
90 mp->m_fsname);
91 } 86 }
92 } 87 }
93 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 88 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
94 cmn_err(CE_ALERT, "Please umount the filesystem, " 89 xfs_alert(mp,
95 "and rectify the problem(s)"); 90 "Please umount the filesystem and rectify the problem(s)");
96 } 91 }
97} 92}
98 93
@@ -106,10 +101,9 @@ xfs_ioerror_alert(
106 xfs_buf_t *bp, 101 xfs_buf_t *bp,
107 xfs_daddr_t blkno) 102 xfs_daddr_t blkno)
108{ 103{
109 cmn_err(CE_ALERT, 104 xfs_alert(mp,
110 "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx" 105 "I/O error occurred: meta-data dev %s block 0x%llx"
111 " (\"%s\") error %d buf count %zd", 106 " (\"%s\") error %d buf count %zd",
112 (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
113 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 107 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
114 (__uint64_t)blkno, func, 108 (__uint64_t)blkno, func,
115 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); 109 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
@@ -173,17 +167,9 @@ xfs_extlen_t
173xfs_get_extsz_hint( 167xfs_get_extsz_hint(
174 struct xfs_inode *ip) 168 struct xfs_inode *ip)
175{ 169{
176 xfs_extlen_t extsz; 170 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
177 171 return ip->i_d.di_extsize;
178 if (unlikely(XFS_IS_REALTIME_INODE(ip))) { 172 if (XFS_IS_REALTIME_INODE(ip))
179 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) 173 return ip->i_mount->m_sb.sb_rextsize;
180 ? ip->i_d.di_extsize 174 return 0;
181 : ip->i_mount->m_sb.sb_rextsize;
182 ASSERT(extsz);
183 } else {
184 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
185 ? ip->i_d.di_extsize : 0;
186 }
187
188 return extsz;
189} 175}
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711e..76922793f64f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
1137 if (blkdelta) 1137 if (blkdelta)
1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); 1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1139out: 1139out:
1140 ASSERT(error = 0); 1140 ASSERT(error == 0);
1141 return; 1141 return;
1142} 1142}
1143 1143
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
1350 * they could be immediately flushed and we'd have to race with the flusher 1350 * they could be immediately flushed and we'd have to race with the flusher
1351 * trying to pull the item from the AIL as we add it. 1351 * trying to pull the item from the AIL as we add it.
1352 */ 1352 */
1353void 1353static void
1354xfs_trans_item_committed( 1354xfs_trans_item_committed(
1355 struct xfs_log_item *lip, 1355 struct xfs_log_item *lip,
1356 xfs_lsn_t commit_lsn, 1356 xfs_lsn_t commit_lsn,
@@ -1425,21 +1425,120 @@ xfs_trans_committed(
1425 xfs_trans_free(tp); 1425 xfs_trans_free(tp);
1426} 1426}
1427 1427
1428static inline void
1429xfs_log_item_batch_insert(
1430 struct xfs_ail *ailp,
1431 struct xfs_log_item **log_items,
1432 int nr_items,
1433 xfs_lsn_t commit_lsn)
1434{
1435 int i;
1436
1437 spin_lock(&ailp->xa_lock);
1438 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
1439 xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
1440
1441 for (i = 0; i < nr_items; i++)
1442 IOP_UNPIN(log_items[i], 0);
1443}
1444
1428/* 1445/*
1429 * Called from the trans_commit code when we notice that 1446 * Bulk operation version of xfs_trans_committed that takes a log vector of
1430 * the filesystem is in the middle of a forced shutdown. 1447 * items to insert into the AIL. This uses bulk AIL insertion techniques to
1448 * minimise lock traffic.
1449 *
1450 * If we are called with the aborted flag set, it is because a log write during
1451 * a CIL checkpoint commit has failed. In this case, all the items in the
1452 * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
1453 * means that checkpoint commit abort handling is treated exactly the same
1454 * as an iclog write error even though we haven't started any IO yet. Hence in
1455 * this case all we need to do is IOP_COMMITTED processing, followed by an
1456 * IOP_UNPIN(aborted) call.
1457 */
1458void
1459xfs_trans_committed_bulk(
1460 struct xfs_ail *ailp,
1461 struct xfs_log_vec *log_vector,
1462 xfs_lsn_t commit_lsn,
1463 int aborted)
1464{
1465#define LOG_ITEM_BATCH_SIZE 32
1466 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
1467 struct xfs_log_vec *lv;
1468 int i = 0;
1469
1470 /* unpin all the log items */
1471 for (lv = log_vector; lv; lv = lv->lv_next ) {
1472 struct xfs_log_item *lip = lv->lv_item;
1473 xfs_lsn_t item_lsn;
1474
1475 if (aborted)
1476 lip->li_flags |= XFS_LI_ABORTED;
1477 item_lsn = IOP_COMMITTED(lip, commit_lsn);
1478
1479 /* item_lsn of -1 means the item was freed */
1480 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1481 continue;
1482
1483 /*
1484 * if we are aborting the operation, no point in inserting the
1485 * object into the AIL as we are in a shutdown situation.
1486 */
1487 if (aborted) {
1488 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
1489 IOP_UNPIN(lip, 1);
1490 continue;
1491 }
1492
1493 if (item_lsn != commit_lsn) {
1494
1495 /*
1496 * Not a bulk update option due to unusual item_lsn.
1497 * Push into AIL immediately, rechecking the lsn once
1498 * we have the ail lock. Then unpin the item.
1499 */
1500 spin_lock(&ailp->xa_lock);
1501 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
1502 xfs_trans_ail_update(ailp, lip, item_lsn);
1503 else
1504 spin_unlock(&ailp->xa_lock);
1505 IOP_UNPIN(lip, 0);
1506 continue;
1507 }
1508
1509 /* Item is a candidate for bulk AIL insert. */
1510 log_items[i++] = lv->lv_item;
1511 if (i >= LOG_ITEM_BATCH_SIZE) {
1512 xfs_log_item_batch_insert(ailp, log_items,
1513 LOG_ITEM_BATCH_SIZE, commit_lsn);
1514 i = 0;
1515 }
1516 }
1517
1518 /* make sure we insert the remainder! */
1519 if (i)
1520 xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
1521}
1522
1523/*
1524 * Called from the trans_commit code when we notice that the filesystem is in
1525 * the middle of a forced shutdown.
1526 *
1527 * When we are called here, we have already pinned all the items in the
1528 * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
1529 * so we can simply walk the items in the transaction, unpin them with an abort
1530 * flag and then free the items. Note that unpinning the items can result in
1531 * them being freed immediately, so we need to use a safe list traversal method
1532 * here.
1431 */ 1533 */
1432STATIC void 1534STATIC void
1433xfs_trans_uncommit( 1535xfs_trans_uncommit(
1434 struct xfs_trans *tp, 1536 struct xfs_trans *tp,
1435 uint flags) 1537 uint flags)
1436{ 1538{
1437 struct xfs_log_item_desc *lidp; 1539 struct xfs_log_item_desc *lidp, *n;
1438 1540
1439 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 1541 list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
1440 /*
1441 * Unpin all but those that aren't dirty.
1442 */
1443 if (lidp->lid_flags & XFS_LID_DIRTY) 1542 if (lidp->lid_flags & XFS_LID_DIRTY)
1444 IOP_UNPIN(lidp->lid_item, 1); 1543 IOP_UNPIN(lidp->lid_item, 1);
1445 } 1544 }
@@ -1656,7 +1755,6 @@ xfs_trans_commit_cil(
1656 int flags) 1755 int flags)
1657{ 1756{
1658 struct xfs_log_vec *log_vector; 1757 struct xfs_log_vec *log_vector;
1659 int error;
1660 1758
1661 /* 1759 /*
1662 * Get each log item to allocate a vector structure for 1760 * Get each log item to allocate a vector structure for
@@ -1667,9 +1765,7 @@ xfs_trans_commit_cil(
1667 if (!log_vector) 1765 if (!log_vector)
1668 return ENOMEM; 1766 return ENOMEM;
1669 1767
1670 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); 1768 xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1671 if (error)
1672 return error;
1673 1769
1674 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1770 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1675 xfs_trans_free(tp); 1771 xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a86..06a9759b6352 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
294#define XFS_ALLOC_BTREE_REF 2 294#define XFS_ALLOC_BTREE_REF 2
295#define XFS_BMAP_BTREE_REF 2 295#define XFS_BMAP_BTREE_REF 2
296#define XFS_DIR_BTREE_REF 2 296#define XFS_DIR_BTREE_REF 2
297#define XFS_INO_REF 2
297#define XFS_ATTR_BTREE_REF 1 298#define XFS_ATTR_BTREE_REF 1
298#define XFS_INO_REF 1
299#define XFS_DQUOT_REF 1 299#define XFS_DQUOT_REF 1
300 300
301#ifdef __KERNEL__ 301#ifdef __KERNEL__
@@ -469,8 +469,6 @@ void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
469void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); 469void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
472int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
473 xfs_ino_t , uint, uint, struct xfs_inode **);
474void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); 472void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
475void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); 473void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
476void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); 474void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..acdb92f14d51 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,74 +28,138 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30 30
31STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *); 31struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
32STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
35 32
36#ifdef DEBUG 33#ifdef DEBUG
37STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *); 34/*
38#else 35 * Check that the list is sorted as it should be.
36 */
37STATIC void
38xfs_ail_check(
39 struct xfs_ail *ailp,
40 xfs_log_item_t *lip)
41{
42 xfs_log_item_t *prev_lip;
43
44 if (list_empty(&ailp->xa_ail))
45 return;
46
47 /*
48 * Check the next and previous entries are valid.
49 */
50 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
51 prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
52 if (&prev_lip->li_ail != &ailp->xa_ail)
53 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
54
55 prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
56 if (&prev_lip->li_ail != &ailp->xa_ail)
57 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
58
59
60#ifdef XFS_TRANS_DEBUG
61 /*
62 * Walk the list checking lsn ordering, and that every entry has the
63 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
64 * when specifically debugging the transaction subsystem.
65 */
66 prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
67 list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
68 if (&prev_lip->li_ail != &ailp->xa_ail)
69 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
70 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
71 prev_lip = lip;
72 }
73#endif /* XFS_TRANS_DEBUG */
74}
75#else /* !DEBUG */
39#define xfs_ail_check(a,l) 76#define xfs_ail_check(a,l)
40#endif /* DEBUG */ 77#endif /* DEBUG */
41 78
79/*
80 * Return a pointer to the first item in the AIL. If the AIL is empty, then
81 * return NULL.
82 */
83static xfs_log_item_t *
84xfs_ail_min(
85 struct xfs_ail *ailp)
86{
87 if (list_empty(&ailp->xa_ail))
88 return NULL;
89
90 return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
91}
92
93 /*
94 * Return a pointer to the last item in the AIL. If the AIL is empty, then
95 * return NULL.
96 */
97static xfs_log_item_t *
98xfs_ail_max(
99 struct xfs_ail *ailp)
100{
101 if (list_empty(&ailp->xa_ail))
102 return NULL;
103
104 return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
105}
106
107/*
108 * Return a pointer to the item which follows the given item in the AIL. If
109 * the given item is the last item in the list, then return NULL.
110 */
111static xfs_log_item_t *
112xfs_ail_next(
113 struct xfs_ail *ailp,
114 xfs_log_item_t *lip)
115{
116 if (lip->li_ail.next == &ailp->xa_ail)
117 return NULL;
118
119 return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
120}
42 121
43/* 122/*
44 * This is called by the log manager code to determine the LSN 123 * This is called by the log manager code to determine the LSN of the tail of
45 * of the tail of the log. This is exactly the LSN of the first 124 * the log. This is exactly the LSN of the first item in the AIL. If the AIL
46 * item in the AIL. If the AIL is empty, then this function 125 * is empty, then this function returns 0.
47 * returns 0.
48 * 126 *
49 * We need the AIL lock in order to get a coherent read of the 127 * We need the AIL lock in order to get a coherent read of the lsn of the last
50 * lsn of the last item in the AIL. 128 * item in the AIL.
51 */ 129 */
52xfs_lsn_t 130xfs_lsn_t
53xfs_trans_ail_tail( 131xfs_ail_min_lsn(
54 struct xfs_ail *ailp) 132 struct xfs_ail *ailp)
55{ 133{
56 xfs_lsn_t lsn; 134 xfs_lsn_t lsn = 0;
57 xfs_log_item_t *lip; 135 xfs_log_item_t *lip;
58 136
59 spin_lock(&ailp->xa_lock); 137 spin_lock(&ailp->xa_lock);
60 lip = xfs_ail_min(ailp); 138 lip = xfs_ail_min(ailp);
61 if (lip == NULL) { 139 if (lip)
62 lsn = (xfs_lsn_t)0;
63 } else {
64 lsn = lip->li_lsn; 140 lsn = lip->li_lsn;
65 }
66 spin_unlock(&ailp->xa_lock); 141 spin_unlock(&ailp->xa_lock);
67 142
68 return lsn; 143 return lsn;
69} 144}
70 145
71/* 146/*
72 * xfs_trans_push_ail 147 * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
73 *
74 * This routine is called to move the tail of the AIL forward. It does this by
75 * trying to flush items in the AIL whose lsns are below the given
76 * threshold_lsn.
77 *
78 * the push is run asynchronously in a separate thread, so we return the tail
79 * of the log right now instead of the tail after the push. This means we will
80 * either continue right away, or we will sleep waiting on the async thread to
81 * do its work.
82 *
83 * We do this unlocked - we only need to know whether there is anything in the
84 * AIL at the time we are called. We don't need to access the contents of
85 * any of the objects, so the lock is not needed.
86 */ 148 */
87void 149static xfs_lsn_t
88xfs_trans_ail_push( 150xfs_ail_max_lsn(
89 struct xfs_ail *ailp, 151 struct xfs_ail *ailp)
90 xfs_lsn_t threshold_lsn)
91{ 152{
92 xfs_log_item_t *lip; 153 xfs_lsn_t lsn = 0;
154 xfs_log_item_t *lip;
93 155
94 lip = xfs_ail_min(ailp); 156 spin_lock(&ailp->xa_lock);
95 if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { 157 lip = xfs_ail_max(ailp);
96 if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) 158 if (lip)
97 xfsaild_wakeup(ailp, threshold_lsn); 159 lsn = lip->li_lsn;
98 } 160 spin_unlock(&ailp->xa_lock);
161
162 return lsn;
99} 163}
100 164
101/* 165/*
@@ -236,16 +300,57 @@ out:
236} 300}
237 301
238/* 302/*
239 * xfsaild_push does the work of pushing on the AIL. Returning a timeout of 303 * splice the log item list into the AIL at the given LSN.
240 * zero indicates that the caller should sleep until woken.
241 */ 304 */
242long 305static void
243xfsaild_push( 306xfs_ail_splice(
244 struct xfs_ail *ailp, 307 struct xfs_ail *ailp,
245 xfs_lsn_t *last_lsn) 308 struct list_head *list,
309 xfs_lsn_t lsn)
310{
311 xfs_log_item_t *next_lip;
312
313 /* If the list is empty, just insert the item. */
314 if (list_empty(&ailp->xa_ail)) {
315 list_splice(list, &ailp->xa_ail);
316 return;
317 }
318
319 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
320 if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
321 break;
322 }
323
324 ASSERT(&next_lip->li_ail == &ailp->xa_ail ||
325 XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0);
326
327 list_splice_init(list, &next_lip->li_ail);
328}
329
330/*
331 * Delete the given item from the AIL. Return a pointer to the item.
332 */
333static void
334xfs_ail_delete(
335 struct xfs_ail *ailp,
336 xfs_log_item_t *lip)
246{ 337{
247 long tout = 0; 338 xfs_ail_check(ailp, lip);
248 xfs_lsn_t last_pushed_lsn = *last_lsn; 339 list_del(&lip->li_ail);
340 xfs_trans_ail_cursor_clear(ailp, lip);
341}
342
343/*
344 * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
345 * to run at a later time if there is more work to do to complete the push.
346 */
347STATIC void
348xfs_ail_worker(
349 struct work_struct *work)
350{
351 struct xfs_ail *ailp = container_of(to_delayed_work(work),
352 struct xfs_ail, xa_work);
353 long tout;
249 xfs_lsn_t target = ailp->xa_target; 354 xfs_lsn_t target = ailp->xa_target;
250 xfs_lsn_t lsn; 355 xfs_lsn_t lsn;
251 xfs_log_item_t *lip; 356 xfs_log_item_t *lip;
@@ -256,15 +361,15 @@ xfsaild_push(
256 361
257 spin_lock(&ailp->xa_lock); 362 spin_lock(&ailp->xa_lock);
258 xfs_trans_ail_cursor_init(ailp, cur); 363 xfs_trans_ail_cursor_init(ailp, cur);
259 lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn); 364 lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn);
260 if (!lip || XFS_FORCED_SHUTDOWN(mp)) { 365 if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
261 /* 366 /*
262 * AIL is empty or our push has reached the end. 367 * AIL is empty or our push has reached the end.
263 */ 368 */
264 xfs_trans_ail_cursor_done(ailp, cur); 369 xfs_trans_ail_cursor_done(ailp, cur);
265 spin_unlock(&ailp->xa_lock); 370 spin_unlock(&ailp->xa_lock);
266 *last_lsn = 0; 371 ailp->xa_last_pushed_lsn = 0;
267 return tout; 372 return;
268 } 373 }
269 374
270 XFS_STATS_INC(xs_push_ail); 375 XFS_STATS_INC(xs_push_ail);
@@ -301,13 +406,13 @@ xfsaild_push(
301 case XFS_ITEM_SUCCESS: 406 case XFS_ITEM_SUCCESS:
302 XFS_STATS_INC(xs_push_ail_success); 407 XFS_STATS_INC(xs_push_ail_success);
303 IOP_PUSH(lip); 408 IOP_PUSH(lip);
304 last_pushed_lsn = lsn; 409 ailp->xa_last_pushed_lsn = lsn;
305 break; 410 break;
306 411
307 case XFS_ITEM_PUSHBUF: 412 case XFS_ITEM_PUSHBUF:
308 XFS_STATS_INC(xs_push_ail_pushbuf); 413 XFS_STATS_INC(xs_push_ail_pushbuf);
309 IOP_PUSHBUF(lip); 414 IOP_PUSHBUF(lip);
310 last_pushed_lsn = lsn; 415 ailp->xa_last_pushed_lsn = lsn;
311 push_xfsbufd = 1; 416 push_xfsbufd = 1;
312 break; 417 break;
313 418
@@ -319,7 +424,7 @@ xfsaild_push(
319 424
320 case XFS_ITEM_LOCKED: 425 case XFS_ITEM_LOCKED:
321 XFS_STATS_INC(xs_push_ail_locked); 426 XFS_STATS_INC(xs_push_ail_locked);
322 last_pushed_lsn = lsn; 427 ailp->xa_last_pushed_lsn = lsn;
323 stuck++; 428 stuck++;
324 break; 429 break;
325 430
@@ -374,9 +479,23 @@ xfsaild_push(
374 wake_up_process(mp->m_ddev_targp->bt_task); 479 wake_up_process(mp->m_ddev_targp->bt_task);
375 } 480 }
376 481
482 /* assume we have more work to do in a short while */
483 tout = 10;
377 if (!count) { 484 if (!count) {
378 /* We're past our target or empty, so idle */ 485 /* We're past our target or empty, so idle */
379 last_pushed_lsn = 0; 486 ailp->xa_last_pushed_lsn = 0;
487
488 /*
489 * Check for an updated push target before clearing the
490 * XFS_AIL_PUSHING_BIT. If the target changed, we've got more
491 * work to do. Wait a bit longer before starting that work.
492 */
493 smp_rmb();
494 if (ailp->xa_target == target) {
495 clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
496 return;
497 }
498 tout = 50;
380 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 499 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
381 /* 500 /*
382 * We reached the target so wait a bit longer for I/O to 501 * We reached the target so wait a bit longer for I/O to
@@ -384,7 +503,7 @@ xfsaild_push(
384 * start the next scan from the start of the AIL. 503 * start the next scan from the start of the AIL.
385 */ 504 */
386 tout = 50; 505 tout = 50;
387 last_pushed_lsn = 0; 506 ailp->xa_last_pushed_lsn = 0;
388 } else if ((stuck * 100) / count > 90) { 507 } else if ((stuck * 100) / count > 90) {
389 /* 508 /*
390 * Either there is a lot of contention on the AIL or we 509 * Either there is a lot of contention on the AIL or we
@@ -396,14 +515,61 @@ xfsaild_push(
396 * continuing from where we were. 515 * continuing from where we were.
397 */ 516 */
398 tout = 20; 517 tout = 20;
399 } else {
400 /* more to do, but wait a short while before continuing */
401 tout = 10;
402 } 518 }
403 *last_lsn = last_pushed_lsn; 519
404 return tout; 520 /* There is more to do, requeue us. */
521 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
522 msecs_to_jiffies(tout));
405} 523}
406 524
525/*
526 * This routine is called to move the tail of the AIL forward. It does this by
527 * trying to flush items in the AIL whose lsns are below the given
528 * threshold_lsn.
529 *
530 * The push is run asynchronously in a workqueue, which means the caller needs
531 * to handle waiting on the async flush for space to become available.
532 * We don't want to interrupt any push that is in progress, hence we only queue
533 * work if we set the pushing bit approriately.
534 *
535 * We do this unlocked - we only need to know whether there is anything in the
536 * AIL at the time we are called. We don't need to access the contents of
537 * any of the objects, so the lock is not needed.
538 */
539void
540xfs_ail_push(
541 struct xfs_ail *ailp,
542 xfs_lsn_t threshold_lsn)
543{
544 xfs_log_item_t *lip;
545
546 lip = xfs_ail_min(ailp);
547 if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
548 XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
549 return;
550
551 /*
552 * Ensure that the new target is noticed in push code before it clears
553 * the XFS_AIL_PUSHING_BIT.
554 */
555 smp_wmb();
556 ailp->xa_target = threshold_lsn;
557 if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
558 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
559}
560
561/*
562 * Push out all items in the AIL immediately
563 */
564void
565xfs_ail_push_all(
566 struct xfs_ail *ailp)
567{
568 xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp);
569
570 if (threshold_lsn)
571 xfs_ail_push(ailp, threshold_lsn);
572}
407 573
408/* 574/*
409 * This is to be called when an item is unlocked that may have 575 * This is to be called when an item is unlocked that may have
@@ -449,129 +615,152 @@ xfs_trans_unlocked_item(
449 xfs_log_move_tail(ailp->xa_mount, 1); 615 xfs_log_move_tail(ailp->xa_mount, 1);
450} /* xfs_trans_unlocked_item */ 616} /* xfs_trans_unlocked_item */
451 617
452
453/* 618/*
454 * Update the position of the item in the AIL with the new 619 * xfs_trans_ail_update - bulk AIL insertion operation.
455 * lsn. If it is not yet in the AIL, add it. Otherwise, move 620 *
456 * it to its new position by removing it and re-adding it. 621 * @xfs_trans_ail_update takes an array of log items that all need to be
622 * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
623 * be added. Otherwise, it will be repositioned by removing it and re-adding
624 * it to the AIL. If we move the first item in the AIL, update the log tail to
625 * match the new minimum LSN in the AIL.
626 *
627 * This function takes the AIL lock once to execute the update operations on
628 * all the items in the array, and as such should not be called with the AIL
629 * lock held. As a result, once we have the AIL lock, we need to check each log
630 * item LSN to confirm it needs to be moved forward in the AIL.
457 * 631 *
458 * Wakeup anyone with an lsn less than the item's lsn. If the item 632 * To optimise the insert operation, we delete all the items from the AIL in
459 * we move in the AIL is the minimum one, update the tail lsn in the 633 * the first pass, moving them into a temporary list, then splice the temporary
460 * log manager. 634 * list into the correct position in the AIL. This avoids needing to do an
635 * insert operation on every item.
461 * 636 *
462 * This function must be called with the AIL lock held. The lock 637 * This function must be called with the AIL lock held. The lock is dropped
463 * is dropped before returning. 638 * before returning.
464 */ 639 */
465void 640void
466xfs_trans_ail_update( 641xfs_trans_ail_update_bulk(
467 struct xfs_ail *ailp, 642 struct xfs_ail *ailp,
468 xfs_log_item_t *lip, 643 struct xfs_log_item **log_items,
469 xfs_lsn_t lsn) __releases(ailp->xa_lock) 644 int nr_items,
645 xfs_lsn_t lsn) __releases(ailp->xa_lock)
470{ 646{
471 xfs_log_item_t *dlip = NULL; 647 xfs_log_item_t *mlip;
472 xfs_log_item_t *mlip; /* ptr to minimum lip */
473 xfs_lsn_t tail_lsn; 648 xfs_lsn_t tail_lsn;
649 int mlip_changed = 0;
650 int i;
651 LIST_HEAD(tmp);
474 652
475 mlip = xfs_ail_min(ailp); 653 mlip = xfs_ail_min(ailp);
476 654
477 if (lip->li_flags & XFS_LI_IN_AIL) { 655 for (i = 0; i < nr_items; i++) {
478 dlip = xfs_ail_delete(ailp, lip); 656 struct xfs_log_item *lip = log_items[i];
479 ASSERT(dlip == lip); 657 if (lip->li_flags & XFS_LI_IN_AIL) {
480 xfs_trans_ail_cursor_clear(ailp, dlip); 658 /* check if we really need to move the item */
481 } else { 659 if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
482 lip->li_flags |= XFS_LI_IN_AIL; 660 continue;
661
662 xfs_ail_delete(ailp, lip);
663 if (mlip == lip)
664 mlip_changed = 1;
665 } else {
666 lip->li_flags |= XFS_LI_IN_AIL;
667 }
668 lip->li_lsn = lsn;
669 list_add(&lip->li_ail, &tmp);
483 } 670 }
484 671
485 lip->li_lsn = lsn; 672 xfs_ail_splice(ailp, &tmp, lsn);
486 xfs_ail_insert(ailp, lip);
487 673
488 if (mlip == dlip) { 674 if (!mlip_changed) {
489 mlip = xfs_ail_min(ailp);
490 /*
491 * It is not safe to access mlip after the AIL lock is
492 * dropped, so we must get a copy of li_lsn before we do
493 * so. This is especially important on 32-bit platforms
494 * where accessing and updating 64-bit values like li_lsn
495 * is not atomic.
496 */
497 tail_lsn = mlip->li_lsn;
498 spin_unlock(&ailp->xa_lock);
499 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
500 } else {
501 spin_unlock(&ailp->xa_lock); 675 spin_unlock(&ailp->xa_lock);
676 return;
502 } 677 }
503 678
504 679 /*
505} /* xfs_trans_update_ail */ 680 * It is not safe to access mlip after the AIL lock is dropped, so we
681 * must get a copy of li_lsn before we do so. This is especially
682 * important on 32-bit platforms where accessing and updating 64-bit
683 * values like li_lsn is not atomic.
684 */
685 mlip = xfs_ail_min(ailp);
686 tail_lsn = mlip->li_lsn;
687 spin_unlock(&ailp->xa_lock);
688 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
689}
506 690
507/* 691/*
508 * Delete the given item from the AIL. It must already be in 692 * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
509 * the AIL. 693 *
694 * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
695 * removed from the AIL. The caller is already holding the AIL lock, and done
696 * all the checks necessary to ensure the items passed in via @log_items are
697 * ready for deletion. This includes checking that the items are in the AIL.
510 * 698 *
511 * Wakeup anyone with an lsn less than item's lsn. If the item 699 * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
512 * we delete in the AIL is the minimum one, update the tail lsn in the 700 * flag from the item and reset the item's lsn to 0. If we remove the first
513 * log manager. 701 * item in the AIL, update the log tail to match the new minimum LSN in the
702 * AIL.
514 * 703 *
515 * Clear the IN_AIL flag from the item, reset its lsn to 0, and 704 * This function will not drop the AIL lock until all items are removed from
516 * bump the AIL's generation count to indicate that the tree 705 * the AIL to minimise the amount of lock traffic on the AIL. This does not
517 * has changed. 706 * greatly increase the AIL hold time, but does significantly reduce the amount
707 * of traffic on the lock, especially during IO completion.
518 * 708 *
519 * This function must be called with the AIL lock held. The lock 709 * This function must be called with the AIL lock held. The lock is dropped
520 * is dropped before returning. 710 * before returning.
521 */ 711 */
522void 712void
523xfs_trans_ail_delete( 713xfs_trans_ail_delete_bulk(
524 struct xfs_ail *ailp, 714 struct xfs_ail *ailp,
525 xfs_log_item_t *lip) __releases(ailp->xa_lock) 715 struct xfs_log_item **log_items,
716 int nr_items) __releases(ailp->xa_lock)
526{ 717{
527 xfs_log_item_t *dlip;
528 xfs_log_item_t *mlip; 718 xfs_log_item_t *mlip;
529 xfs_lsn_t tail_lsn; 719 xfs_lsn_t tail_lsn;
720 int mlip_changed = 0;
721 int i;
530 722
531 if (lip->li_flags & XFS_LI_IN_AIL) { 723 mlip = xfs_ail_min(ailp);
532 mlip = xfs_ail_min(ailp);
533 dlip = xfs_ail_delete(ailp, lip);
534 ASSERT(dlip == lip);
535 xfs_trans_ail_cursor_clear(ailp, dlip);
536 724
725 for (i = 0; i < nr_items; i++) {
726 struct xfs_log_item *lip = log_items[i];
727 if (!(lip->li_flags & XFS_LI_IN_AIL)) {
728 struct xfs_mount *mp = ailp->xa_mount;
537 729
538 lip->li_flags &= ~XFS_LI_IN_AIL;
539 lip->li_lsn = 0;
540
541 if (mlip == dlip) {
542 mlip = xfs_ail_min(ailp);
543 /*
544 * It is not safe to access mlip after the AIL lock
545 * is dropped, so we must get a copy of li_lsn
546 * before we do so. This is especially important
547 * on 32-bit platforms where accessing and updating
548 * 64-bit values like li_lsn is not atomic.
549 */
550 tail_lsn = mlip ? mlip->li_lsn : 0;
551 spin_unlock(&ailp->xa_lock);
552 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
553 } else {
554 spin_unlock(&ailp->xa_lock); 730 spin_unlock(&ailp->xa_lock);
731 if (!XFS_FORCED_SHUTDOWN(mp)) {
732 xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
733 "%s: attempting to delete a log item that is not in the AIL",
734 __func__);
735 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
736 }
737 return;
555 } 738 }
739
740 xfs_ail_delete(ailp, lip);
741 lip->li_flags &= ~XFS_LI_IN_AIL;
742 lip->li_lsn = 0;
743 if (mlip == lip)
744 mlip_changed = 1;
556 } 745 }
557 else {
558 /*
559 * If the file system is not being shutdown, we are in
560 * serious trouble if we get to this stage.
561 */
562 struct xfs_mount *mp = ailp->xa_mount;
563 746
747 if (!mlip_changed) {
564 spin_unlock(&ailp->xa_lock); 748 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) { 749 return;
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 } 750 }
572}
573
574 751
752 /*
753 * It is not safe to access mlip after the AIL lock is dropped, so we
754 * must get a copy of li_lsn before we do so. This is especially
755 * important on 32-bit platforms where accessing and updating 64-bit
756 * values like li_lsn is not atomic. It is possible we've emptied the
757 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
758 */
759 mlip = xfs_ail_min(ailp);
760 tail_lsn = mlip ? mlip->li_lsn : 0;
761 spin_unlock(&ailp->xa_lock);
762 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
763}
575 764
576/* 765/*
577 * The active item list (AIL) is a doubly linked list of log 766 * The active item list (AIL) is a doubly linked list of log
@@ -592,7 +781,6 @@ xfs_trans_ail_init(
592 xfs_mount_t *mp) 781 xfs_mount_t *mp)
593{ 782{
594 struct xfs_ail *ailp; 783 struct xfs_ail *ailp;
595 int error;
596 784
597 ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); 785 ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
598 if (!ailp) 786 if (!ailp)
@@ -601,15 +789,9 @@ xfs_trans_ail_init(
601 ailp->xa_mount = mp; 789 ailp->xa_mount = mp;
602 INIT_LIST_HEAD(&ailp->xa_ail); 790 INIT_LIST_HEAD(&ailp->xa_ail);
603 spin_lock_init(&ailp->xa_lock); 791 spin_lock_init(&ailp->xa_lock);
604 error = xfsaild_start(ailp); 792 INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
605 if (error)
606 goto out_free_ailp;
607 mp->m_ail = ailp; 793 mp->m_ail = ailp;
608 return 0; 794 return 0;
609
610out_free_ailp:
611 kmem_free(ailp);
612 return error;
613} 795}
614 796
615void 797void
@@ -618,135 +800,6 @@ xfs_trans_ail_destroy(
618{ 800{
619 struct xfs_ail *ailp = mp->m_ail; 801 struct xfs_ail *ailp = mp->m_ail;
620 802
621 xfsaild_stop(ailp); 803 cancel_delayed_work_sync(&ailp->xa_work);
622 kmem_free(ailp); 804 kmem_free(ailp);
623} 805}
624
625/*
626 * Insert the given log item into the AIL.
627 * We almost always insert at the end of the list, so on inserts
628 * we search from the end of the list to find where the
629 * new item belongs.
630 */
631STATIC void
632xfs_ail_insert(
633 struct xfs_ail *ailp,
634 xfs_log_item_t *lip)
635/* ARGSUSED */
636{
637 xfs_log_item_t *next_lip;
638
639 /*
640 * If the list is empty, just insert the item.
641 */
642 if (list_empty(&ailp->xa_ail)) {
643 list_add(&lip->li_ail, &ailp->xa_ail);
644 return;
645 }
646
647 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
648 if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
649 break;
650 }
651
652 ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
653 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
654
655 list_add(&lip->li_ail, &next_lip->li_ail);
656
657 xfs_ail_check(ailp, lip);
658 return;
659}
660
661/*
662 * Delete the given item from the AIL. Return a pointer to the item.
663 */
664/*ARGSUSED*/
665STATIC xfs_log_item_t *
666xfs_ail_delete(
667 struct xfs_ail *ailp,
668 xfs_log_item_t *lip)
669/* ARGSUSED */
670{
671 xfs_ail_check(ailp, lip);
672
673 list_del(&lip->li_ail);
674
675 return lip;
676}
677
678/*
679 * Return a pointer to the first item in the AIL.
680 * If the AIL is empty, then return NULL.
681 */
682STATIC xfs_log_item_t *
683xfs_ail_min(
684 struct xfs_ail *ailp)
685/* ARGSUSED */
686{
687 if (list_empty(&ailp->xa_ail))
688 return NULL;
689
690 return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
691}
692
693/*
694 * Return a pointer to the item which follows
695 * the given item in the AIL. If the given item
696 * is the last item in the list, then return NULL.
697 */
698STATIC xfs_log_item_t *
699xfs_ail_next(
700 struct xfs_ail *ailp,
701 xfs_log_item_t *lip)
702/* ARGSUSED */
703{
704 if (lip->li_ail.next == &ailp->xa_ail)
705 return NULL;
706
707 return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
708}
709
710#ifdef DEBUG
711/*
712 * Check that the list is sorted as it should be.
713 */
714STATIC void
715xfs_ail_check(
716 struct xfs_ail *ailp,
717 xfs_log_item_t *lip)
718{
719 xfs_log_item_t *prev_lip;
720
721 if (list_empty(&ailp->xa_ail))
722 return;
723
724 /*
725 * Check the next and previous entries are valid.
726 */
727 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
728 prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
729 if (&prev_lip->li_ail != &ailp->xa_ail)
730 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
731
732 prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
733 if (&prev_lip->li_ail != &ailp->xa_ail)
734 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
735
736
737#ifdef XFS_TRANS_DEBUG
738 /*
739 * Walk the list checking lsn ordering, and that every entry has the
740 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
741 * when specifically debugging the transaction subsystem.
742 */
743 prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
744 list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
745 if (&prev_lip->li_ail != &ailp->xa_ail)
746 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
747 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
748 prev_lip = lip;
749 }
750#endif /* XFS_TRANS_DEBUG */
751}
752#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c47918c302a5..03b3b7f85a3b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -305,7 +305,7 @@ xfs_trans_read_buf(
305 if (xfs_error_target == target) { 305 if (xfs_error_target == target) {
306 if (((xfs_req_num++) % xfs_error_mod) == 0) { 306 if (((xfs_req_num++) % xfs_error_mod) == 0) {
307 xfs_buf_relse(bp); 307 xfs_buf_relse(bp);
308 cmn_err(CE_DEBUG, "Returning error!\n"); 308 xfs_debug(mp, "Returning error!");
309 return XFS_ERROR(EIO); 309 return XFS_ERROR(EIO);
310 } 310 }
311 } 311 }
@@ -383,7 +383,8 @@ xfs_trans_read_buf(
383 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); 383 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
384 if (bp == NULL) { 384 if (bp == NULL) {
385 *bpp = NULL; 385 *bpp = NULL;
386 return 0; 386 return (flags & XBF_TRYLOCK) ?
387 0 : XFS_ERROR(ENOMEM);
387 } 388 }
388 if (XFS_BUF_GETERROR(bp) != 0) { 389 if (XFS_BUF_GETERROR(bp) != 0) {
389 XFS_BUF_SUPER_STALE(bp); 390 XFS_BUF_SUPER_STALE(bp);
@@ -403,7 +404,7 @@ xfs_trans_read_buf(
403 xfs_force_shutdown(tp->t_mountp, 404 xfs_force_shutdown(tp->t_mountp,
404 SHUTDOWN_META_IO_ERROR); 405 SHUTDOWN_META_IO_ERROR);
405 xfs_buf_relse(bp); 406 xfs_buf_relse(bp);
406 cmn_err(CE_DEBUG, "Returning trans error!\n"); 407 xfs_debug(mp, "Returning trans error!");
407 return XFS_ERROR(EIO); 408 return XFS_ERROR(EIO);
408 } 409 }
409 } 410 }
@@ -427,7 +428,7 @@ shutdown_abort:
427 */ 428 */
428#if defined(DEBUG) 429#if defined(DEBUG)
429 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 430 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
430 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); 431 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
431#endif 432#endif
432 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != 433 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
433 (XBF_STALE|XBF_DELWRI)); 434 (XBF_STALE|XBF_DELWRI));
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
69 tp->t_flags |= XFS_TRANS_DIRTY; 69 tp->t_flags |= XFS_TRANS_DIRTY;
70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; 70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
71 71
72 next_extent = efip->efi_next_extent; 72 /*
73 * atomic_inc_return gives us the value after the increment;
74 * we want to use it as an array index so we need to subtract 1 from
75 * it.
76 */
77 next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
73 ASSERT(next_extent < efip->efi_format.efi_nextents); 78 ASSERT(next_extent < efip->efi_format.efi_nextents);
74 extp = &(efip->efi_format.efi_extents[next_extent]); 79 extp = &(efip->efi_format.efi_extents[next_extent]);
75 extp->ext_start = start_block; 80 extp->ext_start = start_block;
76 extp->ext_len = ext_len; 81 extp->ext_len = ext_len;
77 efip->efi_next_extent++;
78} 82}
79 83
80 84
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ccb34532768b..048b0c689d3e 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -44,28 +44,6 @@ xfs_trans_inode_broot_debug(
44#endif 44#endif
45 45
46/* 46/*
47 * Get an inode and join it to the transaction.
48 */
49int
50xfs_trans_iget(
51 xfs_mount_t *mp,
52 xfs_trans_t *tp,
53 xfs_ino_t ino,
54 uint flags,
55 uint lock_flags,
56 xfs_inode_t **ipp)
57{
58 int error;
59
60 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
61 if (!error && tp) {
62 xfs_trans_ijoin(tp, *ipp);
63 (*ipp)->i_itemp->ili_lock_flags = lock_flags;
64 }
65 return error;
66}
67
68/*
69 * Add a locked inode to the transaction. 47 * Add a locked inode to the transaction.
70 * 48 *
71 * The inode must be locked, and it cannot be associated with any transaction. 49 * The inode must be locked, and it cannot be associated with any transaction.
@@ -103,7 +81,7 @@ xfs_trans_ijoin(
103 * 81 *
104 * 82 *
105 * Grabs a reference to the inode which will be dropped when the transaction 83 * Grabs a reference to the inode which will be dropped when the transaction
106 * is commited. The inode will also be unlocked at that point. The inode 84 * is committed. The inode will also be unlocked at that point. The inode
107 * must be locked, and it cannot be associated with any transaction. 85 * must be locked, and it cannot be associated with any transaction.
108 */ 86 */
109void 87void
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..6b164e9e9a1f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
22struct xfs_log_item_desc; 22struct xfs_log_item_desc;
23struct xfs_mount; 23struct xfs_mount;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_ail;
26struct xfs_log_vec;
25 27
26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 28void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27void xfs_trans_del_item(struct xfs_log_item *); 29void xfs_trans_del_item(struct xfs_log_item *);
28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 30void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29 int flags); 31 int flags);
30void xfs_trans_item_committed(struct xfs_log_item *lip,
31 xfs_lsn_t commit_lsn, int aborted);
32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
33 33
34void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
35 xfs_lsn_t commit_lsn, int aborted);
34/* 36/*
35 * AIL traversal cursor. 37 * AIL traversal cursor.
36 * 38 *
@@ -63,28 +65,52 @@ struct xfs_ail_cursor {
63struct xfs_ail { 65struct xfs_ail {
64 struct xfs_mount *xa_mount; 66 struct xfs_mount *xa_mount;
65 struct list_head xa_ail; 67 struct list_head xa_ail;
66 uint xa_gen;
67 struct task_struct *xa_task;
68 xfs_lsn_t xa_target; 68 xfs_lsn_t xa_target;
69 struct xfs_ail_cursor xa_cursors; 69 struct xfs_ail_cursor xa_cursors;
70 spinlock_t xa_lock; 70 spinlock_t xa_lock;
71 struct delayed_work xa_work;
72 xfs_lsn_t xa_last_pushed_lsn;
73 unsigned long xa_flags;
71}; 74};
72 75
76#define XFS_AIL_PUSHING_BIT 0
77
73/* 78/*
74 * From xfs_trans_ail.c 79 * From xfs_trans_ail.c
75 */ 80 */
76void xfs_trans_ail_update(struct xfs_ail *ailp, 81
77 struct xfs_log_item *lip, xfs_lsn_t lsn) 82extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
78 __releases(ailp->xa_lock); 83
79void xfs_trans_ail_delete(struct xfs_ail *ailp, 84void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
80 struct xfs_log_item *lip) 85 struct xfs_log_item **log_items, int nr_items,
81 __releases(ailp->xa_lock); 86 xfs_lsn_t lsn) __releases(ailp->xa_lock);
82void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); 87static inline void
88xfs_trans_ail_update(
89 struct xfs_ail *ailp,
90 struct xfs_log_item *lip,
91 xfs_lsn_t lsn) __releases(ailp->xa_lock)
92{
93 xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
94}
95
96void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
97 struct xfs_log_item **log_items, int nr_items)
98 __releases(ailp->xa_lock);
99static inline void
100xfs_trans_ail_delete(
101 struct xfs_ail *ailp,
102 xfs_log_item_t *lip) __releases(ailp->xa_lock)
103{
104 xfs_trans_ail_delete_bulk(ailp, &lip, 1);
105}
106
107void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
108void xfs_ail_push_all(struct xfs_ail *);
109xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp);
110
83void xfs_trans_unlocked_item(struct xfs_ail *, 111void xfs_trans_unlocked_item(struct xfs_ail *,
84 xfs_log_item_t *); 112 xfs_log_item_t *);
85 113
86xfs_lsn_t xfs_trans_ail_tail(struct xfs_ail *ailp);
87
88struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp, 114struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
89 struct xfs_ail_cursor *cur, 115 struct xfs_ail_cursor *cur,
90 xfs_lsn_t lsn); 116 xfs_lsn_t lsn);
@@ -93,11 +119,6 @@ struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
93void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, 119void xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
94 struct xfs_ail_cursor *cur); 120 struct xfs_ail_cursor *cur);
95 121
96long xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
97void xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
98int xfsaild_start(struct xfs_ail *);
99void xfsaild_stop(struct xfs_ail *);
100
101#if BITS_PER_LONG != 64 122#if BITS_PER_LONG != 64
102static inline void 123static inline void
103xfs_trans_ail_copy_lsn( 124xfs_trans_ail_copy_lsn(
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151a..b7a5fe7c52c8 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -953,7 +953,7 @@ xfs_release(
953 * If we previously truncated this file and removed old data 953 * If we previously truncated this file and removed old data
954 * in the process, we want to initiate "early" writeout on 954 * in the process, we want to initiate "early" writeout on
955 * the last close. This is an attempt to combat the notorious 955 * the last close. This is an attempt to combat the notorious
956 * NULL files problem which is particularly noticable from a 956 * NULL files problem which is particularly noticeable from a
957 * truncate down, buffered (re-)write (delalloc), followed by 957 * truncate down, buffered (re-)write (delalloc), followed by
958 * a crash. What we are effectively doing here is 958 * a crash. What we are effectively doing here is
959 * significantly reducing the time window where we'd otherwise 959 * significantly reducing the time window where we'd otherwise
@@ -964,29 +964,48 @@ xfs_release(
964 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 964 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
965 } 965 }
966 966
967 if (ip->i_d.di_nlink != 0) { 967 if (ip->i_d.di_nlink == 0)
968 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 968 return 0;
969 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
970 ip->i_delayed_blks > 0)) &&
971 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
972 (!(ip->i_d.di_flags &
973 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
974 969
975 /* 970 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
976 * If we can't get the iolock just skip truncating 971 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
977 * the blocks past EOF because we could deadlock 972 ip->i_delayed_blks > 0)) &&
978 * with the mmap_sem otherwise. We'll get another 973 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
979 * chance to drop them once the last reference to 974 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
980 * the inode is dropped, so we'll never leak blocks 975
981 * permanently. 976 /*
982 */ 977 * If we can't get the iolock just skip truncating the blocks
983 error = xfs_free_eofblocks(mp, ip, 978 * past EOF because we could deadlock with the mmap_sem
984 XFS_FREE_EOF_TRYLOCK); 979 * otherwise. We'll get another chance to drop them once the
985 if (error) 980 * last reference to the inode is dropped, so we'll never leak
986 return error; 981 * blocks permanently.
987 } 982 *
988 } 983 * Further, check if the inode is being opened, written and
984 * closed frequently and we have delayed allocation blocks
985 * outstanding (e.g. streaming writes from the NFS server),
986 * truncating the blocks past EOF will cause fragmentation to
987 * occur.
988 *
989 * In this case don't do the truncation, either, but we have to
990 * be careful how we detect this case. Blocks beyond EOF show
991 * up as i_delayed_blks even when the inode is clean, so we
992 * need to truncate them away first before checking for a dirty
993 * release. Hence on the first dirty close we will still remove
994 * the speculative allocation, but after that we will leave it
995 * in place.
996 */
997 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
998 return 0;
999
1000 error = xfs_free_eofblocks(mp, ip,
1001 XFS_FREE_EOF_TRYLOCK);
1002 if (error)
1003 return error;
989 1004
1005 /* delalloc blocks after truncation means it really is dirty */
1006 if (ip->i_delayed_blks)
1007 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1008 }
990 return 0; 1009 return 0;
991} 1010}
992 1011
@@ -1170,9 +1189,8 @@ xfs_inactive(
1170 * inode might be lost for a long time or forever. 1189 * inode might be lost for a long time or forever.
1171 */ 1190 */
1172 if (!XFS_FORCED_SHUTDOWN(mp)) { 1191 if (!XFS_FORCED_SHUTDOWN(mp)) {
1173 cmn_err(CE_NOTE, 1192 xfs_notice(mp, "%s: xfs_ifree returned error %d",
1174 "xfs_inactive: xfs_ifree() returned an error = %d on %s", 1193 __func__, error);
1175 error, mp->m_fsname);
1176 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1194 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1177 } 1195 }
1178 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 1196 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
@@ -1189,12 +1207,12 @@ xfs_inactive(
1189 */ 1207 */
1190 error = xfs_bmap_finish(&tp, &free_list, &committed); 1208 error = xfs_bmap_finish(&tp, &free_list, &committed);
1191 if (error) 1209 if (error)
1192 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " 1210 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
1193 "xfs_bmap_finish() returned error %d", error); 1211 __func__, error);
1194 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1212 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1195 if (error) 1213 if (error)
1196 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " 1214 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1197 "xfs_trans_commit() returned error %d", error); 1215 __func__, error);
1198 } 1216 }
1199 1217
1200 /* 1218 /*
@@ -1291,7 +1309,7 @@ xfs_create(
1291 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 1309 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1292 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 1310 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1293 if (error) 1311 if (error)
1294 goto std_return; 1312 return error;
1295 1313
1296 if (is_dir) { 1314 if (is_dir) {
1297 rdev = 0; 1315 rdev = 0;
@@ -1371,12 +1389,6 @@ xfs_create(
1371 } 1389 }
1372 1390
1373 /* 1391 /*
1374 * At this point, we've gotten a newly allocated inode.
1375 * It is locked (and joined to the transaction).
1376 */
1377 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1378
1379 /*
1380 * Now we join the directory inode to the transaction. We do not do it 1392 * Now we join the directory inode to the transaction. We do not do it
1381 * earlier because xfs_dir_ialloc might commit the previous transaction 1393 * earlier because xfs_dir_ialloc might commit the previous transaction
1382 * (and release all the locks). An error from here on will result in 1394 * (and release all the locks). An error from here on will result in
@@ -1421,22 +1433,13 @@ xfs_create(
1421 */ 1433 */
1422 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); 1434 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
1423 1435
1424 /*
1425 * xfs_trans_commit normally decrements the vnode ref count
1426 * when it unlocks the inode. Since we want to return the
1427 * vnode to the caller, we bump the vnode ref count now.
1428 */
1429 IHOLD(ip);
1430
1431 error = xfs_bmap_finish(&tp, &free_list, &committed); 1436 error = xfs_bmap_finish(&tp, &free_list, &committed);
1432 if (error) 1437 if (error)
1433 goto out_abort_rele; 1438 goto out_bmap_cancel;
1434 1439
1435 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1440 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1436 if (error) { 1441 if (error)
1437 IRELE(ip); 1442 goto out_release_inode;
1438 goto out_dqrele;
1439 }
1440 1443
1441 xfs_qm_dqrele(udqp); 1444 xfs_qm_dqrele(udqp);
1442 xfs_qm_dqrele(gdqp); 1445 xfs_qm_dqrele(gdqp);
@@ -1450,27 +1453,21 @@ xfs_create(
1450 cancel_flags |= XFS_TRANS_ABORT; 1453 cancel_flags |= XFS_TRANS_ABORT;
1451 out_trans_cancel: 1454 out_trans_cancel:
1452 xfs_trans_cancel(tp, cancel_flags); 1455 xfs_trans_cancel(tp, cancel_flags);
1453 out_dqrele: 1456 out_release_inode:
1457 /*
1458 * Wait until after the current transaction is aborted to
1459 * release the inode. This prevents recursive transactions
1460 * and deadlocks from xfs_inactive.
1461 */
1462 if (ip)
1463 IRELE(ip);
1464
1454 xfs_qm_dqrele(udqp); 1465 xfs_qm_dqrele(udqp);
1455 xfs_qm_dqrele(gdqp); 1466 xfs_qm_dqrele(gdqp);
1456 1467
1457 if (unlock_dp_on_error) 1468 if (unlock_dp_on_error)
1458 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1469 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1459 std_return:
1460 return error; 1470 return error;
1461
1462 out_abort_rele:
1463 /*
1464 * Wait until after the current transaction is aborted to
1465 * release the inode. This prevents recursive transactions
1466 * and deadlocks from xfs_inactive.
1467 */
1468 xfs_bmap_cancel(&free_list);
1469 cancel_flags |= XFS_TRANS_ABORT;
1470 xfs_trans_cancel(tp, cancel_flags);
1471 IRELE(ip);
1472 unlock_dp_on_error = B_FALSE;
1473 goto out_dqrele;
1474} 1471}
1475 1472
1476#ifdef DEBUG 1473#ifdef DEBUG
@@ -2095,9 +2092,8 @@ xfs_symlink(
2095 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, 2092 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
2096 &first_block, resblks, mval, &nmaps, 2093 &first_block, resblks, mval, &nmaps,
2097 &free_list); 2094 &free_list);
2098 if (error) { 2095 if (error)
2099 goto error1; 2096 goto error2;
2100 }
2101 2097
2102 if (resblks) 2098 if (resblks)
2103 resblks -= fs_blocks; 2099 resblks -= fs_blocks;
@@ -2129,7 +2125,7 @@ xfs_symlink(
2129 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, 2125 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
2130 &first_block, &free_list, resblks); 2126 &first_block, &free_list, resblks);
2131 if (error) 2127 if (error)
2132 goto error1; 2128 goto error2;
2133 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2129 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2134 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2130 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2135 2131
@@ -2142,13 +2138,6 @@ xfs_symlink(
2142 xfs_trans_set_sync(tp); 2138 xfs_trans_set_sync(tp);
2143 } 2139 }
2144 2140
2145 /*
2146 * xfs_trans_commit normally decrements the vnode ref count
2147 * when it unlocks the inode. Since we want to return the
2148 * vnode to the caller, we bump the vnode ref count now.
2149 */
2150 IHOLD(ip);
2151
2152 error = xfs_bmap_finish(&tp, &free_list, &committed); 2141 error = xfs_bmap_finish(&tp, &free_list, &committed);
2153 if (error) { 2142 if (error) {
2154 goto error2; 2143 goto error2;
@@ -2842,7 +2831,8 @@ xfs_change_file_space(
2842 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; 2831 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
2843 2832
2844 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2833 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2845 xfs_trans_set_sync(tp); 2834 if (attr_flags & XFS_ATTR_SYNC)
2835 xfs_trans_set_sync(tp);
2846 2836
2847 error = xfs_trans_commit(tp, 0); 2837 error = xfs_trans_commit(tp, 0);
2848 2838
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index f6702927eee4..3bcd23353d6c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
18#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ 18#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
19#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ 19#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
20#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 20#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
21#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
21 22
22int xfs_readlink(struct xfs_inode *ip, char *link); 23int xfs_readlink(struct xfs_inode *ip, char *link);
23int xfs_release(struct xfs_inode *ip); 24int xfs_release(struct xfs_inode *ip);