aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/xfs
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile13
-rw-r--r--fs/xfs/linux-2.6/kmem.c9
-rw-r--r--fs/xfs/linux-2.6/sv.h59
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c537
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h16
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c860
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h149
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h28
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c222
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h10
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c641
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.h23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c59
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c113
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h30
-rw-r--r--fs/xfs/linux-2.6/xfs_message.c108
-rw-r--r--fs/xfs/linux-2.6/xfs_message.h39
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c416
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c754
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c25
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h175
-rw-r--r--fs/xfs/linux-2.6/xfs_version.h29
-rw-r--r--fs/xfs/quota/xfs_dquot.c211
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c5
-rw-r--r--fs/xfs/quota/xfs_qm.c329
-rw-r--r--fs/xfs/quota/xfs_qm.h5
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c7
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c107
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c5
-rw-r--r--fs/xfs/support/debug.c115
-rw-r--r--fs/xfs/support/debug.h54
-rw-r--r--fs/xfs/xfs_acl.h2
-rw-r--r--fs/xfs/xfs_ag.h15
-rw-r--r--fs/xfs/xfs_alloc.c1360
-rw-r--r--fs/xfs/xfs_alloc.h55
-rw-r--r--fs/xfs/xfs_alloc_btree.c49
-rw-r--r--fs/xfs/xfs_attr.c44
-rw-r--r--fs/xfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/xfs_bmap.c749
-rw-r--r--fs/xfs/xfs_bmap.h16
-rw-r--r--fs/xfs/xfs_btree.c65
-rw-r--r--fs/xfs/xfs_btree.h14
-rw-r--r--fs/xfs/xfs_buf_item.c203
-rw-r--r--fs/xfs/xfs_buf_item.h11
-rw-r--r--fs/xfs/xfs_da_btree.c11
-rw-r--r--fs/xfs/xfs_dfrag.c23
-rw-r--r--fs/xfs/xfs_dinode.h5
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_dir2_leaf.c2
-rw-r--r--fs/xfs/xfs_dir2_node.c25
-rw-r--r--fs/xfs/xfs_error.c56
-rw-r--r--fs/xfs/xfs_error.h34
-rw-r--r--fs/xfs/xfs_extfree_item.c96
-rw-r--r--fs/xfs/xfs_extfree_item.h11
-rw-r--r--fs/xfs/xfs_filestream.c8
-rw-r--r--fs/xfs/xfs_fs.h7
-rw-r--r--fs/xfs/xfs_fsops.c34
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c84
-rw-r--r--fs/xfs/xfs_ialloc_btree.c33
-rw-r--r--fs/xfs/xfs_iget.c105
-rw-r--r--fs/xfs/xfs_inode.c223
-rw-r--r--fs/xfs/xfs_inode.h83
-rw-r--r--fs/xfs/xfs_inode_item.c206
-rw-r--r--fs/xfs/xfs_iomap.c250
-rw-r--r--fs/xfs/xfs_iomap.h27
-rw-r--r--fs/xfs/xfs_itable.c5
-rw-r--r--fs/xfs/xfs_log.c929
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_cil.c280
-rw-r--r--fs/xfs/xfs_log_priv.h134
-rw-r--r--fs/xfs/xfs_log_recover.c919
-rw-r--r--fs/xfs/xfs_mount.c470
-rw-r--r--fs/xfs/xfs_mount.h33
-rw-r--r--fs/xfs/xfs_mru_cache.c4
-rw-r--r--fs/xfs/xfs_quota.h23
-rw-r--r--fs/xfs/xfs_refcache.h52
-rw-r--r--fs/xfs/xfs_rename.c15
-rw-r--r--fs/xfs/xfs_rtalloc.c119
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_rw.c58
-rw-r--r--fs/xfs/xfs_sb.h10
-rw-r--r--fs/xfs/xfs_trans.c219
-rw-r--r--fs/xfs/xfs_trans.h7
-rw-r--r--fs/xfs/xfs_trans_ail.c650
-rw-r--r--fs/xfs/xfs_trans_buf.c11
-rw-r--r--fs/xfs/xfs_trans_extfree.c8
-rw-r--r--fs/xfs/xfs_trans_inode.c54
-rw-r--r--fs/xfs/xfs_trans_priv.h57
-rw-r--r--fs/xfs/xfs_types.h4
-rw-r--r--fs/xfs/xfs_utils.c9
-rw-r--r--fs/xfs/xfs_utils.h3
-rw-r--r--fs/xfs/xfs_vnodeops.c212
-rw-r--r--fs/xfs/xfs_vnodeops.h7
103 files changed, 6926 insertions, 6521 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 480f28127f09..6100ec0fa1d4 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,7 @@ config XFS_FS
22config XFS_QUOTA 22config XFS_QUOTA
23 bool "XFS Quota support" 23 bool "XFS Quota support"
24 depends on XFS_FS 24 depends on XFS_FS
25 select QUOTACTL
25 help 26 help
26 If you say Y here, you will be able to set limits for disk usage on 27 If you say Y here, you will be able to set limits for disk usage on
27 a per user and/or a per group basis under XFS. XFS considers quota 28 a per user and/or a per group basis under XFS. XFS considers quota
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..284a7c89697e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,14 +16,11 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 19ccflags-y := -I$(src) -I$(src)/linux-2.6
20ccflags-$(CONFIG_XFS_DEBUG) += -g
20 21
21XFS_LINUX := linux-2.6 22XFS_LINUX := linux-2.6
22 23
23ifeq ($(CONFIG_XFS_DEBUG),y)
24 EXTRA_CFLAGS += -g
25endif
26
27obj-$(CONFIG_XFS_FS) += xfs.o 24obj-$(CONFIG_XFS_FS) += xfs.o
28 25
29xfs-y += linux-2.6/xfs_trace.o 26xfs-y += linux-2.6/xfs_trace.o
@@ -98,17 +95,17 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
98 kmem.o \ 95 kmem.o \
99 xfs_aops.o \ 96 xfs_aops.o \
100 xfs_buf.o \ 97 xfs_buf.o \
98 xfs_discard.o \
101 xfs_export.o \ 99 xfs_export.o \
102 xfs_file.o \ 100 xfs_file.o \
103 xfs_fs_subr.o \ 101 xfs_fs_subr.o \
104 xfs_globals.o \ 102 xfs_globals.o \
105 xfs_ioctl.o \ 103 xfs_ioctl.o \
106 xfs_iops.o \ 104 xfs_iops.o \
105 xfs_message.o \
107 xfs_super.o \ 106 xfs_super.o \
108 xfs_sync.o \ 107 xfs_sync.o \
109 xfs_xattr.o) 108 xfs_xattr.o)
110 109
111# Objects in support/ 110# Objects in support/
112xfs-y += $(addprefix support/, \ 111xfs-y += support/uuid.o
113 debug.o \
114 uuid.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb6..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include "time.h" 24#include "time.h"
25#include "kmem.h" 25#include "kmem.h"
26#include "xfs_message.h"
26 27
27/* 28/*
28 * Greedy allocation. May fail and may return vmalloced memory. 29 * Greedy allocation. May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
56 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 57 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
57 return ptr; 58 return ptr;
58 if (!(++retries % 100)) 59 if (!(++retries % 100))
59 printk(KERN_ERR "XFS: possible memory allocation " 60 xfs_err(NULL,
60 "deadlock in %s (mode:0x%x)\n", 61 "possible memory allocation deadlock in %s (mode:0x%x)",
61 __func__, lflags); 62 __func__, lflags);
62 congestion_wait(BLK_RW_ASYNC, HZ/50); 63 congestion_wait(BLK_RW_ASYNC, HZ/50);
63 } while (1); 64 } while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
112 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 113 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
113 return ptr; 114 return ptr;
114 if (!(++retries % 100)) 115 if (!(++retries % 100))
115 printk(KERN_ERR "XFS: possible memory allocation " 116 xfs_err(NULL,
116 "deadlock in %s (mode:0x%x)\n", 117 "possible memory allocation deadlock in %s (mode:0x%x)",
117 __func__, lflags); 118 __func__, lflags);
118 congestion_wait(BLK_RW_ASYNC, HZ/50); 119 congestion_wait(BLK_RW_ASYNC, HZ/50);
119 } while (1); 120 } while (1);
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_SV_H__
19#define __XFS_SUPPORT_SV_H__
20
21#include <linux/wait.h>
22#include <linux/sched.h>
23#include <linux/spinlock.h>
24
25/*
26 * Synchronisation variables.
27 *
28 * (Parameters "pri", "svf" and "rts" are not implemented)
29 */
30
31typedef struct sv_s {
32 wait_queue_head_t waiters;
33} sv_t;
34
35static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
36{
37 DECLARE_WAITQUEUE(wait, current);
38
39 add_wait_queue_exclusive(&sv->waiters, &wait);
40 __set_current_state(TASK_UNINTERRUPTIBLE);
41 spin_unlock(lock);
42
43 schedule();
44
45 remove_wait_queue(&sv->waiters, &wait);
46}
47
48#define sv_init(sv,flag,name) \
49 init_waitqueue_head(&(sv)->waiters)
50#define sv_destroy(sv) \
51 /*NOTHING*/
52#define sv_wait(sv, pri, lock, s) \
53 _sv_wait(sv, lock)
54#define sv_signal(sv) \
55 wake_up(&(sv)->waiters)
56#define sv_broadcast(sv) \
57 wake_up_all(&(sv)->waiters)
58
59#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3d..39f4f809bb68 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
219} 219}
220 220
221int 221int
222xfs_check_acl(struct inode *inode, int mask) 222xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
223{ 223{
224 struct xfs_inode *ip = XFS_I(inode); 224 struct xfs_inode *ip;
225 struct posix_acl *acl; 225 struct posix_acl *acl;
226 int error = -EAGAIN; 226 int error = -EAGAIN;
227 227
228 ip = XFS_I(inode);
228 trace_xfs_check_acl(ip); 229 trace_xfs_check_acl(ip);
229 230
230 /* 231 /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
234 if (!XFS_IFORK_Q(ip)) 235 if (!XFS_IFORK_Q(ip))
235 return -EAGAIN; 236 return -EAGAIN;
236 237
238 if (flags & IPERM_FLAG_RCU) {
239 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
240 return -ECHILD;
241 return -EAGAIN;
242 }
243
237 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); 244 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
238 if (IS_ERR(acl)) 245 if (IS_ERR(acl))
239 return PTR_ERR(acl); 246 return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..79ce38be15a1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
38#include <linux/pagevec.h> 38#include <linux/pagevec.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40 40
41/*
42 * Types of I/O for bmap clustering and I/O completion tracking.
43 */
44enum {
45 IO_READ, /* mapping for a read */
46 IO_DELAY, /* mapping covers delalloc region */
47 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
48 IO_NEW /* just allocated */
49};
50 41
51/* 42/*
52 * Prime number of hash buckets since address is used as the key. 43 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
182 xfs_inode_t *ip = XFS_I(ioend->io_inode); 173 xfs_inode_t *ip = XFS_I(ioend->io_inode);
183 xfs_fsize_t isize; 174 xfs_fsize_t isize;
184 175
185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
186 ASSERT(ioend->io_type != IO_READ);
187
188 if (unlikely(ioend->io_error)) 176 if (unlikely(ioend->io_error))
189 return 0; 177 return 0;
190 178
@@ -244,10 +232,8 @@ xfs_end_io(
244 * We might have to update the on-disk file size after extending 232 * We might have to update the on-disk file size after extending
245 * writes. 233 * writes.
246 */ 234 */
247 if (ioend->io_type != IO_READ) { 235 error = xfs_setfilesize(ioend);
248 error = xfs_setfilesize(ioend); 236 ASSERT(!error || error == EAGAIN);
249 ASSERT(!error || error == EAGAIN);
250 }
251 237
252 /* 238 /*
253 * If we didn't complete processing of the ioend, requeue it to the 239 * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
318xfs_map_blocks( 304xfs_map_blocks(
319 struct inode *inode, 305 struct inode *inode,
320 loff_t offset, 306 loff_t offset,
321 ssize_t count,
322 struct xfs_bmbt_irec *imap, 307 struct xfs_bmbt_irec *imap,
323 int flags) 308 int type,
309 int nonblocking)
324{ 310{
325 int nmaps = 1; 311 struct xfs_inode *ip = XFS_I(inode);
326 int new = 0; 312 struct xfs_mount *mp = ip->i_mount;
313 ssize_t count = 1 << inode->i_blkbits;
314 xfs_fileoff_t offset_fsb, end_fsb;
315 int error = 0;
316 int bmapi_flags = XFS_BMAPI_ENTIRE;
317 int nimaps = 1;
318
319 if (XFS_FORCED_SHUTDOWN(mp))
320 return -XFS_ERROR(EIO);
321
322 if (type == IO_UNWRITTEN)
323 bmapi_flags |= XFS_BMAPI_IGSTATE;
324
325 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
326 if (nonblocking)
327 return -XFS_ERROR(EAGAIN);
328 xfs_ilock(ip, XFS_ILOCK_SHARED);
329 }
330
331 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
332 (ip->i_df.if_flags & XFS_IFEXTENTS));
333 ASSERT(offset <= mp->m_maxioffset);
334
335 if (offset + count > mp->m_maxioffset)
336 count = mp->m_maxioffset - offset;
337 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
338 offset_fsb = XFS_B_TO_FSBT(mp, offset);
339 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
340 bmapi_flags, NULL, 0, imap, &nimaps, NULL);
341 xfs_iunlock(ip, XFS_ILOCK_SHARED);
327 342
328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); 343 if (error)
344 return -XFS_ERROR(error);
345
346 if (type == IO_DELALLOC &&
347 (!nimaps || isnullstartblock(imap->br_startblock))) {
348 error = xfs_iomap_write_allocate(ip, offset, count, imap);
349 if (!error)
350 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
351 return -XFS_ERROR(error);
352 }
353
354#ifdef DEBUG
355 if (type == IO_UNWRITTEN) {
356 ASSERT(nimaps);
357 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
358 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
359 }
360#endif
361 if (nimaps)
362 trace_xfs_map_blocks_found(ip, offset, count, type, imap);
363 return 0;
329} 364}
330 365
331STATIC int 366STATIC int
@@ -378,28 +413,19 @@ xfs_submit_ioend_bio(
378 if (xfs_ioend_new_eof(ioend)) 413 if (xfs_ioend_new_eof(ioend))
379 xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); 414 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
380 415
381 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
382 WRITE_SYNC_PLUG : WRITE, bio);
383 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
384 bio_put(bio);
385} 417}
386 418
387STATIC struct bio * 419STATIC struct bio *
388xfs_alloc_ioend_bio( 420xfs_alloc_ioend_bio(
389 struct buffer_head *bh) 421 struct buffer_head *bh)
390{ 422{
391 struct bio *bio;
392 int nvecs = bio_get_nr_vecs(bh->b_bdev); 423 int nvecs = bio_get_nr_vecs(bh->b_bdev);
393 424 struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
394 do {
395 bio = bio_alloc(GFP_NOIO, nvecs);
396 nvecs >>= 1;
397 } while (!bio);
398 425
399 ASSERT(bio->bi_private == NULL); 426 ASSERT(bio->bi_private == NULL);
400 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 427 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
401 bio->bi_bdev = bh->b_bdev; 428 bio->bi_bdev = bh->b_bdev;
402 bio_get(bio);
403 return bio; 429 return bio;
404} 430}
405 431
@@ -470,9 +496,8 @@ xfs_submit_ioend(
470 /* Pass 1 - start writeback */ 496 /* Pass 1 - start writeback */
471 do { 497 do {
472 next = ioend->io_list; 498 next = ioend->io_list;
473 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 499 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
474 xfs_start_buffer_writeback(bh); 500 xfs_start_buffer_writeback(bh);
475 }
476 } while ((ioend = next) != NULL); 501 } while ((ioend = next) != NULL);
477 502
478 /* Pass 2 - submit I/O */ 503 /* Pass 2 - submit I/O */
@@ -600,117 +625,13 @@ xfs_map_at_offset(
600 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 625 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 626 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
602 627
603 lock_buffer(bh);
604 xfs_map_buffer(inode, bh, imap, offset); 628 xfs_map_buffer(inode, bh, imap, offset);
605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
606 set_buffer_mapped(bh); 629 set_buffer_mapped(bh);
607 clear_buffer_delay(bh); 630 clear_buffer_delay(bh);
608 clear_buffer_unwritten(bh); 631 clear_buffer_unwritten(bh);
609} 632}
610 633
611/* 634/*
612 * Look for a page at index that is suitable for clustering.
613 */
614STATIC unsigned int
615xfs_probe_page(
616 struct page *page,
617 unsigned int pg_offset)
618{
619 struct buffer_head *bh, *head;
620 int ret = 0;
621
622 if (PageWriteback(page))
623 return 0;
624 if (!PageDirty(page))
625 return 0;
626 if (!page->mapping)
627 return 0;
628 if (!page_has_buffers(page))
629 return 0;
630
631 bh = head = page_buffers(page);
632 do {
633 if (!buffer_uptodate(bh))
634 break;
635 if (!buffer_mapped(bh))
636 break;
637 ret += bh->b_size;
638 if (ret >= pg_offset)
639 break;
640 } while ((bh = bh->b_this_page) != head);
641
642 return ret;
643}
644
645STATIC size_t
646xfs_probe_cluster(
647 struct inode *inode,
648 struct page *startpage,
649 struct buffer_head *bh,
650 struct buffer_head *head)
651{
652 struct pagevec pvec;
653 pgoff_t tindex, tlast, tloff;
654 size_t total = 0;
655 int done = 0, i;
656
657 /* First sum forwards in this page */
658 do {
659 if (!buffer_uptodate(bh) || !buffer_mapped(bh))
660 return total;
661 total += bh->b_size;
662 } while ((bh = bh->b_this_page) != head);
663
664 /* if we reached the end of the page, sum forwards in following pages */
665 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
666 tindex = startpage->index + 1;
667
668 /* Prune this back to avoid pathological behavior */
669 tloff = min(tlast, startpage->index + 64);
670
671 pagevec_init(&pvec, 0);
672 while (!done && tindex <= tloff) {
673 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
674
675 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
676 break;
677
678 for (i = 0; i < pagevec_count(&pvec); i++) {
679 struct page *page = pvec.pages[i];
680 size_t pg_offset, pg_len = 0;
681
682 if (tindex == tlast) {
683 pg_offset =
684 i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
685 if (!pg_offset) {
686 done = 1;
687 break;
688 }
689 } else
690 pg_offset = PAGE_CACHE_SIZE;
691
692 if (page->index == tindex && trylock_page(page)) {
693 pg_len = xfs_probe_page(page, pg_offset);
694 unlock_page(page);
695 }
696
697 if (!pg_len) {
698 done = 1;
699 break;
700 }
701
702 total += pg_len;
703 tindex++;
704 }
705
706 pagevec_release(&pvec);
707 cond_resched();
708 }
709
710 return total;
711}
712
713/*
714 * Test if a given page is suitable for writing as part of an unwritten 635 * Test if a given page is suitable for writing as part of an unwritten
715 * or delayed allocate extent. 636 * or delayed allocate extent.
716 */ 637 */
@@ -731,9 +652,9 @@ xfs_is_delayed_page(
731 if (buffer_unwritten(bh)) 652 if (buffer_unwritten(bh))
732 acceptable = (type == IO_UNWRITTEN); 653 acceptable = (type == IO_UNWRITTEN);
733 else if (buffer_delay(bh)) 654 else if (buffer_delay(bh))
734 acceptable = (type == IO_DELAY); 655 acceptable = (type == IO_DELALLOC);
735 else if (buffer_dirty(bh) && buffer_mapped(bh)) 656 else if (buffer_dirty(bh) && buffer_mapped(bh))
736 acceptable = (type == IO_NEW); 657 acceptable = (type == IO_OVERWRITE);
737 else 658 else
738 break; 659 break;
739 } while ((bh = bh->b_this_page) != head); 660 } while ((bh = bh->b_this_page) != head);
@@ -758,8 +679,7 @@ xfs_convert_page(
758 loff_t tindex, 679 loff_t tindex,
759 struct xfs_bmbt_irec *imap, 680 struct xfs_bmbt_irec *imap,
760 xfs_ioend_t **ioendp, 681 xfs_ioend_t **ioendp,
761 struct writeback_control *wbc, 682 struct writeback_control *wbc)
762 int all_bh)
763{ 683{
764 struct buffer_head *bh, *head; 684 struct buffer_head *bh, *head;
765 xfs_off_t end_offset; 685 xfs_off_t end_offset;
@@ -814,37 +734,30 @@ xfs_convert_page(
814 continue; 734 continue;
815 } 735 }
816 736
817 if (buffer_unwritten(bh) || buffer_delay(bh)) { 737 if (buffer_unwritten(bh) || buffer_delay(bh) ||
738 buffer_mapped(bh)) {
818 if (buffer_unwritten(bh)) 739 if (buffer_unwritten(bh))
819 type = IO_UNWRITTEN; 740 type = IO_UNWRITTEN;
741 else if (buffer_delay(bh))
742 type = IO_DELALLOC;
820 else 743 else
821 type = IO_DELAY; 744 type = IO_OVERWRITE;
822 745
823 if (!xfs_imap_valid(inode, imap, offset)) { 746 if (!xfs_imap_valid(inode, imap, offset)) {
824 done = 1; 747 done = 1;
825 continue; 748 continue;
826 } 749 }
827 750
828 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 751 lock_buffer(bh);
829 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 752 if (type != IO_OVERWRITE)
830 753 xfs_map_at_offset(inode, bh, imap, offset);
831 xfs_map_at_offset(inode, bh, imap, offset);
832 xfs_add_to_ioend(inode, bh, offset, type, 754 xfs_add_to_ioend(inode, bh, offset, type,
833 ioendp, done); 755 ioendp, done);
834 756
835 page_dirty--; 757 page_dirty--;
836 count++; 758 count++;
837 } else { 759 } else {
838 type = IO_NEW; 760 done = 1;
839 if (buffer_mapped(bh) && all_bh) {
840 lock_buffer(bh);
841 xfs_add_to_ioend(inode, bh, offset,
842 type, ioendp, done);
843 count++;
844 page_dirty--;
845 } else {
846 done = 1;
847 }
848 } 761 }
849 } while (offset += len, (bh = bh->b_this_page) != head); 762 } while (offset += len, (bh = bh->b_this_page) != head);
850 763
@@ -876,7 +789,6 @@ xfs_cluster_write(
876 struct xfs_bmbt_irec *imap, 789 struct xfs_bmbt_irec *imap,
877 xfs_ioend_t **ioendp, 790 xfs_ioend_t **ioendp,
878 struct writeback_control *wbc, 791 struct writeback_control *wbc,
879 int all_bh,
880 pgoff_t tlast) 792 pgoff_t tlast)
881{ 793{
882 struct pagevec pvec; 794 struct pagevec pvec;
@@ -891,7 +803,7 @@ xfs_cluster_write(
891 803
892 for (i = 0; i < pagevec_count(&pvec); i++) { 804 for (i = 0; i < pagevec_count(&pvec); i++) {
893 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 805 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
894 imap, ioendp, wbc, all_bh); 806 imap, ioendp, wbc);
895 if (done) 807 if (done)
896 break; 808 break;
897 } 809 }
@@ -934,83 +846,38 @@ xfs_aops_discard_page(
934 struct xfs_inode *ip = XFS_I(inode); 846 struct xfs_inode *ip = XFS_I(inode);
935 struct buffer_head *bh, *head; 847 struct buffer_head *bh, *head;
936 loff_t offset = page_offset(page); 848 loff_t offset = page_offset(page);
937 ssize_t len = 1 << inode->i_blkbits;
938 849
939 if (!xfs_is_delayed_page(page, IO_DELAY)) 850 if (!xfs_is_delayed_page(page, IO_DELALLOC))
940 goto out_invalidate; 851 goto out_invalidate;
941 852
942 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 853 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
943 goto out_invalidate; 854 goto out_invalidate;
944 855
945 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 856 xfs_alert(ip->i_mount,
946 "page discard on page %p, inode 0x%llx, offset %llu.", 857 "page discard on page %p, inode 0x%llx, offset %llu.",
947 page, ip->i_ino, offset); 858 page, ip->i_ino, offset);
948 859
949 xfs_ilock(ip, XFS_ILOCK_EXCL); 860 xfs_ilock(ip, XFS_ILOCK_EXCL);
950 bh = head = page_buffers(page); 861 bh = head = page_buffers(page);
951 do { 862 do {
952 int done;
953 xfs_fileoff_t offset_fsb;
954 xfs_bmbt_irec_t imap;
955 int nimaps = 1;
956 int error; 863 int error;
957 xfs_fsblock_t firstblock; 864 xfs_fileoff_t start_fsb;
958 xfs_bmap_free_t flist;
959 865
960 if (!buffer_delay(bh)) 866 if (!buffer_delay(bh))
961 goto next_buffer; 867 goto next_buffer;
962 868
963 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 869 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
964 870 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
965 /*
966 * Map the range first and check that it is a delalloc extent
967 * before trying to unmap the range. Otherwise we will be
968 * trying to remove a real extent (which requires a
969 * transaction) or a hole, which is probably a bad idea...
970 */
971 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
972 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
973 &nimaps, NULL);
974
975 if (error) {
976 /* something screwed, just bail */
977 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
978 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
979 "page discard failed delalloc mapping lookup.");
980 }
981 break;
982 }
983 if (!nimaps) {
984 /* nothing there */
985 goto next_buffer;
986 }
987 if (imap.br_startblock != DELAYSTARTBLOCK) {
988 /* been converted, ignore */
989 goto next_buffer;
990 }
991 WARN_ON(imap.br_blockcount == 0);
992
993 /*
994 * Note: while we initialise the firstblock/flist pair, they
995 * should never be used because blocks should never be
996 * allocated or freed for a delalloc extent and hence we need
997 * don't cancel or finish them after the xfs_bunmapi() call.
998 */
999 xfs_bmap_init(&flist, &firstblock);
1000 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
1001 &flist, &done);
1002
1003 ASSERT(!flist.xbf_count && !flist.xbf_first);
1004 if (error) { 871 if (error) {
1005 /* something screwed, just bail */ 872 /* something screwed, just bail */
1006 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 873 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1007 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 874 xfs_alert(ip->i_mount,
1008 "page discard unable to remove delalloc mapping."); 875 "page discard unable to remove delalloc mapping.");
1009 } 876 }
1010 break; 877 break;
1011 } 878 }
1012next_buffer: 879next_buffer:
1013 offset += len; 880 offset += 1 << inode->i_blkbits;
1014 881
1015 } while ((bh = bh->b_this_page) != head); 882 } while ((bh = bh->b_this_page) != head);
1016 883
@@ -1047,10 +914,10 @@ xfs_vm_writepage(
1047 unsigned int type; 914 unsigned int type;
1048 __uint64_t end_offset; 915 __uint64_t end_offset;
1049 pgoff_t end_index, last_index; 916 pgoff_t end_index, last_index;
1050 ssize_t size, len; 917 ssize_t len;
1051 int flags, err, imap_valid = 0, uptodate = 1; 918 int err, imap_valid = 0, uptodate = 1;
1052 int count = 0; 919 int count = 0;
1053 int all_bh = 0; 920 int nonblocking = 0;
1054 921
1055 trace_xfs_writepage(inode, page, 0); 922 trace_xfs_writepage(inode, page, 0);
1056 923
@@ -1101,110 +968,78 @@ xfs_vm_writepage(
1101 968
1102 bh = head = page_buffers(page); 969 bh = head = page_buffers(page);
1103 offset = page_offset(page); 970 offset = page_offset(page);
1104 flags = BMAPI_READ; 971 type = IO_OVERWRITE;
1105 type = IO_NEW; 972
973 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
974 nonblocking = 1;
1106 975
1107 do { 976 do {
977 int new_ioend = 0;
978
1108 if (offset >= end_offset) 979 if (offset >= end_offset)
1109 break; 980 break;
1110 if (!buffer_uptodate(bh)) 981 if (!buffer_uptodate(bh))
1111 uptodate = 0; 982 uptodate = 0;
1112 983
1113 /* 984 /*
1114 * A hole may still be marked uptodate because discard_buffer 985 * set_page_dirty dirties all buffers in a page, independent
1115 * leaves the flag set. 986 * of their state. The dirty state however is entirely
987 * meaningless for holes (!mapped && uptodate), so skip
988 * buffers covering holes here.
1116 */ 989 */
1117 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 990 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1118 ASSERT(!buffer_dirty(bh));
1119 imap_valid = 0; 991 imap_valid = 0;
1120 continue; 992 continue;
1121 } 993 }
1122 994
1123 if (imap_valid) 995 if (buffer_unwritten(bh)) {
1124 imap_valid = xfs_imap_valid(inode, &imap, offset); 996 if (type != IO_UNWRITTEN) {
1125
1126 if (buffer_unwritten(bh) || buffer_delay(bh)) {
1127 int new_ioend = 0;
1128
1129 /*
1130 * Make sure we don't use a read-only iomap
1131 */
1132 if (flags == BMAPI_READ)
1133 imap_valid = 0;
1134
1135 if (buffer_unwritten(bh)) {
1136 type = IO_UNWRITTEN; 997 type = IO_UNWRITTEN;
1137 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 998 imap_valid = 0;
1138 } else if (buffer_delay(bh)) {
1139 type = IO_DELAY;
1140 flags = BMAPI_ALLOCATE;
1141
1142 if (wbc->sync_mode == WB_SYNC_NONE &&
1143 wbc->nonblocking)
1144 flags |= BMAPI_TRYLOCK;
1145 }
1146
1147 if (!imap_valid) {
1148 /*
1149 * If we didn't have a valid mapping then we
1150 * need to ensure that we put the new mapping
1151 * in a new ioend structure. This needs to be
1152 * done to ensure that the ioends correctly
1153 * reflect the block mappings at io completion
1154 * for unwritten extent conversion.
1155 */
1156 new_ioend = 1;
1157 err = xfs_map_blocks(inode, offset, len,
1158 &imap, flags);
1159 if (err)
1160 goto error;
1161 imap_valid = xfs_imap_valid(inode, &imap,
1162 offset);
1163 } 999 }
1164 if (imap_valid) { 1000 } else if (buffer_delay(bh)) {
1165 xfs_map_at_offset(inode, bh, &imap, offset); 1001 if (type != IO_DELALLOC) {
1166 xfs_add_to_ioend(inode, bh, offset, type, 1002 type = IO_DELALLOC;
1167 &ioend, new_ioend); 1003 imap_valid = 0;
1168 count++;
1169 } 1004 }
1170 } else if (buffer_uptodate(bh)) { 1005 } else if (buffer_uptodate(bh)) {
1171 /* 1006 if (type != IO_OVERWRITE) {
1172 * we got here because the buffer is already mapped. 1007 type = IO_OVERWRITE;
1173 * That means it must already have extents allocated 1008 imap_valid = 0;
1174 * underneath it. Map the extent by reading it. 1009 }
1175 */ 1010 } else {
1176 if (!imap_valid || flags != BMAPI_READ) { 1011 if (PageUptodate(page)) {
1177 flags = BMAPI_READ; 1012 ASSERT(buffer_mapped(bh));
1178 size = xfs_probe_cluster(inode, page, bh, head); 1013 imap_valid = 0;
1179 err = xfs_map_blocks(inode, offset, size,
1180 &imap, flags);
1181 if (err)
1182 goto error;
1183 imap_valid = xfs_imap_valid(inode, &imap,
1184 offset);
1185 } 1014 }
1015 continue;
1016 }
1186 1017
1018 if (imap_valid)
1019 imap_valid = xfs_imap_valid(inode, &imap, offset);
1020 if (!imap_valid) {
1187 /* 1021 /*
1188 * We set the type to IO_NEW in case we are doing a 1022 * If we didn't have a valid mapping then we need to
1189 * small write at EOF that is extending the file but 1023 * put the new mapping into a separate ioend structure.
1190 * without needing an allocation. We need to update the 1024 * This ensures non-contiguous extents always have
1191 * file size on I/O completion in this case so it is 1025 * separate ioends, which is particularly important
1192 * the same case as having just allocated a new extent 1026 * for unwritten extent conversion at I/O completion
1193 * that we are writing into for the first time. 1027 * time.
1194 */ 1028 */
1195 type = IO_NEW; 1029 new_ioend = 1;
1196 if (trylock_buffer(bh)) { 1030 err = xfs_map_blocks(inode, offset, &imap, type,
1197 if (imap_valid) 1031 nonblocking);
1198 all_bh = 1; 1032 if (err)
1199 xfs_add_to_ioend(inode, bh, offset, type, 1033 goto error;
1200 &ioend, !imap_valid); 1034 imap_valid = xfs_imap_valid(inode, &imap, offset);
1201 count++; 1035 }
1202 } else { 1036 if (imap_valid) {
1203 imap_valid = 0; 1037 lock_buffer(bh);
1204 } 1038 if (type != IO_OVERWRITE)
1205 } else if (PageUptodate(page)) { 1039 xfs_map_at_offset(inode, bh, &imap, offset);
1206 ASSERT(buffer_mapped(bh)); 1040 xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1207 imap_valid = 0; 1041 new_ioend);
1042 count++;
1208 } 1043 }
1209 1044
1210 if (!iohead) 1045 if (!iohead)
@@ -1233,7 +1068,7 @@ xfs_vm_writepage(
1233 end_index = last_index; 1068 end_index = last_index;
1234 1069
1235 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1070 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1236 wbc, all_bh, end_index); 1071 wbc, end_index);
1237 } 1072 }
1238 1073
1239 if (iohead) 1074 if (iohead)
@@ -1302,13 +1137,19 @@ __xfs_get_blocks(
1302 int create, 1137 int create,
1303 int direct) 1138 int direct)
1304{ 1139{
1305 int flags = create ? BMAPI_WRITE : BMAPI_READ; 1140 struct xfs_inode *ip = XFS_I(inode);
1141 struct xfs_mount *mp = ip->i_mount;
1142 xfs_fileoff_t offset_fsb, end_fsb;
1143 int error = 0;
1144 int lockmode = 0;
1306 struct xfs_bmbt_irec imap; 1145 struct xfs_bmbt_irec imap;
1146 int nimaps = 1;
1307 xfs_off_t offset; 1147 xfs_off_t offset;
1308 ssize_t size; 1148 ssize_t size;
1309 int nimap = 1;
1310 int new = 0; 1149 int new = 0;
1311 int error; 1150
1151 if (XFS_FORCED_SHUTDOWN(mp))
1152 return -XFS_ERROR(EIO);
1312 1153
1313 offset = (xfs_off_t)iblock << inode->i_blkbits; 1154 offset = (xfs_off_t)iblock << inode->i_blkbits;
1314 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1155 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1317,15 +1158,45 @@ __xfs_get_blocks(
1317 if (!create && direct && offset >= i_size_read(inode)) 1158 if (!create && direct && offset >= i_size_read(inode))
1318 return 0; 1159 return 0;
1319 1160
1320 if (direct && create) 1161 if (create) {
1321 flags |= BMAPI_DIRECT; 1162 lockmode = XFS_ILOCK_EXCL;
1163 xfs_ilock(ip, lockmode);
1164 } else {
1165 lockmode = xfs_ilock_map_shared(ip);
1166 }
1322 1167
1323 error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, 1168 ASSERT(offset <= mp->m_maxioffset);
1324 &new); 1169 if (offset + size > mp->m_maxioffset)
1170 size = mp->m_maxioffset - offset;
1171 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1172 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1173
1174 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
1175 XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
1325 if (error) 1176 if (error)
1326 return -error; 1177 goto out_unlock;
1327 if (nimap == 0) 1178
1328 return 0; 1179 if (create &&
1180 (!nimaps ||
1181 (imap.br_startblock == HOLESTARTBLOCK ||
1182 imap.br_startblock == DELAYSTARTBLOCK))) {
1183 if (direct) {
1184 error = xfs_iomap_write_direct(ip, offset, size,
1185 &imap, nimaps);
1186 } else {
1187 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1188 }
1189 if (error)
1190 goto out_unlock;
1191
1192 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1193 } else if (nimaps) {
1194 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1195 } else {
1196 trace_xfs_get_blocks_notfound(ip, offset, size);
1197 goto out_unlock;
1198 }
1199 xfs_iunlock(ip, lockmode);
1329 1200
1330 if (imap.br_startblock != HOLESTARTBLOCK && 1201 if (imap.br_startblock != HOLESTARTBLOCK &&
1331 imap.br_startblock != DELAYSTARTBLOCK) { 1202 imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1392,6 +1263,10 @@ __xfs_get_blocks(
1392 } 1263 }
1393 1264
1394 return 0; 1265 return 0;
1266
1267out_unlock:
1268 xfs_iunlock(ip, lockmode);
1269 return -error;
1395} 1270}
1396 1271
1397int 1272int
@@ -1420,7 +1295,7 @@ xfs_get_blocks_direct(
1420 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1295 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1421 * need to issue a transaction to convert the range from unwritten to written 1296 * need to issue a transaction to convert the range from unwritten to written
1422 * extents. In case this is regular synchronous I/O we just call xfs_end_io 1297 * extents. In case this is regular synchronous I/O we just call xfs_end_io
1423 * to do this and we are done. But in case this was a successfull AIO 1298 * to do this and we are done. But in case this was a successful AIO
1424 * request this handler is called from interrupt context, from which we 1299 * request this handler is called from interrupt context, from which we
1425 * can't start transactions. In that case offload the I/O completion to 1300 * can't start transactions. In that case offload the I/O completion to
1426 * the workqueues we also use for buffered I/O completion. 1301 * the workqueues we also use for buffered I/O completion.
@@ -1479,7 +1354,7 @@ xfs_vm_direct_IO(
1479 ssize_t ret; 1354 ssize_t ret;
1480 1355
1481 if (rw & WRITE) { 1356 if (rw & WRITE) {
1482 iocb->private = xfs_alloc_ioend(inode, IO_NEW); 1357 iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
1483 1358
1484 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1359 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1485 offset, nr_segs, 1360 offset, nr_segs,
@@ -1505,11 +1380,42 @@ xfs_vm_write_failed(
1505 struct inode *inode = mapping->host; 1380 struct inode *inode = mapping->host;
1506 1381
1507 if (to > inode->i_size) { 1382 if (to > inode->i_size) {
1508 struct iattr ia = { 1383 /*
1509 .ia_valid = ATTR_SIZE | ATTR_FORCE, 1384 * punch out the delalloc blocks we have already allocated. We
1510 .ia_size = inode->i_size, 1385 * don't call xfs_setattr() to do this as we may be in the
1511 }; 1386 * middle of a multi-iovec write and so the vfs inode->i_size
1512 xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); 1387 * will not match the xfs ip->i_size and so it will zero too
1388 * much. Hence we jus truncate the page cache to zero what is
1389 * necessary and punch the delalloc blocks directly.
1390 */
1391 struct xfs_inode *ip = XFS_I(inode);
1392 xfs_fileoff_t start_fsb;
1393 xfs_fileoff_t end_fsb;
1394 int error;
1395
1396 truncate_pagecache(inode, to, inode->i_size);
1397
1398 /*
1399 * Check if there are any blocks that are outside of i_size
1400 * that need to be trimmed back.
1401 */
1402 start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
1403 end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
1404 if (end_fsb <= start_fsb)
1405 return;
1406
1407 xfs_ilock(ip, XFS_ILOCK_EXCL);
1408 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1409 end_fsb - start_fsb);
1410 if (error) {
1411 /* something screwed, just bail */
1412 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1413 xfs_alert(ip->i_mount,
1414 "xfs_vm_write_failed: unable to clean up ino %lld",
1415 ip->i_ino);
1416 }
1417 }
1418 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1513 } 1419 }
1514} 1420}
1515 1421
@@ -1588,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
1588 .readpages = xfs_vm_readpages, 1494 .readpages = xfs_vm_readpages,
1589 .writepage = xfs_vm_writepage, 1495 .writepage = xfs_vm_writepage,
1590 .writepages = xfs_vm_writepages, 1496 .writepages = xfs_vm_writepages,
1591 .sync_page = block_sync_page,
1592 .releasepage = xfs_vm_releasepage, 1497 .releasepage = xfs_vm_releasepage,
1593 .invalidatepage = xfs_vm_invalidatepage, 1498 .invalidatepage = xfs_vm_invalidatepage,
1594 .write_begin = xfs_vm_write_begin, 1499 .write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
23extern mempool_t *xfs_ioend_pool; 23extern mempool_t *xfs_ioend_pool;
24 24
25/* 25/*
26 * Types of I/O for bmap clustering and I/O completion tracking.
27 */
28enum {
29 IO_DIRECT = 0, /* special case for direct I/O ioends */
30 IO_DELALLOC, /* mapping covers delalloc region */
31 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
32 IO_OVERWRITE, /* mapping covers already allocated extent */
33};
34
35#define XFS_IO_TYPES \
36 { 0, "" }, \
37 { IO_DELALLOC, "delalloc" }, \
38 { IO_UNWRITTEN, "unwritten" }, \
39 { IO_OVERWRITE, "overwrite" }
40
41/*
26 * xfs_ioend struct manages large extent writes for XFS. 42 * xfs_ioend struct manages large extent writes for XFS.
27 * It can manage several multi-page bio's at once. 43 * It can manage several multi-page bio's at once.
28 */ 44 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..5e68099db2a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,7 +33,6 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
37 36
38#include "xfs_sb.h" 37#include "xfs_sb.h"
39#include "xfs_inum.h" 38#include "xfs_inum.h"
@@ -44,12 +43,7 @@
44 43
45static kmem_zone_t *xfs_buf_zone; 44static kmem_zone_t *xfs_buf_zone;
46STATIC int xfsbufd(void *); 45STATIC int xfsbufd(void *);
47STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
48STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 46STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
49static struct shrinker xfs_buf_shake = {
50 .shrink = xfsbufd_wakeup,
51 .seeks = DEFAULT_SEEKS,
52};
53 47
54static struct workqueue_struct *xfslogd_workqueue; 48static struct workqueue_struct *xfslogd_workqueue;
55struct workqueue_struct *xfsdatad_workqueue; 49struct workqueue_struct *xfsdatad_workqueue;
@@ -99,77 +93,79 @@ xfs_buf_vmap_len(
99} 93}
100 94
101/* 95/*
102 * Page Region interfaces. 96 * xfs_buf_lru_add - add a buffer to the LRU.
103 * 97 *
104 * For pages in filesystems where the blocksize is smaller than the 98 * The LRU takes a new reference to the buffer so that it will only be freed
105 * pagesize, we use the page->private field (long) to hold a bitmap 99 * once the shrinker takes the buffer off the LRU.
106 * of uptodate regions within the page.
107 *
108 * Each such region is "bytes per page / bits per long" bytes long.
109 *
110 * NBPPR == number-of-bytes-per-page-region
111 * BTOPR == bytes-to-page-region (rounded up)
112 * BTOPRT == bytes-to-page-region-truncated (rounded down)
113 */ 100 */
114#if (BITS_PER_LONG == 32) 101STATIC void
115#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 102xfs_buf_lru_add(
116#elif (BITS_PER_LONG == 64) 103 struct xfs_buf *bp)
117#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
118#else
119#error BITS_PER_LONG must be 32 or 64
120#endif
121#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
122#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
123#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
124
125STATIC unsigned long
126page_region_mask(
127 size_t offset,
128 size_t length)
129{ 104{
130 unsigned long mask; 105 struct xfs_buftarg *btp = bp->b_target;
131 int first, final;
132
133 first = BTOPR(offset);
134 final = BTOPRT(offset + length - 1);
135 first = min(first, final);
136
137 mask = ~0UL;
138 mask <<= BITS_PER_LONG - (final - first);
139 mask >>= BITS_PER_LONG - (final);
140
141 ASSERT(offset + length <= PAGE_CACHE_SIZE);
142 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
143 106
144 return mask; 107 spin_lock(&btp->bt_lru_lock);
108 if (list_empty(&bp->b_lru)) {
109 atomic_inc(&bp->b_hold);
110 list_add_tail(&bp->b_lru, &btp->bt_lru);
111 btp->bt_lru_nr++;
112 }
113 spin_unlock(&btp->bt_lru_lock);
145} 114}
146 115
116/*
117 * xfs_buf_lru_del - remove a buffer from the LRU
118 *
119 * The unlocked check is safe here because it only occurs when there are not
120 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
121 * to optimise the shrinker removing the buffer from the LRU and calling
122 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
123 * bt_lru_lock.
124 */
147STATIC void 125STATIC void
148set_page_region( 126xfs_buf_lru_del(
149 struct page *page, 127 struct xfs_buf *bp)
150 size_t offset,
151 size_t length)
152{ 128{
153 set_page_private(page, 129 struct xfs_buftarg *btp = bp->b_target;
154 page_private(page) | page_region_mask(offset, length));
155 if (page_private(page) == ~0UL)
156 SetPageUptodate(page);
157}
158 130
159STATIC int 131 if (list_empty(&bp->b_lru))
160test_page_region( 132 return;
161 struct page *page,
162 size_t offset,
163 size_t length)
164{
165 unsigned long mask = page_region_mask(offset, length);
166 133
167 return (mask && (page_private(page) & mask) == mask); 134 spin_lock(&btp->bt_lru_lock);
135 if (!list_empty(&bp->b_lru)) {
136 list_del_init(&bp->b_lru);
137 btp->bt_lru_nr--;
138 }
139 spin_unlock(&btp->bt_lru_lock);
168} 140}
169 141
170/* 142/*
171 * Internal xfs_buf_t object manipulation 143 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
144 * b_lru_ref count so that the buffer is freed immediately when the buffer
145 * reference count falls to zero. If the buffer is already on the LRU, we need
146 * to remove the reference that LRU holds on the buffer.
147 *
148 * This prevents build-up of stale buffers on the LRU.
172 */ 149 */
150void
151xfs_buf_stale(
152 struct xfs_buf *bp)
153{
154 bp->b_flags |= XBF_STALE;
155 atomic_set(&(bp)->b_lru_ref, 0);
156 if (!list_empty(&bp->b_lru)) {
157 struct xfs_buftarg *btp = bp->b_target;
158
159 spin_lock(&btp->bt_lru_lock);
160 if (!list_empty(&bp->b_lru)) {
161 list_del_init(&bp->b_lru);
162 btp->bt_lru_nr--;
163 atomic_dec(&bp->b_hold);
164 }
165 spin_unlock(&btp->bt_lru_lock);
166 }
167 ASSERT(atomic_read(&bp->b_hold) >= 1);
168}
173 169
174STATIC void 170STATIC void
175_xfs_buf_initialize( 171_xfs_buf_initialize(
@@ -186,10 +182,12 @@ _xfs_buf_initialize(
186 182
187 memset(bp, 0, sizeof(xfs_buf_t)); 183 memset(bp, 0, sizeof(xfs_buf_t));
188 atomic_set(&bp->b_hold, 1); 184 atomic_set(&bp->b_hold, 1);
185 atomic_set(&bp->b_lru_ref, 1);
189 init_completion(&bp->b_iowait); 186 init_completion(&bp->b_iowait);
187 INIT_LIST_HEAD(&bp->b_lru);
190 INIT_LIST_HEAD(&bp->b_list); 188 INIT_LIST_HEAD(&bp->b_list);
191 INIT_LIST_HEAD(&bp->b_hash_list); 189 RB_CLEAR_NODE(&bp->b_rbnode);
192 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 190 sema_init(&bp->b_sema, 0); /* held, no waiters */
193 XB_SET_OWNER(bp); 191 XB_SET_OWNER(bp);
194 bp->b_target = target; 192 bp->b_target = target;
195 bp->b_file_offset = range_base; 193 bp->b_file_offset = range_base;
@@ -262,9 +260,9 @@ xfs_buf_free(
262{ 260{
263 trace_xfs_buf_free(bp, _RET_IP_); 261 trace_xfs_buf_free(bp, _RET_IP_);
264 262
265 ASSERT(list_empty(&bp->b_hash_list)); 263 ASSERT(list_empty(&bp->b_lru));
266 264
267 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 265 if (bp->b_flags & _XBF_PAGES) {
268 uint i; 266 uint i;
269 267
270 if (xfs_buf_is_vmapped(bp)) 268 if (xfs_buf_is_vmapped(bp))
@@ -274,56 +272,77 @@ xfs_buf_free(
274 for (i = 0; i < bp->b_page_count; i++) { 272 for (i = 0; i < bp->b_page_count; i++) {
275 struct page *page = bp->b_pages[i]; 273 struct page *page = bp->b_pages[i];
276 274
277 if (bp->b_flags & _XBF_PAGE_CACHE) 275 __free_page(page);
278 ASSERT(!PagePrivate(page));
279 page_cache_release(page);
280 } 276 }
281 } 277 } else if (bp->b_flags & _XBF_KMEM)
278 kmem_free(bp->b_addr);
282 _xfs_buf_free_pages(bp); 279 _xfs_buf_free_pages(bp);
283 xfs_buf_deallocate(bp); 280 xfs_buf_deallocate(bp);
284} 281}
285 282
286/* 283/*
287 * Finds all pages for buffer in question and builds it's page list. 284 * Allocates all the pages for buffer in question and builds it's page list.
288 */ 285 */
289STATIC int 286STATIC int
290_xfs_buf_lookup_pages( 287xfs_buf_allocate_memory(
291 xfs_buf_t *bp, 288 xfs_buf_t *bp,
292 uint flags) 289 uint flags)
293{ 290{
294 struct address_space *mapping = bp->b_target->bt_mapping;
295 size_t blocksize = bp->b_target->bt_bsize;
296 size_t size = bp->b_count_desired; 291 size_t size = bp->b_count_desired;
297 size_t nbytes, offset; 292 size_t nbytes, offset;
298 gfp_t gfp_mask = xb_to_gfp(flags); 293 gfp_t gfp_mask = xb_to_gfp(flags);
299 unsigned short page_count, i; 294 unsigned short page_count, i;
300 pgoff_t first;
301 xfs_off_t end; 295 xfs_off_t end;
302 int error; 296 int error;
303 297
298 /*
299 * for buffers that are contained within a single page, just allocate
300 * the memory from the heap - there's no need for the complexity of
301 * page arrays to keep allocation down to order 0.
302 */
303 if (bp->b_buffer_length < PAGE_SIZE) {
304 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
305 if (!bp->b_addr) {
306 /* low memory - use alloc_page loop instead */
307 goto use_alloc_page;
308 }
309
310 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
311 PAGE_MASK) !=
312 ((unsigned long)bp->b_addr & PAGE_MASK)) {
313 /* b_addr spans two pages - use alloc_page instead */
314 kmem_free(bp->b_addr);
315 bp->b_addr = NULL;
316 goto use_alloc_page;
317 }
318 bp->b_offset = offset_in_page(bp->b_addr);
319 bp->b_pages = bp->b_page_array;
320 bp->b_pages[0] = virt_to_page(bp->b_addr);
321 bp->b_page_count = 1;
322 bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
323 return 0;
324 }
325
326use_alloc_page:
304 end = bp->b_file_offset + bp->b_buffer_length; 327 end = bp->b_file_offset + bp->b_buffer_length;
305 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 328 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
306
307 error = _xfs_buf_get_pages(bp, page_count, flags); 329 error = _xfs_buf_get_pages(bp, page_count, flags);
308 if (unlikely(error)) 330 if (unlikely(error))
309 return error; 331 return error;
310 bp->b_flags |= _XBF_PAGE_CACHE;
311 332
312 offset = bp->b_offset; 333 offset = bp->b_offset;
313 first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 334 bp->b_flags |= _XBF_PAGES;
314 335
315 for (i = 0; i < bp->b_page_count; i++) { 336 for (i = 0; i < bp->b_page_count; i++) {
316 struct page *page; 337 struct page *page;
317 uint retries = 0; 338 uint retries = 0;
318 339retry:
319 retry: 340 page = alloc_page(gfp_mask);
320 page = find_or_create_page(mapping, first + i, gfp_mask);
321 if (unlikely(page == NULL)) { 341 if (unlikely(page == NULL)) {
322 if (flags & XBF_READ_AHEAD) { 342 if (flags & XBF_READ_AHEAD) {
323 bp->b_page_count = i; 343 bp->b_page_count = i;
324 for (i = 0; i < bp->b_page_count; i++) 344 error = ENOMEM;
325 unlock_page(bp->b_pages[i]); 345 goto out_free_pages;
326 return -ENOMEM;
327 } 346 }
328 347
329 /* 348 /*
@@ -333,65 +352,55 @@ _xfs_buf_lookup_pages(
333 * handle buffer allocation failures we can't do much. 352 * handle buffer allocation failures we can't do much.
334 */ 353 */
335 if (!(++retries % 100)) 354 if (!(++retries % 100))
336 printk(KERN_ERR 355 xfs_err(NULL,
337 "XFS: possible memory allocation " 356 "possible memory allocation deadlock in %s (mode:0x%x)",
338 "deadlock in %s (mode:0x%x)\n",
339 __func__, gfp_mask); 357 __func__, gfp_mask);
340 358
341 XFS_STATS_INC(xb_page_retries); 359 XFS_STATS_INC(xb_page_retries);
342 xfsbufd_wakeup(NULL, 0, gfp_mask);
343 congestion_wait(BLK_RW_ASYNC, HZ/50); 360 congestion_wait(BLK_RW_ASYNC, HZ/50);
344 goto retry; 361 goto retry;
345 } 362 }
346 363
347 XFS_STATS_INC(xb_page_found); 364 XFS_STATS_INC(xb_page_found);
348 365
349 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 366 nbytes = min_t(size_t, size, PAGE_SIZE - offset);
350 size -= nbytes; 367 size -= nbytes;
351
352 ASSERT(!PagePrivate(page));
353 if (!PageUptodate(page)) {
354 page_count--;
355 if (blocksize >= PAGE_CACHE_SIZE) {
356 if (flags & XBF_READ)
357 bp->b_flags |= _XBF_PAGE_LOCKED;
358 } else if (!PagePrivate(page)) {
359 if (test_page_region(page, offset, nbytes))
360 page_count++;
361 }
362 }
363
364 bp->b_pages[i] = page; 368 bp->b_pages[i] = page;
365 offset = 0; 369 offset = 0;
366 } 370 }
371 return 0;
367 372
368 if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { 373out_free_pages:
369 for (i = 0; i < bp->b_page_count; i++) 374 for (i = 0; i < bp->b_page_count; i++)
370 unlock_page(bp->b_pages[i]); 375 __free_page(bp->b_pages[i]);
371 }
372
373 if (page_count == bp->b_page_count)
374 bp->b_flags |= XBF_DONE;
375
376 return error; 376 return error;
377} 377}
378 378
379/* 379/*
380 * Map buffer into kernel address-space if nessecary. 380 * Map buffer into kernel address-space if necessary.
381 */ 381 */
382STATIC int 382STATIC int
383_xfs_buf_map_pages( 383_xfs_buf_map_pages(
384 xfs_buf_t *bp, 384 xfs_buf_t *bp,
385 uint flags) 385 uint flags)
386{ 386{
387 /* A single page buffer is always mappable */ 387 ASSERT(bp->b_flags & _XBF_PAGES);
388 if (bp->b_page_count == 1) { 388 if (bp->b_page_count == 1) {
389 /* A single page buffer is always mappable */
389 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 390 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
390 bp->b_flags |= XBF_MAPPED; 391 bp->b_flags |= XBF_MAPPED;
391 } else if (flags & XBF_MAPPED) { 392 } else if (flags & XBF_MAPPED) {
392 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 393 int retried = 0;
393 -1, PAGE_KERNEL); 394
394 if (unlikely(bp->b_addr == NULL)) 395 do {
396 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
397 -1, PAGE_KERNEL);
398 if (bp->b_addr)
399 break;
400 vm_unmap_aliases();
401 } while (retried++ <= 1);
402
403 if (!bp->b_addr)
395 return -ENOMEM; 404 return -ENOMEM;
396 bp->b_addr += bp->b_offset; 405 bp->b_addr += bp->b_offset;
397 bp->b_flags |= XBF_MAPPED; 406 bp->b_flags |= XBF_MAPPED;
@@ -422,8 +431,10 @@ _xfs_buf_find(
422{ 431{
423 xfs_off_t range_base; 432 xfs_off_t range_base;
424 size_t range_length; 433 size_t range_length;
425 xfs_bufhash_t *hash; 434 struct xfs_perag *pag;
426 xfs_buf_t *bp, *n; 435 struct rb_node **rbp;
436 struct rb_node *parent;
437 xfs_buf_t *bp;
427 438
428 range_base = (ioff << BBSHIFT); 439 range_base = (ioff << BBSHIFT);
429 range_length = (isize << BBSHIFT); 440 range_length = (isize << BBSHIFT);
@@ -432,14 +443,37 @@ _xfs_buf_find(
432 ASSERT(!(range_length < (1 << btp->bt_sshift))); 443 ASSERT(!(range_length < (1 << btp->bt_sshift)));
433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 444 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
434 445
435 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 446 /* get tree root */
436 447 pag = xfs_perag_get(btp->bt_mount,
437 spin_lock(&hash->bh_lock); 448 xfs_daddr_to_agno(btp->bt_mount, ioff));
438 449
439 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 450 /* walk tree */
440 ASSERT(btp == bp->b_target); 451 spin_lock(&pag->pag_buf_lock);
441 if (bp->b_file_offset == range_base && 452 rbp = &pag->pag_buf_tree.rb_node;
442 bp->b_buffer_length == range_length) { 453 parent = NULL;
454 bp = NULL;
455 while (*rbp) {
456 parent = *rbp;
457 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
458
459 if (range_base < bp->b_file_offset)
460 rbp = &(*rbp)->rb_left;
461 else if (range_base > bp->b_file_offset)
462 rbp = &(*rbp)->rb_right;
463 else {
464 /*
465 * found a block offset match. If the range doesn't
466 * match, the only way this is allowed is if the buffer
467 * in the cache is stale and the transaction that made
468 * it stale has not yet committed. i.e. we are
469 * reallocating a busy extent. Skip this buffer and
470 * continue searching to the right for an exact match.
471 */
472 if (bp->b_buffer_length != range_length) {
473 ASSERT(bp->b_flags & XBF_STALE);
474 rbp = &(*rbp)->rb_right;
475 continue;
476 }
443 atomic_inc(&bp->b_hold); 477 atomic_inc(&bp->b_hold);
444 goto found; 478 goto found;
445 } 479 }
@@ -449,46 +483,42 @@ _xfs_buf_find(
449 if (new_bp) { 483 if (new_bp) {
450 _xfs_buf_initialize(new_bp, btp, range_base, 484 _xfs_buf_initialize(new_bp, btp, range_base,
451 range_length, flags); 485 range_length, flags);
452 new_bp->b_hash = hash; 486 rb_link_node(&new_bp->b_rbnode, parent, rbp);
453 list_add(&new_bp->b_hash_list, &hash->bh_list); 487 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
488 /* the buffer keeps the perag reference until it is freed */
489 new_bp->b_pag = pag;
490 spin_unlock(&pag->pag_buf_lock);
454 } else { 491 } else {
455 XFS_STATS_INC(xb_miss_locked); 492 XFS_STATS_INC(xb_miss_locked);
493 spin_unlock(&pag->pag_buf_lock);
494 xfs_perag_put(pag);
456 } 495 }
457
458 spin_unlock(&hash->bh_lock);
459 return new_bp; 496 return new_bp;
460 497
461found: 498found:
462 spin_unlock(&hash->bh_lock); 499 spin_unlock(&pag->pag_buf_lock);
500 xfs_perag_put(pag);
463 501
464 /* Attempt to get the semaphore without sleeping, 502 if (xfs_buf_cond_lock(bp)) {
465 * if this does not work then we need to drop the 503 /* failed, so wait for the lock if requested. */
466 * spinlock and do a hard attempt on the semaphore.
467 */
468 if (down_trylock(&bp->b_sema)) {
469 if (!(flags & XBF_TRYLOCK)) { 504 if (!(flags & XBF_TRYLOCK)) {
470 /* wait for buffer ownership */
471 xfs_buf_lock(bp); 505 xfs_buf_lock(bp);
472 XFS_STATS_INC(xb_get_locked_waited); 506 XFS_STATS_INC(xb_get_locked_waited);
473 } else { 507 } else {
474 /* We asked for a trylock and failed, no need
475 * to look at file offset and length here, we
476 * know that this buffer at least overlaps our
477 * buffer and is locked, therefore our buffer
478 * either does not exist, or is this buffer.
479 */
480 xfs_buf_rele(bp); 508 xfs_buf_rele(bp);
481 XFS_STATS_INC(xb_busy_locked); 509 XFS_STATS_INC(xb_busy_locked);
482 return NULL; 510 return NULL;
483 } 511 }
484 } else {
485 /* trylock worked */
486 XB_SET_OWNER(bp);
487 } 512 }
488 513
514 /*
515 * if the buffer is stale, clear all the external state associated with
516 * it. We need to keep flags such as how we allocated the buffer memory
517 * intact here.
518 */
489 if (bp->b_flags & XBF_STALE) { 519 if (bp->b_flags & XBF_STALE) {
490 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 520 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
491 bp->b_flags &= XBF_MAPPED; 521 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
492 } 522 }
493 523
494 trace_xfs_buf_find(bp, flags, _RET_IP_); 524 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -509,7 +539,7 @@ xfs_buf_get(
509 xfs_buf_flags_t flags) 539 xfs_buf_flags_t flags)
510{ 540{
511 xfs_buf_t *bp, *new_bp; 541 xfs_buf_t *bp, *new_bp;
512 int error = 0, i; 542 int error = 0;
513 543
514 new_bp = xfs_buf_allocate(flags); 544 new_bp = xfs_buf_allocate(flags);
515 if (unlikely(!new_bp)) 545 if (unlikely(!new_bp))
@@ -517,7 +547,7 @@ xfs_buf_get(
517 547
518 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 548 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
519 if (bp == new_bp) { 549 if (bp == new_bp) {
520 error = _xfs_buf_lookup_pages(bp, flags); 550 error = xfs_buf_allocate_memory(bp, flags);
521 if (error) 551 if (error)
522 goto no_buffer; 552 goto no_buffer;
523 } else { 553 } else {
@@ -526,14 +556,11 @@ xfs_buf_get(
526 return NULL; 556 return NULL;
527 } 557 }
528 558
529 for (i = 0; i < bp->b_page_count; i++)
530 mark_page_accessed(bp->b_pages[i]);
531
532 if (!(bp->b_flags & XBF_MAPPED)) { 559 if (!(bp->b_flags & XBF_MAPPED)) {
533 error = _xfs_buf_map_pages(bp, flags); 560 error = _xfs_buf_map_pages(bp, flags);
534 if (unlikely(error)) { 561 if (unlikely(error)) {
535 printk(KERN_WARNING "%s: failed to map pages\n", 562 xfs_warn(target->bt_mount,
536 __func__); 563 "%s: failed to map pages\n", __func__);
537 goto no_buffer; 564 goto no_buffer;
538 } 565 }
539 } 566 }
@@ -625,17 +652,47 @@ void
625xfs_buf_readahead( 652xfs_buf_readahead(
626 xfs_buftarg_t *target, 653 xfs_buftarg_t *target,
627 xfs_off_t ioff, 654 xfs_off_t ioff,
628 size_t isize, 655 size_t isize)
629 xfs_buf_flags_t flags)
630{ 656{
631 struct backing_dev_info *bdi; 657 if (bdi_read_congested(target->bt_bdi))
632
633 bdi = target->bt_mapping->backing_dev_info;
634 if (bdi_read_congested(bdi))
635 return; 658 return;
636 659
637 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 660 xfs_buf_read(target, ioff, isize,
638 xfs_buf_read(target, ioff, isize, flags); 661 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
662}
663
664/*
665 * Read an uncached buffer from disk. Allocates and returns a locked
666 * buffer containing the disk contents or nothing.
667 */
668struct xfs_buf *
669xfs_buf_read_uncached(
670 struct xfs_mount *mp,
671 struct xfs_buftarg *target,
672 xfs_daddr_t daddr,
673 size_t length,
674 int flags)
675{
676 xfs_buf_t *bp;
677 int error;
678
679 bp = xfs_buf_get_uncached(target, length, flags);
680 if (!bp)
681 return NULL;
682
683 /* set up the buffer for a read IO */
684 xfs_buf_lock(bp);
685 XFS_BUF_SET_ADDR(bp, daddr);
686 XFS_BUF_READ(bp);
687 XFS_BUF_BUSY(bp);
688
689 xfsbdstrat(mp, bp);
690 error = xfs_buf_iowait(bp);
691 if (error || bp->b_error) {
692 xfs_buf_relse(bp);
693 return NULL;
694 }
695 return bp;
639} 696}
640 697
641xfs_buf_t * 698xfs_buf_t *
@@ -651,6 +708,27 @@ xfs_buf_get_empty(
651 return bp; 708 return bp;
652} 709}
653 710
711/*
712 * Return a buffer allocated as an empty buffer and associated to external
713 * memory via xfs_buf_associate_memory() back to it's empty state.
714 */
715void
716xfs_buf_set_empty(
717 struct xfs_buf *bp,
718 size_t len)
719{
720 if (bp->b_pages)
721 _xfs_buf_free_pages(bp);
722
723 bp->b_pages = NULL;
724 bp->b_page_count = 0;
725 bp->b_addr = NULL;
726 bp->b_file_offset = 0;
727 bp->b_buffer_length = bp->b_count_desired = len;
728 bp->b_bn = XFS_BUF_DADDR_NULL;
729 bp->b_flags &= ~XBF_MAPPED;
730}
731
654static inline struct page * 732static inline struct page *
655mem_to_page( 733mem_to_page(
656 void *addr) 734 void *addr)
@@ -675,10 +753,10 @@ xfs_buf_associate_memory(
675 size_t buflen; 753 size_t buflen;
676 int page_count; 754 int page_count;
677 755
678 pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; 756 pageaddr = (unsigned long)mem & PAGE_MASK;
679 offset = (unsigned long)mem - pageaddr; 757 offset = (unsigned long)mem - pageaddr;
680 buflen = PAGE_CACHE_ALIGN(len + offset); 758 buflen = PAGE_ALIGN(len + offset);
681 page_count = buflen >> PAGE_CACHE_SHIFT; 759 page_count = buflen >> PAGE_SHIFT;
682 760
683 /* Free any previous set of page pointers */ 761 /* Free any previous set of page pointers */
684 if (bp->b_pages) 762 if (bp->b_pages)
@@ -695,21 +773,21 @@ xfs_buf_associate_memory(
695 773
696 for (i = 0; i < bp->b_page_count; i++) { 774 for (i = 0; i < bp->b_page_count; i++) {
697 bp->b_pages[i] = mem_to_page((void *)pageaddr); 775 bp->b_pages[i] = mem_to_page((void *)pageaddr);
698 pageaddr += PAGE_CACHE_SIZE; 776 pageaddr += PAGE_SIZE;
699 } 777 }
700 778
701 bp->b_count_desired = len; 779 bp->b_count_desired = len;
702 bp->b_buffer_length = buflen; 780 bp->b_buffer_length = buflen;
703 bp->b_flags |= XBF_MAPPED; 781 bp->b_flags |= XBF_MAPPED;
704 bp->b_flags &= ~_XBF_PAGE_LOCKED;
705 782
706 return 0; 783 return 0;
707} 784}
708 785
709xfs_buf_t * 786xfs_buf_t *
710xfs_buf_get_noaddr( 787xfs_buf_get_uncached(
788 struct xfs_buftarg *target,
711 size_t len, 789 size_t len,
712 xfs_buftarg_t *target) 790 int flags)
713{ 791{
714 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 792 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
715 int error, i; 793 int error, i;
@@ -725,7 +803,7 @@ xfs_buf_get_noaddr(
725 goto fail_free_buf; 803 goto fail_free_buf;
726 804
727 for (i = 0; i < page_count; i++) { 805 for (i = 0; i < page_count; i++) {
728 bp->b_pages[i] = alloc_page(GFP_KERNEL); 806 bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
729 if (!bp->b_pages[i]) 807 if (!bp->b_pages[i])
730 goto fail_free_mem; 808 goto fail_free_mem;
731 } 809 }
@@ -733,14 +811,14 @@ xfs_buf_get_noaddr(
733 811
734 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 812 error = _xfs_buf_map_pages(bp, XBF_MAPPED);
735 if (unlikely(error)) { 813 if (unlikely(error)) {
736 printk(KERN_WARNING "%s: failed to map pages\n", 814 xfs_warn(target->bt_mount,
737 __func__); 815 "%s: failed to map pages\n", __func__);
738 goto fail_free_mem; 816 goto fail_free_mem;
739 } 817 }
740 818
741 xfs_buf_unlock(bp); 819 xfs_buf_unlock(bp);
742 820
743 trace_xfs_buf_get_noaddr(bp, _RET_IP_); 821 trace_xfs_buf_get_uncached(bp, _RET_IP_);
744 return bp; 822 return bp;
745 823
746 fail_free_mem: 824 fail_free_mem:
@@ -774,29 +852,32 @@ void
774xfs_buf_rele( 852xfs_buf_rele(
775 xfs_buf_t *bp) 853 xfs_buf_t *bp)
776{ 854{
777 xfs_bufhash_t *hash = bp->b_hash; 855 struct xfs_perag *pag = bp->b_pag;
778 856
779 trace_xfs_buf_rele(bp, _RET_IP_); 857 trace_xfs_buf_rele(bp, _RET_IP_);
780 858
781 if (unlikely(!hash)) { 859 if (!pag) {
782 ASSERT(!bp->b_relse); 860 ASSERT(list_empty(&bp->b_lru));
861 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
783 if (atomic_dec_and_test(&bp->b_hold)) 862 if (atomic_dec_and_test(&bp->b_hold))
784 xfs_buf_free(bp); 863 xfs_buf_free(bp);
785 return; 864 return;
786 } 865 }
787 866
867 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
868
788 ASSERT(atomic_read(&bp->b_hold) > 0); 869 ASSERT(atomic_read(&bp->b_hold) > 0);
789 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 870 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
790 if (bp->b_relse) { 871 if (!(bp->b_flags & XBF_STALE) &&
791 atomic_inc(&bp->b_hold); 872 atomic_read(&bp->b_lru_ref)) {
792 spin_unlock(&hash->bh_lock); 873 xfs_buf_lru_add(bp);
793 (*(bp->b_relse)) (bp); 874 spin_unlock(&pag->pag_buf_lock);
794 } else if (bp->b_flags & XBF_FS_MANAGED) {
795 spin_unlock(&hash->bh_lock);
796 } else { 875 } else {
876 xfs_buf_lru_del(bp);
797 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 877 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
798 list_del_init(&bp->b_hash_list); 878 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
799 spin_unlock(&hash->bh_lock); 879 spin_unlock(&pag->pag_buf_lock);
880 xfs_perag_put(pag);
800 xfs_buf_free(bp); 881 xfs_buf_free(bp);
801 } 882 }
802 } 883 }
@@ -804,20 +885,15 @@ xfs_buf_rele(
804 885
805 886
806/* 887/*
807 * Mutual exclusion on buffers. Locking model: 888 * Lock a buffer object, if it is not already locked.
808 * 889 *
809 * Buffers associated with inodes for which buffer locking 890 * If we come across a stale, pinned, locked buffer, we know that we are
810 * is not enabled are not protected by semaphores, and are 891 * being asked to lock a buffer that has been reallocated. Because it is
811 * assumed to be exclusively owned by the caller. There is a 892 * pinned, we know that the log has not been pushed to disk and hence it
812 * spinlock in the buffer, used by the caller when concurrent 893 * will still be locked. Rather than continuing to have trylock attempts
813 * access is possible. 894 * fail until someone else pushes the log, push it ourselves before
814 */ 895 * returning. This means that the xfsaild will not get stuck trying
815 896 * to push on stale inode buffers.
816/*
817 * Locks a buffer object, if it is not already locked.
818 * Note that this in no way locks the underlying pages, so it is only
819 * useful for synchronizing concurrent use of buffer objects, not for
820 * synchronizing independent access to the underlying pages.
821 */ 897 */
822int 898int
823xfs_buf_cond_lock( 899xfs_buf_cond_lock(
@@ -828,6 +904,8 @@ xfs_buf_cond_lock(
828 locked = down_trylock(&bp->b_sema) == 0; 904 locked = down_trylock(&bp->b_sema) == 0;
829 if (locked) 905 if (locked)
830 XB_SET_OWNER(bp); 906 XB_SET_OWNER(bp);
907 else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
908 xfs_log_force(bp->b_target->bt_mount, 0);
831 909
832 trace_xfs_buf_cond_lock(bp, _RET_IP_); 910 trace_xfs_buf_cond_lock(bp, _RET_IP_);
833 return locked ? 0 : -EBUSY; 911 return locked ? 0 : -EBUSY;
@@ -841,10 +919,7 @@ xfs_buf_lock_value(
841} 919}
842 920
843/* 921/*
844 * Locks a buffer object. 922 * Lock a buffer object.
845 * Note that this in no way locks the underlying pages, so it is only
846 * useful for synchronizing concurrent use of buffer objects, not for
847 * synchronizing independent access to the underlying pages.
848 * 923 *
849 * If we come across a stale, pinned, locked buffer, we know that we 924 * If we come across a stale, pinned, locked buffer, we know that we
850 * are being asked to lock a buffer that has been reallocated. Because 925 * are being asked to lock a buffer that has been reallocated. Because
@@ -859,9 +934,7 @@ xfs_buf_lock(
859 trace_xfs_buf_lock(bp, _RET_IP_); 934 trace_xfs_buf_lock(bp, _RET_IP_);
860 935
861 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 936 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
862 xfs_log_force(bp->b_mount, 0); 937 xfs_log_force(bp->b_target->bt_mount, 0);
863 if (atomic_read(&bp->b_io_remaining))
864 blk_run_address_space(bp->b_target->bt_mapping);
865 down(&bp->b_sema); 938 down(&bp->b_sema);
866 XB_SET_OWNER(bp); 939 XB_SET_OWNER(bp);
867 940
@@ -905,9 +978,7 @@ xfs_buf_wait_unpin(
905 set_current_state(TASK_UNINTERRUPTIBLE); 978 set_current_state(TASK_UNINTERRUPTIBLE);
906 if (atomic_read(&bp->b_pin_count) == 0) 979 if (atomic_read(&bp->b_pin_count) == 0)
907 break; 980 break;
908 if (atomic_read(&bp->b_io_remaining)) 981 io_schedule();
909 blk_run_address_space(bp->b_target->bt_mapping);
910 schedule();
911 } 982 }
912 remove_wait_queue(&bp->b_waiters, &wait); 983 remove_wait_queue(&bp->b_waiters, &wait);
913 set_current_state(TASK_RUNNING); 984 set_current_state(TASK_RUNNING);
@@ -924,19 +995,7 @@ xfs_buf_iodone_work(
924 xfs_buf_t *bp = 995 xfs_buf_t *bp =
925 container_of(work, xfs_buf_t, b_iodone_work); 996 container_of(work, xfs_buf_t, b_iodone_work);
926 997
927 /* 998 if (bp->b_iodone)
928 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
929 * ordered flag and reissue them. Because we can't tell the higher
930 * layers directly that they should not issue ordered I/O anymore, they
931 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
932 */
933 if ((bp->b_error == EOPNOTSUPP) &&
934 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
935 trace_xfs_buf_ordered_retry(bp, _RET_IP_);
936 bp->b_flags &= ~XBF_ORDERED;
937 bp->b_flags |= _XFS_BARRIER_FAILED;
938 xfs_buf_iorequest(bp);
939 } else if (bp->b_iodone)
940 (*(bp->b_iodone))(bp); 999 (*(bp->b_iodone))(bp);
941 else if (bp->b_flags & XBF_ASYNC) 1000 else if (bp->b_flags & XBF_ASYNC)
942 xfs_buf_relse(bp); 1001 xfs_buf_relse(bp);
@@ -982,7 +1041,6 @@ xfs_bwrite(
982{ 1041{
983 int error; 1042 int error;
984 1043
985 bp->b_mount = mp;
986 bp->b_flags |= XBF_WRITE; 1044 bp->b_flags |= XBF_WRITE;
987 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1045 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
988 1046
@@ -1003,8 +1061,6 @@ xfs_bdwrite(
1003{ 1061{
1004 trace_xfs_buf_bdwrite(bp, _RET_IP_); 1062 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1005 1063
1006 bp->b_mount = mp;
1007
1008 bp->b_flags &= ~XBF_READ; 1064 bp->b_flags &= ~XBF_READ;
1009 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); 1065 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1010 1066
@@ -1013,7 +1069,7 @@ xfs_bdwrite(
1013 1069
1014/* 1070/*
1015 * Called when we want to stop a buffer from getting written or read. 1071 * Called when we want to stop a buffer from getting written or read.
1016 * We attach the EIO error, muck with its flags, and call biodone 1072 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1017 * so that the proper iodone callbacks get called. 1073 * so that the proper iodone callbacks get called.
1018 */ 1074 */
1019STATIC int 1075STATIC int
@@ -1030,21 +1086,21 @@ xfs_bioerror(
1030 XFS_BUF_ERROR(bp, EIO); 1086 XFS_BUF_ERROR(bp, EIO);
1031 1087
1032 /* 1088 /*
1033 * We're calling biodone, so delete XBF_DONE flag. 1089 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1034 */ 1090 */
1035 XFS_BUF_UNREAD(bp); 1091 XFS_BUF_UNREAD(bp);
1036 XFS_BUF_UNDELAYWRITE(bp); 1092 XFS_BUF_UNDELAYWRITE(bp);
1037 XFS_BUF_UNDONE(bp); 1093 XFS_BUF_UNDONE(bp);
1038 XFS_BUF_STALE(bp); 1094 XFS_BUF_STALE(bp);
1039 1095
1040 xfs_biodone(bp); 1096 xfs_buf_ioend(bp, 0);
1041 1097
1042 return EIO; 1098 return EIO;
1043} 1099}
1044 1100
1045/* 1101/*
1046 * Same as xfs_bioerror, except that we are releasing the buffer 1102 * Same as xfs_bioerror, except that we are releasing the buffer
1047 * here ourselves, and avoiding the biodone call. 1103 * here ourselves, and avoiding the xfs_buf_ioend call.
1048 * This is meant for userdata errors; metadata bufs come with 1104 * This is meant for userdata errors; metadata bufs come with
1049 * iodone functions attached, so that we can track down errors. 1105 * iodone functions attached, so that we can track down errors.
1050 */ 1106 */
@@ -1093,7 +1149,7 @@ int
1093xfs_bdstrat_cb( 1149xfs_bdstrat_cb(
1094 struct xfs_buf *bp) 1150 struct xfs_buf *bp)
1095{ 1151{
1096 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { 1152 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1097 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1153 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1098 /* 1154 /*
1099 * Metadata write that didn't get logged but 1155 * Metadata write that didn't get logged but
@@ -1134,10 +1190,8 @@ _xfs_buf_ioend(
1134 xfs_buf_t *bp, 1190 xfs_buf_t *bp,
1135 int schedule) 1191 int schedule)
1136{ 1192{
1137 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1193 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1138 bp->b_flags &= ~_XBF_PAGE_LOCKED;
1139 xfs_buf_ioend(bp, schedule); 1194 xfs_buf_ioend(bp, schedule);
1140 }
1141} 1195}
1142 1196
1143STATIC void 1197STATIC void
@@ -1146,35 +1200,12 @@ xfs_buf_bio_end_io(
1146 int error) 1200 int error)
1147{ 1201{
1148 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1202 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1149 unsigned int blocksize = bp->b_target->bt_bsize;
1150 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1151 1203
1152 xfs_buf_ioerror(bp, -error); 1204 xfs_buf_ioerror(bp, -error);
1153 1205
1154 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1206 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1155 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1207 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1156 1208
1157 do {
1158 struct page *page = bvec->bv_page;
1159
1160 ASSERT(!PagePrivate(page));
1161 if (unlikely(bp->b_error)) {
1162 if (bp->b_flags & XBF_READ)
1163 ClearPageUptodate(page);
1164 } else if (blocksize >= PAGE_CACHE_SIZE) {
1165 SetPageUptodate(page);
1166 } else if (!PagePrivate(page) &&
1167 (bp->b_flags & _XBF_PAGE_CACHE)) {
1168 set_page_region(page, bvec->bv_offset, bvec->bv_len);
1169 }
1170
1171 if (--bvec >= bio->bi_io_vec)
1172 prefetchw(&bvec->bv_page->flags);
1173
1174 if (bp->b_flags & _XBF_PAGE_LOCKED)
1175 unlock_page(page);
1176 } while (bvec >= bio->bi_io_vec);
1177
1178 _xfs_buf_ioend(bp, 1); 1209 _xfs_buf_ioend(bp, 1);
1179 bio_put(bio); 1210 bio_put(bio);
1180} 1211}
@@ -1188,14 +1219,13 @@ _xfs_buf_ioapply(
1188 int offset = bp->b_offset; 1219 int offset = bp->b_offset;
1189 int size = bp->b_count_desired; 1220 int size = bp->b_count_desired;
1190 sector_t sector = bp->b_bn; 1221 sector_t sector = bp->b_bn;
1191 unsigned int blocksize = bp->b_target->bt_bsize;
1192 1222
1193 total_nr_pages = bp->b_page_count; 1223 total_nr_pages = bp->b_page_count;
1194 map_i = 0; 1224 map_i = 0;
1195 1225
1196 if (bp->b_flags & XBF_ORDERED) { 1226 if (bp->b_flags & XBF_ORDERED) {
1197 ASSERT(!(bp->b_flags & XBF_READ)); 1227 ASSERT(!(bp->b_flags & XBF_READ));
1198 rw = WRITE_BARRIER; 1228 rw = WRITE_FLUSH_FUA;
1199 } else if (bp->b_flags & XBF_LOG_BUFFER) { 1229 } else if (bp->b_flags & XBF_LOG_BUFFER) {
1200 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1230 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1201 bp->b_flags &= ~_XBF_RUN_QUEUES; 1231 bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1209,29 +1239,6 @@ _xfs_buf_ioapply(
1209 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1239 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1210 } 1240 }
1211 1241
1212 /* Special code path for reading a sub page size buffer in --
1213 * we populate up the whole page, and hence the other metadata
1214 * in the same page. This optimization is only valid when the
1215 * filesystem block size is not smaller than the page size.
1216 */
1217 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1218 ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
1219 (XBF_READ|_XBF_PAGE_LOCKED)) &&
1220 (blocksize >= PAGE_CACHE_SIZE)) {
1221 bio = bio_alloc(GFP_NOIO, 1);
1222
1223 bio->bi_bdev = bp->b_target->bt_bdev;
1224 bio->bi_sector = sector - (offset >> BBSHIFT);
1225 bio->bi_end_io = xfs_buf_bio_end_io;
1226 bio->bi_private = bp;
1227
1228 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1229 size = 0;
1230
1231 atomic_inc(&bp->b_io_remaining);
1232
1233 goto submit_io;
1234 }
1235 1242
1236next_chunk: 1243next_chunk:
1237 atomic_inc(&bp->b_io_remaining); 1244 atomic_inc(&bp->b_io_remaining);
@@ -1245,8 +1252,9 @@ next_chunk:
1245 bio->bi_end_io = xfs_buf_bio_end_io; 1252 bio->bi_end_io = xfs_buf_bio_end_io;
1246 bio->bi_private = bp; 1253 bio->bi_private = bp;
1247 1254
1255
1248 for (; size && nr_pages; nr_pages--, map_i++) { 1256 for (; size && nr_pages; nr_pages--, map_i++) {
1249 int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1257 int rbytes, nbytes = PAGE_SIZE - offset;
1250 1258
1251 if (nbytes > size) 1259 if (nbytes > size)
1252 nbytes = size; 1260 nbytes = size;
@@ -1261,7 +1269,6 @@ next_chunk:
1261 total_nr_pages--; 1269 total_nr_pages--;
1262 } 1270 }
1263 1271
1264submit_io:
1265 if (likely(bio->bi_size)) { 1272 if (likely(bio->bi_size)) {
1266 if (xfs_buf_is_vmapped(bp)) { 1273 if (xfs_buf_is_vmapped(bp)) {
1267 flush_kernel_vmap_range(bp->b_addr, 1274 flush_kernel_vmap_range(bp->b_addr,
@@ -1271,18 +1278,7 @@ submit_io:
1271 if (size) 1278 if (size)
1272 goto next_chunk; 1279 goto next_chunk;
1273 } else { 1280 } else {
1274 /*
1275 * if we get here, no pages were added to the bio. However,
1276 * we can't just error out here - if the pages are locked then
1277 * we have to unlock them otherwise we can hang on a later
1278 * access to the page.
1279 */
1280 xfs_buf_ioerror(bp, EIO); 1281 xfs_buf_ioerror(bp, EIO);
1281 if (bp->b_flags & _XBF_PAGE_LOCKED) {
1282 int i;
1283 for (i = 0; i < bp->b_page_count; i++)
1284 unlock_page(bp->b_pages[i]);
1285 }
1286 bio_put(bio); 1282 bio_put(bio);
1287 } 1283 }
1288} 1284}
@@ -1327,8 +1323,6 @@ xfs_buf_iowait(
1327{ 1323{
1328 trace_xfs_buf_iowait(bp, _RET_IP_); 1324 trace_xfs_buf_iowait(bp, _RET_IP_);
1329 1325
1330 if (atomic_read(&bp->b_io_remaining))
1331 blk_run_address_space(bp->b_target->bt_mapping);
1332 wait_for_completion(&bp->b_iowait); 1326 wait_for_completion(&bp->b_iowait);
1333 1327
1334 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1328 trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1346,8 +1340,8 @@ xfs_buf_offset(
1346 return XFS_BUF_PTR(bp) + offset; 1340 return XFS_BUF_PTR(bp) + offset;
1347 1341
1348 offset += bp->b_offset; 1342 offset += bp->b_offset;
1349 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1343 page = bp->b_pages[offset >> PAGE_SHIFT];
1350 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1344 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1351} 1345}
1352 1346
1353/* 1347/*
@@ -1369,9 +1363,9 @@ xfs_buf_iomove(
1369 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1363 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1370 cpoff = xfs_buf_poff(boff + bp->b_offset); 1364 cpoff = xfs_buf_poff(boff + bp->b_offset);
1371 csize = min_t(size_t, 1365 csize = min_t(size_t,
1372 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1366 PAGE_SIZE-cpoff, bp->b_count_desired-boff);
1373 1367
1374 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1368 ASSERT(((csize + cpoff) <= PAGE_SIZE));
1375 1369
1376 switch (mode) { 1370 switch (mode) {
1377 case XBRW_ZERO: 1371 case XBRW_ZERO:
@@ -1394,89 +1388,84 @@ xfs_buf_iomove(
1394 */ 1388 */
1395 1389
1396/* 1390/*
1397 * Wait for any bufs with callbacks that have been submitted but 1391 * Wait for any bufs with callbacks that have been submitted but have not yet
1398 * have not yet returned... walk the hash list for the target. 1392 * returned. These buffers will have an elevated hold count, so wait on those
1393 * while freeing all the buffers only held by the LRU.
1399 */ 1394 */
1400void 1395void
1401xfs_wait_buftarg( 1396xfs_wait_buftarg(
1402 xfs_buftarg_t *btp) 1397 struct xfs_buftarg *btp)
1403{ 1398{
1404 xfs_buf_t *bp, *n; 1399 struct xfs_buf *bp;
1405 xfs_bufhash_t *hash; 1400
1406 uint i; 1401restart:
1407 1402 spin_lock(&btp->bt_lru_lock);
1408 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1403 while (!list_empty(&btp->bt_lru)) {
1409 hash = &btp->bt_hash[i]; 1404 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1410again: 1405 if (atomic_read(&bp->b_hold) > 1) {
1411 spin_lock(&hash->bh_lock); 1406 spin_unlock(&btp->bt_lru_lock);
1412 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 1407 delay(100);
1413 ASSERT(btp == bp->b_target); 1408 goto restart;
1414 if (!(bp->b_flags & XBF_FS_MANAGED)) {
1415 spin_unlock(&hash->bh_lock);
1416 /*
1417 * Catch superblock reference count leaks
1418 * immediately
1419 */
1420 BUG_ON(bp->b_bn == 0);
1421 delay(100);
1422 goto again;
1423 }
1424 } 1409 }
1425 spin_unlock(&hash->bh_lock); 1410 /*
1411 * clear the LRU reference count so the bufer doesn't get
1412 * ignored in xfs_buf_rele().
1413 */
1414 atomic_set(&bp->b_lru_ref, 0);
1415 spin_unlock(&btp->bt_lru_lock);
1416 xfs_buf_rele(bp);
1417 spin_lock(&btp->bt_lru_lock);
1426 } 1418 }
1419 spin_unlock(&btp->bt_lru_lock);
1427} 1420}
1428 1421
1429/* 1422int
1430 * Allocate buffer hash table for a given target. 1423xfs_buftarg_shrink(
1431 * For devices containing metadata (i.e. not the log/realtime devices) 1424 struct shrinker *shrink,
1432 * we need to allocate a much larger hash table. 1425 struct shrink_control *sc)
1433 */
1434STATIC void
1435xfs_alloc_bufhash(
1436 xfs_buftarg_t *btp,
1437 int external)
1438{ 1426{
1439 unsigned int i; 1427 struct xfs_buftarg *btp = container_of(shrink,
1428 struct xfs_buftarg, bt_shrinker);
1429 struct xfs_buf *bp;
1430 int nr_to_scan = sc->nr_to_scan;
1431 LIST_HEAD(dispose);
1440 1432
1441 btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ 1433 if (!nr_to_scan)
1442 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1434 return btp->bt_lru_nr;
1443 sizeof(xfs_bufhash_t));
1444 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1445 spin_lock_init(&btp->bt_hash[i].bh_lock);
1446 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
1447 }
1448}
1449 1435
1450STATIC void 1436 spin_lock(&btp->bt_lru_lock);
1451xfs_free_bufhash( 1437 while (!list_empty(&btp->bt_lru)) {
1452 xfs_buftarg_t *btp) 1438 if (nr_to_scan-- <= 0)
1453{ 1439 break;
1454 kmem_free_large(btp->bt_hash);
1455 btp->bt_hash = NULL;
1456}
1457 1440
1458/* 1441 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1459 * buftarg list for delwrite queue processing
1460 */
1461static LIST_HEAD(xfs_buftarg_list);
1462static DEFINE_SPINLOCK(xfs_buftarg_lock);
1463 1442
1464STATIC void 1443 /*
1465xfs_register_buftarg( 1444 * Decrement the b_lru_ref count unless the value is already
1466 xfs_buftarg_t *btp) 1445 * zero. If the value is already zero, we need to reclaim the
1467{ 1446 * buffer, otherwise it gets another trip through the LRU.
1468 spin_lock(&xfs_buftarg_lock); 1447 */
1469 list_add(&btp->bt_list, &xfs_buftarg_list); 1448 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1470 spin_unlock(&xfs_buftarg_lock); 1449 list_move_tail(&bp->b_lru, &btp->bt_lru);
1471} 1450 continue;
1451 }
1472 1452
1473STATIC void 1453 /*
1474xfs_unregister_buftarg( 1454 * remove the buffer from the LRU now to avoid needing another
1475 xfs_buftarg_t *btp) 1455 * lock round trip inside xfs_buf_rele().
1476{ 1456 */
1477 spin_lock(&xfs_buftarg_lock); 1457 list_move(&bp->b_lru, &dispose);
1478 list_del(&btp->bt_list); 1458 btp->bt_lru_nr--;
1479 spin_unlock(&xfs_buftarg_lock); 1459 }
1460 spin_unlock(&btp->bt_lru_lock);
1461
1462 while (!list_empty(&dispose)) {
1463 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1464 list_del_init(&bp->b_lru);
1465 xfs_buf_rele(bp);
1466 }
1467
1468 return btp->bt_lru_nr;
1480} 1469}
1481 1470
1482void 1471void
@@ -1484,18 +1473,13 @@ xfs_free_buftarg(
1484 struct xfs_mount *mp, 1473 struct xfs_mount *mp,
1485 struct xfs_buftarg *btp) 1474 struct xfs_buftarg *btp)
1486{ 1475{
1476 unregister_shrinker(&btp->bt_shrinker);
1477
1487 xfs_flush_buftarg(btp, 1); 1478 xfs_flush_buftarg(btp, 1);
1488 if (mp->m_flags & XFS_MOUNT_BARRIER) 1479 if (mp->m_flags & XFS_MOUNT_BARRIER)
1489 xfs_blkdev_issue_flush(btp); 1480 xfs_blkdev_issue_flush(btp);
1490 xfs_free_bufhash(btp);
1491 iput(btp->bt_mapping->host);
1492 1481
1493 /* Unregister the buftarg first so that we don't get a
1494 * wakeup finding a non-existent task
1495 */
1496 xfs_unregister_buftarg(btp);
1497 kthread_stop(btp->bt_task); 1482 kthread_stop(btp->bt_task);
1498
1499 kmem_free(btp); 1483 kmem_free(btp);
1500} 1484}
1501 1485
@@ -1511,21 +1495,12 @@ xfs_setsize_buftarg_flags(
1511 btp->bt_smask = sectorsize - 1; 1495 btp->bt_smask = sectorsize - 1;
1512 1496
1513 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1497 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1514 printk(KERN_WARNING 1498 xfs_warn(btp->bt_mount,
1515 "XFS: Cannot set_blocksize to %u on device %s\n", 1499 "Cannot set_blocksize to %u on device %s\n",
1516 sectorsize, XFS_BUFTARG_NAME(btp)); 1500 sectorsize, XFS_BUFTARG_NAME(btp));
1517 return EINVAL; 1501 return EINVAL;
1518 } 1502 }
1519 1503
1520 if (verbose &&
1521 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1522 printk(KERN_WARNING
1523 "XFS: %u byte sectors in use on device %s. "
1524 "This is suboptimal; %u or greater is ideal.\n",
1525 sectorsize, XFS_BUFTARG_NAME(btp),
1526 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1527 }
1528
1529 return 0; 1504 return 0;
1530} 1505}
1531 1506
@@ -1540,7 +1515,7 @@ xfs_setsize_buftarg_early(
1540 struct block_device *bdev) 1515 struct block_device *bdev)
1541{ 1516{
1542 return xfs_setsize_buftarg_flags(btp, 1517 return xfs_setsize_buftarg_flags(btp,
1543 PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); 1518 PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1544} 1519}
1545 1520
1546int 1521int
@@ -1553,62 +1528,22 @@ xfs_setsize_buftarg(
1553} 1528}
1554 1529
1555STATIC int 1530STATIC int
1556xfs_mapping_buftarg(
1557 xfs_buftarg_t *btp,
1558 struct block_device *bdev)
1559{
1560 struct backing_dev_info *bdi;
1561 struct inode *inode;
1562 struct address_space *mapping;
1563 static const struct address_space_operations mapping_aops = {
1564 .sync_page = block_sync_page,
1565 .migratepage = fail_migrate_page,
1566 };
1567
1568 inode = new_inode(bdev->bd_inode->i_sb);
1569 if (!inode) {
1570 printk(KERN_WARNING
1571 "XFS: Cannot allocate mapping inode for device %s\n",
1572 XFS_BUFTARG_NAME(btp));
1573 return ENOMEM;
1574 }
1575 inode->i_mode = S_IFBLK;
1576 inode->i_bdev = bdev;
1577 inode->i_rdev = bdev->bd_dev;
1578 bdi = blk_get_backing_dev_info(bdev);
1579 if (!bdi)
1580 bdi = &default_backing_dev_info;
1581 mapping = &inode->i_data;
1582 mapping->a_ops = &mapping_aops;
1583 mapping->backing_dev_info = bdi;
1584 mapping_set_gfp_mask(mapping, GFP_NOFS);
1585 btp->bt_mapping = mapping;
1586 return 0;
1587}
1588
1589STATIC int
1590xfs_alloc_delwrite_queue( 1531xfs_alloc_delwrite_queue(
1591 xfs_buftarg_t *btp, 1532 xfs_buftarg_t *btp,
1592 const char *fsname) 1533 const char *fsname)
1593{ 1534{
1594 int error = 0;
1595
1596 INIT_LIST_HEAD(&btp->bt_list);
1597 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1535 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1598 spin_lock_init(&btp->bt_delwrite_lock); 1536 spin_lock_init(&btp->bt_delwrite_lock);
1599 btp->bt_flags = 0; 1537 btp->bt_flags = 0;
1600 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1538 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1601 if (IS_ERR(btp->bt_task)) { 1539 if (IS_ERR(btp->bt_task))
1602 error = PTR_ERR(btp->bt_task); 1540 return PTR_ERR(btp->bt_task);
1603 goto out_error; 1541 return 0;
1604 }
1605 xfs_register_buftarg(btp);
1606out_error:
1607 return error;
1608} 1542}
1609 1543
1610xfs_buftarg_t * 1544xfs_buftarg_t *
1611xfs_alloc_buftarg( 1545xfs_alloc_buftarg(
1546 struct xfs_mount *mp,
1612 struct block_device *bdev, 1547 struct block_device *bdev,
1613 int external, 1548 int external,
1614 const char *fsname) 1549 const char *fsname)
@@ -1617,15 +1552,22 @@ xfs_alloc_buftarg(
1617 1552
1618 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1553 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1619 1554
1555 btp->bt_mount = mp;
1620 btp->bt_dev = bdev->bd_dev; 1556 btp->bt_dev = bdev->bd_dev;
1621 btp->bt_bdev = bdev; 1557 btp->bt_bdev = bdev;
1622 if (xfs_setsize_buftarg_early(btp, bdev)) 1558 btp->bt_bdi = blk_get_backing_dev_info(bdev);
1559 if (!btp->bt_bdi)
1623 goto error; 1560 goto error;
1624 if (xfs_mapping_buftarg(btp, bdev)) 1561
1562 INIT_LIST_HEAD(&btp->bt_lru);
1563 spin_lock_init(&btp->bt_lru_lock);
1564 if (xfs_setsize_buftarg_early(btp, bdev))
1625 goto error; 1565 goto error;
1626 if (xfs_alloc_delwrite_queue(btp, fsname)) 1566 if (xfs_alloc_delwrite_queue(btp, fsname))
1627 goto error; 1567 goto error;
1628 xfs_alloc_bufhash(btp, external); 1568 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1569 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1570 register_shrinker(&btp->bt_shrinker);
1629 return btp; 1571 return btp;
1630 1572
1631error: 1573error:
@@ -1730,27 +1672,6 @@ xfs_buf_runall_queues(
1730 flush_workqueue(queue); 1672 flush_workqueue(queue);
1731} 1673}
1732 1674
1733STATIC int
1734xfsbufd_wakeup(
1735 struct shrinker *shrink,
1736 int priority,
1737 gfp_t mask)
1738{
1739 xfs_buftarg_t *btp;
1740
1741 spin_lock(&xfs_buftarg_lock);
1742 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1743 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1744 continue;
1745 if (list_empty(&btp->bt_delwrite_queue))
1746 continue;
1747 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1748 wake_up_process(btp->bt_task);
1749 }
1750 spin_unlock(&xfs_buftarg_lock);
1751 return 0;
1752}
1753
1754/* 1675/*
1755 * Move as many buffers as specified to the supplied list 1676 * Move as many buffers as specified to the supplied list
1756 * idicating if we skipped any buffers to prevent deadlocks. 1677 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1771,7 +1692,6 @@ xfs_buf_delwri_split(
1771 INIT_LIST_HEAD(list); 1692 INIT_LIST_HEAD(list);
1772 spin_lock(dwlk); 1693 spin_lock(dwlk);
1773 list_for_each_entry_safe(bp, n, dwq, b_list) { 1694 list_for_each_entry_safe(bp, n, dwq, b_list) {
1774 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1775 ASSERT(bp->b_flags & XBF_DELWRI); 1695 ASSERT(bp->b_flags & XBF_DELWRI);
1776 1696
1777 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { 1697 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1785,6 +1705,7 @@ xfs_buf_delwri_split(
1785 _XBF_RUN_QUEUES); 1705 _XBF_RUN_QUEUES);
1786 bp->b_flags |= XBF_WRITE; 1706 bp->b_flags |= XBF_WRITE;
1787 list_move_tail(&bp->b_list, list); 1707 list_move_tail(&bp->b_list, list);
1708 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1788 } else 1709 } else
1789 skipped++; 1710 skipped++;
1790 } 1711 }
@@ -1838,8 +1759,8 @@ xfsbufd(
1838 do { 1759 do {
1839 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1760 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1840 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); 1761 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1841 int count = 0;
1842 struct list_head tmp; 1762 struct list_head tmp;
1763 struct blk_plug plug;
1843 1764
1844 if (unlikely(freezing(current))) { 1765 if (unlikely(freezing(current))) {
1845 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1766 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1855,16 +1776,15 @@ xfsbufd(
1855 1776
1856 xfs_buf_delwri_split(target, &tmp, age); 1777 xfs_buf_delwri_split(target, &tmp, age);
1857 list_sort(NULL, &tmp, xfs_buf_cmp); 1778 list_sort(NULL, &tmp, xfs_buf_cmp);
1779
1780 blk_start_plug(&plug);
1858 while (!list_empty(&tmp)) { 1781 while (!list_empty(&tmp)) {
1859 struct xfs_buf *bp; 1782 struct xfs_buf *bp;
1860 bp = list_first_entry(&tmp, struct xfs_buf, b_list); 1783 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1861 list_del_init(&bp->b_list); 1784 list_del_init(&bp->b_list);
1862 xfs_bdstrat_cb(bp); 1785 xfs_bdstrat_cb(bp);
1863 count++;
1864 } 1786 }
1865 if (count) 1787 blk_finish_plug(&plug);
1866 blk_run_address_space(target->bt_mapping);
1867
1868 } while (!kthread_should_stop()); 1788 } while (!kthread_should_stop());
1869 1789
1870 return 0; 1790 return 0;
@@ -1884,6 +1804,7 @@ xfs_flush_buftarg(
1884 int pincount = 0; 1804 int pincount = 0;
1885 LIST_HEAD(tmp_list); 1805 LIST_HEAD(tmp_list);
1886 LIST_HEAD(wait_list); 1806 LIST_HEAD(wait_list);
1807 struct blk_plug plug;
1887 1808
1888 xfs_buf_runall_queues(xfsconvertd_workqueue); 1809 xfs_buf_runall_queues(xfsconvertd_workqueue);
1889 xfs_buf_runall_queues(xfsdatad_workqueue); 1810 xfs_buf_runall_queues(xfsdatad_workqueue);
@@ -1898,6 +1819,8 @@ xfs_flush_buftarg(
1898 * we do that after issuing all the IO. 1819 * we do that after issuing all the IO.
1899 */ 1820 */
1900 list_sort(NULL, &tmp_list, xfs_buf_cmp); 1821 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1822
1823 blk_start_plug(&plug);
1901 while (!list_empty(&tmp_list)) { 1824 while (!list_empty(&tmp_list)) {
1902 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); 1825 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1903 ASSERT(target == bp->b_target); 1826 ASSERT(target == bp->b_target);
@@ -1908,15 +1831,15 @@ xfs_flush_buftarg(
1908 } 1831 }
1909 xfs_bdstrat_cb(bp); 1832 xfs_bdstrat_cb(bp);
1910 } 1833 }
1834 blk_finish_plug(&plug);
1911 1835
1912 if (wait) { 1836 if (wait) {
1913 /* Expedite and wait for IO to complete. */ 1837 /* Wait for IO to complete. */
1914 blk_run_address_space(target->bt_mapping);
1915 while (!list_empty(&wait_list)) { 1838 while (!list_empty(&wait_list)) {
1916 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1839 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1917 1840
1918 list_del_init(&bp->b_list); 1841 list_del_init(&bp->b_list);
1919 xfs_iowait(bp); 1842 xfs_buf_iowait(bp);
1920 xfs_buf_relse(bp); 1843 xfs_buf_relse(bp);
1921 } 1844 }
1922 } 1845 }
@@ -1933,19 +1856,19 @@ xfs_buf_init(void)
1933 goto out; 1856 goto out;
1934 1857
1935 xfslogd_workqueue = alloc_workqueue("xfslogd", 1858 xfslogd_workqueue = alloc_workqueue("xfslogd",
1936 WQ_RESCUER | WQ_HIGHPRI, 1); 1859 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1937 if (!xfslogd_workqueue) 1860 if (!xfslogd_workqueue)
1938 goto out_free_buf_zone; 1861 goto out_free_buf_zone;
1939 1862
1940 xfsdatad_workqueue = create_workqueue("xfsdatad"); 1863 xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
1941 if (!xfsdatad_workqueue) 1864 if (!xfsdatad_workqueue)
1942 goto out_destroy_xfslogd_workqueue; 1865 goto out_destroy_xfslogd_workqueue;
1943 1866
1944 xfsconvertd_workqueue = create_workqueue("xfsconvertd"); 1867 xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
1868 WQ_MEM_RECLAIM, 1);
1945 if (!xfsconvertd_workqueue) 1869 if (!xfsconvertd_workqueue)
1946 goto out_destroy_xfsdatad_workqueue; 1870 goto out_destroy_xfsdatad_workqueue;
1947 1871
1948 register_shrinker(&xfs_buf_shake);
1949 return 0; 1872 return 0;
1950 1873
1951 out_destroy_xfsdatad_workqueue: 1874 out_destroy_xfsdatad_workqueue:
@@ -1961,7 +1884,6 @@ xfs_buf_init(void)
1961void 1884void
1962xfs_buf_terminate(void) 1885xfs_buf_terminate(void)
1963{ 1886{
1964 unregister_shrinker(&xfs_buf_shake);
1965 destroy_workqueue(xfsconvertd_workqueue); 1887 destroy_workqueue(xfsconvertd_workqueue);
1966 destroy_workqueue(xfsdatad_workqueue); 1888 destroy_workqueue(xfsdatad_workqueue);
1967 destroy_workqueue(xfslogd_workqueue); 1889 destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b92..50a7d5fb3b73 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ 52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ 53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */
55#define XBF_ORDERED (1 << 11)/* use ordered writes */ 54#define XBF_ORDERED (1 << 11)/* use ordered writes */
56#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ 55#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
57#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ 56#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
@@ -62,38 +61,11 @@ typedef enum {
62#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ 61#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
63 62
64/* flags used only internally */ 63/* flags used only internally */
65#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
66#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ 64#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
67#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ 65#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
66#define _XBF_KMEM (1 << 20)/* backed by heap memory */
68#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ 67#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
69 68
70/*
71 * Special flag for supporting metadata blocks smaller than a FSB.
72 *
73 * In this case we can have multiple xfs_buf_t on a single page and
74 * need to lock out concurrent xfs_buf_t readers as they only
75 * serialise access to the buffer.
76 *
77 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
78 * between reads of the page. Hence we can have one thread read the
79 * page and modify it, but then race with another thread that thinks
80 * the page is not up-to-date and hence reads it again.
81 *
82 * The result is that the first modifcation to the page is lost.
83 * This sort of AGF/AGI reading race can happen when unlinking inodes
84 * that require truncation and results in the AGI unlinked list
85 * modifications being lost.
86 */
87#define _XBF_PAGE_LOCKED (1 << 22)
88
89/*
90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information.
94 */
95#define _XFS_BARRIER_FAILED (1 << 23)
96
97typedef unsigned int xfs_buf_flags_t; 69typedef unsigned int xfs_buf_flags_t;
98 70
99#define XFS_BUF_FLAGS \ 71#define XFS_BUF_FLAGS \
@@ -104,19 +76,15 @@ typedef unsigned int xfs_buf_flags_t;
104 { XBF_DONE, "DONE" }, \ 76 { XBF_DONE, "DONE" }, \
105 { XBF_DELWRI, "DELWRI" }, \ 77 { XBF_DELWRI, "DELWRI" }, \
106 { XBF_STALE, "STALE" }, \ 78 { XBF_STALE, "STALE" }, \
107 { XBF_FS_MANAGED, "FS_MANAGED" }, \
108 { XBF_ORDERED, "ORDERED" }, \ 79 { XBF_ORDERED, "ORDERED" }, \
109 { XBF_READ_AHEAD, "READ_AHEAD" }, \ 80 { XBF_READ_AHEAD, "READ_AHEAD" }, \
110 { XBF_LOCK, "LOCK" }, /* should never be set */\ 81 { XBF_LOCK, "LOCK" }, /* should never be set */\
111 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ 82 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
112 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ 83 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
113 { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \
114 { _XBF_PAGES, "PAGES" }, \ 84 { _XBF_PAGES, "PAGES" }, \
115 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 85 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
116 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 86 { _XBF_KMEM, "KMEM" }, \
117 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ 87 { _XBF_DELWRI_Q, "DELWRI_Q" }
118 { _XFS_BARRIER_FAILED, "BARRIER_FAILED" }
119
120 88
121typedef enum { 89typedef enum {
122 XBT_FORCE_SLEEP = 0, 90 XBT_FORCE_SLEEP = 0,
@@ -131,70 +99,67 @@ typedef struct xfs_bufhash {
131typedef struct xfs_buftarg { 99typedef struct xfs_buftarg {
132 dev_t bt_dev; 100 dev_t bt_dev;
133 struct block_device *bt_bdev; 101 struct block_device *bt_bdev;
134 struct address_space *bt_mapping; 102 struct backing_dev_info *bt_bdi;
103 struct xfs_mount *bt_mount;
135 unsigned int bt_bsize; 104 unsigned int bt_bsize;
136 unsigned int bt_sshift; 105 unsigned int bt_sshift;
137 size_t bt_smask; 106 size_t bt_smask;
138 107
139 /* per device buffer hash table */
140 uint bt_hashshift;
141 xfs_bufhash_t *bt_hash;
142
143 /* per device delwri queue */ 108 /* per device delwri queue */
144 struct task_struct *bt_task; 109 struct task_struct *bt_task;
145 struct list_head bt_list;
146 struct list_head bt_delwrite_queue; 110 struct list_head bt_delwrite_queue;
147 spinlock_t bt_delwrite_lock; 111 spinlock_t bt_delwrite_lock;
148 unsigned long bt_flags; 112 unsigned long bt_flags;
149} xfs_buftarg_t;
150 113
151/* 114 /* LRU control structures */
152 * xfs_buf_t: Buffer structure for pagecache-based buffers 115 struct shrinker bt_shrinker;
153 * 116 struct list_head bt_lru;
154 * This buffer structure is used by the pagecache buffer management routines 117 spinlock_t bt_lru_lock;
155 * to refer to an assembly of pages forming a logical buffer. 118 unsigned int bt_lru_nr;
156 * 119} xfs_buftarg_t;
157 * The buffer structure is used on a temporary basis only, and discarded when
158 * released. The real data storage is recorded in the pagecache. Buffers are
159 * hashed to the block device on which the file system resides.
160 */
161 120
162struct xfs_buf; 121struct xfs_buf;
163typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 122typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
164typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
165typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
166 123
167#define XB_PAGES 2 124#define XB_PAGES 2
168 125
169typedef struct xfs_buf { 126typedef struct xfs_buf {
127 /*
128 * first cacheline holds all the fields needed for an uncontended cache
129 * hit to be fully processed. The semaphore straddles the cacheline
130 * boundary, but the counter and lock sits on the first cacheline,
131 * which is the only bit that is touched if we hit the semaphore
132 * fast-path on locking.
133 */
134 struct rb_node b_rbnode; /* rbtree node */
135 xfs_off_t b_file_offset; /* offset in file */
136 size_t b_buffer_length;/* size of buffer in bytes */
137 atomic_t b_hold; /* reference count */
138 atomic_t b_lru_ref; /* lru reclaim ref count */
139 xfs_buf_flags_t b_flags; /* status flags */
170 struct semaphore b_sema; /* semaphore for lockables */ 140 struct semaphore b_sema; /* semaphore for lockables */
171 unsigned long b_queuetime; /* time buffer was queued */ 141
172 atomic_t b_pin_count; /* pin count */ 142 struct list_head b_lru; /* lru list */
173 wait_queue_head_t b_waiters; /* unpin waiters */ 143 wait_queue_head_t b_waiters; /* unpin waiters */
174 struct list_head b_list; 144 struct list_head b_list;
175 xfs_buf_flags_t b_flags; /* status flags */ 145 struct xfs_perag *b_pag; /* contains rbtree root */
176 struct list_head b_hash_list; /* hash table list */
177 xfs_bufhash_t *b_hash; /* hash table list start */
178 xfs_buftarg_t *b_target; /* buffer target (device) */ 146 xfs_buftarg_t *b_target; /* buffer target (device) */
179 atomic_t b_hold; /* reference count */
180 xfs_daddr_t b_bn; /* block number for I/O */ 147 xfs_daddr_t b_bn; /* block number for I/O */
181 xfs_off_t b_file_offset; /* offset in file */
182 size_t b_buffer_length;/* size of buffer in bytes */
183 size_t b_count_desired;/* desired transfer size */ 148 size_t b_count_desired;/* desired transfer size */
184 void *b_addr; /* virtual address of buffer */ 149 void *b_addr; /* virtual address of buffer */
185 struct work_struct b_iodone_work; 150 struct work_struct b_iodone_work;
186 atomic_t b_io_remaining; /* #outstanding I/O requests */
187 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 151 xfs_buf_iodone_t b_iodone; /* I/O completion function */
188 xfs_buf_relse_t b_relse; /* releasing function */
189 struct completion b_iowait; /* queue for I/O waiters */ 152 struct completion b_iowait; /* queue for I/O waiters */
190 void *b_fspriv; 153 void *b_fspriv;
191 void *b_fspriv2; 154 void *b_fspriv2;
192 struct xfs_mount *b_mount;
193 unsigned short b_error; /* error code on I/O */
194 unsigned int b_page_count; /* size of page array */
195 unsigned int b_offset; /* page offset in first page */
196 struct page **b_pages; /* array of page pointers */ 155 struct page **b_pages; /* array of page pointers */
197 struct page *b_page_array[XB_PAGES]; /* inline pages */ 156 struct page *b_page_array[XB_PAGES]; /* inline pages */
157 unsigned long b_queuetime; /* time buffer was queued */
158 atomic_t b_pin_count; /* pin count */
159 atomic_t b_io_remaining; /* #outstanding I/O requests */
160 unsigned int b_page_count; /* size of page array */
161 unsigned int b_offset; /* page offset in first page */
162 unsigned short b_error; /* error code on I/O */
198#ifdef XFS_BUF_LOCK_TRACKING 163#ifdef XFS_BUF_LOCK_TRACKING
199 int b_last_holder; 164 int b_last_holder;
200#endif 165#endif
@@ -213,11 +178,14 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
213 xfs_buf_flags_t); 178 xfs_buf_flags_t);
214 179
215extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 180extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
216extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); 181extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
182extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
217extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 183extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
218extern void xfs_buf_hold(xfs_buf_t *); 184extern void xfs_buf_hold(xfs_buf_t *);
219extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, 185extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
220 xfs_buf_flags_t); 186struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
187 struct xfs_buftarg *target,
188 xfs_daddr_t daddr, size_t length, int flags);
221 189
222/* Releasing Buffers */ 190/* Releasing Buffers */
223extern void xfs_buf_free(xfs_buf_t *); 191extern void xfs_buf_free(xfs_buf_t *);
@@ -242,6 +210,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
242extern int xfs_buf_iowait(xfs_buf_t *); 210extern int xfs_buf_iowait(xfs_buf_t *);
243extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 211extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
244 xfs_buf_rw_t); 212 xfs_buf_rw_t);
213#define xfs_buf_zero(bp, off, len) \
214 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
245 215
246static inline int xfs_buf_geterror(xfs_buf_t *bp) 216static inline int xfs_buf_geterror(xfs_buf_t *bp)
247{ 217{
@@ -267,7 +237,8 @@ extern void xfs_buf_terminate(void);
267#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 237#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
268 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 238 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
269 239
270#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) 240void xfs_buf_stale(struct xfs_buf *bp);
241#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
271#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 242#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
272#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 243#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
273#define XFS_BUF_SUPER_STALE(bp) do { \ 244#define XFS_BUF_SUPER_STALE(bp) do { \
@@ -276,8 +247,6 @@ extern void xfs_buf_terminate(void);
276 XFS_BUF_DONE(bp); \ 247 XFS_BUF_DONE(bp); \
277 } while (0) 248 } while (0)
278 249
279#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
280
281#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 250#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
282#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) 251#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
283#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) 252#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
@@ -320,7 +289,6 @@ extern void xfs_buf_terminate(void);
320#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 289#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
321#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 290#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
322#define XFS_BUF_SET_START(bp) do { } while (0) 291#define XFS_BUF_SET_START(bp) do { } while (0)
323#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
324 292
325#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 293#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
326#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 294#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -333,9 +301,15 @@ extern void xfs_buf_terminate(void);
333#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 301#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
334#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 302#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
335 303
336#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) 304static inline void
305xfs_buf_set_ref(
306 struct xfs_buf *bp,
307 int lru_ref)
308{
309 atomic_set(&bp->b_lru_ref, lru_ref);
310}
311#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
337#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 312#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
338#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
339 313
340#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) 314#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
341 315
@@ -351,30 +325,15 @@ extern void xfs_buf_terminate(void);
351 325
352static inline void xfs_buf_relse(xfs_buf_t *bp) 326static inline void xfs_buf_relse(xfs_buf_t *bp)
353{ 327{
354 if (!bp->b_relse) 328 xfs_buf_unlock(bp);
355 xfs_buf_unlock(bp);
356 xfs_buf_rele(bp); 329 xfs_buf_rele(bp);
357} 330}
358 331
359#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
360
361#define xfs_biomove(bp, off, len, data, rw) \
362 xfs_buf_iomove((bp), (off), (len), (data), \
363 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
364
365#define xfs_biozero(bp, off, len) \
366 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
367
368#define xfs_iowait(bp) xfs_buf_iowait(bp)
369
370#define xfs_baread(target, rablkno, ralen) \
371 xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
372
373
374/* 332/*
375 * Handling of buftargs. 333 * Handling of buftargs.
376 */ 334 */
377extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); 335extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
336 struct block_device *, int, const char *);
378extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 337extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
379extern void xfs_wait_buftarg(xfs_buftarg_t *); 338extern void xfs_wait_buftarg(xfs_buftarg_t *);
380extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 339extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b6091..000000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_CRED_H__
19#define __XFS_CRED_H__
20
21#include <linux/capability.h>
22
23/*
24 * Credentials
25 */
26typedef const struct cred cred_t;
27
28#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..244e797dae32
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,222 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_trans.h"
26#include "xfs_alloc_btree.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_inode.h"
31#include "xfs_alloc.h"
32#include "xfs_error.h"
33#include "xfs_discard.h"
34#include "xfs_trace.h"
35
36STATIC int
37xfs_trim_extents(
38 struct xfs_mount *mp,
39 xfs_agnumber_t agno,
40 xfs_fsblock_t start,
41 xfs_fsblock_t len,
42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed)
44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
46 struct xfs_btree_cur *cur;
47 struct xfs_buf *agbp;
48 struct xfs_perag *pag;
49 int error;
50 int i;
51
52 pag = xfs_perag_get(mp, agno);
53
54 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
55 if (error || !agbp)
56 goto out_put_perag;
57
58 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
59
60 /*
61 * Force out the log. This means any transactions that might have freed
62 * space before we took the AGF buffer lock are now on disk, and the
63 * volatile disk cache is flushed.
64 */
65 xfs_log_force(mp, XFS_LOG_SYNC);
66
67 /*
68 * Look up the longest btree in the AGF and start with it.
69 */
70 error = xfs_alloc_lookup_le(cur, 0,
71 XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
72 if (error)
73 goto out_del_cursor;
74
75 /*
76 * Loop until we are done with all extents that are large
77 * enough to be worth discarding.
78 */
79 while (i) {
80 xfs_agblock_t fbno;
81 xfs_extlen_t flen;
82
83 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
84 if (error)
85 goto out_del_cursor;
86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
87 ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
88
89 /*
90 * Too small? Give up.
91 */
92 if (flen < minlen) {
93 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
94 goto out_del_cursor;
95 }
96
97 /*
98 * If the extent is entirely outside of the range we are
99 * supposed to discard skip it. Do not bother to trim
100 * down partially overlapping ranges for now.
101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent;
106 }
107
108 /*
109 * If any blocks in the range are still busy, skip the
110 * discard and try again the next time.
111 */
112 if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
113 trace_xfs_discard_busy(mp, agno, fbno, flen);
114 goto next_extent;
115 }
116
117 trace_xfs_discard_extent(mp, agno, fbno, flen);
118 error = -blkdev_issue_discard(bdev,
119 XFS_AGB_TO_DADDR(mp, agno, fbno),
120 XFS_FSB_TO_BB(mp, flen),
121 GFP_NOFS, 0);
122 if (error)
123 goto out_del_cursor;
124 *blocks_trimmed += flen;
125
126next_extent:
127 error = xfs_btree_decrement(cur, 0, &i);
128 if (error)
129 goto out_del_cursor;
130 }
131
132out_del_cursor:
133 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
134 xfs_buf_relse(agbp);
135out_put_perag:
136 xfs_perag_put(pag);
137 return error;
138}
139
140int
141xfs_ioc_trim(
142 struct xfs_mount *mp,
143 struct fstrim_range __user *urange)
144{
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0;
152
153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM);
155 if (!blk_queue_discard(q))
156 return -XFS_ERROR(EOPNOTSUPP);
157 if (copy_from_user(&range, urange, sizeof(range)))
158 return -XFS_ERROR(EFAULT);
159
160 /*
161 * Truncating down the len isn't actually quite correct, but using
162 * XFS_B_TO_FSB would mean we trivially get overflows for values
163 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
164 * used by the fstrim application. In the end it really doesn't
165 * matter as trimming blocks is an advisory interface.
166 */
167 start = XFS_B_TO_FSBT(mp, range.start);
168 len = XFS_B_TO_FSBT(mp, range.len);
169 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
170
171 start_agno = XFS_FSB_TO_AGNO(mp, start);
172 if (start_agno >= mp->m_sb.sb_agcount)
173 return -XFS_ERROR(EINVAL);
174
175 end_agno = XFS_FSB_TO_AGNO(mp, start + len);
176 if (end_agno >= mp->m_sb.sb_agcount)
177 end_agno = mp->m_sb.sb_agcount - 1;
178
179 for (agno = start_agno; agno <= end_agno; agno++) {
180 error = -xfs_trim_extents(mp, agno, start, len, minlen,
181 &blocks_trimmed);
182 if (error)
183 last_error = error;
184 }
185
186 if (last_error)
187 return last_error;
188
189 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
190 if (copy_to_user(urange, &range, sizeof(range)))
191 return -XFS_ERROR(EFAULT);
192 return 0;
193}
194
195int
196xfs_discard_extents(
197 struct xfs_mount *mp,
198 struct list_head *list)
199{
200 struct xfs_busy_extent *busyp;
201 int error = 0;
202
203 list_for_each_entry(busyp, list, list) {
204 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
205 busyp->length);
206
207 error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
208 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
209 XFS_FSB_TO_BB(mp, busyp->length),
210 GFP_NOFS, 0);
211 if (error && error != EOPNOTSUPP) {
212 xfs_info(mp,
213 "discard failed for extent [0x%llu,%u], error %d",
214 (unsigned long long)busyp->bno,
215 busyp->length,
216 error);
217 return error;
218 }
219 }
220
221 return 0;
222}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..344879aea646
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,10 @@
1#ifndef XFS_DISCARD_H
2#define XFS_DISCARD_H 1
3
4struct fstrim_range;
5struct list_head;
6
7extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
8extern int xfs_discard_extents(struct xfs_mount *, struct list_head *);
9
10#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
70 else 70 else
71 fileid_type = FILEID_INO32_GEN_PARENT; 71 fileid_type = FILEID_INO32_GEN_PARENT;
72 72
73 /* filesystem may contain 64bit inode numbers */ 73 /*
74 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) 74 * If the the filesystem may contain 64bit inode numbers, we need
75 * to use larger file handles that can represent them.
76 *
77 * While we only allocate inodes that do not fit into 32 bits any
78 * large enough filesystem may contain them, thus the slightly
79 * confusing looking conditional below.
80 */
81 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
82 (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
75 fileid_type |= XFS_FILEID_TYPE_64FLAG; 83 fileid_type |= XFS_FILEID_TYPE_64FLAG;
76 84
77 /* 85 /*
@@ -81,8 +89,10 @@ xfs_fs_encode_fh(
81 * seven combinations work. The real answer is "don't use v2". 89 * seven combinations work. The real answer is "don't use v2".
82 */ 90 */
83 len = xfs_fileid_length(fileid_type); 91 len = xfs_fileid_length(fileid_type);
84 if (*max_len < len) 92 if (*max_len < len) {
93 *max_len = len;
85 return 255; 94 return 255;
95 }
86 *max_len = len; 96 *max_len = len;
87 97
88 switch (fileid_type) { 98 switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..7f782af286bf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38 38
39#include <linux/dcache.h> 39#include <linux/dcache.h>
40#include <linux/falloc.h>
40 41
41static const struct vm_operations_struct xfs_file_vm_ops; 42static const struct vm_operations_struct xfs_file_vm_ops;
42 43
43/* 44/*
45 * Locking primitives for read and write IO paths to ensure we consistently use
46 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
47 */
48static inline void
49xfs_rw_ilock(
50 struct xfs_inode *ip,
51 int type)
52{
53 if (type & XFS_IOLOCK_EXCL)
54 mutex_lock(&VFS_I(ip)->i_mutex);
55 xfs_ilock(ip, type);
56}
57
58static inline void
59xfs_rw_iunlock(
60 struct xfs_inode *ip,
61 int type)
62{
63 xfs_iunlock(ip, type);
64 if (type & XFS_IOLOCK_EXCL)
65 mutex_unlock(&VFS_I(ip)->i_mutex);
66}
67
68static inline void
69xfs_rw_ilock_demote(
70 struct xfs_inode *ip,
71 int type)
72{
73 xfs_ilock_demote(ip, type);
74 if (type & XFS_IOLOCK_EXCL)
75 mutex_unlock(&VFS_I(ip)->i_mutex);
76}
77
78/*
44 * xfs_iozero 79 * xfs_iozero
45 * 80 *
46 * xfs_iozero clears the specified range of buffer supplied, 81 * xfs_iozero clears the specified range of buffer supplied,
@@ -96,19 +131,34 @@ xfs_file_fsync(
96{ 131{
97 struct inode *inode = file->f_mapping->host; 132 struct inode *inode = file->f_mapping->host;
98 struct xfs_inode *ip = XFS_I(inode); 133 struct xfs_inode *ip = XFS_I(inode);
134 struct xfs_mount *mp = ip->i_mount;
99 struct xfs_trans *tp; 135 struct xfs_trans *tp;
100 int error = 0; 136 int error = 0;
101 int log_flushed = 0; 137 int log_flushed = 0;
102 138
103 trace_xfs_file_fsync(ip); 139 trace_xfs_file_fsync(ip);
104 140
105 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 141 if (XFS_FORCED_SHUTDOWN(mp))
106 return -XFS_ERROR(EIO); 142 return -XFS_ERROR(EIO);
107 143
108 xfs_iflags_clear(ip, XFS_ITRUNCATED); 144 xfs_iflags_clear(ip, XFS_ITRUNCATED);
109 145
110 xfs_ioend_wait(ip); 146 xfs_ioend_wait(ip);
111 147
148 if (mp->m_flags & XFS_MOUNT_BARRIER) {
149 /*
150 * If we have an RT and/or log subvolume we need to make sure
151 * to flush the write cache the device used for file data
152 * first. This is to ensure newly written file data make
153 * it to disk before logging the new inode size in case of
154 * an extending write.
155 */
156 if (XFS_IS_REALTIME_INODE(ip))
157 xfs_blkdev_issue_flush(mp->m_rtdev_targp);
158 else if (mp->m_logdev_targp != mp->m_ddev_targp)
159 xfs_blkdev_issue_flush(mp->m_ddev_targp);
160 }
161
112 /* 162 /*
113 * We always need to make sure that the required inode state is safe on 163 * We always need to make sure that the required inode state is safe on
114 * disk. The inode might be clean but we still might need to force the 164 * disk. The inode might be clean but we still might need to force the
@@ -140,9 +190,9 @@ xfs_file_fsync(
140 * updates. The sync transaction will also force the log. 190 * updates. The sync transaction will also force the log.
141 */ 191 */
142 xfs_iunlock(ip, XFS_ILOCK_SHARED); 192 xfs_iunlock(ip, XFS_ILOCK_SHARED);
143 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); 193 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
144 error = xfs_trans_reserve(tp, 0, 194 error = xfs_trans_reserve(tp, 0,
145 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); 195 XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
146 if (error) { 196 if (error) {
147 xfs_trans_cancel(tp, 0); 197 xfs_trans_cancel(tp, 0);
148 return -error; 198 return -error;
@@ -174,28 +224,25 @@ xfs_file_fsync(
174 * force the log. 224 * force the log.
175 */ 225 */
176 if (xfs_ipincount(ip)) { 226 if (xfs_ipincount(ip)) {
177 error = _xfs_log_force_lsn(ip->i_mount, 227 error = _xfs_log_force_lsn(mp,
178 ip->i_itemp->ili_last_lsn, 228 ip->i_itemp->ili_last_lsn,
179 XFS_LOG_SYNC, &log_flushed); 229 XFS_LOG_SYNC, &log_flushed);
180 } 230 }
181 xfs_iunlock(ip, XFS_ILOCK_SHARED); 231 xfs_iunlock(ip, XFS_ILOCK_SHARED);
182 } 232 }
183 233
184 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { 234 /*
185 /* 235 * If we only have a single device, and the log force about was
186 * If the log write didn't issue an ordered tag we need 236 * a no-op we might have to flush the data device cache here.
187 * to flush the disk cache for the data device now. 237 * This can only happen for fdatasync/O_DSYNC if we were overwriting
188 */ 238 * an already allocated file and thus do not have any metadata to
189 if (!log_flushed) 239 * commit.
190 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); 240 */
191 241 if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
192 /* 242 mp->m_logdev_targp == mp->m_ddev_targp &&
193 * If this inode is on the RT dev we need to flush that 243 !XFS_IS_REALTIME_INODE(ip) &&
194 * cache as well. 244 !log_flushed)
195 */ 245 xfs_blkdev_issue_flush(mp->m_ddev_targp);
196 if (XFS_IS_REALTIME_INODE(ip))
197 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
198 }
199 246
200 return -error; 247 return -error;
201} 248}
@@ -262,22 +309,21 @@ xfs_file_aio_read(
262 if (XFS_FORCED_SHUTDOWN(mp)) 309 if (XFS_FORCED_SHUTDOWN(mp))
263 return -EIO; 310 return -EIO;
264 311
265 if (unlikely(ioflags & IO_ISDIRECT))
266 mutex_lock(&inode->i_mutex);
267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
268
269 if (unlikely(ioflags & IO_ISDIRECT)) { 312 if (unlikely(ioflags & IO_ISDIRECT)) {
313 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
314
270 if (inode->i_mapping->nrpages) { 315 if (inode->i_mapping->nrpages) {
271 ret = -xfs_flushinval_pages(ip, 316 ret = -xfs_flushinval_pages(ip,
272 (iocb->ki_pos & PAGE_CACHE_MASK), 317 (iocb->ki_pos & PAGE_CACHE_MASK),
273 -1, FI_REMAPF_LOCKED); 318 -1, FI_REMAPF_LOCKED);
319 if (ret) {
320 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
321 return ret;
322 }
274 } 323 }
275 mutex_unlock(&inode->i_mutex); 324 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
276 if (ret) { 325 } else
277 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 326 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
278 return ret;
279 }
280 }
281 327
282 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 328 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
283 329
@@ -285,7 +331,7 @@ xfs_file_aio_read(
285 if (ret > 0) 331 if (ret > 0)
286 XFS_STATS_ADD(xs_read_bytes, ret); 332 XFS_STATS_ADD(xs_read_bytes, ret);
287 333
288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 334 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
289 return ret; 335 return ret;
290} 336}
291 337
@@ -309,7 +355,7 @@ xfs_file_splice_read(
309 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 355 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
310 return -EIO; 356 return -EIO;
311 357
312 xfs_ilock(ip, XFS_IOLOCK_SHARED); 358 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
313 359
314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 360 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
315 361
@@ -317,10 +363,61 @@ xfs_file_splice_read(
317 if (ret > 0) 363 if (ret > 0)
318 XFS_STATS_ADD(xs_read_bytes, ret); 364 XFS_STATS_ADD(xs_read_bytes, ret);
319 365
320 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 366 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 return ret; 367 return ret;
322} 368}
323 369
370STATIC void
371xfs_aio_write_isize_update(
372 struct inode *inode,
373 loff_t *ppos,
374 ssize_t bytes_written)
375{
376 struct xfs_inode *ip = XFS_I(inode);
377 xfs_fsize_t isize = i_size_read(inode);
378
379 if (bytes_written > 0)
380 XFS_STATS_ADD(xs_write_bytes, bytes_written);
381
382 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
383 *ppos > isize))
384 *ppos = isize;
385
386 if (*ppos > ip->i_size) {
387 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
388 if (*ppos > ip->i_size)
389 ip->i_size = *ppos;
390 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
391 }
392}
393
394/*
395 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
396 * part of the I/O may have been written to disk before the error occurred. In
397 * this case the on-disk file size may have been adjusted beyond the in-memory
398 * file size and now needs to be truncated back.
399 */
400STATIC void
401xfs_aio_write_newsize_update(
402 struct xfs_inode *ip)
403{
404 if (ip->i_new_size) {
405 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
406 ip->i_new_size = 0;
407 if (ip->i_d.di_size > ip->i_size)
408 ip->i_d.di_size = ip->i_size;
409 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
410 }
411}
412
413/*
414 * xfs_file_splice_write() does not use xfs_rw_ilock() because
415 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
416 * couuld cause lock inversions between the aio_write path and the splice path
417 * if someone is doing concurrent splice(2) based writes and write(2) based
418 * writes to the same inode. The only real way to fix this is to re-implement
419 * the generic code here with correct locking orders.
420 */
324STATIC ssize_t 421STATIC ssize_t
325xfs_file_splice_write( 422xfs_file_splice_write(
326 struct pipe_inode_info *pipe, 423 struct pipe_inode_info *pipe,
@@ -331,7 +428,7 @@ xfs_file_splice_write(
331{ 428{
332 struct inode *inode = outfilp->f_mapping->host; 429 struct inode *inode = outfilp->f_mapping->host;
333 struct xfs_inode *ip = XFS_I(inode); 430 struct xfs_inode *ip = XFS_I(inode);
334 xfs_fsize_t isize, new_size; 431 xfs_fsize_t new_size;
335 int ioflags = 0; 432 int ioflags = 0;
336 ssize_t ret; 433 ssize_t ret;
337 434
@@ -355,27 +452,9 @@ xfs_file_splice_write(
355 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 452 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
356 453
357 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 454 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
358 if (ret > 0)
359 XFS_STATS_ADD(xs_write_bytes, ret);
360 455
361 isize = i_size_read(inode); 456 xfs_aio_write_isize_update(inode, ppos, ret);
362 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) 457 xfs_aio_write_newsize_update(ip);
363 *ppos = isize;
364
365 if (*ppos > ip->i_size) {
366 xfs_ilock(ip, XFS_ILOCK_EXCL);
367 if (*ppos > ip->i_size)
368 ip->i_size = *ppos;
369 xfs_iunlock(ip, XFS_ILOCK_EXCL);
370 }
371
372 if (ip->i_new_size) {
373 xfs_ilock(ip, XFS_ILOCK_EXCL);
374 ip->i_new_size = 0;
375 if (ip->i_d.di_size > ip->i_size)
376 ip->i_d.di_size = ip->i_size;
377 xfs_iunlock(ip, XFS_ILOCK_EXCL);
378 }
379 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 458 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
380 return ret; 459 return ret;
381} 460}
@@ -562,247 +641,318 @@ out_lock:
562 return error; 641 return error;
563} 642}
564 643
644/*
645 * Common pre-write limit and setup checks.
646 *
647 * Returns with iolock held according to @iolock.
648 */
565STATIC ssize_t 649STATIC ssize_t
566xfs_file_aio_write( 650xfs_file_aio_write_checks(
567 struct kiocb *iocb, 651 struct file *file,
568 const struct iovec *iovp, 652 loff_t *pos,
569 unsigned long nr_segs, 653 size_t *count,
570 loff_t pos) 654 int *iolock)
571{ 655{
572 struct file *file = iocb->ki_filp; 656 struct inode *inode = file->f_mapping->host;
573 struct address_space *mapping = file->f_mapping;
574 struct inode *inode = mapping->host;
575 struct xfs_inode *ip = XFS_I(inode); 657 struct xfs_inode *ip = XFS_I(inode);
576 struct xfs_mount *mp = ip->i_mount; 658 xfs_fsize_t new_size;
577 ssize_t ret = 0, error = 0; 659 int error = 0;
578 int ioflags = 0;
579 xfs_fsize_t isize, new_size;
580 int iolock;
581 size_t ocount = 0, count;
582 int need_i_mutex;
583 660
584 XFS_STATS_INC(xs_write_calls); 661 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
662 if (error) {
663 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
664 *iolock = 0;
665 return error;
666 }
585 667
586 BUG_ON(iocb->ki_pos != pos); 668 new_size = *pos + *count;
669 if (new_size > ip->i_size)
670 ip->i_new_size = new_size;
587 671
588 if (unlikely(file->f_flags & O_DIRECT)) 672 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
589 ioflags |= IO_ISDIRECT; 673 file_update_time(file);
590 if (file->f_mode & FMODE_NOCMTIME)
591 ioflags |= IO_INVIS;
592 674
593 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); 675 /*
676 * If the offset is beyond the size of the file, we need to zero any
677 * blocks that fall between the existing EOF and the start of this
678 * write.
679 */
680 if (*pos > ip->i_size)
681 error = -xfs_zero_eof(ip, *pos, ip->i_size);
682
683 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
594 if (error) 684 if (error)
595 return error; 685 return error;
596 686
597 count = ocount; 687 /*
598 if (count == 0) 688 * If we're writing the file then make sure to clear the setuid and
599 return 0; 689 * setgid bits if the process is not being run by root. This keeps
600 690 * people from modifying setuid and setgid binaries.
601 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 691 */
692 return file_remove_suid(file);
602 693
603 if (XFS_FORCED_SHUTDOWN(mp)) 694}
604 return -EIO;
605 695
606relock: 696/*
607 if (ioflags & IO_ISDIRECT) { 697 * xfs_file_dio_aio_write - handle direct IO writes
608 iolock = XFS_IOLOCK_SHARED; 698 *
609 need_i_mutex = 0; 699 * Lock the inode appropriately to prepare for and issue a direct IO write.
610 } else { 700 * By separating it from the buffered write path we remove all the tricky to
611 iolock = XFS_IOLOCK_EXCL; 701 * follow locking changes and looping.
612 need_i_mutex = 1; 702 *
613 mutex_lock(&inode->i_mutex); 703 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
704 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
705 * pages are flushed out.
706 *
707 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
708 * allowing them to be done in parallel with reads and other direct IO writes.
709 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
710 * needs to do sub-block zeroing and that requires serialisation against other
711 * direct IOs to the same block. In this case we need to serialise the
712 * submission of the unaligned IOs so that we don't get racing block zeroing in
713 * the dio layer. To avoid the problem with aio, we also need to wait for
714 * outstanding IOs to complete so that unwritten extent conversion is completed
715 * before we try to map the overlapping block. This is currently implemented by
716 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
717 *
718 * Returns with locks held indicated by @iolock and errors indicated by
719 * negative return values.
720 */
721STATIC ssize_t
722xfs_file_dio_aio_write(
723 struct kiocb *iocb,
724 const struct iovec *iovp,
725 unsigned long nr_segs,
726 loff_t pos,
727 size_t ocount,
728 int *iolock)
729{
730 struct file *file = iocb->ki_filp;
731 struct address_space *mapping = file->f_mapping;
732 struct inode *inode = mapping->host;
733 struct xfs_inode *ip = XFS_I(inode);
734 struct xfs_mount *mp = ip->i_mount;
735 ssize_t ret = 0;
736 size_t count = ocount;
737 int unaligned_io = 0;
738 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
739 mp->m_rtdev_targp : mp->m_ddev_targp;
740
741 *iolock = 0;
742 if ((pos & target->bt_smask) || (count & target->bt_smask))
743 return -XFS_ERROR(EINVAL);
744
745 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
746 unaligned_io = 1;
747
748 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
749 *iolock = XFS_IOLOCK_EXCL;
750 else
751 *iolock = XFS_IOLOCK_SHARED;
752 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
753
754 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
755 if (ret)
756 return ret;
757
758 if (mapping->nrpages) {
759 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
760 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
761 FI_REMAPF_LOCKED);
762 if (ret)
763 return ret;
614 } 764 }
615 765
616 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 766 /*
617 767 * If we are doing unaligned IO, wait for all other IO to drain,
618start: 768 * otherwise demote the lock if we had to flush cached pages
619 error = -generic_write_checks(file, &pos, &count, 769 */
620 S_ISBLK(inode->i_mode)); 770 if (unaligned_io)
621 if (error) { 771 xfs_ioend_wait(ip);
622 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 772 else if (*iolock == XFS_IOLOCK_EXCL) {
623 goto out_unlock_mutex; 773 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
774 *iolock = XFS_IOLOCK_SHARED;
624 } 775 }
625 776
626 if (ioflags & IO_ISDIRECT) { 777 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
627 xfs_buftarg_t *target = 778 ret = generic_file_direct_write(iocb, iovp,
628 XFS_IS_REALTIME_INODE(ip) ? 779 &nr_segs, pos, &iocb->ki_pos, count, ocount);
629 mp->m_rtdev_targp : mp->m_ddev_targp;
630 780
631 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 781 /* No fallback to buffered IO on errors for XFS. */
632 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 782 ASSERT(ret < 0 || ret == count);
633 return XFS_ERROR(-EINVAL); 783 return ret;
634 } 784}
635 785
636 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { 786STATIC ssize_t
637 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 787xfs_file_buffered_aio_write(
638 iolock = XFS_IOLOCK_EXCL; 788 struct kiocb *iocb,
639 need_i_mutex = 1; 789 const struct iovec *iovp,
640 mutex_lock(&inode->i_mutex); 790 unsigned long nr_segs,
641 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 791 loff_t pos,
642 goto start; 792 size_t ocount,
643 } 793 int *iolock)
644 } 794{
795 struct file *file = iocb->ki_filp;
796 struct address_space *mapping = file->f_mapping;
797 struct inode *inode = mapping->host;
798 struct xfs_inode *ip = XFS_I(inode);
799 ssize_t ret;
800 int enospc = 0;
801 size_t count = ocount;
645 802
646 new_size = pos + count; 803 *iolock = XFS_IOLOCK_EXCL;
647 if (new_size > ip->i_size) 804 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
648 ip->i_new_size = new_size;
649 805
650 if (likely(!(ioflags & IO_INVIS))) 806 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
651 file_update_time(file); 807 if (ret)
808 return ret;
809
810 /* We can write back this queue in page reclaim */
811 current->backing_dev_info = mapping->backing_dev_info;
652 812
813write_retry:
814 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
815 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
816 pos, &iocb->ki_pos, count, ret);
653 /* 817 /*
654 * If the offset is beyond the size of the file, we have a couple 818 * if we just got an ENOSPC, flush the inode now we aren't holding any
655 * of things to do. First, if there is already space allocated 819 * page locks and retry *once*
656 * we need to either create holes or zero the disk or ...
657 *
658 * If there is a page where the previous size lands, we need
659 * to zero it out up to the new size.
660 */ 820 */
661 821 if (ret == -ENOSPC && !enospc) {
662 if (pos > ip->i_size) { 822 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
663 error = xfs_zero_eof(ip, pos, ip->i_size); 823 if (ret)
664 if (error) { 824 return ret;
665 xfs_iunlock(ip, XFS_ILOCK_EXCL); 825 enospc = 1;
666 goto out_unlock_internal; 826 goto write_retry;
667 }
668 } 827 }
669 xfs_iunlock(ip, XFS_ILOCK_EXCL); 828 current->backing_dev_info = NULL;
829 return ret;
830}
670 831
671 /* 832STATIC ssize_t
672 * If we're writing the file then make sure to clear the 833xfs_file_aio_write(
673 * setuid and setgid bits if the process is not being run 834 struct kiocb *iocb,
674 * by root. This keeps people from modifying setuid and 835 const struct iovec *iovp,
675 * setgid binaries. 836 unsigned long nr_segs,
676 */ 837 loff_t pos)
677 error = -file_remove_suid(file); 838{
678 if (unlikely(error)) 839 struct file *file = iocb->ki_filp;
679 goto out_unlock_internal; 840 struct address_space *mapping = file->f_mapping;
841 struct inode *inode = mapping->host;
842 struct xfs_inode *ip = XFS_I(inode);
843 ssize_t ret;
844 int iolock;
845 size_t ocount = 0;
680 846
681 /* We can write back this queue in page reclaim */ 847 XFS_STATS_INC(xs_write_calls);
682 current->backing_dev_info = mapping->backing_dev_info;
683 848
684 if ((ioflags & IO_ISDIRECT)) { 849 BUG_ON(iocb->ki_pos != pos);
685 if (mapping->nrpages) {
686 WARN_ON(need_i_mutex == 0);
687 error = xfs_flushinval_pages(ip,
688 (pos & PAGE_CACHE_MASK),
689 -1, FI_REMAPF_LOCKED);
690 if (error)
691 goto out_unlock_internal;
692 }
693 850
694 if (need_i_mutex) { 851 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
695 /* demote the lock now the cached pages are gone */ 852 if (ret)
696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 853 return ret;
697 mutex_unlock(&inode->i_mutex);
698 854
699 iolock = XFS_IOLOCK_SHARED; 855 if (ocount == 0)
700 need_i_mutex = 0; 856 return 0;
701 }
702 857
703 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); 858 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
704 ret = generic_file_direct_write(iocb, iovp,
705 &nr_segs, pos, &iocb->ki_pos, count, ocount);
706 859
707 /* 860 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
708 * direct-io write to a hole: fall through to buffered I/O 861 return -EIO;
709 * for completing the rest of the request.
710 */
711 if (ret >= 0 && ret != count) {
712 XFS_STATS_ADD(xs_write_bytes, ret);
713 862
714 pos += ret; 863 if (unlikely(file->f_flags & O_DIRECT))
715 count -= ret; 864 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
865 ocount, &iolock);
866 else
867 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
868 ocount, &iolock);
716 869
717 ioflags &= ~IO_ISDIRECT; 870 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
718 xfs_iunlock(ip, iolock);
719 goto relock;
720 }
721 } else {
722 int enospc = 0;
723 ssize_t ret2 = 0;
724 871
725write_retry: 872 if (ret <= 0)
726 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); 873 goto out_unlock;
727 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
728 pos, &iocb->ki_pos, count, ret);
729 /*
730 * if we just got an ENOSPC, flush the inode now we
731 * aren't holding any page locks and retry *once*
732 */
733 if (ret2 == -ENOSPC && !enospc) {
734 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
735 if (error)
736 goto out_unlock_internal;
737 enospc = 1;
738 goto write_retry;
739 }
740 ret = ret2;
741 }
742 874
743 current->backing_dev_info = NULL; 875 /* Handle various SYNC-type writes */
876 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
877 loff_t end = pos + ret - 1;
878 int error, error2;
744 879
745 isize = i_size_read(inode); 880 xfs_rw_iunlock(ip, iolock);
746 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) 881 error = filemap_write_and_wait_range(mapping, pos, end);
747 iocb->ki_pos = isize; 882 xfs_rw_ilock(ip, iolock);
748 883
749 if (iocb->ki_pos > ip->i_size) { 884 error2 = -xfs_file_fsync(file,
750 xfs_ilock(ip, XFS_ILOCK_EXCL); 885 (file->f_flags & __O_SYNC) ? 0 : 1);
751 if (iocb->ki_pos > ip->i_size) 886 if (error)
752 ip->i_size = iocb->ki_pos; 887 ret = error;
753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 888 else if (error2)
889 ret = error2;
754 } 890 }
755 891
756 error = -ret; 892out_unlock:
757 if (ret <= 0) 893 xfs_aio_write_newsize_update(ip);
758 goto out_unlock_internal; 894 xfs_rw_iunlock(ip, iolock);
895 return ret;
896}
759 897
760 XFS_STATS_ADD(xs_write_bytes, ret); 898STATIC long
899xfs_file_fallocate(
900 struct file *file,
901 int mode,
902 loff_t offset,
903 loff_t len)
904{
905 struct inode *inode = file->f_path.dentry->d_inode;
906 long error;
907 loff_t new_size = 0;
908 xfs_flock64_t bf;
909 xfs_inode_t *ip = XFS_I(inode);
910 int cmd = XFS_IOC_RESVSP;
911 int attr_flags = XFS_ATTR_NOLOCK;
761 912
762 /* Handle various SYNC-type writes */ 913 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
763 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 914 return -EOPNOTSUPP;
764 loff_t end = pos + ret - 1;
765 int error2;
766 915
767 xfs_iunlock(ip, iolock); 916 bf.l_whence = 0;
768 if (need_i_mutex) 917 bf.l_start = offset;
769 mutex_unlock(&inode->i_mutex); 918 bf.l_len = len;
770 919
771 error2 = filemap_write_and_wait_range(mapping, pos, end); 920 xfs_ilock(ip, XFS_IOLOCK_EXCL);
772 if (!error)
773 error = error2;
774 if (need_i_mutex)
775 mutex_lock(&inode->i_mutex);
776 xfs_ilock(ip, iolock);
777 921
778 error2 = -xfs_file_fsync(file, 922 if (mode & FALLOC_FL_PUNCH_HOLE)
779 (file->f_flags & __O_SYNC) ? 0 : 1); 923 cmd = XFS_IOC_UNRESVSP;
780 if (!error) 924
781 error = error2; 925 /* check the new inode size is valid before allocating */
926 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
927 offset + len > i_size_read(inode)) {
928 new_size = offset + len;
929 error = inode_newsize_ok(inode, new_size);
930 if (error)
931 goto out_unlock;
782 } 932 }
783 933
784 out_unlock_internal: 934 if (file->f_flags & O_DSYNC)
785 if (ip->i_new_size) { 935 attr_flags |= XFS_ATTR_SYNC;
786 xfs_ilock(ip, XFS_ILOCK_EXCL); 936
787 ip->i_new_size = 0; 937 error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
788 /* 938 if (error)
789 * If this was a direct or synchronous I/O that failed (such 939 goto out_unlock;
790 * as ENOSPC) then part of the I/O may have been written to 940
791 * disk before the error occured. In this case the on-disk 941 /* Change file size if needed */
792 * file size may have been adjusted beyond the in-memory file 942 if (new_size) {
793 * size and now needs to be truncated back. 943 struct iattr iattr;
794 */ 944
795 if (ip->i_d.di_size > ip->i_size) 945 iattr.ia_valid = ATTR_SIZE;
796 ip->i_d.di_size = ip->i_size; 946 iattr.ia_size = new_size;
797 xfs_iunlock(ip, XFS_ILOCK_EXCL); 947 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
798 } 948 }
799 xfs_iunlock(ip, iolock); 949
800 out_unlock_mutex: 950out_unlock:
801 if (need_i_mutex) 951 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
802 mutex_unlock(&inode->i_mutex); 952 return error;
803 return -error;
804} 953}
805 954
955
806STATIC int 956STATIC int
807xfs_file_open( 957xfs_file_open(
808 struct inode *inode, 958 struct inode *inode,
@@ -921,6 +1071,7 @@ const struct file_operations xfs_file_operations = {
921 .open = xfs_file_open, 1071 .open = xfs_file_open,
922 .release = xfs_file_release, 1072 .release = xfs_file_release,
923 .fsync = xfs_file_fsync, 1073 .fsync = xfs_file_fsync,
1074 .fallocate = xfs_file_fallocate,
924}; 1075};
925 1076
926const struct file_operations xfs_dir_file_operations = { 1077const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1f279b012f94..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -32,10 +32,9 @@ xfs_tosspages(
32 xfs_off_t last, 32 xfs_off_t last,
33 int fiopt) 33 int fiopt)
34{ 34{
35 struct address_space *mapping = VFS_I(ip)->i_mapping; 35 /* can't toss partial tail pages, so mask them out */
36 36 last &= ~(PAGE_SIZE - 1);
37 if (mapping->nrpages) 37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38 truncate_inode_pages(mapping, first);
39} 38}
40 39
41int 40int
@@ -50,12 +49,11 @@ xfs_flushinval_pages(
50 49
51 trace_xfs_pagecache_inval(ip, first, last); 50 trace_xfs_pagecache_inval(ip, first, last);
52 51
53 if (mapping->nrpages) { 52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
54 xfs_iflags_clear(ip, XFS_ITRUNCATED); 53 ret = filemap_write_and_wait_range(mapping, first,
55 ret = filemap_write_and_wait(mapping); 54 last == -1 ? LLONG_MAX : last);
56 if (!ret) 55 if (!ret)
57 truncate_inode_pages(mapping, first); 56 truncate_inode_pages_range(mapping, first, last);
58 }
59 return -ret; 57 return -ret;
60} 58}
61 59
@@ -71,10 +69,9 @@ xfs_flush_pages(
71 int ret = 0; 69 int ret = 0;
72 int ret2; 70 int ret2;
73 71
74 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
75 xfs_iflags_clear(ip, XFS_ITRUNCATED); 73 ret = -filemap_fdatawrite_range(mapping, first,
76 ret = -filemap_fdatawrite(mapping); 74 last == -1 ? LLONG_MAX : last);
77 }
78 if (flags & XBF_ASYNC) 75 if (flags & XBF_ASYNC)
79 return ret; 76 return ret;
80 ret2 = xfs_wait_on_pages(ip, first, last); 77 ret2 = xfs_wait_on_pages(ip, first, last);
@@ -91,7 +88,9 @@ xfs_wait_on_pages(
91{ 88{
92 struct address_space *mapping = VFS_I(ip)->i_mapping; 89 struct address_space *mapping = VFS_I(ip)->i_mapping;
93 90
94 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) 91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
95 return -filemap_fdatawait(mapping); 92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last);
94 }
96 return 0; 95 return 0;
97} 96}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02e..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_cred.h"
20#include "xfs_sysctl.h" 19#include "xfs_sysctl.h"
21 20
22/* 21/*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061c..000000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_GLOBALS_H__
19#define __XFS_GLOBALS_H__
20
21extern uint64_t xfs_panic_mask; /* set to cause more panics */
22
23#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3b9e626f7cd1..acca2c5ca3fa 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
39#include "xfs_dfrag.h" 39#include "xfs_dfrag.h"
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_discard.h"
42#include "xfs_quota.h" 43#include "xfs_quota.h"
43#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
44#include "xfs_export.h" 45#include "xfs_export.h"
@@ -416,7 +417,7 @@ xfs_attrlist_by_handle(
416 if (IS_ERR(dentry)) 417 if (IS_ERR(dentry))
417 return PTR_ERR(dentry); 418 return PTR_ERR(dentry);
418 419
419 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 420 kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
420 if (!kbuf) 421 if (!kbuf)
421 goto out_dput; 422 goto out_dput;
422 423
@@ -623,6 +624,10 @@ xfs_ioc_space(
623 624
624 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 625 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
625 attr_flags |= XFS_ATTR_NONBLOCK; 626 attr_flags |= XFS_ATTR_NONBLOCK;
627
628 if (filp->f_flags & O_DSYNC)
629 attr_flags |= XFS_ATTR_SYNC;
630
626 if (ioflags & IO_INVIS) 631 if (ioflags & IO_INVIS)
627 attr_flags |= XFS_ATTR_DMI; 632 attr_flags |= XFS_ATTR_DMI;
628 633
@@ -694,14 +699,19 @@ xfs_ioc_fsgeometry_v1(
694 xfs_mount_t *mp, 699 xfs_mount_t *mp,
695 void __user *arg) 700 void __user *arg)
696{ 701{
697 xfs_fsop_geom_v1_t fsgeo; 702 xfs_fsop_geom_t fsgeo;
698 int error; 703 int error;
699 704
700 error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); 705 error = xfs_fs_geometry(mp, &fsgeo, 3);
701 if (error) 706 if (error)
702 return -error; 707 return -error;
703 708
704 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) 709 /*
710 * Caller should have passed an argument of type
711 * xfs_fsop_geom_v1_t. This is a proper subset of the
712 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
713 */
714 if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
705 return -XFS_ERROR(EFAULT); 715 return -XFS_ERROR(EFAULT);
706 return 0; 716 return 0;
707} 717}
@@ -790,7 +800,7 @@ xfs_ioc_fsgetxattr(
790 xfs_ilock(ip, XFS_ILOCK_SHARED); 800 xfs_ilock(ip, XFS_ILOCK_SHARED);
791 fa.fsx_xflags = xfs_ip2xflags(ip); 801 fa.fsx_xflags = xfs_ip2xflags(ip);
792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; 802 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
793 fa.fsx_projid = ip->i_d.di_projid; 803 fa.fsx_projid = xfs_get_projid(ip);
794 804
795 if (attr) { 805 if (attr) {
796 if (ip->i_afp) { 806 if (ip->i_afp) {
@@ -909,10 +919,10 @@ xfs_ioctl_setattr(
909 return XFS_ERROR(EIO); 919 return XFS_ERROR(EIO);
910 920
911 /* 921 /*
912 * Disallow 32bit project ids because on-disk structure 922 * Disallow 32bit project ids when projid32bit feature is not enabled.
913 * is 16bit only.
914 */ 923 */
915 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) 924 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
925 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
916 return XFS_ERROR(EINVAL); 926 return XFS_ERROR(EINVAL);
917 927
918 /* 928 /*
@@ -961,7 +971,7 @@ xfs_ioctl_setattr(
961 if (mask & FSX_PROJID) { 971 if (mask & FSX_PROJID) {
962 if (XFS_IS_QUOTA_RUNNING(mp) && 972 if (XFS_IS_QUOTA_RUNNING(mp) &&
963 XFS_IS_PQUOTA_ON(mp) && 973 XFS_IS_PQUOTA_ON(mp) &&
964 ip->i_d.di_projid != fa->fsx_projid) { 974 xfs_get_projid(ip) != fa->fsx_projid) {
965 ASSERT(tp); 975 ASSERT(tp);
966 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 976 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
967 capable(CAP_FOWNER) ? 977 capable(CAP_FOWNER) ?
@@ -984,10 +994,22 @@ xfs_ioctl_setattr(
984 994
985 /* 995 /*
986 * Extent size must be a multiple of the appropriate block 996 * Extent size must be a multiple of the appropriate block
987 * size, if set at all. 997 * size, if set at all. It must also be smaller than the
998 * maximum extent size supported by the filesystem.
999 *
1000 * Also, for non-realtime files, limit the extent size hint to
1001 * half the size of the AGs in the filesystem so alignment
1002 * doesn't result in extents larger than an AG.
988 */ 1003 */
989 if (fa->fsx_extsize != 0) { 1004 if (fa->fsx_extsize != 0) {
990 xfs_extlen_t size; 1005 xfs_extlen_t size;
1006 xfs_fsblock_t extsize_fsb;
1007
1008 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1009 if (extsize_fsb > MAXEXTLEN) {
1010 code = XFS_ERROR(EINVAL);
1011 goto error_return;
1012 }
991 1013
992 if (XFS_IS_REALTIME_INODE(ip) || 1014 if (XFS_IS_REALTIME_INODE(ip) ||
993 ((mask & FSX_XFLAGS) && 1015 ((mask & FSX_XFLAGS) &&
@@ -996,6 +1018,10 @@ xfs_ioctl_setattr(
996 mp->m_sb.sb_blocklog; 1018 mp->m_sb.sb_blocklog;
997 } else { 1019 } else {
998 size = mp->m_sb.sb_blocksize; 1020 size = mp->m_sb.sb_blocksize;
1021 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1022 code = XFS_ERROR(EINVAL);
1023 goto error_return;
1024 }
999 } 1025 }
1000 1026
1001 if (fa->fsx_extsize % size) { 1027 if (fa->fsx_extsize % size) {
@@ -1063,12 +1089,12 @@ xfs_ioctl_setattr(
1063 * Change the ownerships and register quota modifications 1089 * Change the ownerships and register quota modifications
1064 * in the transaction. 1090 * in the transaction.
1065 */ 1091 */
1066 if (ip->i_d.di_projid != fa->fsx_projid) { 1092 if (xfs_get_projid(ip) != fa->fsx_projid) {
1067 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { 1093 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1068 olddquot = xfs_qm_vop_chown(tp, ip, 1094 olddquot = xfs_qm_vop_chown(tp, ip,
1069 &ip->i_gdquot, gdqp); 1095 &ip->i_gdquot, gdqp);
1070 } 1096 }
1071 ip->i_d.di_projid = fa->fsx_projid; 1097 xfs_set_projid(ip, fa->fsx_projid);
1072 1098
1073 /* 1099 /*
1074 * We may have to rev the inode as well as 1100 * We may have to rev the inode as well as
@@ -1088,8 +1114,8 @@ xfs_ioctl_setattr(
1088 xfs_diflags_to_linux(ip); 1114 xfs_diflags_to_linux(ip);
1089 } 1115 }
1090 1116
1117 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1091 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1118 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1092 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1093 1119
1094 XFS_STATS_INC(xs_ig_attrchg); 1120 XFS_STATS_INC(xs_ig_attrchg);
1095 1121
@@ -1294,6 +1320,8 @@ xfs_file_ioctl(
1294 trace_xfs_file_ioctl(ip); 1320 trace_xfs_file_ioctl(ip);
1295 1321
1296 switch (cmd) { 1322 switch (cmd) {
1323 case FITRIM:
1324 return xfs_ioc_trim(mp, arg);
1297 case XFS_IOC_ALLOCSP: 1325 case XFS_IOC_ALLOCSP:
1298 case XFS_IOC_FREESP: 1326 case XFS_IOC_FREESP:
1299 case XFS_IOC_RESVSP: 1327 case XFS_IOC_RESVSP:
@@ -1301,7 +1329,8 @@ xfs_file_ioctl(
1301 case XFS_IOC_ALLOCSP64: 1329 case XFS_IOC_ALLOCSP64:
1302 case XFS_IOC_FREESP64: 1330 case XFS_IOC_FREESP64:
1303 case XFS_IOC_RESVSP64: 1331 case XFS_IOC_RESVSP64:
1304 case XFS_IOC_UNRESVSP64: { 1332 case XFS_IOC_UNRESVSP64:
1333 case XFS_IOC_ZERO_RANGE: {
1305 xfs_flock64_t bf; 1334 xfs_flock64_t bf;
1306 1335
1307 if (copy_from_user(&bf, arg, sizeof(bf))) 1336 if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 6c83f7f62dc9..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) || 164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) ||
165 get_user(bstat->bs_extents, &bstat32->bs_extents) || 165 get_user(bstat->bs_extents, &bstat32->bs_extents) ||
166 get_user(bstat->bs_gen, &bstat32->bs_gen) || 166 get_user(bstat->bs_gen, &bstat32->bs_gen) ||
167 get_user(bstat->bs_projid, &bstat32->bs_projid) || 167 get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
168 get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
168 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || 169 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
169 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || 170 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
170 get_user(bstat->bs_aextents, &bstat32->bs_aextents)) 171 get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
218 put_user(buffer->bs_extents, &p32->bs_extents) || 219 put_user(buffer->bs_extents, &p32->bs_extents) ||
219 put_user(buffer->bs_gen, &p32->bs_gen) || 220 put_user(buffer->bs_gen, &p32->bs_gen) ||
220 put_user(buffer->bs_projid, &p32->bs_projid) || 221 put_user(buffer->bs_projid, &p32->bs_projid) ||
222 put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) ||
221 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || 223 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
222 put_user(buffer->bs_dmstate, &p32->bs_dmstate) || 224 put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
223 put_user(buffer->bs_aextents, &p32->bs_aextents)) 225 put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -574,6 +576,7 @@ xfs_file_compat_ioctl(
574 case XFS_IOC_FSGEOMETRY_V1: 576 case XFS_IOC_FSGEOMETRY_V1:
575 case XFS_IOC_FSGROWFSDATA: 577 case XFS_IOC_FSGROWFSDATA:
576 case XFS_IOC_FSGROWFSRT: 578 case XFS_IOC_FSGROWFSRT:
579 case XFS_IOC_ZERO_RANGE:
577 return xfs_file_ioctl(filp, cmd, p); 580 return xfs_file_ioctl(filp, cmd, p);
578#else 581#else
579 case XFS_IOC_ALLOCSP_32: 582 case XFS_IOC_ALLOCSP_32:
@@ -583,7 +586,8 @@ xfs_file_compat_ioctl(
583 case XFS_IOC_RESVSP_32: 586 case XFS_IOC_RESVSP_32:
584 case XFS_IOC_UNRESVSP_32: 587 case XFS_IOC_UNRESVSP_32:
585 case XFS_IOC_RESVSP64_32: 588 case XFS_IOC_RESVSP64_32:
586 case XFS_IOC_UNRESVSP64_32: { 589 case XFS_IOC_UNRESVSP64_32:
590 case XFS_IOC_ZERO_RANGE_32: {
587 struct xfs_flock64 bf; 591 struct xfs_flock64 bf;
588 592
589 if (xfs_compat_flock64_copyin(&bf, arg)) 593 if (xfs_compat_flock64_copyin(&bf, arg))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0d..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
65 __s32 bs_extsize; /* extent size */ 65 __s32 bs_extsize; /* extent size */
66 __s32 bs_extents; /* number of extents */ 66 __s32 bs_extents; /* number of extents */
67 __u32 bs_gen; /* generation count */ 67 __u32 bs_gen; /* generation count */
68 __u16 bs_projid; /* project id */ 68 __u16 bs_projid_lo; /* lower part of project id */
69 unsigned char bs_pad[14]; /* pad space, unused */ 69#define bs_projid bs_projid_lo /* (previously just bs_projid) */
70 __u16 bs_projid_hi; /* high part of project id */
71 unsigned char bs_pad[12]; /* pad space, unused */
70 __u32 bs_dmevmask; /* DMIG event mask */ 72 __u32 bs_dmevmask; /* DMIG event mask */
71 __u16 bs_dmstate; /* DMIG state info */ 73 __u16 bs_dmstate; /* DMIG state info */
72 __u16 bs_aextents; /* attribute number of extents */ 74 __u16 bs_aextents; /* attribute number of extents */
@@ -182,6 +184,7 @@ typedef struct compat_xfs_flock64 {
182#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) 184#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64)
183#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) 185#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64)
184#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) 186#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64)
187#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64)
185 188
186typedef struct compat_xfs_fsop_geom_v1 { 189typedef struct compat_xfs_fsop_geom_v1 {
187 __u32 blocksize; /* filesystem (data) block size */ 190 __u32 blocksize; /* filesystem (data) block size */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6bfe83..d44d92cd12b1 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/posix_acl.h> 47#include <linux/posix_acl.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/falloc.h>
50#include <linux/fiemap.h> 49#include <linux/fiemap.h>
51#include <linux/slab.h> 50#include <linux/slab.h>
52 51
@@ -71,7 +70,7 @@ xfs_synchronize_times(
71 70
72/* 71/*
73 * If the linux inode is valid, mark it dirty. 72 * If the linux inode is valid, mark it dirty.
74 * Used when commiting a dirty inode into a transaction so that 73 * Used when committing a dirty inode into a transaction so that
75 * the inode will get written back by the linux code 74 * the inode will get written back by the linux code
76 */ 75 */
77void 76void
@@ -95,41 +94,6 @@ xfs_mark_inode_dirty(
95} 94}
96 95
97/* 96/*
98 * Change the requested timestamp in the given inode.
99 * We don't lock across timestamp updates, and we don't log them but
100 * we do record the fact that there is dirty information in core.
101 */
102void
103xfs_ichgtime(
104 xfs_inode_t *ip,
105 int flags)
106{
107 struct inode *inode = VFS_I(ip);
108 timespec_t tv;
109 int sync_it = 0;
110
111 tv = current_fs_time(inode->i_sb);
112
113 if ((flags & XFS_ICHGTIME_MOD) &&
114 !timespec_equal(&inode->i_mtime, &tv)) {
115 inode->i_mtime = tv;
116 sync_it = 1;
117 }
118 if ((flags & XFS_ICHGTIME_CHG) &&
119 !timespec_equal(&inode->i_ctime, &tv)) {
120 inode->i_ctime = tv;
121 sync_it = 1;
122 }
123
124 /*
125 * Update complete - now make sure everyone knows that the inode
126 * is dirty.
127 */
128 if (sync_it)
129 xfs_mark_inode_dirty_sync(ip);
130}
131
132/*
133 * Hook in SELinux. This is not quite correct yet, what we really need 97 * Hook in SELinux. This is not quite correct yet, what we really need
134 * here (as we do for default ACLs) is a mechanism by which creation of 98 * here (as we do for default ACLs) is a mechanism by which creation of
135 * these attrs can be journalled at inode creation time (along with the 99 * these attrs can be journalled at inode creation time (along with the
@@ -138,7 +102,8 @@ xfs_ichgtime(
138STATIC int 102STATIC int
139xfs_init_security( 103xfs_init_security(
140 struct inode *inode, 104 struct inode *inode,
141 struct inode *dir) 105 struct inode *dir,
106 const struct qstr *qstr)
142{ 107{
143 struct xfs_inode *ip = XFS_I(inode); 108 struct xfs_inode *ip = XFS_I(inode);
144 size_t length; 109 size_t length;
@@ -146,7 +111,7 @@ xfs_init_security(
146 unsigned char *name; 111 unsigned char *name;
147 int error; 112 int error;
148 113
149 error = security_inode_init_security(inode, dir, (char **)&name, 114 error = security_inode_init_security(inode, dir, qstr, (char **)&name,
150 &value, &length); 115 &value, &length);
151 if (error) { 116 if (error) {
152 if (error == -EOPNOTSUPP) 117 if (error == -EOPNOTSUPP)
@@ -217,20 +182,20 @@ xfs_vn_mknod(
217 if (IS_POSIXACL(dir)) { 182 if (IS_POSIXACL(dir)) {
218 default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); 183 default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
219 if (IS_ERR(default_acl)) 184 if (IS_ERR(default_acl))
220 return -PTR_ERR(default_acl); 185 return PTR_ERR(default_acl);
221 186
222 if (!default_acl) 187 if (!default_acl)
223 mode &= ~current_umask(); 188 mode &= ~current_umask();
224 } 189 }
225 190
226 xfs_dentry_to_name(&name, dentry); 191 xfs_dentry_to_name(&name, dentry);
227 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); 192 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
228 if (unlikely(error)) 193 if (unlikely(error))
229 goto out_free_acl; 194 goto out_free_acl;
230 195
231 inode = VFS_I(ip); 196 inode = VFS_I(ip);
232 197
233 error = xfs_init_security(inode, dir); 198 error = xfs_init_security(inode, dir, &dentry->d_name);
234 if (unlikely(error)) 199 if (unlikely(error))
235 goto out_cleanup_inode; 200 goto out_cleanup_inode;
236 201
@@ -352,7 +317,7 @@ xfs_vn_link(
352 if (unlikely(error)) 317 if (unlikely(error))
353 return -error; 318 return -error;
354 319
355 atomic_inc(&inode->i_count); 320 ihold(inode);
356 d_instantiate(dentry, inode); 321 d_instantiate(dentry, inode);
357 return 0; 322 return 0;
358} 323}
@@ -397,13 +362,13 @@ xfs_vn_symlink(
397 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); 362 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
398 xfs_dentry_to_name(&name, dentry); 363 xfs_dentry_to_name(&name, dentry);
399 364
400 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 365 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
401 if (unlikely(error)) 366 if (unlikely(error))
402 goto out; 367 goto out;
403 368
404 inode = VFS_I(cip); 369 inode = VFS_I(cip);
405 370
406 error = xfs_init_security(inode, dir); 371 error = xfs_init_security(inode, dir, &dentry->d_name);
407 if (unlikely(error)) 372 if (unlikely(error))
408 goto out_cleanup_inode; 373 goto out_cleanup_inode;
409 374
@@ -540,58 +505,6 @@ xfs_vn_setattr(
540 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 505 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
541} 506}
542 507
543STATIC long
544xfs_vn_fallocate(
545 struct inode *inode,
546 int mode,
547 loff_t offset,
548 loff_t len)
549{
550 long error;
551 loff_t new_size = 0;
552 xfs_flock64_t bf;
553 xfs_inode_t *ip = XFS_I(inode);
554
555 /* preallocation on directories not yet supported */
556 error = -ENODEV;
557 if (S_ISDIR(inode->i_mode))
558 goto out_error;
559
560 bf.l_whence = 0;
561 bf.l_start = offset;
562 bf.l_len = len;
563
564 xfs_ilock(ip, XFS_IOLOCK_EXCL);
565
566 /* check the new inode size is valid before allocating */
567 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
568 offset + len > i_size_read(inode)) {
569 new_size = offset + len;
570 error = inode_newsize_ok(inode, new_size);
571 if (error)
572 goto out_unlock;
573 }
574
575 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
576 0, XFS_ATTR_NOLOCK);
577 if (error)
578 goto out_unlock;
579
580 /* Change file size if needed */
581 if (new_size) {
582 struct iattr iattr;
583
584 iattr.ia_valid = ATTR_SIZE;
585 iattr.ia_size = new_size;
586 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
587 }
588
589out_unlock:
590 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
591out_error:
592 return error;
593}
594
595#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 508#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
596 509
597/* 510/*
@@ -685,7 +598,6 @@ static const struct inode_operations xfs_inode_operations = {
685 .getxattr = generic_getxattr, 598 .getxattr = generic_getxattr,
686 .removexattr = generic_removexattr, 599 .removexattr = generic_removexattr,
687 .listxattr = xfs_vn_listxattr, 600 .listxattr = xfs_vn_listxattr,
688 .fallocate = xfs_vn_fallocate,
689 .fiemap = xfs_vn_fiemap, 601 .fiemap = xfs_vn_fiemap,
690}; 602};
691 603
@@ -795,7 +707,10 @@ xfs_setup_inode(
795 707
796 inode->i_ino = ip->i_ino; 708 inode->i_ino = ip->i_ino;
797 inode->i_state = I_NEW; 709 inode->i_state = I_NEW;
798 inode_add_to_lists(ip->i_mount->m_super, inode); 710
711 inode_sb_list_add(inode);
712 /* make the inode look hashed for the writeback code */
713 hlist_add_fake(&inode->i_hash);
799 714
800 inode->i_mode = ip->i_d.di_mode; 715 inode->i_mode = ip->i_d.di_mode;
801 inode->i_nlink = ip->i_d.di_nlink; 716 inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 2fa0bd9ebc7f..8633521b3b2e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,10 +37,8 @@
37 37
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h>
41#include <time.h> 40#include <time.h>
42 41
43#include <support/debug.h>
44#include <support/uuid.h> 42#include <support/uuid.h>
45 43
46#include <linux/semaphore.h> 44#include <linux/semaphore.h>
@@ -71,6 +69,8 @@
71#include <linux/random.h> 69#include <linux/random.h>
72#include <linux/ctype.h> 70#include <linux/ctype.h>
73#include <linux/writeback.h> 71#include <linux/writeback.h>
72#include <linux/capability.h>
73#include <linux/list_sort.h>
74 74
75#include <asm/page.h> 75#include <asm/page.h>
76#include <asm/div64.h> 76#include <asm/div64.h>
@@ -79,15 +79,14 @@
79#include <asm/byteorder.h> 79#include <asm/byteorder.h>
80#include <asm/unaligned.h> 80#include <asm/unaligned.h>
81 81
82#include <xfs_cred.h>
83#include <xfs_vnode.h> 82#include <xfs_vnode.h>
84#include <xfs_stats.h> 83#include <xfs_stats.h>
85#include <xfs_sysctl.h> 84#include <xfs_sysctl.h>
86#include <xfs_iops.h> 85#include <xfs_iops.h>
87#include <xfs_aops.h> 86#include <xfs_aops.h>
88#include <xfs_super.h> 87#include <xfs_super.h>
89#include <xfs_globals.h>
90#include <xfs_buf.h> 88#include <xfs_buf.h>
89#include <xfs_message.h>
91 90
92/* 91/*
93 * Feature macros (disable/enable) 92 * Feature macros (disable/enable)
@@ -144,7 +143,7 @@
144#define SYNCHRONIZE() barrier() 143#define SYNCHRONIZE() barrier()
145#define __return_address __builtin_return_address(0) 144#define __return_address __builtin_return_address(0)
146 145
147#define dfltprid 0 146#define XFS_PROJID_DEFAULT 0
148#define MAXPATHLEN 1024 147#define MAXPATHLEN 1024
149 148
150#define MIN(a,b) (min(a,b)) 149#define MIN(a,b) (min(a,b))
@@ -282,4 +281,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
282#define __arch_pack 281#define __arch_pack
283#endif 282#endif
284 283
284#define ASSERT_ALWAYS(expr) \
285 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
286
287#ifndef DEBUG
288#define ASSERT(expr) ((void)0)
289
290#ifndef STATIC
291# define STATIC static noinline
292#endif
293
294#else /* DEBUG */
295
296#define ASSERT(expr) \
297 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
298
299#ifndef STATIC
300# define STATIC noinline
301#endif
302
303#endif /* DEBUG */
304
285#endif /* __XFS_LINUX__ */ 305#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 000000000000..bd672def95ac
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,108 @@
1/*
2 * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27
28/*
29 * XFS logging functions
30 */
31static void
32__xfs_printk(
33 const char *level,
34 const struct xfs_mount *mp,
35 struct va_format *vaf)
36{
37 if (mp && mp->m_fsname) {
38 printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
39 return;
40 }
41 printk("%sXFS: %pV\n", level, vaf);
42}
43
44#define define_xfs_printk_level(func, kern_level) \
45void func(const struct xfs_mount *mp, const char *fmt, ...) \
46{ \
47 struct va_format vaf; \
48 va_list args; \
49 \
50 va_start(args, fmt); \
51 \
52 vaf.fmt = fmt; \
53 vaf.va = &args; \
54 \
55 __xfs_printk(kern_level, mp, &vaf); \
56 va_end(args); \
57} \
58
59define_xfs_printk_level(xfs_emerg, KERN_EMERG);
60define_xfs_printk_level(xfs_alert, KERN_ALERT);
61define_xfs_printk_level(xfs_crit, KERN_CRIT);
62define_xfs_printk_level(xfs_err, KERN_ERR);
63define_xfs_printk_level(xfs_warn, KERN_WARNING);
64define_xfs_printk_level(xfs_notice, KERN_NOTICE);
65define_xfs_printk_level(xfs_info, KERN_INFO);
66#ifdef DEBUG
67define_xfs_printk_level(xfs_debug, KERN_DEBUG);
68#endif
69
70void
71xfs_alert_tag(
72 const struct xfs_mount *mp,
73 int panic_tag,
74 const char *fmt, ...)
75{
76 struct va_format vaf;
77 va_list args;
78 int do_panic = 0;
79
80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
81 xfs_alert(mp, "Transforming an alert into a BUG.");
82 do_panic = 1;
83 }
84
85 va_start(args, fmt);
86
87 vaf.fmt = fmt;
88 vaf.va = &args;
89
90 __xfs_printk(KERN_ALERT, mp, &vaf);
91 va_end(args);
92
93 BUG_ON(do_panic);
94}
95
96void
97assfail(char *expr, char *file, int line)
98{
99 xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
100 expr, file, line);
101 BUG();
102}
103
104void
105xfs_hex_dump(void *p, int length)
106{
107 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
108}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 000000000000..7fb7ea007672
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,39 @@
1#ifndef __XFS_MESSAGE_H
2#define __XFS_MESSAGE_H 1
3
4struct xfs_mount;
5
6extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
7 __attribute__ ((format (printf, 2, 3)));
8extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
9 __attribute__ ((format (printf, 2, 3)));
10extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
11 const char *fmt, ...)
12 __attribute__ ((format (printf, 3, 4)));
13extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
14 __attribute__ ((format (printf, 2, 3)));
15extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
16 __attribute__ ((format (printf, 2, 3)));
17extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
18 __attribute__ ((format (printf, 2, 3)));
19extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
20 __attribute__ ((format (printf, 2, 3)));
21extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
22 __attribute__ ((format (printf, 2, 3)));
23
24#ifdef DEBUG
25extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
26 __attribute__ ((format (printf, 2, 3)));
27#else
28static inline void
29__attribute__ ((format (printf, 2, 3)))
30xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
31{
32}
33#endif
34
35extern void assfail(char *expr, char *f, int l);
36
37extern void xfs_hex_dump(void *p, int length);
38
39#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955b..a1a881e68a9a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -44,7 +44,6 @@
44#include "xfs_buf_item.h" 44#include "xfs_buf_item.h"
45#include "xfs_utils.h" 45#include "xfs_utils.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_version.h"
48#include "xfs_log_priv.h" 47#include "xfs_log_priv.h"
49#include "xfs_trans_priv.h" 48#include "xfs_trans_priv.h"
50#include "xfs_filestream.h" 49#include "xfs_filestream.h"
@@ -111,8 +110,10 @@ mempool_t *xfs_ioend_pool;
111#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ 110#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
112#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ 111#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
113#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ 112#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
114#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ 113#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */
115#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ 114#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
115#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
116#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
116 117
117/* 118/*
118 * Table driven mount option parser. 119 * Table driven mount option parser.
@@ -174,6 +175,15 @@ xfs_parseargs(
174 __uint8_t iosizelog = 0; 175 __uint8_t iosizelog = 0;
175 176
176 /* 177 /*
178 * set up the mount name first so all the errors will refer to the
179 * correct device.
180 */
181 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
182 if (!mp->m_fsname)
183 return ENOMEM;
184 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
185
186 /*
177 * Copy binary VFS mount flags we are interested in. 187 * Copy binary VFS mount flags we are interested in.
178 */ 188 */
179 if (sb->s_flags & MS_RDONLY) 189 if (sb->s_flags & MS_RDONLY)
@@ -190,6 +200,7 @@ xfs_parseargs(
190 mp->m_flags |= XFS_MOUNT_BARRIER; 200 mp->m_flags |= XFS_MOUNT_BARRIER;
191 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; 201 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
192 mp->m_flags |= XFS_MOUNT_SMALL_INUMS; 202 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
203 mp->m_flags |= XFS_MOUNT_DELAYLOG;
193 204
194 /* 205 /*
195 * These can be overridden by the mount option parsing. 206 * These can be overridden by the mount option parsing.
@@ -208,24 +219,21 @@ xfs_parseargs(
208 219
209 if (!strcmp(this_char, MNTOPT_LOGBUFS)) { 220 if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
210 if (!value || !*value) { 221 if (!value || !*value) {
211 cmn_err(CE_WARN, 222 xfs_warn(mp, "%s option requires an argument",
212 "XFS: %s option requires an argument",
213 this_char); 223 this_char);
214 return EINVAL; 224 return EINVAL;
215 } 225 }
216 mp->m_logbufs = simple_strtoul(value, &eov, 10); 226 mp->m_logbufs = simple_strtoul(value, &eov, 10);
217 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { 227 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
218 if (!value || !*value) { 228 if (!value || !*value) {
219 cmn_err(CE_WARN, 229 xfs_warn(mp, "%s option requires an argument",
220 "XFS: %s option requires an argument",
221 this_char); 230 this_char);
222 return EINVAL; 231 return EINVAL;
223 } 232 }
224 mp->m_logbsize = suffix_strtoul(value, &eov, 10); 233 mp->m_logbsize = suffix_strtoul(value, &eov, 10);
225 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { 234 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
226 if (!value || !*value) { 235 if (!value || !*value) {
227 cmn_err(CE_WARN, 236 xfs_warn(mp, "%s option requires an argument",
228 "XFS: %s option requires an argument",
229 this_char); 237 this_char);
230 return EINVAL; 238 return EINVAL;
231 } 239 }
@@ -233,14 +241,12 @@ xfs_parseargs(
233 if (!mp->m_logname) 241 if (!mp->m_logname)
234 return ENOMEM; 242 return ENOMEM;
235 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 243 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
236 cmn_err(CE_WARN, 244 xfs_warn(mp, "%s option not allowed on this system",
237 "XFS: %s option not allowed on this system",
238 this_char); 245 this_char);
239 return EINVAL; 246 return EINVAL;
240 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 247 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
241 if (!value || !*value) { 248 if (!value || !*value) {
242 cmn_err(CE_WARN, 249 xfs_warn(mp, "%s option requires an argument",
243 "XFS: %s option requires an argument",
244 this_char); 250 this_char);
245 return EINVAL; 251 return EINVAL;
246 } 252 }
@@ -249,8 +255,7 @@ xfs_parseargs(
249 return ENOMEM; 255 return ENOMEM;
250 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { 256 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
251 if (!value || !*value) { 257 if (!value || !*value) {
252 cmn_err(CE_WARN, 258 xfs_warn(mp, "%s option requires an argument",
253 "XFS: %s option requires an argument",
254 this_char); 259 this_char);
255 return EINVAL; 260 return EINVAL;
256 } 261 }
@@ -258,8 +263,7 @@ xfs_parseargs(
258 iosizelog = ffs(iosize) - 1; 263 iosizelog = ffs(iosize) - 1;
259 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { 264 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
260 if (!value || !*value) { 265 if (!value || !*value) {
261 cmn_err(CE_WARN, 266 xfs_warn(mp, "%s option requires an argument",
262 "XFS: %s option requires an argument",
263 this_char); 267 this_char);
264 return EINVAL; 268 return EINVAL;
265 } 269 }
@@ -281,16 +285,14 @@ xfs_parseargs(
281 mp->m_flags |= XFS_MOUNT_SWALLOC; 285 mp->m_flags |= XFS_MOUNT_SWALLOC;
282 } else if (!strcmp(this_char, MNTOPT_SUNIT)) { 286 } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
283 if (!value || !*value) { 287 if (!value || !*value) {
284 cmn_err(CE_WARN, 288 xfs_warn(mp, "%s option requires an argument",
285 "XFS: %s option requires an argument",
286 this_char); 289 this_char);
287 return EINVAL; 290 return EINVAL;
288 } 291 }
289 dsunit = simple_strtoul(value, &eov, 10); 292 dsunit = simple_strtoul(value, &eov, 10);
290 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { 293 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
291 if (!value || !*value) { 294 if (!value || !*value) {
292 cmn_err(CE_WARN, 295 xfs_warn(mp, "%s option requires an argument",
293 "XFS: %s option requires an argument",
294 this_char); 296 this_char);
295 return EINVAL; 297 return EINVAL;
296 } 298 }
@@ -298,8 +300,7 @@ xfs_parseargs(
298 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { 300 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
299 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; 301 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
300#if !XFS_BIG_INUMS 302#if !XFS_BIG_INUMS
301 cmn_err(CE_WARN, 303 xfs_warn(mp, "%s option not allowed on this system",
302 "XFS: %s option not allowed on this system",
303 this_char); 304 this_char);
304 return EINVAL; 305 return EINVAL;
305#endif 306#endif
@@ -354,26 +355,26 @@ xfs_parseargs(
354 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 355 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
355 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 356 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
356 mp->m_flags |= XFS_MOUNT_DELAYLOG; 357 mp->m_flags |= XFS_MOUNT_DELAYLOG;
357 cmn_err(CE_WARN,
358 "Enabling EXPERIMENTAL delayed logging feature "
359 "- use at your own risk.\n");
360 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 358 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
361 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 359 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
360 } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
361 mp->m_flags |= XFS_MOUNT_DISCARD;
362 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
363 mp->m_flags &= ~XFS_MOUNT_DISCARD;
362 } else if (!strcmp(this_char, "ihashsize")) { 364 } else if (!strcmp(this_char, "ihashsize")) {
363 cmn_err(CE_WARN, 365 xfs_warn(mp,
364 "XFS: ihashsize no longer used, option is deprecated."); 366 "ihashsize no longer used, option is deprecated.");
365 } else if (!strcmp(this_char, "osyncisdsync")) { 367 } else if (!strcmp(this_char, "osyncisdsync")) {
366 cmn_err(CE_WARN, 368 xfs_warn(mp,
367 "XFS: osyncisdsync has no effect, option is deprecated."); 369 "osyncisdsync has no effect, option is deprecated.");
368 } else if (!strcmp(this_char, "osyncisosync")) { 370 } else if (!strcmp(this_char, "osyncisosync")) {
369 cmn_err(CE_WARN, 371 xfs_warn(mp,
370 "XFS: osyncisosync has no effect, option is deprecated."); 372 "osyncisosync has no effect, option is deprecated.");
371 } else if (!strcmp(this_char, "irixsgid")) { 373 } else if (!strcmp(this_char, "irixsgid")) {
372 cmn_err(CE_WARN, 374 xfs_warn(mp,
373 "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); 375 "irixsgid is now a sysctl(2) variable, option is deprecated.");
374 } else { 376 } else {
375 cmn_err(CE_WARN, 377 xfs_warn(mp, "unknown mount option [%s].", this_char);
376 "XFS: unknown mount option [%s].", this_char);
377 return EINVAL; 378 return EINVAL;
378 } 379 }
379 } 380 }
@@ -383,40 +384,44 @@ xfs_parseargs(
383 */ 384 */
384 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && 385 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
385 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 386 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
386 cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only."); 387 xfs_warn(mp, "no-recovery mounts must be read-only.");
387 return EINVAL; 388 return EINVAL;
388 } 389 }
389 390
390 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { 391 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
391 cmn_err(CE_WARN, 392 xfs_warn(mp,
392 "XFS: sunit and swidth options incompatible with the noalign option"); 393 "sunit and swidth options incompatible with the noalign option");
394 return EINVAL;
395 }
396
397 if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
398 !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
399 xfs_warn(mp,
400 "the discard option is incompatible with the nodelaylog option");
393 return EINVAL; 401 return EINVAL;
394 } 402 }
395 403
396#ifndef CONFIG_XFS_QUOTA 404#ifndef CONFIG_XFS_QUOTA
397 if (XFS_IS_QUOTA_RUNNING(mp)) { 405 if (XFS_IS_QUOTA_RUNNING(mp)) {
398 cmn_err(CE_WARN, 406 xfs_warn(mp, "quota support not available in this kernel.");
399 "XFS: quota support not available in this kernel.");
400 return EINVAL; 407 return EINVAL;
401 } 408 }
402#endif 409#endif
403 410
404 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && 411 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
405 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { 412 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
406 cmn_err(CE_WARN, 413 xfs_warn(mp, "cannot mount with both project and group quota");
407 "XFS: cannot mount with both project and group quota");
408 return EINVAL; 414 return EINVAL;
409 } 415 }
410 416
411 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 417 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
412 cmn_err(CE_WARN, 418 xfs_warn(mp, "sunit and swidth must be specified together");
413 "XFS: sunit and swidth must be specified together");
414 return EINVAL; 419 return EINVAL;
415 } 420 }
416 421
417 if (dsunit && (dswidth % dsunit != 0)) { 422 if (dsunit && (dswidth % dsunit != 0)) {
418 cmn_err(CE_WARN, 423 xfs_warn(mp,
419 "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)", 424 "stripe width (%d) must be a multiple of the stripe unit (%d)",
420 dswidth, dsunit); 425 dswidth, dsunit);
421 return EINVAL; 426 return EINVAL;
422 } 427 }
@@ -442,8 +447,7 @@ done:
442 mp->m_logbufs != 0 && 447 mp->m_logbufs != 0 &&
443 (mp->m_logbufs < XLOG_MIN_ICLOGS || 448 (mp->m_logbufs < XLOG_MIN_ICLOGS ||
444 mp->m_logbufs > XLOG_MAX_ICLOGS)) { 449 mp->m_logbufs > XLOG_MAX_ICLOGS)) {
445 cmn_err(CE_WARN, 450 xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
446 "XFS: invalid logbufs value: %d [not %d-%d]",
447 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); 451 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
448 return XFS_ERROR(EINVAL); 452 return XFS_ERROR(EINVAL);
449 } 453 }
@@ -452,22 +456,16 @@ done:
452 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || 456 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
453 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || 457 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
454 !is_power_of_2(mp->m_logbsize))) { 458 !is_power_of_2(mp->m_logbsize))) {
455 cmn_err(CE_WARN, 459 xfs_warn(mp,
456 "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", 460 "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
457 mp->m_logbsize); 461 mp->m_logbsize);
458 return XFS_ERROR(EINVAL); 462 return XFS_ERROR(EINVAL);
459 } 463 }
460 464
461 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
462 if (!mp->m_fsname)
463 return ENOMEM;
464 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
465
466 if (iosizelog) { 465 if (iosizelog) {
467 if (iosizelog > XFS_MAX_IO_LOG || 466 if (iosizelog > XFS_MAX_IO_LOG ||
468 iosizelog < XFS_MIN_IO_LOG) { 467 iosizelog < XFS_MIN_IO_LOG) {
469 cmn_err(CE_WARN, 468 xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
470 "XFS: invalid log iosize: %d [not %d-%d]",
471 iosizelog, XFS_MIN_IO_LOG, 469 iosizelog, XFS_MIN_IO_LOG,
472 XFS_MAX_IO_LOG); 470 XFS_MAX_IO_LOG);
473 return XFS_ERROR(EINVAL); 471 return XFS_ERROR(EINVAL);
@@ -503,6 +501,7 @@ xfs_showargs(
503 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 501 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
504 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 502 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
505 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, 503 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
504 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
506 { 0, NULL } 505 { 0, NULL }
507 }; 506 };
508 static struct proc_xfs_info xfs_info_unset[] = { 507 static struct proc_xfs_info xfs_info_unset[] = {
@@ -577,7 +576,7 @@ xfs_max_file_offset(
577 576
578 /* Figure out maximum filesize, on Linux this can depend on 577 /* Figure out maximum filesize, on Linux this can depend on
579 * the filesystem blocksize (on 32 bit platforms). 578 * the filesystem blocksize (on 32 bit platforms).
580 * __block_prepare_write does this in an [unsigned] long... 579 * __block_write_begin does this in an [unsigned] long...
581 * page->index << (PAGE_CACHE_SHIFT - bbits) 580 * page->index << (PAGE_CACHE_SHIFT - bbits)
582 * So, for page sized blocks (4K on 32 bit platforms), 581 * So, for page sized blocks (4K on 32 bit platforms),
583 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is 582 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -610,10 +609,11 @@ xfs_blkdev_get(
610{ 609{
611 int error = 0; 610 int error = 0;
612 611
613 *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); 612 *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
613 mp);
614 if (IS_ERR(*bdevp)) { 614 if (IS_ERR(*bdevp)) {
615 error = PTR_ERR(*bdevp); 615 error = PTR_ERR(*bdevp);
616 printk("XFS: Invalid device [%s], error=%d\n", name, error); 616 xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
617 } 617 }
618 618
619 return -error; 619 return -error;
@@ -624,77 +624,14 @@ xfs_blkdev_put(
624 struct block_device *bdev) 624 struct block_device *bdev)
625{ 625{
626 if (bdev) 626 if (bdev)
627 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 627 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
628}
629
630/*
631 * Try to write out the superblock using barriers.
632 */
633STATIC int
634xfs_barrier_test(
635 xfs_mount_t *mp)
636{
637 xfs_buf_t *sbp = xfs_getsb(mp, 0);
638 int error;
639
640 XFS_BUF_UNDONE(sbp);
641 XFS_BUF_UNREAD(sbp);
642 XFS_BUF_UNDELAYWRITE(sbp);
643 XFS_BUF_WRITE(sbp);
644 XFS_BUF_UNASYNC(sbp);
645 XFS_BUF_ORDERED(sbp);
646
647 xfsbdstrat(mp, sbp);
648 error = xfs_iowait(sbp);
649
650 /*
651 * Clear all the flags we set and possible error state in the
652 * buffer. We only did the write to try out whether barriers
653 * worked and shouldn't leave any traces in the superblock
654 * buffer.
655 */
656 XFS_BUF_DONE(sbp);
657 XFS_BUF_ERROR(sbp, 0);
658 XFS_BUF_UNORDERED(sbp);
659
660 xfs_buf_relse(sbp);
661 return error;
662}
663
664STATIC void
665xfs_mountfs_check_barriers(xfs_mount_t *mp)
666{
667 int error;
668
669 if (mp->m_logdev_targp != mp->m_ddev_targp) {
670 xfs_fs_cmn_err(CE_NOTE, mp,
671 "Disabling barriers, not supported with external log device");
672 mp->m_flags &= ~XFS_MOUNT_BARRIER;
673 return;
674 }
675
676 if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
677 xfs_fs_cmn_err(CE_NOTE, mp,
678 "Disabling barriers, underlying device is readonly");
679 mp->m_flags &= ~XFS_MOUNT_BARRIER;
680 return;
681 }
682
683 error = xfs_barrier_test(mp);
684 if (error) {
685 xfs_fs_cmn_err(CE_NOTE, mp,
686 "Disabling barriers, trial barrier write failed");
687 mp->m_flags &= ~XFS_MOUNT_BARRIER;
688 return;
689 }
690} 628}
691 629
692void 630void
693xfs_blkdev_issue_flush( 631xfs_blkdev_issue_flush(
694 xfs_buftarg_t *buftarg) 632 xfs_buftarg_t *buftarg)
695{ 633{
696 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, 634 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
697 BLKDEV_IFL_WAIT);
698} 635}
699 636
700STATIC void 637STATIC void
@@ -747,8 +684,8 @@ xfs_open_devices(
747 goto out_close_logdev; 684 goto out_close_logdev;
748 685
749 if (rtdev == ddev || rtdev == logdev) { 686 if (rtdev == ddev || rtdev == logdev) {
750 cmn_err(CE_WARN, 687 xfs_warn(mp,
751 "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); 688 "Cannot mount filesystem with identical rtdev and ddev/logdev.");
752 error = EINVAL; 689 error = EINVAL;
753 goto out_close_rtdev; 690 goto out_close_rtdev;
754 } 691 }
@@ -758,18 +695,20 @@ xfs_open_devices(
758 * Setup xfs_mount buffer target pointers 695 * Setup xfs_mount buffer target pointers
759 */ 696 */
760 error = ENOMEM; 697 error = ENOMEM;
761 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); 698 mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
762 if (!mp->m_ddev_targp) 699 if (!mp->m_ddev_targp)
763 goto out_close_rtdev; 700 goto out_close_rtdev;
764 701
765 if (rtdev) { 702 if (rtdev) {
766 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); 703 mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
704 mp->m_fsname);
767 if (!mp->m_rtdev_targp) 705 if (!mp->m_rtdev_targp)
768 goto out_free_ddev_targ; 706 goto out_free_ddev_targ;
769 } 707 }
770 708
771 if (logdev && logdev != ddev) { 709 if (logdev && logdev != ddev) {
772 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); 710 mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
711 mp->m_fsname);
773 if (!mp->m_logdev_targp) 712 if (!mp->m_logdev_targp)
774 goto out_free_rtdev_targ; 713 goto out_free_rtdev_targ;
775 } else { 714 } else {
@@ -829,63 +768,6 @@ xfs_setup_devices(
829 return 0; 768 return 0;
830} 769}
831 770
832/*
833 * XFS AIL push thread support
834 */
835void
836xfsaild_wakeup(
837 struct xfs_ail *ailp,
838 xfs_lsn_t threshold_lsn)
839{
840 ailp->xa_target = threshold_lsn;
841 wake_up_process(ailp->xa_task);
842}
843
844STATIC int
845xfsaild(
846 void *data)
847{
848 struct xfs_ail *ailp = data;
849 xfs_lsn_t last_pushed_lsn = 0;
850 long tout = 0; /* milliseconds */
851
852 while (!kthread_should_stop()) {
853 schedule_timeout_interruptible(tout ?
854 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
855
856 /* swsusp */
857 try_to_freeze();
858
859 ASSERT(ailp->xa_mount->m_log);
860 if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
861 continue;
862
863 tout = xfsaild_push(ailp, &last_pushed_lsn);
864 }
865
866 return 0;
867} /* xfsaild */
868
869int
870xfsaild_start(
871 struct xfs_ail *ailp)
872{
873 ailp->xa_target = 0;
874 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
875 ailp->xa_mount->m_fsname);
876 if (IS_ERR(ailp->xa_task))
877 return -PTR_ERR(ailp->xa_task);
878 return 0;
879}
880
881void
882xfsaild_stop(
883 struct xfs_ail *ailp)
884{
885 kthread_stop(ailp->xa_task);
886}
887
888
889/* Catch misguided souls that try to use this interface on XFS */ 771/* Catch misguided souls that try to use this interface on XFS */
890STATIC struct inode * 772STATIC struct inode *
891xfs_fs_alloc_inode( 773xfs_fs_alloc_inode(
@@ -938,7 +820,7 @@ out_reclaim:
938 * Slab object creation initialisation for the XFS inode. 820 * Slab object creation initialisation for the XFS inode.
939 * This covers only the idempotent fields in the XFS inode; 821 * This covers only the idempotent fields in the XFS inode;
940 * all other fields need to be initialised on allocation 822 * all other fields need to be initialised on allocation
941 * from the slab. This avoids the need to repeatedly intialise 823 * from the slab. This avoids the need to repeatedly initialise
942 * fields in the xfs inode that left in the initialise state 824 * fields in the xfs inode that left in the initialise state
943 * when freeing the inode. 825 * when freeing the inode.
944 */ 826 */
@@ -972,12 +854,7 @@ xfs_fs_inode_init_once(
972 854
973/* 855/*
974 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that 856 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
975 * we catch unlogged VFS level updates to the inode. Care must be taken 857 * we catch unlogged VFS level updates to the inode.
976 * here - the transaction code calls mark_inode_dirty_sync() to mark the
977 * VFS inode dirty in a transaction and clears the i_update_core field;
978 * it must clear the field after calling mark_inode_dirty_sync() to
979 * correctly indicate that the dirty state has been propagated into the
980 * inode log item.
981 * 858 *
982 * We need the barrier() to maintain correct ordering between unlogged 859 * We need the barrier() to maintain correct ordering between unlogged
983 * updates and the transaction commit code that clears the i_update_core 860 * updates and the transaction commit code that clears the i_update_core
@@ -986,7 +863,8 @@ xfs_fs_inode_init_once(
986 */ 863 */
987STATIC void 864STATIC void
988xfs_fs_dirty_inode( 865xfs_fs_dirty_inode(
989 struct inode *inode) 866 struct inode *inode,
867 int flags)
990{ 868{
991 barrier(); 869 barrier();
992 XFS_I(inode)->i_update_core = 1; 870 XFS_I(inode)->i_update_core = 1;
@@ -1084,7 +962,7 @@ xfs_fs_write_inode(
1084 error = 0; 962 error = 0;
1085 goto out_unlock; 963 goto out_unlock;
1086 } 964 }
1087 error = xfs_iflush(ip, 0); 965 error = xfs_iflush(ip, SYNC_TRYLOCK);
1088 } 966 }
1089 967
1090 out_unlock: 968 out_unlock:
@@ -1126,6 +1004,8 @@ xfs_fs_evict_inode(
1126 */ 1004 */
1127 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 1005 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
1128 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 1006 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
1007 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
1008 &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
1129 1009
1130 xfs_inactive(ip); 1010 xfs_inactive(ip);
1131} 1011}
@@ -1195,22 +1075,12 @@ xfs_fs_sync_fs(
1195 return -error; 1075 return -error;
1196 1076
1197 if (laptop_mode) { 1077 if (laptop_mode) {
1198 int prev_sync_seq = mp->m_sync_seq;
1199
1200 /* 1078 /*
1201 * The disk must be active because we're syncing. 1079 * The disk must be active because we're syncing.
1202 * We schedule xfssyncd now (now that the disk is 1080 * We schedule xfssyncd now (now that the disk is
1203 * active) instead of later (when it might not be). 1081 * active) instead of later (when it might not be).
1204 */ 1082 */
1205 wake_up_process(mp->m_sync_task); 1083 flush_delayed_work_sync(&mp->m_sync_work);
1206 /*
1207 * We have to wait for the sync iteration to complete.
1208 * If we don't, the disk activity caused by the sync
1209 * will come after the sync is completed, and that
1210 * triggers another sync from laptop mode.
1211 */
1212 wait_event(mp->m_wait_single_sync_task,
1213 mp->m_sync_seq != prev_sync_seq);
1214 } 1084 }
1215 1085
1216 return 0; 1086 return 0;
@@ -1308,14 +1178,6 @@ xfs_fs_remount(
1308 switch (token) { 1178 switch (token) {
1309 case Opt_barrier: 1179 case Opt_barrier:
1310 mp->m_flags |= XFS_MOUNT_BARRIER; 1180 mp->m_flags |= XFS_MOUNT_BARRIER;
1311
1312 /*
1313 * Test if barriers are actually working if we can,
1314 * else delay this check until the filesystem is
1315 * marked writeable.
1316 */
1317 if (!(mp->m_flags & XFS_MOUNT_RDONLY))
1318 xfs_mountfs_check_barriers(mp);
1319 break; 1181 break;
1320 case Opt_nobarrier: 1182 case Opt_nobarrier:
1321 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1183 mp->m_flags &= ~XFS_MOUNT_BARRIER;
@@ -1338,8 +1200,8 @@ xfs_fs_remount(
1338 * options that we can't actually change. 1200 * options that we can't actually change.
1339 */ 1201 */
1340#if 0 1202#if 0
1341 printk(KERN_INFO 1203 xfs_info(mp,
1342 "XFS: mount option \"%s\" not supported for remount\n", p); 1204 "mount option \"%s\" not supported for remount\n", p);
1343 return -EINVAL; 1205 return -EINVAL;
1344#else 1206#else
1345 break; 1207 break;
@@ -1350,8 +1212,6 @@ xfs_fs_remount(
1350 /* ro -> rw */ 1212 /* ro -> rw */
1351 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { 1213 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
1352 mp->m_flags &= ~XFS_MOUNT_RDONLY; 1214 mp->m_flags &= ~XFS_MOUNT_RDONLY;
1353 if (mp->m_flags & XFS_MOUNT_BARRIER)
1354 xfs_mountfs_check_barriers(mp);
1355 1215
1356 /* 1216 /*
1357 * If this is the first remount to writeable state we 1217 * If this is the first remount to writeable state we
@@ -1360,8 +1220,7 @@ xfs_fs_remount(
1360 if (mp->m_update_flags) { 1220 if (mp->m_update_flags) {
1361 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1221 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1362 if (error) { 1222 if (error) {
1363 cmn_err(CE_WARN, 1223 xfs_warn(mp, "failed to write sb changes");
1364 "XFS: failed to write sb changes");
1365 return error; 1224 return error;
1366 } 1225 }
1367 mp->m_update_flags = 0; 1226 mp->m_update_flags = 0;
@@ -1407,7 +1266,7 @@ xfs_fs_freeze(
1407 1266
1408 xfs_save_resvblks(mp); 1267 xfs_save_resvblks(mp);
1409 xfs_quiesce_attr(mp); 1268 xfs_quiesce_attr(mp);
1410 return -xfs_fs_log_dummy(mp, SYNC_WAIT); 1269 return -xfs_fs_log_dummy(mp);
1411} 1270}
1412 1271
1413STATIC int 1272STATIC int
@@ -1445,15 +1304,15 @@ xfs_finish_flags(
1445 mp->m_logbsize = mp->m_sb.sb_logsunit; 1304 mp->m_logbsize = mp->m_sb.sb_logsunit;
1446 } else if (mp->m_logbsize > 0 && 1305 } else if (mp->m_logbsize > 0 &&
1447 mp->m_logbsize < mp->m_sb.sb_logsunit) { 1306 mp->m_logbsize < mp->m_sb.sb_logsunit) {
1448 cmn_err(CE_WARN, 1307 xfs_warn(mp,
1449 "XFS: logbuf size must be greater than or equal to log stripe size"); 1308 "logbuf size must be greater than or equal to log stripe size");
1450 return XFS_ERROR(EINVAL); 1309 return XFS_ERROR(EINVAL);
1451 } 1310 }
1452 } else { 1311 } else {
1453 /* Fail a mount if the logbuf is larger than 32K */ 1312 /* Fail a mount if the logbuf is larger than 32K */
1454 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { 1313 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
1455 cmn_err(CE_WARN, 1314 xfs_warn(mp,
1456 "XFS: logbuf size for version 1 logs must be 16K or 32K"); 1315 "logbuf size for version 1 logs must be 16K or 32K");
1457 return XFS_ERROR(EINVAL); 1316 return XFS_ERROR(EINVAL);
1458 } 1317 }
1459 } 1318 }
@@ -1470,8 +1329,8 @@ xfs_finish_flags(
1470 * prohibit r/w mounts of read-only filesystems 1329 * prohibit r/w mounts of read-only filesystems
1471 */ 1330 */
1472 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { 1331 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
1473 cmn_err(CE_WARN, 1332 xfs_warn(mp,
1474 "XFS: cannot mount a read-only filesystem as read-write"); 1333 "cannot mount a read-only filesystem as read-write");
1475 return XFS_ERROR(EROFS); 1334 return XFS_ERROR(EROFS);
1476 } 1335 }
1477 1336
@@ -1495,9 +1354,6 @@ xfs_fs_fill_super(
1495 spin_lock_init(&mp->m_sb_lock); 1354 spin_lock_init(&mp->m_sb_lock);
1496 mutex_init(&mp->m_growlock); 1355 mutex_init(&mp->m_growlock);
1497 atomic_set(&mp->m_active_trans, 0); 1356 atomic_set(&mp->m_active_trans, 0);
1498 INIT_LIST_HEAD(&mp->m_sync_list);
1499 spin_lock_init(&mp->m_sync_lock);
1500 init_waitqueue_head(&mp->m_wait_single_sync_task);
1501 1357
1502 mp->m_super = sb; 1358 mp->m_super = sb;
1503 sb->s_fs_info = mp; 1359 sb->s_fs_info = mp;
@@ -1521,8 +1377,9 @@ xfs_fs_fill_super(
1521 if (error) 1377 if (error)
1522 goto out_free_fsname; 1378 goto out_free_fsname;
1523 1379
1524 if (xfs_icsb_init_counters(mp)) 1380 error = xfs_icsb_init_counters(mp);
1525 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 1381 if (error)
1382 goto out_close_devices;
1526 1383
1527 error = xfs_readsb(mp, flags); 1384 error = xfs_readsb(mp, flags);
1528 if (error) 1385 if (error)
@@ -1536,17 +1393,18 @@ xfs_fs_fill_super(
1536 if (error) 1393 if (error)
1537 goto out_free_sb; 1394 goto out_free_sb;
1538 1395
1539 if (mp->m_flags & XFS_MOUNT_BARRIER)
1540 xfs_mountfs_check_barriers(mp);
1541
1542 error = xfs_filestream_mount(mp); 1396 error = xfs_filestream_mount(mp);
1543 if (error) 1397 if (error)
1544 goto out_free_sb; 1398 goto out_free_sb;
1545 1399
1546 error = xfs_mountfs(mp); 1400 /*
1547 if (error) 1401 * we must configure the block size in the superblock before we run the
1548 goto out_filestream_unmount; 1402 * full mount process as the mount process can lookup and cache inodes.
1549 1403 * For the same reason we must also initialise the syncd and register
1404 * the inode cache shrinker so that inodes can be reclaimed during
1405 * operations like a quotacheck that iterate all inodes in the
1406 * filesystem.
1407 */
1550 sb->s_magic = XFS_SB_MAGIC; 1408 sb->s_magic = XFS_SB_MAGIC;
1551 sb->s_blocksize = mp->m_sb.sb_blocksize; 1409 sb->s_blocksize = mp->m_sb.sb_blocksize;
1552 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1410 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1554,6 +1412,16 @@ xfs_fs_fill_super(
1554 sb->s_time_gran = 1; 1412 sb->s_time_gran = 1;
1555 set_posix_acl_flag(sb); 1413 set_posix_acl_flag(sb);
1556 1414
1415 error = xfs_syncd_init(mp);
1416 if (error)
1417 goto out_filestream_unmount;
1418
1419 xfs_inode_shrinker_register(mp);
1420
1421 error = xfs_mountfs(mp);
1422 if (error)
1423 goto out_syncd_stop;
1424
1557 root = igrab(VFS_I(mp->m_rootip)); 1425 root = igrab(VFS_I(mp->m_rootip));
1558 if (!root) { 1426 if (!root) {
1559 error = ENOENT; 1427 error = ENOENT;
@@ -1569,20 +1437,18 @@ xfs_fs_fill_super(
1569 goto fail_vnrele; 1437 goto fail_vnrele;
1570 } 1438 }
1571 1439
1572 error = xfs_syncd_init(mp);
1573 if (error)
1574 goto fail_vnrele;
1575
1576 xfs_inode_shrinker_register(mp);
1577
1578 return 0; 1440 return 0;
1579 1441
1442 out_syncd_stop:
1443 xfs_inode_shrinker_unregister(mp);
1444 xfs_syncd_stop(mp);
1580 out_filestream_unmount: 1445 out_filestream_unmount:
1581 xfs_filestream_unmount(mp); 1446 xfs_filestream_unmount(mp);
1582 out_free_sb: 1447 out_free_sb:
1583 xfs_freesb(mp); 1448 xfs_freesb(mp);
1584 out_destroy_counters: 1449 out_destroy_counters:
1585 xfs_icsb_destroy_counters(mp); 1450 xfs_icsb_destroy_counters(mp);
1451 out_close_devices:
1586 xfs_close_devices(mp); 1452 xfs_close_devices(mp);
1587 out_free_fsname: 1453 out_free_fsname:
1588 xfs_free_fsname(mp); 1454 xfs_free_fsname(mp);
@@ -1599,6 +1465,9 @@ xfs_fs_fill_super(
1599 } 1465 }
1600 1466
1601 fail_unmount: 1467 fail_unmount:
1468 xfs_inode_shrinker_unregister(mp);
1469 xfs_syncd_stop(mp);
1470
1602 /* 1471 /*
1603 * Blow away any referenced inode in the filestreams cache. 1472 * Blow away any referenced inode in the filestreams cache.
1604 * This can and will cause log traffic as inodes go inactive 1473 * This can and will cause log traffic as inodes go inactive
@@ -1612,16 +1481,14 @@ xfs_fs_fill_super(
1612 goto out_free_sb; 1481 goto out_free_sb;
1613} 1482}
1614 1483
1615STATIC int 1484STATIC struct dentry *
1616xfs_fs_get_sb( 1485xfs_fs_mount(
1617 struct file_system_type *fs_type, 1486 struct file_system_type *fs_type,
1618 int flags, 1487 int flags,
1619 const char *dev_name, 1488 const char *dev_name,
1620 void *data, 1489 void *data)
1621 struct vfsmount *mnt)
1622{ 1490{
1623 return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super, 1491 return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
1624 mnt);
1625} 1492}
1626 1493
1627static const struct super_operations xfs_super_operations = { 1494static const struct super_operations xfs_super_operations = {
@@ -1642,7 +1509,7 @@ static const struct super_operations xfs_super_operations = {
1642static struct file_system_type xfs_fs_type = { 1509static struct file_system_type xfs_fs_type = {
1643 .owner = THIS_MODULE, 1510 .owner = THIS_MODULE,
1644 .name = "xfs", 1511 .name = "xfs",
1645 .get_sb = xfs_fs_get_sb, 1512 .mount = xfs_fs_mount,
1646 .kill_sb = kill_block_super, 1513 .kill_sb = kill_block_super,
1647 .fs_flags = FS_REQUIRES_DEV, 1514 .fs_flags = FS_REQUIRES_DEV,
1648}; 1515};
@@ -1790,6 +1657,38 @@ xfs_destroy_zones(void)
1790} 1657}
1791 1658
1792STATIC int __init 1659STATIC int __init
1660xfs_init_workqueues(void)
1661{
1662 /*
1663 * max_active is set to 8 to give enough concurency to allow
1664 * multiple work operations on each CPU to run. This allows multiple
1665 * filesystems to be running sync work concurrently, and scales with
1666 * the number of CPUs in the system.
1667 */
1668 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
1669 if (!xfs_syncd_wq)
1670 goto out;
1671
1672 xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
1673 if (!xfs_ail_wq)
1674 goto out_destroy_syncd;
1675
1676 return 0;
1677
1678out_destroy_syncd:
1679 destroy_workqueue(xfs_syncd_wq);
1680out:
1681 return -ENOMEM;
1682}
1683
1684STATIC void
1685xfs_destroy_workqueues(void)
1686{
1687 destroy_workqueue(xfs_ail_wq);
1688 destroy_workqueue(xfs_syncd_wq);
1689}
1690
1691STATIC int __init
1793init_xfs_fs(void) 1692init_xfs_fs(void)
1794{ 1693{
1795 int error; 1694 int error;
@@ -1804,10 +1703,14 @@ init_xfs_fs(void)
1804 if (error) 1703 if (error)
1805 goto out; 1704 goto out;
1806 1705
1807 error = xfs_mru_cache_init(); 1706 error = xfs_init_workqueues();
1808 if (error) 1707 if (error)
1809 goto out_destroy_zones; 1708 goto out_destroy_zones;
1810 1709
1710 error = xfs_mru_cache_init();
1711 if (error)
1712 goto out_destroy_wq;
1713
1811 error = xfs_filestream_init(); 1714 error = xfs_filestream_init();
1812 if (error) 1715 if (error)
1813 goto out_mru_cache_uninit; 1716 goto out_mru_cache_uninit;
@@ -1841,6 +1744,8 @@ init_xfs_fs(void)
1841 xfs_filestream_uninit(); 1744 xfs_filestream_uninit();
1842 out_mru_cache_uninit: 1745 out_mru_cache_uninit:
1843 xfs_mru_cache_uninit(); 1746 xfs_mru_cache_uninit();
1747 out_destroy_wq:
1748 xfs_destroy_workqueues();
1844 out_destroy_zones: 1749 out_destroy_zones:
1845 xfs_destroy_zones(); 1750 xfs_destroy_zones();
1846 out: 1751 out:
@@ -1857,6 +1762,7 @@ exit_xfs_fs(void)
1857 xfs_buf_terminate(); 1762 xfs_buf_terminate();
1858 xfs_filestream_uninit(); 1763 xfs_filestream_uninit();
1859 xfs_mru_cache_uninit(); 1764 xfs_mru_cache_uninit();
1765 xfs_destroy_workqueues();
1860 xfs_destroy_zones(); 1766 xfs_destroy_zones();
1861} 1767}
1862 1768
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 1ef4a4d2d997..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -62,6 +62,7 @@ extern void xfs_qm_exit(void);
62# define XFS_DBG_STRING "no debug" 62# define XFS_DBG_STRING "no debug"
63#endif 63#endif
64 64
65#define XFS_VERSION_STRING "SGI XFS"
65#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ 66#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
66 XFS_SECURITY_STRING \ 67 XFS_SECURITY_STRING \
67 XFS_REALTIME_STRING \ 68 XFS_REALTIME_STRING \
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 81976ffed7d6..8ecad5ff9f9b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -39,42 +40,61 @@
39#include <linux/kthread.h> 40#include <linux/kthread.h>
40#include <linux/freezer.h> 41#include <linux/freezer.h>
41 42
43struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
42 44
43STATIC xfs_inode_t * 45/*
44xfs_inode_ag_lookup( 46 * The inode lookup is done in batches to keep the amount of lock traffic and
45 struct xfs_mount *mp, 47 * radix tree lookups to a minimum. The batch size is a trade off between
46 struct xfs_perag *pag, 48 * lookup reduction and stack usage. This is in the reclaim path, so we can't
47 uint32_t *first_index, 49 * be too greedy.
48 int tag) 50 */
51#define XFS_LOOKUP_BATCH 32
52
53STATIC int
54xfs_inode_ag_walk_grab(
55 struct xfs_inode *ip)
49{ 56{
50 int nr_found; 57 struct inode *inode = VFS_I(ip);
51 struct xfs_inode *ip; 58
59 ASSERT(rcu_read_lock_held());
52 60
53 /* 61 /*
54 * use a gang lookup to find the next inode in the tree 62 * check for stale RCU freed inode
55 * as the tree is sparse and a gang lookup walks to find 63 *
56 * the number of objects requested. 64 * If the inode has been reallocated, it doesn't matter if it's not in
65 * the AG we are walking - we are walking for writeback, so if it
66 * passes all the "valid inode" checks and is dirty, then we'll write
67 * it back anyway. If it has been reallocated and still being
68 * initialised, the XFS_INEW check below will catch it.
57 */ 69 */
58 if (tag == XFS_ICI_NO_TAG) { 70 spin_lock(&ip->i_flags_lock);
59 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 71 if (!ip->i_ino)
60 (void **)&ip, *first_index, 1); 72 goto out_unlock_noent;
61 } else { 73
62 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 74 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
63 (void **)&ip, *first_index, 1, tag); 75 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
76 goto out_unlock_noent;
77 spin_unlock(&ip->i_flags_lock);
78
79 /* nothing to sync during shutdown */
80 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
81 return EFSCORRUPTED;
82
83 /* If we can't grab the inode, it must on it's way to reclaim. */
84 if (!igrab(inode))
85 return ENOENT;
86
87 if (is_bad_inode(inode)) {
88 IRELE(ip);
89 return ENOENT;
64 } 90 }
65 if (!nr_found)
66 return NULL;
67 91
68 /* 92 /* inode is valid */
69 * Update the index for the next lookup. Catch overflows 93 return 0;
70 * into the next AG range which can occur if we have inodes 94
71 * in the last block of the AG and we are currently 95out_unlock_noent:
72 * pointing to the last inode. 96 spin_unlock(&ip->i_flags_lock);
73 */ 97 return ENOENT;
74 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
75 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
76 return NULL;
77 return ip;
78} 98}
79 99
80STATIC int 100STATIC int
@@ -83,49 +103,83 @@ xfs_inode_ag_walk(
83 struct xfs_perag *pag, 103 struct xfs_perag *pag,
84 int (*execute)(struct xfs_inode *ip, 104 int (*execute)(struct xfs_inode *ip,
85 struct xfs_perag *pag, int flags), 105 struct xfs_perag *pag, int flags),
86 int flags, 106 int flags)
87 int tag,
88 int exclusive,
89 int *nr_to_scan)
90{ 107{
91 uint32_t first_index; 108 uint32_t first_index;
92 int last_error = 0; 109 int last_error = 0;
93 int skipped; 110 int skipped;
111 int done;
112 int nr_found;
94 113
95restart: 114restart:
115 done = 0;
96 skipped = 0; 116 skipped = 0;
97 first_index = 0; 117 first_index = 0;
118 nr_found = 0;
98 do { 119 do {
120 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
99 int error = 0; 121 int error = 0;
100 xfs_inode_t *ip; 122 int i;
101 123
102 if (exclusive) 124 rcu_read_lock();
103 write_lock(&pag->pag_ici_lock); 125 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
104 else 126 (void **)batch, first_index,
105 read_lock(&pag->pag_ici_lock); 127 XFS_LOOKUP_BATCH);
106 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 128 if (!nr_found) {
107 if (!ip) { 129 rcu_read_unlock();
108 if (exclusive)
109 write_unlock(&pag->pag_ici_lock);
110 else
111 read_unlock(&pag->pag_ici_lock);
112 break; 130 break;
113 } 131 }
114 132
115 /* execute releases pag->pag_ici_lock */ 133 /*
116 error = execute(ip, pag, flags); 134 * Grab the inodes before we drop the lock. if we found
117 if (error == EAGAIN) { 135 * nothing, nr == 0 and the loop will be skipped.
118 skipped++; 136 */
119 continue; 137 for (i = 0; i < nr_found; i++) {
138 struct xfs_inode *ip = batch[i];
139
140 if (done || xfs_inode_ag_walk_grab(ip))
141 batch[i] = NULL;
142
143 /*
144 * Update the index for the next lookup. Catch
145 * overflows into the next AG range which can occur if
146 * we have inodes in the last block of the AG and we
147 * are currently pointing to the last inode.
148 *
149 * Because we may see inodes that are from the wrong AG
150 * due to RCU freeing and reallocation, only update the
151 * index if it lies in this AG. It was a race that lead
152 * us to see this inode, so another lookup from the
153 * same index will not find it again.
154 */
155 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
156 continue;
157 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
158 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
159 done = 1;
160 }
161
162 /* unlock now we've grabbed the inodes. */
163 rcu_read_unlock();
164
165 for (i = 0; i < nr_found; i++) {
166 if (!batch[i])
167 continue;
168 error = execute(batch[i], pag, flags);
169 IRELE(batch[i]);
170 if (error == EAGAIN) {
171 skipped++;
172 continue;
173 }
174 if (error && last_error != EFSCORRUPTED)
175 last_error = error;
120 } 176 }
121 if (error)
122 last_error = error;
123 177
124 /* bail out if the filesystem is corrupted. */ 178 /* bail out if the filesystem is corrupted. */
125 if (error == EFSCORRUPTED) 179 if (error == EFSCORRUPTED)
126 break; 180 break;
127 181
128 } while ((*nr_to_scan)--); 182 } while (nr_found && !done);
129 183
130 if (skipped) { 184 if (skipped) {
131 delay(1); 185 delay(1);
@@ -134,110 +188,32 @@ restart:
134 return last_error; 188 return last_error;
135} 189}
136 190
137/*
138 * Select the next per-ag structure to iterate during the walk. The reclaim
139 * walk is optimised only to walk AGs with reclaimable inodes in them.
140 */
141static struct xfs_perag *
142xfs_inode_ag_iter_next_pag(
143 struct xfs_mount *mp,
144 xfs_agnumber_t *first,
145 int tag)
146{
147 struct xfs_perag *pag = NULL;
148
149 if (tag == XFS_ICI_RECLAIM_TAG) {
150 int found;
151 int ref;
152
153 spin_lock(&mp->m_perag_lock);
154 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
155 (void **)&pag, *first, 1, tag);
156 if (found <= 0) {
157 spin_unlock(&mp->m_perag_lock);
158 return NULL;
159 }
160 *first = pag->pag_agno + 1;
161 /* open coded pag reference increment */
162 ref = atomic_inc_return(&pag->pag_ref);
163 spin_unlock(&mp->m_perag_lock);
164 trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
165 } else {
166 pag = xfs_perag_get(mp, *first);
167 (*first)++;
168 }
169 return pag;
170}
171
172int 191int
173xfs_inode_ag_iterator( 192xfs_inode_ag_iterator(
174 struct xfs_mount *mp, 193 struct xfs_mount *mp,
175 int (*execute)(struct xfs_inode *ip, 194 int (*execute)(struct xfs_inode *ip,
176 struct xfs_perag *pag, int flags), 195 struct xfs_perag *pag, int flags),
177 int flags, 196 int flags)
178 int tag,
179 int exclusive,
180 int *nr_to_scan)
181{ 197{
182 struct xfs_perag *pag; 198 struct xfs_perag *pag;
183 int error = 0; 199 int error = 0;
184 int last_error = 0; 200 int last_error = 0;
185 xfs_agnumber_t ag; 201 xfs_agnumber_t ag;
186 int nr;
187 202
188 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
189 ag = 0; 203 ag = 0;
190 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { 204 while ((pag = xfs_perag_get(mp, ag))) {
191 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 205 ag = pag->pag_agno + 1;
192 exclusive, &nr); 206 error = xfs_inode_ag_walk(mp, pag, execute, flags);
193 xfs_perag_put(pag); 207 xfs_perag_put(pag);
194 if (error) { 208 if (error) {
195 last_error = error; 209 last_error = error;
196 if (error == EFSCORRUPTED) 210 if (error == EFSCORRUPTED)
197 break; 211 break;
198 } 212 }
199 if (nr <= 0)
200 break;
201 } 213 }
202 if (nr_to_scan)
203 *nr_to_scan = nr;
204 return XFS_ERROR(last_error); 214 return XFS_ERROR(last_error);
205} 215}
206 216
207/* must be called with pag_ici_lock held and releases it */
208int
209xfs_sync_inode_valid(
210 struct xfs_inode *ip,
211 struct xfs_perag *pag)
212{
213 struct inode *inode = VFS_I(ip);
214 int error = EFSCORRUPTED;
215
216 /* nothing to sync during shutdown */
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 goto out_unlock;
219
220 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
221 error = ENOENT;
222 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
223 goto out_unlock;
224
225 /* If we can't grab the inode, it must on it's way to reclaim. */
226 if (!igrab(inode))
227 goto out_unlock;
228
229 if (is_bad_inode(inode)) {
230 IRELE(ip);
231 goto out_unlock;
232 }
233
234 /* inode is valid */
235 error = 0;
236out_unlock:
237 read_unlock(&pag->pag_ici_lock);
238 return error;
239}
240
241STATIC int 217STATIC int
242xfs_sync_inode_data( 218xfs_sync_inode_data(
243 struct xfs_inode *ip, 219 struct xfs_inode *ip,
@@ -248,10 +224,6 @@ xfs_sync_inode_data(
248 struct address_space *mapping = inode->i_mapping; 224 struct address_space *mapping = inode->i_mapping;
249 int error = 0; 225 int error = 0;
250 226
251 error = xfs_sync_inode_valid(ip, pag);
252 if (error)
253 return error;
254
255 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 227 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
256 goto out_wait; 228 goto out_wait;
257 229
@@ -268,7 +240,6 @@ xfs_sync_inode_data(
268 out_wait: 240 out_wait:
269 if (flags & SYNC_WAIT) 241 if (flags & SYNC_WAIT)
270 xfs_ioend_wait(ip); 242 xfs_ioend_wait(ip);
271 IRELE(ip);
272 return error; 243 return error;
273} 244}
274 245
@@ -280,10 +251,6 @@ xfs_sync_inode_attr(
280{ 251{
281 int error = 0; 252 int error = 0;
282 253
283 error = xfs_sync_inode_valid(ip, pag);
284 if (error)
285 return error;
286
287 xfs_ilock(ip, XFS_ILOCK_SHARED); 254 xfs_ilock(ip, XFS_ILOCK_SHARED);
288 if (xfs_inode_clean(ip)) 255 if (xfs_inode_clean(ip))
289 goto out_unlock; 256 goto out_unlock;
@@ -300,9 +267,18 @@ xfs_sync_inode_attr(
300 267
301 error = xfs_iflush(ip, flags); 268 error = xfs_iflush(ip, flags);
302 269
270 /*
271 * We don't want to try again on non-blocking flushes that can't run
272 * again immediately. If an inode really must be written, then that's
273 * what the SYNC_WAIT flag is for.
274 */
275 if (error == EAGAIN) {
276 ASSERT(!(flags & SYNC_WAIT));
277 error = 0;
278 }
279
303 out_unlock: 280 out_unlock:
304 xfs_iunlock(ip, XFS_ILOCK_SHARED); 281 xfs_iunlock(ip, XFS_ILOCK_SHARED);
305 IRELE(ip);
306 return error; 282 return error;
307} 283}
308 284
@@ -318,8 +294,7 @@ xfs_sync_data(
318 294
319 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 295 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
320 296
321 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 297 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
322 XFS_ICI_NO_TAG, 0, NULL);
323 if (error) 298 if (error)
324 return XFS_ERROR(error); 299 return XFS_ERROR(error);
325 300
@@ -337,8 +312,7 @@ xfs_sync_attr(
337{ 312{
338 ASSERT((flags & ~SYNC_WAIT) == 0); 313 ASSERT((flags & ~SYNC_WAIT) == 0);
339 314
340 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 315 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
341 XFS_ICI_NO_TAG, 0, NULL);
342} 316}
343 317
344STATIC int 318STATIC int
@@ -401,7 +375,7 @@ xfs_quiesce_data(
401 375
402 /* mark the log as covered if needed */ 376 /* mark the log as covered if needed */
403 if (xfs_log_need_covered(mp)) 377 if (xfs_log_need_covered(mp))
404 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); 378 error2 = xfs_fs_log_dummy(mp);
405 379
406 /* flush data-only devices */ 380 /* flush data-only devices */
407 if (mp->m_rtdev_targp) 381 if (mp->m_rtdev_targp)
@@ -440,7 +414,7 @@ xfs_quiesce_fs(
440/* 414/*
441 * Second stage of a quiesce. The data is already synced, now we have to take 415 * Second stage of a quiesce. The data is already synced, now we have to take
442 * care of the metadata. New transactions are already blocked, so we need to 416 * care of the metadata. New transactions are already blocked, so we need to
443 * wait for any remaining transactions to drain out before proceding. 417 * wait for any remaining transactions to drain out before proceeding.
444 */ 418 */
445void 419void
446xfs_quiesce_attr( 420xfs_quiesce_attr(
@@ -464,69 +438,18 @@ xfs_quiesce_attr(
464 /* Push the superblock and write an unmount record */ 438 /* Push the superblock and write an unmount record */
465 error = xfs_log_sbcount(mp, 1); 439 error = xfs_log_sbcount(mp, 1);
466 if (error) 440 if (error)
467 xfs_fs_cmn_err(CE_WARN, mp, 441 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
468 "xfs_attr_quiesce: failed to log sb changes. "
469 "Frozen image may not be consistent."); 442 "Frozen image may not be consistent.");
470 xfs_log_unmount_write(mp); 443 xfs_log_unmount_write(mp);
471 xfs_unmountfs_writesb(mp); 444 xfs_unmountfs_writesb(mp);
472} 445}
473 446
474/* 447static void
475 * Enqueue a work item to be picked up by the vfs xfssyncd thread. 448xfs_syncd_queue_sync(
476 * Doing this has two advantages: 449 struct xfs_mount *mp)
477 * - It saves on stack space, which is tight in certain situations
478 * - It can be used (with care) as a mechanism to avoid deadlocks.
479 * Flushing while allocating in a full filesystem requires both.
480 */
481STATIC void
482xfs_syncd_queue_work(
483 struct xfs_mount *mp,
484 void *data,
485 void (*syncer)(struct xfs_mount *, void *),
486 struct completion *completion)
487{
488 struct xfs_sync_work *work;
489
490 work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
491 INIT_LIST_HEAD(&work->w_list);
492 work->w_syncer = syncer;
493 work->w_data = data;
494 work->w_mount = mp;
495 work->w_completion = completion;
496 spin_lock(&mp->m_sync_lock);
497 list_add_tail(&work->w_list, &mp->m_sync_list);
498 spin_unlock(&mp->m_sync_lock);
499 wake_up_process(mp->m_sync_task);
500}
501
502/*
503 * Flush delayed allocate data, attempting to free up reserved space
504 * from existing allocations. At this point a new allocation attempt
505 * has failed with ENOSPC and we are in the process of scratching our
506 * heads, looking about for more room...
507 */
508STATIC void
509xfs_flush_inodes_work(
510 struct xfs_mount *mp,
511 void *arg)
512{
513 struct inode *inode = arg;
514 xfs_sync_data(mp, SYNC_TRYLOCK);
515 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
516 iput(inode);
517}
518
519void
520xfs_flush_inodes(
521 xfs_inode_t *ip)
522{ 450{
523 struct inode *inode = VFS_I(ip); 451 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
524 DECLARE_COMPLETION_ONSTACK(completion); 452 msecs_to_jiffies(xfs_syncd_centisecs * 10));
525
526 igrab(inode);
527 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
528 wait_for_completion(&completion);
529 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
530} 453}
531 454
532/* 455/*
@@ -536,84 +459,119 @@ xfs_flush_inodes(
536 */ 459 */
537STATIC void 460STATIC void
538xfs_sync_worker( 461xfs_sync_worker(
539 struct xfs_mount *mp, 462 struct work_struct *work)
540 void *unused)
541{ 463{
464 struct xfs_mount *mp = container_of(to_delayed_work(work),
465 struct xfs_mount, m_sync_work);
542 int error; 466 int error;
543 467
544 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 468 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
545 xfs_log_force(mp, 0);
546 xfs_reclaim_inodes(mp, 0);
547 /* dgc: errors ignored here */ 469 /* dgc: errors ignored here */
548 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
549 if (mp->m_super->s_frozen == SB_UNFROZEN && 470 if (mp->m_super->s_frozen == SB_UNFROZEN &&
550 xfs_log_need_covered(mp)) 471 xfs_log_need_covered(mp))
551 error = xfs_fs_log_dummy(mp, 0); 472 error = xfs_fs_log_dummy(mp);
473 else
474 xfs_log_force(mp, 0);
475 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
476
477 /* start pushing all the metadata that is currently dirty */
478 xfs_ail_push_all(mp->m_ail);
552 } 479 }
553 mp->m_sync_seq++; 480
554 wake_up(&mp->m_wait_single_sync_task); 481 /* queue us up again */
482 xfs_syncd_queue_sync(mp);
555} 483}
556 484
557STATIC int 485/*
558xfssyncd( 486 * Queue a new inode reclaim pass if there are reclaimable inodes and there
559 void *arg) 487 * isn't a reclaim pass already in progress. By default it runs every 5s based
488 * on the xfs syncd work default of 30s. Perhaps this should have it's own
489 * tunable, but that can be done if this method proves to be ineffective or too
490 * aggressive.
491 */
492static void
493xfs_syncd_queue_reclaim(
494 struct xfs_mount *mp)
560{ 495{
561 struct xfs_mount *mp = arg;
562 long timeleft;
563 xfs_sync_work_t *work, *n;
564 LIST_HEAD (tmp);
565
566 set_freezable();
567 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
568 for (;;) {
569 if (list_empty(&mp->m_sync_list))
570 timeleft = schedule_timeout_interruptible(timeleft);
571 /* swsusp */
572 try_to_freeze();
573 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
574 break;
575 496
576 spin_lock(&mp->m_sync_lock); 497 /*
577 /* 498 * We can have inodes enter reclaim after we've shut down the syncd
578 * We can get woken by laptop mode, to do a sync - 499 * workqueue during unmount, so don't allow reclaim work to be queued
579 * that's the (only!) case where the list would be 500 * during unmount.
580 * empty with time remaining. 501 */
581 */ 502 if (!(mp->m_super->s_flags & MS_ACTIVE))
582 if (!timeleft || list_empty(&mp->m_sync_list)) { 503 return;
583 if (!timeleft)
584 timeleft = xfs_syncd_centisecs *
585 msecs_to_jiffies(10);
586 INIT_LIST_HEAD(&mp->m_sync_work.w_list);
587 list_add_tail(&mp->m_sync_work.w_list,
588 &mp->m_sync_list);
589 }
590 list_splice_init(&mp->m_sync_list, &tmp);
591 spin_unlock(&mp->m_sync_lock);
592 504
593 list_for_each_entry_safe(work, n, &tmp, w_list) { 505 rcu_read_lock();
594 (*work->w_syncer)(mp, work->w_data); 506 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
595 list_del(&work->w_list); 507 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
596 if (work == &mp->m_sync_work) 508 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
597 continue;
598 if (work->w_completion)
599 complete(work->w_completion);
600 kmem_free(work);
601 }
602 } 509 }
510 rcu_read_unlock();
511}
603 512
604 return 0; 513/*
514 * This is a fast pass over the inode cache to try to get reclaim moving on as
515 * many inodes as possible in a short period of time. It kicks itself every few
516 * seconds, as well as being kicked by the inode cache shrinker when memory
517 * goes low. It scans as quickly as possible avoiding locked inodes or those
518 * already being flushed, and once done schedules a future pass.
519 */
520STATIC void
521xfs_reclaim_worker(
522 struct work_struct *work)
523{
524 struct xfs_mount *mp = container_of(to_delayed_work(work),
525 struct xfs_mount, m_reclaim_work);
526
527 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
528 xfs_syncd_queue_reclaim(mp);
529}
530
531/*
532 * Flush delayed allocate data, attempting to free up reserved space
533 * from existing allocations. At this point a new allocation attempt
534 * has failed with ENOSPC and we are in the process of scratching our
535 * heads, looking about for more room.
536 *
537 * Queue a new data flush if there isn't one already in progress and
538 * wait for completion of the flush. This means that we only ever have one
539 * inode flush in progress no matter how many ENOSPC events are occurring and
540 * so will prevent the system from bogging down due to every concurrent
541 * ENOSPC event scanning all the active inodes in the system for writeback.
542 */
543void
544xfs_flush_inodes(
545 struct xfs_inode *ip)
546{
547 struct xfs_mount *mp = ip->i_mount;
548
549 queue_work(xfs_syncd_wq, &mp->m_flush_work);
550 flush_work_sync(&mp->m_flush_work);
551}
552
553STATIC void
554xfs_flush_worker(
555 struct work_struct *work)
556{
557 struct xfs_mount *mp = container_of(work,
558 struct xfs_mount, m_flush_work);
559
560 xfs_sync_data(mp, SYNC_TRYLOCK);
561 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
605} 562}
606 563
607int 564int
608xfs_syncd_init( 565xfs_syncd_init(
609 struct xfs_mount *mp) 566 struct xfs_mount *mp)
610{ 567{
611 mp->m_sync_work.w_syncer = xfs_sync_worker; 568 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
612 mp->m_sync_work.w_mount = mp; 569 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
613 mp->m_sync_work.w_completion = NULL; 570 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
614 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); 571
615 if (IS_ERR(mp->m_sync_task)) 572 xfs_syncd_queue_sync(mp);
616 return -PTR_ERR(mp->m_sync_task); 573 xfs_syncd_queue_reclaim(mp);
574
617 return 0; 575 return 0;
618} 576}
619 577
@@ -621,7 +579,9 @@ void
621xfs_syncd_stop( 579xfs_syncd_stop(
622 struct xfs_mount *mp) 580 struct xfs_mount *mp)
623{ 581{
624 kthread_stop(mp->m_sync_task); 582 cancel_delayed_work_sync(&mp->m_sync_work);
583 cancel_delayed_work_sync(&mp->m_reclaim_work);
584 cancel_work_sync(&mp->m_flush_work);
625} 585}
626 586
627void 587void
@@ -640,6 +600,10 @@ __xfs_inode_set_reclaim_tag(
640 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 600 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
641 XFS_ICI_RECLAIM_TAG); 601 XFS_ICI_RECLAIM_TAG);
642 spin_unlock(&ip->i_mount->m_perag_lock); 602 spin_unlock(&ip->i_mount->m_perag_lock);
603
604 /* schedule periodic background inode reclaim */
605 xfs_syncd_queue_reclaim(ip->i_mount);
606
643 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 607 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
644 -1, _RET_IP_); 608 -1, _RET_IP_);
645 } 609 }
@@ -659,12 +623,12 @@ xfs_inode_set_reclaim_tag(
659 struct xfs_perag *pag; 623 struct xfs_perag *pag;
660 624
661 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 625 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
662 write_lock(&pag->pag_ici_lock); 626 spin_lock(&pag->pag_ici_lock);
663 spin_lock(&ip->i_flags_lock); 627 spin_lock(&ip->i_flags_lock);
664 __xfs_inode_set_reclaim_tag(pag, ip); 628 __xfs_inode_set_reclaim_tag(pag, ip);
665 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 629 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
666 spin_unlock(&ip->i_flags_lock); 630 spin_unlock(&ip->i_flags_lock);
667 write_unlock(&pag->pag_ici_lock); 631 spin_unlock(&pag->pag_ici_lock);
668 xfs_perag_put(pag); 632 xfs_perag_put(pag);
669} 633}
670 634
@@ -698,6 +662,53 @@ __xfs_inode_clear_reclaim_tag(
698} 662}
699 663
700/* 664/*
665 * Grab the inode for reclaim exclusively.
666 * Return 0 if we grabbed it, non-zero otherwise.
667 */
668STATIC int
669xfs_reclaim_inode_grab(
670 struct xfs_inode *ip,
671 int flags)
672{
673 ASSERT(rcu_read_lock_held());
674
675 /* quick check for stale RCU freed inode */
676 if (!ip->i_ino)
677 return 1;
678
679 /*
680 * do some unlocked checks first to avoid unnecessary lock traffic.
681 * The first is a flush lock check, the second is a already in reclaim
682 * check. Only do these checks if we are not going to block on locks.
683 */
684 if ((flags & SYNC_TRYLOCK) &&
685 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
686 return 1;
687 }
688
689 /*
690 * The radix tree lock here protects a thread in xfs_iget from racing
691 * with us starting reclaim on the inode. Once we have the
692 * XFS_IRECLAIM flag set it will not touch us.
693 *
694 * Due to RCU lookup, we may find inodes that have been freed and only
695 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
696 * aren't candidates for reclaim at all, so we must check the
697 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
698 */
699 spin_lock(&ip->i_flags_lock);
700 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
701 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
702 /* not a reclaim candidate. */
703 spin_unlock(&ip->i_flags_lock);
704 return 1;
705 }
706 __xfs_iflags_set(ip, XFS_IRECLAIM);
707 spin_unlock(&ip->i_flags_lock);
708 return 0;
709}
710
711/*
701 * Inodes in different states need to be treated differently, and the return 712 * Inodes in different states need to be treated differently, and the return
702 * value of xfs_iflush is not sufficient to get this right. The following table 713 * value of xfs_iflush is not sufficient to get this right. The following table
703 * lists the inode states and the reclaim actions necessary for non-blocking 714 * lists the inode states and the reclaim actions necessary for non-blocking
@@ -753,25 +764,10 @@ xfs_reclaim_inode(
753 struct xfs_perag *pag, 764 struct xfs_perag *pag,
754 int sync_mode) 765 int sync_mode)
755{ 766{
756 int error = 0; 767 int error;
757
758 /*
759 * The radix tree lock here protects a thread in xfs_iget from racing
760 * with us starting reclaim on the inode. Once we have the
761 * XFS_IRECLAIM flag set it will not touch us.
762 */
763 spin_lock(&ip->i_flags_lock);
764 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
765 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
766 /* ignore as it is already under reclaim */
767 spin_unlock(&ip->i_flags_lock);
768 write_unlock(&pag->pag_ici_lock);
769 return 0;
770 }
771 __xfs_iflags_set(ip, XFS_IRECLAIM);
772 spin_unlock(&ip->i_flags_lock);
773 write_unlock(&pag->pag_ici_lock);
774 768
769restart:
770 error = 0;
775 xfs_ilock(ip, XFS_ILOCK_EXCL); 771 xfs_ilock(ip, XFS_ILOCK_EXCL);
776 if (!xfs_iflock_nowait(ip)) { 772 if (!xfs_iflock_nowait(ip)) {
777 if (!(sync_mode & SYNC_WAIT)) 773 if (!(sync_mode & SYNC_WAIT))
@@ -797,9 +793,31 @@ xfs_reclaim_inode(
797 if (xfs_inode_clean(ip)) 793 if (xfs_inode_clean(ip))
798 goto reclaim; 794 goto reclaim;
799 795
800 /* Now we have an inode that needs flushing */ 796 /*
801 error = xfs_iflush(ip, sync_mode); 797 * Now we have an inode that needs flushing.
798 *
799 * We do a nonblocking flush here even if we are doing a SYNC_WAIT
800 * reclaim as we can deadlock with inode cluster removal.
801 * xfs_ifree_cluster() can lock the inode buffer before it locks the
802 * ip->i_lock, and we are doing the exact opposite here. As a result,
803 * doing a blocking xfs_itobp() to get the cluster buffer will result
804 * in an ABBA deadlock with xfs_ifree_cluster().
805 *
806 * As xfs_ifree_cluser() must gather all inodes that are active in the
807 * cache to mark them stale, if we hit this case we don't actually want
808 * to do IO here - we want the inode marked stale so we can simply
809 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
810 * just unlock the inode, back off and try again. Hopefully the next
811 * pass through will see the stale flag set on the inode.
812 */
813 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
802 if (sync_mode & SYNC_WAIT) { 814 if (sync_mode & SYNC_WAIT) {
815 if (error == EAGAIN) {
816 xfs_iunlock(ip, XFS_ILOCK_EXCL);
817 /* backoff longer than in xfs_ifree_cluster */
818 delay(2);
819 goto restart;
820 }
803 xfs_iflock(ip); 821 xfs_iflock(ip);
804 goto reclaim; 822 goto reclaim;
805 } 823 }
@@ -814,7 +832,7 @@ xfs_reclaim_inode(
814 * pass on the error. 832 * pass on the error.
815 */ 833 */
816 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 834 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
817 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 835 xfs_warn(ip->i_mount,
818 "inode 0x%llx background reclaim flush failed with %d", 836 "inode 0x%llx background reclaim flush failed with %d",
819 (long long)ip->i_ino, error); 837 (long long)ip->i_ino, error);
820 } 838 }
@@ -842,12 +860,12 @@ reclaim:
842 * added to the tree assert that it's been there before to catch 860 * added to the tree assert that it's been there before to catch
843 * problems with the inode life time early on. 861 * problems with the inode life time early on.
844 */ 862 */
845 write_lock(&pag->pag_ici_lock); 863 spin_lock(&pag->pag_ici_lock);
846 if (!radix_tree_delete(&pag->pag_ici_root, 864 if (!radix_tree_delete(&pag->pag_ici_root,
847 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 865 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
848 ASSERT(0); 866 ASSERT(0);
849 __xfs_inode_clear_reclaim(pag, ip); 867 __xfs_inode_clear_reclaim(pag, ip);
850 write_unlock(&pag->pag_ici_lock); 868 spin_unlock(&pag->pag_ici_lock);
851 869
852 /* 870 /*
853 * Here we do an (almost) spurious inode lock in order to coordinate 871 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -868,45 +886,181 @@ reclaim:
868 886
869} 887}
870 888
889/*
890 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
891 * corrupted, we still want to try to reclaim all the inodes. If we don't,
892 * then a shut down during filesystem unmount reclaim walk leak all the
893 * unreclaimed inodes.
894 */
895int
896xfs_reclaim_inodes_ag(
897 struct xfs_mount *mp,
898 int flags,
899 int *nr_to_scan)
900{
901 struct xfs_perag *pag;
902 int error = 0;
903 int last_error = 0;
904 xfs_agnumber_t ag;
905 int trylock = flags & SYNC_TRYLOCK;
906 int skipped;
907
908restart:
909 ag = 0;
910 skipped = 0;
911 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
912 unsigned long first_index = 0;
913 int done = 0;
914 int nr_found = 0;
915
916 ag = pag->pag_agno + 1;
917
918 if (trylock) {
919 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
920 skipped++;
921 xfs_perag_put(pag);
922 continue;
923 }
924 first_index = pag->pag_ici_reclaim_cursor;
925 } else
926 mutex_lock(&pag->pag_ici_reclaim_lock);
927
928 do {
929 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
930 int i;
931
932 rcu_read_lock();
933 nr_found = radix_tree_gang_lookup_tag(
934 &pag->pag_ici_root,
935 (void **)batch, first_index,
936 XFS_LOOKUP_BATCH,
937 XFS_ICI_RECLAIM_TAG);
938 if (!nr_found) {
939 done = 1;
940 rcu_read_unlock();
941 break;
942 }
943
944 /*
945 * Grab the inodes before we drop the lock. if we found
946 * nothing, nr == 0 and the loop will be skipped.
947 */
948 for (i = 0; i < nr_found; i++) {
949 struct xfs_inode *ip = batch[i];
950
951 if (done || xfs_reclaim_inode_grab(ip, flags))
952 batch[i] = NULL;
953
954 /*
955 * Update the index for the next lookup. Catch
956 * overflows into the next AG range which can
957 * occur if we have inodes in the last block of
958 * the AG and we are currently pointing to the
959 * last inode.
960 *
961 * Because we may see inodes that are from the
962 * wrong AG due to RCU freeing and
963 * reallocation, only update the index if it
964 * lies in this AG. It was a race that lead us
965 * to see this inode, so another lookup from
966 * the same index will not find it again.
967 */
968 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
969 pag->pag_agno)
970 continue;
971 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
972 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
973 done = 1;
974 }
975
976 /* unlock now we've grabbed the inodes. */
977 rcu_read_unlock();
978
979 for (i = 0; i < nr_found; i++) {
980 if (!batch[i])
981 continue;
982 error = xfs_reclaim_inode(batch[i], pag, flags);
983 if (error && last_error != EFSCORRUPTED)
984 last_error = error;
985 }
986
987 *nr_to_scan -= XFS_LOOKUP_BATCH;
988
989 } while (nr_found && !done && *nr_to_scan > 0);
990
991 if (trylock && !done)
992 pag->pag_ici_reclaim_cursor = first_index;
993 else
994 pag->pag_ici_reclaim_cursor = 0;
995 mutex_unlock(&pag->pag_ici_reclaim_lock);
996 xfs_perag_put(pag);
997 }
998
999 /*
1000 * if we skipped any AG, and we still have scan count remaining, do
1001 * another pass this time using blocking reclaim semantics (i.e
1002 * waiting on the reclaim locks and ignoring the reclaim cursors). This
1003 * ensure that when we get more reclaimers than AGs we block rather
1004 * than spin trying to execute reclaim.
1005 */
1006 if (trylock && skipped && *nr_to_scan > 0) {
1007 trylock = 0;
1008 goto restart;
1009 }
1010 return XFS_ERROR(last_error);
1011}
1012
871int 1013int
872xfs_reclaim_inodes( 1014xfs_reclaim_inodes(
873 xfs_mount_t *mp, 1015 xfs_mount_t *mp,
874 int mode) 1016 int mode)
875{ 1017{
876 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, 1018 int nr_to_scan = INT_MAX;
877 XFS_ICI_RECLAIM_TAG, 1, NULL); 1019
1020 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
878} 1021}
879 1022
880/* 1023/*
881 * Shrinker infrastructure. 1024 * Inode cache shrinker.
1025 *
1026 * When called we make sure that there is a background (fast) inode reclaim in
1027 * progress, while we will throttle the speed of reclaim via doiing synchronous
1028 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1029 * them to be cleaned, which we hope will not be very long due to the
1030 * background walker having already kicked the IO off on those dirty inodes.
882 */ 1031 */
883static int 1032static int
884xfs_reclaim_inode_shrink( 1033xfs_reclaim_inode_shrink(
885 struct shrinker *shrink, 1034 struct shrinker *shrink,
886 int nr_to_scan, 1035 struct shrink_control *sc)
887 gfp_t gfp_mask)
888{ 1036{
889 struct xfs_mount *mp; 1037 struct xfs_mount *mp;
890 struct xfs_perag *pag; 1038 struct xfs_perag *pag;
891 xfs_agnumber_t ag; 1039 xfs_agnumber_t ag;
892 int reclaimable; 1040 int reclaimable;
1041 int nr_to_scan = sc->nr_to_scan;
1042 gfp_t gfp_mask = sc->gfp_mask;
893 1043
894 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1044 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
895 if (nr_to_scan) { 1045 if (nr_to_scan) {
1046 /* kick background reclaimer and push the AIL */
1047 xfs_syncd_queue_reclaim(mp);
1048 xfs_ail_push_all(mp->m_ail);
1049
896 if (!(gfp_mask & __GFP_FS)) 1050 if (!(gfp_mask & __GFP_FS))
897 return -1; 1051 return -1;
898 1052
899 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, 1053 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
900 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); 1054 &nr_to_scan);
901 /* if we don't exhaust the scan, don't bother coming back */ 1055 /* terminate if we don't exhaust the scan */
902 if (nr_to_scan > 0) 1056 if (nr_to_scan > 0)
903 return -1; 1057 return -1;
904 } 1058 }
905 1059
906 reclaimable = 0; 1060 reclaimable = 0;
907 ag = 0; 1061 ag = 0;
908 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, 1062 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
909 XFS_ICI_RECLAIM_TAG))) { 1063 ag = pag->pag_agno + 1;
910 reclaimable += pag->pag_ici_reclaimable; 1064 reclaimable += pag->pag_ici_reclaimable;
911 xfs_perag_put(pag); 1065 xfs_perag_put(pag);
912 } 1066 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726196f8..e3a6ad27415f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -32,6 +32,8 @@ typedef struct xfs_sync_work {
32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
34 34
35extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
36
35int xfs_syncd_init(struct xfs_mount *mp); 37int xfs_syncd_init(struct xfs_mount *mp);
36void xfs_syncd_stop(struct xfs_mount *mp); 38void xfs_syncd_stop(struct xfs_mount *mp);
37 39
@@ -47,10 +49,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
47void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 49void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
48 struct xfs_inode *ip); 50 struct xfs_inode *ip);
49 51
50int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 52int xfs_sync_inode_grab(struct xfs_inode *ip);
51int xfs_inode_ag_iterator(struct xfs_mount *mp, 53int xfs_inode_ag_iterator(struct xfs_mount *mp,
52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 54 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
53 int flags, int tag, int write_lock, int *nr_to_scan); 55 int flags);
54 56
55void xfs_inode_shrinker_register(struct xfs_mount *mp); 57void xfs_inode_shrinker_register(struct xfs_mount *mp);
56void xfs_inode_shrinker_unregister(struct xfs_mount *mp); 58void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include "xfs_error.h"
21 22
22static struct ctl_table_header *xfs_table_header; 23static struct ctl_table_header *xfs_table_header;
23 24
@@ -36,7 +37,7 @@ xfs_stats_clear_proc_handler(
36 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); 37 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
37 38
38 if (!ret && write && *valp) { 39 if (!ret && write && *valp) {
39 printk("XFS Clearing xfsstats\n"); 40 xfs_notice(NULL, "Clearing xfsstats");
40 for_each_possible_cpu(c) { 41 for_each_possible_cpu(c) {
41 preempt_disable(); 42 preempt_disable();
42 /* save vn_active, it's a universal truth! */ 43 /* save vn_active, it's a universal truth! */
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
51 52
52 return ret; 53 return ret;
53} 54}
55
56STATIC int
57xfs_panic_mask_proc_handler(
58 ctl_table *ctl,
59 int write,
60 void __user *buffer,
61 size_t *lenp,
62 loff_t *ppos)
63{
64 int ret, *valp = ctl->data;
65
66 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
67 if (!ret && write) {
68 xfs_panic_mask = *valp;
69#ifdef DEBUG
70 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
71#endif
72 }
73 return ret;
74}
54#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
55 76
56static ctl_table xfs_table[] = { 77static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
77 .data = &xfs_params.panic_mask.val, 98 .data = &xfs_params.panic_mask.val,
78 .maxlen = sizeof(int), 99 .maxlen = sizeof(int),
79 .mode = 0644, 100 .mode = 0644,
80 .proc_handler = proc_dointvec_minmax, 101 .proc_handler = xfs_panic_mask_proc_handler,
81 .extra1 = &xfs_params.panic_mask.min, 102 .extra1 = &xfs_params.panic_mask.min,
82 .extra2 = &xfs_params.panic_mask.max 103 .extra2 = &xfs_params.panic_mask.max
83 }, 104 },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..d48b7a579ae1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name, \
124 unsigned long caller_ip), \ 124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip)) 125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get); 126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); 127DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
128DEFINE_PERAG_REF_EVENT(xfs_perag_put); 128DEFINE_PERAG_REF_EVENT(xfs_perag_put);
129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -325,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
325DEFINE_BUF_EVENT(xfs_buf_lock_done); 325DEFINE_BUF_EVENT(xfs_buf_lock_done);
326DEFINE_BUF_EVENT(xfs_buf_cond_lock); 326DEFINE_BUF_EVENT(xfs_buf_cond_lock);
327DEFINE_BUF_EVENT(xfs_buf_unlock); 327DEFINE_BUF_EVENT(xfs_buf_unlock);
328DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
329DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
330DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
331DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
332DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); 331DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
333DEFINE_BUF_EVENT(xfs_buf_delwri_split); 332DEFINE_BUF_EVENT(xfs_buf_delwri_split);
334DEFINE_BUF_EVENT(xfs_buf_get_noaddr); 333DEFINE_BUF_EVENT(xfs_buf_get_uncached);
335DEFINE_BUF_EVENT(xfs_bdstrat_shut); 334DEFINE_BUF_EVENT(xfs_bdstrat_shut);
336DEFINE_BUF_EVENT(xfs_buf_item_relse); 335DEFINE_BUF_EVENT(xfs_buf_item_relse);
337DEFINE_BUF_EVENT(xfs_buf_item_iodone); 336DEFINE_BUF_EVENT(xfs_buf_item_iodone);
@@ -767,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
767 __field(int, curr_res) 766 __field(int, curr_res)
768 __field(int, unit_res) 767 __field(int, unit_res)
769 __field(unsigned int, flags) 768 __field(unsigned int, flags)
770 __field(void *, reserve_headq) 769 __field(int, reserveq)
771 __field(void *, write_headq) 770 __field(int, writeq)
772 __field(int, grant_reserve_cycle) 771 __field(int, grant_reserve_cycle)
773 __field(int, grant_reserve_bytes) 772 __field(int, grant_reserve_bytes)
774 __field(int, grant_write_cycle) 773 __field(int, grant_write_cycle)
@@ -785,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
785 __entry->curr_res = tic->t_curr_res; 784 __entry->curr_res = tic->t_curr_res;
786 __entry->unit_res = tic->t_unit_res; 785 __entry->unit_res = tic->t_unit_res;
787 __entry->flags = tic->t_flags; 786 __entry->flags = tic->t_flags;
788 __entry->reserve_headq = log->l_reserve_headq; 787 __entry->reserveq = list_empty(&log->l_reserveq);
789 __entry->write_headq = log->l_write_headq; 788 __entry->writeq = list_empty(&log->l_writeq);
790 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; 789 xlog_crack_grant_head(&log->l_grant_reserve_head,
791 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; 790 &__entry->grant_reserve_cycle,
792 __entry->grant_write_cycle = log->l_grant_write_cycle; 791 &__entry->grant_reserve_bytes);
793 __entry->grant_write_bytes = log->l_grant_write_bytes; 792 xlog_crack_grant_head(&log->l_grant_write_head,
793 &__entry->grant_write_cycle,
794 &__entry->grant_write_bytes);
794 __entry->curr_cycle = log->l_curr_cycle; 795 __entry->curr_cycle = log->l_curr_cycle;
795 __entry->curr_block = log->l_curr_block; 796 __entry->curr_block = log->l_curr_block;
796 __entry->tail_lsn = log->l_tail_lsn; 797 __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
797 ), 798 ),
798 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " 799 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
799 "t_unit_res %u t_flags %s reserve_headq 0x%p " 800 "t_unit_res %u t_flags %s reserveq %s "
800 "write_headq 0x%p grant_reserve_cycle %d " 801 "writeq %s grant_reserve_cycle %d "
801 "grant_reserve_bytes %d grant_write_cycle %d " 802 "grant_reserve_bytes %d grant_write_cycle %d "
802 "grant_write_bytes %d curr_cycle %d curr_block %d " 803 "grant_write_bytes %d curr_cycle %d curr_block %d "
803 "tail_cycle %d tail_block %d", 804 "tail_cycle %d tail_block %d",
@@ -808,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
808 __entry->curr_res, 809 __entry->curr_res,
809 __entry->unit_res, 810 __entry->unit_res,
810 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), 811 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
811 __entry->reserve_headq, 812 __entry->reserveq ? "empty" : "active",
812 __entry->write_headq, 813 __entry->writeq ? "empty" : "active",
813 __entry->grant_reserve_cycle, 814 __entry->grant_reserve_cycle,
814 __entry->grant_reserve_bytes, 815 __entry->grant_reserve_bytes,
815 __entry->grant_write_cycle, 816 __entry->grant_write_cycle,
@@ -836,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); 837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
837DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); 838DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
838DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); 839DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
840DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
839DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -843,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 850DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); 851DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -936,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
936DEFINE_PAGE_EVENT(xfs_releasepage); 939DEFINE_PAGE_EVENT(xfs_releasepage);
937DEFINE_PAGE_EVENT(xfs_invalidatepage); 940DEFINE_PAGE_EVENT(xfs_invalidatepage);
938 941
939DECLARE_EVENT_CLASS(xfs_iomap_class, 942DECLARE_EVENT_CLASS(xfs_imap_class,
940 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 943 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
941 int flags, struct xfs_bmbt_irec *irec), 944 int type, struct xfs_bmbt_irec *irec),
942 TP_ARGS(ip, offset, count, flags, irec), 945 TP_ARGS(ip, offset, count, type, irec),
943 TP_STRUCT__entry( 946 TP_STRUCT__entry(
944 __field(dev_t, dev) 947 __field(dev_t, dev)
945 __field(xfs_ino_t, ino) 948 __field(xfs_ino_t, ino)
@@ -947,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
947 __field(loff_t, new_size) 950 __field(loff_t, new_size)
948 __field(loff_t, offset) 951 __field(loff_t, offset)
949 __field(size_t, count) 952 __field(size_t, count)
950 __field(int, flags) 953 __field(int, type)
951 __field(xfs_fileoff_t, startoff) 954 __field(xfs_fileoff_t, startoff)
952 __field(xfs_fsblock_t, startblock) 955 __field(xfs_fsblock_t, startblock)
953 __field(xfs_filblks_t, blockcount) 956 __field(xfs_filblks_t, blockcount)
@@ -959,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
959 __entry->new_size = ip->i_new_size; 962 __entry->new_size = ip->i_new_size;
960 __entry->offset = offset; 963 __entry->offset = offset;
961 __entry->count = count; 964 __entry->count = count;
962 __entry->flags = flags; 965 __entry->type = type;
963 __entry->startoff = irec ? irec->br_startoff : 0; 966 __entry->startoff = irec ? irec->br_startoff : 0;
964 __entry->startblock = irec ? irec->br_startblock : 0; 967 __entry->startblock = irec ? irec->br_startblock : 0;
965 __entry->blockcount = irec ? irec->br_blockcount : 0; 968 __entry->blockcount = irec ? irec->br_blockcount : 0;
966 ), 969 ),
967 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 970 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
968 "offset 0x%llx count %zd flags %s " 971 "offset 0x%llx count %zd type %s "
969 "startoff 0x%llx startblock %lld blockcount 0x%llx", 972 "startoff 0x%llx startblock %lld blockcount 0x%llx",
970 MAJOR(__entry->dev), MINOR(__entry->dev), 973 MAJOR(__entry->dev), MINOR(__entry->dev),
971 __entry->ino, 974 __entry->ino,
@@ -973,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
973 __entry->new_size, 976 __entry->new_size,
974 __entry->offset, 977 __entry->offset,
975 __entry->count, 978 __entry->count,
976 __print_flags(__entry->flags, "|", BMAPI_FLAGS), 979 __print_symbolic(__entry->type, XFS_IO_TYPES),
977 __entry->startoff, 980 __entry->startoff,
978 (__int64_t)__entry->startblock, 981 (__int64_t)__entry->startblock,
979 __entry->blockcount) 982 __entry->blockcount)
980) 983)
981 984
982#define DEFINE_IOMAP_EVENT(name) \ 985#define DEFINE_IOMAP_EVENT(name) \
983DEFINE_EVENT(xfs_iomap_class, name, \ 986DEFINE_EVENT(xfs_imap_class, name, \
984 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 987 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
985 int flags, struct xfs_bmbt_irec *irec), \ 988 int type, struct xfs_bmbt_irec *irec), \
986 TP_ARGS(ip, offset, count, flags, irec)) 989 TP_ARGS(ip, offset, count, type, irec))
987DEFINE_IOMAP_EVENT(xfs_iomap_enter); 990DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
988DEFINE_IOMAP_EVENT(xfs_iomap_found); 991DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
989DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 992DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
993DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
990 994
991DECLARE_EVENT_CLASS(xfs_simple_io_class, 995DECLARE_EVENT_CLASS(xfs_simple_io_class,
992 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 996 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1023,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
1023 TP_ARGS(ip, offset, count)) 1027 TP_ARGS(ip, offset, count))
1024DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 1028DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
1025DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 1029DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
1030DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
1026 1031
1027 1032
1028TRACE_EVENT(xfs_itruncate_start, 1033TRACE_EVENT(xfs_itruncate_start,
@@ -1146,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap,
1146 1151
1147); 1152);
1148 1153
1149#define XFS_BUSY_SYNC \ 1154DECLARE_EVENT_CLASS(xfs_busy_class,
1150 { 0, "async" }, \
1151 { 1, "sync" }
1152
1153TRACE_EVENT(xfs_alloc_busy,
1154 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1155 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1156 TP_ARGS(trans, agno, agbno, len, sync),
1157 TP_STRUCT__entry(
1158 __field(dev_t, dev)
1159 __field(struct xfs_trans *, tp)
1160 __field(int, tid)
1161 __field(xfs_agnumber_t, agno)
1162 __field(xfs_agblock_t, agbno)
1163 __field(xfs_extlen_t, len)
1164 __field(int, sync)
1165 ),
1166 TP_fast_assign(
1167 __entry->dev = trans->t_mountp->m_super->s_dev;
1168 __entry->tp = trans;
1169 __entry->tid = trans->t_ticket->t_tid;
1170 __entry->agno = agno;
1171 __entry->agbno = agbno;
1172 __entry->len = len;
1173 __entry->sync = sync;
1174 ),
1175 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1176 MAJOR(__entry->dev), MINOR(__entry->dev),
1177 __entry->tp,
1178 __entry->tid,
1179 __entry->agno,
1180 __entry->agbno,
1181 __entry->len,
1182 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1183
1184);
1185
1186TRACE_EVENT(xfs_alloc_unbusy,
1187 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1155 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1188 xfs_agblock_t agbno, xfs_extlen_t len), 1156 xfs_agblock_t agbno, xfs_extlen_t len),
1189 TP_ARGS(mp, agno, agbno, len), 1157 TP_ARGS(mp, agno, agbno, len),
@@ -1205,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy,
1205 __entry->agbno, 1173 __entry->agbno,
1206 __entry->len) 1174 __entry->len)
1207); 1175);
1208 1176#define DEFINE_BUSY_EVENT(name) \
1209#define XFS_BUSY_STATES \ 1177DEFINE_EVENT(xfs_busy_class, name, \
1210 { 0, "missing" }, \ 1178 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1211 { 1, "found" } 1179 xfs_agblock_t agbno, xfs_extlen_t len), \
1212 1180 TP_ARGS(mp, agno, agbno, len))
1213TRACE_EVENT(xfs_alloc_busysearch, 1181DEFINE_BUSY_EVENT(xfs_alloc_busy);
1182DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
1183DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
1184DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
1185DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
1186
1187TRACE_EVENT(xfs_alloc_busy_trim,
1214 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1188 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1215 xfs_agblock_t agbno, xfs_extlen_t len, int found), 1189 xfs_agblock_t agbno, xfs_extlen_t len,
1216 TP_ARGS(mp, agno, agbno, len, found), 1190 xfs_agblock_t tbno, xfs_extlen_t tlen),
1191 TP_ARGS(mp, agno, agbno, len, tbno, tlen),
1217 TP_STRUCT__entry( 1192 TP_STRUCT__entry(
1218 __field(dev_t, dev) 1193 __field(dev_t, dev)
1219 __field(xfs_agnumber_t, agno) 1194 __field(xfs_agnumber_t, agno)
1220 __field(xfs_agblock_t, agbno) 1195 __field(xfs_agblock_t, agbno)
1221 __field(xfs_extlen_t, len) 1196 __field(xfs_extlen_t, len)
1222 __field(int, found) 1197 __field(xfs_agblock_t, tbno)
1198 __field(xfs_extlen_t, tlen)
1223 ), 1199 ),
1224 TP_fast_assign( 1200 TP_fast_assign(
1225 __entry->dev = mp->m_super->s_dev; 1201 __entry->dev = mp->m_super->s_dev;
1226 __entry->agno = agno; 1202 __entry->agno = agno;
1227 __entry->agbno = agbno; 1203 __entry->agbno = agbno;
1228 __entry->len = len; 1204 __entry->len = len;
1229 __entry->found = found; 1205 __entry->tbno = tbno;
1206 __entry->tlen = tlen;
1230 ), 1207 ),
1231 TP_printk("dev %d:%d agno %u agbno %u len %u %s", 1208 TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
1232 MAJOR(__entry->dev), MINOR(__entry->dev), 1209 MAJOR(__entry->dev), MINOR(__entry->dev),
1233 __entry->agno, 1210 __entry->agno,
1234 __entry->agbno, 1211 __entry->agbno,
1235 __entry->len, 1212 __entry->len,
1236 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1213 __entry->tbno,
1214 __entry->tlen)
1237); 1215);
1238 1216
1239TRACE_EVENT(xfs_trans_commit_lsn, 1217TRACE_EVENT(xfs_trans_commit_lsn,
@@ -1413,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
1413 __entry->wasfromfl, 1391 __entry->wasfromfl,
1414 __entry->isfl, 1392 __entry->isfl,
1415 __entry->userdata, 1393 __entry->userdata,
1416 __entry->firstblock) 1394 (unsigned long long)__entry->firstblock)
1417) 1395)
1418 1396
1419#define DEFINE_ALLOC_EVENT(name) \ 1397#define DEFINE_ALLOC_EVENT(name) \
@@ -1421,17 +1399,21 @@ DEFINE_EVENT(xfs_alloc_class, name, \
1421 TP_PROTO(struct xfs_alloc_arg *args), \ 1399 TP_PROTO(struct xfs_alloc_arg *args), \
1422 TP_ARGS(args)) 1400 TP_ARGS(args))
1423DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); 1401DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
1402DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
1424DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); 1403DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
1425DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); 1404DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
1426DEFINE_ALLOC_EVENT(xfs_alloc_near_first); 1405DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
1427DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); 1406DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
1428DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); 1407DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
1429DEFINE_ALLOC_EVENT(xfs_alloc_near_error); 1408DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
1409DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
1410DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
1430DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); 1411DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
1431DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); 1412DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
1432DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); 1413DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
1433DEFINE_ALLOC_EVENT(xfs_alloc_size_done); 1414DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
1434DEFINE_ALLOC_EVENT(xfs_alloc_size_error); 1415DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
1416DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
1435DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); 1417DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
1436DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); 1418DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
1437DEFINE_ALLOC_EVENT(xfs_alloc_small_done); 1419DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
@@ -1753,6 +1735,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1753DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); 1735DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1754DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); 1736DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1755 1737
1738DECLARE_EVENT_CLASS(xfs_discard_class,
1739 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1740 xfs_agblock_t agbno, xfs_extlen_t len),
1741 TP_ARGS(mp, agno, agbno, len),
1742 TP_STRUCT__entry(
1743 __field(dev_t, dev)
1744 __field(xfs_agnumber_t, agno)
1745 __field(xfs_agblock_t, agbno)
1746 __field(xfs_extlen_t, len)
1747 ),
1748 TP_fast_assign(
1749 __entry->dev = mp->m_super->s_dev;
1750 __entry->agno = agno;
1751 __entry->agbno = agbno;
1752 __entry->len = len;
1753 ),
1754 TP_printk("dev %d:%d agno %u agbno %u len %u\n",
1755 MAJOR(__entry->dev), MINOR(__entry->dev),
1756 __entry->agno,
1757 __entry->agbno,
1758 __entry->len)
1759)
1760
1761#define DEFINE_DISCARD_EVENT(name) \
1762DEFINE_EVENT(xfs_discard_class, name, \
1763 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1764 xfs_agblock_t agbno, xfs_extlen_t len), \
1765 TP_ARGS(mp, agno, agbno, len))
1766DEFINE_DISCARD_EVENT(xfs_discard_extent);
1767DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
1768DEFINE_DISCARD_EVENT(xfs_discard_exclude);
1769DEFINE_DISCARD_EVENT(xfs_discard_busy);
1770
1756#endif /* _TRACE_XFS_H */ 1771#endif /* _TRACE_XFS_H */
1757 1772
1758#undef TRACE_INCLUDE_PATH 1773#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563a..000000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_VERSION_H__
19#define __XFS_VERSION_H__
20
21/*
22 * Dummy file that can contain a timestamp to put into the
23 * XFS init string, to help users keep track of what they're
24 * running
25 */
26
27#define XFS_VERSION_STRING "SGI XFS"
28
29#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e1a2f6800e01..6fa214603819 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
149 ASSERT(list_empty(&dqp->q_freelist)); 149 ASSERT(list_empty(&dqp->q_freelist));
150 150
151 mutex_destroy(&dqp->q_qlock); 151 mutex_destroy(&dqp->q_qlock);
152 sv_destroy(&dqp->q_pinwait);
153 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); 152 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
154 153
155 atomic_dec(&xfs_Gqm->qm_totaldquots); 154 atomic_dec(&xfs_Gqm->qm_totaldquots);
@@ -463,87 +462,68 @@ xfs_qm_dqtobp(
463 uint flags) 462 uint flags)
464{ 463{
465 xfs_bmbt_irec_t map; 464 xfs_bmbt_irec_t map;
466 int nmaps, error; 465 int nmaps = 1, error;
467 xfs_buf_t *bp; 466 xfs_buf_t *bp;
468 xfs_inode_t *quotip; 467 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
469 xfs_mount_t *mp; 468 xfs_mount_t *mp = dqp->q_mount;
470 xfs_disk_dquot_t *ddq; 469 xfs_disk_dquot_t *ddq;
471 xfs_dqid_t id; 470 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
472 boolean_t newdquot;
473 xfs_trans_t *tp = (tpp ? *tpp : NULL); 471 xfs_trans_t *tp = (tpp ? *tpp : NULL);
474 472
475 mp = dqp->q_mount; 473 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
476 id = be32_to_cpu(dqp->q_core.d_id);
477 nmaps = 1;
478 newdquot = B_FALSE;
479 474
480 /* 475 xfs_ilock(quotip, XFS_ILOCK_SHARED);
481 * If we don't know where the dquot lives, find out. 476 if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
482 */
483 if (dqp->q_blkno == (xfs_daddr_t) 0) {
484 /* We use the id as an index */
485 dqp->q_fileoffset = (xfs_fileoff_t)id /
486 mp->m_quotainfo->qi_dqperchunk;
487 nmaps = 1;
488 quotip = XFS_DQ_TO_QIP(dqp);
489 xfs_ilock(quotip, XFS_ILOCK_SHARED);
490 /* 477 /*
491 * Return if this type of quotas is turned off while we didn't 478 * Return if this type of quotas is turned off while we
492 * have an inode lock 479 * didn't have the quota inode lock.
493 */ 480 */
494 if (XFS_IS_THIS_QUOTA_OFF(dqp)) { 481 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
495 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 482 return ESRCH;
496 return (ESRCH); 483 }
497 } 484
485 /*
486 * Find the block map; no allocations yet
487 */
488 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
489 XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
490 NULL, 0, &map, &nmaps, NULL);
491
492 xfs_iunlock(quotip, XFS_ILOCK_SHARED);
493 if (error)
494 return error;
495
496 ASSERT(nmaps == 1);
497 ASSERT(map.br_blockcount == 1);
498
499 /*
500 * Offset of dquot in the (fixed sized) dquot chunk.
501 */
502 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
503 sizeof(xfs_dqblk_t);
504
505 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
506 if (map.br_startblock == HOLESTARTBLOCK) {
498 /* 507 /*
499 * Find the block map; no allocations yet 508 * We don't allocate unless we're asked to
500 */ 509 */
501 error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, 510 if (!(flags & XFS_QMOPT_DQALLOC))
502 XFS_DQUOT_CLUSTER_SIZE_FSB, 511 return ENOENT;
503 XFS_BMAPI_METADATA,
504 NULL, 0, &map, &nmaps, NULL);
505 512
506 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 513 ASSERT(tp);
514 error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
515 dqp->q_fileoffset, &bp);
507 if (error) 516 if (error)
508 return (error); 517 return error;
509 ASSERT(nmaps == 1); 518 tp = *tpp;
510 ASSERT(map.br_blockcount == 1); 519 } else {
520 trace_xfs_dqtobp_read(dqp);
511 521
512 /* 522 /*
513 * offset of dquot in the (fixed sized) dquot chunk. 523 * store the blkno etc so that we don't have to do the
524 * mapping all the time
514 */ 525 */
515 dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * 526 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
516 sizeof(xfs_dqblk_t);
517 if (map.br_startblock == HOLESTARTBLOCK) {
518 /*
519 * We don't allocate unless we're asked to
520 */
521 if (!(flags & XFS_QMOPT_DQALLOC))
522 return (ENOENT);
523
524 ASSERT(tp);
525 if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
526 dqp->q_fileoffset, &bp)))
527 return (error);
528 tp = *tpp;
529 newdquot = B_TRUE;
530 } else {
531 /*
532 * store the blkno etc so that we don't have to do the
533 * mapping all the time
534 */
535 dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
536 }
537 }
538 ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
539 ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
540
541 /*
542 * Read in the buffer, unless we've just done the allocation
543 * (in which case we already have the buf).
544 */
545 if (!newdquot) {
546 trace_xfs_dqtobp_read(dqp);
547 527
548 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 528 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
549 dqp->q_blkno, 529 dqp->q_blkno,
@@ -552,20 +532,22 @@ xfs_qm_dqtobp(
552 if (error || !bp) 532 if (error || !bp)
553 return XFS_ERROR(error); 533 return XFS_ERROR(error);
554 } 534 }
535
555 ASSERT(XFS_BUF_ISBUSY(bp)); 536 ASSERT(XFS_BUF_ISBUSY(bp));
556 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 537 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
557 538
558 /* 539 /*
559 * calculate the location of the dquot inside the buffer. 540 * calculate the location of the dquot inside the buffer.
560 */ 541 */
561 ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset); 542 ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
562 543
563 /* 544 /*
564 * A simple sanity check in case we got a corrupted dquot... 545 * A simple sanity check in case we got a corrupted dquot...
565 */ 546 */
566 if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, 547 error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
567 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), 548 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
568 "dqtobp")) { 549 "dqtobp");
550 if (error) {
569 if (!(flags & XFS_QMOPT_DQREPAIR)) { 551 if (!(flags & XFS_QMOPT_DQREPAIR)) {
570 xfs_trans_brelse(tp, bp); 552 xfs_trans_brelse(tp, bp);
571 return XFS_ERROR(EIO); 553 return XFS_ERROR(EIO);
@@ -618,7 +600,7 @@ xfs_qm_dqread(
618 600
619 /* 601 /*
620 * Reservation counters are defined as reservation plus current usage 602 * Reservation counters are defined as reservation plus current usage
621 * to avoid having to add everytime. 603 * to avoid having to add every time.
622 */ 604 */
623 dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); 605 dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
624 dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); 606 dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
@@ -846,7 +828,7 @@ xfs_qm_dqget(
846 if (xfs_do_dqerror) { 828 if (xfs_do_dqerror) {
847 if ((xfs_dqerror_target == mp->m_ddev_targp) && 829 if ((xfs_dqerror_target == mp->m_ddev_targp) &&
848 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { 830 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
849 cmn_err(CE_DEBUG, "Returning error in dqget"); 831 xfs_debug(mp, "Returning error in dqget");
850 return (EIO); 832 return (EIO);
851 } 833 }
852 } 834 }
@@ -1176,18 +1158,18 @@ xfs_qm_dqflush(
1176 xfs_dquot_t *dqp, 1158 xfs_dquot_t *dqp,
1177 uint flags) 1159 uint flags)
1178{ 1160{
1179 xfs_mount_t *mp; 1161 struct xfs_mount *mp = dqp->q_mount;
1180 xfs_buf_t *bp; 1162 struct xfs_buf *bp;
1181 xfs_disk_dquot_t *ddqp; 1163 struct xfs_disk_dquot *ddqp;
1182 int error; 1164 int error;
1183 1165
1184 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1166 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1185 ASSERT(!completion_done(&dqp->q_flush)); 1167 ASSERT(!completion_done(&dqp->q_flush));
1168
1186 trace_xfs_dqflush(dqp); 1169 trace_xfs_dqflush(dqp);
1187 1170
1188 /* 1171 /*
1189 * If not dirty, or it's pinned and we are not supposed to 1172 * If not dirty, or it's pinned and we are not supposed to block, nada.
1190 * block, nada.
1191 */ 1173 */
1192 if (!XFS_DQ_IS_DIRTY(dqp) || 1174 if (!XFS_DQ_IS_DIRTY(dqp) ||
1193 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { 1175 (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
@@ -1201,40 +1183,47 @@ xfs_qm_dqflush(
1201 * down forcibly. If that's the case we must not write this dquot 1183 * down forcibly. If that's the case we must not write this dquot
1202 * to disk, because the log record didn't make it to disk! 1184 * to disk, because the log record didn't make it to disk!
1203 */ 1185 */
1204 if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) { 1186 if (XFS_FORCED_SHUTDOWN(mp)) {
1205 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1187 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1206 xfs_dqfunlock(dqp); 1188 xfs_dqfunlock(dqp);
1207 return XFS_ERROR(EIO); 1189 return XFS_ERROR(EIO);
1208 } 1190 }
1209 1191
1210 /* 1192 /*
1211 * Get the buffer containing the on-disk dquot 1193 * Get the buffer containing the on-disk dquot
1212 * We don't need a transaction envelope because we know that the
1213 * the ondisk-dquot has already been allocated for.
1214 */ 1194 */
1215 if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { 1195 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
1196 mp->m_quotainfo->qi_dqchunklen, 0, &bp);
1197 if (error) {
1216 ASSERT(error != ENOENT); 1198 ASSERT(error != ENOENT);
1217 /*
1218 * Quotas could have gotten turned off (ESRCH)
1219 */
1220 xfs_dqfunlock(dqp); 1199 xfs_dqfunlock(dqp);
1221 return (error); 1200 return error;
1222 } 1201 }
1223 1202
1224 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 1203 /*
1225 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { 1204 * Calculate the location of the dquot inside the buffer.
1226 xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE); 1205 */
1206 ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
1207
1208 /*
1209 * A simple sanity check in case we got a corrupted dquot..
1210 */
1211 error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
1212 XFS_QMOPT_DOWARN, "dqflush (incore copy)");
1213 if (error) {
1214 xfs_buf_relse(bp);
1215 xfs_dqfunlock(dqp);
1216 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1227 return XFS_ERROR(EIO); 1217 return XFS_ERROR(EIO);
1228 } 1218 }
1229 1219
1230 /* This is the only portion of data that needs to persist */ 1220 /* This is the only portion of data that needs to persist */
1231 memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t)); 1221 memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
1232 1222
1233 /* 1223 /*
1234 * Clear the dirty field and remember the flush lsn for later use. 1224 * Clear the dirty field and remember the flush lsn for later use.
1235 */ 1225 */
1236 dqp->dq_flags &= ~(XFS_DQ_DIRTY); 1226 dqp->dq_flags &= ~XFS_DQ_DIRTY;
1237 mp = dqp->q_mount;
1238 1227
1239 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, 1228 xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
1240 &dqp->q_logitem.qli_item.li_lsn); 1229 &dqp->q_logitem.qli_item.li_lsn);
@@ -1404,8 +1393,8 @@ xfs_qm_dqpurge(
1404 */ 1393 */
1405 error = xfs_qm_dqflush(dqp, SYNC_WAIT); 1394 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
1406 if (error) 1395 if (error)
1407 xfs_fs_cmn_err(CE_WARN, mp, 1396 xfs_warn(mp, "%s: dquot %p flush failed",
1408 "xfs_qm_dqpurge: dquot %p flush failed", dqp); 1397 __func__, dqp);
1409 xfs_dqflock(dqp); 1398 xfs_dqflock(dqp);
1410 } 1399 }
1411 ASSERT(atomic_read(&dqp->q_pincount) == 0); 1400 ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -1438,36 +1427,38 @@ xfs_qm_dqpurge(
1438void 1427void
1439xfs_qm_dqprint(xfs_dquot_t *dqp) 1428xfs_qm_dqprint(xfs_dquot_t *dqp)
1440{ 1429{
1441 cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------"); 1430 struct xfs_mount *mp = dqp->q_mount;
1442 cmn_err(CE_DEBUG, "---- dquotID = %d", 1431
1432 xfs_debug(mp, "-----------KERNEL DQUOT----------------");
1433 xfs_debug(mp, "---- dquotID = %d",
1443 (int)be32_to_cpu(dqp->q_core.d_id)); 1434 (int)be32_to_cpu(dqp->q_core.d_id));
1444 cmn_err(CE_DEBUG, "---- type = %s", DQFLAGTO_TYPESTR(dqp)); 1435 xfs_debug(mp, "---- type = %s", DQFLAGTO_TYPESTR(dqp));
1445 cmn_err(CE_DEBUG, "---- fs = 0x%p", dqp->q_mount); 1436 xfs_debug(mp, "---- fs = 0x%p", dqp->q_mount);
1446 cmn_err(CE_DEBUG, "---- blkno = 0x%x", (int) dqp->q_blkno); 1437 xfs_debug(mp, "---- blkno = 0x%x", (int) dqp->q_blkno);
1447 cmn_err(CE_DEBUG, "---- boffset = 0x%x", (int) dqp->q_bufoffset); 1438 xfs_debug(mp, "---- boffset = 0x%x", (int) dqp->q_bufoffset);
1448 cmn_err(CE_DEBUG, "---- blkhlimit = %Lu (0x%x)", 1439 xfs_debug(mp, "---- blkhlimit = %Lu (0x%x)",
1449 be64_to_cpu(dqp->q_core.d_blk_hardlimit), 1440 be64_to_cpu(dqp->q_core.d_blk_hardlimit),
1450 (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit)); 1441 (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
1451 cmn_err(CE_DEBUG, "---- blkslimit = %Lu (0x%x)", 1442 xfs_debug(mp, "---- blkslimit = %Lu (0x%x)",
1452 be64_to_cpu(dqp->q_core.d_blk_softlimit), 1443 be64_to_cpu(dqp->q_core.d_blk_softlimit),
1453 (int)be64_to_cpu(dqp->q_core.d_blk_softlimit)); 1444 (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
1454 cmn_err(CE_DEBUG, "---- inohlimit = %Lu (0x%x)", 1445 xfs_debug(mp, "---- inohlimit = %Lu (0x%x)",
1455 be64_to_cpu(dqp->q_core.d_ino_hardlimit), 1446 be64_to_cpu(dqp->q_core.d_ino_hardlimit),
1456 (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit)); 1447 (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
1457 cmn_err(CE_DEBUG, "---- inoslimit = %Lu (0x%x)", 1448 xfs_debug(mp, "---- inoslimit = %Lu (0x%x)",
1458 be64_to_cpu(dqp->q_core.d_ino_softlimit), 1449 be64_to_cpu(dqp->q_core.d_ino_softlimit),
1459 (int)be64_to_cpu(dqp->q_core.d_ino_softlimit)); 1450 (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
1460 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", 1451 xfs_debug(mp, "---- bcount = %Lu (0x%x)",
1461 be64_to_cpu(dqp->q_core.d_bcount), 1452 be64_to_cpu(dqp->q_core.d_bcount),
1462 (int)be64_to_cpu(dqp->q_core.d_bcount)); 1453 (int)be64_to_cpu(dqp->q_core.d_bcount));
1463 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", 1454 xfs_debug(mp, "---- icount = %Lu (0x%x)",
1464 be64_to_cpu(dqp->q_core.d_icount), 1455 be64_to_cpu(dqp->q_core.d_icount),
1465 (int)be64_to_cpu(dqp->q_core.d_icount)); 1456 (int)be64_to_cpu(dqp->q_core.d_icount));
1466 cmn_err(CE_DEBUG, "---- btimer = %d", 1457 xfs_debug(mp, "---- btimer = %d",
1467 (int)be32_to_cpu(dqp->q_core.d_btimer)); 1458 (int)be32_to_cpu(dqp->q_core.d_btimer));
1468 cmn_err(CE_DEBUG, "---- itimer = %d", 1459 xfs_debug(mp, "---- itimer = %d",
1469 (int)be32_to_cpu(dqp->q_core.d_itimer)); 1460 (int)be32_to_cpu(dqp->q_core.d_itimer));
1470 cmn_err(CE_DEBUG, "---------------------------"); 1461 xfs_debug(mp, "---------------------------");
1471} 1462}
1472#endif 1463#endif
1473 1464
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2a1f3dc10a02..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -136,9 +136,8 @@ xfs_qm_dquot_logitem_push(
136 */ 136 */
137 error = xfs_qm_dqflush(dqp, 0); 137 error = xfs_qm_dqflush(dqp, 0);
138 if (error) 138 if (error)
139 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 139 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
140 "xfs_qm_dquot_logitem_push: push error %d on dqp %p", 140 __func__, error, dqp);
141 error, dqp);
142 xfs_dqunlock(dqp); 141 xfs_dqunlock(dqp);
143} 142}
144 143
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9a92407109a1..b94dace4e785 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -55,14 +55,12 @@ uint ndquot;
55kmem_zone_t *qm_dqzone; 55kmem_zone_t *qm_dqzone;
56kmem_zone_t *qm_dqtrxzone; 56kmem_zone_t *qm_dqtrxzone;
57 57
58static cred_t xfs_zerocr;
59
60STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); 58STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
61STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); 59STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
62 60
63STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 61STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
64STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 62STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
65STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); 63STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
66 64
67static struct shrinker xfs_qm_shaker = { 65static struct shrinker xfs_qm_shaker = {
68 .shrink = xfs_qm_shake, 66 .shrink = xfs_qm_shake,
@@ -82,7 +80,7 @@ xfs_qm_dquot_list_print(
82 int i = 0; 80 int i = 0;
83 81
84 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { 82 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
85 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " 83 xfs_debug(mp, " %d. \"%d (%s)\" "
86 "bcnt = %lld, icnt = %lld, refs = %d", 84 "bcnt = %lld, icnt = %lld, refs = %d",
87 i++, be32_to_cpu(dqp->q_core.d_id), 85 i++, be32_to_cpu(dqp->q_core.d_id),
88 DQFLAGTO_TYPESTR(dqp), 86 DQFLAGTO_TYPESTR(dqp),
@@ -207,7 +205,7 @@ xfs_qm_destroy(
207 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { 205 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
208 xfs_dqlock(dqp); 206 xfs_dqlock(dqp);
209#ifdef QUOTADEBUG 207#ifdef QUOTADEBUG
210 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); 208 xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
211#endif 209#endif
212 list_del_init(&dqp->q_freelist); 210 list_del_init(&dqp->q_freelist);
213 xfs_Gqm->qm_dqfrlist_cnt--; 211 xfs_Gqm->qm_dqfrlist_cnt--;
@@ -343,9 +341,7 @@ xfs_qm_mount_quotas(
343 * quotas immediately. 341 * quotas immediately.
344 */ 342 */
345 if (mp->m_sb.sb_rextents) { 343 if (mp->m_sb.sb_rextents) {
346 cmn_err(CE_NOTE, 344 xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
347 "Cannot turn on quotas for realtime filesystem %s",
348 mp->m_fsname);
349 mp->m_qflags = 0; 345 mp->m_qflags = 0;
350 goto write_changes; 346 goto write_changes;
351 } 347 }
@@ -404,14 +400,13 @@ xfs_qm_mount_quotas(
404 * off, but the on disk superblock doesn't know that ! 400 * off, but the on disk superblock doesn't know that !
405 */ 401 */
406 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); 402 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
407 xfs_fs_cmn_err(CE_ALERT, mp, 403 xfs_alert(mp, "%s: Superblock update failed!",
408 "XFS mount_quotas: Superblock update failed!"); 404 __func__);
409 } 405 }
410 } 406 }
411 407
412 if (error) { 408 if (error) {
413 xfs_fs_cmn_err(CE_WARN, mp, 409 xfs_warn(mp, "Failed to initialize disk quotas.");
414 "Failed to initialize disk quotas.");
415 return; 410 return;
416 } 411 }
417 412
@@ -466,12 +461,10 @@ xfs_qm_dqflush_all(
466 struct xfs_quotainfo *q = mp->m_quotainfo; 461 struct xfs_quotainfo *q = mp->m_quotainfo;
467 int recl; 462 int recl;
468 struct xfs_dquot *dqp; 463 struct xfs_dquot *dqp;
469 int niters;
470 int error; 464 int error;
471 465
472 if (!q) 466 if (!q)
473 return 0; 467 return 0;
474 niters = 0;
475again: 468again:
476 mutex_lock(&q->qi_dqlist_lock); 469 mutex_lock(&q->qi_dqlist_lock);
477 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { 470 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
@@ -837,7 +830,7 @@ xfs_qm_dqattach_locked(
837 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, 830 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
838 flags & XFS_QMOPT_DQALLOC, 831 flags & XFS_QMOPT_DQALLOC,
839 ip->i_udquot, &ip->i_gdquot) : 832 ip->i_udquot, &ip->i_gdquot) :
840 xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, 833 xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
841 flags & XFS_QMOPT_DQALLOC, 834 flags & XFS_QMOPT_DQALLOC,
842 ip->i_udquot, &ip->i_gdquot); 835 ip->i_udquot, &ip->i_gdquot);
843 /* 836 /*
@@ -1199,87 +1192,6 @@ xfs_qm_list_destroy(
1199 mutex_destroy(&(list->qh_lock)); 1192 mutex_destroy(&(list->qh_lock));
1200} 1193}
1201 1194
1202
1203/*
1204 * Stripped down version of dqattach. This doesn't attach, or even look at the
1205 * dquots attached to the inode. The rationale is that there won't be any
1206 * attached at the time this is called from quotacheck.
1207 */
1208STATIC int
1209xfs_qm_dqget_noattach(
1210 xfs_inode_t *ip,
1211 xfs_dquot_t **O_udqpp,
1212 xfs_dquot_t **O_gdqpp)
1213{
1214 int error;
1215 xfs_mount_t *mp;
1216 xfs_dquot_t *udqp, *gdqp;
1217
1218 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1219 mp = ip->i_mount;
1220 udqp = NULL;
1221 gdqp = NULL;
1222
1223 if (XFS_IS_UQUOTA_ON(mp)) {
1224 ASSERT(ip->i_udquot == NULL);
1225 /*
1226 * We want the dquot allocated if it doesn't exist.
1227 */
1228 if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
1229 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
1230 &udqp))) {
1231 /*
1232 * Shouldn't be able to turn off quotas here.
1233 */
1234 ASSERT(error != ESRCH);
1235 ASSERT(error != ENOENT);
1236 return error;
1237 }
1238 ASSERT(udqp);
1239 }
1240
1241 if (XFS_IS_OQUOTA_ON(mp)) {
1242 ASSERT(ip->i_gdquot == NULL);
1243 if (udqp)
1244 xfs_dqunlock(udqp);
1245 error = XFS_IS_GQUOTA_ON(mp) ?
1246 xfs_qm_dqget(mp, ip,
1247 ip->i_d.di_gid, XFS_DQ_GROUP,
1248 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1249 &gdqp) :
1250 xfs_qm_dqget(mp, ip,
1251 ip->i_d.di_projid, XFS_DQ_PROJ,
1252 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
1253 &gdqp);
1254 if (error) {
1255 if (udqp)
1256 xfs_qm_dqrele(udqp);
1257 ASSERT(error != ESRCH);
1258 ASSERT(error != ENOENT);
1259 return error;
1260 }
1261 ASSERT(gdqp);
1262
1263 /* Reacquire the locks in the right order */
1264 if (udqp) {
1265 if (! xfs_qm_dqlock_nowait(udqp)) {
1266 xfs_dqunlock(gdqp);
1267 xfs_dqlock(udqp);
1268 xfs_dqlock(gdqp);
1269 }
1270 }
1271 }
1272
1273 *O_udqpp = udqp;
1274 *O_gdqpp = gdqp;
1275
1276#ifdef QUOTADEBUG
1277 if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
1278 if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
1279#endif
1280 return 0;
1281}
1282
1283/* 1195/*
1284 * Create an inode and return with a reference already taken, but unlocked 1196 * Create an inode and return with a reference already taken, but unlocked
1285 * This is how we create quota inodes 1197 * This is how we create quota inodes
@@ -1305,21 +1217,14 @@ xfs_qm_qino_alloc(
1305 return error; 1217 return error;
1306 } 1218 }
1307 1219
1308 if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 1220 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
1309 &xfs_zerocr, 0, 1, ip, &committed))) { 1221 if (error) {
1310 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 1222 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
1311 XFS_TRANS_ABORT); 1223 XFS_TRANS_ABORT);
1312 return error; 1224 return error;
1313 } 1225 }
1314 1226
1315 /* 1227 /*
1316 * Keep an extra reference to this quota inode. This inode is
1317 * locked exclusively and joined to the transaction already.
1318 */
1319 ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
1320 IHOLD(*ip);
1321
1322 /*
1323 * Make the changes in the superblock, and log those too. 1228 * Make the changes in the superblock, and log those too.
1324 * sbfields arg may contain fields other than *QUOTINO; 1229 * sbfields arg may contain fields other than *QUOTINO;
1325 * VERSIONNUM for example. 1230 * VERSIONNUM for example.
@@ -1347,7 +1252,7 @@ xfs_qm_qino_alloc(
1347 xfs_mod_sb(tp, sbfields); 1252 xfs_mod_sb(tp, sbfields);
1348 1253
1349 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { 1254 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
1350 xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!"); 1255 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
1351 return error; 1256 return error;
1352 } 1257 }
1353 return 0; 1258 return 0;
@@ -1382,7 +1287,7 @@ xfs_qm_reset_dqcounts(
1382 * output any warnings because it's perfectly possible to 1287 * output any warnings because it's perfectly possible to
1383 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck. 1288 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
1384 */ 1289 */
1385 (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR, 1290 (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
1386 "xfs_quotacheck"); 1291 "xfs_quotacheck");
1387 ddq->d_bcount = 0; 1292 ddq->d_bcount = 0;
1388 ddq->d_icount = 0; 1293 ddq->d_icount = 0;
@@ -1407,14 +1312,9 @@ xfs_qm_dqiter_bufs(
1407{ 1312{
1408 xfs_buf_t *bp; 1313 xfs_buf_t *bp;
1409 int error; 1314 int error;
1410 int notcommitted;
1411 int incr;
1412 int type; 1315 int type;
1413 1316
1414 ASSERT(blkcnt > 0); 1317 ASSERT(blkcnt > 0);
1415 notcommitted = 0;
1416 incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
1417 XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
1418 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : 1318 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
1419 (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); 1319 (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
1420 error = 0; 1320 error = 0;
@@ -1516,7 +1416,7 @@ xfs_qm_dqiterate(
1516 rablkcnt = map[i+1].br_blockcount; 1416 rablkcnt = map[i+1].br_blockcount;
1517 rablkno = map[i+1].br_startblock; 1417 rablkno = map[i+1].br_startblock;
1518 while (rablkcnt--) { 1418 while (rablkcnt--) {
1519 xfs_baread(mp->m_ddev_targp, 1419 xfs_buf_readahead(mp->m_ddev_targp,
1520 XFS_FSB_TO_DADDR(mp, rablkno), 1420 XFS_FSB_TO_DADDR(mp, rablkno),
1521 mp->m_quotainfo->qi_dqchunklen); 1421 mp->m_quotainfo->qi_dqchunklen);
1522 rablkno++; 1422 rablkno++;
@@ -1546,18 +1446,34 @@ xfs_qm_dqiterate(
1546 1446
1547/* 1447/*
1548 * Called by dqusage_adjust in doing a quotacheck. 1448 * Called by dqusage_adjust in doing a quotacheck.
1549 * Given the inode, and a dquot (either USR or GRP, doesn't matter), 1449 *
1550 * this updates its incore copy as well as the buffer copy. This is 1450 * Given the inode, and a dquot id this updates both the incore dqout as well
1551 * so that once the quotacheck is done, we can just log all the buffers, 1451 * as the buffer copy. This is so that once the quotacheck is done, we can
1552 * as opposed to logging numerous updates to individual dquots. 1452 * just log all the buffers, as opposed to logging numerous updates to
1453 * individual dquots.
1553 */ 1454 */
1554STATIC void 1455STATIC int
1555xfs_qm_quotacheck_dqadjust( 1456xfs_qm_quotacheck_dqadjust(
1556 xfs_dquot_t *dqp, 1457 struct xfs_inode *ip,
1458 xfs_dqid_t id,
1459 uint type,
1557 xfs_qcnt_t nblks, 1460 xfs_qcnt_t nblks,
1558 xfs_qcnt_t rtblks) 1461 xfs_qcnt_t rtblks)
1559{ 1462{
1560 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1463 struct xfs_mount *mp = ip->i_mount;
1464 struct xfs_dquot *dqp;
1465 int error;
1466
1467 error = xfs_qm_dqget(mp, ip, id, type,
1468 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
1469 if (error) {
1470 /*
1471 * Shouldn't be able to turn off quotas here.
1472 */
1473 ASSERT(error != ESRCH);
1474 ASSERT(error != ENOENT);
1475 return error;
1476 }
1561 1477
1562 trace_xfs_dqadjust(dqp); 1478 trace_xfs_dqadjust(dqp);
1563 1479
@@ -1582,11 +1498,13 @@ xfs_qm_quotacheck_dqadjust(
1582 * There are no timers for the default values set in the root dquot. 1498 * There are no timers for the default values set in the root dquot.
1583 */ 1499 */
1584 if (dqp->q_core.d_id) { 1500 if (dqp->q_core.d_id) {
1585 xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); 1501 xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
1586 xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); 1502 xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
1587 } 1503 }
1588 1504
1589 dqp->dq_flags |= XFS_DQ_DIRTY; 1505 dqp->dq_flags |= XFS_DQ_DIRTY;
1506 xfs_qm_dqput(dqp);
1507 return 0;
1590} 1508}
1591 1509
1592STATIC int 1510STATIC int
@@ -1629,8 +1547,7 @@ xfs_qm_dqusage_adjust(
1629 int *res) /* result code value */ 1547 int *res) /* result code value */
1630{ 1548{
1631 xfs_inode_t *ip; 1549 xfs_inode_t *ip;
1632 xfs_dquot_t *udqp, *gdqp; 1550 xfs_qcnt_t nblks, rtblks = 0;
1633 xfs_qcnt_t nblks, rtblks;
1634 int error; 1551 int error;
1635 1552
1636 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1553 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1650,51 +1567,24 @@ xfs_qm_dqusage_adjust(
1650 * the case in all other instances. It's OK that we do this because 1567 * the case in all other instances. It's OK that we do this because
1651 * quotacheck is done only at mount time. 1568 * quotacheck is done only at mount time.
1652 */ 1569 */
1653 if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) { 1570 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
1571 if (error) {
1654 *res = BULKSTAT_RV_NOTHING; 1572 *res = BULKSTAT_RV_NOTHING;
1655 return error; 1573 return error;
1656 } 1574 }
1657 1575
1658 /* 1576 ASSERT(ip->i_delayed_blks == 0);
1659 * Obtain the locked dquots. In case of an error (eg. allocation
1660 * fails for ENOSPC), we return the negative of the error number
1661 * to bulkstat, so that it can get propagated to quotacheck() and
1662 * making us disable quotas for the file system.
1663 */
1664 if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
1665 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1666 IRELE(ip);
1667 *res = BULKSTAT_RV_GIVEUP;
1668 return error;
1669 }
1670 1577
1671 rtblks = 0; 1578 if (XFS_IS_REALTIME_INODE(ip)) {
1672 if (! XFS_IS_REALTIME_INODE(ip)) {
1673 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
1674 } else {
1675 /* 1579 /*
1676 * Walk thru the extent list and count the realtime blocks. 1580 * Walk thru the extent list and count the realtime blocks.
1677 */ 1581 */
1678 if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { 1582 error = xfs_qm_get_rtblks(ip, &rtblks);
1679 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1583 if (error)
1680 IRELE(ip); 1584 goto error0;
1681 if (udqp)
1682 xfs_qm_dqput(udqp);
1683 if (gdqp)
1684 xfs_qm_dqput(gdqp);
1685 *res = BULKSTAT_RV_GIVEUP;
1686 return error;
1687 }
1688 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1689 } 1585 }
1690 ASSERT(ip->i_delayed_blks == 0);
1691 1586
1692 /* 1587 nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
1693 * We can't release the inode while holding its dquot locks.
1694 * The inode can go into inactive and might try to acquire the dquotlocks.
1695 * So, just unlock here and do a vn_rele at the end.
1696 */
1697 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1698 1588
1699 /* 1589 /*
1700 * Add the (disk blocks and inode) resources occupied by this 1590 * Add the (disk blocks and inode) resources occupied by this
@@ -1709,26 +1599,36 @@ xfs_qm_dqusage_adjust(
1709 * and quotaoffs don't race. (Quotachecks happen at mount time only). 1599 * and quotaoffs don't race. (Quotachecks happen at mount time only).
1710 */ 1600 */
1711 if (XFS_IS_UQUOTA_ON(mp)) { 1601 if (XFS_IS_UQUOTA_ON(mp)) {
1712 ASSERT(udqp); 1602 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
1713 xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks); 1603 XFS_DQ_USER, nblks, rtblks);
1714 xfs_qm_dqput(udqp); 1604 if (error)
1605 goto error0;
1715 } 1606 }
1716 if (XFS_IS_OQUOTA_ON(mp)) { 1607
1717 ASSERT(gdqp); 1608 if (XFS_IS_GQUOTA_ON(mp)) {
1718 xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks); 1609 error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
1719 xfs_qm_dqput(gdqp); 1610 XFS_DQ_GROUP, nblks, rtblks);
1611 if (error)
1612 goto error0;
1720 } 1613 }
1721 /*
1722 * Now release the inode. This will send it to 'inactive', and
1723 * possibly even free blocks.
1724 */
1725 IRELE(ip);
1726 1614
1727 /* 1615 if (XFS_IS_PQUOTA_ON(mp)) {
1728 * Goto next inode. 1616 error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
1729 */ 1617 XFS_DQ_PROJ, nblks, rtblks);
1618 if (error)
1619 goto error0;
1620 }
1621
1622 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1623 IRELE(ip);
1730 *res = BULKSTAT_RV_DIDONE; 1624 *res = BULKSTAT_RV_DIDONE;
1731 return 0; 1625 return 0;
1626
1627error0:
1628 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1629 IRELE(ip);
1630 *res = BULKSTAT_RV_GIVEUP;
1631 return error;
1732} 1632}
1733 1633
1734/* 1634/*
@@ -1759,7 +1659,7 @@ xfs_qm_quotacheck(
1759 */ 1659 */
1760 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); 1660 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
1761 1661
1762 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); 1662 xfs_notice(mp, "Quotacheck needed: Please wait.");
1763 1663
1764 /* 1664 /*
1765 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset 1665 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
@@ -1837,9 +1737,9 @@ xfs_qm_quotacheck(
1837 1737
1838 error_return: 1738 error_return:
1839 if (error) { 1739 if (error) {
1840 cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): " 1740 xfs_warn(mp,
1841 "Disabling quotas.", 1741 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
1842 mp->m_fsname, error); 1742 error);
1843 /* 1743 /*
1844 * We must turn off quotas. 1744 * We must turn off quotas.
1845 */ 1745 */
@@ -1847,12 +1747,11 @@ xfs_qm_quotacheck(
1847 ASSERT(xfs_Gqm != NULL); 1747 ASSERT(xfs_Gqm != NULL);
1848 xfs_qm_destroy_quotainfo(mp); 1748 xfs_qm_destroy_quotainfo(mp);
1849 if (xfs_mount_reset_sbqflags(mp)) { 1749 if (xfs_mount_reset_sbqflags(mp)) {
1850 cmn_err(CE_WARN, "XFS quotacheck %s: " 1750 xfs_warn(mp,
1851 "Failed to reset quota flags.", mp->m_fsname); 1751 "Quotacheck: Failed to reset quota flags.");
1852 } 1752 }
1853 } else { 1753 } else
1854 cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname); 1754 xfs_notice(mp, "Quotacheck: Done.");
1855 }
1856 return (error); 1755 return (error);
1857} 1756}
1858 1757
@@ -1946,12 +1845,14 @@ xfs_qm_dqreclaim_one(void)
1946 xfs_dquot_t *dqpout; 1845 xfs_dquot_t *dqpout;
1947 xfs_dquot_t *dqp; 1846 xfs_dquot_t *dqp;
1948 int restarts; 1847 int restarts;
1848 int startagain;
1949 1849
1950 restarts = 0; 1850 restarts = 0;
1951 dqpout = NULL; 1851 dqpout = NULL;
1952 1852
1953 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ 1853 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1954startagain: 1854again:
1855 startagain = 0;
1955 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); 1856 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1956 1857
1957 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { 1858 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1968,13 +1869,10 @@ startagain:
1968 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); 1869 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1969 1870
1970 trace_xfs_dqreclaim_want(dqp); 1871 trace_xfs_dqreclaim_want(dqp);
1971
1972 xfs_dqunlock(dqp);
1973 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1974 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1975 return NULL;
1976 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1872 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1977 goto startagain; 1873 restarts++;
1874 startagain = 1;
1875 goto dqunlock;
1978 } 1876 }
1979 1877
1980 /* 1878 /*
@@ -1989,23 +1887,20 @@ startagain:
1989 ASSERT(list_empty(&dqp->q_mplist)); 1887 ASSERT(list_empty(&dqp->q_mplist));
1990 list_del_init(&dqp->q_freelist); 1888 list_del_init(&dqp->q_freelist);
1991 xfs_Gqm->qm_dqfrlist_cnt--; 1889 xfs_Gqm->qm_dqfrlist_cnt--;
1992 xfs_dqunlock(dqp);
1993 dqpout = dqp; 1890 dqpout = dqp;
1994 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1891 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1995 break; 1892 goto dqunlock;
1996 } 1893 }
1997 1894
1998 ASSERT(dqp->q_hash); 1895 ASSERT(dqp->q_hash);
1999 ASSERT(!list_empty(&dqp->q_mplist)); 1896 ASSERT(!list_empty(&dqp->q_mplist));
2000 1897
2001 /* 1898 /*
2002 * Try to grab the flush lock. If this dquot is in the process of 1899 * Try to grab the flush lock. If this dquot is in the process
2003 * getting flushed to disk, we don't want to reclaim it. 1900 * of getting flushed to disk, we don't want to reclaim it.
2004 */ 1901 */
2005 if (!xfs_dqflock_nowait(dqp)) { 1902 if (!xfs_dqflock_nowait(dqp))
2006 xfs_dqunlock(dqp); 1903 goto dqunlock;
2007 continue;
2008 }
2009 1904
2010 /* 1905 /*
2011 * We have the flush lock so we know that this is not in the 1906 * We have the flush lock so we know that this is not in the
@@ -2024,11 +1919,10 @@ startagain:
2024 */ 1919 */
2025 error = xfs_qm_dqflush(dqp, 0); 1920 error = xfs_qm_dqflush(dqp, 0);
2026 if (error) { 1921 if (error) {
2027 xfs_fs_cmn_err(CE_WARN, mp, 1922 xfs_warn(mp, "%s: dquot %p flush failed",
2028 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 1923 __func__, dqp);
2029 } 1924 }
2030 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 1925 goto dqunlock;
2031 continue;
2032 } 1926 }
2033 1927
2034 /* 1928 /*
@@ -2050,13 +1944,8 @@ startagain:
2050 */ 1944 */
2051 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { 1945 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
2052 restarts++; 1946 restarts++;
2053 mutex_unlock(&dqp->q_hash->qh_lock); 1947 startagain = 1;
2054 xfs_dqfunlock(dqp); 1948 goto qhunlock;
2055 xfs_dqunlock(dqp);
2056 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2057 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
2058 return NULL;
2059 goto startagain;
2060 } 1949 }
2061 1950
2062 ASSERT(dqp->q_nrefs == 0); 1951 ASSERT(dqp->q_nrefs == 0);
@@ -2069,14 +1958,20 @@ startagain:
2069 xfs_Gqm->qm_dqfrlist_cnt--; 1958 xfs_Gqm->qm_dqfrlist_cnt--;
2070 dqpout = dqp; 1959 dqpout = dqp;
2071 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); 1960 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1961qhunlock:
2072 mutex_unlock(&dqp->q_hash->qh_lock); 1962 mutex_unlock(&dqp->q_hash->qh_lock);
2073dqfunlock: 1963dqfunlock:
2074 xfs_dqfunlock(dqp); 1964 xfs_dqfunlock(dqp);
1965dqunlock:
2075 xfs_dqunlock(dqp); 1966 xfs_dqunlock(dqp);
2076 if (dqpout) 1967 if (dqpout)
2077 break; 1968 break;
2078 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1969 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
2079 return NULL; 1970 break;
1971 if (startagain) {
1972 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1973 goto again;
1974 }
2080 } 1975 }
2081 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); 1976 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
2082 return dqpout; 1977 return dqpout;
@@ -2114,10 +2009,10 @@ xfs_qm_shake_freelist(
2114STATIC int 2009STATIC int
2115xfs_qm_shake( 2010xfs_qm_shake(
2116 struct shrinker *shrink, 2011 struct shrinker *shrink,
2117 int nr_to_scan, 2012 struct shrink_control *sc)
2118 gfp_t gfp_mask)
2119{ 2013{
2120 int ndqused, nfree, n; 2014 int ndqused, nfree, n;
2015 gfp_t gfp_mask = sc->gfp_mask;
2121 2016
2122 if (!kmem_shake_allow(gfp_mask)) 2017 if (!kmem_shake_allow(gfp_mask))
2123 return 0; 2018 return 0;
@@ -2202,7 +2097,7 @@ xfs_qm_write_sb_changes(
2202 int error; 2097 int error;
2203 2098
2204#ifdef QUOTADEBUG 2099#ifdef QUOTADEBUG
2205 cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname); 2100 xfs_notice(mp, "Writing superblock quota changes");
2206#endif 2101#endif
2207 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 2102 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
2208 if ((error = xfs_trans_reserve(tp, 0, 2103 if ((error = xfs_trans_reserve(tp, 0,
@@ -2224,7 +2119,7 @@ xfs_qm_write_sb_changes(
2224 2119
2225 2120
2226/* 2121/*
2227 * Given an inode, a uid and gid (from cred_t) make sure that we have 2122 * Given an inode, a uid, gid and prid make sure that we have
2228 * allocated relevant dquot(s) on disk, and that we won't exceed inode 2123 * allocated relevant dquot(s) on disk, and that we won't exceed inode
2229 * quotas by creating this file. 2124 * quotas by creating this file.
2230 * This also attaches dquot(s) to the given inode after locking it, 2125 * This also attaches dquot(s) to the given inode after locking it,
@@ -2332,7 +2227,7 @@ xfs_qm_vop_dqalloc(
2332 xfs_dqunlock(gq); 2227 xfs_dqunlock(gq);
2333 } 2228 }
2334 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { 2229 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
2335 if (ip->i_d.di_projid != prid) { 2230 if (xfs_get_projid(ip) != prid) {
2336 xfs_iunlock(ip, lockflags); 2231 xfs_iunlock(ip, lockflags);
2337 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, 2232 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
2338 XFS_DQ_PROJ, 2233 XFS_DQ_PROJ,
@@ -2454,7 +2349,7 @@ xfs_qm_vop_chown_reserve(
2454 } 2349 }
2455 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { 2350 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
2456 if (XFS_IS_PQUOTA_ON(ip->i_mount) && 2351 if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
2457 ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) 2352 xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
2458 prjflags = XFS_QMOPT_ENOSPC; 2353 prjflags = XFS_QMOPT_ENOSPC;
2459 2354
2460 if (prjflags || 2355 if (prjflags ||
@@ -2558,7 +2453,7 @@ xfs_qm_vop_create_dqattach(
2558 ip->i_gdquot = gdqp; 2453 ip->i_gdquot = gdqp;
2559 ASSERT(XFS_IS_OQUOTA_ON(mp)); 2454 ASSERT(XFS_IS_OQUOTA_ON(mp));
2560 ASSERT((XFS_IS_GQUOTA_ON(mp) ? 2455 ASSERT((XFS_IS_GQUOTA_ON(mp) ?
2561 ip->i_d.di_gid : ip->i_d.di_projid) == 2456 ip->i_d.di_gid : xfs_get_projid(ip)) ==
2562 be32_to_cpu(gdqp->q_core.d_id)); 2457 be32_to_cpu(gdqp->q_core.d_id));
2563 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2458 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
2564 } 2459 }
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index c9446f1c726d..567b29b9f1b3 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -65,11 +65,6 @@ extern kmem_zone_t *qm_dqtrxzone;
65 * block in the dquot/xqm code. 65 * block in the dquot/xqm code.
66 */ 66 */
67#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 67#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
68/*
69 * When doing a quotacheck, we log dquot clusters of this many FSBs at most
70 * in a single transaction. We don't want to ask for too huge a log reservation.
71 */
72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
73 68
74typedef xfs_dqhash_t xfs_dqlist_t; 69typedef xfs_dqhash_t xfs_dqlist_t;
75 70
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index bea02d786c5d..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -81,7 +81,7 @@ xfs_qm_statvfs(
81 xfs_mount_t *mp = ip->i_mount; 81 xfs_mount_t *mp = ip->i_mount;
82 xfs_dquot_t *dqp; 82 xfs_dquot_t *dqp;
83 83
84 if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) { 84 if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); 85 xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
86 xfs_qm_dqput(dqp); 86 xfs_qm_dqput(dqp);
87 } 87 }
@@ -119,8 +119,7 @@ xfs_qm_newmount(
119 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || 119 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
120 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && 120 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) &&
121 xfs_dev_is_read_only(mp, "changing quota state")) { 121 xfs_dev_is_read_only(mp, "changing quota state")) {
122 cmn_err(CE_WARN, 122 xfs_warn(mp, "please mount with%s%s%s%s.",
123 "XFS: please mount with%s%s%s%s.",
124 (!quotaondisk ? "out quota" : ""), 123 (!quotaondisk ? "out quota" : ""),
125 (uquotaondisk ? " usrquota" : ""), 124 (uquotaondisk ? " usrquota" : ""),
126 (pquotaondisk ? " prjquota" : ""), 125 (pquotaondisk ? " prjquota" : ""),
@@ -135,7 +134,7 @@ xfs_qm_newmount(
135 */ 134 */
136 if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) { 135 if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
137 /* 136 /*
138 * If an error occured, qm_mount_quotas code 137 * If an error occurred, qm_mount_quotas code
139 * has already disabled quotas. So, just finish 138 * has already disabled quotas. So, just finish
140 * mounting, and get on with the boring life 139 * mounting, and get on with the boring life
141 * without disk quotas. 140 * without disk quotas.
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 45e5849df238..2dadb15d5ca9 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -41,12 +41,6 @@
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43 43
44#ifdef DEBUG
45# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args)
46#else
47# define qdprintk(s, args...) do { } while (0)
48#endif
49
50STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 44STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
51STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 45STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
52 uint); 46 uint);
@@ -178,7 +172,7 @@ xfs_qm_scall_quotaoff(
178 /* 172 /*
179 * Next we make the changes in the quota flag in the mount struct. 173 * Next we make the changes in the quota flag in the mount struct.
180 * This isn't protected by a particular lock directly, because we 174 * This isn't protected by a particular lock directly, because we
181 * don't want to take a mrlock everytime we depend on quotas being on. 175 * don't want to take a mrlock every time we depend on quotas being on.
182 */ 176 */
183 mp->m_qflags &= ~(flags); 177 mp->m_qflags &= ~(flags);
184 178
@@ -276,7 +270,7 @@ xfs_qm_scall_trunc_qfile(
276 goto out_unlock; 270 goto out_unlock;
277 } 271 }
278 272
279 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 273 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 274 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
281 275
282out_unlock: 276out_unlock:
@@ -294,7 +288,8 @@ xfs_qm_scall_trunc_qfiles(
294 int error = 0, error2 = 0; 288 int error = 0, error2 = 0;
295 289
296 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 290 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
297 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 291 xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
292 __func__, flags, mp->m_qflags);
298 return XFS_ERROR(EINVAL); 293 return XFS_ERROR(EINVAL);
299 } 294 }
300 295
@@ -318,20 +313,19 @@ xfs_qm_scall_quotaon(
318{ 313{
319 int error; 314 int error;
320 uint qf; 315 uint qf;
321 uint accflags;
322 __int64_t sbflags; 316 __int64_t sbflags;
323 317
324 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); 318 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
325 /* 319 /*
326 * Switching on quota accounting must be done at mount time. 320 * Switching on quota accounting must be done at mount time.
327 */ 321 */
328 accflags = flags & XFS_ALL_QUOTA_ACCT;
329 flags &= ~(XFS_ALL_QUOTA_ACCT); 322 flags &= ~(XFS_ALL_QUOTA_ACCT);
330 323
331 sbflags = 0; 324 sbflags = 0;
332 325
333 if (flags == 0) { 326 if (flags == 0) {
334 qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags); 327 xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
328 __func__, mp->m_qflags);
335 return XFS_ERROR(EINVAL); 329 return XFS_ERROR(EINVAL);
336 } 330 }
337 331
@@ -352,12 +346,13 @@ xfs_qm_scall_quotaon(
352 (flags & XFS_GQUOTA_ACCT) == 0 && 346 (flags & XFS_GQUOTA_ACCT) == 0 &&
353 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && 347 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
354 (flags & XFS_OQUOTA_ENFD))) { 348 (flags & XFS_OQUOTA_ENFD))) {
355 qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n", 349 xfs_debug(mp,
356 flags, mp->m_sb.sb_qflags); 350 "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
351 __func__, flags, mp->m_sb.sb_qflags);
357 return XFS_ERROR(EINVAL); 352 return XFS_ERROR(EINVAL);
358 } 353 }
359 /* 354 /*
360 * If everything's upto-date incore, then don't waste time. 355 * If everything's up to-date incore, then don't waste time.
361 */ 356 */
362 if ((mp->m_qflags & flags) == flags) 357 if ((mp->m_qflags & flags) == flags)
363 return XFS_ERROR(EEXIST); 358 return XFS_ERROR(EEXIST);
@@ -541,7 +536,7 @@ xfs_qm_scall_setqlim(
541 q->qi_bsoftlimit = soft; 536 q->qi_bsoftlimit = soft;
542 } 537 }
543 } else { 538 } else {
544 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); 539 xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
545 } 540 }
546 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? 541 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
547 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : 542 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -557,7 +552,7 @@ xfs_qm_scall_setqlim(
557 q->qi_rtbsoftlimit = soft; 552 q->qi_rtbsoftlimit = soft;
558 } 553 }
559 } else { 554 } else {
560 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); 555 xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
561 } 556 }
562 557
563 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? 558 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -574,7 +569,7 @@ xfs_qm_scall_setqlim(
574 q->qi_isoftlimit = soft; 569 q->qi_isoftlimit = soft;
575 } 570 }
576 } else { 571 } else {
577 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); 572 xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
578 } 573 }
579 574
580 /* 575 /*
@@ -875,21 +870,14 @@ xfs_dqrele_inode(
875 struct xfs_perag *pag, 870 struct xfs_perag *pag,
876 int flags) 871 int flags)
877{ 872{
878 int error;
879
880 /* skip quota inodes */ 873 /* skip quota inodes */
881 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 874 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
882 ip == ip->i_mount->m_quotainfo->qi_gquotaip) { 875 ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
883 ASSERT(ip->i_udquot == NULL); 876 ASSERT(ip->i_udquot == NULL);
884 ASSERT(ip->i_gdquot == NULL); 877 ASSERT(ip->i_gdquot == NULL);
885 read_unlock(&pag->pag_ici_lock);
886 return 0; 878 return 0;
887 } 879 }
888 880
889 error = xfs_sync_inode_valid(ip, pag);
890 if (error)
891 return error;
892
893 xfs_ilock(ip, XFS_ILOCK_EXCL); 881 xfs_ilock(ip, XFS_ILOCK_EXCL);
894 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { 882 if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
895 xfs_qm_dqrele(ip->i_udquot); 883 xfs_qm_dqrele(ip->i_udquot);
@@ -900,8 +888,6 @@ xfs_dqrele_inode(
900 ip->i_gdquot = NULL; 888 ip->i_gdquot = NULL;
901 } 889 }
902 xfs_iunlock(ip, XFS_ILOCK_EXCL); 890 xfs_iunlock(ip, XFS_ILOCK_EXCL);
903
904 IRELE(ip);
905 return 0; 891 return 0;
906} 892}
907 893
@@ -918,8 +904,7 @@ xfs_qm_dqrele_all_inodes(
918 uint flags) 904 uint flags)
919{ 905{
920 ASSERT(mp->m_quotainfo); 906 ASSERT(mp->m_quotainfo);
921 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, 907 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
922 XFS_ICI_NO_TAG, 0, NULL);
923} 908}
924 909
925/*------------------------------------------------------------------------*/ 910/*------------------------------------------------------------------------*/
@@ -949,10 +934,11 @@ struct mutex qcheck_lock;
949#define DQTEST_LIST_PRINT(l, NXT, title) \ 934#define DQTEST_LIST_PRINT(l, NXT, title) \
950{ \ 935{ \
951 xfs_dqtest_t *dqp; int i = 0;\ 936 xfs_dqtest_t *dqp; int i = 0;\
952 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ 937 xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
953 for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \ 938 for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
954 dqp = (xfs_dqtest_t *)dqp->NXT) { \ 939 dqp = (xfs_dqtest_t *)dqp->NXT) { \
955 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \ 940 xfs_debug(dqp->q_mount, \
941 " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \
956 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \ 942 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \
957 dqp->d_bcount, dqp->d_icount); } \ 943 dqp->d_bcount, dqp->d_icount); } \
958} 944}
@@ -976,16 +962,17 @@ xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
976} 962}
977STATIC void 963STATIC void
978xfs_qm_dqtest_print( 964xfs_qm_dqtest_print(
979 xfs_dqtest_t *d) 965 struct xfs_mount *mp,
966 struct dqtest *d)
980{ 967{
981 cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------"); 968 xfs_debug(mp, "-----------DQTEST DQUOT----------------");
982 cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id); 969 xfs_debug(mp, "---- dquot ID = %d", d->d_id);
983 cmn_err(CE_DEBUG, "---- fs = 0x%p", d->q_mount); 970 xfs_debug(mp, "---- fs = 0x%p", d->q_mount);
984 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", 971 xfs_debug(mp, "---- bcount = %Lu (0x%x)",
985 d->d_bcount, (int)d->d_bcount); 972 d->d_bcount, (int)d->d_bcount);
986 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", 973 xfs_debug(mp, "---- icount = %Lu (0x%x)",
987 d->d_icount, (int)d->d_icount); 974 d->d_icount, (int)d->d_icount);
988 cmn_err(CE_DEBUG, "---------------------------"); 975 xfs_debug(mp, "---------------------------");
989} 976}
990 977
991STATIC void 978STATIC void
@@ -999,12 +986,14 @@ xfs_qm_dqtest_failed(
999{ 986{
1000 qmtest_nfails++; 987 qmtest_nfails++;
1001 if (error) 988 if (error)
1002 cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s", 989 xfs_debug(dqp->q_mount,
1003 d->d_id, error, reason); 990 "quotacheck failed id=%d, err=%d\nreason: %s",
991 d->d_id, error, reason);
1004 else 992 else
1005 cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]", 993 xfs_debug(dqp->q_mount,
1006 d->d_id, reason, (int)a, (int)b); 994 "quotacheck failed id=%d (%s) [%d != %d]",
1007 xfs_qm_dqtest_print(d); 995 d->d_id, reason, (int)a, (int)b);
996 xfs_qm_dqtest_print(dqp->q_mount, d);
1008 if (dqp) 997 if (dqp)
1009 xfs_qm_dqprint(dqp); 998 xfs_qm_dqprint(dqp);
1010} 999}
@@ -1031,9 +1020,9 @@ xfs_dqtest_cmp2(
1031 be64_to_cpu(dqp->q_core.d_bcount) >= 1020 be64_to_cpu(dqp->q_core.d_bcount) >=
1032 be64_to_cpu(dqp->q_core.d_blk_softlimit)) { 1021 be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
1033 if (!dqp->q_core.d_btimer && dqp->q_core.d_id) { 1022 if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
1034 cmn_err(CE_DEBUG, 1023 xfs_debug(dqp->q_mount,
1035 "%d [%s] [0x%p] BLK TIMER NOT STARTED", 1024 "%d [%s] BLK TIMER NOT STARTED",
1036 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1025 d->d_id, DQFLAGTO_TYPESTR(d));
1037 err++; 1026 err++;
1038 } 1027 }
1039 } 1028 }
@@ -1041,16 +1030,16 @@ xfs_dqtest_cmp2(
1041 be64_to_cpu(dqp->q_core.d_icount) >= 1030 be64_to_cpu(dqp->q_core.d_icount) >=
1042 be64_to_cpu(dqp->q_core.d_ino_softlimit)) { 1031 be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
1043 if (!dqp->q_core.d_itimer && dqp->q_core.d_id) { 1032 if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
1044 cmn_err(CE_DEBUG, 1033 xfs_debug(dqp->q_mount,
1045 "%d [%s] [0x%p] INO TIMER NOT STARTED", 1034 "%d [%s] INO TIMER NOT STARTED",
1046 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1035 d->d_id, DQFLAGTO_TYPESTR(d));
1047 err++; 1036 err++;
1048 } 1037 }
1049 } 1038 }
1050#ifdef QUOTADEBUG 1039#ifdef QUOTADEBUG
1051 if (!err) { 1040 if (!err) {
1052 cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked", 1041 xfs_debug(dqp->q_mount, "%d [%s] qchecked",
1053 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1042 d->d_id, DQFLAGTO_TYPESTR(d));
1054 } 1043 }
1055#endif 1044#endif
1056 return (err); 1045 return (err);
@@ -1147,8 +1136,8 @@ xfs_qm_internalqcheck_adjust(
1147 1136
1148 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { 1137 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
1149 *res = BULKSTAT_RV_NOTHING; 1138 *res = BULKSTAT_RV_NOTHING;
1150 qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n", 1139 xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
1151 (unsigned long long) ino, 1140 __func__, (unsigned long long) ino,
1152 (unsigned long long) mp->m_sb.sb_uquotino, 1141 (unsigned long long) mp->m_sb.sb_uquotino,
1153 (unsigned long long) mp->m_sb.sb_gquotino); 1142 (unsigned long long) mp->m_sb.sb_gquotino);
1154 return XFS_ERROR(EINVAL); 1143 return XFS_ERROR(EINVAL);
@@ -1175,7 +1164,7 @@ xfs_qm_internalqcheck_adjust(
1175 } 1164 }
1176 xfs_qm_internalqcheck_get_dquots(mp, 1165 xfs_qm_internalqcheck_get_dquots(mp,
1177 (xfs_dqid_t) ip->i_d.di_uid, 1166 (xfs_dqid_t) ip->i_d.di_uid,
1178 (xfs_dqid_t) ip->i_d.di_projid, 1167 (xfs_dqid_t) xfs_get_projid(ip),
1179 (xfs_dqid_t) ip->i_d.di_gid, 1168 (xfs_dqid_t) ip->i_d.di_gid,
1180 &ud, &gd); 1169 &ud, &gd);
1181 if (XFS_IS_UQUOTA_ON(mp)) { 1170 if (XFS_IS_UQUOTA_ON(mp)) {
@@ -1233,12 +1222,12 @@ xfs_qm_internalqcheck(
1233 xfs_qm_internalqcheck_adjust, 1222 xfs_qm_internalqcheck_adjust,
1234 0, NULL, &done); 1223 0, NULL, &done);
1235 if (error) { 1224 if (error) {
1236 cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); 1225 xfs_debug(mp, "Bulkstat returned error 0x%x", error);
1237 break; 1226 break;
1238 } 1227 }
1239 } while (!done); 1228 } while (!done);
1240 1229
1241 cmn_err(CE_DEBUG, "Checking results against system dquots"); 1230 xfs_debug(mp, "Checking results against system dquots");
1242 for (i = 0; i < qmtest_hashmask; i++) { 1231 for (i = 0; i < qmtest_hashmask; i++) {
1243 xfs_dqtest_t *d, *n; 1232 xfs_dqtest_t *d, *n;
1244 xfs_dqhash_t *h; 1233 xfs_dqhash_t *h;
@@ -1256,10 +1245,10 @@ xfs_qm_internalqcheck(
1256 } 1245 }
1257 1246
1258 if (qmtest_nfails) { 1247 if (qmtest_nfails) {
1259 cmn_err(CE_DEBUG, "******** quotacheck failed ********"); 1248 xfs_debug(mp, "******** quotacheck failed ********");
1260 cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails); 1249 xfs_debug(mp, "failures = %d", qmtest_nfails);
1261 } else { 1250 } else {
1262 cmn_err(CE_DEBUG, "******** quotacheck successful! ********"); 1251 xfs_debug(mp, "******** quotacheck successful! ********");
1263 } 1252 }
1264 kmem_free(qmtest_udqtab); 1253 kmem_free(qmtest_udqtab);
1265 kmem_free(qmtest_gdqtab); 1254 kmem_free(qmtest_gdqtab);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de91d1b75c0..2a3648731331 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -643,8 +643,9 @@ xfs_trans_dqresv(
643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && 643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { 644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
645#ifdef QUOTADEBUG 645#ifdef QUOTADEBUG
646 cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld" 646 xfs_debug(mp,
647 " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit); 647 "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
648 nblks, *resbcountp, hardlimit);
648#endif 649#endif
649 if (nblks > 0) { 650 if (nblks > 0) {
650 /* 651 /*
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index 975aa10e1a47..000000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,115 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include <xfs.h>
19#include "debug.h"
20
21/* xfs_mount.h drags a lot of crap in, sorry.. */
22#include "xfs_sb.h"
23#include "xfs_inum.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_error.h"
27
28static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock);
30
31/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
32#define XFS_MAX_ERR_LEVEL 7
33#define XFS_ERR_MASK ((1 << 3) - 1)
34static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
35 {KERN_EMERG, KERN_ALERT, KERN_CRIT,
36 KERN_ERR, KERN_WARNING, KERN_NOTICE,
37 KERN_INFO, KERN_DEBUG};
38
39void
40cmn_err(register int level, char *fmt, ...)
41{
42 char *fp = fmt;
43 int len;
44 ulong flags;
45 va_list ap;
46
47 level &= XFS_ERR_MASK;
48 if (level > XFS_MAX_ERR_LEVEL)
49 level = XFS_MAX_ERR_LEVEL;
50 spin_lock_irqsave(&xfs_err_lock,flags);
51 va_start(ap, fmt);
52 if (*fmt == '!') fp++;
53 len = vsnprintf(message, sizeof(message), fp, ap);
54 if (len >= sizeof(message))
55 len = sizeof(message) - 1;
56 if (message[len-1] == '\n')
57 message[len-1] = 0;
58 printk("%s%s\n", err_level[level], message);
59 va_end(ap);
60 spin_unlock_irqrestore(&xfs_err_lock,flags);
61 BUG_ON(level == CE_PANIC);
62}
63
64void
65xfs_fs_vcmn_err(
66 int level,
67 struct xfs_mount *mp,
68 char *fmt,
69 va_list ap)
70{
71 unsigned long flags;
72 int len = 0;
73
74 level &= XFS_ERR_MASK;
75 if (level > XFS_MAX_ERR_LEVEL)
76 level = XFS_MAX_ERR_LEVEL;
77
78 spin_lock_irqsave(&xfs_err_lock,flags);
79
80 if (mp) {
81 len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
82
83 /*
84 * Skip the printk if we can't print anything useful
85 * due to an over-long device name.
86 */
87 if (len >= sizeof(message))
88 goto out;
89 }
90
91 len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
92 if (len >= sizeof(message))
93 len = sizeof(message) - 1;
94 if (message[len-1] == '\n')
95 message[len-1] = 0;
96
97 printk("%s%s\n", err_level[level], message);
98 out:
99 spin_unlock_irqrestore(&xfs_err_lock,flags);
100
101 BUG_ON(level == CE_PANIC);
102}
103
104void
105assfail(char *expr, char *file, int line)
106{
107 printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
108 BUG();
109}
110
111void
112xfs_hex_dump(void *p, int length)
113{
114 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
115}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index d2d20462fd4f..000000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,54 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_DEBUG_H__
19#define __XFS_SUPPORT_DEBUG_H__
20
21#include <stdarg.h>
22
23#define CE_DEBUG 7 /* debug */
24#define CE_CONT 6 /* continuation */
25#define CE_NOTE 5 /* notice */
26#define CE_WARN 4 /* warning */
27#define CE_ALERT 1 /* alert */
28#define CE_PANIC 0 /* panic */
29
30extern void cmn_err(int, char *, ...)
31 __attribute__ ((format (printf, 2, 3)));
32extern void assfail(char *expr, char *f, int l);
33
34#define ASSERT_ALWAYS(expr) \
35 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
36
37#ifndef DEBUG
38#define ASSERT(expr) ((void)0)
39
40#ifndef STATIC
41# define STATIC static noinline
42#endif
43
44#else /* DEBUG */
45
46#define ASSERT(expr) \
47 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
48
49#ifndef STATIC
50# define STATIC noinline
51#endif
52
53#endif /* DEBUG */
54#endif /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 0135e2a669d7..11dd72070cbb 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ struct xfs_acl {
42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) 42#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
43 43
44#ifdef CONFIG_XFS_POSIX_ACL 44#ifdef CONFIG_XFS_POSIX_ACL
45extern int xfs_check_acl(struct inode *inode, int mask); 45extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
46extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); 46extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
47extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); 47extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
48extern int xfs_acl_chmod(struct inode *inode); 48extern int xfs_acl_chmod(struct inode *inode);
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 4917d4eed4ed..6530769a999b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,7 +187,9 @@ struct xfs_busy_extent {
187 xfs_agnumber_t agno; 187 xfs_agnumber_t agno;
188 xfs_agblock_t bno; 188 xfs_agblock_t bno;
189 xfs_extlen_t length; 189 xfs_extlen_t length;
190 xlog_tid_t tid; /* transaction that created this */ 190 unsigned int flags;
191#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
192#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */
191}; 193};
192 194
193/* 195/*
@@ -227,9 +229,18 @@ typedef struct xfs_perag {
227 229
228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 230 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
229 231
230 rwlock_t pag_ici_lock; /* incore inode lock */ 232 spinlock_t pag_ici_lock; /* incore inode cache lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 233 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232 int pag_ici_reclaimable; /* reclaimable inodes */ 234 int pag_ici_reclaimable; /* reclaimable inodes */
235 struct mutex pag_ici_reclaim_lock; /* serialisation point */
236 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
237
238 /* buffer cache index */
239 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
240 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
241
242 /* for rcu-safe freeing */
243 struct rcu_head rcu_head;
233#endif 244#endif
234 int pagb_count; /* pagb slots in use */ 245 int pagb_count; /* pagb slots in use */
235} xfs_perag_t; 246} xfs_perag_t;
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index af168faccc7a..95862bbff56b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,23 +41,13 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44static int
45xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
46 xfs_agblock_t bno, xfs_extlen_t len);
47
48/*
49 * Prototypes for per-ag allocation routines
50 */
51
52STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); 44STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
53STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); 45STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
54STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); 46STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
55STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, 47STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
56 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); 48 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
57 49STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
58/* 50 xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
59 * Internal functions.
60 */
61 51
62/* 52/*
63 * Lookup the record equal to [bno, len] in the btree given by cur. 53 * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -94,7 +84,7 @@ xfs_alloc_lookup_ge(
94 * Lookup the first record less than or equal to [bno, len] 84 * Lookup the first record less than or equal to [bno, len]
95 * in the btree given by cur. 85 * in the btree given by cur.
96 */ 86 */
97STATIC int /* error */ 87int /* error */
98xfs_alloc_lookup_le( 88xfs_alloc_lookup_le(
99 struct xfs_btree_cur *cur, /* btree cursor */ 89 struct xfs_btree_cur *cur, /* btree cursor */
100 xfs_agblock_t bno, /* starting block of extent */ 90 xfs_agblock_t bno, /* starting block of extent */
@@ -127,7 +117,7 @@ xfs_alloc_update(
127/* 117/*
128 * Get the data from the pointed-to record. 118 * Get the data from the pointed-to record.
129 */ 119 */
130STATIC int /* error */ 120int /* error */
131xfs_alloc_get_rec( 121xfs_alloc_get_rec(
132 struct xfs_btree_cur *cur, /* btree cursor */ 122 struct xfs_btree_cur *cur, /* btree cursor */
133 xfs_agblock_t *bno, /* output: starting block of extent */ 123 xfs_agblock_t *bno, /* output: starting block of extent */
@@ -151,27 +141,28 @@ xfs_alloc_get_rec(
151 */ 141 */
152STATIC void 142STATIC void
153xfs_alloc_compute_aligned( 143xfs_alloc_compute_aligned(
144 xfs_alloc_arg_t *args, /* allocation argument structure */
154 xfs_agblock_t foundbno, /* starting block in found extent */ 145 xfs_agblock_t foundbno, /* starting block in found extent */
155 xfs_extlen_t foundlen, /* length in found extent */ 146 xfs_extlen_t foundlen, /* length in found extent */
156 xfs_extlen_t alignment, /* alignment for allocation */
157 xfs_extlen_t minlen, /* minimum length for allocation */
158 xfs_agblock_t *resbno, /* result block number */ 147 xfs_agblock_t *resbno, /* result block number */
159 xfs_extlen_t *reslen) /* result length */ 148 xfs_extlen_t *reslen) /* result length */
160{ 149{
161 xfs_agblock_t bno; 150 xfs_agblock_t bno;
162 xfs_extlen_t diff;
163 xfs_extlen_t len; 151 xfs_extlen_t len;
164 152
165 if (alignment > 1 && foundlen >= minlen) { 153 /* Trim busy sections out of found extent */
166 bno = roundup(foundbno, alignment); 154 xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
167 diff = bno - foundbno; 155
168 len = diff >= foundlen ? 0 : foundlen - diff; 156 if (args->alignment > 1 && len >= args->minlen) {
157 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
158 xfs_extlen_t diff = aligned_bno - bno;
159
160 *resbno = aligned_bno;
161 *reslen = diff >= len ? 0 : len - diff;
169 } else { 162 } else {
170 bno = foundbno; 163 *resbno = bno;
171 len = foundlen; 164 *reslen = len;
172 } 165 }
173 *resbno = bno;
174 *reslen = len;
175} 166}
176 167
177/* 168/*
@@ -285,7 +276,6 @@ xfs_alloc_fix_minleft(
285 return 1; 276 return 1;
286 agf = XFS_BUF_TO_AGF(args->agbp); 277 agf = XFS_BUF_TO_AGF(args->agbp);
287 diff = be32_to_cpu(agf->agf_freeblks) 278 diff = be32_to_cpu(agf->agf_freeblks)
288 + be32_to_cpu(agf->agf_flcount)
289 - args->len - args->minleft; 279 - args->len - args->minleft;
290 if (diff >= 0) 280 if (diff >= 0)
291 return 1; 281 return 1;
@@ -468,6 +458,27 @@ xfs_alloc_read_agfl(
468 return 0; 458 return 0;
469} 459}
470 460
461STATIC int
462xfs_alloc_update_counters(
463 struct xfs_trans *tp,
464 struct xfs_perag *pag,
465 struct xfs_buf *agbp,
466 long len)
467{
468 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
469
470 pag->pagf_freeblks += len;
471 be32_add_cpu(&agf->agf_freeblks, len);
472
473 xfs_trans_agblocks_delta(tp, len);
474 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
475 be32_to_cpu(agf->agf_length)))
476 return EFSCORRUPTED;
477
478 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
479 return 0;
480}
481
471/* 482/*
472 * Allocation group level functions. 483 * Allocation group level functions.
473 */ 484 */
@@ -509,49 +520,36 @@ xfs_alloc_ag_vextent(
509 ASSERT(0); 520 ASSERT(0);
510 /* NOTREACHED */ 521 /* NOTREACHED */
511 } 522 }
512 if (error) 523
524 if (error || args->agbno == NULLAGBLOCK)
513 return error; 525 return error;
514 /*
515 * If the allocation worked, need to change the agf structure
516 * (and log it), and the superblock.
517 */
518 if (args->agbno != NULLAGBLOCK) {
519 xfs_agf_t *agf; /* allocation group freelist header */
520 long slen = (long)args->len;
521 526
522 ASSERT(args->len >= args->minlen && args->len <= args->maxlen); 527 ASSERT(args->len >= args->minlen);
523 ASSERT(!(args->wasfromfl) || !args->isfl); 528 ASSERT(args->len <= args->maxlen);
524 ASSERT(args->agbno % args->alignment == 0); 529 ASSERT(!args->wasfromfl || !args->isfl);
525 if (!(args->wasfromfl)) { 530 ASSERT(args->agbno % args->alignment == 0);
526 531
527 agf = XFS_BUF_TO_AGF(args->agbp); 532 if (!args->wasfromfl) {
528 be32_add_cpu(&agf->agf_freeblks, -(args->len)); 533 error = xfs_alloc_update_counters(args->tp, args->pag,
529 xfs_trans_agblocks_delta(args->tp, 534 args->agbp,
530 -((long)(args->len))); 535 -((long)(args->len)));
531 args->pag->pagf_freeblks -= args->len; 536 if (error)
532 ASSERT(be32_to_cpu(agf->agf_freeblks) <= 537 return error;
533 be32_to_cpu(agf->agf_length)); 538
534 xfs_alloc_log_agf(args->tp, args->agbp, 539 ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
535 XFS_AGF_FREEBLKS); 540 args->agbno, args->len));
536 /*
537 * Search the busylist for these blocks and mark the
538 * transaction as synchronous if blocks are found. This
539 * avoids the need to block due to a synchronous log
540 * force to ensure correct ordering as the synchronous
541 * transaction will guarantee that for us.
542 */
543 if (xfs_alloc_busy_search(args->mp, args->agno,
544 args->agbno, args->len))
545 xfs_trans_set_sync(args->tp);
546 }
547 if (!args->isfl)
548 xfs_trans_mod_sb(args->tp,
549 args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
550 XFS_TRANS_SB_FDBLOCKS, -slen);
551 XFS_STATS_INC(xs_allocx);
552 XFS_STATS_ADD(xs_allocb, args->len);
553 } 541 }
554 return 0; 542
543 if (!args->isfl) {
544 xfs_trans_mod_sb(args->tp, args->wasdel ?
545 XFS_TRANS_SB_RES_FDBLOCKS :
546 XFS_TRANS_SB_FDBLOCKS,
547 -((long)(args->len)));
548 }
549
550 XFS_STATS_INC(xs_allocx);
551 XFS_STATS_ADD(xs_allocb, args->len);
552 return error;
555} 553}
556 554
557/* 555/*
@@ -566,72 +564,77 @@ xfs_alloc_ag_vextent_exact(
566{ 564{
567 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ 565 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
568 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ 566 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
569 xfs_agblock_t end; /* end of allocated extent */
570 int error; 567 int error;
571 xfs_agblock_t fbno; /* start block of found extent */ 568 xfs_agblock_t fbno; /* start block of found extent */
572 xfs_agblock_t fend; /* end block of found extent */
573 xfs_extlen_t flen; /* length of found extent */ 569 xfs_extlen_t flen; /* length of found extent */
570 xfs_agblock_t tbno; /* start block of trimmed extent */
571 xfs_extlen_t tlen; /* length of trimmed extent */
572 xfs_agblock_t tend; /* end block of trimmed extent */
573 xfs_agblock_t end; /* end of allocated extent */
574 int i; /* success/failure of operation */ 574 int i; /* success/failure of operation */
575 xfs_agblock_t maxend; /* end of maximal extent */
576 xfs_agblock_t minend; /* end of minimal extent */
577 xfs_extlen_t rlen; /* length of returned extent */ 575 xfs_extlen_t rlen; /* length of returned extent */
578 576
579 ASSERT(args->alignment == 1); 577 ASSERT(args->alignment == 1);
578
580 /* 579 /*
581 * Allocate/initialize a cursor for the by-number freespace btree. 580 * Allocate/initialize a cursor for the by-number freespace btree.
582 */ 581 */
583 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 582 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
584 args->agno, XFS_BTNUM_BNO); 583 args->agno, XFS_BTNUM_BNO);
584
585 /* 585 /*
586 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 586 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
587 * Look for the closest free block <= bno, it must contain bno 587 * Look for the closest free block <= bno, it must contain bno
588 * if any free block does. 588 * if any free block does.
589 */ 589 */
590 if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) 590 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
591 if (error)
591 goto error0; 592 goto error0;
592 if (!i) { 593 if (!i)
593 /* 594 goto not_found;
594 * Didn't find it, return null. 595
595 */
596 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
597 args->agbno = NULLAGBLOCK;
598 return 0;
599 }
600 /* 596 /*
601 * Grab the freespace record. 597 * Grab the freespace record.
602 */ 598 */
603 if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) 599 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
600 if (error)
604 goto error0; 601 goto error0;
605 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 602 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
606 ASSERT(fbno <= args->agbno); 603 ASSERT(fbno <= args->agbno);
607 minend = args->agbno + args->minlen; 604
608 maxend = args->agbno + args->maxlen;
609 fend = fbno + flen;
610 /* 605 /*
611 * Give up if the freespace isn't long enough for the minimum request. 606 * Check for overlapping busy extents.
612 */ 607 */
613 if (fend < minend) { 608 xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
614 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 609
615 args->agbno = NULLAGBLOCK;
616 return 0;
617 }
618 /* 610 /*
619 * End of extent will be smaller of the freespace end and the 611 * Give up if the start of the extent is busy, or the freespace isn't
620 * maximal requested end. 612 * long enough for the minimum request.
621 */ 613 */
622 end = XFS_AGBLOCK_MIN(fend, maxend); 614 if (tbno > args->agbno)
615 goto not_found;
616 if (tlen < args->minlen)
617 goto not_found;
618 tend = tbno + tlen;
619 if (tend < args->agbno + args->minlen)
620 goto not_found;
621
623 /* 622 /*
623 * End of extent will be smaller of the freespace end and the
624 * maximal requested end.
625 *
624 * Fix the length according to mod and prod if given. 626 * Fix the length according to mod and prod if given.
625 */ 627 */
628 end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
626 args->len = end - args->agbno; 629 args->len = end - args->agbno;
627 xfs_alloc_fix_len(args); 630 xfs_alloc_fix_len(args);
628 if (!xfs_alloc_fix_minleft(args)) { 631 if (!xfs_alloc_fix_minleft(args))
629 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 632 goto not_found;
630 return 0; 633
631 }
632 rlen = args->len; 634 rlen = args->len;
633 ASSERT(args->agbno + rlen <= fend); 635 ASSERT(args->agbno + rlen <= tend);
634 end = args->agbno + rlen; 636 end = args->agbno + rlen;
637
635 /* 638 /*
636 * We are allocating agbno for rlen [agbno .. end] 639 * We are allocating agbno for rlen [agbno .. end]
637 * Allocate/initialize a cursor for the by-size btree. 640 * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +643,25 @@ xfs_alloc_ag_vextent_exact(
640 args->agno, XFS_BTNUM_CNT); 643 args->agno, XFS_BTNUM_CNT);
641 ASSERT(args->agbno + args->len <= 644 ASSERT(args->agbno + args->len <=
642 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 645 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
643 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 646 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
644 args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { 647 args->len, XFSA_FIXUP_BNO_OK);
648 if (error) {
645 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 649 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
646 goto error0; 650 goto error0;
647 } 651 }
652
648 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 653 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
649 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 654 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
650 655
651 trace_xfs_alloc_exact_done(args);
652 args->wasfromfl = 0; 656 args->wasfromfl = 0;
657 trace_xfs_alloc_exact_done(args);
658 return 0;
659
660not_found:
661 /* Didn't find it, return null. */
662 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
663 args->agbno = NULLAGBLOCK;
664 trace_xfs_alloc_exact_notfound(args);
653 return 0; 665 return 0;
654 666
655error0: 667error0:
@@ -659,6 +671,94 @@ error0:
659} 671}
660 672
661/* 673/*
674 * Search the btree in a given direction via the search cursor and compare
675 * the records found against the good extent we've already found.
676 */
677STATIC int
678xfs_alloc_find_best_extent(
679 struct xfs_alloc_arg *args, /* allocation argument structure */
680 struct xfs_btree_cur **gcur, /* good cursor */
681 struct xfs_btree_cur **scur, /* searching cursor */
682 xfs_agblock_t gdiff, /* difference for search comparison */
683 xfs_agblock_t *sbno, /* extent found by search */
684 xfs_extlen_t *slen, /* extent length */
685 xfs_agblock_t *sbnoa, /* aligned extent found by search */
686 xfs_extlen_t *slena, /* aligned extent length */
687 int dir) /* 0 = search right, 1 = search left */
688{
689 xfs_agblock_t new;
690 xfs_agblock_t sdiff;
691 int error;
692 int i;
693
694 /* The good extent is perfect, no need to search. */
695 if (!gdiff)
696 goto out_use_good;
697
698 /*
699 * Look until we find a better one, run out of space or run off the end.
700 */
701 do {
702 error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
703 if (error)
704 goto error0;
705 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
706 xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
707
708 /*
709 * The good extent is closer than this one.
710 */
711 if (!dir) {
712 if (*sbnoa >= args->agbno + gdiff)
713 goto out_use_good;
714 } else {
715 if (*sbnoa <= args->agbno - gdiff)
716 goto out_use_good;
717 }
718
719 /*
720 * Same distance, compare length and pick the best.
721 */
722 if (*slena >= args->minlen) {
723 args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
724 xfs_alloc_fix_len(args);
725
726 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
727 args->alignment, *sbnoa,
728 *slena, &new);
729
730 /*
731 * Choose closer size and invalidate other cursor.
732 */
733 if (sdiff < gdiff)
734 goto out_use_search;
735 goto out_use_good;
736 }
737
738 if (!dir)
739 error = xfs_btree_increment(*scur, 0, &i);
740 else
741 error = xfs_btree_decrement(*scur, 0, &i);
742 if (error)
743 goto error0;
744 } while (i);
745
746out_use_good:
747 xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
748 *scur = NULL;
749 return 0;
750
751out_use_search:
752 xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
753 *gcur = NULL;
754 return 0;
755
756error0:
757 /* caller invalidates cursors */
758 return error;
759}
760
761/*
662 * Allocate a variable extent near bno in the allocation group agno. 762 * Allocate a variable extent near bno in the allocation group agno.
663 * Extent's length (returned in len) will be between minlen and maxlen, 763 * Extent's length (returned in len) will be between minlen and maxlen,
664 * and of the form k * prod + mod unless there's nothing that large. 764 * and of the form k * prod + mod unless there's nothing that large.
@@ -687,6 +787,7 @@ xfs_alloc_ag_vextent_near(
687 xfs_extlen_t ltlena; /* aligned ... */ 787 xfs_extlen_t ltlena; /* aligned ... */
688 xfs_agblock_t ltnew; /* useful start bno of left side */ 788 xfs_agblock_t ltnew; /* useful start bno of left side */
689 xfs_extlen_t rlen; /* length of returned extent */ 789 xfs_extlen_t rlen; /* length of returned extent */
790 int forced = 0;
690#if defined(DEBUG) && defined(__KERNEL__) 791#if defined(DEBUG) && defined(__KERNEL__)
691 /* 792 /*
692 * Randomly don't execute the first algorithm. 793 * Randomly don't execute the first algorithm.
@@ -695,13 +796,20 @@ xfs_alloc_ag_vextent_near(
695 796
696 dofirst = random32() & 1; 797 dofirst = random32() & 1;
697#endif 798#endif
799
800restart:
801 bno_cur_lt = NULL;
802 bno_cur_gt = NULL;
803 ltlen = 0;
804 gtlena = 0;
805 ltlena = 0;
806
698 /* 807 /*
699 * Get a cursor for the by-size btree. 808 * Get a cursor for the by-size btree.
700 */ 809 */
701 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 810 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
702 args->agno, XFS_BTNUM_CNT); 811 args->agno, XFS_BTNUM_CNT);
703 ltlen = 0; 812
704 bno_cur_lt = bno_cur_gt = NULL;
705 /* 813 /*
706 * See if there are any free extents as big as maxlen. 814 * See if there are any free extents as big as maxlen.
707 */ 815 */
@@ -717,11 +825,13 @@ xfs_alloc_ag_vextent_near(
717 goto error0; 825 goto error0;
718 if (i == 0 || ltlen == 0) { 826 if (i == 0 || ltlen == 0) {
719 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 827 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
828 trace_xfs_alloc_near_noentry(args);
720 return 0; 829 return 0;
721 } 830 }
722 ASSERT(i == 1); 831 ASSERT(i == 1);
723 } 832 }
724 args->wasfromfl = 0; 833 args->wasfromfl = 0;
834
725 /* 835 /*
726 * First algorithm. 836 * First algorithm.
727 * If the requested extent is large wrt the freespaces available 837 * If the requested extent is large wrt the freespaces available
@@ -775,8 +885,8 @@ xfs_alloc_ag_vextent_near(
775 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 885 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
776 goto error0; 886 goto error0;
777 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 887 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
778 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 888 xfs_alloc_compute_aligned(args, ltbno, ltlen,
779 args->minlen, &ltbnoa, &ltlena); 889 &ltbnoa, &ltlena);
780 if (ltlena < args->minlen) 890 if (ltlena < args->minlen)
781 continue; 891 continue;
782 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 892 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -785,7 +895,7 @@ xfs_alloc_ag_vextent_near(
785 if (args->len < blen) 895 if (args->len < blen)
786 continue; 896 continue;
787 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 897 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
788 args->alignment, ltbno, ltlen, &ltnew); 898 args->alignment, ltbnoa, ltlena, &ltnew);
789 if (ltnew != NULLAGBLOCK && 899 if (ltnew != NULLAGBLOCK &&
790 (args->len > blen || ltdiff < bdiff)) { 900 (args->len > blen || ltdiff < bdiff)) {
791 bdiff = ltdiff; 901 bdiff = ltdiff;
@@ -896,8 +1006,8 @@ xfs_alloc_ag_vextent_near(
896 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i))) 1006 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
897 goto error0; 1007 goto error0;
898 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1008 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
899 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 1009 xfs_alloc_compute_aligned(args, ltbno, ltlen,
900 args->minlen, &ltbnoa, &ltlena); 1010 &ltbnoa, &ltlena);
901 if (ltlena >= args->minlen) 1011 if (ltlena >= args->minlen)
902 break; 1012 break;
903 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) 1013 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -912,8 +1022,8 @@ xfs_alloc_ag_vextent_near(
912 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i))) 1022 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
913 goto error0; 1023 goto error0;
914 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1024 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
915 xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment, 1025 xfs_alloc_compute_aligned(args, gtbno, gtlen,
916 args->minlen, &gtbnoa, &gtlena); 1026 &gtbnoa, &gtlena);
917 if (gtlena >= args->minlen) 1027 if (gtlena >= args->minlen)
918 break; 1028 break;
919 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) 1029 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -925,211 +1035,62 @@ xfs_alloc_ag_vextent_near(
925 } 1035 }
926 } 1036 }
927 } while (bno_cur_lt || bno_cur_gt); 1037 } while (bno_cur_lt || bno_cur_gt);
1038
928 /* 1039 /*
929 * Got both cursors still active, need to find better entry. 1040 * Got both cursors still active, need to find better entry.
930 */ 1041 */
931 if (bno_cur_lt && bno_cur_gt) { 1042 if (bno_cur_lt && bno_cur_gt) {
932 /*
933 * Left side is long enough, look for a right side entry.
934 */
935 if (ltlena >= args->minlen) { 1043 if (ltlena >= args->minlen) {
936 /* 1044 /*
937 * Fix up the length. 1045 * Left side is good, look for a right side entry.
938 */ 1046 */
939 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1047 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
940 xfs_alloc_fix_len(args); 1048 xfs_alloc_fix_len(args);
941 rlen = args->len; 1049 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
942 ltdiff = xfs_alloc_compute_diff(args->agbno, rlen, 1050 args->alignment, ltbnoa, ltlena, &ltnew);
943 args->alignment, ltbno, ltlen, &ltnew); 1051
944 /* 1052 error = xfs_alloc_find_best_extent(args,
945 * Not perfect. 1053 &bno_cur_lt, &bno_cur_gt,
946 */ 1054 ltdiff, &gtbno, &gtlen,
947 if (ltdiff) { 1055 &gtbnoa, &gtlena,
948 /* 1056 0 /* search right */);
949 * Look until we find a better one, run out of 1057 } else {
950 * space, or run off the end. 1058 ASSERT(gtlena >= args->minlen);
951 */ 1059
952 while (bno_cur_lt && bno_cur_gt) {
953 if ((error = xfs_alloc_get_rec(
954 bno_cur_gt, &gtbno,
955 &gtlen, &i)))
956 goto error0;
957 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
958 xfs_alloc_compute_aligned(gtbno, gtlen,
959 args->alignment, args->minlen,
960 &gtbnoa, &gtlena);
961 /*
962 * The left one is clearly better.
963 */
964 if (gtbnoa >= args->agbno + ltdiff) {
965 xfs_btree_del_cursor(
966 bno_cur_gt,
967 XFS_BTREE_NOERROR);
968 bno_cur_gt = NULL;
969 break;
970 }
971 /*
972 * If we reach a big enough entry,
973 * compare the two and pick the best.
974 */
975 if (gtlena >= args->minlen) {
976 args->len =
977 XFS_EXTLEN_MIN(gtlena,
978 args->maxlen);
979 xfs_alloc_fix_len(args);
980 rlen = args->len;
981 gtdiff = xfs_alloc_compute_diff(
982 args->agbno, rlen,
983 args->alignment,
984 gtbno, gtlen, &gtnew);
985 /*
986 * Right side is better.
987 */
988 if (gtdiff < ltdiff) {
989 xfs_btree_del_cursor(
990 bno_cur_lt,
991 XFS_BTREE_NOERROR);
992 bno_cur_lt = NULL;
993 }
994 /*
995 * Left side is better.
996 */
997 else {
998 xfs_btree_del_cursor(
999 bno_cur_gt,
1000 XFS_BTREE_NOERROR);
1001 bno_cur_gt = NULL;
1002 }
1003 break;
1004 }
1005 /*
1006 * Fell off the right end.
1007 */
1008 if ((error = xfs_btree_increment(
1009 bno_cur_gt, 0, &i)))
1010 goto error0;
1011 if (!i) {
1012 xfs_btree_del_cursor(
1013 bno_cur_gt,
1014 XFS_BTREE_NOERROR);
1015 bno_cur_gt = NULL;
1016 break;
1017 }
1018 }
1019 }
1020 /*
1021 * The left side is perfect, trash the right side.
1022 */
1023 else {
1024 xfs_btree_del_cursor(bno_cur_gt,
1025 XFS_BTREE_NOERROR);
1026 bno_cur_gt = NULL;
1027 }
1028 }
1029 /*
1030 * It's the right side that was found first, look left.
1031 */
1032 else {
1033 /* 1060 /*
1034 * Fix up the length. 1061 * Right side is good, look for a left side entry.
1035 */ 1062 */
1036 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1063 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1037 xfs_alloc_fix_len(args); 1064 xfs_alloc_fix_len(args);
1038 rlen = args->len; 1065 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1039 gtdiff = xfs_alloc_compute_diff(args->agbno, rlen, 1066 args->alignment, gtbnoa, gtlena, &gtnew);
1040 args->alignment, gtbno, gtlen, &gtnew); 1067
1041 /* 1068 error = xfs_alloc_find_best_extent(args,
1042 * Right side entry isn't perfect. 1069 &bno_cur_gt, &bno_cur_lt,
1043 */ 1070 gtdiff, &ltbno, &ltlen,
1044 if (gtdiff) { 1071 &ltbnoa, &ltlena,
1045 /* 1072 1 /* search left */);
1046 * Look until we find a better one, run out of
1047 * space, or run off the end.
1048 */
1049 while (bno_cur_lt && bno_cur_gt) {
1050 if ((error = xfs_alloc_get_rec(
1051 bno_cur_lt, &ltbno,
1052 &ltlen, &i)))
1053 goto error0;
1054 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1055 xfs_alloc_compute_aligned(ltbno, ltlen,
1056 args->alignment, args->minlen,
1057 &ltbnoa, &ltlena);
1058 /*
1059 * The right one is clearly better.
1060 */
1061 if (ltbnoa <= args->agbno - gtdiff) {
1062 xfs_btree_del_cursor(
1063 bno_cur_lt,
1064 XFS_BTREE_NOERROR);
1065 bno_cur_lt = NULL;
1066 break;
1067 }
1068 /*
1069 * If we reach a big enough entry,
1070 * compare the two and pick the best.
1071 */
1072 if (ltlena >= args->minlen) {
1073 args->len = XFS_EXTLEN_MIN(
1074 ltlena, args->maxlen);
1075 xfs_alloc_fix_len(args);
1076 rlen = args->len;
1077 ltdiff = xfs_alloc_compute_diff(
1078 args->agbno, rlen,
1079 args->alignment,
1080 ltbno, ltlen, &ltnew);
1081 /*
1082 * Left side is better.
1083 */
1084 if (ltdiff < gtdiff) {
1085 xfs_btree_del_cursor(
1086 bno_cur_gt,
1087 XFS_BTREE_NOERROR);
1088 bno_cur_gt = NULL;
1089 }
1090 /*
1091 * Right side is better.
1092 */
1093 else {
1094 xfs_btree_del_cursor(
1095 bno_cur_lt,
1096 XFS_BTREE_NOERROR);
1097 bno_cur_lt = NULL;
1098 }
1099 break;
1100 }
1101 /*
1102 * Fell off the left end.
1103 */
1104 if ((error = xfs_btree_decrement(
1105 bno_cur_lt, 0, &i)))
1106 goto error0;
1107 if (!i) {
1108 xfs_btree_del_cursor(bno_cur_lt,
1109 XFS_BTREE_NOERROR);
1110 bno_cur_lt = NULL;
1111 break;
1112 }
1113 }
1114 }
1115 /*
1116 * The right side is perfect, trash the left side.
1117 */
1118 else {
1119 xfs_btree_del_cursor(bno_cur_lt,
1120 XFS_BTREE_NOERROR);
1121 bno_cur_lt = NULL;
1122 }
1123 } 1073 }
1074
1075 if (error)
1076 goto error0;
1124 } 1077 }
1078
1125 /* 1079 /*
1126 * If we couldn't get anything, give up. 1080 * If we couldn't get anything, give up.
1127 */ 1081 */
1128 if (bno_cur_lt == NULL && bno_cur_gt == NULL) { 1082 if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
1083 if (!forced++) {
1084 trace_xfs_alloc_near_busy(args);
1085 xfs_log_force(args->mp, XFS_LOG_SYNC);
1086 goto restart;
1087 }
1088
1129 trace_xfs_alloc_size_neither(args); 1089 trace_xfs_alloc_size_neither(args);
1130 args->agbno = NULLAGBLOCK; 1090 args->agbno = NULLAGBLOCK;
1131 return 0; 1091 return 0;
1132 } 1092 }
1093
1133 /* 1094 /*
1134 * At this point we have selected a freespace entry, either to the 1095 * At this point we have selected a freespace entry, either to the
1135 * left or to the right. If it's on the right, copy all the 1096 * left or to the right. If it's on the right, copy all the
@@ -1146,6 +1107,7 @@ xfs_alloc_ag_vextent_near(
1146 j = 1; 1107 j = 1;
1147 } else 1108 } else
1148 j = 0; 1109 j = 0;
1110
1149 /* 1111 /*
1150 * Fix up the length and compute the useful address. 1112 * Fix up the length and compute the useful address.
1151 */ 1113 */
@@ -1158,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
1158 return 0; 1120 return 0;
1159 } 1121 }
1160 rlen = args->len; 1122 rlen = args->len;
1161 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, 1123 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
1162 ltlen, &ltnew); 1124 ltbnoa, ltlena, &ltnew);
1163 ASSERT(ltnew >= ltbno); 1125 ASSERT(ltnew >= ltbno);
1164 ASSERT(ltnew + rlen <= ltbno + ltlen); 1126 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1165 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1127 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1166 args->agbno = ltnew; 1128 args->agbno = ltnew;
1129
1167 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, 1130 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
1168 ltnew, rlen, XFSA_FIXUP_BNO_OK))) 1131 ltnew, rlen, XFSA_FIXUP_BNO_OK)))
1169 goto error0; 1132 goto error0;
@@ -1206,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
1206 int i; /* temp status variable */ 1169 int i; /* temp status variable */
1207 xfs_agblock_t rbno; /* returned block number */ 1170 xfs_agblock_t rbno; /* returned block number */
1208 xfs_extlen_t rlen; /* length of returned extent */ 1171 xfs_extlen_t rlen; /* length of returned extent */
1172 int forced = 0;
1209 1173
1174restart:
1210 /* 1175 /*
1211 * Allocate and initialize a cursor for the by-size btree. 1176 * Allocate and initialize a cursor for the by-size btree.
1212 */ 1177 */
1213 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1178 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1214 args->agno, XFS_BTNUM_CNT); 1179 args->agno, XFS_BTNUM_CNT);
1215 bno_cur = NULL; 1180 bno_cur = NULL;
1181
1216 /* 1182 /*
1217 * Look for an entry >= maxlen+alignment-1 blocks. 1183 * Look for an entry >= maxlen+alignment-1 blocks.
1218 */ 1184 */
1219 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1185 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1220 args->maxlen + args->alignment - 1, &i))) 1186 args->maxlen + args->alignment - 1, &i)))
1221 goto error0; 1187 goto error0;
1188
1222 /* 1189 /*
1223 * If none, then pick up the last entry in the tree unless the 1190 * If none or we have busy extents that we cannot allocate from, then
1224 * tree is empty. 1191 * we have to settle for a smaller extent. In the case that there are
1192 * no large extents, this will return the last entry in the tree unless
1193 * the tree is empty. In the case that there are only busy large
1194 * extents, this will return the largest small extent unless there
1195 * are no smaller extents available.
1225 */ 1196 */
1226 if (!i) { 1197 if (!i || forced > 1) {
1227 if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, 1198 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1228 &flen, &i))) 1199 &fbno, &flen, &i);
1200 if (error)
1229 goto error0; 1201 goto error0;
1230 if (i == 0 || flen == 0) { 1202 if (i == 0 || flen == 0) {
1231 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1203 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1233,23 +1205,56 @@ xfs_alloc_ag_vextent_size(
1233 return 0; 1205 return 0;
1234 } 1206 }
1235 ASSERT(i == 1); 1207 ASSERT(i == 1);
1208 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1209 } else {
1210 /*
1211 * Search for a non-busy extent that is large enough.
1212 * If we are at low space, don't check, or if we fall of
1213 * the end of the btree, turn off the busy check and
1214 * restart.
1215 */
1216 for (;;) {
1217 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1218 if (error)
1219 goto error0;
1220 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1221
1222 xfs_alloc_compute_aligned(args, fbno, flen,
1223 &rbno, &rlen);
1224
1225 if (rlen >= args->maxlen)
1226 break;
1227
1228 error = xfs_btree_increment(cnt_cur, 0, &i);
1229 if (error)
1230 goto error0;
1231 if (i == 0) {
1232 /*
1233 * Our only valid extents must have been busy.
1234 * Make it unbusy by forcing the log out and
1235 * retrying. If we've been here before, forcing
1236 * the log isn't making the extents available,
1237 * which means they have probably been freed in
1238 * this transaction. In that case, we have to
1239 * give up on them and we'll attempt a minlen
1240 * allocation the next time around.
1241 */
1242 xfs_btree_del_cursor(cnt_cur,
1243 XFS_BTREE_NOERROR);
1244 trace_xfs_alloc_size_busy(args);
1245 if (!forced++)
1246 xfs_log_force(args->mp, XFS_LOG_SYNC);
1247 goto restart;
1248 }
1249 }
1236 } 1250 }
1237 /* 1251
1238 * There's a freespace as big as maxlen+alignment-1, get it.
1239 */
1240 else {
1241 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
1242 goto error0;
1243 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1244 }
1245 /* 1252 /*
1246 * In the first case above, we got the last entry in the 1253 * In the first case above, we got the last entry in the
1247 * by-size btree. Now we check to see if the space hits maxlen 1254 * by-size btree. Now we check to see if the space hits maxlen
1248 * once aligned; if not, we search left for something better. 1255 * once aligned; if not, we search left for something better.
1249 * This can't happen in the second case above. 1256 * This can't happen in the second case above.
1250 */ 1257 */
1251 xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
1252 &rbno, &rlen);
1253 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1258 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1254 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1259 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1255 (rlen <= flen && rbno + rlen <= fbno + flen), error0); 1260 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1274,8 +1279,8 @@ xfs_alloc_ag_vextent_size(
1274 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1279 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1275 if (flen < bestrlen) 1280 if (flen < bestrlen)
1276 break; 1281 break;
1277 xfs_alloc_compute_aligned(fbno, flen, args->alignment, 1282 xfs_alloc_compute_aligned(args, fbno, flen,
1278 args->minlen, &rbno, &rlen); 1283 &rbno, &rlen);
1279 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1284 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1280 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1285 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1281 (rlen <= flen && rbno + rlen <= fbno + flen), 1286 (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1303,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
1303 * Fix up the length. 1308 * Fix up the length.
1304 */ 1309 */
1305 args->len = rlen; 1310 args->len = rlen;
1306 xfs_alloc_fix_len(args); 1311 if (rlen < args->minlen) {
1307 if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { 1312 if (!forced++) {
1308 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1313 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1309 trace_xfs_alloc_size_nominleft(args); 1314 trace_xfs_alloc_size_busy(args);
1310 args->agbno = NULLAGBLOCK; 1315 xfs_log_force(args->mp, XFS_LOG_SYNC);
1311 return 0; 1316 goto restart;
1317 }
1318 goto out_nominleft;
1312 } 1319 }
1320 xfs_alloc_fix_len(args);
1321
1322 if (!xfs_alloc_fix_minleft(args))
1323 goto out_nominleft;
1313 rlen = args->len; 1324 rlen = args->len;
1314 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); 1325 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
1315 /* 1326 /*
@@ -1339,6 +1350,12 @@ error0:
1339 if (bno_cur) 1350 if (bno_cur)
1340 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1351 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1341 return error; 1352 return error;
1353
1354out_nominleft:
1355 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1356 trace_xfs_alloc_size_nominleft(args);
1357 args->agbno = NULLAGBLOCK;
1358 return 0;
1342} 1359}
1343 1360
1344/* 1361/*
@@ -1378,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
1378 if (error) 1395 if (error)
1379 goto error0; 1396 goto error0;
1380 if (fbno != NULLAGBLOCK) { 1397 if (fbno != NULLAGBLOCK) {
1398 xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
1399 args->userdata);
1400
1381 if (args->userdata) { 1401 if (args->userdata) {
1382 xfs_buf_t *bp; 1402 xfs_buf_t *bp;
1383 1403
@@ -1453,6 +1473,7 @@ xfs_free_ag_extent(
1453 xfs_mount_t *mp; /* mount point struct for filesystem */ 1473 xfs_mount_t *mp; /* mount point struct for filesystem */
1454 xfs_agblock_t nbno; /* new starting block of freespace */ 1474 xfs_agblock_t nbno; /* new starting block of freespace */
1455 xfs_extlen_t nlen; /* new length of freespace */ 1475 xfs_extlen_t nlen; /* new length of freespace */
1476 xfs_perag_t *pag; /* per allocation group data */
1456 1477
1457 mp = tp->t_mountp; 1478 mp = tp->t_mountp;
1458 /* 1479 /*
@@ -1651,45 +1672,23 @@ xfs_free_ag_extent(
1651 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1672 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1652 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1673 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1653 cnt_cur = NULL; 1674 cnt_cur = NULL;
1675
1654 /* 1676 /*
1655 * Update the freespace totals in the ag and superblock. 1677 * Update the freespace totals in the ag and superblock.
1656 */ 1678 */
1657 { 1679 pag = xfs_perag_get(mp, agno);
1658 xfs_agf_t *agf; 1680 error = xfs_alloc_update_counters(tp, pag, agbp, len);
1659 xfs_perag_t *pag; /* per allocation group data */ 1681 xfs_perag_put(pag);
1660 1682 if (error)
1661 pag = xfs_perag_get(mp, agno); 1683 goto error0;
1662 pag->pagf_freeblks += len;
1663 xfs_perag_put(pag);
1664 1684
1665 agf = XFS_BUF_TO_AGF(agbp); 1685 if (!isfl)
1666 be32_add_cpu(&agf->agf_freeblks, len); 1686 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1667 xfs_trans_agblocks_delta(tp, len); 1687 XFS_STATS_INC(xs_freex);
1668 XFS_WANT_CORRUPTED_GOTO( 1688 XFS_STATS_ADD(xs_freeb, len);
1669 be32_to_cpu(agf->agf_freeblks) <=
1670 be32_to_cpu(agf->agf_length),
1671 error0);
1672 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
1673 if (!isfl)
1674 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1675 XFS_STATS_INC(xs_freex);
1676 XFS_STATS_ADD(xs_freeb, len);
1677 }
1678 1689
1679 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); 1690 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
1680 1691
1681 /*
1682 * Since blocks move to the free list without the coordination
1683 * used in xfs_bmap_finish, we can't allow block to be available
1684 * for reallocation and non-transaction writing (user data)
1685 * until we know that the transaction that moved it to the free
1686 * list is permanently on disk. We track the blocks by declaring
1687 * these blocks as "busy"; the busy list is maintained on a per-ag
1688 * basis and each transaction records which entries should be removed
1689 * when the iclog commits to disk. If a busy block is allocated,
1690 * the iclog is pushed up to the LSN that freed the block.
1691 */
1692 xfs_alloc_busy_insert(tp, agno, bno, len);
1693 return 0; 1692 return 0;
1694 1693
1695 error0: 1694 error0:
@@ -1984,21 +1983,6 @@ xfs_alloc_get_freelist(
1984 xfs_alloc_log_agf(tp, agbp, logflags); 1983 xfs_alloc_log_agf(tp, agbp, logflags);
1985 *bnop = bno; 1984 *bnop = bno;
1986 1985
1987 /*
1988 * As blocks are freed, they are added to the per-ag busy list and
1989 * remain there until the freeing transaction is committed to disk.
1990 * Now that we have allocated blocks, this list must be searched to see
1991 * if a block is being reused. If one is, then the freeing transaction
1992 * must be pushed to disk before this transaction.
1993 *
1994 * We do this by setting the current transaction to a sync transaction
1995 * which guarantees that the freeing transaction is on disk before this
1996 * transaction. This is done instead of a synchronous log force here so
1997 * that we don't sit and wait with the AGF locked in the transaction
1998 * during the log force.
1999 */
2000 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
2001 xfs_trans_set_sync(tp);
2002 return 0; 1986 return 0;
2003} 1987}
2004 1988
@@ -2456,131 +2440,54 @@ xfs_free_extent(
2456 memset(&args, 0, sizeof(xfs_alloc_arg_t)); 2440 memset(&args, 0, sizeof(xfs_alloc_arg_t));
2457 args.tp = tp; 2441 args.tp = tp;
2458 args.mp = tp->t_mountp; 2442 args.mp = tp->t_mountp;
2443
2444 /*
2445 * validate that the block number is legal - the enables us to detect
2446 * and handle a silent filesystem corruption rather than crashing.
2447 */
2459 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2448 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2460 ASSERT(args.agno < args.mp->m_sb.sb_agcount); 2449 if (args.agno >= args.mp->m_sb.sb_agcount)
2450 return EFSCORRUPTED;
2451
2461 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2452 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2453 if (args.agbno >= args.mp->m_sb.sb_agblocks)
2454 return EFSCORRUPTED;
2455
2462 args.pag = xfs_perag_get(args.mp, args.agno); 2456 args.pag = xfs_perag_get(args.mp, args.agno);
2463 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) 2457 ASSERT(args.pag);
2458
2459 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
2460 if (error)
2464 goto error0; 2461 goto error0;
2465#ifdef DEBUG 2462
2466 ASSERT(args.agbp != NULL); 2463 /* validate the extent size is legal now we have the agf locked */
2467 ASSERT((args.agbno + len) <= 2464 if (args.agbno + len >
2468 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)); 2465 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
2469#endif 2466 error = EFSCORRUPTED;
2467 goto error0;
2468 }
2469
2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2471 if (!error)
2472 xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
2471error0: 2473error0:
2472 xfs_perag_put(args.pag); 2474 xfs_perag_put(args.pag);
2473 return error; 2475 return error;
2474} 2476}
2475 2477
2476
2477/*
2478 * AG Busy list management
2479 * The busy list contains block ranges that have been freed but whose
2480 * transactions have not yet hit disk. If any block listed in a busy
2481 * list is reused, the transaction that freed it must be forced to disk
2482 * before continuing to use the block.
2483 *
2484 * xfs_alloc_busy_insert - add to the per-ag busy list
2485 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2486 * xfs_alloc_busy_search - search for a busy extent
2487 */
2488
2489/*
2490 * Insert a new extent into the busy tree.
2491 *
2492 * The busy extent tree is indexed by the start block of the busy extent.
2493 * there can be multiple overlapping ranges in the busy extent tree but only
2494 * ever one entry at a given start block. The reason for this is that
2495 * multi-block extents can be freed, then smaller chunks of that extent
2496 * allocated and freed again before the first transaction commit is on disk.
2497 * If the exact same start block is freed a second time, we have to wait for
2498 * that busy extent to pass out of the tree before the new extent is inserted.
2499 * There are two main cases we have to handle here.
2500 *
2501 * The first case is a transaction that triggers a "free - allocate - free"
2502 * cycle. This can occur during btree manipulations as a btree block is freed
2503 * to the freelist, then allocated from the free list, then freed again. In
2504 * this case, the second extxpnet free is what triggers the duplicate and as
2505 * such the transaction IDs should match. Because the extent was allocated in
2506 * this transaction, the transaction must be marked as synchronous. This is
2507 * true for all cases where the free/alloc/free occurs in the one transaction,
2508 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2509 * This serves to catch violations of the second case quite effectively.
2510 *
2511 * The second case is where the free/alloc/free occur in different
2512 * transactions. In this case, the thread freeing the extent the second time
2513 * can't mark the extent busy immediately because it is already tracked in a
2514 * transaction that may be committing. When the log commit for the existing
2515 * busy extent completes, the busy extent will be removed from the tree. If we
2516 * allow the second busy insert to continue using that busy extent structure,
2517 * it can be freed before this transaction is safely in the log. Hence our
2518 * only option in this case is to force the log to remove the existing busy
2519 * extent from the list before we insert the new one with the current
2520 * transaction ID.
2521 *
2522 * The problem we are trying to avoid in the free-alloc-free in separate
2523 * transactions is most easily described with a timeline:
2524 *
2525 * Thread 1 Thread 2 Thread 3 xfslogd
2526 * xact alloc
2527 * free X
2528 * mark busy
2529 * commit xact
2530 * free xact
2531 * xact alloc
2532 * alloc X
2533 * busy search
2534 * mark xact sync
2535 * commit xact
2536 * free xact
2537 * force log
2538 * checkpoint starts
2539 * ....
2540 * xact alloc
2541 * free X
2542 * mark busy
2543 * finds match
2544 * *** KABOOM! ***
2545 * ....
2546 * log IO completes
2547 * unbusy X
2548 * checkpoint completes
2549 *
2550 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2551 * the checkpoint completes, and the busy extent it matched will have been
2552 * removed from the tree when it is woken. Hence it can then continue safely.
2553 *
2554 * However, to ensure this matching process is robust, we need to use the
2555 * transaction ID for identifying transaction, as delayed logging results in
2556 * the busy extent and transaction lifecycles being different. i.e. the busy
2557 * extent is active for a lot longer than the transaction. Hence the
2558 * transaction structure can be freed and reallocated, then mark the same
2559 * extent busy again in the new transaction. In this case the new transaction
2560 * will have a different tid but can have the same address, and hence we need
2561 * to check against the tid.
2562 *
2563 * Future: for delayed logging, we could avoid the log force if the extent was
2564 * first freed in the current checkpoint sequence. This, however, requires the
2565 * ability to pin the current checkpoint in memory until this transaction
2566 * commits to ensure that both the original free and the current one combine
2567 * logically into the one checkpoint. If the checkpoint sequences are
2568 * different, however, we still need to wait on a log force.
2569 */
2570void 2478void
2571xfs_alloc_busy_insert( 2479xfs_alloc_busy_insert(
2572 struct xfs_trans *tp, 2480 struct xfs_trans *tp,
2573 xfs_agnumber_t agno, 2481 xfs_agnumber_t agno,
2574 xfs_agblock_t bno, 2482 xfs_agblock_t bno,
2575 xfs_extlen_t len) 2483 xfs_extlen_t len,
2484 unsigned int flags)
2576{ 2485{
2577 struct xfs_busy_extent *new; 2486 struct xfs_busy_extent *new;
2578 struct xfs_busy_extent *busyp; 2487 struct xfs_busy_extent *busyp;
2579 struct xfs_perag *pag; 2488 struct xfs_perag *pag;
2580 struct rb_node **rbp; 2489 struct rb_node **rbp;
2581 struct rb_node *parent; 2490 struct rb_node *parent = NULL;
2582 int match;
2583
2584 2491
2585 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); 2492 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2586 if (!new) { 2493 if (!new) {
@@ -2589,7 +2496,7 @@ xfs_alloc_busy_insert(
2589 * block, make this a synchronous transaction to insure that 2496 * block, make this a synchronous transaction to insure that
2590 * the block is not reused before this transaction commits. 2497 * the block is not reused before this transaction commits.
2591 */ 2498 */
2592 trace_xfs_alloc_busy(tp, agno, bno, len, 1); 2499 trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
2593 xfs_trans_set_sync(tp); 2500 xfs_trans_set_sync(tp);
2594 return; 2501 return;
2595 } 2502 }
@@ -2597,66 +2504,29 @@ xfs_alloc_busy_insert(
2597 new->agno = agno; 2504 new->agno = agno;
2598 new->bno = bno; 2505 new->bno = bno;
2599 new->length = len; 2506 new->length = len;
2600 new->tid = xfs_log_get_trans_ident(tp);
2601
2602 INIT_LIST_HEAD(&new->list); 2507 INIT_LIST_HEAD(&new->list);
2508 new->flags = flags;
2603 2509
2604 /* trace before insert to be able to see failed inserts */ 2510 /* trace before insert to be able to see failed inserts */
2605 trace_xfs_alloc_busy(tp, agno, bno, len, 0); 2511 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
2606 2512
2607 pag = xfs_perag_get(tp->t_mountp, new->agno); 2513 pag = xfs_perag_get(tp->t_mountp, new->agno);
2608restart:
2609 spin_lock(&pag->pagb_lock); 2514 spin_lock(&pag->pagb_lock);
2610 rbp = &pag->pagb_tree.rb_node; 2515 rbp = &pag->pagb_tree.rb_node;
2611 parent = NULL; 2516 while (*rbp) {
2612 busyp = NULL;
2613 match = 0;
2614 while (*rbp && match >= 0) {
2615 parent = *rbp; 2517 parent = *rbp;
2616 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); 2518 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2617 2519
2618 if (new->bno < busyp->bno) { 2520 if (new->bno < busyp->bno) {
2619 /* may overlap, but exact start block is lower */
2620 rbp = &(*rbp)->rb_left; 2521 rbp = &(*rbp)->rb_left;
2621 if (new->bno + new->length > busyp->bno) 2522 ASSERT(new->bno + new->length <= busyp->bno);
2622 match = busyp->tid == new->tid ? 1 : -1;
2623 } else if (new->bno > busyp->bno) { 2523 } else if (new->bno > busyp->bno) {
2624 /* may overlap, but exact start block is higher */
2625 rbp = &(*rbp)->rb_right; 2524 rbp = &(*rbp)->rb_right;
2626 if (bno < busyp->bno + busyp->length) 2525 ASSERT(bno >= busyp->bno + busyp->length);
2627 match = busyp->tid == new->tid ? 1 : -1;
2628 } else { 2526 } else {
2629 match = busyp->tid == new->tid ? 1 : -1; 2527 ASSERT(0);
2630 break;
2631 } 2528 }
2632 } 2529 }
2633 if (match < 0) {
2634 /* overlap marked busy in different transaction */
2635 spin_unlock(&pag->pagb_lock);
2636 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2637 goto restart;
2638 }
2639 if (match > 0) {
2640 /*
2641 * overlap marked busy in same transaction. Update if exact
2642 * start block match, otherwise combine the busy extents into
2643 * a single range.
2644 */
2645 if (busyp->bno == new->bno) {
2646 busyp->length = max(busyp->length, new->length);
2647 spin_unlock(&pag->pagb_lock);
2648 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2649 xfs_perag_put(pag);
2650 kmem_free(new);
2651 return;
2652 }
2653 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2654 new->length = max(busyp->bno + busyp->length,
2655 new->bno + new->length) -
2656 min(busyp->bno, new->bno);
2657 new->bno = min(busyp->bno, new->bno);
2658 } else
2659 busyp = NULL;
2660 2530
2661 rb_link_node(&new->rb_node, parent, rbp); 2531 rb_link_node(&new->rb_node, parent, rbp);
2662 rb_insert_color(&new->rb_node, &pag->pagb_tree); 2532 rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2664,7 +2534,6 @@ restart:
2664 list_add(&new->list, &tp->t_busy); 2534 list_add(&new->list, &tp->t_busy);
2665 spin_unlock(&pag->pagb_lock); 2535 spin_unlock(&pag->pagb_lock);
2666 xfs_perag_put(pag); 2536 xfs_perag_put(pag);
2667 kmem_free(busyp);
2668} 2537}
2669 2538
2670/* 2539/*
@@ -2676,7 +2545,7 @@ restart:
2676 * will require a synchronous transaction, but it can still be 2545 * will require a synchronous transaction, but it can still be
2677 * used to distinguish between a partial or exact match. 2546 * used to distinguish between a partial or exact match.
2678 */ 2547 */
2679static int 2548int
2680xfs_alloc_busy_search( 2549xfs_alloc_busy_search(
2681 struct xfs_mount *mp, 2550 struct xfs_mount *mp,
2682 xfs_agnumber_t agno, 2551 xfs_agnumber_t agno,
@@ -2713,31 +2582,466 @@ xfs_alloc_busy_search(
2713 } 2582 }
2714 } 2583 }
2715 spin_unlock(&pag->pagb_lock); 2584 spin_unlock(&pag->pagb_lock);
2716 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2717 xfs_perag_put(pag); 2585 xfs_perag_put(pag);
2718 return match; 2586 return match;
2719} 2587}
2720 2588
2589/*
2590 * The found free extent [fbno, fend] overlaps part or all of the given busy
2591 * extent. If the overlap covers the beginning, the end, or all of the busy
2592 * extent, the overlapping portion can be made unbusy and used for the
2593 * allocation. We can't split a busy extent because we can't modify a
2594 * transaction/CIL context busy list, but we can update an entries block
2595 * number or length.
2596 *
2597 * Returns true if the extent can safely be reused, or false if the search
2598 * needs to be restarted.
2599 */
2600STATIC bool
2601xfs_alloc_busy_update_extent(
2602 struct xfs_mount *mp,
2603 struct xfs_perag *pag,
2604 struct xfs_busy_extent *busyp,
2605 xfs_agblock_t fbno,
2606 xfs_extlen_t flen,
2607 bool userdata)
2608{
2609 xfs_agblock_t fend = fbno + flen;
2610 xfs_agblock_t bbno = busyp->bno;
2611 xfs_agblock_t bend = bbno + busyp->length;
2612
2613 /*
2614 * This extent is currently being discarded. Give the thread
2615 * performing the discard a chance to mark the extent unbusy
2616 * and retry.
2617 */
2618 if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
2619 spin_unlock(&pag->pagb_lock);
2620 delay(1);
2621 spin_lock(&pag->pagb_lock);
2622 return false;
2623 }
2624
2625 /*
2626 * If there is a busy extent overlapping a user allocation, we have
2627 * no choice but to force the log and retry the search.
2628 *
2629 * Fortunately this does not happen during normal operation, but
2630 * only if the filesystem is very low on space and has to dip into
2631 * the AGFL for normal allocations.
2632 */
2633 if (userdata)
2634 goto out_force_log;
2635
2636 if (bbno < fbno && bend > fend) {
2637 /*
2638 * Case 1:
2639 * bbno bend
2640 * +BBBBBBBBBBBBBBBBB+
2641 * +---------+
2642 * fbno fend
2643 */
2644
2645 /*
2646 * We would have to split the busy extent to be able to track
2647 * it correct, which we cannot do because we would have to
2648 * modify the list of busy extents attached to the transaction
2649 * or CIL context, which is immutable.
2650 *
2651 * Force out the log to clear the busy extent and retry the
2652 * search.
2653 */
2654 goto out_force_log;
2655 } else if (bbno >= fbno && bend <= fend) {
2656 /*
2657 * Case 2:
2658 * bbno bend
2659 * +BBBBBBBBBBBBBBBBB+
2660 * +-----------------+
2661 * fbno fend
2662 *
2663 * Case 3:
2664 * bbno bend
2665 * +BBBBBBBBBBBBBBBBB+
2666 * +--------------------------+
2667 * fbno fend
2668 *
2669 * Case 4:
2670 * bbno bend
2671 * +BBBBBBBBBBBBBBBBB+
2672 * +--------------------------+
2673 * fbno fend
2674 *
2675 * Case 5:
2676 * bbno bend
2677 * +BBBBBBBBBBBBBBBBB+
2678 * +-----------------------------------+
2679 * fbno fend
2680 *
2681 */
2682
2683 /*
2684 * The busy extent is fully covered by the extent we are
2685 * allocating, and can simply be removed from the rbtree.
2686 * However we cannot remove it from the immutable list
2687 * tracking busy extents in the transaction or CIL context,
2688 * so set the length to zero to mark it invalid.
2689 *
2690 * We also need to restart the busy extent search from the
2691 * tree root, because erasing the node can rearrange the
2692 * tree topology.
2693 */
2694 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2695 busyp->length = 0;
2696 return false;
2697 } else if (fend < bend) {
2698 /*
2699 * Case 6:
2700 * bbno bend
2701 * +BBBBBBBBBBBBBBBBB+
2702 * +---------+
2703 * fbno fend
2704 *
2705 * Case 7:
2706 * bbno bend
2707 * +BBBBBBBBBBBBBBBBB+
2708 * +------------------+
2709 * fbno fend
2710 *
2711 */
2712 busyp->bno = fend;
2713 } else if (bbno < fbno) {
2714 /*
2715 * Case 8:
2716 * bbno bend
2717 * +BBBBBBBBBBBBBBBBB+
2718 * +-------------+
2719 * fbno fend
2720 *
2721 * Case 9:
2722 * bbno bend
2723 * +BBBBBBBBBBBBBBBBB+
2724 * +----------------------+
2725 * fbno fend
2726 */
2727 busyp->length = fbno - busyp->bno;
2728 } else {
2729 ASSERT(0);
2730 }
2731
2732 trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
2733 return true;
2734
2735out_force_log:
2736 spin_unlock(&pag->pagb_lock);
2737 xfs_log_force(mp, XFS_LOG_SYNC);
2738 trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
2739 spin_lock(&pag->pagb_lock);
2740 return false;
2741}
2742
2743
2744/*
2745 * For a given extent [fbno, flen], make sure we can reuse it safely.
2746 */
2721void 2747void
2722xfs_alloc_busy_clear( 2748xfs_alloc_busy_reuse(
2723 struct xfs_mount *mp, 2749 struct xfs_mount *mp,
2724 struct xfs_busy_extent *busyp) 2750 xfs_agnumber_t agno,
2751 xfs_agblock_t fbno,
2752 xfs_extlen_t flen,
2753 bool userdata)
2725{ 2754{
2726 struct xfs_perag *pag; 2755 struct xfs_perag *pag;
2756 struct rb_node *rbp;
2727 2757
2728 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, 2758 ASSERT(flen > 0);
2729 busyp->length);
2730 2759
2731 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, 2760 pag = xfs_perag_get(mp, agno);
2732 busyp->length) == 1); 2761 spin_lock(&pag->pagb_lock);
2762restart:
2763 rbp = pag->pagb_tree.rb_node;
2764 while (rbp) {
2765 struct xfs_busy_extent *busyp =
2766 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2767 xfs_agblock_t bbno = busyp->bno;
2768 xfs_agblock_t bend = bbno + busyp->length;
2733 2769
2734 list_del_init(&busyp->list); 2770 if (fbno + flen <= bbno) {
2771 rbp = rbp->rb_left;
2772 continue;
2773 } else if (fbno >= bend) {
2774 rbp = rbp->rb_right;
2775 continue;
2776 }
2735 2777
2736 pag = xfs_perag_get(mp, busyp->agno); 2778 if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
2737 spin_lock(&pag->pagb_lock); 2779 userdata))
2738 rb_erase(&busyp->rb_node, &pag->pagb_tree); 2780 goto restart;
2781 }
2739 spin_unlock(&pag->pagb_lock); 2782 spin_unlock(&pag->pagb_lock);
2740 xfs_perag_put(pag); 2783 xfs_perag_put(pag);
2784}
2785
2786/*
2787 * For a given extent [fbno, flen], search the busy extent list to find a
2788 * subset of the extent that is not busy. If *rlen is smaller than
2789 * args->minlen no suitable extent could be found, and the higher level
2790 * code needs to force out the log and retry the allocation.
2791 */
2792STATIC void
2793xfs_alloc_busy_trim(
2794 struct xfs_alloc_arg *args,
2795 xfs_agblock_t bno,
2796 xfs_extlen_t len,
2797 xfs_agblock_t *rbno,
2798 xfs_extlen_t *rlen)
2799{
2800 xfs_agblock_t fbno;
2801 xfs_extlen_t flen;
2802 struct rb_node *rbp;
2803
2804 ASSERT(len > 0);
2805
2806 spin_lock(&args->pag->pagb_lock);
2807restart:
2808 fbno = bno;
2809 flen = len;
2810 rbp = args->pag->pagb_tree.rb_node;
2811 while (rbp && flen >= args->minlen) {
2812 struct xfs_busy_extent *busyp =
2813 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2814 xfs_agblock_t fend = fbno + flen;
2815 xfs_agblock_t bbno = busyp->bno;
2816 xfs_agblock_t bend = bbno + busyp->length;
2817
2818 if (fend <= bbno) {
2819 rbp = rbp->rb_left;
2820 continue;
2821 } else if (fbno >= bend) {
2822 rbp = rbp->rb_right;
2823 continue;
2824 }
2825
2826 /*
2827 * If this is a metadata allocation, try to reuse the busy
2828 * extent instead of trimming the allocation.
2829 */
2830 if (!args->userdata &&
2831 !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
2832 if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
2833 busyp, fbno, flen,
2834 false))
2835 goto restart;
2836 continue;
2837 }
2838
2839 if (bbno <= fbno) {
2840 /* start overlap */
2741 2841
2842 /*
2843 * Case 1:
2844 * bbno bend
2845 * +BBBBBBBBBBBBBBBBB+
2846 * +---------+
2847 * fbno fend
2848 *
2849 * Case 2:
2850 * bbno bend
2851 * +BBBBBBBBBBBBBBBBB+
2852 * +-------------+
2853 * fbno fend
2854 *
2855 * Case 3:
2856 * bbno bend
2857 * +BBBBBBBBBBBBBBBBB+
2858 * +-------------+
2859 * fbno fend
2860 *
2861 * Case 4:
2862 * bbno bend
2863 * +BBBBBBBBBBBBBBBBB+
2864 * +-----------------+
2865 * fbno fend
2866 *
2867 * No unbusy region in extent, return failure.
2868 */
2869 if (fend <= bend)
2870 goto fail;
2871
2872 /*
2873 * Case 5:
2874 * bbno bend
2875 * +BBBBBBBBBBBBBBBBB+
2876 * +----------------------+
2877 * fbno fend
2878 *
2879 * Case 6:
2880 * bbno bend
2881 * +BBBBBBBBBBBBBBBBB+
2882 * +--------------------------+
2883 * fbno fend
2884 *
2885 * Needs to be trimmed to:
2886 * +-------+
2887 * fbno fend
2888 */
2889 fbno = bend;
2890 } else if (bend >= fend) {
2891 /* end overlap */
2892
2893 /*
2894 * Case 7:
2895 * bbno bend
2896 * +BBBBBBBBBBBBBBBBB+
2897 * +------------------+
2898 * fbno fend
2899 *
2900 * Case 8:
2901 * bbno bend
2902 * +BBBBBBBBBBBBBBBBB+
2903 * +--------------------------+
2904 * fbno fend
2905 *
2906 * Needs to be trimmed to:
2907 * +-------+
2908 * fbno fend
2909 */
2910 fend = bbno;
2911 } else {
2912 /* middle overlap */
2913
2914 /*
2915 * Case 9:
2916 * bbno bend
2917 * +BBBBBBBBBBBBBBBBB+
2918 * +-----------------------------------+
2919 * fbno fend
2920 *
2921 * Can be trimmed to:
2922 * +-------+ OR +-------+
2923 * fbno fend fbno fend
2924 *
2925 * Backward allocation leads to significant
2926 * fragmentation of directories, which degrades
2927 * directory performance, therefore we always want to
2928 * choose the option that produces forward allocation
2929 * patterns.
2930 * Preferring the lower bno extent will make the next
2931 * request use "fend" as the start of the next
2932 * allocation; if the segment is no longer busy at
2933 * that point, we'll get a contiguous allocation, but
2934 * even if it is still busy, we will get a forward
2935 * allocation.
2936 * We try to avoid choosing the segment at "bend",
2937 * because that can lead to the next allocation
2938 * taking the segment at "fbno", which would be a
2939 * backward allocation. We only use the segment at
2940 * "fbno" if it is much larger than the current
2941 * requested size, because in that case there's a
2942 * good chance subsequent allocations will be
2943 * contiguous.
2944 */
2945 if (bbno - fbno >= args->maxlen) {
2946 /* left candidate fits perfect */
2947 fend = bbno;
2948 } else if (fend - bend >= args->maxlen * 4) {
2949 /* right candidate has enough free space */
2950 fbno = bend;
2951 } else if (bbno - fbno >= args->minlen) {
2952 /* left candidate fits minimum requirement */
2953 fend = bbno;
2954 } else {
2955 goto fail;
2956 }
2957 }
2958
2959 flen = fend - fbno;
2960 }
2961 spin_unlock(&args->pag->pagb_lock);
2962
2963 if (fbno != bno || flen != len) {
2964 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
2965 fbno, flen);
2966 }
2967 *rbno = fbno;
2968 *rlen = flen;
2969 return;
2970fail:
2971 /*
2972 * Return a zero extent length as failure indications. All callers
2973 * re-check if the trimmed extent satisfies the minlen requirement.
2974 */
2975 spin_unlock(&args->pag->pagb_lock);
2976 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
2977 *rbno = fbno;
2978 *rlen = 0;
2979}
2980
2981static void
2982xfs_alloc_busy_clear_one(
2983 struct xfs_mount *mp,
2984 struct xfs_perag *pag,
2985 struct xfs_busy_extent *busyp)
2986{
2987 if (busyp->length) {
2988 trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
2989 busyp->length);
2990 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2991 }
2992
2993 list_del_init(&busyp->list);
2742 kmem_free(busyp); 2994 kmem_free(busyp);
2743} 2995}
2996
2997/*
2998 * Remove all extents on the passed in list from the busy extents tree.
2999 * If do_discard is set skip extents that need to be discarded, and mark
3000 * these as undergoing a discard operation instead.
3001 */
3002void
3003xfs_alloc_busy_clear(
3004 struct xfs_mount *mp,
3005 struct list_head *list,
3006 bool do_discard)
3007{
3008 struct xfs_busy_extent *busyp, *n;
3009 struct xfs_perag *pag = NULL;
3010 xfs_agnumber_t agno = NULLAGNUMBER;
3011
3012 list_for_each_entry_safe(busyp, n, list, list) {
3013 if (busyp->agno != agno) {
3014 if (pag) {
3015 spin_unlock(&pag->pagb_lock);
3016 xfs_perag_put(pag);
3017 }
3018 pag = xfs_perag_get(mp, busyp->agno);
3019 spin_lock(&pag->pagb_lock);
3020 agno = busyp->agno;
3021 }
3022
3023 if (do_discard && busyp->length &&
3024 !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
3025 busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
3026 else
3027 xfs_alloc_busy_clear_one(mp, pag, busyp);
3028 }
3029
3030 if (pag) {
3031 spin_unlock(&pag->pagb_lock);
3032 xfs_perag_put(pag);
3033 }
3034}
3035
3036/*
3037 * Callback for list_sort to sort busy extents by the AG they reside in.
3038 */
3039int
3040xfs_busy_extent_ag_cmp(
3041 void *priv,
3042 struct list_head *a,
3043 struct list_head *b)
3044{
3045 return container_of(a, struct xfs_busy_extent, list)->agno -
3046 container_of(b, struct xfs_busy_extent, list)->agno;
3047}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..2f52b924be79 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
19#define __XFS_ALLOC_H__ 19#define __XFS_ALLOC_H__
20 20
21struct xfs_buf; 21struct xfs_buf;
22struct xfs_btree_cur;
22struct xfs_mount; 23struct xfs_mount;
23struct xfs_perag; 24struct xfs_perag;
24struct xfs_trans; 25struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
74#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) 75#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
75 76
76/* 77/*
78 * When deciding how much space to allocate out of an AG, we limit the
79 * allocation maximum size to the size the AG. However, we cannot use all the
80 * blocks in the AG - some are permanently used by metadata. These
81 * blocks are generally:
82 * - the AG superblock, AGF, AGI and AGFL
83 * - the AGF (bno and cnt) and AGI btree root blocks
84 * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
85 *
86 * The AG headers are sector sized, so the amount of space they take up is
87 * dependent on filesystem geometry. The others are all single blocks.
88 */
89#define XFS_ALLOC_AG_MAX_USABLE(mp) \
90 ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
91
92
93/*
77 * Argument structure for xfs_alloc routines. 94 * Argument structure for xfs_alloc routines.
78 * This is turned into a structure to avoid having 20 arguments passed 95 * This is turned into a structure to avoid having 20 arguments passed
79 * down several levels of the stack. 96 * down several levels of the stack.
@@ -118,15 +135,29 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
118 struct xfs_perag *pag); 135 struct xfs_perag *pag);
119 136
120#ifdef __KERNEL__ 137#ifdef __KERNEL__
138void
139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
140 xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
121 141
122void 142void
123xfs_alloc_busy_insert(xfs_trans_t *tp, 143xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
124 xfs_agnumber_t agno, 144 bool do_discard);
125 xfs_agblock_t bno, 145
126 xfs_extlen_t len); 146int
147xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
148 xfs_agblock_t bno, xfs_extlen_t len);
127 149
128void 150void
129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); 151xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
152 xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
153
154int
155xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
156
157static inline void xfs_alloc_busy_sort(struct list_head *list)
158{
159 list_sort(NULL, list, xfs_busy_extent_ag_cmp);
160}
130 161
131#endif /* __KERNEL__ */ 162#endif /* __KERNEL__ */
132 163
@@ -205,4 +236,18 @@ xfs_free_extent(
205 xfs_fsblock_t bno, /* starting block number of extent */ 236 xfs_fsblock_t bno, /* starting block number of extent */
206 xfs_extlen_t len); /* length of extent */ 237 xfs_extlen_t len); /* length of extent */
207 238
239int /* error */
240xfs_alloc_lookup_le(
241 struct xfs_btree_cur *cur, /* btree cursor */
242 xfs_agblock_t bno, /* starting block of extent */
243 xfs_extlen_t len, /* length of extent */
244 int *stat); /* success/failure */
245
246int /* error */
247xfs_alloc_get_rec(
248 struct xfs_btree_cur *cur, /* btree cursor */
249 xfs_agblock_t *bno, /* output: starting block of extent */
250 xfs_extlen_t *len, /* output: length of extent */
251 int *stat); /* output: success/failure */
252
208#endif /* __XFS_ALLOC_H__ */ 253#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 97f7328967fd..2b3518826a69 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -95,6 +95,8 @@ xfs_allocbt_alloc_block(
95 return 0; 95 return 0;
96 } 96 }
97 97
98 xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
99
98 xfs_trans_agbtree_delta(cur->bc_tp, 1); 100 xfs_trans_agbtree_delta(cur->bc_tp, 1);
99 new->s = cpu_to_be32(bno); 101 new->s = cpu_to_be32(bno);
100 102
@@ -118,18 +120,8 @@ xfs_allocbt_free_block(
118 if (error) 120 if (error)
119 return error; 121 return error;
120 122
121 /* 123 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
122 * Since blocks move to the free list without the coordination used in 124 XFS_ALLOC_BUSY_SKIP_DISCARD);
123 * xfs_bmap_finish, we can't allow block to be available for
124 * reallocation and non-transaction writing (user data) until we know
125 * that the transaction that moved it to the free list is permanently
126 * on disk. We track the blocks by declaring these blocks as "busy";
127 * the busy list is maintained on a per-ag basis and each transaction
128 * records which entries should be removed when the iclog commits to
129 * disk. If a busy block is allocated, the iclog is pushed up to the
130 * LSN that freed the block.
131 */
132 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
133 xfs_trans_agbtree_delta(cur->bc_tp, -1); 125 xfs_trans_agbtree_delta(cur->bc_tp, -1);
134 return 0; 126 return 0;
135} 127}
@@ -280,38 +272,6 @@ xfs_allocbt_key_diff(
280 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; 272 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
281} 273}
282 274
283STATIC int
284xfs_allocbt_kill_root(
285 struct xfs_btree_cur *cur,
286 struct xfs_buf *bp,
287 int level,
288 union xfs_btree_ptr *newroot)
289{
290 int error;
291
292 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
293 XFS_BTREE_STATS_INC(cur, killroot);
294
295 /*
296 * Update the root pointer, decreasing the level by 1 and then
297 * free the old root.
298 */
299 xfs_allocbt_set_root(cur, newroot, -1);
300 error = xfs_allocbt_free_block(cur, bp);
301 if (error) {
302 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
303 return error;
304 }
305
306 XFS_BTREE_STATS_INC(cur, free);
307
308 xfs_btree_setbuf(cur, level, NULL);
309 cur->bc_nlevels--;
310
311 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
312 return 0;
313}
314
315#ifdef DEBUG 275#ifdef DEBUG
316STATIC int 276STATIC int
317xfs_allocbt_keys_inorder( 277xfs_allocbt_keys_inorder(
@@ -423,7 +383,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
423 383
424 .dup_cursor = xfs_allocbt_dup_cursor, 384 .dup_cursor = xfs_allocbt_dup_cursor,
425 .set_root = xfs_allocbt_set_root, 385 .set_root = xfs_allocbt_set_root,
426 .kill_root = xfs_allocbt_kill_root,
427 .alloc_block = xfs_allocbt_alloc_block, 386 .alloc_block = xfs_allocbt_alloc_block,
428 .free_block = xfs_allocbt_free_block, 387 .free_block = xfs_allocbt_free_block,
429 .update_lastrec = xfs_allocbt_update_lastrec, 388 .update_lastrec = xfs_allocbt_update_lastrec,
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index c2568242a901..01d2072fb6d4 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -355,16 +355,15 @@ xfs_attr_set_int(
355 if (mp->m_flags & XFS_MOUNT_WSYNC) { 355 if (mp->m_flags & XFS_MOUNT_WSYNC) {
356 xfs_trans_set_sync(args.trans); 356 xfs_trans_set_sync(args.trans);
357 } 357 }
358
359 if (!error && (flags & ATTR_KERNOTIME) == 0) {
360 xfs_trans_ichgtime(args.trans, dp,
361 XFS_ICHGTIME_CHG);
362 }
358 err2 = xfs_trans_commit(args.trans, 363 err2 = xfs_trans_commit(args.trans,
359 XFS_TRANS_RELEASE_LOG_RES); 364 XFS_TRANS_RELEASE_LOG_RES);
360 xfs_iunlock(dp, XFS_ILOCK_EXCL); 365 xfs_iunlock(dp, XFS_ILOCK_EXCL);
361 366
362 /*
363 * Hit the inode change time.
364 */
365 if (!error && (flags & ATTR_KERNOTIME) == 0) {
366 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
367 }
368 return(error == 0 ? err2 : error); 367 return(error == 0 ? err2 : error);
369 } 368 }
370 369
@@ -420,6 +419,9 @@ xfs_attr_set_int(
420 xfs_trans_set_sync(args.trans); 419 xfs_trans_set_sync(args.trans);
421 } 420 }
422 421
422 if ((flags & ATTR_KERNOTIME) == 0)
423 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
424
423 /* 425 /*
424 * Commit the last in the sequence of transactions. 426 * Commit the last in the sequence of transactions.
425 */ 427 */
@@ -427,13 +429,6 @@ xfs_attr_set_int(
427 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 429 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
428 xfs_iunlock(dp, XFS_ILOCK_EXCL); 430 xfs_iunlock(dp, XFS_ILOCK_EXCL);
429 431
430 /*
431 * Hit the inode change time.
432 */
433 if (!error && (flags & ATTR_KERNOTIME) == 0) {
434 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
435 }
436
437 return(error); 432 return(error);
438 433
439out: 434out:
@@ -495,6 +490,13 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
495 args.whichfork = XFS_ATTR_FORK; 490 args.whichfork = XFS_ATTR_FORK;
496 491
497 /* 492 /*
493 * we have no control over the attribute names that userspace passes us
494 * to remove, so we have to allow the name lookup prior to attribute
495 * removal to fail.
496 */
497 args.op_flags = XFS_DA_OP_OKNOENT;
498
499 /*
498 * Attach the dquots to the inode. 500 * Attach the dquots to the inode.
499 */ 501 */
500 error = xfs_qm_dqattach(dp, 0); 502 error = xfs_qm_dqattach(dp, 0);
@@ -567,6 +569,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
567 xfs_trans_set_sync(args.trans); 569 xfs_trans_set_sync(args.trans);
568 } 570 }
569 571
572 if ((flags & ATTR_KERNOTIME) == 0)
573 xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
574
570 /* 575 /*
571 * Commit the last in the sequence of transactions. 576 * Commit the last in the sequence of transactions.
572 */ 577 */
@@ -574,13 +579,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
574 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); 579 error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
575 xfs_iunlock(dp, XFS_ILOCK_EXCL); 580 xfs_iunlock(dp, XFS_ILOCK_EXCL);
576 581
577 /*
578 * Hit the inode change time.
579 */
580 if (!error && (flags & ATTR_KERNOTIME) == 0) {
581 xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
582 }
583
584 return(error); 582 return(error);
585 583
586out: 584out:
@@ -1995,7 +1993,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1995 1993
1996 tmp = (valuelen < XFS_BUF_SIZE(bp)) 1994 tmp = (valuelen < XFS_BUF_SIZE(bp))
1997 ? valuelen : XFS_BUF_SIZE(bp); 1995 ? valuelen : XFS_BUF_SIZE(bp);
1998 xfs_biomove(bp, 0, tmp, dst, XBF_READ); 1996 xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
1999 xfs_buf_relse(bp); 1997 xfs_buf_relse(bp);
2000 dst += tmp; 1998 dst += tmp;
2001 valuelen -= tmp; 1999 valuelen -= tmp;
@@ -2125,9 +2123,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2125 2123
2126 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : 2124 tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
2127 XFS_BUF_SIZE(bp); 2125 XFS_BUF_SIZE(bp);
2128 xfs_biomove(bp, 0, tmp, src, XBF_WRITE); 2126 xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
2129 if (tmp < XFS_BUF_SIZE(bp)) 2127 if (tmp < XFS_BUF_SIZE(bp))
2130 xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); 2128 xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
2131 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ 2129 if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
2132 return (error); 2130 return (error);
2133 } 2131 }
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
637 * It didn't all fit, so we have to sort everything on hashval. 637 * It didn't all fit, so we have to sort everything on hashval.
638 */ 638 */
639 sbsize = sf->hdr.count * sizeof(*sbuf); 639 sbsize = sf->hdr.count * sizeof(*sbuf);
640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); 640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
641 641
642 /* 642 /*
643 * Scan the attribute list for the rest of the entries, storing 643 * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2386 args.dp = context->dp; 2386 args.dp = context->dp;
2387 args.whichfork = XFS_ATTR_FORK; 2387 args.whichfork = XFS_ATTR_FORK;
2388 args.valuelen = valuelen; 2388 args.valuelen = valuelen;
2389 args.value = kmem_alloc(valuelen, KM_SLEEP); 2389 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk); 2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); 2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
2392 retval = xfs_attr_rmtval_get(&args); 2392 retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f90dadd5a968..e546a33214c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local(
89 int *flags); /* inode logging flags */ 89 int *flags); /* inode logging flags */
90 90
91/* 91/*
92 * Called by xfs_bmapi to update file extent records and the btree
93 * after allocating space (or doing a delayed allocation).
94 */
95STATIC int /* error */
96xfs_bmap_add_extent(
97 xfs_inode_t *ip, /* incore inode pointer */
98 xfs_extnum_t idx, /* extent number to update/insert */
99 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
100 xfs_bmbt_irec_t *new, /* new data to add to file extents */
101 xfs_fsblock_t *first, /* pointer to firstblock variable */
102 xfs_bmap_free_t *flist, /* list of extents to be freed */
103 int *logflagsp, /* inode logging flags */
104 int whichfork, /* data or attr fork */
105 int rsvd); /* OK to allocate reserved blocks */
106
107/*
108 * Called by xfs_bmap_add_extent to handle cases converting a delayed 92 * Called by xfs_bmap_add_extent to handle cases converting a delayed
109 * allocation to a real allocation. 93 * allocation to a real allocation.
110 */ 94 */
111STATIC int /* error */ 95STATIC int /* error */
112xfs_bmap_add_extent_delay_real( 96xfs_bmap_add_extent_delay_real(
113 xfs_inode_t *ip, /* incore inode pointer */ 97 xfs_inode_t *ip, /* incore inode pointer */
114 xfs_extnum_t idx, /* extent number to update/insert */ 98 xfs_extnum_t *idx, /* extent number to update/insert */
115 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 99 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
116 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 100 xfs_bmbt_irec_t *new, /* new data to add to file extents */
117 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ 101 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
118 xfs_fsblock_t *first, /* pointer to firstblock variable */ 102 xfs_fsblock_t *first, /* pointer to firstblock variable */
119 xfs_bmap_free_t *flist, /* list of extents to be freed */ 103 xfs_bmap_free_t *flist, /* list of extents to be freed */
120 int *logflagsp, /* inode logging flags */ 104 int *logflagsp); /* inode logging flags */
121 int rsvd); /* OK to allocate reserved blocks */
122 105
123/* 106/*
124 * Called by xfs_bmap_add_extent to handle cases converting a hole 107 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real(
127STATIC int /* error */ 110STATIC int /* error */
128xfs_bmap_add_extent_hole_delay( 111xfs_bmap_add_extent_hole_delay(
129 xfs_inode_t *ip, /* incore inode pointer */ 112 xfs_inode_t *ip, /* incore inode pointer */
130 xfs_extnum_t idx, /* extent number to update/insert */ 113 xfs_extnum_t *idx, /* extent number to update/insert */
131 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 114 xfs_bmbt_irec_t *new, /* new data to add to file extents */
132 int *logflagsp,/* inode logging flags */ 115 int *logflagsp); /* inode logging flags */
133 int rsvd); /* OK to allocate reserved blocks */
134 116
135/* 117/*
136 * Called by xfs_bmap_add_extent to handle cases converting a hole 118 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay(
139STATIC int /* error */ 121STATIC int /* error */
140xfs_bmap_add_extent_hole_real( 122xfs_bmap_add_extent_hole_real(
141 xfs_inode_t *ip, /* incore inode pointer */ 123 xfs_inode_t *ip, /* incore inode pointer */
142 xfs_extnum_t idx, /* extent number to update/insert */ 124 xfs_extnum_t *idx, /* extent number to update/insert */
143 xfs_btree_cur_t *cur, /* if null, not a btree */ 125 xfs_btree_cur_t *cur, /* if null, not a btree */
144 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 126 xfs_bmbt_irec_t *new, /* new data to add to file extents */
145 int *logflagsp, /* inode logging flags */ 127 int *logflagsp, /* inode logging flags */
@@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real(
152STATIC int /* error */ 134STATIC int /* error */
153xfs_bmap_add_extent_unwritten_real( 135xfs_bmap_add_extent_unwritten_real(
154 xfs_inode_t *ip, /* incore inode pointer */ 136 xfs_inode_t *ip, /* incore inode pointer */
155 xfs_extnum_t idx, /* extent number to update/insert */ 137 xfs_extnum_t *idx, /* extent number to update/insert */
156 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 138 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
157 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 139 xfs_bmbt_irec_t *new, /* new data to add to file extents */
158 int *logflagsp); /* inode logging flags */ 140 int *logflagsp); /* inode logging flags */
@@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents(
180 int whichfork); /* data or attr fork */ 162 int whichfork); /* data or attr fork */
181 163
182/* 164/*
183 * Called by xfs_bmapi to update file extent records and the btree
184 * after removing space (or undoing a delayed allocation).
185 */
186STATIC int /* error */
187xfs_bmap_del_extent(
188 xfs_inode_t *ip, /* incore inode pointer */
189 xfs_trans_t *tp, /* current trans pointer */
190 xfs_extnum_t idx, /* extent number to update/insert */
191 xfs_bmap_free_t *flist, /* list of extents to be freed */
192 xfs_btree_cur_t *cur, /* if null, not a btree */
193 xfs_bmbt_irec_t *new, /* new data to add to file extents */
194 int *logflagsp,/* inode logging flags */
195 int whichfork, /* data or attr fork */
196 int rsvd); /* OK to allocate reserved blocks */
197
198/*
199 * Remove the entry "free" from the free item list. Prev points to the 165 * Remove the entry "free" from the free item list. Prev points to the
200 * previous entry, unless "free" is the head of the list. 166 * previous entry, unless "free" is the head of the list.
201 */ 167 */
@@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local(
474STATIC int /* error */ 440STATIC int /* error */
475xfs_bmap_add_extent( 441xfs_bmap_add_extent(
476 xfs_inode_t *ip, /* incore inode pointer */ 442 xfs_inode_t *ip, /* incore inode pointer */
477 xfs_extnum_t idx, /* extent number to update/insert */ 443 xfs_extnum_t *idx, /* extent number to update/insert */
478 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 444 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
479 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 445 xfs_bmbt_irec_t *new, /* new data to add to file extents */
480 xfs_fsblock_t *first, /* pointer to firstblock variable */ 446 xfs_fsblock_t *first, /* pointer to firstblock variable */
481 xfs_bmap_free_t *flist, /* list of extents to be freed */ 447 xfs_bmap_free_t *flist, /* list of extents to be freed */
482 int *logflagsp, /* inode logging flags */ 448 int *logflagsp, /* inode logging flags */
483 int whichfork, /* data or attr fork */ 449 int whichfork) /* data or attr fork */
484 int rsvd) /* OK to use reserved data blocks */
485{ 450{
486 xfs_btree_cur_t *cur; /* btree cursor or null */ 451 xfs_btree_cur_t *cur; /* btree cursor or null */
487 xfs_filblks_t da_new; /* new count del alloc blocks used */ 452 xfs_filblks_t da_new; /* new count del alloc blocks used */
@@ -492,23 +457,27 @@ xfs_bmap_add_extent(
492 xfs_extnum_t nextents; /* number of extents in file now */ 457 xfs_extnum_t nextents; /* number of extents in file now */
493 458
494 XFS_STATS_INC(xs_add_exlist); 459 XFS_STATS_INC(xs_add_exlist);
460
495 cur = *curp; 461 cur = *curp;
496 ifp = XFS_IFORK_PTR(ip, whichfork); 462 ifp = XFS_IFORK_PTR(ip, whichfork);
497 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 463 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
498 ASSERT(idx <= nextents);
499 da_old = da_new = 0; 464 da_old = da_new = 0;
500 error = 0; 465 error = 0;
466
467 ASSERT(*idx >= 0);
468 ASSERT(*idx <= nextents);
469
501 /* 470 /*
502 * This is the first extent added to a new/empty file. 471 * This is the first extent added to a new/empty file.
503 * Special case this one, so other routines get to assume there are 472 * Special case this one, so other routines get to assume there are
504 * already extents in the list. 473 * already extents in the list.
505 */ 474 */
506 if (nextents == 0) { 475 if (nextents == 0) {
507 xfs_iext_insert(ip, 0, 1, new, 476 xfs_iext_insert(ip, *idx, 1, new,
508 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); 477 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
509 478
510 ASSERT(cur == NULL); 479 ASSERT(cur == NULL);
511 ifp->if_lastex = 0; 480
512 if (!isnullstartblock(new->br_startblock)) { 481 if (!isnullstartblock(new->br_startblock)) {
513 XFS_IFORK_NEXT_SET(ip, whichfork, 1); 482 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
514 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 483 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -522,27 +491,25 @@ xfs_bmap_add_extent(
522 if (cur) 491 if (cur)
523 ASSERT((cur->bc_private.b.flags & 492 ASSERT((cur->bc_private.b.flags &
524 XFS_BTCUR_BPRV_WASDEL) == 0); 493 XFS_BTCUR_BPRV_WASDEL) == 0);
525 if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, 494 error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
526 &logflags, rsvd))) 495 &logflags);
527 goto done;
528 } 496 }
529 /* 497 /*
530 * Real allocation off the end of the file. 498 * Real allocation off the end of the file.
531 */ 499 */
532 else if (idx == nextents) { 500 else if (*idx == nextents) {
533 if (cur) 501 if (cur)
534 ASSERT((cur->bc_private.b.flags & 502 ASSERT((cur->bc_private.b.flags &
535 XFS_BTCUR_BPRV_WASDEL) == 0); 503 XFS_BTCUR_BPRV_WASDEL) == 0);
536 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, 504 error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
537 &logflags, whichfork))) 505 &logflags, whichfork);
538 goto done;
539 } else { 506 } else {
540 xfs_bmbt_irec_t prev; /* old extent at offset idx */ 507 xfs_bmbt_irec_t prev; /* old extent at offset idx */
541 508
542 /* 509 /*
543 * Get the record referred to by idx. 510 * Get the record referred to by idx.
544 */ 511 */
545 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev); 512 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
546 /* 513 /*
547 * If it's a real allocation record, and the new allocation ends 514 * If it's a real allocation record, and the new allocation ends
548 * after the start of the referred to record, then we're filling 515 * after the start of the referred to record, then we're filling
@@ -557,22 +524,18 @@ xfs_bmap_add_extent(
557 if (cur) 524 if (cur)
558 ASSERT(cur->bc_private.b.flags & 525 ASSERT(cur->bc_private.b.flags &
559 XFS_BTCUR_BPRV_WASDEL); 526 XFS_BTCUR_BPRV_WASDEL);
560 if ((error = xfs_bmap_add_extent_delay_real(ip, 527 error = xfs_bmap_add_extent_delay_real(ip,
561 idx, &cur, new, &da_new, first, flist, 528 idx, &cur, new, &da_new,
562 &logflags, rsvd))) 529 first, flist, &logflags);
563 goto done;
564 } else if (new->br_state == XFS_EXT_NORM) {
565 ASSERT(new->br_state == XFS_EXT_NORM);
566 if ((error = xfs_bmap_add_extent_unwritten_real(
567 ip, idx, &cur, new, &logflags)))
568 goto done;
569 } else { 530 } else {
570 ASSERT(new->br_state == XFS_EXT_UNWRITTEN); 531 ASSERT(new->br_state == XFS_EXT_NORM ||
571 if ((error = xfs_bmap_add_extent_unwritten_real( 532 new->br_state == XFS_EXT_UNWRITTEN);
572 ip, idx, &cur, new, &logflags))) 533
534 error = xfs_bmap_add_extent_unwritten_real(ip,
535 idx, &cur, new, &logflags);
536 if (error)
573 goto done; 537 goto done;
574 } 538 }
575 ASSERT(*curp == cur || *curp == NULL);
576 } 539 }
577 /* 540 /*
578 * Otherwise we're filling in a hole with an allocation. 541 * Otherwise we're filling in a hole with an allocation.
@@ -581,13 +544,15 @@ xfs_bmap_add_extent(
581 if (cur) 544 if (cur)
582 ASSERT((cur->bc_private.b.flags & 545 ASSERT((cur->bc_private.b.flags &
583 XFS_BTCUR_BPRV_WASDEL) == 0); 546 XFS_BTCUR_BPRV_WASDEL) == 0);
584 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, 547 error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
585 new, &logflags, whichfork))) 548 new, &logflags, whichfork);
586 goto done;
587 } 549 }
588 } 550 }
589 551
552 if (error)
553 goto done;
590 ASSERT(*curp == cur || *curp == NULL); 554 ASSERT(*curp == cur || *curp == NULL);
555
591 /* 556 /*
592 * Convert to a btree if necessary. 557 * Convert to a btree if necessary.
593 */ 558 */
@@ -614,8 +579,8 @@ xfs_bmap_add_extent(
614 nblks += cur->bc_private.b.allocated; 579 nblks += cur->bc_private.b.allocated;
615 ASSERT(nblks <= da_old); 580 ASSERT(nblks <= da_old);
616 if (nblks < da_old) 581 if (nblks < da_old)
617 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 582 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
618 (int64_t)(da_old - nblks), rsvd); 583 (int64_t)(da_old - nblks), 0);
619 } 584 }
620 /* 585 /*
621 * Clear out the allocated field, done with it now in any case. 586 * Clear out the allocated field, done with it now in any case.
@@ -640,14 +605,13 @@ done:
640STATIC int /* error */ 605STATIC int /* error */
641xfs_bmap_add_extent_delay_real( 606xfs_bmap_add_extent_delay_real(
642 xfs_inode_t *ip, /* incore inode pointer */ 607 xfs_inode_t *ip, /* incore inode pointer */
643 xfs_extnum_t idx, /* extent number to update/insert */ 608 xfs_extnum_t *idx, /* extent number to update/insert */
644 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 609 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
645 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 610 xfs_bmbt_irec_t *new, /* new data to add to file extents */
646 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ 611 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
647 xfs_fsblock_t *first, /* pointer to firstblock variable */ 612 xfs_fsblock_t *first, /* pointer to firstblock variable */
648 xfs_bmap_free_t *flist, /* list of extents to be freed */ 613 xfs_bmap_free_t *flist, /* list of extents to be freed */
649 int *logflagsp, /* inode logging flags */ 614 int *logflagsp) /* inode logging flags */
650 int rsvd) /* OK to use reserved data block allocation */
651{ 615{
652 xfs_btree_cur_t *cur; /* btree cursor */ 616 xfs_btree_cur_t *cur; /* btree cursor */
653 int diff; /* temp value */ 617 int diff; /* temp value */
@@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real(
673 */ 637 */
674 cur = *curp; 638 cur = *curp;
675 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 639 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
676 ep = xfs_iext_get_ext(ifp, idx); 640 ep = xfs_iext_get_ext(ifp, *idx);
677 xfs_bmbt_get_all(ep, &PREV); 641 xfs_bmbt_get_all(ep, &PREV);
678 new_endoff = new->br_startoff + new->br_blockcount; 642 new_endoff = new->br_startoff + new->br_blockcount;
679 ASSERT(PREV.br_startoff <= new->br_startoff); 643 ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real(
692 * Check and set flags if this segment has a left neighbor. 656 * Check and set flags if this segment has a left neighbor.
693 * Don't set contiguous if the combined extent would be too large. 657 * Don't set contiguous if the combined extent would be too large.
694 */ 658 */
695 if (idx > 0) { 659 if (*idx > 0) {
696 state |= BMAP_LEFT_VALID; 660 state |= BMAP_LEFT_VALID;
697 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 661 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
698 662
699 if (isnullstartblock(LEFT.br_startblock)) 663 if (isnullstartblock(LEFT.br_startblock))
700 state |= BMAP_LEFT_DELAY; 664 state |= BMAP_LEFT_DELAY;
@@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real(
712 * Don't set contiguous if the combined extent would be too large. 676 * Don't set contiguous if the combined extent would be too large.
713 * Also check for all-three-contiguous being too large. 677 * Also check for all-three-contiguous being too large.
714 */ 678 */
715 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { 679 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
716 state |= BMAP_RIGHT_VALID; 680 state |= BMAP_RIGHT_VALID;
717 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 681 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
718 682
719 if (isnullstartblock(RIGHT.br_startblock)) 683 if (isnullstartblock(RIGHT.br_startblock))
720 state |= BMAP_RIGHT_DELAY; 684 state |= BMAP_RIGHT_DELAY;
@@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real(
745 * Filling in all of a previously delayed allocation extent. 709 * Filling in all of a previously delayed allocation extent.
746 * The left and right neighbors are both contiguous with new. 710 * The left and right neighbors are both contiguous with new.
747 */ 711 */
748 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 712 --*idx;
749 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 713 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
714 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
750 LEFT.br_blockcount + PREV.br_blockcount + 715 LEFT.br_blockcount + PREV.br_blockcount +
751 RIGHT.br_blockcount); 716 RIGHT.br_blockcount);
752 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 717 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
753 718
754 xfs_iext_remove(ip, idx, 2, state); 719 xfs_iext_remove(ip, *idx + 1, 2, state);
755 ip->i_df.if_lastex = idx - 1;
756 ip->i_d.di_nextents--; 720 ip->i_d.di_nextents--;
757 if (cur == NULL) 721 if (cur == NULL)
758 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 722 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real(
784 * Filling in all of a previously delayed allocation extent. 748 * Filling in all of a previously delayed allocation extent.
785 * The left neighbor is contiguous, the right is not. 749 * The left neighbor is contiguous, the right is not.
786 */ 750 */
787 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 751 --*idx;
788 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 752
753 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
754 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
789 LEFT.br_blockcount + PREV.br_blockcount); 755 LEFT.br_blockcount + PREV.br_blockcount);
790 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 756 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
791 757
792 ip->i_df.if_lastex = idx - 1; 758 xfs_iext_remove(ip, *idx + 1, 1, state);
793 xfs_iext_remove(ip, idx, 1, state);
794 if (cur == NULL) 759 if (cur == NULL)
795 rval = XFS_ILOG_DEXT; 760 rval = XFS_ILOG_DEXT;
796 else { 761 else {
@@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real(
814 * Filling in all of a previously delayed allocation extent. 779 * Filling in all of a previously delayed allocation extent.
815 * The right neighbor is contiguous, the left is not. 780 * The right neighbor is contiguous, the left is not.
816 */ 781 */
817 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 782 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
818 xfs_bmbt_set_startblock(ep, new->br_startblock); 783 xfs_bmbt_set_startblock(ep, new->br_startblock);
819 xfs_bmbt_set_blockcount(ep, 784 xfs_bmbt_set_blockcount(ep,
820 PREV.br_blockcount + RIGHT.br_blockcount); 785 PREV.br_blockcount + RIGHT.br_blockcount);
821 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 786 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
822 787
823 ip->i_df.if_lastex = idx; 788 xfs_iext_remove(ip, *idx + 1, 1, state);
824 xfs_iext_remove(ip, idx + 1, 1, state);
825 if (cur == NULL) 789 if (cur == NULL)
826 rval = XFS_ILOG_DEXT; 790 rval = XFS_ILOG_DEXT;
827 else { 791 else {
@@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real(
837 RIGHT.br_blockcount, PREV.br_state))) 801 RIGHT.br_blockcount, PREV.br_state)))
838 goto done; 802 goto done;
839 } 803 }
804
840 *dnew = 0; 805 *dnew = 0;
841 break; 806 break;
842 807
@@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real(
846 * Neither the left nor right neighbors are contiguous with 811 * Neither the left nor right neighbors are contiguous with
847 * the new one. 812 * the new one.
848 */ 813 */
849 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 814 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
850 xfs_bmbt_set_startblock(ep, new->br_startblock); 815 xfs_bmbt_set_startblock(ep, new->br_startblock);
851 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 816 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
852 817
853 ip->i_df.if_lastex = idx;
854 ip->i_d.di_nextents++; 818 ip->i_d.di_nextents++;
855 if (cur == NULL) 819 if (cur == NULL)
856 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 820 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real(
866 goto done; 830 goto done;
867 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 831 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
868 } 832 }
833
869 *dnew = 0; 834 *dnew = 0;
870 break; 835 break;
871 836
@@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real(
874 * Filling in the first part of a previous delayed allocation. 839 * Filling in the first part of a previous delayed allocation.
875 * The left neighbor is contiguous. 840 * The left neighbor is contiguous.
876 */ 841 */
877 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 842 trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
878 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 843 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
879 LEFT.br_blockcount + new->br_blockcount); 844 LEFT.br_blockcount + new->br_blockcount);
880 xfs_bmbt_set_startoff(ep, 845 xfs_bmbt_set_startoff(ep,
881 PREV.br_startoff + new->br_blockcount); 846 PREV.br_startoff + new->br_blockcount);
882 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 847 trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
883 848
884 temp = PREV.br_blockcount - new->br_blockcount; 849 temp = PREV.br_blockcount - new->br_blockcount;
885 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 850 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
886 xfs_bmbt_set_blockcount(ep, temp); 851 xfs_bmbt_set_blockcount(ep, temp);
887 ip->i_df.if_lastex = idx - 1;
888 if (cur == NULL) 852 if (cur == NULL)
889 rval = XFS_ILOG_DEXT; 853 rval = XFS_ILOG_DEXT;
890 else { 854 else {
@@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real(
904 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 868 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
905 startblockval(PREV.br_startblock)); 869 startblockval(PREV.br_startblock));
906 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 870 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
907 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 871 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
872
873 --*idx;
908 *dnew = temp; 874 *dnew = temp;
909 break; 875 break;
910 876
@@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real(
913 * Filling in the first part of a previous delayed allocation. 879 * Filling in the first part of a previous delayed allocation.
914 * The left neighbor is not contiguous. 880 * The left neighbor is not contiguous.
915 */ 881 */
916 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 882 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
917 xfs_bmbt_set_startoff(ep, new_endoff); 883 xfs_bmbt_set_startoff(ep, new_endoff);
918 temp = PREV.br_blockcount - new->br_blockcount; 884 temp = PREV.br_blockcount - new->br_blockcount;
919 xfs_bmbt_set_blockcount(ep, temp); 885 xfs_bmbt_set_blockcount(ep, temp);
920 xfs_iext_insert(ip, idx, 1, new, state); 886 xfs_iext_insert(ip, *idx, 1, new, state);
921 ip->i_df.if_lastex = idx;
922 ip->i_d.di_nextents++; 887 ip->i_d.di_nextents++;
923 if (cur == NULL) 888 if (cur == NULL)
924 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 889 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real(
946 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 911 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
947 startblockval(PREV.br_startblock) - 912 startblockval(PREV.br_startblock) -
948 (cur ? cur->bc_private.b.allocated : 0)); 913 (cur ? cur->bc_private.b.allocated : 0));
949 ep = xfs_iext_get_ext(ifp, idx + 1); 914 ep = xfs_iext_get_ext(ifp, *idx + 1);
950 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 915 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
951 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 916 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
917
952 *dnew = temp; 918 *dnew = temp;
953 break; 919 break;
954 920
@@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real(
958 * The right neighbor is contiguous with the new allocation. 924 * The right neighbor is contiguous with the new allocation.
959 */ 925 */
960 temp = PREV.br_blockcount - new->br_blockcount; 926 temp = PREV.br_blockcount - new->br_blockcount;
961 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 927 trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
962 trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
963 xfs_bmbt_set_blockcount(ep, temp); 928 xfs_bmbt_set_blockcount(ep, temp);
964 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 929 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
965 new->br_startoff, new->br_startblock, 930 new->br_startoff, new->br_startblock,
966 new->br_blockcount + RIGHT.br_blockcount, 931 new->br_blockcount + RIGHT.br_blockcount,
967 RIGHT.br_state); 932 RIGHT.br_state);
968 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 933 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
969 ip->i_df.if_lastex = idx + 1;
970 if (cur == NULL) 934 if (cur == NULL)
971 rval = XFS_ILOG_DEXT; 935 rval = XFS_ILOG_DEXT;
972 else { 936 else {
@@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real(
983 RIGHT.br_state))) 947 RIGHT.br_state)))
984 goto done; 948 goto done;
985 } 949 }
950
986 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 951 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
987 startblockval(PREV.br_startblock)); 952 startblockval(PREV.br_startblock));
953 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
988 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 954 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
989 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 955 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
956
957 ++*idx;
990 *dnew = temp; 958 *dnew = temp;
991 break; 959 break;
992 960
@@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real(
996 * The right neighbor is not contiguous. 964 * The right neighbor is not contiguous.
997 */ 965 */
998 temp = PREV.br_blockcount - new->br_blockcount; 966 temp = PREV.br_blockcount - new->br_blockcount;
999 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 967 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1000 xfs_bmbt_set_blockcount(ep, temp); 968 xfs_bmbt_set_blockcount(ep, temp);
1001 xfs_iext_insert(ip, idx + 1, 1, new, state); 969 xfs_iext_insert(ip, *idx + 1, 1, new, state);
1002 ip->i_df.if_lastex = idx + 1;
1003 ip->i_d.di_nextents++; 970 ip->i_d.di_nextents++;
1004 if (cur == NULL) 971 if (cur == NULL)
1005 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 972 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real(
1027 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 994 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1028 startblockval(PREV.br_startblock) - 995 startblockval(PREV.br_startblock) -
1029 (cur ? cur->bc_private.b.allocated : 0)); 996 (cur ? cur->bc_private.b.allocated : 0));
1030 ep = xfs_iext_get_ext(ifp, idx); 997 ep = xfs_iext_get_ext(ifp, *idx);
1031 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 998 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1032 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 999 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1000
1001 ++*idx;
1033 *dnew = temp; 1002 *dnew = temp;
1034 break; 1003 break;
1035 1004
@@ -1038,18 +1007,34 @@ xfs_bmap_add_extent_delay_real(
1038 * Filling in the middle part of a previous delayed allocation. 1007 * Filling in the middle part of a previous delayed allocation.
1039 * Contiguity is impossible here. 1008 * Contiguity is impossible here.
1040 * This case is avoided almost all the time. 1009 * This case is avoided almost all the time.
1010 *
1011 * We start with a delayed allocation:
1012 *
1013 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
1014 * PREV @ idx
1015 *
1016 * and we are allocating:
1017 * +rrrrrrrrrrrrrrrrr+
1018 * new
1019 *
1020 * and we set it up for insertion as:
1021 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
1022 * new
1023 * PREV @ idx LEFT RIGHT
1024 * inserted at idx + 1
1041 */ 1025 */
1042 temp = new->br_startoff - PREV.br_startoff; 1026 temp = new->br_startoff - PREV.br_startoff;
1043 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1044 xfs_bmbt_set_blockcount(ep, temp);
1045 r[0] = *new;
1046 r[1].br_state = PREV.br_state;
1047 r[1].br_startblock = 0;
1048 r[1].br_startoff = new_endoff;
1049 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1027 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1050 r[1].br_blockcount = temp2; 1028 trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
1051 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1029 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1052 ip->i_df.if_lastex = idx + 1; 1030 LEFT = *new;
1031 RIGHT.br_state = PREV.br_state;
1032 RIGHT.br_startblock = nullstartblock(
1033 (int)xfs_bmap_worst_indlen(ip, temp2));
1034 RIGHT.br_startoff = new_endoff;
1035 RIGHT.br_blockcount = temp2;
1036 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1037 xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
1053 ip->i_d.di_nextents++; 1038 ip->i_d.di_nextents++;
1054 if (cur == NULL) 1039 if (cur == NULL)
1055 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1040 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1079,7 +1064,8 @@ xfs_bmap_add_extent_delay_real(
1079 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - 1064 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
1080 (cur ? cur->bc_private.b.allocated : 0)); 1065 (cur ? cur->bc_private.b.allocated : 0));
1081 if (diff > 0 && 1066 if (diff > 0 &&
1082 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { 1067 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1068 -((int64_t)diff), 0)) {
1083 /* 1069 /*
1084 * Ick gross gag me with a spoon. 1070 * Ick gross gag me with a spoon.
1085 */ 1071 */
@@ -1089,27 +1075,31 @@ xfs_bmap_add_extent_delay_real(
1089 temp--; 1075 temp--;
1090 diff--; 1076 diff--;
1091 if (!diff || 1077 if (!diff ||
1092 !xfs_mod_incore_sb(ip->i_mount, 1078 !xfs_icsb_modify_counters(ip->i_mount,
1093 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1079 XFS_SBS_FDBLOCKS,
1080 -((int64_t)diff), 0))
1094 break; 1081 break;
1095 } 1082 }
1096 if (temp2) { 1083 if (temp2) {
1097 temp2--; 1084 temp2--;
1098 diff--; 1085 diff--;
1099 if (!diff || 1086 if (!diff ||
1100 !xfs_mod_incore_sb(ip->i_mount, 1087 !xfs_icsb_modify_counters(ip->i_mount,
1101 XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) 1088 XFS_SBS_FDBLOCKS,
1089 -((int64_t)diff), 0))
1102 break; 1090 break;
1103 } 1091 }
1104 } 1092 }
1105 } 1093 }
1106 ep = xfs_iext_get_ext(ifp, idx); 1094 ep = xfs_iext_get_ext(ifp, *idx);
1107 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 1095 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1108 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1096 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1109 trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_); 1097 trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
1110 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), 1098 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
1111 nullstartblock((int)temp2)); 1099 nullstartblock((int)temp2));
1112 trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); 1100 trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
1101
1102 ++*idx;
1113 *dnew = temp + temp2; 1103 *dnew = temp + temp2;
1114 break; 1104 break;
1115 1105
@@ -1141,7 +1131,7 @@ done:
1141STATIC int /* error */ 1131STATIC int /* error */
1142xfs_bmap_add_extent_unwritten_real( 1132xfs_bmap_add_extent_unwritten_real(
1143 xfs_inode_t *ip, /* incore inode pointer */ 1133 xfs_inode_t *ip, /* incore inode pointer */
1144 xfs_extnum_t idx, /* extent number to update/insert */ 1134 xfs_extnum_t *idx, /* extent number to update/insert */
1145 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 1135 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
1146 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1136 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1147 int *logflagsp) /* inode logging flags */ 1137 int *logflagsp) /* inode logging flags */
@@ -1168,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real(
1168 error = 0; 1158 error = 0;
1169 cur = *curp; 1159 cur = *curp;
1170 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1160 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1171 ep = xfs_iext_get_ext(ifp, idx); 1161 ep = xfs_iext_get_ext(ifp, *idx);
1172 xfs_bmbt_get_all(ep, &PREV); 1162 xfs_bmbt_get_all(ep, &PREV);
1173 newext = new->br_state; 1163 newext = new->br_state;
1174 oldext = (newext == XFS_EXT_UNWRITTEN) ? 1164 oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1191,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real(
1191 * Check and set flags if this segment has a left neighbor. 1181 * Check and set flags if this segment has a left neighbor.
1192 * Don't set contiguous if the combined extent would be too large. 1182 * Don't set contiguous if the combined extent would be too large.
1193 */ 1183 */
1194 if (idx > 0) { 1184 if (*idx > 0) {
1195 state |= BMAP_LEFT_VALID; 1185 state |= BMAP_LEFT_VALID;
1196 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 1186 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
1197 1187
1198 if (isnullstartblock(LEFT.br_startblock)) 1188 if (isnullstartblock(LEFT.br_startblock))
1199 state |= BMAP_LEFT_DELAY; 1189 state |= BMAP_LEFT_DELAY;
@@ -1211,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real(
1211 * Don't set contiguous if the combined extent would be too large. 1201 * Don't set contiguous if the combined extent would be too large.
1212 * Also check for all-three-contiguous being too large. 1202 * Also check for all-three-contiguous being too large.
1213 */ 1203 */
1214 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { 1204 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
1215 state |= BMAP_RIGHT_VALID; 1205 state |= BMAP_RIGHT_VALID;
1216 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 1206 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
1217 if (isnullstartblock(RIGHT.br_startblock)) 1207 if (isnullstartblock(RIGHT.br_startblock))
1218 state |= BMAP_RIGHT_DELAY; 1208 state |= BMAP_RIGHT_DELAY;
1219 } 1209 }
@@ -1242,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real(
1242 * Setting all of a previous oldext extent to newext. 1232 * Setting all of a previous oldext extent to newext.
1243 * The left and right neighbors are both contiguous with new. 1233 * The left and right neighbors are both contiguous with new.
1244 */ 1234 */
1245 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1235 --*idx;
1246 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1236
1237 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1238 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1247 LEFT.br_blockcount + PREV.br_blockcount + 1239 LEFT.br_blockcount + PREV.br_blockcount +
1248 RIGHT.br_blockcount); 1240 RIGHT.br_blockcount);
1249 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1241 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1250 1242
1251 xfs_iext_remove(ip, idx, 2, state); 1243 xfs_iext_remove(ip, *idx + 1, 2, state);
1252 ip->i_df.if_lastex = idx - 1;
1253 ip->i_d.di_nextents -= 2; 1244 ip->i_d.di_nextents -= 2;
1254 if (cur == NULL) 1245 if (cur == NULL)
1255 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1246 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1285,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real(
1285 * Setting all of a previous oldext extent to newext. 1276 * Setting all of a previous oldext extent to newext.
1286 * The left neighbor is contiguous, the right is not. 1277 * The left neighbor is contiguous, the right is not.
1287 */ 1278 */
1288 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1279 --*idx;
1289 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1280
1281 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1282 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1290 LEFT.br_blockcount + PREV.br_blockcount); 1283 LEFT.br_blockcount + PREV.br_blockcount);
1291 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1284 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1292 1285
1293 ip->i_df.if_lastex = idx - 1; 1286 xfs_iext_remove(ip, *idx + 1, 1, state);
1294 xfs_iext_remove(ip, idx, 1, state);
1295 ip->i_d.di_nextents--; 1287 ip->i_d.di_nextents--;
1296 if (cur == NULL) 1288 if (cur == NULL)
1297 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1289 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1321,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real(
1321 * Setting all of a previous oldext extent to newext. 1313 * Setting all of a previous oldext extent to newext.
1322 * The right neighbor is contiguous, the left is not. 1314 * The right neighbor is contiguous, the left is not.
1323 */ 1315 */
1324 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1316 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1325 xfs_bmbt_set_blockcount(ep, 1317 xfs_bmbt_set_blockcount(ep,
1326 PREV.br_blockcount + RIGHT.br_blockcount); 1318 PREV.br_blockcount + RIGHT.br_blockcount);
1327 xfs_bmbt_set_state(ep, newext); 1319 xfs_bmbt_set_state(ep, newext);
1328 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1320 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1329 ip->i_df.if_lastex = idx; 1321 xfs_iext_remove(ip, *idx + 1, 1, state);
1330 xfs_iext_remove(ip, idx + 1, 1, state);
1331 ip->i_d.di_nextents--; 1322 ip->i_d.di_nextents--;
1332 if (cur == NULL) 1323 if (cur == NULL)
1333 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1324 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1358,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real(
1358 * Neither the left nor right neighbors are contiguous with 1349 * Neither the left nor right neighbors are contiguous with
1359 * the new one. 1350 * the new one.
1360 */ 1351 */
1361 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1352 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1362 xfs_bmbt_set_state(ep, newext); 1353 xfs_bmbt_set_state(ep, newext);
1363 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1354 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1364 1355
1365 ip->i_df.if_lastex = idx;
1366 if (cur == NULL) 1356 if (cur == NULL)
1367 rval = XFS_ILOG_DEXT; 1357 rval = XFS_ILOG_DEXT;
1368 else { 1358 else {
@@ -1384,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real(
1384 * Setting the first part of a previous oldext extent to newext. 1374 * Setting the first part of a previous oldext extent to newext.
1385 * The left neighbor is contiguous. 1375 * The left neighbor is contiguous.
1386 */ 1376 */
1387 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1377 trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
1388 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1378 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
1389 LEFT.br_blockcount + new->br_blockcount); 1379 LEFT.br_blockcount + new->br_blockcount);
1390 xfs_bmbt_set_startoff(ep, 1380 xfs_bmbt_set_startoff(ep,
1391 PREV.br_startoff + new->br_blockcount); 1381 PREV.br_startoff + new->br_blockcount);
1392 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1382 trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
1393 1383
1394 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1384 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1395 xfs_bmbt_set_startblock(ep, 1385 xfs_bmbt_set_startblock(ep,
1396 new->br_startblock + new->br_blockcount); 1386 new->br_startblock + new->br_blockcount);
1397 xfs_bmbt_set_blockcount(ep, 1387 xfs_bmbt_set_blockcount(ep,
1398 PREV.br_blockcount - new->br_blockcount); 1388 PREV.br_blockcount - new->br_blockcount);
1399 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1389 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1390
1391 --*idx;
1400 1392
1401 ip->i_df.if_lastex = idx - 1;
1402 if (cur == NULL) 1393 if (cur == NULL)
1403 rval = XFS_ILOG_DEXT; 1394 rval = XFS_ILOG_DEXT;
1404 else { 1395 else {
@@ -1429,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real(
1429 * Setting the first part of a previous oldext extent to newext. 1420 * Setting the first part of a previous oldext extent to newext.
1430 * The left neighbor is not contiguous. 1421 * The left neighbor is not contiguous.
1431 */ 1422 */
1432 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1423 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1433 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); 1424 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
1434 xfs_bmbt_set_startoff(ep, new_endoff); 1425 xfs_bmbt_set_startoff(ep, new_endoff);
1435 xfs_bmbt_set_blockcount(ep, 1426 xfs_bmbt_set_blockcount(ep,
1436 PREV.br_blockcount - new->br_blockcount); 1427 PREV.br_blockcount - new->br_blockcount);
1437 xfs_bmbt_set_startblock(ep, 1428 xfs_bmbt_set_startblock(ep,
1438 new->br_startblock + new->br_blockcount); 1429 new->br_startblock + new->br_blockcount);
1439 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1430 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1440 1431
1441 xfs_iext_insert(ip, idx, 1, new, state); 1432 xfs_iext_insert(ip, *idx, 1, new, state);
1442 ip->i_df.if_lastex = idx;
1443 ip->i_d.di_nextents++; 1433 ip->i_d.di_nextents++;
1444 if (cur == NULL) 1434 if (cur == NULL)
1445 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1435 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1468,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real(
1468 * Setting the last part of a previous oldext extent to newext. 1458 * Setting the last part of a previous oldext extent to newext.
1469 * The right neighbor is contiguous with the new allocation. 1459 * The right neighbor is contiguous with the new allocation.
1470 */ 1460 */
1471 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1461 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1472 trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
1473 xfs_bmbt_set_blockcount(ep, 1462 xfs_bmbt_set_blockcount(ep,
1474 PREV.br_blockcount - new->br_blockcount); 1463 PREV.br_blockcount - new->br_blockcount);
1475 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1464 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1476 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 1465
1466 ++*idx;
1467
1468 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1469 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
1477 new->br_startoff, new->br_startblock, 1470 new->br_startoff, new->br_startblock,
1478 new->br_blockcount + RIGHT.br_blockcount, newext); 1471 new->br_blockcount + RIGHT.br_blockcount, newext);
1479 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 1472 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1480 1473
1481 ip->i_df.if_lastex = idx + 1;
1482 if (cur == NULL) 1474 if (cur == NULL)
1483 rval = XFS_ILOG_DEXT; 1475 rval = XFS_ILOG_DEXT;
1484 else { 1476 else {
@@ -1508,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real(
1508 * Setting the last part of a previous oldext extent to newext. 1500 * Setting the last part of a previous oldext extent to newext.
1509 * The right neighbor is not contiguous. 1501 * The right neighbor is not contiguous.
1510 */ 1502 */
1511 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1503 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1512 xfs_bmbt_set_blockcount(ep, 1504 xfs_bmbt_set_blockcount(ep,
1513 PREV.br_blockcount - new->br_blockcount); 1505 PREV.br_blockcount - new->br_blockcount);
1514 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1506 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1507
1508 ++*idx;
1509 xfs_iext_insert(ip, *idx, 1, new, state);
1515 1510
1516 xfs_iext_insert(ip, idx + 1, 1, new, state);
1517 ip->i_df.if_lastex = idx + 1;
1518 ip->i_d.di_nextents++; 1511 ip->i_d.di_nextents++;
1519 if (cur == NULL) 1512 if (cur == NULL)
1520 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1513 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1548,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real(
1548 * newext. Contiguity is impossible here. 1541 * newext. Contiguity is impossible here.
1549 * One extent becomes three extents. 1542 * One extent becomes three extents.
1550 */ 1543 */
1551 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1544 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1552 xfs_bmbt_set_blockcount(ep, 1545 xfs_bmbt_set_blockcount(ep,
1553 new->br_startoff - PREV.br_startoff); 1546 new->br_startoff - PREV.br_startoff);
1554 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1547 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1555 1548
1556 r[0] = *new; 1549 r[0] = *new;
1557 r[1].br_startoff = new_endoff; 1550 r[1].br_startoff = new_endoff;
@@ -1559,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real(
1559 PREV.br_startoff + PREV.br_blockcount - new_endoff; 1552 PREV.br_startoff + PREV.br_blockcount - new_endoff;
1560 r[1].br_startblock = new->br_startblock + new->br_blockcount; 1553 r[1].br_startblock = new->br_startblock + new->br_blockcount;
1561 r[1].br_state = oldext; 1554 r[1].br_state = oldext;
1562 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1555
1563 ip->i_df.if_lastex = idx + 1; 1556 ++*idx;
1557 xfs_iext_insert(ip, *idx, 2, &r[0], state);
1558
1564 ip->i_d.di_nextents += 2; 1559 ip->i_d.di_nextents += 2;
1565 if (cur == NULL) 1560 if (cur == NULL)
1566 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1561 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1630,12 +1625,10 @@ done:
1630STATIC int /* error */ 1625STATIC int /* error */
1631xfs_bmap_add_extent_hole_delay( 1626xfs_bmap_add_extent_hole_delay(
1632 xfs_inode_t *ip, /* incore inode pointer */ 1627 xfs_inode_t *ip, /* incore inode pointer */
1633 xfs_extnum_t idx, /* extent number to update/insert */ 1628 xfs_extnum_t *idx, /* extent number to update/insert */
1634 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1629 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1635 int *logflagsp, /* inode logging flags */ 1630 int *logflagsp) /* inode logging flags */
1636 int rsvd) /* OK to allocate reserved blocks */
1637{ 1631{
1638 xfs_bmbt_rec_host_t *ep; /* extent record for idx */
1639 xfs_ifork_t *ifp; /* inode fork pointer */ 1632 xfs_ifork_t *ifp; /* inode fork pointer */
1640 xfs_bmbt_irec_t left; /* left neighbor extent entry */ 1633 xfs_bmbt_irec_t left; /* left neighbor extent entry */
1641 xfs_filblks_t newlen=0; /* new indirect size */ 1634 xfs_filblks_t newlen=0; /* new indirect size */
@@ -1645,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay(
1645 xfs_filblks_t temp=0; /* temp for indirect calculations */ 1638 xfs_filblks_t temp=0; /* temp for indirect calculations */
1646 1639
1647 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1640 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1648 ep = xfs_iext_get_ext(ifp, idx);
1649 state = 0; 1641 state = 0;
1650 ASSERT(isnullstartblock(new->br_startblock)); 1642 ASSERT(isnullstartblock(new->br_startblock));
1651 1643
1652 /* 1644 /*
1653 * Check and set flags if this segment has a left neighbor 1645 * Check and set flags if this segment has a left neighbor
1654 */ 1646 */
1655 if (idx > 0) { 1647 if (*idx > 0) {
1656 state |= BMAP_LEFT_VALID; 1648 state |= BMAP_LEFT_VALID;
1657 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1649 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
1658 1650
1659 if (isnullstartblock(left.br_startblock)) 1651 if (isnullstartblock(left.br_startblock))
1660 state |= BMAP_LEFT_DELAY; 1652 state |= BMAP_LEFT_DELAY;
@@ -1664,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay(
1664 * Check and set flags if the current (right) segment exists. 1656 * Check and set flags if the current (right) segment exists.
1665 * If it doesn't exist, we're converting the hole at end-of-file. 1657 * If it doesn't exist, we're converting the hole at end-of-file.
1666 */ 1658 */
1667 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { 1659 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
1668 state |= BMAP_RIGHT_VALID; 1660 state |= BMAP_RIGHT_VALID;
1669 xfs_bmbt_get_all(ep, &right); 1661 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
1670 1662
1671 if (isnullstartblock(right.br_startblock)) 1663 if (isnullstartblock(right.br_startblock))
1672 state |= BMAP_RIGHT_DELAY; 1664 state |= BMAP_RIGHT_DELAY;
@@ -1699,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay(
1699 * on the left and on the right. 1691 * on the left and on the right.
1700 * Merge all three into a single extent record. 1692 * Merge all three into a single extent record.
1701 */ 1693 */
1694 --*idx;
1702 temp = left.br_blockcount + new->br_blockcount + 1695 temp = left.br_blockcount + new->br_blockcount +
1703 right.br_blockcount; 1696 right.br_blockcount;
1704 1697
1705 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1698 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1706 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1699 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
1707 oldlen = startblockval(left.br_startblock) + 1700 oldlen = startblockval(left.br_startblock) +
1708 startblockval(new->br_startblock) + 1701 startblockval(new->br_startblock) +
1709 startblockval(right.br_startblock); 1702 startblockval(right.br_startblock);
1710 newlen = xfs_bmap_worst_indlen(ip, temp); 1703 newlen = xfs_bmap_worst_indlen(ip, temp);
1711 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1704 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
1712 nullstartblock((int)newlen)); 1705 nullstartblock((int)newlen));
1713 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1706 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1714 1707
1715 xfs_iext_remove(ip, idx, 1, state); 1708 xfs_iext_remove(ip, *idx + 1, 1, state);
1716 ip->i_df.if_lastex = idx - 1;
1717 break; 1709 break;
1718 1710
1719 case BMAP_LEFT_CONTIG: 1711 case BMAP_LEFT_CONTIG:
@@ -1722,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay(
1722 * on the left. 1714 * on the left.
1723 * Merge the new allocation with the left neighbor. 1715 * Merge the new allocation with the left neighbor.
1724 */ 1716 */
1717 --*idx;
1725 temp = left.br_blockcount + new->br_blockcount; 1718 temp = left.br_blockcount + new->br_blockcount;
1726 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1719
1727 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1720 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1721 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
1728 oldlen = startblockval(left.br_startblock) + 1722 oldlen = startblockval(left.br_startblock) +
1729 startblockval(new->br_startblock); 1723 startblockval(new->br_startblock);
1730 newlen = xfs_bmap_worst_indlen(ip, temp); 1724 newlen = xfs_bmap_worst_indlen(ip, temp);
1731 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1725 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
1732 nullstartblock((int)newlen)); 1726 nullstartblock((int)newlen));
1733 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1727 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1734
1735 ip->i_df.if_lastex = idx - 1;
1736 break; 1728 break;
1737 1729
1738 case BMAP_RIGHT_CONTIG: 1730 case BMAP_RIGHT_CONTIG:
@@ -1741,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay(
1741 * on the right. 1733 * on the right.
1742 * Merge the new allocation with the right neighbor. 1734 * Merge the new allocation with the right neighbor.
1743 */ 1735 */
1744 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1736 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1745 temp = new->br_blockcount + right.br_blockcount; 1737 temp = new->br_blockcount + right.br_blockcount;
1746 oldlen = startblockval(new->br_startblock) + 1738 oldlen = startblockval(new->br_startblock) +
1747 startblockval(right.br_startblock); 1739 startblockval(right.br_startblock);
1748 newlen = xfs_bmap_worst_indlen(ip, temp); 1740 newlen = xfs_bmap_worst_indlen(ip, temp);
1749 xfs_bmbt_set_allf(ep, new->br_startoff, 1741 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
1742 new->br_startoff,
1750 nullstartblock((int)newlen), temp, right.br_state); 1743 nullstartblock((int)newlen), temp, right.br_state);
1751 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1744 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1752
1753 ip->i_df.if_lastex = idx;
1754 break; 1745 break;
1755 1746
1756 case 0: 1747 case 0:
@@ -1760,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay(
1760 * Insert a new entry. 1751 * Insert a new entry.
1761 */ 1752 */
1762 oldlen = newlen = 0; 1753 oldlen = newlen = 0;
1763 xfs_iext_insert(ip, idx, 1, new, state); 1754 xfs_iext_insert(ip, *idx, 1, new, state);
1764 ip->i_df.if_lastex = idx;
1765 break; 1755 break;
1766 } 1756 }
1767 if (oldlen != newlen) { 1757 if (oldlen != newlen) {
1768 ASSERT(oldlen > newlen); 1758 ASSERT(oldlen > newlen);
1769 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, 1759 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1770 (int64_t)(oldlen - newlen), rsvd); 1760 (int64_t)(oldlen - newlen), 0);
1771 /* 1761 /*
1772 * Nothing to do for disk quota accounting here. 1762 * Nothing to do for disk quota accounting here.
1773 */ 1763 */
@@ -1783,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay(
1783STATIC int /* error */ 1773STATIC int /* error */
1784xfs_bmap_add_extent_hole_real( 1774xfs_bmap_add_extent_hole_real(
1785 xfs_inode_t *ip, /* incore inode pointer */ 1775 xfs_inode_t *ip, /* incore inode pointer */
1786 xfs_extnum_t idx, /* extent number to update/insert */ 1776 xfs_extnum_t *idx, /* extent number to update/insert */
1787 xfs_btree_cur_t *cur, /* if null, not a btree */ 1777 xfs_btree_cur_t *cur, /* if null, not a btree */
1788 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1778 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1789 int *logflagsp, /* inode logging flags */ 1779 int *logflagsp, /* inode logging flags */
1790 int whichfork) /* data or attr fork */ 1780 int whichfork) /* data or attr fork */
1791{ 1781{
1792 xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */
1793 int error; /* error return value */ 1782 int error; /* error return value */
1794 int i; /* temp state */ 1783 int i; /* temp state */
1795 xfs_ifork_t *ifp; /* inode fork pointer */ 1784 xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1799,8 +1788,7 @@ xfs_bmap_add_extent_hole_real(
1799 int state; /* state bits, accessed thru macros */ 1788 int state; /* state bits, accessed thru macros */
1800 1789
1801 ifp = XFS_IFORK_PTR(ip, whichfork); 1790 ifp = XFS_IFORK_PTR(ip, whichfork);
1802 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 1791 ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
1803 ep = xfs_iext_get_ext(ifp, idx);
1804 state = 0; 1792 state = 0;
1805 1793
1806 if (whichfork == XFS_ATTR_FORK) 1794 if (whichfork == XFS_ATTR_FORK)
@@ -1809,9 +1797,9 @@ xfs_bmap_add_extent_hole_real(
1809 /* 1797 /*
1810 * Check and set flags if this segment has a left neighbor. 1798 * Check and set flags if this segment has a left neighbor.
1811 */ 1799 */
1812 if (idx > 0) { 1800 if (*idx > 0) {
1813 state |= BMAP_LEFT_VALID; 1801 state |= BMAP_LEFT_VALID;
1814 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1802 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
1815 if (isnullstartblock(left.br_startblock)) 1803 if (isnullstartblock(left.br_startblock))
1816 state |= BMAP_LEFT_DELAY; 1804 state |= BMAP_LEFT_DELAY;
1817 } 1805 }
@@ -1820,9 +1808,9 @@ xfs_bmap_add_extent_hole_real(
1820 * Check and set flags if this segment has a current value. 1808 * Check and set flags if this segment has a current value.
1821 * Not true if we're inserting into the "hole" at eof. 1809 * Not true if we're inserting into the "hole" at eof.
1822 */ 1810 */
1823 if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { 1811 if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
1824 state |= BMAP_RIGHT_VALID; 1812 state |= BMAP_RIGHT_VALID;
1825 xfs_bmbt_get_all(ep, &right); 1813 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
1826 if (isnullstartblock(right.br_startblock)) 1814 if (isnullstartblock(right.br_startblock))
1827 state |= BMAP_RIGHT_DELAY; 1815 state |= BMAP_RIGHT_DELAY;
1828 } 1816 }
@@ -1859,14 +1847,15 @@ xfs_bmap_add_extent_hole_real(
1859 * left and on the right. 1847 * left and on the right.
1860 * Merge all three into a single extent record. 1848 * Merge all three into a single extent record.
1861 */ 1849 */
1862 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1850 --*idx;
1863 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1851 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1852 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1864 left.br_blockcount + new->br_blockcount + 1853 left.br_blockcount + new->br_blockcount +
1865 right.br_blockcount); 1854 right.br_blockcount);
1866 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1855 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1856
1857 xfs_iext_remove(ip, *idx + 1, 1, state);
1867 1858
1868 xfs_iext_remove(ip, idx, 1, state);
1869 ifp->if_lastex = idx - 1;
1870 XFS_IFORK_NEXT_SET(ip, whichfork, 1859 XFS_IFORK_NEXT_SET(ip, whichfork,
1871 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 1860 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
1872 if (cur == NULL) { 1861 if (cur == NULL) {
@@ -1901,12 +1890,12 @@ xfs_bmap_add_extent_hole_real(
1901 * on the left. 1890 * on the left.
1902 * Merge the new allocation with the left neighbor. 1891 * Merge the new allocation with the left neighbor.
1903 */ 1892 */
1904 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1893 --*idx;
1905 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1894 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1895 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1906 left.br_blockcount + new->br_blockcount); 1896 left.br_blockcount + new->br_blockcount);
1907 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1897 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1908 1898
1909 ifp->if_lastex = idx - 1;
1910 if (cur == NULL) { 1899 if (cur == NULL) {
1911 rval = xfs_ilog_fext(whichfork); 1900 rval = xfs_ilog_fext(whichfork);
1912 } else { 1901 } else {
@@ -1932,13 +1921,13 @@ xfs_bmap_add_extent_hole_real(
1932 * on the right. 1921 * on the right.
1933 * Merge the new allocation with the right neighbor. 1922 * Merge the new allocation with the right neighbor.
1934 */ 1923 */
1935 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1924 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1936 xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, 1925 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
1926 new->br_startoff, new->br_startblock,
1937 new->br_blockcount + right.br_blockcount, 1927 new->br_blockcount + right.br_blockcount,
1938 right.br_state); 1928 right.br_state);
1939 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1929 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1940 1930
1941 ifp->if_lastex = idx;
1942 if (cur == NULL) { 1931 if (cur == NULL) {
1943 rval = xfs_ilog_fext(whichfork); 1932 rval = xfs_ilog_fext(whichfork);
1944 } else { 1933 } else {
@@ -1964,8 +1953,7 @@ xfs_bmap_add_extent_hole_real(
1964 * real allocation. 1953 * real allocation.
1965 * Insert a new entry. 1954 * Insert a new entry.
1966 */ 1955 */
1967 xfs_iext_insert(ip, idx, 1, new, state); 1956 xfs_iext_insert(ip, *idx, 1, new, state);
1968 ifp->if_lastex = idx;
1969 XFS_IFORK_NEXT_SET(ip, whichfork, 1957 XFS_IFORK_NEXT_SET(ip, whichfork,
1970 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 1958 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
1971 if (cur == NULL) { 1959 if (cur == NULL) {
@@ -2345,6 +2333,13 @@ xfs_bmap_rtalloc(
2345 */ 2333 */
2346 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) 2334 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
2347 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; 2335 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
2336
2337 /*
2338 * Lock out other modifications to the RT bitmap inode.
2339 */
2340 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2341 xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2342
2348 /* 2343 /*
2349 * If it's an allocation to an empty file at offset 0, 2344 * If it's an allocation to an empty file at offset 0,
2350 * pick an extent that will space things out in the rt area. 2345 * pick an extent that will space things out in the rt area.
@@ -2427,7 +2422,7 @@ xfs_bmap_btalloc_nullfb(
2427 startag = ag = 0; 2422 startag = ag = 0;
2428 2423
2429 pag = xfs_perag_get(mp, ag); 2424 pag = xfs_perag_get(mp, ag);
2430 while (*blen < ap->alen) { 2425 while (*blen < args->maxlen) {
2431 if (!pag->pagf_init) { 2426 if (!pag->pagf_init) {
2432 error = xfs_alloc_pagf_init(mp, args->tp, ag, 2427 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2433 XFS_ALLOC_FLAG_TRYLOCK); 2428 XFS_ALLOC_FLAG_TRYLOCK);
@@ -2449,7 +2444,7 @@ xfs_bmap_btalloc_nullfb(
2449 notinit = 1; 2444 notinit = 1;
2450 2445
2451 if (xfs_inode_is_filestream(ap->ip)) { 2446 if (xfs_inode_is_filestream(ap->ip)) {
2452 if (*blen >= ap->alen) 2447 if (*blen >= args->maxlen)
2453 break; 2448 break;
2454 2449
2455 if (ap->userdata) { 2450 if (ap->userdata) {
@@ -2495,14 +2490,14 @@ xfs_bmap_btalloc_nullfb(
2495 * If the best seen length is less than the request 2490 * If the best seen length is less than the request
2496 * length, use the best as the minimum. 2491 * length, use the best as the minimum.
2497 */ 2492 */
2498 else if (*blen < ap->alen) 2493 else if (*blen < args->maxlen)
2499 args->minlen = *blen; 2494 args->minlen = *blen;
2500 /* 2495 /*
2501 * Otherwise we've seen an extent as big as alen, 2496 * Otherwise we've seen an extent as big as maxlen,
2502 * use that as the minimum. 2497 * use that as the minimum.
2503 */ 2498 */
2504 else 2499 else
2505 args->minlen = ap->alen; 2500 args->minlen = args->maxlen;
2506 2501
2507 /* 2502 /*
2508 * set the failure fallback case to look in the selected 2503 * set the failure fallback case to look in the selected
@@ -2570,7 +2565,9 @@ xfs_bmap_btalloc(
2570 args.tp = ap->tp; 2565 args.tp = ap->tp;
2571 args.mp = mp; 2566 args.mp = mp;
2572 args.fsbno = ap->rval; 2567 args.fsbno = ap->rval;
2573 args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); 2568
2569 /* Trim the allocation back to the maximum an AG can fit. */
2570 args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
2574 args.firstblock = ap->firstblock; 2571 args.firstblock = ap->firstblock;
2575 blen = 0; 2572 blen = 0;
2576 if (nullfb) { 2573 if (nullfb) {
@@ -2618,7 +2615,7 @@ xfs_bmap_btalloc(
2618 /* 2615 /*
2619 * Adjust for alignment 2616 * Adjust for alignment
2620 */ 2617 */
2621 if (blen > args.alignment && blen <= ap->alen) 2618 if (blen > args.alignment && blen <= args.maxlen)
2622 args.minlen = blen - args.alignment; 2619 args.minlen = blen - args.alignment;
2623 args.minalignslop = 0; 2620 args.minalignslop = 0;
2624 } else { 2621 } else {
@@ -2637,7 +2634,7 @@ xfs_bmap_btalloc(
2637 * of minlen+alignment+slop doesn't go up 2634 * of minlen+alignment+slop doesn't go up
2638 * between the calls. 2635 * between the calls.
2639 */ 2636 */
2640 if (blen > mp->m_dalign && blen <= ap->alen) 2637 if (blen > mp->m_dalign && blen <= args.maxlen)
2641 nextminlen = blen - mp->m_dalign; 2638 nextminlen = blen - mp->m_dalign;
2642 else 2639 else
2643 nextminlen = args.minlen; 2640 nextminlen = args.minlen;
@@ -2804,13 +2801,12 @@ STATIC int /* error */
2804xfs_bmap_del_extent( 2801xfs_bmap_del_extent(
2805 xfs_inode_t *ip, /* incore inode pointer */ 2802 xfs_inode_t *ip, /* incore inode pointer */
2806 xfs_trans_t *tp, /* current transaction pointer */ 2803 xfs_trans_t *tp, /* current transaction pointer */
2807 xfs_extnum_t idx, /* extent number to update/delete */ 2804 xfs_extnum_t *idx, /* extent number to update/delete */
2808 xfs_bmap_free_t *flist, /* list of extents to be freed */ 2805 xfs_bmap_free_t *flist, /* list of extents to be freed */
2809 xfs_btree_cur_t *cur, /* if null, not a btree */ 2806 xfs_btree_cur_t *cur, /* if null, not a btree */
2810 xfs_bmbt_irec_t *del, /* data to remove from extents */ 2807 xfs_bmbt_irec_t *del, /* data to remove from extents */
2811 int *logflagsp, /* inode logging flags */ 2808 int *logflagsp, /* inode logging flags */
2812 int whichfork, /* data or attr fork */ 2809 int whichfork) /* data or attr fork */
2813 int rsvd) /* OK to allocate reserved blocks */
2814{ 2810{
2815 xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ 2811 xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
2816 xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ 2812 xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
@@ -2841,10 +2837,10 @@ xfs_bmap_del_extent(
2841 2837
2842 mp = ip->i_mount; 2838 mp = ip->i_mount;
2843 ifp = XFS_IFORK_PTR(ip, whichfork); 2839 ifp = XFS_IFORK_PTR(ip, whichfork);
2844 ASSERT((idx >= 0) && (idx < ifp->if_bytes / 2840 ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
2845 (uint)sizeof(xfs_bmbt_rec_t))); 2841 (uint)sizeof(xfs_bmbt_rec_t)));
2846 ASSERT(del->br_blockcount > 0); 2842 ASSERT(del->br_blockcount > 0);
2847 ep = xfs_iext_get_ext(ifp, idx); 2843 ep = xfs_iext_get_ext(ifp, *idx);
2848 xfs_bmbt_get_all(ep, &got); 2844 xfs_bmbt_get_all(ep, &got);
2849 ASSERT(got.br_startoff <= del->br_startoff); 2845 ASSERT(got.br_startoff <= del->br_startoff);
2850 del_endoff = del->br_startoff + del->br_blockcount; 2846 del_endoff = del->br_startoff + del->br_blockcount;
@@ -2918,11 +2914,12 @@ xfs_bmap_del_extent(
2918 /* 2914 /*
2919 * Matches the whole extent. Delete the entry. 2915 * Matches the whole extent. Delete the entry.
2920 */ 2916 */
2921 xfs_iext_remove(ip, idx, 1, 2917 xfs_iext_remove(ip, *idx, 1,
2922 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); 2918 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
2923 ifp->if_lastex = idx; 2919 --*idx;
2924 if (delay) 2920 if (delay)
2925 break; 2921 break;
2922
2926 XFS_IFORK_NEXT_SET(ip, whichfork, 2923 XFS_IFORK_NEXT_SET(ip, whichfork,
2927 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 2924 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
2928 flags |= XFS_ILOG_CORE; 2925 flags |= XFS_ILOG_CORE;
@@ -2939,21 +2936,20 @@ xfs_bmap_del_extent(
2939 /* 2936 /*
2940 * Deleting the first part of the extent. 2937 * Deleting the first part of the extent.
2941 */ 2938 */
2942 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 2939 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2943 xfs_bmbt_set_startoff(ep, del_endoff); 2940 xfs_bmbt_set_startoff(ep, del_endoff);
2944 temp = got.br_blockcount - del->br_blockcount; 2941 temp = got.br_blockcount - del->br_blockcount;
2945 xfs_bmbt_set_blockcount(ep, temp); 2942 xfs_bmbt_set_blockcount(ep, temp);
2946 ifp->if_lastex = idx;
2947 if (delay) { 2943 if (delay) {
2948 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 2944 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2949 da_old); 2945 da_old);
2950 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 2946 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
2951 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2947 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2952 da_new = temp; 2948 da_new = temp;
2953 break; 2949 break;
2954 } 2950 }
2955 xfs_bmbt_set_startblock(ep, del_endblock); 2951 xfs_bmbt_set_startblock(ep, del_endblock);
2956 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2952 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2957 if (!cur) { 2953 if (!cur) {
2958 flags |= xfs_ilog_fext(whichfork); 2954 flags |= xfs_ilog_fext(whichfork);
2959 break; 2955 break;
@@ -2969,18 +2965,17 @@ xfs_bmap_del_extent(
2969 * Deleting the last part of the extent. 2965 * Deleting the last part of the extent.
2970 */ 2966 */
2971 temp = got.br_blockcount - del->br_blockcount; 2967 temp = got.br_blockcount - del->br_blockcount;
2972 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 2968 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2973 xfs_bmbt_set_blockcount(ep, temp); 2969 xfs_bmbt_set_blockcount(ep, temp);
2974 ifp->if_lastex = idx;
2975 if (delay) { 2970 if (delay) {
2976 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 2971 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2977 da_old); 2972 da_old);
2978 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 2973 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
2979 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2974 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2980 da_new = temp; 2975 da_new = temp;
2981 break; 2976 break;
2982 } 2977 }
2983 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2978 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2984 if (!cur) { 2979 if (!cur) {
2985 flags |= xfs_ilog_fext(whichfork); 2980 flags |= xfs_ilog_fext(whichfork);
2986 break; 2981 break;
@@ -2997,7 +2992,7 @@ xfs_bmap_del_extent(
2997 * Deleting the middle of the extent. 2992 * Deleting the middle of the extent.
2998 */ 2993 */
2999 temp = del->br_startoff - got.br_startoff; 2994 temp = del->br_startoff - got.br_startoff;
3000 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 2995 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
3001 xfs_bmbt_set_blockcount(ep, temp); 2996 xfs_bmbt_set_blockcount(ep, temp);
3002 new.br_startoff = del_endoff; 2997 new.br_startoff = del_endoff;
3003 temp2 = got_endoff - del_endoff; 2998 temp2 = got_endoff - del_endoff;
@@ -3084,9 +3079,9 @@ xfs_bmap_del_extent(
3084 } 3079 }
3085 } 3080 }
3086 } 3081 }
3087 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 3082 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
3088 xfs_iext_insert(ip, idx + 1, 1, &new, state); 3083 xfs_iext_insert(ip, *idx + 1, 1, &new, state);
3089 ifp->if_lastex = idx + 1; 3084 ++*idx;
3090 break; 3085 break;
3091 } 3086 }
3092 /* 3087 /*
@@ -3111,9 +3106,10 @@ xfs_bmap_del_extent(
3111 * Nothing to do for disk quota accounting here. 3106 * Nothing to do for disk quota accounting here.
3112 */ 3107 */
3113 ASSERT(da_old >= da_new); 3108 ASSERT(da_old >= da_new);
3114 if (da_old > da_new) 3109 if (da_old > da_new) {
3115 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), 3110 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
3116 rsvd); 3111 (int64_t)(da_old - da_new), 0);
3112 }
3117done: 3113done:
3118 *logflagsp = flags; 3114 *logflagsp = flags;
3119 return error; 3115 return error;
@@ -3496,7 +3492,7 @@ xfs_bmap_search_extents(
3496 3492
3497 if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && 3493 if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
3498 !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { 3494 !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
3499 xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, 3495 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
3500 "Access to block zero in inode %llu " 3496 "Access to block zero in inode %llu "
3501 "start_block: %llx start_off: %llx " 3497 "start_block: %llx start_off: %llx "
3502 "blkcnt: %llx extent-state: %x lastx: %x\n", 3498 "blkcnt: %llx extent-state: %x lastx: %x\n",
@@ -4170,12 +4166,11 @@ xfs_bmap_read_extents(
4170 num_recs = xfs_btree_get_numrecs(block); 4166 num_recs = xfs_btree_get_numrecs(block);
4171 if (unlikely(i + num_recs > room)) { 4167 if (unlikely(i + num_recs > room)) {
4172 ASSERT(i + num_recs <= room); 4168 ASSERT(i + num_recs <= room);
4173 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 4169 xfs_warn(ip->i_mount,
4174 "corrupt dinode %Lu, (btree extents).", 4170 "corrupt dinode %Lu, (btree extents).",
4175 (unsigned long long) ip->i_ino); 4171 (unsigned long long) ip->i_ino);
4176 XFS_ERROR_REPORT("xfs_bmap_read_extents(1)", 4172 XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
4177 XFS_ERRLEVEL_LOW, 4173 XFS_ERRLEVEL_LOW, ip->i_mount, block);
4178 ip->i_mount);
4179 goto error0; 4174 goto error0;
4180 } 4175 }
4181 XFS_WANT_CORRUPTED_GOTO( 4176 XFS_WANT_CORRUPTED_GOTO(
@@ -4481,6 +4476,16 @@ xfs_bmapi(
4481 /* Figure out the extent size, adjust alen */ 4476 /* Figure out the extent size, adjust alen */
4482 extsz = xfs_get_extsz_hint(ip); 4477 extsz = xfs_get_extsz_hint(ip);
4483 if (extsz) { 4478 if (extsz) {
4479 /*
4480 * make sure we don't exceed a single
4481 * extent length when we align the
4482 * extent by reducing length we are
4483 * going to allocate by the maximum
4484 * amount extent size aligment may
4485 * require.
4486 */
4487 alen = XFS_FILBLKS_MIN(len,
4488 MAXEXTLEN - (2 * extsz - 1));
4484 error = xfs_bmap_extsize_align(mp, 4489 error = xfs_bmap_extsize_align(mp,
4485 &got, &prev, extsz, 4490 &got, &prev, extsz,
4486 rt, eof, 4491 rt, eof,
@@ -4523,29 +4528,24 @@ xfs_bmapi(
4523 if (rt) { 4528 if (rt) {
4524 error = xfs_mod_incore_sb(mp, 4529 error = xfs_mod_incore_sb(mp,
4525 XFS_SBS_FREXTENTS, 4530 XFS_SBS_FREXTENTS,
4526 -((int64_t)extsz), (flags & 4531 -((int64_t)extsz), 0);
4527 XFS_BMAPI_RSVBLOCKS));
4528 } else { 4532 } else {
4529 error = xfs_mod_incore_sb(mp, 4533 error = xfs_icsb_modify_counters(mp,
4530 XFS_SBS_FDBLOCKS, 4534 XFS_SBS_FDBLOCKS,
4531 -((int64_t)alen), (flags & 4535 -((int64_t)alen), 0);
4532 XFS_BMAPI_RSVBLOCKS));
4533 } 4536 }
4534 if (!error) { 4537 if (!error) {
4535 error = xfs_mod_incore_sb(mp, 4538 error = xfs_icsb_modify_counters(mp,
4536 XFS_SBS_FDBLOCKS, 4539 XFS_SBS_FDBLOCKS,
4537 -((int64_t)indlen), (flags & 4540 -((int64_t)indlen), 0);
4538 XFS_BMAPI_RSVBLOCKS));
4539 if (error && rt) 4541 if (error && rt)
4540 xfs_mod_incore_sb(mp, 4542 xfs_mod_incore_sb(mp,
4541 XFS_SBS_FREXTENTS, 4543 XFS_SBS_FREXTENTS,
4542 (int64_t)extsz, (flags & 4544 (int64_t)extsz, 0);
4543 XFS_BMAPI_RSVBLOCKS));
4544 else if (error) 4545 else if (error)
4545 xfs_mod_incore_sb(mp, 4546 xfs_icsb_modify_counters(mp,
4546 XFS_SBS_FDBLOCKS, 4547 XFS_SBS_FDBLOCKS,
4547 (int64_t)alen, (flags & 4548 (int64_t)alen, 0);
4548 XFS_BMAPI_RSVBLOCKS));
4549 } 4549 }
4550 4550
4551 if (error) { 4551 if (error) {
@@ -4662,13 +4662,12 @@ xfs_bmapi(
4662 if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) 4662 if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
4663 got.br_state = XFS_EXT_UNWRITTEN; 4663 got.br_state = XFS_EXT_UNWRITTEN;
4664 } 4664 }
4665 error = xfs_bmap_add_extent(ip, lastx, &cur, &got, 4665 error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
4666 firstblock, flist, &tmp_logflags, 4666 firstblock, flist, &tmp_logflags,
4667 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4667 whichfork);
4668 logflags |= tmp_logflags; 4668 logflags |= tmp_logflags;
4669 if (error) 4669 if (error)
4670 goto error0; 4670 goto error0;
4671 lastx = ifp->if_lastex;
4672 ep = xfs_iext_get_ext(ifp, lastx); 4671 ep = xfs_iext_get_ext(ifp, lastx);
4673 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4672 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4674 xfs_bmbt_get_all(ep, &got); 4673 xfs_bmbt_get_all(ep, &got);
@@ -4744,8 +4743,12 @@ xfs_bmapi(
4744 * Check if writing previously allocated but 4743 * Check if writing previously allocated but
4745 * unwritten extents. 4744 * unwritten extents.
4746 */ 4745 */
4747 if (wr && mval->br_state == XFS_EXT_UNWRITTEN && 4746 if (wr &&
4748 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) { 4747 ((mval->br_state == XFS_EXT_UNWRITTEN &&
4748 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
4749 (mval->br_state == XFS_EXT_NORM &&
4750 ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
4751 (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
4749 /* 4752 /*
4750 * Modify (by adding) the state flag, if writing. 4753 * Modify (by adding) the state flag, if writing.
4751 */ 4754 */
@@ -4757,14 +4760,15 @@ xfs_bmapi(
4757 *firstblock; 4760 *firstblock;
4758 cur->bc_private.b.flist = flist; 4761 cur->bc_private.b.flist = flist;
4759 } 4762 }
4760 mval->br_state = XFS_EXT_NORM; 4763 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4761 error = xfs_bmap_add_extent(ip, lastx, &cur, mval, 4764 ? XFS_EXT_NORM
4765 : XFS_EXT_UNWRITTEN;
4766 error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
4762 firstblock, flist, &tmp_logflags, 4767 firstblock, flist, &tmp_logflags,
4763 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4768 whichfork);
4764 logflags |= tmp_logflags; 4769 logflags |= tmp_logflags;
4765 if (error) 4770 if (error)
4766 goto error0; 4771 goto error0;
4767 lastx = ifp->if_lastex;
4768 ep = xfs_iext_get_ext(ifp, lastx); 4772 ep = xfs_iext_get_ext(ifp, lastx);
4769 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4773 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4770 xfs_bmbt_get_all(ep, &got); 4774 xfs_bmbt_get_all(ep, &got);
@@ -4823,14 +4827,14 @@ xfs_bmapi(
4823 /* 4827 /*
4824 * Else go on to the next record. 4828 * Else go on to the next record.
4825 */ 4829 */
4826 ep = xfs_iext_get_ext(ifp, ++lastx);
4827 prev = got; 4830 prev = got;
4828 if (lastx >= nextents) 4831 if (++lastx < nextents) {
4829 eof = 1; 4832 ep = xfs_iext_get_ext(ifp, lastx);
4830 else
4831 xfs_bmbt_get_all(ep, &got); 4833 xfs_bmbt_get_all(ep, &got);
4834 } else {
4835 eof = 1;
4836 }
4832 } 4837 }
4833 ifp->if_lastex = lastx;
4834 *nmap = n; 4838 *nmap = n;
4835 /* 4839 /*
4836 * Transform from btree to extents, give it cur. 4840 * Transform from btree to extents, give it cur.
@@ -4939,7 +4943,6 @@ xfs_bmapi_single(
4939 ASSERT(!isnullstartblock(got.br_startblock)); 4943 ASSERT(!isnullstartblock(got.br_startblock));
4940 ASSERT(bno < got.br_startoff + got.br_blockcount); 4944 ASSERT(bno < got.br_startoff + got.br_blockcount);
4941 *fsb = got.br_startblock + (bno - got.br_startoff); 4945 *fsb = got.br_startblock + (bno - got.br_startoff);
4942 ifp->if_lastex = lastx;
4943 return 0; 4946 return 0;
4944} 4947}
4945 4948
@@ -4981,7 +4984,6 @@ xfs_bunmapi(
4981 int tmp_logflags; /* partial logging flags */ 4984 int tmp_logflags; /* partial logging flags */
4982 int wasdel; /* was a delayed alloc extent */ 4985 int wasdel; /* was a delayed alloc extent */
4983 int whichfork; /* data or attribute fork */ 4986 int whichfork; /* data or attribute fork */
4984 int rsvd; /* OK to allocate reserved blocks */
4985 xfs_fsblock_t sum; 4987 xfs_fsblock_t sum;
4986 4988
4987 trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); 4989 trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -4999,7 +5001,7 @@ xfs_bunmapi(
4999 mp = ip->i_mount; 5001 mp = ip->i_mount;
5000 if (XFS_FORCED_SHUTDOWN(mp)) 5002 if (XFS_FORCED_SHUTDOWN(mp))
5001 return XFS_ERROR(EIO); 5003 return XFS_ERROR(EIO);
5002 rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; 5004
5003 ASSERT(len > 0); 5005 ASSERT(len > 0);
5004 ASSERT(nexts >= 0); 5006 ASSERT(nexts >= 0);
5005 ASSERT(ifp->if_ext_max == 5007 ASSERT(ifp->if_ext_max ==
@@ -5115,9 +5117,9 @@ xfs_bunmapi(
5115 del.br_blockcount = mod; 5117 del.br_blockcount = mod;
5116 } 5118 }
5117 del.br_state = XFS_EXT_UNWRITTEN; 5119 del.br_state = XFS_EXT_UNWRITTEN;
5118 error = xfs_bmap_add_extent(ip, lastx, &cur, &del, 5120 error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
5119 firstblock, flist, &logflags, 5121 firstblock, flist, &logflags,
5120 XFS_DATA_FORK, 0); 5122 XFS_DATA_FORK);
5121 if (error) 5123 if (error)
5122 goto error0; 5124 goto error0;
5123 goto nodelete; 5125 goto nodelete;
@@ -5143,9 +5145,12 @@ xfs_bunmapi(
5143 */ 5145 */
5144 ASSERT(bno >= del.br_blockcount); 5146 ASSERT(bno >= del.br_blockcount);
5145 bno -= del.br_blockcount; 5147 bno -= del.br_blockcount;
5146 if (bno < got.br_startoff) { 5148 if (got.br_startoff > bno) {
5147 if (--lastx >= 0) 5149 if (--lastx >= 0) {
5148 xfs_bmbt_get_all(--ep, &got); 5150 ep = xfs_iext_get_ext(ifp,
5151 lastx);
5152 xfs_bmbt_get_all(ep, &got);
5153 }
5149 } 5154 }
5150 continue; 5155 continue;
5151 } else if (del.br_state == XFS_EXT_UNWRITTEN) { 5156 } else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5169,18 +5174,19 @@ xfs_bunmapi(
5169 prev.br_startoff = start; 5174 prev.br_startoff = start;
5170 } 5175 }
5171 prev.br_state = XFS_EXT_UNWRITTEN; 5176 prev.br_state = XFS_EXT_UNWRITTEN;
5172 error = xfs_bmap_add_extent(ip, lastx - 1, &cur, 5177 lastx--;
5178 error = xfs_bmap_add_extent(ip, &lastx, &cur,
5173 &prev, firstblock, flist, &logflags, 5179 &prev, firstblock, flist, &logflags,
5174 XFS_DATA_FORK, 0); 5180 XFS_DATA_FORK);
5175 if (error) 5181 if (error)
5176 goto error0; 5182 goto error0;
5177 goto nodelete; 5183 goto nodelete;
5178 } else { 5184 } else {
5179 ASSERT(del.br_state == XFS_EXT_NORM); 5185 ASSERT(del.br_state == XFS_EXT_NORM);
5180 del.br_state = XFS_EXT_UNWRITTEN; 5186 del.br_state = XFS_EXT_UNWRITTEN;
5181 error = xfs_bmap_add_extent(ip, lastx, &cur, 5187 error = xfs_bmap_add_extent(ip, &lastx, &cur,
5182 &del, firstblock, flist, &logflags, 5188 &del, firstblock, flist, &logflags,
5183 XFS_DATA_FORK, 0); 5189 XFS_DATA_FORK);
5184 if (error) 5190 if (error)
5185 goto error0; 5191 goto error0;
5186 goto nodelete; 5192 goto nodelete;
@@ -5195,13 +5201,13 @@ xfs_bunmapi(
5195 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); 5201 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
5196 do_div(rtexts, mp->m_sb.sb_rextsize); 5202 do_div(rtexts, mp->m_sb.sb_rextsize);
5197 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, 5203 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
5198 (int64_t)rtexts, rsvd); 5204 (int64_t)rtexts, 0);
5199 (void)xfs_trans_reserve_quota_nblks(NULL, 5205 (void)xfs_trans_reserve_quota_nblks(NULL,
5200 ip, -((long)del.br_blockcount), 0, 5206 ip, -((long)del.br_blockcount), 0,
5201 XFS_QMOPT_RES_RTBLKS); 5207 XFS_QMOPT_RES_RTBLKS);
5202 } else { 5208 } else {
5203 xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, 5209 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
5204 (int64_t)del.br_blockcount, rsvd); 5210 (int64_t)del.br_blockcount, 0);
5205 (void)xfs_trans_reserve_quota_nblks(NULL, 5211 (void)xfs_trans_reserve_quota_nblks(NULL,
5206 ip, -((long)del.br_blockcount), 0, 5212 ip, -((long)del.br_blockcount), 0,
5207 XFS_QMOPT_RES_REGBLKS); 5213 XFS_QMOPT_RES_REGBLKS);
@@ -5232,31 +5238,29 @@ xfs_bunmapi(
5232 error = XFS_ERROR(ENOSPC); 5238 error = XFS_ERROR(ENOSPC);
5233 goto error0; 5239 goto error0;
5234 } 5240 }
5235 error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, 5241 error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
5236 &tmp_logflags, whichfork, rsvd); 5242 &tmp_logflags, whichfork);
5237 logflags |= tmp_logflags; 5243 logflags |= tmp_logflags;
5238 if (error) 5244 if (error)
5239 goto error0; 5245 goto error0;
5240 bno = del.br_startoff - 1; 5246 bno = del.br_startoff - 1;
5241nodelete: 5247nodelete:
5242 lastx = ifp->if_lastex;
5243 /* 5248 /*
5244 * If not done go on to the next (previous) record. 5249 * If not done go on to the next (previous) record.
5245 * Reset ep in case the extents array was re-alloced.
5246 */ 5250 */
5247 ep = xfs_iext_get_ext(ifp, lastx);
5248 if (bno != (xfs_fileoff_t)-1 && bno >= start) { 5251 if (bno != (xfs_fileoff_t)-1 && bno >= start) {
5249 if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) || 5252 if (lastx >= 0) {
5250 xfs_bmbt_get_startoff(ep) > bno) { 5253 ep = xfs_iext_get_ext(ifp, lastx);
5251 if (--lastx >= 0) 5254 if (xfs_bmbt_get_startoff(ep) > bno) {
5252 ep = xfs_iext_get_ext(ifp, lastx); 5255 if (--lastx >= 0)
5253 } 5256 ep = xfs_iext_get_ext(ifp,
5254 if (lastx >= 0) 5257 lastx);
5258 }
5255 xfs_bmbt_get_all(ep, &got); 5259 xfs_bmbt_get_all(ep, &got);
5260 }
5256 extno++; 5261 extno++;
5257 } 5262 }
5258 } 5263 }
5259 ifp->if_lastex = lastx;
5260 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; 5264 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
5261 ASSERT(ifp->if_ext_max == 5265 ASSERT(ifp->if_ext_max ==
5262 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); 5266 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
@@ -5461,8 +5465,13 @@ xfs_getbmap(
5461 if (error) 5465 if (error)
5462 goto out_unlock_iolock; 5466 goto out_unlock_iolock;
5463 } 5467 }
5464 5468 /*
5465 ASSERT(ip->i_delayed_blks == 0); 5469 * even after flushing the inode, there can still be delalloc
5470 * blocks on the inode beyond EOF due to speculative
5471 * preallocation. These are not removed until the release
5472 * function is called or the inode is inactivated. Hence we
5473 * cannot assert here that ip->i_delayed_blks == 0.
5474 */
5466 } 5475 }
5467 5476
5468 lock = xfs_ilock_map_shared(ip); 5477 lock = xfs_ilock_map_shared(ip);
@@ -5728,7 +5737,7 @@ xfs_check_block(
5728 else 5737 else
5729 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); 5738 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
5730 if (*thispa == *pp) { 5739 if (*thispa == *pp) {
5731 cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", 5740 xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
5732 __func__, j, i, 5741 __func__, j, i,
5733 (unsigned long long)be64_to_cpu(*thispa)); 5742 (unsigned long long)be64_to_cpu(*thispa));
5734 panic("%s: ptrs are equal in node\n", 5743 panic("%s: ptrs are equal in node\n",
@@ -5893,11 +5902,11 @@ xfs_bmap_check_leaf_extents(
5893 return; 5902 return;
5894 5903
5895error0: 5904error0:
5896 cmn_err(CE_WARN, "%s: at error0", __func__); 5905 xfs_warn(mp, "%s: at error0", __func__);
5897 if (bp_release) 5906 if (bp_release)
5898 xfs_trans_brelse(NULL, bp); 5907 xfs_trans_brelse(NULL, bp);
5899error_norelse: 5908error_norelse:
5900 cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents", 5909 xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
5901 __func__, i); 5910 __func__, i);
5902 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); 5911 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
5903 return; 5912 return;
@@ -6060,3 +6069,79 @@ xfs_bmap_disk_count_leaves(
6060 *count += xfs_bmbt_disk_get_blockcount(frp); 6069 *count += xfs_bmbt_disk_get_blockcount(frp);
6061 } 6070 }
6062} 6071}
6072
6073/*
6074 * dead simple method of punching delalyed allocation blocks from a range in
6075 * the inode. Walks a block at a time so will be slow, but is only executed in
6076 * rare error cases so the overhead is not critical. This will alays punch out
6077 * both the start and end blocks, even if the ranges only partially overlap
6078 * them, so it is up to the caller to ensure that partial blocks are not
6079 * passed in.
6080 */
6081int
6082xfs_bmap_punch_delalloc_range(
6083 struct xfs_inode *ip,
6084 xfs_fileoff_t start_fsb,
6085 xfs_fileoff_t length)
6086{
6087 xfs_fileoff_t remaining = length;
6088 int error = 0;
6089
6090 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
6091
6092 do {
6093 int done;
6094 xfs_bmbt_irec_t imap;
6095 int nimaps = 1;
6096 xfs_fsblock_t firstblock;
6097 xfs_bmap_free_t flist;
6098
6099 /*
6100 * Map the range first and check that it is a delalloc extent
6101 * before trying to unmap the range. Otherwise we will be
6102 * trying to remove a real extent (which requires a
6103 * transaction) or a hole, which is probably a bad idea...
6104 */
6105 error = xfs_bmapi(NULL, ip, start_fsb, 1,
6106 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
6107 &nimaps, NULL);
6108
6109 if (error) {
6110 /* something screwed, just bail */
6111 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
6112 xfs_alert(ip->i_mount,
6113 "Failed delalloc mapping lookup ino %lld fsb %lld.",
6114 ip->i_ino, start_fsb);
6115 }
6116 break;
6117 }
6118 if (!nimaps) {
6119 /* nothing there */
6120 goto next_block;
6121 }
6122 if (imap.br_startblock != DELAYSTARTBLOCK) {
6123 /* been converted, ignore */
6124 goto next_block;
6125 }
6126 WARN_ON(imap.br_blockcount == 0);
6127
6128 /*
6129 * Note: while we initialise the firstblock/flist pair, they
6130 * should never be used because blocks should never be
6131 * allocated or freed for a delalloc extent and hence we need
6132 * don't cancel or finish them after the xfs_bunmapi() call.
6133 */
6134 xfs_bmap_init(&flist, &firstblock);
6135 error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
6136 &flist, &done);
6137 if (error)
6138 break;
6139
6140 ASSERT(!flist.xbf_count && !flist.xbf_first);
6141next_block:
6142 start_fsb++;
6143 remaining--;
6144 } while(remaining > 0);
6145
6146 return error;
6147}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b13569a6179b..c62234bde053 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -69,14 +69,16 @@ typedef struct xfs_bmap_free
69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ 69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ 70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ 71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
72#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */
73#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ 72#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ 73#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
75 /* combine contig. space */ 74 /* combine contig. space */
76#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ 75#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */
77#define XFS_BMAPI_CONVERT 0x200 /* unwritten extent conversion - */ 76/*
78 /* need write cache flushing and no */ 77 * unwritten extent conversion - this needs write cache flushing and no additional
79 /* additional allocation alignments */ 78 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
79 * from written to unwritten, otherwise convert from unwritten to written.
80 */
81#define XFS_BMAPI_CONVERT 0x200
80 82
81#define XFS_BMAPI_FLAGS \ 83#define XFS_BMAPI_FLAGS \
82 { XFS_BMAPI_WRITE, "WRITE" }, \ 84 { XFS_BMAPI_WRITE, "WRITE" }, \
@@ -84,7 +86,6 @@ typedef struct xfs_bmap_free
84 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 86 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
85 { XFS_BMAPI_METADATA, "METADATA" }, \ 87 { XFS_BMAPI_METADATA, "METADATA" }, \
86 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ 88 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
87 { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \
88 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ 89 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
89 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ 90 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
90 { XFS_BMAPI_CONTIG, "CONTIG" }, \ 91 { XFS_BMAPI_CONTIG, "CONTIG" }, \
@@ -391,6 +392,11 @@ xfs_bmap_count_blocks(
391 int whichfork, 392 int whichfork,
392 int *count); 393 int *count);
393 394
395int
396xfs_bmap_punch_delalloc_range(
397 struct xfs_inode *ip,
398 xfs_fileoff_t start_fsb,
399 xfs_fileoff_t length);
394#endif /* __KERNEL__ */ 400#endif /* __KERNEL__ */
395 401
396#endif /* __XFS_BMAP_H__ */ 402#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 829af92f0fba..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -217,7 +217,7 @@ xfs_btree_del_cursor(
217 */ 217 */
218 for (i = 0; i < cur->bc_nlevels; i++) { 218 for (i = 0; i < cur->bc_nlevels; i++) {
219 if (cur->bc_bufs[i]) 219 if (cur->bc_bufs[i])
220 xfs_btree_setbuf(cur, i, NULL); 220 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
221 else if (!error) 221 else if (!error)
222 break; 222 break;
223 } 223 }
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
634 return error; 634 return error;
635 } 635 }
636 ASSERT(!bp || !XFS_BUF_GETERROR(bp)); 636 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
637 if (bp != NULL) { 637 if (bp)
638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); 638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
639 }
640 *bpp = bp; 639 *bpp = bp;
641 return 0; 640 return 0;
642} 641}
@@ -656,7 +655,7 @@ xfs_btree_reada_bufl(
656 655
657 ASSERT(fsbno != NULLFSBLOCK); 656 ASSERT(fsbno != NULLFSBLOCK);
658 d = XFS_FSB_TO_DADDR(mp, fsbno); 657 d = XFS_FSB_TO_DADDR(mp, fsbno);
659 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 658 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
660} 659}
661 660
662/* 661/*
@@ -676,7 +675,7 @@ xfs_btree_reada_bufs(
676 ASSERT(agno != NULLAGNUMBER); 675 ASSERT(agno != NULLAGNUMBER);
677 ASSERT(agbno != NULLAGBLOCK); 676 ASSERT(agbno != NULLAGBLOCK);
678 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 677 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
679 xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); 678 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
680} 679}
681 680
682STATIC int 681STATIC int
@@ -763,22 +762,19 @@ xfs_btree_readahead(
763 * Set the buffer for level "lev" in the cursor to bp, releasing 762 * Set the buffer for level "lev" in the cursor to bp, releasing
764 * any previous buffer. 763 * any previous buffer.
765 */ 764 */
766void 765STATIC void
767xfs_btree_setbuf( 766xfs_btree_setbuf(
768 xfs_btree_cur_t *cur, /* btree cursor */ 767 xfs_btree_cur_t *cur, /* btree cursor */
769 int lev, /* level in btree */ 768 int lev, /* level in btree */
770 xfs_buf_t *bp) /* new buffer to set */ 769 xfs_buf_t *bp) /* new buffer to set */
771{ 770{
772 struct xfs_btree_block *b; /* btree block */ 771 struct xfs_btree_block *b; /* btree block */
773 xfs_buf_t *obp; /* old buffer pointer */
774 772
775 obp = cur->bc_bufs[lev]; 773 if (cur->bc_bufs[lev])
776 if (obp) 774 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
777 xfs_trans_brelse(cur->bc_tp, obp);
778 cur->bc_bufs[lev] = bp; 775 cur->bc_bufs[lev] = bp;
779 cur->bc_ra[lev] = 0; 776 cur->bc_ra[lev] = 0;
780 if (!bp) 777
781 return;
782 b = XFS_BUF_TO_BLOCK(bp); 778 b = XFS_BUF_TO_BLOCK(bp);
783 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 779 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
784 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) 780 if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
@@ -947,13 +943,13 @@ xfs_btree_set_refs(
947 switch (cur->bc_btnum) { 943 switch (cur->bc_btnum) {
948 case XFS_BTNUM_BNO: 944 case XFS_BTNUM_BNO:
949 case XFS_BTNUM_CNT: 945 case XFS_BTNUM_CNT:
950 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); 946 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
951 break; 947 break;
952 case XFS_BTNUM_INO: 948 case XFS_BTNUM_INO:
953 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); 949 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
954 break; 950 break;
955 case XFS_BTNUM_BMAP: 951 case XFS_BTNUM_BMAP:
956 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); 952 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
957 break; 953 break;
958 default: 954 default:
959 ASSERT(0); 955 ASSERT(0);
@@ -3011,6 +3007,43 @@ out0:
3011 return 0; 3007 return 0;
3012} 3008}
3013 3009
3010/*
3011 * Kill the current root node, and replace it with it's only child node.
3012 */
3013STATIC int
3014xfs_btree_kill_root(
3015 struct xfs_btree_cur *cur,
3016 struct xfs_buf *bp,
3017 int level,
3018 union xfs_btree_ptr *newroot)
3019{
3020 int error;
3021
3022 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3023 XFS_BTREE_STATS_INC(cur, killroot);
3024
3025 /*
3026 * Update the root pointer, decreasing the level by 1 and then
3027 * free the old root.
3028 */
3029 cur->bc_ops->set_root(cur, newroot, -1);
3030
3031 error = cur->bc_ops->free_block(cur, bp);
3032 if (error) {
3033 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3034 return error;
3035 }
3036
3037 XFS_BTREE_STATS_INC(cur, free);
3038
3039 cur->bc_bufs[level] = NULL;
3040 cur->bc_ra[level] = 0;
3041 cur->bc_nlevels--;
3042
3043 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3044 return 0;
3045}
3046
3014STATIC int 3047STATIC int
3015xfs_btree_dec_cursor( 3048xfs_btree_dec_cursor(
3016 struct xfs_btree_cur *cur, 3049 struct xfs_btree_cur *cur,
@@ -3195,7 +3228,7 @@ xfs_btree_delrec(
3195 * Make it the new root of the btree. 3228 * Make it the new root of the btree.
3196 */ 3229 */
3197 pp = xfs_btree_ptr_addr(cur, 1, block); 3230 pp = xfs_btree_ptr_addr(cur, 1, block);
3198 error = cur->bc_ops->kill_root(cur, bp, level, pp); 3231 error = xfs_btree_kill_root(cur, bp, level, pp);
3199 if (error) 3232 if (error)
3200 goto error0; 3233 goto error0;
3201 } else if (level > 0) { 3234 } else if (level > 0) {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7fa07062bdda..82fafc66bd1f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -152,9 +152,7 @@ struct xfs_btree_ops {
152 152
153 /* update btree root pointer */ 153 /* update btree root pointer */
154 void (*set_root)(struct xfs_btree_cur *cur, 154 void (*set_root)(struct xfs_btree_cur *cur,
155 union xfs_btree_ptr *nptr, int level_change); 155 union xfs_btree_ptr *nptr, int level_change);
156 int (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
157 int level, union xfs_btree_ptr *newroot);
158 156
159 /* block allocation / freeing */ 157 /* block allocation / freeing */
160 int (*alloc_block)(struct xfs_btree_cur *cur, 158 int (*alloc_block)(struct xfs_btree_cur *cur,
@@ -399,16 +397,6 @@ xfs_btree_reada_bufs(
399 xfs_agblock_t agbno, /* allocation group block number */ 397 xfs_agblock_t agbno, /* allocation group block number */
400 xfs_extlen_t count); /* count of filesystem blocks */ 398 xfs_extlen_t count); /* count of filesystem blocks */
401 399
402/*
403 * Set the buffer for level "lev" in the cursor to bp, releasing
404 * any previous buffer.
405 */
406void
407xfs_btree_setbuf(
408 xfs_btree_cur_t *cur, /* btree cursor */
409 int lev, /* level in btree */
410 struct xfs_buf *bp); /* new buffer to set */
411
412 400
413/* 401/*
414 * Common btree core entry points. 402 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1b09d7a280df..7b7e005e3dcc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -130,10 +130,12 @@ xfs_buf_item_log_check(
130 orig = bip->bli_orig; 130 orig = bip->bli_orig;
131 buffer = XFS_BUF_PTR(bp); 131 buffer = XFS_BUF_PTR(bp);
132 for (x = 0; x < XFS_BUF_COUNT(bp); x++) { 132 for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
133 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) 133 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
134 cmn_err(CE_PANIC, 134 xfs_emerg(bp->b_mount,
135 "xfs_buf_item_log_check bip %x buffer %x orig %x index %d", 135 "%s: bip %x buffer %x orig %x index %d",
136 bip, bp, orig, x); 136 __func__, bip, bp, orig, x);
137 ASSERT(0);
138 }
137 } 139 }
138} 140}
139#else 141#else
@@ -141,8 +143,7 @@ xfs_buf_item_log_check(
141#define xfs_buf_item_log_check(x) 143#define xfs_buf_item_log_check(x)
142#endif 144#endif
143 145
144STATIC void xfs_buf_error_relse(xfs_buf_t *bp); 146STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
145STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
146 147
147/* 148/*
148 * This returns the number of log iovecs needed to log the 149 * This returns the number of log iovecs needed to log the
@@ -428,13 +429,15 @@ xfs_buf_item_unpin(
428 429
429 if (remove) { 430 if (remove) {
430 /* 431 /*
431 * We have to remove the log item from the transaction 432 * If we are in a transaction context, we have to
432 * as we are about to release our reference to the 433 * remove the log item from the transaction as we are
433 * buffer. If we don't, the unlock that occurs later 434 * about to release our reference to the buffer. If we
434 * in xfs_trans_uncommit() will ry to reference the 435 * don't, the unlock that occurs later in
436 * xfs_trans_uncommit() will try to reference the
435 * buffer which we no longer have a hold on. 437 * buffer which we no longer have a hold on.
436 */ 438 */
437 xfs_trans_del_item(lip); 439 if (lip->li_desc)
440 xfs_trans_del_item(lip);
438 441
439 /* 442 /*
440 * Since the transaction no longer refers to the buffer, 443 * Since the transaction no longer refers to the buffer,
@@ -450,7 +453,7 @@ xfs_buf_item_unpin(
450 * xfs_trans_ail_delete() drops the AIL lock. 453 * xfs_trans_ail_delete() drops the AIL lock.
451 */ 454 */
452 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 455 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
453 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); 456 xfs_buf_do_callbacks(bp);
454 XFS_BUF_SET_FSPRIVATE(bp, NULL); 457 XFS_BUF_SET_FSPRIVATE(bp, NULL);
455 XFS_BUF_CLR_IODONE_FUNC(bp); 458 XFS_BUF_CLR_IODONE_FUNC(bp);
456 } else { 459 } else {
@@ -692,8 +695,7 @@ xfs_buf_item_init(
692 * the first. If we do already have one, there is 695 * the first. If we do already have one, there is
693 * nothing to do here so return. 696 * nothing to do here so return.
694 */ 697 */
695 if (bp->b_mount != mp) 698 ASSERT(bp->b_target->bt_mount == mp);
696 bp->b_mount = mp;
697 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 699 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
698 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 700 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
699 if (lip->li_type == XFS_LI_BUF) { 701 if (lip->li_type == XFS_LI_BUF) {
@@ -919,15 +921,26 @@ xfs_buf_attach_iodone(
919 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 921 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
920} 922}
921 923
924/*
925 * We can have many callbacks on a buffer. Running the callbacks individually
926 * can cause a lot of contention on the AIL lock, so we allow for a single
927 * callback to be able to scan the remaining lip->li_bio_list for other items
928 * of the same type and callback to be processed in the first call.
929 *
930 * As a result, the loop walking the callback list below will also modify the
931 * list. it removes the first item from the list and then runs the callback.
932 * The loop then restarts from the new head of the list. This allows the
933 * callback to scan and modify the list attached to the buffer and we don't
934 * have to care about maintaining a next item pointer.
935 */
922STATIC void 936STATIC void
923xfs_buf_do_callbacks( 937xfs_buf_do_callbacks(
924 xfs_buf_t *bp, 938 struct xfs_buf *bp)
925 xfs_log_item_t *lip)
926{ 939{
927 xfs_log_item_t *nlip; 940 struct xfs_log_item *lip;
928 941
929 while (lip != NULL) { 942 while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
930 nlip = lip->li_bio_list; 943 XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
931 ASSERT(lip->li_cb != NULL); 944 ASSERT(lip->li_cb != NULL);
932 /* 945 /*
933 * Clear the next pointer so we don't have any 946 * Clear the next pointer so we don't have any
@@ -937,7 +950,6 @@ xfs_buf_do_callbacks(
937 */ 950 */
938 lip->li_bio_list = NULL; 951 lip->li_bio_list = NULL;
939 lip->li_cb(bp, lip); 952 lip->li_cb(bp, lip);
940 lip = nlip;
941 } 953 }
942} 954}
943 955
@@ -950,128 +962,75 @@ xfs_buf_do_callbacks(
950 */ 962 */
951void 963void
952xfs_buf_iodone_callbacks( 964xfs_buf_iodone_callbacks(
953 xfs_buf_t *bp) 965 struct xfs_buf *bp)
954{ 966{
955 xfs_log_item_t *lip; 967 struct xfs_log_item *lip = bp->b_fspriv;
956 static ulong lasttime; 968 struct xfs_mount *mp = lip->li_mountp;
957 static xfs_buftarg_t *lasttarg; 969 static ulong lasttime;
958 xfs_mount_t *mp; 970 static xfs_buftarg_t *lasttarg;
959 971
960 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 972 if (likely(!XFS_BUF_GETERROR(bp)))
961 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 973 goto do_callbacks;
962 974
963 if (XFS_BUF_GETERROR(bp) != 0) { 975 /*
964 /* 976 * If we've already decided to shutdown the filesystem because of
965 * If we've already decided to shutdown the filesystem 977 * I/O errors, there's no point in giving this a retry.
966 * because of IO errors, there's no point in giving this 978 */
967 * a retry. 979 if (XFS_FORCED_SHUTDOWN(mp)) {
968 */ 980 XFS_BUF_SUPER_STALE(bp);
969 mp = lip->li_mountp; 981 trace_xfs_buf_item_iodone(bp, _RET_IP_);
970 if (XFS_FORCED_SHUTDOWN(mp)) { 982 goto do_callbacks;
971 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 983 }
972 XFS_BUF_SUPER_STALE(bp);
973 trace_xfs_buf_item_iodone(bp, _RET_IP_);
974 xfs_buf_do_callbacks(bp, lip);
975 XFS_BUF_SET_FSPRIVATE(bp, NULL);
976 XFS_BUF_CLR_IODONE_FUNC(bp);
977 xfs_biodone(bp);
978 return;
979 }
980 984
981 if ((XFS_BUF_TARGET(bp) != lasttarg) || 985 if (XFS_BUF_TARGET(bp) != lasttarg ||
982 (time_after(jiffies, (lasttime + 5*HZ)))) { 986 time_after(jiffies, (lasttime + 5*HZ))) {
983 lasttime = jiffies; 987 lasttime = jiffies;
984 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 988 xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
985 " block 0x%llx in %s", 989 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
986 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 990 (__uint64_t)XFS_BUF_ADDR(bp));
987 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 991 }
988 } 992 lasttarg = XFS_BUF_TARGET(bp);
989 lasttarg = XFS_BUF_TARGET(bp);
990 993
991 if (XFS_BUF_ISASYNC(bp)) { 994 /*
992 /* 995 * If the write was asynchronous then no one will be looking for the
993 * If the write was asynchronous then noone will be 996 * error. Clear the error state and write the buffer out again.
994 * looking for the error. Clear the error state 997 *
995 * and write the buffer out again delayed write. 998 * During sync or umount we'll write all pending buffers again
996 * 999 * synchronous, which will catch these errors if they keep hanging
997 * XXXsup This is OK, so long as we catch these 1000 * around.
998 * before we start the umount; we don't want these 1001 */
999 * DELWRI metadata bufs to be hanging around. 1002 if (XFS_BUF_ISASYNC(bp)) {
1000 */ 1003 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
1001 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ 1004
1002 1005 if (!XFS_BUF_ISSTALE(bp)) {
1003 if (!(XFS_BUF_ISSTALE(bp))) { 1006 XFS_BUF_DELAYWRITE(bp);
1004 XFS_BUF_DELAYWRITE(bp);
1005 XFS_BUF_DONE(bp);
1006 XFS_BUF_SET_START(bp);
1007 }
1008 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1009 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1010 xfs_buf_relse(bp);
1011 } else {
1012 /*
1013 * If the write of the buffer was not asynchronous,
1014 * then we want to make sure to return the error
1015 * to the caller of bwrite(). Because of this we
1016 * cannot clear the B_ERROR state at this point.
1017 * Instead we install a callback function that
1018 * will be called when the buffer is released, and
1019 * that routine will clear the error state and
1020 * set the buffer to be written out again after
1021 * some delay.
1022 */
1023 /* We actually overwrite the existing b-relse
1024 function at times, but we're gonna be shutting down
1025 anyway. */
1026 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1027 XFS_BUF_DONE(bp); 1007 XFS_BUF_DONE(bp);
1028 XFS_BUF_FINISH_IOWAIT(bp); 1008 XFS_BUF_SET_START(bp);
1029 } 1009 }
1010 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1011 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1012 xfs_buf_relse(bp);
1030 return; 1013 return;
1031 } 1014 }
1032 1015
1033 xfs_buf_do_callbacks(bp, lip); 1016 /*
1034 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1017 * If the write of the buffer was synchronous, we want to make
1035 XFS_BUF_CLR_IODONE_FUNC(bp); 1018 * sure to return the error to the caller of xfs_bwrite().
1036 xfs_biodone(bp); 1019 */
1037}
1038
1039/*
1040 * This is a callback routine attached to a buffer which gets an error
1041 * when being written out synchronously.
1042 */
1043STATIC void
1044xfs_buf_error_relse(
1045 xfs_buf_t *bp)
1046{
1047 xfs_log_item_t *lip;
1048 xfs_mount_t *mp;
1049
1050 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1051 mp = (xfs_mount_t *)lip->li_mountp;
1052 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1053
1054 XFS_BUF_STALE(bp); 1020 XFS_BUF_STALE(bp);
1055 XFS_BUF_DONE(bp); 1021 XFS_BUF_DONE(bp);
1056 XFS_BUF_UNDELAYWRITE(bp); 1022 XFS_BUF_UNDELAYWRITE(bp);
1057 XFS_BUF_ERROR(bp,0);
1058 1023
1059 trace_xfs_buf_error_relse(bp, _RET_IP_); 1024 trace_xfs_buf_error_relse(bp, _RET_IP_);
1025 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1060 1026
1061 if (! XFS_FORCED_SHUTDOWN(mp)) 1027do_callbacks:
1062 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1028 xfs_buf_do_callbacks(bp);
1063 /*
1064 * We have to unpin the pinned buffers so do the
1065 * callbacks.
1066 */
1067 xfs_buf_do_callbacks(bp, lip);
1068 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1029 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1069 XFS_BUF_CLR_IODONE_FUNC(bp); 1030 XFS_BUF_CLR_IODONE_FUNC(bp);
1070 XFS_BUF_SET_BRELSE_FUNC(bp,NULL); 1031 xfs_buf_ioend(bp, 0);
1071 xfs_buf_relse(bp);
1072} 1032}
1073 1033
1074
1075/* 1034/*
1076 * This is the iodone() function for buffers which have been 1035 * This is the iodone() function for buffers which have been
1077 * logged. It is called when they are eventually flushed out. 1036 * logged. It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
105 xfs_buf_log_format_t bli_format; /* in-log header */ 105 xfs_buf_log_format_t bli_format; /* in-log header */
106} xfs_buf_log_item_t; 106} xfs_buf_log_item_t;
107 107
108/*
109 * This structure is used during recovery to record the buf log
110 * items which have been canceled and should not be replayed.
111 */
112typedef struct xfs_buf_cancel {
113 xfs_daddr_t bc_blkno;
114 uint bc_len;
115 int bc_refcount;
116 struct xfs_buf_cancel *bc_next;
117} xfs_buf_cancel_t;
118
119void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); 108void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
120void xfs_buf_item_relse(struct xfs_buf *); 109void xfs_buf_item_relse(struct xfs_buf *);
121void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); 110void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 30fa0e206fba..6102ac6d1dff 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,13 +1995,12 @@ xfs_da_do_buf(
1995 error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); 1995 error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
1996 if (unlikely(error == EFSCORRUPTED)) { 1996 if (unlikely(error == EFSCORRUPTED)) {
1997 if (xfs_error_level >= XFS_ERRLEVEL_LOW) { 1997 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
1998 cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n", 1998 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
1999 (long long)bno); 1999 __func__, (long long)bno,
2000 cmn_err(CE_ALERT, "dir: inode %lld\n",
2001 (long long)dp->i_ino); 2000 (long long)dp->i_ino);
2002 for (i = 0; i < nmap; i++) { 2001 for (i = 0; i < nmap; i++) {
2003 cmn_err(CE_ALERT, 2002 xfs_alert(mp,
2004 "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n", 2003"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
2005 i, 2004 i,
2006 (long long)mapp[i].br_startoff, 2005 (long long)mapp[i].br_startoff,
2007 (long long)mapp[i].br_startblock, 2006 (long long)mapp[i].br_startblock,
@@ -2042,7 +2041,7 @@ xfs_da_do_buf(
2042 mappedbno, nmapped, 0, &bp); 2041 mappedbno, nmapped, 0, &bp);
2043 break; 2042 break;
2044 case 3: 2043 case 3:
2045 xfs_baread(mp->m_ddev_targp, mappedbno, nmapped); 2044 xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
2046 error = 0; 2045 error = 0;
2047 bp = NULL; 2046 bp = NULL;
2048 break; 2047 break;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 3b9582c60a22..9a84a85c03b1 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -202,7 +202,7 @@ xfs_swap_extents(
202 xfs_inode_t *tip, /* tmp inode */ 202 xfs_inode_t *tip, /* tmp inode */
203 xfs_swapext_t *sxp) 203 xfs_swapext_t *sxp)
204{ 204{
205 xfs_mount_t *mp; 205 xfs_mount_t *mp = ip->i_mount;
206 xfs_trans_t *tp; 206 xfs_trans_t *tp;
207 xfs_bstat_t *sbp = &sxp->sx_stat; 207 xfs_bstat_t *sbp = &sxp->sx_stat;
208 xfs_ifork_t *tempifp, *ifp, *tifp; 208 xfs_ifork_t *tempifp, *ifp, *tifp;
@@ -212,16 +212,12 @@ xfs_swap_extents(
212 int taforkblks = 0; 212 int taforkblks = 0;
213 __uint64_t tmp; 213 __uint64_t tmp;
214 214
215 mp = ip->i_mount;
216
217 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); 215 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
218 if (!tempifp) { 216 if (!tempifp) {
219 error = XFS_ERROR(ENOMEM); 217 error = XFS_ERROR(ENOMEM);
220 goto out; 218 goto out;
221 } 219 }
222 220
223 sbp = &sxp->sx_stat;
224
225 /* 221 /*
226 * we have to do two separate lock calls here to keep lockdep 222 * we have to do two separate lock calls here to keep lockdep
227 * happy. If we try to get all the locks in one call, lock will 223 * happy. If we try to get all the locks in one call, lock will
@@ -270,9 +266,9 @@ xfs_swap_extents(
270 /* check inode formats now that data is flushed */ 266 /* check inode formats now that data is flushed */
271 error = xfs_swap_extents_check_format(ip, tip); 267 error = xfs_swap_extents_check_format(ip, tip);
272 if (error) { 268 if (error) {
273 xfs_fs_cmn_err(CE_NOTE, mp, 269 xfs_notice(mp,
274 "%s: inode 0x%llx format is incompatible for exchanging.", 270 "%s: inode 0x%llx format is incompatible for exchanging.",
275 __FILE__, ip->i_ino); 271 __func__, ip->i_ino);
276 goto out_unlock; 272 goto out_unlock;
277 } 273 }
278 274
@@ -377,6 +373,19 @@ xfs_swap_extents(
377 ip->i_d.di_format = tip->i_d.di_format; 373 ip->i_d.di_format = tip->i_d.di_format;
378 tip->i_d.di_format = tmp; 374 tip->i_d.di_format = tmp;
379 375
376 /*
377 * The extents in the source inode could still contain speculative
378 * preallocation beyond EOF (e.g. the file is open but not modified
379 * while defrag is in progress). In that case, we need to copy over the
380 * number of delalloc blocks the data fork in the source inode is
381 * tracking beyond EOF so that when the fork is truncated away when the
382 * temporary inode is unlinked we don't underrun the i_delayed_blks
383 * counter on that inode.
384 */
385 ASSERT(tip->i_delayed_blks == 0);
386 tip->i_delayed_blks = ip->i_delayed_blks;
387 ip->i_delayed_blks = 0;
388
380 ilf_fields = XFS_ILOG_CORE; 389 ilf_fields = XFS_ILOG_CORE;
381 390
382 switch(ip->i_d.di_format) { 391 switch(ip->i_d.di_format) {
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5b153b2e6a3..dffba9ba0db6 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -49,8 +49,9 @@ typedef struct xfs_dinode {
49 __be32 di_uid; /* owner's user id */ 49 __be32 di_uid; /* owner's user id */
50 __be32 di_gid; /* owner's group id */ 50 __be32 di_gid; /* owner's group id */
51 __be32 di_nlink; /* number of links to file */ 51 __be32 di_nlink; /* number of links to file */
52 __be16 di_projid; /* owner's project id */ 52 __be16 di_projid_lo; /* lower part of owner's project id */
53 __u8 di_pad[8]; /* unused, zeroed space */ 53 __be16 di_projid_hi; /* higher part owner's project id */
54 __u8 di_pad[6]; /* unused, zeroed space */
54 __be16 di_flushiter; /* incremented on flush */ 55 __be16 di_flushiter; /* incremented on flush */
55 xfs_timestamp_t di_atime; /* time last accessed */ 56 xfs_timestamp_t di_atime; /* time last accessed */
56 xfs_timestamp_t di_mtime; /* time last modified */ 57 xfs_timestamp_t di_mtime; /* time last modified */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a1321bc7f192..dba7a71cedf3 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -159,7 +159,7 @@ xfs_dir_ino_validate(
159 XFS_AGINO_TO_INO(mp, agno, agino) == ino; 159 XFS_AGINO_TO_INO(mp, agno, agino) == ino;
160 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, 160 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
161 XFS_RANDOM_DIR_INO_VALIDATE))) { 161 XFS_RANDOM_DIR_INO_VALIDATE))) {
162 xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx", 162 xfs_warn(mp, "Invalid inode number 0x%Lx",
163 (unsigned long long) ino); 163 (unsigned long long) ino);
164 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); 164 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
165 return XFS_ERROR(EFSCORRUPTED); 165 return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 504be8640e91..ae891223be90 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -961,7 +961,7 @@ xfs_dir2_leaf_getdents(
961 if (i > ra_current && 961 if (i > ra_current &&
962 map[ra_index].br_blockcount >= 962 map[ra_index].br_blockcount >=
963 mp->m_dirblkfsbs) { 963 mp->m_dirblkfsbs) {
964 xfs_baread(mp->m_ddev_targp, 964 xfs_buf_readahead(mp->m_ddev_targp,
965 XFS_FSB_TO_DADDR(mp, 965 XFS_FSB_TO_DADDR(mp,
966 map[ra_index].br_startblock + 966 map[ra_index].br_startblock +
967 ra_offset), 967 ra_offset),
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f9a0864b696a..a0aab7d3294f 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -899,10 +899,9 @@ xfs_dir2_leafn_rebalance(
899 if(blk2->index < 0) { 899 if(blk2->index < 0) {
900 state->inleaf = 1; 900 state->inleaf = 1;
901 blk2->index = 0; 901 blk2->index = 0;
902 cmn_err(CE_ALERT, 902 xfs_alert(args->dp->i_mount,
903 "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: " 903 "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
904 "blk1->index %d\n", 904 __func__, blk1->index);
905 blk1->index);
906 } 905 }
907} 906}
908 907
@@ -1641,26 +1640,22 @@ xfs_dir2_node_addname_int(
1641 } 1640 }
1642 1641
1643 if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { 1642 if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
1644 cmn_err(CE_ALERT, 1643 xfs_alert(mp,
1645 "xfs_dir2_node_addname_int: dir ino " 1644 "%s: dir ino " "%llu needed freesp block %lld for\n"
1646 "%llu needed freesp block %lld for\n" 1645 " data block %lld, got %lld ifbno %llu lastfbno %d",
1647 " data block %lld, got %lld\n" 1646 __func__, (unsigned long long)dp->i_ino,
1648 " ifbno %llu lastfbno %d\n",
1649 (unsigned long long)dp->i_ino,
1650 (long long)xfs_dir2_db_to_fdb(mp, dbno), 1647 (long long)xfs_dir2_db_to_fdb(mp, dbno),
1651 (long long)dbno, (long long)fbno, 1648 (long long)dbno, (long long)fbno,
1652 (unsigned long long)ifbno, lastfbno); 1649 (unsigned long long)ifbno, lastfbno);
1653 if (fblk) { 1650 if (fblk) {
1654 cmn_err(CE_ALERT, 1651 xfs_alert(mp,
1655 " fblk 0x%p blkno %llu " 1652 " fblk 0x%p blkno %llu index %d magic 0x%x",
1656 "index %d magic 0x%x\n",
1657 fblk, 1653 fblk,
1658 (unsigned long long)fblk->blkno, 1654 (unsigned long long)fblk->blkno,
1659 fblk->index, 1655 fblk->index,
1660 fblk->magic); 1656 fblk->magic);
1661 } else { 1657 } else {
1662 cmn_err(CE_ALERT, 1658 xfs_alert(mp, " ... fblk is NULL");
1663 " ... fblk is NULL\n");
1664 } 1659 }
1665 XFS_ERROR_REPORT("xfs_dir2_node_addname_int", 1660 XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
1666 XFS_ERRLEVEL_LOW, mp); 1661 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ed9990267661..39f06336b99d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -48,7 +48,7 @@ xfs_error_trap(int e)
48 break; 48 break;
49 if (e != xfs_etrap[i]) 49 if (e != xfs_etrap[i])
50 continue; 50 continue;
51 cmn_err(CE_NOTE, "xfs_error_trap: error %d", e); 51 xfs_notice(NULL, "%s: error %d", __func__, e);
52 BUG(); 52 BUG();
53 break; 53 break;
54 } 54 }
@@ -58,6 +58,7 @@ xfs_error_trap(int e)
58int xfs_etest[XFS_NUM_INJECT_ERROR]; 58int xfs_etest[XFS_NUM_INJECT_ERROR];
59int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; 59int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
60char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; 60char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
61int xfs_error_test_active;
61 62
62int 63int
63xfs_error_test(int error_tag, int *fsidp, char *expression, 64xfs_error_test(int error_tag, int *fsidp, char *expression,
@@ -73,7 +74,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
73 74
74 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 75 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
75 if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { 76 if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
76 cmn_err(CE_WARN, 77 xfs_warn(NULL,
77 "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", 78 "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
78 expression, file, line, xfs_etest_fsname[i]); 79 expression, file, line, xfs_etest_fsname[i]);
79 return 1; 80 return 1;
@@ -94,25 +95,26 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
94 95
95 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 96 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
96 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { 97 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
97 cmn_err(CE_WARN, "XFS error tag #%d on", error_tag); 98 xfs_warn(mp, "error tag #%d on", error_tag);
98 return 0; 99 return 0;
99 } 100 }
100 } 101 }
101 102
102 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 103 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
103 if (xfs_etest[i] == 0) { 104 if (xfs_etest[i] == 0) {
104 cmn_err(CE_WARN, "Turned on XFS error tag #%d", 105 xfs_warn(mp, "Turned on XFS error tag #%d",
105 error_tag); 106 error_tag);
106 xfs_etest[i] = error_tag; 107 xfs_etest[i] = error_tag;
107 xfs_etest_fsid[i] = fsid; 108 xfs_etest_fsid[i] = fsid;
108 len = strlen(mp->m_fsname); 109 len = strlen(mp->m_fsname);
109 xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); 110 xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
110 strcpy(xfs_etest_fsname[i], mp->m_fsname); 111 strcpy(xfs_etest_fsname[i], mp->m_fsname);
112 xfs_error_test_active++;
111 return 0; 113 return 0;
112 } 114 }
113 } 115 }
114 116
115 cmn_err(CE_WARN, "error tag overflow, too many turned on"); 117 xfs_warn(mp, "error tag overflow, too many turned on");
116 118
117 return 1; 119 return 1;
118} 120}
@@ -131,55 +133,23 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
131 if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && 133 if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
132 xfs_etest[i] != 0) { 134 xfs_etest[i] != 0) {
133 cleared = 1; 135 cleared = 1;
134 cmn_err(CE_WARN, "Clearing XFS error tag #%d", 136 xfs_warn(mp, "Clearing XFS error tag #%d",
135 xfs_etest[i]); 137 xfs_etest[i]);
136 xfs_etest[i] = 0; 138 xfs_etest[i] = 0;
137 xfs_etest_fsid[i] = 0LL; 139 xfs_etest_fsid[i] = 0LL;
138 kmem_free(xfs_etest_fsname[i]); 140 kmem_free(xfs_etest_fsname[i]);
139 xfs_etest_fsname[i] = NULL; 141 xfs_etest_fsname[i] = NULL;
142 xfs_error_test_active--;
140 } 143 }
141 } 144 }
142 145
143 if (loud || cleared) 146 if (loud || cleared)
144 cmn_err(CE_WARN, 147 xfs_warn(mp, "Cleared all XFS error tags for filesystem");
145 "Cleared all XFS error tags for filesystem \"%s\"",
146 mp->m_fsname);
147 148
148 return 0; 149 return 0;
149} 150}
150#endif /* DEBUG */ 151#endif /* DEBUG */
151 152
152
153void
154xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
155{
156 va_list ap;
157
158 va_start(ap, fmt);
159 xfs_fs_vcmn_err(level, mp, fmt, ap);
160 va_end(ap);
161}
162
163void
164xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
165{
166 va_list ap;
167
168#ifdef DEBUG
169 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
170#endif
171
172 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
173 && (level & CE_ALERT)) {
174 level &= ~CE_ALERT;
175 level |= CE_PANIC;
176 cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
177 }
178 va_start(ap, fmt);
179 xfs_fs_vcmn_err(level, mp, fmt, ap);
180 va_end(ap);
181}
182
183void 153void
184xfs_error_report( 154xfs_error_report(
185 const char *tag, 155 const char *tag,
@@ -190,9 +160,8 @@ xfs_error_report(
190 inst_t *ra) 160 inst_t *ra)
191{ 161{
192 if (level <= xfs_error_level) { 162 if (level <= xfs_error_level) {
193 xfs_cmn_err(XFS_PTAG_ERROR_REPORT, 163 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
194 CE_ALERT, mp, 164 "Internal error %s at line %d of file %s. Caller 0x%p\n",
195 "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
196 tag, linenum, filename, ra); 165 tag, linenum, filename, ra);
197 166
198 xfs_stack_trace(); 167 xfs_stack_trace();
@@ -212,4 +181,5 @@ xfs_corruption_error(
212 if (level <= xfs_error_level) 181 if (level <= xfs_error_level)
213 xfs_hex_dump(p, 16); 182 xfs_hex_dump(p, 16);
214 xfs_error_report(tag, level, mp, filename, linenum, ra); 183 xfs_error_report(tag, level, mp, filename, linenum, ra);
184 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
215} 185}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c2c1a072bb82..079a367f44ee 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -127,16 +127,17 @@ extern void xfs_corruption_error(const char *tag, int level,
127#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT 127#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
128 128
129#ifdef DEBUG 129#ifdef DEBUG
130extern int xfs_error_test_active;
130extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); 131extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
131 132
132#define XFS_NUM_INJECT_ERROR 10 133#define XFS_NUM_INJECT_ERROR 10
133#define XFS_TEST_ERROR(expr, mp, tag, rf) \ 134#define XFS_TEST_ERROR(expr, mp, tag, rf) \
134 ((expr) || \ 135 ((expr) || (xfs_error_test_active && \
135 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
136 (rf))) 137 (rf))))
137 138
138extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); 139extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
139extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); 140extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
140#else 141#else
141#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
142#define xfs_errortag_add(tag, mp) (ENOSYS) 143#define xfs_errortag_add(tag, mp) (ENOSYS)
@@ -144,10 +145,8 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
144#endif /* DEBUG */ 145#endif /* DEBUG */
145 146
146/* 147/*
147 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into 148 * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
148 * a panic by setting xfs_panic_mask in a 149 * a panic by setting xfs_panic_mask in a sysctl.
149 * sysctl. update xfs_max[XFS_PARAM] if
150 * more are added.
151 */ 150 */
152#define XFS_NO_PTAG 0 151#define XFS_NO_PTAG 0
153#define XFS_PTAG_IFLUSH 0x00000001 152#define XFS_PTAG_IFLUSH 0x00000001
@@ -159,23 +158,4 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
159#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 158#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
160#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 159#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
161 160
162struct xfs_mount;
163
164extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
165 char *fmt, va_list ap)
166 __attribute__ ((format (printf, 3, 0)));
167extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
168 char *fmt, ...)
169 __attribute__ ((format (printf, 4, 5)));
170extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
171 __attribute__ ((format (printf, 3, 4)));
172
173extern void xfs_hex_dump(void *p, int length);
174
175#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
176 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
177
178#define xfs_fs_mount_cmn_err(f, fmt, args...) \
179 ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
180
181#endif /* __XFS_ERROR_H__ */ 161#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
48} 48}
49 49
50/* 50/*
51 * Freeing the efi requires that we remove it from the AIL if it has already
52 * been placed there. However, the EFI may not yet have been placed in the AIL
53 * when called by xfs_efi_release() from EFD processing due to the ordering of
54 * committed vs unpin operations in bulk insert operations. Hence the
55 * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
56 * the EFI.
57 */
58STATIC void
59__xfs_efi_release(
60 struct xfs_efi_log_item *efip)
61{
62 struct xfs_ail *ailp = efip->efi_item.li_ailp;
63
64 if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
65 spin_lock(&ailp->xa_lock);
66 /* xfs_trans_ail_delete() drops the AIL lock. */
67 xfs_trans_ail_delete(ailp, &efip->efi_item);
68 xfs_efi_item_free(efip);
69 }
70}
71
72/*
51 * This returns the number of iovecs needed to log the given efi item. 73 * This returns the number of iovecs needed to log the given efi item.
52 * We only need 1 iovec for an efi item. It just logs the efi_log_format 74 * We only need 1 iovec for an efi item. It just logs the efi_log_format
53 * structure. 75 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
74 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 96 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
75 uint size; 97 uint size;
76 98
77 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); 99 ASSERT(atomic_read(&efip->efi_next_extent) ==
100 efip->efi_format.efi_nextents);
78 101
79 efip->efi_format.efi_type = XFS_LI_EFI; 102 efip->efi_format.efi_type = XFS_LI_EFI;
80 103
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
99} 122}
100 123
101/* 124/*
102 * While EFIs cannot really be pinned, the unpin operation is the 125 * While EFIs cannot really be pinned, the unpin operation is the last place at
103 * last place at which the EFI is manipulated during a transaction. 126 * which the EFI is manipulated during a transaction. If we are being asked to
104 * Here we coordinate with xfs_efi_cancel() to determine who gets to 127 * remove the EFI it's because the transaction has been cancelled and by
105 * free the EFI. 128 * definition that means the EFI cannot be in the AIL so remove it from the
129 * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
130 * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
106 */ 131 */
107STATIC void 132STATIC void
108xfs_efi_item_unpin( 133xfs_efi_item_unpin(
@@ -110,20 +135,15 @@ xfs_efi_item_unpin(
110 int remove) 135 int remove)
111{ 136{
112 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 137 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
113 struct xfs_ail *ailp = lip->li_ailp;
114 138
115 spin_lock(&ailp->xa_lock); 139 if (remove) {
116 if (efip->efi_flags & XFS_EFI_CANCELED) { 140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
117 if (remove) 141 if (lip->li_desc)
118 xfs_trans_del_item(lip); 142 xfs_trans_del_item(lip);
119
120 /* xfs_trans_ail_delete() drops the AIL lock. */
121 xfs_trans_ail_delete(ailp, lip);
122 xfs_efi_item_free(efip); 143 xfs_efi_item_free(efip);
123 } else { 144 return;
124 efip->efi_flags |= XFS_EFI_COMMITTED;
125 spin_unlock(&ailp->xa_lock);
126 } 145 }
146 __xfs_efi_release(efip);
127} 147}
128 148
129/* 149/*
@@ -152,16 +172,20 @@ xfs_efi_item_unlock(
152} 172}
153 173
154/* 174/*
155 * The EFI is logged only once and cannot be moved in the log, so 175 * The EFI is logged only once and cannot be moved in the log, so simply return
156 * simply return the lsn at which it's been logged. The canceled 176 * the lsn at which it's been logged. For bulk transaction committed
157 * flag is not paid any attention here. Checking for that is delayed 177 * processing, the EFI may be processed but not yet unpinned prior to the EFD
158 * until the EFI is unpinned. 178 * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
179 * when processing the EFD.
159 */ 180 */
160STATIC xfs_lsn_t 181STATIC xfs_lsn_t
161xfs_efi_item_committed( 182xfs_efi_item_committed(
162 struct xfs_log_item *lip, 183 struct xfs_log_item *lip,
163 xfs_lsn_t lsn) 184 xfs_lsn_t lsn)
164{ 185{
186 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
187
188 set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
165 return lsn; 189 return lsn;
166} 190}
167 191
@@ -230,6 +254,7 @@ xfs_efi_init(
230 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); 254 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
231 efip->efi_format.efi_nextents = nextents; 255 efip->efi_format.efi_nextents = nextents;
232 efip->efi_format.efi_id = (__psint_t)(void*)efip; 256 efip->efi_format.efi_id = (__psint_t)(void*)efip;
257 atomic_set(&efip->efi_next_extent, 0);
233 258
234 return efip; 259 return efip;
235} 260}
@@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
289} 314}
290 315
291/* 316/*
292 * This is called by the efd item code below to release references to 317 * This is called by the efd item code below to release references to the given
293 * the given efi item. Each efd calls this with the number of 318 * efi item. Each efd calls this with the number of extents that it has
294 * extents that it has logged, and when the sum of these reaches 319 * logged, and when the sum of these reaches the total number of extents logged
295 * the total number of extents logged by this efi item we can free 320 * by this efi item we can free the efi item.
296 * the efi item.
297 *
298 * Freeing the efi item requires that we remove it from the AIL.
299 * We'll use the AIL lock to protect our counters as well as
300 * the removal from the AIL.
301 */ 321 */
302void 322void
303xfs_efi_release(xfs_efi_log_item_t *efip, 323xfs_efi_release(xfs_efi_log_item_t *efip,
304 uint nextents) 324 uint nextents)
305{ 325{
306 struct xfs_ail *ailp = efip->efi_item.li_ailp; 326 ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
307 int extents_left; 327 if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
308 328 __xfs_efi_release(efip);
309 ASSERT(efip->efi_next_extent > 0);
310 ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
311
312 spin_lock(&ailp->xa_lock);
313 ASSERT(efip->efi_next_extent >= nextents);
314 efip->efi_next_extent -= nextents;
315 extents_left = efip->efi_next_extent;
316 if (extents_left == 0) {
317 /* xfs_trans_ail_delete() drops the AIL lock. */
318 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
319 xfs_efi_item_free(efip);
320 } else {
321 spin_unlock(&ailp->xa_lock);
322 }
323} 329}
324 330
325static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) 331static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
111#define XFS_EFI_MAX_FAST_EXTENTS 16 111#define XFS_EFI_MAX_FAST_EXTENTS 16
112 112
113/* 113/*
114 * Define EFI flags. 114 * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
115 */ 115 */
116#define XFS_EFI_RECOVERED 0x1 116#define XFS_EFI_RECOVERED 1
117#define XFS_EFI_COMMITTED 0x2 117#define XFS_EFI_COMMITTED 2
118#define XFS_EFI_CANCELED 0x4
119 118
120/* 119/*
121 * This is the "extent free intention" log item. It is used 120 * This is the "extent free intention" log item. It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
125 */ 124 */
126typedef struct xfs_efi_log_item { 125typedef struct xfs_efi_log_item {
127 xfs_log_item_t efi_item; 126 xfs_log_item_t efi_item;
128 uint efi_flags; /* misc flags */ 127 atomic_t efi_next_extent;
129 uint efi_next_extent; 128 unsigned long efi_flags; /* misc flags */
130 xfs_efi_log_format_t efi_format; 129 xfs_efi_log_format_t efi_format;
131} xfs_efi_log_item_t; 130} xfs_efi_log_item_t;
132 131
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9b715dce5699..9124425b7f2f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -744,9 +744,15 @@ xfs_filestream_new_ag(
744 * If the file's parent directory is known, take its iolock in exclusive 744 * If the file's parent directory is known, take its iolock in exclusive
745 * mode to prevent two sibling files from racing each other to migrate 745 * mode to prevent two sibling files from racing each other to migrate
746 * themselves and their parent to different AGs. 746 * themselves and their parent to different AGs.
747 *
748 * Note that we lock the parent directory iolock inside the child
749 * iolock here. That's fine as we never hold both parent and child
750 * iolock in any other place. This is different from the ilock,
751 * which requires locking of the child after the parent for namespace
752 * operations.
747 */ 753 */
748 if (pip) 754 if (pip)
749 xfs_ilock(pip, XFS_IOLOCK_EXCL); 755 xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
750 756
751 /* 757 /*
752 * A new AG needs to be found for the file. If the file's parent 758 * A new AG needs to be found for the file. If the file's parent
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 87c2e9d02288..8f6fc1a96386 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -293,9 +293,11 @@ typedef struct xfs_bstat {
293 __s32 bs_extsize; /* extent size */ 293 __s32 bs_extsize; /* extent size */
294 __s32 bs_extents; /* number of extents */ 294 __s32 bs_extents; /* number of extents */
295 __u32 bs_gen; /* generation count */ 295 __u32 bs_gen; /* generation count */
296 __u16 bs_projid; /* project id */ 296 __u16 bs_projid_lo; /* lower part of project id */
297#define bs_projid bs_projid_lo /* (previously just bs_projid) */
297 __u16 bs_forkoff; /* inode fork offset in bytes */ 298 __u16 bs_forkoff; /* inode fork offset in bytes */
298 unsigned char bs_pad[12]; /* pad space, unused */ 299 __u16 bs_projid_hi; /* higher part of project id */
300 unsigned char bs_pad[10]; /* pad space, unused */
299 __u32 bs_dmevmask; /* DMIG event mask */ 301 __u32 bs_dmevmask; /* DMIG event mask */
300 __u16 bs_dmstate; /* DMIG state info */ 302 __u16 bs_dmstate; /* DMIG state info */
301 __u16 bs_aextents; /* attribute number of extents */ 303 __u16 bs_aextents; /* attribute number of extents */
@@ -448,6 +450,7 @@ typedef struct xfs_handle {
448/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ 450/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */
449/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 451/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
450#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 452#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
453#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
451 454
452/* 455/*
453 * ioctl commands that replace IRIX syssgi()'s 456 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 43b1d5699335..9153d2c77caf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
53 xfs_fsop_geom_t *geo, 53 xfs_fsop_geom_t *geo,
54 int new_version) 54 int new_version)
55{ 55{
56
57 memset(geo, 0, sizeof(*geo));
58
56 geo->blocksize = mp->m_sb.sb_blocksize; 59 geo->blocksize = mp->m_sb.sb_blocksize;
57 geo->rtextsize = mp->m_sb.sb_rextsize; 60 geo->rtextsize = mp->m_sb.sb_rextsize;
58 geo->agblocks = mp->m_sb.sb_agblocks; 61 geo->agblocks = mp->m_sb.sb_agblocks;
@@ -144,12 +147,11 @@ xfs_growfs_data_private(
144 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) 147 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
145 return error; 148 return error;
146 dpct = pct - mp->m_sb.sb_imax_pct; 149 dpct = pct - mp->m_sb.sb_imax_pct;
147 error = xfs_read_buf(mp, mp->m_ddev_targp, 150 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 151 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
149 XFS_FSS_TO_BB(mp, 1), 0, &bp); 152 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
150 if (error) 153 if (!bp)
151 return error; 154 return EIO;
152 ASSERT(bp);
153 xfs_buf_relse(bp); 155 xfs_buf_relse(bp);
154 156
155 new = nb; /* use new as a temporary here */ 157 new = nb; /* use new as a temporary here */
@@ -375,6 +377,7 @@ xfs_growfs_data_private(
375 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 377 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
376 } else 378 } else
377 mp->m_maxicount = 0; 379 mp->m_maxicount = 0;
380 xfs_set_low_space_thresholds(mp);
378 381
379 /* update secondary superblocks. */ 382 /* update secondary superblocks. */
380 for (agno = 1; agno < nagcount; agno++) { 383 for (agno = 1; agno < nagcount; agno++) {
@@ -382,8 +385,8 @@ xfs_growfs_data_private(
382 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 385 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
383 XFS_FSS_TO_BB(mp, 1), 0, &bp); 386 XFS_FSS_TO_BB(mp, 1), 0, &bp);
384 if (error) { 387 if (error) {
385 xfs_fs_cmn_err(CE_WARN, mp, 388 xfs_warn(mp,
386 "error %d reading secondary superblock for ag %d", 389 "error %d reading secondary superblock for ag %d",
387 error, agno); 390 error, agno);
388 break; 391 break;
389 } 392 }
@@ -396,7 +399,7 @@ xfs_growfs_data_private(
396 if (!(error = xfs_bwrite(mp, bp))) { 399 if (!(error = xfs_bwrite(mp, bp))) {
397 continue; 400 continue;
398 } else { 401 } else {
399 xfs_fs_cmn_err(CE_WARN, mp, 402 xfs_warn(mp,
400 "write error %d updating secondary superblock for ag %d", 403 "write error %d updating secondary superblock for ag %d",
401 error, agno); 404 error, agno);
402 break; /* no point in continuing */ 405 break; /* no point in continuing */
@@ -597,7 +600,8 @@ out:
597 * the extra reserve blocks from the reserve..... 600 * the extra reserve blocks from the reserve.....
598 */ 601 */
599 int error; 602 int error;
600 error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0); 603 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
604 fdblks_delta, 0);
601 if (error == ENOSPC) 605 if (error == ENOSPC)
602 goto retry; 606 goto retry;
603 } 607 }
@@ -611,12 +615,13 @@ out:
611 * 615 *
612 * We cannot use an inode here for this - that will push dirty state back up 616 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from 617 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead. 618 * making progress. Hence we log a field in the superblock instead and use a
619 * synchronous transaction to ensure the superblock is immediately unpinned
620 * and can be written back.
615 */ 621 */
616int 622int
617xfs_fs_log_dummy( 623xfs_fs_log_dummy(
618 xfs_mount_t *mp, 624 xfs_mount_t *mp)
619 int flags)
620{ 625{
621 xfs_trans_t *tp; 626 xfs_trans_t *tp;
622 int error; 627 int error;
@@ -631,8 +636,7 @@ xfs_fs_log_dummy(
631 636
632 /* log the UUID because it is an unchanging field */ 637 /* log the UUID because it is an unchanging field */
633 xfs_mod_sb(tp, XFS_SB_UUID); 638 xfs_mod_sb(tp, XFS_SB_UUID);
634 if (flags & SYNC_WAIT) 639 xfs_trans_set_sync(tp);
635 xfs_trans_set_sync(tp);
636 return xfs_trans_commit(tp, 0); 640 return xfs_trans_commit(tp, 0);
637} 641}
638 642
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); 28extern int xfs_fs_log_dummy(struct xfs_mount *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5371d2dc360e..84ebeec16642 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -212,7 +212,7 @@ xfs_ialloc_inode_init(
212 * to log a whole cluster of inodes instead of all the 212 * to log a whole cluster of inodes instead of all the
213 * individual transactions causing a lot of log traffic. 213 * individual transactions causing a lot of log traffic.
214 */ 214 */
215 xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 215 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
216 for (i = 0; i < ninodes; i++) { 216 for (i = 0; i < ninodes; i++) {
217 int ioffset = i << mp->m_sb.sb_inodelog; 217 int ioffset = i << mp->m_sb.sb_inodelog;
218 uint isize = sizeof(struct xfs_dinode); 218 uint isize = sizeof(struct xfs_dinode);
@@ -1055,28 +1055,23 @@ xfs_difree(
1055 */ 1055 */
1056 agno = XFS_INO_TO_AGNO(mp, inode); 1056 agno = XFS_INO_TO_AGNO(mp, inode);
1057 if (agno >= mp->m_sb.sb_agcount) { 1057 if (agno >= mp->m_sb.sb_agcount) {
1058 cmn_err(CE_WARN, 1058 xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
1059 "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s. Returning EINVAL.", 1059 __func__, agno, mp->m_sb.sb_agcount);
1060 agno, mp->m_sb.sb_agcount, mp->m_fsname);
1061 ASSERT(0); 1060 ASSERT(0);
1062 return XFS_ERROR(EINVAL); 1061 return XFS_ERROR(EINVAL);
1063 } 1062 }
1064 agino = XFS_INO_TO_AGINO(mp, inode); 1063 agino = XFS_INO_TO_AGINO(mp, inode);
1065 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { 1064 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
1066 cmn_err(CE_WARN, 1065 xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
1067 "xfs_difree: inode != XFS_AGINO_TO_INO() " 1066 __func__, (unsigned long long)inode,
1068 "(%llu != %llu) on %s. Returning EINVAL.", 1067 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
1069 (unsigned long long)inode,
1070 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
1071 mp->m_fsname);
1072 ASSERT(0); 1068 ASSERT(0);
1073 return XFS_ERROR(EINVAL); 1069 return XFS_ERROR(EINVAL);
1074 } 1070 }
1075 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 1071 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1076 if (agbno >= mp->m_sb.sb_agblocks) { 1072 if (agbno >= mp->m_sb.sb_agblocks) {
1077 cmn_err(CE_WARN, 1073 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
1078 "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s. Returning EINVAL.", 1074 __func__, agbno, mp->m_sb.sb_agblocks);
1079 agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
1080 ASSERT(0); 1075 ASSERT(0);
1081 return XFS_ERROR(EINVAL); 1076 return XFS_ERROR(EINVAL);
1082 } 1077 }
@@ -1085,9 +1080,8 @@ xfs_difree(
1085 */ 1080 */
1086 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1081 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1087 if (error) { 1082 if (error) {
1088 cmn_err(CE_WARN, 1083 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
1089 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", 1084 __func__, error);
1090 error, mp->m_fsname);
1091 return error; 1085 return error;
1092 } 1086 }
1093 agi = XFS_BUF_TO_AGI(agbp); 1087 agi = XFS_BUF_TO_AGI(agbp);
@@ -1106,17 +1100,15 @@ xfs_difree(
1106 * Look for the entry describing this inode. 1100 * Look for the entry describing this inode.
1107 */ 1101 */
1108 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { 1102 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1109 cmn_err(CE_WARN, 1103 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
1110 "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.", 1104 __func__, error);
1111 error, mp->m_fsname);
1112 goto error0; 1105 goto error0;
1113 } 1106 }
1114 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1107 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1115 error = xfs_inobt_get_rec(cur, &rec, &i); 1108 error = xfs_inobt_get_rec(cur, &rec, &i);
1116 if (error) { 1109 if (error) {
1117 cmn_err(CE_WARN, 1110 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1118 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", 1111 __func__, error);
1119 error, mp->m_fsname);
1120 goto error0; 1112 goto error0;
1121 } 1113 }
1122 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1114 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1157,8 +1149,8 @@ xfs_difree(
1157 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 1149 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1158 1150
1159 if ((error = xfs_btree_delete(cur, &i))) { 1151 if ((error = xfs_btree_delete(cur, &i))) {
1160 cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n", 1152 xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
1161 error, mp->m_fsname); 1153 __func__, error);
1162 goto error0; 1154 goto error0;
1163 } 1155 }
1164 1156
@@ -1170,9 +1162,8 @@ xfs_difree(
1170 1162
1171 error = xfs_inobt_update(cur, &rec); 1163 error = xfs_inobt_update(cur, &rec);
1172 if (error) { 1164 if (error) {
1173 cmn_err(CE_WARN, 1165 xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
1174 "xfs_difree: xfs_inobt_update returned an error %d on %s.", 1166 __func__, error);
1175 error, mp->m_fsname);
1176 goto error0; 1167 goto error0;
1177 } 1168 }
1178 1169
@@ -1218,10 +1209,9 @@ xfs_imap_lookup(
1218 1209
1219 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1210 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1220 if (error) { 1211 if (error) {
1221 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1212 xfs_alert(mp,
1222 "xfs_ialloc_read_agi() returned " 1213 "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
1223 "error %d, agno %d", 1214 __func__, error, agno);
1224 error, agno);
1225 return error; 1215 return error;
1226 } 1216 }
1227 1217
@@ -1299,24 +1289,21 @@ xfs_imap(
1299 if (flags & XFS_IGET_UNTRUSTED) 1289 if (flags & XFS_IGET_UNTRUSTED)
1300 return XFS_ERROR(EINVAL); 1290 return XFS_ERROR(EINVAL);
1301 if (agno >= mp->m_sb.sb_agcount) { 1291 if (agno >= mp->m_sb.sb_agcount) {
1302 xfs_fs_cmn_err(CE_ALERT, mp, 1292 xfs_alert(mp,
1303 "xfs_imap: agno (%d) >= " 1293 "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
1304 "mp->m_sb.sb_agcount (%d)", 1294 __func__, agno, mp->m_sb.sb_agcount);
1305 agno, mp->m_sb.sb_agcount);
1306 } 1295 }
1307 if (agbno >= mp->m_sb.sb_agblocks) { 1296 if (agbno >= mp->m_sb.sb_agblocks) {
1308 xfs_fs_cmn_err(CE_ALERT, mp, 1297 xfs_alert(mp,
1309 "xfs_imap: agbno (0x%llx) >= " 1298 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
1310 "mp->m_sb.sb_agblocks (0x%lx)", 1299 __func__, (unsigned long long)agbno,
1311 (unsigned long long) agbno, 1300 (unsigned long)mp->m_sb.sb_agblocks);
1312 (unsigned long) mp->m_sb.sb_agblocks);
1313 } 1301 }
1314 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { 1302 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1315 xfs_fs_cmn_err(CE_ALERT, mp, 1303 xfs_alert(mp,
1316 "xfs_imap: ino (0x%llx) != " 1304 "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
1317 "XFS_AGINO_TO_INO(mp, agno, agino) " 1305 __func__, ino,
1318 "(0x%llx)", 1306 XFS_AGINO_TO_INO(mp, agno, agino));
1319 ino, XFS_AGINO_TO_INO(mp, agno, agino));
1320 } 1307 }
1321 xfs_stack_trace(); 1308 xfs_stack_trace();
1322#endif /* DEBUG */ 1309#endif /* DEBUG */
@@ -1388,10 +1375,9 @@ out_map:
1388 */ 1375 */
1389 if ((imap->im_blkno + imap->im_len) > 1376 if ((imap->im_blkno + imap->im_len) >
1390 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 1377 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
1391 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1378 xfs_alert(mp,
1392 "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > " 1379 "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
1393 " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)", 1380 __func__, (unsigned long long) imap->im_blkno,
1394 (unsigned long long) imap->im_blkno,
1395 (unsigned long long) imap->im_len, 1381 (unsigned long long) imap->im_len,
1396 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 1382 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1397 return XFS_ERROR(EINVAL); 1383 return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index d352862cefa0..16921f55c542 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -183,38 +183,6 @@ xfs_inobt_key_diff(
183 cur->bc_rec.i.ir_startino; 183 cur->bc_rec.i.ir_startino;
184} 184}
185 185
186STATIC int
187xfs_inobt_kill_root(
188 struct xfs_btree_cur *cur,
189 struct xfs_buf *bp,
190 int level,
191 union xfs_btree_ptr *newroot)
192{
193 int error;
194
195 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
196 XFS_BTREE_STATS_INC(cur, killroot);
197
198 /*
199 * Update the root pointer, decreasing the level by 1 and then
200 * free the old root.
201 */
202 xfs_inobt_set_root(cur, newroot, -1);
203 error = xfs_inobt_free_block(cur, bp);
204 if (error) {
205 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
206 return error;
207 }
208
209 XFS_BTREE_STATS_INC(cur, free);
210
211 cur->bc_bufs[level] = NULL;
212 cur->bc_nlevels--;
213
214 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
215 return 0;
216}
217
218#ifdef DEBUG 186#ifdef DEBUG
219STATIC int 187STATIC int
220xfs_inobt_keys_inorder( 188xfs_inobt_keys_inorder(
@@ -309,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
309 277
310 .dup_cursor = xfs_inobt_dup_cursor, 278 .dup_cursor = xfs_inobt_dup_cursor,
311 .set_root = xfs_inobt_set_root, 279 .set_root = xfs_inobt_set_root,
312 .kill_root = xfs_inobt_kill_root,
313 .alloc_block = xfs_inobt_alloc_block, 280 .alloc_block = xfs_inobt_alloc_block,
314 .free_block = xfs_inobt_free_block, 281 .free_block = xfs_inobt_free_block,
315 .get_minrecs = xfs_inobt_get_minrecs, 282 .get_minrecs = xfs_inobt_get_minrecs,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b1ecc6f97ade..3631783b2b53 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
43 43
44 44
45/* 45/*
46 * Define xfs inode iolock lockdep classes. We need to ensure that all active
47 * inodes are considered the same for lockdep purposes, including inodes that
48 * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
49 * guarantee the locks are considered the same when there are multiple lock
50 * initialisation siteѕ. Also, define a reclaimable inode class so it is
51 * obvious in lockdep reports which class the report is against.
52 */
53static struct lock_class_key xfs_iolock_active;
54struct lock_class_key xfs_iolock_reclaimable;
55
56/*
46 * Allocate and initialise an xfs_inode. 57 * Allocate and initialise an xfs_inode.
47 */ 58 */
48STATIC struct xfs_inode * 59STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
69 ASSERT(atomic_read(&ip->i_pincount) == 0); 80 ASSERT(atomic_read(&ip->i_pincount) == 0);
70 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 81 ASSERT(!spin_is_locked(&ip->i_flags_lock));
71 ASSERT(completion_done(&ip->i_flush)); 82 ASSERT(completion_done(&ip->i_flush));
83 ASSERT(ip->i_ino == 0);
72 84
73 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 85 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
86 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
87 &xfs_iolock_active, "xfs_iolock_active");
74 88
75 /* initialise the xfs inode */ 89 /* initialise the xfs inode */
76 ip->i_ino = ino; 90 ip->i_ino = ino;
@@ -85,12 +99,20 @@ xfs_inode_alloc(
85 ip->i_size = 0; 99 ip->i_size = 0;
86 ip->i_new_size = 0; 100 ip->i_new_size = 0;
87 101
88 /* prevent anyone from using this yet */
89 VFS_I(ip)->i_state = I_NEW;
90
91 return ip; 102 return ip;
92} 103}
93 104
105STATIC void
106xfs_inode_free_callback(
107 struct rcu_head *head)
108{
109 struct inode *inode = container_of(head, struct inode, i_rcu);
110 struct xfs_inode *ip = XFS_I(inode);
111
112 INIT_LIST_HEAD(&inode->i_dentry);
113 kmem_zone_free(xfs_inode_zone, ip);
114}
115
94void 116void
95xfs_inode_free( 117xfs_inode_free(
96 struct xfs_inode *ip) 118 struct xfs_inode *ip)
@@ -134,7 +156,18 @@ xfs_inode_free(
134 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 156 ASSERT(!spin_is_locked(&ip->i_flags_lock));
135 ASSERT(completion_done(&ip->i_flush)); 157 ASSERT(completion_done(&ip->i_flush));
136 158
137 kmem_zone_free(xfs_inode_zone, ip); 159 /*
160 * Because we use RCU freeing we need to ensure the inode always
161 * appears to be reclaimed with an invalid inode number when in the
162 * free state. The ip->i_flags_lock provides the barrier against lookup
163 * races.
164 */
165 spin_lock(&ip->i_flags_lock);
166 ip->i_flags = XFS_IRECLAIM;
167 ip->i_ino = 0;
168 spin_unlock(&ip->i_flags_lock);
169
170 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138} 171}
139 172
140/* 173/*
@@ -144,14 +177,29 @@ static int
144xfs_iget_cache_hit( 177xfs_iget_cache_hit(
145 struct xfs_perag *pag, 178 struct xfs_perag *pag,
146 struct xfs_inode *ip, 179 struct xfs_inode *ip,
180 xfs_ino_t ino,
147 int flags, 181 int flags,
148 int lock_flags) __releases(pag->pag_ici_lock) 182 int lock_flags) __releases(RCU)
149{ 183{
150 struct inode *inode = VFS_I(ip); 184 struct inode *inode = VFS_I(ip);
151 struct xfs_mount *mp = ip->i_mount; 185 struct xfs_mount *mp = ip->i_mount;
152 int error; 186 int error;
153 187
188 /*
189 * check for re-use of an inode within an RCU grace period due to the
190 * radix tree nodes not being updated yet. We monitor for this by
191 * setting the inode number to zero before freeing the inode structure.
192 * If the inode has been reallocated and set up, then the inode number
193 * will not match, so check for that, too.
194 */
154 spin_lock(&ip->i_flags_lock); 195 spin_lock(&ip->i_flags_lock);
196 if (ip->i_ino != ino) {
197 trace_xfs_iget_skip(ip);
198 XFS_STATS_INC(xs_ig_frecycle);
199 error = EAGAIN;
200 goto out_error;
201 }
202
155 203
156 /* 204 /*
157 * If we are racing with another cache hit that is currently 205 * If we are racing with another cache hit that is currently
@@ -194,7 +242,7 @@ xfs_iget_cache_hit(
194 ip->i_flags |= XFS_IRECLAIM; 242 ip->i_flags |= XFS_IRECLAIM;
195 243
196 spin_unlock(&ip->i_flags_lock); 244 spin_unlock(&ip->i_flags_lock);
197 read_unlock(&pag->pag_ici_lock); 245 rcu_read_unlock();
198 246
199 error = -inode_init_always(mp->m_super, inode); 247 error = -inode_init_always(mp->m_super, inode);
200 if (error) { 248 if (error) {
@@ -202,24 +250,35 @@ xfs_iget_cache_hit(
202 * Re-initializing the inode failed, and we are in deep 250 * Re-initializing the inode failed, and we are in deep
203 * trouble. Try to re-add it to the reclaim list. 251 * trouble. Try to re-add it to the reclaim list.
204 */ 252 */
205 read_lock(&pag->pag_ici_lock); 253 rcu_read_lock();
206 spin_lock(&ip->i_flags_lock); 254 spin_lock(&ip->i_flags_lock);
207 255
208 ip->i_flags &= ~XFS_INEW; 256 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
209 ip->i_flags |= XFS_IRECLAIMABLE; 257 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
210 __xfs_inode_set_reclaim_tag(pag, ip);
211 trace_xfs_iget_reclaim_fail(ip); 258 trace_xfs_iget_reclaim_fail(ip);
212 goto out_error; 259 goto out_error;
213 } 260 }
214 261
215 write_lock(&pag->pag_ici_lock); 262 spin_lock(&pag->pag_ici_lock);
216 spin_lock(&ip->i_flags_lock); 263 spin_lock(&ip->i_flags_lock);
217 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); 264
265 /*
266 * Clear the per-lifetime state in the inode as we are now
267 * effectively a new inode and need to return to the initial
268 * state before reuse occurs.
269 */
270 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
218 ip->i_flags |= XFS_INEW; 271 ip->i_flags |= XFS_INEW;
219 __xfs_inode_clear_reclaim_tag(mp, pag, ip); 272 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
220 inode->i_state = I_NEW; 273 inode->i_state = I_NEW;
274
275 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
276 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
277 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
278 &xfs_iolock_active, "xfs_iolock_active");
279
221 spin_unlock(&ip->i_flags_lock); 280 spin_unlock(&ip->i_flags_lock);
222 write_unlock(&pag->pag_ici_lock); 281 spin_unlock(&pag->pag_ici_lock);
223 } else { 282 } else {
224 /* If the VFS inode is being torn down, pause and try again. */ 283 /* If the VFS inode is being torn down, pause and try again. */
225 if (!igrab(inode)) { 284 if (!igrab(inode)) {
@@ -230,7 +289,7 @@ xfs_iget_cache_hit(
230 289
231 /* We've got a live one. */ 290 /* We've got a live one. */
232 spin_unlock(&ip->i_flags_lock); 291 spin_unlock(&ip->i_flags_lock);
233 read_unlock(&pag->pag_ici_lock); 292 rcu_read_unlock();
234 trace_xfs_iget_hit(ip); 293 trace_xfs_iget_hit(ip);
235 } 294 }
236 295
@@ -244,7 +303,7 @@ xfs_iget_cache_hit(
244 303
245out_error: 304out_error:
246 spin_unlock(&ip->i_flags_lock); 305 spin_unlock(&ip->i_flags_lock);
247 read_unlock(&pag->pag_ici_lock); 306 rcu_read_unlock();
248 return error; 307 return error;
249} 308}
250 309
@@ -297,7 +356,7 @@ xfs_iget_cache_miss(
297 BUG(); 356 BUG();
298 } 357 }
299 358
300 write_lock(&pag->pag_ici_lock); 359 spin_lock(&pag->pag_ici_lock);
301 360
302 /* insert the new inode */ 361 /* insert the new inode */
303 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 362 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -312,14 +371,14 @@ xfs_iget_cache_miss(
312 ip->i_udquot = ip->i_gdquot = NULL; 371 ip->i_udquot = ip->i_gdquot = NULL;
313 xfs_iflags_set(ip, XFS_INEW); 372 xfs_iflags_set(ip, XFS_INEW);
314 373
315 write_unlock(&pag->pag_ici_lock); 374 spin_unlock(&pag->pag_ici_lock);
316 radix_tree_preload_end(); 375 radix_tree_preload_end();
317 376
318 *ipp = ip; 377 *ipp = ip;
319 return 0; 378 return 0;
320 379
321out_preload_end: 380out_preload_end:
322 write_unlock(&pag->pag_ici_lock); 381 spin_unlock(&pag->pag_ici_lock);
323 radix_tree_preload_end(); 382 radix_tree_preload_end();
324 if (lock_flags) 383 if (lock_flags)
325 xfs_iunlock(ip, lock_flags); 384 xfs_iunlock(ip, lock_flags);
@@ -365,8 +424,8 @@ xfs_iget(
365 xfs_perag_t *pag; 424 xfs_perag_t *pag;
366 xfs_agino_t agino; 425 xfs_agino_t agino;
367 426
368 /* the radix tree exists only in inode capable AGs */ 427 /* reject inode numbers outside existing AGs */
369 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) 428 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
370 return EINVAL; 429 return EINVAL;
371 430
372 /* get the perag structure and ensure that it's inode capable */ 431 /* get the perag structure and ensure that it's inode capable */
@@ -375,15 +434,15 @@ xfs_iget(
375 434
376again: 435again:
377 error = 0; 436 error = 0;
378 read_lock(&pag->pag_ici_lock); 437 rcu_read_lock();
379 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 438 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
380 439
381 if (ip) { 440 if (ip) {
382 error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); 441 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
383 if (error) 442 if (error)
384 goto out_error_or_again; 443 goto out_error_or_again;
385 } else { 444 } else {
386 read_unlock(&pag->pag_ici_lock); 445 rcu_read_unlock();
387 XFS_STATS_INC(xs_ig_missed); 446 XFS_STATS_INC(xs_ig_missed);
388 447
389 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 448 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34798f391c49..a098a20ca63e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -110,8 +110,8 @@ xfs_inobp_check(
110 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 110 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
111 i * mp->m_sb.sb_inodesize); 111 i * mp->m_sb.sb_inodesize);
112 if (!dip->di_next_unlinked) { 112 if (!dip->di_next_unlinked) {
113 xfs_fs_cmn_err(CE_ALERT, mp, 113 xfs_alert(mp,
114 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", 114 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
115 bp); 115 bp);
116 ASSERT(dip->di_next_unlinked); 116 ASSERT(dip->di_next_unlinked);
117 } 117 }
@@ -142,10 +142,9 @@ xfs_imap_to_bp(
142 (int)imap->im_len, buf_flags, &bp); 142 (int)imap->im_len, buf_flags, &bp);
143 if (error) { 143 if (error) {
144 if (error != EAGAIN) { 144 if (error != EAGAIN) {
145 cmn_err(CE_WARN, 145 xfs_warn(mp,
146 "xfs_imap_to_bp: xfs_trans_read_buf()returned " 146 "%s: xfs_trans_read_buf() returned error %d.",
147 "an error %d on %s. Returning error.", 147 __func__, error);
148 error, mp->m_fsname);
149 } else { 148 } else {
150 ASSERT(buf_flags & XBF_TRYLOCK); 149 ASSERT(buf_flags & XBF_TRYLOCK);
151 } 150 }
@@ -180,12 +179,11 @@ xfs_imap_to_bp(
180 XFS_CORRUPTION_ERROR("xfs_imap_to_bp", 179 XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
181 XFS_ERRLEVEL_HIGH, mp, dip); 180 XFS_ERRLEVEL_HIGH, mp, dip);
182#ifdef DEBUG 181#ifdef DEBUG
183 cmn_err(CE_PANIC, 182 xfs_emerg(mp,
184 "Device %s - bad inode magic/vsn " 183 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
185 "daddr %lld #%d (magic=%x)",
186 XFS_BUFTARG_NAME(mp->m_ddev_targp),
187 (unsigned long long)imap->im_blkno, i, 184 (unsigned long long)imap->im_blkno, i,
188 be16_to_cpu(dip->di_magic)); 185 be16_to_cpu(dip->di_magic));
186 ASSERT(0);
189#endif 187#endif
190 xfs_trans_brelse(tp, bp); 188 xfs_trans_brelse(tp, bp);
191 return XFS_ERROR(EFSCORRUPTED); 189 return XFS_ERROR(EFSCORRUPTED);
@@ -317,7 +315,7 @@ xfs_iformat(
317 if (unlikely(be32_to_cpu(dip->di_nextents) + 315 if (unlikely(be32_to_cpu(dip->di_nextents) +
318 be16_to_cpu(dip->di_anextents) > 316 be16_to_cpu(dip->di_anextents) >
319 be64_to_cpu(dip->di_nblocks))) { 317 be64_to_cpu(dip->di_nblocks))) {
320 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 318 xfs_warn(ip->i_mount,
321 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 319 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
322 (unsigned long long)ip->i_ino, 320 (unsigned long long)ip->i_ino,
323 (int)(be32_to_cpu(dip->di_nextents) + 321 (int)(be32_to_cpu(dip->di_nextents) +
@@ -330,8 +328,7 @@ xfs_iformat(
330 } 328 }
331 329
332 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 330 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
333 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 331 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
334 "corrupt dinode %Lu, forkoff = 0x%x.",
335 (unsigned long long)ip->i_ino, 332 (unsigned long long)ip->i_ino,
336 dip->di_forkoff); 333 dip->di_forkoff);
337 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 334 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -341,7 +338,7 @@ xfs_iformat(
341 338
342 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 339 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
343 !ip->i_mount->m_rtdev_targp)) { 340 !ip->i_mount->m_rtdev_targp)) {
344 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 341 xfs_warn(ip->i_mount,
345 "corrupt dinode %Lu, has realtime flag set.", 342 "corrupt dinode %Lu, has realtime flag set.",
346 ip->i_ino); 343 ip->i_ino);
347 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 344 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
@@ -373,9 +370,8 @@ xfs_iformat(
373 * no local regular files yet 370 * no local regular files yet
374 */ 371 */
375 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { 372 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
376 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 373 xfs_warn(ip->i_mount,
377 "corrupt inode %Lu " 374 "corrupt inode %Lu (local format for regular file).",
378 "(local format for regular file).",
379 (unsigned long long) ip->i_ino); 375 (unsigned long long) ip->i_ino);
380 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 376 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
381 XFS_ERRLEVEL_LOW, 377 XFS_ERRLEVEL_LOW,
@@ -385,9 +381,8 @@ xfs_iformat(
385 381
386 di_size = be64_to_cpu(dip->di_size); 382 di_size = be64_to_cpu(dip->di_size);
387 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 383 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
388 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 384 xfs_warn(ip->i_mount,
389 "corrupt inode %Lu " 385 "corrupt inode %Lu (bad size %Ld for local inode).",
390 "(bad size %Ld for local inode).",
391 (unsigned long long) ip->i_ino, 386 (unsigned long long) ip->i_ino,
392 (long long) di_size); 387 (long long) di_size);
393 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 388 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -431,9 +426,8 @@ xfs_iformat(
431 size = be16_to_cpu(atp->hdr.totsize); 426 size = be16_to_cpu(atp->hdr.totsize);
432 427
433 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { 428 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
434 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 429 xfs_warn(ip->i_mount,
435 "corrupt inode %Lu " 430 "corrupt inode %Lu (bad attr fork size %Ld).",
436 "(bad attr fork size %Ld).",
437 (unsigned long long) ip->i_ino, 431 (unsigned long long) ip->i_ino,
438 (long long) size); 432 (long long) size);
439 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 433 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
@@ -488,9 +482,8 @@ xfs_iformat_local(
488 * kmem_alloc() or memcpy() below. 482 * kmem_alloc() or memcpy() below.
489 */ 483 */
490 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 484 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
491 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 485 xfs_warn(ip->i_mount,
492 "corrupt inode %Lu " 486 "corrupt inode %Lu (bad size %d for local fork, size = %d).",
493 "(bad size %d for local fork, size = %d).",
494 (unsigned long long) ip->i_ino, size, 487 (unsigned long long) ip->i_ino, size,
495 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 488 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
496 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 489 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -547,8 +540,7 @@ xfs_iformat_extents(
547 * kmem_alloc() or memcpy() below. 540 * kmem_alloc() or memcpy() below.
548 */ 541 */
549 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 542 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
550 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 543 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
551 "corrupt inode %Lu ((a)extents = %d).",
552 (unsigned long long) ip->i_ino, nex); 544 (unsigned long long) ip->i_ino, nex);
553 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 545 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
554 ip->i_mount, dip); 546 ip->i_mount, dip);
@@ -623,11 +615,10 @@ xfs_iformat_btree(
623 || XFS_BMDR_SPACE_CALC(nrecs) > 615 || XFS_BMDR_SPACE_CALC(nrecs) >
624 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 616 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
625 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 617 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
626 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 618 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
627 "corrupt inode %Lu (btree).",
628 (unsigned long long) ip->i_ino); 619 (unsigned long long) ip->i_ino);
629 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 620 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
630 ip->i_mount); 621 ip->i_mount, dip);
631 return XFS_ERROR(EFSCORRUPTED); 622 return XFS_ERROR(EFSCORRUPTED);
632 } 623 }
633 624
@@ -660,7 +651,8 @@ xfs_dinode_from_disk(
660 to->di_uid = be32_to_cpu(from->di_uid); 651 to->di_uid = be32_to_cpu(from->di_uid);
661 to->di_gid = be32_to_cpu(from->di_gid); 652 to->di_gid = be32_to_cpu(from->di_gid);
662 to->di_nlink = be32_to_cpu(from->di_nlink); 653 to->di_nlink = be32_to_cpu(from->di_nlink);
663 to->di_projid = be16_to_cpu(from->di_projid); 654 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
655 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
664 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 656 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
665 to->di_flushiter = be16_to_cpu(from->di_flushiter); 657 to->di_flushiter = be16_to_cpu(from->di_flushiter);
666 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 658 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
@@ -695,7 +687,8 @@ xfs_dinode_to_disk(
695 to->di_uid = cpu_to_be32(from->di_uid); 687 to->di_uid = cpu_to_be32(from->di_uid);
696 to->di_gid = cpu_to_be32(from->di_gid); 688 to->di_gid = cpu_to_be32(from->di_gid);
697 to->di_nlink = cpu_to_be32(from->di_nlink); 689 to->di_nlink = cpu_to_be32(from->di_nlink);
698 to->di_projid = cpu_to_be16(from->di_projid); 690 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
691 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
699 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 692 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
700 to->di_flushiter = cpu_to_be16(from->di_flushiter); 693 to->di_flushiter = cpu_to_be16(from->di_flushiter);
701 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 694 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
@@ -811,11 +804,9 @@ xfs_iread(
811 */ 804 */
812 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { 805 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
813#ifdef DEBUG 806#ifdef DEBUG
814 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 807 xfs_alert(mp,
815 "dip->di_magic (0x%x) != " 808 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
816 "XFS_DINODE_MAGIC (0x%x)", 809 __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
817 be16_to_cpu(dip->di_magic),
818 XFS_DINODE_MAGIC);
819#endif /* DEBUG */ 810#endif /* DEBUG */
820 error = XFS_ERROR(EINVAL); 811 error = XFS_ERROR(EINVAL);
821 goto out_brelse; 812 goto out_brelse;
@@ -833,9 +824,8 @@ xfs_iread(
833 error = xfs_iformat(ip, dip); 824 error = xfs_iformat(ip, dip);
834 if (error) { 825 if (error) {
835#ifdef DEBUG 826#ifdef DEBUG
836 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 827 xfs_alert(mp, "%s: xfs_iformat() returned error %d",
837 "xfs_iformat() returned error %d", 828 __func__, error);
838 error);
839#endif /* DEBUG */ 829#endif /* DEBUG */
840 goto out_brelse; 830 goto out_brelse;
841 } 831 }
@@ -874,7 +864,7 @@ xfs_iread(
874 if (ip->i_d.di_version == 1) { 864 if (ip->i_d.di_version == 1) {
875 ip->i_d.di_nlink = ip->i_d.di_onlink; 865 ip->i_d.di_nlink = ip->i_d.di_onlink;
876 ip->i_d.di_onlink = 0; 866 ip->i_d.di_onlink = 0;
877 ip->i_d.di_projid = 0; 867 xfs_set_projid(ip, 0);
878 } 868 }
879 869
880 ip->i_delayed_blks = 0; 870 ip->i_delayed_blks = 0;
@@ -885,7 +875,7 @@ xfs_iread(
885 * around for a while. This helps to keep recently accessed 875 * around for a while. This helps to keep recently accessed
886 * meta-data in-core longer. 876 * meta-data in-core longer.
887 */ 877 */
888 XFS_BUF_SET_REF(bp, XFS_INO_REF); 878 xfs_buf_set_ref(bp, XFS_INO_REF);
889 879
890 /* 880 /*
891 * Use xfs_trans_brelse() to release the buffer containing the 881 * Use xfs_trans_brelse() to release the buffer containing the
@@ -930,7 +920,6 @@ xfs_iread_extents(
930 /* 920 /*
931 * We know that the size is valid (it's checked in iformat_btree) 921 * We know that the size is valid (it's checked in iformat_btree)
932 */ 922 */
933 ifp->if_lastex = NULLEXTNUM;
934 ifp->if_bytes = ifp->if_real_bytes = 0; 923 ifp->if_bytes = ifp->if_real_bytes = 0;
935 ifp->if_flags |= XFS_IFEXTENTS; 924 ifp->if_flags |= XFS_IFEXTENTS;
936 xfs_iext_add(ifp, 0, nextents); 925 xfs_iext_add(ifp, 0, nextents);
@@ -982,8 +971,7 @@ xfs_ialloc(
982 mode_t mode, 971 mode_t mode,
983 xfs_nlink_t nlink, 972 xfs_nlink_t nlink,
984 xfs_dev_t rdev, 973 xfs_dev_t rdev,
985 cred_t *cr, 974 prid_t prid,
986 xfs_prid_t prid,
987 int okalloc, 975 int okalloc,
988 xfs_buf_t **ialloc_context, 976 xfs_buf_t **ialloc_context,
989 boolean_t *call_again, 977 boolean_t *call_again,
@@ -1015,8 +1003,8 @@ xfs_ialloc(
1015 * This is because we're setting fields here we need 1003 * This is because we're setting fields here we need
1016 * to prevent others from looking at until we're done. 1004 * to prevent others from looking at until we're done.
1017 */ 1005 */
1018 error = xfs_trans_iget(tp->t_mountp, tp, ino, 1006 error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
1019 XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 1007 XFS_ILOCK_EXCL, &ip);
1020 if (error) 1008 if (error)
1021 return error; 1009 return error;
1022 ASSERT(ip != NULL); 1010 ASSERT(ip != NULL);
@@ -1027,7 +1015,7 @@ xfs_ialloc(
1027 ASSERT(ip->i_d.di_nlink == nlink); 1015 ASSERT(ip->i_d.di_nlink == nlink);
1028 ip->i_d.di_uid = current_fsuid(); 1016 ip->i_d.di_uid = current_fsuid();
1029 ip->i_d.di_gid = current_fsgid(); 1017 ip->i_d.di_gid = current_fsgid();
1030 ip->i_d.di_projid = prid; 1018 xfs_set_projid(ip, prid);
1031 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1019 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1032 1020
1033 /* 1021 /*
@@ -1165,6 +1153,7 @@ xfs_ialloc(
1165 /* 1153 /*
1166 * Log the new values stuffed into the inode. 1154 * Log the new values stuffed into the inode.
1167 */ 1155 */
1156 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
1168 xfs_trans_log_inode(tp, ip, flags); 1157 xfs_trans_log_inode(tp, ip, flags);
1169 1158
1170 /* now that we have an i_mode we can setup inode ops and unlock */ 1159 /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1364,7 +1353,7 @@ xfs_itruncate_start(
1364 return 0; 1353 return 0;
1365 } 1354 }
1366 last_byte = xfs_file_last_byte(ip); 1355 last_byte = xfs_file_last_byte(ip);
1367 trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte); 1356 trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
1368 if (last_byte > toss_start) { 1357 if (last_byte > toss_start) {
1369 if (flags & XFS_ITRUNC_DEFINITE) { 1358 if (flags & XFS_ITRUNC_DEFINITE) {
1370 xfs_tosspages(ip, toss_start, 1359 xfs_tosspages(ip, toss_start,
@@ -1480,7 +1469,7 @@ xfs_itruncate_finish(
1480 * file but the log buffers containing the free and reallocation 1469 * file but the log buffers containing the free and reallocation
1481 * don't, then we'd end up with garbage in the blocks being freed. 1470 * don't, then we'd end up with garbage in the blocks being freed.
1482 * As long as we make the new_size permanent before actually 1471 * As long as we make the new_size permanent before actually
1483 * freeing any blocks it doesn't matter if they get writtten to. 1472 * freeing any blocks it doesn't matter if they get written to.
1484 * 1473 *
1485 * The callers must signal into us whether or not the size 1474 * The callers must signal into us whether or not the size
1486 * setting here must be synchronous. There are a few cases 1475 * setting here must be synchronous. There are a few cases
@@ -1819,9 +1808,8 @@ xfs_iunlink_remove(
1819 */ 1808 */
1820 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1809 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1821 if (error) { 1810 if (error) {
1822 cmn_err(CE_WARN, 1811 xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
1823 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1812 __func__, error);
1824 error, mp->m_fsname);
1825 return error; 1813 return error;
1826 } 1814 }
1827 next_agino = be32_to_cpu(dip->di_next_unlinked); 1815 next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1866,9 +1854,9 @@ xfs_iunlink_remove(
1866 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 1854 error = xfs_inotobp(mp, tp, next_ino, &last_dip,
1867 &last_ibp, &last_offset, 0); 1855 &last_ibp, &last_offset, 0);
1868 if (error) { 1856 if (error) {
1869 cmn_err(CE_WARN, 1857 xfs_warn(mp,
1870 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", 1858 "%s: xfs_inotobp() returned error %d.",
1871 error, mp->m_fsname); 1859 __func__, error);
1872 return error; 1860 return error;
1873 } 1861 }
1874 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 1862 next_agino = be32_to_cpu(last_dip->di_next_unlinked);
@@ -1881,9 +1869,8 @@ xfs_iunlink_remove(
1881 */ 1869 */
1882 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1870 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1883 if (error) { 1871 if (error) {
1884 cmn_err(CE_WARN, 1872 xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
1885 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1873 __func__, error);
1886 error, mp->m_fsname);
1887 return error; 1874 return error;
1888 } 1875 }
1889 next_agino = be32_to_cpu(dip->di_next_unlinked); 1876 next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1999,15 +1986,31 @@ xfs_ifree_cluster(
1999 */ 1986 */
2000 for (i = 0; i < ninodes; i++) { 1987 for (i = 0; i < ninodes; i++) {
2001retry: 1988retry:
2002 read_lock(&pag->pag_ici_lock); 1989 rcu_read_lock();
2003 ip = radix_tree_lookup(&pag->pag_ici_root, 1990 ip = radix_tree_lookup(&pag->pag_ici_root,
2004 XFS_INO_TO_AGINO(mp, (inum + i))); 1991 XFS_INO_TO_AGINO(mp, (inum + i)));
2005 1992
2006 /* Inode not in memory or stale, nothing to do */ 1993 /* Inode not in memory, nothing to do */
2007 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 1994 if (!ip) {
2008 read_unlock(&pag->pag_ici_lock); 1995 rcu_read_unlock();
1996 continue;
1997 }
1998
1999 /*
2000 * because this is an RCU protected lookup, we could
2001 * find a recently freed or even reallocated inode
2002 * during the lookup. We need to check under the
2003 * i_flags_lock for a valid inode here. Skip it if it
2004 * is not valid, the wrong inode or stale.
2005 */
2006 spin_lock(&ip->i_flags_lock);
2007 if (ip->i_ino != inum + i ||
2008 __xfs_iflags_test(ip, XFS_ISTALE)) {
2009 spin_unlock(&ip->i_flags_lock);
2010 rcu_read_unlock();
2009 continue; 2011 continue;
2010 } 2012 }
2013 spin_unlock(&ip->i_flags_lock);
2011 2014
2012 /* 2015 /*
2013 * Don't try to lock/unlock the current inode, but we 2016 * Don't try to lock/unlock the current inode, but we
@@ -2018,11 +2021,11 @@ retry:
2018 */ 2021 */
2019 if (ip != free_ip && 2022 if (ip != free_ip &&
2020 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2023 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2021 read_unlock(&pag->pag_ici_lock); 2024 rcu_read_unlock();
2022 delay(1); 2025 delay(1);
2023 goto retry; 2026 goto retry;
2024 } 2027 }
2025 read_unlock(&pag->pag_ici_lock); 2028 rcu_read_unlock();
2026 2029
2027 xfs_iflock(ip); 2030 xfs_iflock(ip);
2028 xfs_iflags_set(ip, XFS_ISTALE); 2031 xfs_iflags_set(ip, XFS_ISTALE);
@@ -2554,12 +2557,9 @@ xfs_iflush_fork(
2554 case XFS_DINODE_FMT_EXTENTS: 2557 case XFS_DINODE_FMT_EXTENTS:
2555 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2558 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2556 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2559 !(iip->ili_format.ilf_fields & extflag[whichfork]));
2557 ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
2558 (ifp->if_bytes == 0));
2559 ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
2560 (ifp->if_bytes > 0));
2561 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2560 if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
2562 (ifp->if_bytes > 0)) { 2561 (ifp->if_bytes > 0)) {
2562 ASSERT(xfs_iext_get_ext(ifp, 0));
2563 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2563 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2564 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2564 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2565 whichfork); 2565 whichfork);
@@ -2628,7 +2628,7 @@ xfs_iflush_cluster(
2628 2628
2629 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2629 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2630 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2630 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2631 read_lock(&pag->pag_ici_lock); 2631 rcu_read_lock();
2632 /* really need a gang lookup range call here */ 2632 /* really need a gang lookup range call here */
2633 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2633 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2634 first_index, inodes_per_cluster); 2634 first_index, inodes_per_cluster);
@@ -2639,9 +2639,21 @@ xfs_iflush_cluster(
2639 iq = ilist[i]; 2639 iq = ilist[i];
2640 if (iq == ip) 2640 if (iq == ip)
2641 continue; 2641 continue;
2642 /* if the inode lies outside this cluster, we're done. */ 2642
2643 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) 2643 /*
2644 break; 2644 * because this is an RCU protected lookup, we could find a
2645 * recently freed or even reallocated inode during the lookup.
2646 * We need to check under the i_flags_lock for a valid inode
2647 * here. Skip it if it is not valid or the wrong inode.
2648 */
2649 spin_lock(&ip->i_flags_lock);
2650 if (!ip->i_ino ||
2651 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2652 spin_unlock(&ip->i_flags_lock);
2653 continue;
2654 }
2655 spin_unlock(&ip->i_flags_lock);
2656
2645 /* 2657 /*
2646 * Do an un-protected check to see if the inode is dirty and 2658 * Do an un-protected check to see if the inode is dirty and
2647 * is a candidate for flushing. These checks will be repeated 2659 * is a candidate for flushing. These checks will be repeated
@@ -2691,7 +2703,7 @@ xfs_iflush_cluster(
2691 } 2703 }
2692 2704
2693out_free: 2705out_free:
2694 read_unlock(&pag->pag_ici_lock); 2706 rcu_read_unlock();
2695 kmem_free(ilist); 2707 kmem_free(ilist);
2696out_put: 2708out_put:
2697 xfs_perag_put(pag); 2709 xfs_perag_put(pag);
@@ -2703,7 +2715,7 @@ cluster_corrupt_out:
2703 * Corruption detected in the clustering loop. Invalidate the 2715 * Corruption detected in the clustering loop. Invalidate the
2704 * inode buffer and shut down the filesystem. 2716 * inode buffer and shut down the filesystem.
2705 */ 2717 */
2706 read_unlock(&pag->pag_ici_lock); 2718 rcu_read_unlock();
2707 /* 2719 /*
2708 * Clean up the buffer. If it was B_DELWRI, just release it -- 2720 * Clean up the buffer. If it was B_DELWRI, just release it --
2709 * brelse can handle it with no problems. If not, shut down the 2721 * brelse can handle it with no problems. If not, shut down the
@@ -2725,7 +2737,7 @@ cluster_corrupt_out:
2725 XFS_BUF_UNDONE(bp); 2737 XFS_BUF_UNDONE(bp);
2726 XFS_BUF_STALE(bp); 2738 XFS_BUF_STALE(bp);
2727 XFS_BUF_ERROR(bp,EIO); 2739 XFS_BUF_ERROR(bp,EIO);
2728 xfs_biodone(bp); 2740 xfs_buf_ioend(bp, 0);
2729 } else { 2741 } else {
2730 XFS_BUF_STALE(bp); 2742 XFS_BUF_STALE(bp);
2731 xfs_buf_relse(bp); 2743 xfs_buf_relse(bp);
@@ -2773,7 +2785,7 @@ xfs_iflush(
2773 2785
2774 /* 2786 /*
2775 * We can't flush the inode until it is unpinned, so wait for it if we 2787 * We can't flush the inode until it is unpinned, so wait for it if we
2776 * are allowed to block. We know noone new can pin it, because we are 2788 * are allowed to block. We know no one new can pin it, because we are
2777 * holding the inode lock shared and you need to hold it exclusively to 2789 * holding the inode lock shared and you need to hold it exclusively to
2778 * pin the inode. 2790 * pin the inode.
2779 * 2791 *
@@ -2819,7 +2831,7 @@ xfs_iflush(
2819 * Get the buffer containing the on-disk inode. 2831 * Get the buffer containing the on-disk inode.
2820 */ 2832 */
2821 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2833 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2822 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); 2834 (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
2823 if (error || !bp) { 2835 if (error || !bp) {
2824 xfs_ifunlock(ip); 2836 xfs_ifunlock(ip);
2825 return error; 2837 return error;
@@ -2910,16 +2922,16 @@ xfs_iflush_int(
2910 2922
2911 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, 2923 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
2912 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2924 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2913 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2925 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2914 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2926 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
2915 ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2927 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2916 goto corrupt_out; 2928 goto corrupt_out;
2917 } 2929 }
2918 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 2930 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
2919 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 2931 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
2920 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2932 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2921 "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 2933 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
2922 ip->i_ino, ip, ip->i_d.di_magic); 2934 __func__, ip->i_ino, ip, ip->i_d.di_magic);
2923 goto corrupt_out; 2935 goto corrupt_out;
2924 } 2936 }
2925 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 2937 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
@@ -2927,9 +2939,9 @@ xfs_iflush_int(
2927 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2939 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2928 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2940 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
2929 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 2941 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
2930 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2942 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2931 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", 2943 "%s: Bad regular inode %Lu, ptr 0x%p",
2932 ip->i_ino, ip); 2944 __func__, ip->i_ino, ip);
2933 goto corrupt_out; 2945 goto corrupt_out;
2934 } 2946 }
2935 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 2947 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
@@ -2938,28 +2950,28 @@ xfs_iflush_int(
2938 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2950 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
2939 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 2951 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
2940 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 2952 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
2941 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2953 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2942 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", 2954 "%s: Bad directory inode %Lu, ptr 0x%p",
2943 ip->i_ino, ip); 2955 __func__, ip->i_ino, ip);
2944 goto corrupt_out; 2956 goto corrupt_out;
2945 } 2957 }
2946 } 2958 }
2947 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 2959 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
2948 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 2960 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
2949 XFS_RANDOM_IFLUSH_5)) { 2961 XFS_RANDOM_IFLUSH_5)) {
2950 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2962 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2951 "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", 2963 "%s: detected corrupt incore inode %Lu, "
2952 ip->i_ino, 2964 "total extents = %d, nblocks = %Ld, ptr 0x%p",
2965 __func__, ip->i_ino,
2953 ip->i_d.di_nextents + ip->i_d.di_anextents, 2966 ip->i_d.di_nextents + ip->i_d.di_anextents,
2954 ip->i_d.di_nblocks, 2967 ip->i_d.di_nblocks, ip);
2955 ip);
2956 goto corrupt_out; 2968 goto corrupt_out;
2957 } 2969 }
2958 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 2970 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
2959 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 2971 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
2960 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2972 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2961 "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 2973 "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
2962 ip->i_ino, ip->i_d.di_forkoff, ip); 2974 __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2963 goto corrupt_out; 2975 goto corrupt_out;
2964 } 2976 }
2965 /* 2977 /*
@@ -3008,7 +3020,7 @@ xfs_iflush_int(
3008 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3020 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3009 memset(&(dip->di_pad[0]), 0, 3021 memset(&(dip->di_pad[0]), 0,
3010 sizeof(dip->di_pad)); 3022 sizeof(dip->di_pad));
3011 ASSERT(ip->i_d.di_projid == 0); 3023 ASSERT(xfs_get_projid(ip) == 0);
3012 } 3024 }
3013 } 3025 }
3014 3026
@@ -3096,6 +3108,8 @@ xfs_iext_get_ext(
3096 xfs_extnum_t idx) /* index of target extent */ 3108 xfs_extnum_t idx) /* index of target extent */
3097{ 3109{
3098 ASSERT(idx >= 0); 3110 ASSERT(idx >= 0);
3111 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3112
3099 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { 3113 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
3100 return ifp->if_u1.if_ext_irec->er_extbuf; 3114 return ifp->if_u1.if_ext_irec->er_extbuf;
3101 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3115 } else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3175,7 +3189,6 @@ xfs_iext_add(
3175 } 3189 }
3176 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3190 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3177 ifp->if_real_bytes = 0; 3191 ifp->if_real_bytes = 0;
3178 ifp->if_lastex = nextents + ext_diff;
3179 } 3192 }
3180 /* 3193 /*
3181 * Otherwise use a linear (direct) extent list. 3194 * Otherwise use a linear (direct) extent list.
@@ -3870,8 +3883,10 @@ xfs_iext_idx_to_irec(
3870 xfs_extnum_t page_idx = *idxp; /* extent index in target list */ 3883 xfs_extnum_t page_idx = *idxp; /* extent index in target list */
3871 3884
3872 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3885 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3873 ASSERT(page_idx >= 0 && page_idx <= 3886 ASSERT(page_idx >= 0);
3874 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 3887 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3888 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3889
3875 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3890 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3876 erp_idx = 0; 3891 erp_idx = 0;
3877 low = 0; 3892 low = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0898c5417d12..964cfea77686 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,7 +67,6 @@ typedef struct xfs_ifork {
67 short if_broot_bytes; /* bytes allocated for root */ 67 short if_broot_bytes; /* bytes allocated for root */
68 unsigned char if_flags; /* per-fork flags */ 68 unsigned char if_flags; /* per-fork flags */
69 unsigned char if_ext_max; /* max # of extent records */ 69 unsigned char if_ext_max; /* max # of extent records */
70 xfs_extnum_t if_lastex; /* last if_extents used */
71 union { 70 union {
72 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ 71 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
73 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ 72 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
@@ -111,7 +110,7 @@ struct xfs_imap {
111 * Generally, we do not want to hold the i_rlock while holding the 110 * Generally, we do not want to hold the i_rlock while holding the
112 * i_ilock. Hierarchy is i_iolock followed by i_rlock. 111 * i_ilock. Hierarchy is i_iolock followed by i_rlock.
113 * 112 *
114 * xfs_iptr_t contains all the inode fields upto and including the 113 * xfs_iptr_t contains all the inode fields up to and including the
115 * i_mnext and i_mprev fields, it is used as a marker in the inode 114 * i_mnext and i_mprev fields, it is used as a marker in the inode
116 * chain off the mount structure by xfs_sync calls. 115 * chain off the mount structure by xfs_sync calls.
117 */ 116 */
@@ -134,8 +133,9 @@ typedef struct xfs_icdinode {
134 __uint32_t di_uid; /* owner's user id */ 133 __uint32_t di_uid; /* owner's user id */
135 __uint32_t di_gid; /* owner's group id */ 134 __uint32_t di_gid; /* owner's group id */
136 __uint32_t di_nlink; /* number of links to file */ 135 __uint32_t di_nlink; /* number of links to file */
137 __uint16_t di_projid; /* owner's project id */ 136 __uint16_t di_projid_lo; /* lower part of owner's project id */
138 __uint8_t di_pad[8]; /* unused, zeroed space */ 137 __uint16_t di_projid_hi; /* higher part of owner's project id */
138 __uint8_t di_pad[6]; /* unused, zeroed space */
139 __uint16_t di_flushiter; /* incremented on flush */ 139 __uint16_t di_flushiter; /* incremented on flush */
140 xfs_ictimestamp_t di_atime; /* time last accessed */ 140 xfs_ictimestamp_t di_atime; /* time last accessed */
141 xfs_ictimestamp_t di_mtime; /* time last modified */ 141 xfs_ictimestamp_t di_mtime; /* time last modified */
@@ -212,7 +212,6 @@ typedef struct xfs_icdinode {
212#ifdef __KERNEL__ 212#ifdef __KERNEL__
213 213
214struct bhv_desc; 214struct bhv_desc;
215struct cred;
216struct xfs_buf; 215struct xfs_buf;
217struct xfs_bmap_free; 216struct xfs_bmap_free;
218struct xfs_bmbt_irec; 217struct xfs_bmbt_irec;
@@ -335,6 +334,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
335} 334}
336 335
337/* 336/*
337 * Project quota id helpers (previously projid was 16bit only
338 * and using two 16bit values to hold new 32bit projid was chosen
339 * to retain compatibility with "old" filesystems).
340 */
341static inline prid_t
342xfs_get_projid(struct xfs_inode *ip)
343{
344 return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
345}
346
347static inline void
348xfs_set_projid(struct xfs_inode *ip,
349 prid_t projid)
350{
351 ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
352 ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
353}
354
355/*
338 * Manage the i_flush queue embedded in the inode. This completion 356 * Manage the i_flush queue embedded in the inode. This completion
339 * queue synchronizes processes attempting to flush the in-core 357 * queue synchronizes processes attempting to flush the in-core
340 * inode back to disk. 358 * inode back to disk.
@@ -357,12 +375,23 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
357/* 375/*
358 * In-core inode flags. 376 * In-core inode flags.
359 */ 377 */
360#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ 378#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
361#define XFS_ISTALE 0x0002 /* inode has been staled */ 379#define XFS_ISTALE 0x0002 /* inode has been staled */
362#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ 380#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
363#define XFS_INEW 0x0008 /* inode has just been allocated */ 381#define XFS_INEW 0x0008 /* inode has just been allocated */
364#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ 382#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
365#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ 383#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
384#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
385
386/*
387 * Per-lifetime flags need to be reset when re-using a reclaimable inode during
388 * inode lookup. Thi prevents unintended behaviour on the new inode from
389 * ocurring.
390 */
391#define XFS_IRECLAIM_RESET_FLAGS \
392 (XFS_IRECLAIMABLE | XFS_IRECLAIM | \
393 XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
394 XFS_IFILESTREAM);
366 395
367/* 396/*
368 * Flags for inode locking. 397 * Flags for inode locking.
@@ -389,28 +418,35 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
389/* 418/*
390 * Flags for lockdep annotations. 419 * Flags for lockdep annotations.
391 * 420 *
392 * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes 421 * XFS_LOCK_PARENT - for directory operations that require locking a
393 * (ie directory operations that require locking a directory inode and 422 * parent directory inode and a child entry inode. The parent gets locked
394 * an entry inode). The first inode gets locked with this flag so it 423 * with this flag so it gets a lockdep subclass of 1 and the child entry
395 * gets a lockdep subclass of 1 and the second lock will have a lockdep 424 * lock will have a lockdep subclass of 0.
396 * subclass of 0. 425 *
426 * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
427 * inodes do not participate in the normal lock order, and thus have their
428 * own subclasses.
397 * 429 *
398 * XFS_LOCK_INUMORDER - for locking several inodes at the some time 430 * XFS_LOCK_INUMORDER - for locking several inodes at the some time
399 * with xfs_lock_inodes(). This flag is used as the starting subclass 431 * with xfs_lock_inodes(). This flag is used as the starting subclass
400 * and each subsequent lock acquired will increment the subclass by one. 432 * and each subsequent lock acquired will increment the subclass by one.
401 * So the first lock acquired will have a lockdep subclass of 2, the 433 * So the first lock acquired will have a lockdep subclass of 4, the
402 * second lock will have a lockdep subclass of 3, and so on. It is 434 * second lock will have a lockdep subclass of 5, and so on. It is
403 * the responsibility of the class builder to shift this to the correct 435 * the responsibility of the class builder to shift this to the correct
404 * portion of the lock_mode lockdep mask. 436 * portion of the lock_mode lockdep mask.
405 */ 437 */
406#define XFS_LOCK_PARENT 1 438#define XFS_LOCK_PARENT 1
407#define XFS_LOCK_INUMORDER 2 439#define XFS_LOCK_RTBITMAP 2
440#define XFS_LOCK_RTSUM 3
441#define XFS_LOCK_INUMORDER 4
408 442
409#define XFS_IOLOCK_SHIFT 16 443#define XFS_IOLOCK_SHIFT 16
410#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) 444#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
411 445
412#define XFS_ILOCK_SHIFT 24 446#define XFS_ILOCK_SHIFT 24
413#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) 447#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
448#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
449#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
414 450
415#define XFS_IOLOCK_DEP_MASK 0x00ff0000 451#define XFS_IOLOCK_DEP_MASK 0x00ff0000
416#define XFS_ILOCK_DEP_MASK 0xff000000 452#define XFS_ILOCK_DEP_MASK 0xff000000
@@ -419,6 +455,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
419#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) 455#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
420#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 456#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
421 457
458extern struct lock_class_key xfs_iolock_reclaimable;
459
422/* 460/*
423 * Flags for xfs_itruncate_start(). 461 * Flags for xfs_itruncate_start().
424 */ 462 */
@@ -456,8 +494,8 @@ void xfs_inode_free(struct xfs_inode *ip);
456 * xfs_inode.c prototypes. 494 * xfs_inode.c prototypes.
457 */ 495 */
458int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, 496int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
459 xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t, 497 xfs_nlink_t, xfs_dev_t, prid_t, int,
460 int, struct xfs_buf **, boolean_t *, xfs_inode_t **); 498 struct xfs_buf **, boolean_t *, xfs_inode_t **);
461 499
462uint xfs_ip2xflags(struct xfs_inode *); 500uint xfs_ip2xflags(struct xfs_inode *);
463uint xfs_dic2xflags(struct xfs_dinode *); 501uint xfs_dic2xflags(struct xfs_dinode *);
@@ -471,7 +509,6 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
471void xfs_iext_realloc(xfs_inode_t *, int, int); 509void xfs_iext_realloc(xfs_inode_t *, int, int);
472void xfs_iunpin_wait(xfs_inode_t *); 510void xfs_iunpin_wait(xfs_inode_t *);
473int xfs_iflush(xfs_inode_t *, uint); 511int xfs_iflush(xfs_inode_t *, uint);
474void xfs_ichgtime(xfs_inode_t *, int);
475void xfs_lock_inodes(xfs_inode_t **, int, uint); 512void xfs_lock_inodes(xfs_inode_t **, int, uint);
476void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 513void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
477 514
@@ -482,7 +519,7 @@ void xfs_mark_inode_dirty_sync(xfs_inode_t *);
482#define IHOLD(ip) \ 519#define IHOLD(ip) \
483do { \ 520do { \
484 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 521 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
485 atomic_inc(&(VFS_I(ip)->i_count)); \ 522 ihold(VFS_I(ip)); \
486 trace_xfs_ihold(ip, _THIS_IP_); \ 523 trace_xfs_ihold(ip, _THIS_IP_); \
487} while (0) 524} while (0)
488 525
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fe00777e2796..b1e88d56069c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -198,6 +198,41 @@ xfs_inode_item_size(
198} 198}
199 199
200/* 200/*
201 * xfs_inode_item_format_extents - convert in-core extents to on-disk form
202 *
203 * For either the data or attr fork in extent format, we need to endian convert
204 * the in-core extent as we place them into the on-disk inode. In this case, we
205 * need to do this conversion before we write the extents into the log. Because
206 * we don't have the disk inode to write into here, we allocate a buffer and
207 * format the extents into it via xfs_iextents_copy(). We free the buffer in
208 * the unlock routine after the copy for the log has been made.
209 *
210 * In the case of the data fork, the in-core and on-disk fork sizes can be
211 * different due to delayed allocation extents. We only log on-disk extents
212 * here, so always use the physical fork size to determine the size of the
213 * buffer we need to allocate.
214 */
215STATIC void
216xfs_inode_item_format_extents(
217 struct xfs_inode *ip,
218 struct xfs_log_iovec *vecp,
219 int whichfork,
220 int type)
221{
222 xfs_bmbt_rec_t *ext_buffer;
223
224 ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
225 if (whichfork == XFS_DATA_FORK)
226 ip->i_itemp->ili_extents_buf = ext_buffer;
227 else
228 ip->i_itemp->ili_aextents_buf = ext_buffer;
229
230 vecp->i_addr = ext_buffer;
231 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
232 vecp->i_type = type;
233}
234
235/*
201 * This is called to fill in the vector of log iovecs for the 236 * This is called to fill in the vector of log iovecs for the
202 * given inode log item. It fills the first item with an inode 237 * given inode log item. It fills the first item with an inode
203 * log format structure, the second with the on-disk inode structure, 238 * log format structure, the second with the on-disk inode structure,
@@ -213,7 +248,6 @@ xfs_inode_item_format(
213 struct xfs_inode *ip = iip->ili_inode; 248 struct xfs_inode *ip = iip->ili_inode;
214 uint nvecs; 249 uint nvecs;
215 size_t data_bytes; 250 size_t data_bytes;
216 xfs_bmbt_rec_t *ext_buffer;
217 xfs_mount_t *mp; 251 xfs_mount_t *mp;
218 252
219 vecp->i_addr = &iip->ili_format; 253 vecp->i_addr = &iip->ili_format;
@@ -223,15 +257,6 @@ xfs_inode_item_format(
223 nvecs = 1; 257 nvecs = 1;
224 258
225 /* 259 /*
226 * Make sure the linux inode is dirty. We do this before
227 * clearing i_update_core as the VFS will call back into
228 * XFS here and set i_update_core, so we need to dirty the
229 * inode first so that the ordering of i_update_core and
230 * unlogged modifications still works as described below.
231 */
232 xfs_mark_inode_dirty_sync(ip);
233
234 /*
235 * Clear i_update_core if the timestamps (or any other 260 * Clear i_update_core if the timestamps (or any other
236 * non-transactional modification) need flushing/logging 261 * non-transactional modification) need flushing/logging
237 * and we're about to log them with the rest of the core. 262 * and we're about to log them with the rest of the core.
@@ -329,22 +354,8 @@ xfs_inode_item_format(
329 } else 354 } else
330#endif 355#endif
331 { 356 {
332 /* 357 xfs_inode_item_format_extents(ip, vecp,
333 * There are delayed allocation extents 358 XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
334 * in the inode, or we need to convert
335 * the extents to on disk format.
336 * Use xfs_iextents_copy()
337 * to copy only the real extents into
338 * a separate buffer. We'll free the
339 * buffer in the unlock routine.
340 */
341 ext_buffer = kmem_alloc(ip->i_df.if_bytes,
342 KM_SLEEP);
343 iip->ili_extents_buf = ext_buffer;
344 vecp->i_addr = ext_buffer;
345 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
346 XFS_DATA_FORK);
347 vecp->i_type = XLOG_REG_TYPE_IEXT;
348 } 359 }
349 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 360 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
350 iip->ili_format.ilf_dsize = vecp->i_len; 361 iip->ili_format.ilf_dsize = vecp->i_len;
@@ -454,19 +465,12 @@ xfs_inode_item_format(
454 */ 465 */
455 vecp->i_addr = ip->i_afp->if_u1.if_extents; 466 vecp->i_addr = ip->i_afp->if_u1.if_extents;
456 vecp->i_len = ip->i_afp->if_bytes; 467 vecp->i_len = ip->i_afp->if_bytes;
468 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
457#else 469#else
458 ASSERT(iip->ili_aextents_buf == NULL); 470 ASSERT(iip->ili_aextents_buf == NULL);
459 /* 471 xfs_inode_item_format_extents(ip, vecp,
460 * Need to endian flip before logging 472 XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
461 */
462 ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
463 KM_SLEEP);
464 iip->ili_aextents_buf = ext_buffer;
465 vecp->i_addr = ext_buffer;
466 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
467 XFS_ATTR_FORK);
468#endif 473#endif
469 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
470 iip->ili_format.ilf_asize = vecp->i_len; 474 iip->ili_format.ilf_asize = vecp->i_len;
471 vecp++; 475 vecp++;
472 nvecs++; 476 nvecs++;
@@ -666,18 +670,39 @@ xfs_inode_item_unlock(
666} 670}
667 671
668/* 672/*
669 * This is called to find out where the oldest active copy of the 673 * This is called to find out where the oldest active copy of the inode log
670 * inode log item in the on disk log resides now that the last log 674 * item in the on disk log resides now that the last log write of it completed
671 * write of it completed at the given lsn. Since we always re-log 675 * at the given lsn. Since we always re-log all dirty data in an inode, the
672 * all dirty data in an inode, the latest copy in the on disk log 676 * latest copy in the on disk log is the only one that matters. Therefore,
673 * is the only one that matters. Therefore, simply return the 677 * simply return the given lsn.
674 * given lsn. 678 *
679 * If the inode has been marked stale because the cluster is being freed, we
680 * don't want to (re-)insert this inode into the AIL. There is a race condition
681 * where the cluster buffer may be unpinned before the inode is inserted into
682 * the AIL during transaction committed processing. If the buffer is unpinned
683 * before the inode item has been committed and inserted, then it is possible
684 * for the buffer to be written and IO completes before the inode is inserted
685 * into the AIL. In that case, we'd be inserting a clean, stale inode into the
686 * AIL which will never get removed. It will, however, get reclaimed which
687 * triggers an assert in xfs_inode_free() complaining about freein an inode
688 * still in the AIL.
689 *
690 * To avoid this, just unpin the inode directly and return a LSN of -1 so the
691 * transaction committed code knows that it does not need to do any further
692 * processing on the item.
675 */ 693 */
676STATIC xfs_lsn_t 694STATIC xfs_lsn_t
677xfs_inode_item_committed( 695xfs_inode_item_committed(
678 struct xfs_log_item *lip, 696 struct xfs_log_item *lip,
679 xfs_lsn_t lsn) 697 xfs_lsn_t lsn)
680{ 698{
699 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
700 struct xfs_inode *ip = iip->ili_inode;
701
702 if (xfs_iflags_test(ip, XFS_ISTALE)) {
703 xfs_inode_item_unpin(lip, 0);
704 return -1;
705 }
681 return lsn; 706 return lsn;
682} 707}
683 708
@@ -750,11 +775,11 @@ xfs_inode_item_push(
750 * Push the inode to it's backing buffer. This will not remove the 775 * Push the inode to it's backing buffer. This will not remove the
751 * inode from the AIL - a further push will be required to trigger a 776 * inode from the AIL - a further push will be required to trigger a
752 * buffer push. However, this allows all the dirty inodes to be pushed 777 * buffer push. However, this allows all the dirty inodes to be pushed
753 * to the buffer before it is pushed to disk. THe buffer IO completion 778 * to the buffer before it is pushed to disk. The buffer IO completion
754 * will pull th einode from the AIL, mark it clean and unlock the flush 779 * will pull the inode from the AIL, mark it clean and unlock the flush
755 * lock. 780 * lock.
756 */ 781 */
757 (void) xfs_iflush(ip, 0); 782 (void) xfs_iflush(ip, SYNC_TRYLOCK);
758 xfs_iunlock(ip, XFS_ILOCK_SHARED); 783 xfs_iunlock(ip, XFS_ILOCK_SHARED);
759} 784}
760 785
@@ -832,15 +857,64 @@ xfs_inode_item_destroy(
832 * flushed to disk. It is responsible for removing the inode item 857 * flushed to disk. It is responsible for removing the inode item
833 * from the AIL if it has not been re-logged, and unlocking the inode's 858 * from the AIL if it has not been re-logged, and unlocking the inode's
834 * flush lock. 859 * flush lock.
860 *
861 * To reduce AIL lock traffic as much as possible, we scan the buffer log item
862 * list for other inodes that will run this function. We remove them from the
863 * buffer list so we can process all the inode IO completions in one AIL lock
864 * traversal.
835 */ 865 */
836void 866void
837xfs_iflush_done( 867xfs_iflush_done(
838 struct xfs_buf *bp, 868 struct xfs_buf *bp,
839 struct xfs_log_item *lip) 869 struct xfs_log_item *lip)
840{ 870{
841 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 871 struct xfs_inode_log_item *iip;
842 xfs_inode_t *ip = iip->ili_inode; 872 struct xfs_log_item *blip;
873 struct xfs_log_item *next;
874 struct xfs_log_item *prev;
843 struct xfs_ail *ailp = lip->li_ailp; 875 struct xfs_ail *ailp = lip->li_ailp;
876 int need_ail = 0;
877
878 /*
879 * Scan the buffer IO completions for other inodes being completed and
880 * attach them to the current inode log item.
881 */
882 blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
883 prev = NULL;
884 while (blip != NULL) {
885 if (lip->li_cb != xfs_iflush_done) {
886 prev = blip;
887 blip = blip->li_bio_list;
888 continue;
889 }
890
891 /* remove from list */
892 next = blip->li_bio_list;
893 if (!prev) {
894 XFS_BUF_SET_FSPRIVATE(bp, next);
895 } else {
896 prev->li_bio_list = next;
897 }
898
899 /* add to current list */
900 blip->li_bio_list = lip->li_bio_list;
901 lip->li_bio_list = blip;
902
903 /*
904 * while we have the item, do the unlocked check for needing
905 * the AIL lock.
906 */
907 iip = INODE_ITEM(blip);
908 if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
909 need_ail++;
910
911 blip = next;
912 }
913
914 /* make sure we capture the state of the initial inode. */
915 iip = INODE_ITEM(lip);
916 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
917 need_ail++;
844 918
845 /* 919 /*
846 * We only want to pull the item from the AIL if it is 920 * We only want to pull the item from the AIL if it is
@@ -851,28 +925,37 @@ xfs_iflush_done(
851 * the lock since it's cheaper, and then we recheck while 925 * the lock since it's cheaper, and then we recheck while
852 * holding the lock before removing the inode from the AIL. 926 * holding the lock before removing the inode from the AIL.
853 */ 927 */
854 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { 928 if (need_ail) {
929 struct xfs_log_item *log_items[need_ail];
930 int i = 0;
855 spin_lock(&ailp->xa_lock); 931 spin_lock(&ailp->xa_lock);
856 if (lip->li_lsn == iip->ili_flush_lsn) { 932 for (blip = lip; blip; blip = blip->li_bio_list) {
857 /* xfs_trans_ail_delete() drops the AIL lock. */ 933 iip = INODE_ITEM(blip);
858 xfs_trans_ail_delete(ailp, lip); 934 if (iip->ili_logged &&
859 } else { 935 blip->li_lsn == iip->ili_flush_lsn) {
860 spin_unlock(&ailp->xa_lock); 936 log_items[i++] = blip;
937 }
938 ASSERT(i <= need_ail);
861 } 939 }
940 /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
941 xfs_trans_ail_delete_bulk(ailp, log_items, i);
862 } 942 }
863 943
864 iip->ili_logged = 0;
865 944
866 /* 945 /*
867 * Clear the ili_last_fields bits now that we know that the 946 * clean up and unlock the flush lock now we are done. We can clear the
868 * data corresponding to them is safely on disk. 947 * ili_last_fields bits now that we know that the data corresponding to
948 * them is safely on disk.
869 */ 949 */
870 iip->ili_last_fields = 0; 950 for (blip = lip; blip; blip = next) {
951 next = blip->li_bio_list;
952 blip->li_bio_list = NULL;
871 953
872 /* 954 iip = INODE_ITEM(blip);
873 * Release the inode's flush lock since we're done with it. 955 iip->ili_logged = 0;
874 */ 956 iip->ili_last_fields = 0;
875 xfs_ifunlock(ip); 957 xfs_ifunlock(iip->ili_inode);
958 }
876} 959}
877 960
878/* 961/*
@@ -889,7 +972,6 @@ xfs_iflush_abort(
889{ 972{
890 xfs_inode_log_item_t *iip = ip->i_itemp; 973 xfs_inode_log_item_t *iip = ip->i_itemp;
891 974
892 iip = ip->i_itemp;
893 if (iip) { 975 if (iip) {
894 struct xfs_ail *ailp = iip->ili_item.li_ailp; 976 struct xfs_ail *ailp = iip->ili_item.li_ailp;
895 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 977 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..091d82b94c4d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
47 47
48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
49 << mp->m_writeio_log) 49 << mp->m_writeio_log)
50#define XFS_STRAT_WRITE_IMAPS 2
51#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 50#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
52 51
53STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
54 int, struct xfs_bmbt_irec *, int *);
55STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
56 struct xfs_bmbt_irec *, int *);
57STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
58 struct xfs_bmbt_irec *, int *);
59
60int
61xfs_iomap(
62 struct xfs_inode *ip,
63 xfs_off_t offset,
64 ssize_t count,
65 int flags,
66 struct xfs_bmbt_irec *imap,
67 int *nimaps,
68 int *new)
69{
70 struct xfs_mount *mp = ip->i_mount;
71 xfs_fileoff_t offset_fsb, end_fsb;
72 int error = 0;
73 int lockmode = 0;
74 int bmapi_flags = 0;
75
76 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
77
78 *new = 0;
79
80 if (XFS_FORCED_SHUTDOWN(mp))
81 return XFS_ERROR(EIO);
82
83 trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
84
85 switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
86 case BMAPI_READ:
87 lockmode = xfs_ilock_map_shared(ip);
88 bmapi_flags = XFS_BMAPI_ENTIRE;
89 break;
90 case BMAPI_WRITE:
91 lockmode = XFS_ILOCK_EXCL;
92 if (flags & BMAPI_IGNSTATE)
93 bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
94 xfs_ilock(ip, lockmode);
95 break;
96 case BMAPI_ALLOCATE:
97 lockmode = XFS_ILOCK_SHARED;
98 bmapi_flags = XFS_BMAPI_ENTIRE;
99
100 /* Attempt non-blocking lock */
101 if (flags & BMAPI_TRYLOCK) {
102 if (!xfs_ilock_nowait(ip, lockmode))
103 return XFS_ERROR(EAGAIN);
104 } else {
105 xfs_ilock(ip, lockmode);
106 }
107 break;
108 default:
109 BUG();
110 }
111
112 ASSERT(offset <= mp->m_maxioffset);
113 if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
114 count = mp->m_maxioffset - offset;
115 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
116 offset_fsb = XFS_B_TO_FSBT(mp, offset);
117
118 error = xfs_bmapi(NULL, ip, offset_fsb,
119 (xfs_filblks_t)(end_fsb - offset_fsb),
120 bmapi_flags, NULL, 0, imap,
121 nimaps, NULL);
122
123 if (error)
124 goto out;
125
126 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
127 case BMAPI_WRITE:
128 /* If we found an extent, return it */
129 if (*nimaps &&
130 (imap->br_startblock != HOLESTARTBLOCK) &&
131 (imap->br_startblock != DELAYSTARTBLOCK)) {
132 trace_xfs_iomap_found(ip, offset, count, flags, imap);
133 break;
134 }
135
136 if (flags & BMAPI_DIRECT) {
137 error = xfs_iomap_write_direct(ip, offset, count, flags,
138 imap, nimaps);
139 } else {
140 error = xfs_iomap_write_delay(ip, offset, count, flags,
141 imap, nimaps);
142 }
143 if (!error) {
144 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
145 }
146 *new = 1;
147 break;
148 case BMAPI_ALLOCATE:
149 /* If we found an extent, return it */
150 xfs_iunlock(ip, lockmode);
151 lockmode = 0;
152
153 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
154 trace_xfs_iomap_found(ip, offset, count, flags, imap);
155 break;
156 }
157
158 error = xfs_iomap_write_allocate(ip, offset, count,
159 imap, nimaps);
160 break;
161 }
162
163 ASSERT(*nimaps <= 1);
164
165out:
166 if (lockmode)
167 xfs_iunlock(ip, lockmode);
168 return XFS_ERROR(error);
169}
170
171STATIC int 52STATIC int
172xfs_iomap_eof_align_last_fsb( 53xfs_iomap_eof_align_last_fsb(
173 xfs_mount_t *mp, 54 xfs_mount_t *mp,
@@ -220,11 +101,11 @@ xfs_iomap_eof_align_last_fsb(
220} 101}
221 102
222STATIC int 103STATIC int
223xfs_cmn_err_fsblock_zero( 104xfs_alert_fsblock_zero(
224 xfs_inode_t *ip, 105 xfs_inode_t *ip,
225 xfs_bmbt_irec_t *imap) 106 xfs_bmbt_irec_t *imap)
226{ 107{
227 xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, 108 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
228 "Access to block zero in inode %llu " 109 "Access to block zero in inode %llu "
229 "start_block: %llx start_off: %llx " 110 "start_block: %llx start_off: %llx "
230 "blkcnt: %llx extent-state: %x\n", 111 "blkcnt: %llx extent-state: %x\n",
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
236 return EFSCORRUPTED; 117 return EFSCORRUPTED;
237} 118}
238 119
239STATIC int 120int
240xfs_iomap_write_direct( 121xfs_iomap_write_direct(
241 xfs_inode_t *ip, 122 xfs_inode_t *ip,
242 xfs_off_t offset, 123 xfs_off_t offset,
243 size_t count, 124 size_t count,
244 int flags,
245 xfs_bmbt_irec_t *imap, 125 xfs_bmbt_irec_t *imap,
246 int *nmaps) 126 int nmaps)
247{ 127{
248 xfs_mount_t *mp = ip->i_mount; 128 xfs_mount_t *mp = ip->i_mount;
249 xfs_fileoff_t offset_fsb; 129 xfs_fileoff_t offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
279 if (error) 159 if (error)
280 goto error_out; 160 goto error_out;
281 } else { 161 } else {
282 if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) 162 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
283 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 163 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
284 imap->br_blockcount + 164 imap->br_blockcount +
285 imap->br_startoff); 165 imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
331 xfs_trans_ijoin(tp, ip); 211 xfs_trans_ijoin(tp, ip);
332 212
333 bmapi_flag = XFS_BMAPI_WRITE; 213 bmapi_flag = XFS_BMAPI_WRITE;
334 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) 214 if (offset < ip->i_size || extsz)
335 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
336 216
337 /* 217 /*
@@ -366,11 +246,10 @@ xfs_iomap_write_direct(
366 } 246 }
367 247
368 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { 248 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
369 error = xfs_cmn_err_fsblock_zero(ip, imap); 249 error = xfs_alert_fsblock_zero(ip, imap);
370 goto error_out; 250 goto error_out;
371 } 251 }
372 252
373 *nmaps = 1;
374 return 0; 253 return 0;
375 254
376error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 255error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
379 258
380error1: /* Just cancel transaction */ 259error1: /* Just cancel transaction */
381 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 260 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
382 *nmaps = 0; /* nothing set-up here */
383 261
384error_out: 262error_out:
385 return XFS_ERROR(error); 263 return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
389 * If the caller is doing a write at the end of the file, then extend the 267 * If the caller is doing a write at the end of the file, then extend the
390 * allocation out to the file system's write iosize. We clean up any extra 268 * allocation out to the file system's write iosize. We clean up any extra
391 * space left over when the file is closed in xfs_inactive(). 269 * space left over when the file is closed in xfs_inactive().
270 *
271 * If we find we already have delalloc preallocation beyond EOF, don't do more
272 * preallocation as it it not needed.
392 */ 273 */
393STATIC int 274STATIC int
394xfs_iomap_eof_want_preallocate( 275xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
396 xfs_inode_t *ip, 277 xfs_inode_t *ip,
397 xfs_off_t offset, 278 xfs_off_t offset,
398 size_t count, 279 size_t count,
399 int ioflag,
400 xfs_bmbt_irec_t *imap, 280 xfs_bmbt_irec_t *imap,
401 int nimaps, 281 int nimaps,
402 int *prealloc) 282 int *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
405 xfs_filblks_t count_fsb; 285 xfs_filblks_t count_fsb;
406 xfs_fsblock_t firstblock; 286 xfs_fsblock_t firstblock;
407 int n, error, imaps; 287 int n, error, imaps;
288 int found_delalloc = 0;
408 289
409 *prealloc = 0; 290 *prealloc = 0;
410 if ((offset + count) <= ip->i_size) 291 if ((offset + count) <= ip->i_size)
@@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate(
429 return 0; 310 return 0;
430 start_fsb += imap[n].br_blockcount; 311 start_fsb += imap[n].br_blockcount;
431 count_fsb -= imap[n].br_blockcount; 312 count_fsb -= imap[n].br_blockcount;
313
314 if (imap[n].br_startblock == DELAYSTARTBLOCK)
315 found_delalloc = 1;
432 } 316 }
433 } 317 }
434 *prealloc = 1; 318 if (!found_delalloc)
319 *prealloc = 1;
435 return 0; 320 return 0;
436} 321}
437 322
438STATIC int 323/*
324 * If we don't have a user specified preallocation size, dynamically increase
325 * the preallocation size as the size of the file grows. Cap the maximum size
326 * at a single extent or less if the filesystem is near full. The closer the
327 * filesystem is to full, the smaller the maximum prealocation.
328 */
329STATIC xfs_fsblock_t
330xfs_iomap_prealloc_size(
331 struct xfs_mount *mp,
332 struct xfs_inode *ip)
333{
334 xfs_fsblock_t alloc_blocks = 0;
335
336 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
337 int shift = 0;
338 int64_t freesp;
339
340 /*
341 * rounddown_pow_of_two() returns an undefined result
342 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
343 * ensure we always pass in a non-zero value.
344 */
345 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
346 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
347 rounddown_pow_of_two(alloc_blocks));
348
349 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
350 freesp = mp->m_sb.sb_fdblocks;
351 if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
352 shift = 2;
353 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
354 shift++;
355 if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
356 shift++;
357 if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
358 shift++;
359 if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
360 shift++;
361 }
362 if (shift)
363 alloc_blocks >>= shift;
364 }
365
366 if (alloc_blocks < mp->m_writeio_blocks)
367 alloc_blocks = mp->m_writeio_blocks;
368
369 return alloc_blocks;
370}
371
372int
439xfs_iomap_write_delay( 373xfs_iomap_write_delay(
440 xfs_inode_t *ip, 374 xfs_inode_t *ip,
441 xfs_off_t offset, 375 xfs_off_t offset,
442 size_t count, 376 size_t count,
443 int ioflag, 377 xfs_bmbt_irec_t *ret_imap)
444 xfs_bmbt_irec_t *ret_imap,
445 int *nmaps)
446{ 378{
447 xfs_mount_t *mp = ip->i_mount; 379 xfs_mount_t *mp = ip->i_mount;
448 xfs_fileoff_t offset_fsb; 380 xfs_fileoff_t offset_fsb;
@@ -469,16 +401,19 @@ xfs_iomap_write_delay(
469 extsz = xfs_get_extsz_hint(ip); 401 extsz = xfs_get_extsz_hint(ip);
470 offset_fsb = XFS_B_TO_FSBT(mp, offset); 402 offset_fsb = XFS_B_TO_FSBT(mp, offset);
471 403
404
472 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, 405 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
473 ioflag, imap, XFS_WRITE_IMAPS, &prealloc); 406 imap, XFS_WRITE_IMAPS, &prealloc);
474 if (error) 407 if (error)
475 return error; 408 return error;
476 409
477retry: 410retry:
478 if (prealloc) { 411 if (prealloc) {
412 xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
413
479 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 414 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
480 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 415 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
481 last_fsb = ioalign + mp->m_writeio_blocks; 416 last_fsb = ioalign + alloc_blocks;
482 } else { 417 } else {
483 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 418 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
484 } 419 }
@@ -496,22 +431,31 @@ retry:
496 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 431 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
497 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 432 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
498 &nimaps, NULL); 433 &nimaps, NULL);
499 if (error && (error != ENOSPC)) 434 switch (error) {
435 case 0:
436 case ENOSPC:
437 case EDQUOT:
438 break;
439 default:
500 return XFS_ERROR(error); 440 return XFS_ERROR(error);
441 }
501 442
502 /* 443 /*
503 * If bmapi returned us nothing, and if we didn't get back EDQUOT, 444 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
504 * then we must have run out of space - flush all other inodes with 445 * ENOSPC, * flush all other inodes with delalloc blocks to free up
505 * delalloc blocks and retry without EOF preallocation. 446 * some of the excess reserved metadata space. For both cases, retry
447 * without EOF preallocation.
506 */ 448 */
507 if (nimaps == 0) { 449 if (nimaps == 0) {
508 trace_xfs_delalloc_enospc(ip, offset, count); 450 trace_xfs_delalloc_enospc(ip, offset, count);
509 if (flushed) 451 if (flushed)
510 return XFS_ERROR(ENOSPC); 452 return XFS_ERROR(error ? error : ENOSPC);
511 453
512 xfs_iunlock(ip, XFS_ILOCK_EXCL); 454 if (error == ENOSPC) {
513 xfs_flush_inodes(ip); 455 xfs_iunlock(ip, XFS_ILOCK_EXCL);
514 xfs_ilock(ip, XFS_ILOCK_EXCL); 456 xfs_flush_inodes(ip);
457 xfs_ilock(ip, XFS_ILOCK_EXCL);
458 }
515 459
516 flushed = 1; 460 flushed = 1;
517 error = 0; 461 error = 0;
@@ -520,11 +464,9 @@ retry:
520 } 464 }
521 465
522 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 466 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
523 return xfs_cmn_err_fsblock_zero(ip, &imap[0]); 467 return xfs_alert_fsblock_zero(ip, &imap[0]);
524 468
525 *ret_imap = imap[0]; 469 *ret_imap = imap[0];
526 *nmaps = 1;
527
528 return 0; 470 return 0;
529} 471}
530 472
@@ -538,13 +480,12 @@ retry:
538 * We no longer bother to look at the incoming map - all we have to 480 * We no longer bother to look at the incoming map - all we have to
539 * guarantee is that whatever we allocate fills the required range. 481 * guarantee is that whatever we allocate fills the required range.
540 */ 482 */
541STATIC int 483int
542xfs_iomap_write_allocate( 484xfs_iomap_write_allocate(
543 xfs_inode_t *ip, 485 xfs_inode_t *ip,
544 xfs_off_t offset, 486 xfs_off_t offset,
545 size_t count, 487 size_t count,
546 xfs_bmbt_irec_t *imap, 488 xfs_bmbt_irec_t *imap)
547 int *retmap)
548{ 489{
549 xfs_mount_t *mp = ip->i_mount; 490 xfs_mount_t *mp = ip->i_mount;
550 xfs_fileoff_t offset_fsb, last_block; 491 xfs_fileoff_t offset_fsb, last_block;
@@ -557,8 +498,6 @@ xfs_iomap_write_allocate(
557 int error = 0; 498 int error = 0;
558 int nres; 499 int nres;
559 500
560 *retmap = 0;
561
562 /* 501 /*
563 * Make sure that the dquots are there. 502 * Make sure that the dquots are there.
564 */ 503 */
@@ -675,12 +614,11 @@ xfs_iomap_write_allocate(
675 * covers at least part of the callers request 614 * covers at least part of the callers request
676 */ 615 */
677 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) 616 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
678 return xfs_cmn_err_fsblock_zero(ip, imap); 617 return xfs_alert_fsblock_zero(ip, imap);
679 618
680 if ((offset_fsb >= imap->br_startoff) && 619 if ((offset_fsb >= imap->br_startoff) &&
681 (offset_fsb < (imap->br_startoff + 620 (offset_fsb < (imap->br_startoff +
682 imap->br_blockcount))) { 621 imap->br_blockcount))) {
683 *retmap = 1;
684 XFS_STATS_INC(xs_xstrat_quick); 622 XFS_STATS_INC(xs_xstrat_quick);
685 return 0; 623 return 0;
686 } 624 }
@@ -786,7 +724,7 @@ xfs_iomap_write_unwritten(
786 return XFS_ERROR(error); 724 return XFS_ERROR(error);
787 725
788 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 726 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
789 return xfs_cmn_err_fsblock_zero(ip, &imap); 727 return xfs_alert_fsblock_zero(ip, &imap);
790 728
791 if ((numblks_fsb = imap.br_blockcount) == 0) { 729 if ((numblks_fsb = imap.br_blockcount) == 0) {
792 /* 730 /*
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21/* base extent manipulation calls */
22#define BMAPI_READ (1 << 0) /* read extents */
23#define BMAPI_WRITE (1 << 1) /* create extents */
24#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
25
26/* modifiers */
27#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
28#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
29#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
30#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
31
32#define BMAPI_FLAGS \
33 { BMAPI_READ, "READ" }, \
34 { BMAPI_WRITE, "WRITE" }, \
35 { BMAPI_ALLOCATE, "ALLOCATE" }, \
36 { BMAPI_IGNSTATE, "IGNSTATE" }, \
37 { BMAPI_DIRECT, "DIRECT" }, \
38 { BMAPI_TRYLOCK, "TRYLOCK" }
39
40struct xfs_inode; 21struct xfs_inode;
41struct xfs_bmbt_irec; 22struct xfs_bmbt_irec;
42 23
43extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 24extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
44 struct xfs_bmbt_irec *, int *, int *); 25 struct xfs_bmbt_irec *, int);
26extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
27 struct xfs_bmbt_irec *);
28extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
29 struct xfs_bmbt_irec *);
45extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 30extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
46 31
47#endif /* __XFS_IOMAP_H__*/ 32#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7e3626e5925c..751e94fe1f77 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -92,7 +92,8 @@ xfs_bulkstat_one_int(
92 * further change. 92 * further change.
93 */ 93 */
94 buf->bs_nlink = dic->di_nlink; 94 buf->bs_nlink = dic->di_nlink;
95 buf->bs_projid = dic->di_projid; 95 buf->bs_projid_lo = dic->di_projid_lo;
96 buf->bs_projid_hi = dic->di_projid_hi;
96 buf->bs_ino = ino; 97 buf->bs_ino = ino;
97 buf->bs_mode = dic->di_mode; 98 buf->bs_mode = dic->di_mode;
98 buf->bs_uid = dic->di_uid; 99 buf->bs_uid = dic->di_uid;
@@ -203,7 +204,6 @@ xfs_bulkstat(
203 xfs_agi_t *agi; /* agi header data */ 204 xfs_agi_t *agi; /* agi header data */
204 xfs_agino_t agino; /* inode # in allocation group */ 205 xfs_agino_t agino; /* inode # in allocation group */
205 xfs_agnumber_t agno; /* allocation group number */ 206 xfs_agnumber_t agno; /* allocation group number */
206 xfs_daddr_t bno; /* inode cluster start daddr */
207 int chunkidx; /* current index into inode chunk */ 207 int chunkidx; /* current index into inode chunk */
208 int clustidx; /* current index into inode cluster */ 208 int clustidx; /* current index into inode cluster */
209 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ 209 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
@@ -462,7 +462,6 @@ xfs_bulkstat(
462 mp->m_sb.sb_inopblog); 462 mp->m_sb.sb_inopblog);
463 } 463 }
464 ino = XFS_AGINO_TO_INO(mp, agno, agino); 464 ino = XFS_AGINO_TO_INO(mp, agno, agino);
465 bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
466 /* 465 /*
467 * Skip if this inode is free. 466 * Skip if this inode is free.
468 */ 467 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 33f718f92a48..41d5b8f2bf92 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
47 xfs_buftarg_t *log_target, 47 xfs_buftarg_t *log_target,
48 xfs_daddr_t blk_offset, 48 xfs_daddr_t blk_offset,
49 int num_bblks); 49 int num_bblks);
50STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 50STATIC int xlog_space_left(struct log *log, atomic64_t *head);
51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
52STATIC void xlog_dealloc_log(xlog_t *log); 52STATIC void xlog_dealloc_log(xlog_t *log);
53 53
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
70/* local functions to manipulate grant head */ 70/* local functions to manipulate grant head */
71STATIC int xlog_grant_log_space(xlog_t *log, 71STATIC int xlog_grant_log_space(xlog_t *log,
72 xlog_ticket_t *xtic); 72 xlog_ticket_t *xtic);
73STATIC void xlog_grant_push_ail(xfs_mount_t *mp, 73STATIC void xlog_grant_push_ail(struct log *log,
74 int need_bytes); 74 int need_bytes);
75STATIC void xlog_regrant_reserve_log_space(xlog_t *log, 75STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
76 xlog_ticket_t *ticket); 76 xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
81 81
82#if defined(DEBUG) 82#if defined(DEBUG)
83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
84STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 84STATIC void xlog_verify_grant_tail(struct log *log);
85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
86 int count, boolean_t syncing); 86 int count, boolean_t syncing);
87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, 87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
88 xfs_lsn_t tail_lsn); 88 xfs_lsn_t tail_lsn);
89#else 89#else
90#define xlog_verify_dest_ptr(a,b) 90#define xlog_verify_dest_ptr(a,b)
91#define xlog_verify_grant_head(a,b) 91#define xlog_verify_grant_tail(a)
92#define xlog_verify_iclog(a,b,c,d) 92#define xlog_verify_iclog(a,b,c,d)
93#define xlog_verify_tail_lsn(a,b,c) 93#define xlog_verify_tail_lsn(a,b,c)
94#endif 94#endif
95 95
96STATIC int xlog_iclogs_empty(xlog_t *log); 96STATIC int xlog_iclogs_empty(xlog_t *log);
97 97
98
99static void 98static void
100xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 99xlog_grant_sub_space(
100 struct log *log,
101 atomic64_t *head,
102 int bytes)
101{ 103{
102 if (*qp) { 104 int64_t head_val = atomic64_read(head);
103 tic->t_next = (*qp); 105 int64_t new, old;
104 tic->t_prev = (*qp)->t_prev;
105 (*qp)->t_prev->t_next = tic;
106 (*qp)->t_prev = tic;
107 } else {
108 tic->t_prev = tic->t_next = tic;
109 *qp = tic;
110 }
111 106
112 tic->t_flags |= XLOG_TIC_IN_Q; 107 do {
113} 108 int cycle, space;
114 109
115static void 110 xlog_crack_grant_head_val(head_val, &cycle, &space);
116xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 111
117{ 112 space -= bytes;
118 if (tic == tic->t_next) { 113 if (space < 0) {
119 *qp = NULL; 114 space += log->l_logsize;
120 } else { 115 cycle--;
121 *qp = tic->t_next; 116 }
122 tic->t_next->t_prev = tic->t_prev;
123 tic->t_prev->t_next = tic->t_next;
124 }
125 117
126 tic->t_next = tic->t_prev = NULL; 118 old = head_val;
127 tic->t_flags &= ~XLOG_TIC_IN_Q; 119 new = xlog_assign_grant_head_val(cycle, space);
120 head_val = atomic64_cmpxchg(head, old, new);
121 } while (head_val != old);
128} 122}
129 123
130static void 124static void
131xlog_grant_sub_space(struct log *log, int bytes) 125xlog_grant_add_space(
126 struct log *log,
127 atomic64_t *head,
128 int bytes)
132{ 129{
133 log->l_grant_write_bytes -= bytes; 130 int64_t head_val = atomic64_read(head);
134 if (log->l_grant_write_bytes < 0) { 131 int64_t new, old;
135 log->l_grant_write_bytes += log->l_logsize;
136 log->l_grant_write_cycle--;
137 }
138
139 log->l_grant_reserve_bytes -= bytes;
140 if ((log)->l_grant_reserve_bytes < 0) {
141 log->l_grant_reserve_bytes += log->l_logsize;
142 log->l_grant_reserve_cycle--;
143 }
144 132
145} 133 do {
134 int tmp;
135 int cycle, space;
146 136
147static void 137 xlog_crack_grant_head_val(head_val, &cycle, &space);
148xlog_grant_add_space_write(struct log *log, int bytes)
149{
150 int tmp = log->l_logsize - log->l_grant_write_bytes;
151 if (tmp > bytes)
152 log->l_grant_write_bytes += bytes;
153 else {
154 log->l_grant_write_cycle++;
155 log->l_grant_write_bytes = bytes - tmp;
156 }
157}
158 138
159static void 139 tmp = log->l_logsize - space;
160xlog_grant_add_space_reserve(struct log *log, int bytes) 140 if (tmp > bytes)
161{ 141 space += bytes;
162 int tmp = log->l_logsize - log->l_grant_reserve_bytes; 142 else {
163 if (tmp > bytes) 143 space = bytes - tmp;
164 log->l_grant_reserve_bytes += bytes; 144 cycle++;
165 else { 145 }
166 log->l_grant_reserve_cycle++;
167 log->l_grant_reserve_bytes = bytes - tmp;
168 }
169}
170 146
171static inline void 147 old = head_val;
172xlog_grant_add_space(struct log *log, int bytes) 148 new = xlog_assign_grant_head_val(cycle, space);
173{ 149 head_val = atomic64_cmpxchg(head, old, new);
174 xlog_grant_add_space_write(log, bytes); 150 } while (head_val != old);
175 xlog_grant_add_space_reserve(log, bytes);
176} 151}
177 152
178static void 153static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
355 330
356 trace_xfs_log_reserve(log, internal_ticket); 331 trace_xfs_log_reserve(log, internal_ticket);
357 332
358 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 333 xlog_grant_push_ail(log, internal_ticket->t_unit_res);
359 retval = xlog_regrant_write_log_space(log, internal_ticket); 334 retval = xlog_regrant_write_log_space(log, internal_ticket);
360 } else { 335 } else {
361 /* may sleep if need to allocate more tickets */ 336 /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
369 344
370 trace_xfs_log_reserve(log, internal_ticket); 345 trace_xfs_log_reserve(log, internal_ticket);
371 346
372 xlog_grant_push_ail(mp, 347 xlog_grant_push_ail(log,
373 (internal_ticket->t_unit_res * 348 (internal_ticket->t_unit_res *
374 internal_ticket->t_cnt)); 349 internal_ticket->t_cnt));
375 retval = xlog_grant_log_space(log, internal_ticket); 350 retval = xlog_grant_log_space(log, internal_ticket);
@@ -399,11 +374,10 @@ xfs_log_mount(
399 int error; 374 int error;
400 375
401 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 376 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
402 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 377 xfs_notice(mp, "Mounting Filesystem");
403 else { 378 else {
404 cmn_err(CE_NOTE, 379 xfs_notice(mp,
405 "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 380"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent.");
406 mp->m_fsname);
407 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 381 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
408 } 382 }
409 383
@@ -418,7 +392,7 @@ xfs_log_mount(
418 */ 392 */
419 error = xfs_trans_ail_init(mp); 393 error = xfs_trans_ail_init(mp);
420 if (error) { 394 if (error) {
421 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); 395 xfs_warn(mp, "AIL initialisation failed: error %d", error);
422 goto out_free_log; 396 goto out_free_log;
423 } 397 }
424 mp->m_log->l_ailp = mp->m_ail; 398 mp->m_log->l_ailp = mp->m_ail;
@@ -438,7 +412,8 @@ xfs_log_mount(
438 if (readonly) 412 if (readonly)
439 mp->m_flags |= XFS_MOUNT_RDONLY; 413 mp->m_flags |= XFS_MOUNT_RDONLY;
440 if (error) { 414 if (error) {
441 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); 415 xfs_warn(mp, "log mount/recovery failed: error %d",
416 error);
442 goto out_destroy_ail; 417 goto out_destroy_ail;
443 } 418 }
444 } 419 }
@@ -567,10 +542,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
567 */ 542 */
568 } 543 }
569 544
570 if (error) { 545 if (error)
571 xfs_fs_cmn_err(CE_ALERT, mp, 546 xfs_alert(mp, "%s: unmount record failed", __func__);
572 "xfs_log_unmount: unmount record failed");
573 }
574 547
575 548
576 spin_lock(&log->l_icloglock); 549 spin_lock(&log->l_icloglock);
@@ -584,8 +557,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
584 if (!(iclog->ic_state == XLOG_STATE_ACTIVE || 557 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
585 iclog->ic_state == XLOG_STATE_DIRTY)) { 558 iclog->ic_state == XLOG_STATE_DIRTY)) {
586 if (!XLOG_FORCED_SHUTDOWN(log)) { 559 if (!XLOG_FORCED_SHUTDOWN(log)) {
587 sv_wait(&iclog->ic_force_wait, PMEM, 560 xlog_wait(&iclog->ic_force_wait,
588 &log->l_icloglock, s); 561 &log->l_icloglock);
589 } else { 562 } else {
590 spin_unlock(&log->l_icloglock); 563 spin_unlock(&log->l_icloglock);
591 } 564 }
@@ -625,8 +598,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
625 || iclog->ic_state == XLOG_STATE_DIRTY 598 || iclog->ic_state == XLOG_STATE_DIRTY
626 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 599 || iclog->ic_state == XLOG_STATE_IOERROR) ) {
627 600
628 sv_wait(&iclog->ic_force_wait, PMEM, 601 xlog_wait(&iclog->ic_force_wait,
629 &log->l_icloglock, s); 602 &log->l_icloglock);
630 } else { 603 } else {
631 spin_unlock(&log->l_icloglock); 604 spin_unlock(&log->l_icloglock);
632 } 605 }
@@ -703,55 +676,46 @@ xfs_log_move_tail(xfs_mount_t *mp,
703{ 676{
704 xlog_ticket_t *tic; 677 xlog_ticket_t *tic;
705 xlog_t *log = mp->m_log; 678 xlog_t *log = mp->m_log;
706 int need_bytes, free_bytes, cycle, bytes; 679 int need_bytes, free_bytes;
707 680
708 if (XLOG_FORCED_SHUTDOWN(log)) 681 if (XLOG_FORCED_SHUTDOWN(log))
709 return; 682 return;
710 683
711 if (tail_lsn == 0) { 684 if (tail_lsn == 0)
712 /* needed since sync_lsn is 64 bits */ 685 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
713 spin_lock(&log->l_icloglock);
714 tail_lsn = log->l_last_sync_lsn;
715 spin_unlock(&log->l_icloglock);
716 }
717
718 spin_lock(&log->l_grant_lock);
719 686
720 /* Also an invalid lsn. 1 implies that we aren't passing in a valid 687 /* tail_lsn == 1 implies that we weren't passed a valid value. */
721 * tail_lsn. 688 if (tail_lsn != 1)
722 */ 689 atomic64_set(&log->l_tail_lsn, tail_lsn);
723 if (tail_lsn != 1) {
724 log->l_tail_lsn = tail_lsn;
725 }
726 690
727 if ((tic = log->l_write_headq)) { 691 if (!list_empty_careful(&log->l_writeq)) {
728#ifdef DEBUG 692#ifdef DEBUG
729 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 693 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
730 panic("Recovery problem"); 694 panic("Recovery problem");
731#endif 695#endif
732 cycle = log->l_grant_write_cycle; 696 spin_lock(&log->l_grant_write_lock);
733 bytes = log->l_grant_write_bytes; 697 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
734 free_bytes = xlog_space_left(log, cycle, bytes); 698 list_for_each_entry(tic, &log->l_writeq, t_queue) {
735 do {
736 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 699 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
737 700
738 if (free_bytes < tic->t_unit_res && tail_lsn != 1) 701 if (free_bytes < tic->t_unit_res && tail_lsn != 1)
739 break; 702 break;
740 tail_lsn = 0; 703 tail_lsn = 0;
741 free_bytes -= tic->t_unit_res; 704 free_bytes -= tic->t_unit_res;
742 sv_signal(&tic->t_wait); 705 trace_xfs_log_regrant_write_wake_up(log, tic);
743 tic = tic->t_next; 706 wake_up(&tic->t_wait);
744 } while (tic != log->l_write_headq); 707 }
708 spin_unlock(&log->l_grant_write_lock);
745 } 709 }
746 if ((tic = log->l_reserve_headq)) { 710
711 if (!list_empty_careful(&log->l_reserveq)) {
747#ifdef DEBUG 712#ifdef DEBUG
748 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 713 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
749 panic("Recovery problem"); 714 panic("Recovery problem");
750#endif 715#endif
751 cycle = log->l_grant_reserve_cycle; 716 spin_lock(&log->l_grant_reserve_lock);
752 bytes = log->l_grant_reserve_bytes; 717 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
753 free_bytes = xlog_space_left(log, cycle, bytes); 718 list_for_each_entry(tic, &log->l_reserveq, t_queue) {
754 do {
755 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 719 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
756 need_bytes = tic->t_unit_res*tic->t_cnt; 720 need_bytes = tic->t_unit_res*tic->t_cnt;
757 else 721 else
@@ -760,12 +724,12 @@ xfs_log_move_tail(xfs_mount_t *mp,
760 break; 724 break;
761 tail_lsn = 0; 725 tail_lsn = 0;
762 free_bytes -= need_bytes; 726 free_bytes -= need_bytes;
763 sv_signal(&tic->t_wait); 727 trace_xfs_log_grant_wake_up(log, tic);
764 tic = tic->t_next; 728 wake_up(&tic->t_wait);
765 } while (tic != log->l_reserve_headq); 729 }
730 spin_unlock(&log->l_grant_reserve_lock);
766 } 731 }
767 spin_unlock(&log->l_grant_lock); 732}
768} /* xfs_log_move_tail */
769 733
770/* 734/*
771 * Determine if we have a transaction that has gone to disk 735 * Determine if we have a transaction that has gone to disk
@@ -797,7 +761,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
797 break; 761 break;
798 case XLOG_STATE_COVER_NEED: 762 case XLOG_STATE_COVER_NEED:
799 case XLOG_STATE_COVER_NEED2: 763 case XLOG_STATE_COVER_NEED2:
800 if (!xfs_trans_ail_tail(log->l_ailp) && 764 if (!xfs_ail_min_lsn(log->l_ailp) &&
801 xlog_iclogs_empty(log)) { 765 xlog_iclogs_empty(log)) {
802 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 766 if (log->l_covered_state == XLOG_STATE_COVER_NEED)
803 log->l_covered_state = XLOG_STATE_COVER_DONE; 767 log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -831,23 +795,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
831 * We may be holding the log iclog lock upon entering this routine. 795 * We may be holding the log iclog lock upon entering this routine.
832 */ 796 */
833xfs_lsn_t 797xfs_lsn_t
834xlog_assign_tail_lsn(xfs_mount_t *mp) 798xlog_assign_tail_lsn(
799 struct xfs_mount *mp)
835{ 800{
836 xfs_lsn_t tail_lsn; 801 xfs_lsn_t tail_lsn;
837 xlog_t *log = mp->m_log; 802 struct log *log = mp->m_log;
838 803
839 tail_lsn = xfs_trans_ail_tail(mp->m_ail); 804 tail_lsn = xfs_ail_min_lsn(mp->m_ail);
840 spin_lock(&log->l_grant_lock); 805 if (!tail_lsn)
841 if (tail_lsn != 0) { 806 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
842 log->l_tail_lsn = tail_lsn;
843 } else {
844 tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
845 }
846 spin_unlock(&log->l_grant_lock);
847 807
808 atomic64_set(&log->l_tail_lsn, tail_lsn);
848 return tail_lsn; 809 return tail_lsn;
849} /* xlog_assign_tail_lsn */ 810}
850
851 811
852/* 812/*
853 * Return the space in the log between the tail and the head. The head 813 * Return the space in the log between the tail and the head. The head
@@ -864,37 +824,42 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
864 * result is that we return the size of the log as the amount of space left. 824 * result is that we return the size of the log as the amount of space left.
865 */ 825 */
866STATIC int 826STATIC int
867xlog_space_left(xlog_t *log, int cycle, int bytes) 827xlog_space_left(
868{ 828 struct log *log,
869 int free_bytes; 829 atomic64_t *head)
870 int tail_bytes; 830{
871 int tail_cycle; 831 int free_bytes;
872 832 int tail_bytes;
873 tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); 833 int tail_cycle;
874 tail_cycle = CYCLE_LSN(log->l_tail_lsn); 834 int head_cycle;
875 if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { 835 int head_bytes;
876 free_bytes = log->l_logsize - (bytes - tail_bytes); 836
877 } else if ((tail_cycle + 1) < cycle) { 837 xlog_crack_grant_head(head, &head_cycle, &head_bytes);
838 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
839 tail_bytes = BBTOB(tail_bytes);
840 if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
841 free_bytes = log->l_logsize - (head_bytes - tail_bytes);
842 else if (tail_cycle + 1 < head_cycle)
878 return 0; 843 return 0;
879 } else if (tail_cycle < cycle) { 844 else if (tail_cycle < head_cycle) {
880 ASSERT(tail_cycle == (cycle - 1)); 845 ASSERT(tail_cycle == (head_cycle - 1));
881 free_bytes = tail_bytes - bytes; 846 free_bytes = tail_bytes - head_bytes;
882 } else { 847 } else {
883 /* 848 /*
884 * The reservation head is behind the tail. 849 * The reservation head is behind the tail.
885 * In this case we just want to return the size of the 850 * In this case we just want to return the size of the
886 * log as the amount of space left. 851 * log as the amount of space left.
887 */ 852 */
888 xfs_fs_cmn_err(CE_ALERT, log->l_mp, 853 xfs_alert(log->l_mp,
889 "xlog_space_left: head behind tail\n" 854 "xlog_space_left: head behind tail\n"
890 " tail_cycle = %d, tail_bytes = %d\n" 855 " tail_cycle = %d, tail_bytes = %d\n"
891 " GH cycle = %d, GH bytes = %d", 856 " GH cycle = %d, GH bytes = %d",
892 tail_cycle, tail_bytes, cycle, bytes); 857 tail_cycle, tail_bytes, head_cycle, head_bytes);
893 ASSERT(0); 858 ASSERT(0);
894 free_bytes = log->l_logsize; 859 free_bytes = log->l_logsize;
895 } 860 }
896 return free_bytes; 861 return free_bytes;
897} /* xlog_space_left */ 862}
898 863
899 864
900/* 865/*
@@ -917,19 +882,6 @@ xlog_iodone(xfs_buf_t *bp)
917 l = iclog->ic_log; 882 l = iclog->ic_log;
918 883
919 /* 884 /*
920 * If the _XFS_BARRIER_FAILED flag was set by a lower
921 * layer, it means the underlying device no longer supports
922 * barrier I/O. Warn loudly and turn off barriers.
923 */
924 if (bp->b_flags & _XFS_BARRIER_FAILED) {
925 bp->b_flags &= ~_XFS_BARRIER_FAILED;
926 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
927 xfs_fs_cmn_err(CE_WARN, l->l_mp,
928 "xlog_iodone: Barriers are no longer supported"
929 " by device. Disabling barriers\n");
930 }
931
932 /*
933 * Race to shutdown the filesystem if we see an error. 885 * Race to shutdown the filesystem if we see an error.
934 */ 886 */
935 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, 887 if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
@@ -1047,7 +999,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1047 999
1048 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); 1000 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1049 if (!log) { 1001 if (!log) {
1050 xlog_warn("XFS: Log allocation failed: No memory!"); 1002 xfs_warn(mp, "Log allocation failed: No memory!");
1051 goto out; 1003 goto out;
1052 } 1004 }
1053 1005
@@ -1060,35 +1012,39 @@ xlog_alloc_log(xfs_mount_t *mp,
1060 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1012 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1061 1013
1062 log->l_prev_block = -1; 1014 log->l_prev_block = -1;
1063 log->l_tail_lsn = xlog_assign_lsn(1, 0);
1064 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1015 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
1065 log->l_last_sync_lsn = log->l_tail_lsn; 1016 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
1017 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
1066 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1018 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
1067 log->l_grant_reserve_cycle = 1; 1019 xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
1068 log->l_grant_write_cycle = 1; 1020 xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
1021 INIT_LIST_HEAD(&log->l_reserveq);
1022 INIT_LIST_HEAD(&log->l_writeq);
1023 spin_lock_init(&log->l_grant_reserve_lock);
1024 spin_lock_init(&log->l_grant_write_lock);
1069 1025
1070 error = EFSCORRUPTED; 1026 error = EFSCORRUPTED;
1071 if (xfs_sb_version_hassector(&mp->m_sb)) { 1027 if (xfs_sb_version_hassector(&mp->m_sb)) {
1072 log2_size = mp->m_sb.sb_logsectlog; 1028 log2_size = mp->m_sb.sb_logsectlog;
1073 if (log2_size < BBSHIFT) { 1029 if (log2_size < BBSHIFT) {
1074 xlog_warn("XFS: Log sector size too small " 1030 xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
1075 "(0x%x < 0x%x)", log2_size, BBSHIFT); 1031 log2_size, BBSHIFT);
1076 goto out_free_log; 1032 goto out_free_log;
1077 } 1033 }
1078 1034
1079 log2_size -= BBSHIFT; 1035 log2_size -= BBSHIFT;
1080 if (log2_size > mp->m_sectbb_log) { 1036 if (log2_size > mp->m_sectbb_log) {
1081 xlog_warn("XFS: Log sector size too large " 1037 xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
1082 "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log); 1038 log2_size, mp->m_sectbb_log);
1083 goto out_free_log; 1039 goto out_free_log;
1084 } 1040 }
1085 1041
1086 /* for larger sector sizes, must have v2 or external log */ 1042 /* for larger sector sizes, must have v2 or external log */
1087 if (log2_size && log->l_logBBstart > 0 && 1043 if (log2_size && log->l_logBBstart > 0 &&
1088 !xfs_sb_version_haslogv2(&mp->m_sb)) { 1044 !xfs_sb_version_haslogv2(&mp->m_sb)) {
1089 1045 xfs_warn(mp,
1090 xlog_warn("XFS: log sector size (0x%x) invalid " 1046 "log sector size (0x%x) invalid for configuration.",
1091 "for configuration.", log2_size); 1047 log2_size);
1092 goto out_free_log; 1048 goto out_free_log;
1093 } 1049 }
1094 } 1050 }
@@ -1107,8 +1063,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1107 log->l_xbuf = bp; 1063 log->l_xbuf = bp;
1108 1064
1109 spin_lock_init(&log->l_icloglock); 1065 spin_lock_init(&log->l_icloglock);
1110 spin_lock_init(&log->l_grant_lock); 1066 init_waitqueue_head(&log->l_flush_wait);
1111 sv_init(&log->l_flush_wait, 0, "flush_wait");
1112 1067
1113 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1068 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1114 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1069 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1131,7 +1086,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1131 iclog->ic_prev = prev_iclog; 1086 iclog->ic_prev = prev_iclog;
1132 prev_iclog = iclog; 1087 prev_iclog = iclog;
1133 1088
1134 bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); 1089 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1090 log->l_iclog_size, 0);
1135 if (!bp) 1091 if (!bp)
1136 goto out_free_iclog; 1092 goto out_free_iclog;
1137 if (!XFS_BUF_CPSEMA(bp)) 1093 if (!XFS_BUF_CPSEMA(bp))
@@ -1163,8 +1119,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1163 1119
1164 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); 1120 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1165 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1121 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
1166 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1122 init_waitqueue_head(&iclog->ic_force_wait);
1167 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1123 init_waitqueue_head(&iclog->ic_write_wait);
1168 1124
1169 iclogp = &iclog->ic_next; 1125 iclogp = &iclog->ic_next;
1170 } 1126 }
@@ -1179,15 +1135,11 @@ xlog_alloc_log(xfs_mount_t *mp,
1179out_free_iclog: 1135out_free_iclog:
1180 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { 1136 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1181 prev_iclog = iclog->ic_next; 1137 prev_iclog = iclog->ic_next;
1182 if (iclog->ic_bp) { 1138 if (iclog->ic_bp)
1183 sv_destroy(&iclog->ic_force_wait);
1184 sv_destroy(&iclog->ic_write_wait);
1185 xfs_buf_free(iclog->ic_bp); 1139 xfs_buf_free(iclog->ic_bp);
1186 }
1187 kmem_free(iclog); 1140 kmem_free(iclog);
1188 } 1141 }
1189 spinlock_destroy(&log->l_icloglock); 1142 spinlock_destroy(&log->l_icloglock);
1190 spinlock_destroy(&log->l_grant_lock);
1191 xfs_buf_free(log->l_xbuf); 1143 xfs_buf_free(log->l_xbuf);
1192out_free_log: 1144out_free_log:
1193 kmem_free(log); 1145 kmem_free(log);
@@ -1235,61 +1187,60 @@ xlog_commit_record(
1235 * water mark. In this manner, we would be creating a low water mark. 1187 * water mark. In this manner, we would be creating a low water mark.
1236 */ 1188 */
1237STATIC void 1189STATIC void
1238xlog_grant_push_ail(xfs_mount_t *mp, 1190xlog_grant_push_ail(
1239 int need_bytes) 1191 struct log *log,
1192 int need_bytes)
1240{ 1193{
1241 xlog_t *log = mp->m_log; /* pointer to the log */ 1194 xfs_lsn_t threshold_lsn = 0;
1242 xfs_lsn_t tail_lsn; /* lsn of the log tail */ 1195 xfs_lsn_t last_sync_lsn;
1243 xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ 1196 int free_blocks;
1244 int free_blocks; /* free blocks left to write to */ 1197 int free_bytes;
1245 int free_bytes; /* free bytes left to write to */ 1198 int threshold_block;
1246 int threshold_block; /* block in lsn we'd like to be at */ 1199 int threshold_cycle;
1247 int threshold_cycle; /* lsn cycle we'd like to be at */ 1200 int free_threshold;
1248 int free_threshold; 1201
1249 1202 ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1250 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1203
1251 1204 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
1252 spin_lock(&log->l_grant_lock); 1205 free_blocks = BTOBBT(free_bytes);
1253 free_bytes = xlog_space_left(log, 1206
1254 log->l_grant_reserve_cycle, 1207 /*
1255 log->l_grant_reserve_bytes); 1208 * Set the threshold for the minimum number of free blocks in the
1256 tail_lsn = log->l_tail_lsn; 1209 * log to the maximum of what the caller needs, one quarter of the
1257 free_blocks = BTOBBT(free_bytes); 1210 * log, and 256 blocks.
1258 1211 */
1259 /* 1212 free_threshold = BTOBB(need_bytes);
1260 * Set the threshold for the minimum number of free blocks in the 1213 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
1261 * log to the maximum of what the caller needs, one quarter of the 1214 free_threshold = MAX(free_threshold, 256);
1262 * log, and 256 blocks. 1215 if (free_blocks >= free_threshold)
1263 */ 1216 return;
1264 free_threshold = BTOBB(need_bytes); 1217
1265 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); 1218 xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
1266 free_threshold = MAX(free_threshold, 256); 1219 &threshold_block);
1267 if (free_blocks < free_threshold) { 1220 threshold_block += free_threshold;
1268 threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
1269 threshold_cycle = CYCLE_LSN(tail_lsn);
1270 if (threshold_block >= log->l_logBBsize) { 1221 if (threshold_block >= log->l_logBBsize) {
1271 threshold_block -= log->l_logBBsize; 1222 threshold_block -= log->l_logBBsize;
1272 threshold_cycle += 1; 1223 threshold_cycle += 1;
1273 } 1224 }
1274 threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block); 1225 threshold_lsn = xlog_assign_lsn(threshold_cycle,
1226 threshold_block);
1227 /*
1228 * Don't pass in an lsn greater than the lsn of the last
1229 * log record known to be on disk. Use a snapshot of the last sync lsn
1230 * so that it doesn't change between the compare and the set.
1231 */
1232 last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
1233 if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
1234 threshold_lsn = last_sync_lsn;
1275 1235
1276 /* Don't pass in an lsn greater than the lsn of the last 1236 /*
1277 * log record known to be on disk. 1237 * Get the transaction layer to kick the dirty buffers out to
1238 * disk asynchronously. No point in trying to do this if
1239 * the filesystem is shutting down.
1278 */ 1240 */
1279 if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) 1241 if (!XLOG_FORCED_SHUTDOWN(log))
1280 threshold_lsn = log->l_last_sync_lsn; 1242 xfs_ail_push(log->l_ailp, threshold_lsn);
1281 } 1243}
1282 spin_unlock(&log->l_grant_lock);
1283
1284 /*
1285 * Get the transaction layer to kick the dirty buffers out to
1286 * disk asynchronously. No point in trying to do this if
1287 * the filesystem is shutting down.
1288 */
1289 if (threshold_lsn &&
1290 !XLOG_FORCED_SHUTDOWN(log))
1291 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1292} /* xlog_grant_push_ail */
1293 1244
1294/* 1245/*
1295 * The bdstrat callback function for log bufs. This gives us a central 1246 * The bdstrat callback function for log bufs. This gives us a central
@@ -1309,7 +1260,7 @@ xlog_bdstrat(
1309 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1260 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1310 XFS_BUF_ERROR(bp, EIO); 1261 XFS_BUF_ERROR(bp, EIO);
1311 XFS_BUF_STALE(bp); 1262 XFS_BUF_STALE(bp);
1312 xfs_biodone(bp); 1263 xfs_buf_ioend(bp, 0);
1313 /* 1264 /*
1314 * It would seem logical to return EIO here, but we rely on 1265 * It would seem logical to return EIO here, but we rely on
1315 * the log state machine to propagate I/O errors instead of 1266 * the log state machine to propagate I/O errors instead of
@@ -1384,9 +1335,8 @@ xlog_sync(xlog_t *log,
1384 roundoff < BBTOB(1))); 1335 roundoff < BBTOB(1)));
1385 1336
1386 /* move grant heads by roundoff in sync */ 1337 /* move grant heads by roundoff in sync */
1387 spin_lock(&log->l_grant_lock); 1338 xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
1388 xlog_grant_add_space(log, roundoff); 1339 xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
1389 spin_unlock(&log->l_grant_lock);
1390 1340
1391 /* put cycle number in every block */ 1341 /* put cycle number in every block */
1392 xlog_pack_data(log, iclog, roundoff); 1342 xlog_pack_data(log, iclog, roundoff);
@@ -1422,8 +1372,17 @@ xlog_sync(xlog_t *log,
1422 XFS_BUF_ASYNC(bp); 1372 XFS_BUF_ASYNC(bp);
1423 bp->b_flags |= XBF_LOG_BUFFER; 1373 bp->b_flags |= XBF_LOG_BUFFER;
1424 1374
1425 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1375 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
1376 /*
1377 * If we have an external log device, flush the data device
1378 * before flushing the log to make sure all meta data
1379 * written back from the AIL actually made it to disk
1380 * before writing out the new log tail LSN in the log buffer.
1381 */
1382 if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
1383 xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
1426 XFS_BUF_ORDERED(bp); 1384 XFS_BUF_ORDERED(bp);
1385 }
1427 1386
1428 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1387 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1429 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1388 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1499,19 +1458,22 @@ xlog_dealloc_log(xlog_t *log)
1499 1458
1500 xlog_cil_destroy(log); 1459 xlog_cil_destroy(log);
1501 1460
1461 /*
1462 * always need to ensure that the extra buffer does not point to memory
1463 * owned by another log buffer before we free it.
1464 */
1465 xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
1466 xfs_buf_free(log->l_xbuf);
1467
1502 iclog = log->l_iclog; 1468 iclog = log->l_iclog;
1503 for (i=0; i<log->l_iclog_bufs; i++) { 1469 for (i=0; i<log->l_iclog_bufs; i++) {
1504 sv_destroy(&iclog->ic_force_wait);
1505 sv_destroy(&iclog->ic_write_wait);
1506 xfs_buf_free(iclog->ic_bp); 1470 xfs_buf_free(iclog->ic_bp);
1507 next_iclog = iclog->ic_next; 1471 next_iclog = iclog->ic_next;
1508 kmem_free(iclog); 1472 kmem_free(iclog);
1509 iclog = next_iclog; 1473 iclog = next_iclog;
1510 } 1474 }
1511 spinlock_destroy(&log->l_icloglock); 1475 spinlock_destroy(&log->l_icloglock);
1512 spinlock_destroy(&log->l_grant_lock);
1513 1476
1514 xfs_buf_free(log->l_xbuf);
1515 log->l_mp->m_log = NULL; 1477 log->l_mp->m_log = NULL;
1516 kmem_free(log); 1478 kmem_free(log);
1517} /* xlog_dealloc_log */ 1479} /* xlog_dealloc_log */
@@ -1614,38 +1576,36 @@ xlog_print_tic_res(
1614 "SWAPEXT" 1576 "SWAPEXT"
1615 }; 1577 };
1616 1578
1617 xfs_fs_cmn_err(CE_WARN, mp, 1579 xfs_warn(mp,
1618 "xfs_log_write: reservation summary:\n" 1580 "xfs_log_write: reservation summary:\n"
1619 " trans type = %s (%u)\n" 1581 " trans type = %s (%u)\n"
1620 " unit res = %d bytes\n" 1582 " unit res = %d bytes\n"
1621 " current res = %d bytes\n" 1583 " current res = %d bytes\n"
1622 " total reg = %u bytes (o/flow = %u bytes)\n" 1584 " total reg = %u bytes (o/flow = %u bytes)\n"
1623 " ophdrs = %u (ophdr space = %u bytes)\n" 1585 " ophdrs = %u (ophdr space = %u bytes)\n"
1624 " ophdr + reg = %u bytes\n" 1586 " ophdr + reg = %u bytes\n"
1625 " num regions = %u\n", 1587 " num regions = %u\n",
1626 ((ticket->t_trans_type <= 0 || 1588 ((ticket->t_trans_type <= 0 ||
1627 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? 1589 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
1628 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), 1590 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
1629 ticket->t_trans_type, 1591 ticket->t_trans_type,
1630 ticket->t_unit_res, 1592 ticket->t_unit_res,
1631 ticket->t_curr_res, 1593 ticket->t_curr_res,
1632 ticket->t_res_arr_sum, ticket->t_res_o_flow, 1594 ticket->t_res_arr_sum, ticket->t_res_o_flow,
1633 ticket->t_res_num_ophdrs, ophdr_spc, 1595 ticket->t_res_num_ophdrs, ophdr_spc,
1634 ticket->t_res_arr_sum + 1596 ticket->t_res_arr_sum +
1635 ticket->t_res_o_flow + ophdr_spc, 1597 ticket->t_res_o_flow + ophdr_spc,
1636 ticket->t_res_num); 1598 ticket->t_res_num);
1637 1599
1638 for (i = 0; i < ticket->t_res_num; i++) { 1600 for (i = 0; i < ticket->t_res_num; i++) {
1639 uint r_type = ticket->t_res_arr[i].r_type; 1601 uint r_type = ticket->t_res_arr[i].r_type;
1640 cmn_err(CE_WARN, 1602 xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
1641 "region[%u]: %s - %u bytes\n",
1642 i,
1643 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? 1603 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
1644 "bad-rtype" : res_type_str[r_type-1]), 1604 "bad-rtype" : res_type_str[r_type-1]),
1645 ticket->t_res_arr[i].r_len); 1605 ticket->t_res_arr[i].r_len);
1646 } 1606 }
1647 1607
1648 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, 1608 xfs_alert_tag(mp, XFS_PTAG_LOGRES,
1649 "xfs_log_write: reservation ran out. Need to up reservation"); 1609 "xfs_log_write: reservation ran out. Need to up reservation");
1650 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1610 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1651} 1611}
@@ -1733,7 +1693,7 @@ xlog_write_setup_ophdr(
1733 case XFS_LOG: 1693 case XFS_LOG:
1734 break; 1694 break;
1735 default: 1695 default:
1736 xfs_fs_cmn_err(CE_WARN, log->l_mp, 1696 xfs_warn(log->l_mp,
1737 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1697 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1738 ophdr->oh_clientid, ticket); 1698 ophdr->oh_clientid, ticket);
1739 return NULL; 1699 return NULL;
@@ -2244,7 +2204,7 @@ xlog_state_do_callback(
2244 lowest_lsn = xlog_get_lowest_lsn(log); 2204 lowest_lsn = xlog_get_lowest_lsn(log);
2245 if (lowest_lsn && 2205 if (lowest_lsn &&
2246 XFS_LSN_CMP(lowest_lsn, 2206 XFS_LSN_CMP(lowest_lsn,
2247 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { 2207 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2248 iclog = iclog->ic_next; 2208 iclog = iclog->ic_next;
2249 continue; /* Leave this iclog for 2209 continue; /* Leave this iclog for
2250 * another thread */ 2210 * another thread */
@@ -2252,23 +2212,21 @@ xlog_state_do_callback(
2252 2212
2253 iclog->ic_state = XLOG_STATE_CALLBACK; 2213 iclog->ic_state = XLOG_STATE_CALLBACK;
2254 2214
2255 spin_unlock(&log->l_icloglock);
2256 2215
2257 /* l_last_sync_lsn field protected by 2216 /*
2258 * l_grant_lock. Don't worry about iclog's lsn. 2217 * update the last_sync_lsn before we drop the
2259 * No one else can be here except us. 2218 * icloglock to ensure we are the only one that
2219 * can update it.
2260 */ 2220 */
2261 spin_lock(&log->l_grant_lock); 2221 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2262 ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, 2222 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2263 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2223 atomic64_set(&log->l_last_sync_lsn,
2264 log->l_last_sync_lsn = 2224 be64_to_cpu(iclog->ic_header.h_lsn));
2265 be64_to_cpu(iclog->ic_header.h_lsn);
2266 spin_unlock(&log->l_grant_lock);
2267 2225
2268 } else { 2226 } else
2269 spin_unlock(&log->l_icloglock);
2270 ioerrors++; 2227 ioerrors++;
2271 } 2228
2229 spin_unlock(&log->l_icloglock);
2272 2230
2273 /* 2231 /*
2274 * Keep processing entries in the callback list until 2232 * Keep processing entries in the callback list until
@@ -2309,7 +2267,7 @@ xlog_state_do_callback(
2309 xlog_state_clean_log(log); 2267 xlog_state_clean_log(log);
2310 2268
2311 /* wake up threads waiting in xfs_log_force() */ 2269 /* wake up threads waiting in xfs_log_force() */
2312 sv_broadcast(&iclog->ic_force_wait); 2270 wake_up_all(&iclog->ic_force_wait);
2313 2271
2314 iclog = iclog->ic_next; 2272 iclog = iclog->ic_next;
2315 } while (first_iclog != iclog); 2273 } while (first_iclog != iclog);
@@ -2317,7 +2275,7 @@ xlog_state_do_callback(
2317 if (repeats > 5000) { 2275 if (repeats > 5000) {
2318 flushcnt += repeats; 2276 flushcnt += repeats;
2319 repeats = 0; 2277 repeats = 0;
2320 xfs_fs_cmn_err(CE_WARN, log->l_mp, 2278 xfs_warn(log->l_mp,
2321 "%s: possible infinite loop (%d iterations)", 2279 "%s: possible infinite loop (%d iterations)",
2322 __func__, flushcnt); 2280 __func__, flushcnt);
2323 } 2281 }
@@ -2356,7 +2314,7 @@ xlog_state_do_callback(
2356 spin_unlock(&log->l_icloglock); 2314 spin_unlock(&log->l_icloglock);
2357 2315
2358 if (wake) 2316 if (wake)
2359 sv_broadcast(&log->l_flush_wait); 2317 wake_up_all(&log->l_flush_wait);
2360} 2318}
2361 2319
2362 2320
@@ -2407,7 +2365,7 @@ xlog_state_done_syncing(
2407 * iclog buffer, we wake them all, one will get to do the 2365 * iclog buffer, we wake them all, one will get to do the
2408 * I/O, the others get to wait for the result. 2366 * I/O, the others get to wait for the result.
2409 */ 2367 */
2410 sv_broadcast(&iclog->ic_write_wait); 2368 wake_up_all(&iclog->ic_write_wait);
2411 spin_unlock(&log->l_icloglock); 2369 spin_unlock(&log->l_icloglock);
2412 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2370 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
2413} /* xlog_state_done_syncing */ 2371} /* xlog_state_done_syncing */
@@ -2456,7 +2414,7 @@ restart:
2456 XFS_STATS_INC(xs_log_noiclogs); 2414 XFS_STATS_INC(xs_log_noiclogs);
2457 2415
2458 /* Wait for log writes to have flushed */ 2416 /* Wait for log writes to have flushed */
2459 sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); 2417 xlog_wait(&log->l_flush_wait, &log->l_icloglock);
2460 goto restart; 2418 goto restart;
2461 } 2419 }
2462 2420
@@ -2539,6 +2497,18 @@ restart:
2539 * 2497 *
2540 * Once a ticket gets put onto the reserveq, it will only return after 2498 * Once a ticket gets put onto the reserveq, it will only return after
2541 * the needed reservation is satisfied. 2499 * the needed reservation is satisfied.
2500 *
2501 * This function is structured so that it has a lock free fast path. This is
2502 * necessary because every new transaction reservation will come through this
2503 * path. Hence any lock will be globally hot if we take it unconditionally on
2504 * every pass.
2505 *
2506 * As tickets are only ever moved on and off the reserveq under the
2507 * l_grant_reserve_lock, we only need to take that lock if we are going
2508 * to add the ticket to the queue and sleep. We can avoid taking the lock if the
2509 * ticket was never added to the reserveq because the t_queue list head will be
2510 * empty and we hold the only reference to it so it can safely be checked
2511 * unlocked.
2542 */ 2512 */
2543STATIC int 2513STATIC int
2544xlog_grant_log_space(xlog_t *log, 2514xlog_grant_log_space(xlog_t *log,
@@ -2546,24 +2516,27 @@ xlog_grant_log_space(xlog_t *log,
2546{ 2516{
2547 int free_bytes; 2517 int free_bytes;
2548 int need_bytes; 2518 int need_bytes;
2549#ifdef DEBUG
2550 xfs_lsn_t tail_lsn;
2551#endif
2552
2553 2519
2554#ifdef DEBUG 2520#ifdef DEBUG
2555 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 2521 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2556 panic("grant Recovery problem"); 2522 panic("grant Recovery problem");
2557#endif 2523#endif
2558 2524
2559 /* Is there space or do we need to sleep? */
2560 spin_lock(&log->l_grant_lock);
2561
2562 trace_xfs_log_grant_enter(log, tic); 2525 trace_xfs_log_grant_enter(log, tic);
2563 2526
2527 need_bytes = tic->t_unit_res;
2528 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2529 need_bytes *= tic->t_ocnt;
2530
2564 /* something is already sleeping; insert new transaction at end */ 2531 /* something is already sleeping; insert new transaction at end */
2565 if (log->l_reserve_headq) { 2532 if (!list_empty_careful(&log->l_reserveq)) {
2566 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2533 spin_lock(&log->l_grant_reserve_lock);
2534 /* recheck the queue now we are locked */
2535 if (list_empty(&log->l_reserveq)) {
2536 spin_unlock(&log->l_grant_reserve_lock);
2537 goto redo;
2538 }
2539 list_add_tail(&tic->t_queue, &log->l_reserveq);
2567 2540
2568 trace_xfs_log_grant_sleep1(log, tic); 2541 trace_xfs_log_grant_sleep1(log, tic);
2569 2542
@@ -2575,72 +2548,57 @@ xlog_grant_log_space(xlog_t *log,
2575 goto error_return; 2548 goto error_return;
2576 2549
2577 XFS_STATS_INC(xs_sleep_logspace); 2550 XFS_STATS_INC(xs_sleep_logspace);
2578 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); 2551 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2552
2579 /* 2553 /*
2580 * If we got an error, and the filesystem is shutting down, 2554 * If we got an error, and the filesystem is shutting down,
2581 * we'll catch it down below. So just continue... 2555 * we'll catch it down below. So just continue...
2582 */ 2556 */
2583 trace_xfs_log_grant_wake1(log, tic); 2557 trace_xfs_log_grant_wake1(log, tic);
2584 spin_lock(&log->l_grant_lock);
2585 } 2558 }
2586 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2587 need_bytes = tic->t_unit_res*tic->t_ocnt;
2588 else
2589 need_bytes = tic->t_unit_res;
2590 2559
2591redo: 2560redo:
2592 if (XLOG_FORCED_SHUTDOWN(log)) 2561 if (XLOG_FORCED_SHUTDOWN(log))
2593 goto error_return; 2562 goto error_return_unlocked;
2594 2563
2595 free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, 2564 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
2596 log->l_grant_reserve_bytes);
2597 if (free_bytes < need_bytes) { 2565 if (free_bytes < need_bytes) {
2598 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2566 spin_lock(&log->l_grant_reserve_lock);
2599 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2567 if (list_empty(&tic->t_queue))
2568 list_add_tail(&tic->t_queue, &log->l_reserveq);
2600 2569
2601 trace_xfs_log_grant_sleep2(log, tic); 2570 trace_xfs_log_grant_sleep2(log, tic);
2602 2571
2603 spin_unlock(&log->l_grant_lock);
2604 xlog_grant_push_ail(log->l_mp, need_bytes);
2605 spin_lock(&log->l_grant_lock);
2606
2607 XFS_STATS_INC(xs_sleep_logspace);
2608 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2609
2610 spin_lock(&log->l_grant_lock);
2611 if (XLOG_FORCED_SHUTDOWN(log)) 2572 if (XLOG_FORCED_SHUTDOWN(log))
2612 goto error_return; 2573 goto error_return;
2613 2574
2614 trace_xfs_log_grant_wake2(log, tic); 2575 xlog_grant_push_ail(log, need_bytes);
2576
2577 XFS_STATS_INC(xs_sleep_logspace);
2578 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2615 2579
2580 trace_xfs_log_grant_wake2(log, tic);
2616 goto redo; 2581 goto redo;
2617 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2582 }
2618 xlog_del_ticketq(&log->l_reserve_headq, tic);
2619 2583
2620 /* we've got enough space */ 2584 if (!list_empty(&tic->t_queue)) {
2621 xlog_grant_add_space(log, need_bytes); 2585 spin_lock(&log->l_grant_reserve_lock);
2622#ifdef DEBUG 2586 list_del_init(&tic->t_queue);
2623 tail_lsn = log->l_tail_lsn; 2587 spin_unlock(&log->l_grant_reserve_lock);
2624 /*
2625 * Check to make sure the grant write head didn't just over lap the
2626 * tail. If the cycles are the same, we can't be overlapping.
2627 * Otherwise, make sure that the cycles differ by exactly one and
2628 * check the byte count.
2629 */
2630 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2631 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2632 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2633 } 2588 }
2634#endif 2589
2590 /* we've got enough space */
2591 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
2592 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2635 trace_xfs_log_grant_exit(log, tic); 2593 trace_xfs_log_grant_exit(log, tic);
2636 xlog_verify_grant_head(log, 1); 2594 xlog_verify_grant_tail(log);
2637 spin_unlock(&log->l_grant_lock);
2638 return 0; 2595 return 0;
2639 2596
2640 error_return: 2597error_return_unlocked:
2641 if (tic->t_flags & XLOG_TIC_IN_Q) 2598 spin_lock(&log->l_grant_reserve_lock);
2642 xlog_del_ticketq(&log->l_reserve_headq, tic); 2599error_return:
2643 2600 list_del_init(&tic->t_queue);
2601 spin_unlock(&log->l_grant_reserve_lock);
2644 trace_xfs_log_grant_error(log, tic); 2602 trace_xfs_log_grant_error(log, tic);
2645 2603
2646 /* 2604 /*
@@ -2650,7 +2608,6 @@ redo:
2650 */ 2608 */
2651 tic->t_curr_res = 0; 2609 tic->t_curr_res = 0;
2652 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2610 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2653 spin_unlock(&log->l_grant_lock);
2654 return XFS_ERROR(EIO); 2611 return XFS_ERROR(EIO);
2655} /* xlog_grant_log_space */ 2612} /* xlog_grant_log_space */
2656 2613
@@ -2658,17 +2615,14 @@ redo:
2658/* 2615/*
2659 * Replenish the byte reservation required by moving the grant write head. 2616 * Replenish the byte reservation required by moving the grant write head.
2660 * 2617 *
2661 * 2618 * Similar to xlog_grant_log_space, the function is structured to have a lock
2619 * free fast path.
2662 */ 2620 */
2663STATIC int 2621STATIC int
2664xlog_regrant_write_log_space(xlog_t *log, 2622xlog_regrant_write_log_space(xlog_t *log,
2665 xlog_ticket_t *tic) 2623 xlog_ticket_t *tic)
2666{ 2624{
2667 int free_bytes, need_bytes; 2625 int free_bytes, need_bytes;
2668 xlog_ticket_t *ntic;
2669#ifdef DEBUG
2670 xfs_lsn_t tail_lsn;
2671#endif
2672 2626
2673 tic->t_curr_res = tic->t_unit_res; 2627 tic->t_curr_res = tic->t_unit_res;
2674 xlog_tic_reset_res(tic); 2628 xlog_tic_reset_res(tic);
@@ -2681,12 +2635,9 @@ xlog_regrant_write_log_space(xlog_t *log,
2681 panic("regrant Recovery problem"); 2635 panic("regrant Recovery problem");
2682#endif 2636#endif
2683 2637
2684 spin_lock(&log->l_grant_lock);
2685
2686 trace_xfs_log_regrant_write_enter(log, tic); 2638 trace_xfs_log_regrant_write_enter(log, tic);
2687
2688 if (XLOG_FORCED_SHUTDOWN(log)) 2639 if (XLOG_FORCED_SHUTDOWN(log))
2689 goto error_return; 2640 goto error_return_unlocked;
2690 2641
2691 /* If there are other waiters on the queue then give them a 2642 /* If there are other waiters on the queue then give them a
2692 * chance at logspace before us. Wake up the first waiters, 2643 * chance at logspace before us. Wake up the first waiters,
@@ -2695,92 +2646,76 @@ xlog_regrant_write_log_space(xlog_t *log,
2695 * this transaction. 2646 * this transaction.
2696 */ 2647 */
2697 need_bytes = tic->t_unit_res; 2648 need_bytes = tic->t_unit_res;
2698 if ((ntic = log->l_write_headq)) { 2649 if (!list_empty_careful(&log->l_writeq)) {
2699 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2650 struct xlog_ticket *ntic;
2700 log->l_grant_write_bytes); 2651
2701 do { 2652 spin_lock(&log->l_grant_write_lock);
2653 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2654 list_for_each_entry(ntic, &log->l_writeq, t_queue) {
2702 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); 2655 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
2703 2656
2704 if (free_bytes < ntic->t_unit_res) 2657 if (free_bytes < ntic->t_unit_res)
2705 break; 2658 break;
2706 free_bytes -= ntic->t_unit_res; 2659 free_bytes -= ntic->t_unit_res;
2707 sv_signal(&ntic->t_wait); 2660 wake_up(&ntic->t_wait);
2708 ntic = ntic->t_next; 2661 }
2709 } while (ntic != log->l_write_headq);
2710
2711 if (ntic != log->l_write_headq) {
2712 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2713 xlog_ins_ticketq(&log->l_write_headq, tic);
2714 2662
2663 if (ntic != list_first_entry(&log->l_writeq,
2664 struct xlog_ticket, t_queue)) {
2665 if (list_empty(&tic->t_queue))
2666 list_add_tail(&tic->t_queue, &log->l_writeq);
2715 trace_xfs_log_regrant_write_sleep1(log, tic); 2667 trace_xfs_log_regrant_write_sleep1(log, tic);
2716 2668
2717 spin_unlock(&log->l_grant_lock); 2669 xlog_grant_push_ail(log, need_bytes);
2718 xlog_grant_push_ail(log->l_mp, need_bytes);
2719 spin_lock(&log->l_grant_lock);
2720 2670
2721 XFS_STATS_INC(xs_sleep_logspace); 2671 XFS_STATS_INC(xs_sleep_logspace);
2722 sv_wait(&tic->t_wait, PINOD|PLTWAIT, 2672 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2723 &log->l_grant_lock, s);
2724
2725 /* If we're shutting down, this tic is already
2726 * off the queue */
2727 spin_lock(&log->l_grant_lock);
2728 if (XLOG_FORCED_SHUTDOWN(log))
2729 goto error_return;
2730
2731 trace_xfs_log_regrant_write_wake1(log, tic); 2673 trace_xfs_log_regrant_write_wake1(log, tic);
2732 } 2674 } else
2675 spin_unlock(&log->l_grant_write_lock);
2733 } 2676 }
2734 2677
2735redo: 2678redo:
2736 if (XLOG_FORCED_SHUTDOWN(log)) 2679 if (XLOG_FORCED_SHUTDOWN(log))
2737 goto error_return; 2680 goto error_return_unlocked;
2738 2681
2739 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2682 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2740 log->l_grant_write_bytes);
2741 if (free_bytes < need_bytes) { 2683 if (free_bytes < need_bytes) {
2742 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2684 spin_lock(&log->l_grant_write_lock);
2743 xlog_ins_ticketq(&log->l_write_headq, tic); 2685 if (list_empty(&tic->t_queue))
2744 spin_unlock(&log->l_grant_lock); 2686 list_add_tail(&tic->t_queue, &log->l_writeq);
2745 xlog_grant_push_ail(log->l_mp, need_bytes);
2746 spin_lock(&log->l_grant_lock);
2747 2687
2748 XFS_STATS_INC(xs_sleep_logspace);
2749 trace_xfs_log_regrant_write_sleep2(log, tic);
2750
2751 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2752
2753 /* If we're shutting down, this tic is already off the queue */
2754 spin_lock(&log->l_grant_lock);
2755 if (XLOG_FORCED_SHUTDOWN(log)) 2688 if (XLOG_FORCED_SHUTDOWN(log))
2756 goto error_return; 2689 goto error_return;
2757 2690
2691 xlog_grant_push_ail(log, need_bytes);
2692
2693 XFS_STATS_INC(xs_sleep_logspace);
2694 trace_xfs_log_regrant_write_sleep2(log, tic);
2695 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2696
2758 trace_xfs_log_regrant_write_wake2(log, tic); 2697 trace_xfs_log_regrant_write_wake2(log, tic);
2759 goto redo; 2698 goto redo;
2760 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2699 }
2761 xlog_del_ticketq(&log->l_write_headq, tic);
2762 2700
2763 /* we've got enough space */ 2701 if (!list_empty(&tic->t_queue)) {
2764 xlog_grant_add_space_write(log, need_bytes); 2702 spin_lock(&log->l_grant_write_lock);
2765#ifdef DEBUG 2703 list_del_init(&tic->t_queue);
2766 tail_lsn = log->l_tail_lsn; 2704 spin_unlock(&log->l_grant_write_lock);
2767 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2768 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2769 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2770 } 2705 }
2771#endif
2772 2706
2707 /* we've got enough space */
2708 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2773 trace_xfs_log_regrant_write_exit(log, tic); 2709 trace_xfs_log_regrant_write_exit(log, tic);
2774 2710 xlog_verify_grant_tail(log);
2775 xlog_verify_grant_head(log, 1);
2776 spin_unlock(&log->l_grant_lock);
2777 return 0; 2711 return 0;
2778 2712
2779 2713
2714 error_return_unlocked:
2715 spin_lock(&log->l_grant_write_lock);
2780 error_return: 2716 error_return:
2781 if (tic->t_flags & XLOG_TIC_IN_Q) 2717 list_del_init(&tic->t_queue);
2782 xlog_del_ticketq(&log->l_reserve_headq, tic); 2718 spin_unlock(&log->l_grant_write_lock);
2783
2784 trace_xfs_log_regrant_write_error(log, tic); 2719 trace_xfs_log_regrant_write_error(log, tic);
2785 2720
2786 /* 2721 /*
@@ -2790,7 +2725,6 @@ redo:
2790 */ 2725 */
2791 tic->t_curr_res = 0; 2726 tic->t_curr_res = 0;
2792 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2727 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2793 spin_unlock(&log->l_grant_lock);
2794 return XFS_ERROR(EIO); 2728 return XFS_ERROR(EIO);
2795} /* xlog_regrant_write_log_space */ 2729} /* xlog_regrant_write_log_space */
2796 2730
@@ -2811,27 +2745,24 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2811 if (ticket->t_cnt > 0) 2745 if (ticket->t_cnt > 0)
2812 ticket->t_cnt--; 2746 ticket->t_cnt--;
2813 2747
2814 spin_lock(&log->l_grant_lock); 2748 xlog_grant_sub_space(log, &log->l_grant_reserve_head,
2815 xlog_grant_sub_space(log, ticket->t_curr_res); 2749 ticket->t_curr_res);
2750 xlog_grant_sub_space(log, &log->l_grant_write_head,
2751 ticket->t_curr_res);
2816 ticket->t_curr_res = ticket->t_unit_res; 2752 ticket->t_curr_res = ticket->t_unit_res;
2817 xlog_tic_reset_res(ticket); 2753 xlog_tic_reset_res(ticket);
2818 2754
2819 trace_xfs_log_regrant_reserve_sub(log, ticket); 2755 trace_xfs_log_regrant_reserve_sub(log, ticket);
2820 2756
2821 xlog_verify_grant_head(log, 1);
2822
2823 /* just return if we still have some of the pre-reserved space */ 2757 /* just return if we still have some of the pre-reserved space */
2824 if (ticket->t_cnt > 0) { 2758 if (ticket->t_cnt > 0)
2825 spin_unlock(&log->l_grant_lock);
2826 return; 2759 return;
2827 }
2828 2760
2829 xlog_grant_add_space_reserve(log, ticket->t_unit_res); 2761 xlog_grant_add_space(log, &log->l_grant_reserve_head,
2762 ticket->t_unit_res);
2830 2763
2831 trace_xfs_log_regrant_reserve_exit(log, ticket); 2764 trace_xfs_log_regrant_reserve_exit(log, ticket);
2832 2765
2833 xlog_verify_grant_head(log, 0);
2834 spin_unlock(&log->l_grant_lock);
2835 ticket->t_curr_res = ticket->t_unit_res; 2766 ticket->t_curr_res = ticket->t_unit_res;
2836 xlog_tic_reset_res(ticket); 2767 xlog_tic_reset_res(ticket);
2837} /* xlog_regrant_reserve_log_space */ 2768} /* xlog_regrant_reserve_log_space */
@@ -2855,28 +2786,29 @@ STATIC void
2855xlog_ungrant_log_space(xlog_t *log, 2786xlog_ungrant_log_space(xlog_t *log,
2856 xlog_ticket_t *ticket) 2787 xlog_ticket_t *ticket)
2857{ 2788{
2789 int bytes;
2790
2858 if (ticket->t_cnt > 0) 2791 if (ticket->t_cnt > 0)
2859 ticket->t_cnt--; 2792 ticket->t_cnt--;
2860 2793
2861 spin_lock(&log->l_grant_lock);
2862 trace_xfs_log_ungrant_enter(log, ticket); 2794 trace_xfs_log_ungrant_enter(log, ticket);
2863
2864 xlog_grant_sub_space(log, ticket->t_curr_res);
2865
2866 trace_xfs_log_ungrant_sub(log, ticket); 2795 trace_xfs_log_ungrant_sub(log, ticket);
2867 2796
2868 /* If this is a permanent reservation ticket, we may be able to free 2797 /*
2798 * If this is a permanent reservation ticket, we may be able to free
2869 * up more space based on the remaining count. 2799 * up more space based on the remaining count.
2870 */ 2800 */
2801 bytes = ticket->t_curr_res;
2871 if (ticket->t_cnt > 0) { 2802 if (ticket->t_cnt > 0) {
2872 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 2803 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2873 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); 2804 bytes += ticket->t_unit_res*ticket->t_cnt;
2874 } 2805 }
2875 2806
2807 xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
2808 xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
2809
2876 trace_xfs_log_ungrant_exit(log, ticket); 2810 trace_xfs_log_ungrant_exit(log, ticket);
2877 2811
2878 xlog_verify_grant_head(log, 1);
2879 spin_unlock(&log->l_grant_lock);
2880 xfs_log_move_tail(log->l_mp, 1); 2812 xfs_log_move_tail(log->l_mp, 1);
2881} /* xlog_ungrant_log_space */ 2813} /* xlog_ungrant_log_space */
2882 2814
@@ -2913,11 +2845,11 @@ xlog_state_release_iclog(
2913 2845
2914 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 2846 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
2915 /* update tail before writing to iclog */ 2847 /* update tail before writing to iclog */
2916 xlog_assign_tail_lsn(log->l_mp); 2848 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
2917 sync++; 2849 sync++;
2918 iclog->ic_state = XLOG_STATE_SYNCING; 2850 iclog->ic_state = XLOG_STATE_SYNCING;
2919 iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); 2851 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
2920 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); 2852 xlog_verify_tail_lsn(log, iclog, tail_lsn);
2921 /* cycle incremented when incrementing curr_block */ 2853 /* cycle incremented when incrementing curr_block */
2922 } 2854 }
2923 spin_unlock(&log->l_icloglock); 2855 spin_unlock(&log->l_icloglock);
@@ -3100,7 +3032,7 @@ maybe_sleep:
3100 return XFS_ERROR(EIO); 3032 return XFS_ERROR(EIO);
3101 } 3033 }
3102 XFS_STATS_INC(xs_log_force_sleep); 3034 XFS_STATS_INC(xs_log_force_sleep);
3103 sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); 3035 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3104 /* 3036 /*
3105 * No need to grab the log lock here since we're 3037 * No need to grab the log lock here since we're
3106 * only deciding whether or not to return EIO 3038 * only deciding whether or not to return EIO
@@ -3131,10 +3063,8 @@ xfs_log_force(
3131 int error; 3063 int error;
3132 3064
3133 error = _xfs_log_force(mp, flags, NULL); 3065 error = _xfs_log_force(mp, flags, NULL);
3134 if (error) { 3066 if (error)
3135 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " 3067 xfs_warn(mp, "%s: error %d returned.", __func__, error);
3136 "error %d returned.", error);
3137 }
3138} 3068}
3139 3069
3140/* 3070/*
@@ -3218,8 +3148,8 @@ try_again:
3218 3148
3219 XFS_STATS_INC(xs_log_force_sleep); 3149 XFS_STATS_INC(xs_log_force_sleep);
3220 3150
3221 sv_wait(&iclog->ic_prev->ic_write_wait, 3151 xlog_wait(&iclog->ic_prev->ic_write_wait,
3222 PSWP, &log->l_icloglock, s); 3152 &log->l_icloglock);
3223 if (log_flushed) 3153 if (log_flushed)
3224 *log_flushed = 1; 3154 *log_flushed = 1;
3225 already_slept = 1; 3155 already_slept = 1;
@@ -3247,7 +3177,7 @@ try_again:
3247 return XFS_ERROR(EIO); 3177 return XFS_ERROR(EIO);
3248 } 3178 }
3249 XFS_STATS_INC(xs_log_force_sleep); 3179 XFS_STATS_INC(xs_log_force_sleep);
3250 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); 3180 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3251 /* 3181 /*
3252 * No need to grab the log lock here since we're 3182 * No need to grab the log lock here since we're
3253 * only deciding whether or not to return EIO 3183 * only deciding whether or not to return EIO
@@ -3283,10 +3213,8 @@ xfs_log_force_lsn(
3283 int error; 3213 int error;
3284 3214
3285 error = _xfs_log_force_lsn(mp, lsn, flags, NULL); 3215 error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3286 if (error) { 3216 if (error)
3287 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " 3217 xfs_warn(mp, "%s: error %d returned.", __func__, error);
3288 "error %d returned.", error);
3289 }
3290} 3218}
3291 3219
3292/* 3220/*
@@ -3322,10 +3250,8 @@ xfs_log_ticket_put(
3322 xlog_ticket_t *ticket) 3250 xlog_ticket_t *ticket)
3323{ 3251{
3324 ASSERT(atomic_read(&ticket->t_ref) > 0); 3252 ASSERT(atomic_read(&ticket->t_ref) > 0);
3325 if (atomic_dec_and_test(&ticket->t_ref)) { 3253 if (atomic_dec_and_test(&ticket->t_ref))
3326 sv_destroy(&ticket->t_wait);
3327 kmem_zone_free(xfs_log_ticket_zone, ticket); 3254 kmem_zone_free(xfs_log_ticket_zone, ticket);
3328 }
3329} 3255}
3330 3256
3331xlog_ticket_t * 3257xlog_ticket_t *
@@ -3337,13 +3263,6 @@ xfs_log_ticket_get(
3337 return ticket; 3263 return ticket;
3338} 3264}
3339 3265
3340xlog_tid_t
3341xfs_log_get_trans_ident(
3342 struct xfs_trans *tp)
3343{
3344 return tp->t_ticket->t_tid;
3345}
3346
3347/* 3266/*
3348 * Allocate and initialise a new log ticket. 3267 * Allocate and initialise a new log ticket.
3349 */ 3268 */
@@ -3447,6 +3366,7 @@ xlog_ticket_alloc(
3447 } 3366 }
3448 3367
3449 atomic_set(&tic->t_ref, 1); 3368 atomic_set(&tic->t_ref, 1);
3369 INIT_LIST_HEAD(&tic->t_queue);
3450 tic->t_unit_res = unit_bytes; 3370 tic->t_unit_res = unit_bytes;
3451 tic->t_curr_res = unit_bytes; 3371 tic->t_curr_res = unit_bytes;
3452 tic->t_cnt = cnt; 3372 tic->t_cnt = cnt;
@@ -3457,7 +3377,7 @@ xlog_ticket_alloc(
3457 tic->t_trans_type = 0; 3377 tic->t_trans_type = 0;
3458 if (xflags & XFS_LOG_PERM_RESERV) 3378 if (xflags & XFS_LOG_PERM_RESERV)
3459 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3379 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3460 sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); 3380 init_waitqueue_head(&tic->t_wait);
3461 3381
3462 xlog_tic_reset_res(tic); 3382 xlog_tic_reset_res(tic);
3463 3383
@@ -3492,22 +3412,45 @@ xlog_verify_dest_ptr(
3492 } 3412 }
3493 3413
3494 if (!good_ptr) 3414 if (!good_ptr)
3495 xlog_panic("xlog_verify_dest_ptr: invalid ptr"); 3415 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3496} 3416}
3497 3417
3418/*
3419 * Check to make sure the grant write head didn't just over lap the tail. If
3420 * the cycles are the same, we can't be overlapping. Otherwise, make sure that
3421 * the cycles differ by exactly one and check the byte count.
3422 *
3423 * This check is run unlocked, so can give false positives. Rather than assert
3424 * on failures, use a warn-once flag and a panic tag to allow the admin to
3425 * determine if they want to panic the machine when such an error occurs. For
3426 * debug kernels this will have the same effect as using an assert but, unlinke
3427 * an assert, it can be turned off at runtime.
3428 */
3498STATIC void 3429STATIC void
3499xlog_verify_grant_head(xlog_t *log, int equals) 3430xlog_verify_grant_tail(
3500{ 3431 struct log *log)
3501 if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { 3432{
3502 if (equals) 3433 int tail_cycle, tail_blocks;
3503 ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); 3434 int cycle, space;
3504 else 3435
3505 ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); 3436 xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
3506 } else { 3437 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3507 ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); 3438 if (tail_cycle != cycle) {
3508 ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); 3439 if (cycle - 1 != tail_cycle &&
3509 } 3440 !(log->l_flags & XLOG_TAIL_WARN)) {
3510} /* xlog_verify_grant_head */ 3441 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3442 "%s: cycle - 1 != tail_cycle", __func__);
3443 log->l_flags |= XLOG_TAIL_WARN;
3444 }
3445
3446 if (space > BBTOB(tail_blocks) &&
3447 !(log->l_flags & XLOG_TAIL_WARN)) {
3448 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3449 "%s: space > BBTOB(tail_blocks)", __func__);
3450 log->l_flags |= XLOG_TAIL_WARN;
3451 }
3452 }
3453}
3511 3454
3512/* check if it will fit */ 3455/* check if it will fit */
3513STATIC void 3456STATIC void
@@ -3521,16 +3464,16 @@ xlog_verify_tail_lsn(xlog_t *log,
3521 blocks = 3464 blocks =
3522 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); 3465 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
3523 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) 3466 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
3524 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3467 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3525 } else { 3468 } else {
3526 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); 3469 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
3527 3470
3528 if (BLOCK_LSN(tail_lsn) == log->l_prev_block) 3471 if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
3529 xlog_panic("xlog_verify_tail_lsn: tail wrapped"); 3472 xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
3530 3473
3531 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; 3474 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
3532 if (blocks < BTOBB(iclog->ic_offset) + 1) 3475 if (blocks < BTOBB(iclog->ic_offset) + 1)
3533 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3476 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3534 } 3477 }
3535} /* xlog_verify_tail_lsn */ 3478} /* xlog_verify_tail_lsn */
3536 3479
@@ -3570,22 +3513,23 @@ xlog_verify_iclog(xlog_t *log,
3570 icptr = log->l_iclog; 3513 icptr = log->l_iclog;
3571 for (i=0; i < log->l_iclog_bufs; i++) { 3514 for (i=0; i < log->l_iclog_bufs; i++) {
3572 if (icptr == NULL) 3515 if (icptr == NULL)
3573 xlog_panic("xlog_verify_iclog: invalid ptr"); 3516 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3574 icptr = icptr->ic_next; 3517 icptr = icptr->ic_next;
3575 } 3518 }
3576 if (icptr != log->l_iclog) 3519 if (icptr != log->l_iclog)
3577 xlog_panic("xlog_verify_iclog: corrupt iclog ring"); 3520 xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
3578 spin_unlock(&log->l_icloglock); 3521 spin_unlock(&log->l_icloglock);
3579 3522
3580 /* check log magic numbers */ 3523 /* check log magic numbers */
3581 if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM) 3524 if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
3582 xlog_panic("xlog_verify_iclog: invalid magic num"); 3525 xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
3583 3526
3584 ptr = (xfs_caddr_t) &iclog->ic_header; 3527 ptr = (xfs_caddr_t) &iclog->ic_header;
3585 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; 3528 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
3586 ptr += BBSIZE) { 3529 ptr += BBSIZE) {
3587 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) 3530 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
3588 xlog_panic("xlog_verify_iclog: unexpected magic num"); 3531 xfs_emerg(log->l_mp, "%s: unexpected magic num",
3532 __func__);
3589 } 3533 }
3590 3534
3591 /* check fields */ 3535 /* check fields */
@@ -3615,9 +3559,10 @@ xlog_verify_iclog(xlog_t *log,
3615 } 3559 }
3616 } 3560 }
3617 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) 3561 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
3618 cmn_err(CE_WARN, "xlog_verify_iclog: " 3562 xfs_warn(log->l_mp,
3619 "invalid clientid %d op 0x%p offset 0x%lx", 3563 "%s: invalid clientid %d op 0x%p offset 0x%lx",
3620 clientid, ophead, (unsigned long)field_offset); 3564 __func__, clientid, ophead,
3565 (unsigned long)field_offset);
3621 3566
3622 /* check length */ 3567 /* check length */
3623 field_offset = (__psint_t) 3568 field_offset = (__psint_t)
@@ -3728,12 +3673,10 @@ xfs_log_force_umount(
3728 xlog_cil_force(log); 3673 xlog_cil_force(log);
3729 3674
3730 /* 3675 /*
3731 * We must hold both the GRANT lock and the LOG lock, 3676 * mark the filesystem and the as in a shutdown state and wake
3732 * before we mark the filesystem SHUTDOWN and wake 3677 * everybody up to tell them the bad news.
3733 * everybody up to tell the bad news.
3734 */ 3678 */
3735 spin_lock(&log->l_icloglock); 3679 spin_lock(&log->l_icloglock);
3736 spin_lock(&log->l_grant_lock);
3737 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3680 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3738 if (mp->m_sb_bp) 3681 if (mp->m_sb_bp)
3739 XFS_BUF_DONE(mp->m_sb_bp); 3682 XFS_BUF_DONE(mp->m_sb_bp);
@@ -3754,27 +3697,21 @@ xfs_log_force_umount(
3754 spin_unlock(&log->l_icloglock); 3697 spin_unlock(&log->l_icloglock);
3755 3698
3756 /* 3699 /*
3757 * We don't want anybody waiting for log reservations 3700 * We don't want anybody waiting for log reservations after this. That
3758 * after this. That means we have to wake up everybody 3701 * means we have to wake up everybody queued up on reserveq as well as
3759 * queued up on reserve_headq as well as write_headq. 3702 * writeq. In addition, we make sure in xlog_{re}grant_log_space that
3760 * In addition, we make sure in xlog_{re}grant_log_space 3703 * we don't enqueue anything once the SHUTDOWN flag is set, and this
3761 * that we don't enqueue anything once the SHUTDOWN flag 3704 * action is protected by the grant locks.
3762 * is set, and this action is protected by the GRANTLOCK.
3763 */ 3705 */
3764 if ((tic = log->l_reserve_headq)) { 3706 spin_lock(&log->l_grant_reserve_lock);
3765 do { 3707 list_for_each_entry(tic, &log->l_reserveq, t_queue)
3766 sv_signal(&tic->t_wait); 3708 wake_up(&tic->t_wait);
3767 tic = tic->t_next; 3709 spin_unlock(&log->l_grant_reserve_lock);
3768 } while (tic != log->l_reserve_headq); 3710
3769 } 3711 spin_lock(&log->l_grant_write_lock);
3770 3712 list_for_each_entry(tic, &log->l_writeq, t_queue)
3771 if ((tic = log->l_write_headq)) { 3713 wake_up(&tic->t_wait);
3772 do { 3714 spin_unlock(&log->l_grant_write_lock);
3773 sv_signal(&tic->t_wait);
3774 tic = tic->t_next;
3775 } while (tic != log->l_write_headq);
3776 }
3777 spin_unlock(&log->l_grant_lock);
3778 3715
3779 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3716 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3780 ASSERT(!logerror); 3717 ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..78c9039994af 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -189,9 +189,7 @@ void xlog_iodone(struct xfs_buf *);
189struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); 189struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
190void xfs_log_ticket_put(struct xlog_ticket *ticket); 190void xfs_log_ticket_put(struct xlog_ticket *ticket);
191 191
192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); 192void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
193
194int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
195 struct xfs_log_vec *log_vector, 193 struct xfs_log_vec *log_vector,
196 xfs_lsn_t *commit_lsn, int flags); 194 xfs_lsn_t *commit_lsn, int flags);
197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 195bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7e206fc1fa36..c7755d5a5fbe 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -29,6 +29,7 @@
29#include "xfs_mount.h" 29#include "xfs_mount.h"
30#include "xfs_error.h" 30#include "xfs_error.h"
31#include "xfs_alloc.h" 31#include "xfs_alloc.h"
32#include "xfs_discard.h"
32 33
33/* 34/*
34 * Perform initial CIL structure initialisation. If the CIL is not 35 * Perform initial CIL structure initialisation. If the CIL is not
@@ -61,7 +62,7 @@ xlog_cil_init(
61 INIT_LIST_HEAD(&cil->xc_committing); 62 INIT_LIST_HEAD(&cil->xc_committing);
62 spin_lock_init(&cil->xc_cil_lock); 63 spin_lock_init(&cil->xc_cil_lock);
63 init_rwsem(&cil->xc_ctx_lock); 64 init_rwsem(&cil->xc_ctx_lock);
64 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); 65 init_waitqueue_head(&cil->xc_commit_wait);
65 66
66 INIT_LIST_HEAD(&ctx->committing); 67 INIT_LIST_HEAD(&ctx->committing);
67 INIT_LIST_HEAD(&ctx->busy_extents); 68 INIT_LIST_HEAD(&ctx->busy_extents);
@@ -146,102 +147,6 @@ xlog_cil_init_post_recovery(
146} 147}
147 148
148/* 149/*
149 * Insert the log item into the CIL and calculate the difference in space
150 * consumed by the item. Add the space to the checkpoint ticket and calculate
151 * if the change requires additional log metadata. If it does, take that space
152 * as well. Remove the amount of space we addded to the checkpoint ticket from
153 * the current transaction ticket so that the accounting works out correctly.
154 *
155 * If this is the first time the item is being placed into the CIL in this
156 * context, pin it so it can't be written to disk until the CIL is flushed to
157 * the iclog and the iclog written to disk.
158 */
159static void
160xlog_cil_insert(
161 struct log *log,
162 struct xlog_ticket *ticket,
163 struct xfs_log_item *item,
164 struct xfs_log_vec *lv)
165{
166 struct xfs_cil *cil = log->l_cilp;
167 struct xfs_log_vec *old = lv->lv_item->li_lv;
168 struct xfs_cil_ctx *ctx = cil->xc_ctx;
169 int len;
170 int diff_iovecs;
171 int iclog_space;
172
173 if (old) {
174 /* existing lv on log item, space used is a delta */
175 ASSERT(!list_empty(&item->li_cil));
176 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
177
178 len = lv->lv_buf_len - old->lv_buf_len;
179 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
180 kmem_free(old->lv_buf);
181 kmem_free(old);
182 } else {
183 /* new lv, must pin the log item */
184 ASSERT(!lv->lv_item->li_lv);
185 ASSERT(list_empty(&item->li_cil));
186
187 len = lv->lv_buf_len;
188 diff_iovecs = lv->lv_niovecs;
189 IOP_PIN(lv->lv_item);
190
191 }
192 len += diff_iovecs * sizeof(xlog_op_header_t);
193
194 /* attach new log vector to log item */
195 lv->lv_item->li_lv = lv;
196
197 spin_lock(&cil->xc_cil_lock);
198 list_move_tail(&item->li_cil, &cil->xc_cil);
199 ctx->nvecs += diff_iovecs;
200
201 /*
202 * If this is the first time the item is being committed to the CIL,
203 * store the sequence number on the log item so we can tell
204 * in future commits whether this is the first checkpoint the item is
205 * being committed into.
206 */
207 if (!item->li_seq)
208 item->li_seq = ctx->sequence;
209
210 /*
211 * Now transfer enough transaction reservation to the context ticket
212 * for the checkpoint. The context ticket is special - the unit
213 * reservation has to grow as well as the current reservation as we
214 * steal from tickets so we can correctly determine the space used
215 * during the transaction commit.
216 */
217 if (ctx->ticket->t_curr_res == 0) {
218 /* first commit in checkpoint, steal the header reservation */
219 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
220 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
221 ticket->t_curr_res -= ctx->ticket->t_unit_res;
222 }
223
224 /* do we need space for more log record headers? */
225 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
226 if (len > 0 && (ctx->space_used / iclog_space !=
227 (ctx->space_used + len) / iclog_space)) {
228 int hdrs;
229
230 hdrs = (len + iclog_space - 1) / iclog_space;
231 /* need to take into account split region headers, too */
232 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
233 ctx->ticket->t_unit_res += hdrs;
234 ctx->ticket->t_curr_res += hdrs;
235 ticket->t_curr_res -= hdrs;
236 ASSERT(ticket->t_curr_res >= len);
237 }
238 ticket->t_curr_res -= len;
239 ctx->space_used += len;
240
241 spin_unlock(&cil->xc_cil_lock);
242}
243
244/*
245 * Format log item into a flat buffers 150 * Format log item into a flat buffers
246 * 151 *
247 * For delayed logging, we need to hold a formatted buffer containing all the 152 * For delayed logging, we need to hold a formatted buffer containing all the
@@ -286,7 +191,7 @@ xlog_cil_format_items(
286 len += lv->lv_iovecp[index].i_len; 191 len += lv->lv_iovecp[index].i_len;
287 192
288 lv->lv_buf_len = len; 193 lv->lv_buf_len = len;
289 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); 194 lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
290 ptr = lv->lv_buf; 195 ptr = lv->lv_buf;
291 196
292 for (index = 0; index < lv->lv_niovecs; index++) { 197 for (index = 0; index < lv->lv_niovecs; index++) {
@@ -300,21 +205,136 @@ xlog_cil_format_items(
300 } 205 }
301} 206}
302 207
208/*
209 * Prepare the log item for insertion into the CIL. Calculate the difference in
210 * log space and vectors it will consume, and if it is a new item pin it as
211 * well.
212 */
213STATIC void
214xfs_cil_prepare_item(
215 struct log *log,
216 struct xfs_log_vec *lv,
217 int *len,
218 int *diff_iovecs)
219{
220 struct xfs_log_vec *old = lv->lv_item->li_lv;
221
222 if (old) {
223 /* existing lv on log item, space used is a delta */
224 ASSERT(!list_empty(&lv->lv_item->li_cil));
225 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
226
227 *len += lv->lv_buf_len - old->lv_buf_len;
228 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
229 kmem_free(old->lv_buf);
230 kmem_free(old);
231 } else {
232 /* new lv, must pin the log item */
233 ASSERT(!lv->lv_item->li_lv);
234 ASSERT(list_empty(&lv->lv_item->li_cil));
235
236 *len += lv->lv_buf_len;
237 *diff_iovecs += lv->lv_niovecs;
238 IOP_PIN(lv->lv_item);
239
240 }
241
242 /* attach new log vector to log item */
243 lv->lv_item->li_lv = lv;
244
245 /*
246 * If this is the first time the item is being committed to the
247 * CIL, store the sequence number on the log item so we can
248 * tell in future commits whether this is the first checkpoint
249 * the item is being committed into.
250 */
251 if (!lv->lv_item->li_seq)
252 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
253}
254
255/*
256 * Insert the log items into the CIL and calculate the difference in space
257 * consumed by the item. Add the space to the checkpoint ticket and calculate
258 * if the change requires additional log metadata. If it does, take that space
259 * as well. Remove the amount of space we addded to the checkpoint ticket from
260 * the current transaction ticket so that the accounting works out correctly.
261 */
303static void 262static void
304xlog_cil_insert_items( 263xlog_cil_insert_items(
305 struct log *log, 264 struct log *log,
306 struct xfs_log_vec *log_vector, 265 struct xfs_log_vec *log_vector,
307 struct xlog_ticket *ticket, 266 struct xlog_ticket *ticket)
308 xfs_lsn_t *start_lsn)
309{ 267{
310 struct xfs_log_vec *lv; 268 struct xfs_cil *cil = log->l_cilp;
311 269 struct xfs_cil_ctx *ctx = cil->xc_ctx;
312 if (start_lsn) 270 struct xfs_log_vec *lv;
313 *start_lsn = log->l_cilp->xc_ctx->sequence; 271 int len = 0;
272 int diff_iovecs = 0;
273 int iclog_space;
314 274
315 ASSERT(log_vector); 275 ASSERT(log_vector);
276
277 /*
278 * Do all the accounting aggregation and switching of log vectors
279 * around in a separate loop to the insertion of items into the CIL.
280 * Then we can do a separate loop to update the CIL within a single
281 * lock/unlock pair. This reduces the number of round trips on the CIL
282 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
283 * hold time for the transaction commit.
284 *
285 * If this is the first time the item is being placed into the CIL in
286 * this context, pin it so it can't be written to disk until the CIL is
287 * flushed to the iclog and the iclog written to disk.
288 *
289 * We can do this safely because the context can't checkpoint until we
290 * are done so it doesn't matter exactly how we update the CIL.
291 */
316 for (lv = log_vector; lv; lv = lv->lv_next) 292 for (lv = log_vector; lv; lv = lv->lv_next)
317 xlog_cil_insert(log, ticket, lv->lv_item, lv); 293 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
294
295 /* account for space used by new iovec headers */
296 len += diff_iovecs * sizeof(xlog_op_header_t);
297
298 spin_lock(&cil->xc_cil_lock);
299
300 /* move the items to the tail of the CIL */
301 for (lv = log_vector; lv; lv = lv->lv_next)
302 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
303
304 ctx->nvecs += diff_iovecs;
305
306 /*
307 * Now transfer enough transaction reservation to the context ticket
308 * for the checkpoint. The context ticket is special - the unit
309 * reservation has to grow as well as the current reservation as we
310 * steal from tickets so we can correctly determine the space used
311 * during the transaction commit.
312 */
313 if (ctx->ticket->t_curr_res == 0) {
314 /* first commit in checkpoint, steal the header reservation */
315 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
316 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
317 ticket->t_curr_res -= ctx->ticket->t_unit_res;
318 }
319
320 /* do we need space for more log record headers? */
321 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
322 if (len > 0 && (ctx->space_used / iclog_space !=
323 (ctx->space_used + len) / iclog_space)) {
324 int hdrs;
325
326 hdrs = (len + iclog_space - 1) / iclog_space;
327 /* need to take into account split region headers, too */
328 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
329 ctx->ticket->t_unit_res += hdrs;
330 ctx->ticket->t_curr_res += hdrs;
331 ticket->t_curr_res -= hdrs;
332 ASSERT(ticket->t_curr_res >= len);
333 }
334 ticket->t_curr_res -= len;
335 ctx->space_used += len;
336
337 spin_unlock(&cil->xc_cil_lock);
318} 338}
319 339
320static void 340static void
@@ -342,24 +362,28 @@ xlog_cil_committed(
342 int abort) 362 int abort)
343{ 363{
344 struct xfs_cil_ctx *ctx = args; 364 struct xfs_cil_ctx *ctx = args;
345 struct xfs_log_vec *lv; 365 struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
346 int abortflag = abort ? XFS_LI_ABORTED : 0;
347 struct xfs_busy_extent *busyp, *n;
348 366
349 /* unpin all the log items */ 367 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
350 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { 368 ctx->start_lsn, abort);
351 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
352 abortflag);
353 }
354 369
355 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) 370 xfs_alloc_busy_sort(&ctx->busy_extents);
356 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); 371 xfs_alloc_busy_clear(mp, &ctx->busy_extents,
372 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
357 373
358 spin_lock(&ctx->cil->xc_cil_lock); 374 spin_lock(&ctx->cil->xc_cil_lock);
359 list_del(&ctx->committing); 375 list_del(&ctx->committing);
360 spin_unlock(&ctx->cil->xc_cil_lock); 376 spin_unlock(&ctx->cil->xc_cil_lock);
361 377
362 xlog_cil_free_logvec(ctx->lv_chain); 378 xlog_cil_free_logvec(ctx->lv_chain);
379
380 if (!list_empty(&ctx->busy_extents)) {
381 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
382
383 xfs_discard_extents(mp, &ctx->busy_extents);
384 xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
385 }
386
363 kmem_free(ctx); 387 kmem_free(ctx);
364} 388}
365 389
@@ -529,7 +553,7 @@ xlog_cil_push(
529 553
530 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); 554 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
531 if (error) 555 if (error)
532 goto out_abort; 556 goto out_abort_free_ticket;
533 557
534 /* 558 /*
535 * now that we've written the checkpoint into the log, strictly 559 * now that we've written the checkpoint into the log, strictly
@@ -549,14 +573,15 @@ restart:
549 * It is still being pushed! Wait for the push to 573 * It is still being pushed! Wait for the push to
550 * complete, then start again from the beginning. 574 * complete, then start again from the beginning.
551 */ 575 */
552 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 576 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
553 goto restart; 577 goto restart;
554 } 578 }
555 } 579 }
556 spin_unlock(&cil->xc_cil_lock); 580 spin_unlock(&cil->xc_cil_lock);
557 581
582 /* xfs_log_done always frees the ticket on error. */
558 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 583 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
559 if (error || commit_lsn == -1) 584 if (commit_lsn == -1)
560 goto out_abort; 585 goto out_abort;
561 586
562 /* attach all the transactions w/ busy extents to iclog */ 587 /* attach all the transactions w/ busy extents to iclog */
@@ -573,7 +598,7 @@ restart:
573 */ 598 */
574 spin_lock(&cil->xc_cil_lock); 599 spin_lock(&cil->xc_cil_lock);
575 ctx->commit_lsn = commit_lsn; 600 ctx->commit_lsn = commit_lsn;
576 sv_broadcast(&cil->xc_commit_wait); 601 wake_up_all(&cil->xc_commit_wait);
577 spin_unlock(&cil->xc_cil_lock); 602 spin_unlock(&cil->xc_cil_lock);
578 603
579 /* release the hounds! */ 604 /* release the hounds! */
@@ -586,6 +611,8 @@ out_free_ticket:
586 kmem_free(new_ctx); 611 kmem_free(new_ctx);
587 return 0; 612 return 0;
588 613
614out_abort_free_ticket:
615 xfs_log_ticket_put(tic);
589out_abort: 616out_abort:
590 xlog_cil_committed(ctx, XFS_LI_ABORTED); 617 xlog_cil_committed(ctx, XFS_LI_ABORTED);
591 return XFS_ERROR(EIO); 618 return XFS_ERROR(EIO);
@@ -608,7 +635,7 @@ out_abort:
608 * background commit, returns without it held once background commits are 635 * background commit, returns without it held once background commits are
609 * allowed again. 636 * allowed again.
610 */ 637 */
611int 638void
612xfs_log_commit_cil( 639xfs_log_commit_cil(
613 struct xfs_mount *mp, 640 struct xfs_mount *mp,
614 struct xfs_trans *tp, 641 struct xfs_trans *tp,
@@ -623,11 +650,6 @@ xfs_log_commit_cil(
623 if (flags & XFS_TRANS_RELEASE_LOG_RES) 650 if (flags & XFS_TRANS_RELEASE_LOG_RES)
624 log_flags = XFS_LOG_REL_PERM_RESERV; 651 log_flags = XFS_LOG_REL_PERM_RESERV;
625 652
626 if (XLOG_FORCED_SHUTDOWN(log)) {
627 xlog_cil_free_logvec(log_vector);
628 return XFS_ERROR(EIO);
629 }
630
631 /* 653 /*
632 * do all the hard work of formatting items (including memory 654 * do all the hard work of formatting items (including memory
633 * allocation) outside the CIL context lock. This prevents stalling CIL 655 * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -638,7 +660,10 @@ xfs_log_commit_cil(
638 660
639 /* lock out background commit */ 661 /* lock out background commit */
640 down_read(&log->l_cilp->xc_ctx_lock); 662 down_read(&log->l_cilp->xc_ctx_lock);
641 xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); 663 if (commit_lsn)
664 *commit_lsn = log->l_cilp->xc_ctx->sequence;
665
666 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
642 667
643 /* check we didn't blow the reservation */ 668 /* check we didn't blow the reservation */
644 if (tp->t_ticket->t_curr_res < 0) 669 if (tp->t_ticket->t_curr_res < 0)
@@ -684,7 +709,6 @@ xfs_log_commit_cil(
684 */ 709 */
685 if (push) 710 if (push)
686 xlog_cil_push(log, 0); 711 xlog_cil_push(log, 0);
687 return 0;
688} 712}
689 713
690/* 714/*
@@ -735,7 +759,7 @@ restart:
735 * It is still being pushed! Wait for the push to 759 * It is still being pushed! Wait for the push to
736 * complete, then start again from the beginning. 760 * complete, then start again from the beginning.
737 */ 761 */
738 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 762 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
739 goto restart; 763 goto restart;
740 } 764 }
741 if (ctx->sequence != sequence) 765 if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..2d3b6a498d63 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
21struct xfs_buf; 21struct xfs_buf;
22struct log; 22struct log;
23struct xlog_ticket; 23struct xlog_ticket;
24struct xfs_buf_cancel;
25struct xfs_mount; 24struct xfs_mount;
26 25
27/* 26/*
@@ -54,7 +53,6 @@ struct xfs_mount;
54 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ 53 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
55 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) 54 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
56 55
57
58static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) 56static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
59{ 57{
60 return ((xfs_lsn_t)cycle << 32) | block; 58 return ((xfs_lsn_t)cycle << 32) | block;
@@ -89,10 +87,6 @@ static inline uint xlog_get_client_id(__be32 i)
89 return be32_to_cpu(i) >> 24; 87 return be32_to_cpu(i) >> 24;
90} 88}
91 89
92#define xlog_panic(args...) cmn_err(CE_PANIC, ## args)
93#define xlog_exit(args...) cmn_err(CE_PANIC, ## args)
94#define xlog_warn(args...) cmn_err(CE_WARN, ## args)
95
96/* 90/*
97 * In core log state 91 * In core log state
98 */ 92 */
@@ -133,12 +127,10 @@ static inline uint xlog_get_client_id(__be32 i)
133 */ 127 */
134#define XLOG_TIC_INITED 0x1 /* has been initialized */ 128#define XLOG_TIC_INITED 0x1 /* has been initialized */
135#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ 129#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
136#define XLOG_TIC_IN_Q 0x4
137 130
138#define XLOG_TIC_FLAGS \ 131#define XLOG_TIC_FLAGS \
139 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ 132 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
140 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ 133 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
141 { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" }
142 134
143#endif /* __KERNEL__ */ 135#endif /* __KERNEL__ */
144 136
@@ -152,6 +144,9 @@ static inline uint xlog_get_client_id(__be32 i)
152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 144#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 145#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
154 shutdown */ 146 shutdown */
147#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
148
149typedef __uint32_t xlog_tid_t;
155 150
156#ifdef __KERNEL__ 151#ifdef __KERNEL__
157/* 152/*
@@ -244,9 +239,8 @@ typedef struct xlog_res {
244} xlog_res_t; 239} xlog_res_t;
245 240
246typedef struct xlog_ticket { 241typedef struct xlog_ticket {
247 sv_t t_wait; /* ticket wait queue : 20 */ 242 wait_queue_head_t t_wait; /* ticket wait queue */
248 struct xlog_ticket *t_next; /* :4|8 */ 243 struct list_head t_queue; /* reserve/write queue */
249 struct xlog_ticket *t_prev; /* :4|8 */
250 xlog_tid_t t_tid; /* transaction identifier : 4 */ 244 xlog_tid_t t_tid; /* transaction identifier : 4 */
251 atomic_t t_ref; /* ticket reference count : 4 */ 245 atomic_t t_ref; /* ticket reference count : 4 */
252 int t_curr_res; /* current reservation in bytes : 4 */ 246 int t_curr_res; /* current reservation in bytes : 4 */
@@ -353,8 +347,8 @@ typedef union xlog_in_core2 {
353 * and move everything else out to subsequent cachelines. 347 * and move everything else out to subsequent cachelines.
354 */ 348 */
355typedef struct xlog_in_core { 349typedef struct xlog_in_core {
356 sv_t ic_force_wait; 350 wait_queue_head_t ic_force_wait;
357 sv_t ic_write_wait; 351 wait_queue_head_t ic_write_wait;
358 struct xlog_in_core *ic_next; 352 struct xlog_in_core *ic_next;
359 struct xlog_in_core *ic_prev; 353 struct xlog_in_core *ic_prev;
360 struct xfs_buf *ic_bp; 354 struct xfs_buf *ic_bp;
@@ -421,7 +415,7 @@ struct xfs_cil {
421 struct xfs_cil_ctx *xc_ctx; 415 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock; 416 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 417 struct list_head xc_committing;
424 sv_t xc_commit_wait; 418 wait_queue_head_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence; 419 xfs_lsn_t xc_current_sequence;
426}; 420};
427 421
@@ -491,7 +485,7 @@ typedef struct log {
491 struct xfs_buftarg *l_targ; /* buftarg of log */ 485 struct xfs_buftarg *l_targ; /* buftarg of log */
492 uint l_flags; 486 uint l_flags;
493 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 487 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
494 struct xfs_buf_cancel **l_buf_cancel_table; 488 struct list_head *l_buf_cancel_table;
495 int l_iclog_hsize; /* size of iclog header */ 489 int l_iclog_hsize; /* size of iclog header */
496 int l_iclog_heads; /* # of iclog header sectors */ 490 int l_iclog_heads; /* # of iclog header sectors */
497 uint l_sectBBsize; /* sector size in BBs (2^n) */ 491 uint l_sectBBsize; /* sector size in BBs (2^n) */
@@ -503,29 +497,40 @@ typedef struct log {
503 int l_logBBsize; /* size of log in BB chunks */ 497 int l_logBBsize; /* size of log in BB chunks */
504 498
505 /* The following block of fields are changed while holding icloglock */ 499 /* The following block of fields are changed while holding icloglock */
506 sv_t l_flush_wait ____cacheline_aligned_in_smp; 500 wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
507 /* waiting for iclog flush */ 501 /* waiting for iclog flush */
508 int l_covered_state;/* state of "covering disk 502 int l_covered_state;/* state of "covering disk
509 * log entries" */ 503 * log entries" */
510 xlog_in_core_t *l_iclog; /* head log queue */ 504 xlog_in_core_t *l_iclog; /* head log queue */
511 spinlock_t l_icloglock; /* grab to change iclog state */ 505 spinlock_t l_icloglock; /* grab to change iclog state */
512 xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
513 * buffers */
514 xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
515 int l_curr_cycle; /* Cycle number of log writes */ 506 int l_curr_cycle; /* Cycle number of log writes */
516 int l_prev_cycle; /* Cycle number before last 507 int l_prev_cycle; /* Cycle number before last
517 * block increment */ 508 * block increment */
518 int l_curr_block; /* current logical log block */ 509 int l_curr_block; /* current logical log block */
519 int l_prev_block; /* previous logical log block */ 510 int l_prev_block; /* previous logical log block */
520 511
521 /* The following block of fields are changed while holding grant_lock */ 512 /*
522 spinlock_t l_grant_lock ____cacheline_aligned_in_smp; 513 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
523 xlog_ticket_t *l_reserve_headq; 514 * read without needing to hold specific locks. To avoid operations
524 xlog_ticket_t *l_write_headq; 515 * contending with other hot objects, place each of them on a separate
525 int l_grant_reserve_cycle; 516 * cacheline.
526 int l_grant_reserve_bytes; 517 */
527 int l_grant_write_cycle; 518 /* lsn of last LR on disk */
528 int l_grant_write_bytes; 519 atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
520 /* lsn of 1st LR with unflushed * buffers */
521 atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
522
523 /*
524 * ticket grant locks, queues and accounting have their own cachlines
525 * as these are quite hot and can be operated on concurrently.
526 */
527 spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
528 struct list_head l_reserveq;
529 atomic64_t l_grant_reserve_head;
530
531 spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
532 struct list_head l_writeq;
533 atomic64_t l_grant_write_head;
529 534
530 /* The following field are used for debugging; need to hold icloglock */ 535 /* The following field are used for debugging; need to hold icloglock */
531#ifdef DEBUG 536#ifdef DEBUG
@@ -534,6 +539,9 @@ typedef struct log {
534 539
535} xlog_t; 540} xlog_t;
536 541
542#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
543 ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
544
537#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 545#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
538 546
539/* common routines */ 547/* common routines */
@@ -562,6 +570,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
562 xlog_in_core_t **commit_iclog, uint flags); 570 xlog_in_core_t **commit_iclog, uint flags);
563 571
564/* 572/*
573 * When we crack an atomic LSN, we sample it first so that the value will not
574 * change while we are cracking it into the component values. This means we
575 * will always get consistent component values to work from. This should always
576 * be used to sample and crack LSNs that are stored and updated in atomic
577 * variables.
578 */
579static inline void
580xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
581{
582 xfs_lsn_t val = atomic64_read(lsn);
583
584 *cycle = CYCLE_LSN(val);
585 *block = BLOCK_LSN(val);
586}
587
588/*
589 * Calculate and assign a value to an atomic LSN variable from component pieces.
590 */
591static inline void
592xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
593{
594 atomic64_set(lsn, xlog_assign_lsn(cycle, block));
595}
596
597/*
598 * When we crack the grant head, we sample it first so that the value will not
599 * change while we are cracking it into the component values. This means we
600 * will always get consistent component values to work from.
601 */
602static inline void
603xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
604{
605 *cycle = val >> 32;
606 *space = val & 0xffffffff;
607}
608
609static inline void
610xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
611{
612 xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
613}
614
615static inline int64_t
616xlog_assign_grant_head_val(int cycle, int space)
617{
618 return ((int64_t)cycle << 32) | space;
619}
620
621static inline void
622xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
623{
624 atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
625}
626
627/*
565 * Committed Item List interfaces 628 * Committed Item List interfaces
566 */ 629 */
567int xlog_cil_init(struct log *log); 630int xlog_cil_init(struct log *log);
@@ -585,6 +648,21 @@ xlog_cil_force(struct log *log)
585 */ 648 */
586#define XLOG_UNMOUNT_REC_TYPE (-1U) 649#define XLOG_UNMOUNT_REC_TYPE (-1U)
587 650
651/*
652 * Wrapper function for waiting on a wait queue serialised against wakeups
653 * by a spinlock. This matches the semantics of all the wait queues used in the
654 * log code.
655 */
656static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
657{
658 DECLARE_WAITQUEUE(wait, current);
659
660 add_wait_queue_exclusive(wq, &wait);
661 __set_current_state(TASK_UNINTERRUPTIBLE);
662 spin_unlock(lock);
663 schedule();
664 remove_wait_queue(wq, &wait);
665}
588#endif /* __KERNEL__ */ 666#endif /* __KERNEL__ */
589 667
590#endif /* __XFS_LOG_PRIV_H__ */ 668#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6f3f5fa37acf..04142caedb2b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void xlog_recover_check_summary(xlog_t *);
53#endif 53#endif
54 54
55/* 55/*
56 * This structure is used during recovery to record the buf log items which
57 * have been canceled and should not be replayed.
58 */
59struct xfs_buf_cancel {
60 xfs_daddr_t bc_blkno;
61 uint bc_len;
62 int bc_refcount;
63 struct list_head bc_list;
64};
65
66/*
56 * Sector aligned buffer routines for buffer create/read/write/access 67 * Sector aligned buffer routines for buffer create/read/write/access
57 */ 68 */
58 69
@@ -81,7 +92,7 @@ xlog_get_bp(
81 int nbblks) 92 int nbblks)
82{ 93{
83 if (!xlog_buf_bbcount_valid(log, nbblks)) { 94 if (!xlog_buf_bbcount_valid(log, nbblks)) {
84 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 95 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
85 nbblks); 96 nbblks);
86 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 97 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
87 return NULL; 98 return NULL;
@@ -90,7 +101,7 @@ xlog_get_bp(
90 /* 101 /*
91 * We do log I/O in units of log sectors (a power-of-2 102 * We do log I/O in units of log sectors (a power-of-2
92 * multiple of the basic block size), so we round up the 103 * multiple of the basic block size), so we round up the
93 * requested size to acommodate the basic blocks required 104 * requested size to accommodate the basic blocks required
94 * for complete log sectors. 105 * for complete log sectors.
95 * 106 *
96 * In addition, the buffer may be used for a non-sector- 107 * In addition, the buffer may be used for a non-sector-
@@ -101,13 +112,14 @@ xlog_get_bp(
101 * an issue. Nor will this be a problem if the log I/O is 112 * an issue. Nor will this be a problem if the log I/O is
102 * done in basic blocks (sector size 1). But otherwise we 113 * done in basic blocks (sector size 1). But otherwise we
103 * extend the buffer by one extra log sector to ensure 114 * extend the buffer by one extra log sector to ensure
104 * there's space to accomodate this possiblility. 115 * there's space to accommodate this possibility.
105 */ 116 */
106 if (nbblks > 1 && log->l_sectBBsize > 1) 117 if (nbblks > 1 && log->l_sectBBsize > 1)
107 nbblks += log->l_sectBBsize; 118 nbblks += log->l_sectBBsize;
108 nbblks = round_up(nbblks, log->l_sectBBsize); 119 nbblks = round_up(nbblks, log->l_sectBBsize);
109 120
110 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); 121 return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
122 BBTOB(nbblks), 0);
111} 123}
112 124
113STATIC void 125STATIC void
@@ -148,7 +160,7 @@ xlog_bread_noalign(
148 int error; 160 int error;
149 161
150 if (!xlog_buf_bbcount_valid(log, nbblks)) { 162 if (!xlog_buf_bbcount_valid(log, nbblks)) {
151 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 163 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
152 nbblks); 164 nbblks);
153 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 165 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
154 return EFSCORRUPTED; 166 return EFSCORRUPTED;
@@ -167,7 +179,7 @@ xlog_bread_noalign(
167 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); 179 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
168 180
169 xfsbdstrat(log->l_mp, bp); 181 xfsbdstrat(log->l_mp, bp);
170 error = xfs_iowait(bp); 182 error = xfs_buf_iowait(bp);
171 if (error) 183 if (error)
172 xfs_ioerror_alert("xlog_bread", log->l_mp, 184 xfs_ioerror_alert("xlog_bread", log->l_mp,
173 bp, XFS_BUF_ADDR(bp)); 185 bp, XFS_BUF_ADDR(bp));
@@ -193,6 +205,35 @@ xlog_bread(
193} 205}
194 206
195/* 207/*
208 * Read at an offset into the buffer. Returns with the buffer in it's original
209 * state regardless of the result of the read.
210 */
211STATIC int
212xlog_bread_offset(
213 xlog_t *log,
214 xfs_daddr_t blk_no, /* block to read from */
215 int nbblks, /* blocks to read */
216 xfs_buf_t *bp,
217 xfs_caddr_t offset)
218{
219 xfs_caddr_t orig_offset = XFS_BUF_PTR(bp);
220 int orig_len = bp->b_buffer_length;
221 int error, error2;
222
223 error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
224 if (error)
225 return error;
226
227 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
228
229 /* must reset buffer pointer even on error */
230 error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
231 if (error)
232 return error;
233 return error2;
234}
235
236/*
196 * Write out the buffer at the given block for the given number of blocks. 237 * Write out the buffer at the given block for the given number of blocks.
197 * The buffer is kept locked across the write and is returned locked. 238 * The buffer is kept locked across the write and is returned locked.
198 * This can only be used for synchronous log writes. 239 * This can only be used for synchronous log writes.
@@ -207,7 +248,7 @@ xlog_bwrite(
207 int error; 248 int error;
208 249
209 if (!xlog_buf_bbcount_valid(log, nbblks)) { 250 if (!xlog_buf_bbcount_valid(log, nbblks)) {
210 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 251 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
211 nbblks); 252 nbblks);
212 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 253 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
213 return EFSCORRUPTED; 254 return EFSCORRUPTED;
@@ -242,9 +283,9 @@ xlog_header_check_dump(
242 xfs_mount_t *mp, 283 xfs_mount_t *mp,
243 xlog_rec_header_t *head) 284 xlog_rec_header_t *head)
244{ 285{
245 cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n", 286 xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",
246 __func__, &mp->m_sb.sb_uuid, XLOG_FMT); 287 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
247 cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n", 288 xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",
248 &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); 289 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
249} 290}
250#else 291#else
@@ -267,15 +308,15 @@ xlog_header_check_recover(
267 * a dirty log created in IRIX. 308 * a dirty log created in IRIX.
268 */ 309 */
269 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { 310 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
270 xlog_warn( 311 xfs_warn(mp,
271 "XFS: dirty log written in incompatible format - can't recover"); 312 "dirty log written in incompatible format - can't recover");
272 xlog_header_check_dump(mp, head); 313 xlog_header_check_dump(mp, head);
273 XFS_ERROR_REPORT("xlog_header_check_recover(1)", 314 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
274 XFS_ERRLEVEL_HIGH, mp); 315 XFS_ERRLEVEL_HIGH, mp);
275 return XFS_ERROR(EFSCORRUPTED); 316 return XFS_ERROR(EFSCORRUPTED);
276 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 317 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
277 xlog_warn( 318 xfs_warn(mp,
278 "XFS: dirty log entry has mismatched uuid - can't recover"); 319 "dirty log entry has mismatched uuid - can't recover");
279 xlog_header_check_dump(mp, head); 320 xlog_header_check_dump(mp, head);
280 XFS_ERROR_REPORT("xlog_header_check_recover(2)", 321 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
281 XFS_ERRLEVEL_HIGH, mp); 322 XFS_ERRLEVEL_HIGH, mp);
@@ -300,9 +341,9 @@ xlog_header_check_mount(
300 * h_fs_uuid is nil, we assume this log was last mounted 341 * h_fs_uuid is nil, we assume this log was last mounted
301 * by IRIX and continue. 342 * by IRIX and continue.
302 */ 343 */
303 xlog_warn("XFS: nil uuid in log - IRIX style log"); 344 xfs_warn(mp, "nil uuid in log - IRIX style log");
304 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 345 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
305 xlog_warn("XFS: log has mismatched uuid - can't recover"); 346 xfs_warn(mp, "log has mismatched uuid - can't recover");
306 xlog_header_check_dump(mp, head); 347 xlog_header_check_dump(mp, head);
307 XFS_ERROR_REPORT("xlog_header_check_mount", 348 XFS_ERROR_REPORT("xlog_header_check_mount",
308 XFS_ERRLEVEL_HIGH, mp); 349 XFS_ERRLEVEL_HIGH, mp);
@@ -321,12 +362,13 @@ xlog_recover_iodone(
321 * this during recovery. One strike! 362 * this during recovery. One strike!
322 */ 363 */
323 xfs_ioerror_alert("xlog_recover_iodone", 364 xfs_ioerror_alert("xlog_recover_iodone",
324 bp->b_mount, bp, XFS_BUF_ADDR(bp)); 365 bp->b_target->bt_mount, bp,
325 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 366 XFS_BUF_ADDR(bp));
367 xfs_force_shutdown(bp->b_target->bt_mount,
368 SHUTDOWN_META_IO_ERROR);
326 } 369 }
327 bp->b_mount = NULL;
328 XFS_BUF_CLR_IODONE_FUNC(bp); 370 XFS_BUF_CLR_IODONE_FUNC(bp);
329 xfs_biodone(bp); 371 xfs_buf_ioend(bp, 0);
330} 372}
331 373
332/* 374/*
@@ -477,8 +519,8 @@ xlog_find_verify_log_record(
477 for (i = (*last_blk) - 1; i >= 0; i--) { 519 for (i = (*last_blk) - 1; i >= 0; i--) {
478 if (i < start_blk) { 520 if (i < start_blk) {
479 /* valid log record not found */ 521 /* valid log record not found */
480 xlog_warn( 522 xfs_warn(log->l_mp,
481 "XFS: Log inconsistent (didn't find previous header)"); 523 "Log inconsistent (didn't find previous header)");
482 ASSERT(0); 524 ASSERT(0);
483 error = XFS_ERROR(EIO); 525 error = XFS_ERROR(EIO);
484 goto out; 526 goto out;
@@ -578,12 +620,12 @@ xlog_find_head(
578 * mkfs etc write a dummy unmount record to a fresh 620 * mkfs etc write a dummy unmount record to a fresh
579 * log so we can store the uuid in there 621 * log so we can store the uuid in there
580 */ 622 */
581 xlog_warn("XFS: totally zeroed log"); 623 xfs_warn(log->l_mp, "totally zeroed log");
582 } 624 }
583 625
584 return 0; 626 return 0;
585 } else if (error) { 627 } else if (error) {
586 xlog_warn("XFS: empty log check failed"); 628 xfs_warn(log->l_mp, "empty log check failed");
587 return error; 629 return error;
588 } 630 }
589 631
@@ -806,7 +848,7 @@ validate_head:
806 xlog_put_bp(bp); 848 xlog_put_bp(bp);
807 849
808 if (error) 850 if (error)
809 xlog_warn("XFS: failed to find log head"); 851 xfs_warn(log->l_mp, "failed to find log head");
810 return error; 852 return error;
811} 853}
812 854
@@ -899,7 +941,7 @@ xlog_find_tail(
899 } 941 }
900 } 942 }
901 if (!found) { 943 if (!found) {
902 xlog_warn("XFS: xlog_find_tail: couldn't find sync record"); 944 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
903 ASSERT(0); 945 ASSERT(0);
904 return XFS_ERROR(EIO); 946 return XFS_ERROR(EIO);
905 } 947 }
@@ -923,12 +965,12 @@ xlog_find_tail(
923 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 965 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
924 if (found == 2) 966 if (found == 2)
925 log->l_curr_cycle++; 967 log->l_curr_cycle++;
926 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); 968 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
927 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); 969 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
928 log->l_grant_reserve_cycle = log->l_curr_cycle; 970 xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
929 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); 971 BBTOB(log->l_curr_block));
930 log->l_grant_write_cycle = log->l_curr_cycle; 972 xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
931 log->l_grant_write_bytes = BBTOB(log->l_curr_block); 973 BBTOB(log->l_curr_block));
932 974
933 /* 975 /*
934 * Look for unmount record. If we find it, then we know there 976 * Look for unmount record. If we find it, then we know there
@@ -958,7 +1000,7 @@ xlog_find_tail(
958 } 1000 }
959 after_umount_blk = (i + hblks + (int) 1001 after_umount_blk = (i + hblks + (int)
960 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; 1002 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
961 tail_lsn = log->l_tail_lsn; 1003 tail_lsn = atomic64_read(&log->l_tail_lsn);
962 if (*head_blk == after_umount_blk && 1004 if (*head_blk == after_umount_blk &&
963 be32_to_cpu(rhead->h_num_logops) == 1) { 1005 be32_to_cpu(rhead->h_num_logops) == 1) {
964 umount_data_blk = (i + hblks) % log->l_logBBsize; 1006 umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -973,12 +1015,10 @@ xlog_find_tail(
973 * log records will point recovery to after the 1015 * log records will point recovery to after the
974 * current unmount record. 1016 * current unmount record.
975 */ 1017 */
976 log->l_tail_lsn = 1018 xlog_assign_atomic_lsn(&log->l_tail_lsn,
977 xlog_assign_lsn(log->l_curr_cycle, 1019 log->l_curr_cycle, after_umount_blk);
978 after_umount_blk); 1020 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
979 log->l_last_sync_lsn = 1021 log->l_curr_cycle, after_umount_blk);
980 xlog_assign_lsn(log->l_curr_cycle,
981 after_umount_blk);
982 *tail_blk = after_umount_blk; 1022 *tail_blk = after_umount_blk;
983 1023
984 /* 1024 /*
@@ -1017,7 +1057,7 @@ done:
1017 xlog_put_bp(bp); 1057 xlog_put_bp(bp);
1018 1058
1019 if (error) 1059 if (error)
1020 xlog_warn("XFS: failed to locate log tail"); 1060 xfs_warn(log->l_mp, "failed to locate log tail");
1021 return error; 1061 return error;
1022} 1062}
1023 1063
@@ -1081,7 +1121,8 @@ xlog_find_zeroed(
1081 * the first block must be 1. If it's not, maybe we're 1121 * the first block must be 1. If it's not, maybe we're
1082 * not looking at a log... Bail out. 1122 * not looking at a log... Bail out.
1083 */ 1123 */
1084 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)"); 1124 xfs_warn(log->l_mp,
1125 "Log inconsistent or not a log (last==0, first!=1)");
1085 return XFS_ERROR(EINVAL); 1126 return XFS_ERROR(EINVAL);
1086 } 1127 }
1087 1128
@@ -1217,20 +1258,12 @@ xlog_write_log_records(
1217 */ 1258 */
1218 ealign = round_down(end_block, sectbb); 1259 ealign = round_down(end_block, sectbb);
1219 if (j == 0 && (start_block + endcount > ealign)) { 1260 if (j == 0 && (start_block + endcount > ealign)) {
1220 offset = XFS_BUF_PTR(bp); 1261 offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
1221 balign = BBTOB(ealign - start_block); 1262 error = xlog_bread_offset(log, ealign, sectbb,
1222 error = XFS_BUF_SET_PTR(bp, offset + balign, 1263 bp, offset);
1223 BBTOB(sectbb));
1224 if (error)
1225 break;
1226
1227 error = xlog_bread_noalign(log, ealign, sectbb, bp);
1228 if (error) 1264 if (error)
1229 break; 1265 break;
1230 1266
1231 error = XFS_BUF_SET_PTR(bp, offset, bufblks);
1232 if (error)
1233 break;
1234 } 1267 }
1235 1268
1236 offset = xlog_align(log, start_block, endcount, bp); 1269 offset = xlog_align(log, start_block, endcount, bp);
@@ -1495,8 +1528,8 @@ xlog_recover_add_to_trans(
1495 if (list_empty(&trans->r_itemq)) { 1528 if (list_empty(&trans->r_itemq)) {
1496 /* we need to catch log corruptions here */ 1529 /* we need to catch log corruptions here */
1497 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 1530 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1498 xlog_warn("XFS: xlog_recover_add_to_trans: " 1531 xfs_warn(log->l_mp, "%s: bad header magic number",
1499 "bad header magic number"); 1532 __func__);
1500 ASSERT(0); 1533 ASSERT(0);
1501 return XFS_ERROR(EIO); 1534 return XFS_ERROR(EIO);
1502 } 1535 }
@@ -1523,8 +1556,8 @@ xlog_recover_add_to_trans(
1523 if (item->ri_total == 0) { /* first region to be added */ 1556 if (item->ri_total == 0) { /* first region to be added */
1524 if (in_f->ilf_size == 0 || 1557 if (in_f->ilf_size == 0 ||
1525 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { 1558 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1526 xlog_warn( 1559 xfs_warn(log->l_mp,
1527 "XFS: bad number of regions (%d) in inode log format", 1560 "bad number of regions (%d) in inode log format",
1528 in_f->ilf_size); 1561 in_f->ilf_size);
1529 ASSERT(0); 1562 ASSERT(0);
1530 return XFS_ERROR(EIO); 1563 return XFS_ERROR(EIO);
@@ -1581,8 +1614,9 @@ xlog_recover_reorder_trans(
1581 list_move_tail(&item->ri_list, &trans->r_itemq); 1614 list_move_tail(&item->ri_list, &trans->r_itemq);
1582 break; 1615 break;
1583 default: 1616 default:
1584 xlog_warn( 1617 xfs_warn(log->l_mp,
1585 "XFS: xlog_recover_reorder_trans: unrecognized type of log operation"); 1618 "%s: unrecognized type of log operation",
1619 __func__);
1586 ASSERT(0); 1620 ASSERT(0);
1587 return XFS_ERROR(EIO); 1621 return XFS_ERROR(EIO);
1588 } 1622 }
@@ -1603,82 +1637,45 @@ xlog_recover_reorder_trans(
1603 * record in the table to tell us how many times we expect to see this 1637 * record in the table to tell us how many times we expect to see this
1604 * record during the second pass. 1638 * record during the second pass.
1605 */ 1639 */
1606STATIC void 1640STATIC int
1607xlog_recover_do_buffer_pass1( 1641xlog_recover_buffer_pass1(
1608 xlog_t *log, 1642 struct log *log,
1609 xfs_buf_log_format_t *buf_f) 1643 xlog_recover_item_t *item)
1610{ 1644{
1611 xfs_buf_cancel_t *bcp; 1645 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1612 xfs_buf_cancel_t *nextp; 1646 struct list_head *bucket;
1613 xfs_buf_cancel_t *prevp; 1647 struct xfs_buf_cancel *bcp;
1614 xfs_buf_cancel_t **bucket;
1615 xfs_daddr_t blkno = 0;
1616 uint len = 0;
1617 ushort flags = 0;
1618
1619 switch (buf_f->blf_type) {
1620 case XFS_LI_BUF:
1621 blkno = buf_f->blf_blkno;
1622 len = buf_f->blf_len;
1623 flags = buf_f->blf_flags;
1624 break;
1625 }
1626 1648
1627 /* 1649 /*
1628 * If this isn't a cancel buffer item, then just return. 1650 * If this isn't a cancel buffer item, then just return.
1629 */ 1651 */
1630 if (!(flags & XFS_BLF_CANCEL)) { 1652 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1631 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1653 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1632 return; 1654 return 0;
1633 }
1634
1635 /*
1636 * Insert an xfs_buf_cancel record into the hash table of
1637 * them. If there is already an identical record, bump
1638 * its reference count.
1639 */
1640 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1641 XLOG_BC_TABLE_SIZE];
1642 /*
1643 * If the hash bucket is empty then just insert a new record into
1644 * the bucket.
1645 */
1646 if (*bucket == NULL) {
1647 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1648 KM_SLEEP);
1649 bcp->bc_blkno = blkno;
1650 bcp->bc_len = len;
1651 bcp->bc_refcount = 1;
1652 bcp->bc_next = NULL;
1653 *bucket = bcp;
1654 return;
1655 } 1655 }
1656 1656
1657 /* 1657 /*
1658 * The hash bucket is not empty, so search for duplicates of our 1658 * Insert an xfs_buf_cancel record into the hash table of them.
1659 * record. If we find one them just bump its refcount. If not 1659 * If there is already an identical record, bump its reference count.
1660 * then add us at the end of the list.
1661 */ 1660 */
1662 prevp = NULL; 1661 bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1663 nextp = *bucket; 1662 list_for_each_entry(bcp, bucket, bc_list) {
1664 while (nextp != NULL) { 1663 if (bcp->bc_blkno == buf_f->blf_blkno &&
1665 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1664 bcp->bc_len == buf_f->blf_len) {
1666 nextp->bc_refcount++; 1665 bcp->bc_refcount++;
1667 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); 1666 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1668 return; 1667 return 0;
1669 } 1668 }
1670 prevp = nextp; 1669 }
1671 nextp = nextp->bc_next; 1670
1672 } 1671 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1673 ASSERT(prevp != NULL); 1672 bcp->bc_blkno = buf_f->blf_blkno;
1674 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), 1673 bcp->bc_len = buf_f->blf_len;
1675 KM_SLEEP);
1676 bcp->bc_blkno = blkno;
1677 bcp->bc_len = len;
1678 bcp->bc_refcount = 1; 1674 bcp->bc_refcount = 1;
1679 bcp->bc_next = NULL; 1675 list_add_tail(&bcp->bc_list, bucket);
1680 prevp->bc_next = bcp; 1676
1681 trace_xfs_log_recover_buf_cancel_add(log, buf_f); 1677 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1678 return 0;
1682} 1679}
1683 1680
1684/* 1681/*
@@ -1696,14 +1693,13 @@ xlog_recover_do_buffer_pass1(
1696 */ 1693 */
1697STATIC int 1694STATIC int
1698xlog_check_buffer_cancelled( 1695xlog_check_buffer_cancelled(
1699 xlog_t *log, 1696 struct log *log,
1700 xfs_daddr_t blkno, 1697 xfs_daddr_t blkno,
1701 uint len, 1698 uint len,
1702 ushort flags) 1699 ushort flags)
1703{ 1700{
1704 xfs_buf_cancel_t *bcp; 1701 struct list_head *bucket;
1705 xfs_buf_cancel_t *prevp; 1702 struct xfs_buf_cancel *bcp;
1706 xfs_buf_cancel_t **bucket;
1707 1703
1708 if (log->l_buf_cancel_table == NULL) { 1704 if (log->l_buf_cancel_table == NULL) {
1709 /* 1705 /*
@@ -1714,128 +1710,70 @@ xlog_check_buffer_cancelled(
1714 return 0; 1710 return 0;
1715 } 1711 }
1716 1712
1717 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1718 XLOG_BC_TABLE_SIZE];
1719 bcp = *bucket;
1720 if (bcp == NULL) {
1721 /*
1722 * There is no corresponding entry in the table built
1723 * in pass one, so this buffer has not been cancelled.
1724 */
1725 ASSERT(!(flags & XFS_BLF_CANCEL));
1726 return 0;
1727 }
1728
1729 /* 1713 /*
1730 * Search for an entry in the buffer cancel table that 1714 * Search for an entry in the cancel table that matches our buffer.
1731 * matches our buffer.
1732 */ 1715 */
1733 prevp = NULL; 1716 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1734 while (bcp != NULL) { 1717 list_for_each_entry(bcp, bucket, bc_list) {
1735 if (bcp->bc_blkno == blkno && bcp->bc_len == len) { 1718 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1736 /* 1719 goto found;
1737 * We've go a match, so return 1 so that the
1738 * recovery of this buffer is cancelled.
1739 * If this buffer is actually a buffer cancel
1740 * log item, then decrement the refcount on the
1741 * one in the table and remove it if this is the
1742 * last reference.
1743 */
1744 if (flags & XFS_BLF_CANCEL) {
1745 bcp->bc_refcount--;
1746 if (bcp->bc_refcount == 0) {
1747 if (prevp == NULL) {
1748 *bucket = bcp->bc_next;
1749 } else {
1750 prevp->bc_next = bcp->bc_next;
1751 }
1752 kmem_free(bcp);
1753 }
1754 }
1755 return 1;
1756 }
1757 prevp = bcp;
1758 bcp = bcp->bc_next;
1759 } 1720 }
1721
1760 /* 1722 /*
1761 * We didn't find a corresponding entry in the table, so 1723 * We didn't find a corresponding entry in the table, so return 0 so
1762 * return 0 so that the buffer is NOT cancelled. 1724 * that the buffer is NOT cancelled.
1763 */ 1725 */
1764 ASSERT(!(flags & XFS_BLF_CANCEL)); 1726 ASSERT(!(flags & XFS_BLF_CANCEL));
1765 return 0; 1727 return 0;
1766}
1767 1728
1768STATIC int 1729found:
1769xlog_recover_do_buffer_pass2( 1730 /*
1770 xlog_t *log, 1731 * We've go a match, so return 1 so that the recovery of this buffer
1771 xfs_buf_log_format_t *buf_f) 1732 * is cancelled. If this buffer is actually a buffer cancel log
1772{ 1733 * item, then decrement the refcount on the one in the table and
1773 xfs_daddr_t blkno = 0; 1734 * remove it if this is the last reference.
1774 ushort flags = 0; 1735 */
1775 uint len = 0; 1736 if (flags & XFS_BLF_CANCEL) {
1776 1737 if (--bcp->bc_refcount == 0) {
1777 switch (buf_f->blf_type) { 1738 list_del(&bcp->bc_list);
1778 case XFS_LI_BUF: 1739 kmem_free(bcp);
1779 blkno = buf_f->blf_blkno; 1740 }
1780 flags = buf_f->blf_flags;
1781 len = buf_f->blf_len;
1782 break;
1783 } 1741 }
1784 1742 return 1;
1785 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1786} 1743}
1787 1744
1788/* 1745/*
1789 * Perform recovery for a buffer full of inodes. In these buffers, 1746 * Perform recovery for a buffer full of inodes. In these buffers, the only
1790 * the only data which should be recovered is that which corresponds 1747 * data which should be recovered is that which corresponds to the
1791 * to the di_next_unlinked pointers in the on disk inode structures. 1748 * di_next_unlinked pointers in the on disk inode structures. The rest of the
1792 * The rest of the data for the inodes is always logged through the 1749 * data for the inodes is always logged through the inodes themselves rather
1793 * inodes themselves rather than the inode buffer and is recovered 1750 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1794 * in xlog_recover_do_inode_trans().
1795 * 1751 *
1796 * The only time when buffers full of inodes are fully recovered is 1752 * The only time when buffers full of inodes are fully recovered is when the
1797 * when the buffer is full of newly allocated inodes. In this case 1753 * buffer is full of newly allocated inodes. In this case the buffer will
1798 * the buffer will not be marked as an inode buffer and so will be 1754 * not be marked as an inode buffer and so will be sent to
1799 * sent to xlog_recover_do_reg_buffer() below during recovery. 1755 * xlog_recover_do_reg_buffer() below during recovery.
1800 */ 1756 */
1801STATIC int 1757STATIC int
1802xlog_recover_do_inode_buffer( 1758xlog_recover_do_inode_buffer(
1803 xfs_mount_t *mp, 1759 struct xfs_mount *mp,
1804 xlog_recover_item_t *item, 1760 xlog_recover_item_t *item,
1805 xfs_buf_t *bp, 1761 struct xfs_buf *bp,
1806 xfs_buf_log_format_t *buf_f) 1762 xfs_buf_log_format_t *buf_f)
1807{ 1763{
1808 int i; 1764 int i;
1809 int item_index; 1765 int item_index = 0;
1810 int bit; 1766 int bit = 0;
1811 int nbits; 1767 int nbits = 0;
1812 int reg_buf_offset; 1768 int reg_buf_offset = 0;
1813 int reg_buf_bytes; 1769 int reg_buf_bytes = 0;
1814 int next_unlinked_offset; 1770 int next_unlinked_offset;
1815 int inodes_per_buf; 1771 int inodes_per_buf;
1816 xfs_agino_t *logged_nextp; 1772 xfs_agino_t *logged_nextp;
1817 xfs_agino_t *buffer_nextp; 1773 xfs_agino_t *buffer_nextp;
1818 unsigned int *data_map = NULL;
1819 unsigned int map_size = 0;
1820 1774
1821 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1775 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1822 1776
1823 switch (buf_f->blf_type) {
1824 case XFS_LI_BUF:
1825 data_map = buf_f->blf_data_map;
1826 map_size = buf_f->blf_map_size;
1827 break;
1828 }
1829 /*
1830 * Set the variables corresponding to the current region to
1831 * 0 so that we'll initialize them on the first pass through
1832 * the loop.
1833 */
1834 reg_buf_offset = 0;
1835 reg_buf_bytes = 0;
1836 bit = 0;
1837 nbits = 0;
1838 item_index = 0;
1839 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; 1777 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1840 for (i = 0; i < inodes_per_buf; i++) { 1778 for (i = 0; i < inodes_per_buf; i++) {
1841 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 1779 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1850,18 +1788,18 @@ xlog_recover_do_inode_buffer(
1850 * the current di_next_unlinked field. 1788 * the current di_next_unlinked field.
1851 */ 1789 */
1852 bit += nbits; 1790 bit += nbits;
1853 bit = xfs_next_bit(data_map, map_size, bit); 1791 bit = xfs_next_bit(buf_f->blf_data_map,
1792 buf_f->blf_map_size, bit);
1854 1793
1855 /* 1794 /*
1856 * If there are no more logged regions in the 1795 * If there are no more logged regions in the
1857 * buffer, then we're done. 1796 * buffer, then we're done.
1858 */ 1797 */
1859 if (bit == -1) { 1798 if (bit == -1)
1860 return 0; 1799 return 0;
1861 }
1862 1800
1863 nbits = xfs_contig_bits(data_map, map_size, 1801 nbits = xfs_contig_bits(buf_f->blf_data_map,
1864 bit); 1802 buf_f->blf_map_size, bit);
1865 ASSERT(nbits > 0); 1803 ASSERT(nbits > 0);
1866 reg_buf_offset = bit << XFS_BLF_SHIFT; 1804 reg_buf_offset = bit << XFS_BLF_SHIFT;
1867 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 1805 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1873,9 +1811,8 @@ xlog_recover_do_inode_buffer(
1873 * di_next_unlinked field, then move on to the next 1811 * di_next_unlinked field, then move on to the next
1874 * di_next_unlinked field. 1812 * di_next_unlinked field.
1875 */ 1813 */
1876 if (next_unlinked_offset < reg_buf_offset) { 1814 if (next_unlinked_offset < reg_buf_offset)
1877 continue; 1815 continue;
1878 }
1879 1816
1880 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1817 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1881 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 1818 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1889,8 +1826,9 @@ xlog_recover_do_inode_buffer(
1889 logged_nextp = item->ri_buf[item_index].i_addr + 1826 logged_nextp = item->ri_buf[item_index].i_addr +
1890 next_unlinked_offset - reg_buf_offset; 1827 next_unlinked_offset - reg_buf_offset;
1891 if (unlikely(*logged_nextp == 0)) { 1828 if (unlikely(*logged_nextp == 0)) {
1892 xfs_fs_cmn_err(CE_ALERT, mp, 1829 xfs_alert(mp,
1893 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", 1830 "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
1831 "Trying to replay bad (0) inode di_next_unlinked field.",
1894 item, bp); 1832 item, bp);
1895 XFS_ERROR_REPORT("xlog_recover_do_inode_buf", 1833 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1896 XFS_ERRLEVEL_LOW, mp); 1834 XFS_ERRLEVEL_LOW, mp);
@@ -1911,36 +1849,29 @@ xlog_recover_do_inode_buffer(
1911 * given buffer. The bitmap in the buf log format structure indicates 1849 * given buffer. The bitmap in the buf log format structure indicates
1912 * where to place the logged data. 1850 * where to place the logged data.
1913 */ 1851 */
1914/*ARGSUSED*/
1915STATIC void 1852STATIC void
1916xlog_recover_do_reg_buffer( 1853xlog_recover_do_reg_buffer(
1917 struct xfs_mount *mp, 1854 struct xfs_mount *mp,
1918 xlog_recover_item_t *item, 1855 xlog_recover_item_t *item,
1919 xfs_buf_t *bp, 1856 struct xfs_buf *bp,
1920 xfs_buf_log_format_t *buf_f) 1857 xfs_buf_log_format_t *buf_f)
1921{ 1858{
1922 int i; 1859 int i;
1923 int bit; 1860 int bit;
1924 int nbits; 1861 int nbits;
1925 unsigned int *data_map = NULL;
1926 unsigned int map_size = 0;
1927 int error; 1862 int error;
1928 1863
1929 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 1864 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1930 1865
1931 switch (buf_f->blf_type) {
1932 case XFS_LI_BUF:
1933 data_map = buf_f->blf_data_map;
1934 map_size = buf_f->blf_map_size;
1935 break;
1936 }
1937 bit = 0; 1866 bit = 0;
1938 i = 1; /* 0 is the buf format structure */ 1867 i = 1; /* 0 is the buf format structure */
1939 while (1) { 1868 while (1) {
1940 bit = xfs_next_bit(data_map, map_size, bit); 1869 bit = xfs_next_bit(buf_f->blf_data_map,
1870 buf_f->blf_map_size, bit);
1941 if (bit == -1) 1871 if (bit == -1)
1942 break; 1872 break;
1943 nbits = xfs_contig_bits(data_map, map_size, bit); 1873 nbits = xfs_contig_bits(buf_f->blf_data_map,
1874 buf_f->blf_map_size, bit);
1944 ASSERT(nbits > 0); 1875 ASSERT(nbits > 0);
1945 ASSERT(item->ri_buf[i].i_addr != NULL); 1876 ASSERT(item->ri_buf[i].i_addr != NULL);
1946 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 1877 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -1956,17 +1887,17 @@ xlog_recover_do_reg_buffer(
1956 if (buf_f->blf_flags & 1887 if (buf_f->blf_flags &
1957 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 1888 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1958 if (item->ri_buf[i].i_addr == NULL) { 1889 if (item->ri_buf[i].i_addr == NULL) {
1959 cmn_err(CE_ALERT, 1890 xfs_alert(mp,
1960 "XFS: NULL dquot in %s.", __func__); 1891 "XFS: NULL dquot in %s.", __func__);
1961 goto next; 1892 goto next;
1962 } 1893 }
1963 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { 1894 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1964 cmn_err(CE_ALERT, 1895 xfs_alert(mp,
1965 "XFS: dquot too small (%d) in %s.", 1896 "XFS: dquot too small (%d) in %s.",
1966 item->ri_buf[i].i_len, __func__); 1897 item->ri_buf[i].i_len, __func__);
1967 goto next; 1898 goto next;
1968 } 1899 }
1969 error = xfs_qm_dqcheck(item->ri_buf[i].i_addr, 1900 error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
1970 -1, 0, XFS_QMOPT_DOWARN, 1901 -1, 0, XFS_QMOPT_DOWARN,
1971 "dquot_buf_recover"); 1902 "dquot_buf_recover");
1972 if (error) 1903 if (error)
@@ -1991,6 +1922,7 @@ xlog_recover_do_reg_buffer(
1991 */ 1922 */
1992int 1923int
1993xfs_qm_dqcheck( 1924xfs_qm_dqcheck(
1925 struct xfs_mount *mp,
1994 xfs_disk_dquot_t *ddq, 1926 xfs_disk_dquot_t *ddq,
1995 xfs_dqid_t id, 1927 xfs_dqid_t id,
1996 uint type, /* used only when IO_dorepair is true */ 1928 uint type, /* used only when IO_dorepair is true */
@@ -2017,14 +1949,14 @@ xfs_qm_dqcheck(
2017 */ 1949 */
2018 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { 1950 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
2019 if (flags & XFS_QMOPT_DOWARN) 1951 if (flags & XFS_QMOPT_DOWARN)
2020 cmn_err(CE_ALERT, 1952 xfs_alert(mp,
2021 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", 1953 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
2022 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); 1954 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
2023 errs++; 1955 errs++;
2024 } 1956 }
2025 if (ddq->d_version != XFS_DQUOT_VERSION) { 1957 if (ddq->d_version != XFS_DQUOT_VERSION) {
2026 if (flags & XFS_QMOPT_DOWARN) 1958 if (flags & XFS_QMOPT_DOWARN)
2027 cmn_err(CE_ALERT, 1959 xfs_alert(mp,
2028 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", 1960 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
2029 str, id, ddq->d_version, XFS_DQUOT_VERSION); 1961 str, id, ddq->d_version, XFS_DQUOT_VERSION);
2030 errs++; 1962 errs++;
@@ -2034,7 +1966,7 @@ xfs_qm_dqcheck(
2034 ddq->d_flags != XFS_DQ_PROJ && 1966 ddq->d_flags != XFS_DQ_PROJ &&
2035 ddq->d_flags != XFS_DQ_GROUP) { 1967 ddq->d_flags != XFS_DQ_GROUP) {
2036 if (flags & XFS_QMOPT_DOWARN) 1968 if (flags & XFS_QMOPT_DOWARN)
2037 cmn_err(CE_ALERT, 1969 xfs_alert(mp,
2038 "%s : XFS dquot ID 0x%x, unknown flags 0x%x", 1970 "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
2039 str, id, ddq->d_flags); 1971 str, id, ddq->d_flags);
2040 errs++; 1972 errs++;
@@ -2042,7 +1974,7 @@ xfs_qm_dqcheck(
2042 1974
2043 if (id != -1 && id != be32_to_cpu(ddq->d_id)) { 1975 if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
2044 if (flags & XFS_QMOPT_DOWARN) 1976 if (flags & XFS_QMOPT_DOWARN)
2045 cmn_err(CE_ALERT, 1977 xfs_alert(mp,
2046 "%s : ondisk-dquot 0x%p, ID mismatch: " 1978 "%s : ondisk-dquot 0x%p, ID mismatch: "
2047 "0x%x expected, found id 0x%x", 1979 "0x%x expected, found id 0x%x",
2048 str, ddq, id, be32_to_cpu(ddq->d_id)); 1980 str, ddq, id, be32_to_cpu(ddq->d_id));
@@ -2055,9 +1987,8 @@ xfs_qm_dqcheck(
2055 be64_to_cpu(ddq->d_blk_softlimit)) { 1987 be64_to_cpu(ddq->d_blk_softlimit)) {
2056 if (!ddq->d_btimer) { 1988 if (!ddq->d_btimer) {
2057 if (flags & XFS_QMOPT_DOWARN) 1989 if (flags & XFS_QMOPT_DOWARN)
2058 cmn_err(CE_ALERT, 1990 xfs_alert(mp,
2059 "%s : Dquot ID 0x%x (0x%p) " 1991 "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
2060 "BLK TIMER NOT STARTED",
2061 str, (int)be32_to_cpu(ddq->d_id), ddq); 1992 str, (int)be32_to_cpu(ddq->d_id), ddq);
2062 errs++; 1993 errs++;
2063 } 1994 }
@@ -2067,9 +1998,8 @@ xfs_qm_dqcheck(
2067 be64_to_cpu(ddq->d_ino_softlimit)) { 1998 be64_to_cpu(ddq->d_ino_softlimit)) {
2068 if (!ddq->d_itimer) { 1999 if (!ddq->d_itimer) {
2069 if (flags & XFS_QMOPT_DOWARN) 2000 if (flags & XFS_QMOPT_DOWARN)
2070 cmn_err(CE_ALERT, 2001 xfs_alert(mp,
2071 "%s : Dquot ID 0x%x (0x%p) " 2002 "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
2072 "INODE TIMER NOT STARTED",
2073 str, (int)be32_to_cpu(ddq->d_id), ddq); 2003 str, (int)be32_to_cpu(ddq->d_id), ddq);
2074 errs++; 2004 errs++;
2075 } 2005 }
@@ -2079,9 +2009,8 @@ xfs_qm_dqcheck(
2079 be64_to_cpu(ddq->d_rtb_softlimit)) { 2009 be64_to_cpu(ddq->d_rtb_softlimit)) {
2080 if (!ddq->d_rtbtimer) { 2010 if (!ddq->d_rtbtimer) {
2081 if (flags & XFS_QMOPT_DOWARN) 2011 if (flags & XFS_QMOPT_DOWARN)
2082 cmn_err(CE_ALERT, 2012 xfs_alert(mp,
2083 "%s : Dquot ID 0x%x (0x%p) " 2013 "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
2084 "RTBLK TIMER NOT STARTED",
2085 str, (int)be32_to_cpu(ddq->d_id), ddq); 2014 str, (int)be32_to_cpu(ddq->d_id), ddq);
2086 errs++; 2015 errs++;
2087 } 2016 }
@@ -2092,7 +2021,7 @@ xfs_qm_dqcheck(
2092 return errs; 2021 return errs;
2093 2022
2094 if (flags & XFS_QMOPT_DOWARN) 2023 if (flags & XFS_QMOPT_DOWARN)
2095 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id); 2024 xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
2096 2025
2097 /* 2026 /*
2098 * Typically, a repair is only requested by quotacheck. 2027 * Typically, a repair is only requested by quotacheck.
@@ -2174,77 +2103,46 @@ xlog_recover_do_dquot_buffer(
2174 * for more details on the implementation of the table of cancel records. 2103 * for more details on the implementation of the table of cancel records.
2175 */ 2104 */
2176STATIC int 2105STATIC int
2177xlog_recover_do_buffer_trans( 2106xlog_recover_buffer_pass2(
2178 xlog_t *log, 2107 xlog_t *log,
2179 xlog_recover_item_t *item, 2108 xlog_recover_item_t *item)
2180 int pass)
2181{ 2109{
2182 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2110 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2183 xfs_mount_t *mp; 2111 xfs_mount_t *mp = log->l_mp;
2184 xfs_buf_t *bp; 2112 xfs_buf_t *bp;
2185 int error; 2113 int error;
2186 int cancel;
2187 xfs_daddr_t blkno;
2188 int len;
2189 ushort flags;
2190 uint buf_flags; 2114 uint buf_flags;
2191 2115
2192 if (pass == XLOG_RECOVER_PASS1) { 2116 /*
2193 /* 2117 * In this pass we only want to recover all the buffers which have
2194 * In this pass we're only looking for buf items 2118 * not been cancelled and are not cancellation buffers themselves.
2195 * with the XFS_BLF_CANCEL bit set. 2119 */
2196 */ 2120 if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2197 xlog_recover_do_buffer_pass1(log, buf_f); 2121 buf_f->blf_len, buf_f->blf_flags)) {
2122 trace_xfs_log_recover_buf_cancel(log, buf_f);
2198 return 0; 2123 return 0;
2199 } else {
2200 /*
2201 * In this pass we want to recover all the buffers
2202 * which have not been cancelled and are not
2203 * cancellation buffers themselves. The routine
2204 * we call here will tell us whether or not to
2205 * continue with the replay of this buffer.
2206 */
2207 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2208 if (cancel) {
2209 trace_xfs_log_recover_buf_cancel(log, buf_f);
2210 return 0;
2211 }
2212 } 2124 }
2125
2213 trace_xfs_log_recover_buf_recover(log, buf_f); 2126 trace_xfs_log_recover_buf_recover(log, buf_f);
2214 switch (buf_f->blf_type) {
2215 case XFS_LI_BUF:
2216 blkno = buf_f->blf_blkno;
2217 len = buf_f->blf_len;
2218 flags = buf_f->blf_flags;
2219 break;
2220 default:
2221 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2222 "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2223 buf_f->blf_type, log->l_mp->m_logname ?
2224 log->l_mp->m_logname : "internal");
2225 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2226 XFS_ERRLEVEL_LOW, log->l_mp);
2227 return XFS_ERROR(EFSCORRUPTED);
2228 }
2229 2127
2230 mp = log->l_mp;
2231 buf_flags = XBF_LOCK; 2128 buf_flags = XBF_LOCK;
2232 if (!(flags & XFS_BLF_INODE_BUF)) 2129 if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
2233 buf_flags |= XBF_MAPPED; 2130 buf_flags |= XBF_MAPPED;
2234 2131
2235 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2132 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2133 buf_flags);
2236 if (XFS_BUF_ISERROR(bp)) { 2134 if (XFS_BUF_ISERROR(bp)) {
2237 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, 2135 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
2238 bp, blkno); 2136 bp, buf_f->blf_blkno);
2239 error = XFS_BUF_GETERROR(bp); 2137 error = XFS_BUF_GETERROR(bp);
2240 xfs_buf_relse(bp); 2138 xfs_buf_relse(bp);
2241 return error; 2139 return error;
2242 } 2140 }
2243 2141
2244 error = 0; 2142 error = 0;
2245 if (flags & XFS_BLF_INODE_BUF) { 2143 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2246 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2144 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2247 } else if (flags & 2145 } else if (buf_f->blf_flags &
2248 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2146 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2249 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2147 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2250 } else { 2148 } else {
@@ -2275,8 +2173,7 @@ xlog_recover_do_buffer_trans(
2275 XFS_BUF_STALE(bp); 2173 XFS_BUF_STALE(bp);
2276 error = xfs_bwrite(mp, bp); 2174 error = xfs_bwrite(mp, bp);
2277 } else { 2175 } else {
2278 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2176 ASSERT(bp->b_target->bt_mount == mp);
2279 bp->b_mount = mp;
2280 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2177 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2281 xfs_bdwrite(mp, bp); 2178 xfs_bdwrite(mp, bp);
2282 } 2179 }
@@ -2285,16 +2182,14 @@ xlog_recover_do_buffer_trans(
2285} 2182}
2286 2183
2287STATIC int 2184STATIC int
2288xlog_recover_do_inode_trans( 2185xlog_recover_inode_pass2(
2289 xlog_t *log, 2186 xlog_t *log,
2290 xlog_recover_item_t *item, 2187 xlog_recover_item_t *item)
2291 int pass)
2292{ 2188{
2293 xfs_inode_log_format_t *in_f; 2189 xfs_inode_log_format_t *in_f;
2294 xfs_mount_t *mp; 2190 xfs_mount_t *mp = log->l_mp;
2295 xfs_buf_t *bp; 2191 xfs_buf_t *bp;
2296 xfs_dinode_t *dip; 2192 xfs_dinode_t *dip;
2297 xfs_ino_t ino;
2298 int len; 2193 int len;
2299 xfs_caddr_t src; 2194 xfs_caddr_t src;
2300 xfs_caddr_t dest; 2195 xfs_caddr_t dest;
@@ -2304,10 +2199,6 @@ xlog_recover_do_inode_trans(
2304 xfs_icdinode_t *dicp; 2199 xfs_icdinode_t *dicp;
2305 int need_free = 0; 2200 int need_free = 0;
2306 2201
2307 if (pass == XLOG_RECOVER_PASS1) {
2308 return 0;
2309 }
2310
2311 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2202 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2312 in_f = item->ri_buf[0].i_addr; 2203 in_f = item->ri_buf[0].i_addr;
2313 } else { 2204 } else {
@@ -2317,8 +2208,6 @@ xlog_recover_do_inode_trans(
2317 if (error) 2208 if (error)
2318 goto error; 2209 goto error;
2319 } 2210 }
2320 ino = in_f->ilf_ino;
2321 mp = log->l_mp;
2322 2211
2323 /* 2212 /*
2324 * Inode buffers can be freed, look out for it, 2213 * Inode buffers can be freed, look out for it,
@@ -2351,10 +2240,10 @@ xlog_recover_do_inode_trans(
2351 */ 2240 */
2352 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { 2241 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
2353 xfs_buf_relse(bp); 2242 xfs_buf_relse(bp);
2354 xfs_fs_cmn_err(CE_ALERT, mp, 2243 xfs_alert(mp,
2355 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2244 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2356 dip, bp, ino); 2245 __func__, dip, bp, in_f->ilf_ino);
2357 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", 2246 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2358 XFS_ERRLEVEL_LOW, mp); 2247 XFS_ERRLEVEL_LOW, mp);
2359 error = EFSCORRUPTED; 2248 error = EFSCORRUPTED;
2360 goto error; 2249 goto error;
@@ -2362,10 +2251,10 @@ xlog_recover_do_inode_trans(
2362 dicp = item->ri_buf[1].i_addr; 2251 dicp = item->ri_buf[1].i_addr;
2363 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2252 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2364 xfs_buf_relse(bp); 2253 xfs_buf_relse(bp);
2365 xfs_fs_cmn_err(CE_ALERT, mp, 2254 xfs_alert(mp,
2366 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", 2255 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2367 item, ino); 2256 __func__, item, in_f->ilf_ino);
2368 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", 2257 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2369 XFS_ERRLEVEL_LOW, mp); 2258 XFS_ERRLEVEL_LOW, mp);
2370 error = EFSCORRUPTED; 2259 error = EFSCORRUPTED;
2371 goto error; 2260 goto error;
@@ -2393,12 +2282,13 @@ xlog_recover_do_inode_trans(
2393 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { 2282 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2394 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2283 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2395 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2284 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2396 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", 2285 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2397 XFS_ERRLEVEL_LOW, mp, dicp); 2286 XFS_ERRLEVEL_LOW, mp, dicp);
2398 xfs_buf_relse(bp); 2287 xfs_buf_relse(bp);
2399 xfs_fs_cmn_err(CE_ALERT, mp, 2288 xfs_alert(mp,
2400 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2289 "%s: Bad regular inode log record, rec ptr 0x%p, "
2401 item, dip, bp, ino); 2290 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2291 __func__, item, dip, bp, in_f->ilf_ino);
2402 error = EFSCORRUPTED; 2292 error = EFSCORRUPTED;
2403 goto error; 2293 goto error;
2404 } 2294 }
@@ -2406,45 +2296,48 @@ xlog_recover_do_inode_trans(
2406 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2296 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2407 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 2297 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2408 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2298 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2409 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", 2299 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2410 XFS_ERRLEVEL_LOW, mp, dicp); 2300 XFS_ERRLEVEL_LOW, mp, dicp);
2411 xfs_buf_relse(bp); 2301 xfs_buf_relse(bp);
2412 xfs_fs_cmn_err(CE_ALERT, mp, 2302 xfs_alert(mp,
2413 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2303 "%s: Bad dir inode log record, rec ptr 0x%p, "
2414 item, dip, bp, ino); 2304 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2305 __func__, item, dip, bp, in_f->ilf_ino);
2415 error = EFSCORRUPTED; 2306 error = EFSCORRUPTED;
2416 goto error; 2307 goto error;
2417 } 2308 }
2418 } 2309 }
2419 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2310 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2420 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", 2311 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2421 XFS_ERRLEVEL_LOW, mp, dicp); 2312 XFS_ERRLEVEL_LOW, mp, dicp);
2422 xfs_buf_relse(bp); 2313 xfs_buf_relse(bp);
2423 xfs_fs_cmn_err(CE_ALERT, mp, 2314 xfs_alert(mp,
2424 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2315 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2425 item, dip, bp, ino, 2316 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2317 __func__, item, dip, bp, in_f->ilf_ino,
2426 dicp->di_nextents + dicp->di_anextents, 2318 dicp->di_nextents + dicp->di_anextents,
2427 dicp->di_nblocks); 2319 dicp->di_nblocks);
2428 error = EFSCORRUPTED; 2320 error = EFSCORRUPTED;
2429 goto error; 2321 goto error;
2430 } 2322 }
2431 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2323 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2432 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", 2324 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2433 XFS_ERRLEVEL_LOW, mp, dicp); 2325 XFS_ERRLEVEL_LOW, mp, dicp);
2434 xfs_buf_relse(bp); 2326 xfs_buf_relse(bp);
2435 xfs_fs_cmn_err(CE_ALERT, mp, 2327 xfs_alert(mp,
2436 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", 2328 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2437 item, dip, bp, ino, dicp->di_forkoff); 2329 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2330 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2438 error = EFSCORRUPTED; 2331 error = EFSCORRUPTED;
2439 goto error; 2332 goto error;
2440 } 2333 }
2441 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { 2334 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2442 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", 2335 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2443 XFS_ERRLEVEL_LOW, mp, dicp); 2336 XFS_ERRLEVEL_LOW, mp, dicp);
2444 xfs_buf_relse(bp); 2337 xfs_buf_relse(bp);
2445 xfs_fs_cmn_err(CE_ALERT, mp, 2338 xfs_alert(mp,
2446 "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p", 2339 "%s: Bad inode log record length %d, rec ptr 0x%p",
2447 item->ri_buf[1].i_len, item); 2340 __func__, item->ri_buf[1].i_len, item);
2448 error = EFSCORRUPTED; 2341 error = EFSCORRUPTED;
2449 goto error; 2342 goto error;
2450 } 2343 }
@@ -2531,7 +2424,7 @@ xlog_recover_do_inode_trans(
2531 break; 2424 break;
2532 2425
2533 default: 2426 default:
2534 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); 2427 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2535 ASSERT(0); 2428 ASSERT(0);
2536 xfs_buf_relse(bp); 2429 xfs_buf_relse(bp);
2537 error = EIO; 2430 error = EIO;
@@ -2540,8 +2433,7 @@ xlog_recover_do_inode_trans(
2540 } 2433 }
2541 2434
2542write_inode_buffer: 2435write_inode_buffer:
2543 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2436 ASSERT(bp->b_target->bt_mount == mp);
2544 bp->b_mount = mp;
2545 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2437 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2546 xfs_bdwrite(mp, bp); 2438 xfs_bdwrite(mp, bp);
2547error: 2439error:
@@ -2556,18 +2448,11 @@ error:
2556 * of that type. 2448 * of that type.
2557 */ 2449 */
2558STATIC int 2450STATIC int
2559xlog_recover_do_quotaoff_trans( 2451xlog_recover_quotaoff_pass1(
2560 xlog_t *log, 2452 xlog_t *log,
2561 xlog_recover_item_t *item, 2453 xlog_recover_item_t *item)
2562 int pass)
2563{ 2454{
2564 xfs_qoff_logformat_t *qoff_f; 2455 xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
2565
2566 if (pass == XLOG_RECOVER_PASS2) {
2567 return (0);
2568 }
2569
2570 qoff_f = item->ri_buf[0].i_addr;
2571 ASSERT(qoff_f); 2456 ASSERT(qoff_f);
2572 2457
2573 /* 2458 /*
@@ -2588,22 +2473,17 @@ xlog_recover_do_quotaoff_trans(
2588 * Recover a dquot record 2473 * Recover a dquot record
2589 */ 2474 */
2590STATIC int 2475STATIC int
2591xlog_recover_do_dquot_trans( 2476xlog_recover_dquot_pass2(
2592 xlog_t *log, 2477 xlog_t *log,
2593 xlog_recover_item_t *item, 2478 xlog_recover_item_t *item)
2594 int pass)
2595{ 2479{
2596 xfs_mount_t *mp; 2480 xfs_mount_t *mp = log->l_mp;
2597 xfs_buf_t *bp; 2481 xfs_buf_t *bp;
2598 struct xfs_disk_dquot *ddq, *recddq; 2482 struct xfs_disk_dquot *ddq, *recddq;
2599 int error; 2483 int error;
2600 xfs_dq_logformat_t *dq_f; 2484 xfs_dq_logformat_t *dq_f;
2601 uint type; 2485 uint type;
2602 2486
2603 if (pass == XLOG_RECOVER_PASS1) {
2604 return 0;
2605 }
2606 mp = log->l_mp;
2607 2487
2608 /* 2488 /*
2609 * Filesystems are required to send in quota flags at mount time. 2489 * Filesystems are required to send in quota flags at mount time.
@@ -2613,13 +2493,11 @@ xlog_recover_do_dquot_trans(
2613 2493
2614 recddq = item->ri_buf[1].i_addr; 2494 recddq = item->ri_buf[1].i_addr;
2615 if (recddq == NULL) { 2495 if (recddq == NULL) {
2616 cmn_err(CE_ALERT, 2496 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2617 "XFS: NULL dquot in %s.", __func__);
2618 return XFS_ERROR(EIO); 2497 return XFS_ERROR(EIO);
2619 } 2498 }
2620 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { 2499 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2621 cmn_err(CE_ALERT, 2500 xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2622 "XFS: dquot too small (%d) in %s.",
2623 item->ri_buf[1].i_len, __func__); 2501 item->ri_buf[1].i_len, __func__);
2624 return XFS_ERROR(EIO); 2502 return XFS_ERROR(EIO);
2625 } 2503 }
@@ -2644,12 +2522,10 @@ xlog_recover_do_dquot_trans(
2644 */ 2522 */
2645 dq_f = item->ri_buf[0].i_addr; 2523 dq_f = item->ri_buf[0].i_addr;
2646 ASSERT(dq_f); 2524 ASSERT(dq_f);
2647 if ((error = xfs_qm_dqcheck(recddq, 2525 error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2648 dq_f->qlf_id, 2526 "xlog_recover_dquot_pass2 (log copy)");
2649 0, XFS_QMOPT_DOWARN, 2527 if (error)
2650 "xlog_recover_do_dquot_trans (log copy)"))) {
2651 return XFS_ERROR(EIO); 2528 return XFS_ERROR(EIO);
2652 }
2653 ASSERT(dq_f->qlf_len == 1); 2529 ASSERT(dq_f->qlf_len == 1);
2654 2530
2655 error = xfs_read_buf(mp, mp->m_ddev_targp, 2531 error = xfs_read_buf(mp, mp->m_ddev_targp,
@@ -2669,8 +2545,9 @@ xlog_recover_do_dquot_trans(
2669 * was among a chunk of dquots created earlier, and we did some 2545 * was among a chunk of dquots created earlier, and we did some
2670 * minimal initialization then. 2546 * minimal initialization then.
2671 */ 2547 */
2672 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2548 error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2673 "xlog_recover_do_dquot_trans")) { 2549 "xlog_recover_dquot_pass2");
2550 if (error) {
2674 xfs_buf_relse(bp); 2551 xfs_buf_relse(bp);
2675 return XFS_ERROR(EIO); 2552 return XFS_ERROR(EIO);
2676 } 2553 }
@@ -2678,8 +2555,7 @@ xlog_recover_do_dquot_trans(
2678 memcpy(ddq, recddq, item->ri_buf[1].i_len); 2555 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2679 2556
2680 ASSERT(dq_f->qlf_size == 2); 2557 ASSERT(dq_f->qlf_size == 2);
2681 ASSERT(bp->b_mount == NULL || bp->b_mount == mp); 2558 ASSERT(bp->b_target->bt_mount == mp);
2682 bp->b_mount = mp;
2683 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2559 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2684 xfs_bdwrite(mp, bp); 2560 xfs_bdwrite(mp, bp);
2685 2561
@@ -2694,38 +2570,31 @@ xlog_recover_do_dquot_trans(
2694 * LSN. 2570 * LSN.
2695 */ 2571 */
2696STATIC int 2572STATIC int
2697xlog_recover_do_efi_trans( 2573xlog_recover_efi_pass2(
2698 xlog_t *log, 2574 xlog_t *log,
2699 xlog_recover_item_t *item, 2575 xlog_recover_item_t *item,
2700 xfs_lsn_t lsn, 2576 xfs_lsn_t lsn)
2701 int pass)
2702{ 2577{
2703 int error; 2578 int error;
2704 xfs_mount_t *mp; 2579 xfs_mount_t *mp = log->l_mp;
2705 xfs_efi_log_item_t *efip; 2580 xfs_efi_log_item_t *efip;
2706 xfs_efi_log_format_t *efi_formatp; 2581 xfs_efi_log_format_t *efi_formatp;
2707 2582
2708 if (pass == XLOG_RECOVER_PASS1) {
2709 return 0;
2710 }
2711
2712 efi_formatp = item->ri_buf[0].i_addr; 2583 efi_formatp = item->ri_buf[0].i_addr;
2713 2584
2714 mp = log->l_mp;
2715 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2585 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2716 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), 2586 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2717 &(efip->efi_format)))) { 2587 &(efip->efi_format)))) {
2718 xfs_efi_item_free(efip); 2588 xfs_efi_item_free(efip);
2719 return error; 2589 return error;
2720 } 2590 }
2721 efip->efi_next_extent = efi_formatp->efi_nextents; 2591 atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2722 efip->efi_flags |= XFS_EFI_COMMITTED;
2723 2592
2724 spin_lock(&log->l_ailp->xa_lock); 2593 spin_lock(&log->l_ailp->xa_lock);
2725 /* 2594 /*
2726 * xfs_trans_ail_update() drops the AIL lock. 2595 * xfs_trans_ail_update() drops the AIL lock.
2727 */ 2596 */
2728 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); 2597 xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2729 return 0; 2598 return 0;
2730} 2599}
2731 2600
@@ -2738,11 +2607,10 @@ xlog_recover_do_efi_trans(
2738 * efd format structure. If we find it, we remove the efi from the 2607 * efd format structure. If we find it, we remove the efi from the
2739 * AIL and free it. 2608 * AIL and free it.
2740 */ 2609 */
2741STATIC void 2610STATIC int
2742xlog_recover_do_efd_trans( 2611xlog_recover_efd_pass2(
2743 xlog_t *log, 2612 xlog_t *log,
2744 xlog_recover_item_t *item, 2613 xlog_recover_item_t *item)
2745 int pass)
2746{ 2614{
2747 xfs_efd_log_format_t *efd_formatp; 2615 xfs_efd_log_format_t *efd_formatp;
2748 xfs_efi_log_item_t *efip = NULL; 2616 xfs_efi_log_item_t *efip = NULL;
@@ -2751,10 +2619,6 @@ xlog_recover_do_efd_trans(
2751 struct xfs_ail_cursor cur; 2619 struct xfs_ail_cursor cur;
2752 struct xfs_ail *ailp = log->l_ailp; 2620 struct xfs_ail *ailp = log->l_ailp;
2753 2621
2754 if (pass == XLOG_RECOVER_PASS1) {
2755 return;
2756 }
2757
2758 efd_formatp = item->ri_buf[0].i_addr; 2622 efd_formatp = item->ri_buf[0].i_addr;
2759 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2623 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2760 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2624 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2786,62 +2650,6 @@ xlog_recover_do_efd_trans(
2786 } 2650 }
2787 xfs_trans_ail_cursor_done(ailp, &cur); 2651 xfs_trans_ail_cursor_done(ailp, &cur);
2788 spin_unlock(&ailp->xa_lock); 2652 spin_unlock(&ailp->xa_lock);
2789}
2790
2791/*
2792 * Perform the transaction
2793 *
2794 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2795 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2796 */
2797STATIC int
2798xlog_recover_do_trans(
2799 xlog_t *log,
2800 xlog_recover_t *trans,
2801 int pass)
2802{
2803 int error = 0;
2804 xlog_recover_item_t *item;
2805
2806 error = xlog_recover_reorder_trans(log, trans, pass);
2807 if (error)
2808 return error;
2809
2810 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2811 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2812 switch (ITEM_TYPE(item)) {
2813 case XFS_LI_BUF:
2814 error = xlog_recover_do_buffer_trans(log, item, pass);
2815 break;
2816 case XFS_LI_INODE:
2817 error = xlog_recover_do_inode_trans(log, item, pass);
2818 break;
2819 case XFS_LI_EFI:
2820 error = xlog_recover_do_efi_trans(log, item,
2821 trans->r_lsn, pass);
2822 break;
2823 case XFS_LI_EFD:
2824 xlog_recover_do_efd_trans(log, item, pass);
2825 error = 0;
2826 break;
2827 case XFS_LI_DQUOT:
2828 error = xlog_recover_do_dquot_trans(log, item, pass);
2829 break;
2830 case XFS_LI_QUOTAOFF:
2831 error = xlog_recover_do_quotaoff_trans(log, item,
2832 pass);
2833 break;
2834 default:
2835 xlog_warn(
2836 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2837 ASSERT(0);
2838 error = XFS_ERROR(EIO);
2839 break;
2840 }
2841
2842 if (error)
2843 return error;
2844 }
2845 2653
2846 return 0; 2654 return 0;
2847} 2655}
@@ -2853,7 +2661,7 @@ xlog_recover_do_trans(
2853 */ 2661 */
2854STATIC void 2662STATIC void
2855xlog_recover_free_trans( 2663xlog_recover_free_trans(
2856 xlog_recover_t *trans) 2664 struct xlog_recover *trans)
2857{ 2665{
2858 xlog_recover_item_t *item, *n; 2666 xlog_recover_item_t *item, *n;
2859 int i; 2667 int i;
@@ -2872,26 +2680,103 @@ xlog_recover_free_trans(
2872} 2680}
2873 2681
2874STATIC int 2682STATIC int
2683xlog_recover_commit_pass1(
2684 struct log *log,
2685 struct xlog_recover *trans,
2686 xlog_recover_item_t *item)
2687{
2688 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
2689
2690 switch (ITEM_TYPE(item)) {
2691 case XFS_LI_BUF:
2692 return xlog_recover_buffer_pass1(log, item);
2693 case XFS_LI_QUOTAOFF:
2694 return xlog_recover_quotaoff_pass1(log, item);
2695 case XFS_LI_INODE:
2696 case XFS_LI_EFI:
2697 case XFS_LI_EFD:
2698 case XFS_LI_DQUOT:
2699 /* nothing to do in pass 1 */
2700 return 0;
2701 default:
2702 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2703 __func__, ITEM_TYPE(item));
2704 ASSERT(0);
2705 return XFS_ERROR(EIO);
2706 }
2707}
2708
2709STATIC int
2710xlog_recover_commit_pass2(
2711 struct log *log,
2712 struct xlog_recover *trans,
2713 xlog_recover_item_t *item)
2714{
2715 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2716
2717 switch (ITEM_TYPE(item)) {
2718 case XFS_LI_BUF:
2719 return xlog_recover_buffer_pass2(log, item);
2720 case XFS_LI_INODE:
2721 return xlog_recover_inode_pass2(log, item);
2722 case XFS_LI_EFI:
2723 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2724 case XFS_LI_EFD:
2725 return xlog_recover_efd_pass2(log, item);
2726 case XFS_LI_DQUOT:
2727 return xlog_recover_dquot_pass2(log, item);
2728 case XFS_LI_QUOTAOFF:
2729 /* nothing to do in pass2 */
2730 return 0;
2731 default:
2732 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2733 __func__, ITEM_TYPE(item));
2734 ASSERT(0);
2735 return XFS_ERROR(EIO);
2736 }
2737}
2738
2739/*
2740 * Perform the transaction.
2741 *
2742 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2743 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2744 */
2745STATIC int
2875xlog_recover_commit_trans( 2746xlog_recover_commit_trans(
2876 xlog_t *log, 2747 struct log *log,
2877 xlog_recover_t *trans, 2748 struct xlog_recover *trans,
2878 int pass) 2749 int pass)
2879{ 2750{
2880 int error; 2751 int error = 0;
2752 xlog_recover_item_t *item;
2881 2753
2882 hlist_del(&trans->r_list); 2754 hlist_del(&trans->r_list);
2883 if ((error = xlog_recover_do_trans(log, trans, pass))) 2755
2756 error = xlog_recover_reorder_trans(log, trans, pass);
2757 if (error)
2884 return error; 2758 return error;
2885 xlog_recover_free_trans(trans); /* no error */ 2759
2760 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2761 if (pass == XLOG_RECOVER_PASS1)
2762 error = xlog_recover_commit_pass1(log, trans, item);
2763 else
2764 error = xlog_recover_commit_pass2(log, trans, item);
2765 if (error)
2766 return error;
2767 }
2768
2769 xlog_recover_free_trans(trans);
2886 return 0; 2770 return 0;
2887} 2771}
2888 2772
2889STATIC int 2773STATIC int
2890xlog_recover_unmount_trans( 2774xlog_recover_unmount_trans(
2775 struct log *log,
2891 xlog_recover_t *trans) 2776 xlog_recover_t *trans)
2892{ 2777{
2893 /* Do nothing now */ 2778 /* Do nothing now */
2894 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR"); 2779 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2895 return 0; 2780 return 0;
2896} 2781}
2897 2782
@@ -2934,8 +2819,8 @@ xlog_recover_process_data(
2934 dp += sizeof(xlog_op_header_t); 2819 dp += sizeof(xlog_op_header_t);
2935 if (ohead->oh_clientid != XFS_TRANSACTION && 2820 if (ohead->oh_clientid != XFS_TRANSACTION &&
2936 ohead->oh_clientid != XFS_LOG) { 2821 ohead->oh_clientid != XFS_LOG) {
2937 xlog_warn( 2822 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2938 "XFS: xlog_recover_process_data: bad clientid"); 2823 __func__, ohead->oh_clientid);
2939 ASSERT(0); 2824 ASSERT(0);
2940 return (XFS_ERROR(EIO)); 2825 return (XFS_ERROR(EIO));
2941 } 2826 }
@@ -2948,8 +2833,8 @@ xlog_recover_process_data(
2948 be64_to_cpu(rhead->h_lsn)); 2833 be64_to_cpu(rhead->h_lsn));
2949 } else { 2834 } else {
2950 if (dp + be32_to_cpu(ohead->oh_len) > lp) { 2835 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2951 xlog_warn( 2836 xfs_warn(log->l_mp, "%s: bad length 0x%x",
2952 "XFS: xlog_recover_process_data: bad length"); 2837 __func__, be32_to_cpu(ohead->oh_len));
2953 WARN_ON(1); 2838 WARN_ON(1);
2954 return (XFS_ERROR(EIO)); 2839 return (XFS_ERROR(EIO));
2955 } 2840 }
@@ -2962,7 +2847,7 @@ xlog_recover_process_data(
2962 trans, pass); 2847 trans, pass);
2963 break; 2848 break;
2964 case XLOG_UNMOUNT_TRANS: 2849 case XLOG_UNMOUNT_TRANS:
2965 error = xlog_recover_unmount_trans(trans); 2850 error = xlog_recover_unmount_trans(log, trans);
2966 break; 2851 break;
2967 case XLOG_WAS_CONT_TRANS: 2852 case XLOG_WAS_CONT_TRANS:
2968 error = xlog_recover_add_to_cont_trans(log, 2853 error = xlog_recover_add_to_cont_trans(log,
@@ -2970,8 +2855,8 @@ xlog_recover_process_data(
2970 be32_to_cpu(ohead->oh_len)); 2855 be32_to_cpu(ohead->oh_len));
2971 break; 2856 break;
2972 case XLOG_START_TRANS: 2857 case XLOG_START_TRANS:
2973 xlog_warn( 2858 xfs_warn(log->l_mp, "%s: bad transaction",
2974 "XFS: xlog_recover_process_data: bad transaction"); 2859 __func__);
2975 ASSERT(0); 2860 ASSERT(0);
2976 error = XFS_ERROR(EIO); 2861 error = XFS_ERROR(EIO);
2977 break; 2862 break;
@@ -2981,8 +2866,8 @@ xlog_recover_process_data(
2981 dp, be32_to_cpu(ohead->oh_len)); 2866 dp, be32_to_cpu(ohead->oh_len));
2982 break; 2867 break;
2983 default: 2868 default:
2984 xlog_warn( 2869 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
2985 "XFS: xlog_recover_process_data: bad flag"); 2870 __func__, flags);
2986 ASSERT(0); 2871 ASSERT(0);
2987 error = XFS_ERROR(EIO); 2872 error = XFS_ERROR(EIO);
2988 break; 2873 break;
@@ -3012,7 +2897,7 @@ xlog_recover_process_efi(
3012 xfs_extent_t *extp; 2897 xfs_extent_t *extp;
3013 xfs_fsblock_t startblock_fsb; 2898 xfs_fsblock_t startblock_fsb;
3014 2899
3015 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); 2900 ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
3016 2901
3017 /* 2902 /*
3018 * First check the validity of the extents described by the 2903 * First check the validity of the extents described by the
@@ -3051,7 +2936,7 @@ xlog_recover_process_efi(
3051 extp->ext_len); 2936 extp->ext_len);
3052 } 2937 }
3053 2938
3054 efip->efi_flags |= XFS_EFI_RECOVERED; 2939 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3055 error = xfs_trans_commit(tp, 0); 2940 error = xfs_trans_commit(tp, 0);
3056 return error; 2941 return error;
3057 2942
@@ -3108,7 +2993,7 @@ xlog_recover_process_efis(
3108 * Skip EFIs that we've already processed. 2993 * Skip EFIs that we've already processed.
3109 */ 2994 */
3110 efip = (xfs_efi_log_item_t *)lip; 2995 efip = (xfs_efi_log_item_t *)lip;
3111 if (efip->efi_flags & XFS_EFI_RECOVERED) { 2996 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3112 lip = xfs_trans_ail_cursor_next(ailp, &cur); 2997 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3113 continue; 2998 continue;
3114 } 2999 }
@@ -3167,8 +3052,7 @@ xlog_recover_clear_agi_bucket(
3167out_abort: 3052out_abort:
3168 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3053 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3169out_error: 3054out_error:
3170 xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: " 3055 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3171 "failed to clear agi %d. Continuing.", agno);
3172 return; 3056 return;
3173} 3057}
3174 3058
@@ -3419,7 +3303,7 @@ xlog_valid_rec_header(
3419 if (unlikely( 3303 if (unlikely(
3420 (!rhead->h_version || 3304 (!rhead->h_version ||
3421 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { 3305 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3422 xlog_warn("XFS: %s: unrecognised log version (%d).", 3306 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
3423 __func__, be32_to_cpu(rhead->h_version)); 3307 __func__, be32_to_cpu(rhead->h_version));
3424 return XFS_ERROR(EIO); 3308 return XFS_ERROR(EIO);
3425 } 3309 }
@@ -3585,19 +3469,9 @@ xlog_do_recovery_pass(
3585 * - order is important. 3469 * - order is important.
3586 */ 3470 */
3587 wrapped_hblks = hblks - split_hblks; 3471 wrapped_hblks = hblks - split_hblks;
3588 error = XFS_BUF_SET_PTR(hbp, 3472 error = xlog_bread_offset(log, 0,
3589 offset + BBTOB(split_hblks), 3473 wrapped_hblks, hbp,
3590 BBTOB(hblks - split_hblks)); 3474 offset + BBTOB(split_hblks));
3591 if (error)
3592 goto bread_err2;
3593
3594 error = xlog_bread_noalign(log, 0,
3595 wrapped_hblks, hbp);
3596 if (error)
3597 goto bread_err2;
3598
3599 error = XFS_BUF_SET_PTR(hbp, offset,
3600 BBTOB(hblks));
3601 if (error) 3475 if (error)
3602 goto bread_err2; 3476 goto bread_err2;
3603 } 3477 }
@@ -3648,19 +3522,9 @@ xlog_do_recovery_pass(
3648 * _first_, then the log start (LR header end) 3522 * _first_, then the log start (LR header end)
3649 * - order is important. 3523 * - order is important.
3650 */ 3524 */
3651 error = XFS_BUF_SET_PTR(dbp, 3525 error = xlog_bread_offset(log, 0,
3652 offset + BBTOB(split_bblks), 3526 bblks - split_bblks, hbp,
3653 BBTOB(bblks - split_bblks)); 3527 offset + BBTOB(split_bblks));
3654 if (error)
3655 goto bread_err2;
3656
3657 error = xlog_bread_noalign(log, wrapped_hblks,
3658 bblks - split_bblks,
3659 dbp);
3660 if (error)
3661 goto bread_err2;
3662
3663 error = XFS_BUF_SET_PTR(dbp, offset, h_size);
3664 if (error) 3528 if (error)
3665 goto bread_err2; 3529 goto bread_err2;
3666 } 3530 }
@@ -3725,7 +3589,7 @@ xlog_do_log_recovery(
3725 xfs_daddr_t head_blk, 3589 xfs_daddr_t head_blk,
3726 xfs_daddr_t tail_blk) 3590 xfs_daddr_t tail_blk)
3727{ 3591{
3728 int error; 3592 int error, i;
3729 3593
3730 ASSERT(head_blk != tail_blk); 3594 ASSERT(head_blk != tail_blk);
3731 3595
@@ -3733,10 +3597,12 @@ xlog_do_log_recovery(
3733 * First do a pass to find all of the cancelled buf log items. 3597 * First do a pass to find all of the cancelled buf log items.
3734 * Store them in the buf_cancel_table for use in the second pass. 3598 * Store them in the buf_cancel_table for use in the second pass.
3735 */ 3599 */
3736 log->l_buf_cancel_table = 3600 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3737 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * 3601 sizeof(struct list_head),
3738 sizeof(xfs_buf_cancel_t*),
3739 KM_SLEEP); 3602 KM_SLEEP);
3603 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3604 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3605
3740 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 3606 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3741 XLOG_RECOVER_PASS1); 3607 XLOG_RECOVER_PASS1);
3742 if (error != 0) { 3608 if (error != 0) {
@@ -3755,7 +3621,7 @@ xlog_do_log_recovery(
3755 int i; 3621 int i;
3756 3622
3757 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 3623 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3758 ASSERT(log->l_buf_cancel_table[i] == NULL); 3624 ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3759 } 3625 }
3760#endif /* DEBUG */ 3626#endif /* DEBUG */
3761 3627
@@ -3817,7 +3683,7 @@ xlog_do_recover(
3817 XFS_BUF_READ(bp); 3683 XFS_BUF_READ(bp);
3818 XFS_BUF_UNASYNC(bp); 3684 XFS_BUF_UNASYNC(bp);
3819 xfsbdstrat(log->l_mp, bp); 3685 xfsbdstrat(log->l_mp, bp);
3820 error = xfs_iowait(bp); 3686 error = xfs_buf_iowait(bp);
3821 if (error) { 3687 if (error) {
3822 xfs_ioerror_alert("xlog_do_recover", 3688 xfs_ioerror_alert("xlog_do_recover",
3823 log->l_mp, bp, XFS_BUF_ADDR(bp)); 3689 log->l_mp, bp, XFS_BUF_ADDR(bp));
@@ -3875,10 +3741,9 @@ xlog_recover(
3875 return error; 3741 return error;
3876 } 3742 }
3877 3743
3878 cmn_err(CE_NOTE, 3744 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3879 "Starting XFS recovery on filesystem: %s (logdev: %s)", 3745 log->l_mp->m_logname ? log->l_mp->m_logname
3880 log->l_mp->m_fsname, log->l_mp->m_logname ? 3746 : "internal");
3881 log->l_mp->m_logname : "internal");
3882 3747
3883 error = xlog_do_recover(log, head_blk, tail_blk); 3748 error = xlog_do_recover(log, head_blk, tail_blk);
3884 log->l_flags |= XLOG_RECOVERY_NEEDED; 3749 log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3911,9 +3776,7 @@ xlog_recover_finish(
3911 int error; 3776 int error;
3912 error = xlog_recover_process_efis(log); 3777 error = xlog_recover_process_efis(log);
3913 if (error) { 3778 if (error) {
3914 cmn_err(CE_ALERT, 3779 xfs_alert(log->l_mp, "Failed to recover EFIs");
3915 "Failed to recover EFIs on filesystem: %s",
3916 log->l_mp->m_fsname);
3917 return error; 3780 return error;
3918 } 3781 }
3919 /* 3782 /*
@@ -3928,15 +3791,12 @@ xlog_recover_finish(
3928 3791
3929 xlog_recover_check_summary(log); 3792 xlog_recover_check_summary(log);
3930 3793
3931 cmn_err(CE_NOTE, 3794 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
3932 "Ending XFS recovery on filesystem: %s (logdev: %s)", 3795 log->l_mp->m_logname ? log->l_mp->m_logname
3933 log->l_mp->m_fsname, log->l_mp->m_logname ? 3796 : "internal");
3934 log->l_mp->m_logname : "internal");
3935 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3797 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3936 } else { 3798 } else {
3937 cmn_err(CE_DEBUG, 3799 xfs_info(log->l_mp, "Ending clean mount");
3938 "!Ending clean XFS mount for filesystem: %s\n",
3939 log->l_mp->m_fsname);
3940 } 3800 }
3941 return 0; 3801 return 0;
3942} 3802}
@@ -3969,10 +3829,8 @@ xlog_recover_check_summary(
3969 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 3829 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3970 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); 3830 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3971 if (error) { 3831 if (error) {
3972 xfs_fs_cmn_err(CE_ALERT, mp, 3832 xfs_alert(mp, "%s agf read failed agno %d error %d",
3973 "xlog_recover_check_summary(agf)" 3833 __func__, agno, error);
3974 "agf read failed agno %d error %d",
3975 agno, error);
3976 } else { 3834 } else {
3977 agfp = XFS_BUF_TO_AGF(agfbp); 3835 agfp = XFS_BUF_TO_AGF(agfbp);
3978 freeblks += be32_to_cpu(agfp->agf_freeblks) + 3836 freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3981,7 +3839,10 @@ xlog_recover_check_summary(
3981 } 3839 }
3982 3840
3983 error = xfs_read_agi(mp, NULL, agno, &agibp); 3841 error = xfs_read_agi(mp, NULL, agno, &agibp);
3984 if (!error) { 3842 if (error) {
3843 xfs_alert(mp, "%s agi read failed agno %d error %d",
3844 __func__, agno, error);
3845 } else {
3985 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); 3846 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
3986 3847
3987 itotal += be32_to_cpu(agi->agi_count); 3848 itotal += be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aeb9d72ebf6e..b49b82363d20 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -52,16 +52,11 @@ STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
52 int); 52 int);
53STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, 53STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
54 int); 54 int);
55STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
56 int64_t, int);
57STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); 55STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
58
59#else 56#else
60 57
61#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) 58#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
62#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) 59#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
63#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0)
64
65#endif 60#endif
66 61
67static const struct { 62static const struct {
@@ -138,9 +133,7 @@ xfs_uuid_mount(
138 return 0; 133 return 0;
139 134
140 if (uuid_is_nil(uuid)) { 135 if (uuid_is_nil(uuid)) {
141 cmn_err(CE_WARN, 136 xfs_warn(mp, "Filesystem has nil UUID - can't mount");
142 "XFS: Filesystem %s has nil UUID - can't mount",
143 mp->m_fsname);
144 return XFS_ERROR(EINVAL); 137 return XFS_ERROR(EINVAL);
145 } 138 }
146 139
@@ -168,8 +161,7 @@ xfs_uuid_mount(
168 161
169 out_duplicate: 162 out_duplicate:
170 mutex_unlock(&xfs_uuid_table_mutex); 163 mutex_unlock(&xfs_uuid_table_mutex);
171 cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount", 164 xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
172 mp->m_fsname);
173 return XFS_ERROR(EINVAL); 165 return XFS_ERROR(EINVAL);
174} 166}
175 167
@@ -199,6 +191,8 @@ xfs_uuid_unmount(
199 191
200/* 192/*
201 * Reference counting access wrappers to the perag structures. 193 * Reference counting access wrappers to the perag structures.
194 * Because we never free per-ag structures, the only thing we
195 * have to protect against changes is the tree structure itself.
202 */ 196 */
203struct xfs_perag * 197struct xfs_perag *
204xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) 198xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -206,19 +200,43 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
206 struct xfs_perag *pag; 200 struct xfs_perag *pag;
207 int ref = 0; 201 int ref = 0;
208 202
209 spin_lock(&mp->m_perag_lock); 203 rcu_read_lock();
210 pag = radix_tree_lookup(&mp->m_perag_tree, agno); 204 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
211 if (pag) { 205 if (pag) {
212 ASSERT(atomic_read(&pag->pag_ref) >= 0); 206 ASSERT(atomic_read(&pag->pag_ref) >= 0);
213 /* catch leaks in the positive direction during testing */
214 ASSERT(atomic_read(&pag->pag_ref) < 1000);
215 ref = atomic_inc_return(&pag->pag_ref); 207 ref = atomic_inc_return(&pag->pag_ref);
216 } 208 }
217 spin_unlock(&mp->m_perag_lock); 209 rcu_read_unlock();
218 trace_xfs_perag_get(mp, agno, ref, _RET_IP_); 210 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
219 return pag; 211 return pag;
220} 212}
221 213
214/*
215 * search from @first to find the next perag with the given tag set.
216 */
217struct xfs_perag *
218xfs_perag_get_tag(
219 struct xfs_mount *mp,
220 xfs_agnumber_t first,
221 int tag)
222{
223 struct xfs_perag *pag;
224 int found;
225 int ref;
226
227 rcu_read_lock();
228 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
229 (void **)&pag, first, 1, tag);
230 if (found <= 0) {
231 rcu_read_unlock();
232 return NULL;
233 }
234 ref = atomic_inc_return(&pag->pag_ref);
235 rcu_read_unlock();
236 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
237 return pag;
238}
239
222void 240void
223xfs_perag_put(struct xfs_perag *pag) 241xfs_perag_put(struct xfs_perag *pag)
224{ 242{
@@ -229,10 +247,18 @@ xfs_perag_put(struct xfs_perag *pag)
229 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); 247 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
230} 248}
231 249
250STATIC void
251__xfs_free_perag(
252 struct rcu_head *head)
253{
254 struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
255
256 ASSERT(atomic_read(&pag->pag_ref) == 0);
257 kmem_free(pag);
258}
259
232/* 260/*
233 * Free up the resources associated with a mount structure. Assume that 261 * Free up the per-ag resources associated with the mount structure.
234 * the structure was initially zeroed, so we can tell which fields got
235 * initialized.
236 */ 262 */
237STATIC void 263STATIC void
238xfs_free_perag( 264xfs_free_perag(
@@ -244,10 +270,10 @@ xfs_free_perag(
244 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 270 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
245 spin_lock(&mp->m_perag_lock); 271 spin_lock(&mp->m_perag_lock);
246 pag = radix_tree_delete(&mp->m_perag_tree, agno); 272 pag = radix_tree_delete(&mp->m_perag_tree, agno);
273 spin_unlock(&mp->m_perag_lock);
247 ASSERT(pag); 274 ASSERT(pag);
248 ASSERT(atomic_read(&pag->pag_ref) == 0); 275 ASSERT(atomic_read(&pag->pag_ref) == 0);
249 spin_unlock(&mp->m_perag_lock); 276 call_rcu(&pag->rcu_head, __xfs_free_perag);
250 kmem_free(pag);
251 } 277 }
252} 278}
253 279
@@ -282,6 +308,8 @@ xfs_mount_validate_sb(
282 xfs_sb_t *sbp, 308 xfs_sb_t *sbp,
283 int flags) 309 int flags)
284{ 310{
311 int loud = !(flags & XFS_MFSI_QUIET);
312
285 /* 313 /*
286 * If the log device and data device have the 314 * If the log device and data device have the
287 * same device number, the log is internal. 315 * same device number, the log is internal.
@@ -290,28 +318,32 @@ xfs_mount_validate_sb(
290 * a volume filesystem in a non-volume manner. 318 * a volume filesystem in a non-volume manner.
291 */ 319 */
292 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 320 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
293 xfs_fs_mount_cmn_err(flags, "bad magic number"); 321 if (loud)
322 xfs_warn(mp, "bad magic number");
294 return XFS_ERROR(EWRONGFS); 323 return XFS_ERROR(EWRONGFS);
295 } 324 }
296 325
297 if (!xfs_sb_good_version(sbp)) { 326 if (!xfs_sb_good_version(sbp)) {
298 xfs_fs_mount_cmn_err(flags, "bad version"); 327 if (loud)
328 xfs_warn(mp, "bad version");
299 return XFS_ERROR(EWRONGFS); 329 return XFS_ERROR(EWRONGFS);
300 } 330 }
301 331
302 if (unlikely( 332 if (unlikely(
303 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 333 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
304 xfs_fs_mount_cmn_err(flags, 334 if (loud)
305 "filesystem is marked as having an external log; " 335 xfs_warn(mp,
306 "specify logdev on the\nmount command line."); 336 "filesystem is marked as having an external log; "
337 "specify logdev on the mount command line.");
307 return XFS_ERROR(EINVAL); 338 return XFS_ERROR(EINVAL);
308 } 339 }
309 340
310 if (unlikely( 341 if (unlikely(
311 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { 342 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
312 xfs_fs_mount_cmn_err(flags, 343 if (loud)
313 "filesystem is marked as having an internal log; " 344 xfs_warn(mp,
314 "do not specify logdev on\nthe mount command line."); 345 "filesystem is marked as having an internal log; "
346 "do not specify logdev on the mount command line.");
315 return XFS_ERROR(EINVAL); 347 return XFS_ERROR(EINVAL);
316 } 348 }
317 349
@@ -340,7 +372,8 @@ xfs_mount_validate_sb(
340 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 372 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
341 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || 373 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
342 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { 374 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
343 xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); 375 if (loud)
376 xfs_warn(mp, "SB sanity check 1 failed");
344 return XFS_ERROR(EFSCORRUPTED); 377 return XFS_ERROR(EFSCORRUPTED);
345 } 378 }
346 379
@@ -353,7 +386,8 @@ xfs_mount_validate_sb(
353 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || 386 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
354 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * 387 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
355 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { 388 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
356 xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed"); 389 if (loud)
390 xfs_warn(mp, "SB sanity check 2 failed");
357 return XFS_ERROR(EFSCORRUPTED); 391 return XFS_ERROR(EFSCORRUPTED);
358 } 392 }
359 393
@@ -361,12 +395,12 @@ xfs_mount_validate_sb(
361 * Until this is fixed only page-sized or smaller data blocks work. 395 * Until this is fixed only page-sized or smaller data blocks work.
362 */ 396 */
363 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { 397 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
364 xfs_fs_mount_cmn_err(flags, 398 if (loud) {
365 "file system with blocksize %d bytes", 399 xfs_warn(mp,
366 sbp->sb_blocksize); 400 "File system with blocksize %d bytes. "
367 xfs_fs_mount_cmn_err(flags, 401 "Only pagesize (%ld) or less will currently work.",
368 "only pagesize (%ld) or less will currently work.", 402 sbp->sb_blocksize, PAGE_SIZE);
369 PAGE_SIZE); 403 }
370 return XFS_ERROR(ENOSYS); 404 return XFS_ERROR(ENOSYS);
371 } 405 }
372 406
@@ -380,21 +414,23 @@ xfs_mount_validate_sb(
380 case 2048: 414 case 2048:
381 break; 415 break;
382 default: 416 default:
383 xfs_fs_mount_cmn_err(flags, 417 if (loud)
384 "inode size of %d bytes not supported", 418 xfs_warn(mp, "inode size of %d bytes not supported",
385 sbp->sb_inodesize); 419 sbp->sb_inodesize);
386 return XFS_ERROR(ENOSYS); 420 return XFS_ERROR(ENOSYS);
387 } 421 }
388 422
389 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 423 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
390 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 424 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
391 xfs_fs_mount_cmn_err(flags, 425 if (loud)
392 "file system too large to be mounted on this system."); 426 xfs_warn(mp,
427 "file system too large to be mounted on this system.");
393 return XFS_ERROR(EFBIG); 428 return XFS_ERROR(EFBIG);
394 } 429 }
395 430
396 if (unlikely(sbp->sb_inprogress)) { 431 if (unlikely(sbp->sb_inprogress)) {
397 xfs_fs_mount_cmn_err(flags, "file system busy"); 432 if (loud)
433 xfs_warn(mp, "file system busy");
398 return XFS_ERROR(EFSCORRUPTED); 434 return XFS_ERROR(EFSCORRUPTED);
399 } 435 }
400 436
@@ -402,8 +438,9 @@ xfs_mount_validate_sb(
402 * Version 1 directory format has never worked on Linux. 438 * Version 1 directory format has never worked on Linux.
403 */ 439 */
404 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { 440 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
405 xfs_fs_mount_cmn_err(flags, 441 if (loud)
406 "file system using version 1 directory format"); 442 xfs_warn(mp,
443 "file system using version 1 directory format");
407 return XFS_ERROR(ENOSYS); 444 return XFS_ERROR(ENOSYS);
408 } 445 }
409 446
@@ -443,8 +480,11 @@ xfs_initialize_perag(
443 goto out_unwind; 480 goto out_unwind;
444 pag->pag_agno = index; 481 pag->pag_agno = index;
445 pag->pag_mount = mp; 482 pag->pag_mount = mp;
446 rwlock_init(&pag->pag_ici_lock); 483 spin_lock_init(&pag->pag_ici_lock);
484 mutex_init(&pag->pag_ici_reclaim_lock);
447 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 485 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
486 spin_lock_init(&pag->pag_buf_lock);
487 pag->pag_buf_tree = RB_ROOT;
448 488
449 if (radix_tree_preload(GFP_NOFS)) 489 if (radix_tree_preload(GFP_NOFS))
450 goto out_unwind; 490 goto out_unwind;
@@ -639,9 +679,9 @@ int
639xfs_readsb(xfs_mount_t *mp, int flags) 679xfs_readsb(xfs_mount_t *mp, int flags)
640{ 680{
641 unsigned int sector_size; 681 unsigned int sector_size;
642 unsigned int extra_flags;
643 xfs_buf_t *bp; 682 xfs_buf_t *bp;
644 int error; 683 int error;
684 int loud = !(flags & XFS_MFSI_QUIET);
645 685
646 ASSERT(mp->m_sb_bp == NULL); 686 ASSERT(mp->m_sb_bp == NULL);
647 ASSERT(mp->m_ddev_targp != NULL); 687 ASSERT(mp->m_ddev_targp != NULL);
@@ -652,39 +692,37 @@ xfs_readsb(xfs_mount_t *mp, int flags)
652 * access to the superblock. 692 * access to the superblock.
653 */ 693 */
654 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 694 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
655 extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
656 695
657 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 696reread:
658 extra_flags); 697 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
659 if (!bp || XFS_BUF_ISERROR(bp)) { 698 XFS_SB_DADDR, sector_size, 0);
660 xfs_fs_mount_cmn_err(flags, "SB read failed"); 699 if (!bp) {
661 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; 700 if (loud)
662 goto fail; 701 xfs_warn(mp, "SB buffer read failed");
702 return EIO;
663 } 703 }
664 ASSERT(XFS_BUF_ISBUSY(bp));
665 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
666 704
667 /* 705 /*
668 * Initialize the mount structure from the superblock. 706 * Initialize the mount structure from the superblock.
669 * But first do some basic consistency checking. 707 * But first do some basic consistency checking.
670 */ 708 */
671 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 709 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
672
673 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 710 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
674 if (error) { 711 if (error) {
675 xfs_fs_mount_cmn_err(flags, "SB validate failed"); 712 if (loud)
676 goto fail; 713 xfs_warn(mp, "SB validate failed");
714 goto release_buf;
677 } 715 }
678 716
679 /* 717 /*
680 * We must be able to do sector-sized and sector-aligned IO. 718 * We must be able to do sector-sized and sector-aligned IO.
681 */ 719 */
682 if (sector_size > mp->m_sb.sb_sectsize) { 720 if (sector_size > mp->m_sb.sb_sectsize) {
683 xfs_fs_mount_cmn_err(flags, 721 if (loud)
684 "device supports only %u byte sectors (not %u)", 722 xfs_warn(mp, "device supports %u byte sectors (not %u)",
685 sector_size, mp->m_sb.sb_sectsize); 723 sector_size, mp->m_sb.sb_sectsize);
686 error = ENOSYS; 724 error = ENOSYS;
687 goto fail; 725 goto release_buf;
688 } 726 }
689 727
690 /* 728 /*
@@ -692,33 +730,20 @@ xfs_readsb(xfs_mount_t *mp, int flags)
692 * re-read the superblock so the buffer is correctly sized. 730 * re-read the superblock so the buffer is correctly sized.
693 */ 731 */
694 if (sector_size < mp->m_sb.sb_sectsize) { 732 if (sector_size < mp->m_sb.sb_sectsize) {
695 XFS_BUF_UNMANAGE(bp);
696 xfs_buf_relse(bp); 733 xfs_buf_relse(bp);
697 sector_size = mp->m_sb.sb_sectsize; 734 sector_size = mp->m_sb.sb_sectsize;
698 bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, 735 goto reread;
699 BTOBB(sector_size), extra_flags);
700 if (!bp || XFS_BUF_ISERROR(bp)) {
701 xfs_fs_mount_cmn_err(flags, "SB re-read failed");
702 error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
703 goto fail;
704 }
705 ASSERT(XFS_BUF_ISBUSY(bp));
706 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
707 } 736 }
708 737
709 /* Initialize per-cpu counters */ 738 /* Initialize per-cpu counters */
710 xfs_icsb_reinit_counters(mp); 739 xfs_icsb_reinit_counters(mp);
711 740
712 mp->m_sb_bp = bp; 741 mp->m_sb_bp = bp;
713 xfs_buf_relse(bp); 742 xfs_buf_unlock(bp);
714 ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
715 return 0; 743 return 0;
716 744
717 fail: 745release_buf:
718 if (bp) { 746 xfs_buf_relse(bp);
719 XFS_BUF_UNMANAGE(bp);
720 xfs_buf_relse(bp);
721 }
722 return error; 747 return error;
723} 748}
724 749
@@ -839,8 +864,7 @@ xfs_update_alignment(xfs_mount_t *mp)
839 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || 864 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
840 (BBTOB(mp->m_swidth) & mp->m_blockmask)) { 865 (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
841 if (mp->m_flags & XFS_MOUNT_RETERR) { 866 if (mp->m_flags & XFS_MOUNT_RETERR) {
842 cmn_err(CE_WARN, 867 xfs_warn(mp, "alignment check 1 failed");
843 "XFS: alignment check 1 failed");
844 return XFS_ERROR(EINVAL); 868 return XFS_ERROR(EINVAL);
845 } 869 }
846 mp->m_dalign = mp->m_swidth = 0; 870 mp->m_dalign = mp->m_swidth = 0;
@@ -853,8 +877,9 @@ xfs_update_alignment(xfs_mount_t *mp)
853 if (mp->m_flags & XFS_MOUNT_RETERR) { 877 if (mp->m_flags & XFS_MOUNT_RETERR) {
854 return XFS_ERROR(EINVAL); 878 return XFS_ERROR(EINVAL);
855 } 879 }
856 xfs_fs_cmn_err(CE_WARN, mp, 880 xfs_warn(mp,
857"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)", 881 "stripe alignment turned off: sunit(%d)/swidth(%d) "
882 "incompatible with agsize(%d)",
858 mp->m_dalign, mp->m_swidth, 883 mp->m_dalign, mp->m_swidth,
859 sbp->sb_agblocks); 884 sbp->sb_agblocks);
860 885
@@ -864,9 +889,9 @@ xfs_update_alignment(xfs_mount_t *mp)
864 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 889 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
865 } else { 890 } else {
866 if (mp->m_flags & XFS_MOUNT_RETERR) { 891 if (mp->m_flags & XFS_MOUNT_RETERR) {
867 xfs_fs_cmn_err(CE_WARN, mp, 892 xfs_warn(mp,
868"stripe alignment turned off: sunit(%d) less than bsize(%d)", 893 "stripe alignment turned off: sunit(%d) less than bsize(%d)",
869 mp->m_dalign, 894 mp->m_dalign,
870 mp->m_blockmask +1); 895 mp->m_blockmask +1);
871 return XFS_ERROR(EINVAL); 896 return XFS_ERROR(EINVAL);
872 } 897 }
@@ -961,6 +986,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
961} 986}
962 987
963/* 988/*
989 * precalculate the low space thresholds for dynamic speculative preallocation.
990 */
991void
992xfs_set_low_space_thresholds(
993 struct xfs_mount *mp)
994{
995 int i;
996
997 for (i = 0; i < XFS_LOWSP_MAX; i++) {
998 __uint64_t space = mp->m_sb.sb_dblocks;
999
1000 do_div(space, 100);
1001 mp->m_low_space[i] = space * (i + 1);
1002 }
1003}
1004
1005
1006/*
964 * Set whether we're using inode alignment. 1007 * Set whether we're using inode alignment.
965 */ 1008 */
966STATIC void 1009STATIC void
@@ -991,42 +1034,35 @@ xfs_check_sizes(xfs_mount_t *mp)
991{ 1034{
992 xfs_buf_t *bp; 1035 xfs_buf_t *bp;
993 xfs_daddr_t d; 1036 xfs_daddr_t d;
994 int error;
995 1037
996 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 1038 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
997 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1039 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
998 cmn_err(CE_WARN, "XFS: size check 1 failed"); 1040 xfs_warn(mp, "filesystem size mismatch detected");
999 return XFS_ERROR(EFBIG); 1041 return XFS_ERROR(EFBIG);
1000 } 1042 }
1001 error = xfs_read_buf(mp, mp->m_ddev_targp, 1043 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
1002 d - XFS_FSS_TO_BB(mp, 1), 1044 d - XFS_FSS_TO_BB(mp, 1),
1003 XFS_FSS_TO_BB(mp, 1), 0, &bp); 1045 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
1004 if (!error) { 1046 if (!bp) {
1005 xfs_buf_relse(bp); 1047 xfs_warn(mp, "last sector read failed");
1006 } else { 1048 return EIO;
1007 cmn_err(CE_WARN, "XFS: size check 2 failed");
1008 if (error == ENOSPC)
1009 error = XFS_ERROR(EFBIG);
1010 return error;
1011 } 1049 }
1050 xfs_buf_relse(bp);
1012 1051
1013 if (mp->m_logdev_targp != mp->m_ddev_targp) { 1052 if (mp->m_logdev_targp != mp->m_ddev_targp) {
1014 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1053 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1015 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1054 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1016 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1055 xfs_warn(mp, "log size mismatch detected");
1017 return XFS_ERROR(EFBIG); 1056 return XFS_ERROR(EFBIG);
1018 } 1057 }
1019 error = xfs_read_buf(mp, mp->m_logdev_targp, 1058 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
1020 d - XFS_FSB_TO_BB(mp, 1), 1059 d - XFS_FSB_TO_BB(mp, 1),
1021 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1060 XFS_FSB_TO_B(mp, 1), 0);
1022 if (!error) { 1061 if (!bp) {
1023 xfs_buf_relse(bp); 1062 xfs_warn(mp, "log device read failed");
1024 } else { 1063 return EIO;
1025 cmn_err(CE_WARN, "XFS: size check 3 failed");
1026 if (error == ENOSPC)
1027 error = XFS_ERROR(EFBIG);
1028 return error;
1029 } 1064 }
1065 xfs_buf_relse(bp);
1030 } 1066 }
1031 return 0; 1067 return 0;
1032} 1068}
@@ -1061,7 +1097,7 @@ xfs_mount_reset_sbqflags(
1061 return 0; 1097 return 0;
1062 1098
1063#ifdef QUOTADEBUG 1099#ifdef QUOTADEBUG
1064 xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes"); 1100 xfs_notice(mp, "Writing superblock quota changes");
1065#endif 1101#endif
1066 1102
1067 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 1103 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
@@ -1069,8 +1105,7 @@ xfs_mount_reset_sbqflags(
1069 XFS_DEFAULT_LOG_COUNT); 1105 XFS_DEFAULT_LOG_COUNT);
1070 if (error) { 1106 if (error) {
1071 xfs_trans_cancel(tp, 0); 1107 xfs_trans_cancel(tp, 0);
1072 xfs_fs_cmn_err(CE_ALERT, mp, 1108 xfs_alert(mp, "%s: Superblock update failed!", __func__);
1073 "xfs_mount_reset_sbqflags: Superblock update failed!");
1074 return error; 1109 return error;
1075 } 1110 }
1076 1111
@@ -1136,8 +1171,7 @@ xfs_mountfs(
1136 * transaction subsystem is online. 1171 * transaction subsystem is online.
1137 */ 1172 */
1138 if (xfs_sb_has_mismatched_features2(sbp)) { 1173 if (xfs_sb_has_mismatched_features2(sbp)) {
1139 cmn_err(CE_WARN, 1174 xfs_warn(mp, "correcting sb_features alignment problem");
1140 "XFS: correcting sb_features alignment problem");
1141 sbp->sb_features2 |= sbp->sb_bad_features2; 1175 sbp->sb_features2 |= sbp->sb_bad_features2;
1142 sbp->sb_bad_features2 = sbp->sb_features2; 1176 sbp->sb_bad_features2 = sbp->sb_features2;
1143 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; 1177 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
@@ -1189,6 +1223,9 @@ xfs_mountfs(
1189 */ 1223 */
1190 xfs_set_rw_sizes(mp); 1224 xfs_set_rw_sizes(mp);
1191 1225
1226 /* set the low space thresholds for dynamic preallocation */
1227 xfs_set_low_space_thresholds(mp);
1228
1192 /* 1229 /*
1193 * Set the inode cluster size. 1230 * Set the inode cluster size.
1194 * This may still be overridden by the file system 1231 * This may still be overridden by the file system
@@ -1213,7 +1250,7 @@ xfs_mountfs(
1213 */ 1250 */
1214 error = xfs_rtmount_init(mp); 1251 error = xfs_rtmount_init(mp);
1215 if (error) { 1252 if (error) {
1216 cmn_err(CE_WARN, "XFS: RT mount failed"); 1253 xfs_warn(mp, "RT mount failed");
1217 goto out_remove_uuid; 1254 goto out_remove_uuid;
1218 } 1255 }
1219 1256
@@ -1244,12 +1281,12 @@ xfs_mountfs(
1244 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); 1281 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
1245 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); 1282 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1246 if (error) { 1283 if (error) {
1247 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); 1284 xfs_warn(mp, "Failed per-ag init: %d", error);
1248 goto out_remove_uuid; 1285 goto out_remove_uuid;
1249 } 1286 }
1250 1287
1251 if (!sbp->sb_logblocks) { 1288 if (!sbp->sb_logblocks) {
1252 cmn_err(CE_WARN, "XFS: no log defined"); 1289 xfs_warn(mp, "no log defined");
1253 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); 1290 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
1254 error = XFS_ERROR(EFSCORRUPTED); 1291 error = XFS_ERROR(EFSCORRUPTED);
1255 goto out_free_perag; 1292 goto out_free_perag;
@@ -1262,7 +1299,7 @@ xfs_mountfs(
1262 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), 1299 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1263 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 1300 XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1264 if (error) { 1301 if (error) {
1265 cmn_err(CE_WARN, "XFS: log mount failed"); 1302 xfs_warn(mp, "log mount failed");
1266 goto out_free_perag; 1303 goto out_free_perag;
1267 } 1304 }
1268 1305
@@ -1299,16 +1336,14 @@ xfs_mountfs(
1299 */ 1336 */
1300 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); 1337 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
1301 if (error) { 1338 if (error) {
1302 cmn_err(CE_WARN, "XFS: failed to read root inode"); 1339 xfs_warn(mp, "failed to read root inode");
1303 goto out_log_dealloc; 1340 goto out_log_dealloc;
1304 } 1341 }
1305 1342
1306 ASSERT(rip != NULL); 1343 ASSERT(rip != NULL);
1307 1344
1308 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { 1345 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
1309 cmn_err(CE_WARN, "XFS: corrupted root inode"); 1346 xfs_warn(mp, "corrupted root inode %llu: not a directory",
1310 cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
1311 XFS_BUFTARG_NAME(mp->m_ddev_targp),
1312 (unsigned long long)rip->i_ino); 1347 (unsigned long long)rip->i_ino);
1313 xfs_iunlock(rip, XFS_ILOCK_EXCL); 1348 xfs_iunlock(rip, XFS_ILOCK_EXCL);
1314 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, 1349 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
@@ -1328,7 +1363,7 @@ xfs_mountfs(
1328 /* 1363 /*
1329 * Free up the root inode. 1364 * Free up the root inode.
1330 */ 1365 */
1331 cmn_err(CE_WARN, "XFS: failed to read RT inodes"); 1366 xfs_warn(mp, "failed to read RT inodes");
1332 goto out_rele_rip; 1367 goto out_rele_rip;
1333 } 1368 }
1334 1369
@@ -1340,7 +1375,7 @@ xfs_mountfs(
1340 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { 1375 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
1341 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1376 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1342 if (error) { 1377 if (error) {
1343 cmn_err(CE_WARN, "XFS: failed to write sb changes"); 1378 xfs_warn(mp, "failed to write sb changes");
1344 goto out_rtunmount; 1379 goto out_rtunmount;
1345 } 1380 }
1346 } 1381 }
@@ -1361,10 +1396,7 @@ xfs_mountfs(
1361 * quotachecked license. 1396 * quotachecked license.
1362 */ 1397 */
1363 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { 1398 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
1364 cmn_err(CE_NOTE, 1399 xfs_notice(mp, "resetting quota flags");
1365 "XFS: resetting qflags for filesystem %s",
1366 mp->m_fsname);
1367
1368 error = xfs_mount_reset_sbqflags(mp); 1400 error = xfs_mount_reset_sbqflags(mp);
1369 if (error) 1401 if (error)
1370 return error; 1402 return error;
@@ -1378,7 +1410,7 @@ xfs_mountfs(
1378 */ 1410 */
1379 error = xfs_log_mount_finish(mp); 1411 error = xfs_log_mount_finish(mp);
1380 if (error) { 1412 if (error) {
1381 cmn_err(CE_WARN, "XFS: log mount finish failed"); 1413 xfs_warn(mp, "log mount finish failed");
1382 goto out_rtunmount; 1414 goto out_rtunmount;
1383 } 1415 }
1384 1416
@@ -1407,8 +1439,8 @@ xfs_mountfs(
1407 resblks = xfs_default_resblks(mp); 1439 resblks = xfs_default_resblks(mp);
1408 error = xfs_reserve_blocks(mp, &resblks, NULL); 1440 error = xfs_reserve_blocks(mp, &resblks, NULL);
1409 if (error) 1441 if (error)
1410 cmn_err(CE_WARN, "XFS: Unable to allocate reserve " 1442 xfs_warn(mp,
1411 "blocks. Continuing without a reserve pool."); 1443 "Unable to allocate reserve blocks. Continuing without reserve pool.");
1412 } 1444 }
1413 1445
1414 return 0; 1446 return 0;
@@ -1497,12 +1529,12 @@ xfs_unmountfs(
1497 resblks = 0; 1529 resblks = 0;
1498 error = xfs_reserve_blocks(mp, &resblks, NULL); 1530 error = xfs_reserve_blocks(mp, &resblks, NULL);
1499 if (error) 1531 if (error)
1500 cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. " 1532 xfs_warn(mp, "Unable to free reserved block pool. "
1501 "Freespace may not be correct on next mount."); 1533 "Freespace may not be correct on next mount.");
1502 1534
1503 error = xfs_log_sbcount(mp, 1); 1535 error = xfs_log_sbcount(mp, 1);
1504 if (error) 1536 if (error)
1505 cmn_err(CE_WARN, "XFS: Unable to update superblock counters. " 1537 xfs_warn(mp, "Unable to update superblock counters. "
1506 "Freespace may not be correct on next mount."); 1538 "Freespace may not be correct on next mount.");
1507 xfs_unmountfs_writesb(mp); 1539 xfs_unmountfs_writesb(mp);
1508 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1540 xfs_unmountfs_wait(mp); /* wait for async bufs */
@@ -1601,7 +1633,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1601 XFS_BUF_UNASYNC(sbp); 1633 XFS_BUF_UNASYNC(sbp);
1602 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); 1634 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1603 xfsbdstrat(mp, sbp); 1635 xfsbdstrat(mp, sbp);
1604 error = xfs_iowait(sbp); 1636 error = xfs_buf_iowait(sbp);
1605 if (error) 1637 if (error)
1606 xfs_ioerror_alert("xfs_unmountfs_writesb", 1638 xfs_ioerror_alert("xfs_unmountfs_writesb",
1607 mp, sbp, XFS_BUF_ADDR(sbp)); 1639 mp, sbp, XFS_BUF_ADDR(sbp));
@@ -1832,135 +1864,72 @@ xfs_mod_incore_sb_unlocked(
1832 */ 1864 */
1833int 1865int
1834xfs_mod_incore_sb( 1866xfs_mod_incore_sb(
1835 xfs_mount_t *mp, 1867 struct xfs_mount *mp,
1836 xfs_sb_field_t field, 1868 xfs_sb_field_t field,
1837 int64_t delta, 1869 int64_t delta,
1838 int rsvd) 1870 int rsvd)
1839{ 1871{
1840 int status; 1872 int status;
1841 1873
1842 /* check for per-cpu counters */
1843 switch (field) {
1844#ifdef HAVE_PERCPU_SB 1874#ifdef HAVE_PERCPU_SB
1845 case XFS_SBS_ICOUNT: 1875 ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
1846 case XFS_SBS_IFREE:
1847 case XFS_SBS_FDBLOCKS:
1848 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1849 status = xfs_icsb_modify_counters(mp, field,
1850 delta, rsvd);
1851 break;
1852 }
1853 /* FALLTHROUGH */
1854#endif 1876#endif
1855 default: 1877 spin_lock(&mp->m_sb_lock);
1856 spin_lock(&mp->m_sb_lock); 1878 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1857 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); 1879 spin_unlock(&mp->m_sb_lock);
1858 spin_unlock(&mp->m_sb_lock);
1859 break;
1860 }
1861 1880
1862 return status; 1881 return status;
1863} 1882}
1864 1883
1865/* 1884/*
1866 * xfs_mod_incore_sb_batch() is used to change more than one field 1885 * Change more than one field in the in-core superblock structure at a time.
1867 * in the in-core superblock structure at a time. This modification
1868 * is protected by a lock internal to this module. The fields and
1869 * changes to those fields are specified in the array of xfs_mod_sb
1870 * structures passed in.
1871 * 1886 *
1872 * Either all of the specified deltas will be applied or none of 1887 * The fields and changes to those fields are specified in the array of
1873 * them will. If any modified field dips below 0, then all modifications 1888 * xfs_mod_sb structures passed in. Either all of the specified deltas
1874 * will be backed out and EINVAL will be returned. 1889 * will be applied or none of them will. If any modified field dips below 0,
1890 * then all modifications will be backed out and EINVAL will be returned.
1891 *
1892 * Note that this function may not be used for the superblock values that
1893 * are tracked with the in-memory per-cpu counters - a direct call to
1894 * xfs_icsb_modify_counters is required for these.
1875 */ 1895 */
1876int 1896int
1877xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) 1897xfs_mod_incore_sb_batch(
1898 struct xfs_mount *mp,
1899 xfs_mod_sb_t *msb,
1900 uint nmsb,
1901 int rsvd)
1878{ 1902{
1879 int status=0; 1903 xfs_mod_sb_t *msbp;
1880 xfs_mod_sb_t *msbp; 1904 int error = 0;
1881 1905
1882 /* 1906 /*
1883 * Loop through the array of mod structures and apply each 1907 * Loop through the array of mod structures and apply each individually.
1884 * individually. If any fail, then back out all those 1908 * If any fail, then back out all those which have already been applied.
1885 * which have already been applied. Do all of this within 1909 * Do all of this within the scope of the m_sb_lock so that all of the
1886 * the scope of the m_sb_lock so that all of the changes will 1910 * changes will be atomic.
1887 * be atomic.
1888 */ 1911 */
1889 spin_lock(&mp->m_sb_lock); 1912 spin_lock(&mp->m_sb_lock);
1890 msbp = &msb[0]; 1913 for (msbp = msb; msbp < (msb + nmsb); msbp++) {
1891 for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { 1914 ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
1892 /* 1915 msbp->msb_field > XFS_SBS_FDBLOCKS);
1893 * Apply the delta at index n. If it fails, break
1894 * from the loop so we'll fall into the undo loop
1895 * below.
1896 */
1897 switch (msbp->msb_field) {
1898#ifdef HAVE_PERCPU_SB
1899 case XFS_SBS_ICOUNT:
1900 case XFS_SBS_IFREE:
1901 case XFS_SBS_FDBLOCKS:
1902 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1903 spin_unlock(&mp->m_sb_lock);
1904 status = xfs_icsb_modify_counters(mp,
1905 msbp->msb_field,
1906 msbp->msb_delta, rsvd);
1907 spin_lock(&mp->m_sb_lock);
1908 break;
1909 }
1910 /* FALLTHROUGH */
1911#endif
1912 default:
1913 status = xfs_mod_incore_sb_unlocked(mp,
1914 msbp->msb_field,
1915 msbp->msb_delta, rsvd);
1916 break;
1917 }
1918 1916
1919 if (status != 0) { 1917 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1920 break; 1918 msbp->msb_delta, rsvd);
1921 } 1919 if (error)
1920 goto unwind;
1922 } 1921 }
1922 spin_unlock(&mp->m_sb_lock);
1923 return 0;
1923 1924
1924 /* 1925unwind:
1925 * If we didn't complete the loop above, then back out 1926 while (--msbp >= msb) {
1926 * any changes made to the superblock. If you add code 1927 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
1927 * between the loop above and here, make sure that you 1928 -msbp->msb_delta, rsvd);
1928 * preserve the value of status. Loop back until 1929 ASSERT(error == 0);
1929 * we step below the beginning of the array. Make sure
1930 * we don't touch anything back there.
1931 */
1932 if (status != 0) {
1933 msbp--;
1934 while (msbp >= msb) {
1935 switch (msbp->msb_field) {
1936#ifdef HAVE_PERCPU_SB
1937 case XFS_SBS_ICOUNT:
1938 case XFS_SBS_IFREE:
1939 case XFS_SBS_FDBLOCKS:
1940 if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
1941 spin_unlock(&mp->m_sb_lock);
1942 status = xfs_icsb_modify_counters(mp,
1943 msbp->msb_field,
1944 -(msbp->msb_delta),
1945 rsvd);
1946 spin_lock(&mp->m_sb_lock);
1947 break;
1948 }
1949 /* FALLTHROUGH */
1950#endif
1951 default:
1952 status = xfs_mod_incore_sb_unlocked(mp,
1953 msbp->msb_field,
1954 -(msbp->msb_delta),
1955 rsvd);
1956 break;
1957 }
1958 ASSERT(status == 0);
1959 msbp--;
1960 }
1961 } 1930 }
1962 spin_unlock(&mp->m_sb_lock); 1931 spin_unlock(&mp->m_sb_lock);
1963 return status; 1932 return error;
1964} 1933}
1965 1934
1966/* 1935/*
@@ -1998,18 +1967,13 @@ xfs_getsb(
1998 */ 1967 */
1999void 1968void
2000xfs_freesb( 1969xfs_freesb(
2001 xfs_mount_t *mp) 1970 struct xfs_mount *mp)
2002{ 1971{
2003 xfs_buf_t *bp; 1972 struct xfs_buf *bp = mp->m_sb_bp;
2004 1973
2005 /* 1974 xfs_buf_lock(bp);
2006 * Use xfs_getsb() so that the buffer will be locked
2007 * when we call xfs_buf_relse().
2008 */
2009 bp = xfs_getsb(mp, 0);
2010 XFS_BUF_UNMANAGE(bp);
2011 xfs_buf_relse(bp);
2012 mp->m_sb_bp = NULL; 1975 mp->m_sb_bp = NULL;
1976 xfs_buf_relse(bp);
2013} 1977}
2014 1978
2015/* 1979/*
@@ -2053,10 +2017,8 @@ xfs_dev_is_read_only(
2053 if (xfs_readonly_buftarg(mp->m_ddev_targp) || 2017 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
2054 xfs_readonly_buftarg(mp->m_logdev_targp) || 2018 xfs_readonly_buftarg(mp->m_logdev_targp) ||
2055 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { 2019 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
2056 cmn_err(CE_NOTE, 2020 xfs_notice(mp, "%s required on read-only device.", message);
2057 "XFS: %s required on read-only device.", message); 2021 xfs_notice(mp, "write access unavailable, cannot proceed.");
2058 cmn_err(CE_NOTE,
2059 "XFS: write access unavailable, cannot proceed.");
2060 return EROFS; 2022 return EROFS;
2061 } 2023 }
2062 return 0; 2024 return 0;
@@ -2496,7 +2458,7 @@ xfs_icsb_balance_counter(
2496 spin_unlock(&mp->m_sb_lock); 2458 spin_unlock(&mp->m_sb_lock);
2497} 2459}
2498 2460
2499STATIC int 2461int
2500xfs_icsb_modify_counters( 2462xfs_icsb_modify_counters(
2501 xfs_mount_t *mp, 2463 xfs_mount_t *mp,
2502 xfs_sb_field_t field, 2464 xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 622da2179a57..3d68bb267c5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations {
53 53
54#include "xfs_sync.h" 54#include "xfs_sync.h"
55 55
56struct cred;
57struct log; 56struct log;
58struct xfs_mount_args; 57struct xfs_mount_args;
59struct xfs_inode; 58struct xfs_inode;
@@ -91,6 +90,8 @@ extern void xfs_icsb_reinit_counters(struct xfs_mount *);
91extern void xfs_icsb_destroy_counters(struct xfs_mount *); 90extern void xfs_icsb_destroy_counters(struct xfs_mount *);
92extern void xfs_icsb_sync_counters(struct xfs_mount *, int); 91extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
93extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); 92extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
93extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
94 int64_t, int);
94 95
95#else 96#else
96#define xfs_icsb_init_counters(mp) (0) 97#define xfs_icsb_init_counters(mp) (0)
@@ -98,8 +99,20 @@ extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
98#define xfs_icsb_reinit_counters(mp) do { } while (0) 99#define xfs_icsb_reinit_counters(mp) do { } while (0)
99#define xfs_icsb_sync_counters(mp, flags) do { } while (0) 100#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
100#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) 101#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
102#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
103 xfs_mod_incore_sb(mp, field, delta, rsvd)
101#endif 104#endif
102 105
106/* dynamic preallocation free space thresholds, 5% down to 1% */
107enum {
108 XFS_LOWSP_1_PCNT = 0,
109 XFS_LOWSP_2_PCNT,
110 XFS_LOWSP_3_PCNT,
111 XFS_LOWSP_4_PCNT,
112 XFS_LOWSP_5_PCNT,
113 XFS_LOWSP_MAX,
114};
115
103typedef struct xfs_mount { 116typedef struct xfs_mount {
104 struct super_block *m_super; 117 struct super_block *m_super;
105 xfs_tid_t m_tid; /* next unused tid for fs */ 118 xfs_tid_t m_tid; /* next unused tid for fs */
@@ -190,15 +203,14 @@ typedef struct xfs_mount {
190 struct mutex m_icsb_mutex; /* balancer sync lock */ 203 struct mutex m_icsb_mutex; /* balancer sync lock */
191#endif 204#endif
192 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 205 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
193 struct task_struct *m_sync_task; /* generalised sync thread */ 206 struct delayed_work m_sync_work; /* background sync work */
194 xfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ 207 struct delayed_work m_reclaim_work; /* background inode reclaim */
195 struct list_head m_sync_list; /* sync thread work item list */ 208 struct work_struct m_flush_work; /* background inode flush */
196 spinlock_t m_sync_lock; /* work item list lock */
197 int m_sync_seq; /* sync thread generation no. */
198 wait_queue_head_t m_wait_single_sync_task;
199 __int64_t m_update_flags; /* sb flags we need to update 209 __int64_t m_update_flags; /* sb flags we need to update
200 on the next remount,rw */ 210 on the next remount,rw */
201 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 211 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
212 int64_t m_low_space[XFS_LOWSP_MAX];
213 /* low free space thresholds */
202} xfs_mount_t; 214} xfs_mount_t;
203 215
204/* 216/*
@@ -212,6 +224,7 @@ typedef struct xfs_mount {
212#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 224#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
213 operations, typically for 225 operations, typically for
214 disk errors in metadata */ 226 disk errors in metadata */
227#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
215#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to 228#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
216 user */ 229 user */
217#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment 230#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
@@ -232,8 +245,6 @@ typedef struct xfs_mount {
232#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ 245#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
233#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred 246#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
234 * I/O size in stat() */ 247 * I/O size in stat() */
235#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock
236 counters */
237#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams 248#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
238 allocator */ 249 allocator */
239#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ 250#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
@@ -327,6 +338,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
327 * perag get/put wrappers for ref counting 338 * perag get/put wrappers for ref counting
328 */ 339 */
329struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); 340struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
341struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
342 int tag);
330void xfs_perag_put(struct xfs_perag *pag); 343void xfs_perag_put(struct xfs_perag *pag);
331 344
332/* 345/*
@@ -376,6 +389,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
376 389
377extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 390extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
378 391
392extern void xfs_set_low_space_thresholds(struct xfs_mount *);
393
379#endif /* __KERNEL__ */ 394#endif /* __KERNEL__ */
380 395
381extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 396extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15dc5b2b..4aff56395732 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
309 if (!xfs_mru_elem_zone) 309 if (!xfs_mru_elem_zone)
310 goto out; 310 goto out;
311 311
312 xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); 312 xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
313 if (!xfs_mru_reap_wq) 313 if (!xfs_mru_reap_wq)
314 goto out_destroy_mru_elem_zone; 314 goto out_destroy_mru_elem_zone;
315 315
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
408 spin_lock(&mru->lock); 408 spin_lock(&mru->lock);
409 if (mru->queued) { 409 if (mru->queued) {
410 spin_unlock(&mru->lock); 410 spin_unlock(&mru->lock);
411 cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work); 411 cancel_delayed_work_sync(&mru->work);
412 spin_lock(&mru->lock); 412 spin_lock(&mru->lock);
413 } 413 }
414 414
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0e64b113bd6..a595f29567fe 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) 346#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
347#define xfs_trans_apply_dquot_deltas(tp) 347#define xfs_trans_apply_dquot_deltas(tp)
348#define xfs_trans_unreserve_and_mod_dquots(tp) 348#define xfs_trans_unreserve_and_mod_dquots(tp)
349#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) 349static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
350#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) 350 struct xfs_inode *ip, long nblks, long ninos, uint flags)
351{
352 return 0;
353}
354static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
355 struct xfs_mount *mp, struct xfs_dquot *udqp,
356 struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
357{
358 return 0;
359}
351#define xfs_qm_vop_create_dqattach(tp, ip, u, g) 360#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
352#define xfs_qm_vop_rename_dqattach(it) (0) 361#define xfs_qm_vop_rename_dqattach(it) (0)
353#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) 362#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
@@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
357#define xfs_qm_dqdetach(ip) 366#define xfs_qm_dqdetach(ip)
358#define xfs_qm_dqrele(d) 367#define xfs_qm_dqrele(d)
359#define xfs_qm_statvfs(ip, s) 368#define xfs_qm_statvfs(ip, s)
360#define xfs_qm_sync(mp, fl) (0) 369static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
370{
371 return 0;
372}
361#define xfs_qm_newmount(mp, a, b) (0) 373#define xfs_qm_newmount(mp, a, b) (0)
362#define xfs_qm_mount_quotas(mp) 374#define xfs_qm_mount_quotas(mp)
363#define xfs_qm_unmount(mp) 375#define xfs_qm_unmount(mp)
364#define xfs_qm_unmount_quotas(mp) (0) 376#define xfs_qm_unmount_quotas(mp)
365#endif /* CONFIG_XFS_QUOTA */ 377#endif /* CONFIG_XFS_QUOTA */
366 378
367#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ 379#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
@@ -370,7 +382,8 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
370 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \ 382 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
371 f | XFS_QMOPT_RES_REGBLKS) 383 f | XFS_QMOPT_RES_REGBLKS)
372 384
373extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); 385extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
386 xfs_dqid_t, uint, uint, char *);
374extern int xfs_mount_reset_sbqflags(struct xfs_mount *); 387extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
375 388
376#endif /* __KERNEL__ */ 389#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
deleted file mode 100644
index 2dec79edb510..000000000000
--- a/fs/xfs/xfs_refcache.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_REFCACHE_H__
19#define __XFS_REFCACHE_H__
20
21#ifdef HAVE_REFCACHE
22/*
23 * Maximum size (in inodes) for the NFS reference cache
24 */
25#define XFS_REFCACHE_SIZE_MAX 512
26
27struct xfs_inode;
28struct xfs_mount;
29
30extern void xfs_refcache_insert(struct xfs_inode *);
31extern void xfs_refcache_purge_ip(struct xfs_inode *);
32extern void xfs_refcache_purge_mp(struct xfs_mount *);
33extern void xfs_refcache_purge_some(struct xfs_mount *);
34extern void xfs_refcache_resize(int);
35extern void xfs_refcache_destroy(void);
36
37extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
38
39#else
40
41#define xfs_refcache_insert(ip) do { } while (0)
42#define xfs_refcache_purge_ip(ip) do { } while (0)
43#define xfs_refcache_purge_mp(mp) do { } while (0)
44#define xfs_refcache_purge_some(mp) do { } while (0)
45#define xfs_refcache_resize(size) do { } while (0)
46#define xfs_refcache_destroy() do { } while (0)
47
48#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
49
50#endif
51
52#endif /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 8fca957200df..77a59891734e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -183,7 +183,7 @@ xfs_rename(
183 * tree quota mechanism would be circumvented. 183 * tree quota mechanism would be circumvented.
184 */ 184 */
185 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 185 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
186 (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { 186 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
187 error = XFS_ERROR(EXDEV); 187 error = XFS_ERROR(EXDEV);
188 goto error_return; 188 goto error_return;
189 } 189 }
@@ -211,7 +211,9 @@ xfs_rename(
211 goto error_return; 211 goto error_return;
212 if (error) 212 if (error)
213 goto abort_return; 213 goto abort_return;
214 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 214
215 xfs_trans_ichgtime(tp, target_dp,
216 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
215 217
216 if (new_parent && src_is_directory) { 218 if (new_parent && src_is_directory) {
217 error = xfs_bumplink(tp, target_dp); 219 error = xfs_bumplink(tp, target_dp);
@@ -249,7 +251,9 @@ xfs_rename(
249 &first_block, &free_list, spaceres); 251 &first_block, &free_list, spaceres);
250 if (error) 252 if (error)
251 goto abort_return; 253 goto abort_return;
252 xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 254
255 xfs_trans_ichgtime(tp, target_dp,
256 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
253 257
254 /* 258 /*
255 * Decrement the link count on the target since the target 259 * Decrement the link count on the target since the target
@@ -292,7 +296,8 @@ xfs_rename(
292 * inode isn't really being changed, but old unix file systems did 296 * inode isn't really being changed, but old unix file systems did
293 * it and some incremental backup programs won't work without it. 297 * it and some incremental backup programs won't work without it.
294 */ 298 */
295 xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); 299 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
300 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
296 301
297 /* 302 /*
298 * Adjust the link count on src_dp. This is necessary when 303 * Adjust the link count on src_dp. This is necessary when
@@ -315,7 +320,7 @@ xfs_rename(
315 if (error) 320 if (error)
316 goto abort_return; 321 goto abort_return;
317 322
318 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 323 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
319 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 324 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
320 if (new_parent) 325 if (new_parent)
321 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 326 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 891260fea11e..8f76fdff4f46 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -39,6 +39,7 @@
39#include "xfs_trans_space.h" 39#include "xfs_trans_space.h"
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_buf.h"
42 43
43 44
44/* 45/*
@@ -75,7 +76,7 @@ xfs_growfs_rt_alloc(
75 xfs_mount_t *mp, /* file system mount point */ 76 xfs_mount_t *mp, /* file system mount point */
76 xfs_extlen_t oblocks, /* old count of blocks */ 77 xfs_extlen_t oblocks, /* old count of blocks */
77 xfs_extlen_t nblocks, /* new count of blocks */ 78 xfs_extlen_t nblocks, /* new count of blocks */
78 xfs_ino_t ino) /* inode number (bitmap/summary) */ 79 xfs_inode_t *ip) /* inode (bitmap/summary) */
79{ 80{
80 xfs_fileoff_t bno; /* block number in file */ 81 xfs_fileoff_t bno; /* block number in file */
81 xfs_buf_t *bp; /* temporary buffer for zeroing */ 82 xfs_buf_t *bp; /* temporary buffer for zeroing */
@@ -85,7 +86,6 @@ xfs_growfs_rt_alloc(
85 xfs_fsblock_t firstblock; /* first block allocated in xaction */ 86 xfs_fsblock_t firstblock; /* first block allocated in xaction */
86 xfs_bmap_free_t flist; /* list of freed blocks */ 87 xfs_bmap_free_t flist; /* list of freed blocks */
87 xfs_fsblock_t fsbno; /* filesystem block for bno */ 88 xfs_fsblock_t fsbno; /* filesystem block for bno */
88 xfs_inode_t *ip; /* pointer to incore inode */
89 xfs_bmbt_irec_t map; /* block map output */ 89 xfs_bmbt_irec_t map; /* block map output */
90 int nmap; /* number of block maps */ 90 int nmap; /* number of block maps */
91 int resblks; /* space reservation */ 91 int resblks; /* space reservation */
@@ -111,9 +111,9 @@ xfs_growfs_rt_alloc(
111 /* 111 /*
112 * Lock the inode. 112 * Lock the inode.
113 */ 113 */
114 if ((error = xfs_trans_iget(mp, tp, ino, 0, 114 xfs_ilock(ip, XFS_ILOCK_EXCL);
115 XFS_ILOCK_EXCL, &ip))) 115 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
116 goto error_cancel; 116
117 xfs_bmap_init(&flist, &firstblock); 117 xfs_bmap_init(&flist, &firstblock);
118 /* 118 /*
119 * Allocate blocks to the bitmap file. 119 * Allocate blocks to the bitmap file.
@@ -154,9 +154,8 @@ xfs_growfs_rt_alloc(
154 /* 154 /*
155 * Lock the bitmap inode. 155 * Lock the bitmap inode.
156 */ 156 */
157 if ((error = xfs_trans_iget(mp, tp, ino, 0, 157 xfs_ilock(ip, XFS_ILOCK_EXCL);
158 XFS_ILOCK_EXCL, &ip))) 158 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
159 goto error_cancel;
160 /* 159 /*
161 * Get a buffer for the block. 160 * Get a buffer for the block.
162 */ 161 */
@@ -1853,7 +1852,6 @@ xfs_growfs_rt(
1853 xfs_rtblock_t bmbno; /* bitmap block number */ 1852 xfs_rtblock_t bmbno; /* bitmap block number */
1854 xfs_buf_t *bp; /* temporary buffer */ 1853 xfs_buf_t *bp; /* temporary buffer */
1855 int error; /* error return value */ 1854 int error; /* error return value */
1856 xfs_inode_t *ip; /* bitmap inode, used as lock */
1857 xfs_mount_t *nmp; /* new (fake) mount structure */ 1855 xfs_mount_t *nmp; /* new (fake) mount structure */
1858 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */ 1856 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */
1859 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ 1857 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
@@ -1883,13 +1881,13 @@ xfs_growfs_rt(
1883 /* 1881 /*
1884 * Read in the last block of the device, make sure it exists. 1882 * Read in the last block of the device, make sure it exists.
1885 */ 1883 */
1886 error = xfs_read_buf(mp, mp->m_rtdev_targp, 1884 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
1887 XFS_FSB_TO_BB(mp, nrblocks - 1), 1885 XFS_FSB_TO_BB(mp, nrblocks - 1),
1888 XFS_FSB_TO_BB(mp, 1), 0, &bp); 1886 XFS_FSB_TO_B(mp, 1), 0);
1889 if (error) 1887 if (!bp)
1890 return error; 1888 return EIO;
1891 ASSERT(bp);
1892 xfs_buf_relse(bp); 1889 xfs_buf_relse(bp);
1890
1893 /* 1891 /*
1894 * Calculate new parameters. These are the final values to be reached. 1892 * Calculate new parameters. These are the final values to be reached.
1895 */ 1893 */
@@ -1917,11 +1915,11 @@ xfs_growfs_rt(
1917 /* 1915 /*
1918 * Allocate space to the bitmap and summary files, as necessary. 1916 * Allocate space to the bitmap and summary files, as necessary.
1919 */ 1917 */
1920 if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, 1918 error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
1921 mp->m_sb.sb_rbmino))) 1919 if (error)
1922 return error; 1920 return error;
1923 if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, 1921 error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
1924 mp->m_sb.sb_rsumino))) 1922 if (error)
1925 return error; 1923 return error;
1926 /* 1924 /*
1927 * Allocate a new (fake) mount/sb. 1925 * Allocate a new (fake) mount/sb.
@@ -1971,10 +1969,8 @@ xfs_growfs_rt(
1971 /* 1969 /*
1972 * Lock out other callers by grabbing the bitmap inode lock. 1970 * Lock out other callers by grabbing the bitmap inode lock.
1973 */ 1971 */
1974 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 1972 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
1975 XFS_ILOCK_EXCL, &ip))) 1973 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
1976 goto error_cancel;
1977 ASSERT(ip == mp->m_rbmip);
1978 /* 1974 /*
1979 * Update the bitmap inode's size. 1975 * Update the bitmap inode's size.
1980 */ 1976 */
@@ -1985,10 +1981,8 @@ xfs_growfs_rt(
1985 /* 1981 /*
1986 * Get the summary inode into the transaction. 1982 * Get the summary inode into the transaction.
1987 */ 1983 */
1988 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, 1984 xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
1989 XFS_ILOCK_EXCL, &ip))) 1985 xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
1990 goto error_cancel;
1991 ASSERT(ip == mp->m_rsumip);
1992 /* 1986 /*
1993 * Update the summary inode's size. 1987 * Update the summary inode's size.
1994 */ 1988 */
@@ -2074,15 +2068,15 @@ xfs_rtallocate_extent(
2074 xfs_extlen_t prod, /* extent product factor */ 2068 xfs_extlen_t prod, /* extent product factor */
2075 xfs_rtblock_t *rtblock) /* out: start block allocated */ 2069 xfs_rtblock_t *rtblock) /* out: start block allocated */
2076{ 2070{
2071 xfs_mount_t *mp = tp->t_mountp;
2077 int error; /* error value */ 2072 int error; /* error value */
2078 xfs_inode_t *ip; /* inode for bitmap file */
2079 xfs_mount_t *mp; /* file system mount structure */
2080 xfs_rtblock_t r; /* result allocated block */ 2073 xfs_rtblock_t r; /* result allocated block */
2081 xfs_fsblock_t sb; /* summary file block number */ 2074 xfs_fsblock_t sb; /* summary file block number */
2082 xfs_buf_t *sumbp; /* summary file block buffer */ 2075 xfs_buf_t *sumbp; /* summary file block buffer */
2083 2076
2077 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2084 ASSERT(minlen > 0 && minlen <= maxlen); 2078 ASSERT(minlen > 0 && minlen <= maxlen);
2085 mp = tp->t_mountp; 2079
2086 /* 2080 /*
2087 * If prod is set then figure out what to do to minlen and maxlen. 2081 * If prod is set then figure out what to do to minlen and maxlen.
2088 */ 2082 */
@@ -2098,12 +2092,7 @@ xfs_rtallocate_extent(
2098 return 0; 2092 return 0;
2099 } 2093 }
2100 } 2094 }
2101 /* 2095
2102 * Lock out other callers by grabbing the bitmap inode lock.
2103 */
2104 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
2105 XFS_ILOCK_EXCL, &ip)))
2106 return error;
2107 sumbp = NULL; 2096 sumbp = NULL;
2108 /* 2097 /*
2109 * Allocate by size, or near another block, or exactly at some block. 2098 * Allocate by size, or near another block, or exactly at some block.
@@ -2122,11 +2111,12 @@ xfs_rtallocate_extent(
2122 len, &sumbp, &sb, prod, &r); 2111 len, &sumbp, &sb, prod, &r);
2123 break; 2112 break;
2124 default: 2113 default:
2114 error = EIO;
2125 ASSERT(0); 2115 ASSERT(0);
2126 } 2116 }
2127 if (error) { 2117 if (error)
2128 return error; 2118 return error;
2129 } 2119
2130 /* 2120 /*
2131 * If it worked, update the superblock. 2121 * If it worked, update the superblock.
2132 */ 2122 */
@@ -2154,7 +2144,6 @@ xfs_rtfree_extent(
2154 xfs_extlen_t len) /* length of extent freed */ 2144 xfs_extlen_t len) /* length of extent freed */
2155{ 2145{
2156 int error; /* error value */ 2146 int error; /* error value */
2157 xfs_inode_t *ip; /* bitmap file inode */
2158 xfs_mount_t *mp; /* file system mount structure */ 2147 xfs_mount_t *mp; /* file system mount structure */
2159 xfs_fsblock_t sb; /* summary file block number */ 2148 xfs_fsblock_t sb; /* summary file block number */
2160 xfs_buf_t *sumbp; /* summary file block buffer */ 2149 xfs_buf_t *sumbp; /* summary file block buffer */
@@ -2163,9 +2152,9 @@ xfs_rtfree_extent(
2163 /* 2152 /*
2164 * Synchronize by locking the bitmap inode. 2153 * Synchronize by locking the bitmap inode.
2165 */ 2154 */
2166 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 2155 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2167 XFS_ILOCK_EXCL, &ip))) 2156 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2168 return error; 2157
2169#if defined(__KERNEL__) && defined(DEBUG) 2158#if defined(__KERNEL__) && defined(DEBUG)
2170 /* 2159 /*
2171 * Check to see that this whole range is currently allocated. 2160 * Check to see that this whole range is currently allocated.
@@ -2198,10 +2187,10 @@ xfs_rtfree_extent(
2198 */ 2187 */
2199 if (tp->t_frextents_delta + mp->m_sb.sb_frextents == 2188 if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
2200 mp->m_sb.sb_rextents) { 2189 mp->m_sb.sb_rextents) {
2201 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) 2190 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
2202 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; 2191 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2203 *(__uint64_t *)&ip->i_d.di_atime = 0; 2192 *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
2204 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2193 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
2205 } 2194 }
2206 return 0; 2195 return 0;
2207} 2196}
@@ -2215,15 +2204,14 @@ xfs_rtmount_init(
2215{ 2204{
2216 xfs_buf_t *bp; /* buffer for last block of subvolume */ 2205 xfs_buf_t *bp; /* buffer for last block of subvolume */
2217 xfs_daddr_t d; /* address of last block of subvolume */ 2206 xfs_daddr_t d; /* address of last block of subvolume */
2218 int error; /* error return value */
2219 xfs_sb_t *sbp; /* filesystem superblock copy in mount */ 2207 xfs_sb_t *sbp; /* filesystem superblock copy in mount */
2220 2208
2221 sbp = &mp->m_sb; 2209 sbp = &mp->m_sb;
2222 if (sbp->sb_rblocks == 0) 2210 if (sbp->sb_rblocks == 0)
2223 return 0; 2211 return 0;
2224 if (mp->m_rtdev_targp == NULL) { 2212 if (mp->m_rtdev_targp == NULL) {
2225 cmn_err(CE_WARN, 2213 xfs_warn(mp,
2226 "XFS: This filesystem has a realtime volume, use rtdev=device option"); 2214 "Filesystem has a realtime volume, use rtdev=device option");
2227 return XFS_ERROR(ENODEV); 2215 return XFS_ERROR(ENODEV);
2228 } 2216 }
2229 mp->m_rsumlevels = sbp->sb_rextslog + 1; 2217 mp->m_rsumlevels = sbp->sb_rextslog + 1;
@@ -2237,20 +2225,17 @@ xfs_rtmount_init(
2237 */ 2225 */
2238 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); 2226 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
2239 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { 2227 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
2240 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", 2228 xfs_warn(mp, "realtime mount -- %llu != %llu",
2241 (unsigned long long) XFS_BB_TO_FSB(mp, d), 2229 (unsigned long long) XFS_BB_TO_FSB(mp, d),
2242 (unsigned long long) mp->m_sb.sb_rblocks); 2230 (unsigned long long) mp->m_sb.sb_rblocks);
2243 return XFS_ERROR(EFBIG); 2231 return XFS_ERROR(EFBIG);
2244 } 2232 }
2245 error = xfs_read_buf(mp, mp->m_rtdev_targp, 2233 bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
2246 d - XFS_FSB_TO_BB(mp, 1), 2234 d - XFS_FSB_TO_BB(mp, 1),
2247 XFS_FSB_TO_BB(mp, 1), 0, &bp); 2235 XFS_FSB_TO_B(mp, 1), 0);
2248 if (error) { 2236 if (!bp) {
2249 cmn_err(CE_WARN, 2237 xfs_warn(mp, "realtime device size check failed");
2250 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); 2238 return EIO;
2251 if (error == ENOSPC)
2252 return XFS_ERROR(EFBIG);
2253 return error;
2254 } 2239 }
2255 xfs_buf_relse(bp); 2240 xfs_buf_relse(bp);
2256 return 0; 2241 return 0;
@@ -2309,20 +2294,16 @@ xfs_rtpick_extent(
2309 xfs_rtblock_t *pick) /* result rt extent */ 2294 xfs_rtblock_t *pick) /* result rt extent */
2310{ 2295{
2311 xfs_rtblock_t b; /* result block */ 2296 xfs_rtblock_t b; /* result block */
2312 int error; /* error return value */
2313 xfs_inode_t *ip; /* bitmap incore inode */
2314 int log2; /* log of sequence number */ 2297 int log2; /* log of sequence number */
2315 __uint64_t resid; /* residual after log removed */ 2298 __uint64_t resid; /* residual after log removed */
2316 __uint64_t seq; /* sequence number of file creation */ 2299 __uint64_t seq; /* sequence number of file creation */
2317 __uint64_t *seqp; /* pointer to seqno in inode */ 2300 __uint64_t *seqp; /* pointer to seqno in inode */
2318 2301
2319 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 2302 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2320 XFS_ILOCK_EXCL, &ip))) 2303
2321 return error; 2304 seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
2322 ASSERT(ip == mp->m_rbmip); 2305 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
2323 seqp = (__uint64_t *)&ip->i_d.di_atime; 2306 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2324 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
2325 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2326 *seqp = 0; 2307 *seqp = 0;
2327 } 2308 }
2328 seq = *seqp; 2309 seq = *seqp;
@@ -2338,7 +2319,7 @@ xfs_rtpick_extent(
2338 b = mp->m_sb.sb_rextents - len; 2319 b = mp->m_sb.sb_rextents - len;
2339 } 2320 }
2340 *seqp = seq + 1; 2321 *seqp = seq + 1;
2341 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2322 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
2342 *pick = b; 2323 *pick = b;
2343 return 0; 2324 return 0;
2344} 2325}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ff614c29b441..09e1f4f35e97 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -154,7 +154,7 @@ xfs_rtmount_init(
154 if (mp->m_sb.sb_rblocks == 0) 154 if (mp->m_sb.sb_rblocks == 0)
155 return 0; 155 return 0;
156 156
157 cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT"); 157 xfs_warn(mp, "Not built with CONFIG_XFS_RT");
158 return ENOSYS; 158 return ENOSYS;
159} 159}
160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 56861d5daaef..d6d6fdfe9422 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -49,9 +49,9 @@ xfs_do_force_shutdown(
49 logerror = flags & SHUTDOWN_LOG_IO_ERROR; 49 logerror = flags & SHUTDOWN_LOG_IO_ERROR;
50 50
51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
52 cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from " 52 xfs_notice(mp,
53 "line %d of file %s. Return address = 0x%p", 53 "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
54 mp->m_fsname, flags, lnnum, fname, __return_address); 54 __func__, flags, lnnum, fname, __return_address);
55 } 55 }
56 /* 56 /*
57 * No need to duplicate efforts. 57 * No need to duplicate efforts.
@@ -69,30 +69,25 @@ xfs_do_force_shutdown(
69 return; 69 return;
70 70
71 if (flags & SHUTDOWN_CORRUPT_INCORE) { 71 if (flags & SHUTDOWN_CORRUPT_INCORE) {
72 xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp, 72 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
73 "Corruption of in-memory data detected. Shutting down filesystem: %s", 73 "Corruption of in-memory data detected. Shutting down filesystem");
74 mp->m_fsname); 74 if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
75 if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
76 xfs_stack_trace(); 75 xfs_stack_trace();
77 }
78 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 76 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
79 if (logerror) { 77 if (logerror) {
80 xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp, 78 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
81 "Log I/O Error Detected. Shutting down filesystem: %s", 79 "Log I/O Error Detected. Shutting down filesystem");
82 mp->m_fsname);
83 } else if (flags & SHUTDOWN_DEVICE_REQ) { 80 } else if (flags & SHUTDOWN_DEVICE_REQ) {
84 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 81 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
85 "All device paths lost. Shutting down filesystem: %s", 82 "All device paths lost. Shutting down filesystem");
86 mp->m_fsname);
87 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { 83 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
88 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 84 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
89 "I/O Error Detected. Shutting down filesystem: %s", 85 "I/O Error Detected. Shutting down filesystem");
90 mp->m_fsname);
91 } 86 }
92 } 87 }
93 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 88 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
94 cmn_err(CE_ALERT, "Please umount the filesystem, " 89 xfs_alert(mp,
95 "and rectify the problem(s)"); 90 "Please umount the filesystem and rectify the problem(s)");
96 } 91 }
97} 92}
98 93
@@ -106,10 +101,9 @@ xfs_ioerror_alert(
106 xfs_buf_t *bp, 101 xfs_buf_t *bp,
107 xfs_daddr_t blkno) 102 xfs_daddr_t blkno)
108{ 103{
109 cmn_err(CE_ALERT, 104 xfs_alert(mp,
110 "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx" 105 "I/O error occurred: meta-data dev %s block 0x%llx"
111 " (\"%s\") error %d buf count %zd", 106 " (\"%s\") error %d buf count %zd",
112 (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
113 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 107 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
114 (__uint64_t)blkno, func, 108 (__uint64_t)blkno, func,
115 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); 109 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
@@ -173,17 +167,9 @@ xfs_extlen_t
173xfs_get_extsz_hint( 167xfs_get_extsz_hint(
174 struct xfs_inode *ip) 168 struct xfs_inode *ip)
175{ 169{
176 xfs_extlen_t extsz; 170 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
177 171 return ip->i_d.di_extsize;
178 if (unlikely(XFS_IS_REALTIME_INODE(ip))) { 172 if (XFS_IS_REALTIME_INODE(ip))
179 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) 173 return ip->i_mount->m_sb.sb_rextsize;
180 ? ip->i_d.di_extsize 174 return 0;
181 : ip->i_mount->m_sb.sb_rextsize;
182 ASSERT(extsz);
183 } else {
184 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
185 ? ip->i_d.di_extsize : 0;
186 }
187
188 return extsz;
189} 175}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1b017c657494..1eb2ba586814 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -80,10 +80,12 @@ struct xfs_mount;
80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 80#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
83 84
84#define XFS_SB_VERSION2_OKREALFBITS \ 85#define XFS_SB_VERSION2_OKREALFBITS \
85 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 86 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
86 XFS_SB_VERSION2_ATTR2BIT) 87 XFS_SB_VERSION2_ATTR2BIT | \
88 XFS_SB_VERSION2_PROJID32BIT)
87#define XFS_SB_VERSION2_OKSASHFBITS \ 89#define XFS_SB_VERSION2_OKSASHFBITS \
88 (0) 90 (0)
89#define XFS_SB_VERSION2_OKREALBITS \ 91#define XFS_SB_VERSION2_OKREALBITS \
@@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
495 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; 497 sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
496} 498}
497 499
500static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
501{
502 return xfs_sb_version_hasmorebits(sbp) &&
503 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
504}
505
498/* 506/*
499 * end of superblock version macros 507 * end of superblock version macros
500 */ 508 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 1c47edaea0d2..c83f63b33aae 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -608,10 +608,8 @@ STATIC void
608xfs_trans_free( 608xfs_trans_free(
609 struct xfs_trans *tp) 609 struct xfs_trans *tp)
610{ 610{
611 struct xfs_busy_extent *busyp, *n; 611 xfs_alloc_busy_sort(&tp->t_busy);
612 612 xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
613 list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
614 xfs_alloc_busy_clear(tp->t_mountp, busyp);
615 613
616 atomic_dec(&tp->t_mountp->m_active_trans); 614 atomic_dec(&tp->t_mountp->m_active_trans);
617 xfs_trans_free_dqinfo(tp); 615 xfs_trans_free_dqinfo(tp);
@@ -696,7 +694,7 @@ xfs_trans_reserve(
696 * fail if the count would go below zero. 694 * fail if the count would go below zero.
697 */ 695 */
698 if (blocks > 0) { 696 if (blocks > 0) {
699 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 697 error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
700 -((int64_t)blocks), rsvd); 698 -((int64_t)blocks), rsvd);
701 if (error != 0) { 699 if (error != 0) {
702 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 700 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -767,7 +765,7 @@ undo_log:
767 765
768undo_blocks: 766undo_blocks:
769 if (blocks > 0) { 767 if (blocks > 0) {
770 (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, 768 xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
771 (int64_t)blocks, rsvd); 769 (int64_t)blocks, rsvd);
772 tp->t_blk_res = 0; 770 tp->t_blk_res = 0;
773 } 771 }
@@ -1009,7 +1007,7 @@ void
1009xfs_trans_unreserve_and_mod_sb( 1007xfs_trans_unreserve_and_mod_sb(
1010 xfs_trans_t *tp) 1008 xfs_trans_t *tp)
1011{ 1009{
1012 xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ 1010 xfs_mod_sb_t msb[9]; /* If you add cases, add entries */
1013 xfs_mod_sb_t *msbp; 1011 xfs_mod_sb_t *msbp;
1014 xfs_mount_t *mp = tp->t_mountp; 1012 xfs_mount_t *mp = tp->t_mountp;
1015 /* REFERENCED */ 1013 /* REFERENCED */
@@ -1017,55 +1015,61 @@ xfs_trans_unreserve_and_mod_sb(
1017 int rsvd; 1015 int rsvd;
1018 int64_t blkdelta = 0; 1016 int64_t blkdelta = 0;
1019 int64_t rtxdelta = 0; 1017 int64_t rtxdelta = 0;
1018 int64_t idelta = 0;
1019 int64_t ifreedelta = 0;
1020 1020
1021 msbp = msb; 1021 msbp = msb;
1022 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 1022 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
1023 1023
1024 /* calculate free blocks delta */ 1024 /* calculate deltas */
1025 if (tp->t_blk_res > 0) 1025 if (tp->t_blk_res > 0)
1026 blkdelta = tp->t_blk_res; 1026 blkdelta = tp->t_blk_res;
1027
1028 if ((tp->t_fdblocks_delta != 0) && 1027 if ((tp->t_fdblocks_delta != 0) &&
1029 (xfs_sb_version_haslazysbcount(&mp->m_sb) || 1028 (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1030 (tp->t_flags & XFS_TRANS_SB_DIRTY))) 1029 (tp->t_flags & XFS_TRANS_SB_DIRTY)))
1031 blkdelta += tp->t_fdblocks_delta; 1030 blkdelta += tp->t_fdblocks_delta;
1032 1031
1033 if (blkdelta != 0) {
1034 msbp->msb_field = XFS_SBS_FDBLOCKS;
1035 msbp->msb_delta = blkdelta;
1036 msbp++;
1037 }
1038
1039 /* calculate free realtime extents delta */
1040 if (tp->t_rtx_res > 0) 1032 if (tp->t_rtx_res > 0)
1041 rtxdelta = tp->t_rtx_res; 1033 rtxdelta = tp->t_rtx_res;
1042
1043 if ((tp->t_frextents_delta != 0) && 1034 if ((tp->t_frextents_delta != 0) &&
1044 (tp->t_flags & XFS_TRANS_SB_DIRTY)) 1035 (tp->t_flags & XFS_TRANS_SB_DIRTY))
1045 rtxdelta += tp->t_frextents_delta; 1036 rtxdelta += tp->t_frextents_delta;
1046 1037
1038 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1039 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1040 idelta = tp->t_icount_delta;
1041 ifreedelta = tp->t_ifree_delta;
1042 }
1043
1044 /* apply the per-cpu counters */
1045 if (blkdelta) {
1046 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
1047 blkdelta, rsvd);
1048 if (error)
1049 goto out;
1050 }
1051
1052 if (idelta) {
1053 error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
1054 idelta, rsvd);
1055 if (error)
1056 goto out_undo_fdblocks;
1057 }
1058
1059 if (ifreedelta) {
1060 error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
1061 ifreedelta, rsvd);
1062 if (error)
1063 goto out_undo_icount;
1064 }
1065
1066 /* apply remaining deltas */
1047 if (rtxdelta != 0) { 1067 if (rtxdelta != 0) {
1048 msbp->msb_field = XFS_SBS_FREXTENTS; 1068 msbp->msb_field = XFS_SBS_FREXTENTS;
1049 msbp->msb_delta = rtxdelta; 1069 msbp->msb_delta = rtxdelta;
1050 msbp++; 1070 msbp++;
1051 } 1071 }
1052 1072
1053 /* apply remaining deltas */
1054
1055 if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
1056 (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
1057 if (tp->t_icount_delta != 0) {
1058 msbp->msb_field = XFS_SBS_ICOUNT;
1059 msbp->msb_delta = tp->t_icount_delta;
1060 msbp++;
1061 }
1062 if (tp->t_ifree_delta != 0) {
1063 msbp->msb_field = XFS_SBS_IFREE;
1064 msbp->msb_delta = tp->t_ifree_delta;
1065 msbp++;
1066 }
1067 }
1068
1069 if (tp->t_flags & XFS_TRANS_SB_DIRTY) { 1073 if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
1070 if (tp->t_dblocks_delta != 0) { 1074 if (tp->t_dblocks_delta != 0) {
1071 msbp->msb_field = XFS_SBS_DBLOCKS; 1075 msbp->msb_field = XFS_SBS_DBLOCKS;
@@ -1115,8 +1119,24 @@ xfs_trans_unreserve_and_mod_sb(
1115 if (msbp > msb) { 1119 if (msbp > msb) {
1116 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, 1120 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
1117 (uint)(msbp - msb), rsvd); 1121 (uint)(msbp - msb), rsvd);
1118 ASSERT(error == 0); 1122 if (error)
1123 goto out_undo_ifreecount;
1119 } 1124 }
1125
1126 return;
1127
1128out_undo_ifreecount:
1129 if (ifreedelta)
1130 xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
1131out_undo_icount:
1132 if (idelta)
1133 xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
1134out_undo_fdblocks:
1135 if (blkdelta)
1136 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1137out:
1138 ASSERT(error == 0);
1139 return;
1120} 1140}
1121 1141
1122/* 1142/*
@@ -1328,7 +1348,7 @@ xfs_trans_fill_vecs(
1328 * they could be immediately flushed and we'd have to race with the flusher 1348 * they could be immediately flushed and we'd have to race with the flusher
1329 * trying to pull the item from the AIL as we add it. 1349 * trying to pull the item from the AIL as we add it.
1330 */ 1350 */
1331void 1351static void
1332xfs_trans_item_committed( 1352xfs_trans_item_committed(
1333 struct xfs_log_item *lip, 1353 struct xfs_log_item *lip,
1334 xfs_lsn_t commit_lsn, 1354 xfs_lsn_t commit_lsn,
@@ -1341,7 +1361,7 @@ xfs_trans_item_committed(
1341 lip->li_flags |= XFS_LI_ABORTED; 1361 lip->li_flags |= XFS_LI_ABORTED;
1342 item_lsn = IOP_COMMITTED(lip, commit_lsn); 1362 item_lsn = IOP_COMMITTED(lip, commit_lsn);
1343 1363
1344 /* If the committed routine returns -1, item has been freed. */ 1364 /* item_lsn of -1 means the item needs no further processing */
1345 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) 1365 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1346 return; 1366 return;
1347 1367
@@ -1389,15 +1409,12 @@ xfs_trans_item_committed(
1389 */ 1409 */
1390STATIC void 1410STATIC void
1391xfs_trans_committed( 1411xfs_trans_committed(
1392 struct xfs_trans *tp, 1412 void *arg,
1393 int abortflag) 1413 int abortflag)
1394{ 1414{
1415 struct xfs_trans *tp = arg;
1395 struct xfs_log_item_desc *lidp, *next; 1416 struct xfs_log_item_desc *lidp, *next;
1396 1417
1397 /* Call the transaction's completion callback if there is one. */
1398 if (tp->t_callback != NULL)
1399 tp->t_callback(tp, tp->t_callarg);
1400
1401 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { 1418 list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
1402 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); 1419 xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
1403 xfs_trans_free_item_desc(lidp); 1420 xfs_trans_free_item_desc(lidp);
@@ -1406,21 +1423,120 @@ xfs_trans_committed(
1406 xfs_trans_free(tp); 1423 xfs_trans_free(tp);
1407} 1424}
1408 1425
1426static inline void
1427xfs_log_item_batch_insert(
1428 struct xfs_ail *ailp,
1429 struct xfs_log_item **log_items,
1430 int nr_items,
1431 xfs_lsn_t commit_lsn)
1432{
1433 int i;
1434
1435 spin_lock(&ailp->xa_lock);
1436 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
1437 xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
1438
1439 for (i = 0; i < nr_items; i++)
1440 IOP_UNPIN(log_items[i], 0);
1441}
1442
1443/*
1444 * Bulk operation version of xfs_trans_committed that takes a log vector of
1445 * items to insert into the AIL. This uses bulk AIL insertion techniques to
1446 * minimise lock traffic.
1447 *
1448 * If we are called with the aborted flag set, it is because a log write during
1449 * a CIL checkpoint commit has failed. In this case, all the items in the
1450 * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
1451 * means that checkpoint commit abort handling is treated exactly the same
1452 * as an iclog write error even though we haven't started any IO yet. Hence in
1453 * this case all we need to do is IOP_COMMITTED processing, followed by an
1454 * IOP_UNPIN(aborted) call.
1455 */
1456void
1457xfs_trans_committed_bulk(
1458 struct xfs_ail *ailp,
1459 struct xfs_log_vec *log_vector,
1460 xfs_lsn_t commit_lsn,
1461 int aborted)
1462{
1463#define LOG_ITEM_BATCH_SIZE 32
1464 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
1465 struct xfs_log_vec *lv;
1466 int i = 0;
1467
1468 /* unpin all the log items */
1469 for (lv = log_vector; lv; lv = lv->lv_next ) {
1470 struct xfs_log_item *lip = lv->lv_item;
1471 xfs_lsn_t item_lsn;
1472
1473 if (aborted)
1474 lip->li_flags |= XFS_LI_ABORTED;
1475 item_lsn = IOP_COMMITTED(lip, commit_lsn);
1476
1477 /* item_lsn of -1 means the item needs no further processing */
1478 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1479 continue;
1480
1481 /*
1482 * if we are aborting the operation, no point in inserting the
1483 * object into the AIL as we are in a shutdown situation.
1484 */
1485 if (aborted) {
1486 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
1487 IOP_UNPIN(lip, 1);
1488 continue;
1489 }
1490
1491 if (item_lsn != commit_lsn) {
1492
1493 /*
1494 * Not a bulk update option due to unusual item_lsn.
1495 * Push into AIL immediately, rechecking the lsn once
1496 * we have the ail lock. Then unpin the item.
1497 */
1498 spin_lock(&ailp->xa_lock);
1499 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
1500 xfs_trans_ail_update(ailp, lip, item_lsn);
1501 else
1502 spin_unlock(&ailp->xa_lock);
1503 IOP_UNPIN(lip, 0);
1504 continue;
1505 }
1506
1507 /* Item is a candidate for bulk AIL insert. */
1508 log_items[i++] = lv->lv_item;
1509 if (i >= LOG_ITEM_BATCH_SIZE) {
1510 xfs_log_item_batch_insert(ailp, log_items,
1511 LOG_ITEM_BATCH_SIZE, commit_lsn);
1512 i = 0;
1513 }
1514 }
1515
1516 /* make sure we insert the remainder! */
1517 if (i)
1518 xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
1519}
1520
1409/* 1521/*
1410 * Called from the trans_commit code when we notice that 1522 * Called from the trans_commit code when we notice that the filesystem is in
1411 * the filesystem is in the middle of a forced shutdown. 1523 * the middle of a forced shutdown.
1524 *
1525 * When we are called here, we have already pinned all the items in the
1526 * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
1527 * so we can simply walk the items in the transaction, unpin them with an abort
1528 * flag and then free the items. Note that unpinning the items can result in
1529 * them being freed immediately, so we need to use a safe list traversal method
1530 * here.
1412 */ 1531 */
1413STATIC void 1532STATIC void
1414xfs_trans_uncommit( 1533xfs_trans_uncommit(
1415 struct xfs_trans *tp, 1534 struct xfs_trans *tp,
1416 uint flags) 1535 uint flags)
1417{ 1536{
1418 struct xfs_log_item_desc *lidp; 1537 struct xfs_log_item_desc *lidp, *n;
1419 1538
1420 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 1539 list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
1421 /*
1422 * Unpin all but those that aren't dirty.
1423 */
1424 if (lidp->lid_flags & XFS_LID_DIRTY) 1540 if (lidp->lid_flags & XFS_LID_DIRTY)
1425 IOP_UNPIN(lidp->lid_item, 1); 1541 IOP_UNPIN(lidp->lid_item, 1);
1426 } 1542 }
@@ -1525,7 +1641,7 @@ xfs_trans_commit_iclog(
1525 * running in simulation mode (the log is explicitly turned 1641 * running in simulation mode (the log is explicitly turned
1526 * off). 1642 * off).
1527 */ 1643 */
1528 tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed; 1644 tp->t_logcb.cb_func = xfs_trans_committed;
1529 tp->t_logcb.cb_arg = tp; 1645 tp->t_logcb.cb_arg = tp;
1530 1646
1531 /* 1647 /*
@@ -1637,7 +1753,6 @@ xfs_trans_commit_cil(
1637 int flags) 1753 int flags)
1638{ 1754{
1639 struct xfs_log_vec *log_vector; 1755 struct xfs_log_vec *log_vector;
1640 int error;
1641 1756
1642 /* 1757 /*
1643 * Get each log item to allocate a vector structure for 1758 * Get each log item to allocate a vector structure for
@@ -1648,9 +1763,7 @@ xfs_trans_commit_cil(
1648 if (!log_vector) 1763 if (!log_vector)
1649 return ENOMEM; 1764 return ENOMEM;
1650 1765
1651 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); 1766 xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1652 if (error)
1653 return error;
1654 1767
1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1768 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1656 xfs_trans_free(tp); 1769 xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c13c0f97b494..06a9759b6352 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
294#define XFS_ALLOC_BTREE_REF 2 294#define XFS_ALLOC_BTREE_REF 2
295#define XFS_BMAP_BTREE_REF 2 295#define XFS_BMAP_BTREE_REF 2
296#define XFS_DIR_BTREE_REF 2 296#define XFS_DIR_BTREE_REF 2
297#define XFS_INO_REF 2
297#define XFS_ATTR_BTREE_REF 1 298#define XFS_ATTR_BTREE_REF 1
298#define XFS_INO_REF 1
299#define XFS_DQUOT_REF 1 299#define XFS_DQUOT_REF 1
300 300
301#ifdef __KERNEL__ 301#ifdef __KERNEL__
@@ -399,8 +399,6 @@ typedef struct xfs_trans {
399 * transaction. */ 399 * transaction. */
400 struct xfs_mount *t_mountp; /* ptr to fs mount struct */ 400 struct xfs_mount *t_mountp; /* ptr to fs mount struct */
401 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ 401 struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
402 xfs_trans_callback_t t_callback; /* transaction callback */
403 void *t_callarg; /* callback arg */
404 unsigned int t_flags; /* misc flags */ 402 unsigned int t_flags; /* misc flags */
405 int64_t t_icount_delta; /* superblock icount change */ 403 int64_t t_icount_delta; /* superblock icount change */
406 int64_t t_ifree_delta; /* superblock ifree change */ 404 int64_t t_ifree_delta; /* superblock ifree change */
@@ -471,8 +469,7 @@ void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
471void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); 469void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
472void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
473void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
474int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, 472void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
475 xfs_ino_t , uint, uint, struct xfs_inode **);
476void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); 473void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
477void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); 474void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
478void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); 475void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..5fc2380092c8 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,74 +28,138 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30 30
31STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *); 31struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
32STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
35 32
36#ifdef DEBUG 33#ifdef DEBUG
37STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *); 34/*
38#else 35 * Check that the list is sorted as it should be.
36 */
37STATIC void
38xfs_ail_check(
39 struct xfs_ail *ailp,
40 xfs_log_item_t *lip)
41{
42 xfs_log_item_t *prev_lip;
43
44 if (list_empty(&ailp->xa_ail))
45 return;
46
47 /*
48 * Check the next and previous entries are valid.
49 */
50 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
51 prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
52 if (&prev_lip->li_ail != &ailp->xa_ail)
53 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
54
55 prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
56 if (&prev_lip->li_ail != &ailp->xa_ail)
57 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
58
59
60#ifdef XFS_TRANS_DEBUG
61 /*
62 * Walk the list checking lsn ordering, and that every entry has the
63 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
64 * when specifically debugging the transaction subsystem.
65 */
66 prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
67 list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
68 if (&prev_lip->li_ail != &ailp->xa_ail)
69 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
70 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
71 prev_lip = lip;
72 }
73#endif /* XFS_TRANS_DEBUG */
74}
75#else /* !DEBUG */
39#define xfs_ail_check(a,l) 76#define xfs_ail_check(a,l)
40#endif /* DEBUG */ 77#endif /* DEBUG */
41 78
79/*
80 * Return a pointer to the first item in the AIL. If the AIL is empty, then
81 * return NULL.
82 */
83static xfs_log_item_t *
84xfs_ail_min(
85 struct xfs_ail *ailp)
86{
87 if (list_empty(&ailp->xa_ail))
88 return NULL;
89
90 return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
91}
92
93 /*
94 * Return a pointer to the last item in the AIL. If the AIL is empty, then
95 * return NULL.
96 */
97static xfs_log_item_t *
98xfs_ail_max(
99 struct xfs_ail *ailp)
100{
101 if (list_empty(&ailp->xa_ail))
102 return NULL;
103
104 return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
105}
106
107/*
108 * Return a pointer to the item which follows the given item in the AIL. If
109 * the given item is the last item in the list, then return NULL.
110 */
111static xfs_log_item_t *
112xfs_ail_next(
113 struct xfs_ail *ailp,
114 xfs_log_item_t *lip)
115{
116 if (lip->li_ail.next == &ailp->xa_ail)
117 return NULL;
118
119 return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
120}
42 121
43/* 122/*
44 * This is called by the log manager code to determine the LSN 123 * This is called by the log manager code to determine the LSN of the tail of
45 * of the tail of the log. This is exactly the LSN of the first 124 * the log. This is exactly the LSN of the first item in the AIL. If the AIL
46 * item in the AIL. If the AIL is empty, then this function 125 * is empty, then this function returns 0.
47 * returns 0.
48 * 126 *
49 * We need the AIL lock in order to get a coherent read of the 127 * We need the AIL lock in order to get a coherent read of the lsn of the last
50 * lsn of the last item in the AIL. 128 * item in the AIL.
51 */ 129 */
52xfs_lsn_t 130xfs_lsn_t
53xfs_trans_ail_tail( 131xfs_ail_min_lsn(
54 struct xfs_ail *ailp) 132 struct xfs_ail *ailp)
55{ 133{
56 xfs_lsn_t lsn; 134 xfs_lsn_t lsn = 0;
57 xfs_log_item_t *lip; 135 xfs_log_item_t *lip;
58 136
59 spin_lock(&ailp->xa_lock); 137 spin_lock(&ailp->xa_lock);
60 lip = xfs_ail_min(ailp); 138 lip = xfs_ail_min(ailp);
61 if (lip == NULL) { 139 if (lip)
62 lsn = (xfs_lsn_t)0;
63 } else {
64 lsn = lip->li_lsn; 140 lsn = lip->li_lsn;
65 }
66 spin_unlock(&ailp->xa_lock); 141 spin_unlock(&ailp->xa_lock);
67 142
68 return lsn; 143 return lsn;
69} 144}
70 145
71/* 146/*
72 * xfs_trans_push_ail 147 * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
73 *
74 * This routine is called to move the tail of the AIL forward. It does this by
75 * trying to flush items in the AIL whose lsns are below the given
76 * threshold_lsn.
77 *
78 * the push is run asynchronously in a separate thread, so we return the tail
79 * of the log right now instead of the tail after the push. This means we will
80 * either continue right away, or we will sleep waiting on the async thread to
81 * do its work.
82 *
83 * We do this unlocked - we only need to know whether there is anything in the
84 * AIL at the time we are called. We don't need to access the contents of
85 * any of the objects, so the lock is not needed.
86 */ 148 */
87void 149static xfs_lsn_t
88xfs_trans_ail_push( 150xfs_ail_max_lsn(
89 struct xfs_ail *ailp, 151 struct xfs_ail *ailp)
90 xfs_lsn_t threshold_lsn)
91{ 152{
92 xfs_log_item_t *lip; 153 xfs_lsn_t lsn = 0;
154 xfs_log_item_t *lip;
93 155
94 lip = xfs_ail_min(ailp); 156 spin_lock(&ailp->xa_lock);
95 if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { 157 lip = xfs_ail_max(ailp);
96 if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) 158 if (lip)
97 xfsaild_wakeup(ailp, threshold_lsn); 159 lsn = lip->li_lsn;
98 } 160 spin_unlock(&ailp->xa_lock);
161
162 return lsn;
99} 163}
100 164
101/* 165/*
@@ -236,35 +300,78 @@ out:
236} 300}
237 301
238/* 302/*
239 * xfsaild_push does the work of pushing on the AIL. Returning a timeout of 303 * splice the log item list into the AIL at the given LSN.
240 * zero indicates that the caller should sleep until woken.
241 */ 304 */
242long 305static void
243xfsaild_push( 306xfs_ail_splice(
244 struct xfs_ail *ailp, 307 struct xfs_ail *ailp,
245 xfs_lsn_t *last_lsn) 308 struct list_head *list,
309 xfs_lsn_t lsn)
246{ 310{
247 long tout = 0; 311 xfs_log_item_t *next_lip;
248 xfs_lsn_t last_pushed_lsn = *last_lsn; 312
249 xfs_lsn_t target = ailp->xa_target; 313 /* If the list is empty, just insert the item. */
250 xfs_lsn_t lsn; 314 if (list_empty(&ailp->xa_ail)) {
251 xfs_log_item_t *lip; 315 list_splice(list, &ailp->xa_ail);
252 int flush_log, count, stuck; 316 return;
253 xfs_mount_t *mp = ailp->xa_mount; 317 }
318
319 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
320 if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
321 break;
322 }
323
324 ASSERT(&next_lip->li_ail == &ailp->xa_ail ||
325 XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0);
326
327 list_splice_init(list, &next_lip->li_ail);
328}
329
330/*
331 * Delete the given item from the AIL. Return a pointer to the item.
332 */
333static void
334xfs_ail_delete(
335 struct xfs_ail *ailp,
336 xfs_log_item_t *lip)
337{
338 xfs_ail_check(ailp, lip);
339 list_del(&lip->li_ail);
340 xfs_trans_ail_cursor_clear(ailp, lip);
341}
342
343/*
344 * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
345 * to run at a later time if there is more work to do to complete the push.
346 */
347STATIC void
348xfs_ail_worker(
349 struct work_struct *work)
350{
351 struct xfs_ail *ailp = container_of(to_delayed_work(work),
352 struct xfs_ail, xa_work);
353 xfs_mount_t *mp = ailp->xa_mount;
254 struct xfs_ail_cursor *cur = &ailp->xa_cursors; 354 struct xfs_ail_cursor *cur = &ailp->xa_cursors;
255 int push_xfsbufd = 0; 355 xfs_log_item_t *lip;
356 xfs_lsn_t lsn;
357 xfs_lsn_t target;
358 long tout = 10;
359 int flush_log = 0;
360 int stuck = 0;
361 int count = 0;
362 int push_xfsbufd = 0;
256 363
257 spin_lock(&ailp->xa_lock); 364 spin_lock(&ailp->xa_lock);
365 target = ailp->xa_target;
258 xfs_trans_ail_cursor_init(ailp, cur); 366 xfs_trans_ail_cursor_init(ailp, cur);
259 lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn); 367 lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn);
260 if (!lip || XFS_FORCED_SHUTDOWN(mp)) { 368 if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
261 /* 369 /*
262 * AIL is empty or our push has reached the end. 370 * AIL is empty or our push has reached the end.
263 */ 371 */
264 xfs_trans_ail_cursor_done(ailp, cur); 372 xfs_trans_ail_cursor_done(ailp, cur);
265 spin_unlock(&ailp->xa_lock); 373 spin_unlock(&ailp->xa_lock);
266 *last_lsn = 0; 374 goto out_done;
267 return tout;
268 } 375 }
269 376
270 XFS_STATS_INC(xs_push_ail); 377 XFS_STATS_INC(xs_push_ail);
@@ -281,8 +388,7 @@ xfsaild_push(
281 * lots of contention on the AIL lists. 388 * lots of contention on the AIL lists.
282 */ 389 */
283 lsn = lip->li_lsn; 390 lsn = lip->li_lsn;
284 flush_log = stuck = count = 0; 391 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
285 while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
286 int lock_result; 392 int lock_result;
287 /* 393 /*
288 * If we can lock the item without sleeping, unlock the AIL 394 * If we can lock the item without sleeping, unlock the AIL
@@ -301,13 +407,13 @@ xfsaild_push(
301 case XFS_ITEM_SUCCESS: 407 case XFS_ITEM_SUCCESS:
302 XFS_STATS_INC(xs_push_ail_success); 408 XFS_STATS_INC(xs_push_ail_success);
303 IOP_PUSH(lip); 409 IOP_PUSH(lip);
304 last_pushed_lsn = lsn; 410 ailp->xa_last_pushed_lsn = lsn;
305 break; 411 break;
306 412
307 case XFS_ITEM_PUSHBUF: 413 case XFS_ITEM_PUSHBUF:
308 XFS_STATS_INC(xs_push_ail_pushbuf); 414 XFS_STATS_INC(xs_push_ail_pushbuf);
309 IOP_PUSHBUF(lip); 415 IOP_PUSHBUF(lip);
310 last_pushed_lsn = lsn; 416 ailp->xa_last_pushed_lsn = lsn;
311 push_xfsbufd = 1; 417 push_xfsbufd = 1;
312 break; 418 break;
313 419
@@ -319,7 +425,7 @@ xfsaild_push(
319 425
320 case XFS_ITEM_LOCKED: 426 case XFS_ITEM_LOCKED:
321 XFS_STATS_INC(xs_push_ail_locked); 427 XFS_STATS_INC(xs_push_ail_locked);
322 last_pushed_lsn = lsn; 428 ailp->xa_last_pushed_lsn = lsn;
323 stuck++; 429 stuck++;
324 break; 430 break;
325 431
@@ -374,9 +480,27 @@ xfsaild_push(
374 wake_up_process(mp->m_ddev_targp->bt_task); 480 wake_up_process(mp->m_ddev_targp->bt_task);
375 } 481 }
376 482
483 /* assume we have more work to do in a short while */
484out_done:
377 if (!count) { 485 if (!count) {
378 /* We're past our target or empty, so idle */ 486 /* We're past our target or empty, so idle */
379 last_pushed_lsn = 0; 487 ailp->xa_last_pushed_lsn = 0;
488
489 /*
490 * We clear the XFS_AIL_PUSHING_BIT first before checking
491 * whether the target has changed. If the target has changed,
492 * this pushes the requeue race directly onto the result of the
493 * atomic test/set bit, so we are guaranteed that either the
494 * the pusher that changed the target or ourselves will requeue
495 * the work (but not both).
496 */
497 clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
498 smp_rmb();
499 if (XFS_LSN_CMP(ailp->xa_target, target) == 0 ||
500 test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
501 return;
502
503 tout = 50;
380 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 504 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
381 /* 505 /*
382 * We reached the target so wait a bit longer for I/O to 506 * We reached the target so wait a bit longer for I/O to
@@ -384,7 +508,7 @@ xfsaild_push(
384 * start the next scan from the start of the AIL. 508 * start the next scan from the start of the AIL.
385 */ 509 */
386 tout = 50; 510 tout = 50;
387 last_pushed_lsn = 0; 511 ailp->xa_last_pushed_lsn = 0;
388 } else if ((stuck * 100) / count > 90) { 512 } else if ((stuck * 100) / count > 90) {
389 /* 513 /*
390 * Either there is a lot of contention on the AIL or we 514 * Either there is a lot of contention on the AIL or we
@@ -396,14 +520,61 @@ xfsaild_push(
396 * continuing from where we were. 520 * continuing from where we were.
397 */ 521 */
398 tout = 20; 522 tout = 20;
399 } else {
400 /* more to do, but wait a short while before continuing */
401 tout = 10;
402 } 523 }
403 *last_lsn = last_pushed_lsn; 524
404 return tout; 525 /* There is more to do, requeue us. */
526 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
527 msecs_to_jiffies(tout));
405} 528}
406 529
530/*
531 * This routine is called to move the tail of the AIL forward. It does this by
532 * trying to flush items in the AIL whose lsns are below the given
533 * threshold_lsn.
534 *
535 * The push is run asynchronously in a workqueue, which means the caller needs
536 * to handle waiting on the async flush for space to become available.
537 * We don't want to interrupt any push that is in progress, hence we only queue
538 * work if we set the pushing bit approriately.
539 *
540 * We do this unlocked - we only need to know whether there is anything in the
541 * AIL at the time we are called. We don't need to access the contents of
542 * any of the objects, so the lock is not needed.
543 */
544void
545xfs_ail_push(
546 struct xfs_ail *ailp,
547 xfs_lsn_t threshold_lsn)
548{
549 xfs_log_item_t *lip;
550
551 lip = xfs_ail_min(ailp);
552 if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
553 XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
554 return;
555
556 /*
557 * Ensure that the new target is noticed in push code before it clears
558 * the XFS_AIL_PUSHING_BIT.
559 */
560 smp_wmb();
561 xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
562 if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
563 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
564}
565
566/*
567 * Push out all items in the AIL immediately
568 */
569void
570xfs_ail_push_all(
571 struct xfs_ail *ailp)
572{
573 xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp);
574
575 if (threshold_lsn)
576 xfs_ail_push(ailp, threshold_lsn);
577}
407 578
408/* 579/*
409 * This is to be called when an item is unlocked that may have 580 * This is to be called when an item is unlocked that may have
@@ -449,129 +620,152 @@ xfs_trans_unlocked_item(
449 xfs_log_move_tail(ailp->xa_mount, 1); 620 xfs_log_move_tail(ailp->xa_mount, 1);
450} /* xfs_trans_unlocked_item */ 621} /* xfs_trans_unlocked_item */
451 622
452
453/* 623/*
454 * Update the position of the item in the AIL with the new 624 * xfs_trans_ail_update - bulk AIL insertion operation.
455 * lsn. If it is not yet in the AIL, add it. Otherwise, move 625 *
456 * it to its new position by removing it and re-adding it. 626 * @xfs_trans_ail_update takes an array of log items that all need to be
627 * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
628 * be added. Otherwise, it will be repositioned by removing it and re-adding
629 * it to the AIL. If we move the first item in the AIL, update the log tail to
630 * match the new minimum LSN in the AIL.
631 *
632 * This function takes the AIL lock once to execute the update operations on
633 * all the items in the array, and as such should not be called with the AIL
634 * lock held. As a result, once we have the AIL lock, we need to check each log
635 * item LSN to confirm it needs to be moved forward in the AIL.
457 * 636 *
458 * Wakeup anyone with an lsn less than the item's lsn. If the item 637 * To optimise the insert operation, we delete all the items from the AIL in
459 * we move in the AIL is the minimum one, update the tail lsn in the 638 * the first pass, moving them into a temporary list, then splice the temporary
460 * log manager. 639 * list into the correct position in the AIL. This avoids needing to do an
640 * insert operation on every item.
461 * 641 *
462 * This function must be called with the AIL lock held. The lock 642 * This function must be called with the AIL lock held. The lock is dropped
463 * is dropped before returning. 643 * before returning.
464 */ 644 */
465void 645void
466xfs_trans_ail_update( 646xfs_trans_ail_update_bulk(
467 struct xfs_ail *ailp, 647 struct xfs_ail *ailp,
468 xfs_log_item_t *lip, 648 struct xfs_log_item **log_items,
469 xfs_lsn_t lsn) __releases(ailp->xa_lock) 649 int nr_items,
650 xfs_lsn_t lsn) __releases(ailp->xa_lock)
470{ 651{
471 xfs_log_item_t *dlip = NULL; 652 xfs_log_item_t *mlip;
472 xfs_log_item_t *mlip; /* ptr to minimum lip */
473 xfs_lsn_t tail_lsn; 653 xfs_lsn_t tail_lsn;
654 int mlip_changed = 0;
655 int i;
656 LIST_HEAD(tmp);
474 657
475 mlip = xfs_ail_min(ailp); 658 mlip = xfs_ail_min(ailp);
476 659
477 if (lip->li_flags & XFS_LI_IN_AIL) { 660 for (i = 0; i < nr_items; i++) {
478 dlip = xfs_ail_delete(ailp, lip); 661 struct xfs_log_item *lip = log_items[i];
479 ASSERT(dlip == lip); 662 if (lip->li_flags & XFS_LI_IN_AIL) {
480 xfs_trans_ail_cursor_clear(ailp, dlip); 663 /* check if we really need to move the item */
481 } else { 664 if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
482 lip->li_flags |= XFS_LI_IN_AIL; 665 continue;
666
667 xfs_ail_delete(ailp, lip);
668 if (mlip == lip)
669 mlip_changed = 1;
670 } else {
671 lip->li_flags |= XFS_LI_IN_AIL;
672 }
673 lip->li_lsn = lsn;
674 list_add(&lip->li_ail, &tmp);
483 } 675 }
484 676
485 lip->li_lsn = lsn; 677 xfs_ail_splice(ailp, &tmp, lsn);
486 xfs_ail_insert(ailp, lip);
487 678
488 if (mlip == dlip) { 679 if (!mlip_changed) {
489 mlip = xfs_ail_min(ailp);
490 /*
491 * It is not safe to access mlip after the AIL lock is
492 * dropped, so we must get a copy of li_lsn before we do
493 * so. This is especially important on 32-bit platforms
494 * where accessing and updating 64-bit values like li_lsn
495 * is not atomic.
496 */
497 tail_lsn = mlip->li_lsn;
498 spin_unlock(&ailp->xa_lock);
499 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
500 } else {
501 spin_unlock(&ailp->xa_lock); 680 spin_unlock(&ailp->xa_lock);
681 return;
502 } 682 }
503 683
504 684 /*
505} /* xfs_trans_update_ail */ 685 * It is not safe to access mlip after the AIL lock is dropped, so we
686 * must get a copy of li_lsn before we do so. This is especially
687 * important on 32-bit platforms where accessing and updating 64-bit
688 * values like li_lsn is not atomic.
689 */
690 mlip = xfs_ail_min(ailp);
691 tail_lsn = mlip->li_lsn;
692 spin_unlock(&ailp->xa_lock);
693 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
694}
506 695
507/* 696/*
508 * Delete the given item from the AIL. It must already be in 697 * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
509 * the AIL. 698 *
699 * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
700 * removed from the AIL. The caller is already holding the AIL lock, and done
701 * all the checks necessary to ensure the items passed in via @log_items are
702 * ready for deletion. This includes checking that the items are in the AIL.
510 * 703 *
511 * Wakeup anyone with an lsn less than item's lsn. If the item 704 * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
512 * we delete in the AIL is the minimum one, update the tail lsn in the 705 * flag from the item and reset the item's lsn to 0. If we remove the first
513 * log manager. 706 * item in the AIL, update the log tail to match the new minimum LSN in the
707 * AIL.
514 * 708 *
515 * Clear the IN_AIL flag from the item, reset its lsn to 0, and 709 * This function will not drop the AIL lock until all items are removed from
516 * bump the AIL's generation count to indicate that the tree 710 * the AIL to minimise the amount of lock traffic on the AIL. This does not
517 * has changed. 711 * greatly increase the AIL hold time, but does significantly reduce the amount
712 * of traffic on the lock, especially during IO completion.
518 * 713 *
519 * This function must be called with the AIL lock held. The lock 714 * This function must be called with the AIL lock held. The lock is dropped
520 * is dropped before returning. 715 * before returning.
521 */ 716 */
522void 717void
523xfs_trans_ail_delete( 718xfs_trans_ail_delete_bulk(
524 struct xfs_ail *ailp, 719 struct xfs_ail *ailp,
525 xfs_log_item_t *lip) __releases(ailp->xa_lock) 720 struct xfs_log_item **log_items,
721 int nr_items) __releases(ailp->xa_lock)
526{ 722{
527 xfs_log_item_t *dlip;
528 xfs_log_item_t *mlip; 723 xfs_log_item_t *mlip;
529 xfs_lsn_t tail_lsn; 724 xfs_lsn_t tail_lsn;
725 int mlip_changed = 0;
726 int i;
530 727
531 if (lip->li_flags & XFS_LI_IN_AIL) { 728 mlip = xfs_ail_min(ailp);
532 mlip = xfs_ail_min(ailp);
533 dlip = xfs_ail_delete(ailp, lip);
534 ASSERT(dlip == lip);
535 xfs_trans_ail_cursor_clear(ailp, dlip);
536 729
730 for (i = 0; i < nr_items; i++) {
731 struct xfs_log_item *lip = log_items[i];
732 if (!(lip->li_flags & XFS_LI_IN_AIL)) {
733 struct xfs_mount *mp = ailp->xa_mount;
537 734
538 lip->li_flags &= ~XFS_LI_IN_AIL;
539 lip->li_lsn = 0;
540
541 if (mlip == dlip) {
542 mlip = xfs_ail_min(ailp);
543 /*
544 * It is not safe to access mlip after the AIL lock
545 * is dropped, so we must get a copy of li_lsn
546 * before we do so. This is especially important
547 * on 32-bit platforms where accessing and updating
548 * 64-bit values like li_lsn is not atomic.
549 */
550 tail_lsn = mlip ? mlip->li_lsn : 0;
551 spin_unlock(&ailp->xa_lock);
552 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
553 } else {
554 spin_unlock(&ailp->xa_lock); 735 spin_unlock(&ailp->xa_lock);
736 if (!XFS_FORCED_SHUTDOWN(mp)) {
737 xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
738 "%s: attempting to delete a log item that is not in the AIL",
739 __func__);
740 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
741 }
742 return;
555 } 743 }
744
745 xfs_ail_delete(ailp, lip);
746 lip->li_flags &= ~XFS_LI_IN_AIL;
747 lip->li_lsn = 0;
748 if (mlip == lip)
749 mlip_changed = 1;
556 } 750 }
557 else {
558 /*
559 * If the file system is not being shutdown, we are in
560 * serious trouble if we get to this stage.
561 */
562 struct xfs_mount *mp = ailp->xa_mount;
563 751
752 if (!mlip_changed) {
564 spin_unlock(&ailp->xa_lock); 753 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) { 754 return;
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 } 755 }
572}
573
574 756
757 /*
758 * It is not safe to access mlip after the AIL lock is dropped, so we
759 * must get a copy of li_lsn before we do so. This is especially
760 * important on 32-bit platforms where accessing and updating 64-bit
761 * values like li_lsn is not atomic. It is possible we've emptied the
762 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
763 */
764 mlip = xfs_ail_min(ailp);
765 tail_lsn = mlip ? mlip->li_lsn : 0;
766 spin_unlock(&ailp->xa_lock);
767 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
768}
575 769
576/* 770/*
577 * The active item list (AIL) is a doubly linked list of log 771 * The active item list (AIL) is a doubly linked list of log
@@ -592,7 +786,6 @@ xfs_trans_ail_init(
592 xfs_mount_t *mp) 786 xfs_mount_t *mp)
593{ 787{
594 struct xfs_ail *ailp; 788 struct xfs_ail *ailp;
595 int error;
596 789
597 ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); 790 ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
598 if (!ailp) 791 if (!ailp)
@@ -601,15 +794,9 @@ xfs_trans_ail_init(
601 ailp->xa_mount = mp; 794 ailp->xa_mount = mp;
602 INIT_LIST_HEAD(&ailp->xa_ail); 795 INIT_LIST_HEAD(&ailp->xa_ail);
603 spin_lock_init(&ailp->xa_lock); 796 spin_lock_init(&ailp->xa_lock);
604 error = xfsaild_start(ailp); 797 INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
605 if (error)
606 goto out_free_ailp;
607 mp->m_ail = ailp; 798 mp->m_ail = ailp;
608 return 0; 799 return 0;
609
610out_free_ailp:
611 kmem_free(ailp);
612 return error;
613} 800}
614 801
615void 802void
@@ -618,135 +805,6 @@ xfs_trans_ail_destroy(
618{ 805{
619 struct xfs_ail *ailp = mp->m_ail; 806 struct xfs_ail *ailp = mp->m_ail;
620 807
621 xfsaild_stop(ailp); 808 cancel_delayed_work_sync(&ailp->xa_work);
622 kmem_free(ailp); 809 kmem_free(ailp);
623} 810}
624
625/*
626 * Insert the given log item into the AIL.
627 * We almost always insert at the end of the list, so on inserts
628 * we search from the end of the list to find where the
629 * new item belongs.
630 */
631STATIC void
632xfs_ail_insert(
633 struct xfs_ail *ailp,
634 xfs_log_item_t *lip)
635/* ARGSUSED */
636{
637 xfs_log_item_t *next_lip;
638
639 /*
640 * If the list is empty, just insert the item.
641 */
642 if (list_empty(&ailp->xa_ail)) {
643 list_add(&lip->li_ail, &ailp->xa_ail);
644 return;
645 }
646
647 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
648 if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
649 break;
650 }
651
652 ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
653 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
654
655 list_add(&lip->li_ail, &next_lip->li_ail);
656
657 xfs_ail_check(ailp, lip);
658 return;
659}
660
661/*
662 * Delete the given item from the AIL. Return a pointer to the item.
663 */
664/*ARGSUSED*/
665STATIC xfs_log_item_t *
666xfs_ail_delete(
667 struct xfs_ail *ailp,
668 xfs_log_item_t *lip)
669/* ARGSUSED */
670{
671 xfs_ail_check(ailp, lip);
672
673 list_del(&lip->li_ail);
674
675 return lip;
676}
677
678/*
679 * Return a pointer to the first item in the AIL.
680 * If the AIL is empty, then return NULL.
681 */
682STATIC xfs_log_item_t *
683xfs_ail_min(
684 struct xfs_ail *ailp)
685/* ARGSUSED */
686{
687 if (list_empty(&ailp->xa_ail))
688 return NULL;
689
690 return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
691}
692
693/*
694 * Return a pointer to the item which follows
695 * the given item in the AIL. If the given item
696 * is the last item in the list, then return NULL.
697 */
698STATIC xfs_log_item_t *
699xfs_ail_next(
700 struct xfs_ail *ailp,
701 xfs_log_item_t *lip)
702/* ARGSUSED */
703{
704 if (lip->li_ail.next == &ailp->xa_ail)
705 return NULL;
706
707 return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
708}
709
710#ifdef DEBUG
711/*
712 * Check that the list is sorted as it should be.
713 */
714STATIC void
715xfs_ail_check(
716 struct xfs_ail *ailp,
717 xfs_log_item_t *lip)
718{
719 xfs_log_item_t *prev_lip;
720
721 if (list_empty(&ailp->xa_ail))
722 return;
723
724 /*
725 * Check the next and previous entries are valid.
726 */
727 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
728 prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
729 if (&prev_lip->li_ail != &ailp->xa_ail)
730 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
731
732 prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
733 if (&prev_lip->li_ail != &ailp->xa_ail)
734 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
735
736
737#ifdef XFS_TRANS_DEBUG
738 /*
739 * Walk the list checking lsn ordering, and that every entry has the
740 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
741 * when specifically debugging the transaction subsystem.
742 */
743 prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
744 list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
745 if (&prev_lip->li_ail != &ailp->xa_ail)
746 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
747 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
748 prev_lip = lip;
749 }
750#endif /* XFS_TRANS_DEBUG */
751}
752#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 90af025e6839..03b3b7f85a3b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -305,7 +305,7 @@ xfs_trans_read_buf(
305 if (xfs_error_target == target) { 305 if (xfs_error_target == target) {
306 if (((xfs_req_num++) % xfs_error_mod) == 0) { 306 if (((xfs_req_num++) % xfs_error_mod) == 0) {
307 xfs_buf_relse(bp); 307 xfs_buf_relse(bp);
308 cmn_err(CE_DEBUG, "Returning error!\n"); 308 xfs_debug(mp, "Returning error!");
309 return XFS_ERROR(EIO); 309 return XFS_ERROR(EIO);
310 } 310 }
311 } 311 }
@@ -336,7 +336,7 @@ xfs_trans_read_buf(
336 ASSERT(!XFS_BUF_ISASYNC(bp)); 336 ASSERT(!XFS_BUF_ISASYNC(bp));
337 XFS_BUF_READ(bp); 337 XFS_BUF_READ(bp);
338 xfsbdstrat(tp->t_mountp, bp); 338 xfsbdstrat(tp->t_mountp, bp);
339 error = xfs_iowait(bp); 339 error = xfs_buf_iowait(bp);
340 if (error) { 340 if (error) {
341 xfs_ioerror_alert("xfs_trans_read_buf", mp, 341 xfs_ioerror_alert("xfs_trans_read_buf", mp,
342 bp, blkno); 342 bp, blkno);
@@ -383,7 +383,8 @@ xfs_trans_read_buf(
383 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); 383 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
384 if (bp == NULL) { 384 if (bp == NULL) {
385 *bpp = NULL; 385 *bpp = NULL;
386 return 0; 386 return (flags & XBF_TRYLOCK) ?
387 0 : XFS_ERROR(ENOMEM);
387 } 388 }
388 if (XFS_BUF_GETERROR(bp) != 0) { 389 if (XFS_BUF_GETERROR(bp) != 0) {
389 XFS_BUF_SUPER_STALE(bp); 390 XFS_BUF_SUPER_STALE(bp);
@@ -403,7 +404,7 @@ xfs_trans_read_buf(
403 xfs_force_shutdown(tp->t_mountp, 404 xfs_force_shutdown(tp->t_mountp,
404 SHUTDOWN_META_IO_ERROR); 405 SHUTDOWN_META_IO_ERROR);
405 xfs_buf_relse(bp); 406 xfs_buf_relse(bp);
406 cmn_err(CE_DEBUG, "Returning trans error!\n"); 407 xfs_debug(mp, "Returning trans error!");
407 return XFS_ERROR(EIO); 408 return XFS_ERROR(EIO);
408 } 409 }
409 } 410 }
@@ -427,7 +428,7 @@ shutdown_abort:
427 */ 428 */
428#if defined(DEBUG) 429#if defined(DEBUG)
429 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 430 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
430 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); 431 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
431#endif 432#endif
432 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != 433 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
433 (XBF_STALE|XBF_DELWRI)); 434 (XBF_STALE|XBF_DELWRI));
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
69 tp->t_flags |= XFS_TRANS_DIRTY; 69 tp->t_flags |= XFS_TRANS_DIRTY;
70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; 70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
71 71
72 next_extent = efip->efi_next_extent; 72 /*
73 * atomic_inc_return gives us the value after the increment;
74 * we want to use it as an array index so we need to subtract 1 from
75 * it.
76 */
77 next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
73 ASSERT(next_extent < efip->efi_format.efi_nextents); 78 ASSERT(next_extent < efip->efi_format.efi_nextents);
74 extp = &(efip->efi_format.efi_extents[next_extent]); 79 extp = &(efip->efi_format.efi_extents[next_extent]);
75 extp->ext_start = start_block; 80 extp->ext_start = start_block;
76 extp->ext_len = ext_len; 81 extp->ext_len = ext_len;
77 efip->efi_next_extent++;
78} 82}
79 83
80 84
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index cdc53a1050c5..048b0c689d3e 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -44,28 +44,6 @@ xfs_trans_inode_broot_debug(
44#endif 44#endif
45 45
46/* 46/*
47 * Get an inode and join it to the transaction.
48 */
49int
50xfs_trans_iget(
51 xfs_mount_t *mp,
52 xfs_trans_t *tp,
53 xfs_ino_t ino,
54 uint flags,
55 uint lock_flags,
56 xfs_inode_t **ipp)
57{
58 int error;
59
60 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
61 if (!error && tp) {
62 xfs_trans_ijoin(tp, *ipp);
63 (*ipp)->i_itemp->ili_lock_flags = lock_flags;
64 }
65 return error;
66}
67
68/*
69 * Add a locked inode to the transaction. 47 * Add a locked inode to the transaction.
70 * 48 *
71 * The inode must be locked, and it cannot be associated with any transaction. 49 * The inode must be locked, and it cannot be associated with any transaction.
@@ -103,7 +81,7 @@ xfs_trans_ijoin(
103 * 81 *
104 * 82 *
105 * Grabs a reference to the inode which will be dropped when the transaction 83 * Grabs a reference to the inode which will be dropped when the transaction
106 * is commited. The inode will also be unlocked at that point. The inode 84 * is committed. The inode will also be unlocked at that point. The inode
107 * must be locked, and it cannot be associated with any transaction. 85 * must be locked, and it cannot be associated with any transaction.
108 */ 86 */
109void 87void
@@ -118,6 +96,36 @@ xfs_trans_ijoin_ref(
118} 96}
119 97
120/* 98/*
99 * Transactional inode timestamp update. Requires the inode to be locked and
100 * joined to the transaction supplied. Relies on the transaction subsystem to
101 * track dirty state and update/writeback the inode accordingly.
102 */
103void
104xfs_trans_ichgtime(
105 struct xfs_trans *tp,
106 struct xfs_inode *ip,
107 int flags)
108{
109 struct inode *inode = VFS_I(ip);
110 timespec_t tv;
111
112 ASSERT(tp);
113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
114 ASSERT(ip->i_transp == tp);
115
116 tv = current_fs_time(inode->i_sb);
117
118 if ((flags & XFS_ICHGTIME_MOD) &&
119 !timespec_equal(&inode->i_mtime, &tv)) {
120 inode->i_mtime = tv;
121 }
122 if ((flags & XFS_ICHGTIME_CHG) &&
123 !timespec_equal(&inode->i_ctime, &tv)) {
124 inode->i_ctime = tv;
125 }
126}
127
128/*
121 * This is called to mark the fields indicated in fieldmask as needing 129 * This is called to mark the fields indicated in fieldmask as needing
122 * to be logged when the transaction is committed. The inode must 130 * to be logged when the transaction is committed. The inode must
123 * already be associated with the given transaction. 131 * already be associated with the given transaction.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..6b164e9e9a1f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
22struct xfs_log_item_desc; 22struct xfs_log_item_desc;
23struct xfs_mount; 23struct xfs_mount;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_ail;
26struct xfs_log_vec;
25 27
26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 28void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27void xfs_trans_del_item(struct xfs_log_item *); 29void xfs_trans_del_item(struct xfs_log_item *);
28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 30void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29 int flags); 31 int flags);
30void xfs_trans_item_committed(struct xfs_log_item *lip,
31 xfs_lsn_t commit_lsn, int aborted);
32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
33 33
34void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
35 xfs_lsn_t commit_lsn, int aborted);
34/* 36/*
35 * AIL traversal cursor. 37 * AIL traversal cursor.
36 * 38 *
@@ -63,28 +65,52 @@ struct xfs_ail_cursor {
63struct xfs_ail { 65struct xfs_ail {
64 struct xfs_mount *xa_mount; 66 struct xfs_mount *xa_mount;
65 struct list_head xa_ail; 67 struct list_head xa_ail;
66 uint xa_gen;
67 struct task_struct *xa_task;
68 xfs_lsn_t xa_target; 68 xfs_lsn_t xa_target;
69 struct xfs_ail_cursor xa_cursors; 69 struct xfs_ail_cursor xa_cursors;
70 spinlock_t xa_lock; 70 spinlock_t xa_lock;
71 struct delayed_work xa_work;
72 xfs_lsn_t xa_last_pushed_lsn;
73 unsigned long xa_flags;
71}; 74};
72 75
76#define XFS_AIL_PUSHING_BIT 0
77
73/* 78/*
74 * From xfs_trans_ail.c 79 * From xfs_trans_ail.c
75 */ 80 */
76void xfs_trans_ail_update(struct xfs_ail *ailp, 81
77 struct xfs_log_item *lip, xfs_lsn_t lsn) 82extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
78 __releases(ailp->xa_lock); 83
79void xfs_trans_ail_delete(struct xfs_ail *ailp, 84void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
80 struct xfs_log_item *lip) 85 struct xfs_log_item **log_items, int nr_items,
81 __releases(ailp->xa_lock); 86 xfs_lsn_t lsn) __releases(ailp->xa_lock);
82void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); 87static inline void
88xfs_trans_ail_update(
89 struct xfs_ail *ailp,
90 struct xfs_log_item *lip,
91 xfs_lsn_t lsn) __releases(ailp->xa_lock)
92{
93 xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
94}
95
96void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
97 struct xfs_log_item **log_items, int nr_items)
98 __releases(ailp->xa_lock);
99static inline void
100xfs_trans_ail_delete(
101 struct xfs_ail *ailp,
102 xfs_log_item_t *lip) __releases(ailp->xa_lock)
103{
104 xfs_trans_ail_delete_bulk(ailp, &lip, 1);
105}
106
107void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
108void xfs_ail_push_all(struct xfs_ail *);
109xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp);
110
83void xfs_trans_unlocked_item(struct xfs_ail *, 111void xfs_trans_unlocked_item(struct xfs_ail *,
84 xfs_log_item_t *); 112 xfs_log_item_t *);
85 113
86xfs_lsn_t xfs_trans_ail_tail(struct xfs_ail *ailp);
87
88struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp, 114struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
89 struct xfs_ail_cursor *cur, 115 struct xfs_ail_cursor *cur,
90 xfs_lsn_t lsn); 116 xfs_lsn_t lsn);
@@ -93,11 +119,6 @@ struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
93void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, 119void xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
94 struct xfs_ail_cursor *cur); 120 struct xfs_ail_cursor *cur);
95 121
96long xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
97void xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
98int xfsaild_start(struct xfs_ail *);
99void xfsaild_stop(struct xfs_ail *);
100
101#if BITS_PER_LONG != 64 122#if BITS_PER_LONG != 64
102static inline void 123static inline void
103xfs_trans_ail_copy_lsn( 124xfs_trans_ail_copy_lsn(
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 320775295e32..65584b55607d 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,10 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */
73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ 73typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ 74typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */
79
80/* 76/*
81 * These types are 64 bits on disk but are either 32 or 64 bits in memory. 77 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
82 * Disk based types: 78 * Disk based types:
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index b7d5769d2df0..8b32d1a4c5a1 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -56,7 +56,6 @@ xfs_dir_ialloc(
56 mode_t mode, 56 mode_t mode,
57 xfs_nlink_t nlink, 57 xfs_nlink_t nlink,
58 xfs_dev_t rdev, 58 xfs_dev_t rdev,
59 cred_t *credp,
60 prid_t prid, /* project id */ 59 prid_t prid, /* project id */
61 int okalloc, /* ok to allocate new space */ 60 int okalloc, /* ok to allocate new space */
62 xfs_inode_t **ipp, /* pointer to inode; it will be 61 xfs_inode_t **ipp, /* pointer to inode; it will be
@@ -93,7 +92,7 @@ xfs_dir_ialloc(
93 * transaction commit so that no other process can steal 92 * transaction commit so that no other process can steal
94 * the inode(s) that we've just allocated. 93 * the inode(s) that we've just allocated.
95 */ 94 */
96 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc, 95 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
97 &ialloc_context, &call_again, &ip); 96 &ialloc_context, &call_again, &ip);
98 97
99 /* 98 /*
@@ -197,7 +196,7 @@ xfs_dir_ialloc(
197 * other allocations in this allocation group, 196 * other allocations in this allocation group,
198 * this call should always succeed. 197 * this call should always succeed.
199 */ 198 */
200 code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, 199 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
201 okalloc, &ialloc_context, &call_again, &ip); 200 okalloc, &ialloc_context, &call_again, &ip);
202 201
203 /* 202 /*
@@ -235,7 +234,7 @@ xfs_droplink(
235{ 234{
236 int error; 235 int error;
237 236
238 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 237 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
239 238
240 ASSERT (ip->i_d.di_nlink > 0); 239 ASSERT (ip->i_d.di_nlink > 0);
241 ip->i_d.di_nlink--; 240 ip->i_d.di_nlink--;
@@ -299,7 +298,7 @@ xfs_bumplink(
299{ 298{
300 if (ip->i_d.di_nlink >= XFS_MAXLINK) 299 if (ip->i_d.di_nlink >= XFS_MAXLINK)
301 return XFS_ERROR(EMLINK); 300 return XFS_ERROR(EMLINK);
302 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 301 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
303 302
304 ASSERT(ip->i_d.di_nlink > 0); 303 ASSERT(ip->i_d.di_nlink > 0);
305 ip->i_d.di_nlink++; 304 ip->i_d.di_nlink++;
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f55b9678264f..456fca314933 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -19,8 +19,7 @@
19#define __XFS_UTILS_H__ 19#define __XFS_UTILS_H__
20 20
21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, 21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
22 xfs_dev_t, cred_t *, prid_t, int, 22 xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
23 xfs_inode_t **, int *);
24extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); 23extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
25extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); 24extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
26extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); 25extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4c7c7bfb2b2f..619720705bc6 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -114,7 +114,7 @@ xfs_setattr(
114 */ 114 */
115 ASSERT(udqp == NULL); 115 ASSERT(udqp == NULL);
116 ASSERT(gdqp == NULL); 116 ASSERT(gdqp == NULL);
117 code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, 117 code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
118 qflags, &udqp, &gdqp); 118 qflags, &udqp, &gdqp);
119 if (code) 119 if (code)
120 return code; 120 return code;
@@ -184,8 +184,11 @@ xfs_setattr(
184 ip->i_size == 0 && ip->i_d.di_nextents == 0) { 184 ip->i_size == 0 && ip->i_d.di_nextents == 0) {
185 xfs_iunlock(ip, XFS_ILOCK_EXCL); 185 xfs_iunlock(ip, XFS_ILOCK_EXCL);
186 lock_flags &= ~XFS_ILOCK_EXCL; 186 lock_flags &= ~XFS_ILOCK_EXCL;
187 if (mask & ATTR_CTIME) 187 if (mask & ATTR_CTIME) {
188 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 188 inode->i_mtime = inode->i_ctime =
189 current_fs_time(inode->i_sb);
190 xfs_mark_inode_dirty_sync(ip);
191 }
189 code = 0; 192 code = 0;
190 goto error_return; 193 goto error_return;
191 } 194 }
@@ -950,40 +953,62 @@ xfs_release(
950 * If we previously truncated this file and removed old data 953 * If we previously truncated this file and removed old data
951 * in the process, we want to initiate "early" writeout on 954 * in the process, we want to initiate "early" writeout on
952 * the last close. This is an attempt to combat the notorious 955 * the last close. This is an attempt to combat the notorious
953 * NULL files problem which is particularly noticable from a 956 * NULL files problem which is particularly noticeable from a
954 * truncate down, buffered (re-)write (delalloc), followed by 957 * truncate down, buffered (re-)write (delalloc), followed by
955 * a crash. What we are effectively doing here is 958 * a crash. What we are effectively doing here is
956 * significantly reducing the time window where we'd otherwise 959 * significantly reducing the time window where we'd otherwise
957 * be exposed to that problem. 960 * be exposed to that problem.
958 */ 961 */
959 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 962 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
960 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) 963 if (truncated) {
961 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 964 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
965 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
966 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
967 }
962 } 968 }
963 969
964 if (ip->i_d.di_nlink != 0) { 970 if (ip->i_d.di_nlink == 0)
965 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 971 return 0;
966 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
967 ip->i_delayed_blks > 0)) &&
968 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
969 (!(ip->i_d.di_flags &
970 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
971 972
972 /* 973 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
973 * If we can't get the iolock just skip truncating 974 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
974 * the blocks past EOF because we could deadlock 975 ip->i_delayed_blks > 0)) &&
975 * with the mmap_sem otherwise. We'll get another 976 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
976 * chance to drop them once the last reference to 977 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
977 * the inode is dropped, so we'll never leak blocks 978
978 * permanently. 979 /*
979 */ 980 * If we can't get the iolock just skip truncating the blocks
980 error = xfs_free_eofblocks(mp, ip, 981 * past EOF because we could deadlock with the mmap_sem
981 XFS_FREE_EOF_TRYLOCK); 982 * otherwise. We'll get another chance to drop them once the
982 if (error) 983 * last reference to the inode is dropped, so we'll never leak
983 return error; 984 * blocks permanently.
984 } 985 *
985 } 986 * Further, check if the inode is being opened, written and
987 * closed frequently and we have delayed allocation blocks
988 * outstanding (e.g. streaming writes from the NFS server),
989 * truncating the blocks past EOF will cause fragmentation to
990 * occur.
991 *
992 * In this case don't do the truncation, either, but we have to
993 * be careful how we detect this case. Blocks beyond EOF show
994 * up as i_delayed_blks even when the inode is clean, so we
995 * need to truncate them away first before checking for a dirty
996 * release. Hence on the first dirty close we will still remove
997 * the speculative allocation, but after that we will leave it
998 * in place.
999 */
1000 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1001 return 0;
1002
1003 error = xfs_free_eofblocks(mp, ip,
1004 XFS_FREE_EOF_TRYLOCK);
1005 if (error)
1006 return error;
986 1007
1008 /* delalloc blocks after truncation means it really is dirty */
1009 if (ip->i_delayed_blks)
1010 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1011 }
987 return 0; 1012 return 0;
988} 1013}
989 1014
@@ -1167,9 +1192,8 @@ xfs_inactive(
1167 * inode might be lost for a long time or forever. 1192 * inode might be lost for a long time or forever.
1168 */ 1193 */
1169 if (!XFS_FORCED_SHUTDOWN(mp)) { 1194 if (!XFS_FORCED_SHUTDOWN(mp)) {
1170 cmn_err(CE_NOTE, 1195 xfs_notice(mp, "%s: xfs_ifree returned error %d",
1171 "xfs_inactive: xfs_ifree() returned an error = %d on %s", 1196 __func__, error);
1172 error, mp->m_fsname);
1173 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1197 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1174 } 1198 }
1175 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 1199 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
@@ -1186,12 +1210,12 @@ xfs_inactive(
1186 */ 1210 */
1187 error = xfs_bmap_finish(&tp, &free_list, &committed); 1211 error = xfs_bmap_finish(&tp, &free_list, &committed);
1188 if (error) 1212 if (error)
1189 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " 1213 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
1190 "xfs_bmap_finish() returned error %d", error); 1214 __func__, error);
1191 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1215 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1192 if (error) 1216 if (error)
1193 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " 1217 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1194 "xfs_trans_commit() returned error %d", error); 1218 __func__, error);
1195 } 1219 }
1196 1220
1197 /* 1221 /*
@@ -1253,8 +1277,7 @@ xfs_create(
1253 struct xfs_name *name, 1277 struct xfs_name *name,
1254 mode_t mode, 1278 mode_t mode,
1255 xfs_dev_t rdev, 1279 xfs_dev_t rdev,
1256 xfs_inode_t **ipp, 1280 xfs_inode_t **ipp)
1257 cred_t *credp)
1258{ 1281{
1259 int is_dir = S_ISDIR(mode); 1282 int is_dir = S_ISDIR(mode);
1260 struct xfs_mount *mp = dp->i_mount; 1283 struct xfs_mount *mp = dp->i_mount;
@@ -1266,7 +1289,7 @@ xfs_create(
1266 boolean_t unlock_dp_on_error = B_FALSE; 1289 boolean_t unlock_dp_on_error = B_FALSE;
1267 uint cancel_flags; 1290 uint cancel_flags;
1268 int committed; 1291 int committed;
1269 xfs_prid_t prid; 1292 prid_t prid;
1270 struct xfs_dquot *udqp = NULL; 1293 struct xfs_dquot *udqp = NULL;
1271 struct xfs_dquot *gdqp = NULL; 1294 struct xfs_dquot *gdqp = NULL;
1272 uint resblks; 1295 uint resblks;
@@ -1279,9 +1302,9 @@ xfs_create(
1279 return XFS_ERROR(EIO); 1302 return XFS_ERROR(EIO);
1280 1303
1281 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1304 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1282 prid = dp->i_d.di_projid; 1305 prid = xfs_get_projid(dp);
1283 else 1306 else
1284 prid = dfltprid; 1307 prid = XFS_PROJID_DEFAULT;
1285 1308
1286 /* 1309 /*
1287 * Make sure that we have allocated dquot(s) on disk. 1310 * Make sure that we have allocated dquot(s) on disk.
@@ -1289,7 +1312,7 @@ xfs_create(
1289 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 1312 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1290 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 1313 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1291 if (error) 1314 if (error)
1292 goto std_return; 1315 return error;
1293 1316
1294 if (is_dir) { 1317 if (is_dir) {
1295 rdev = 0; 1318 rdev = 0;
@@ -1360,7 +1383,7 @@ xfs_create(
1360 * entry pointing to them, but a directory also the "." entry 1383 * entry pointing to them, but a directory also the "." entry
1361 * pointing to itself. 1384 * pointing to itself.
1362 */ 1385 */
1363 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp, 1386 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1364 prid, resblks > 0, &ip, &committed); 1387 prid, resblks > 0, &ip, &committed);
1365 if (error) { 1388 if (error) {
1366 if (error == ENOSPC) 1389 if (error == ENOSPC)
@@ -1369,12 +1392,6 @@ xfs_create(
1369 } 1392 }
1370 1393
1371 /* 1394 /*
1372 * At this point, we've gotten a newly allocated inode.
1373 * It is locked (and joined to the transaction).
1374 */
1375 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1376
1377 /*
1378 * Now we join the directory inode to the transaction. We do not do it 1395 * Now we join the directory inode to the transaction. We do not do it
1379 * earlier because xfs_dir_ialloc might commit the previous transaction 1396 * earlier because xfs_dir_ialloc might commit the previous transaction
1380 * (and release all the locks). An error from here on will result in 1397 * (and release all the locks). An error from here on will result in
@@ -1391,7 +1408,7 @@ xfs_create(
1391 ASSERT(error != ENOSPC); 1408 ASSERT(error != ENOSPC);
1392 goto out_trans_abort; 1409 goto out_trans_abort;
1393 } 1410 }
1394 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1411 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1395 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1412 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1396 1413
1397 if (is_dir) { 1414 if (is_dir) {
@@ -1419,22 +1436,13 @@ xfs_create(
1419 */ 1436 */
1420 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); 1437 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
1421 1438
1422 /*
1423 * xfs_trans_commit normally decrements the vnode ref count
1424 * when it unlocks the inode. Since we want to return the
1425 * vnode to the caller, we bump the vnode ref count now.
1426 */
1427 IHOLD(ip);
1428
1429 error = xfs_bmap_finish(&tp, &free_list, &committed); 1439 error = xfs_bmap_finish(&tp, &free_list, &committed);
1430 if (error) 1440 if (error)
1431 goto out_abort_rele; 1441 goto out_bmap_cancel;
1432 1442
1433 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1443 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1434 if (error) { 1444 if (error)
1435 IRELE(ip); 1445 goto out_release_inode;
1436 goto out_dqrele;
1437 }
1438 1446
1439 xfs_qm_dqrele(udqp); 1447 xfs_qm_dqrele(udqp);
1440 xfs_qm_dqrele(gdqp); 1448 xfs_qm_dqrele(gdqp);
@@ -1448,27 +1456,21 @@ xfs_create(
1448 cancel_flags |= XFS_TRANS_ABORT; 1456 cancel_flags |= XFS_TRANS_ABORT;
1449 out_trans_cancel: 1457 out_trans_cancel:
1450 xfs_trans_cancel(tp, cancel_flags); 1458 xfs_trans_cancel(tp, cancel_flags);
1451 out_dqrele: 1459 out_release_inode:
1460 /*
1461 * Wait until after the current transaction is aborted to
1462 * release the inode. This prevents recursive transactions
1463 * and deadlocks from xfs_inactive.
1464 */
1465 if (ip)
1466 IRELE(ip);
1467
1452 xfs_qm_dqrele(udqp); 1468 xfs_qm_dqrele(udqp);
1453 xfs_qm_dqrele(gdqp); 1469 xfs_qm_dqrele(gdqp);
1454 1470
1455 if (unlock_dp_on_error) 1471 if (unlock_dp_on_error)
1456 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1472 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1457 std_return:
1458 return error; 1473 return error;
1459
1460 out_abort_rele:
1461 /*
1462 * Wait until after the current transaction is aborted to
1463 * release the inode. This prevents recursive transactions
1464 * and deadlocks from xfs_inactive.
1465 */
1466 xfs_bmap_cancel(&free_list);
1467 cancel_flags |= XFS_TRANS_ABORT;
1468 xfs_trans_cancel(tp, cancel_flags);
1469 IRELE(ip);
1470 unlock_dp_on_error = B_FALSE;
1471 goto out_dqrele;
1472} 1474}
1473 1475
1474#ifdef DEBUG 1476#ifdef DEBUG
@@ -1742,7 +1744,7 @@ xfs_remove(
1742 ASSERT(error != ENOENT); 1744 ASSERT(error != ENOENT);
1743 goto out_bmap_cancel; 1745 goto out_bmap_cancel;
1744 } 1746 }
1745 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1747 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1746 1748
1747 if (is_dir) { 1749 if (is_dir) {
1748 /* 1750 /*
@@ -1880,7 +1882,7 @@ xfs_link(
1880 * the tree quota mechanism could be circumvented. 1882 * the tree quota mechanism could be circumvented.
1881 */ 1883 */
1882 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1884 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1883 (tdp->i_d.di_projid != sip->i_d.di_projid))) { 1885 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1884 error = XFS_ERROR(EXDEV); 1886 error = XFS_ERROR(EXDEV);
1885 goto error_return; 1887 goto error_return;
1886 } 1888 }
@@ -1895,7 +1897,7 @@ xfs_link(
1895 &first_block, &free_list, resblks); 1897 &first_block, &free_list, resblks);
1896 if (error) 1898 if (error)
1897 goto abort_return; 1899 goto abort_return;
1898 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1900 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1899 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1901 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1900 1902
1901 error = xfs_bumplink(tp, sip); 1903 error = xfs_bumplink(tp, sip);
@@ -1933,8 +1935,7 @@ xfs_symlink(
1933 struct xfs_name *link_name, 1935 struct xfs_name *link_name,
1934 const char *target_path, 1936 const char *target_path,
1935 mode_t mode, 1937 mode_t mode,
1936 xfs_inode_t **ipp, 1938 xfs_inode_t **ipp)
1937 cred_t *credp)
1938{ 1939{
1939 xfs_mount_t *mp = dp->i_mount; 1940 xfs_mount_t *mp = dp->i_mount;
1940 xfs_trans_t *tp; 1941 xfs_trans_t *tp;
@@ -1955,7 +1956,7 @@ xfs_symlink(
1955 int byte_cnt; 1956 int byte_cnt;
1956 int n; 1957 int n;
1957 xfs_buf_t *bp; 1958 xfs_buf_t *bp;
1958 xfs_prid_t prid; 1959 prid_t prid;
1959 struct xfs_dquot *udqp, *gdqp; 1960 struct xfs_dquot *udqp, *gdqp;
1960 uint resblks; 1961 uint resblks;
1961 1962
@@ -1978,9 +1979,9 @@ xfs_symlink(
1978 1979
1979 udqp = gdqp = NULL; 1980 udqp = gdqp = NULL;
1980 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1981 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1981 prid = dp->i_d.di_projid; 1982 prid = xfs_get_projid(dp);
1982 else 1983 else
1983 prid = (xfs_prid_t)dfltprid; 1984 prid = XFS_PROJID_DEFAULT;
1984 1985
1985 /* 1986 /*
1986 * Make sure that we have allocated dquot(s) on disk. 1987 * Make sure that we have allocated dquot(s) on disk.
@@ -2046,8 +2047,8 @@ xfs_symlink(
2046 /* 2047 /*
2047 * Allocate an inode for the symlink. 2048 * Allocate an inode for the symlink.
2048 */ 2049 */
2049 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 2050 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
2050 1, 0, credp, prid, resblks > 0, &ip, NULL); 2051 prid, resblks > 0, &ip, NULL);
2051 if (error) { 2052 if (error) {
2052 if (error == ENOSPC) 2053 if (error == ENOSPC)
2053 goto error_return; 2054 goto error_return;
@@ -2094,9 +2095,8 @@ xfs_symlink(
2094 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, 2095 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
2095 &first_block, resblks, mval, &nmaps, 2096 &first_block, resblks, mval, &nmaps,
2096 &free_list); 2097 &free_list);
2097 if (error) { 2098 if (error)
2098 goto error1; 2099 goto error2;
2099 }
2100 2100
2101 if (resblks) 2101 if (resblks)
2102 resblks -= fs_blocks; 2102 resblks -= fs_blocks;
@@ -2128,8 +2128,8 @@ xfs_symlink(
2128 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, 2128 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
2129 &first_block, &free_list, resblks); 2129 &first_block, &free_list, resblks);
2130 if (error) 2130 if (error)
2131 goto error1; 2131 goto error2;
2132 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2132 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2133 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2133 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2134 2134
2135 /* 2135 /*
@@ -2141,13 +2141,6 @@ xfs_symlink(
2141 xfs_trans_set_sync(tp); 2141 xfs_trans_set_sync(tp);
2142 } 2142 }
2143 2143
2144 /*
2145 * xfs_trans_commit normally decrements the vnode ref count
2146 * when it unlocks the inode. Since we want to return the
2147 * vnode to the caller, we bump the vnode ref count now.
2148 */
2149 IHOLD(ip);
2150
2151 error = xfs_bmap_finish(&tp, &free_list, &committed); 2144 error = xfs_bmap_finish(&tp, &free_list, &committed);
2152 if (error) { 2145 if (error) {
2153 goto error2; 2146 goto error2;
@@ -2272,7 +2265,7 @@ xfs_alloc_file_space(
2272 count = len; 2265 count = len;
2273 imapp = &imaps[0]; 2266 imapp = &imaps[0];
2274 nimaps = 1; 2267 nimaps = 1;
2275 bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); 2268 bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
2276 startoffset_fsb = XFS_B_TO_FSBT(mp, offset); 2269 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
2277 allocatesize_fsb = XFS_B_TO_FSB(mp, count); 2270 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
2278 2271
@@ -2431,9 +2424,9 @@ xfs_zero_remaining_bytes(
2431 if (endoff > ip->i_size) 2424 if (endoff > ip->i_size)
2432 endoff = ip->i_size; 2425 endoff = ip->i_size;
2433 2426
2434 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 2427 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
2435 XFS_IS_REALTIME_INODE(ip) ? 2428 mp->m_rtdev_targp : mp->m_ddev_targp,
2436 mp->m_rtdev_targp : mp->m_ddev_targp); 2429 mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
2437 if (!bp) 2430 if (!bp)
2438 return XFS_ERROR(ENOMEM); 2431 return XFS_ERROR(ENOMEM);
2439 2432
@@ -2459,7 +2452,7 @@ xfs_zero_remaining_bytes(
2459 XFS_BUF_READ(bp); 2452 XFS_BUF_READ(bp);
2460 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); 2453 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2461 xfsbdstrat(mp, bp); 2454 xfsbdstrat(mp, bp);
2462 error = xfs_iowait(bp); 2455 error = xfs_buf_iowait(bp);
2463 if (error) { 2456 if (error) {
2464 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", 2457 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
2465 mp, bp, XFS_BUF_ADDR(bp)); 2458 mp, bp, XFS_BUF_ADDR(bp));
@@ -2472,7 +2465,7 @@ xfs_zero_remaining_bytes(
2472 XFS_BUF_UNREAD(bp); 2465 XFS_BUF_UNREAD(bp);
2473 XFS_BUF_WRITE(bp); 2466 XFS_BUF_WRITE(bp);
2474 xfsbdstrat(mp, bp); 2467 xfsbdstrat(mp, bp);
2475 error = xfs_iowait(bp); 2468 error = xfs_buf_iowait(bp);
2476 if (error) { 2469 if (error) {
2477 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", 2470 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
2478 mp, bp, XFS_BUF_ADDR(bp)); 2471 mp, bp, XFS_BUF_ADDR(bp));
@@ -2711,6 +2704,7 @@ xfs_change_file_space(
2711 xfs_off_t llen; 2704 xfs_off_t llen;
2712 xfs_trans_t *tp; 2705 xfs_trans_t *tp;
2713 struct iattr iattr; 2706 struct iattr iattr;
2707 int prealloc_type;
2714 2708
2715 if (!S_ISREG(ip->i_d.di_mode)) 2709 if (!S_ISREG(ip->i_d.di_mode))
2716 return XFS_ERROR(EINVAL); 2710 return XFS_ERROR(EINVAL);
@@ -2753,12 +2747,17 @@ xfs_change_file_space(
2753 * size to be changed. 2747 * size to be changed.
2754 */ 2748 */
2755 setprealloc = clrprealloc = 0; 2749 setprealloc = clrprealloc = 0;
2750 prealloc_type = XFS_BMAPI_PREALLOC;
2756 2751
2757 switch (cmd) { 2752 switch (cmd) {
2753 case XFS_IOC_ZERO_RANGE:
2754 prealloc_type |= XFS_BMAPI_CONVERT;
2755 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
2756 /* FALLTHRU */
2758 case XFS_IOC_RESVSP: 2757 case XFS_IOC_RESVSP:
2759 case XFS_IOC_RESVSP64: 2758 case XFS_IOC_RESVSP64:
2760 error = xfs_alloc_file_space(ip, startoffset, bf->l_len, 2759 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2761 1, attr_flags); 2760 prealloc_type, attr_flags);
2762 if (error) 2761 if (error)
2763 return error; 2762 return error;
2764 setprealloc = 1; 2763 setprealloc = 1;
@@ -2827,7 +2826,7 @@ xfs_change_file_space(
2827 if (ip->i_d.di_mode & S_IXGRP) 2826 if (ip->i_d.di_mode & S_IXGRP)
2828 ip->i_d.di_mode &= ~S_ISGID; 2827 ip->i_d.di_mode &= ~S_ISGID;
2829 2828
2830 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2829 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2831 } 2830 }
2832 if (setprealloc) 2831 if (setprealloc)
2833 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 2832 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
@@ -2835,7 +2834,8 @@ xfs_change_file_space(
2835 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; 2834 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
2836 2835
2837 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2836 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2838 xfs_trans_set_sync(tp); 2837 if (attr_flags & XFS_ATTR_SYNC)
2838 xfs_trans_set_sync(tp);
2839 2839
2840 error = xfs_trans_commit(tp, 0); 2840 error = xfs_trans_commit(tp, 0);
2841 2841
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d8dfa8d0dadd..3bcd23353d6c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,7 +2,6 @@
2#define _XFS_VNODEOPS_H 1 2#define _XFS_VNODEOPS_H 1
3 3
4struct attrlist_cursor_kern; 4struct attrlist_cursor_kern;
5struct cred;
6struct file; 5struct file;
7struct iattr; 6struct iattr;
8struct inode; 7struct inode;
@@ -19,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
19#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ 18#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
20#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ 19#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 20#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
21#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
22 22
23int xfs_readlink(struct xfs_inode *ip, char *link); 23int xfs_readlink(struct xfs_inode *ip, char *link);
24int xfs_release(struct xfs_inode *ip); 24int xfs_release(struct xfs_inode *ip);
@@ -26,7 +26,7 @@ int xfs_inactive(struct xfs_inode *ip);
26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
27 struct xfs_inode **ipp, struct xfs_name *ci_name); 27 struct xfs_inode **ipp, struct xfs_name *ci_name);
28int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, 28int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
29 xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp); 29 xfs_dev_t rdev, struct xfs_inode **ipp);
30int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, 30int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 31 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -34,8 +34,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
35 xfs_off_t *offset, filldir_t filldir); 35 xfs_off_t *offset, filldir_t filldir);
36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
37 const char *target_path, mode_t mode, struct xfs_inode **ipp, 37 const char *target_path, mode_t mode, struct xfs_inode **ipp);
38 cred_t *credp);
39int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 38int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
40int xfs_change_file_space(struct xfs_inode *ip, int cmd, 39int xfs_change_file_space(struct xfs_inode *ip, int cmd,
41 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); 40 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);