aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorJens Axboe <jaxboe@fusionio.com>2010-10-19 03:13:04 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-10-19 03:13:04 -0400
commitfa251f89903d73989e2f63e13d0eaed1e07ce0da (patch)
tree3f7fe779941e3b6d67754dd7c44a32f48ea47c74 /fs/xfs
parentdd3932eddf428571762596e17b65f5dc92ca361b (diff)
parentcd07202cc8262e1669edff0d97715f3dd9260917 (diff)
Merge branch 'v2.6.36-rc8' into for-2.6.37/barrier
Conflicts: block/blk-core.c drivers/block/loop.c mm/swapfile.c Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c61
-rw-r--r--fs/xfs/xfs_bmap.c14
-rw-r--r--fs/xfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_fsops.c31
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c16
-rw-r--r--fs/xfs/xfs_inode.c49
-rw-r--r--fs/xfs/xfs_log.c7
-rw-r--r--fs/xfs/xfs_log_cil.c271
-rw-r--r--fs/xfs/xfs_log_priv.h50
-rw-r--r--fs/xfs/xfs_trans.c5
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/xfs/xfs_vnodeops.c13
19 files changed, 332 insertions, 239 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 15412fe15c3a..b552f816de15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -852,8 +852,8 @@ xfs_convert_page(
852 SetPageUptodate(page); 852 SetPageUptodate(page);
853 853
854 if (count) { 854 if (count) {
855 wbc->nr_to_write--; 855 if (--wbc->nr_to_write <= 0 &&
856 if (wbc->nr_to_write <= 0) 856 wbc->sync_mode == WB_SYNC_NONE)
857 done = 1; 857 done = 1;
858 } 858 }
859 xfs_start_page_writeback(page, !page_dirty, count); 859 xfs_start_page_writeback(page, !page_dirty, count);
@@ -1068,7 +1068,7 @@ xfs_vm_writepage(
1068 * by themselves. 1068 * by themselves.
1069 */ 1069 */
1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) 1070 if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
1071 goto out_fail; 1071 goto redirty;
1072 1072
1073 /* 1073 /*
1074 * We need a transaction if there are delalloc or unwritten buffers 1074 * We need a transaction if there are delalloc or unwritten buffers
@@ -1080,7 +1080,7 @@ xfs_vm_writepage(
1080 */ 1080 */
1081 xfs_count_page_state(page, &delalloc, &unwritten); 1081 xfs_count_page_state(page, &delalloc, &unwritten);
1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) 1082 if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
1083 goto out_fail; 1083 goto redirty;
1084 1084
1085 /* Is this page beyond the end of the file? */ 1085 /* Is this page beyond the end of the file? */
1086 offset = i_size_read(inode); 1086 offset = i_size_read(inode);
@@ -1245,12 +1245,15 @@ error:
1245 if (iohead) 1245 if (iohead)
1246 xfs_cancel_ioend(iohead); 1246 xfs_cancel_ioend(iohead);
1247 1247
1248 if (err == -EAGAIN)
1249 goto redirty;
1250
1248 xfs_aops_discard_page(page); 1251 xfs_aops_discard_page(page);
1249 ClearPageUptodate(page); 1252 ClearPageUptodate(page);
1250 unlock_page(page); 1253 unlock_page(page);
1251 return err; 1254 return err;
1252 1255
1253out_fail: 1256redirty:
1254 redirty_page_for_writepage(wbc, page); 1257 redirty_page_for_writepage(wbc, page);
1255 unlock_page(page); 1258 unlock_page(page);
1256 return 0; 1259 return 0;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b93ea3342281..1846a0dd7035 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -440,12 +440,7 @@ _xfs_buf_find(
440 ASSERT(btp == bp->b_target); 440 ASSERT(btp == bp->b_target);
441 if (bp->b_file_offset == range_base && 441 if (bp->b_file_offset == range_base &&
442 bp->b_buffer_length == range_length) { 442 bp->b_buffer_length == range_length) {
443 /*
444 * If we look at something, bring it to the
445 * front of the list for next time.
446 */
447 atomic_inc(&bp->b_hold); 443 atomic_inc(&bp->b_hold);
448 list_move(&bp->b_hash_list, &hash->bh_list);
449 goto found; 444 goto found;
450 } 445 }
451 } 446 }
@@ -1431,8 +1426,7 @@ xfs_alloc_bufhash(
1431{ 1426{
1432 unsigned int i; 1427 unsigned int i;
1433 1428
1434 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1429 btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */
1435 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1436 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1430 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1437 sizeof(xfs_bufhash_t)); 1431 sizeof(xfs_bufhash_t));
1438 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1432 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
@@ -1926,7 +1920,8 @@ xfs_buf_init(void)
1926 if (!xfs_buf_zone) 1920 if (!xfs_buf_zone)
1927 goto out; 1921 goto out;
1928 1922
1929 xfslogd_workqueue = create_workqueue("xfslogd"); 1923 xfslogd_workqueue = alloc_workqueue("xfslogd",
1924 WQ_RESCUER | WQ_HIGHPRI, 1);
1930 if (!xfslogd_workqueue) 1925 if (!xfslogd_workqueue)
1931 goto out_free_buf_zone; 1926 goto out_free_buf_zone;
1932 1927
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d533d64e2c3e..9d021c73ea52 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,7 +128,6 @@ typedef struct xfs_buftarg {
128 size_t bt_smask; 128 size_t bt_smask;
129 129
130 /* per device buffer hash table */ 130 /* per device buffer hash table */
131 uint bt_hashmask;
132 uint bt_hashshift; 131 uint bt_hashshift;
133 xfs_bufhash_t *bt_hash; 132 xfs_bufhash_t *bt_hash;
134 133
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 237f5ffb2ee8..3b9e626f7cd1 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -785,6 +785,8 @@ xfs_ioc_fsgetxattr(
785{ 785{
786 struct fsxattr fa; 786 struct fsxattr fa;
787 787
788 memset(&fa, 0, sizeof(struct fsxattr));
789
788 xfs_ilock(ip, XFS_ILOCK_SHARED); 790 xfs_ilock(ip, XFS_ILOCK_SHARED);
789 fa.fsx_xflags = xfs_ip2xflags(ip); 791 fa.fsx_xflags = xfs_ip2xflags(ip);
790 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; 792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
@@ -907,6 +909,13 @@ xfs_ioctl_setattr(
907 return XFS_ERROR(EIO); 909 return XFS_ERROR(EIO);
908 910
909 /* 911 /*
912 * Disallow 32bit project ids because on-disk structure
913 * is 16bit only.
914 */
915 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
916 return XFS_ERROR(EINVAL);
917
918 /*
910 * If disk quotas is on, we make sure that the dquots do exist on disk, 919 * If disk quotas is on, we make sure that the dquots do exist on disk,
911 * before we start any other transactions. Trying to do this later 920 * before we start any other transactions. Trying to do this later
912 * is messy. We don't care to take a readlock to look at the ids 921 * is messy. We don't care to take a readlock to look at the ids
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 68be25dcd301..b1fc2a6bfe83 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -664,7 +664,7 @@ xfs_vn_fiemap(
664 fieinfo->fi_extents_max + 1; 664 fieinfo->fi_extents_max + 1;
665 bm.bmv_count = min_t(__s32, bm.bmv_count, 665 bm.bmv_count = min_t(__s32, bm.bmv_count,
666 (PAGE_SIZE * 16 / sizeof(struct getbmapx))); 666 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
667 bm.bmv_iflags = BMV_IF_PREALLOC; 667 bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
668 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) 668 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
669 bm.bmv_iflags |= BMV_IF_ATTRFORK; 669 bm.bmv_iflags |= BMV_IF_ATTRFORK;
670 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) 670 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 5fa7a30cc3f0..08fd3102128c 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1225,6 +1225,7 @@ xfs_fs_statfs(
1225 struct xfs_inode *ip = XFS_I(dentry->d_inode); 1225 struct xfs_inode *ip = XFS_I(dentry->d_inode);
1226 __uint64_t fakeinos, id; 1226 __uint64_t fakeinos, id;
1227 xfs_extlen_t lsize; 1227 xfs_extlen_t lsize;
1228 __int64_t ffree;
1228 1229
1229 statp->f_type = XFS_SB_MAGIC; 1230 statp->f_type = XFS_SB_MAGIC;
1230 statp->f_namelen = MAXNAMELEN - 1; 1231 statp->f_namelen = MAXNAMELEN - 1;
@@ -1248,7 +1249,11 @@ xfs_fs_statfs(
1248 statp->f_files = min_t(typeof(statp->f_files), 1249 statp->f_files = min_t(typeof(statp->f_files),
1249 statp->f_files, 1250 statp->f_files,
1250 mp->m_maxicount); 1251 mp->m_maxicount);
1251 statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1252
1253 /* make sure statp->f_ffree does not underflow */
1254 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
1255 statp->f_ffree = max_t(__int64_t, ffree, 0);
1256
1252 spin_unlock(&mp->m_sb_lock); 1257 spin_unlock(&mp->m_sb_lock);
1253 1258
1254 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || 1259 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1401,7 +1406,7 @@ xfs_fs_freeze(
1401 1406
1402 xfs_save_resvblks(mp); 1407 xfs_save_resvblks(mp);
1403 xfs_quiesce_attr(mp); 1408 xfs_quiesce_attr(mp);
1404 return -xfs_fs_log_dummy(mp); 1409 return -xfs_fs_log_dummy(mp, SYNC_WAIT);
1405} 1410}
1406 1411
1407STATIC int 1412STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index dfcbd98d1599..81976ffed7d6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -34,6 +34,7 @@
34#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
35#include "xfs_quota.h" 35#include "xfs_quota.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_fsops.h"
37 38
38#include <linux/kthread.h> 39#include <linux/kthread.h>
39#include <linux/freezer.h> 40#include <linux/freezer.h>
@@ -341,38 +342,6 @@ xfs_sync_attr(
341} 342}
342 343
343STATIC int 344STATIC int
344xfs_commit_dummy_trans(
345 struct xfs_mount *mp,
346 uint flags)
347{
348 struct xfs_inode *ip = mp->m_rootip;
349 struct xfs_trans *tp;
350 int error;
351
352 /*
353 * Put a dummy transaction in the log to tell recovery
354 * that all others are OK.
355 */
356 tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
357 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
358 if (error) {
359 xfs_trans_cancel(tp, 0);
360 return error;
361 }
362
363 xfs_ilock(ip, XFS_ILOCK_EXCL);
364
365 xfs_trans_ijoin(tp, ip);
366 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
367 error = xfs_trans_commit(tp, 0);
368 xfs_iunlock(ip, XFS_ILOCK_EXCL);
369
370 /* the log force ensures this transaction is pushed to disk */
371 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
372 return error;
373}
374
375STATIC int
376xfs_sync_fsdata( 345xfs_sync_fsdata(
377 struct xfs_mount *mp) 346 struct xfs_mount *mp)
378{ 347{
@@ -432,7 +401,7 @@ xfs_quiesce_data(
432 401
433 /* mark the log as covered if needed */ 402 /* mark the log as covered if needed */
434 if (xfs_log_need_covered(mp)) 403 if (xfs_log_need_covered(mp))
435 error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); 404 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
436 405
437 /* flush data-only devices */ 406 /* flush data-only devices */
438 if (mp->m_rtdev_targp) 407 if (mp->m_rtdev_targp)
@@ -563,7 +532,7 @@ xfs_flush_inodes(
563/* 532/*
564 * Every sync period we need to unpin all items, reclaim inodes and sync 533 * Every sync period we need to unpin all items, reclaim inodes and sync
565 * disk quotas. We might need to cover the log to indicate that the 534 * disk quotas. We might need to cover the log to indicate that the
566 * filesystem is idle. 535 * filesystem is idle and not frozen.
567 */ 536 */
568STATIC void 537STATIC void
569xfs_sync_worker( 538xfs_sync_worker(
@@ -577,8 +546,9 @@ xfs_sync_worker(
577 xfs_reclaim_inodes(mp, 0); 546 xfs_reclaim_inodes(mp, 0);
578 /* dgc: errors ignored here */ 547 /* dgc: errors ignored here */
579 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 548 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
580 if (xfs_log_need_covered(mp)) 549 if (mp->m_super->s_frozen == SB_UNFROZEN &&
581 error = xfs_commit_dummy_trans(mp, 0); 550 xfs_log_need_covered(mp))
551 error = xfs_fs_log_dummy(mp, 0);
582 } 552 }
583 mp->m_sync_seq++; 553 mp->m_sync_seq++;
584 wake_up(&mp->m_wait_single_sync_task); 554 wake_up(&mp->m_wait_single_sync_task);
@@ -698,14 +668,11 @@ xfs_inode_set_reclaim_tag(
698 xfs_perag_put(pag); 668 xfs_perag_put(pag);
699} 669}
700 670
701void 671STATIC void
702__xfs_inode_clear_reclaim_tag( 672__xfs_inode_clear_reclaim(
703 xfs_mount_t *mp,
704 xfs_perag_t *pag, 673 xfs_perag_t *pag,
705 xfs_inode_t *ip) 674 xfs_inode_t *ip)
706{ 675{
707 radix_tree_tag_clear(&pag->pag_ici_root,
708 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
709 pag->pag_ici_reclaimable--; 676 pag->pag_ici_reclaimable--;
710 if (!pag->pag_ici_reclaimable) { 677 if (!pag->pag_ici_reclaimable) {
711 /* clear the reclaim tag from the perag radix tree */ 678 /* clear the reclaim tag from the perag radix tree */
@@ -719,6 +686,17 @@ __xfs_inode_clear_reclaim_tag(
719 } 686 }
720} 687}
721 688
689void
690__xfs_inode_clear_reclaim_tag(
691 xfs_mount_t *mp,
692 xfs_perag_t *pag,
693 xfs_inode_t *ip)
694{
695 radix_tree_tag_clear(&pag->pag_ici_root,
696 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
697 __xfs_inode_clear_reclaim(pag, ip);
698}
699
722/* 700/*
723 * Inodes in different states need to be treated differently, and the return 701 * Inodes in different states need to be treated differently, and the return
724 * value of xfs_iflush is not sufficient to get this right. The following table 702 * value of xfs_iflush is not sufficient to get this right. The following table
@@ -868,6 +846,7 @@ reclaim:
868 if (!radix_tree_delete(&pag->pag_ici_root, 846 if (!radix_tree_delete(&pag->pag_ici_root,
869 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 847 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
870 ASSERT(0); 848 ASSERT(0);
849 __xfs_inode_clear_reclaim(pag, ip);
871 write_unlock(&pag->pag_ici_lock); 850 write_unlock(&pag->pag_ici_lock);
872 851
873 /* 852 /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 23f14e595c18..f90dadd5a968 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5533,12 +5533,24 @@ xfs_getbmap(
5533 map[i].br_startblock)) 5533 map[i].br_startblock))
5534 goto out_free_map; 5534 goto out_free_map;
5535 5535
5536 nexleft--;
5537 bmv->bmv_offset = 5536 bmv->bmv_offset =
5538 out[cur_ext].bmv_offset + 5537 out[cur_ext].bmv_offset +
5539 out[cur_ext].bmv_length; 5538 out[cur_ext].bmv_length;
5540 bmv->bmv_length = 5539 bmv->bmv_length =
5541 max_t(__int64_t, 0, bmvend - bmv->bmv_offset); 5540 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
5541
5542 /*
5543 * In case we don't want to return the hole,
5544 * don't increase cur_ext so that we can reuse
5545 * it in the next loop.
5546 */
5547 if ((iflags & BMV_IF_NO_HOLES) &&
5548 map[i].br_startblock == HOLESTARTBLOCK) {
5549 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
5550 continue;
5551 }
5552
5553 nexleft--;
5542 bmv->bmv_entries++; 5554 bmv->bmv_entries++;
5543 cur_ext++; 5555 cur_ext++;
5544 } 5556 }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5f..87c2e9d02288 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ 114#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ 115#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ 116#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
117#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
117#define BMV_IF_VALID \ 118#define BMV_IF_VALID \
118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) 119 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
120 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
119 121
120/* bmv_oflags values - returned for each non-header segment */ 122/* bmv_oflags values - returned for each non-header segment */
121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ 123#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dbca5f5c37ba..43b1d5699335 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -604,31 +604,36 @@ out:
604 return 0; 604 return 0;
605} 605}
606 606
607/*
608 * Dump a transaction into the log that contains no real change. This is needed
609 * to be able to make the log dirty or stamp the current tail LSN into the log
610 * during the covering operation.
611 *
612 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead.
615 */
607int 616int
608xfs_fs_log_dummy( 617xfs_fs_log_dummy(
609 xfs_mount_t *mp) 618 xfs_mount_t *mp,
619 int flags)
610{ 620{
611 xfs_trans_t *tp; 621 xfs_trans_t *tp;
612 xfs_inode_t *ip;
613 int error; 622 int error;
614 623
615 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); 624 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
616 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 625 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
626 XFS_DEFAULT_LOG_COUNT);
617 if (error) { 627 if (error) {
618 xfs_trans_cancel(tp, 0); 628 xfs_trans_cancel(tp, 0);
619 return error; 629 return error;
620 } 630 }
621 631
622 ip = mp->m_rootip; 632 /* log the UUID because it is an unchanging field */
623 xfs_ilock(ip, XFS_ILOCK_EXCL); 633 xfs_mod_sb(tp, XFS_SB_UUID);
624 634 if (flags & SYNC_WAIT)
625 xfs_trans_ijoin(tp, ip); 635 xfs_trans_set_sync(tp);
626 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 636 return xfs_trans_commit(tp, 0);
627 xfs_trans_set_sync(tp);
628 error = xfs_trans_commit(tp, 0);
629
630 xfs_iunlock(ip, XFS_ILOCK_EXCL);
631 return error;
632} 637}
633 638
634int 639int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c9..a786c5212c1e 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp); 28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index abf80ae1e95b..5371d2dc360e 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1213,7 +1213,6 @@ xfs_imap_lookup(
1213 struct xfs_inobt_rec_incore rec; 1213 struct xfs_inobt_rec_incore rec;
1214 struct xfs_btree_cur *cur; 1214 struct xfs_btree_cur *cur;
1215 struct xfs_buf *agbp; 1215 struct xfs_buf *agbp;
1216 xfs_agino_t startino;
1217 int error; 1216 int error;
1218 int i; 1217 int i;
1219 1218
@@ -1227,13 +1226,13 @@ xfs_imap_lookup(
1227 } 1226 }
1228 1227
1229 /* 1228 /*
1230 * derive and lookup the exact inode record for the given agino. If the 1229 * Lookup the inode record for the given agino. If the record cannot be
1231 * record cannot be found, then it's an invalid inode number and we 1230 * found, then it's an invalid inode number and we should abort. Once
1232 * should abort. 1231 * we have a record, we need to ensure it contains the inode number
1232 * we are looking up.
1233 */ 1233 */
1234 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1234 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1235 startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); 1235 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1236 error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
1237 if (!error) { 1236 if (!error) {
1238 if (i) 1237 if (i)
1239 error = xfs_inobt_get_rec(cur, &rec, &i); 1238 error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1246,6 +1245,11 @@ xfs_imap_lookup(
1246 if (error) 1245 if (error)
1247 return error; 1246 return error;
1248 1247
1248 /* check that the returned record contains the required inode */
1249 if (rec.ir_startino > agino ||
1250 rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
1251 return EINVAL;
1252
1249 /* for untrusted inodes check it is allocated first */ 1253 /* for untrusted inodes check it is allocated first */
1250 if ((flags & XFS_IGET_UNTRUSTED) && 1254 if ((flags & XFS_IGET_UNTRUSTED) &&
1251 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) 1255 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 68415cb4f23c..34798f391c49 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1914,6 +1914,11 @@ xfs_iunlink_remove(
1914 return 0; 1914 return 0;
1915} 1915}
1916 1916
1917/*
1918 * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1919 * inodes that are in memory - they all must be marked stale and attached to
1920 * the cluster buffer.
1921 */
1917STATIC void 1922STATIC void
1918xfs_ifree_cluster( 1923xfs_ifree_cluster(
1919 xfs_inode_t *free_ip, 1924 xfs_inode_t *free_ip,
@@ -1945,8 +1950,6 @@ xfs_ifree_cluster(
1945 } 1950 }
1946 1951
1947 for (j = 0; j < nbufs; j++, inum += ninodes) { 1952 for (j = 0; j < nbufs; j++, inum += ninodes) {
1948 int found = 0;
1949
1950 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1953 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1951 XFS_INO_TO_AGBNO(mp, inum)); 1954 XFS_INO_TO_AGBNO(mp, inum));
1952 1955
@@ -1965,7 +1968,9 @@ xfs_ifree_cluster(
1965 /* 1968 /*
1966 * Walk the inodes already attached to the buffer and mark them 1969 * Walk the inodes already attached to the buffer and mark them
1967 * stale. These will all have the flush locks held, so an 1970 * stale. These will all have the flush locks held, so an
1968 * in-memory inode walk can't lock them. 1971 * in-memory inode walk can't lock them. By marking them all
1972 * stale first, we will not attempt to lock them in the loop
1973 * below as the XFS_ISTALE flag will be set.
1969 */ 1974 */
1970 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1975 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1971 while (lip) { 1976 while (lip) {
@@ -1977,11 +1982,11 @@ xfs_ifree_cluster(
1977 &iip->ili_flush_lsn, 1982 &iip->ili_flush_lsn,
1978 &iip->ili_item.li_lsn); 1983 &iip->ili_item.li_lsn);
1979 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1984 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1980 found++;
1981 } 1985 }
1982 lip = lip->li_bio_list; 1986 lip = lip->li_bio_list;
1983 } 1987 }
1984 1988
1989
1985 /* 1990 /*
1986 * For each inode in memory attempt to add it to the inode 1991 * For each inode in memory attempt to add it to the inode
1987 * buffer and set it up for being staled on buffer IO 1992 * buffer and set it up for being staled on buffer IO
@@ -1993,6 +1998,7 @@ xfs_ifree_cluster(
1993 * even trying to lock them. 1998 * even trying to lock them.
1994 */ 1999 */
1995 for (i = 0; i < ninodes; i++) { 2000 for (i = 0; i < ninodes; i++) {
2001retry:
1996 read_lock(&pag->pag_ici_lock); 2002 read_lock(&pag->pag_ici_lock);
1997 ip = radix_tree_lookup(&pag->pag_ici_root, 2003 ip = radix_tree_lookup(&pag->pag_ici_root,
1998 XFS_INO_TO_AGINO(mp, (inum + i))); 2004 XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2003,38 +2009,36 @@ xfs_ifree_cluster(
2003 continue; 2009 continue;
2004 } 2010 }
2005 2011
2006 /* don't try to lock/unlock the current inode */ 2012 /*
2013 * Don't try to lock/unlock the current inode, but we
2014 * _cannot_ skip the other inodes that we did not find
2015 * in the list attached to the buffer and are not
2016 * already marked stale. If we can't lock it, back off
2017 * and retry.
2018 */
2007 if (ip != free_ip && 2019 if (ip != free_ip &&
2008 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2020 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2009 read_unlock(&pag->pag_ici_lock); 2021 read_unlock(&pag->pag_ici_lock);
2010 continue; 2022 delay(1);
2023 goto retry;
2011 } 2024 }
2012 read_unlock(&pag->pag_ici_lock); 2025 read_unlock(&pag->pag_ici_lock);
2013 2026
2014 if (!xfs_iflock_nowait(ip)) { 2027 xfs_iflock(ip);
2015 if (ip != free_ip)
2016 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2017 continue;
2018 }
2019
2020 xfs_iflags_set(ip, XFS_ISTALE); 2028 xfs_iflags_set(ip, XFS_ISTALE);
2021 if (xfs_inode_clean(ip)) {
2022 ASSERT(ip != free_ip);
2023 xfs_ifunlock(ip);
2024 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2025 continue;
2026 }
2027 2029
2030 /*
2031 * we don't need to attach clean inodes or those only
2032 * with unlogged changes (which we throw away, anyway).
2033 */
2028 iip = ip->i_itemp; 2034 iip = ip->i_itemp;
2029 if (!iip) { 2035 if (!iip || xfs_inode_clean(ip)) {
2030 /* inode with unlogged changes only */
2031 ASSERT(ip != free_ip); 2036 ASSERT(ip != free_ip);
2032 ip->i_update_core = 0; 2037 ip->i_update_core = 0;
2033 xfs_ifunlock(ip); 2038 xfs_ifunlock(ip);
2034 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2039 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2035 continue; 2040 continue;
2036 } 2041 }
2037 found++;
2038 2042
2039 iip->ili_last_fields = iip->ili_format.ilf_fields; 2043 iip->ili_last_fields = iip->ili_format.ilf_fields;
2040 iip->ili_format.ilf_fields = 0; 2044 iip->ili_format.ilf_fields = 0;
@@ -2049,8 +2053,7 @@ xfs_ifree_cluster(
2049 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2053 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2050 } 2054 }
2051 2055
2052 if (found) 2056 xfs_trans_stale_inode_buf(tp, bp);
2053 xfs_trans_stale_inode_buf(tp, bp);
2054 xfs_trans_binval(tp, bp); 2057 xfs_trans_binval(tp, bp);
2055 } 2058 }
2056 2059
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 430a8fc02c1f..ba8e36e0b4e7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3002,7 +3002,8 @@ _xfs_log_force(
3002 3002
3003 XFS_STATS_INC(xs_log_force); 3003 XFS_STATS_INC(xs_log_force);
3004 3004
3005 xlog_cil_push(log, 1); 3005 if (log->l_cilp)
3006 xlog_cil_force(log);
3006 3007
3007 spin_lock(&log->l_icloglock); 3008 spin_lock(&log->l_icloglock);
3008 3009
@@ -3154,7 +3155,7 @@ _xfs_log_force_lsn(
3154 XFS_STATS_INC(xs_log_force); 3155 XFS_STATS_INC(xs_log_force);
3155 3156
3156 if (log->l_cilp) { 3157 if (log->l_cilp) {
3157 lsn = xlog_cil_push_lsn(log, lsn); 3158 lsn = xlog_cil_force_lsn(log, lsn);
3158 if (lsn == NULLCOMMITLSN) 3159 if (lsn == NULLCOMMITLSN)
3159 return 0; 3160 return 0;
3160 } 3161 }
@@ -3711,7 +3712,7 @@ xfs_log_force_umount(
3711 * call below. 3712 * call below.
3712 */ 3713 */
3713 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) 3714 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3714 xlog_cil_push(log, 1); 3715 xlog_cil_force(log);
3715 3716
3716 /* 3717 /*
3717 * We must hold both the GRANT lock and the LOG lock, 3718 * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 31e4ea2d19ac..7e206fc1fa36 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -68,6 +68,7 @@ xlog_cil_init(
68 ctx->sequence = 1; 68 ctx->sequence = 1;
69 ctx->cil = cil; 69 ctx->cil = cil;
70 cil->xc_ctx = ctx; 70 cil->xc_ctx = ctx;
71 cil->xc_current_sequence = ctx->sequence;
71 72
72 cil->xc_log = log; 73 cil->xc_log = log;
73 log->l_cilp = cil; 74 log->l_cilp = cil;
@@ -269,15 +270,10 @@ xlog_cil_insert(
269static void 270static void
270xlog_cil_format_items( 271xlog_cil_format_items(
271 struct log *log, 272 struct log *log,
272 struct xfs_log_vec *log_vector, 273 struct xfs_log_vec *log_vector)
273 struct xlog_ticket *ticket,
274 xfs_lsn_t *start_lsn)
275{ 274{
276 struct xfs_log_vec *lv; 275 struct xfs_log_vec *lv;
277 276
278 if (start_lsn)
279 *start_lsn = log->l_cilp->xc_ctx->sequence;
280
281 ASSERT(log_vector); 277 ASSERT(log_vector);
282 for (lv = log_vector; lv; lv = lv->lv_next) { 278 for (lv = log_vector; lv; lv = lv->lv_next) {
283 void *ptr; 279 void *ptr;
@@ -301,9 +297,24 @@ xlog_cil_format_items(
301 ptr += vec->i_len; 297 ptr += vec->i_len;
302 } 298 }
303 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); 299 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
300 }
301}
304 302
303static void
304xlog_cil_insert_items(
305 struct log *log,
306 struct xfs_log_vec *log_vector,
307 struct xlog_ticket *ticket,
308 xfs_lsn_t *start_lsn)
309{
310 struct xfs_log_vec *lv;
311
312 if (start_lsn)
313 *start_lsn = log->l_cilp->xc_ctx->sequence;
314
315 ASSERT(log_vector);
316 for (lv = log_vector; lv; lv = lv->lv_next)
305 xlog_cil_insert(log, ticket, lv->lv_item, lv); 317 xlog_cil_insert(log, ticket, lv->lv_item, lv);
306 }
307} 318}
308 319
309static void 320static void
@@ -321,80 +332,6 @@ xlog_cil_free_logvec(
321} 332}
322 333
323/* 334/*
324 * Commit a transaction with the given vector to the Committed Item List.
325 *
326 * To do this, we need to format the item, pin it in memory if required and
327 * account for the space used by the transaction. Once we have done that we
328 * need to release the unused reservation for the transaction, attach the
329 * transaction to the checkpoint context so we carry the busy extents through
330 * to checkpoint completion, and then unlock all the items in the transaction.
331 *
332 * For more specific information about the order of operations in
333 * xfs_log_commit_cil() please refer to the comments in
334 * xfs_trans_commit_iclog().
335 *
336 * Called with the context lock already held in read mode to lock out
337 * background commit, returns without it held once background commits are
338 * allowed again.
339 */
340int
341xfs_log_commit_cil(
342 struct xfs_mount *mp,
343 struct xfs_trans *tp,
344 struct xfs_log_vec *log_vector,
345 xfs_lsn_t *commit_lsn,
346 int flags)
347{
348 struct log *log = mp->m_log;
349 int log_flags = 0;
350 int push = 0;
351
352 if (flags & XFS_TRANS_RELEASE_LOG_RES)
353 log_flags = XFS_LOG_REL_PERM_RESERV;
354
355 if (XLOG_FORCED_SHUTDOWN(log)) {
356 xlog_cil_free_logvec(log_vector);
357 return XFS_ERROR(EIO);
358 }
359
360 /* lock out background commit */
361 down_read(&log->l_cilp->xc_ctx_lock);
362 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
363
364 /* check we didn't blow the reservation */
365 if (tp->t_ticket->t_curr_res < 0)
366 xlog_print_tic_res(log->l_mp, tp->t_ticket);
367
368 /* attach the transaction to the CIL if it has any busy extents */
369 if (!list_empty(&tp->t_busy)) {
370 spin_lock(&log->l_cilp->xc_cil_lock);
371 list_splice_init(&tp->t_busy,
372 &log->l_cilp->xc_ctx->busy_extents);
373 spin_unlock(&log->l_cilp->xc_cil_lock);
374 }
375
376 tp->t_commit_lsn = *commit_lsn;
377 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
378 xfs_trans_unreserve_and_mod_sb(tp);
379
380 /* check for background commit before unlock */
381 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
382 push = 1;
383 up_read(&log->l_cilp->xc_ctx_lock);
384
385 /*
386 * We need to push CIL every so often so we don't cache more than we
387 * can fit in the log. The limit really is that a checkpoint can't be
388 * more than half the log (the current checkpoint is not allowed to
389 * overwrite the previous checkpoint), but commit latency and memory
390 * usage limit this to a smaller size in most cases.
391 */
392 if (push)
393 xlog_cil_push(log, 0);
394 return 0;
395}
396
397/*
398 * Mark all items committed and clear busy extents. We free the log vector 335 * Mark all items committed and clear busy extents. We free the log vector
399 * chains in a separate pass so that we unpin the log items as quickly as 336 * chains in a separate pass so that we unpin the log items as quickly as
400 * possible. 337 * possible.
@@ -427,13 +364,23 @@ xlog_cil_committed(
427} 364}
428 365
429/* 366/*
430 * Push the Committed Item List to the log. If the push_now flag is not set, 367 * Push the Committed Item List to the log. If @push_seq flag is zero, then it
431 * then it is a background flush and so we can chose to ignore it. 368 * is a background flush and so we can chose to ignore it. Otherwise, if the
369 * current sequence is the same as @push_seq we need to do a flush. If
370 * @push_seq is less than the current sequence, then it has already been
371 * flushed and we don't need to do anything - the caller will wait for it to
372 * complete if necessary.
373 *
374 * @push_seq is a value rather than a flag because that allows us to do an
375 * unlocked check of the sequence number for a match. Hence we can allows log
376 * forces to run racily and not issue pushes for the same sequence twice. If we
377 * get a race between multiple pushes for the same sequence they will block on
378 * the first one and then abort, hence avoiding needless pushes.
432 */ 379 */
433int 380STATIC int
434xlog_cil_push( 381xlog_cil_push(
435 struct log *log, 382 struct log *log,
436 int push_now) 383 xfs_lsn_t push_seq)
437{ 384{
438 struct xfs_cil *cil = log->l_cilp; 385 struct xfs_cil *cil = log->l_cilp;
439 struct xfs_log_vec *lv; 386 struct xfs_log_vec *lv;
@@ -453,12 +400,20 @@ xlog_cil_push(
453 if (!cil) 400 if (!cil)
454 return 0; 401 return 0;
455 402
403 ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
404
456 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 405 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
457 new_ctx->ticket = xlog_cil_ticket_alloc(log); 406 new_ctx->ticket = xlog_cil_ticket_alloc(log);
458 407
459 /* lock out transaction commit, but don't block on background push */ 408 /*
409 * Lock out transaction commit, but don't block for background pushes
410 * unless we are well over the CIL space limit. See the definition of
411 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
412 * used here.
413 */
460 if (!down_write_trylock(&cil->xc_ctx_lock)) { 414 if (!down_write_trylock(&cil->xc_ctx_lock)) {
461 if (!push_now) 415 if (!push_seq &&
416 cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
462 goto out_free_ticket; 417 goto out_free_ticket;
463 down_write(&cil->xc_ctx_lock); 418 down_write(&cil->xc_ctx_lock);
464 } 419 }
@@ -469,7 +424,11 @@ xlog_cil_push(
469 goto out_skip; 424 goto out_skip;
470 425
471 /* check for spurious background flush */ 426 /* check for spurious background flush */
472 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 427 if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
428 goto out_skip;
429
430 /* check for a previously pushed seqeunce */
431 if (push_seq && push_seq < cil->xc_ctx->sequence)
473 goto out_skip; 432 goto out_skip;
474 433
475 /* 434 /*
@@ -515,6 +474,13 @@ xlog_cil_push(
515 cil->xc_ctx = new_ctx; 474 cil->xc_ctx = new_ctx;
516 475
517 /* 476 /*
477 * mirror the new sequence into the cil structure so that we can do
478 * unlocked checks against the current sequence in log forces without
479 * risking deferencing a freed context pointer.
480 */
481 cil->xc_current_sequence = new_ctx->sequence;
482
483 /*
518 * The switch is now done, so we can drop the context lock and move out 484 * The switch is now done, so we can drop the context lock and move out
519 * of a shared context. We can't just go straight to the commit record, 485 * of a shared context. We can't just go straight to the commit record,
520 * though - we need to synchronise with previous and future commits so 486 * though - we need to synchronise with previous and future commits so
@@ -626,6 +592,102 @@ out_abort:
626} 592}
627 593
628/* 594/*
595 * Commit a transaction with the given vector to the Committed Item List.
596 *
597 * To do this, we need to format the item, pin it in memory if required and
598 * account for the space used by the transaction. Once we have done that we
599 * need to release the unused reservation for the transaction, attach the
600 * transaction to the checkpoint context so we carry the busy extents through
601 * to checkpoint completion, and then unlock all the items in the transaction.
602 *
603 * For more specific information about the order of operations in
604 * xfs_log_commit_cil() please refer to the comments in
605 * xfs_trans_commit_iclog().
606 *
607 * Called with the context lock already held in read mode to lock out
608 * background commit, returns without it held once background commits are
609 * allowed again.
610 */
611int
612xfs_log_commit_cil(
613 struct xfs_mount *mp,
614 struct xfs_trans *tp,
615 struct xfs_log_vec *log_vector,
616 xfs_lsn_t *commit_lsn,
617 int flags)
618{
619 struct log *log = mp->m_log;
620 int log_flags = 0;
621 int push = 0;
622
623 if (flags & XFS_TRANS_RELEASE_LOG_RES)
624 log_flags = XFS_LOG_REL_PERM_RESERV;
625
626 if (XLOG_FORCED_SHUTDOWN(log)) {
627 xlog_cil_free_logvec(log_vector);
628 return XFS_ERROR(EIO);
629 }
630
631 /*
632 * do all the hard work of formatting items (including memory
633 * allocation) outside the CIL context lock. This prevents stalling CIL
634 * pushes when we are low on memory and a transaction commit spends a
635 * lot of time in memory reclaim.
636 */
637 xlog_cil_format_items(log, log_vector);
638
639 /* lock out background commit */
640 down_read(&log->l_cilp->xc_ctx_lock);
641 xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
642
643 /* check we didn't blow the reservation */
644 if (tp->t_ticket->t_curr_res < 0)
645 xlog_print_tic_res(log->l_mp, tp->t_ticket);
646
647 /* attach the transaction to the CIL if it has any busy extents */
648 if (!list_empty(&tp->t_busy)) {
649 spin_lock(&log->l_cilp->xc_cil_lock);
650 list_splice_init(&tp->t_busy,
651 &log->l_cilp->xc_ctx->busy_extents);
652 spin_unlock(&log->l_cilp->xc_cil_lock);
653 }
654
655 tp->t_commit_lsn = *commit_lsn;
656 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
657 xfs_trans_unreserve_and_mod_sb(tp);
658
659 /*
660 * Once all the items of the transaction have been copied to the CIL,
661 * the items can be unlocked and freed.
662 *
663 * This needs to be done before we drop the CIL context lock because we
664 * have to update state in the log items and unlock them before they go
665 * to disk. If we don't, then the CIL checkpoint can race with us and
666 * we can run checkpoint completion before we've updated and unlocked
667 * the log items. This affects (at least) processing of stale buffers,
668 * inodes and EFIs.
669 */
670 xfs_trans_free_items(tp, *commit_lsn, 0);
671
672 /* check for background commit before unlock */
673 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
674 push = 1;
675
676 up_read(&log->l_cilp->xc_ctx_lock);
677
678 /*
679 * We need to push CIL every so often so we don't cache more than we
680 * can fit in the log. The limit really is that a checkpoint can't be
681 * more than half the log (the current checkpoint is not allowed to
682 * overwrite the previous checkpoint), but commit latency and memory
683 * usage limit this to a smaller size in most cases.
684 */
685 if (push)
686 xlog_cil_push(log, 0);
687 return 0;
688}
689
690/*
629 * Conditionally push the CIL based on the sequence passed in. 691 * Conditionally push the CIL based on the sequence passed in.
630 * 692 *
631 * We only need to push if we haven't already pushed the sequence 693 * We only need to push if we haven't already pushed the sequence
@@ -639,39 +701,34 @@ out_abort:
639 * commit lsn is there. It'll be empty, so this is broken for now. 701 * commit lsn is there. It'll be empty, so this is broken for now.
640 */ 702 */
641xfs_lsn_t 703xfs_lsn_t
642xlog_cil_push_lsn( 704xlog_cil_force_lsn(
643 struct log *log, 705 struct log *log,
644 xfs_lsn_t push_seq) 706 xfs_lsn_t sequence)
645{ 707{
646 struct xfs_cil *cil = log->l_cilp; 708 struct xfs_cil *cil = log->l_cilp;
647 struct xfs_cil_ctx *ctx; 709 struct xfs_cil_ctx *ctx;
648 xfs_lsn_t commit_lsn = NULLCOMMITLSN; 710 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
649 711
650restart: 712 ASSERT(sequence <= cil->xc_current_sequence);
651 down_write(&cil->xc_ctx_lock); 713
652 ASSERT(push_seq <= cil->xc_ctx->sequence); 714 /*
653 715 * check to see if we need to force out the current context.
654 /* check to see if we need to force out the current context */ 716 * xlog_cil_push() handles racing pushes for the same sequence,
655 if (push_seq == cil->xc_ctx->sequence) { 717 * so no need to deal with it here.
656 up_write(&cil->xc_ctx_lock); 718 */
657 xlog_cil_push(log, 1); 719 if (sequence == cil->xc_current_sequence)
658 goto restart; 720 xlog_cil_push(log, sequence);
659 }
660 721
661 /* 722 /*
662 * See if we can find a previous sequence still committing. 723 * See if we can find a previous sequence still committing.
663 * We can drop the flush lock as soon as we have the cil lock
664 * because we are now only comparing contexts protected by
665 * the cil lock.
666 *
667 * We need to wait for all previous sequence commits to complete 724 * We need to wait for all previous sequence commits to complete
668 * before allowing the force of push_seq to go ahead. Hence block 725 * before allowing the force of push_seq to go ahead. Hence block
669 * on commits for those as well. 726 * on commits for those as well.
670 */ 727 */
728restart:
671 spin_lock(&cil->xc_cil_lock); 729 spin_lock(&cil->xc_cil_lock);
672 up_write(&cil->xc_ctx_lock);
673 list_for_each_entry(ctx, &cil->xc_committing, committing) { 730 list_for_each_entry(ctx, &cil->xc_committing, committing) {
674 if (ctx->sequence > push_seq) 731 if (ctx->sequence > sequence)
675 continue; 732 continue;
676 if (!ctx->commit_lsn) { 733 if (!ctx->commit_lsn) {
677 /* 734 /*
@@ -681,7 +738,7 @@ restart:
681 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 738 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
682 goto restart; 739 goto restart;
683 } 740 }
684 if (ctx->sequence != push_seq) 741 if (ctx->sequence != sequence)
685 continue; 742 continue;
686 /* found it! */ 743 /* found it! */
687 commit_lsn = ctx->commit_lsn; 744 commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965c..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,16 +422,17 @@ struct xfs_cil {
422 struct rw_semaphore xc_ctx_lock; 422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 423 struct list_head xc_committing;
424 sv_t xc_commit_wait; 424 sv_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence;
425}; 426};
426 427
427/* 428/*
428 * The amount of log space we should the CIL to aggregate is difficult to size. 429 * The amount of log space we allow the CIL to aggregate is difficult to size.
429 * Whatever we chose we have to make we can get a reservation for the log space 430 * Whatever we choose, we have to make sure we can get a reservation for the
430 * effectively, that it is large enough to capture sufficient relogging to 431 * log space effectively, that it is large enough to capture sufficient
431 * reduce log buffer IO significantly, but it is not too large for the log or 432 * relogging to reduce log buffer IO significantly, but it is not too large for
432 * induces too much latency when writing out through the iclogs. We track both 433 * the log or induces too much latency when writing out through the iclogs. We
433 * space consumed and the number of vectors in the checkpoint context, so we 434 * track both space consumed and the number of vectors in the checkpoint
434 * need to decide which to use for limiting. 435 * context, so we need to decide which to use for limiting.
435 * 436 *
436 * Every log buffer we write out during a push needs a header reserved, which 437 * Every log buffer we write out during a push needs a header reserved, which
437 * is at least one sector and more for v2 logs. Hence we need a reservation of 438 * is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -458,16 +459,21 @@ struct xfs_cil {
458 * checkpoint transaction ticket is specific to the checkpoint context, rather 459 * checkpoint transaction ticket is specific to the checkpoint context, rather
459 * than the CIL itself. 460 * than the CIL itself.
460 * 461 *
461 * With dynamic reservations, we can basically make up arbitrary limits for the 462 * With dynamic reservations, we can effectively make up arbitrary limits for
462 * checkpoint size so long as they don't violate any other size rules. Hence 463 * the checkpoint size so long as they don't violate any other size rules.
463 * the initial maximum size for the checkpoint transaction will be set to a 464 * Recovery imposes a rule that no transaction exceed half the log, so we are
464 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit 465 * limited by that. Furthermore, the log transaction reservation subsystem
465 * right now based on the latency of writing out a large amount of data through 466 * tries to keep 25% of the log free, so we need to keep below that limit or we
466 * the circular iclog buffers. 467 * risk running out of free log space to start any new transactions.
468 *
469 * In order to keep background CIL push efficient, we will set a lower
470 * threshold at which background pushing is attempted without blocking current
471 * transaction commits. A separate, higher bound defines when CIL pushes are
472 * enforced to ensure we stay within our maximum checkpoint size bounds.
473 * threshold, yet give us plenty of space for aggregation on large logs.
467 */ 474 */
468 475#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
469#define XLOG_CIL_SPACE_LIMIT(log) \ 476#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
470 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
471 477
472/* 478/*
473 * The reservation head lsn is not made up of a cycle number and block number. 479 * The reservation head lsn is not made up of a cycle number and block number.
@@ -562,8 +568,16 @@ int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log); 568void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log); 569void xlog_cil_destroy(struct log *log);
564 570
565int xlog_cil_push(struct log *log, int push_now); 571/*
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); 572 * CIL force routines
573 */
574xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
575
576static inline void
577xlog_cil_force(struct log *log)
578{
579 xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
580}
567 581
568/* 582/*
569 * Unmount record type is used as a pseudo transaction type for the ticket. 583 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdca7416c754..1c47edaea0d2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1167,7 +1167,7 @@ xfs_trans_del_item(
1167 * Unlock all of the items of a transaction and free all the descriptors 1167 * Unlock all of the items of a transaction and free all the descriptors
1168 * of that transaction. 1168 * of that transaction.
1169 */ 1169 */
1170STATIC void 1170void
1171xfs_trans_free_items( 1171xfs_trans_free_items(
1172 struct xfs_trans *tp, 1172 struct xfs_trans *tp,
1173 xfs_lsn_t commit_lsn, 1173 xfs_lsn_t commit_lsn,
@@ -1653,9 +1653,6 @@ xfs_trans_commit_cil(
1653 return error; 1653 return error;
1654 1654
1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1655 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1656
1657 /* xfs_trans_free_items() unlocks them first */
1658 xfs_trans_free_items(tp, *commit_lsn, 0);
1659 xfs_trans_free(tp); 1656 xfs_trans_free(tp);
1660 return 0; 1657 return 0;
1661} 1658}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index e2d93d8ead7b..62da86c90de5 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,7 +25,8 @@ struct xfs_trans;
25 25
26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27void xfs_trans_del_item(struct xfs_log_item *); 27void xfs_trans_del_item(struct xfs_log_item *);
28 28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29 int flags);
29void xfs_trans_item_committed(struct xfs_log_item *lip, 30void xfs_trans_item_committed(struct xfs_log_item *lip,
30 xfs_lsn_t commit_lsn, int aborted); 31 xfs_lsn_t commit_lsn, int aborted);
31void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 66d585c6917c..4c7c7bfb2b2f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2299,15 +2299,22 @@ xfs_alloc_file_space(
2299 e = allocatesize_fsb; 2299 e = allocatesize_fsb;
2300 } 2300 }
2301 2301
2302 /*
2303 * The transaction reservation is limited to a 32-bit block
2304 * count, hence we need to limit the number of blocks we are
2305 * trying to reserve to avoid an overflow. We can't allocate
2306 * more than @nimaps extents, and an extent is limited on disk
2307 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
2308 */
2309 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
2302 if (unlikely(rt)) { 2310 if (unlikely(rt)) {
2303 resrtextents = qblocks = (uint)(e - s); 2311 resrtextents = qblocks = resblks;
2304 resrtextents /= mp->m_sb.sb_rextsize; 2312 resrtextents /= mp->m_sb.sb_rextsize;
2305 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 2313 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
2306 quota_flag = XFS_QMOPT_RES_RTBLKS; 2314 quota_flag = XFS_QMOPT_RES_RTBLKS;
2307 } else { 2315 } else {
2308 resrtextents = 0; 2316 resrtextents = 0;
2309 resblks = qblocks = \ 2317 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
2310 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
2311 quota_flag = XFS_QMOPT_RES_REGBLKS; 2318 quota_flag = XFS_QMOPT_RES_REGBLKS;
2312 } 2319 }
2313 2320