aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/sv.h59
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c425
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h16
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c238
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h29
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c193
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c587
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c34
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c54
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c103
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h92
-rw-r--r--fs/xfs/quota/xfs_dquot.c1
-rw-r--r--fs/xfs/quota/xfs_qm.c46
-rw-r--r--fs/xfs/support/debug.c112
-rw-r--r--fs/xfs/support/debug.h25
-rw-r--r--fs/xfs/xfs_ag.h2
-rw-r--r--fs/xfs/xfs_alloc.c361
-rw-r--r--fs/xfs/xfs_alloc.h41
-rw-r--r--fs/xfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/xfs_bmap.c61
-rw-r--r--fs/xfs/xfs_btree.c9
-rw-r--r--fs/xfs/xfs_buf_item.c191
-rw-r--r--fs/xfs/xfs_buf_item.h11
-rw-r--r--fs/xfs/xfs_error.c31
-rw-r--r--fs/xfs/xfs_error.h18
-rw-r--r--fs/xfs/xfs_extfree_item.c96
-rw-r--r--fs/xfs/xfs_extfree_item.h11
-rw-r--r--fs/xfs/xfs_fsops.c14
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_iget.c79
-rw-r--r--fs/xfs/xfs_inode.c54
-rw-r--r--fs/xfs/xfs_inode.h15
-rw-r--r--fs/xfs/xfs_inode_item.c90
-rw-r--r--fs/xfs/xfs_iomap.c238
-rw-r--r--fs/xfs/xfs_iomap.h27
-rw-r--r--fs/xfs/xfs_log.c741
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c32
-rw-r--r--fs/xfs/xfs_log_priv.h127
-rw-r--r--fs/xfs/xfs_log_recover.c622
-rw-r--r--fs/xfs/xfs_mount.c23
-rw-r--r--fs/xfs/xfs_mount.h14
-rw-r--r--fs/xfs/xfs_trans.c122
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_ail.c232
-rw-r--r--fs/xfs/xfs_trans_extfree.c8
-rw-r--r--fs/xfs/xfs_trans_priv.h35
-rw-r--r--fs/xfs/xfs_vnodeops.c61
54 files changed, 2942 insertions, 2524 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..faca44997099 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
98 kmem.o \ 98 kmem.o \
99 xfs_aops.o \ 99 xfs_aops.o \
100 xfs_buf.o \ 100 xfs_buf.o \
101 xfs_discard.o \
101 xfs_export.o \ 102 xfs_export.o \
102 xfs_file.o \ 103 xfs_file.o \
103 xfs_fs_subr.o \ 104 xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_SV_H__
19#define __XFS_SUPPORT_SV_H__
20
21#include <linux/wait.h>
22#include <linux/sched.h>
23#include <linux/spinlock.h>
24
25/*
26 * Synchronisation variables.
27 *
28 * (Parameters "pri", "svf" and "rts" are not implemented)
29 */
30
31typedef struct sv_s {
32 wait_queue_head_t waiters;
33} sv_t;
34
35static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
36{
37 DECLARE_WAITQUEUE(wait, current);
38
39 add_wait_queue_exclusive(&sv->waiters, &wait);
40 __set_current_state(TASK_UNINTERRUPTIBLE);
41 spin_unlock(lock);
42
43 schedule();
44
45 remove_wait_queue(&sv->waiters, &wait);
46}
47
48#define sv_init(sv,flag,name) \
49 init_waitqueue_head(&(sv)->waiters)
50#define sv_destroy(sv) \
51 /*NOTHING*/
52#define sv_wait(sv, pri, lock, s) \
53 _sv_wait(sv, lock)
54#define sv_signal(sv) \
55 wake_up(&(sv)->waiters)
56#define sv_broadcast(sv) \
57 wake_up_all(&(sv)->waiters)
58
59#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 691f61223ed6..ec7bbb5645b6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
38#include <linux/pagevec.h> 38#include <linux/pagevec.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40 40
41/*
42 * Types of I/O for bmap clustering and I/O completion tracking.
43 */
44enum {
45 IO_READ, /* mapping for a read */
46 IO_DELAY, /* mapping covers delalloc region */
47 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
48 IO_NEW /* just allocated */
49};
50 41
51/* 42/*
52 * Prime number of hash buckets since address is used as the key. 43 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
182 xfs_inode_t *ip = XFS_I(ioend->io_inode); 173 xfs_inode_t *ip = XFS_I(ioend->io_inode);
183 xfs_fsize_t isize; 174 xfs_fsize_t isize;
184 175
185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
186 ASSERT(ioend->io_type != IO_READ);
187
188 if (unlikely(ioend->io_error)) 176 if (unlikely(ioend->io_error))
189 return 0; 177 return 0;
190 178
@@ -244,10 +232,8 @@ xfs_end_io(
244 * We might have to update the on-disk file size after extending 232 * We might have to update the on-disk file size after extending
245 * writes. 233 * writes.
246 */ 234 */
247 if (ioend->io_type != IO_READ) { 235 error = xfs_setfilesize(ioend);
248 error = xfs_setfilesize(ioend); 236 ASSERT(!error || error == EAGAIN);
249 ASSERT(!error || error == EAGAIN);
250 }
251 237
252 /* 238 /*
253 * If we didn't complete processing of the ioend, requeue it to the 239 * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
318xfs_map_blocks( 304xfs_map_blocks(
319 struct inode *inode, 305 struct inode *inode,
320 loff_t offset, 306 loff_t offset,
321 ssize_t count,
322 struct xfs_bmbt_irec *imap, 307 struct xfs_bmbt_irec *imap,
323 int flags) 308 int type,
309 int nonblocking)
324{ 310{
325 int nmaps = 1; 311 struct xfs_inode *ip = XFS_I(inode);
326 int new = 0; 312 struct xfs_mount *mp = ip->i_mount;
313 ssize_t count = 1 << inode->i_blkbits;
314 xfs_fileoff_t offset_fsb, end_fsb;
315 int error = 0;
316 int bmapi_flags = XFS_BMAPI_ENTIRE;
317 int nimaps = 1;
318
319 if (XFS_FORCED_SHUTDOWN(mp))
320 return -XFS_ERROR(EIO);
321
322 if (type == IO_UNWRITTEN)
323 bmapi_flags |= XFS_BMAPI_IGSTATE;
324
325 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
326 if (nonblocking)
327 return -XFS_ERROR(EAGAIN);
328 xfs_ilock(ip, XFS_ILOCK_SHARED);
329 }
327 330
328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); 331 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
332 (ip->i_df.if_flags & XFS_IFEXTENTS));
333 ASSERT(offset <= mp->m_maxioffset);
334
335 if (offset + count > mp->m_maxioffset)
336 count = mp->m_maxioffset - offset;
337 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
338 offset_fsb = XFS_B_TO_FSBT(mp, offset);
339 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
340 bmapi_flags, NULL, 0, imap, &nimaps, NULL);
341 xfs_iunlock(ip, XFS_ILOCK_SHARED);
342
343 if (error)
344 return -XFS_ERROR(error);
345
346 if (type == IO_DELALLOC &&
347 (!nimaps || isnullstartblock(imap->br_startblock))) {
348 error = xfs_iomap_write_allocate(ip, offset, count, imap);
349 if (!error)
350 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
351 return -XFS_ERROR(error);
352 }
353
354#ifdef DEBUG
355 if (type == IO_UNWRITTEN) {
356 ASSERT(nimaps);
357 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
358 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
359 }
360#endif
361 if (nimaps)
362 trace_xfs_map_blocks_found(ip, offset, count, type, imap);
363 return 0;
329} 364}
330 365
331STATIC int 366STATIC int
@@ -380,26 +415,18 @@ xfs_submit_ioend_bio(
380 415
381 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
382 WRITE_SYNC_PLUG : WRITE, bio); 417 WRITE_SYNC_PLUG : WRITE, bio);
383 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
384 bio_put(bio);
385} 418}
386 419
387STATIC struct bio * 420STATIC struct bio *
388xfs_alloc_ioend_bio( 421xfs_alloc_ioend_bio(
389 struct buffer_head *bh) 422 struct buffer_head *bh)
390{ 423{
391 struct bio *bio;
392 int nvecs = bio_get_nr_vecs(bh->b_bdev); 424 int nvecs = bio_get_nr_vecs(bh->b_bdev);
393 425 struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
394 do {
395 bio = bio_alloc(GFP_NOIO, nvecs);
396 nvecs >>= 1;
397 } while (!bio);
398 426
399 ASSERT(bio->bi_private == NULL); 427 ASSERT(bio->bi_private == NULL);
400 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 428 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
401 bio->bi_bdev = bh->b_bdev; 429 bio->bi_bdev = bh->b_bdev;
402 bio_get(bio);
403 return bio; 430 return bio;
404} 431}
405 432
@@ -470,9 +497,8 @@ xfs_submit_ioend(
470 /* Pass 1 - start writeback */ 497 /* Pass 1 - start writeback */
471 do { 498 do {
472 next = ioend->io_list; 499 next = ioend->io_list;
473 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 500 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
474 xfs_start_buffer_writeback(bh); 501 xfs_start_buffer_writeback(bh);
475 }
476 } while ((ioend = next) != NULL); 502 } while ((ioend = next) != NULL);
477 503
478 /* Pass 2 - submit I/O */ 504 /* Pass 2 - submit I/O */
@@ -600,117 +626,13 @@ xfs_map_at_offset(
600 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 626 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 627 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
602 628
603 lock_buffer(bh);
604 xfs_map_buffer(inode, bh, imap, offset); 629 xfs_map_buffer(inode, bh, imap, offset);
605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
606 set_buffer_mapped(bh); 630 set_buffer_mapped(bh);
607 clear_buffer_delay(bh); 631 clear_buffer_delay(bh);
608 clear_buffer_unwritten(bh); 632 clear_buffer_unwritten(bh);
609} 633}
610 634
611/* 635/*
612 * Look for a page at index that is suitable for clustering.
613 */
614STATIC unsigned int
615xfs_probe_page(
616 struct page *page,
617 unsigned int pg_offset)
618{
619 struct buffer_head *bh, *head;
620 int ret = 0;
621
622 if (PageWriteback(page))
623 return 0;
624 if (!PageDirty(page))
625 return 0;
626 if (!page->mapping)
627 return 0;
628 if (!page_has_buffers(page))
629 return 0;
630
631 bh = head = page_buffers(page);
632 do {
633 if (!buffer_uptodate(bh))
634 break;
635 if (!buffer_mapped(bh))
636 break;
637 ret += bh->b_size;
638 if (ret >= pg_offset)
639 break;
640 } while ((bh = bh->b_this_page) != head);
641
642 return ret;
643}
644
645STATIC size_t
646xfs_probe_cluster(
647 struct inode *inode,
648 struct page *startpage,
649 struct buffer_head *bh,
650 struct buffer_head *head)
651{
652 struct pagevec pvec;
653 pgoff_t tindex, tlast, tloff;
654 size_t total = 0;
655 int done = 0, i;
656
657 /* First sum forwards in this page */
658 do {
659 if (!buffer_uptodate(bh) || !buffer_mapped(bh))
660 return total;
661 total += bh->b_size;
662 } while ((bh = bh->b_this_page) != head);
663
664 /* if we reached the end of the page, sum forwards in following pages */
665 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
666 tindex = startpage->index + 1;
667
668 /* Prune this back to avoid pathological behavior */
669 tloff = min(tlast, startpage->index + 64);
670
671 pagevec_init(&pvec, 0);
672 while (!done && tindex <= tloff) {
673 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
674
675 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
676 break;
677
678 for (i = 0; i < pagevec_count(&pvec); i++) {
679 struct page *page = pvec.pages[i];
680 size_t pg_offset, pg_len = 0;
681
682 if (tindex == tlast) {
683 pg_offset =
684 i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
685 if (!pg_offset) {
686 done = 1;
687 break;
688 }
689 } else
690 pg_offset = PAGE_CACHE_SIZE;
691
692 if (page->index == tindex && trylock_page(page)) {
693 pg_len = xfs_probe_page(page, pg_offset);
694 unlock_page(page);
695 }
696
697 if (!pg_len) {
698 done = 1;
699 break;
700 }
701
702 total += pg_len;
703 tindex++;
704 }
705
706 pagevec_release(&pvec);
707 cond_resched();
708 }
709
710 return total;
711}
712
713/*
714 * Test if a given page is suitable for writing as part of an unwritten 636 * Test if a given page is suitable for writing as part of an unwritten
715 * or delayed allocate extent. 637 * or delayed allocate extent.
716 */ 638 */
@@ -731,9 +653,9 @@ xfs_is_delayed_page(
731 if (buffer_unwritten(bh)) 653 if (buffer_unwritten(bh))
732 acceptable = (type == IO_UNWRITTEN); 654 acceptable = (type == IO_UNWRITTEN);
733 else if (buffer_delay(bh)) 655 else if (buffer_delay(bh))
734 acceptable = (type == IO_DELAY); 656 acceptable = (type == IO_DELALLOC);
735 else if (buffer_dirty(bh) && buffer_mapped(bh)) 657 else if (buffer_dirty(bh) && buffer_mapped(bh))
736 acceptable = (type == IO_NEW); 658 acceptable = (type == IO_OVERWRITE);
737 else 659 else
738 break; 660 break;
739 } while ((bh = bh->b_this_page) != head); 661 } while ((bh = bh->b_this_page) != head);
@@ -758,8 +680,7 @@ xfs_convert_page(
758 loff_t tindex, 680 loff_t tindex,
759 struct xfs_bmbt_irec *imap, 681 struct xfs_bmbt_irec *imap,
760 xfs_ioend_t **ioendp, 682 xfs_ioend_t **ioendp,
761 struct writeback_control *wbc, 683 struct writeback_control *wbc)
762 int all_bh)
763{ 684{
764 struct buffer_head *bh, *head; 685 struct buffer_head *bh, *head;
765 xfs_off_t end_offset; 686 xfs_off_t end_offset;
@@ -814,37 +735,30 @@ xfs_convert_page(
814 continue; 735 continue;
815 } 736 }
816 737
817 if (buffer_unwritten(bh) || buffer_delay(bh)) { 738 if (buffer_unwritten(bh) || buffer_delay(bh) ||
739 buffer_mapped(bh)) {
818 if (buffer_unwritten(bh)) 740 if (buffer_unwritten(bh))
819 type = IO_UNWRITTEN; 741 type = IO_UNWRITTEN;
742 else if (buffer_delay(bh))
743 type = IO_DELALLOC;
820 else 744 else
821 type = IO_DELAY; 745 type = IO_OVERWRITE;
822 746
823 if (!xfs_imap_valid(inode, imap, offset)) { 747 if (!xfs_imap_valid(inode, imap, offset)) {
824 done = 1; 748 done = 1;
825 continue; 749 continue;
826 } 750 }
827 751
828 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 752 lock_buffer(bh);
829 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 753 if (type != IO_OVERWRITE)
830 754 xfs_map_at_offset(inode, bh, imap, offset);
831 xfs_map_at_offset(inode, bh, imap, offset);
832 xfs_add_to_ioend(inode, bh, offset, type, 755 xfs_add_to_ioend(inode, bh, offset, type,
833 ioendp, done); 756 ioendp, done);
834 757
835 page_dirty--; 758 page_dirty--;
836 count++; 759 count++;
837 } else { 760 } else {
838 type = IO_NEW; 761 done = 1;
839 if (buffer_mapped(bh) && all_bh) {
840 lock_buffer(bh);
841 xfs_add_to_ioend(inode, bh, offset,
842 type, ioendp, done);
843 count++;
844 page_dirty--;
845 } else {
846 done = 1;
847 }
848 } 762 }
849 } while (offset += len, (bh = bh->b_this_page) != head); 763 } while (offset += len, (bh = bh->b_this_page) != head);
850 764
@@ -876,7 +790,6 @@ xfs_cluster_write(
876 struct xfs_bmbt_irec *imap, 790 struct xfs_bmbt_irec *imap,
877 xfs_ioend_t **ioendp, 791 xfs_ioend_t **ioendp,
878 struct writeback_control *wbc, 792 struct writeback_control *wbc,
879 int all_bh,
880 pgoff_t tlast) 793 pgoff_t tlast)
881{ 794{
882 struct pagevec pvec; 795 struct pagevec pvec;
@@ -891,7 +804,7 @@ xfs_cluster_write(
891 804
892 for (i = 0; i < pagevec_count(&pvec); i++) { 805 for (i = 0; i < pagevec_count(&pvec); i++) {
893 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 806 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
894 imap, ioendp, wbc, all_bh); 807 imap, ioendp, wbc);
895 if (done) 808 if (done)
896 break; 809 break;
897 } 810 }
@@ -935,7 +848,7 @@ xfs_aops_discard_page(
935 struct buffer_head *bh, *head; 848 struct buffer_head *bh, *head;
936 loff_t offset = page_offset(page); 849 loff_t offset = page_offset(page);
937 850
938 if (!xfs_is_delayed_page(page, IO_DELAY)) 851 if (!xfs_is_delayed_page(page, IO_DELALLOC))
939 goto out_invalidate; 852 goto out_invalidate;
940 853
941 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 854 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1002,10 +915,10 @@ xfs_vm_writepage(
1002 unsigned int type; 915 unsigned int type;
1003 __uint64_t end_offset; 916 __uint64_t end_offset;
1004 pgoff_t end_index, last_index; 917 pgoff_t end_index, last_index;
1005 ssize_t size, len; 918 ssize_t len;
1006 int flags, err, imap_valid = 0, uptodate = 1; 919 int err, imap_valid = 0, uptodate = 1;
1007 int count = 0; 920 int count = 0;
1008 int all_bh = 0; 921 int nonblocking = 0;
1009 922
1010 trace_xfs_writepage(inode, page, 0); 923 trace_xfs_writepage(inode, page, 0);
1011 924
@@ -1056,10 +969,14 @@ xfs_vm_writepage(
1056 969
1057 bh = head = page_buffers(page); 970 bh = head = page_buffers(page);
1058 offset = page_offset(page); 971 offset = page_offset(page);
1059 flags = BMAPI_READ; 972 type = IO_OVERWRITE;
1060 type = IO_NEW; 973
974 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
975 nonblocking = 1;
1061 976
1062 do { 977 do {
978 int new_ioend = 0;
979
1063 if (offset >= end_offset) 980 if (offset >= end_offset)
1064 break; 981 break;
1065 if (!buffer_uptodate(bh)) 982 if (!buffer_uptodate(bh))
@@ -1076,90 +993,54 @@ xfs_vm_writepage(
1076 continue; 993 continue;
1077 } 994 }
1078 995
1079 if (imap_valid) 996 if (buffer_unwritten(bh)) {
1080 imap_valid = xfs_imap_valid(inode, &imap, offset); 997 if (type != IO_UNWRITTEN) {
1081
1082 if (buffer_unwritten(bh) || buffer_delay(bh)) {
1083 int new_ioend = 0;
1084
1085 /*
1086 * Make sure we don't use a read-only iomap
1087 */
1088 if (flags == BMAPI_READ)
1089 imap_valid = 0;
1090
1091 if (buffer_unwritten(bh)) {
1092 type = IO_UNWRITTEN; 998 type = IO_UNWRITTEN;
1093 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 999 imap_valid = 0;
1094 } else if (buffer_delay(bh)) {
1095 type = IO_DELAY;
1096 flags = BMAPI_ALLOCATE;
1097
1098 if (wbc->sync_mode == WB_SYNC_NONE)
1099 flags |= BMAPI_TRYLOCK;
1100 }
1101
1102 if (!imap_valid) {
1103 /*
1104 * If we didn't have a valid mapping then we
1105 * need to ensure that we put the new mapping
1106 * in a new ioend structure. This needs to be
1107 * done to ensure that the ioends correctly
1108 * reflect the block mappings at io completion
1109 * for unwritten extent conversion.
1110 */
1111 new_ioend = 1;
1112 err = xfs_map_blocks(inode, offset, len,
1113 &imap, flags);
1114 if (err)
1115 goto error;
1116 imap_valid = xfs_imap_valid(inode, &imap,
1117 offset);
1118 } 1000 }
1119 if (imap_valid) { 1001 } else if (buffer_delay(bh)) {
1120 xfs_map_at_offset(inode, bh, &imap, offset); 1002 if (type != IO_DELALLOC) {
1121 xfs_add_to_ioend(inode, bh, offset, type, 1003 type = IO_DELALLOC;
1122 &ioend, new_ioend); 1004 imap_valid = 0;
1123 count++;
1124 } 1005 }
1125 } else if (buffer_uptodate(bh)) { 1006 } else if (buffer_uptodate(bh)) {
1126 /* 1007 if (type != IO_OVERWRITE) {
1127 * we got here because the buffer is already mapped. 1008 type = IO_OVERWRITE;
1128 * That means it must already have extents allocated 1009 imap_valid = 0;
1129 * underneath it. Map the extent by reading it.
1130 */
1131 if (!imap_valid || flags != BMAPI_READ) {
1132 flags = BMAPI_READ;
1133 size = xfs_probe_cluster(inode, page, bh, head);
1134 err = xfs_map_blocks(inode, offset, size,
1135 &imap, flags);
1136 if (err)
1137 goto error;
1138 imap_valid = xfs_imap_valid(inode, &imap,
1139 offset);
1140 } 1010 }
1011 } else {
1012 if (PageUptodate(page)) {
1013 ASSERT(buffer_mapped(bh));
1014 imap_valid = 0;
1015 }
1016 continue;
1017 }
1141 1018
1019 if (imap_valid)
1020 imap_valid = xfs_imap_valid(inode, &imap, offset);
1021 if (!imap_valid) {
1142 /* 1022 /*
1143 * We set the type to IO_NEW in case we are doing a 1023 * If we didn't have a valid mapping then we need to
1144 * small write at EOF that is extending the file but 1024 * put the new mapping into a separate ioend structure.
1145 * without needing an allocation. We need to update the 1025 * This ensures non-contiguous extents always have
1146 * file size on I/O completion in this case so it is 1026 * separate ioends, which is particularly important
1147 * the same case as having just allocated a new extent 1027 * for unwritten extent conversion at I/O completion
1148 * that we are writing into for the first time. 1028 * time.
1149 */ 1029 */
1150 type = IO_NEW; 1030 new_ioend = 1;
1151 if (trylock_buffer(bh)) { 1031 err = xfs_map_blocks(inode, offset, &imap, type,
1152 if (imap_valid) 1032 nonblocking);
1153 all_bh = 1; 1033 if (err)
1154 xfs_add_to_ioend(inode, bh, offset, type, 1034 goto error;
1155 &ioend, !imap_valid); 1035 imap_valid = xfs_imap_valid(inode, &imap, offset);
1156 count++; 1036 }
1157 } else { 1037 if (imap_valid) {
1158 imap_valid = 0; 1038 lock_buffer(bh);
1159 } 1039 if (type != IO_OVERWRITE)
1160 } else if (PageUptodate(page)) { 1040 xfs_map_at_offset(inode, bh, &imap, offset);
1161 ASSERT(buffer_mapped(bh)); 1041 xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1162 imap_valid = 0; 1042 new_ioend);
1043 count++;
1163 } 1044 }
1164 1045
1165 if (!iohead) 1046 if (!iohead)
@@ -1188,7 +1069,7 @@ xfs_vm_writepage(
1188 end_index = last_index; 1069 end_index = last_index;
1189 1070
1190 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1071 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1191 wbc, all_bh, end_index); 1072 wbc, end_index);
1192 } 1073 }
1193 1074
1194 if (iohead) 1075 if (iohead)
@@ -1257,13 +1138,19 @@ __xfs_get_blocks(
1257 int create, 1138 int create,
1258 int direct) 1139 int direct)
1259{ 1140{
1260 int flags = create ? BMAPI_WRITE : BMAPI_READ; 1141 struct xfs_inode *ip = XFS_I(inode);
1142 struct xfs_mount *mp = ip->i_mount;
1143 xfs_fileoff_t offset_fsb, end_fsb;
1144 int error = 0;
1145 int lockmode = 0;
1261 struct xfs_bmbt_irec imap; 1146 struct xfs_bmbt_irec imap;
1147 int nimaps = 1;
1262 xfs_off_t offset; 1148 xfs_off_t offset;
1263 ssize_t size; 1149 ssize_t size;
1264 int nimap = 1;
1265 int new = 0; 1150 int new = 0;
1266 int error; 1151
1152 if (XFS_FORCED_SHUTDOWN(mp))
1153 return -XFS_ERROR(EIO);
1267 1154
1268 offset = (xfs_off_t)iblock << inode->i_blkbits; 1155 offset = (xfs_off_t)iblock << inode->i_blkbits;
1269 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1156 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1272,15 +1159,45 @@ __xfs_get_blocks(
1272 if (!create && direct && offset >= i_size_read(inode)) 1159 if (!create && direct && offset >= i_size_read(inode))
1273 return 0; 1160 return 0;
1274 1161
1275 if (direct && create) 1162 if (create) {
1276 flags |= BMAPI_DIRECT; 1163 lockmode = XFS_ILOCK_EXCL;
1164 xfs_ilock(ip, lockmode);
1165 } else {
1166 lockmode = xfs_ilock_map_shared(ip);
1167 }
1168
1169 ASSERT(offset <= mp->m_maxioffset);
1170 if (offset + size > mp->m_maxioffset)
1171 size = mp->m_maxioffset - offset;
1172 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1173 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1277 1174
1278 error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, 1175 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
1279 &new); 1176 XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
1280 if (error) 1177 if (error)
1281 return -error; 1178 goto out_unlock;
1282 if (nimap == 0) 1179
1283 return 0; 1180 if (create &&
1181 (!nimaps ||
1182 (imap.br_startblock == HOLESTARTBLOCK ||
1183 imap.br_startblock == DELAYSTARTBLOCK))) {
1184 if (direct) {
1185 error = xfs_iomap_write_direct(ip, offset, size,
1186 &imap, nimaps);
1187 } else {
1188 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1189 }
1190 if (error)
1191 goto out_unlock;
1192
1193 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1194 } else if (nimaps) {
1195 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1196 } else {
1197 trace_xfs_get_blocks_notfound(ip, offset, size);
1198 goto out_unlock;
1199 }
1200 xfs_iunlock(ip, lockmode);
1284 1201
1285 if (imap.br_startblock != HOLESTARTBLOCK && 1202 if (imap.br_startblock != HOLESTARTBLOCK &&
1286 imap.br_startblock != DELAYSTARTBLOCK) { 1203 imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1347,6 +1264,10 @@ __xfs_get_blocks(
1347 } 1264 }
1348 1265
1349 return 0; 1266 return 0;
1267
1268out_unlock:
1269 xfs_iunlock(ip, lockmode);
1270 return -error;
1350} 1271}
1351 1272
1352int 1273int
@@ -1434,7 +1355,7 @@ xfs_vm_direct_IO(
1434 ssize_t ret; 1355 ssize_t ret;
1435 1356
1436 if (rw & WRITE) { 1357 if (rw & WRITE) {
1437 iocb->private = xfs_alloc_ioend(inode, IO_NEW); 1358 iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
1438 1359
1439 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1360 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1440 offset, nr_segs, 1361 offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
23extern mempool_t *xfs_ioend_pool; 23extern mempool_t *xfs_ioend_pool;
24 24
25/* 25/*
26 * Types of I/O for bmap clustering and I/O completion tracking.
27 */
28enum {
29 IO_DIRECT = 0, /* special case for direct I/O ioends */
30 IO_DELALLOC, /* mapping covers delalloc region */
31 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
32 IO_OVERWRITE, /* mapping covers already allocated extent */
33};
34
35#define XFS_IO_TYPES \
36 { 0, "" }, \
37 { IO_DELALLOC, "delalloc" }, \
38 { IO_UNWRITTEN, "unwritten" }, \
39 { IO_OVERWRITE, "overwrite" }
40
41/*
26 * xfs_ioend struct manages large extent writes for XFS. 42 * xfs_ioend struct manages large extent writes for XFS.
27 * It can manage several multi-page bio's at once. 43 * It can manage several multi-page bio's at once.
28 */ 44 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4c5deb6e9e31..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -44,12 +44,7 @@
44 44
45static kmem_zone_t *xfs_buf_zone; 45static kmem_zone_t *xfs_buf_zone;
46STATIC int xfsbufd(void *); 46STATIC int xfsbufd(void *);
47STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
48STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 47STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
49static struct shrinker xfs_buf_shake = {
50 .shrink = xfsbufd_wakeup,
51 .seeks = DEFAULT_SEEKS,
52};
53 48
54static struct workqueue_struct *xfslogd_workqueue; 49static struct workqueue_struct *xfslogd_workqueue;
55struct workqueue_struct *xfsdatad_workqueue; 50struct workqueue_struct *xfsdatad_workqueue;
@@ -168,8 +163,79 @@ test_page_region(
168} 163}
169 164
170/* 165/*
171 * Internal xfs_buf_t object manipulation 166 * xfs_buf_lru_add - add a buffer to the LRU.
167 *
168 * The LRU takes a new reference to the buffer so that it will only be freed
169 * once the shrinker takes the buffer off the LRU.
172 */ 170 */
171STATIC void
172xfs_buf_lru_add(
173 struct xfs_buf *bp)
174{
175 struct xfs_buftarg *btp = bp->b_target;
176
177 spin_lock(&btp->bt_lru_lock);
178 if (list_empty(&bp->b_lru)) {
179 atomic_inc(&bp->b_hold);
180 list_add_tail(&bp->b_lru, &btp->bt_lru);
181 btp->bt_lru_nr++;
182 }
183 spin_unlock(&btp->bt_lru_lock);
184}
185
186/*
187 * xfs_buf_lru_del - remove a buffer from the LRU
188 *
189 * The unlocked check is safe here because it only occurs when there are not
190 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
191 * to optimise the shrinker removing the buffer from the LRU and calling
192 * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
193 * bt_lru_lock.
194 */
195STATIC void
196xfs_buf_lru_del(
197 struct xfs_buf *bp)
198{
199 struct xfs_buftarg *btp = bp->b_target;
200
201 if (list_empty(&bp->b_lru))
202 return;
203
204 spin_lock(&btp->bt_lru_lock);
205 if (!list_empty(&bp->b_lru)) {
206 list_del_init(&bp->b_lru);
207 btp->bt_lru_nr--;
208 }
209 spin_unlock(&btp->bt_lru_lock);
210}
211
212/*
213 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
214 * b_lru_ref count so that the buffer is freed immediately when the buffer
215 * reference count falls to zero. If the buffer is already on the LRU, we need
216 * to remove the reference that LRU holds on the buffer.
217 *
218 * This prevents build-up of stale buffers on the LRU.
219 */
220void
221xfs_buf_stale(
222 struct xfs_buf *bp)
223{
224 bp->b_flags |= XBF_STALE;
225 atomic_set(&(bp)->b_lru_ref, 0);
226 if (!list_empty(&bp->b_lru)) {
227 struct xfs_buftarg *btp = bp->b_target;
228
229 spin_lock(&btp->bt_lru_lock);
230 if (!list_empty(&bp->b_lru)) {
231 list_del_init(&bp->b_lru);
232 btp->bt_lru_nr--;
233 atomic_dec(&bp->b_hold);
234 }
235 spin_unlock(&btp->bt_lru_lock);
236 }
237 ASSERT(atomic_read(&bp->b_hold) >= 1);
238}
173 239
174STATIC void 240STATIC void
175_xfs_buf_initialize( 241_xfs_buf_initialize(
@@ -186,7 +252,9 @@ _xfs_buf_initialize(
186 252
187 memset(bp, 0, sizeof(xfs_buf_t)); 253 memset(bp, 0, sizeof(xfs_buf_t));
188 atomic_set(&bp->b_hold, 1); 254 atomic_set(&bp->b_hold, 1);
255 atomic_set(&bp->b_lru_ref, 1);
189 init_completion(&bp->b_iowait); 256 init_completion(&bp->b_iowait);
257 INIT_LIST_HEAD(&bp->b_lru);
190 INIT_LIST_HEAD(&bp->b_list); 258 INIT_LIST_HEAD(&bp->b_list);
191 RB_CLEAR_NODE(&bp->b_rbnode); 259 RB_CLEAR_NODE(&bp->b_rbnode);
192 sema_init(&bp->b_sema, 0); /* held, no waiters */ 260 sema_init(&bp->b_sema, 0); /* held, no waiters */
@@ -262,6 +330,8 @@ xfs_buf_free(
262{ 330{
263 trace_xfs_buf_free(bp, _RET_IP_); 331 trace_xfs_buf_free(bp, _RET_IP_);
264 332
333 ASSERT(list_empty(&bp->b_lru));
334
265 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 335 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
266 uint i; 336 uint i;
267 337
@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
337 __func__, gfp_mask); 407 __func__, gfp_mask);
338 408
339 XFS_STATS_INC(xb_page_retries); 409 XFS_STATS_INC(xb_page_retries);
340 xfsbufd_wakeup(NULL, 0, gfp_mask);
341 congestion_wait(BLK_RW_ASYNC, HZ/50); 410 congestion_wait(BLK_RW_ASYNC, HZ/50);
342 goto retry; 411 goto retry;
343 } 412 }
@@ -827,7 +896,7 @@ xfs_buf_rele(
827 trace_xfs_buf_rele(bp, _RET_IP_); 896 trace_xfs_buf_rele(bp, _RET_IP_);
828 897
829 if (!pag) { 898 if (!pag) {
830 ASSERT(!bp->b_relse); 899 ASSERT(list_empty(&bp->b_lru));
831 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 900 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
832 if (atomic_dec_and_test(&bp->b_hold)) 901 if (atomic_dec_and_test(&bp->b_hold))
833 xfs_buf_free(bp); 902 xfs_buf_free(bp);
@@ -835,13 +904,15 @@ xfs_buf_rele(
835 } 904 }
836 905
837 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 906 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
907
838 ASSERT(atomic_read(&bp->b_hold) > 0); 908 ASSERT(atomic_read(&bp->b_hold) > 0);
839 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 909 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
840 if (bp->b_relse) { 910 if (!(bp->b_flags & XBF_STALE) &&
841 atomic_inc(&bp->b_hold); 911 atomic_read(&bp->b_lru_ref)) {
912 xfs_buf_lru_add(bp);
842 spin_unlock(&pag->pag_buf_lock); 913 spin_unlock(&pag->pag_buf_lock);
843 bp->b_relse(bp);
844 } else { 914 } else {
915 xfs_buf_lru_del(bp);
845 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 916 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
846 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 917 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
847 spin_unlock(&pag->pag_buf_lock); 918 spin_unlock(&pag->pag_buf_lock);
@@ -1438,51 +1509,84 @@ xfs_buf_iomove(
1438 */ 1509 */
1439 1510
1440/* 1511/*
1441 * Wait for any bufs with callbacks that have been submitted but 1512 * Wait for any bufs with callbacks that have been submitted but have not yet
1442 * have not yet returned... walk the hash list for the target. 1513 * returned. These buffers will have an elevated hold count, so wait on those
1514 * while freeing all the buffers only held by the LRU.
1443 */ 1515 */
1444void 1516void
1445xfs_wait_buftarg( 1517xfs_wait_buftarg(
1446 struct xfs_buftarg *btp) 1518 struct xfs_buftarg *btp)
1447{ 1519{
1448 struct xfs_perag *pag; 1520 struct xfs_buf *bp;
1449 uint i;
1450 1521
1451 for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { 1522restart:
1452 pag = xfs_perag_get(btp->bt_mount, i); 1523 spin_lock(&btp->bt_lru_lock);
1453 spin_lock(&pag->pag_buf_lock); 1524 while (!list_empty(&btp->bt_lru)) {
1454 while (rb_first(&pag->pag_buf_tree)) { 1525 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1455 spin_unlock(&pag->pag_buf_lock); 1526 if (atomic_read(&bp->b_hold) > 1) {
1527 spin_unlock(&btp->bt_lru_lock);
1456 delay(100); 1528 delay(100);
1457 spin_lock(&pag->pag_buf_lock); 1529 goto restart;
1458 } 1530 }
1459 spin_unlock(&pag->pag_buf_lock); 1531 /*
1460 xfs_perag_put(pag); 1532 * clear the LRU reference count so the bufer doesn't get
1533 * ignored in xfs_buf_rele().
1534 */
1535 atomic_set(&bp->b_lru_ref, 0);
1536 spin_unlock(&btp->bt_lru_lock);
1537 xfs_buf_rele(bp);
1538 spin_lock(&btp->bt_lru_lock);
1461 } 1539 }
1540 spin_unlock(&btp->bt_lru_lock);
1462} 1541}
1463 1542
1464/* 1543int
1465 * buftarg list for delwrite queue processing 1544xfs_buftarg_shrink(
1466 */ 1545 struct shrinker *shrink,
1467static LIST_HEAD(xfs_buftarg_list); 1546 int nr_to_scan,
1468static DEFINE_SPINLOCK(xfs_buftarg_lock); 1547 gfp_t mask)
1469
1470STATIC void
1471xfs_register_buftarg(
1472 xfs_buftarg_t *btp)
1473{ 1548{
1474 spin_lock(&xfs_buftarg_lock); 1549 struct xfs_buftarg *btp = container_of(shrink,
1475 list_add(&btp->bt_list, &xfs_buftarg_list); 1550 struct xfs_buftarg, bt_shrinker);
1476 spin_unlock(&xfs_buftarg_lock); 1551 struct xfs_buf *bp;
1477} 1552 LIST_HEAD(dispose);
1478 1553
1479STATIC void 1554 if (!nr_to_scan)
1480xfs_unregister_buftarg( 1555 return btp->bt_lru_nr;
1481 xfs_buftarg_t *btp) 1556
1482{ 1557 spin_lock(&btp->bt_lru_lock);
1483 spin_lock(&xfs_buftarg_lock); 1558 while (!list_empty(&btp->bt_lru)) {
1484 list_del(&btp->bt_list); 1559 if (nr_to_scan-- <= 0)
1485 spin_unlock(&xfs_buftarg_lock); 1560 break;
1561
1562 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1563
1564 /*
1565 * Decrement the b_lru_ref count unless the value is already
1566 * zero. If the value is already zero, we need to reclaim the
1567 * buffer, otherwise it gets another trip through the LRU.
1568 */
1569 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1570 list_move_tail(&bp->b_lru, &btp->bt_lru);
1571 continue;
1572 }
1573
1574 /*
1575 * remove the buffer from the LRU now to avoid needing another
1576 * lock round trip inside xfs_buf_rele().
1577 */
1578 list_move(&bp->b_lru, &dispose);
1579 btp->bt_lru_nr--;
1580 }
1581 spin_unlock(&btp->bt_lru_lock);
1582
1583 while (!list_empty(&dispose)) {
1584 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1585 list_del_init(&bp->b_lru);
1586 xfs_buf_rele(bp);
1587 }
1588
1589 return btp->bt_lru_nr;
1486} 1590}
1487 1591
1488void 1592void
@@ -1490,17 +1594,14 @@ xfs_free_buftarg(
1490 struct xfs_mount *mp, 1594 struct xfs_mount *mp,
1491 struct xfs_buftarg *btp) 1595 struct xfs_buftarg *btp)
1492{ 1596{
1597 unregister_shrinker(&btp->bt_shrinker);
1598
1493 xfs_flush_buftarg(btp, 1); 1599 xfs_flush_buftarg(btp, 1);
1494 if (mp->m_flags & XFS_MOUNT_BARRIER) 1600 if (mp->m_flags & XFS_MOUNT_BARRIER)
1495 xfs_blkdev_issue_flush(btp); 1601 xfs_blkdev_issue_flush(btp);
1496 iput(btp->bt_mapping->host); 1602 iput(btp->bt_mapping->host);
1497 1603
1498 /* Unregister the buftarg first so that we don't get a
1499 * wakeup finding a non-existent task
1500 */
1501 xfs_unregister_buftarg(btp);
1502 kthread_stop(btp->bt_task); 1604 kthread_stop(btp->bt_task);
1503
1504 kmem_free(btp); 1605 kmem_free(btp);
1505} 1606}
1506 1607
@@ -1597,20 +1698,13 @@ xfs_alloc_delwrite_queue(
1597 xfs_buftarg_t *btp, 1698 xfs_buftarg_t *btp,
1598 const char *fsname) 1699 const char *fsname)
1599{ 1700{
1600 int error = 0;
1601
1602 INIT_LIST_HEAD(&btp->bt_list);
1603 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1701 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1604 spin_lock_init(&btp->bt_delwrite_lock); 1702 spin_lock_init(&btp->bt_delwrite_lock);
1605 btp->bt_flags = 0; 1703 btp->bt_flags = 0;
1606 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1704 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1607 if (IS_ERR(btp->bt_task)) { 1705 if (IS_ERR(btp->bt_task))
1608 error = PTR_ERR(btp->bt_task); 1706 return PTR_ERR(btp->bt_task);
1609 goto out_error; 1707 return 0;
1610 }
1611 xfs_register_buftarg(btp);
1612out_error:
1613 return error;
1614} 1708}
1615 1709
1616xfs_buftarg_t * 1710xfs_buftarg_t *
@@ -1627,12 +1721,17 @@ xfs_alloc_buftarg(
1627 btp->bt_mount = mp; 1721 btp->bt_mount = mp;
1628 btp->bt_dev = bdev->bd_dev; 1722 btp->bt_dev = bdev->bd_dev;
1629 btp->bt_bdev = bdev; 1723 btp->bt_bdev = bdev;
1724 INIT_LIST_HEAD(&btp->bt_lru);
1725 spin_lock_init(&btp->bt_lru_lock);
1630 if (xfs_setsize_buftarg_early(btp, bdev)) 1726 if (xfs_setsize_buftarg_early(btp, bdev))
1631 goto error; 1727 goto error;
1632 if (xfs_mapping_buftarg(btp, bdev)) 1728 if (xfs_mapping_buftarg(btp, bdev))
1633 goto error; 1729 goto error;
1634 if (xfs_alloc_delwrite_queue(btp, fsname)) 1730 if (xfs_alloc_delwrite_queue(btp, fsname))
1635 goto error; 1731 goto error;
1732 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1733 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1734 register_shrinker(&btp->bt_shrinker);
1636 return btp; 1735 return btp;
1637 1736
1638error: 1737error:
@@ -1737,27 +1836,6 @@ xfs_buf_runall_queues(
1737 flush_workqueue(queue); 1836 flush_workqueue(queue);
1738} 1837}
1739 1838
1740STATIC int
1741xfsbufd_wakeup(
1742 struct shrinker *shrink,
1743 int priority,
1744 gfp_t mask)
1745{
1746 xfs_buftarg_t *btp;
1747
1748 spin_lock(&xfs_buftarg_lock);
1749 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1750 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1751 continue;
1752 if (list_empty(&btp->bt_delwrite_queue))
1753 continue;
1754 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1755 wake_up_process(btp->bt_task);
1756 }
1757 spin_unlock(&xfs_buftarg_lock);
1758 return 0;
1759}
1760
1761/* 1839/*
1762 * Move as many buffers as specified to the supplied list 1840 * Move as many buffers as specified to the supplied list
1763 * idicating if we skipped any buffers to prevent deadlocks. 1841 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1952,7 +2030,6 @@ xfs_buf_init(void)
1952 if (!xfsconvertd_workqueue) 2030 if (!xfsconvertd_workqueue)
1953 goto out_destroy_xfsdatad_workqueue; 2031 goto out_destroy_xfsdatad_workqueue;
1954 2032
1955 register_shrinker(&xfs_buf_shake);
1956 return 0; 2033 return 0;
1957 2034
1958 out_destroy_xfsdatad_workqueue: 2035 out_destroy_xfsdatad_workqueue:
@@ -1968,7 +2045,6 @@ xfs_buf_init(void)
1968void 2045void
1969xfs_buf_terminate(void) 2046xfs_buf_terminate(void)
1970{ 2047{
1971 unregister_shrinker(&xfs_buf_shake);
1972 destroy_workqueue(xfsconvertd_workqueue); 2048 destroy_workqueue(xfsconvertd_workqueue);
1973 destroy_workqueue(xfsdatad_workqueue); 2049 destroy_workqueue(xfsdatad_workqueue);
1974 destroy_workqueue(xfslogd_workqueue); 2050 destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 383a3f37cf98..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
128 128
129 /* per device delwri queue */ 129 /* per device delwri queue */
130 struct task_struct *bt_task; 130 struct task_struct *bt_task;
131 struct list_head bt_list;
132 struct list_head bt_delwrite_queue; 131 struct list_head bt_delwrite_queue;
133 spinlock_t bt_delwrite_lock; 132 spinlock_t bt_delwrite_lock;
134 unsigned long bt_flags; 133 unsigned long bt_flags;
134
135 /* LRU control structures */
136 struct shrinker bt_shrinker;
137 struct list_head bt_lru;
138 spinlock_t bt_lru_lock;
139 unsigned int bt_lru_nr;
135} xfs_buftarg_t; 140} xfs_buftarg_t;
136 141
137/* 142/*
@@ -147,8 +152,6 @@ typedef struct xfs_buftarg {
147 152
148struct xfs_buf; 153struct xfs_buf;
149typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
150typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
151typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
152 155
153#define XB_PAGES 2 156#define XB_PAGES 2
154 157
@@ -164,9 +167,11 @@ typedef struct xfs_buf {
164 xfs_off_t b_file_offset; /* offset in file */ 167 xfs_off_t b_file_offset; /* offset in file */
165 size_t b_buffer_length;/* size of buffer in bytes */ 168 size_t b_buffer_length;/* size of buffer in bytes */
166 atomic_t b_hold; /* reference count */ 169 atomic_t b_hold; /* reference count */
170 atomic_t b_lru_ref; /* lru reclaim ref count */
167 xfs_buf_flags_t b_flags; /* status flags */ 171 xfs_buf_flags_t b_flags; /* status flags */
168 struct semaphore b_sema; /* semaphore for lockables */ 172 struct semaphore b_sema; /* semaphore for lockables */
169 173
174 struct list_head b_lru; /* lru list */
170 wait_queue_head_t b_waiters; /* unpin waiters */ 175 wait_queue_head_t b_waiters; /* unpin waiters */
171 struct list_head b_list; 176 struct list_head b_list;
172 struct xfs_perag *b_pag; /* contains rbtree root */ 177 struct xfs_perag *b_pag; /* contains rbtree root */
@@ -176,7 +181,6 @@ typedef struct xfs_buf {
176 void *b_addr; /* virtual address of buffer */ 181 void *b_addr; /* virtual address of buffer */
177 struct work_struct b_iodone_work; 182 struct work_struct b_iodone_work;
178 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 183 xfs_buf_iodone_t b_iodone; /* I/O completion function */
179 xfs_buf_relse_t b_relse; /* releasing function */
180 struct completion b_iowait; /* queue for I/O waiters */ 184 struct completion b_iowait; /* queue for I/O waiters */
181 void *b_fspriv; 185 void *b_fspriv;
182 void *b_fspriv2; 186 void *b_fspriv2;
@@ -264,7 +268,8 @@ extern void xfs_buf_terminate(void);
264#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 268#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
265 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 269 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
266 270
267#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) 271void xfs_buf_stale(struct xfs_buf *bp);
272#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
268#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 273#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
269#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 274#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
270#define XFS_BUF_SUPER_STALE(bp) do { \ 275#define XFS_BUF_SUPER_STALE(bp) do { \
@@ -315,7 +320,6 @@ extern void xfs_buf_terminate(void);
315#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 320#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
316#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 321#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
317#define XFS_BUF_SET_START(bp) do { } while (0) 322#define XFS_BUF_SET_START(bp) do { } while (0)
318#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
319 323
320#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 324#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
321#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 325#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -328,9 +332,15 @@ extern void xfs_buf_terminate(void);
328#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 332#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
329#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 333#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
330 334
331#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) 335static inline void
336xfs_buf_set_ref(
337 struct xfs_buf *bp,
338 int lru_ref)
339{
340 atomic_set(&bp->b_lru_ref, lru_ref);
341}
342#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
332#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 343#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
333#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
334 344
335#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) 345#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
336 346
@@ -346,8 +356,7 @@ extern void xfs_buf_terminate(void);
346 356
347static inline void xfs_buf_relse(xfs_buf_t *bp) 357static inline void xfs_buf_relse(xfs_buf_t *bp)
348{ 358{
349 if (!bp->b_relse) 359 xfs_buf_unlock(bp);
350 xfs_buf_unlock(bp);
351 xfs_buf_rele(bp); 360 xfs_buf_rele(bp);
352} 361}
353 362
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..d61611c88012
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,193 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_trans.h"
26#include "xfs_alloc_btree.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_inode.h"
31#include "xfs_alloc.h"
32#include "xfs_error.h"
33#include "xfs_discard.h"
34#include "xfs_trace.h"
35
36STATIC int
37xfs_trim_extents(
38 struct xfs_mount *mp,
39 xfs_agnumber_t agno,
40 xfs_fsblock_t start,
41 xfs_fsblock_t len,
42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed)
44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
46 struct xfs_btree_cur *cur;
47 struct xfs_buf *agbp;
48 struct xfs_perag *pag;
49 int error;
50 int i;
51
52 pag = xfs_perag_get(mp, agno);
53
54 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
55 if (error || !agbp)
56 goto out_put_perag;
57
58 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
59
60 /*
61 * Force out the log. This means any transactions that might have freed
62 * space before we took the AGF buffer lock are now on disk, and the
63 * volatile disk cache is flushed.
64 */
65 xfs_log_force(mp, XFS_LOG_SYNC);
66
67 /*
68 * Look up the longest btree in the AGF and start with it.
69 */
70 error = xfs_alloc_lookup_le(cur, 0,
71 XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
72 if (error)
73 goto out_del_cursor;
74
75 /*
76 * Loop until we are done with all extents that are large
77 * enough to be worth discarding.
78 */
79 while (i) {
80 xfs_agblock_t fbno;
81 xfs_extlen_t flen;
82
83 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
84 if (error)
85 goto out_del_cursor;
86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
87 ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
88
89 /*
90 * Too small? Give up.
91 */
92 if (flen < minlen) {
93 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
94 goto out_del_cursor;
95 }
96
97 /*
98 * If the extent is entirely outside of the range we are
99 * supposed to discard skip it. Do not bother to trim
100 * down partially overlapping ranges for now.
101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent;
106 }
107
108 /*
109 * If any blocks in the range are still busy, skip the
110 * discard and try again the next time.
111 */
112 if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
113 trace_xfs_discard_busy(mp, agno, fbno, flen);
114 goto next_extent;
115 }
116
117 trace_xfs_discard_extent(mp, agno, fbno, flen);
118 error = -blkdev_issue_discard(bdev,
119 XFS_AGB_TO_DADDR(mp, agno, fbno),
120 XFS_FSB_TO_BB(mp, flen),
121 GFP_NOFS, 0);
122 if (error)
123 goto out_del_cursor;
124 *blocks_trimmed += flen;
125
126next_extent:
127 error = xfs_btree_decrement(cur, 0, &i);
128 if (error)
129 goto out_del_cursor;
130 }
131
132out_del_cursor:
133 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
134 xfs_buf_relse(agbp);
135out_put_perag:
136 xfs_perag_put(pag);
137 return error;
138}
139
140int
141xfs_ioc_trim(
142 struct xfs_mount *mp,
143 struct fstrim_range __user *urange)
144{
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0;
152
153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM);
155 if (!blk_queue_discard(q))
156 return -XFS_ERROR(EOPNOTSUPP);
157 if (copy_from_user(&range, urange, sizeof(range)))
158 return -XFS_ERROR(EFAULT);
159
160 /*
161 * Truncating down the len isn't actually quite correct, but using
162 * XFS_B_TO_FSB would mean we trivially get overflows for values
163 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
164 * used by the fstrim application. In the end it really doesn't
165 * matter as trimming blocks is an advisory interface.
166 */
167 start = XFS_B_TO_FSBT(mp, range.start);
168 len = XFS_B_TO_FSBT(mp, range.len);
169 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
170
171 start_agno = XFS_FSB_TO_AGNO(mp, start);
172 if (start_agno >= mp->m_sb.sb_agcount)
173 return -XFS_ERROR(EINVAL);
174
175 end_agno = XFS_FSB_TO_AGNO(mp, start + len);
176 if (end_agno >= mp->m_sb.sb_agcount)
177 end_agno = mp->m_sb.sb_agcount - 1;
178
179 for (agno = start_agno; agno <= end_agno; agno++) {
180 error = -xfs_trim_extents(mp, agno, start, len, minlen,
181 &blocks_trimmed);
182 if (error)
183 last_error = error;
184 }
185
186 if (last_error)
187 return last_error;
188
189 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
190 if (copy_to_user(urange, &range, sizeof(range)))
191 return -XFS_ERROR(EFAULT);
192 return 0;
193}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
1#ifndef XFS_DISCARD_H
2#define XFS_DISCARD_H 1
3
4struct fstrim_range;
5
6extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
7
8#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..fc0114da7fdd 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
70 else 70 else
71 fileid_type = FILEID_INO32_GEN_PARENT; 71 fileid_type = FILEID_INO32_GEN_PARENT;
72 72
73 /* filesystem may contain 64bit inode numbers */ 73 /*
74 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) 74 * If the the filesystem may contain 64bit inode numbers, we need
75 * to use larger file handles that can represent them.
76 *
77 * While we only allocate inodes that do not fit into 32 bits any
78 * large enough filesystem may contain them, thus the slightly
79 * confusing looking conditional below.
80 */
81 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
82 (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
75 fileid_type |= XFS_FILEID_TYPE_64FLAG; 83 fileid_type |= XFS_FILEID_TYPE_64FLAG;
76 84
77 /* 85 /*
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..a55c1b46b219 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38 38
39#include <linux/dcache.h> 39#include <linux/dcache.h>
40#include <linux/falloc.h>
40 41
41static const struct vm_operations_struct xfs_file_vm_ops; 42static const struct vm_operations_struct xfs_file_vm_ops;
42 43
43/* 44/*
45 * Locking primitives for read and write IO paths to ensure we consistently use
46 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
47 */
48static inline void
49xfs_rw_ilock(
50 struct xfs_inode *ip,
51 int type)
52{
53 if (type & XFS_IOLOCK_EXCL)
54 mutex_lock(&VFS_I(ip)->i_mutex);
55 xfs_ilock(ip, type);
56}
57
58static inline void
59xfs_rw_iunlock(
60 struct xfs_inode *ip,
61 int type)
62{
63 xfs_iunlock(ip, type);
64 if (type & XFS_IOLOCK_EXCL)
65 mutex_unlock(&VFS_I(ip)->i_mutex);
66}
67
68static inline void
69xfs_rw_ilock_demote(
70 struct xfs_inode *ip,
71 int type)
72{
73 xfs_ilock_demote(ip, type);
74 if (type & XFS_IOLOCK_EXCL)
75 mutex_unlock(&VFS_I(ip)->i_mutex);
76}
77
78/*
44 * xfs_iozero 79 * xfs_iozero
45 * 80 *
46 * xfs_iozero clears the specified range of buffer supplied, 81 * xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
262 if (XFS_FORCED_SHUTDOWN(mp)) 297 if (XFS_FORCED_SHUTDOWN(mp))
263 return -EIO; 298 return -EIO;
264 299
265 if (unlikely(ioflags & IO_ISDIRECT))
266 mutex_lock(&inode->i_mutex);
267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
268
269 if (unlikely(ioflags & IO_ISDIRECT)) { 300 if (unlikely(ioflags & IO_ISDIRECT)) {
301 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
302
270 if (inode->i_mapping->nrpages) { 303 if (inode->i_mapping->nrpages) {
271 ret = -xfs_flushinval_pages(ip, 304 ret = -xfs_flushinval_pages(ip,
272 (iocb->ki_pos & PAGE_CACHE_MASK), 305 (iocb->ki_pos & PAGE_CACHE_MASK),
273 -1, FI_REMAPF_LOCKED); 306 -1, FI_REMAPF_LOCKED);
307 if (ret) {
308 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310 }
274 } 311 }
275 mutex_unlock(&inode->i_mutex); 312 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
276 if (ret) { 313 } else
277 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 314 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
278 return ret;
279 }
280 }
281 315
282 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 316 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
283 317
@@ -285,7 +319,7 @@ xfs_file_aio_read(
285 if (ret > 0) 319 if (ret > 0)
286 XFS_STATS_ADD(xs_read_bytes, ret); 320 XFS_STATS_ADD(xs_read_bytes, ret);
287 321
288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
289 return ret; 323 return ret;
290} 324}
291 325
@@ -309,7 +343,7 @@ xfs_file_splice_read(
309 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 343 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
310 return -EIO; 344 return -EIO;
311 345
312 xfs_ilock(ip, XFS_IOLOCK_SHARED); 346 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
313 347
314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
315 349
@@ -317,10 +351,61 @@ xfs_file_splice_read(
317 if (ret > 0) 351 if (ret > 0)
318 XFS_STATS_ADD(xs_read_bytes, ret); 352 XFS_STATS_ADD(xs_read_bytes, ret);
319 353
320 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 354 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 return ret; 355 return ret;
322} 356}
323 357
358STATIC void
359xfs_aio_write_isize_update(
360 struct inode *inode,
361 loff_t *ppos,
362 ssize_t bytes_written)
363{
364 struct xfs_inode *ip = XFS_I(inode);
365 xfs_fsize_t isize = i_size_read(inode);
366
367 if (bytes_written > 0)
368 XFS_STATS_ADD(xs_write_bytes, bytes_written);
369
370 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
371 *ppos > isize))
372 *ppos = isize;
373
374 if (*ppos > ip->i_size) {
375 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
376 if (*ppos > ip->i_size)
377 ip->i_size = *ppos;
378 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
379 }
380}
381
382/*
383 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
384 * part of the I/O may have been written to disk before the error occured. In
385 * this case the on-disk file size may have been adjusted beyond the in-memory
386 * file size and now needs to be truncated back.
387 */
388STATIC void
389xfs_aio_write_newsize_update(
390 struct xfs_inode *ip)
391{
392 if (ip->i_new_size) {
393 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
394 ip->i_new_size = 0;
395 if (ip->i_d.di_size > ip->i_size)
396 ip->i_d.di_size = ip->i_size;
397 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
398 }
399}
400
401/*
402 * xfs_file_splice_write() does not use xfs_rw_ilock() because
403 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
404 * couuld cause lock inversions between the aio_write path and the splice path
405 * if someone is doing concurrent splice(2) based writes and write(2) based
406 * writes to the same inode. The only real way to fix this is to re-implement
407 * the generic code here with correct locking orders.
408 */
324STATIC ssize_t 409STATIC ssize_t
325xfs_file_splice_write( 410xfs_file_splice_write(
326 struct pipe_inode_info *pipe, 411 struct pipe_inode_info *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
331{ 416{
332 struct inode *inode = outfilp->f_mapping->host; 417 struct inode *inode = outfilp->f_mapping->host;
333 struct xfs_inode *ip = XFS_I(inode); 418 struct xfs_inode *ip = XFS_I(inode);
334 xfs_fsize_t isize, new_size; 419 xfs_fsize_t new_size;
335 int ioflags = 0; 420 int ioflags = 0;
336 ssize_t ret; 421 ssize_t ret;
337 422
@@ -355,27 +440,9 @@ xfs_file_splice_write(
355 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 440 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
356 441
357 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 442 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
358 if (ret > 0)
359 XFS_STATS_ADD(xs_write_bytes, ret);
360
361 isize = i_size_read(inode);
362 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
363 *ppos = isize;
364
365 if (*ppos > ip->i_size) {
366 xfs_ilock(ip, XFS_ILOCK_EXCL);
367 if (*ppos > ip->i_size)
368 ip->i_size = *ppos;
369 xfs_iunlock(ip, XFS_ILOCK_EXCL);
370 }
371 443
372 if (ip->i_new_size) { 444 xfs_aio_write_isize_update(inode, ppos, ret);
373 xfs_ilock(ip, XFS_ILOCK_EXCL); 445 xfs_aio_write_newsize_update(ip);
374 ip->i_new_size = 0;
375 if (ip->i_d.di_size > ip->i_size)
376 ip->i_d.di_size = ip->i_size;
377 xfs_iunlock(ip, XFS_ILOCK_EXCL);
378 }
379 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 446 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
380 return ret; 447 return ret;
381} 448}
@@ -562,247 +629,314 @@ out_lock:
562 return error; 629 return error;
563} 630}
564 631
632/*
633 * Common pre-write limit and setup checks.
634 *
635 * Returns with iolock held according to @iolock.
636 */
565STATIC ssize_t 637STATIC ssize_t
566xfs_file_aio_write( 638xfs_file_aio_write_checks(
567 struct kiocb *iocb, 639 struct file *file,
568 const struct iovec *iovp, 640 loff_t *pos,
569 unsigned long nr_segs, 641 size_t *count,
570 loff_t pos) 642 int *iolock)
571{ 643{
572 struct file *file = iocb->ki_filp; 644 struct inode *inode = file->f_mapping->host;
573 struct address_space *mapping = file->f_mapping;
574 struct inode *inode = mapping->host;
575 struct xfs_inode *ip = XFS_I(inode); 645 struct xfs_inode *ip = XFS_I(inode);
576 struct xfs_mount *mp = ip->i_mount; 646 xfs_fsize_t new_size;
577 ssize_t ret = 0, error = 0; 647 int error = 0;
578 int ioflags = 0;
579 xfs_fsize_t isize, new_size;
580 int iolock;
581 size_t ocount = 0, count;
582 int need_i_mutex;
583 648
584 XFS_STATS_INC(xs_write_calls); 649 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
650 if (error) {
651 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
652 *iolock = 0;
653 return error;
654 }
585 655
586 BUG_ON(iocb->ki_pos != pos); 656 new_size = *pos + *count;
657 if (new_size > ip->i_size)
658 ip->i_new_size = new_size;
587 659
588 if (unlikely(file->f_flags & O_DIRECT)) 660 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
589 ioflags |= IO_ISDIRECT; 661 file_update_time(file);
590 if (file->f_mode & FMODE_NOCMTIME) 662
591 ioflags |= IO_INVIS; 663 /*
664 * If the offset is beyond the size of the file, we need to zero any
665 * blocks that fall between the existing EOF and the start of this
666 * write.
667 */
668 if (*pos > ip->i_size)
669 error = -xfs_zero_eof(ip, *pos, ip->i_size);
592 670
593 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); 671 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
594 if (error) 672 if (error)
595 return error; 673 return error;
596 674
597 count = ocount; 675 /*
598 if (count == 0) 676 * If we're writing the file then make sure to clear the setuid and
599 return 0; 677 * setgid bits if the process is not being run by root. This keeps
600 678 * people from modifying setuid and setgid binaries.
601 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 679 */
680 return file_remove_suid(file);
602 681
603 if (XFS_FORCED_SHUTDOWN(mp)) 682}
604 return -EIO;
605 683
606relock: 684/*
607 if (ioflags & IO_ISDIRECT) { 685 * xfs_file_dio_aio_write - handle direct IO writes
608 iolock = XFS_IOLOCK_SHARED; 686 *
609 need_i_mutex = 0; 687 * Lock the inode appropriately to prepare for and issue a direct IO write.
610 } else { 688 * By separating it from the buffered write path we remove all the tricky to
611 iolock = XFS_IOLOCK_EXCL; 689 * follow locking changes and looping.
612 need_i_mutex = 1; 690 *
613 mutex_lock(&inode->i_mutex); 691 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
692 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
693 * pages are flushed out.
694 *
695 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
696 * allowing them to be done in parallel with reads and other direct IO writes.
697 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
698 * needs to do sub-block zeroing and that requires serialisation against other
699 * direct IOs to the same block. In this case we need to serialise the
700 * submission of the unaligned IOs so that we don't get racing block zeroing in
701 * the dio layer. To avoid the problem with aio, we also need to wait for
702 * outstanding IOs to complete so that unwritten extent conversion is completed
703 * before we try to map the overlapping block. This is currently implemented by
704 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
705 *
706 * Returns with locks held indicated by @iolock and errors indicated by
707 * negative return values.
708 */
709STATIC ssize_t
710xfs_file_dio_aio_write(
711 struct kiocb *iocb,
712 const struct iovec *iovp,
713 unsigned long nr_segs,
714 loff_t pos,
715 size_t ocount,
716 int *iolock)
717{
718 struct file *file = iocb->ki_filp;
719 struct address_space *mapping = file->f_mapping;
720 struct inode *inode = mapping->host;
721 struct xfs_inode *ip = XFS_I(inode);
722 struct xfs_mount *mp = ip->i_mount;
723 ssize_t ret = 0;
724 size_t count = ocount;
725 int unaligned_io = 0;
726 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
727 mp->m_rtdev_targp : mp->m_ddev_targp;
728
729 *iolock = 0;
730 if ((pos & target->bt_smask) || (count & target->bt_smask))
731 return -XFS_ERROR(EINVAL);
732
733 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
734 unaligned_io = 1;
735
736 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
737 *iolock = XFS_IOLOCK_EXCL;
738 else
739 *iolock = XFS_IOLOCK_SHARED;
740 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
741
742 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
743 if (ret)
744 return ret;
745
746 if (mapping->nrpages) {
747 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
748 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
749 FI_REMAPF_LOCKED);
750 if (ret)
751 return ret;
614 } 752 }
615 753
616 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 754 /*
617 755 * If we are doing unaligned IO, wait for all other IO to drain,
618start: 756 * otherwise demote the lock if we had to flush cached pages
619 error = -generic_write_checks(file, &pos, &count, 757 */
620 S_ISBLK(inode->i_mode)); 758 if (unaligned_io)
621 if (error) { 759 xfs_ioend_wait(ip);
622 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 760 else if (*iolock == XFS_IOLOCK_EXCL) {
623 goto out_unlock_mutex; 761 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
762 *iolock = XFS_IOLOCK_SHARED;
624 } 763 }
625 764
626 if (ioflags & IO_ISDIRECT) { 765 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
627 xfs_buftarg_t *target = 766 ret = generic_file_direct_write(iocb, iovp,
628 XFS_IS_REALTIME_INODE(ip) ? 767 &nr_segs, pos, &iocb->ki_pos, count, ocount);
629 mp->m_rtdev_targp : mp->m_ddev_targp;
630 768
631 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 769 /* No fallback to buffered IO on errors for XFS. */
632 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 770 ASSERT(ret < 0 || ret == count);
633 return XFS_ERROR(-EINVAL); 771 return ret;
634 } 772}
635 773
636 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { 774STATIC ssize_t
637 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 775xfs_file_buffered_aio_write(
638 iolock = XFS_IOLOCK_EXCL; 776 struct kiocb *iocb,
639 need_i_mutex = 1; 777 const struct iovec *iovp,
640 mutex_lock(&inode->i_mutex); 778 unsigned long nr_segs,
641 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 779 loff_t pos,
642 goto start; 780 size_t ocount,
643 } 781 int *iolock)
644 } 782{
783 struct file *file = iocb->ki_filp;
784 struct address_space *mapping = file->f_mapping;
785 struct inode *inode = mapping->host;
786 struct xfs_inode *ip = XFS_I(inode);
787 ssize_t ret;
788 int enospc = 0;
789 size_t count = ocount;
645 790
646 new_size = pos + count; 791 *iolock = XFS_IOLOCK_EXCL;
647 if (new_size > ip->i_size) 792 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
648 ip->i_new_size = new_size;
649 793
650 if (likely(!(ioflags & IO_INVIS))) 794 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
651 file_update_time(file); 795 if (ret)
796 return ret;
652 797
798 /* We can write back this queue in page reclaim */
799 current->backing_dev_info = mapping->backing_dev_info;
800
801write_retry:
802 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
803 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
804 pos, &iocb->ki_pos, count, ret);
653 /* 805 /*
654 * If the offset is beyond the size of the file, we have a couple 806 * if we just got an ENOSPC, flush the inode now we aren't holding any
655 * of things to do. First, if there is already space allocated 807 * page locks and retry *once*
656 * we need to either create holes or zero the disk or ...
657 *
658 * If there is a page where the previous size lands, we need
659 * to zero it out up to the new size.
660 */ 808 */
661 809 if (ret == -ENOSPC && !enospc) {
662 if (pos > ip->i_size) { 810 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
663 error = xfs_zero_eof(ip, pos, ip->i_size); 811 if (ret)
664 if (error) { 812 return ret;
665 xfs_iunlock(ip, XFS_ILOCK_EXCL); 813 enospc = 1;
666 goto out_unlock_internal; 814 goto write_retry;
667 }
668 } 815 }
669 xfs_iunlock(ip, XFS_ILOCK_EXCL); 816 current->backing_dev_info = NULL;
817 return ret;
818}
670 819
671 /* 820STATIC ssize_t
672 * If we're writing the file then make sure to clear the 821xfs_file_aio_write(
673 * setuid and setgid bits if the process is not being run 822 struct kiocb *iocb,
674 * by root. This keeps people from modifying setuid and 823 const struct iovec *iovp,
675 * setgid binaries. 824 unsigned long nr_segs,
676 */ 825 loff_t pos)
677 error = -file_remove_suid(file); 826{
678 if (unlikely(error)) 827 struct file *file = iocb->ki_filp;
679 goto out_unlock_internal; 828 struct address_space *mapping = file->f_mapping;
829 struct inode *inode = mapping->host;
830 struct xfs_inode *ip = XFS_I(inode);
831 ssize_t ret;
832 int iolock;
833 size_t ocount = 0;
680 834
681 /* We can write back this queue in page reclaim */ 835 XFS_STATS_INC(xs_write_calls);
682 current->backing_dev_info = mapping->backing_dev_info;
683 836
684 if ((ioflags & IO_ISDIRECT)) { 837 BUG_ON(iocb->ki_pos != pos);
685 if (mapping->nrpages) {
686 WARN_ON(need_i_mutex == 0);
687 error = xfs_flushinval_pages(ip,
688 (pos & PAGE_CACHE_MASK),
689 -1, FI_REMAPF_LOCKED);
690 if (error)
691 goto out_unlock_internal;
692 }
693 838
694 if (need_i_mutex) { 839 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
695 /* demote the lock now the cached pages are gone */ 840 if (ret)
696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 841 return ret;
697 mutex_unlock(&inode->i_mutex);
698 842
699 iolock = XFS_IOLOCK_SHARED; 843 if (ocount == 0)
700 need_i_mutex = 0; 844 return 0;
701 }
702 845
703 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); 846 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
704 ret = generic_file_direct_write(iocb, iovp,
705 &nr_segs, pos, &iocb->ki_pos, count, ocount);
706 847
707 /* 848 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
708 * direct-io write to a hole: fall through to buffered I/O 849 return -EIO;
709 * for completing the rest of the request.
710 */
711 if (ret >= 0 && ret != count) {
712 XFS_STATS_ADD(xs_write_bytes, ret);
713 850
714 pos += ret; 851 if (unlikely(file->f_flags & O_DIRECT))
715 count -= ret; 852 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
853 ocount, &iolock);
854 else
855 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
856 ocount, &iolock);
716 857
717 ioflags &= ~IO_ISDIRECT; 858 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
718 xfs_iunlock(ip, iolock);
719 goto relock;
720 }
721 } else {
722 int enospc = 0;
723 ssize_t ret2 = 0;
724 859
725write_retry: 860 if (ret <= 0)
726 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); 861 goto out_unlock;
727 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
728 pos, &iocb->ki_pos, count, ret);
729 /*
730 * if we just got an ENOSPC, flush the inode now we
731 * aren't holding any page locks and retry *once*
732 */
733 if (ret2 == -ENOSPC && !enospc) {
734 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
735 if (error)
736 goto out_unlock_internal;
737 enospc = 1;
738 goto write_retry;
739 }
740 ret = ret2;
741 }
742 862
743 current->backing_dev_info = NULL; 863 /* Handle various SYNC-type writes */
864 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
865 loff_t end = pos + ret - 1;
866 int error, error2;
744 867
745 isize = i_size_read(inode); 868 xfs_rw_iunlock(ip, iolock);
746 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) 869 error = filemap_write_and_wait_range(mapping, pos, end);
747 iocb->ki_pos = isize; 870 xfs_rw_ilock(ip, iolock);
748 871
749 if (iocb->ki_pos > ip->i_size) { 872 error2 = -xfs_file_fsync(file,
750 xfs_ilock(ip, XFS_ILOCK_EXCL); 873 (file->f_flags & __O_SYNC) ? 0 : 1);
751 if (iocb->ki_pos > ip->i_size) 874 if (error)
752 ip->i_size = iocb->ki_pos; 875 ret = error;
753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 876 else if (error2)
877 ret = error2;
754 } 878 }
755 879
756 error = -ret; 880out_unlock:
757 if (ret <= 0) 881 xfs_aio_write_newsize_update(ip);
758 goto out_unlock_internal; 882 xfs_rw_iunlock(ip, iolock);
883 return ret;
884}
759 885
760 XFS_STATS_ADD(xs_write_bytes, ret); 886STATIC long
887xfs_file_fallocate(
888 struct file *file,
889 int mode,
890 loff_t offset,
891 loff_t len)
892{
893 struct inode *inode = file->f_path.dentry->d_inode;
894 long error;
895 loff_t new_size = 0;
896 xfs_flock64_t bf;
897 xfs_inode_t *ip = XFS_I(inode);
898 int cmd = XFS_IOC_RESVSP;
761 899
762 /* Handle various SYNC-type writes */ 900 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
763 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 901 return -EOPNOTSUPP;
764 loff_t end = pos + ret - 1;
765 int error2;
766 902
767 xfs_iunlock(ip, iolock); 903 bf.l_whence = 0;
768 if (need_i_mutex) 904 bf.l_start = offset;
769 mutex_unlock(&inode->i_mutex); 905 bf.l_len = len;
770 906
771 error2 = filemap_write_and_wait_range(mapping, pos, end); 907 xfs_ilock(ip, XFS_IOLOCK_EXCL);
772 if (!error)
773 error = error2;
774 if (need_i_mutex)
775 mutex_lock(&inode->i_mutex);
776 xfs_ilock(ip, iolock);
777 908
778 error2 = -xfs_file_fsync(file, 909 if (mode & FALLOC_FL_PUNCH_HOLE)
779 (file->f_flags & __O_SYNC) ? 0 : 1); 910 cmd = XFS_IOC_UNRESVSP;
780 if (!error) 911
781 error = error2; 912 /* check the new inode size is valid before allocating */
913 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
914 offset + len > i_size_read(inode)) {
915 new_size = offset + len;
916 error = inode_newsize_ok(inode, new_size);
917 if (error)
918 goto out_unlock;
782 } 919 }
783 920
784 out_unlock_internal: 921 error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
785 if (ip->i_new_size) { 922 if (error)
786 xfs_ilock(ip, XFS_ILOCK_EXCL); 923 goto out_unlock;
787 ip->i_new_size = 0; 924
788 /* 925 /* Change file size if needed */
789 * If this was a direct or synchronous I/O that failed (such 926 if (new_size) {
790 * as ENOSPC) then part of the I/O may have been written to 927 struct iattr iattr;
791 * disk before the error occured. In this case the on-disk 928
792 * file size may have been adjusted beyond the in-memory file 929 iattr.ia_valid = ATTR_SIZE;
793 * size and now needs to be truncated back. 930 iattr.ia_size = new_size;
794 */ 931 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
795 if (ip->i_d.di_size > ip->i_size)
796 ip->i_d.di_size = ip->i_size;
797 xfs_iunlock(ip, XFS_ILOCK_EXCL);
798 } 932 }
799 xfs_iunlock(ip, iolock); 933
800 out_unlock_mutex: 934out_unlock:
801 if (need_i_mutex) 935 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
802 mutex_unlock(&inode->i_mutex); 936 return error;
803 return -error;
804} 937}
805 938
939
806STATIC int 940STATIC int
807xfs_file_open( 941xfs_file_open(
808 struct inode *inode, 942 struct inode *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
921 .open = xfs_file_open, 1055 .open = xfs_file_open,
922 .release = xfs_file_release, 1056 .release = xfs_file_release,
923 .fsync = xfs_file_fsync, 1057 .fsync = xfs_file_fsync,
1058 .fallocate = xfs_file_fallocate,
924}; 1059};
925 1060
926const struct file_operations xfs_dir_file_operations = { 1061const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392e..0ca0e3c024d7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
39#include "xfs_dfrag.h" 39#include "xfs_dfrag.h"
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_discard.h"
42#include "xfs_quota.h" 43#include "xfs_quota.h"
43#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
44#include "xfs_export.h" 45#include "xfs_export.h"
@@ -694,14 +695,19 @@ xfs_ioc_fsgeometry_v1(
694 xfs_mount_t *mp, 695 xfs_mount_t *mp,
695 void __user *arg) 696 void __user *arg)
696{ 697{
697 xfs_fsop_geom_v1_t fsgeo; 698 xfs_fsop_geom_t fsgeo;
698 int error; 699 int error;
699 700
700 error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); 701 error = xfs_fs_geometry(mp, &fsgeo, 3);
701 if (error) 702 if (error)
702 return -error; 703 return -error;
703 704
704 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) 705 /*
706 * Caller should have passed an argument of type
707 * xfs_fsop_geom_v1_t. This is a proper subset of the
708 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
709 */
710 if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
705 return -XFS_ERROR(EFAULT); 711 return -XFS_ERROR(EFAULT);
706 return 0; 712 return 0;
707} 713}
@@ -984,10 +990,22 @@ xfs_ioctl_setattr(
984 990
985 /* 991 /*
986 * Extent size must be a multiple of the appropriate block 992 * Extent size must be a multiple of the appropriate block
987 * size, if set at all. 993 * size, if set at all. It must also be smaller than the
994 * maximum extent size supported by the filesystem.
995 *
996 * Also, for non-realtime files, limit the extent size hint to
997 * half the size of the AGs in the filesystem so alignment
998 * doesn't result in extents larger than an AG.
988 */ 999 */
989 if (fa->fsx_extsize != 0) { 1000 if (fa->fsx_extsize != 0) {
990 xfs_extlen_t size; 1001 xfs_extlen_t size;
1002 xfs_fsblock_t extsize_fsb;
1003
1004 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1005 if (extsize_fsb > MAXEXTLEN) {
1006 code = XFS_ERROR(EINVAL);
1007 goto error_return;
1008 }
991 1009
992 if (XFS_IS_REALTIME_INODE(ip) || 1010 if (XFS_IS_REALTIME_INODE(ip) ||
993 ((mask & FSX_XFLAGS) && 1011 ((mask & FSX_XFLAGS) &&
@@ -996,6 +1014,10 @@ xfs_ioctl_setattr(
996 mp->m_sb.sb_blocklog; 1014 mp->m_sb.sb_blocklog;
997 } else { 1015 } else {
998 size = mp->m_sb.sb_blocksize; 1016 size = mp->m_sb.sb_blocksize;
1017 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1018 code = XFS_ERROR(EINVAL);
1019 goto error_return;
1020 }
999 } 1021 }
1000 1022
1001 if (fa->fsx_extsize % size) { 1023 if (fa->fsx_extsize % size) {
@@ -1294,6 +1316,8 @@ xfs_file_ioctl(
1294 trace_xfs_file_ioctl(ip); 1316 trace_xfs_file_ioctl(ip);
1295 1317
1296 switch (cmd) { 1318 switch (cmd) {
1319 case FITRIM:
1320 return xfs_ioc_trim(mp, arg);
1297 case XFS_IOC_ALLOCSP: 1321 case XFS_IOC_ALLOCSP:
1298 case XFS_IOC_FREESP: 1322 case XFS_IOC_FREESP:
1299 case XFS_IOC_RESVSP: 1323 case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 94d5fd6a2973..bd5727852fd6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/posix_acl.h> 47#include <linux/posix_acl.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/falloc.h>
50#include <linux/fiemap.h> 49#include <linux/fiemap.h>
51#include <linux/slab.h> 50#include <linux/slab.h>
52 51
@@ -505,58 +504,6 @@ xfs_vn_setattr(
505 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 504 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
506} 505}
507 506
508STATIC long
509xfs_vn_fallocate(
510 struct inode *inode,
511 int mode,
512 loff_t offset,
513 loff_t len)
514{
515 long error;
516 loff_t new_size = 0;
517 xfs_flock64_t bf;
518 xfs_inode_t *ip = XFS_I(inode);
519
520 /* preallocation on directories not yet supported */
521 error = -ENODEV;
522 if (S_ISDIR(inode->i_mode))
523 goto out_error;
524
525 bf.l_whence = 0;
526 bf.l_start = offset;
527 bf.l_len = len;
528
529 xfs_ilock(ip, XFS_IOLOCK_EXCL);
530
531 /* check the new inode size is valid before allocating */
532 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
533 offset + len > i_size_read(inode)) {
534 new_size = offset + len;
535 error = inode_newsize_ok(inode, new_size);
536 if (error)
537 goto out_unlock;
538 }
539
540 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
541 0, XFS_ATTR_NOLOCK);
542 if (error)
543 goto out_unlock;
544
545 /* Change file size if needed */
546 if (new_size) {
547 struct iattr iattr;
548
549 iattr.ia_valid = ATTR_SIZE;
550 iattr.ia_size = new_size;
551 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
552 }
553
554out_unlock:
555 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
556out_error:
557 return error;
558}
559
560#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 507#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
561 508
562/* 509/*
@@ -650,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
650 .getxattr = generic_getxattr, 597 .getxattr = generic_getxattr,
651 .removexattr = generic_removexattr, 598 .removexattr = generic_removexattr,
652 .listxattr = xfs_vn_listxattr, 599 .listxattr = xfs_vn_listxattr,
653 .fallocate = xfs_vn_fallocate,
654 .fiemap = xfs_vn_fiemap, 600 .fiemap = xfs_vn_fiemap,
655}; 601};
656 602
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd71ff79..096494997747 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,7 +37,6 @@
37 37
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h>
41#include <time.h> 40#include <time.h>
42 41
43#include <support/debug.h> 42#include <support/debug.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 064f964d4f3c..9731898083ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -606,7 +606,8 @@ xfs_blkdev_get(
606{ 606{
607 int error = 0; 607 int error = 0;
608 608
609 *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); 609 *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
610 mp);
610 if (IS_ERR(*bdevp)) { 611 if (IS_ERR(*bdevp)) {
611 error = PTR_ERR(*bdevp); 612 error = PTR_ERR(*bdevp);
612 printk("XFS: Invalid device [%s], error=%d\n", name, error); 613 printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -620,7 +621,7 @@ xfs_blkdev_put(
620 struct block_device *bdev) 621 struct block_device *bdev)
621{ 622{
622 if (bdev) 623 if (bdev)
623 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 624 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
624} 625}
625 626
626/* 627/*
@@ -834,8 +835,11 @@ xfsaild_wakeup(
834 struct xfs_ail *ailp, 835 struct xfs_ail *ailp,
835 xfs_lsn_t threshold_lsn) 836 xfs_lsn_t threshold_lsn)
836{ 837{
837 ailp->xa_target = threshold_lsn; 838 /* only ever move the target forwards */
838 wake_up_process(ailp->xa_task); 839 if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
840 ailp->xa_target = threshold_lsn;
841 wake_up_process(ailp->xa_task);
842 }
839} 843}
840 844
841STATIC int 845STATIC int
@@ -847,8 +851,17 @@ xfsaild(
847 long tout = 0; /* milliseconds */ 851 long tout = 0; /* milliseconds */
848 852
849 while (!kthread_should_stop()) { 853 while (!kthread_should_stop()) {
850 schedule_timeout_interruptible(tout ? 854 /*
851 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); 855 * for short sleeps indicating congestion, don't allow us to
856 * get woken early. Otherwise all we do is bang on the AIL lock
857 * without making progress.
858 */
859 if (tout && tout <= 20)
860 __set_current_state(TASK_KILLABLE);
861 else
862 __set_current_state(TASK_INTERRUPTIBLE);
863 schedule_timeout(tout ?
864 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
852 865
853 /* swsusp */ 866 /* swsusp */
854 try_to_freeze(); 867 try_to_freeze();
@@ -935,7 +948,7 @@ out_reclaim:
935 * Slab object creation initialisation for the XFS inode. 948 * Slab object creation initialisation for the XFS inode.
936 * This covers only the idempotent fields in the XFS inode; 949 * This covers only the idempotent fields in the XFS inode;
937 * all other fields need to be initialised on allocation 950 * all other fields need to be initialised on allocation
938 * from the slab. This avoids the need to repeatedly intialise 951 * from the slab. This avoids the need to repeatedly initialise
939 * fields in the xfs inode that left in the initialise state 952 * fields in the xfs inode that left in the initialise state
940 * when freeing the inode. 953 * when freeing the inode.
941 */ 954 */
@@ -1118,6 +1131,8 @@ xfs_fs_evict_inode(
1118 */ 1131 */
1119 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 1132 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
1120 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 1133 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
1134 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
1135 &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
1121 1136
1122 xfs_inactive(ip); 1137 xfs_inactive(ip);
1123} 1138}
@@ -1399,7 +1414,7 @@ xfs_fs_freeze(
1399 1414
1400 xfs_save_resvblks(mp); 1415 xfs_save_resvblks(mp);
1401 xfs_quiesce_attr(mp); 1416 xfs_quiesce_attr(mp);
1402 return -xfs_fs_log_dummy(mp, SYNC_WAIT); 1417 return -xfs_fs_log_dummy(mp);
1403} 1418}
1404 1419
1405STATIC int 1420STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1c..e22f0057d21f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
53{ 53{
54 struct inode *inode = VFS_I(ip); 54 struct inode *inode = VFS_I(ip);
55 55
56 ASSERT(rcu_read_lock_held());
57
58 /*
59 * check for stale RCU freed inode
60 *
61 * If the inode has been reallocated, it doesn't matter if it's not in
62 * the AG we are walking - we are walking for writeback, so if it
63 * passes all the "valid inode" checks and is dirty, then we'll write
64 * it back anyway. If it has been reallocated and still being
65 * initialised, the XFS_INEW check below will catch it.
66 */
67 spin_lock(&ip->i_flags_lock);
68 if (!ip->i_ino)
69 goto out_unlock_noent;
70
71 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
72 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
73 goto out_unlock_noent;
74 spin_unlock(&ip->i_flags_lock);
75
56 /* nothing to sync during shutdown */ 76 /* nothing to sync during shutdown */
57 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 77 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
58 return EFSCORRUPTED; 78 return EFSCORRUPTED;
59 79
60 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
61 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
62 return ENOENT;
63
64 /* If we can't grab the inode, it must on it's way to reclaim. */ 80 /* If we can't grab the inode, it must on it's way to reclaim. */
65 if (!igrab(inode)) 81 if (!igrab(inode))
66 return ENOENT; 82 return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
72 88
73 /* inode is valid */ 89 /* inode is valid */
74 return 0; 90 return 0;
91
92out_unlock_noent:
93 spin_unlock(&ip->i_flags_lock);
94 return ENOENT;
75} 95}
76 96
77STATIC int 97STATIC int
@@ -98,12 +118,12 @@ restart:
98 int error = 0; 118 int error = 0;
99 int i; 119 int i;
100 120
101 read_lock(&pag->pag_ici_lock); 121 rcu_read_lock();
102 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 122 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
103 (void **)batch, first_index, 123 (void **)batch, first_index,
104 XFS_LOOKUP_BATCH); 124 XFS_LOOKUP_BATCH);
105 if (!nr_found) { 125 if (!nr_found) {
106 read_unlock(&pag->pag_ici_lock); 126 rcu_read_unlock();
107 break; 127 break;
108 } 128 }
109 129
@@ -118,18 +138,26 @@ restart:
118 batch[i] = NULL; 138 batch[i] = NULL;
119 139
120 /* 140 /*
121 * Update the index for the next lookup. Catch overflows 141 * Update the index for the next lookup. Catch
122 * into the next AG range which can occur if we have inodes 142 * overflows into the next AG range which can occur if
123 * in the last block of the AG and we are currently 143 * we have inodes in the last block of the AG and we
124 * pointing to the last inode. 144 * are currently pointing to the last inode.
145 *
146 * Because we may see inodes that are from the wrong AG
147 * due to RCU freeing and reallocation, only update the
148 * index if it lies in this AG. It was a race that lead
149 * us to see this inode, so another lookup from the
150 * same index will not find it again.
125 */ 151 */
152 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
153 continue;
126 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 154 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 155 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128 done = 1; 156 done = 1;
129 } 157 }
130 158
131 /* unlock now we've grabbed the inodes. */ 159 /* unlock now we've grabbed the inodes. */
132 read_unlock(&pag->pag_ici_lock); 160 rcu_read_unlock();
133 161
134 for (i = 0; i < nr_found; i++) { 162 for (i = 0; i < nr_found; i++) {
135 if (!batch[i]) 163 if (!batch[i])
@@ -334,7 +362,7 @@ xfs_quiesce_data(
334 362
335 /* mark the log as covered if needed */ 363 /* mark the log as covered if needed */
336 if (xfs_log_need_covered(mp)) 364 if (xfs_log_need_covered(mp))
337 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); 365 error2 = xfs_fs_log_dummy(mp);
338 366
339 /* flush data-only devices */ 367 /* flush data-only devices */
340 if (mp->m_rtdev_targp) 368 if (mp->m_rtdev_targp)
@@ -475,13 +503,14 @@ xfs_sync_worker(
475 int error; 503 int error;
476 504
477 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 505 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
478 xfs_log_force(mp, 0);
479 xfs_reclaim_inodes(mp, 0);
480 /* dgc: errors ignored here */ 506 /* dgc: errors ignored here */
481 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
482 if (mp->m_super->s_frozen == SB_UNFROZEN && 507 if (mp->m_super->s_frozen == SB_UNFROZEN &&
483 xfs_log_need_covered(mp)) 508 xfs_log_need_covered(mp))
484 error = xfs_fs_log_dummy(mp, 0); 509 error = xfs_fs_log_dummy(mp);
510 else
511 xfs_log_force(mp, 0);
512 xfs_reclaim_inodes(mp, 0);
513 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
485 } 514 }
486 mp->m_sync_seq++; 515 mp->m_sync_seq++;
487 wake_up(&mp->m_wait_single_sync_task); 516 wake_up(&mp->m_wait_single_sync_task);
@@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag(
592 struct xfs_perag *pag; 621 struct xfs_perag *pag;
593 622
594 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 623 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
595 write_lock(&pag->pag_ici_lock); 624 spin_lock(&pag->pag_ici_lock);
596 spin_lock(&ip->i_flags_lock); 625 spin_lock(&ip->i_flags_lock);
597 __xfs_inode_set_reclaim_tag(pag, ip); 626 __xfs_inode_set_reclaim_tag(pag, ip);
598 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 627 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
599 spin_unlock(&ip->i_flags_lock); 628 spin_unlock(&ip->i_flags_lock);
600 write_unlock(&pag->pag_ici_lock); 629 spin_unlock(&pag->pag_ici_lock);
601 xfs_perag_put(pag); 630 xfs_perag_put(pag);
602} 631}
603 632
@@ -639,9 +668,14 @@ xfs_reclaim_inode_grab(
639 struct xfs_inode *ip, 668 struct xfs_inode *ip,
640 int flags) 669 int flags)
641{ 670{
671 ASSERT(rcu_read_lock_held());
672
673 /* quick check for stale RCU freed inode */
674 if (!ip->i_ino)
675 return 1;
642 676
643 /* 677 /*
644 * do some unlocked checks first to avoid unnecceary lock traffic. 678 * do some unlocked checks first to avoid unnecessary lock traffic.
645 * The first is a flush lock check, the second is a already in reclaim 679 * The first is a flush lock check, the second is a already in reclaim
646 * check. Only do these checks if we are not going to block on locks. 680 * check. Only do these checks if we are not going to block on locks.
647 */ 681 */
@@ -654,11 +688,16 @@ xfs_reclaim_inode_grab(
654 * The radix tree lock here protects a thread in xfs_iget from racing 688 * The radix tree lock here protects a thread in xfs_iget from racing
655 * with us starting reclaim on the inode. Once we have the 689 * with us starting reclaim on the inode. Once we have the
656 * XFS_IRECLAIM flag set it will not touch us. 690 * XFS_IRECLAIM flag set it will not touch us.
691 *
692 * Due to RCU lookup, we may find inodes that have been freed and only
693 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
694 * aren't candidates for reclaim at all, so we must check the
695 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
657 */ 696 */
658 spin_lock(&ip->i_flags_lock); 697 spin_lock(&ip->i_flags_lock);
659 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); 698 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
660 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { 699 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
661 /* ignore as it is already under reclaim */ 700 /* not a reclaim candidate. */
662 spin_unlock(&ip->i_flags_lock); 701 spin_unlock(&ip->i_flags_lock);
663 return 1; 702 return 1;
664 } 703 }
@@ -795,12 +834,12 @@ reclaim:
795 * added to the tree assert that it's been there before to catch 834 * added to the tree assert that it's been there before to catch
796 * problems with the inode life time early on. 835 * problems with the inode life time early on.
797 */ 836 */
798 write_lock(&pag->pag_ici_lock); 837 spin_lock(&pag->pag_ici_lock);
799 if (!radix_tree_delete(&pag->pag_ici_root, 838 if (!radix_tree_delete(&pag->pag_ici_root,
800 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 839 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
801 ASSERT(0); 840 ASSERT(0);
802 __xfs_inode_clear_reclaim(pag, ip); 841 __xfs_inode_clear_reclaim(pag, ip);
803 write_unlock(&pag->pag_ici_lock); 842 spin_unlock(&pag->pag_ici_lock);
804 843
805 /* 844 /*
806 * Here we do an (almost) spurious inode lock in order to coordinate 845 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +903,14 @@ restart:
864 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 903 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
865 int i; 904 int i;
866 905
867 write_lock(&pag->pag_ici_lock); 906 rcu_read_lock();
868 nr_found = radix_tree_gang_lookup_tag( 907 nr_found = radix_tree_gang_lookup_tag(
869 &pag->pag_ici_root, 908 &pag->pag_ici_root,
870 (void **)batch, first_index, 909 (void **)batch, first_index,
871 XFS_LOOKUP_BATCH, 910 XFS_LOOKUP_BATCH,
872 XFS_ICI_RECLAIM_TAG); 911 XFS_ICI_RECLAIM_TAG);
873 if (!nr_found) { 912 if (!nr_found) {
874 write_unlock(&pag->pag_ici_lock); 913 rcu_read_unlock();
875 break; 914 break;
876 } 915 }
877 916
@@ -891,14 +930,24 @@ restart:
891 * occur if we have inodes in the last block of 930 * occur if we have inodes in the last block of
892 * the AG and we are currently pointing to the 931 * the AG and we are currently pointing to the
893 * last inode. 932 * last inode.
933 *
934 * Because we may see inodes that are from the
935 * wrong AG due to RCU freeing and
936 * reallocation, only update the index if it
937 * lies in this AG. It was a race that lead us
938 * to see this inode, so another lookup from
939 * the same index will not find it again.
894 */ 940 */
941 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
942 pag->pag_agno)
943 continue;
895 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 944 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
896 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 945 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
897 done = 1; 946 done = 1;
898 } 947 }
899 948
900 /* unlock now we've grabbed the inodes. */ 949 /* unlock now we've grabbed the inodes. */
901 write_unlock(&pag->pag_ici_lock); 950 rcu_read_unlock();
902 951
903 for (i = 0; i < nr_found; i++) { 952 for (i = 0; i < nr_found; i++) {
904 if (!batch[i]) 953 if (!batch[i])
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee3cee097e7e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include "xfs_error.h"
21 22
22static struct ctl_table_header *xfs_table_header; 23static struct ctl_table_header *xfs_table_header;
23 24
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
51 52
52 return ret; 53 return ret;
53} 54}
55
56STATIC int
57xfs_panic_mask_proc_handler(
58 ctl_table *ctl,
59 int write,
60 void __user *buffer,
61 size_t *lenp,
62 loff_t *ppos)
63{
64 int ret, *valp = ctl->data;
65
66 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
67 if (!ret && write) {
68 xfs_panic_mask = *valp;
69#ifdef DEBUG
70 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
71#endif
72 }
73 return ret;
74}
54#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
55 76
56static ctl_table xfs_table[] = { 77static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
77 .data = &xfs_params.panic_mask.val, 98 .data = &xfs_params.panic_mask.val,
78 .maxlen = sizeof(int), 99 .maxlen = sizeof(int),
79 .mode = 0644, 100 .mode = 0644,
80 .proc_handler = proc_dointvec_minmax, 101 .proc_handler = xfs_panic_mask_proc_handler,
81 .extra1 = &xfs_params.panic_mask.min, 102 .extra1 = &xfs_params.panic_mask.min,
82 .extra2 = &xfs_params.panic_mask.max 103 .extra2 = &xfs_params.panic_mask.max
83 }, 104 },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index acef2e98c594..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
766 __field(int, curr_res) 766 __field(int, curr_res)
767 __field(int, unit_res) 767 __field(int, unit_res)
768 __field(unsigned int, flags) 768 __field(unsigned int, flags)
769 __field(void *, reserve_headq) 769 __field(int, reserveq)
770 __field(void *, write_headq) 770 __field(int, writeq)
771 __field(int, grant_reserve_cycle) 771 __field(int, grant_reserve_cycle)
772 __field(int, grant_reserve_bytes) 772 __field(int, grant_reserve_bytes)
773 __field(int, grant_write_cycle) 773 __field(int, grant_write_cycle)
@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
784 __entry->curr_res = tic->t_curr_res; 784 __entry->curr_res = tic->t_curr_res;
785 __entry->unit_res = tic->t_unit_res; 785 __entry->unit_res = tic->t_unit_res;
786 __entry->flags = tic->t_flags; 786 __entry->flags = tic->t_flags;
787 __entry->reserve_headq = log->l_reserve_headq; 787 __entry->reserveq = list_empty(&log->l_reserveq);
788 __entry->write_headq = log->l_write_headq; 788 __entry->writeq = list_empty(&log->l_writeq);
789 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; 789 xlog_crack_grant_head(&log->l_grant_reserve_head,
790 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; 790 &__entry->grant_reserve_cycle,
791 __entry->grant_write_cycle = log->l_grant_write_cycle; 791 &__entry->grant_reserve_bytes);
792 __entry->grant_write_bytes = log->l_grant_write_bytes; 792 xlog_crack_grant_head(&log->l_grant_write_head,
793 &__entry->grant_write_cycle,
794 &__entry->grant_write_bytes);
793 __entry->curr_cycle = log->l_curr_cycle; 795 __entry->curr_cycle = log->l_curr_cycle;
794 __entry->curr_block = log->l_curr_block; 796 __entry->curr_block = log->l_curr_block;
795 __entry->tail_lsn = log->l_tail_lsn; 797 __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
796 ), 798 ),
797 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " 799 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
798 "t_unit_res %u t_flags %s reserve_headq 0x%p " 800 "t_unit_res %u t_flags %s reserveq %s "
799 "write_headq 0x%p grant_reserve_cycle %d " 801 "writeq %s grant_reserve_cycle %d "
800 "grant_reserve_bytes %d grant_write_cycle %d " 802 "grant_reserve_bytes %d grant_write_cycle %d "
801 "grant_write_bytes %d curr_cycle %d curr_block %d " 803 "grant_write_bytes %d curr_cycle %d curr_block %d "
802 "tail_cycle %d tail_block %d", 804 "tail_cycle %d tail_block %d",
@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
807 __entry->curr_res, 809 __entry->curr_res,
808 __entry->unit_res, 810 __entry->unit_res,
809 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), 811 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
810 __entry->reserve_headq, 812 __entry->reserveq ? "empty" : "active",
811 __entry->write_headq, 813 __entry->writeq ? "empty" : "active",
812 __entry->grant_reserve_cycle, 814 __entry->grant_reserve_cycle,
813 __entry->grant_reserve_bytes, 815 __entry->grant_reserve_bytes,
814 __entry->grant_write_cycle, 816 __entry->grant_write_cycle,
@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
835DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); 837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); 838DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); 839DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
840DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
838DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
839DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 850DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); 851DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
935DEFINE_PAGE_EVENT(xfs_releasepage); 939DEFINE_PAGE_EVENT(xfs_releasepage);
936DEFINE_PAGE_EVENT(xfs_invalidatepage); 940DEFINE_PAGE_EVENT(xfs_invalidatepage);
937 941
938DECLARE_EVENT_CLASS(xfs_iomap_class, 942DECLARE_EVENT_CLASS(xfs_imap_class,
939 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 943 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
940 int flags, struct xfs_bmbt_irec *irec), 944 int type, struct xfs_bmbt_irec *irec),
941 TP_ARGS(ip, offset, count, flags, irec), 945 TP_ARGS(ip, offset, count, type, irec),
942 TP_STRUCT__entry( 946 TP_STRUCT__entry(
943 __field(dev_t, dev) 947 __field(dev_t, dev)
944 __field(xfs_ino_t, ino) 948 __field(xfs_ino_t, ino)
@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
946 __field(loff_t, new_size) 950 __field(loff_t, new_size)
947 __field(loff_t, offset) 951 __field(loff_t, offset)
948 __field(size_t, count) 952 __field(size_t, count)
949 __field(int, flags) 953 __field(int, type)
950 __field(xfs_fileoff_t, startoff) 954 __field(xfs_fileoff_t, startoff)
951 __field(xfs_fsblock_t, startblock) 955 __field(xfs_fsblock_t, startblock)
952 __field(xfs_filblks_t, blockcount) 956 __field(xfs_filblks_t, blockcount)
@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
958 __entry->new_size = ip->i_new_size; 962 __entry->new_size = ip->i_new_size;
959 __entry->offset = offset; 963 __entry->offset = offset;
960 __entry->count = count; 964 __entry->count = count;
961 __entry->flags = flags; 965 __entry->type = type;
962 __entry->startoff = irec ? irec->br_startoff : 0; 966 __entry->startoff = irec ? irec->br_startoff : 0;
963 __entry->startblock = irec ? irec->br_startblock : 0; 967 __entry->startblock = irec ? irec->br_startblock : 0;
964 __entry->blockcount = irec ? irec->br_blockcount : 0; 968 __entry->blockcount = irec ? irec->br_blockcount : 0;
965 ), 969 ),
966 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 970 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
967 "offset 0x%llx count %zd flags %s " 971 "offset 0x%llx count %zd type %s "
968 "startoff 0x%llx startblock %lld blockcount 0x%llx", 972 "startoff 0x%llx startblock %lld blockcount 0x%llx",
969 MAJOR(__entry->dev), MINOR(__entry->dev), 973 MAJOR(__entry->dev), MINOR(__entry->dev),
970 __entry->ino, 974 __entry->ino,
@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
972 __entry->new_size, 976 __entry->new_size,
973 __entry->offset, 977 __entry->offset,
974 __entry->count, 978 __entry->count,
975 __print_flags(__entry->flags, "|", BMAPI_FLAGS), 979 __print_symbolic(__entry->type, XFS_IO_TYPES),
976 __entry->startoff, 980 __entry->startoff,
977 (__int64_t)__entry->startblock, 981 (__int64_t)__entry->startblock,
978 __entry->blockcount) 982 __entry->blockcount)
979) 983)
980 984
981#define DEFINE_IOMAP_EVENT(name) \ 985#define DEFINE_IOMAP_EVENT(name) \
982DEFINE_EVENT(xfs_iomap_class, name, \ 986DEFINE_EVENT(xfs_imap_class, name, \
983 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 987 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
984 int flags, struct xfs_bmbt_irec *irec), \ 988 int type, struct xfs_bmbt_irec *irec), \
985 TP_ARGS(ip, offset, count, flags, irec)) 989 TP_ARGS(ip, offset, count, type, irec))
986DEFINE_IOMAP_EVENT(xfs_iomap_enter); 990DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
987DEFINE_IOMAP_EVENT(xfs_iomap_found); 991DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
988DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 992DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
993DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
989 994
990DECLARE_EVENT_CLASS(xfs_simple_io_class, 995DECLARE_EVENT_CLASS(xfs_simple_io_class,
991 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 996 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
1022 TP_ARGS(ip, offset, count)) 1027 TP_ARGS(ip, offset, count))
1023DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 1028DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
1024DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 1029DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
1030DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
1025 1031
1026 1032
1027TRACE_EVENT(xfs_itruncate_start, 1033TRACE_EVENT(xfs_itruncate_start,
@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
1420 TP_PROTO(struct xfs_alloc_arg *args), \ 1426 TP_PROTO(struct xfs_alloc_arg *args), \
1421 TP_ARGS(args)) 1427 TP_ARGS(args))
1422DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); 1428DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
1429DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
1423DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); 1430DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
1424DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); 1431DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
1425DEFINE_ALLOC_EVENT(xfs_alloc_near_first); 1432DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
@@ -1752,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1752DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); 1759DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1753DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); 1760DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1754 1761
1762DECLARE_EVENT_CLASS(xfs_discard_class,
1763 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1764 xfs_agblock_t agbno, xfs_extlen_t len),
1765 TP_ARGS(mp, agno, agbno, len),
1766 TP_STRUCT__entry(
1767 __field(dev_t, dev)
1768 __field(xfs_agnumber_t, agno)
1769 __field(xfs_agblock_t, agbno)
1770 __field(xfs_extlen_t, len)
1771 ),
1772 TP_fast_assign(
1773 __entry->dev = mp->m_super->s_dev;
1774 __entry->agno = agno;
1775 __entry->agbno = agbno;
1776 __entry->len = len;
1777 ),
1778 TP_printk("dev %d:%d agno %u agbno %u len %u\n",
1779 MAJOR(__entry->dev), MINOR(__entry->dev),
1780 __entry->agno,
1781 __entry->agbno,
1782 __entry->len)
1783)
1784
1785#define DEFINE_DISCARD_EVENT(name) \
1786DEFINE_EVENT(xfs_discard_class, name, \
1787 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1788 xfs_agblock_t agbno, xfs_extlen_t len), \
1789 TP_ARGS(mp, agno, agbno, len))
1790DEFINE_DISCARD_EVENT(xfs_discard_extent);
1791DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
1792DEFINE_DISCARD_EVENT(xfs_discard_exclude);
1793DEFINE_DISCARD_EVENT(xfs_discard_busy);
1794
1755#endif /* _TRACE_XFS_H */ 1795#endif /* _TRACE_XFS_H */
1756 1796
1757#undef TRACE_INCLUDE_PATH 1797#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index faf8e1a83a12..d22aa3103106 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
149 ASSERT(list_empty(&dqp->q_freelist)); 149 ASSERT(list_empty(&dqp->q_freelist));
150 150
151 mutex_destroy(&dqp->q_qlock); 151 mutex_destroy(&dqp->q_qlock);
152 sv_destroy(&dqp->q_pinwait);
153 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); 152 kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
154 153
155 atomic_dec(&xfs_Gqm->qm_totaldquots); 154 atomic_dec(&xfs_Gqm->qm_totaldquots);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde8..206a2815ced6 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1863,12 +1863,14 @@ xfs_qm_dqreclaim_one(void)
1863 xfs_dquot_t *dqpout; 1863 xfs_dquot_t *dqpout;
1864 xfs_dquot_t *dqp; 1864 xfs_dquot_t *dqp;
1865 int restarts; 1865 int restarts;
1866 int startagain;
1866 1867
1867 restarts = 0; 1868 restarts = 0;
1868 dqpout = NULL; 1869 dqpout = NULL;
1869 1870
1870 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ 1871 /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
1871startagain: 1872again:
1873 startagain = 0;
1872 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); 1874 mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
1873 1875
1874 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { 1876 list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1887,10 @@ startagain:
1885 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); 1887 ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
1886 1888
1887 trace_xfs_dqreclaim_want(dqp); 1889 trace_xfs_dqreclaim_want(dqp);
1888
1889 xfs_dqunlock(dqp);
1890 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1891 if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1892 return NULL;
1893 XQM_STATS_INC(xqmstats.xs_qm_dqwants); 1890 XQM_STATS_INC(xqmstats.xs_qm_dqwants);
1894 goto startagain; 1891 restarts++;
1892 startagain = 1;
1893 goto dqunlock;
1895 } 1894 }
1896 1895
1897 /* 1896 /*
@@ -1906,23 +1905,20 @@ startagain:
1906 ASSERT(list_empty(&dqp->q_mplist)); 1905 ASSERT(list_empty(&dqp->q_mplist));
1907 list_del_init(&dqp->q_freelist); 1906 list_del_init(&dqp->q_freelist);
1908 xfs_Gqm->qm_dqfrlist_cnt--; 1907 xfs_Gqm->qm_dqfrlist_cnt--;
1909 xfs_dqunlock(dqp);
1910 dqpout = dqp; 1908 dqpout = dqp;
1911 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); 1909 XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
1912 break; 1910 goto dqunlock;
1913 } 1911 }
1914 1912
1915 ASSERT(dqp->q_hash); 1913 ASSERT(dqp->q_hash);
1916 ASSERT(!list_empty(&dqp->q_mplist)); 1914 ASSERT(!list_empty(&dqp->q_mplist));
1917 1915
1918 /* 1916 /*
1919 * Try to grab the flush lock. If this dquot is in the process of 1917 * Try to grab the flush lock. If this dquot is in the process
1920 * getting flushed to disk, we don't want to reclaim it. 1918 * of getting flushed to disk, we don't want to reclaim it.
1921 */ 1919 */
1922 if (!xfs_dqflock_nowait(dqp)) { 1920 if (!xfs_dqflock_nowait(dqp))
1923 xfs_dqunlock(dqp); 1921 goto dqunlock;
1924 continue;
1925 }
1926 1922
1927 /* 1923 /*
1928 * We have the flush lock so we know that this is not in the 1924 * We have the flush lock so we know that this is not in the
@@ -1944,8 +1940,7 @@ startagain:
1944 xfs_fs_cmn_err(CE_WARN, mp, 1940 xfs_fs_cmn_err(CE_WARN, mp,
1945 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 1941 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
1946 } 1942 }
1947 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 1943 goto dqunlock;
1948 continue;
1949 } 1944 }
1950 1945
1951 /* 1946 /*
@@ -1967,13 +1962,8 @@ startagain:
1967 */ 1962 */
1968 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { 1963 if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
1969 restarts++; 1964 restarts++;
1970 mutex_unlock(&dqp->q_hash->qh_lock); 1965 startagain = 1;
1971 xfs_dqfunlock(dqp); 1966 goto qhunlock;
1972 xfs_dqunlock(dqp);
1973 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1974 if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
1975 return NULL;
1976 goto startagain;
1977 } 1967 }
1978 1968
1979 ASSERT(dqp->q_nrefs == 0); 1969 ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1976,20 @@ startagain:
1986 xfs_Gqm->qm_dqfrlist_cnt--; 1976 xfs_Gqm->qm_dqfrlist_cnt--;
1987 dqpout = dqp; 1977 dqpout = dqp;
1988 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); 1978 mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
1979qhunlock:
1989 mutex_unlock(&dqp->q_hash->qh_lock); 1980 mutex_unlock(&dqp->q_hash->qh_lock);
1990dqfunlock: 1981dqfunlock:
1991 xfs_dqfunlock(dqp); 1982 xfs_dqfunlock(dqp);
1983dqunlock:
1992 xfs_dqunlock(dqp); 1984 xfs_dqunlock(dqp);
1993 if (dqpout) 1985 if (dqpout)
1994 break; 1986 break;
1995 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) 1987 if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
1996 return NULL; 1988 break;
1989 if (startagain) {
1990 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1991 goto again;
1992 }
1997 } 1993 }
1998 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); 1994 mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
1999 return dqpout; 1995 return dqpout;
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a47..0df88897ef84 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
25#include "xfs_mount.h" 25#include "xfs_mount.h"
26#include "xfs_error.h" 26#include "xfs_error.h"
27 27
28static char message[1024]; /* keep it off the stack */
29static DEFINE_SPINLOCK(xfs_err_lock);
30
31/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
32#define XFS_MAX_ERR_LEVEL 7
33#define XFS_ERR_MASK ((1 << 3) - 1)
34static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
35 {KERN_EMERG, KERN_ALERT, KERN_CRIT,
36 KERN_ERR, KERN_WARNING, KERN_NOTICE,
37 KERN_INFO, KERN_DEBUG};
38
39void 28void
40cmn_err(register int level, char *fmt, ...) 29cmn_err(
30 const char *lvl,
31 const char *fmt,
32 ...)
41{ 33{
42 char *fp = fmt; 34 struct va_format vaf;
43 int len; 35 va_list args;
44 ulong flags; 36
45 va_list ap; 37 va_start(args, fmt);
46 38 vaf.fmt = fmt;
47 level &= XFS_ERR_MASK; 39 vaf.va = &args;
48 if (level > XFS_MAX_ERR_LEVEL) 40
49 level = XFS_MAX_ERR_LEVEL; 41 printk("%s%pV", lvl, &vaf);
50 spin_lock_irqsave(&xfs_err_lock,flags); 42 va_end(args);
51 va_start(ap, fmt); 43
52 if (*fmt == '!') fp++; 44 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
53 len = vsnprintf(message, sizeof(message), fp, ap);
54 if (len >= sizeof(message))
55 len = sizeof(message) - 1;
56 if (message[len-1] == '\n')
57 message[len-1] = 0;
58 printk("%s%s\n", err_level[level], message);
59 va_end(ap);
60 spin_unlock_irqrestore(&xfs_err_lock,flags);
61 BUG_ON(level == CE_PANIC);
62} 45}
63 46
64void 47void
65xfs_fs_vcmn_err( 48xfs_fs_cmn_err(
66 int level, 49 const char *lvl,
67 struct xfs_mount *mp, 50 struct xfs_mount *mp,
68 char *fmt, 51 const char *fmt,
69 va_list ap) 52 ...)
70{ 53{
71 unsigned long flags; 54 struct va_format vaf;
72 int len = 0; 55 va_list args;
73 56
74 level &= XFS_ERR_MASK; 57 va_start(args, fmt);
75 if (level > XFS_MAX_ERR_LEVEL) 58 vaf.fmt = fmt;
76 level = XFS_MAX_ERR_LEVEL; 59 vaf.va = &args;
77 60
78 spin_lock_irqsave(&xfs_err_lock,flags); 61 printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
62 va_end(args);
79 63
80 if (mp) { 64 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
81 len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); 65}
66
67/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
68void
69xfs_cmn_err(
70 int panic_tag,
71 const char *lvl,
72 struct xfs_mount *mp,
73 const char *fmt,
74 ...)
75{
76 struct va_format vaf;
77 va_list args;
78 int do_panic = 0;
82 79
83 /* 80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
84 * Skip the printk if we can't print anything useful 81 printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
85 * due to an over-long device name. 82 do_panic = 1;
86 */
87 if (len >= sizeof(message))
88 goto out;
89 } 83 }
90 84
91 len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); 85 va_start(args, fmt);
92 if (len >= sizeof(message)) 86 vaf.fmt = fmt;
93 len = sizeof(message) - 1; 87 vaf.va = &args;
94 if (message[len-1] == '\n')
95 message[len-1] = 0;
96 88
97 printk("%s%s\n", err_level[level], message); 89 printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
98 out: 90 va_end(args);
99 spin_unlock_irqrestore(&xfs_err_lock,flags);
100 91
101 BUG_ON(level == CE_PANIC); 92 BUG_ON(do_panic);
102} 93}
103 94
104void 95void
105assfail(char *expr, char *file, int line) 96assfail(char *expr, char *file, int line)
106{ 97{
107 printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); 98 printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
99 file, line);
108 BUG(); 100 BUG();
109} 101}
110 102
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4f..05699f67d475 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
20 20
21#include <stdarg.h> 21#include <stdarg.h>
22 22
23#define CE_DEBUG 7 /* debug */ 23struct xfs_mount;
24#define CE_CONT 6 /* continuation */ 24
25#define CE_NOTE 5 /* notice */ 25#define CE_DEBUG KERN_DEBUG
26#define CE_WARN 4 /* warning */ 26#define CE_CONT KERN_INFO
27#define CE_ALERT 1 /* alert */ 27#define CE_NOTE KERN_NOTICE
28#define CE_PANIC 0 /* panic */ 28#define CE_WARN KERN_WARNING
29 29#define CE_ALERT KERN_ALERT
30extern void cmn_err(int, char *, ...) 30#define CE_PANIC KERN_EMERG
31 __attribute__ ((format (printf, 2, 3))); 31
32void cmn_err(const char *lvl, const char *fmt, ...)
33 __attribute__ ((format (printf, 2, 3)));
34void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
35 const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
36void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
37 const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
38
32extern void assfail(char *expr, char *f, int l); 39extern void assfail(char *expr, char *f, int l);
33 40
34#define ASSERT_ALWAYS(expr) \ 41#define ASSERT_ALWAYS(expr) \
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 63c7a1a6c022..58632cc17f2d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,7 @@ typedef struct xfs_perag {
227 227
228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
229 229
230 rwlock_t pag_ici_lock; /* incore inode lock */ 230 spinlock_t pag_ici_lock; /* incore inode cache lock */
231 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
232 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
233 struct mutex pag_ici_reclaim_lock; /* serialisation point */ 233 struct mutex pag_ici_reclaim_lock; /* serialisation point */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 112abc439ca5..f3227984a9bf 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44static int
45xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
46 xfs_agblock_t bno, xfs_extlen_t len);
47
48/* 44/*
49 * Prototypes for per-ag allocation routines 45 * Prototypes for per-ag allocation routines
50 */ 46 */
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
94 * Lookup the first record less than or equal to [bno, len] 90 * Lookup the first record less than or equal to [bno, len]
95 * in the btree given by cur. 91 * in the btree given by cur.
96 */ 92 */
97STATIC int /* error */ 93int /* error */
98xfs_alloc_lookup_le( 94xfs_alloc_lookup_le(
99 struct xfs_btree_cur *cur, /* btree cursor */ 95 struct xfs_btree_cur *cur, /* btree cursor */
100 xfs_agblock_t bno, /* starting block of extent */ 96 xfs_agblock_t bno, /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
127/* 123/*
128 * Get the data from the pointed-to record. 124 * Get the data from the pointed-to record.
129 */ 125 */
130STATIC int /* error */ 126int /* error */
131xfs_alloc_get_rec( 127xfs_alloc_get_rec(
132 struct xfs_btree_cur *cur, /* btree cursor */ 128 struct xfs_btree_cur *cur, /* btree cursor */
133 xfs_agblock_t *bno, /* output: starting block of extent */ 129 xfs_agblock_t *bno, /* output: starting block of extent */
@@ -577,61 +573,58 @@ xfs_alloc_ag_vextent_exact(
577 xfs_extlen_t rlen; /* length of returned extent */ 573 xfs_extlen_t rlen; /* length of returned extent */
578 574
579 ASSERT(args->alignment == 1); 575 ASSERT(args->alignment == 1);
576
580 /* 577 /*
581 * Allocate/initialize a cursor for the by-number freespace btree. 578 * Allocate/initialize a cursor for the by-number freespace btree.
582 */ 579 */
583 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 580 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
584 args->agno, XFS_BTNUM_BNO); 581 args->agno, XFS_BTNUM_BNO);
582
585 /* 583 /*
586 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 584 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
587 * Look for the closest free block <= bno, it must contain bno 585 * Look for the closest free block <= bno, it must contain bno
588 * if any free block does. 586 * if any free block does.
589 */ 587 */
590 if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) 588 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
589 if (error)
591 goto error0; 590 goto error0;
592 if (!i) { 591 if (!i)
593 /* 592 goto not_found;
594 * Didn't find it, return null. 593
595 */
596 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
597 args->agbno = NULLAGBLOCK;
598 return 0;
599 }
600 /* 594 /*
601 * Grab the freespace record. 595 * Grab the freespace record.
602 */ 596 */
603 if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) 597 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
598 if (error)
604 goto error0; 599 goto error0;
605 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 600 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
606 ASSERT(fbno <= args->agbno); 601 ASSERT(fbno <= args->agbno);
607 minend = args->agbno + args->minlen; 602 minend = args->agbno + args->minlen;
608 maxend = args->agbno + args->maxlen; 603 maxend = args->agbno + args->maxlen;
609 fend = fbno + flen; 604 fend = fbno + flen;
605
610 /* 606 /*
611 * Give up if the freespace isn't long enough for the minimum request. 607 * Give up if the freespace isn't long enough for the minimum request.
612 */ 608 */
613 if (fend < minend) { 609 if (fend < minend)
614 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 610 goto not_found;
615 args->agbno = NULLAGBLOCK; 611
616 return 0;
617 }
618 /* 612 /*
619 * End of extent will be smaller of the freespace end and the 613 * End of extent will be smaller of the freespace end and the
620 * maximal requested end. 614 * maximal requested end.
621 */ 615 *
622 end = XFS_AGBLOCK_MIN(fend, maxend);
623 /*
624 * Fix the length according to mod and prod if given. 616 * Fix the length according to mod and prod if given.
625 */ 617 */
618 end = XFS_AGBLOCK_MIN(fend, maxend);
626 args->len = end - args->agbno; 619 args->len = end - args->agbno;
627 xfs_alloc_fix_len(args); 620 xfs_alloc_fix_len(args);
628 if (!xfs_alloc_fix_minleft(args)) { 621 if (!xfs_alloc_fix_minleft(args))
629 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 622 goto not_found;
630 return 0; 623
631 }
632 rlen = args->len; 624 rlen = args->len;
633 ASSERT(args->agbno + rlen <= fend); 625 ASSERT(args->agbno + rlen <= fend);
634 end = args->agbno + rlen; 626 end = args->agbno + rlen;
627
635 /* 628 /*
636 * We are allocating agbno for rlen [agbno .. end] 629 * We are allocating agbno for rlen [agbno .. end]
637 * Allocate/initialize a cursor for the by-size btree. 630 * Allocate/initialize a cursor for the by-size btree.
@@ -640,16 +633,25 @@ xfs_alloc_ag_vextent_exact(
640 args->agno, XFS_BTNUM_CNT); 633 args->agno, XFS_BTNUM_CNT);
641 ASSERT(args->agbno + args->len <= 634 ASSERT(args->agbno + args->len <=
642 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 635 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
643 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 636 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
644 args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { 637 args->len, XFSA_FIXUP_BNO_OK);
638 if (error) {
645 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 639 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
646 goto error0; 640 goto error0;
647 } 641 }
642
648 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 643 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
649 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 644 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
650 645
651 trace_xfs_alloc_exact_done(args);
652 args->wasfromfl = 0; 646 args->wasfromfl = 0;
647 trace_xfs_alloc_exact_done(args);
648 return 0;
649
650not_found:
651 /* Didn't find it, return null. */
652 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
653 args->agbno = NULLAGBLOCK;
654 trace_xfs_alloc_exact_notfound(args);
653 return 0; 655 return 0;
654 656
655error0: 657error0:
@@ -659,6 +661,95 @@ error0:
659} 661}
660 662
661/* 663/*
664 * Search the btree in a given direction via the search cursor and compare
665 * the records found against the good extent we've already found.
666 */
667STATIC int
668xfs_alloc_find_best_extent(
669 struct xfs_alloc_arg *args, /* allocation argument structure */
670 struct xfs_btree_cur **gcur, /* good cursor */
671 struct xfs_btree_cur **scur, /* searching cursor */
672 xfs_agblock_t gdiff, /* difference for search comparison */
673 xfs_agblock_t *sbno, /* extent found by search */
674 xfs_extlen_t *slen,
675 xfs_extlen_t *slena, /* aligned length */
676 int dir) /* 0 = search right, 1 = search left */
677{
678 xfs_agblock_t bno;
679 xfs_agblock_t new;
680 xfs_agblock_t sdiff;
681 int error;
682 int i;
683
684 /* The good extent is perfect, no need to search. */
685 if (!gdiff)
686 goto out_use_good;
687
688 /*
689 * Look until we find a better one, run out of space or run off the end.
690 */
691 do {
692 error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
693 if (error)
694 goto error0;
695 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
696 xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
697 args->minlen, &bno, slena);
698
699 /*
700 * The good extent is closer than this one.
701 */
702 if (!dir) {
703 if (bno >= args->agbno + gdiff)
704 goto out_use_good;
705 } else {
706 if (bno <= args->agbno - gdiff)
707 goto out_use_good;
708 }
709
710 /*
711 * Same distance, compare length and pick the best.
712 */
713 if (*slena >= args->minlen) {
714 args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
715 xfs_alloc_fix_len(args);
716
717 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
718 args->alignment, *sbno,
719 *slen, &new);
720
721 /*
722 * Choose closer size and invalidate other cursor.
723 */
724 if (sdiff < gdiff)
725 goto out_use_search;
726 goto out_use_good;
727 }
728
729 if (!dir)
730 error = xfs_btree_increment(*scur, 0, &i);
731 else
732 error = xfs_btree_decrement(*scur, 0, &i);
733 if (error)
734 goto error0;
735 } while (i);
736
737out_use_good:
738 xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
739 *scur = NULL;
740 return 0;
741
742out_use_search:
743 xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
744 *gcur = NULL;
745 return 0;
746
747error0:
748 /* caller invalidates cursors */
749 return error;
750}
751
752/*
662 * Allocate a variable extent near bno in the allocation group agno. 753 * Allocate a variable extent near bno in the allocation group agno.
663 * Extent's length (returned in len) will be between minlen and maxlen, 754 * Extent's length (returned in len) will be between minlen and maxlen,
664 * and of the form k * prod + mod unless there's nothing that large. 755 * and of the form k * prod + mod unless there's nothing that large.
@@ -925,203 +1016,45 @@ xfs_alloc_ag_vextent_near(
925 } 1016 }
926 } 1017 }
927 } while (bno_cur_lt || bno_cur_gt); 1018 } while (bno_cur_lt || bno_cur_gt);
1019
928 /* 1020 /*
929 * Got both cursors still active, need to find better entry. 1021 * Got both cursors still active, need to find better entry.
930 */ 1022 */
931 if (bno_cur_lt && bno_cur_gt) { 1023 if (bno_cur_lt && bno_cur_gt) {
932 /*
933 * Left side is long enough, look for a right side entry.
934 */
935 if (ltlena >= args->minlen) { 1024 if (ltlena >= args->minlen) {
936 /* 1025 /*
937 * Fix up the length. 1026 * Left side is good, look for a right side entry.
938 */ 1027 */
939 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1028 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
940 xfs_alloc_fix_len(args); 1029 xfs_alloc_fix_len(args);
941 rlen = args->len; 1030 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
942 ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
943 args->alignment, ltbno, ltlen, &ltnew); 1031 args->alignment, ltbno, ltlen, &ltnew);
1032
1033 error = xfs_alloc_find_best_extent(args,
1034 &bno_cur_lt, &bno_cur_gt,
1035 ltdiff, &gtbno, &gtlen, &gtlena,
1036 0 /* search right */);
1037 } else {
1038 ASSERT(gtlena >= args->minlen);
1039
944 /* 1040 /*
945 * Not perfect. 1041 * Right side is good, look for a left side entry.
946 */
947 if (ltdiff) {
948 /*
949 * Look until we find a better one, run out of
950 * space, or run off the end.
951 */
952 while (bno_cur_lt && bno_cur_gt) {
953 if ((error = xfs_alloc_get_rec(
954 bno_cur_gt, &gtbno,
955 &gtlen, &i)))
956 goto error0;
957 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
958 xfs_alloc_compute_aligned(gtbno, gtlen,
959 args->alignment, args->minlen,
960 &gtbnoa, &gtlena);
961 /*
962 * The left one is clearly better.
963 */
964 if (gtbnoa >= args->agbno + ltdiff) {
965 xfs_btree_del_cursor(
966 bno_cur_gt,
967 XFS_BTREE_NOERROR);
968 bno_cur_gt = NULL;
969 break;
970 }
971 /*
972 * If we reach a big enough entry,
973 * compare the two and pick the best.
974 */
975 if (gtlena >= args->minlen) {
976 args->len =
977 XFS_EXTLEN_MIN(gtlena,
978 args->maxlen);
979 xfs_alloc_fix_len(args);
980 rlen = args->len;
981 gtdiff = xfs_alloc_compute_diff(
982 args->agbno, rlen,
983 args->alignment,
984 gtbno, gtlen, &gtnew);
985 /*
986 * Right side is better.
987 */
988 if (gtdiff < ltdiff) {
989 xfs_btree_del_cursor(
990 bno_cur_lt,
991 XFS_BTREE_NOERROR);
992 bno_cur_lt = NULL;
993 }
994 /*
995 * Left side is better.
996 */
997 else {
998 xfs_btree_del_cursor(
999 bno_cur_gt,
1000 XFS_BTREE_NOERROR);
1001 bno_cur_gt = NULL;
1002 }
1003 break;
1004 }
1005 /*
1006 * Fell off the right end.
1007 */
1008 if ((error = xfs_btree_increment(
1009 bno_cur_gt, 0, &i)))
1010 goto error0;
1011 if (!i) {
1012 xfs_btree_del_cursor(
1013 bno_cur_gt,
1014 XFS_BTREE_NOERROR);
1015 bno_cur_gt = NULL;
1016 break;
1017 }
1018 }
1019 }
1020 /*
1021 * The left side is perfect, trash the right side.
1022 */
1023 else {
1024 xfs_btree_del_cursor(bno_cur_gt,
1025 XFS_BTREE_NOERROR);
1026 bno_cur_gt = NULL;
1027 }
1028 }
1029 /*
1030 * It's the right side that was found first, look left.
1031 */
1032 else {
1033 /*
1034 * Fix up the length.
1035 */ 1042 */
1036 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1043 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1037 xfs_alloc_fix_len(args); 1044 xfs_alloc_fix_len(args);
1038 rlen = args->len; 1045 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1039 gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
1040 args->alignment, gtbno, gtlen, &gtnew); 1046 args->alignment, gtbno, gtlen, &gtnew);
1041 /* 1047
1042 * Right side entry isn't perfect. 1048 error = xfs_alloc_find_best_extent(args,
1043 */ 1049 &bno_cur_gt, &bno_cur_lt,
1044 if (gtdiff) { 1050 gtdiff, &ltbno, &ltlen, &ltlena,
1045 /* 1051 1 /* search left */);
1046 * Look until we find a better one, run out of
1047 * space, or run off the end.
1048 */
1049 while (bno_cur_lt && bno_cur_gt) {
1050 if ((error = xfs_alloc_get_rec(
1051 bno_cur_lt, &ltbno,
1052 &ltlen, &i)))
1053 goto error0;
1054 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1055 xfs_alloc_compute_aligned(ltbno, ltlen,
1056 args->alignment, args->minlen,
1057 &ltbnoa, &ltlena);
1058 /*
1059 * The right one is clearly better.
1060 */
1061 if (ltbnoa <= args->agbno - gtdiff) {
1062 xfs_btree_del_cursor(
1063 bno_cur_lt,
1064 XFS_BTREE_NOERROR);
1065 bno_cur_lt = NULL;
1066 break;
1067 }
1068 /*
1069 * If we reach a big enough entry,
1070 * compare the two and pick the best.
1071 */
1072 if (ltlena >= args->minlen) {
1073 args->len = XFS_EXTLEN_MIN(
1074 ltlena, args->maxlen);
1075 xfs_alloc_fix_len(args);
1076 rlen = args->len;
1077 ltdiff = xfs_alloc_compute_diff(
1078 args->agbno, rlen,
1079 args->alignment,
1080 ltbno, ltlen, &ltnew);
1081 /*
1082 * Left side is better.
1083 */
1084 if (ltdiff < gtdiff) {
1085 xfs_btree_del_cursor(
1086 bno_cur_gt,
1087 XFS_BTREE_NOERROR);
1088 bno_cur_gt = NULL;
1089 }
1090 /*
1091 * Right side is better.
1092 */
1093 else {
1094 xfs_btree_del_cursor(
1095 bno_cur_lt,
1096 XFS_BTREE_NOERROR);
1097 bno_cur_lt = NULL;
1098 }
1099 break;
1100 }
1101 /*
1102 * Fell off the left end.
1103 */
1104 if ((error = xfs_btree_decrement(
1105 bno_cur_lt, 0, &i)))
1106 goto error0;
1107 if (!i) {
1108 xfs_btree_del_cursor(bno_cur_lt,
1109 XFS_BTREE_NOERROR);
1110 bno_cur_lt = NULL;
1111 break;
1112 }
1113 }
1114 }
1115 /*
1116 * The right side is perfect, trash the left side.
1117 */
1118 else {
1119 xfs_btree_del_cursor(bno_cur_lt,
1120 XFS_BTREE_NOERROR);
1121 bno_cur_lt = NULL;
1122 }
1123 } 1052 }
1053
1054 if (error)
1055 goto error0;
1124 } 1056 }
1057
1125 /* 1058 /*
1126 * If we couldn't get anything, give up. 1059 * If we couldn't get anything, give up.
1127 */ 1060 */
@@ -1130,6 +1063,7 @@ xfs_alloc_ag_vextent_near(
1130 args->agbno = NULLAGBLOCK; 1063 args->agbno = NULLAGBLOCK;
1131 return 0; 1064 return 0;
1132 } 1065 }
1066
1133 /* 1067 /*
1134 * At this point we have selected a freespace entry, either to the 1068 * At this point we have selected a freespace entry, either to the
1135 * left or to the right. If it's on the right, copy all the 1069 * left or to the right. If it's on the right, copy all the
@@ -1146,6 +1080,7 @@ xfs_alloc_ag_vextent_near(
1146 j = 1; 1080 j = 1;
1147 } else 1081 } else
1148 j = 0; 1082 j = 0;
1083
1149 /* 1084 /*
1150 * Fix up the length and compute the useful address. 1085 * Fix up the length and compute the useful address.
1151 */ 1086 */
@@ -2676,7 +2611,7 @@ restart:
2676 * will require a synchronous transaction, but it can still be 2611 * will require a synchronous transaction, but it can still be
2677 * used to distinguish between a partial or exact match. 2612 * used to distinguish between a partial or exact match.
2678 */ 2613 */
2679static int 2614int
2680xfs_alloc_busy_search( 2615xfs_alloc_busy_search(
2681 struct xfs_mount *mp, 2616 struct xfs_mount *mp,
2682 xfs_agnumber_t agno, 2617 xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..d0b3bc72005b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
19#define __XFS_ALLOC_H__ 19#define __XFS_ALLOC_H__
20 20
21struct xfs_buf; 21struct xfs_buf;
22struct xfs_btree_cur;
22struct xfs_mount; 23struct xfs_mount;
23struct xfs_perag; 24struct xfs_perag;
24struct xfs_trans; 25struct xfs_trans;
@@ -74,6 +75,22 @@ typedef unsigned int xfs_alloctype_t;
74#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) 75#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
75 76
76/* 77/*
78 * When deciding how much space to allocate out of an AG, we limit the
79 * allocation maximum size to the size the AG. However, we cannot use all the
80 * blocks in the AG - some are permanently used by metadata. These
81 * blocks are generally:
82 * - the AG superblock, AGF, AGI and AGFL
83 * - the AGF (bno and cnt) and AGI btree root blocks
84 * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
85 *
86 * The AG headers are sector sized, so the amount of space they take up is
87 * dependent on filesystem geometry. The others are all single blocks.
88 */
89#define XFS_ALLOC_AG_MAX_USABLE(mp) \
90 ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
91
92
93/*
77 * Argument structure for xfs_alloc routines. 94 * Argument structure for xfs_alloc routines.
78 * This is turned into a structure to avoid having 20 arguments passed 95 * This is turned into a structure to avoid having 20 arguments passed
79 * down several levels of the stack. 96 * down several levels of the stack.
@@ -118,16 +135,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
118 struct xfs_perag *pag); 135 struct xfs_perag *pag);
119 136
120#ifdef __KERNEL__ 137#ifdef __KERNEL__
121
122void 138void
123xfs_alloc_busy_insert(xfs_trans_t *tp, 139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
124 xfs_agnumber_t agno, 140 xfs_agblock_t bno, xfs_extlen_t len);
125 xfs_agblock_t bno,
126 xfs_extlen_t len);
127 141
128void 142void
129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); 143xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
130 144
145int
146xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
147 xfs_agblock_t bno, xfs_extlen_t len);
131#endif /* __KERNEL__ */ 148#endif /* __KERNEL__ */
132 149
133/* 150/*
@@ -205,4 +222,18 @@ xfs_free_extent(
205 xfs_fsblock_t bno, /* starting block number of extent */ 222 xfs_fsblock_t bno, /* starting block number of extent */
206 xfs_extlen_t len); /* length of extent */ 223 xfs_extlen_t len); /* length of extent */
207 224
225int /* error */
226xfs_alloc_lookup_le(
227 struct xfs_btree_cur *cur, /* btree cursor */
228 xfs_agblock_t bno, /* starting block of extent */
229 xfs_extlen_t len, /* length of extent */
230 int *stat); /* success/failure */
231
232int /* error */
233xfs_alloc_get_rec(
234 struct xfs_btree_cur *cur, /* btree cursor */
235 xfs_agblock_t *bno, /* output: starting block of extent */
236 xfs_extlen_t *len, /* output: length of extent */
237 int *stat); /* output: success/failure */
238
208#endif /* __XFS_ALLOC_H__ */ 239#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a6cff8edcdb6..71e90dc2aeb1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
637 * It didn't all fit, so we have to sort everything on hashval. 637 * It didn't all fit, so we have to sort everything on hashval.
638 */ 638 */
639 sbsize = sf->hdr.count * sizeof(*sbuf); 639 sbsize = sf->hdr.count * sizeof(*sbuf);
640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); 640 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
641 641
642 /* 642 /*
643 * Scan the attribute list for the rest of the entries, storing 643 * Scan the attribute list for the rest of the entries, storing
@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2386 args.dp = context->dp; 2386 args.dp = context->dp;
2387 args.whichfork = XFS_ATTR_FORK; 2387 args.whichfork = XFS_ATTR_FORK;
2388 args.valuelen = valuelen; 2388 args.valuelen = valuelen;
2389 args.value = kmem_alloc(valuelen, KM_SLEEP); 2389 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk); 2390 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); 2391 args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
2392 retval = xfs_attr_rmtval_get(&args); 2392 retval = xfs_attr_rmtval_get(&args);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4111cd3966c7..dc3afd7739ff 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
1038 * Filling in the middle part of a previous delayed allocation. 1038 * Filling in the middle part of a previous delayed allocation.
1039 * Contiguity is impossible here. 1039 * Contiguity is impossible here.
1040 * This case is avoided almost all the time. 1040 * This case is avoided almost all the time.
1041 *
1042 * We start with a delayed allocation:
1043 *
1044 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
1045 * PREV @ idx
1046 *
1047 * and we are allocating:
1048 * +rrrrrrrrrrrrrrrrr+
1049 * new
1050 *
1051 * and we set it up for insertion as:
1052 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
1053 * new
1054 * PREV @ idx LEFT RIGHT
1055 * inserted at idx + 1
1041 */ 1056 */
1042 temp = new->br_startoff - PREV.br_startoff; 1057 temp = new->br_startoff - PREV.br_startoff;
1043 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1044 xfs_bmbt_set_blockcount(ep, temp);
1045 r[0] = *new;
1046 r[1].br_state = PREV.br_state;
1047 r[1].br_startblock = 0;
1048 r[1].br_startoff = new_endoff;
1049 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1058 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1050 r[1].br_blockcount = temp2; 1059 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
1051 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1060 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1061 LEFT = *new;
1062 RIGHT.br_state = PREV.br_state;
1063 RIGHT.br_startblock = nullstartblock(
1064 (int)xfs_bmap_worst_indlen(ip, temp2));
1065 RIGHT.br_startoff = new_endoff;
1066 RIGHT.br_blockcount = temp2;
1067 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1068 xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
1052 ip->i_df.if_lastex = idx + 1; 1069 ip->i_df.if_lastex = idx + 1;
1053 ip->i_d.di_nextents++; 1070 ip->i_d.di_nextents++;
1054 if (cur == NULL) 1071 if (cur == NULL)
@@ -2430,7 +2447,7 @@ xfs_bmap_btalloc_nullfb(
2430 startag = ag = 0; 2447 startag = ag = 0;
2431 2448
2432 pag = xfs_perag_get(mp, ag); 2449 pag = xfs_perag_get(mp, ag);
2433 while (*blen < ap->alen) { 2450 while (*blen < args->maxlen) {
2434 if (!pag->pagf_init) { 2451 if (!pag->pagf_init) {
2435 error = xfs_alloc_pagf_init(mp, args->tp, ag, 2452 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2436 XFS_ALLOC_FLAG_TRYLOCK); 2453 XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2469,7 @@ xfs_bmap_btalloc_nullfb(
2452 notinit = 1; 2469 notinit = 1;
2453 2470
2454 if (xfs_inode_is_filestream(ap->ip)) { 2471 if (xfs_inode_is_filestream(ap->ip)) {
2455 if (*blen >= ap->alen) 2472 if (*blen >= args->maxlen)
2456 break; 2473 break;
2457 2474
2458 if (ap->userdata) { 2475 if (ap->userdata) {
@@ -2498,14 +2515,14 @@ xfs_bmap_btalloc_nullfb(
2498 * If the best seen length is less than the request 2515 * If the best seen length is less than the request
2499 * length, use the best as the minimum. 2516 * length, use the best as the minimum.
2500 */ 2517 */
2501 else if (*blen < ap->alen) 2518 else if (*blen < args->maxlen)
2502 args->minlen = *blen; 2519 args->minlen = *blen;
2503 /* 2520 /*
2504 * Otherwise we've seen an extent as big as alen, 2521 * Otherwise we've seen an extent as big as maxlen,
2505 * use that as the minimum. 2522 * use that as the minimum.
2506 */ 2523 */
2507 else 2524 else
2508 args->minlen = ap->alen; 2525 args->minlen = args->maxlen;
2509 2526
2510 /* 2527 /*
2511 * set the failure fallback case to look in the selected 2528 * set the failure fallback case to look in the selected
@@ -2573,7 +2590,9 @@ xfs_bmap_btalloc(
2573 args.tp = ap->tp; 2590 args.tp = ap->tp;
2574 args.mp = mp; 2591 args.mp = mp;
2575 args.fsbno = ap->rval; 2592 args.fsbno = ap->rval;
2576 args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); 2593
2594 /* Trim the allocation back to the maximum an AG can fit. */
2595 args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
2577 args.firstblock = ap->firstblock; 2596 args.firstblock = ap->firstblock;
2578 blen = 0; 2597 blen = 0;
2579 if (nullfb) { 2598 if (nullfb) {
@@ -2621,7 +2640,7 @@ xfs_bmap_btalloc(
2621 /* 2640 /*
2622 * Adjust for alignment 2641 * Adjust for alignment
2623 */ 2642 */
2624 if (blen > args.alignment && blen <= ap->alen) 2643 if (blen > args.alignment && blen <= args.maxlen)
2625 args.minlen = blen - args.alignment; 2644 args.minlen = blen - args.alignment;
2626 args.minalignslop = 0; 2645 args.minalignslop = 0;
2627 } else { 2646 } else {
@@ -2640,7 +2659,7 @@ xfs_bmap_btalloc(
2640 * of minlen+alignment+slop doesn't go up 2659 * of minlen+alignment+slop doesn't go up
2641 * between the calls. 2660 * between the calls.
2642 */ 2661 */
2643 if (blen > mp->m_dalign && blen <= ap->alen) 2662 if (blen > mp->m_dalign && blen <= args.maxlen)
2644 nextminlen = blen - mp->m_dalign; 2663 nextminlen = blen - mp->m_dalign;
2645 else 2664 else
2646 nextminlen = args.minlen; 2665 nextminlen = args.minlen;
@@ -4485,6 +4504,16 @@ xfs_bmapi(
4485 /* Figure out the extent size, adjust alen */ 4504 /* Figure out the extent size, adjust alen */
4486 extsz = xfs_get_extsz_hint(ip); 4505 extsz = xfs_get_extsz_hint(ip);
4487 if (extsz) { 4506 if (extsz) {
4507 /*
4508 * make sure we don't exceed a single
4509 * extent length when we align the
4510 * extent by reducing length we are
4511 * going to allocate by the maximum
4512 * amount extent size aligment may
4513 * require.
4514 */
4515 alen = XFS_FILBLKS_MIN(len,
4516 MAXEXTLEN - (2 * extsz - 1));
4488 error = xfs_bmap_extsize_align(mp, 4517 error = xfs_bmap_extsize_align(mp,
4489 &got, &prev, extsz, 4518 &got, &prev, extsz,
4490 rt, eof, 4519 rt, eof,
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 04f9cca8da7e..2f9e97c128a0 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
634 return error; 634 return error;
635 } 635 }
636 ASSERT(!bp || !XFS_BUF_GETERROR(bp)); 636 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
637 if (bp != NULL) { 637 if (bp)
638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); 638 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
639 }
640 *bpp = bp; 639 *bpp = bp;
641 return 0; 640 return 0;
642} 641}
@@ -944,13 +943,13 @@ xfs_btree_set_refs(
944 switch (cur->bc_btnum) { 943 switch (cur->bc_btnum) {
945 case XFS_BTNUM_BNO: 944 case XFS_BTNUM_BNO:
946 case XFS_BTNUM_CNT: 945 case XFS_BTNUM_CNT:
947 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); 946 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
948 break; 947 break;
949 case XFS_BTNUM_INO: 948 case XFS_BTNUM_INO:
950 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); 949 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
951 break; 950 break;
952 case XFS_BTNUM_BMAP: 951 case XFS_BTNUM_BMAP:
953 XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); 952 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
954 break; 953 break;
955 default: 954 default:
956 ASSERT(0); 955 ASSERT(0);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2686d0d54c5b..6f8c21ce0d6d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,8 +141,7 @@ xfs_buf_item_log_check(
141#define xfs_buf_item_log_check(x) 141#define xfs_buf_item_log_check(x)
142#endif 142#endif
143 143
144STATIC void xfs_buf_error_relse(xfs_buf_t *bp); 144STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
145STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
146 145
147/* 146/*
148 * This returns the number of log iovecs needed to log the 147 * This returns the number of log iovecs needed to log the
@@ -428,13 +427,15 @@ xfs_buf_item_unpin(
428 427
429 if (remove) { 428 if (remove) {
430 /* 429 /*
431 * We have to remove the log item from the transaction 430 * If we are in a transaction context, we have to
432 * as we are about to release our reference to the 431 * remove the log item from the transaction as we are
433 * buffer. If we don't, the unlock that occurs later 432 * about to release our reference to the buffer. If we
434 * in xfs_trans_uncommit() will ry to reference the 433 * don't, the unlock that occurs later in
434 * xfs_trans_uncommit() will try to reference the
435 * buffer which we no longer have a hold on. 435 * buffer which we no longer have a hold on.
436 */ 436 */
437 xfs_trans_del_item(lip); 437 if (lip->li_desc)
438 xfs_trans_del_item(lip);
438 439
439 /* 440 /*
440 * Since the transaction no longer refers to the buffer, 441 * Since the transaction no longer refers to the buffer,
@@ -450,7 +451,7 @@ xfs_buf_item_unpin(
450 * xfs_trans_ail_delete() drops the AIL lock. 451 * xfs_trans_ail_delete() drops the AIL lock.
451 */ 452 */
452 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 453 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
453 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); 454 xfs_buf_do_callbacks(bp);
454 XFS_BUF_SET_FSPRIVATE(bp, NULL); 455 XFS_BUF_SET_FSPRIVATE(bp, NULL);
455 XFS_BUF_CLR_IODONE_FUNC(bp); 456 XFS_BUF_CLR_IODONE_FUNC(bp);
456 } else { 457 } else {
@@ -918,15 +919,26 @@ xfs_buf_attach_iodone(
918 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 919 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
919} 920}
920 921
922/*
923 * We can have many callbacks on a buffer. Running the callbacks individually
924 * can cause a lot of contention on the AIL lock, so we allow for a single
925 * callback to be able to scan the remaining lip->li_bio_list for other items
926 * of the same type and callback to be processed in the first call.
927 *
928 * As a result, the loop walking the callback list below will also modify the
929 * list. it removes the first item from the list and then runs the callback.
930 * The loop then restarts from the new head of the list. This allows the
931 * callback to scan and modify the list attached to the buffer and we don't
932 * have to care about maintaining a next item pointer.
933 */
921STATIC void 934STATIC void
922xfs_buf_do_callbacks( 935xfs_buf_do_callbacks(
923 xfs_buf_t *bp, 936 struct xfs_buf *bp)
924 xfs_log_item_t *lip)
925{ 937{
926 xfs_log_item_t *nlip; 938 struct xfs_log_item *lip;
927 939
928 while (lip != NULL) { 940 while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
929 nlip = lip->li_bio_list; 941 XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
930 ASSERT(lip->li_cb != NULL); 942 ASSERT(lip->li_cb != NULL);
931 /* 943 /*
932 * Clear the next pointer so we don't have any 944 * Clear the next pointer so we don't have any
@@ -936,7 +948,6 @@ xfs_buf_do_callbacks(
936 */ 948 */
937 lip->li_bio_list = NULL; 949 lip->li_bio_list = NULL;
938 lip->li_cb(bp, lip); 950 lip->li_cb(bp, lip);
939 lip = nlip;
940 } 951 }
941} 952}
942 953
@@ -949,128 +960,76 @@ xfs_buf_do_callbacks(
949 */ 960 */
950void 961void
951xfs_buf_iodone_callbacks( 962xfs_buf_iodone_callbacks(
952 xfs_buf_t *bp) 963 struct xfs_buf *bp)
953{ 964{
954 xfs_log_item_t *lip; 965 struct xfs_log_item *lip = bp->b_fspriv;
955 static ulong lasttime; 966 struct xfs_mount *mp = lip->li_mountp;
956 static xfs_buftarg_t *lasttarg; 967 static ulong lasttime;
957 xfs_mount_t *mp; 968 static xfs_buftarg_t *lasttarg;
958 969
959 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 970 if (likely(!XFS_BUF_GETERROR(bp)))
960 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 971 goto do_callbacks;
961 972
962 if (XFS_BUF_GETERROR(bp) != 0) { 973 /*
963 /* 974 * If we've already decided to shutdown the filesystem because of
964 * If we've already decided to shutdown the filesystem 975 * I/O errors, there's no point in giving this a retry.
965 * because of IO errors, there's no point in giving this 976 */
966 * a retry. 977 if (XFS_FORCED_SHUTDOWN(mp)) {
967 */ 978 XFS_BUF_SUPER_STALE(bp);
968 mp = lip->li_mountp; 979 trace_xfs_buf_item_iodone(bp, _RET_IP_);
969 if (XFS_FORCED_SHUTDOWN(mp)) { 980 goto do_callbacks;
970 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 981 }
971 XFS_BUF_SUPER_STALE(bp);
972 trace_xfs_buf_item_iodone(bp, _RET_IP_);
973 xfs_buf_do_callbacks(bp, lip);
974 XFS_BUF_SET_FSPRIVATE(bp, NULL);
975 XFS_BUF_CLR_IODONE_FUNC(bp);
976 xfs_buf_ioend(bp, 0);
977 return;
978 }
979 982
980 if ((XFS_BUF_TARGET(bp) != lasttarg) || 983 if (XFS_BUF_TARGET(bp) != lasttarg ||
981 (time_after(jiffies, (lasttime + 5*HZ)))) { 984 time_after(jiffies, (lasttime + 5*HZ))) {
982 lasttime = jiffies; 985 lasttime = jiffies;
983 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 986 cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
984 " block 0x%llx in %s", 987 " block 0x%llx in %s",
985 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 988 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
986 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 989 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
987 } 990 }
988 lasttarg = XFS_BUF_TARGET(bp); 991 lasttarg = XFS_BUF_TARGET(bp);
989 992
990 if (XFS_BUF_ISASYNC(bp)) { 993 /*
991 /* 994 * If the write was asynchronous then noone will be looking for the
992 * If the write was asynchronous then noone will be 995 * error. Clear the error state and write the buffer out again.
993 * looking for the error. Clear the error state 996 *
994 * and write the buffer out again delayed write. 997 * During sync or umount we'll write all pending buffers again
995 * 998 * synchronous, which will catch these errors if they keep hanging
996 * XXXsup This is OK, so long as we catch these 999 * around.
997 * before we start the umount; we don't want these 1000 */
998 * DELWRI metadata bufs to be hanging around. 1001 if (XFS_BUF_ISASYNC(bp)) {
999 */ 1002 XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
1000 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ 1003
1001 1004 if (!XFS_BUF_ISSTALE(bp)) {
1002 if (!(XFS_BUF_ISSTALE(bp))) { 1005 XFS_BUF_DELAYWRITE(bp);
1003 XFS_BUF_DELAYWRITE(bp);
1004 XFS_BUF_DONE(bp);
1005 XFS_BUF_SET_START(bp);
1006 }
1007 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1008 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1009 xfs_buf_relse(bp);
1010 } else {
1011 /*
1012 * If the write of the buffer was not asynchronous,
1013 * then we want to make sure to return the error
1014 * to the caller of bwrite(). Because of this we
1015 * cannot clear the B_ERROR state at this point.
1016 * Instead we install a callback function that
1017 * will be called when the buffer is released, and
1018 * that routine will clear the error state and
1019 * set the buffer to be written out again after
1020 * some delay.
1021 */
1022 /* We actually overwrite the existing b-relse
1023 function at times, but we're gonna be shutting down
1024 anyway. */
1025 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1026 XFS_BUF_DONE(bp); 1006 XFS_BUF_DONE(bp);
1027 XFS_BUF_FINISH_IOWAIT(bp); 1007 XFS_BUF_SET_START(bp);
1028 } 1008 }
1009 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1010 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1011 xfs_buf_relse(bp);
1029 return; 1012 return;
1030 } 1013 }
1031 1014
1032 xfs_buf_do_callbacks(bp, lip); 1015 /*
1033 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1016 * If the write of the buffer was synchronous, we want to make
1034 XFS_BUF_CLR_IODONE_FUNC(bp); 1017 * sure to return the error to the caller of xfs_bwrite().
1035 xfs_buf_ioend(bp, 0); 1018 */
1036}
1037
1038/*
1039 * This is a callback routine attached to a buffer which gets an error
1040 * when being written out synchronously.
1041 */
1042STATIC void
1043xfs_buf_error_relse(
1044 xfs_buf_t *bp)
1045{
1046 xfs_log_item_t *lip;
1047 xfs_mount_t *mp;
1048
1049 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1050 mp = (xfs_mount_t *)lip->li_mountp;
1051 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1052
1053 XFS_BUF_STALE(bp); 1019 XFS_BUF_STALE(bp);
1054 XFS_BUF_DONE(bp); 1020 XFS_BUF_DONE(bp);
1055 XFS_BUF_UNDELAYWRITE(bp); 1021 XFS_BUF_UNDELAYWRITE(bp);
1056 XFS_BUF_ERROR(bp,0);
1057 1022
1058 trace_xfs_buf_error_relse(bp, _RET_IP_); 1023 trace_xfs_buf_error_relse(bp, _RET_IP_);
1024 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1059 1025
1060 if (! XFS_FORCED_SHUTDOWN(mp)) 1026do_callbacks:
1061 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1027 xfs_buf_do_callbacks(bp);
1062 /*
1063 * We have to unpin the pinned buffers so do the
1064 * callbacks.
1065 */
1066 xfs_buf_do_callbacks(bp, lip);
1067 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1028 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1068 XFS_BUF_CLR_IODONE_FUNC(bp); 1029 XFS_BUF_CLR_IODONE_FUNC(bp);
1069 XFS_BUF_SET_BRELSE_FUNC(bp,NULL); 1030 xfs_buf_ioend(bp, 0);
1070 xfs_buf_relse(bp);
1071} 1031}
1072 1032
1073
1074/* 1033/*
1075 * This is the iodone() function for buffers which have been 1034 * This is the iodone() function for buffers which have been
1076 * logged. It is called when they are eventually flushed out. 1035 * logged. It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0e2ed43f16c7..b6ecd2061e7c 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
105 xfs_buf_log_format_t bli_format; /* in-log header */ 105 xfs_buf_log_format_t bli_format; /* in-log header */
106} xfs_buf_log_item_t; 106} xfs_buf_log_item_t;
107 107
108/*
109 * This structure is used during recovery to record the buf log
110 * items which have been canceled and should not be replayed.
111 */
112typedef struct xfs_buf_cancel {
113 xfs_daddr_t bc_blkno;
114 uint bc_len;
115 int bc_refcount;
116 struct xfs_buf_cancel *bc_next;
117} xfs_buf_cancel_t;
118
119void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); 108void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
120void xfs_buf_item_relse(struct xfs_buf *); 109void xfs_buf_item_relse(struct xfs_buf *);
121void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); 110void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c78cc6a3d87c..4c7db74a05f7 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
152} 152}
153#endif /* DEBUG */ 153#endif /* DEBUG */
154 154
155
156void
157xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
158{
159 va_list ap;
160
161 va_start(ap, fmt);
162 xfs_fs_vcmn_err(level, mp, fmt, ap);
163 va_end(ap);
164}
165
166void
167xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
168{
169 va_list ap;
170
171#ifdef DEBUG
172 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
173#endif
174
175 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
176 && (level & CE_ALERT)) {
177 level &= ~CE_ALERT;
178 level |= CE_PANIC;
179 cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
180 }
181 va_start(ap, fmt);
182 xfs_fs_vcmn_err(level, mp, fmt, ap);
183 va_end(ap);
184}
185
186void 155void
187xfs_error_report( 156xfs_error_report(
188 const char *tag, 157 const char *tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index f338847f80b8..10dce5475f02 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 136 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
137 (rf)))) 137 (rf))))
138 138
139extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); 139extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
140extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); 140extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
141#else 141#else
142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 142#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
143#define xfs_errortag_add(tag, mp) (ENOSYS) 143#define xfs_errortag_add(tag, mp) (ENOSYS)
@@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
162 162
163struct xfs_mount; 163struct xfs_mount;
164 164
165extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
166 char *fmt, va_list ap)
167 __attribute__ ((format (printf, 3, 0)));
168extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
169 char *fmt, ...)
170 __attribute__ ((format (printf, 4, 5)));
171extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
172 __attribute__ ((format (printf, 3, 4)));
173
174extern void xfs_hex_dump(void *p, int length); 165extern void xfs_hex_dump(void *p, int length);
175 166
176#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ 167#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
177 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) 168 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
178 169
179#define xfs_fs_mount_cmn_err(f, fmt, args...) \ 170#define xfs_fs_mount_cmn_err(f, fmt, args...) \
180 ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args)) 171 do { \
172 if (!(f & XFS_MFSI_QUIET)) \
173 cmn_err(CE_WARN, "XFS: " fmt, ## args); \
174 } while (0)
181 175
182#endif /* __XFS_ERROR_H__ */ 176#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a55e687bf562..d22e62623437 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -48,6 +48,28 @@ xfs_efi_item_free(
48} 48}
49 49
50/* 50/*
51 * Freeing the efi requires that we remove it from the AIL if it has already
52 * been placed there. However, the EFI may not yet have been placed in the AIL
53 * when called by xfs_efi_release() from EFD processing due to the ordering of
54 * committed vs unpin operations in bulk insert operations. Hence the
55 * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
56 * the EFI.
57 */
58STATIC void
59__xfs_efi_release(
60 struct xfs_efi_log_item *efip)
61{
62 struct xfs_ail *ailp = efip->efi_item.li_ailp;
63
64 if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
65 spin_lock(&ailp->xa_lock);
66 /* xfs_trans_ail_delete() drops the AIL lock. */
67 xfs_trans_ail_delete(ailp, &efip->efi_item);
68 xfs_efi_item_free(efip);
69 }
70}
71
72/*
51 * This returns the number of iovecs needed to log the given efi item. 73 * This returns the number of iovecs needed to log the given efi item.
52 * We only need 1 iovec for an efi item. It just logs the efi_log_format 74 * We only need 1 iovec for an efi item. It just logs the efi_log_format
53 * structure. 75 * structure.
@@ -74,7 +96,8 @@ xfs_efi_item_format(
74 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 96 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
75 uint size; 97 uint size;
76 98
77 ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); 99 ASSERT(atomic_read(&efip->efi_next_extent) ==
100 efip->efi_format.efi_nextents);
78 101
79 efip->efi_format.efi_type = XFS_LI_EFI; 102 efip->efi_format.efi_type = XFS_LI_EFI;
80 103
@@ -99,10 +122,12 @@ xfs_efi_item_pin(
99} 122}
100 123
101/* 124/*
102 * While EFIs cannot really be pinned, the unpin operation is the 125 * While EFIs cannot really be pinned, the unpin operation is the last place at
103 * last place at which the EFI is manipulated during a transaction. 126 * which the EFI is manipulated during a transaction. If we are being asked to
104 * Here we coordinate with xfs_efi_cancel() to determine who gets to 127 * remove the EFI it's because the transaction has been cancelled and by
105 * free the EFI. 128 * definition that means the EFI cannot be in the AIL so remove it from the
129 * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
130 * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
106 */ 131 */
107STATIC void 132STATIC void
108xfs_efi_item_unpin( 133xfs_efi_item_unpin(
@@ -110,20 +135,15 @@ xfs_efi_item_unpin(
110 int remove) 135 int remove)
111{ 136{
112 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 137 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
113 struct xfs_ail *ailp = lip->li_ailp;
114 138
115 spin_lock(&ailp->xa_lock); 139 if (remove) {
116 if (efip->efi_flags & XFS_EFI_CANCELED) { 140 ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
117 if (remove) 141 if (lip->li_desc)
118 xfs_trans_del_item(lip); 142 xfs_trans_del_item(lip);
119
120 /* xfs_trans_ail_delete() drops the AIL lock. */
121 xfs_trans_ail_delete(ailp, lip);
122 xfs_efi_item_free(efip); 143 xfs_efi_item_free(efip);
123 } else { 144 return;
124 efip->efi_flags |= XFS_EFI_COMMITTED;
125 spin_unlock(&ailp->xa_lock);
126 } 145 }
146 __xfs_efi_release(efip);
127} 147}
128 148
129/* 149/*
@@ -152,16 +172,20 @@ xfs_efi_item_unlock(
152} 172}
153 173
154/* 174/*
155 * The EFI is logged only once and cannot be moved in the log, so 175 * The EFI is logged only once and cannot be moved in the log, so simply return
156 * simply return the lsn at which it's been logged. The canceled 176 * the lsn at which it's been logged. For bulk transaction committed
157 * flag is not paid any attention here. Checking for that is delayed 177 * processing, the EFI may be processed but not yet unpinned prior to the EFD
158 * until the EFI is unpinned. 178 * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
179 * when processing the EFD.
159 */ 180 */
160STATIC xfs_lsn_t 181STATIC xfs_lsn_t
161xfs_efi_item_committed( 182xfs_efi_item_committed(
162 struct xfs_log_item *lip, 183 struct xfs_log_item *lip,
163 xfs_lsn_t lsn) 184 xfs_lsn_t lsn)
164{ 185{
186 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
187
188 set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
165 return lsn; 189 return lsn;
166} 190}
167 191
@@ -230,6 +254,7 @@ xfs_efi_init(
230 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); 254 xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
231 efip->efi_format.efi_nextents = nextents; 255 efip->efi_format.efi_nextents = nextents;
232 efip->efi_format.efi_id = (__psint_t)(void*)efip; 256 efip->efi_format.efi_id = (__psint_t)(void*)efip;
257 atomic_set(&efip->efi_next_extent, 0);
233 258
234 return efip; 259 return efip;
235} 260}
@@ -289,37 +314,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
289} 314}
290 315
291/* 316/*
292 * This is called by the efd item code below to release references to 317 * This is called by the efd item code below to release references to the given
293 * the given efi item. Each efd calls this with the number of 318 * efi item. Each efd calls this with the number of extents that it has
294 * extents that it has logged, and when the sum of these reaches 319 * logged, and when the sum of these reaches the total number of extents logged
295 * the total number of extents logged by this efi item we can free 320 * by this efi item we can free the efi item.
296 * the efi item.
297 *
298 * Freeing the efi item requires that we remove it from the AIL.
299 * We'll use the AIL lock to protect our counters as well as
300 * the removal from the AIL.
301 */ 321 */
302void 322void
303xfs_efi_release(xfs_efi_log_item_t *efip, 323xfs_efi_release(xfs_efi_log_item_t *efip,
304 uint nextents) 324 uint nextents)
305{ 325{
306 struct xfs_ail *ailp = efip->efi_item.li_ailp; 326 ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
307 int extents_left; 327 if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
308 328 __xfs_efi_release(efip);
309 ASSERT(efip->efi_next_extent > 0);
310 ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
311
312 spin_lock(&ailp->xa_lock);
313 ASSERT(efip->efi_next_extent >= nextents);
314 efip->efi_next_extent -= nextents;
315 extents_left = efip->efi_next_extent;
316 if (extents_left == 0) {
317 /* xfs_trans_ail_delete() drops the AIL lock. */
318 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
319 xfs_efi_item_free(efip);
320 } else {
321 spin_unlock(&ailp->xa_lock);
322 }
323} 329}
324 330
325static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) 331static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0d22c56fdf64..375f68e42531 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
111#define XFS_EFI_MAX_FAST_EXTENTS 16 111#define XFS_EFI_MAX_FAST_EXTENTS 16
112 112
113/* 113/*
114 * Define EFI flags. 114 * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
115 */ 115 */
116#define XFS_EFI_RECOVERED 0x1 116#define XFS_EFI_RECOVERED 1
117#define XFS_EFI_COMMITTED 0x2 117#define XFS_EFI_COMMITTED 2
118#define XFS_EFI_CANCELED 0x4
119 118
120/* 119/*
121 * This is the "extent free intention" log item. It is used 120 * This is the "extent free intention" log item. It is used
@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
125 */ 124 */
126typedef struct xfs_efi_log_item { 125typedef struct xfs_efi_log_item {
127 xfs_log_item_t efi_item; 126 xfs_log_item_t efi_item;
128 uint efi_flags; /* misc flags */ 127 atomic_t efi_next_extent;
129 uint efi_next_extent; 128 unsigned long efi_flags; /* misc flags */
130 xfs_efi_log_format_t efi_format; 129 xfs_efi_log_format_t efi_format;
131} xfs_efi_log_item_t; 130} xfs_efi_log_item_t;
132 131
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814af..85668efb3e3e 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
53 xfs_fsop_geom_t *geo, 53 xfs_fsop_geom_t *geo,
54 int new_version) 54 int new_version)
55{ 55{
56
57 memset(geo, 0, sizeof(*geo));
58
56 geo->blocksize = mp->m_sb.sb_blocksize; 59 geo->blocksize = mp->m_sb.sb_blocksize;
57 geo->rtextsize = mp->m_sb.sb_rextsize; 60 geo->rtextsize = mp->m_sb.sb_rextsize;
58 geo->agblocks = mp->m_sb.sb_agblocks; 61 geo->agblocks = mp->m_sb.sb_agblocks;
@@ -374,6 +377,7 @@ xfs_growfs_data_private(
374 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 377 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
375 } else 378 } else
376 mp->m_maxicount = 0; 379 mp->m_maxicount = 0;
380 xfs_set_low_space_thresholds(mp);
377 381
378 /* update secondary superblocks. */ 382 /* update secondary superblocks. */
379 for (agno = 1; agno < nagcount; agno++) { 383 for (agno = 1; agno < nagcount; agno++) {
@@ -611,12 +615,13 @@ out:
611 * 615 *
612 * We cannot use an inode here for this - that will push dirty state back up 616 * We cannot use an inode here for this - that will push dirty state back up
613 * into the VFS and then periodic inode flushing will prevent log covering from 617 * into the VFS and then periodic inode flushing will prevent log covering from
614 * making progress. Hence we log a field in the superblock instead. 618 * making progress. Hence we log a field in the superblock instead and use a
619 * synchronous transaction to ensure the superblock is immediately unpinned
620 * and can be written back.
615 */ 621 */
616int 622int
617xfs_fs_log_dummy( 623xfs_fs_log_dummy(
618 xfs_mount_t *mp, 624 xfs_mount_t *mp)
619 int flags)
620{ 625{
621 xfs_trans_t *tp; 626 xfs_trans_t *tp;
622 int error; 627 int error;
@@ -631,8 +636,7 @@ xfs_fs_log_dummy(
631 636
632 /* log the UUID because it is an unchanging field */ 637 /* log the UUID because it is an unchanging field */
633 xfs_mod_sb(tp, XFS_SB_UUID); 638 xfs_mod_sb(tp, XFS_SB_UUID);
634 if (flags & SYNC_WAIT) 639 xfs_trans_set_sync(tp);
635 xfs_trans_set_sync(tp);
636 return xfs_trans_commit(tp, 0); 640 return xfs_trans_commit(tp, 0);
637} 641}
638 642
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); 28extern int xfs_fs_log_dummy(struct xfs_mount *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index d7de5a3f7867..cb9b6d1469f7 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -43,6 +43,17 @@
43 43
44 44
45/* 45/*
46 * Define xfs inode iolock lockdep classes. We need to ensure that all active
47 * inodes are considered the same for lockdep purposes, including inodes that
48 * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
49 * guarantee the locks are considered the same when there are multiple lock
50 * initialisation siteѕ. Also, define a reclaimable inode class so it is
51 * obvious in lockdep reports which class the report is against.
52 */
53static struct lock_class_key xfs_iolock_active;
54struct lock_class_key xfs_iolock_reclaimable;
55
56/*
46 * Allocate and initialise an xfs_inode. 57 * Allocate and initialise an xfs_inode.
47 */ 58 */
48STATIC struct xfs_inode * 59STATIC struct xfs_inode *
@@ -69,8 +80,11 @@ xfs_inode_alloc(
69 ASSERT(atomic_read(&ip->i_pincount) == 0); 80 ASSERT(atomic_read(&ip->i_pincount) == 0);
70 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 81 ASSERT(!spin_is_locked(&ip->i_flags_lock));
71 ASSERT(completion_done(&ip->i_flush)); 82 ASSERT(completion_done(&ip->i_flush));
83 ASSERT(ip->i_ino == 0);
72 84
73 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 85 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
86 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
87 &xfs_iolock_active, "xfs_iolock_active");
74 88
75 /* initialise the xfs inode */ 89 /* initialise the xfs inode */
76 ip->i_ino = ino; 90 ip->i_ino = ino;
@@ -85,9 +99,6 @@ xfs_inode_alloc(
85 ip->i_size = 0; 99 ip->i_size = 0;
86 ip->i_new_size = 0; 100 ip->i_new_size = 0;
87 101
88 /* prevent anyone from using this yet */
89 VFS_I(ip)->i_state = I_NEW;
90
91 return ip; 102 return ip;
92} 103}
93 104
@@ -145,7 +156,18 @@ xfs_inode_free(
145 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 156 ASSERT(!spin_is_locked(&ip->i_flags_lock));
146 ASSERT(completion_done(&ip->i_flush)); 157 ASSERT(completion_done(&ip->i_flush));
147 158
148 call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback); 159 /*
160 * Because we use RCU freeing we need to ensure the inode always
161 * appears to be reclaimed with an invalid inode number when in the
162 * free state. The ip->i_flags_lock provides the barrier against lookup
163 * races.
164 */
165 spin_lock(&ip->i_flags_lock);
166 ip->i_flags = XFS_IRECLAIM;
167 ip->i_ino = 0;
168 spin_unlock(&ip->i_flags_lock);
169
170 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
149} 171}
150 172
151/* 173/*
@@ -155,14 +177,29 @@ static int
155xfs_iget_cache_hit( 177xfs_iget_cache_hit(
156 struct xfs_perag *pag, 178 struct xfs_perag *pag,
157 struct xfs_inode *ip, 179 struct xfs_inode *ip,
180 xfs_ino_t ino,
158 int flags, 181 int flags,
159 int lock_flags) __releases(pag->pag_ici_lock) 182 int lock_flags) __releases(RCU)
160{ 183{
161 struct inode *inode = VFS_I(ip); 184 struct inode *inode = VFS_I(ip);
162 struct xfs_mount *mp = ip->i_mount; 185 struct xfs_mount *mp = ip->i_mount;
163 int error; 186 int error;
164 187
188 /*
189 * check for re-use of an inode within an RCU grace period due to the
190 * radix tree nodes not being updated yet. We monitor for this by
191 * setting the inode number to zero before freeing the inode structure.
192 * If the inode has been reallocated and set up, then the inode number
193 * will not match, so check for that, too.
194 */
165 spin_lock(&ip->i_flags_lock); 195 spin_lock(&ip->i_flags_lock);
196 if (ip->i_ino != ino) {
197 trace_xfs_iget_skip(ip);
198 XFS_STATS_INC(xs_ig_frecycle);
199 error = EAGAIN;
200 goto out_error;
201 }
202
166 203
167 /* 204 /*
168 * If we are racing with another cache hit that is currently 205 * If we are racing with another cache hit that is currently
@@ -205,7 +242,7 @@ xfs_iget_cache_hit(
205 ip->i_flags |= XFS_IRECLAIM; 242 ip->i_flags |= XFS_IRECLAIM;
206 243
207 spin_unlock(&ip->i_flags_lock); 244 spin_unlock(&ip->i_flags_lock);
208 read_unlock(&pag->pag_ici_lock); 245 rcu_read_unlock();
209 246
210 error = -inode_init_always(mp->m_super, inode); 247 error = -inode_init_always(mp->m_super, inode);
211 if (error) { 248 if (error) {
@@ -213,7 +250,7 @@ xfs_iget_cache_hit(
213 * Re-initializing the inode failed, and we are in deep 250 * Re-initializing the inode failed, and we are in deep
214 * trouble. Try to re-add it to the reclaim list. 251 * trouble. Try to re-add it to the reclaim list.
215 */ 252 */
216 read_lock(&pag->pag_ici_lock); 253 rcu_read_lock();
217 spin_lock(&ip->i_flags_lock); 254 spin_lock(&ip->i_flags_lock);
218 255
219 ip->i_flags &= ~XFS_INEW; 256 ip->i_flags &= ~XFS_INEW;
@@ -223,14 +260,20 @@ xfs_iget_cache_hit(
223 goto out_error; 260 goto out_error;
224 } 261 }
225 262
226 write_lock(&pag->pag_ici_lock); 263 spin_lock(&pag->pag_ici_lock);
227 spin_lock(&ip->i_flags_lock); 264 spin_lock(&ip->i_flags_lock);
228 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); 265 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
229 ip->i_flags |= XFS_INEW; 266 ip->i_flags |= XFS_INEW;
230 __xfs_inode_clear_reclaim_tag(mp, pag, ip); 267 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
231 inode->i_state = I_NEW; 268 inode->i_state = I_NEW;
269
270 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
271 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
272 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
273 &xfs_iolock_active, "xfs_iolock_active");
274
232 spin_unlock(&ip->i_flags_lock); 275 spin_unlock(&ip->i_flags_lock);
233 write_unlock(&pag->pag_ici_lock); 276 spin_unlock(&pag->pag_ici_lock);
234 } else { 277 } else {
235 /* If the VFS inode is being torn down, pause and try again. */ 278 /* If the VFS inode is being torn down, pause and try again. */
236 if (!igrab(inode)) { 279 if (!igrab(inode)) {
@@ -241,7 +284,7 @@ xfs_iget_cache_hit(
241 284
242 /* We've got a live one. */ 285 /* We've got a live one. */
243 spin_unlock(&ip->i_flags_lock); 286 spin_unlock(&ip->i_flags_lock);
244 read_unlock(&pag->pag_ici_lock); 287 rcu_read_unlock();
245 trace_xfs_iget_hit(ip); 288 trace_xfs_iget_hit(ip);
246 } 289 }
247 290
@@ -255,7 +298,7 @@ xfs_iget_cache_hit(
255 298
256out_error: 299out_error:
257 spin_unlock(&ip->i_flags_lock); 300 spin_unlock(&ip->i_flags_lock);
258 read_unlock(&pag->pag_ici_lock); 301 rcu_read_unlock();
259 return error; 302 return error;
260} 303}
261 304
@@ -308,7 +351,7 @@ xfs_iget_cache_miss(
308 BUG(); 351 BUG();
309 } 352 }
310 353
311 write_lock(&pag->pag_ici_lock); 354 spin_lock(&pag->pag_ici_lock);
312 355
313 /* insert the new inode */ 356 /* insert the new inode */
314 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 357 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
@@ -323,14 +366,14 @@ xfs_iget_cache_miss(
323 ip->i_udquot = ip->i_gdquot = NULL; 366 ip->i_udquot = ip->i_gdquot = NULL;
324 xfs_iflags_set(ip, XFS_INEW); 367 xfs_iflags_set(ip, XFS_INEW);
325 368
326 write_unlock(&pag->pag_ici_lock); 369 spin_unlock(&pag->pag_ici_lock);
327 radix_tree_preload_end(); 370 radix_tree_preload_end();
328 371
329 *ipp = ip; 372 *ipp = ip;
330 return 0; 373 return 0;
331 374
332out_preload_end: 375out_preload_end:
333 write_unlock(&pag->pag_ici_lock); 376 spin_unlock(&pag->pag_ici_lock);
334 radix_tree_preload_end(); 377 radix_tree_preload_end();
335 if (lock_flags) 378 if (lock_flags)
336 xfs_iunlock(ip, lock_flags); 379 xfs_iunlock(ip, lock_flags);
@@ -377,7 +420,7 @@ xfs_iget(
377 xfs_agino_t agino; 420 xfs_agino_t agino;
378 421
379 /* reject inode numbers outside existing AGs */ 422 /* reject inode numbers outside existing AGs */
380 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 423 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
381 return EINVAL; 424 return EINVAL;
382 425
383 /* get the perag structure and ensure that it's inode capable */ 426 /* get the perag structure and ensure that it's inode capable */
@@ -386,15 +429,15 @@ xfs_iget(
386 429
387again: 430again:
388 error = 0; 431 error = 0;
389 read_lock(&pag->pag_ici_lock); 432 rcu_read_lock();
390 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 433 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
391 434
392 if (ip) { 435 if (ip) {
393 error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); 436 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
394 if (error) 437 if (error)
395 goto out_error_or_again; 438 goto out_error_or_again;
396 } else { 439 } else {
397 read_unlock(&pag->pag_ici_lock); 440 rcu_read_unlock();
398 XFS_STATS_INC(xs_ig_missed); 441 XFS_STATS_INC(xs_ig_missed);
399 442
400 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 443 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f94..be7cf625421f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -887,7 +887,7 @@ xfs_iread(
887 * around for a while. This helps to keep recently accessed 887 * around for a while. This helps to keep recently accessed
888 * meta-data in-core longer. 888 * meta-data in-core longer.
889 */ 889 */
890 XFS_BUF_SET_REF(bp, XFS_INO_REF); 890 xfs_buf_set_ref(bp, XFS_INO_REF);
891 891
892 /* 892 /*
893 * Use xfs_trans_brelse() to release the buffer containing the 893 * Use xfs_trans_brelse() to release the buffer containing the
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
2000 */ 2000 */
2001 for (i = 0; i < ninodes; i++) { 2001 for (i = 0; i < ninodes; i++) {
2002retry: 2002retry:
2003 read_lock(&pag->pag_ici_lock); 2003 rcu_read_lock();
2004 ip = radix_tree_lookup(&pag->pag_ici_root, 2004 ip = radix_tree_lookup(&pag->pag_ici_root,
2005 XFS_INO_TO_AGINO(mp, (inum + i))); 2005 XFS_INO_TO_AGINO(mp, (inum + i)));
2006 2006
2007 /* Inode not in memory or stale, nothing to do */ 2007 /* Inode not in memory, nothing to do */
2008 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2008 if (!ip) {
2009 read_unlock(&pag->pag_ici_lock); 2009 rcu_read_unlock();
2010 continue; 2010 continue;
2011 } 2011 }
2012 2012
2013 /* 2013 /*
2014 * because this is an RCU protected lookup, we could
2015 * find a recently freed or even reallocated inode
2016 * during the lookup. We need to check under the
2017 * i_flags_lock for a valid inode here. Skip it if it
2018 * is not valid, the wrong inode or stale.
2019 */
2020 spin_lock(&ip->i_flags_lock);
2021 if (ip->i_ino != inum + i ||
2022 __xfs_iflags_test(ip, XFS_ISTALE)) {
2023 spin_unlock(&ip->i_flags_lock);
2024 rcu_read_unlock();
2025 continue;
2026 }
2027 spin_unlock(&ip->i_flags_lock);
2028
2029 /*
2014 * Don't try to lock/unlock the current inode, but we 2030 * Don't try to lock/unlock the current inode, but we
2015 * _cannot_ skip the other inodes that we did not find 2031 * _cannot_ skip the other inodes that we did not find
2016 * in the list attached to the buffer and are not 2032 * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
2019 */ 2035 */
2020 if (ip != free_ip && 2036 if (ip != free_ip &&
2021 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2037 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2022 read_unlock(&pag->pag_ici_lock); 2038 rcu_read_unlock();
2023 delay(1); 2039 delay(1);
2024 goto retry; 2040 goto retry;
2025 } 2041 }
2026 read_unlock(&pag->pag_ici_lock); 2042 rcu_read_unlock();
2027 2043
2028 xfs_iflock(ip); 2044 xfs_iflock(ip);
2029 xfs_iflags_set(ip, XFS_ISTALE); 2045 xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
2629 2645
2630 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2646 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2631 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2647 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2632 read_lock(&pag->pag_ici_lock); 2648 rcu_read_lock();
2633 /* really need a gang lookup range call here */ 2649 /* really need a gang lookup range call here */
2634 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2650 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2635 first_index, inodes_per_cluster); 2651 first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
2640 iq = ilist[i]; 2656 iq = ilist[i];
2641 if (iq == ip) 2657 if (iq == ip)
2642 continue; 2658 continue;
2643 /* if the inode lies outside this cluster, we're done. */ 2659
2644 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) 2660 /*
2645 break; 2661 * because this is an RCU protected lookup, we could find a
2662 * recently freed or even reallocated inode during the lookup.
2663 * We need to check under the i_flags_lock for a valid inode
2664 * here. Skip it if it is not valid or the wrong inode.
2665 */
2666 spin_lock(&ip->i_flags_lock);
2667 if (!ip->i_ino ||
2668 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2669 spin_unlock(&ip->i_flags_lock);
2670 continue;
2671 }
2672 spin_unlock(&ip->i_flags_lock);
2673
2646 /* 2674 /*
2647 * Do an un-protected check to see if the inode is dirty and 2675 * Do an un-protected check to see if the inode is dirty and
2648 * is a candidate for flushing. These checks will be repeated 2676 * is a candidate for flushing. These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
2692 } 2720 }
2693 2721
2694out_free: 2722out_free:
2695 read_unlock(&pag->pag_ici_lock); 2723 rcu_read_unlock();
2696 kmem_free(ilist); 2724 kmem_free(ilist);
2697out_put: 2725out_put:
2698 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
2704 * Corruption detected in the clustering loop. Invalidate the 2732 * Corruption detected in the clustering loop. Invalidate the
2705 * inode buffer and shut down the filesystem. 2733 * inode buffer and shut down the filesystem.
2706 */ 2734 */
2707 read_unlock(&pag->pag_ici_lock); 2735 rcu_read_unlock();
2708 /* 2736 /*
2709 * Clean up the buffer. If it was B_DELWRI, just release it -- 2737 * Clean up the buffer. If it was B_DELWRI, just release it --
2710 * brelse can handle it with no problems. If not, shut down the 2738 * brelse can handle it with no problems. If not, shut down the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fb2ca2e4cdc9..5c95fa8ec11d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
376/* 376/*
377 * In-core inode flags. 377 * In-core inode flags.
378 */ 378 */
379#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ 379#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
380#define XFS_ISTALE 0x0002 /* inode has been staled */ 380#define XFS_ISTALE 0x0002 /* inode has been staled */
381#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ 381#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
382#define XFS_INEW 0x0008 /* inode has just been allocated */ 382#define XFS_INEW 0x0008 /* inode has just been allocated */
383#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ 383#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
384#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ 384#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
385#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
385 386
386/* 387/*
387 * Flags for inode locking. 388 * Flags for inode locking.
@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
438#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) 439#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
439#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 440#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
440 441
442extern struct lock_class_key xfs_iolock_reclaimable;
443
441/* 444/*
442 * Flags for xfs_itruncate_start(). 445 * Flags for xfs_itruncate_start().
443 */ 446 */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c8d30c453c3..fd4f398bd6f1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
842 * flushed to disk. It is responsible for removing the inode item 842 * flushed to disk. It is responsible for removing the inode item
843 * from the AIL if it has not been re-logged, and unlocking the inode's 843 * from the AIL if it has not been re-logged, and unlocking the inode's
844 * flush lock. 844 * flush lock.
845 *
846 * To reduce AIL lock traffic as much as possible, we scan the buffer log item
847 * list for other inodes that will run this function. We remove them from the
848 * buffer list so we can process all the inode IO completions in one AIL lock
849 * traversal.
845 */ 850 */
846void 851void
847xfs_iflush_done( 852xfs_iflush_done(
848 struct xfs_buf *bp, 853 struct xfs_buf *bp,
849 struct xfs_log_item *lip) 854 struct xfs_log_item *lip)
850{ 855{
851 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 856 struct xfs_inode_log_item *iip;
852 xfs_inode_t *ip = iip->ili_inode; 857 struct xfs_log_item *blip;
858 struct xfs_log_item *next;
859 struct xfs_log_item *prev;
853 struct xfs_ail *ailp = lip->li_ailp; 860 struct xfs_ail *ailp = lip->li_ailp;
861 int need_ail = 0;
862
863 /*
864 * Scan the buffer IO completions for other inodes being completed and
865 * attach them to the current inode log item.
866 */
867 blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
868 prev = NULL;
869 while (blip != NULL) {
870 if (lip->li_cb != xfs_iflush_done) {
871 prev = blip;
872 blip = blip->li_bio_list;
873 continue;
874 }
875
876 /* remove from list */
877 next = blip->li_bio_list;
878 if (!prev) {
879 XFS_BUF_SET_FSPRIVATE(bp, next);
880 } else {
881 prev->li_bio_list = next;
882 }
883
884 /* add to current list */
885 blip->li_bio_list = lip->li_bio_list;
886 lip->li_bio_list = blip;
887
888 /*
889 * while we have the item, do the unlocked check for needing
890 * the AIL lock.
891 */
892 iip = INODE_ITEM(blip);
893 if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
894 need_ail++;
895
896 blip = next;
897 }
898
899 /* make sure we capture the state of the initial inode. */
900 iip = INODE_ITEM(lip);
901 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
902 need_ail++;
854 903
855 /* 904 /*
856 * We only want to pull the item from the AIL if it is 905 * We only want to pull the item from the AIL if it is
@@ -861,28 +910,37 @@ xfs_iflush_done(
861 * the lock since it's cheaper, and then we recheck while 910 * the lock since it's cheaper, and then we recheck while
862 * holding the lock before removing the inode from the AIL. 911 * holding the lock before removing the inode from the AIL.
863 */ 912 */
864 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { 913 if (need_ail) {
914 struct xfs_log_item *log_items[need_ail];
915 int i = 0;
865 spin_lock(&ailp->xa_lock); 916 spin_lock(&ailp->xa_lock);
866 if (lip->li_lsn == iip->ili_flush_lsn) { 917 for (blip = lip; blip; blip = blip->li_bio_list) {
867 /* xfs_trans_ail_delete() drops the AIL lock. */ 918 iip = INODE_ITEM(blip);
868 xfs_trans_ail_delete(ailp, lip); 919 if (iip->ili_logged &&
869 } else { 920 blip->li_lsn == iip->ili_flush_lsn) {
870 spin_unlock(&ailp->xa_lock); 921 log_items[i++] = blip;
922 }
923 ASSERT(i <= need_ail);
871 } 924 }
925 /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
926 xfs_trans_ail_delete_bulk(ailp, log_items, i);
872 } 927 }
873 928
874 iip->ili_logged = 0;
875 929
876 /* 930 /*
877 * Clear the ili_last_fields bits now that we know that the 931 * clean up and unlock the flush lock now we are done. We can clear the
878 * data corresponding to them is safely on disk. 932 * ili_last_fields bits now that we know that the data corresponding to
933 * them is safely on disk.
879 */ 934 */
880 iip->ili_last_fields = 0; 935 for (blip = lip; blip; blip = next) {
936 next = blip->li_bio_list;
937 blip->li_bio_list = NULL;
881 938
882 /* 939 iip = INODE_ITEM(blip);
883 * Release the inode's flush lock since we're done with it. 940 iip->ili_logged = 0;
884 */ 941 iip->ili_last_fields = 0;
885 xfs_ifunlock(ip); 942 xfs_ifunlock(iip->ili_inode);
943 }
886} 944}
887 945
888/* 946/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 20576146369f..8a0f044750c3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -47,127 +47,8 @@
47 47
48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 48#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
49 << mp->m_writeio_log) 49 << mp->m_writeio_log)
50#define XFS_STRAT_WRITE_IMAPS 2
51#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP 50#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
52 51
53STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
54 int, struct xfs_bmbt_irec *, int *);
55STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
56 struct xfs_bmbt_irec *, int *);
57STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
58 struct xfs_bmbt_irec *, int *);
59
60int
61xfs_iomap(
62 struct xfs_inode *ip,
63 xfs_off_t offset,
64 ssize_t count,
65 int flags,
66 struct xfs_bmbt_irec *imap,
67 int *nimaps,
68 int *new)
69{
70 struct xfs_mount *mp = ip->i_mount;
71 xfs_fileoff_t offset_fsb, end_fsb;
72 int error = 0;
73 int lockmode = 0;
74 int bmapi_flags = 0;
75
76 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
77
78 *new = 0;
79
80 if (XFS_FORCED_SHUTDOWN(mp))
81 return XFS_ERROR(EIO);
82
83 trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
84
85 switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
86 case BMAPI_READ:
87 lockmode = xfs_ilock_map_shared(ip);
88 bmapi_flags = XFS_BMAPI_ENTIRE;
89 break;
90 case BMAPI_WRITE:
91 lockmode = XFS_ILOCK_EXCL;
92 if (flags & BMAPI_IGNSTATE)
93 bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
94 xfs_ilock(ip, lockmode);
95 break;
96 case BMAPI_ALLOCATE:
97 lockmode = XFS_ILOCK_SHARED;
98 bmapi_flags = XFS_BMAPI_ENTIRE;
99
100 /* Attempt non-blocking lock */
101 if (flags & BMAPI_TRYLOCK) {
102 if (!xfs_ilock_nowait(ip, lockmode))
103 return XFS_ERROR(EAGAIN);
104 } else {
105 xfs_ilock(ip, lockmode);
106 }
107 break;
108 default:
109 BUG();
110 }
111
112 ASSERT(offset <= mp->m_maxioffset);
113 if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
114 count = mp->m_maxioffset - offset;
115 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
116 offset_fsb = XFS_B_TO_FSBT(mp, offset);
117
118 error = xfs_bmapi(NULL, ip, offset_fsb,
119 (xfs_filblks_t)(end_fsb - offset_fsb),
120 bmapi_flags, NULL, 0, imap,
121 nimaps, NULL);
122
123 if (error)
124 goto out;
125
126 switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
127 case BMAPI_WRITE:
128 /* If we found an extent, return it */
129 if (*nimaps &&
130 (imap->br_startblock != HOLESTARTBLOCK) &&
131 (imap->br_startblock != DELAYSTARTBLOCK)) {
132 trace_xfs_iomap_found(ip, offset, count, flags, imap);
133 break;
134 }
135
136 if (flags & BMAPI_DIRECT) {
137 error = xfs_iomap_write_direct(ip, offset, count, flags,
138 imap, nimaps);
139 } else {
140 error = xfs_iomap_write_delay(ip, offset, count, flags,
141 imap, nimaps);
142 }
143 if (!error) {
144 trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
145 }
146 *new = 1;
147 break;
148 case BMAPI_ALLOCATE:
149 /* If we found an extent, return it */
150 xfs_iunlock(ip, lockmode);
151 lockmode = 0;
152
153 if (*nimaps && !isnullstartblock(imap->br_startblock)) {
154 trace_xfs_iomap_found(ip, offset, count, flags, imap);
155 break;
156 }
157
158 error = xfs_iomap_write_allocate(ip, offset, count,
159 imap, nimaps);
160 break;
161 }
162
163 ASSERT(*nimaps <= 1);
164
165out:
166 if (lockmode)
167 xfs_iunlock(ip, lockmode);
168 return XFS_ERROR(error);
169}
170
171STATIC int 52STATIC int
172xfs_iomap_eof_align_last_fsb( 53xfs_iomap_eof_align_last_fsb(
173 xfs_mount_t *mp, 54 xfs_mount_t *mp,
@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
236 return EFSCORRUPTED; 117 return EFSCORRUPTED;
237} 118}
238 119
239STATIC int 120int
240xfs_iomap_write_direct( 121xfs_iomap_write_direct(
241 xfs_inode_t *ip, 122 xfs_inode_t *ip,
242 xfs_off_t offset, 123 xfs_off_t offset,
243 size_t count, 124 size_t count,
244 int flags,
245 xfs_bmbt_irec_t *imap, 125 xfs_bmbt_irec_t *imap,
246 int *nmaps) 126 int nmaps)
247{ 127{
248 xfs_mount_t *mp = ip->i_mount; 128 xfs_mount_t *mp = ip->i_mount;
249 xfs_fileoff_t offset_fsb; 129 xfs_fileoff_t offset_fsb;
@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
279 if (error) 159 if (error)
280 goto error_out; 160 goto error_out;
281 } else { 161 } else {
282 if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) 162 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
283 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 163 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
284 imap->br_blockcount + 164 imap->br_blockcount +
285 imap->br_startoff); 165 imap->br_startoff);
@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
331 xfs_trans_ijoin(tp, ip); 211 xfs_trans_ijoin(tp, ip);
332 212
333 bmapi_flag = XFS_BMAPI_WRITE; 213 bmapi_flag = XFS_BMAPI_WRITE;
334 if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) 214 if (offset < ip->i_size || extsz)
335 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
336 216
337 /* 217 /*
@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
370 goto error_out; 250 goto error_out;
371 } 251 }
372 252
373 *nmaps = 1;
374 return 0; 253 return 0;
375 254
376error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ 255error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -379,7 +258,6 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
379 258
380error1: /* Just cancel transaction */ 259error1: /* Just cancel transaction */
381 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 260 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
382 *nmaps = 0; /* nothing set-up here */
383 261
384error_out: 262error_out:
385 return XFS_ERROR(error); 263 return XFS_ERROR(error);
@@ -389,6 +267,9 @@ error_out:
389 * If the caller is doing a write at the end of the file, then extend the 267 * If the caller is doing a write at the end of the file, then extend the
390 * allocation out to the file system's write iosize. We clean up any extra 268 * allocation out to the file system's write iosize. We clean up any extra
391 * space left over when the file is closed in xfs_inactive(). 269 * space left over when the file is closed in xfs_inactive().
270 *
271 * If we find we already have delalloc preallocation beyond EOF, don't do more
272 * preallocation as it it not needed.
392 */ 273 */
393STATIC int 274STATIC int
394xfs_iomap_eof_want_preallocate( 275xfs_iomap_eof_want_preallocate(
@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
396 xfs_inode_t *ip, 277 xfs_inode_t *ip,
397 xfs_off_t offset, 278 xfs_off_t offset,
398 size_t count, 279 size_t count,
399 int ioflag,
400 xfs_bmbt_irec_t *imap, 280 xfs_bmbt_irec_t *imap,
401 int nimaps, 281 int nimaps,
402 int *prealloc) 282 int *prealloc)
@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
405 xfs_filblks_t count_fsb; 285 xfs_filblks_t count_fsb;
406 xfs_fsblock_t firstblock; 286 xfs_fsblock_t firstblock;
407 int n, error, imaps; 287 int n, error, imaps;
288 int found_delalloc = 0;
408 289
409 *prealloc = 0; 290 *prealloc = 0;
410 if ((offset + count) <= ip->i_size) 291 if ((offset + count) <= ip->i_size)
@@ -429,20 +310,71 @@ xfs_iomap_eof_want_preallocate(
429 return 0; 310 return 0;
430 start_fsb += imap[n].br_blockcount; 311 start_fsb += imap[n].br_blockcount;
431 count_fsb -= imap[n].br_blockcount; 312 count_fsb -= imap[n].br_blockcount;
313
314 if (imap[n].br_startblock == DELAYSTARTBLOCK)
315 found_delalloc = 1;
432 } 316 }
433 } 317 }
434 *prealloc = 1; 318 if (!found_delalloc)
319 *prealloc = 1;
435 return 0; 320 return 0;
436} 321}
437 322
438STATIC int 323/*
324 * If we don't have a user specified preallocation size, dynamically increase
325 * the preallocation size as the size of the file grows. Cap the maximum size
326 * at a single extent or less if the filesystem is near full. The closer the
327 * filesystem is to full, the smaller the maximum prealocation.
328 */
329STATIC xfs_fsblock_t
330xfs_iomap_prealloc_size(
331 struct xfs_mount *mp,
332 struct xfs_inode *ip)
333{
334 xfs_fsblock_t alloc_blocks = 0;
335
336 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
337 int shift = 0;
338 int64_t freesp;
339
340 /*
341 * rounddown_pow_of_two() returns an undefined result
342 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
343 * ensure we always pass in a non-zero value.
344 */
345 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
346 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
347 rounddown_pow_of_two(alloc_blocks));
348
349 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
350 freesp = mp->m_sb.sb_fdblocks;
351 if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
352 shift = 2;
353 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
354 shift++;
355 if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
356 shift++;
357 if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
358 shift++;
359 if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
360 shift++;
361 }
362 if (shift)
363 alloc_blocks >>= shift;
364 }
365
366 if (alloc_blocks < mp->m_writeio_blocks)
367 alloc_blocks = mp->m_writeio_blocks;
368
369 return alloc_blocks;
370}
371
372int
439xfs_iomap_write_delay( 373xfs_iomap_write_delay(
440 xfs_inode_t *ip, 374 xfs_inode_t *ip,
441 xfs_off_t offset, 375 xfs_off_t offset,
442 size_t count, 376 size_t count,
443 int ioflag, 377 xfs_bmbt_irec_t *ret_imap)
444 xfs_bmbt_irec_t *ret_imap,
445 int *nmaps)
446{ 378{
447 xfs_mount_t *mp = ip->i_mount; 379 xfs_mount_t *mp = ip->i_mount;
448 xfs_fileoff_t offset_fsb; 380 xfs_fileoff_t offset_fsb;
@@ -469,16 +401,19 @@ xfs_iomap_write_delay(
469 extsz = xfs_get_extsz_hint(ip); 401 extsz = xfs_get_extsz_hint(ip);
470 offset_fsb = XFS_B_TO_FSBT(mp, offset); 402 offset_fsb = XFS_B_TO_FSBT(mp, offset);
471 403
404
472 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, 405 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
473 ioflag, imap, XFS_WRITE_IMAPS, &prealloc); 406 imap, XFS_WRITE_IMAPS, &prealloc);
474 if (error) 407 if (error)
475 return error; 408 return error;
476 409
477retry: 410retry:
478 if (prealloc) { 411 if (prealloc) {
412 xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
413
479 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 414 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
480 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 415 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
481 last_fsb = ioalign + mp->m_writeio_blocks; 416 last_fsb = ioalign + alloc_blocks;
482 } else { 417 } else {
483 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 418 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
484 } 419 }
@@ -496,22 +431,31 @@ retry:
496 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 431 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
497 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 432 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
498 &nimaps, NULL); 433 &nimaps, NULL);
499 if (error && (error != ENOSPC)) 434 switch (error) {
435 case 0:
436 case ENOSPC:
437 case EDQUOT:
438 break;
439 default:
500 return XFS_ERROR(error); 440 return XFS_ERROR(error);
441 }
501 442
502 /* 443 /*
503 * If bmapi returned us nothing, and if we didn't get back EDQUOT, 444 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
504 * then we must have run out of space - flush all other inodes with 445 * ENOSPC, * flush all other inodes with delalloc blocks to free up
505 * delalloc blocks and retry without EOF preallocation. 446 * some of the excess reserved metadata space. For both cases, retry
447 * without EOF preallocation.
506 */ 448 */
507 if (nimaps == 0) { 449 if (nimaps == 0) {
508 trace_xfs_delalloc_enospc(ip, offset, count); 450 trace_xfs_delalloc_enospc(ip, offset, count);
509 if (flushed) 451 if (flushed)
510 return XFS_ERROR(ENOSPC); 452 return XFS_ERROR(error ? error : ENOSPC);
511 453
512 xfs_iunlock(ip, XFS_ILOCK_EXCL); 454 if (error == ENOSPC) {
513 xfs_flush_inodes(ip); 455 xfs_iunlock(ip, XFS_ILOCK_EXCL);
514 xfs_ilock(ip, XFS_ILOCK_EXCL); 456 xfs_flush_inodes(ip);
457 xfs_ilock(ip, XFS_ILOCK_EXCL);
458 }
515 459
516 flushed = 1; 460 flushed = 1;
517 error = 0; 461 error = 0;
@@ -523,8 +467,6 @@ retry:
523 return xfs_cmn_err_fsblock_zero(ip, &imap[0]); 467 return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
524 468
525 *ret_imap = imap[0]; 469 *ret_imap = imap[0];
526 *nmaps = 1;
527
528 return 0; 470 return 0;
529} 471}
530 472
@@ -538,13 +480,12 @@ retry:
538 * We no longer bother to look at the incoming map - all we have to 480 * We no longer bother to look at the incoming map - all we have to
539 * guarantee is that whatever we allocate fills the required range. 481 * guarantee is that whatever we allocate fills the required range.
540 */ 482 */
541STATIC int 483int
542xfs_iomap_write_allocate( 484xfs_iomap_write_allocate(
543 xfs_inode_t *ip, 485 xfs_inode_t *ip,
544 xfs_off_t offset, 486 xfs_off_t offset,
545 size_t count, 487 size_t count,
546 xfs_bmbt_irec_t *imap, 488 xfs_bmbt_irec_t *imap)
547 int *retmap)
548{ 489{
549 xfs_mount_t *mp = ip->i_mount; 490 xfs_mount_t *mp = ip->i_mount;
550 xfs_fileoff_t offset_fsb, last_block; 491 xfs_fileoff_t offset_fsb, last_block;
@@ -557,8 +498,6 @@ xfs_iomap_write_allocate(
557 int error = 0; 498 int error = 0;
558 int nres; 499 int nres;
559 500
560 *retmap = 0;
561
562 /* 501 /*
563 * Make sure that the dquots are there. 502 * Make sure that the dquots are there.
564 */ 503 */
@@ -680,7 +619,6 @@ xfs_iomap_write_allocate(
680 if ((offset_fsb >= imap->br_startoff) && 619 if ((offset_fsb >= imap->br_startoff) &&
681 (offset_fsb < (imap->br_startoff + 620 (offset_fsb < (imap->br_startoff +
682 imap->br_blockcount))) { 621 imap->br_blockcount))) {
683 *retmap = 1;
684 XFS_STATS_INC(xs_xstrat_quick); 622 XFS_STATS_INC(xs_xstrat_quick);
685 return 0; 623 return 0;
686 } 624 }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7748a430f50d..80615760959a 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,15 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21/* base extent manipulation calls */
22#define BMAPI_READ (1 << 0) /* read extents */
23#define BMAPI_WRITE (1 << 1) /* create extents */
24#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
25
26/* modifiers */
27#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
28#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
29#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
30#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
31
32#define BMAPI_FLAGS \
33 { BMAPI_READ, "READ" }, \
34 { BMAPI_WRITE, "WRITE" }, \
35 { BMAPI_ALLOCATE, "ALLOCATE" }, \
36 { BMAPI_IGNSTATE, "IGNSTATE" }, \
37 { BMAPI_DIRECT, "DIRECT" }, \
38 { BMAPI_TRYLOCK, "TRYLOCK" }
39
40struct xfs_inode; 21struct xfs_inode;
41struct xfs_bmbt_irec; 22struct xfs_bmbt_irec;
42 23
43extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, 24extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
44 struct xfs_bmbt_irec *, int *, int *); 25 struct xfs_bmbt_irec *, int);
26extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
27 struct xfs_bmbt_irec *);
28extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
29 struct xfs_bmbt_irec *);
45extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); 30extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
46 31
47#endif /* __XFS_IOMAP_H__*/ 32#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index cee4ab9f8a9e..ae6fef1ff563 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -47,7 +47,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
47 xfs_buftarg_t *log_target, 47 xfs_buftarg_t *log_target,
48 xfs_daddr_t blk_offset, 48 xfs_daddr_t blk_offset,
49 int num_bblks); 49 int num_bblks);
50STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 50STATIC int xlog_space_left(struct log *log, atomic64_t *head);
51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 51STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
52STATIC void xlog_dealloc_log(xlog_t *log); 52STATIC void xlog_dealloc_log(xlog_t *log);
53 53
@@ -70,7 +70,7 @@ STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
70/* local functions to manipulate grant head */ 70/* local functions to manipulate grant head */
71STATIC int xlog_grant_log_space(xlog_t *log, 71STATIC int xlog_grant_log_space(xlog_t *log,
72 xlog_ticket_t *xtic); 72 xlog_ticket_t *xtic);
73STATIC void xlog_grant_push_ail(xfs_mount_t *mp, 73STATIC void xlog_grant_push_ail(struct log *log,
74 int need_bytes); 74 int need_bytes);
75STATIC void xlog_regrant_reserve_log_space(xlog_t *log, 75STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
76 xlog_ticket_t *ticket); 76 xlog_ticket_t *ticket);
@@ -81,98 +81,73 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
81 81
82#if defined(DEBUG) 82#if defined(DEBUG)
83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 83STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
84STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 84STATIC void xlog_verify_grant_tail(struct log *log);
85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, 85STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
86 int count, boolean_t syncing); 86 int count, boolean_t syncing);
87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, 87STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
88 xfs_lsn_t tail_lsn); 88 xfs_lsn_t tail_lsn);
89#else 89#else
90#define xlog_verify_dest_ptr(a,b) 90#define xlog_verify_dest_ptr(a,b)
91#define xlog_verify_grant_head(a,b) 91#define xlog_verify_grant_tail(a)
92#define xlog_verify_iclog(a,b,c,d) 92#define xlog_verify_iclog(a,b,c,d)
93#define xlog_verify_tail_lsn(a,b,c) 93#define xlog_verify_tail_lsn(a,b,c)
94#endif 94#endif
95 95
96STATIC int xlog_iclogs_empty(xlog_t *log); 96STATIC int xlog_iclogs_empty(xlog_t *log);
97 97
98
99static void 98static void
100xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) 99xlog_grant_sub_space(
100 struct log *log,
101 atomic64_t *head,
102 int bytes)
101{ 103{
102 if (*qp) { 104 int64_t head_val = atomic64_read(head);
103 tic->t_next = (*qp); 105 int64_t new, old;
104 tic->t_prev = (*qp)->t_prev;
105 (*qp)->t_prev->t_next = tic;
106 (*qp)->t_prev = tic;
107 } else {
108 tic->t_prev = tic->t_next = tic;
109 *qp = tic;
110 }
111 106
112 tic->t_flags |= XLOG_TIC_IN_Q; 107 do {
113} 108 int cycle, space;
114 109
115static void 110 xlog_crack_grant_head_val(head_val, &cycle, &space);
116xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic)
117{
118 if (tic == tic->t_next) {
119 *qp = NULL;
120 } else {
121 *qp = tic->t_next;
122 tic->t_next->t_prev = tic->t_prev;
123 tic->t_prev->t_next = tic->t_next;
124 }
125 111
126 tic->t_next = tic->t_prev = NULL; 112 space -= bytes;
127 tic->t_flags &= ~XLOG_TIC_IN_Q; 113 if (space < 0) {
114 space += log->l_logsize;
115 cycle--;
116 }
117
118 old = head_val;
119 new = xlog_assign_grant_head_val(cycle, space);
120 head_val = atomic64_cmpxchg(head, old, new);
121 } while (head_val != old);
128} 122}
129 123
130static void 124static void
131xlog_grant_sub_space(struct log *log, int bytes) 125xlog_grant_add_space(
126 struct log *log,
127 atomic64_t *head,
128 int bytes)
132{ 129{
133 log->l_grant_write_bytes -= bytes; 130 int64_t head_val = atomic64_read(head);
134 if (log->l_grant_write_bytes < 0) { 131 int64_t new, old;
135 log->l_grant_write_bytes += log->l_logsize;
136 log->l_grant_write_cycle--;
137 }
138
139 log->l_grant_reserve_bytes -= bytes;
140 if ((log)->l_grant_reserve_bytes < 0) {
141 log->l_grant_reserve_bytes += log->l_logsize;
142 log->l_grant_reserve_cycle--;
143 }
144 132
145} 133 do {
134 int tmp;
135 int cycle, space;
146 136
147static void 137 xlog_crack_grant_head_val(head_val, &cycle, &space);
148xlog_grant_add_space_write(struct log *log, int bytes)
149{
150 int tmp = log->l_logsize - log->l_grant_write_bytes;
151 if (tmp > bytes)
152 log->l_grant_write_bytes += bytes;
153 else {
154 log->l_grant_write_cycle++;
155 log->l_grant_write_bytes = bytes - tmp;
156 }
157}
158 138
159static void 139 tmp = log->l_logsize - space;
160xlog_grant_add_space_reserve(struct log *log, int bytes) 140 if (tmp > bytes)
161{ 141 space += bytes;
162 int tmp = log->l_logsize - log->l_grant_reserve_bytes; 142 else {
163 if (tmp > bytes) 143 space = bytes - tmp;
164 log->l_grant_reserve_bytes += bytes; 144 cycle++;
165 else { 145 }
166 log->l_grant_reserve_cycle++;
167 log->l_grant_reserve_bytes = bytes - tmp;
168 }
169}
170 146
171static inline void 147 old = head_val;
172xlog_grant_add_space(struct log *log, int bytes) 148 new = xlog_assign_grant_head_val(cycle, space);
173{ 149 head_val = atomic64_cmpxchg(head, old, new);
174 xlog_grant_add_space_write(log, bytes); 150 } while (head_val != old);
175 xlog_grant_add_space_reserve(log, bytes);
176} 151}
177 152
178static void 153static void
@@ -355,7 +330,7 @@ xfs_log_reserve(
355 330
356 trace_xfs_log_reserve(log, internal_ticket); 331 trace_xfs_log_reserve(log, internal_ticket);
357 332
358 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 333 xlog_grant_push_ail(log, internal_ticket->t_unit_res);
359 retval = xlog_regrant_write_log_space(log, internal_ticket); 334 retval = xlog_regrant_write_log_space(log, internal_ticket);
360 } else { 335 } else {
361 /* may sleep if need to allocate more tickets */ 336 /* may sleep if need to allocate more tickets */
@@ -369,7 +344,7 @@ xfs_log_reserve(
369 344
370 trace_xfs_log_reserve(log, internal_ticket); 345 trace_xfs_log_reserve(log, internal_ticket);
371 346
372 xlog_grant_push_ail(mp, 347 xlog_grant_push_ail(log,
373 (internal_ticket->t_unit_res * 348 (internal_ticket->t_unit_res *
374 internal_ticket->t_cnt)); 349 internal_ticket->t_cnt));
375 retval = xlog_grant_log_space(log, internal_ticket); 350 retval = xlog_grant_log_space(log, internal_ticket);
@@ -402,7 +377,7 @@ xfs_log_mount(
402 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 377 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
403 else { 378 else {
404 cmn_err(CE_NOTE, 379 cmn_err(CE_NOTE,
405 "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 380 "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
406 mp->m_fsname); 381 mp->m_fsname);
407 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 382 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
408 } 383 }
@@ -584,8 +559,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
584 if (!(iclog->ic_state == XLOG_STATE_ACTIVE || 559 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
585 iclog->ic_state == XLOG_STATE_DIRTY)) { 560 iclog->ic_state == XLOG_STATE_DIRTY)) {
586 if (!XLOG_FORCED_SHUTDOWN(log)) { 561 if (!XLOG_FORCED_SHUTDOWN(log)) {
587 sv_wait(&iclog->ic_force_wait, PMEM, 562 xlog_wait(&iclog->ic_force_wait,
588 &log->l_icloglock, s); 563 &log->l_icloglock);
589 } else { 564 } else {
590 spin_unlock(&log->l_icloglock); 565 spin_unlock(&log->l_icloglock);
591 } 566 }
@@ -625,8 +600,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
625 || iclog->ic_state == XLOG_STATE_DIRTY 600 || iclog->ic_state == XLOG_STATE_DIRTY
626 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 601 || iclog->ic_state == XLOG_STATE_IOERROR) ) {
627 602
628 sv_wait(&iclog->ic_force_wait, PMEM, 603 xlog_wait(&iclog->ic_force_wait,
629 &log->l_icloglock, s); 604 &log->l_icloglock);
630 } else { 605 } else {
631 spin_unlock(&log->l_icloglock); 606 spin_unlock(&log->l_icloglock);
632 } 607 }
@@ -703,55 +678,46 @@ xfs_log_move_tail(xfs_mount_t *mp,
703{ 678{
704 xlog_ticket_t *tic; 679 xlog_ticket_t *tic;
705 xlog_t *log = mp->m_log; 680 xlog_t *log = mp->m_log;
706 int need_bytes, free_bytes, cycle, bytes; 681 int need_bytes, free_bytes;
707 682
708 if (XLOG_FORCED_SHUTDOWN(log)) 683 if (XLOG_FORCED_SHUTDOWN(log))
709 return; 684 return;
710 685
711 if (tail_lsn == 0) { 686 if (tail_lsn == 0)
712 /* needed since sync_lsn is 64 bits */ 687 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
713 spin_lock(&log->l_icloglock);
714 tail_lsn = log->l_last_sync_lsn;
715 spin_unlock(&log->l_icloglock);
716 }
717
718 spin_lock(&log->l_grant_lock);
719 688
720 /* Also an invalid lsn. 1 implies that we aren't passing in a valid 689 /* tail_lsn == 1 implies that we weren't passed a valid value. */
721 * tail_lsn. 690 if (tail_lsn != 1)
722 */ 691 atomic64_set(&log->l_tail_lsn, tail_lsn);
723 if (tail_lsn != 1) {
724 log->l_tail_lsn = tail_lsn;
725 }
726 692
727 if ((tic = log->l_write_headq)) { 693 if (!list_empty_careful(&log->l_writeq)) {
728#ifdef DEBUG 694#ifdef DEBUG
729 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 695 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
730 panic("Recovery problem"); 696 panic("Recovery problem");
731#endif 697#endif
732 cycle = log->l_grant_write_cycle; 698 spin_lock(&log->l_grant_write_lock);
733 bytes = log->l_grant_write_bytes; 699 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
734 free_bytes = xlog_space_left(log, cycle, bytes); 700 list_for_each_entry(tic, &log->l_writeq, t_queue) {
735 do {
736 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 701 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
737 702
738 if (free_bytes < tic->t_unit_res && tail_lsn != 1) 703 if (free_bytes < tic->t_unit_res && tail_lsn != 1)
739 break; 704 break;
740 tail_lsn = 0; 705 tail_lsn = 0;
741 free_bytes -= tic->t_unit_res; 706 free_bytes -= tic->t_unit_res;
742 sv_signal(&tic->t_wait); 707 trace_xfs_log_regrant_write_wake_up(log, tic);
743 tic = tic->t_next; 708 wake_up(&tic->t_wait);
744 } while (tic != log->l_write_headq); 709 }
710 spin_unlock(&log->l_grant_write_lock);
745 } 711 }
746 if ((tic = log->l_reserve_headq)) { 712
713 if (!list_empty_careful(&log->l_reserveq)) {
747#ifdef DEBUG 714#ifdef DEBUG
748 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 715 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
749 panic("Recovery problem"); 716 panic("Recovery problem");
750#endif 717#endif
751 cycle = log->l_grant_reserve_cycle; 718 spin_lock(&log->l_grant_reserve_lock);
752 bytes = log->l_grant_reserve_bytes; 719 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
753 free_bytes = xlog_space_left(log, cycle, bytes); 720 list_for_each_entry(tic, &log->l_reserveq, t_queue) {
754 do {
755 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 721 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
756 need_bytes = tic->t_unit_res*tic->t_cnt; 722 need_bytes = tic->t_unit_res*tic->t_cnt;
757 else 723 else
@@ -760,12 +726,12 @@ xfs_log_move_tail(xfs_mount_t *mp,
760 break; 726 break;
761 tail_lsn = 0; 727 tail_lsn = 0;
762 free_bytes -= need_bytes; 728 free_bytes -= need_bytes;
763 sv_signal(&tic->t_wait); 729 trace_xfs_log_grant_wake_up(log, tic);
764 tic = tic->t_next; 730 wake_up(&tic->t_wait);
765 } while (tic != log->l_reserve_headq); 731 }
732 spin_unlock(&log->l_grant_reserve_lock);
766 } 733 }
767 spin_unlock(&log->l_grant_lock); 734}
768} /* xfs_log_move_tail */
769 735
770/* 736/*
771 * Determine if we have a transaction that has gone to disk 737 * Determine if we have a transaction that has gone to disk
@@ -831,23 +797,19 @@ xfs_log_need_covered(xfs_mount_t *mp)
831 * We may be holding the log iclog lock upon entering this routine. 797 * We may be holding the log iclog lock upon entering this routine.
832 */ 798 */
833xfs_lsn_t 799xfs_lsn_t
834xlog_assign_tail_lsn(xfs_mount_t *mp) 800xlog_assign_tail_lsn(
801 struct xfs_mount *mp)
835{ 802{
836 xfs_lsn_t tail_lsn; 803 xfs_lsn_t tail_lsn;
837 xlog_t *log = mp->m_log; 804 struct log *log = mp->m_log;
838 805
839 tail_lsn = xfs_trans_ail_tail(mp->m_ail); 806 tail_lsn = xfs_trans_ail_tail(mp->m_ail);
840 spin_lock(&log->l_grant_lock); 807 if (!tail_lsn)
841 if (tail_lsn != 0) { 808 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
842 log->l_tail_lsn = tail_lsn;
843 } else {
844 tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
845 }
846 spin_unlock(&log->l_grant_lock);
847 809
810 atomic64_set(&log->l_tail_lsn, tail_lsn);
848 return tail_lsn; 811 return tail_lsn;
849} /* xlog_assign_tail_lsn */ 812}
850
851 813
852/* 814/*
853 * Return the space in the log between the tail and the head. The head 815 * Return the space in the log between the tail and the head. The head
@@ -864,21 +826,26 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
864 * result is that we return the size of the log as the amount of space left. 826 * result is that we return the size of the log as the amount of space left.
865 */ 827 */
866STATIC int 828STATIC int
867xlog_space_left(xlog_t *log, int cycle, int bytes) 829xlog_space_left(
868{ 830 struct log *log,
869 int free_bytes; 831 atomic64_t *head)
870 int tail_bytes; 832{
871 int tail_cycle; 833 int free_bytes;
872 834 int tail_bytes;
873 tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); 835 int tail_cycle;
874 tail_cycle = CYCLE_LSN(log->l_tail_lsn); 836 int head_cycle;
875 if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { 837 int head_bytes;
876 free_bytes = log->l_logsize - (bytes - tail_bytes); 838
877 } else if ((tail_cycle + 1) < cycle) { 839 xlog_crack_grant_head(head, &head_cycle, &head_bytes);
840 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
841 tail_bytes = BBTOB(tail_bytes);
842 if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
843 free_bytes = log->l_logsize - (head_bytes - tail_bytes);
844 else if (tail_cycle + 1 < head_cycle)
878 return 0; 845 return 0;
879 } else if (tail_cycle < cycle) { 846 else if (tail_cycle < head_cycle) {
880 ASSERT(tail_cycle == (cycle - 1)); 847 ASSERT(tail_cycle == (head_cycle - 1));
881 free_bytes = tail_bytes - bytes; 848 free_bytes = tail_bytes - head_bytes;
882 } else { 849 } else {
883 /* 850 /*
884 * The reservation head is behind the tail. 851 * The reservation head is behind the tail.
@@ -889,12 +856,12 @@ xlog_space_left(xlog_t *log, int cycle, int bytes)
889 "xlog_space_left: head behind tail\n" 856 "xlog_space_left: head behind tail\n"
890 " tail_cycle = %d, tail_bytes = %d\n" 857 " tail_cycle = %d, tail_bytes = %d\n"
891 " GH cycle = %d, GH bytes = %d", 858 " GH cycle = %d, GH bytes = %d",
892 tail_cycle, tail_bytes, cycle, bytes); 859 tail_cycle, tail_bytes, head_cycle, head_bytes);
893 ASSERT(0); 860 ASSERT(0);
894 free_bytes = log->l_logsize; 861 free_bytes = log->l_logsize;
895 } 862 }
896 return free_bytes; 863 return free_bytes;
897} /* xlog_space_left */ 864}
898 865
899 866
900/* 867/*
@@ -1047,12 +1014,16 @@ xlog_alloc_log(xfs_mount_t *mp,
1047 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1014 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1048 1015
1049 log->l_prev_block = -1; 1016 log->l_prev_block = -1;
1050 log->l_tail_lsn = xlog_assign_lsn(1, 0);
1051 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1017 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
1052 log->l_last_sync_lsn = log->l_tail_lsn; 1018 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
1019 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
1053 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1020 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
1054 log->l_grant_reserve_cycle = 1; 1021 xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
1055 log->l_grant_write_cycle = 1; 1022 xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
1023 INIT_LIST_HEAD(&log->l_reserveq);
1024 INIT_LIST_HEAD(&log->l_writeq);
1025 spin_lock_init(&log->l_grant_reserve_lock);
1026 spin_lock_init(&log->l_grant_write_lock);
1056 1027
1057 error = EFSCORRUPTED; 1028 error = EFSCORRUPTED;
1058 if (xfs_sb_version_hassector(&mp->m_sb)) { 1029 if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1094,8 +1065,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1094 log->l_xbuf = bp; 1065 log->l_xbuf = bp;
1095 1066
1096 spin_lock_init(&log->l_icloglock); 1067 spin_lock_init(&log->l_icloglock);
1097 spin_lock_init(&log->l_grant_lock); 1068 init_waitqueue_head(&log->l_flush_wait);
1098 sv_init(&log->l_flush_wait, 0, "flush_wait");
1099 1069
1100 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1070 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1101 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1071 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1151,8 +1121,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1151 1121
1152 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); 1122 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1153 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1123 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
1154 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1124 init_waitqueue_head(&iclog->ic_force_wait);
1155 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1125 init_waitqueue_head(&iclog->ic_write_wait);
1156 1126
1157 iclogp = &iclog->ic_next; 1127 iclogp = &iclog->ic_next;
1158 } 1128 }
@@ -1167,15 +1137,11 @@ xlog_alloc_log(xfs_mount_t *mp,
1167out_free_iclog: 1137out_free_iclog:
1168 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { 1138 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1169 prev_iclog = iclog->ic_next; 1139 prev_iclog = iclog->ic_next;
1170 if (iclog->ic_bp) { 1140 if (iclog->ic_bp)
1171 sv_destroy(&iclog->ic_force_wait);
1172 sv_destroy(&iclog->ic_write_wait);
1173 xfs_buf_free(iclog->ic_bp); 1141 xfs_buf_free(iclog->ic_bp);
1174 }
1175 kmem_free(iclog); 1142 kmem_free(iclog);
1176 } 1143 }
1177 spinlock_destroy(&log->l_icloglock); 1144 spinlock_destroy(&log->l_icloglock);
1178 spinlock_destroy(&log->l_grant_lock);
1179 xfs_buf_free(log->l_xbuf); 1145 xfs_buf_free(log->l_xbuf);
1180out_free_log: 1146out_free_log:
1181 kmem_free(log); 1147 kmem_free(log);
@@ -1223,61 +1189,60 @@ xlog_commit_record(
1223 * water mark. In this manner, we would be creating a low water mark. 1189 * water mark. In this manner, we would be creating a low water mark.
1224 */ 1190 */
1225STATIC void 1191STATIC void
1226xlog_grant_push_ail(xfs_mount_t *mp, 1192xlog_grant_push_ail(
1227 int need_bytes) 1193 struct log *log,
1194 int need_bytes)
1228{ 1195{
1229 xlog_t *log = mp->m_log; /* pointer to the log */ 1196 xfs_lsn_t threshold_lsn = 0;
1230 xfs_lsn_t tail_lsn; /* lsn of the log tail */ 1197 xfs_lsn_t last_sync_lsn;
1231 xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ 1198 int free_blocks;
1232 int free_blocks; /* free blocks left to write to */ 1199 int free_bytes;
1233 int free_bytes; /* free bytes left to write to */ 1200 int threshold_block;
1234 int threshold_block; /* block in lsn we'd like to be at */ 1201 int threshold_cycle;
1235 int threshold_cycle; /* lsn cycle we'd like to be at */ 1202 int free_threshold;
1236 int free_threshold; 1203
1237 1204 ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1238 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1205
1239 1206 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
1240 spin_lock(&log->l_grant_lock); 1207 free_blocks = BTOBBT(free_bytes);
1241 free_bytes = xlog_space_left(log, 1208
1242 log->l_grant_reserve_cycle, 1209 /*
1243 log->l_grant_reserve_bytes); 1210 * Set the threshold for the minimum number of free blocks in the
1244 tail_lsn = log->l_tail_lsn; 1211 * log to the maximum of what the caller needs, one quarter of the
1245 free_blocks = BTOBBT(free_bytes); 1212 * log, and 256 blocks.
1246 1213 */
1247 /* 1214 free_threshold = BTOBB(need_bytes);
1248 * Set the threshold for the minimum number of free blocks in the 1215 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
1249 * log to the maximum of what the caller needs, one quarter of the 1216 free_threshold = MAX(free_threshold, 256);
1250 * log, and 256 blocks. 1217 if (free_blocks >= free_threshold)
1251 */ 1218 return;
1252 free_threshold = BTOBB(need_bytes); 1219
1253 free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); 1220 xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
1254 free_threshold = MAX(free_threshold, 256); 1221 &threshold_block);
1255 if (free_blocks < free_threshold) { 1222 threshold_block += free_threshold;
1256 threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
1257 threshold_cycle = CYCLE_LSN(tail_lsn);
1258 if (threshold_block >= log->l_logBBsize) { 1223 if (threshold_block >= log->l_logBBsize) {
1259 threshold_block -= log->l_logBBsize; 1224 threshold_block -= log->l_logBBsize;
1260 threshold_cycle += 1; 1225 threshold_cycle += 1;
1261 } 1226 }
1262 threshold_lsn = xlog_assign_lsn(threshold_cycle, threshold_block); 1227 threshold_lsn = xlog_assign_lsn(threshold_cycle,
1228 threshold_block);
1229 /*
1230 * Don't pass in an lsn greater than the lsn of the last
1231 * log record known to be on disk. Use a snapshot of the last sync lsn
1232 * so that it doesn't change between the compare and the set.
1233 */
1234 last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
1235 if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
1236 threshold_lsn = last_sync_lsn;
1263 1237
1264 /* Don't pass in an lsn greater than the lsn of the last 1238 /*
1265 * log record known to be on disk. 1239 * Get the transaction layer to kick the dirty buffers out to
1240 * disk asynchronously. No point in trying to do this if
1241 * the filesystem is shutting down.
1266 */ 1242 */
1267 if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) 1243 if (!XLOG_FORCED_SHUTDOWN(log))
1268 threshold_lsn = log->l_last_sync_lsn; 1244 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1269 } 1245}
1270 spin_unlock(&log->l_grant_lock);
1271
1272 /*
1273 * Get the transaction layer to kick the dirty buffers out to
1274 * disk asynchronously. No point in trying to do this if
1275 * the filesystem is shutting down.
1276 */
1277 if (threshold_lsn &&
1278 !XLOG_FORCED_SHUTDOWN(log))
1279 xfs_trans_ail_push(log->l_ailp, threshold_lsn);
1280} /* xlog_grant_push_ail */
1281 1246
1282/* 1247/*
1283 * The bdstrat callback function for log bufs. This gives us a central 1248 * The bdstrat callback function for log bufs. This gives us a central
@@ -1372,9 +1337,8 @@ xlog_sync(xlog_t *log,
1372 roundoff < BBTOB(1))); 1337 roundoff < BBTOB(1)));
1373 1338
1374 /* move grant heads by roundoff in sync */ 1339 /* move grant heads by roundoff in sync */
1375 spin_lock(&log->l_grant_lock); 1340 xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
1376 xlog_grant_add_space(log, roundoff); 1341 xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
1377 spin_unlock(&log->l_grant_lock);
1378 1342
1379 /* put cycle number in every block */ 1343 /* put cycle number in every block */
1380 xlog_pack_data(log, iclog, roundoff); 1344 xlog_pack_data(log, iclog, roundoff);
@@ -1489,15 +1453,12 @@ xlog_dealloc_log(xlog_t *log)
1489 1453
1490 iclog = log->l_iclog; 1454 iclog = log->l_iclog;
1491 for (i=0; i<log->l_iclog_bufs; i++) { 1455 for (i=0; i<log->l_iclog_bufs; i++) {
1492 sv_destroy(&iclog->ic_force_wait);
1493 sv_destroy(&iclog->ic_write_wait);
1494 xfs_buf_free(iclog->ic_bp); 1456 xfs_buf_free(iclog->ic_bp);
1495 next_iclog = iclog->ic_next; 1457 next_iclog = iclog->ic_next;
1496 kmem_free(iclog); 1458 kmem_free(iclog);
1497 iclog = next_iclog; 1459 iclog = next_iclog;
1498 } 1460 }
1499 spinlock_destroy(&log->l_icloglock); 1461 spinlock_destroy(&log->l_icloglock);
1500 spinlock_destroy(&log->l_grant_lock);
1501 1462
1502 xfs_buf_free(log->l_xbuf); 1463 xfs_buf_free(log->l_xbuf);
1503 log->l_mp->m_log = NULL; 1464 log->l_mp->m_log = NULL;
@@ -2232,7 +2193,7 @@ xlog_state_do_callback(
2232 lowest_lsn = xlog_get_lowest_lsn(log); 2193 lowest_lsn = xlog_get_lowest_lsn(log);
2233 if (lowest_lsn && 2194 if (lowest_lsn &&
2234 XFS_LSN_CMP(lowest_lsn, 2195 XFS_LSN_CMP(lowest_lsn,
2235 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { 2196 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2236 iclog = iclog->ic_next; 2197 iclog = iclog->ic_next;
2237 continue; /* Leave this iclog for 2198 continue; /* Leave this iclog for
2238 * another thread */ 2199 * another thread */
@@ -2240,23 +2201,21 @@ xlog_state_do_callback(
2240 2201
2241 iclog->ic_state = XLOG_STATE_CALLBACK; 2202 iclog->ic_state = XLOG_STATE_CALLBACK;
2242 2203
2243 spin_unlock(&log->l_icloglock);
2244 2204
2245 /* l_last_sync_lsn field protected by 2205 /*
2246 * l_grant_lock. Don't worry about iclog's lsn. 2206 * update the last_sync_lsn before we drop the
2247 * No one else can be here except us. 2207 * icloglock to ensure we are the only one that
2208 * can update it.
2248 */ 2209 */
2249 spin_lock(&log->l_grant_lock); 2210 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2250 ASSERT(XFS_LSN_CMP(log->l_last_sync_lsn, 2211 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2251 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2212 atomic64_set(&log->l_last_sync_lsn,
2252 log->l_last_sync_lsn = 2213 be64_to_cpu(iclog->ic_header.h_lsn));
2253 be64_to_cpu(iclog->ic_header.h_lsn);
2254 spin_unlock(&log->l_grant_lock);
2255 2214
2256 } else { 2215 } else
2257 spin_unlock(&log->l_icloglock);
2258 ioerrors++; 2216 ioerrors++;
2259 } 2217
2218 spin_unlock(&log->l_icloglock);
2260 2219
2261 /* 2220 /*
2262 * Keep processing entries in the callback list until 2221 * Keep processing entries in the callback list until
@@ -2297,7 +2256,7 @@ xlog_state_do_callback(
2297 xlog_state_clean_log(log); 2256 xlog_state_clean_log(log);
2298 2257
2299 /* wake up threads waiting in xfs_log_force() */ 2258 /* wake up threads waiting in xfs_log_force() */
2300 sv_broadcast(&iclog->ic_force_wait); 2259 wake_up_all(&iclog->ic_force_wait);
2301 2260
2302 iclog = iclog->ic_next; 2261 iclog = iclog->ic_next;
2303 } while (first_iclog != iclog); 2262 } while (first_iclog != iclog);
@@ -2344,7 +2303,7 @@ xlog_state_do_callback(
2344 spin_unlock(&log->l_icloglock); 2303 spin_unlock(&log->l_icloglock);
2345 2304
2346 if (wake) 2305 if (wake)
2347 sv_broadcast(&log->l_flush_wait); 2306 wake_up_all(&log->l_flush_wait);
2348} 2307}
2349 2308
2350 2309
@@ -2395,7 +2354,7 @@ xlog_state_done_syncing(
2395 * iclog buffer, we wake them all, one will get to do the 2354 * iclog buffer, we wake them all, one will get to do the
2396 * I/O, the others get to wait for the result. 2355 * I/O, the others get to wait for the result.
2397 */ 2356 */
2398 sv_broadcast(&iclog->ic_write_wait); 2357 wake_up_all(&iclog->ic_write_wait);
2399 spin_unlock(&log->l_icloglock); 2358 spin_unlock(&log->l_icloglock);
2400 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2359 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
2401} /* xlog_state_done_syncing */ 2360} /* xlog_state_done_syncing */
@@ -2444,7 +2403,7 @@ restart:
2444 XFS_STATS_INC(xs_log_noiclogs); 2403 XFS_STATS_INC(xs_log_noiclogs);
2445 2404
2446 /* Wait for log writes to have flushed */ 2405 /* Wait for log writes to have flushed */
2447 sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0); 2406 xlog_wait(&log->l_flush_wait, &log->l_icloglock);
2448 goto restart; 2407 goto restart;
2449 } 2408 }
2450 2409
@@ -2527,6 +2486,18 @@ restart:
2527 * 2486 *
2528 * Once a ticket gets put onto the reserveq, it will only return after 2487 * Once a ticket gets put onto the reserveq, it will only return after
2529 * the needed reservation is satisfied. 2488 * the needed reservation is satisfied.
2489 *
2490 * This function is structured so that it has a lock free fast path. This is
2491 * necessary because every new transaction reservation will come through this
2492 * path. Hence any lock will be globally hot if we take it unconditionally on
2493 * every pass.
2494 *
2495 * As tickets are only ever moved on and off the reserveq under the
2496 * l_grant_reserve_lock, we only need to take that lock if we are going
2497 * to add the ticket to the queue and sleep. We can avoid taking the lock if the
2498 * ticket was never added to the reserveq because the t_queue list head will be
2499 * empty and we hold the only reference to it so it can safely be checked
2500 * unlocked.
2530 */ 2501 */
2531STATIC int 2502STATIC int
2532xlog_grant_log_space(xlog_t *log, 2503xlog_grant_log_space(xlog_t *log,
@@ -2534,24 +2505,27 @@ xlog_grant_log_space(xlog_t *log,
2534{ 2505{
2535 int free_bytes; 2506 int free_bytes;
2536 int need_bytes; 2507 int need_bytes;
2537#ifdef DEBUG
2538 xfs_lsn_t tail_lsn;
2539#endif
2540
2541 2508
2542#ifdef DEBUG 2509#ifdef DEBUG
2543 if (log->l_flags & XLOG_ACTIVE_RECOVERY) 2510 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2544 panic("grant Recovery problem"); 2511 panic("grant Recovery problem");
2545#endif 2512#endif
2546 2513
2547 /* Is there space or do we need to sleep? */
2548 spin_lock(&log->l_grant_lock);
2549
2550 trace_xfs_log_grant_enter(log, tic); 2514 trace_xfs_log_grant_enter(log, tic);
2551 2515
2516 need_bytes = tic->t_unit_res;
2517 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2518 need_bytes *= tic->t_ocnt;
2519
2552 /* something is already sleeping; insert new transaction at end */ 2520 /* something is already sleeping; insert new transaction at end */
2553 if (log->l_reserve_headq) { 2521 if (!list_empty_careful(&log->l_reserveq)) {
2554 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2522 spin_lock(&log->l_grant_reserve_lock);
2523 /* recheck the queue now we are locked */
2524 if (list_empty(&log->l_reserveq)) {
2525 spin_unlock(&log->l_grant_reserve_lock);
2526 goto redo;
2527 }
2528 list_add_tail(&tic->t_queue, &log->l_reserveq);
2555 2529
2556 trace_xfs_log_grant_sleep1(log, tic); 2530 trace_xfs_log_grant_sleep1(log, tic);
2557 2531
@@ -2563,72 +2537,57 @@ xlog_grant_log_space(xlog_t *log,
2563 goto error_return; 2537 goto error_return;
2564 2538
2565 XFS_STATS_INC(xs_sleep_logspace); 2539 XFS_STATS_INC(xs_sleep_logspace);
2566 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); 2540 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2541
2567 /* 2542 /*
2568 * If we got an error, and the filesystem is shutting down, 2543 * If we got an error, and the filesystem is shutting down,
2569 * we'll catch it down below. So just continue... 2544 * we'll catch it down below. So just continue...
2570 */ 2545 */
2571 trace_xfs_log_grant_wake1(log, tic); 2546 trace_xfs_log_grant_wake1(log, tic);
2572 spin_lock(&log->l_grant_lock);
2573 } 2547 }
2574 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2575 need_bytes = tic->t_unit_res*tic->t_ocnt;
2576 else
2577 need_bytes = tic->t_unit_res;
2578 2548
2579redo: 2549redo:
2580 if (XLOG_FORCED_SHUTDOWN(log)) 2550 if (XLOG_FORCED_SHUTDOWN(log))
2581 goto error_return; 2551 goto error_return_unlocked;
2582 2552
2583 free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, 2553 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
2584 log->l_grant_reserve_bytes);
2585 if (free_bytes < need_bytes) { 2554 if (free_bytes < need_bytes) {
2586 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2555 spin_lock(&log->l_grant_reserve_lock);
2587 xlog_ins_ticketq(&log->l_reserve_headq, tic); 2556 if (list_empty(&tic->t_queue))
2557 list_add_tail(&tic->t_queue, &log->l_reserveq);
2588 2558
2589 trace_xfs_log_grant_sleep2(log, tic); 2559 trace_xfs_log_grant_sleep2(log, tic);
2590 2560
2591 spin_unlock(&log->l_grant_lock);
2592 xlog_grant_push_ail(log->l_mp, need_bytes);
2593 spin_lock(&log->l_grant_lock);
2594
2595 XFS_STATS_INC(xs_sleep_logspace);
2596 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2597
2598 spin_lock(&log->l_grant_lock);
2599 if (XLOG_FORCED_SHUTDOWN(log)) 2561 if (XLOG_FORCED_SHUTDOWN(log))
2600 goto error_return; 2562 goto error_return;
2601 2563
2602 trace_xfs_log_grant_wake2(log, tic); 2564 xlog_grant_push_ail(log, need_bytes);
2565
2566 XFS_STATS_INC(xs_sleep_logspace);
2567 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2603 2568
2569 trace_xfs_log_grant_wake2(log, tic);
2604 goto redo; 2570 goto redo;
2605 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2571 }
2606 xlog_del_ticketq(&log->l_reserve_headq, tic);
2607 2572
2608 /* we've got enough space */ 2573 if (!list_empty(&tic->t_queue)) {
2609 xlog_grant_add_space(log, need_bytes); 2574 spin_lock(&log->l_grant_reserve_lock);
2610#ifdef DEBUG 2575 list_del_init(&tic->t_queue);
2611 tail_lsn = log->l_tail_lsn; 2576 spin_unlock(&log->l_grant_reserve_lock);
2612 /*
2613 * Check to make sure the grant write head didn't just over lap the
2614 * tail. If the cycles are the same, we can't be overlapping.
2615 * Otherwise, make sure that the cycles differ by exactly one and
2616 * check the byte count.
2617 */
2618 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2619 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2620 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2621 } 2577 }
2622#endif 2578
2579 /* we've got enough space */
2580 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
2581 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2623 trace_xfs_log_grant_exit(log, tic); 2582 trace_xfs_log_grant_exit(log, tic);
2624 xlog_verify_grant_head(log, 1); 2583 xlog_verify_grant_tail(log);
2625 spin_unlock(&log->l_grant_lock);
2626 return 0; 2584 return 0;
2627 2585
2628 error_return: 2586error_return_unlocked:
2629 if (tic->t_flags & XLOG_TIC_IN_Q) 2587 spin_lock(&log->l_grant_reserve_lock);
2630 xlog_del_ticketq(&log->l_reserve_headq, tic); 2588error_return:
2631 2589 list_del_init(&tic->t_queue);
2590 spin_unlock(&log->l_grant_reserve_lock);
2632 trace_xfs_log_grant_error(log, tic); 2591 trace_xfs_log_grant_error(log, tic);
2633 2592
2634 /* 2593 /*
@@ -2638,7 +2597,6 @@ redo:
2638 */ 2597 */
2639 tic->t_curr_res = 0; 2598 tic->t_curr_res = 0;
2640 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2599 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2641 spin_unlock(&log->l_grant_lock);
2642 return XFS_ERROR(EIO); 2600 return XFS_ERROR(EIO);
2643} /* xlog_grant_log_space */ 2601} /* xlog_grant_log_space */
2644 2602
@@ -2646,17 +2604,14 @@ redo:
2646/* 2604/*
2647 * Replenish the byte reservation required by moving the grant write head. 2605 * Replenish the byte reservation required by moving the grant write head.
2648 * 2606 *
2649 * 2607 * Similar to xlog_grant_log_space, the function is structured to have a lock
2608 * free fast path.
2650 */ 2609 */
2651STATIC int 2610STATIC int
2652xlog_regrant_write_log_space(xlog_t *log, 2611xlog_regrant_write_log_space(xlog_t *log,
2653 xlog_ticket_t *tic) 2612 xlog_ticket_t *tic)
2654{ 2613{
2655 int free_bytes, need_bytes; 2614 int free_bytes, need_bytes;
2656 xlog_ticket_t *ntic;
2657#ifdef DEBUG
2658 xfs_lsn_t tail_lsn;
2659#endif
2660 2615
2661 tic->t_curr_res = tic->t_unit_res; 2616 tic->t_curr_res = tic->t_unit_res;
2662 xlog_tic_reset_res(tic); 2617 xlog_tic_reset_res(tic);
@@ -2669,12 +2624,9 @@ xlog_regrant_write_log_space(xlog_t *log,
2669 panic("regrant Recovery problem"); 2624 panic("regrant Recovery problem");
2670#endif 2625#endif
2671 2626
2672 spin_lock(&log->l_grant_lock);
2673
2674 trace_xfs_log_regrant_write_enter(log, tic); 2627 trace_xfs_log_regrant_write_enter(log, tic);
2675
2676 if (XLOG_FORCED_SHUTDOWN(log)) 2628 if (XLOG_FORCED_SHUTDOWN(log))
2677 goto error_return; 2629 goto error_return_unlocked;
2678 2630
2679 /* If there are other waiters on the queue then give them a 2631 /* If there are other waiters on the queue then give them a
2680 * chance at logspace before us. Wake up the first waiters, 2632 * chance at logspace before us. Wake up the first waiters,
@@ -2683,92 +2635,76 @@ xlog_regrant_write_log_space(xlog_t *log,
2683 * this transaction. 2635 * this transaction.
2684 */ 2636 */
2685 need_bytes = tic->t_unit_res; 2637 need_bytes = tic->t_unit_res;
2686 if ((ntic = log->l_write_headq)) { 2638 if (!list_empty_careful(&log->l_writeq)) {
2687 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2639 struct xlog_ticket *ntic;
2688 log->l_grant_write_bytes); 2640
2689 do { 2641 spin_lock(&log->l_grant_write_lock);
2642 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2643 list_for_each_entry(ntic, &log->l_writeq, t_queue) {
2690 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); 2644 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
2691 2645
2692 if (free_bytes < ntic->t_unit_res) 2646 if (free_bytes < ntic->t_unit_res)
2693 break; 2647 break;
2694 free_bytes -= ntic->t_unit_res; 2648 free_bytes -= ntic->t_unit_res;
2695 sv_signal(&ntic->t_wait); 2649 wake_up(&ntic->t_wait);
2696 ntic = ntic->t_next; 2650 }
2697 } while (ntic != log->l_write_headq);
2698
2699 if (ntic != log->l_write_headq) {
2700 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2701 xlog_ins_ticketq(&log->l_write_headq, tic);
2702 2651
2652 if (ntic != list_first_entry(&log->l_writeq,
2653 struct xlog_ticket, t_queue)) {
2654 if (list_empty(&tic->t_queue))
2655 list_add_tail(&tic->t_queue, &log->l_writeq);
2703 trace_xfs_log_regrant_write_sleep1(log, tic); 2656 trace_xfs_log_regrant_write_sleep1(log, tic);
2704 2657
2705 spin_unlock(&log->l_grant_lock); 2658 xlog_grant_push_ail(log, need_bytes);
2706 xlog_grant_push_ail(log->l_mp, need_bytes);
2707 spin_lock(&log->l_grant_lock);
2708 2659
2709 XFS_STATS_INC(xs_sleep_logspace); 2660 XFS_STATS_INC(xs_sleep_logspace);
2710 sv_wait(&tic->t_wait, PINOD|PLTWAIT, 2661 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2711 &log->l_grant_lock, s);
2712
2713 /* If we're shutting down, this tic is already
2714 * off the queue */
2715 spin_lock(&log->l_grant_lock);
2716 if (XLOG_FORCED_SHUTDOWN(log))
2717 goto error_return;
2718
2719 trace_xfs_log_regrant_write_wake1(log, tic); 2662 trace_xfs_log_regrant_write_wake1(log, tic);
2720 } 2663 } else
2664 spin_unlock(&log->l_grant_write_lock);
2721 } 2665 }
2722 2666
2723redo: 2667redo:
2724 if (XLOG_FORCED_SHUTDOWN(log)) 2668 if (XLOG_FORCED_SHUTDOWN(log))
2725 goto error_return; 2669 goto error_return_unlocked;
2726 2670
2727 free_bytes = xlog_space_left(log, log->l_grant_write_cycle, 2671 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2728 log->l_grant_write_bytes);
2729 if (free_bytes < need_bytes) { 2672 if (free_bytes < need_bytes) {
2730 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2673 spin_lock(&log->l_grant_write_lock);
2731 xlog_ins_ticketq(&log->l_write_headq, tic); 2674 if (list_empty(&tic->t_queue))
2732 spin_unlock(&log->l_grant_lock); 2675 list_add_tail(&tic->t_queue, &log->l_writeq);
2733 xlog_grant_push_ail(log->l_mp, need_bytes);
2734 spin_lock(&log->l_grant_lock);
2735
2736 XFS_STATS_INC(xs_sleep_logspace);
2737 trace_xfs_log_regrant_write_sleep2(log, tic);
2738
2739 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2740 2676
2741 /* If we're shutting down, this tic is already off the queue */
2742 spin_lock(&log->l_grant_lock);
2743 if (XLOG_FORCED_SHUTDOWN(log)) 2677 if (XLOG_FORCED_SHUTDOWN(log))
2744 goto error_return; 2678 goto error_return;
2745 2679
2680 xlog_grant_push_ail(log, need_bytes);
2681
2682 XFS_STATS_INC(xs_sleep_logspace);
2683 trace_xfs_log_regrant_write_sleep2(log, tic);
2684 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2685
2746 trace_xfs_log_regrant_write_wake2(log, tic); 2686 trace_xfs_log_regrant_write_wake2(log, tic);
2747 goto redo; 2687 goto redo;
2748 } else if (tic->t_flags & XLOG_TIC_IN_Q) 2688 }
2749 xlog_del_ticketq(&log->l_write_headq, tic);
2750 2689
2751 /* we've got enough space */ 2690 if (!list_empty(&tic->t_queue)) {
2752 xlog_grant_add_space_write(log, need_bytes); 2691 spin_lock(&log->l_grant_write_lock);
2753#ifdef DEBUG 2692 list_del_init(&tic->t_queue);
2754 tail_lsn = log->l_tail_lsn; 2693 spin_unlock(&log->l_grant_write_lock);
2755 if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
2756 ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
2757 ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
2758 } 2694 }
2759#endif
2760 2695
2696 /* we've got enough space */
2697 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2761 trace_xfs_log_regrant_write_exit(log, tic); 2698 trace_xfs_log_regrant_write_exit(log, tic);
2762 2699 xlog_verify_grant_tail(log);
2763 xlog_verify_grant_head(log, 1);
2764 spin_unlock(&log->l_grant_lock);
2765 return 0; 2700 return 0;
2766 2701
2767 2702
2703 error_return_unlocked:
2704 spin_lock(&log->l_grant_write_lock);
2768 error_return: 2705 error_return:
2769 if (tic->t_flags & XLOG_TIC_IN_Q) 2706 list_del_init(&tic->t_queue);
2770 xlog_del_ticketq(&log->l_reserve_headq, tic); 2707 spin_unlock(&log->l_grant_write_lock);
2771
2772 trace_xfs_log_regrant_write_error(log, tic); 2708 trace_xfs_log_regrant_write_error(log, tic);
2773 2709
2774 /* 2710 /*
@@ -2778,7 +2714,6 @@ redo:
2778 */ 2714 */
2779 tic->t_curr_res = 0; 2715 tic->t_curr_res = 0;
2780 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 2716 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2781 spin_unlock(&log->l_grant_lock);
2782 return XFS_ERROR(EIO); 2717 return XFS_ERROR(EIO);
2783} /* xlog_regrant_write_log_space */ 2718} /* xlog_regrant_write_log_space */
2784 2719
@@ -2799,27 +2734,24 @@ xlog_regrant_reserve_log_space(xlog_t *log,
2799 if (ticket->t_cnt > 0) 2734 if (ticket->t_cnt > 0)
2800 ticket->t_cnt--; 2735 ticket->t_cnt--;
2801 2736
2802 spin_lock(&log->l_grant_lock); 2737 xlog_grant_sub_space(log, &log->l_grant_reserve_head,
2803 xlog_grant_sub_space(log, ticket->t_curr_res); 2738 ticket->t_curr_res);
2739 xlog_grant_sub_space(log, &log->l_grant_write_head,
2740 ticket->t_curr_res);
2804 ticket->t_curr_res = ticket->t_unit_res; 2741 ticket->t_curr_res = ticket->t_unit_res;
2805 xlog_tic_reset_res(ticket); 2742 xlog_tic_reset_res(ticket);
2806 2743
2807 trace_xfs_log_regrant_reserve_sub(log, ticket); 2744 trace_xfs_log_regrant_reserve_sub(log, ticket);
2808 2745
2809 xlog_verify_grant_head(log, 1);
2810
2811 /* just return if we still have some of the pre-reserved space */ 2746 /* just return if we still have some of the pre-reserved space */
2812 if (ticket->t_cnt > 0) { 2747 if (ticket->t_cnt > 0)
2813 spin_unlock(&log->l_grant_lock);
2814 return; 2748 return;
2815 }
2816 2749
2817 xlog_grant_add_space_reserve(log, ticket->t_unit_res); 2750 xlog_grant_add_space(log, &log->l_grant_reserve_head,
2751 ticket->t_unit_res);
2818 2752
2819 trace_xfs_log_regrant_reserve_exit(log, ticket); 2753 trace_xfs_log_regrant_reserve_exit(log, ticket);
2820 2754
2821 xlog_verify_grant_head(log, 0);
2822 spin_unlock(&log->l_grant_lock);
2823 ticket->t_curr_res = ticket->t_unit_res; 2755 ticket->t_curr_res = ticket->t_unit_res;
2824 xlog_tic_reset_res(ticket); 2756 xlog_tic_reset_res(ticket);
2825} /* xlog_regrant_reserve_log_space */ 2757} /* xlog_regrant_reserve_log_space */
@@ -2843,28 +2775,29 @@ STATIC void
2843xlog_ungrant_log_space(xlog_t *log, 2775xlog_ungrant_log_space(xlog_t *log,
2844 xlog_ticket_t *ticket) 2776 xlog_ticket_t *ticket)
2845{ 2777{
2778 int bytes;
2779
2846 if (ticket->t_cnt > 0) 2780 if (ticket->t_cnt > 0)
2847 ticket->t_cnt--; 2781 ticket->t_cnt--;
2848 2782
2849 spin_lock(&log->l_grant_lock);
2850 trace_xfs_log_ungrant_enter(log, ticket); 2783 trace_xfs_log_ungrant_enter(log, ticket);
2851
2852 xlog_grant_sub_space(log, ticket->t_curr_res);
2853
2854 trace_xfs_log_ungrant_sub(log, ticket); 2784 trace_xfs_log_ungrant_sub(log, ticket);
2855 2785
2856 /* If this is a permanent reservation ticket, we may be able to free 2786 /*
2787 * If this is a permanent reservation ticket, we may be able to free
2857 * up more space based on the remaining count. 2788 * up more space based on the remaining count.
2858 */ 2789 */
2790 bytes = ticket->t_curr_res;
2859 if (ticket->t_cnt > 0) { 2791 if (ticket->t_cnt > 0) {
2860 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 2792 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
2861 xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); 2793 bytes += ticket->t_unit_res*ticket->t_cnt;
2862 } 2794 }
2863 2795
2796 xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
2797 xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
2798
2864 trace_xfs_log_ungrant_exit(log, ticket); 2799 trace_xfs_log_ungrant_exit(log, ticket);
2865 2800
2866 xlog_verify_grant_head(log, 1);
2867 spin_unlock(&log->l_grant_lock);
2868 xfs_log_move_tail(log->l_mp, 1); 2801 xfs_log_move_tail(log->l_mp, 1);
2869} /* xlog_ungrant_log_space */ 2802} /* xlog_ungrant_log_space */
2870 2803
@@ -2901,11 +2834,11 @@ xlog_state_release_iclog(
2901 2834
2902 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 2835 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
2903 /* update tail before writing to iclog */ 2836 /* update tail before writing to iclog */
2904 xlog_assign_tail_lsn(log->l_mp); 2837 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
2905 sync++; 2838 sync++;
2906 iclog->ic_state = XLOG_STATE_SYNCING; 2839 iclog->ic_state = XLOG_STATE_SYNCING;
2907 iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); 2840 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
2908 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); 2841 xlog_verify_tail_lsn(log, iclog, tail_lsn);
2909 /* cycle incremented when incrementing curr_block */ 2842 /* cycle incremented when incrementing curr_block */
2910 } 2843 }
2911 spin_unlock(&log->l_icloglock); 2844 spin_unlock(&log->l_icloglock);
@@ -3088,7 +3021,7 @@ maybe_sleep:
3088 return XFS_ERROR(EIO); 3021 return XFS_ERROR(EIO);
3089 } 3022 }
3090 XFS_STATS_INC(xs_log_force_sleep); 3023 XFS_STATS_INC(xs_log_force_sleep);
3091 sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s); 3024 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3092 /* 3025 /*
3093 * No need to grab the log lock here since we're 3026 * No need to grab the log lock here since we're
3094 * only deciding whether or not to return EIO 3027 * only deciding whether or not to return EIO
@@ -3206,8 +3139,8 @@ try_again:
3206 3139
3207 XFS_STATS_INC(xs_log_force_sleep); 3140 XFS_STATS_INC(xs_log_force_sleep);
3208 3141
3209 sv_wait(&iclog->ic_prev->ic_write_wait, 3142 xlog_wait(&iclog->ic_prev->ic_write_wait,
3210 PSWP, &log->l_icloglock, s); 3143 &log->l_icloglock);
3211 if (log_flushed) 3144 if (log_flushed)
3212 *log_flushed = 1; 3145 *log_flushed = 1;
3213 already_slept = 1; 3146 already_slept = 1;
@@ -3235,7 +3168,7 @@ try_again:
3235 return XFS_ERROR(EIO); 3168 return XFS_ERROR(EIO);
3236 } 3169 }
3237 XFS_STATS_INC(xs_log_force_sleep); 3170 XFS_STATS_INC(xs_log_force_sleep);
3238 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s); 3171 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3239 /* 3172 /*
3240 * No need to grab the log lock here since we're 3173 * No need to grab the log lock here since we're
3241 * only deciding whether or not to return EIO 3174 * only deciding whether or not to return EIO
@@ -3310,10 +3243,8 @@ xfs_log_ticket_put(
3310 xlog_ticket_t *ticket) 3243 xlog_ticket_t *ticket)
3311{ 3244{
3312 ASSERT(atomic_read(&ticket->t_ref) > 0); 3245 ASSERT(atomic_read(&ticket->t_ref) > 0);
3313 if (atomic_dec_and_test(&ticket->t_ref)) { 3246 if (atomic_dec_and_test(&ticket->t_ref))
3314 sv_destroy(&ticket->t_wait);
3315 kmem_zone_free(xfs_log_ticket_zone, ticket); 3247 kmem_zone_free(xfs_log_ticket_zone, ticket);
3316 }
3317} 3248}
3318 3249
3319xlog_ticket_t * 3250xlog_ticket_t *
@@ -3435,6 +3366,7 @@ xlog_ticket_alloc(
3435 } 3366 }
3436 3367
3437 atomic_set(&tic->t_ref, 1); 3368 atomic_set(&tic->t_ref, 1);
3369 INIT_LIST_HEAD(&tic->t_queue);
3438 tic->t_unit_res = unit_bytes; 3370 tic->t_unit_res = unit_bytes;
3439 tic->t_curr_res = unit_bytes; 3371 tic->t_curr_res = unit_bytes;
3440 tic->t_cnt = cnt; 3372 tic->t_cnt = cnt;
@@ -3445,7 +3377,7 @@ xlog_ticket_alloc(
3445 tic->t_trans_type = 0; 3377 tic->t_trans_type = 0;
3446 if (xflags & XFS_LOG_PERM_RESERV) 3378 if (xflags & XFS_LOG_PERM_RESERV)
3447 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3379 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3448 sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); 3380 init_waitqueue_head(&tic->t_wait);
3449 3381
3450 xlog_tic_reset_res(tic); 3382 xlog_tic_reset_res(tic);
3451 3383
@@ -3484,18 +3416,25 @@ xlog_verify_dest_ptr(
3484} 3416}
3485 3417
3486STATIC void 3418STATIC void
3487xlog_verify_grant_head(xlog_t *log, int equals) 3419xlog_verify_grant_tail(
3420 struct log *log)
3488{ 3421{
3489 if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { 3422 int tail_cycle, tail_blocks;
3490 if (equals) 3423 int cycle, space;
3491 ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); 3424
3492 else 3425 /*
3493 ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); 3426 * Check to make sure the grant write head didn't just over lap the
3494 } else { 3427 * tail. If the cycles are the same, we can't be overlapping.
3495 ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); 3428 * Otherwise, make sure that the cycles differ by exactly one and
3496 ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); 3429 * check the byte count.
3497 } 3430 */
3498} /* xlog_verify_grant_head */ 3431 xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
3432 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3433 if (tail_cycle != cycle) {
3434 ASSERT(cycle - 1 == tail_cycle);
3435 ASSERT(space <= BBTOB(tail_blocks));
3436 }
3437}
3499 3438
3500/* check if it will fit */ 3439/* check if it will fit */
3501STATIC void 3440STATIC void
@@ -3716,12 +3655,10 @@ xfs_log_force_umount(
3716 xlog_cil_force(log); 3655 xlog_cil_force(log);
3717 3656
3718 /* 3657 /*
3719 * We must hold both the GRANT lock and the LOG lock, 3658 * mark the filesystem and the as in a shutdown state and wake
3720 * before we mark the filesystem SHUTDOWN and wake 3659 * everybody up to tell them the bad news.
3721 * everybody up to tell the bad news.
3722 */ 3660 */
3723 spin_lock(&log->l_icloglock); 3661 spin_lock(&log->l_icloglock);
3724 spin_lock(&log->l_grant_lock);
3725 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3662 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3726 if (mp->m_sb_bp) 3663 if (mp->m_sb_bp)
3727 XFS_BUF_DONE(mp->m_sb_bp); 3664 XFS_BUF_DONE(mp->m_sb_bp);
@@ -3742,27 +3679,21 @@ xfs_log_force_umount(
3742 spin_unlock(&log->l_icloglock); 3679 spin_unlock(&log->l_icloglock);
3743 3680
3744 /* 3681 /*
3745 * We don't want anybody waiting for log reservations 3682 * We don't want anybody waiting for log reservations after this. That
3746 * after this. That means we have to wake up everybody 3683 * means we have to wake up everybody queued up on reserveq as well as
3747 * queued up on reserve_headq as well as write_headq. 3684 * writeq. In addition, we make sure in xlog_{re}grant_log_space that
3748 * In addition, we make sure in xlog_{re}grant_log_space 3685 * we don't enqueue anything once the SHUTDOWN flag is set, and this
3749 * that we don't enqueue anything once the SHUTDOWN flag 3686 * action is protected by the grant locks.
3750 * is set, and this action is protected by the GRANTLOCK.
3751 */ 3687 */
3752 if ((tic = log->l_reserve_headq)) { 3688 spin_lock(&log->l_grant_reserve_lock);
3753 do { 3689 list_for_each_entry(tic, &log->l_reserveq, t_queue)
3754 sv_signal(&tic->t_wait); 3690 wake_up(&tic->t_wait);
3755 tic = tic->t_next; 3691 spin_unlock(&log->l_grant_reserve_lock);
3756 } while (tic != log->l_reserve_headq); 3692
3757 } 3693 spin_lock(&log->l_grant_write_lock);
3758 3694 list_for_each_entry(tic, &log->l_writeq, t_queue)
3759 if ((tic = log->l_write_headq)) { 3695 wake_up(&tic->t_wait);
3760 do { 3696 spin_unlock(&log->l_grant_write_lock);
3761 sv_signal(&tic->t_wait);
3762 tic = tic->t_next;
3763 } while (tic != log->l_write_headq);
3764 }
3765 spin_unlock(&log->l_grant_lock);
3766 3697
3767 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { 3698 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3768 ASSERT(!logerror); 3699 ASSERT(!logerror);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d9..3bd3291ef8d2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket);
191 191
192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); 192xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
193 193
194int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, 194void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
195 struct xfs_log_vec *log_vector, 195 struct xfs_log_vec *log_vector,
196 xfs_lsn_t *commit_lsn, int flags); 196 xfs_lsn_t *commit_lsn, int flags);
197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 197bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 23d6ceb5e97b..9ca59be08977 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -61,7 +61,7 @@ xlog_cil_init(
61 INIT_LIST_HEAD(&cil->xc_committing); 61 INIT_LIST_HEAD(&cil->xc_committing);
62 spin_lock_init(&cil->xc_cil_lock); 62 spin_lock_init(&cil->xc_cil_lock);
63 init_rwsem(&cil->xc_ctx_lock); 63 init_rwsem(&cil->xc_ctx_lock);
64 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); 64 init_waitqueue_head(&cil->xc_commit_wait);
65 65
66 INIT_LIST_HEAD(&ctx->committing); 66 INIT_LIST_HEAD(&ctx->committing);
67 INIT_LIST_HEAD(&ctx->busy_extents); 67 INIT_LIST_HEAD(&ctx->busy_extents);
@@ -361,15 +361,10 @@ xlog_cil_committed(
361 int abort) 361 int abort)
362{ 362{
363 struct xfs_cil_ctx *ctx = args; 363 struct xfs_cil_ctx *ctx = args;
364 struct xfs_log_vec *lv;
365 int abortflag = abort ? XFS_LI_ABORTED : 0;
366 struct xfs_busy_extent *busyp, *n; 364 struct xfs_busy_extent *busyp, *n;
367 365
368 /* unpin all the log items */ 366 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
369 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { 367 ctx->start_lsn, abort);
370 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
371 abortflag);
372 }
373 368
374 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) 369 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
375 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); 370 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
@@ -548,7 +543,7 @@ xlog_cil_push(
548 543
549 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); 544 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
550 if (error) 545 if (error)
551 goto out_abort; 546 goto out_abort_free_ticket;
552 547
553 /* 548 /*
554 * now that we've written the checkpoint into the log, strictly 549 * now that we've written the checkpoint into the log, strictly
@@ -568,14 +563,15 @@ restart:
568 * It is still being pushed! Wait for the push to 563 * It is still being pushed! Wait for the push to
569 * complete, then start again from the beginning. 564 * complete, then start again from the beginning.
570 */ 565 */
571 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 566 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
572 goto restart; 567 goto restart;
573 } 568 }
574 } 569 }
575 spin_unlock(&cil->xc_cil_lock); 570 spin_unlock(&cil->xc_cil_lock);
576 571
572 /* xfs_log_done always frees the ticket on error. */
577 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 573 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
578 if (error || commit_lsn == -1) 574 if (commit_lsn == -1)
579 goto out_abort; 575 goto out_abort;
580 576
581 /* attach all the transactions w/ busy extents to iclog */ 577 /* attach all the transactions w/ busy extents to iclog */
@@ -592,7 +588,7 @@ restart:
592 */ 588 */
593 spin_lock(&cil->xc_cil_lock); 589 spin_lock(&cil->xc_cil_lock);
594 ctx->commit_lsn = commit_lsn; 590 ctx->commit_lsn = commit_lsn;
595 sv_broadcast(&cil->xc_commit_wait); 591 wake_up_all(&cil->xc_commit_wait);
596 spin_unlock(&cil->xc_cil_lock); 592 spin_unlock(&cil->xc_cil_lock);
597 593
598 /* release the hounds! */ 594 /* release the hounds! */
@@ -605,6 +601,8 @@ out_free_ticket:
605 kmem_free(new_ctx); 601 kmem_free(new_ctx);
606 return 0; 602 return 0;
607 603
604out_abort_free_ticket:
605 xfs_log_ticket_put(tic);
608out_abort: 606out_abort:
609 xlog_cil_committed(ctx, XFS_LI_ABORTED); 607 xlog_cil_committed(ctx, XFS_LI_ABORTED);
610 return XFS_ERROR(EIO); 608 return XFS_ERROR(EIO);
@@ -627,7 +625,7 @@ out_abort:
627 * background commit, returns without it held once background commits are 625 * background commit, returns without it held once background commits are
628 * allowed again. 626 * allowed again.
629 */ 627 */
630int 628void
631xfs_log_commit_cil( 629xfs_log_commit_cil(
632 struct xfs_mount *mp, 630 struct xfs_mount *mp,
633 struct xfs_trans *tp, 631 struct xfs_trans *tp,
@@ -642,11 +640,6 @@ xfs_log_commit_cil(
642 if (flags & XFS_TRANS_RELEASE_LOG_RES) 640 if (flags & XFS_TRANS_RELEASE_LOG_RES)
643 log_flags = XFS_LOG_REL_PERM_RESERV; 641 log_flags = XFS_LOG_REL_PERM_RESERV;
644 642
645 if (XLOG_FORCED_SHUTDOWN(log)) {
646 xlog_cil_free_logvec(log_vector);
647 return XFS_ERROR(EIO);
648 }
649
650 /* 643 /*
651 * do all the hard work of formatting items (including memory 644 * do all the hard work of formatting items (including memory
652 * allocation) outside the CIL context lock. This prevents stalling CIL 645 * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -706,7 +699,6 @@ xfs_log_commit_cil(
706 */ 699 */
707 if (push) 700 if (push)
708 xlog_cil_push(log, 0); 701 xlog_cil_push(log, 0);
709 return 0;
710} 702}
711 703
712/* 704/*
@@ -757,7 +749,7 @@ restart:
757 * It is still being pushed! Wait for the push to 749 * It is still being pushed! Wait for the push to
758 * complete, then start again from the beginning. 750 * complete, then start again from the beginning.
759 */ 751 */
760 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 752 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
761 goto restart; 753 goto restart;
762 } 754 }
763 if (ctx->sequence != sequence) 755 if (ctx->sequence != sequence)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index edcdfe01617f..d5f8be8f4bf6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -21,7 +21,6 @@
21struct xfs_buf; 21struct xfs_buf;
22struct log; 22struct log;
23struct xlog_ticket; 23struct xlog_ticket;
24struct xfs_buf_cancel;
25struct xfs_mount; 24struct xfs_mount;
26 25
27/* 26/*
@@ -54,7 +53,6 @@ struct xfs_mount;
54 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ 53 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
55 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) 54 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
56 55
57
58static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) 56static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
59{ 57{
60 return ((xfs_lsn_t)cycle << 32) | block; 58 return ((xfs_lsn_t)cycle << 32) | block;
@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
133 */ 131 */
134#define XLOG_TIC_INITED 0x1 /* has been initialized */ 132#define XLOG_TIC_INITED 0x1 /* has been initialized */
135#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ 133#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
136#define XLOG_TIC_IN_Q 0x4
137 134
138#define XLOG_TIC_FLAGS \ 135#define XLOG_TIC_FLAGS \
139 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ 136 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
140 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ 137 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
141 { XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" }
142 138
143#endif /* __KERNEL__ */ 139#endif /* __KERNEL__ */
144 140
@@ -244,9 +240,8 @@ typedef struct xlog_res {
244} xlog_res_t; 240} xlog_res_t;
245 241
246typedef struct xlog_ticket { 242typedef struct xlog_ticket {
247 sv_t t_wait; /* ticket wait queue : 20 */ 243 wait_queue_head_t t_wait; /* ticket wait queue */
248 struct xlog_ticket *t_next; /* :4|8 */ 244 struct list_head t_queue; /* reserve/write queue */
249 struct xlog_ticket *t_prev; /* :4|8 */
250 xlog_tid_t t_tid; /* transaction identifier : 4 */ 245 xlog_tid_t t_tid; /* transaction identifier : 4 */
251 atomic_t t_ref; /* ticket reference count : 4 */ 246 atomic_t t_ref; /* ticket reference count : 4 */
252 int t_curr_res; /* current reservation in bytes : 4 */ 247 int t_curr_res; /* current reservation in bytes : 4 */
@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
353 * and move everything else out to subsequent cachelines. 348 * and move everything else out to subsequent cachelines.
354 */ 349 */
355typedef struct xlog_in_core { 350typedef struct xlog_in_core {
356 sv_t ic_force_wait; 351 wait_queue_head_t ic_force_wait;
357 sv_t ic_write_wait; 352 wait_queue_head_t ic_write_wait;
358 struct xlog_in_core *ic_next; 353 struct xlog_in_core *ic_next;
359 struct xlog_in_core *ic_prev; 354 struct xlog_in_core *ic_prev;
360 struct xfs_buf *ic_bp; 355 struct xfs_buf *ic_bp;
@@ -421,7 +416,7 @@ struct xfs_cil {
421 struct xfs_cil_ctx *xc_ctx; 416 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock; 417 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 418 struct list_head xc_committing;
424 sv_t xc_commit_wait; 419 wait_queue_head_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence; 420 xfs_lsn_t xc_current_sequence;
426}; 421};
427 422
@@ -491,7 +486,7 @@ typedef struct log {
491 struct xfs_buftarg *l_targ; /* buftarg of log */ 486 struct xfs_buftarg *l_targ; /* buftarg of log */
492 uint l_flags; 487 uint l_flags;
493 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 488 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
494 struct xfs_buf_cancel **l_buf_cancel_table; 489 struct list_head *l_buf_cancel_table;
495 int l_iclog_hsize; /* size of iclog header */ 490 int l_iclog_hsize; /* size of iclog header */
496 int l_iclog_heads; /* # of iclog header sectors */ 491 int l_iclog_heads; /* # of iclog header sectors */
497 uint l_sectBBsize; /* sector size in BBs (2^n) */ 492 uint l_sectBBsize; /* sector size in BBs (2^n) */
@@ -503,29 +498,40 @@ typedef struct log {
503 int l_logBBsize; /* size of log in BB chunks */ 498 int l_logBBsize; /* size of log in BB chunks */
504 499
505 /* The following block of fields are changed while holding icloglock */ 500 /* The following block of fields are changed while holding icloglock */
506 sv_t l_flush_wait ____cacheline_aligned_in_smp; 501 wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
507 /* waiting for iclog flush */ 502 /* waiting for iclog flush */
508 int l_covered_state;/* state of "covering disk 503 int l_covered_state;/* state of "covering disk
509 * log entries" */ 504 * log entries" */
510 xlog_in_core_t *l_iclog; /* head log queue */ 505 xlog_in_core_t *l_iclog; /* head log queue */
511 spinlock_t l_icloglock; /* grab to change iclog state */ 506 spinlock_t l_icloglock; /* grab to change iclog state */
512 xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
513 * buffers */
514 xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
515 int l_curr_cycle; /* Cycle number of log writes */ 507 int l_curr_cycle; /* Cycle number of log writes */
516 int l_prev_cycle; /* Cycle number before last 508 int l_prev_cycle; /* Cycle number before last
517 * block increment */ 509 * block increment */
518 int l_curr_block; /* current logical log block */ 510 int l_curr_block; /* current logical log block */
519 int l_prev_block; /* previous logical log block */ 511 int l_prev_block; /* previous logical log block */
520 512
521 /* The following block of fields are changed while holding grant_lock */ 513 /*
522 spinlock_t l_grant_lock ____cacheline_aligned_in_smp; 514 * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
523 xlog_ticket_t *l_reserve_headq; 515 * read without needing to hold specific locks. To avoid operations
524 xlog_ticket_t *l_write_headq; 516 * contending with other hot objects, place each of them on a separate
525 int l_grant_reserve_cycle; 517 * cacheline.
526 int l_grant_reserve_bytes; 518 */
527 int l_grant_write_cycle; 519 /* lsn of last LR on disk */
528 int l_grant_write_bytes; 520 atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
521 /* lsn of 1st LR with unflushed * buffers */
522 atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
523
524 /*
525 * ticket grant locks, queues and accounting have their own cachlines
526 * as these are quite hot and can be operated on concurrently.
527 */
528 spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
529 struct list_head l_reserveq;
530 atomic64_t l_grant_reserve_head;
531
532 spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
533 struct list_head l_writeq;
534 atomic64_t l_grant_write_head;
529 535
530 /* The following field are used for debugging; need to hold icloglock */ 536 /* The following field are used for debugging; need to hold icloglock */
531#ifdef DEBUG 537#ifdef DEBUG
@@ -534,6 +540,9 @@ typedef struct log {
534 540
535} xlog_t; 541} xlog_t;
536 542
543#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
544 ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
545
537#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 546#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
538 547
539/* common routines */ 548/* common routines */
@@ -562,6 +571,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
562 xlog_in_core_t **commit_iclog, uint flags); 571 xlog_in_core_t **commit_iclog, uint flags);
563 572
564/* 573/*
574 * When we crack an atomic LSN, we sample it first so that the value will not
575 * change while we are cracking it into the component values. This means we
576 * will always get consistent component values to work from. This should always
577 * be used to smaple and crack LSNs taht are stored and updated in atomic
578 * variables.
579 */
580static inline void
581xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
582{
583 xfs_lsn_t val = atomic64_read(lsn);
584
585 *cycle = CYCLE_LSN(val);
586 *block = BLOCK_LSN(val);
587}
588
589/*
590 * Calculate and assign a value to an atomic LSN variable from component pieces.
591 */
592static inline void
593xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
594{
595 atomic64_set(lsn, xlog_assign_lsn(cycle, block));
596}
597
598/*
599 * When we crack the grant head, we sample it first so that the value will not
600 * change while we are cracking it into the component values. This means we
601 * will always get consistent component values to work from.
602 */
603static inline void
604xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
605{
606 *cycle = val >> 32;
607 *space = val & 0xffffffff;
608}
609
610static inline void
611xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
612{
613 xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
614}
615
616static inline int64_t
617xlog_assign_grant_head_val(int cycle, int space)
618{
619 return ((int64_t)cycle << 32) | space;
620}
621
622static inline void
623xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
624{
625 atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
626}
627
628/*
565 * Committed Item List interfaces 629 * Committed Item List interfaces
566 */ 630 */
567int xlog_cil_init(struct log *log); 631int xlog_cil_init(struct log *log);
@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
585 */ 649 */
586#define XLOG_UNMOUNT_REC_TYPE (-1U) 650#define XLOG_UNMOUNT_REC_TYPE (-1U)
587 651
652/*
653 * Wrapper function for waiting on a wait queue serialised against wakeups
654 * by a spinlock. This matches the semantics of all the wait queues used in the
655 * log code.
656 */
657static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
658{
659 DECLARE_WAITQUEUE(wait, current);
660
661 add_wait_queue_exclusive(wq, &wait);
662 __set_current_state(TASK_UNINTERRUPTIBLE);
663 spin_unlock(lock);
664 schedule();
665 remove_wait_queue(wq, &wait);
666}
588#endif /* __KERNEL__ */ 667#endif /* __KERNEL__ */
589 668
590#endif /* __XFS_LOG_PRIV_H__ */ 669#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458c..aa0ebb776903 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -53,6 +53,17 @@ STATIC void xlog_recover_check_summary(xlog_t *);
53#endif 53#endif
54 54
55/* 55/*
56 * This structure is used during recovery to record the buf log items which
57 * have been canceled and should not be replayed.
58 */
59struct xfs_buf_cancel {
60 xfs_daddr_t bc_blkno;
61 uint bc_len;
62 int bc_refcount;
63 struct list_head bc_list;
64};
65
66/*
56 * Sector aligned buffer routines for buffer create/read/write/access 67 * Sector aligned buffer routines for buffer create/read/write/access
57 */ 68 */
58 69
@@ -925,12 +936,12 @@ xlog_find_tail(
925 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 936 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
926 if (found == 2) 937 if (found == 2)
927 log->l_curr_cycle++; 938 log->l_curr_cycle++;
928 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); 939 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
929 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); 940 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
930 log->l_grant_reserve_cycle = log->l_curr_cycle; 941 xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
931 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); 942 BBTOB(log->l_curr_block));
932 log->l_grant_write_cycle = log->l_curr_cycle; 943 xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
933 log->l_grant_write_bytes = BBTOB(log->l_curr_block); 944 BBTOB(log->l_curr_block));
934 945
935 /* 946 /*
936 * Look for unmount record. If we find it, then we know there 947 * Look for unmount record. If we find it, then we know there
@@ -960,7 +971,7 @@ xlog_find_tail(
960 } 971 }
961 after_umount_blk = (i + hblks + (int) 972 after_umount_blk = (i + hblks + (int)
962 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; 973 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
963 tail_lsn = log->l_tail_lsn; 974 tail_lsn = atomic64_read(&log->l_tail_lsn);
964 if (*head_blk == after_umount_blk && 975 if (*head_blk == after_umount_blk &&
965 be32_to_cpu(rhead->h_num_logops) == 1) { 976 be32_to_cpu(rhead->h_num_logops) == 1) {
966 umount_data_blk = (i + hblks) % log->l_logBBsize; 977 umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +986,10 @@ xlog_find_tail(
975 * log records will point recovery to after the 986 * log records will point recovery to after the
976 * current unmount record. 987 * current unmount record.
977 */ 988 */
978 log->l_tail_lsn = 989 xlog_assign_atomic_lsn(&log->l_tail_lsn,
979 xlog_assign_lsn(log->l_curr_cycle, 990 log->l_curr_cycle, after_umount_blk);
980 after_umount_blk); 991 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
981 log->l_last_sync_lsn = 992 log->l_curr_cycle, after_umount_blk);
982 xlog_assign_lsn(log->l_curr_cycle,
983 after_umount_blk);
984 *tail_blk = after_umount_blk; 993 *tail_blk = after_umount_blk;
985 994
986 /* 995 /*
@@ -1605,82 +1614,45 @@ xlog_recover_reorder_trans(
1605 * record in the table to tell us how many times we expect to see this 1614 * record in the table to tell us how many times we expect to see this
1606 * record during the second pass. 1615 * record during the second pass.
1607 */ 1616 */
1608STATIC void 1617STATIC int
1609xlog_recover_do_buffer_pass1( 1618xlog_recover_buffer_pass1(
1610 xlog_t *log, 1619 struct log *log,
1611 xfs_buf_log_format_t *buf_f) 1620 xlog_recover_item_t *item)
1612{ 1621{
1613 xfs_buf_cancel_t *bcp; 1622 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1614 xfs_buf_cancel_t *nextp; 1623 struct list_head *bucket;
1615 xfs_buf_cancel_t *prevp; 1624 struct xfs_buf_cancel *bcp;
1616 xfs_buf_cancel_t **bucket;
1617 xfs_daddr_t blkno = 0;
1618 uint len = 0;
1619 ushort flags = 0;
1620
1621 switch (buf_f->blf_type) {
1622 case XFS_LI_BUF:
1623 blkno = buf_f->blf_blkno;
1624 len = buf_f->blf_len;
1625 flags = buf_f->blf_flags;
1626 break;
1627 }
1628 1625
1629 /* 1626 /*
1630 * If this isn't a cancel buffer item, then just return. 1627 * If this isn't a cancel buffer item, then just return.
1631 */ 1628 */
1632 if (!(flags & XFS_BLF_CANCEL)) { 1629 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1633 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1630 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1634 return; 1631 return 0;
1635 }
1636
1637 /*
1638 * Insert an xfs_buf_cancel record into the hash table of
1639 * them. If there is already an identical record, bump
1640 * its reference count.
1641 */
1642 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1643 XLOG_BC_TABLE_SIZE];
1644 /*
1645 * If the hash bucket is empty then just insert a new record into
1646 * the bucket.
1647 */
1648 if (*bucket == NULL) {
1649 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1650 KM_SLEEP);
1651 bcp->bc_blkno = blkno;
1652 bcp->bc_len = len;
1653 bcp->bc_refcount = 1;
1654 bcp->bc_next = NULL;
1655 *bucket = bcp;
1656 return;
1657 } 1632 }
1658 1633
1659 /* 1634 /*
1660 * The hash bucket is not empty, so search for duplicates of our 1635 * Insert an xfs_buf_cancel record into the hash table of them.
1661 * record. If we find one them just bump its refcount. If not 1636 * If there is already an identical record, bump its reference count.
1662 * then add us at the end of the list.
1663 */ 1637 */
1664 prevp = NULL; 1638 bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1665 nextp = *bucket; 1639 list_for_each_entry(bcp, bucket, bc_list) {
1666 while (nextp != NULL) { 1640 if (bcp->bc_blkno == buf_f->blf_blkno &&
1667 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1641 bcp->bc_len == buf_f->blf_len) {
1668 nextp->bc_refcount++; 1642 bcp->bc_refcount++;
1669 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); 1643 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1670 return; 1644 return 0;
1671 } 1645 }
1672 prevp = nextp; 1646 }
1673 nextp = nextp->bc_next; 1647
1674 } 1648 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1675 ASSERT(prevp != NULL); 1649 bcp->bc_blkno = buf_f->blf_blkno;
1676 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), 1650 bcp->bc_len = buf_f->blf_len;
1677 KM_SLEEP);
1678 bcp->bc_blkno = blkno;
1679 bcp->bc_len = len;
1680 bcp->bc_refcount = 1; 1651 bcp->bc_refcount = 1;
1681 bcp->bc_next = NULL; 1652 list_add_tail(&bcp->bc_list, bucket);
1682 prevp->bc_next = bcp; 1653
1683 trace_xfs_log_recover_buf_cancel_add(log, buf_f); 1654 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1655 return 0;
1684} 1656}
1685 1657
1686/* 1658/*
@@ -1698,14 +1670,13 @@ xlog_recover_do_buffer_pass1(
1698 */ 1670 */
1699STATIC int 1671STATIC int
1700xlog_check_buffer_cancelled( 1672xlog_check_buffer_cancelled(
1701 xlog_t *log, 1673 struct log *log,
1702 xfs_daddr_t blkno, 1674 xfs_daddr_t blkno,
1703 uint len, 1675 uint len,
1704 ushort flags) 1676 ushort flags)
1705{ 1677{
1706 xfs_buf_cancel_t *bcp; 1678 struct list_head *bucket;
1707 xfs_buf_cancel_t *prevp; 1679 struct xfs_buf_cancel *bcp;
1708 xfs_buf_cancel_t **bucket;
1709 1680
1710 if (log->l_buf_cancel_table == NULL) { 1681 if (log->l_buf_cancel_table == NULL) {
1711 /* 1682 /*
@@ -1716,128 +1687,70 @@ xlog_check_buffer_cancelled(
1716 return 0; 1687 return 0;
1717 } 1688 }
1718 1689
1719 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1720 XLOG_BC_TABLE_SIZE];
1721 bcp = *bucket;
1722 if (bcp == NULL) {
1723 /*
1724 * There is no corresponding entry in the table built
1725 * in pass one, so this buffer has not been cancelled.
1726 */
1727 ASSERT(!(flags & XFS_BLF_CANCEL));
1728 return 0;
1729 }
1730
1731 /* 1690 /*
1732 * Search for an entry in the buffer cancel table that 1691 * Search for an entry in the cancel table that matches our buffer.
1733 * matches our buffer.
1734 */ 1692 */
1735 prevp = NULL; 1693 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1736 while (bcp != NULL) { 1694 list_for_each_entry(bcp, bucket, bc_list) {
1737 if (bcp->bc_blkno == blkno && bcp->bc_len == len) { 1695 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1738 /* 1696 goto found;
1739 * We've go a match, so return 1 so that the
1740 * recovery of this buffer is cancelled.
1741 * If this buffer is actually a buffer cancel
1742 * log item, then decrement the refcount on the
1743 * one in the table and remove it if this is the
1744 * last reference.
1745 */
1746 if (flags & XFS_BLF_CANCEL) {
1747 bcp->bc_refcount--;
1748 if (bcp->bc_refcount == 0) {
1749 if (prevp == NULL) {
1750 *bucket = bcp->bc_next;
1751 } else {
1752 prevp->bc_next = bcp->bc_next;
1753 }
1754 kmem_free(bcp);
1755 }
1756 }
1757 return 1;
1758 }
1759 prevp = bcp;
1760 bcp = bcp->bc_next;
1761 } 1697 }
1698
1762 /* 1699 /*
1763 * We didn't find a corresponding entry in the table, so 1700 * We didn't find a corresponding entry in the table, so return 0 so
1764 * return 0 so that the buffer is NOT cancelled. 1701 * that the buffer is NOT cancelled.
1765 */ 1702 */
1766 ASSERT(!(flags & XFS_BLF_CANCEL)); 1703 ASSERT(!(flags & XFS_BLF_CANCEL));
1767 return 0; 1704 return 0;
1768}
1769 1705
1770STATIC int 1706found:
1771xlog_recover_do_buffer_pass2( 1707 /*
1772 xlog_t *log, 1708 * We've go a match, so return 1 so that the recovery of this buffer
1773 xfs_buf_log_format_t *buf_f) 1709 * is cancelled. If this buffer is actually a buffer cancel log
1774{ 1710 * item, then decrement the refcount on the one in the table and
1775 xfs_daddr_t blkno = 0; 1711 * remove it if this is the last reference.
1776 ushort flags = 0; 1712 */
1777 uint len = 0; 1713 if (flags & XFS_BLF_CANCEL) {
1778 1714 if (--bcp->bc_refcount == 0) {
1779 switch (buf_f->blf_type) { 1715 list_del(&bcp->bc_list);
1780 case XFS_LI_BUF: 1716 kmem_free(bcp);
1781 blkno = buf_f->blf_blkno; 1717 }
1782 flags = buf_f->blf_flags;
1783 len = buf_f->blf_len;
1784 break;
1785 } 1718 }
1786 1719 return 1;
1787 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1788} 1720}
1789 1721
1790/* 1722/*
1791 * Perform recovery for a buffer full of inodes. In these buffers, 1723 * Perform recovery for a buffer full of inodes. In these buffers, the only
1792 * the only data which should be recovered is that which corresponds 1724 * data which should be recovered is that which corresponds to the
1793 * to the di_next_unlinked pointers in the on disk inode structures. 1725 * di_next_unlinked pointers in the on disk inode structures. The rest of the
1794 * The rest of the data for the inodes is always logged through the 1726 * data for the inodes is always logged through the inodes themselves rather
1795 * inodes themselves rather than the inode buffer and is recovered 1727 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1796 * in xlog_recover_do_inode_trans().
1797 * 1728 *
1798 * The only time when buffers full of inodes are fully recovered is 1729 * The only time when buffers full of inodes are fully recovered is when the
1799 * when the buffer is full of newly allocated inodes. In this case 1730 * buffer is full of newly allocated inodes. In this case the buffer will
1800 * the buffer will not be marked as an inode buffer and so will be 1731 * not be marked as an inode buffer and so will be sent to
1801 * sent to xlog_recover_do_reg_buffer() below during recovery. 1732 * xlog_recover_do_reg_buffer() below during recovery.
1802 */ 1733 */
1803STATIC int 1734STATIC int
1804xlog_recover_do_inode_buffer( 1735xlog_recover_do_inode_buffer(
1805 xfs_mount_t *mp, 1736 struct xfs_mount *mp,
1806 xlog_recover_item_t *item, 1737 xlog_recover_item_t *item,
1807 xfs_buf_t *bp, 1738 struct xfs_buf *bp,
1808 xfs_buf_log_format_t *buf_f) 1739 xfs_buf_log_format_t *buf_f)
1809{ 1740{
1810 int i; 1741 int i;
1811 int item_index; 1742 int item_index = 0;
1812 int bit; 1743 int bit = 0;
1813 int nbits; 1744 int nbits = 0;
1814 int reg_buf_offset; 1745 int reg_buf_offset = 0;
1815 int reg_buf_bytes; 1746 int reg_buf_bytes = 0;
1816 int next_unlinked_offset; 1747 int next_unlinked_offset;
1817 int inodes_per_buf; 1748 int inodes_per_buf;
1818 xfs_agino_t *logged_nextp; 1749 xfs_agino_t *logged_nextp;
1819 xfs_agino_t *buffer_nextp; 1750 xfs_agino_t *buffer_nextp;
1820 unsigned int *data_map = NULL;
1821 unsigned int map_size = 0;
1822 1751
1823 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1752 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1824 1753
1825 switch (buf_f->blf_type) {
1826 case XFS_LI_BUF:
1827 data_map = buf_f->blf_data_map;
1828 map_size = buf_f->blf_map_size;
1829 break;
1830 }
1831 /*
1832 * Set the variables corresponding to the current region to
1833 * 0 so that we'll initialize them on the first pass through
1834 * the loop.
1835 */
1836 reg_buf_offset = 0;
1837 reg_buf_bytes = 0;
1838 bit = 0;
1839 nbits = 0;
1840 item_index = 0;
1841 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; 1754 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1842 for (i = 0; i < inodes_per_buf; i++) { 1755 for (i = 0; i < inodes_per_buf; i++) {
1843 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 1756 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,18 +1765,18 @@ xlog_recover_do_inode_buffer(
1852 * the current di_next_unlinked field. 1765 * the current di_next_unlinked field.
1853 */ 1766 */
1854 bit += nbits; 1767 bit += nbits;
1855 bit = xfs_next_bit(data_map, map_size, bit); 1768 bit = xfs_next_bit(buf_f->blf_data_map,
1769 buf_f->blf_map_size, bit);
1856 1770
1857 /* 1771 /*
1858 * If there are no more logged regions in the 1772 * If there are no more logged regions in the
1859 * buffer, then we're done. 1773 * buffer, then we're done.
1860 */ 1774 */
1861 if (bit == -1) { 1775 if (bit == -1)
1862 return 0; 1776 return 0;
1863 }
1864 1777
1865 nbits = xfs_contig_bits(data_map, map_size, 1778 nbits = xfs_contig_bits(buf_f->blf_data_map,
1866 bit); 1779 buf_f->blf_map_size, bit);
1867 ASSERT(nbits > 0); 1780 ASSERT(nbits > 0);
1868 reg_buf_offset = bit << XFS_BLF_SHIFT; 1781 reg_buf_offset = bit << XFS_BLF_SHIFT;
1869 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 1782 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,9 +1788,8 @@ xlog_recover_do_inode_buffer(
1875 * di_next_unlinked field, then move on to the next 1788 * di_next_unlinked field, then move on to the next
1876 * di_next_unlinked field. 1789 * di_next_unlinked field.
1877 */ 1790 */
1878 if (next_unlinked_offset < reg_buf_offset) { 1791 if (next_unlinked_offset < reg_buf_offset)
1879 continue; 1792 continue;
1880 }
1881 1793
1882 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1794 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1883 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 1795 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
@@ -1913,36 +1825,29 @@ xlog_recover_do_inode_buffer(
1913 * given buffer. The bitmap in the buf log format structure indicates 1825 * given buffer. The bitmap in the buf log format structure indicates
1914 * where to place the logged data. 1826 * where to place the logged data.
1915 */ 1827 */
1916/*ARGSUSED*/
1917STATIC void 1828STATIC void
1918xlog_recover_do_reg_buffer( 1829xlog_recover_do_reg_buffer(
1919 struct xfs_mount *mp, 1830 struct xfs_mount *mp,
1920 xlog_recover_item_t *item, 1831 xlog_recover_item_t *item,
1921 xfs_buf_t *bp, 1832 struct xfs_buf *bp,
1922 xfs_buf_log_format_t *buf_f) 1833 xfs_buf_log_format_t *buf_f)
1923{ 1834{
1924 int i; 1835 int i;
1925 int bit; 1836 int bit;
1926 int nbits; 1837 int nbits;
1927 unsigned int *data_map = NULL;
1928 unsigned int map_size = 0;
1929 int error; 1838 int error;
1930 1839
1931 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 1840 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1932 1841
1933 switch (buf_f->blf_type) {
1934 case XFS_LI_BUF:
1935 data_map = buf_f->blf_data_map;
1936 map_size = buf_f->blf_map_size;
1937 break;
1938 }
1939 bit = 0; 1842 bit = 0;
1940 i = 1; /* 0 is the buf format structure */ 1843 i = 1; /* 0 is the buf format structure */
1941 while (1) { 1844 while (1) {
1942 bit = xfs_next_bit(data_map, map_size, bit); 1845 bit = xfs_next_bit(buf_f->blf_data_map,
1846 buf_f->blf_map_size, bit);
1943 if (bit == -1) 1847 if (bit == -1)
1944 break; 1848 break;
1945 nbits = xfs_contig_bits(data_map, map_size, bit); 1849 nbits = xfs_contig_bits(buf_f->blf_data_map,
1850 buf_f->blf_map_size, bit);
1946 ASSERT(nbits > 0); 1851 ASSERT(nbits > 0);
1947 ASSERT(item->ri_buf[i].i_addr != NULL); 1852 ASSERT(item->ri_buf[i].i_addr != NULL);
1948 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 1853 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
@@ -2176,77 +2081,46 @@ xlog_recover_do_dquot_buffer(
2176 * for more details on the implementation of the table of cancel records. 2081 * for more details on the implementation of the table of cancel records.
2177 */ 2082 */
2178STATIC int 2083STATIC int
2179xlog_recover_do_buffer_trans( 2084xlog_recover_buffer_pass2(
2180 xlog_t *log, 2085 xlog_t *log,
2181 xlog_recover_item_t *item, 2086 xlog_recover_item_t *item)
2182 int pass)
2183{ 2087{
2184 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2088 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2185 xfs_mount_t *mp; 2089 xfs_mount_t *mp = log->l_mp;
2186 xfs_buf_t *bp; 2090 xfs_buf_t *bp;
2187 int error; 2091 int error;
2188 int cancel;
2189 xfs_daddr_t blkno;
2190 int len;
2191 ushort flags;
2192 uint buf_flags; 2092 uint buf_flags;
2193 2093
2194 if (pass == XLOG_RECOVER_PASS1) { 2094 /*
2195 /* 2095 * In this pass we only want to recover all the buffers which have
2196 * In this pass we're only looking for buf items 2096 * not been cancelled and are not cancellation buffers themselves.
2197 * with the XFS_BLF_CANCEL bit set. 2097 */
2198 */ 2098 if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2199 xlog_recover_do_buffer_pass1(log, buf_f); 2099 buf_f->blf_len, buf_f->blf_flags)) {
2100 trace_xfs_log_recover_buf_cancel(log, buf_f);
2200 return 0; 2101 return 0;
2201 } else {
2202 /*
2203 * In this pass we want to recover all the buffers
2204 * which have not been cancelled and are not
2205 * cancellation buffers themselves. The routine
2206 * we call here will tell us whether or not to
2207 * continue with the replay of this buffer.
2208 */
2209 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2210 if (cancel) {
2211 trace_xfs_log_recover_buf_cancel(log, buf_f);
2212 return 0;
2213 }
2214 } 2102 }
2103
2215 trace_xfs_log_recover_buf_recover(log, buf_f); 2104 trace_xfs_log_recover_buf_recover(log, buf_f);
2216 switch (buf_f->blf_type) {
2217 case XFS_LI_BUF:
2218 blkno = buf_f->blf_blkno;
2219 len = buf_f->blf_len;
2220 flags = buf_f->blf_flags;
2221 break;
2222 default:
2223 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2224 "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2225 buf_f->blf_type, log->l_mp->m_logname ?
2226 log->l_mp->m_logname : "internal");
2227 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2228 XFS_ERRLEVEL_LOW, log->l_mp);
2229 return XFS_ERROR(EFSCORRUPTED);
2230 }
2231 2105
2232 mp = log->l_mp;
2233 buf_flags = XBF_LOCK; 2106 buf_flags = XBF_LOCK;
2234 if (!(flags & XFS_BLF_INODE_BUF)) 2107 if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
2235 buf_flags |= XBF_MAPPED; 2108 buf_flags |= XBF_MAPPED;
2236 2109
2237 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2110 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2111 buf_flags);
2238 if (XFS_BUF_ISERROR(bp)) { 2112 if (XFS_BUF_ISERROR(bp)) {
2239 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, 2113 xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
2240 bp, blkno); 2114 bp, buf_f->blf_blkno);
2241 error = XFS_BUF_GETERROR(bp); 2115 error = XFS_BUF_GETERROR(bp);
2242 xfs_buf_relse(bp); 2116 xfs_buf_relse(bp);
2243 return error; 2117 return error;
2244 } 2118 }
2245 2119
2246 error = 0; 2120 error = 0;
2247 if (flags & XFS_BLF_INODE_BUF) { 2121 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2248 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2122 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2249 } else if (flags & 2123 } else if (buf_f->blf_flags &
2250 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2124 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2251 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2125 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2252 } else { 2126 } else {
@@ -2286,16 +2160,14 @@ xlog_recover_do_buffer_trans(
2286} 2160}
2287 2161
2288STATIC int 2162STATIC int
2289xlog_recover_do_inode_trans( 2163xlog_recover_inode_pass2(
2290 xlog_t *log, 2164 xlog_t *log,
2291 xlog_recover_item_t *item, 2165 xlog_recover_item_t *item)
2292 int pass)
2293{ 2166{
2294 xfs_inode_log_format_t *in_f; 2167 xfs_inode_log_format_t *in_f;
2295 xfs_mount_t *mp; 2168 xfs_mount_t *mp = log->l_mp;
2296 xfs_buf_t *bp; 2169 xfs_buf_t *bp;
2297 xfs_dinode_t *dip; 2170 xfs_dinode_t *dip;
2298 xfs_ino_t ino;
2299 int len; 2171 int len;
2300 xfs_caddr_t src; 2172 xfs_caddr_t src;
2301 xfs_caddr_t dest; 2173 xfs_caddr_t dest;
@@ -2305,10 +2177,6 @@ xlog_recover_do_inode_trans(
2305 xfs_icdinode_t *dicp; 2177 xfs_icdinode_t *dicp;
2306 int need_free = 0; 2178 int need_free = 0;
2307 2179
2308 if (pass == XLOG_RECOVER_PASS1) {
2309 return 0;
2310 }
2311
2312 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2180 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2313 in_f = item->ri_buf[0].i_addr; 2181 in_f = item->ri_buf[0].i_addr;
2314 } else { 2182 } else {
@@ -2318,8 +2186,6 @@ xlog_recover_do_inode_trans(
2318 if (error) 2186 if (error)
2319 goto error; 2187 goto error;
2320 } 2188 }
2321 ino = in_f->ilf_ino;
2322 mp = log->l_mp;
2323 2189
2324 /* 2190 /*
2325 * Inode buffers can be freed, look out for it, 2191 * Inode buffers can be freed, look out for it,
@@ -2354,8 +2220,8 @@ xlog_recover_do_inode_trans(
2354 xfs_buf_relse(bp); 2220 xfs_buf_relse(bp);
2355 xfs_fs_cmn_err(CE_ALERT, mp, 2221 xfs_fs_cmn_err(CE_ALERT, mp,
2356 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2222 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2357 dip, bp, ino); 2223 dip, bp, in_f->ilf_ino);
2358 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", 2224 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2359 XFS_ERRLEVEL_LOW, mp); 2225 XFS_ERRLEVEL_LOW, mp);
2360 error = EFSCORRUPTED; 2226 error = EFSCORRUPTED;
2361 goto error; 2227 goto error;
@@ -2365,8 +2231,8 @@ xlog_recover_do_inode_trans(
2365 xfs_buf_relse(bp); 2231 xfs_buf_relse(bp);
2366 xfs_fs_cmn_err(CE_ALERT, mp, 2232 xfs_fs_cmn_err(CE_ALERT, mp,
2367 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", 2233 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2368 item, ino); 2234 item, in_f->ilf_ino);
2369 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", 2235 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2370 XFS_ERRLEVEL_LOW, mp); 2236 XFS_ERRLEVEL_LOW, mp);
2371 error = EFSCORRUPTED; 2237 error = EFSCORRUPTED;
2372 goto error; 2238 goto error;
@@ -2394,12 +2260,12 @@ xlog_recover_do_inode_trans(
2394 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { 2260 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2395 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2261 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2396 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2262 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2397 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", 2263 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2398 XFS_ERRLEVEL_LOW, mp, dicp); 2264 XFS_ERRLEVEL_LOW, mp, dicp);
2399 xfs_buf_relse(bp); 2265 xfs_buf_relse(bp);
2400 xfs_fs_cmn_err(CE_ALERT, mp, 2266 xfs_fs_cmn_err(CE_ALERT, mp,
2401 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2267 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2402 item, dip, bp, ino); 2268 item, dip, bp, in_f->ilf_ino);
2403 error = EFSCORRUPTED; 2269 error = EFSCORRUPTED;
2404 goto error; 2270 goto error;
2405 } 2271 }
@@ -2407,40 +2273,40 @@ xlog_recover_do_inode_trans(
2407 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2273 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2408 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 2274 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2409 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2275 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2410 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", 2276 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2411 XFS_ERRLEVEL_LOW, mp, dicp); 2277 XFS_ERRLEVEL_LOW, mp, dicp);
2412 xfs_buf_relse(bp); 2278 xfs_buf_relse(bp);
2413 xfs_fs_cmn_err(CE_ALERT, mp, 2279 xfs_fs_cmn_err(CE_ALERT, mp,
2414 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2280 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2415 item, dip, bp, ino); 2281 item, dip, bp, in_f->ilf_ino);
2416 error = EFSCORRUPTED; 2282 error = EFSCORRUPTED;
2417 goto error; 2283 goto error;
2418 } 2284 }
2419 } 2285 }
2420 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2286 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2421 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", 2287 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2422 XFS_ERRLEVEL_LOW, mp, dicp); 2288 XFS_ERRLEVEL_LOW, mp, dicp);
2423 xfs_buf_relse(bp); 2289 xfs_buf_relse(bp);
2424 xfs_fs_cmn_err(CE_ALERT, mp, 2290 xfs_fs_cmn_err(CE_ALERT, mp,
2425 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2291 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2426 item, dip, bp, ino, 2292 item, dip, bp, in_f->ilf_ino,
2427 dicp->di_nextents + dicp->di_anextents, 2293 dicp->di_nextents + dicp->di_anextents,
2428 dicp->di_nblocks); 2294 dicp->di_nblocks);
2429 error = EFSCORRUPTED; 2295 error = EFSCORRUPTED;
2430 goto error; 2296 goto error;
2431 } 2297 }
2432 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2298 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2433 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", 2299 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2434 XFS_ERRLEVEL_LOW, mp, dicp); 2300 XFS_ERRLEVEL_LOW, mp, dicp);
2435 xfs_buf_relse(bp); 2301 xfs_buf_relse(bp);
2436 xfs_fs_cmn_err(CE_ALERT, mp, 2302 xfs_fs_cmn_err(CE_ALERT, mp,
2437 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", 2303 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2438 item, dip, bp, ino, dicp->di_forkoff); 2304 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2439 error = EFSCORRUPTED; 2305 error = EFSCORRUPTED;
2440 goto error; 2306 goto error;
2441 } 2307 }
2442 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { 2308 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2443 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", 2309 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2444 XFS_ERRLEVEL_LOW, mp, dicp); 2310 XFS_ERRLEVEL_LOW, mp, dicp);
2445 xfs_buf_relse(bp); 2311 xfs_buf_relse(bp);
2446 xfs_fs_cmn_err(CE_ALERT, mp, 2312 xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2532,7 +2398,7 @@ xlog_recover_do_inode_trans(
2532 break; 2398 break;
2533 2399
2534 default: 2400 default:
2535 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); 2401 xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
2536 ASSERT(0); 2402 ASSERT(0);
2537 xfs_buf_relse(bp); 2403 xfs_buf_relse(bp);
2538 error = EIO; 2404 error = EIO;
@@ -2556,18 +2422,11 @@ error:
2556 * of that type. 2422 * of that type.
2557 */ 2423 */
2558STATIC int 2424STATIC int
2559xlog_recover_do_quotaoff_trans( 2425xlog_recover_quotaoff_pass1(
2560 xlog_t *log, 2426 xlog_t *log,
2561 xlog_recover_item_t *item, 2427 xlog_recover_item_t *item)
2562 int pass)
2563{ 2428{
2564 xfs_qoff_logformat_t *qoff_f; 2429 xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
2565
2566 if (pass == XLOG_RECOVER_PASS2) {
2567 return (0);
2568 }
2569
2570 qoff_f = item->ri_buf[0].i_addr;
2571 ASSERT(qoff_f); 2430 ASSERT(qoff_f);
2572 2431
2573 /* 2432 /*
@@ -2588,22 +2447,17 @@ xlog_recover_do_quotaoff_trans(
2588 * Recover a dquot record 2447 * Recover a dquot record
2589 */ 2448 */
2590STATIC int 2449STATIC int
2591xlog_recover_do_dquot_trans( 2450xlog_recover_dquot_pass2(
2592 xlog_t *log, 2451 xlog_t *log,
2593 xlog_recover_item_t *item, 2452 xlog_recover_item_t *item)
2594 int pass)
2595{ 2453{
2596 xfs_mount_t *mp; 2454 xfs_mount_t *mp = log->l_mp;
2597 xfs_buf_t *bp; 2455 xfs_buf_t *bp;
2598 struct xfs_disk_dquot *ddq, *recddq; 2456 struct xfs_disk_dquot *ddq, *recddq;
2599 int error; 2457 int error;
2600 xfs_dq_logformat_t *dq_f; 2458 xfs_dq_logformat_t *dq_f;
2601 uint type; 2459 uint type;
2602 2460
2603 if (pass == XLOG_RECOVER_PASS1) {
2604 return 0;
2605 }
2606 mp = log->l_mp;
2607 2461
2608 /* 2462 /*
2609 * Filesystems are required to send in quota flags at mount time. 2463 * Filesystems are required to send in quota flags at mount time.
@@ -2647,7 +2501,7 @@ xlog_recover_do_dquot_trans(
2647 if ((error = xfs_qm_dqcheck(recddq, 2501 if ((error = xfs_qm_dqcheck(recddq,
2648 dq_f->qlf_id, 2502 dq_f->qlf_id,
2649 0, XFS_QMOPT_DOWARN, 2503 0, XFS_QMOPT_DOWARN,
2650 "xlog_recover_do_dquot_trans (log copy)"))) { 2504 "xlog_recover_dquot_pass2 (log copy)"))) {
2651 return XFS_ERROR(EIO); 2505 return XFS_ERROR(EIO);
2652 } 2506 }
2653 ASSERT(dq_f->qlf_len == 1); 2507 ASSERT(dq_f->qlf_len == 1);
@@ -2670,7 +2524,7 @@ xlog_recover_do_dquot_trans(
2670 * minimal initialization then. 2524 * minimal initialization then.
2671 */ 2525 */
2672 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2526 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2673 "xlog_recover_do_dquot_trans")) { 2527 "xlog_recover_dquot_pass2")) {
2674 xfs_buf_relse(bp); 2528 xfs_buf_relse(bp);
2675 return XFS_ERROR(EIO); 2529 return XFS_ERROR(EIO);
2676 } 2530 }
@@ -2693,38 +2547,31 @@ xlog_recover_do_dquot_trans(
2693 * LSN. 2547 * LSN.
2694 */ 2548 */
2695STATIC int 2549STATIC int
2696xlog_recover_do_efi_trans( 2550xlog_recover_efi_pass2(
2697 xlog_t *log, 2551 xlog_t *log,
2698 xlog_recover_item_t *item, 2552 xlog_recover_item_t *item,
2699 xfs_lsn_t lsn, 2553 xfs_lsn_t lsn)
2700 int pass)
2701{ 2554{
2702 int error; 2555 int error;
2703 xfs_mount_t *mp; 2556 xfs_mount_t *mp = log->l_mp;
2704 xfs_efi_log_item_t *efip; 2557 xfs_efi_log_item_t *efip;
2705 xfs_efi_log_format_t *efi_formatp; 2558 xfs_efi_log_format_t *efi_formatp;
2706 2559
2707 if (pass == XLOG_RECOVER_PASS1) {
2708 return 0;
2709 }
2710
2711 efi_formatp = item->ri_buf[0].i_addr; 2560 efi_formatp = item->ri_buf[0].i_addr;
2712 2561
2713 mp = log->l_mp;
2714 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2562 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2715 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), 2563 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2716 &(efip->efi_format)))) { 2564 &(efip->efi_format)))) {
2717 xfs_efi_item_free(efip); 2565 xfs_efi_item_free(efip);
2718 return error; 2566 return error;
2719 } 2567 }
2720 efip->efi_next_extent = efi_formatp->efi_nextents; 2568 atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2721 efip->efi_flags |= XFS_EFI_COMMITTED;
2722 2569
2723 spin_lock(&log->l_ailp->xa_lock); 2570 spin_lock(&log->l_ailp->xa_lock);
2724 /* 2571 /*
2725 * xfs_trans_ail_update() drops the AIL lock. 2572 * xfs_trans_ail_update() drops the AIL lock.
2726 */ 2573 */
2727 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); 2574 xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2728 return 0; 2575 return 0;
2729} 2576}
2730 2577
@@ -2737,11 +2584,10 @@ xlog_recover_do_efi_trans(
2737 * efd format structure. If we find it, we remove the efi from the 2584 * efd format structure. If we find it, we remove the efi from the
2738 * AIL and free it. 2585 * AIL and free it.
2739 */ 2586 */
2740STATIC void 2587STATIC int
2741xlog_recover_do_efd_trans( 2588xlog_recover_efd_pass2(
2742 xlog_t *log, 2589 xlog_t *log,
2743 xlog_recover_item_t *item, 2590 xlog_recover_item_t *item)
2744 int pass)
2745{ 2591{
2746 xfs_efd_log_format_t *efd_formatp; 2592 xfs_efd_log_format_t *efd_formatp;
2747 xfs_efi_log_item_t *efip = NULL; 2593 xfs_efi_log_item_t *efip = NULL;
@@ -2750,10 +2596,6 @@ xlog_recover_do_efd_trans(
2750 struct xfs_ail_cursor cur; 2596 struct xfs_ail_cursor cur;
2751 struct xfs_ail *ailp = log->l_ailp; 2597 struct xfs_ail *ailp = log->l_ailp;
2752 2598
2753 if (pass == XLOG_RECOVER_PASS1) {
2754 return;
2755 }
2756
2757 efd_formatp = item->ri_buf[0].i_addr; 2599 efd_formatp = item->ri_buf[0].i_addr;
2758 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2600 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2759 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2601 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2785,62 +2627,6 @@ xlog_recover_do_efd_trans(
2785 } 2627 }
2786 xfs_trans_ail_cursor_done(ailp, &cur); 2628 xfs_trans_ail_cursor_done(ailp, &cur);
2787 spin_unlock(&ailp->xa_lock); 2629 spin_unlock(&ailp->xa_lock);
2788}
2789
2790/*
2791 * Perform the transaction
2792 *
2793 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2794 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2795 */
2796STATIC int
2797xlog_recover_do_trans(
2798 xlog_t *log,
2799 xlog_recover_t *trans,
2800 int pass)
2801{
2802 int error = 0;
2803 xlog_recover_item_t *item;
2804
2805 error = xlog_recover_reorder_trans(log, trans, pass);
2806 if (error)
2807 return error;
2808
2809 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2810 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2811 switch (ITEM_TYPE(item)) {
2812 case XFS_LI_BUF:
2813 error = xlog_recover_do_buffer_trans(log, item, pass);
2814 break;
2815 case XFS_LI_INODE:
2816 error = xlog_recover_do_inode_trans(log, item, pass);
2817 break;
2818 case XFS_LI_EFI:
2819 error = xlog_recover_do_efi_trans(log, item,
2820 trans->r_lsn, pass);
2821 break;
2822 case XFS_LI_EFD:
2823 xlog_recover_do_efd_trans(log, item, pass);
2824 error = 0;
2825 break;
2826 case XFS_LI_DQUOT:
2827 error = xlog_recover_do_dquot_trans(log, item, pass);
2828 break;
2829 case XFS_LI_QUOTAOFF:
2830 error = xlog_recover_do_quotaoff_trans(log, item,
2831 pass);
2832 break;
2833 default:
2834 xlog_warn(
2835 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
2836 ASSERT(0);
2837 error = XFS_ERROR(EIO);
2838 break;
2839 }
2840
2841 if (error)
2842 return error;
2843 }
2844 2630
2845 return 0; 2631 return 0;
2846} 2632}
@@ -2852,7 +2638,7 @@ xlog_recover_do_trans(
2852 */ 2638 */
2853STATIC void 2639STATIC void
2854xlog_recover_free_trans( 2640xlog_recover_free_trans(
2855 xlog_recover_t *trans) 2641 struct xlog_recover *trans)
2856{ 2642{
2857 xlog_recover_item_t *item, *n; 2643 xlog_recover_item_t *item, *n;
2858 int i; 2644 int i;
@@ -2871,17 +2657,95 @@ xlog_recover_free_trans(
2871} 2657}
2872 2658
2873STATIC int 2659STATIC int
2660xlog_recover_commit_pass1(
2661 struct log *log,
2662 struct xlog_recover *trans,
2663 xlog_recover_item_t *item)
2664{
2665 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
2666
2667 switch (ITEM_TYPE(item)) {
2668 case XFS_LI_BUF:
2669 return xlog_recover_buffer_pass1(log, item);
2670 case XFS_LI_QUOTAOFF:
2671 return xlog_recover_quotaoff_pass1(log, item);
2672 case XFS_LI_INODE:
2673 case XFS_LI_EFI:
2674 case XFS_LI_EFD:
2675 case XFS_LI_DQUOT:
2676 /* nothing to do in pass 1 */
2677 return 0;
2678 default:
2679 xlog_warn(
2680 "XFS: invalid item type (%d) xlog_recover_commit_pass1",
2681 ITEM_TYPE(item));
2682 ASSERT(0);
2683 return XFS_ERROR(EIO);
2684 }
2685}
2686
2687STATIC int
2688xlog_recover_commit_pass2(
2689 struct log *log,
2690 struct xlog_recover *trans,
2691 xlog_recover_item_t *item)
2692{
2693 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2694
2695 switch (ITEM_TYPE(item)) {
2696 case XFS_LI_BUF:
2697 return xlog_recover_buffer_pass2(log, item);
2698 case XFS_LI_INODE:
2699 return xlog_recover_inode_pass2(log, item);
2700 case XFS_LI_EFI:
2701 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2702 case XFS_LI_EFD:
2703 return xlog_recover_efd_pass2(log, item);
2704 case XFS_LI_DQUOT:
2705 return xlog_recover_dquot_pass2(log, item);
2706 case XFS_LI_QUOTAOFF:
2707 /* nothing to do in pass2 */
2708 return 0;
2709 default:
2710 xlog_warn(
2711 "XFS: invalid item type (%d) xlog_recover_commit_pass2",
2712 ITEM_TYPE(item));
2713 ASSERT(0);
2714 return XFS_ERROR(EIO);
2715 }
2716}
2717
2718/*
2719 * Perform the transaction.
2720 *
2721 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2722 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2723 */
2724STATIC int
2874xlog_recover_commit_trans( 2725xlog_recover_commit_trans(
2875 xlog_t *log, 2726 struct log *log,
2876 xlog_recover_t *trans, 2727 struct xlog_recover *trans,
2877 int pass) 2728 int pass)
2878{ 2729{
2879 int error; 2730 int error = 0;
2731 xlog_recover_item_t *item;
2880 2732
2881 hlist_del(&trans->r_list); 2733 hlist_del(&trans->r_list);
2882 if ((error = xlog_recover_do_trans(log, trans, pass))) 2734
2735 error = xlog_recover_reorder_trans(log, trans, pass);
2736 if (error)
2883 return error; 2737 return error;
2884 xlog_recover_free_trans(trans); /* no error */ 2738
2739 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2740 if (pass == XLOG_RECOVER_PASS1)
2741 error = xlog_recover_commit_pass1(log, trans, item);
2742 else
2743 error = xlog_recover_commit_pass2(log, trans, item);
2744 if (error)
2745 return error;
2746 }
2747
2748 xlog_recover_free_trans(trans);
2885 return 0; 2749 return 0;
2886} 2750}
2887 2751
@@ -3011,7 +2875,7 @@ xlog_recover_process_efi(
3011 xfs_extent_t *extp; 2875 xfs_extent_t *extp;
3012 xfs_fsblock_t startblock_fsb; 2876 xfs_fsblock_t startblock_fsb;
3013 2877
3014 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); 2878 ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
3015 2879
3016 /* 2880 /*
3017 * First check the validity of the extents described by the 2881 * First check the validity of the extents described by the
@@ -3050,7 +2914,7 @@ xlog_recover_process_efi(
3050 extp->ext_len); 2914 extp->ext_len);
3051 } 2915 }
3052 2916
3053 efip->efi_flags |= XFS_EFI_RECOVERED; 2917 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3054 error = xfs_trans_commit(tp, 0); 2918 error = xfs_trans_commit(tp, 0);
3055 return error; 2919 return error;
3056 2920
@@ -3107,7 +2971,7 @@ xlog_recover_process_efis(
3107 * Skip EFIs that we've already processed. 2971 * Skip EFIs that we've already processed.
3108 */ 2972 */
3109 efip = (xfs_efi_log_item_t *)lip; 2973 efip = (xfs_efi_log_item_t *)lip;
3110 if (efip->efi_flags & XFS_EFI_RECOVERED) { 2974 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3111 lip = xfs_trans_ail_cursor_next(ailp, &cur); 2975 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3112 continue; 2976 continue;
3113 } 2977 }
@@ -3724,7 +3588,7 @@ xlog_do_log_recovery(
3724 xfs_daddr_t head_blk, 3588 xfs_daddr_t head_blk,
3725 xfs_daddr_t tail_blk) 3589 xfs_daddr_t tail_blk)
3726{ 3590{
3727 int error; 3591 int error, i;
3728 3592
3729 ASSERT(head_blk != tail_blk); 3593 ASSERT(head_blk != tail_blk);
3730 3594
@@ -3732,10 +3596,12 @@ xlog_do_log_recovery(
3732 * First do a pass to find all of the cancelled buf log items. 3596 * First do a pass to find all of the cancelled buf log items.
3733 * Store them in the buf_cancel_table for use in the second pass. 3597 * Store them in the buf_cancel_table for use in the second pass.
3734 */ 3598 */
3735 log->l_buf_cancel_table = 3599 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3736 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * 3600 sizeof(struct list_head),
3737 sizeof(xfs_buf_cancel_t*),
3738 KM_SLEEP); 3601 KM_SLEEP);
3602 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3603 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3604
3739 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 3605 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3740 XLOG_RECOVER_PASS1); 3606 XLOG_RECOVER_PASS1);
3741 if (error != 0) { 3607 if (error != 0) {
@@ -3754,7 +3620,7 @@ xlog_do_log_recovery(
3754 int i; 3620 int i;
3755 3621
3756 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 3622 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3757 ASSERT(log->l_buf_cancel_table[i] == NULL); 3623 ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3758 } 3624 }
3759#endif /* DEBUG */ 3625#endif /* DEBUG */
3760 3626
@@ -3934,7 +3800,7 @@ xlog_recover_finish(
3934 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3800 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3935 } else { 3801 } else {
3936 cmn_err(CE_DEBUG, 3802 cmn_err(CE_DEBUG,
3937 "!Ending clean XFS mount for filesystem: %s\n", 3803 "Ending clean XFS mount for filesystem: %s\n",
3938 log->l_mp->m_fsname); 3804 log->l_mp->m_fsname);
3939 } 3805 }
3940 return 0; 3806 return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa1c254..d447aef84bc3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -472,7 +472,7 @@ xfs_initialize_perag(
472 goto out_unwind; 472 goto out_unwind;
473 pag->pag_agno = index; 473 pag->pag_agno = index;
474 pag->pag_mount = mp; 474 pag->pag_mount = mp;
475 rwlock_init(&pag->pag_ici_lock); 475 spin_lock_init(&pag->pag_ici_lock);
476 mutex_init(&pag->pag_ici_reclaim_lock); 476 mutex_init(&pag->pag_ici_reclaim_lock);
477 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 477 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
478 spin_lock_init(&pag->pag_buf_lock); 478 spin_lock_init(&pag->pag_buf_lock);
@@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
975} 975}
976 976
977/* 977/*
978 * precalculate the low space thresholds for dynamic speculative preallocation.
979 */
980void
981xfs_set_low_space_thresholds(
982 struct xfs_mount *mp)
983{
984 int i;
985
986 for (i = 0; i < XFS_LOWSP_MAX; i++) {
987 __uint64_t space = mp->m_sb.sb_dblocks;
988
989 do_div(space, 100);
990 mp->m_low_space[i] = space * (i + 1);
991 }
992}
993
994
995/*
978 * Set whether we're using inode alignment. 996 * Set whether we're using inode alignment.
979 */ 997 */
980STATIC void 998STATIC void
@@ -1196,6 +1214,9 @@ xfs_mountfs(
1196 */ 1214 */
1197 xfs_set_rw_sizes(mp); 1215 xfs_set_rw_sizes(mp);
1198 1216
1217 /* set the low space thresholds for dynamic preallocation */
1218 xfs_set_low_space_thresholds(mp);
1219
1199 /* 1220 /*
1200 * Set the inode cluster size. 1221 * Set the inode cluster size.
1201 * This may still be overridden by the file system 1222 * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b4980740..a62e8971539d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
103 xfs_mod_incore_sb(mp, field, delta, rsvd) 103 xfs_mod_incore_sb(mp, field, delta, rsvd)
104#endif 104#endif
105 105
106/* dynamic preallocation free space thresholds, 5% down to 1% */
107enum {
108 XFS_LOWSP_1_PCNT = 0,
109 XFS_LOWSP_2_PCNT,
110 XFS_LOWSP_3_PCNT,
111 XFS_LOWSP_4_PCNT,
112 XFS_LOWSP_5_PCNT,
113 XFS_LOWSP_MAX,
114};
115
106typedef struct xfs_mount { 116typedef struct xfs_mount {
107 struct super_block *m_super; 117 struct super_block *m_super;
108 xfs_tid_t m_tid; /* next unused tid for fs */ 118 xfs_tid_t m_tid; /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
202 __int64_t m_update_flags; /* sb flags we need to update 212 __int64_t m_update_flags; /* sb flags we need to update
203 on the next remount,rw */ 213 on the next remount,rw */
204 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 214 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
215 int64_t m_low_space[XFS_LOWSP_MAX];
216 /* low free space thresholds */
205} xfs_mount_t; 217} xfs_mount_t;
206 218
207/* 219/*
@@ -379,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
379 391
380extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 392extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
381 393
394extern void xfs_set_low_space_thresholds(struct xfs_mount *);
395
382#endif /* __KERNEL__ */ 396#endif /* __KERNEL__ */
383 397
384extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 398extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f6d956b7711e..76922793f64f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
1137 if (blkdelta) 1137 if (blkdelta)
1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); 1138 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
1139out: 1139out:
1140 ASSERT(error = 0); 1140 ASSERT(error == 0);
1141 return; 1141 return;
1142} 1142}
1143 1143
@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
1350 * they could be immediately flushed and we'd have to race with the flusher 1350 * they could be immediately flushed and we'd have to race with the flusher
1351 * trying to pull the item from the AIL as we add it. 1351 * trying to pull the item from the AIL as we add it.
1352 */ 1352 */
1353void 1353static void
1354xfs_trans_item_committed( 1354xfs_trans_item_committed(
1355 struct xfs_log_item *lip, 1355 struct xfs_log_item *lip,
1356 xfs_lsn_t commit_lsn, 1356 xfs_lsn_t commit_lsn,
@@ -1425,21 +1425,120 @@ xfs_trans_committed(
1425 xfs_trans_free(tp); 1425 xfs_trans_free(tp);
1426} 1426}
1427 1427
1428static inline void
1429xfs_log_item_batch_insert(
1430 struct xfs_ail *ailp,
1431 struct xfs_log_item **log_items,
1432 int nr_items,
1433 xfs_lsn_t commit_lsn)
1434{
1435 int i;
1436
1437 spin_lock(&ailp->xa_lock);
1438 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
1439 xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
1440
1441 for (i = 0; i < nr_items; i++)
1442 IOP_UNPIN(log_items[i], 0);
1443}
1444
1428/* 1445/*
1429 * Called from the trans_commit code when we notice that 1446 * Bulk operation version of xfs_trans_committed that takes a log vector of
1430 * the filesystem is in the middle of a forced shutdown. 1447 * items to insert into the AIL. This uses bulk AIL insertion techniques to
1448 * minimise lock traffic.
1449 *
1450 * If we are called with the aborted flag set, it is because a log write during
1451 * a CIL checkpoint commit has failed. In this case, all the items in the
1452 * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
1453 * means that checkpoint commit abort handling is treated exactly the same
1454 * as an iclog write error even though we haven't started any IO yet. Hence in
1455 * this case all we need to do is IOP_COMMITTED processing, followed by an
1456 * IOP_UNPIN(aborted) call.
1457 */
1458void
1459xfs_trans_committed_bulk(
1460 struct xfs_ail *ailp,
1461 struct xfs_log_vec *log_vector,
1462 xfs_lsn_t commit_lsn,
1463 int aborted)
1464{
1465#define LOG_ITEM_BATCH_SIZE 32
1466 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
1467 struct xfs_log_vec *lv;
1468 int i = 0;
1469
1470 /* unpin all the log items */
1471 for (lv = log_vector; lv; lv = lv->lv_next ) {
1472 struct xfs_log_item *lip = lv->lv_item;
1473 xfs_lsn_t item_lsn;
1474
1475 if (aborted)
1476 lip->li_flags |= XFS_LI_ABORTED;
1477 item_lsn = IOP_COMMITTED(lip, commit_lsn);
1478
1479 /* item_lsn of -1 means the item was freed */
1480 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
1481 continue;
1482
1483 /*
1484 * if we are aborting the operation, no point in inserting the
1485 * object into the AIL as we are in a shutdown situation.
1486 */
1487 if (aborted) {
1488 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
1489 IOP_UNPIN(lip, 1);
1490 continue;
1491 }
1492
1493 if (item_lsn != commit_lsn) {
1494
1495 /*
1496 * Not a bulk update option due to unusual item_lsn.
1497 * Push into AIL immediately, rechecking the lsn once
1498 * we have the ail lock. Then unpin the item.
1499 */
1500 spin_lock(&ailp->xa_lock);
1501 if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
1502 xfs_trans_ail_update(ailp, lip, item_lsn);
1503 else
1504 spin_unlock(&ailp->xa_lock);
1505 IOP_UNPIN(lip, 0);
1506 continue;
1507 }
1508
1509 /* Item is a candidate for bulk AIL insert. */
1510 log_items[i++] = lv->lv_item;
1511 if (i >= LOG_ITEM_BATCH_SIZE) {
1512 xfs_log_item_batch_insert(ailp, log_items,
1513 LOG_ITEM_BATCH_SIZE, commit_lsn);
1514 i = 0;
1515 }
1516 }
1517
1518 /* make sure we insert the remainder! */
1519 if (i)
1520 xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
1521}
1522
1523/*
1524 * Called from the trans_commit code when we notice that the filesystem is in
1525 * the middle of a forced shutdown.
1526 *
1527 * When we are called here, we have already pinned all the items in the
1528 * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
1529 * so we can simply walk the items in the transaction, unpin them with an abort
1530 * flag and then free the items. Note that unpinning the items can result in
1531 * them being freed immediately, so we need to use a safe list traversal method
1532 * here.
1431 */ 1533 */
1432STATIC void 1534STATIC void
1433xfs_trans_uncommit( 1535xfs_trans_uncommit(
1434 struct xfs_trans *tp, 1536 struct xfs_trans *tp,
1435 uint flags) 1537 uint flags)
1436{ 1538{
1437 struct xfs_log_item_desc *lidp; 1539 struct xfs_log_item_desc *lidp, *n;
1438 1540
1439 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 1541 list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
1440 /*
1441 * Unpin all but those that aren't dirty.
1442 */
1443 if (lidp->lid_flags & XFS_LID_DIRTY) 1542 if (lidp->lid_flags & XFS_LID_DIRTY)
1444 IOP_UNPIN(lidp->lid_item, 1); 1543 IOP_UNPIN(lidp->lid_item, 1);
1445 } 1544 }
@@ -1656,7 +1755,6 @@ xfs_trans_commit_cil(
1656 int flags) 1755 int flags)
1657{ 1756{
1658 struct xfs_log_vec *log_vector; 1757 struct xfs_log_vec *log_vector;
1659 int error;
1660 1758
1661 /* 1759 /*
1662 * Get each log item to allocate a vector structure for 1760 * Get each log item to allocate a vector structure for
@@ -1667,9 +1765,7 @@ xfs_trans_commit_cil(
1667 if (!log_vector) 1765 if (!log_vector)
1668 return ENOMEM; 1766 return ENOMEM;
1669 1767
1670 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); 1768 xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1671 if (error)
1672 return error;
1673 1769
1674 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1770 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1675 xfs_trans_free(tp); 1771 xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 246286b77a86..c2042b736b81 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
294#define XFS_ALLOC_BTREE_REF 2 294#define XFS_ALLOC_BTREE_REF 2
295#define XFS_BMAP_BTREE_REF 2 295#define XFS_BMAP_BTREE_REF 2
296#define XFS_DIR_BTREE_REF 2 296#define XFS_DIR_BTREE_REF 2
297#define XFS_INO_REF 2
297#define XFS_ATTR_BTREE_REF 1 298#define XFS_ATTR_BTREE_REF 1
298#define XFS_INO_REF 1
299#define XFS_DQUOT_REF 1 299#define XFS_DQUOT_REF 1
300 300
301#ifdef __KERNEL__ 301#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index dc9069568ff7..c5bbbc45db91 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,8 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30 30
31STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *); 31STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
32STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); 32STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *); 33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *); 34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
35 35
@@ -449,129 +449,152 @@ xfs_trans_unlocked_item(
449 xfs_log_move_tail(ailp->xa_mount, 1); 449 xfs_log_move_tail(ailp->xa_mount, 1);
450} /* xfs_trans_unlocked_item */ 450} /* xfs_trans_unlocked_item */
451 451
452
453/* 452/*
454 * Update the position of the item in the AIL with the new 453 * xfs_trans_ail_update - bulk AIL insertion operation.
455 * lsn. If it is not yet in the AIL, add it. Otherwise, move 454 *
456 * it to its new position by removing it and re-adding it. 455 * @xfs_trans_ail_update takes an array of log items that all need to be
456 * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
457 * be added. Otherwise, it will be repositioned by removing it and re-adding
458 * it to the AIL. If we move the first item in the AIL, update the log tail to
459 * match the new minimum LSN in the AIL.
457 * 460 *
458 * Wakeup anyone with an lsn less than the item's lsn. If the item 461 * This function takes the AIL lock once to execute the update operations on
459 * we move in the AIL is the minimum one, update the tail lsn in the 462 * all the items in the array, and as such should not be called with the AIL
460 * log manager. 463 * lock held. As a result, once we have the AIL lock, we need to check each log
464 * item LSN to confirm it needs to be moved forward in the AIL.
461 * 465 *
462 * This function must be called with the AIL lock held. The lock 466 * To optimise the insert operation, we delete all the items from the AIL in
463 * is dropped before returning. 467 * the first pass, moving them into a temporary list, then splice the temporary
468 * list into the correct position in the AIL. This avoids needing to do an
469 * insert operation on every item.
470 *
471 * This function must be called with the AIL lock held. The lock is dropped
472 * before returning.
464 */ 473 */
465void 474void
466xfs_trans_ail_update( 475xfs_trans_ail_update_bulk(
467 struct xfs_ail *ailp, 476 struct xfs_ail *ailp,
468 xfs_log_item_t *lip, 477 struct xfs_log_item **log_items,
469 xfs_lsn_t lsn) __releases(ailp->xa_lock) 478 int nr_items,
479 xfs_lsn_t lsn) __releases(ailp->xa_lock)
470{ 480{
471 xfs_log_item_t *dlip = NULL; 481 xfs_log_item_t *mlip;
472 xfs_log_item_t *mlip; /* ptr to minimum lip */
473 xfs_lsn_t tail_lsn; 482 xfs_lsn_t tail_lsn;
483 int mlip_changed = 0;
484 int i;
485 LIST_HEAD(tmp);
474 486
475 mlip = xfs_ail_min(ailp); 487 mlip = xfs_ail_min(ailp);
476 488
477 if (lip->li_flags & XFS_LI_IN_AIL) { 489 for (i = 0; i < nr_items; i++) {
478 dlip = xfs_ail_delete(ailp, lip); 490 struct xfs_log_item *lip = log_items[i];
479 ASSERT(dlip == lip); 491 if (lip->li_flags & XFS_LI_IN_AIL) {
480 xfs_trans_ail_cursor_clear(ailp, dlip); 492 /* check if we really need to move the item */
481 } else { 493 if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
482 lip->li_flags |= XFS_LI_IN_AIL; 494 continue;
495
496 xfs_ail_delete(ailp, lip);
497 if (mlip == lip)
498 mlip_changed = 1;
499 } else {
500 lip->li_flags |= XFS_LI_IN_AIL;
501 }
502 lip->li_lsn = lsn;
503 list_add(&lip->li_ail, &tmp);
483 } 504 }
484 505
485 lip->li_lsn = lsn; 506 xfs_ail_splice(ailp, &tmp, lsn);
486 xfs_ail_insert(ailp, lip);
487 507
488 if (mlip == dlip) { 508 if (!mlip_changed) {
489 mlip = xfs_ail_min(ailp);
490 /*
491 * It is not safe to access mlip after the AIL lock is
492 * dropped, so we must get a copy of li_lsn before we do
493 * so. This is especially important on 32-bit platforms
494 * where accessing and updating 64-bit values like li_lsn
495 * is not atomic.
496 */
497 tail_lsn = mlip->li_lsn;
498 spin_unlock(&ailp->xa_lock);
499 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
500 } else {
501 spin_unlock(&ailp->xa_lock); 509 spin_unlock(&ailp->xa_lock);
510 return;
502 } 511 }
503 512
504 513 /*
505} /* xfs_trans_update_ail */ 514 * It is not safe to access mlip after the AIL lock is dropped, so we
515 * must get a copy of li_lsn before we do so. This is especially
516 * important on 32-bit platforms where accessing and updating 64-bit
517 * values like li_lsn is not atomic.
518 */
519 mlip = xfs_ail_min(ailp);
520 tail_lsn = mlip->li_lsn;
521 spin_unlock(&ailp->xa_lock);
522 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
523}
506 524
507/* 525/*
508 * Delete the given item from the AIL. It must already be in 526 * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
509 * the AIL.
510 * 527 *
511 * Wakeup anyone with an lsn less than item's lsn. If the item 528 * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
512 * we delete in the AIL is the minimum one, update the tail lsn in the 529 * removed from the AIL. The caller is already holding the AIL lock, and done
513 * log manager. 530 * all the checks necessary to ensure the items passed in via @log_items are
531 * ready for deletion. This includes checking that the items are in the AIL.
514 * 532 *
515 * Clear the IN_AIL flag from the item, reset its lsn to 0, and 533 * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
516 * bump the AIL's generation count to indicate that the tree 534 * flag from the item and reset the item's lsn to 0. If we remove the first
517 * has changed. 535 * item in the AIL, update the log tail to match the new minimum LSN in the
536 * AIL.
518 * 537 *
519 * This function must be called with the AIL lock held. The lock 538 * This function will not drop the AIL lock until all items are removed from
520 * is dropped before returning. 539 * the AIL to minimise the amount of lock traffic on the AIL. This does not
540 * greatly increase the AIL hold time, but does significantly reduce the amount
541 * of traffic on the lock, especially during IO completion.
542 *
543 * This function must be called with the AIL lock held. The lock is dropped
544 * before returning.
521 */ 545 */
522void 546void
523xfs_trans_ail_delete( 547xfs_trans_ail_delete_bulk(
524 struct xfs_ail *ailp, 548 struct xfs_ail *ailp,
525 xfs_log_item_t *lip) __releases(ailp->xa_lock) 549 struct xfs_log_item **log_items,
550 int nr_items) __releases(ailp->xa_lock)
526{ 551{
527 xfs_log_item_t *dlip;
528 xfs_log_item_t *mlip; 552 xfs_log_item_t *mlip;
529 xfs_lsn_t tail_lsn; 553 xfs_lsn_t tail_lsn;
554 int mlip_changed = 0;
555 int i;
530 556
531 if (lip->li_flags & XFS_LI_IN_AIL) { 557 mlip = xfs_ail_min(ailp);
532 mlip = xfs_ail_min(ailp);
533 dlip = xfs_ail_delete(ailp, lip);
534 ASSERT(dlip == lip);
535 xfs_trans_ail_cursor_clear(ailp, dlip);
536
537 558
538 lip->li_flags &= ~XFS_LI_IN_AIL; 559 for (i = 0; i < nr_items; i++) {
539 lip->li_lsn = 0; 560 struct xfs_log_item *lip = log_items[i];
561 if (!(lip->li_flags & XFS_LI_IN_AIL)) {
562 struct xfs_mount *mp = ailp->xa_mount;
540 563
541 if (mlip == dlip) {
542 mlip = xfs_ail_min(ailp);
543 /*
544 * It is not safe to access mlip after the AIL lock
545 * is dropped, so we must get a copy of li_lsn
546 * before we do so. This is especially important
547 * on 32-bit platforms where accessing and updating
548 * 64-bit values like li_lsn is not atomic.
549 */
550 tail_lsn = mlip ? mlip->li_lsn : 0;
551 spin_unlock(&ailp->xa_lock);
552 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
553 } else {
554 spin_unlock(&ailp->xa_lock); 564 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) {
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 return;
555 } 572 }
573
574 xfs_ail_delete(ailp, lip);
575 lip->li_flags &= ~XFS_LI_IN_AIL;
576 lip->li_lsn = 0;
577 if (mlip == lip)
578 mlip_changed = 1;
556 } 579 }
557 else {
558 /*
559 * If the file system is not being shutdown, we are in
560 * serious trouble if we get to this stage.
561 */
562 struct xfs_mount *mp = ailp->xa_mount;
563 580
581 if (!mlip_changed) {
564 spin_unlock(&ailp->xa_lock); 582 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) { 583 return;
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
567 "%s: attempting to delete a log item that is not in the AIL",
568 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
570 }
571 } 584 }
572}
573
574 585
586 /*
587 * It is not safe to access mlip after the AIL lock is dropped, so we
588 * must get a copy of li_lsn before we do so. This is especially
589 * important on 32-bit platforms where accessing and updating 64-bit
590 * values like li_lsn is not atomic. It is possible we've emptied the
591 * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
592 */
593 mlip = xfs_ail_min(ailp);
594 tail_lsn = mlip ? mlip->li_lsn : 0;
595 spin_unlock(&ailp->xa_lock);
596 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
597}
575 598
576/* 599/*
577 * The active item list (AIL) is a doubly linked list of log 600 * The active item list (AIL) is a doubly linked list of log
@@ -623,16 +646,13 @@ xfs_trans_ail_destroy(
623} 646}
624 647
625/* 648/*
626 * Insert the given log item into the AIL. 649 * splice the log item list into the AIL at the given LSN.
627 * We almost always insert at the end of the list, so on inserts
628 * we search from the end of the list to find where the
629 * new item belongs.
630 */ 650 */
631STATIC void 651STATIC void
632xfs_ail_insert( 652xfs_ail_splice(
633 struct xfs_ail *ailp, 653 struct xfs_ail *ailp,
634 xfs_log_item_t *lip) 654 struct list_head *list,
635/* ARGSUSED */ 655 xfs_lsn_t lsn)
636{ 656{
637 xfs_log_item_t *next_lip; 657 xfs_log_item_t *next_lip;
638 658
@@ -640,39 +660,33 @@ xfs_ail_insert(
640 * If the list is empty, just insert the item. 660 * If the list is empty, just insert the item.
641 */ 661 */
642 if (list_empty(&ailp->xa_ail)) { 662 if (list_empty(&ailp->xa_ail)) {
643 list_add(&lip->li_ail, &ailp->xa_ail); 663 list_splice(list, &ailp->xa_ail);
644 return; 664 return;
645 } 665 }
646 666
647 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { 667 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
648 if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0) 668 if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
649 break; 669 break;
650 } 670 }
651 671
652 ASSERT((&next_lip->li_ail == &ailp->xa_ail) || 672 ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
653 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); 673 (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
654
655 list_add(&lip->li_ail, &next_lip->li_ail);
656 674
657 xfs_ail_check(ailp, lip); 675 list_splice_init(list, &next_lip->li_ail);
658 return; 676 return;
659} 677}
660 678
661/* 679/*
662 * Delete the given item from the AIL. Return a pointer to the item. 680 * Delete the given item from the AIL. Return a pointer to the item.
663 */ 681 */
664/*ARGSUSED*/ 682STATIC void
665STATIC xfs_log_item_t *
666xfs_ail_delete( 683xfs_ail_delete(
667 struct xfs_ail *ailp, 684 struct xfs_ail *ailp,
668 xfs_log_item_t *lip) 685 xfs_log_item_t *lip)
669/* ARGSUSED */
670{ 686{
671 xfs_ail_check(ailp, lip); 687 xfs_ail_check(ailp, lip);
672
673 list_del(&lip->li_ail); 688 list_del(&lip->li_ail);
674 689 xfs_trans_ail_cursor_clear(ailp, lip);
675 return lip;
676} 690}
677 691
678/* 692/*
@@ -682,7 +696,6 @@ xfs_ail_delete(
682STATIC xfs_log_item_t * 696STATIC xfs_log_item_t *
683xfs_ail_min( 697xfs_ail_min(
684 struct xfs_ail *ailp) 698 struct xfs_ail *ailp)
685/* ARGSUSED */
686{ 699{
687 if (list_empty(&ailp->xa_ail)) 700 if (list_empty(&ailp->xa_ail))
688 return NULL; 701 return NULL;
@@ -699,7 +712,6 @@ STATIC xfs_log_item_t *
699xfs_ail_next( 712xfs_ail_next(
700 struct xfs_ail *ailp, 713 struct xfs_ail *ailp,
701 xfs_log_item_t *lip) 714 xfs_log_item_t *lip)
702/* ARGSUSED */
703{ 715{
704 if (lip->li_ail.next == &ailp->xa_ail) 716 if (lip->li_ail.next == &ailp->xa_ail)
705 return NULL; 717 return NULL;
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f783d5e9fa70..f7590f5badea 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
69 tp->t_flags |= XFS_TRANS_DIRTY; 69 tp->t_flags |= XFS_TRANS_DIRTY;
70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; 70 efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
71 71
72 next_extent = efip->efi_next_extent; 72 /*
73 * atomic_inc_return gives us the value after the increment;
74 * we want to use it as an array index so we need to subtract 1 from
75 * it.
76 */
77 next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
73 ASSERT(next_extent < efip->efi_format.efi_nextents); 78 ASSERT(next_extent < efip->efi_format.efi_nextents);
74 extp = &(efip->efi_format.efi_extents[next_extent]); 79 extp = &(efip->efi_format.efi_extents[next_extent]);
75 extp->ext_start = start_block; 80 extp->ext_start = start_block;
76 extp->ext_len = ext_len; 81 extp->ext_len = ext_len;
77 efip->efi_next_extent++;
78} 82}
79 83
80 84
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 62da86c90de5..35162c238fa3 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -22,15 +22,17 @@ struct xfs_log_item;
22struct xfs_log_item_desc; 22struct xfs_log_item_desc;
23struct xfs_mount; 23struct xfs_mount;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_ail;
26struct xfs_log_vec;
25 27
26void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 28void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
27void xfs_trans_del_item(struct xfs_log_item *); 29void xfs_trans_del_item(struct xfs_log_item *);
28void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 30void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
29 int flags); 31 int flags);
30void xfs_trans_item_committed(struct xfs_log_item *lip,
31 xfs_lsn_t commit_lsn, int aborted);
32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); 32void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
33 33
34void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
35 xfs_lsn_t commit_lsn, int aborted);
34/* 36/*
35 * AIL traversal cursor. 37 * AIL traversal cursor.
36 * 38 *
@@ -73,12 +75,29 @@ struct xfs_ail {
73/* 75/*
74 * From xfs_trans_ail.c 76 * From xfs_trans_ail.c
75 */ 77 */
76void xfs_trans_ail_update(struct xfs_ail *ailp, 78void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
77 struct xfs_log_item *lip, xfs_lsn_t lsn) 79 struct xfs_log_item **log_items, int nr_items,
78 __releases(ailp->xa_lock); 80 xfs_lsn_t lsn) __releases(ailp->xa_lock);
79void xfs_trans_ail_delete(struct xfs_ail *ailp, 81static inline void
80 struct xfs_log_item *lip) 82xfs_trans_ail_update(
81 __releases(ailp->xa_lock); 83 struct xfs_ail *ailp,
84 struct xfs_log_item *lip,
85 xfs_lsn_t lsn) __releases(ailp->xa_lock)
86{
87 xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
88}
89
90void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
91 struct xfs_log_item **log_items, int nr_items)
92 __releases(ailp->xa_lock);
93static inline void
94xfs_trans_ail_delete(
95 struct xfs_ail *ailp,
96 xfs_log_item_t *lip) __releases(ailp->xa_lock)
97{
98 xfs_trans_ail_delete_bulk(ailp, &lip, 1);
99}
100
82void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); 101void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
83void xfs_trans_unlocked_item(struct xfs_ail *, 102void xfs_trans_unlocked_item(struct xfs_ail *,
84 xfs_log_item_t *); 103 xfs_log_item_t *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8e4a63c4151a..d8e6f8cd6f0c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -964,29 +964,48 @@ xfs_release(
964 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 964 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
965 } 965 }
966 966
967 if (ip->i_d.di_nlink != 0) { 967 if (ip->i_d.di_nlink == 0)
968 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 968 return 0;
969 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
970 ip->i_delayed_blks > 0)) &&
971 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
972 (!(ip->i_d.di_flags &
973 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
974 969
975 /* 970 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
976 * If we can't get the iolock just skip truncating 971 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
977 * the blocks past EOF because we could deadlock 972 ip->i_delayed_blks > 0)) &&
978 * with the mmap_sem otherwise. We'll get another 973 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
979 * chance to drop them once the last reference to 974 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
980 * the inode is dropped, so we'll never leak blocks
981 * permanently.
982 */
983 error = xfs_free_eofblocks(mp, ip,
984 XFS_FREE_EOF_TRYLOCK);
985 if (error)
986 return error;
987 }
988 }
989 975
976 /*
977 * If we can't get the iolock just skip truncating the blocks
978 * past EOF because we could deadlock with the mmap_sem
979 * otherwise. We'll get another chance to drop them once the
980 * last reference to the inode is dropped, so we'll never leak
981 * blocks permanently.
982 *
983 * Further, check if the inode is being opened, written and
984 * closed frequently and we have delayed allocation blocks
985 * oustanding (e.g. streaming writes from the NFS server),
986 * truncating the blocks past EOF will cause fragmentation to
987 * occur.
988 *
989 * In this case don't do the truncation, either, but we have to
990 * be careful how we detect this case. Blocks beyond EOF show
991 * up as i_delayed_blks even when the inode is clean, so we
992 * need to truncate them away first before checking for a dirty
993 * release. Hence on the first dirty close we will still remove
994 * the speculative allocation, but after that we will leave it
995 * in place.
996 */
997 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
998 return 0;
999
1000 error = xfs_free_eofblocks(mp, ip,
1001 XFS_FREE_EOF_TRYLOCK);
1002 if (error)
1003 return error;
1004
1005 /* delalloc blocks after truncation means it really is dirty */
1006 if (ip->i_delayed_blks)
1007 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1008 }
990 return 0; 1009 return 0;
991} 1010}
992 1011