aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/xfs/linux-2.6
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'fs/xfs/linux-2.6')
-rw-r--r--fs/xfs/linux-2.6/kmem.c9
-rw-r--r--fs/xfs/linux-2.6/sv.h59
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c537
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h16
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c860
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h149
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h28
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c222
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h10
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c641
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.h23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c59
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c113
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h30
-rw-r--r--fs/xfs/linux-2.6/xfs_message.c108
-rw-r--r--fs/xfs/linux-2.6/xfs_message.h39
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c416
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c754
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c25
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h175
-rw-r--r--fs/xfs/linux-2.6/xfs_version.h29
29 files changed, 2331 insertions, 2052 deletions
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb6..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include "time.h" 24#include "time.h"
25#include "kmem.h" 25#include "kmem.h"
26#include "xfs_message.h"
26 27
27/* 28/*
28 * Greedy allocation. May fail and may return vmalloced memory. 29 * Greedy allocation. May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
56 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 57 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
57 return ptr; 58 return ptr;
58 if (!(++retries % 100)) 59 if (!(++retries % 100))
59 printk(KERN_ERR "XFS: possible memory allocation " 60 xfs_err(NULL,
60 "deadlock in %s (mode:0x%x)\n", 61 "possible memory allocation deadlock in %s (mode:0x%x)",
61 __func__, lflags); 62 __func__, lflags);
62 congestion_wait(BLK_RW_ASYNC, HZ/50); 63 congestion_wait(BLK_RW_ASYNC, HZ/50);
63 } while (1); 64 } while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
112 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 113 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
113 return ptr; 114 return ptr;
114 if (!(++retries % 100)) 115 if (!(++retries % 100))
115 printk(KERN_ERR "XFS: possible memory allocation " 116 xfs_err(NULL,
116 "deadlock in %s (mode:0x%x)\n", 117 "possible memory allocation deadlock in %s (mode:0x%x)",
117 __func__, lflags); 118 __func__, lflags);
118 congestion_wait(BLK_RW_ASYNC, HZ/50); 119 congestion_wait(BLK_RW_ASYNC, HZ/50);
119 } while (1); 120 } while (1);
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 4dfc7c370819..000000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_SV_H__
19#define __XFS_SUPPORT_SV_H__
20
21#include <linux/wait.h>
22#include <linux/sched.h>
23#include <linux/spinlock.h>
24
25/*
26 * Synchronisation variables.
27 *
28 * (Parameters "pri", "svf" and "rts" are not implemented)
29 */
30
31typedef struct sv_s {
32 wait_queue_head_t waiters;
33} sv_t;
34
35static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
36{
37 DECLARE_WAITQUEUE(wait, current);
38
39 add_wait_queue_exclusive(&sv->waiters, &wait);
40 __set_current_state(TASK_UNINTERRUPTIBLE);
41 spin_unlock(lock);
42
43 schedule();
44
45 remove_wait_queue(&sv->waiters, &wait);
46}
47
48#define sv_init(sv,flag,name) \
49 init_waitqueue_head(&(sv)->waiters)
50#define sv_destroy(sv) \
51 /*NOTHING*/
52#define sv_wait(sv, pri, lock, s) \
53 _sv_wait(sv, lock)
54#define sv_signal(sv) \
55 wake_up(&(sv)->waiters)
56#define sv_broadcast(sv) \
57 wake_up_all(&(sv)->waiters)
58
59#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index b2771862fd3d..39f4f809bb68 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,12 +219,13 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
219} 219}
220 220
221int 221int
222xfs_check_acl(struct inode *inode, int mask) 222xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
223{ 223{
224 struct xfs_inode *ip = XFS_I(inode); 224 struct xfs_inode *ip;
225 struct posix_acl *acl; 225 struct posix_acl *acl;
226 int error = -EAGAIN; 226 int error = -EAGAIN;
227 227
228 ip = XFS_I(inode);
228 trace_xfs_check_acl(ip); 229 trace_xfs_check_acl(ip);
229 230
230 /* 231 /*
@@ -234,6 +235,12 @@ xfs_check_acl(struct inode *inode, int mask)
234 if (!XFS_IFORK_Q(ip)) 235 if (!XFS_IFORK_Q(ip))
235 return -EAGAIN; 236 return -EAGAIN;
236 237
238 if (flags & IPERM_FLAG_RCU) {
239 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
240 return -ECHILD;
241 return -EAGAIN;
242 }
243
237 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); 244 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
238 if (IS_ERR(acl)) 245 if (IS_ERR(acl))
239 return PTR_ERR(acl); 246 return PTR_ERR(acl);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index b552f816de15..79ce38be15a1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -38,15 +38,6 @@
38#include <linux/pagevec.h> 38#include <linux/pagevec.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40 40
41/*
42 * Types of I/O for bmap clustering and I/O completion tracking.
43 */
44enum {
45 IO_READ, /* mapping for a read */
46 IO_DELAY, /* mapping covers delalloc region */
47 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
48 IO_NEW /* just allocated */
49};
50 41
51/* 42/*
52 * Prime number of hash buckets since address is used as the key. 43 * Prime number of hash buckets since address is used as the key.
@@ -182,9 +173,6 @@ xfs_setfilesize(
182 xfs_inode_t *ip = XFS_I(ioend->io_inode); 173 xfs_inode_t *ip = XFS_I(ioend->io_inode);
183 xfs_fsize_t isize; 174 xfs_fsize_t isize;
184 175
185 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
186 ASSERT(ioend->io_type != IO_READ);
187
188 if (unlikely(ioend->io_error)) 176 if (unlikely(ioend->io_error))
189 return 0; 177 return 0;
190 178
@@ -244,10 +232,8 @@ xfs_end_io(
244 * We might have to update the on-disk file size after extending 232 * We might have to update the on-disk file size after extending
245 * writes. 233 * writes.
246 */ 234 */
247 if (ioend->io_type != IO_READ) { 235 error = xfs_setfilesize(ioend);
248 error = xfs_setfilesize(ioend); 236 ASSERT(!error || error == EAGAIN);
249 ASSERT(!error || error == EAGAIN);
250 }
251 237
252 /* 238 /*
253 * If we didn't complete processing of the ioend, requeue it to the 239 * If we didn't complete processing of the ioend, requeue it to the
@@ -318,14 +304,63 @@ STATIC int
318xfs_map_blocks( 304xfs_map_blocks(
319 struct inode *inode, 305 struct inode *inode,
320 loff_t offset, 306 loff_t offset,
321 ssize_t count,
322 struct xfs_bmbt_irec *imap, 307 struct xfs_bmbt_irec *imap,
323 int flags) 308 int type,
309 int nonblocking)
324{ 310{
325 int nmaps = 1; 311 struct xfs_inode *ip = XFS_I(inode);
326 int new = 0; 312 struct xfs_mount *mp = ip->i_mount;
313 ssize_t count = 1 << inode->i_blkbits;
314 xfs_fileoff_t offset_fsb, end_fsb;
315 int error = 0;
316 int bmapi_flags = XFS_BMAPI_ENTIRE;
317 int nimaps = 1;
318
319 if (XFS_FORCED_SHUTDOWN(mp))
320 return -XFS_ERROR(EIO);
321
322 if (type == IO_UNWRITTEN)
323 bmapi_flags |= XFS_BMAPI_IGSTATE;
324
325 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
326 if (nonblocking)
327 return -XFS_ERROR(EAGAIN);
328 xfs_ilock(ip, XFS_ILOCK_SHARED);
329 }
330
331 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
332 (ip->i_df.if_flags & XFS_IFEXTENTS));
333 ASSERT(offset <= mp->m_maxioffset);
334
335 if (offset + count > mp->m_maxioffset)
336 count = mp->m_maxioffset - offset;
337 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
338 offset_fsb = XFS_B_TO_FSBT(mp, offset);
339 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
340 bmapi_flags, NULL, 0, imap, &nimaps, NULL);
341 xfs_iunlock(ip, XFS_ILOCK_SHARED);
327 342
328 return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); 343 if (error)
344 return -XFS_ERROR(error);
345
346 if (type == IO_DELALLOC &&
347 (!nimaps || isnullstartblock(imap->br_startblock))) {
348 error = xfs_iomap_write_allocate(ip, offset, count, imap);
349 if (!error)
350 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
351 return -XFS_ERROR(error);
352 }
353
354#ifdef DEBUG
355 if (type == IO_UNWRITTEN) {
356 ASSERT(nimaps);
357 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
358 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
359 }
360#endif
361 if (nimaps)
362 trace_xfs_map_blocks_found(ip, offset, count, type, imap);
363 return 0;
329} 364}
330 365
331STATIC int 366STATIC int
@@ -378,28 +413,19 @@ xfs_submit_ioend_bio(
378 if (xfs_ioend_new_eof(ioend)) 413 if (xfs_ioend_new_eof(ioend))
379 xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); 414 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
380 415
381 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
382 WRITE_SYNC_PLUG : WRITE, bio);
383 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
384 bio_put(bio);
385} 417}
386 418
387STATIC struct bio * 419STATIC struct bio *
388xfs_alloc_ioend_bio( 420xfs_alloc_ioend_bio(
389 struct buffer_head *bh) 421 struct buffer_head *bh)
390{ 422{
391 struct bio *bio;
392 int nvecs = bio_get_nr_vecs(bh->b_bdev); 423 int nvecs = bio_get_nr_vecs(bh->b_bdev);
393 424 struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
394 do {
395 bio = bio_alloc(GFP_NOIO, nvecs);
396 nvecs >>= 1;
397 } while (!bio);
398 425
399 ASSERT(bio->bi_private == NULL); 426 ASSERT(bio->bi_private == NULL);
400 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 427 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
401 bio->bi_bdev = bh->b_bdev; 428 bio->bi_bdev = bh->b_bdev;
402 bio_get(bio);
403 return bio; 429 return bio;
404} 430}
405 431
@@ -470,9 +496,8 @@ xfs_submit_ioend(
470 /* Pass 1 - start writeback */ 496 /* Pass 1 - start writeback */
471 do { 497 do {
472 next = ioend->io_list; 498 next = ioend->io_list;
473 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { 499 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
474 xfs_start_buffer_writeback(bh); 500 xfs_start_buffer_writeback(bh);
475 }
476 } while ((ioend = next) != NULL); 501 } while ((ioend = next) != NULL);
477 502
478 /* Pass 2 - submit I/O */ 503 /* Pass 2 - submit I/O */
@@ -600,117 +625,13 @@ xfs_map_at_offset(
600 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 625 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
601 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 626 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
602 627
603 lock_buffer(bh);
604 xfs_map_buffer(inode, bh, imap, offset); 628 xfs_map_buffer(inode, bh, imap, offset);
605 bh->b_bdev = xfs_find_bdev_for_inode(inode);
606 set_buffer_mapped(bh); 629 set_buffer_mapped(bh);
607 clear_buffer_delay(bh); 630 clear_buffer_delay(bh);
608 clear_buffer_unwritten(bh); 631 clear_buffer_unwritten(bh);
609} 632}
610 633
611/* 634/*
612 * Look for a page at index that is suitable for clustering.
613 */
614STATIC unsigned int
615xfs_probe_page(
616 struct page *page,
617 unsigned int pg_offset)
618{
619 struct buffer_head *bh, *head;
620 int ret = 0;
621
622 if (PageWriteback(page))
623 return 0;
624 if (!PageDirty(page))
625 return 0;
626 if (!page->mapping)
627 return 0;
628 if (!page_has_buffers(page))
629 return 0;
630
631 bh = head = page_buffers(page);
632 do {
633 if (!buffer_uptodate(bh))
634 break;
635 if (!buffer_mapped(bh))
636 break;
637 ret += bh->b_size;
638 if (ret >= pg_offset)
639 break;
640 } while ((bh = bh->b_this_page) != head);
641
642 return ret;
643}
644
645STATIC size_t
646xfs_probe_cluster(
647 struct inode *inode,
648 struct page *startpage,
649 struct buffer_head *bh,
650 struct buffer_head *head)
651{
652 struct pagevec pvec;
653 pgoff_t tindex, tlast, tloff;
654 size_t total = 0;
655 int done = 0, i;
656
657 /* First sum forwards in this page */
658 do {
659 if (!buffer_uptodate(bh) || !buffer_mapped(bh))
660 return total;
661 total += bh->b_size;
662 } while ((bh = bh->b_this_page) != head);
663
664 /* if we reached the end of the page, sum forwards in following pages */
665 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
666 tindex = startpage->index + 1;
667
668 /* Prune this back to avoid pathological behavior */
669 tloff = min(tlast, startpage->index + 64);
670
671 pagevec_init(&pvec, 0);
672 while (!done && tindex <= tloff) {
673 unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
674
675 if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
676 break;
677
678 for (i = 0; i < pagevec_count(&pvec); i++) {
679 struct page *page = pvec.pages[i];
680 size_t pg_offset, pg_len = 0;
681
682 if (tindex == tlast) {
683 pg_offset =
684 i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
685 if (!pg_offset) {
686 done = 1;
687 break;
688 }
689 } else
690 pg_offset = PAGE_CACHE_SIZE;
691
692 if (page->index == tindex && trylock_page(page)) {
693 pg_len = xfs_probe_page(page, pg_offset);
694 unlock_page(page);
695 }
696
697 if (!pg_len) {
698 done = 1;
699 break;
700 }
701
702 total += pg_len;
703 tindex++;
704 }
705
706 pagevec_release(&pvec);
707 cond_resched();
708 }
709
710 return total;
711}
712
713/*
714 * Test if a given page is suitable for writing as part of an unwritten 635 * Test if a given page is suitable for writing as part of an unwritten
715 * or delayed allocate extent. 636 * or delayed allocate extent.
716 */ 637 */
@@ -731,9 +652,9 @@ xfs_is_delayed_page(
731 if (buffer_unwritten(bh)) 652 if (buffer_unwritten(bh))
732 acceptable = (type == IO_UNWRITTEN); 653 acceptable = (type == IO_UNWRITTEN);
733 else if (buffer_delay(bh)) 654 else if (buffer_delay(bh))
734 acceptable = (type == IO_DELAY); 655 acceptable = (type == IO_DELALLOC);
735 else if (buffer_dirty(bh) && buffer_mapped(bh)) 656 else if (buffer_dirty(bh) && buffer_mapped(bh))
736 acceptable = (type == IO_NEW); 657 acceptable = (type == IO_OVERWRITE);
737 else 658 else
738 break; 659 break;
739 } while ((bh = bh->b_this_page) != head); 660 } while ((bh = bh->b_this_page) != head);
@@ -758,8 +679,7 @@ xfs_convert_page(
758 loff_t tindex, 679 loff_t tindex,
759 struct xfs_bmbt_irec *imap, 680 struct xfs_bmbt_irec *imap,
760 xfs_ioend_t **ioendp, 681 xfs_ioend_t **ioendp,
761 struct writeback_control *wbc, 682 struct writeback_control *wbc)
762 int all_bh)
763{ 683{
764 struct buffer_head *bh, *head; 684 struct buffer_head *bh, *head;
765 xfs_off_t end_offset; 685 xfs_off_t end_offset;
@@ -814,37 +734,30 @@ xfs_convert_page(
814 continue; 734 continue;
815 } 735 }
816 736
817 if (buffer_unwritten(bh) || buffer_delay(bh)) { 737 if (buffer_unwritten(bh) || buffer_delay(bh) ||
738 buffer_mapped(bh)) {
818 if (buffer_unwritten(bh)) 739 if (buffer_unwritten(bh))
819 type = IO_UNWRITTEN; 740 type = IO_UNWRITTEN;
741 else if (buffer_delay(bh))
742 type = IO_DELALLOC;
820 else 743 else
821 type = IO_DELAY; 744 type = IO_OVERWRITE;
822 745
823 if (!xfs_imap_valid(inode, imap, offset)) { 746 if (!xfs_imap_valid(inode, imap, offset)) {
824 done = 1; 747 done = 1;
825 continue; 748 continue;
826 } 749 }
827 750
828 ASSERT(imap->br_startblock != HOLESTARTBLOCK); 751 lock_buffer(bh);
829 ASSERT(imap->br_startblock != DELAYSTARTBLOCK); 752 if (type != IO_OVERWRITE)
830 753 xfs_map_at_offset(inode, bh, imap, offset);
831 xfs_map_at_offset(inode, bh, imap, offset);
832 xfs_add_to_ioend(inode, bh, offset, type, 754 xfs_add_to_ioend(inode, bh, offset, type,
833 ioendp, done); 755 ioendp, done);
834 756
835 page_dirty--; 757 page_dirty--;
836 count++; 758 count++;
837 } else { 759 } else {
838 type = IO_NEW; 760 done = 1;
839 if (buffer_mapped(bh) && all_bh) {
840 lock_buffer(bh);
841 xfs_add_to_ioend(inode, bh, offset,
842 type, ioendp, done);
843 count++;
844 page_dirty--;
845 } else {
846 done = 1;
847 }
848 } 761 }
849 } while (offset += len, (bh = bh->b_this_page) != head); 762 } while (offset += len, (bh = bh->b_this_page) != head);
850 763
@@ -876,7 +789,6 @@ xfs_cluster_write(
876 struct xfs_bmbt_irec *imap, 789 struct xfs_bmbt_irec *imap,
877 xfs_ioend_t **ioendp, 790 xfs_ioend_t **ioendp,
878 struct writeback_control *wbc, 791 struct writeback_control *wbc,
879 int all_bh,
880 pgoff_t tlast) 792 pgoff_t tlast)
881{ 793{
882 struct pagevec pvec; 794 struct pagevec pvec;
@@ -891,7 +803,7 @@ xfs_cluster_write(
891 803
892 for (i = 0; i < pagevec_count(&pvec); i++) { 804 for (i = 0; i < pagevec_count(&pvec); i++) {
893 done = xfs_convert_page(inode, pvec.pages[i], tindex++, 805 done = xfs_convert_page(inode, pvec.pages[i], tindex++,
894 imap, ioendp, wbc, all_bh); 806 imap, ioendp, wbc);
895 if (done) 807 if (done)
896 break; 808 break;
897 } 809 }
@@ -934,83 +846,38 @@ xfs_aops_discard_page(
934 struct xfs_inode *ip = XFS_I(inode); 846 struct xfs_inode *ip = XFS_I(inode);
935 struct buffer_head *bh, *head; 847 struct buffer_head *bh, *head;
936 loff_t offset = page_offset(page); 848 loff_t offset = page_offset(page);
937 ssize_t len = 1 << inode->i_blkbits;
938 849
939 if (!xfs_is_delayed_page(page, IO_DELAY)) 850 if (!xfs_is_delayed_page(page, IO_DELALLOC))
940 goto out_invalidate; 851 goto out_invalidate;
941 852
942 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 853 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
943 goto out_invalidate; 854 goto out_invalidate;
944 855
945 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 856 xfs_alert(ip->i_mount,
946 "page discard on page %p, inode 0x%llx, offset %llu.", 857 "page discard on page %p, inode 0x%llx, offset %llu.",
947 page, ip->i_ino, offset); 858 page, ip->i_ino, offset);
948 859
949 xfs_ilock(ip, XFS_ILOCK_EXCL); 860 xfs_ilock(ip, XFS_ILOCK_EXCL);
950 bh = head = page_buffers(page); 861 bh = head = page_buffers(page);
951 do { 862 do {
952 int done;
953 xfs_fileoff_t offset_fsb;
954 xfs_bmbt_irec_t imap;
955 int nimaps = 1;
956 int error; 863 int error;
957 xfs_fsblock_t firstblock; 864 xfs_fileoff_t start_fsb;
958 xfs_bmap_free_t flist;
959 865
960 if (!buffer_delay(bh)) 866 if (!buffer_delay(bh))
961 goto next_buffer; 867 goto next_buffer;
962 868
963 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 869 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
964 870 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
965 /*
966 * Map the range first and check that it is a delalloc extent
967 * before trying to unmap the range. Otherwise we will be
968 * trying to remove a real extent (which requires a
969 * transaction) or a hole, which is probably a bad idea...
970 */
971 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
972 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
973 &nimaps, NULL);
974
975 if (error) {
976 /* something screwed, just bail */
977 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
978 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
979 "page discard failed delalloc mapping lookup.");
980 }
981 break;
982 }
983 if (!nimaps) {
984 /* nothing there */
985 goto next_buffer;
986 }
987 if (imap.br_startblock != DELAYSTARTBLOCK) {
988 /* been converted, ignore */
989 goto next_buffer;
990 }
991 WARN_ON(imap.br_blockcount == 0);
992
993 /*
994 * Note: while we initialise the firstblock/flist pair, they
995 * should never be used because blocks should never be
996 * allocated or freed for a delalloc extent and hence we need
997 * don't cancel or finish them after the xfs_bunmapi() call.
998 */
999 xfs_bmap_init(&flist, &firstblock);
1000 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
1001 &flist, &done);
1002
1003 ASSERT(!flist.xbf_count && !flist.xbf_first);
1004 if (error) { 871 if (error) {
1005 /* something screwed, just bail */ 872 /* something screwed, just bail */
1006 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 873 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1007 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 874 xfs_alert(ip->i_mount,
1008 "page discard unable to remove delalloc mapping."); 875 "page discard unable to remove delalloc mapping.");
1009 } 876 }
1010 break; 877 break;
1011 } 878 }
1012next_buffer: 879next_buffer:
1013 offset += len; 880 offset += 1 << inode->i_blkbits;
1014 881
1015 } while ((bh = bh->b_this_page) != head); 882 } while ((bh = bh->b_this_page) != head);
1016 883
@@ -1047,10 +914,10 @@ xfs_vm_writepage(
1047 unsigned int type; 914 unsigned int type;
1048 __uint64_t end_offset; 915 __uint64_t end_offset;
1049 pgoff_t end_index, last_index; 916 pgoff_t end_index, last_index;
1050 ssize_t size, len; 917 ssize_t len;
1051 int flags, err, imap_valid = 0, uptodate = 1; 918 int err, imap_valid = 0, uptodate = 1;
1052 int count = 0; 919 int count = 0;
1053 int all_bh = 0; 920 int nonblocking = 0;
1054 921
1055 trace_xfs_writepage(inode, page, 0); 922 trace_xfs_writepage(inode, page, 0);
1056 923
@@ -1101,110 +968,78 @@ xfs_vm_writepage(
1101 968
1102 bh = head = page_buffers(page); 969 bh = head = page_buffers(page);
1103 offset = page_offset(page); 970 offset = page_offset(page);
1104 flags = BMAPI_READ; 971 type = IO_OVERWRITE;
1105 type = IO_NEW; 972
973 if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
974 nonblocking = 1;
1106 975
1107 do { 976 do {
977 int new_ioend = 0;
978
1108 if (offset >= end_offset) 979 if (offset >= end_offset)
1109 break; 980 break;
1110 if (!buffer_uptodate(bh)) 981 if (!buffer_uptodate(bh))
1111 uptodate = 0; 982 uptodate = 0;
1112 983
1113 /* 984 /*
1114 * A hole may still be marked uptodate because discard_buffer 985 * set_page_dirty dirties all buffers in a page, independent
1115 * leaves the flag set. 986 * of their state. The dirty state however is entirely
987 * meaningless for holes (!mapped && uptodate), so skip
988 * buffers covering holes here.
1116 */ 989 */
1117 if (!buffer_mapped(bh) && buffer_uptodate(bh)) { 990 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1118 ASSERT(!buffer_dirty(bh));
1119 imap_valid = 0; 991 imap_valid = 0;
1120 continue; 992 continue;
1121 } 993 }
1122 994
1123 if (imap_valid) 995 if (buffer_unwritten(bh)) {
1124 imap_valid = xfs_imap_valid(inode, &imap, offset); 996 if (type != IO_UNWRITTEN) {
1125
1126 if (buffer_unwritten(bh) || buffer_delay(bh)) {
1127 int new_ioend = 0;
1128
1129 /*
1130 * Make sure we don't use a read-only iomap
1131 */
1132 if (flags == BMAPI_READ)
1133 imap_valid = 0;
1134
1135 if (buffer_unwritten(bh)) {
1136 type = IO_UNWRITTEN; 997 type = IO_UNWRITTEN;
1137 flags = BMAPI_WRITE | BMAPI_IGNSTATE; 998 imap_valid = 0;
1138 } else if (buffer_delay(bh)) {
1139 type = IO_DELAY;
1140 flags = BMAPI_ALLOCATE;
1141
1142 if (wbc->sync_mode == WB_SYNC_NONE &&
1143 wbc->nonblocking)
1144 flags |= BMAPI_TRYLOCK;
1145 }
1146
1147 if (!imap_valid) {
1148 /*
1149 * If we didn't have a valid mapping then we
1150 * need to ensure that we put the new mapping
1151 * in a new ioend structure. This needs to be
1152 * done to ensure that the ioends correctly
1153 * reflect the block mappings at io completion
1154 * for unwritten extent conversion.
1155 */
1156 new_ioend = 1;
1157 err = xfs_map_blocks(inode, offset, len,
1158 &imap, flags);
1159 if (err)
1160 goto error;
1161 imap_valid = xfs_imap_valid(inode, &imap,
1162 offset);
1163 } 999 }
1164 if (imap_valid) { 1000 } else if (buffer_delay(bh)) {
1165 xfs_map_at_offset(inode, bh, &imap, offset); 1001 if (type != IO_DELALLOC) {
1166 xfs_add_to_ioend(inode, bh, offset, type, 1002 type = IO_DELALLOC;
1167 &ioend, new_ioend); 1003 imap_valid = 0;
1168 count++;
1169 } 1004 }
1170 } else if (buffer_uptodate(bh)) { 1005 } else if (buffer_uptodate(bh)) {
1171 /* 1006 if (type != IO_OVERWRITE) {
1172 * we got here because the buffer is already mapped. 1007 type = IO_OVERWRITE;
1173 * That means it must already have extents allocated 1008 imap_valid = 0;
1174 * underneath it. Map the extent by reading it. 1009 }
1175 */ 1010 } else {
1176 if (!imap_valid || flags != BMAPI_READ) { 1011 if (PageUptodate(page)) {
1177 flags = BMAPI_READ; 1012 ASSERT(buffer_mapped(bh));
1178 size = xfs_probe_cluster(inode, page, bh, head); 1013 imap_valid = 0;
1179 err = xfs_map_blocks(inode, offset, size,
1180 &imap, flags);
1181 if (err)
1182 goto error;
1183 imap_valid = xfs_imap_valid(inode, &imap,
1184 offset);
1185 } 1014 }
1015 continue;
1016 }
1186 1017
1018 if (imap_valid)
1019 imap_valid = xfs_imap_valid(inode, &imap, offset);
1020 if (!imap_valid) {
1187 /* 1021 /*
1188 * We set the type to IO_NEW in case we are doing a 1022 * If we didn't have a valid mapping then we need to
1189 * small write at EOF that is extending the file but 1023 * put the new mapping into a separate ioend structure.
1190 * without needing an allocation. We need to update the 1024 * This ensures non-contiguous extents always have
1191 * file size on I/O completion in this case so it is 1025 * separate ioends, which is particularly important
1192 * the same case as having just allocated a new extent 1026 * for unwritten extent conversion at I/O completion
1193 * that we are writing into for the first time. 1027 * time.
1194 */ 1028 */
1195 type = IO_NEW; 1029 new_ioend = 1;
1196 if (trylock_buffer(bh)) { 1030 err = xfs_map_blocks(inode, offset, &imap, type,
1197 if (imap_valid) 1031 nonblocking);
1198 all_bh = 1; 1032 if (err)
1199 xfs_add_to_ioend(inode, bh, offset, type, 1033 goto error;
1200 &ioend, !imap_valid); 1034 imap_valid = xfs_imap_valid(inode, &imap, offset);
1201 count++; 1035 }
1202 } else { 1036 if (imap_valid) {
1203 imap_valid = 0; 1037 lock_buffer(bh);
1204 } 1038 if (type != IO_OVERWRITE)
1205 } else if (PageUptodate(page)) { 1039 xfs_map_at_offset(inode, bh, &imap, offset);
1206 ASSERT(buffer_mapped(bh)); 1040 xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1207 imap_valid = 0; 1041 new_ioend);
1042 count++;
1208 } 1043 }
1209 1044
1210 if (!iohead) 1045 if (!iohead)
@@ -1233,7 +1068,7 @@ xfs_vm_writepage(
1233 end_index = last_index; 1068 end_index = last_index;
1234 1069
1235 xfs_cluster_write(inode, page->index + 1, &imap, &ioend, 1070 xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1236 wbc, all_bh, end_index); 1071 wbc, end_index);
1237 } 1072 }
1238 1073
1239 if (iohead) 1074 if (iohead)
@@ -1302,13 +1137,19 @@ __xfs_get_blocks(
1302 int create, 1137 int create,
1303 int direct) 1138 int direct)
1304{ 1139{
1305 int flags = create ? BMAPI_WRITE : BMAPI_READ; 1140 struct xfs_inode *ip = XFS_I(inode);
1141 struct xfs_mount *mp = ip->i_mount;
1142 xfs_fileoff_t offset_fsb, end_fsb;
1143 int error = 0;
1144 int lockmode = 0;
1306 struct xfs_bmbt_irec imap; 1145 struct xfs_bmbt_irec imap;
1146 int nimaps = 1;
1307 xfs_off_t offset; 1147 xfs_off_t offset;
1308 ssize_t size; 1148 ssize_t size;
1309 int nimap = 1;
1310 int new = 0; 1149 int new = 0;
1311 int error; 1150
1151 if (XFS_FORCED_SHUTDOWN(mp))
1152 return -XFS_ERROR(EIO);
1312 1153
1313 offset = (xfs_off_t)iblock << inode->i_blkbits; 1154 offset = (xfs_off_t)iblock << inode->i_blkbits;
1314 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1155 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1317,15 +1158,45 @@ __xfs_get_blocks(
1317 if (!create && direct && offset >= i_size_read(inode)) 1158 if (!create && direct && offset >= i_size_read(inode))
1318 return 0; 1159 return 0;
1319 1160
1320 if (direct && create) 1161 if (create) {
1321 flags |= BMAPI_DIRECT; 1162 lockmode = XFS_ILOCK_EXCL;
1163 xfs_ilock(ip, lockmode);
1164 } else {
1165 lockmode = xfs_ilock_map_shared(ip);
1166 }
1322 1167
1323 error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, 1168 ASSERT(offset <= mp->m_maxioffset);
1324 &new); 1169 if (offset + size > mp->m_maxioffset)
1170 size = mp->m_maxioffset - offset;
1171 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1172 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1173
1174 error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
1175 XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
1325 if (error) 1176 if (error)
1326 return -error; 1177 goto out_unlock;
1327 if (nimap == 0) 1178
1328 return 0; 1179 if (create &&
1180 (!nimaps ||
1181 (imap.br_startblock == HOLESTARTBLOCK ||
1182 imap.br_startblock == DELAYSTARTBLOCK))) {
1183 if (direct) {
1184 error = xfs_iomap_write_direct(ip, offset, size,
1185 &imap, nimaps);
1186 } else {
1187 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1188 }
1189 if (error)
1190 goto out_unlock;
1191
1192 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
1193 } else if (nimaps) {
1194 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
1195 } else {
1196 trace_xfs_get_blocks_notfound(ip, offset, size);
1197 goto out_unlock;
1198 }
1199 xfs_iunlock(ip, lockmode);
1329 1200
1330 if (imap.br_startblock != HOLESTARTBLOCK && 1201 if (imap.br_startblock != HOLESTARTBLOCK &&
1331 imap.br_startblock != DELAYSTARTBLOCK) { 1202 imap.br_startblock != DELAYSTARTBLOCK) {
@@ -1392,6 +1263,10 @@ __xfs_get_blocks(
1392 } 1263 }
1393 1264
1394 return 0; 1265 return 0;
1266
1267out_unlock:
1268 xfs_iunlock(ip, lockmode);
1269 return -error;
1395} 1270}
1396 1271
1397int 1272int
@@ -1420,7 +1295,7 @@ xfs_get_blocks_direct(
1420 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1295 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1421 * need to issue a transaction to convert the range from unwritten to written 1296 * need to issue a transaction to convert the range from unwritten to written
1422 * extents. In case this is regular synchronous I/O we just call xfs_end_io 1297 * extents. In case this is regular synchronous I/O we just call xfs_end_io
1423 * to do this and we are done. But in case this was a successfull AIO 1298 * to do this and we are done. But in case this was a successful AIO
1424 * request this handler is called from interrupt context, from which we 1299 * request this handler is called from interrupt context, from which we
1425 * can't start transactions. In that case offload the I/O completion to 1300 * can't start transactions. In that case offload the I/O completion to
1426 * the workqueues we also use for buffered I/O completion. 1301 * the workqueues we also use for buffered I/O completion.
@@ -1479,7 +1354,7 @@ xfs_vm_direct_IO(
1479 ssize_t ret; 1354 ssize_t ret;
1480 1355
1481 if (rw & WRITE) { 1356 if (rw & WRITE) {
1482 iocb->private = xfs_alloc_ioend(inode, IO_NEW); 1357 iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
1483 1358
1484 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1359 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1485 offset, nr_segs, 1360 offset, nr_segs,
@@ -1505,11 +1380,42 @@ xfs_vm_write_failed(
1505 struct inode *inode = mapping->host; 1380 struct inode *inode = mapping->host;
1506 1381
1507 if (to > inode->i_size) { 1382 if (to > inode->i_size) {
1508 struct iattr ia = { 1383 /*
1509 .ia_valid = ATTR_SIZE | ATTR_FORCE, 1384 * punch out the delalloc blocks we have already allocated. We
1510 .ia_size = inode->i_size, 1385 * don't call xfs_setattr() to do this as we may be in the
1511 }; 1386 * middle of a multi-iovec write and so the vfs inode->i_size
1512 xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); 1387 * will not match the xfs ip->i_size and so it will zero too
1388 * much. Hence we jus truncate the page cache to zero what is
1389 * necessary and punch the delalloc blocks directly.
1390 */
1391 struct xfs_inode *ip = XFS_I(inode);
1392 xfs_fileoff_t start_fsb;
1393 xfs_fileoff_t end_fsb;
1394 int error;
1395
1396 truncate_pagecache(inode, to, inode->i_size);
1397
1398 /*
1399 * Check if there are any blocks that are outside of i_size
1400 * that need to be trimmed back.
1401 */
1402 start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
1403 end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
1404 if (end_fsb <= start_fsb)
1405 return;
1406
1407 xfs_ilock(ip, XFS_ILOCK_EXCL);
1408 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1409 end_fsb - start_fsb);
1410 if (error) {
1411 /* something screwed, just bail */
1412 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1413 xfs_alert(ip->i_mount,
1414 "xfs_vm_write_failed: unable to clean up ino %lld",
1415 ip->i_ino);
1416 }
1417 }
1418 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1513 } 1419 }
1514} 1420}
1515 1421
@@ -1588,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
1588 .readpages = xfs_vm_readpages, 1494 .readpages = xfs_vm_readpages,
1589 .writepage = xfs_vm_writepage, 1495 .writepage = xfs_vm_writepage,
1590 .writepages = xfs_vm_writepages, 1496 .writepages = xfs_vm_writepages,
1591 .sync_page = block_sync_page,
1592 .releasepage = xfs_vm_releasepage, 1497 .releasepage = xfs_vm_releasepage,
1593 .invalidatepage = xfs_vm_invalidatepage, 1498 .invalidatepage = xfs_vm_invalidatepage,
1594 .write_begin = xfs_vm_write_begin, 1499 .write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index c5057fb6237a..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -23,6 +23,22 @@ extern struct workqueue_struct *xfsconvertd_workqueue;
23extern mempool_t *xfs_ioend_pool; 23extern mempool_t *xfs_ioend_pool;
24 24
25/* 25/*
26 * Types of I/O for bmap clustering and I/O completion tracking.
27 */
28enum {
29 IO_DIRECT = 0, /* special case for direct I/O ioends */
30 IO_DELALLOC, /* mapping covers delalloc region */
31 IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
32 IO_OVERWRITE, /* mapping covers already allocated extent */
33};
34
35#define XFS_IO_TYPES \
36 { 0, "" }, \
37 { IO_DELALLOC, "delalloc" }, \
38 { IO_UNWRITTEN, "unwritten" }, \
39 { IO_OVERWRITE, "overwrite" }
40
41/*
26 * xfs_ioend struct manages large extent writes for XFS. 42 * xfs_ioend struct manages large extent writes for XFS.
27 * It can manage several multi-page bio's at once. 43 * It can manage several multi-page bio's at once.
28 */ 44 */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 286e36e21dae..5e68099db2a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,7 +33,6 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
37 36
38#include "xfs_sb.h" 37#include "xfs_sb.h"
39#include "xfs_inum.h" 38#include "xfs_inum.h"
@@ -44,12 +43,7 @@
44 43
45static kmem_zone_t *xfs_buf_zone; 44static kmem_zone_t *xfs_buf_zone;
46STATIC int xfsbufd(void *); 45STATIC int xfsbufd(void *);
47STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
48STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 46STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
49static struct shrinker xfs_buf_shake = {
50 .shrink = xfsbufd_wakeup,
51 .seeks = DEFAULT_SEEKS,
52};
53 47
54static struct workqueue_struct *xfslogd_workqueue; 48static struct workqueue_struct *xfslogd_workqueue;
55struct workqueue_struct *xfsdatad_workqueue; 49struct workqueue_struct *xfsdatad_workqueue;
@@ -99,77 +93,79 @@ xfs_buf_vmap_len(
99} 93}
100 94
101/* 95/*
102 * Page Region interfaces. 96 * xfs_buf_lru_add - add a buffer to the LRU.
103 * 97 *
104 * For pages in filesystems where the blocksize is smaller than the 98 * The LRU takes a new reference to the buffer so that it will only be freed
105 * pagesize, we use the page->private field (long) to hold a bitmap 99 * once the shrinker takes the buffer off the LRU.
106 * of uptodate regions within the page.
107 *
108 * Each such region is "bytes per page / bits per long" bytes long.
109 *
110 * NBPPR == number-of-bytes-per-page-region
111 * BTOPR == bytes-to-page-region (rounded up)
112 * BTOPRT == bytes-to-page-region-truncated (rounded down)
113 */ 100 */
114#if (BITS_PER_LONG == 32) 101STATIC void
115#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 102xfs_buf_lru_add(
116#elif (BITS_PER_LONG == 64) 103 struct xfs_buf *bp)
117#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
118#else
119#error BITS_PER_LONG must be 32 or 64
120#endif
121#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
122#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
123#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
124
125STATIC unsigned long
126page_region_mask(
127 size_t offset,
128 size_t length)
129{ 104{
130 unsigned long mask; 105 struct xfs_buftarg *btp = bp->b_target;
131 int first, final;
132
133 first = BTOPR(offset);
134 final = BTOPRT(offset + length - 1);
135 first = min(first, final);
136
137 mask = ~0UL;
138 mask <<= BITS_PER_LONG - (final - first);
139 mask >>= BITS_PER_LONG - (final);
140
141 ASSERT(offset + length <= PAGE_CACHE_SIZE);
142 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
143 106
144 return mask; 107 spin_lock(&btp->bt_lru_lock);
108 if (list_empty(&bp->b_lru)) {
109 atomic_inc(&bp->b_hold);
110 list_add_tail(&bp->b_lru, &btp->bt_lru);
111 btp->bt_lru_nr++;
112 }
113 spin_unlock(&btp->bt_lru_lock);
145} 114}
146 115
116/*
117 * xfs_buf_lru_del - remove a buffer from the LRU
118 *
119 * The unlocked check is safe here because it only occurs when there are not
120 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
121 * to optimise the shrinker removing the buffer from the LRU and calling
122 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
123 * bt_lru_lock.
124 */
147STATIC void 125STATIC void
148set_page_region( 126xfs_buf_lru_del(
149 struct page *page, 127 struct xfs_buf *bp)
150 size_t offset,
151 size_t length)
152{ 128{
153 set_page_private(page, 129 struct xfs_buftarg *btp = bp->b_target;
154 page_private(page) | page_region_mask(offset, length));
155 if (page_private(page) == ~0UL)
156 SetPageUptodate(page);
157}
158 130
159STATIC int 131 if (list_empty(&bp->b_lru))
160test_page_region( 132 return;
161 struct page *page,
162 size_t offset,
163 size_t length)
164{
165 unsigned long mask = page_region_mask(offset, length);
166 133
167 return (mask && (page_private(page) & mask) == mask); 134 spin_lock(&btp->bt_lru_lock);
135 if (!list_empty(&bp->b_lru)) {
136 list_del_init(&bp->b_lru);
137 btp->bt_lru_nr--;
138 }
139 spin_unlock(&btp->bt_lru_lock);
168} 140}
169 141
170/* 142/*
171 * Internal xfs_buf_t object manipulation 143 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
144 * b_lru_ref count so that the buffer is freed immediately when the buffer
145 * reference count falls to zero. If the buffer is already on the LRU, we need
146 * to remove the reference that LRU holds on the buffer.
147 *
148 * This prevents build-up of stale buffers on the LRU.
172 */ 149 */
150void
151xfs_buf_stale(
152 struct xfs_buf *bp)
153{
154 bp->b_flags |= XBF_STALE;
155 atomic_set(&(bp)->b_lru_ref, 0);
156 if (!list_empty(&bp->b_lru)) {
157 struct xfs_buftarg *btp = bp->b_target;
158
159 spin_lock(&btp->bt_lru_lock);
160 if (!list_empty(&bp->b_lru)) {
161 list_del_init(&bp->b_lru);
162 btp->bt_lru_nr--;
163 atomic_dec(&bp->b_hold);
164 }
165 spin_unlock(&btp->bt_lru_lock);
166 }
167 ASSERT(atomic_read(&bp->b_hold) >= 1);
168}
173 169
174STATIC void 170STATIC void
175_xfs_buf_initialize( 171_xfs_buf_initialize(
@@ -186,10 +182,12 @@ _xfs_buf_initialize(
186 182
187 memset(bp, 0, sizeof(xfs_buf_t)); 183 memset(bp, 0, sizeof(xfs_buf_t));
188 atomic_set(&bp->b_hold, 1); 184 atomic_set(&bp->b_hold, 1);
185 atomic_set(&bp->b_lru_ref, 1);
189 init_completion(&bp->b_iowait); 186 init_completion(&bp->b_iowait);
187 INIT_LIST_HEAD(&bp->b_lru);
190 INIT_LIST_HEAD(&bp->b_list); 188 INIT_LIST_HEAD(&bp->b_list);
191 INIT_LIST_HEAD(&bp->b_hash_list); 189 RB_CLEAR_NODE(&bp->b_rbnode);
192 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 190 sema_init(&bp->b_sema, 0); /* held, no waiters */
193 XB_SET_OWNER(bp); 191 XB_SET_OWNER(bp);
194 bp->b_target = target; 192 bp->b_target = target;
195 bp->b_file_offset = range_base; 193 bp->b_file_offset = range_base;
@@ -262,9 +260,9 @@ xfs_buf_free(
262{ 260{
263 trace_xfs_buf_free(bp, _RET_IP_); 261 trace_xfs_buf_free(bp, _RET_IP_);
264 262
265 ASSERT(list_empty(&bp->b_hash_list)); 263 ASSERT(list_empty(&bp->b_lru));
266 264
267 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 265 if (bp->b_flags & _XBF_PAGES) {
268 uint i; 266 uint i;
269 267
270 if (xfs_buf_is_vmapped(bp)) 268 if (xfs_buf_is_vmapped(bp))
@@ -274,56 +272,77 @@ xfs_buf_free(
274 for (i = 0; i < bp->b_page_count; i++) { 272 for (i = 0; i < bp->b_page_count; i++) {
275 struct page *page = bp->b_pages[i]; 273 struct page *page = bp->b_pages[i];
276 274
277 if (bp->b_flags & _XBF_PAGE_CACHE) 275 __free_page(page);
278 ASSERT(!PagePrivate(page));
279 page_cache_release(page);
280 } 276 }
281 } 277 } else if (bp->b_flags & _XBF_KMEM)
278 kmem_free(bp->b_addr);
282 _xfs_buf_free_pages(bp); 279 _xfs_buf_free_pages(bp);
283 xfs_buf_deallocate(bp); 280 xfs_buf_deallocate(bp);
284} 281}
285 282
286/* 283/*
287 * Finds all pages for buffer in question and builds it's page list. 284 * Allocates all the pages for buffer in question and builds it's page list.
288 */ 285 */
289STATIC int 286STATIC int
290_xfs_buf_lookup_pages( 287xfs_buf_allocate_memory(
291 xfs_buf_t *bp, 288 xfs_buf_t *bp,
292 uint flags) 289 uint flags)
293{ 290{
294 struct address_space *mapping = bp->b_target->bt_mapping;
295 size_t blocksize = bp->b_target->bt_bsize;
296 size_t size = bp->b_count_desired; 291 size_t size = bp->b_count_desired;
297 size_t nbytes, offset; 292 size_t nbytes, offset;
298 gfp_t gfp_mask = xb_to_gfp(flags); 293 gfp_t gfp_mask = xb_to_gfp(flags);
299 unsigned short page_count, i; 294 unsigned short page_count, i;
300 pgoff_t first;
301 xfs_off_t end; 295 xfs_off_t end;
302 int error; 296 int error;
303 297
298 /*
299 * for buffers that are contained within a single page, just allocate
300 * the memory from the heap - there's no need for the complexity of
301 * page arrays to keep allocation down to order 0.
302 */
303 if (bp->b_buffer_length < PAGE_SIZE) {
304 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
305 if (!bp->b_addr) {
306 /* low memory - use alloc_page loop instead */
307 goto use_alloc_page;
308 }
309
310 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
311 PAGE_MASK) !=
312 ((unsigned long)bp->b_addr & PAGE_MASK)) {
313 /* b_addr spans two pages - use alloc_page instead */
314 kmem_free(bp->b_addr);
315 bp->b_addr = NULL;
316 goto use_alloc_page;
317 }
318 bp->b_offset = offset_in_page(bp->b_addr);
319 bp->b_pages = bp->b_page_array;
320 bp->b_pages[0] = virt_to_page(bp->b_addr);
321 bp->b_page_count = 1;
322 bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
323 return 0;
324 }
325
326use_alloc_page:
304 end = bp->b_file_offset + bp->b_buffer_length; 327 end = bp->b_file_offset + bp->b_buffer_length;
305 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 328 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
306
307 error = _xfs_buf_get_pages(bp, page_count, flags); 329 error = _xfs_buf_get_pages(bp, page_count, flags);
308 if (unlikely(error)) 330 if (unlikely(error))
309 return error; 331 return error;
310 bp->b_flags |= _XBF_PAGE_CACHE;
311 332
312 offset = bp->b_offset; 333 offset = bp->b_offset;
313 first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 334 bp->b_flags |= _XBF_PAGES;
314 335
315 for (i = 0; i < bp->b_page_count; i++) { 336 for (i = 0; i < bp->b_page_count; i++) {
316 struct page *page; 337 struct page *page;
317 uint retries = 0; 338 uint retries = 0;
318 339retry:
319 retry: 340 page = alloc_page(gfp_mask);
320 page = find_or_create_page(mapping, first + i, gfp_mask);
321 if (unlikely(page == NULL)) { 341 if (unlikely(page == NULL)) {
322 if (flags & XBF_READ_AHEAD) { 342 if (flags & XBF_READ_AHEAD) {
323 bp->b_page_count = i; 343 bp->b_page_count = i;
324 for (i = 0; i < bp->b_page_count; i++) 344 error = ENOMEM;
325 unlock_page(bp->b_pages[i]); 345 goto out_free_pages;
326 return -ENOMEM;
327 } 346 }
328 347
329 /* 348 /*
@@ -333,65 +352,55 @@ _xfs_buf_lookup_pages(
333 * handle buffer allocation failures we can't do much. 352 * handle buffer allocation failures we can't do much.
334 */ 353 */
335 if (!(++retries % 100)) 354 if (!(++retries % 100))
336 printk(KERN_ERR 355 xfs_err(NULL,
337 "XFS: possible memory allocation " 356 "possible memory allocation deadlock in %s (mode:0x%x)",
338 "deadlock in %s (mode:0x%x)\n",
339 __func__, gfp_mask); 357 __func__, gfp_mask);
340 358
341 XFS_STATS_INC(xb_page_retries); 359 XFS_STATS_INC(xb_page_retries);
342 xfsbufd_wakeup(NULL, 0, gfp_mask);
343 congestion_wait(BLK_RW_ASYNC, HZ/50); 360 congestion_wait(BLK_RW_ASYNC, HZ/50);
344 goto retry; 361 goto retry;
345 } 362 }
346 363
347 XFS_STATS_INC(xb_page_found); 364 XFS_STATS_INC(xb_page_found);
348 365
349 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 366 nbytes = min_t(size_t, size, PAGE_SIZE - offset);
350 size -= nbytes; 367 size -= nbytes;
351
352 ASSERT(!PagePrivate(page));
353 if (!PageUptodate(page)) {
354 page_count--;
355 if (blocksize >= PAGE_CACHE_SIZE) {
356 if (flags & XBF_READ)
357 bp->b_flags |= _XBF_PAGE_LOCKED;
358 } else if (!PagePrivate(page)) {
359 if (test_page_region(page, offset, nbytes))
360 page_count++;
361 }
362 }
363
364 bp->b_pages[i] = page; 368 bp->b_pages[i] = page;
365 offset = 0; 369 offset = 0;
366 } 370 }
371 return 0;
367 372
368 if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { 373out_free_pages:
369 for (i = 0; i < bp->b_page_count; i++) 374 for (i = 0; i < bp->b_page_count; i++)
370 unlock_page(bp->b_pages[i]); 375 __free_page(bp->b_pages[i]);
371 }
372
373 if (page_count == bp->b_page_count)
374 bp->b_flags |= XBF_DONE;
375
376 return error; 376 return error;
377} 377}
378 378
379/* 379/*
380 * Map buffer into kernel address-space if nessecary. 380 * Map buffer into kernel address-space if necessary.
381 */ 381 */
382STATIC int 382STATIC int
383_xfs_buf_map_pages( 383_xfs_buf_map_pages(
384 xfs_buf_t *bp, 384 xfs_buf_t *bp,
385 uint flags) 385 uint flags)
386{ 386{
387 /* A single page buffer is always mappable */ 387 ASSERT(bp->b_flags & _XBF_PAGES);
388 if (bp->b_page_count == 1) { 388 if (bp->b_page_count == 1) {
389 /* A single page buffer is always mappable */
389 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 390 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
390 bp->b_flags |= XBF_MAPPED; 391 bp->b_flags |= XBF_MAPPED;
391 } else if (flags & XBF_MAPPED) { 392 } else if (flags & XBF_MAPPED) {
392 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 393 int retried = 0;
393 -1, PAGE_KERNEL); 394
394 if (unlikely(bp->b_addr == NULL)) 395 do {
396 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
397 -1, PAGE_KERNEL);
398 if (bp->b_addr)
399 break;
400 vm_unmap_aliases();
401 } while (retried++ <= 1);
402
403 if (!bp->b_addr)
395 return -ENOMEM; 404 return -ENOMEM;
396 bp->b_addr += bp->b_offset; 405 bp->b_addr += bp->b_offset;
397 bp->b_flags |= XBF_MAPPED; 406 bp->b_flags |= XBF_MAPPED;
@@ -422,8 +431,10 @@ _xfs_buf_find(
422{ 431{
423 xfs_off_t range_base; 432 xfs_off_t range_base;
424 size_t range_length; 433 size_t range_length;
425 xfs_bufhash_t *hash; 434 struct xfs_perag *pag;
426 xfs_buf_t *bp, *n; 435 struct rb_node **rbp;
436 struct rb_node *parent;
437 xfs_buf_t *bp;
427 438
428 range_base = (ioff << BBSHIFT); 439 range_base = (ioff << BBSHIFT);
429 range_length = (isize << BBSHIFT); 440 range_length = (isize << BBSHIFT);
@@ -432,14 +443,37 @@ _xfs_buf_find(
432 ASSERT(!(range_length < (1 << btp->bt_sshift))); 443 ASSERT(!(range_length < (1 << btp->bt_sshift)));
433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 444 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
434 445
435 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 446 /* get tree root */
436 447 pag = xfs_perag_get(btp->bt_mount,
437 spin_lock(&hash->bh_lock); 448 xfs_daddr_to_agno(btp->bt_mount, ioff));
438 449
439 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 450 /* walk tree */
440 ASSERT(btp == bp->b_target); 451 spin_lock(&pag->pag_buf_lock);
441 if (bp->b_file_offset == range_base && 452 rbp = &pag->pag_buf_tree.rb_node;
442 bp->b_buffer_length == range_length) { 453 parent = NULL;
454 bp = NULL;
455 while (*rbp) {
456 parent = *rbp;
457 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
458
459 if (range_base < bp->b_file_offset)
460 rbp = &(*rbp)->rb_left;
461 else if (range_base > bp->b_file_offset)
462 rbp = &(*rbp)->rb_right;
463 else {
464 /*
465 * found a block offset match. If the range doesn't
466 * match, the only way this is allowed is if the buffer
467 * in the cache is stale and the transaction that made
468 * it stale has not yet committed. i.e. we are
469 * reallocating a busy extent. Skip this buffer and
470 * continue searching to the right for an exact match.
471 */
472 if (bp->b_buffer_length != range_length) {
473 ASSERT(bp->b_flags & XBF_STALE);
474 rbp = &(*rbp)->rb_right;
475 continue;
476 }
443 atomic_inc(&bp->b_hold); 477 atomic_inc(&bp->b_hold);
444 goto found; 478 goto found;
445 } 479 }
@@ -449,46 +483,42 @@ _xfs_buf_find(
449 if (new_bp) { 483 if (new_bp) {
450 _xfs_buf_initialize(new_bp, btp, range_base, 484 _xfs_buf_initialize(new_bp, btp, range_base,
451 range_length, flags); 485 range_length, flags);
452 new_bp->b_hash = hash; 486 rb_link_node(&new_bp->b_rbnode, parent, rbp);
453 list_add(&new_bp->b_hash_list, &hash->bh_list); 487 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
488 /* the buffer keeps the perag reference until it is freed */
489 new_bp->b_pag = pag;
490 spin_unlock(&pag->pag_buf_lock);
454 } else { 491 } else {
455 XFS_STATS_INC(xb_miss_locked); 492 XFS_STATS_INC(xb_miss_locked);
493 spin_unlock(&pag->pag_buf_lock);
494 xfs_perag_put(pag);
456 } 495 }
457
458 spin_unlock(&hash->bh_lock);
459 return new_bp; 496 return new_bp;
460 497
461found: 498found:
462 spin_unlock(&hash->bh_lock); 499 spin_unlock(&pag->pag_buf_lock);
500 xfs_perag_put(pag);
463 501
464 /* Attempt to get the semaphore without sleeping, 502 if (xfs_buf_cond_lock(bp)) {
465 * if this does not work then we need to drop the 503 /* failed, so wait for the lock if requested. */
466 * spinlock and do a hard attempt on the semaphore.
467 */
468 if (down_trylock(&bp->b_sema)) {
469 if (!(flags & XBF_TRYLOCK)) { 504 if (!(flags & XBF_TRYLOCK)) {
470 /* wait for buffer ownership */
471 xfs_buf_lock(bp); 505 xfs_buf_lock(bp);
472 XFS_STATS_INC(xb_get_locked_waited); 506 XFS_STATS_INC(xb_get_locked_waited);
473 } else { 507 } else {
474 /* We asked for a trylock and failed, no need
475 * to look at file offset and length here, we
476 * know that this buffer at least overlaps our
477 * buffer and is locked, therefore our buffer
478 * either does not exist, or is this buffer.
479 */
480 xfs_buf_rele(bp); 508 xfs_buf_rele(bp);
481 XFS_STATS_INC(xb_busy_locked); 509 XFS_STATS_INC(xb_busy_locked);
482 return NULL; 510 return NULL;
483 } 511 }
484 } else {
485 /* trylock worked */
486 XB_SET_OWNER(bp);
487 } 512 }
488 513
514 /*
515 * if the buffer is stale, clear all the external state associated with
516 * it. We need to keep flags such as how we allocated the buffer memory
517 * intact here.
518 */
489 if (bp->b_flags & XBF_STALE) { 519 if (bp->b_flags & XBF_STALE) {
490 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 520 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
491 bp->b_flags &= XBF_MAPPED; 521 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
492 } 522 }
493 523
494 trace_xfs_buf_find(bp, flags, _RET_IP_); 524 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -509,7 +539,7 @@ xfs_buf_get(
509 xfs_buf_flags_t flags) 539 xfs_buf_flags_t flags)
510{ 540{
511 xfs_buf_t *bp, *new_bp; 541 xfs_buf_t *bp, *new_bp;
512 int error = 0, i; 542 int error = 0;
513 543
514 new_bp = xfs_buf_allocate(flags); 544 new_bp = xfs_buf_allocate(flags);
515 if (unlikely(!new_bp)) 545 if (unlikely(!new_bp))
@@ -517,7 +547,7 @@ xfs_buf_get(
517 547
518 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 548 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
519 if (bp == new_bp) { 549 if (bp == new_bp) {
520 error = _xfs_buf_lookup_pages(bp, flags); 550 error = xfs_buf_allocate_memory(bp, flags);
521 if (error) 551 if (error)
522 goto no_buffer; 552 goto no_buffer;
523 } else { 553 } else {
@@ -526,14 +556,11 @@ xfs_buf_get(
526 return NULL; 556 return NULL;
527 } 557 }
528 558
529 for (i = 0; i < bp->b_page_count; i++)
530 mark_page_accessed(bp->b_pages[i]);
531
532 if (!(bp->b_flags & XBF_MAPPED)) { 559 if (!(bp->b_flags & XBF_MAPPED)) {
533 error = _xfs_buf_map_pages(bp, flags); 560 error = _xfs_buf_map_pages(bp, flags);
534 if (unlikely(error)) { 561 if (unlikely(error)) {
535 printk(KERN_WARNING "%s: failed to map pages\n", 562 xfs_warn(target->bt_mount,
536 __func__); 563 "%s: failed to map pages\n", __func__);
537 goto no_buffer; 564 goto no_buffer;
538 } 565 }
539 } 566 }
@@ -625,17 +652,47 @@ void
625xfs_buf_readahead( 652xfs_buf_readahead(
626 xfs_buftarg_t *target, 653 xfs_buftarg_t *target,
627 xfs_off_t ioff, 654 xfs_off_t ioff,
628 size_t isize, 655 size_t isize)
629 xfs_buf_flags_t flags)
630{ 656{
631 struct backing_dev_info *bdi; 657 if (bdi_read_congested(target->bt_bdi))
632
633 bdi = target->bt_mapping->backing_dev_info;
634 if (bdi_read_congested(bdi))
635 return; 658 return;
636 659
637 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 660 xfs_buf_read(target, ioff, isize,
638 xfs_buf_read(target, ioff, isize, flags); 661 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
662}
663
664/*
665 * Read an uncached buffer from disk. Allocates and returns a locked
666 * buffer containing the disk contents or nothing.
667 */
668struct xfs_buf *
669xfs_buf_read_uncached(
670 struct xfs_mount *mp,
671 struct xfs_buftarg *target,
672 xfs_daddr_t daddr,
673 size_t length,
674 int flags)
675{
676 xfs_buf_t *bp;
677 int error;
678
679 bp = xfs_buf_get_uncached(target, length, flags);
680 if (!bp)
681 return NULL;
682
683 /* set up the buffer for a read IO */
684 xfs_buf_lock(bp);
685 XFS_BUF_SET_ADDR(bp, daddr);
686 XFS_BUF_READ(bp);
687 XFS_BUF_BUSY(bp);
688
689 xfsbdstrat(mp, bp);
690 error = xfs_buf_iowait(bp);
691 if (error || bp->b_error) {
692 xfs_buf_relse(bp);
693 return NULL;
694 }
695 return bp;
639} 696}
640 697
641xfs_buf_t * 698xfs_buf_t *
@@ -651,6 +708,27 @@ xfs_buf_get_empty(
651 return bp; 708 return bp;
652} 709}
653 710
711/*
712 * Return a buffer allocated as an empty buffer and associated to external
713 * memory via xfs_buf_associate_memory() back to it's empty state.
714 */
715void
716xfs_buf_set_empty(
717 struct xfs_buf *bp,
718 size_t len)
719{
720 if (bp->b_pages)
721 _xfs_buf_free_pages(bp);
722
723 bp->b_pages = NULL;
724 bp->b_page_count = 0;
725 bp->b_addr = NULL;
726 bp->b_file_offset = 0;
727 bp->b_buffer_length = bp->b_count_desired = len;
728 bp->b_bn = XFS_BUF_DADDR_NULL;
729 bp->b_flags &= ~XBF_MAPPED;
730}
731
654static inline struct page * 732static inline struct page *
655mem_to_page( 733mem_to_page(
656 void *addr) 734 void *addr)
@@ -675,10 +753,10 @@ xfs_buf_associate_memory(
675 size_t buflen; 753 size_t buflen;
676 int page_count; 754 int page_count;
677 755
678 pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; 756 pageaddr = (unsigned long)mem & PAGE_MASK;
679 offset = (unsigned long)mem - pageaddr; 757 offset = (unsigned long)mem - pageaddr;
680 buflen = PAGE_CACHE_ALIGN(len + offset); 758 buflen = PAGE_ALIGN(len + offset);
681 page_count = buflen >> PAGE_CACHE_SHIFT; 759 page_count = buflen >> PAGE_SHIFT;
682 760
683 /* Free any previous set of page pointers */ 761 /* Free any previous set of page pointers */
684 if (bp->b_pages) 762 if (bp->b_pages)
@@ -695,21 +773,21 @@ xfs_buf_associate_memory(
695 773
696 for (i = 0; i < bp->b_page_count; i++) { 774 for (i = 0; i < bp->b_page_count; i++) {
697 bp->b_pages[i] = mem_to_page((void *)pageaddr); 775 bp->b_pages[i] = mem_to_page((void *)pageaddr);
698 pageaddr += PAGE_CACHE_SIZE; 776 pageaddr += PAGE_SIZE;
699 } 777 }
700 778
701 bp->b_count_desired = len; 779 bp->b_count_desired = len;
702 bp->b_buffer_length = buflen; 780 bp->b_buffer_length = buflen;
703 bp->b_flags |= XBF_MAPPED; 781 bp->b_flags |= XBF_MAPPED;
704 bp->b_flags &= ~_XBF_PAGE_LOCKED;
705 782
706 return 0; 783 return 0;
707} 784}
708 785
709xfs_buf_t * 786xfs_buf_t *
710xfs_buf_get_noaddr( 787xfs_buf_get_uncached(
788 struct xfs_buftarg *target,
711 size_t len, 789 size_t len,
712 xfs_buftarg_t *target) 790 int flags)
713{ 791{
714 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 792 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
715 int error, i; 793 int error, i;
@@ -725,7 +803,7 @@ xfs_buf_get_noaddr(
725 goto fail_free_buf; 803 goto fail_free_buf;
726 804
727 for (i = 0; i < page_count; i++) { 805 for (i = 0; i < page_count; i++) {
728 bp->b_pages[i] = alloc_page(GFP_KERNEL); 806 bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
729 if (!bp->b_pages[i]) 807 if (!bp->b_pages[i])
730 goto fail_free_mem; 808 goto fail_free_mem;
731 } 809 }
@@ -733,14 +811,14 @@ xfs_buf_get_noaddr(
733 811
734 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 812 error = _xfs_buf_map_pages(bp, XBF_MAPPED);
735 if (unlikely(error)) { 813 if (unlikely(error)) {
736 printk(KERN_WARNING "%s: failed to map pages\n", 814 xfs_warn(target->bt_mount,
737 __func__); 815 "%s: failed to map pages\n", __func__);
738 goto fail_free_mem; 816 goto fail_free_mem;
739 } 817 }
740 818
741 xfs_buf_unlock(bp); 819 xfs_buf_unlock(bp);
742 820
743 trace_xfs_buf_get_noaddr(bp, _RET_IP_); 821 trace_xfs_buf_get_uncached(bp, _RET_IP_);
744 return bp; 822 return bp;
745 823
746 fail_free_mem: 824 fail_free_mem:
@@ -774,29 +852,32 @@ void
774xfs_buf_rele( 852xfs_buf_rele(
775 xfs_buf_t *bp) 853 xfs_buf_t *bp)
776{ 854{
777 xfs_bufhash_t *hash = bp->b_hash; 855 struct xfs_perag *pag = bp->b_pag;
778 856
779 trace_xfs_buf_rele(bp, _RET_IP_); 857 trace_xfs_buf_rele(bp, _RET_IP_);
780 858
781 if (unlikely(!hash)) { 859 if (!pag) {
782 ASSERT(!bp->b_relse); 860 ASSERT(list_empty(&bp->b_lru));
861 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
783 if (atomic_dec_and_test(&bp->b_hold)) 862 if (atomic_dec_and_test(&bp->b_hold))
784 xfs_buf_free(bp); 863 xfs_buf_free(bp);
785 return; 864 return;
786 } 865 }
787 866
867 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
868
788 ASSERT(atomic_read(&bp->b_hold) > 0); 869 ASSERT(atomic_read(&bp->b_hold) > 0);
789 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 870 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
790 if (bp->b_relse) { 871 if (!(bp->b_flags & XBF_STALE) &&
791 atomic_inc(&bp->b_hold); 872 atomic_read(&bp->b_lru_ref)) {
792 spin_unlock(&hash->bh_lock); 873 xfs_buf_lru_add(bp);
793 (*(bp->b_relse)) (bp); 874 spin_unlock(&pag->pag_buf_lock);
794 } else if (bp->b_flags & XBF_FS_MANAGED) {
795 spin_unlock(&hash->bh_lock);
796 } else { 875 } else {
876 xfs_buf_lru_del(bp);
797 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 877 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
798 list_del_init(&bp->b_hash_list); 878 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
799 spin_unlock(&hash->bh_lock); 879 spin_unlock(&pag->pag_buf_lock);
880 xfs_perag_put(pag);
800 xfs_buf_free(bp); 881 xfs_buf_free(bp);
801 } 882 }
802 } 883 }
@@ -804,20 +885,15 @@ xfs_buf_rele(
804 885
805 886
806/* 887/*
807 * Mutual exclusion on buffers. Locking model: 888 * Lock a buffer object, if it is not already locked.
808 * 889 *
809 * Buffers associated with inodes for which buffer locking 890 * If we come across a stale, pinned, locked buffer, we know that we are
810 * is not enabled are not protected by semaphores, and are 891 * being asked to lock a buffer that has been reallocated. Because it is
811 * assumed to be exclusively owned by the caller. There is a 892 * pinned, we know that the log has not been pushed to disk and hence it
812 * spinlock in the buffer, used by the caller when concurrent 893 * will still be locked. Rather than continuing to have trylock attempts
813 * access is possible. 894 * fail until someone else pushes the log, push it ourselves before
814 */ 895 * returning. This means that the xfsaild will not get stuck trying
815 896 * to push on stale inode buffers.
816/*
817 * Locks a buffer object, if it is not already locked.
818 * Note that this in no way locks the underlying pages, so it is only
819 * useful for synchronizing concurrent use of buffer objects, not for
820 * synchronizing independent access to the underlying pages.
821 */ 897 */
822int 898int
823xfs_buf_cond_lock( 899xfs_buf_cond_lock(
@@ -828,6 +904,8 @@ xfs_buf_cond_lock(
828 locked = down_trylock(&bp->b_sema) == 0; 904 locked = down_trylock(&bp->b_sema) == 0;
829 if (locked) 905 if (locked)
830 XB_SET_OWNER(bp); 906 XB_SET_OWNER(bp);
907 else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
908 xfs_log_force(bp->b_target->bt_mount, 0);
831 909
832 trace_xfs_buf_cond_lock(bp, _RET_IP_); 910 trace_xfs_buf_cond_lock(bp, _RET_IP_);
833 return locked ? 0 : -EBUSY; 911 return locked ? 0 : -EBUSY;
@@ -841,10 +919,7 @@ xfs_buf_lock_value(
841} 919}
842 920
843/* 921/*
844 * Locks a buffer object. 922 * Lock a buffer object.
845 * Note that this in no way locks the underlying pages, so it is only
846 * useful for synchronizing concurrent use of buffer objects, not for
847 * synchronizing independent access to the underlying pages.
848 * 923 *
849 * If we come across a stale, pinned, locked buffer, we know that we 924 * If we come across a stale, pinned, locked buffer, we know that we
850 * are being asked to lock a buffer that has been reallocated. Because 925 * are being asked to lock a buffer that has been reallocated. Because
@@ -859,9 +934,7 @@ xfs_buf_lock(
859 trace_xfs_buf_lock(bp, _RET_IP_); 934 trace_xfs_buf_lock(bp, _RET_IP_);
860 935
861 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 936 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
862 xfs_log_force(bp->b_mount, 0); 937 xfs_log_force(bp->b_target->bt_mount, 0);
863 if (atomic_read(&bp->b_io_remaining))
864 blk_run_address_space(bp->b_target->bt_mapping);
865 down(&bp->b_sema); 938 down(&bp->b_sema);
866 XB_SET_OWNER(bp); 939 XB_SET_OWNER(bp);
867 940
@@ -905,9 +978,7 @@ xfs_buf_wait_unpin(
905 set_current_state(TASK_UNINTERRUPTIBLE); 978 set_current_state(TASK_UNINTERRUPTIBLE);
906 if (atomic_read(&bp->b_pin_count) == 0) 979 if (atomic_read(&bp->b_pin_count) == 0)
907 break; 980 break;
908 if (atomic_read(&bp->b_io_remaining)) 981 io_schedule();
909 blk_run_address_space(bp->b_target->bt_mapping);
910 schedule();
911 } 982 }
912 remove_wait_queue(&bp->b_waiters, &wait); 983 remove_wait_queue(&bp->b_waiters, &wait);
913 set_current_state(TASK_RUNNING); 984 set_current_state(TASK_RUNNING);
@@ -924,19 +995,7 @@ xfs_buf_iodone_work(
924 xfs_buf_t *bp = 995 xfs_buf_t *bp =
925 container_of(work, xfs_buf_t, b_iodone_work); 996 container_of(work, xfs_buf_t, b_iodone_work);
926 997
927 /* 998 if (bp->b_iodone)
928 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
929 * ordered flag and reissue them. Because we can't tell the higher
930 * layers directly that they should not issue ordered I/O anymore, they
931 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
932 */
933 if ((bp->b_error == EOPNOTSUPP) &&
934 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
935 trace_xfs_buf_ordered_retry(bp, _RET_IP_);
936 bp->b_flags &= ~XBF_ORDERED;
937 bp->b_flags |= _XFS_BARRIER_FAILED;
938 xfs_buf_iorequest(bp);
939 } else if (bp->b_iodone)
940 (*(bp->b_iodone))(bp); 999 (*(bp->b_iodone))(bp);
941 else if (bp->b_flags & XBF_ASYNC) 1000 else if (bp->b_flags & XBF_ASYNC)
942 xfs_buf_relse(bp); 1001 xfs_buf_relse(bp);
@@ -982,7 +1041,6 @@ xfs_bwrite(
982{ 1041{
983 int error; 1042 int error;
984 1043
985 bp->b_mount = mp;
986 bp->b_flags |= XBF_WRITE; 1044 bp->b_flags |= XBF_WRITE;
987 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1045 bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
988 1046
@@ -1003,8 +1061,6 @@ xfs_bdwrite(
1003{ 1061{
1004 trace_xfs_buf_bdwrite(bp, _RET_IP_); 1062 trace_xfs_buf_bdwrite(bp, _RET_IP_);
1005 1063
1006 bp->b_mount = mp;
1007
1008 bp->b_flags &= ~XBF_READ; 1064 bp->b_flags &= ~XBF_READ;
1009 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); 1065 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1010 1066
@@ -1013,7 +1069,7 @@ xfs_bdwrite(
1013 1069
1014/* 1070/*
1015 * Called when we want to stop a buffer from getting written or read. 1071 * Called when we want to stop a buffer from getting written or read.
1016 * We attach the EIO error, muck with its flags, and call biodone 1072 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1017 * so that the proper iodone callbacks get called. 1073 * so that the proper iodone callbacks get called.
1018 */ 1074 */
1019STATIC int 1075STATIC int
@@ -1030,21 +1086,21 @@ xfs_bioerror(
1030 XFS_BUF_ERROR(bp, EIO); 1086 XFS_BUF_ERROR(bp, EIO);
1031 1087
1032 /* 1088 /*
1033 * We're calling biodone, so delete XBF_DONE flag. 1089 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1034 */ 1090 */
1035 XFS_BUF_UNREAD(bp); 1091 XFS_BUF_UNREAD(bp);
1036 XFS_BUF_UNDELAYWRITE(bp); 1092 XFS_BUF_UNDELAYWRITE(bp);
1037 XFS_BUF_UNDONE(bp); 1093 XFS_BUF_UNDONE(bp);
1038 XFS_BUF_STALE(bp); 1094 XFS_BUF_STALE(bp);
1039 1095
1040 xfs_biodone(bp); 1096 xfs_buf_ioend(bp, 0);
1041 1097
1042 return EIO; 1098 return EIO;
1043} 1099}
1044 1100
1045/* 1101/*
1046 * Same as xfs_bioerror, except that we are releasing the buffer 1102 * Same as xfs_bioerror, except that we are releasing the buffer
1047 * here ourselves, and avoiding the biodone call. 1103 * here ourselves, and avoiding the xfs_buf_ioend call.
1048 * This is meant for userdata errors; metadata bufs come with 1104 * This is meant for userdata errors; metadata bufs come with
1049 * iodone functions attached, so that we can track down errors. 1105 * iodone functions attached, so that we can track down errors.
1050 */ 1106 */
@@ -1093,7 +1149,7 @@ int
1093xfs_bdstrat_cb( 1149xfs_bdstrat_cb(
1094 struct xfs_buf *bp) 1150 struct xfs_buf *bp)
1095{ 1151{
1096 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { 1152 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1097 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1153 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1098 /* 1154 /*
1099 * Metadata write that didn't get logged but 1155 * Metadata write that didn't get logged but
@@ -1134,10 +1190,8 @@ _xfs_buf_ioend(
1134 xfs_buf_t *bp, 1190 xfs_buf_t *bp,
1135 int schedule) 1191 int schedule)
1136{ 1192{
1137 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1193 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1138 bp->b_flags &= ~_XBF_PAGE_LOCKED;
1139 xfs_buf_ioend(bp, schedule); 1194 xfs_buf_ioend(bp, schedule);
1140 }
1141} 1195}
1142 1196
1143STATIC void 1197STATIC void
@@ -1146,35 +1200,12 @@ xfs_buf_bio_end_io(
1146 int error) 1200 int error)
1147{ 1201{
1148 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1202 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1149 unsigned int blocksize = bp->b_target->bt_bsize;
1150 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1151 1203
1152 xfs_buf_ioerror(bp, -error); 1204 xfs_buf_ioerror(bp, -error);
1153 1205
1154 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1206 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1155 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1207 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1156 1208
1157 do {
1158 struct page *page = bvec->bv_page;
1159
1160 ASSERT(!PagePrivate(page));
1161 if (unlikely(bp->b_error)) {
1162 if (bp->b_flags & XBF_READ)
1163 ClearPageUptodate(page);
1164 } else if (blocksize >= PAGE_CACHE_SIZE) {
1165 SetPageUptodate(page);
1166 } else if (!PagePrivate(page) &&
1167 (bp->b_flags & _XBF_PAGE_CACHE)) {
1168 set_page_region(page, bvec->bv_offset, bvec->bv_len);
1169 }
1170
1171 if (--bvec >= bio->bi_io_vec)
1172 prefetchw(&bvec->bv_page->flags);
1173
1174 if (bp->b_flags & _XBF_PAGE_LOCKED)
1175 unlock_page(page);
1176 } while (bvec >= bio->bi_io_vec);
1177
1178 _xfs_buf_ioend(bp, 1); 1209 _xfs_buf_ioend(bp, 1);
1179 bio_put(bio); 1210 bio_put(bio);
1180} 1211}
@@ -1188,14 +1219,13 @@ _xfs_buf_ioapply(
1188 int offset = bp->b_offset; 1219 int offset = bp->b_offset;
1189 int size = bp->b_count_desired; 1220 int size = bp->b_count_desired;
1190 sector_t sector = bp->b_bn; 1221 sector_t sector = bp->b_bn;
1191 unsigned int blocksize = bp->b_target->bt_bsize;
1192 1222
1193 total_nr_pages = bp->b_page_count; 1223 total_nr_pages = bp->b_page_count;
1194 map_i = 0; 1224 map_i = 0;
1195 1225
1196 if (bp->b_flags & XBF_ORDERED) { 1226 if (bp->b_flags & XBF_ORDERED) {
1197 ASSERT(!(bp->b_flags & XBF_READ)); 1227 ASSERT(!(bp->b_flags & XBF_READ));
1198 rw = WRITE_BARRIER; 1228 rw = WRITE_FLUSH_FUA;
1199 } else if (bp->b_flags & XBF_LOG_BUFFER) { 1229 } else if (bp->b_flags & XBF_LOG_BUFFER) {
1200 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1230 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1201 bp->b_flags &= ~_XBF_RUN_QUEUES; 1231 bp->b_flags &= ~_XBF_RUN_QUEUES;
@@ -1209,29 +1239,6 @@ _xfs_buf_ioapply(
1209 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1239 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1210 } 1240 }
1211 1241
1212 /* Special code path for reading a sub page size buffer in --
1213 * we populate up the whole page, and hence the other metadata
1214 * in the same page. This optimization is only valid when the
1215 * filesystem block size is not smaller than the page size.
1216 */
1217 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1218 ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
1219 (XBF_READ|_XBF_PAGE_LOCKED)) &&
1220 (blocksize >= PAGE_CACHE_SIZE)) {
1221 bio = bio_alloc(GFP_NOIO, 1);
1222
1223 bio->bi_bdev = bp->b_target->bt_bdev;
1224 bio->bi_sector = sector - (offset >> BBSHIFT);
1225 bio->bi_end_io = xfs_buf_bio_end_io;
1226 bio->bi_private = bp;
1227
1228 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1229 size = 0;
1230
1231 atomic_inc(&bp->b_io_remaining);
1232
1233 goto submit_io;
1234 }
1235 1242
1236next_chunk: 1243next_chunk:
1237 atomic_inc(&bp->b_io_remaining); 1244 atomic_inc(&bp->b_io_remaining);
@@ -1245,8 +1252,9 @@ next_chunk:
1245 bio->bi_end_io = xfs_buf_bio_end_io; 1252 bio->bi_end_io = xfs_buf_bio_end_io;
1246 bio->bi_private = bp; 1253 bio->bi_private = bp;
1247 1254
1255
1248 for (; size && nr_pages; nr_pages--, map_i++) { 1256 for (; size && nr_pages; nr_pages--, map_i++) {
1249 int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1257 int rbytes, nbytes = PAGE_SIZE - offset;
1250 1258
1251 if (nbytes > size) 1259 if (nbytes > size)
1252 nbytes = size; 1260 nbytes = size;
@@ -1261,7 +1269,6 @@ next_chunk:
1261 total_nr_pages--; 1269 total_nr_pages--;
1262 } 1270 }
1263 1271
1264submit_io:
1265 if (likely(bio->bi_size)) { 1272 if (likely(bio->bi_size)) {
1266 if (xfs_buf_is_vmapped(bp)) { 1273 if (xfs_buf_is_vmapped(bp)) {
1267 flush_kernel_vmap_range(bp->b_addr, 1274 flush_kernel_vmap_range(bp->b_addr,
@@ -1271,18 +1278,7 @@ submit_io:
1271 if (size) 1278 if (size)
1272 goto next_chunk; 1279 goto next_chunk;
1273 } else { 1280 } else {
1274 /*
1275 * if we get here, no pages were added to the bio. However,
1276 * we can't just error out here - if the pages are locked then
1277 * we have to unlock them otherwise we can hang on a later
1278 * access to the page.
1279 */
1280 xfs_buf_ioerror(bp, EIO); 1281 xfs_buf_ioerror(bp, EIO);
1281 if (bp->b_flags & _XBF_PAGE_LOCKED) {
1282 int i;
1283 for (i = 0; i < bp->b_page_count; i++)
1284 unlock_page(bp->b_pages[i]);
1285 }
1286 bio_put(bio); 1282 bio_put(bio);
1287 } 1283 }
1288} 1284}
@@ -1327,8 +1323,6 @@ xfs_buf_iowait(
1327{ 1323{
1328 trace_xfs_buf_iowait(bp, _RET_IP_); 1324 trace_xfs_buf_iowait(bp, _RET_IP_);
1329 1325
1330 if (atomic_read(&bp->b_io_remaining))
1331 blk_run_address_space(bp->b_target->bt_mapping);
1332 wait_for_completion(&bp->b_iowait); 1326 wait_for_completion(&bp->b_iowait);
1333 1327
1334 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1328 trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1346,8 +1340,8 @@ xfs_buf_offset(
1346 return XFS_BUF_PTR(bp) + offset; 1340 return XFS_BUF_PTR(bp) + offset;
1347 1341
1348 offset += bp->b_offset; 1342 offset += bp->b_offset;
1349 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1343 page = bp->b_pages[offset >> PAGE_SHIFT];
1350 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1344 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1351} 1345}
1352 1346
1353/* 1347/*
@@ -1369,9 +1363,9 @@ xfs_buf_iomove(
1369 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1363 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1370 cpoff = xfs_buf_poff(boff + bp->b_offset); 1364 cpoff = xfs_buf_poff(boff + bp->b_offset);
1371 csize = min_t(size_t, 1365 csize = min_t(size_t,
1372 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1366 PAGE_SIZE-cpoff, bp->b_count_desired-boff);
1373 1367
1374 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1368 ASSERT(((csize + cpoff) <= PAGE_SIZE));
1375 1369
1376 switch (mode) { 1370 switch (mode) {
1377 case XBRW_ZERO: 1371 case XBRW_ZERO:
@@ -1394,89 +1388,84 @@ xfs_buf_iomove(
1394 */ 1388 */
1395 1389
1396/* 1390/*
1397 * Wait for any bufs with callbacks that have been submitted but 1391 * Wait for any bufs with callbacks that have been submitted but have not yet
1398 * have not yet returned... walk the hash list for the target. 1392 * returned. These buffers will have an elevated hold count, so wait on those
1393 * while freeing all the buffers only held by the LRU.
1399 */ 1394 */
1400void 1395void
1401xfs_wait_buftarg( 1396xfs_wait_buftarg(
1402 xfs_buftarg_t *btp) 1397 struct xfs_buftarg *btp)
1403{ 1398{
1404 xfs_buf_t *bp, *n; 1399 struct xfs_buf *bp;
1405 xfs_bufhash_t *hash; 1400
1406 uint i; 1401restart:
1407 1402 spin_lock(&btp->bt_lru_lock);
1408 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1403 while (!list_empty(&btp->bt_lru)) {
1409 hash = &btp->bt_hash[i]; 1404 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1410again: 1405 if (atomic_read(&bp->b_hold) > 1) {
1411 spin_lock(&hash->bh_lock); 1406 spin_unlock(&btp->bt_lru_lock);
1412 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 1407 delay(100);
1413 ASSERT(btp == bp->b_target); 1408 goto restart;
1414 if (!(bp->b_flags & XBF_FS_MANAGED)) {
1415 spin_unlock(&hash->bh_lock);
1416 /*
1417 * Catch superblock reference count leaks
1418 * immediately
1419 */
1420 BUG_ON(bp->b_bn == 0);
1421 delay(100);
1422 goto again;
1423 }
1424 } 1409 }
1425 spin_unlock(&hash->bh_lock); 1410 /*
1411 * clear the LRU reference count so the bufer doesn't get
1412 * ignored in xfs_buf_rele().
1413 */
1414 atomic_set(&bp->b_lru_ref, 0);
1415 spin_unlock(&btp->bt_lru_lock);
1416 xfs_buf_rele(bp);
1417 spin_lock(&btp->bt_lru_lock);
1426 } 1418 }
1419 spin_unlock(&btp->bt_lru_lock);
1427} 1420}
1428 1421
1429/* 1422int
1430 * Allocate buffer hash table for a given target. 1423xfs_buftarg_shrink(
1431 * For devices containing metadata (i.e. not the log/realtime devices) 1424 struct shrinker *shrink,
1432 * we need to allocate a much larger hash table. 1425 struct shrink_control *sc)
1433 */
1434STATIC void
1435xfs_alloc_bufhash(
1436 xfs_buftarg_t *btp,
1437 int external)
1438{ 1426{
1439 unsigned int i; 1427 struct xfs_buftarg *btp = container_of(shrink,
1428 struct xfs_buftarg, bt_shrinker);
1429 struct xfs_buf *bp;
1430 int nr_to_scan = sc->nr_to_scan;
1431 LIST_HEAD(dispose);
1440 1432
1441 btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ 1433 if (!nr_to_scan)
1442 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * 1434 return btp->bt_lru_nr;
1443 sizeof(xfs_bufhash_t));
1444 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1445 spin_lock_init(&btp->bt_hash[i].bh_lock);
1446 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
1447 }
1448}
1449 1435
1450STATIC void 1436 spin_lock(&btp->bt_lru_lock);
1451xfs_free_bufhash( 1437 while (!list_empty(&btp->bt_lru)) {
1452 xfs_buftarg_t *btp) 1438 if (nr_to_scan-- <= 0)
1453{ 1439 break;
1454 kmem_free_large(btp->bt_hash);
1455 btp->bt_hash = NULL;
1456}
1457 1440
1458/* 1441 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1459 * buftarg list for delwrite queue processing
1460 */
1461static LIST_HEAD(xfs_buftarg_list);
1462static DEFINE_SPINLOCK(xfs_buftarg_lock);
1463 1442
1464STATIC void 1443 /*
1465xfs_register_buftarg( 1444 * Decrement the b_lru_ref count unless the value is already
1466 xfs_buftarg_t *btp) 1445 * zero. If the value is already zero, we need to reclaim the
1467{ 1446 * buffer, otherwise it gets another trip through the LRU.
1468 spin_lock(&xfs_buftarg_lock); 1447 */
1469 list_add(&btp->bt_list, &xfs_buftarg_list); 1448 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1470 spin_unlock(&xfs_buftarg_lock); 1449 list_move_tail(&bp->b_lru, &btp->bt_lru);
1471} 1450 continue;
1451 }
1472 1452
1473STATIC void 1453 /*
1474xfs_unregister_buftarg( 1454 * remove the buffer from the LRU now to avoid needing another
1475 xfs_buftarg_t *btp) 1455 * lock round trip inside xfs_buf_rele().
1476{ 1456 */
1477 spin_lock(&xfs_buftarg_lock); 1457 list_move(&bp->b_lru, &dispose);
1478 list_del(&btp->bt_list); 1458 btp->bt_lru_nr--;
1479 spin_unlock(&xfs_buftarg_lock); 1459 }
1460 spin_unlock(&btp->bt_lru_lock);
1461
1462 while (!list_empty(&dispose)) {
1463 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1464 list_del_init(&bp->b_lru);
1465 xfs_buf_rele(bp);
1466 }
1467
1468 return btp->bt_lru_nr;
1480} 1469}
1481 1470
1482void 1471void
@@ -1484,18 +1473,13 @@ xfs_free_buftarg(
1484 struct xfs_mount *mp, 1473 struct xfs_mount *mp,
1485 struct xfs_buftarg *btp) 1474 struct xfs_buftarg *btp)
1486{ 1475{
1476 unregister_shrinker(&btp->bt_shrinker);
1477
1487 xfs_flush_buftarg(btp, 1); 1478 xfs_flush_buftarg(btp, 1);
1488 if (mp->m_flags & XFS_MOUNT_BARRIER) 1479 if (mp->m_flags & XFS_MOUNT_BARRIER)
1489 xfs_blkdev_issue_flush(btp); 1480 xfs_blkdev_issue_flush(btp);
1490 xfs_free_bufhash(btp);
1491 iput(btp->bt_mapping->host);
1492 1481
1493 /* Unregister the buftarg first so that we don't get a
1494 * wakeup finding a non-existent task
1495 */
1496 xfs_unregister_buftarg(btp);
1497 kthread_stop(btp->bt_task); 1482 kthread_stop(btp->bt_task);
1498
1499 kmem_free(btp); 1483 kmem_free(btp);
1500} 1484}
1501 1485
@@ -1511,21 +1495,12 @@ xfs_setsize_buftarg_flags(
1511 btp->bt_smask = sectorsize - 1; 1495 btp->bt_smask = sectorsize - 1;
1512 1496
1513 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1497 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1514 printk(KERN_WARNING 1498 xfs_warn(btp->bt_mount,
1515 "XFS: Cannot set_blocksize to %u on device %s\n", 1499 "Cannot set_blocksize to %u on device %s\n",
1516 sectorsize, XFS_BUFTARG_NAME(btp)); 1500 sectorsize, XFS_BUFTARG_NAME(btp));
1517 return EINVAL; 1501 return EINVAL;
1518 } 1502 }
1519 1503
1520 if (verbose &&
1521 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1522 printk(KERN_WARNING
1523 "XFS: %u byte sectors in use on device %s. "
1524 "This is suboptimal; %u or greater is ideal.\n",
1525 sectorsize, XFS_BUFTARG_NAME(btp),
1526 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1527 }
1528
1529 return 0; 1504 return 0;
1530} 1505}
1531 1506
@@ -1540,7 +1515,7 @@ xfs_setsize_buftarg_early(
1540 struct block_device *bdev) 1515 struct block_device *bdev)
1541{ 1516{
1542 return xfs_setsize_buftarg_flags(btp, 1517 return xfs_setsize_buftarg_flags(btp,
1543 PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); 1518 PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1544} 1519}
1545 1520
1546int 1521int
@@ -1553,62 +1528,22 @@ xfs_setsize_buftarg(
1553} 1528}
1554 1529
1555STATIC int 1530STATIC int
1556xfs_mapping_buftarg(
1557 xfs_buftarg_t *btp,
1558 struct block_device *bdev)
1559{
1560 struct backing_dev_info *bdi;
1561 struct inode *inode;
1562 struct address_space *mapping;
1563 static const struct address_space_operations mapping_aops = {
1564 .sync_page = block_sync_page,
1565 .migratepage = fail_migrate_page,
1566 };
1567
1568 inode = new_inode(bdev->bd_inode->i_sb);
1569 if (!inode) {
1570 printk(KERN_WARNING
1571 "XFS: Cannot allocate mapping inode for device %s\n",
1572 XFS_BUFTARG_NAME(btp));
1573 return ENOMEM;
1574 }
1575 inode->i_mode = S_IFBLK;
1576 inode->i_bdev = bdev;
1577 inode->i_rdev = bdev->bd_dev;
1578 bdi = blk_get_backing_dev_info(bdev);
1579 if (!bdi)
1580 bdi = &default_backing_dev_info;
1581 mapping = &inode->i_data;
1582 mapping->a_ops = &mapping_aops;
1583 mapping->backing_dev_info = bdi;
1584 mapping_set_gfp_mask(mapping, GFP_NOFS);
1585 btp->bt_mapping = mapping;
1586 return 0;
1587}
1588
1589STATIC int
1590xfs_alloc_delwrite_queue( 1531xfs_alloc_delwrite_queue(
1591 xfs_buftarg_t *btp, 1532 xfs_buftarg_t *btp,
1592 const char *fsname) 1533 const char *fsname)
1593{ 1534{
1594 int error = 0;
1595
1596 INIT_LIST_HEAD(&btp->bt_list);
1597 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1535 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1598 spin_lock_init(&btp->bt_delwrite_lock); 1536 spin_lock_init(&btp->bt_delwrite_lock);
1599 btp->bt_flags = 0; 1537 btp->bt_flags = 0;
1600 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1538 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1601 if (IS_ERR(btp->bt_task)) { 1539 if (IS_ERR(btp->bt_task))
1602 error = PTR_ERR(btp->bt_task); 1540 return PTR_ERR(btp->bt_task);
1603 goto out_error; 1541 return 0;
1604 }
1605 xfs_register_buftarg(btp);
1606out_error:
1607 return error;
1608} 1542}
1609 1543
1610xfs_buftarg_t * 1544xfs_buftarg_t *
1611xfs_alloc_buftarg( 1545xfs_alloc_buftarg(
1546 struct xfs_mount *mp,
1612 struct block_device *bdev, 1547 struct block_device *bdev,
1613 int external, 1548 int external,
1614 const char *fsname) 1549 const char *fsname)
@@ -1617,15 +1552,22 @@ xfs_alloc_buftarg(
1617 1552
1618 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1553 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1619 1554
1555 btp->bt_mount = mp;
1620 btp->bt_dev = bdev->bd_dev; 1556 btp->bt_dev = bdev->bd_dev;
1621 btp->bt_bdev = bdev; 1557 btp->bt_bdev = bdev;
1622 if (xfs_setsize_buftarg_early(btp, bdev)) 1558 btp->bt_bdi = blk_get_backing_dev_info(bdev);
1559 if (!btp->bt_bdi)
1623 goto error; 1560 goto error;
1624 if (xfs_mapping_buftarg(btp, bdev)) 1561
1562 INIT_LIST_HEAD(&btp->bt_lru);
1563 spin_lock_init(&btp->bt_lru_lock);
1564 if (xfs_setsize_buftarg_early(btp, bdev))
1625 goto error; 1565 goto error;
1626 if (xfs_alloc_delwrite_queue(btp, fsname)) 1566 if (xfs_alloc_delwrite_queue(btp, fsname))
1627 goto error; 1567 goto error;
1628 xfs_alloc_bufhash(btp, external); 1568 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1569 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1570 register_shrinker(&btp->bt_shrinker);
1629 return btp; 1571 return btp;
1630 1572
1631error: 1573error:
@@ -1730,27 +1672,6 @@ xfs_buf_runall_queues(
1730 flush_workqueue(queue); 1672 flush_workqueue(queue);
1731} 1673}
1732 1674
1733STATIC int
1734xfsbufd_wakeup(
1735 struct shrinker *shrink,
1736 int priority,
1737 gfp_t mask)
1738{
1739 xfs_buftarg_t *btp;
1740
1741 spin_lock(&xfs_buftarg_lock);
1742 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1743 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1744 continue;
1745 if (list_empty(&btp->bt_delwrite_queue))
1746 continue;
1747 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1748 wake_up_process(btp->bt_task);
1749 }
1750 spin_unlock(&xfs_buftarg_lock);
1751 return 0;
1752}
1753
1754/* 1675/*
1755 * Move as many buffers as specified to the supplied list 1676 * Move as many buffers as specified to the supplied list
1756 * idicating if we skipped any buffers to prevent deadlocks. 1677 * idicating if we skipped any buffers to prevent deadlocks.
@@ -1771,7 +1692,6 @@ xfs_buf_delwri_split(
1771 INIT_LIST_HEAD(list); 1692 INIT_LIST_HEAD(list);
1772 spin_lock(dwlk); 1693 spin_lock(dwlk);
1773 list_for_each_entry_safe(bp, n, dwq, b_list) { 1694 list_for_each_entry_safe(bp, n, dwq, b_list) {
1774 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1775 ASSERT(bp->b_flags & XBF_DELWRI); 1695 ASSERT(bp->b_flags & XBF_DELWRI);
1776 1696
1777 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { 1697 if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1785,6 +1705,7 @@ xfs_buf_delwri_split(
1785 _XBF_RUN_QUEUES); 1705 _XBF_RUN_QUEUES);
1786 bp->b_flags |= XBF_WRITE; 1706 bp->b_flags |= XBF_WRITE;
1787 list_move_tail(&bp->b_list, list); 1707 list_move_tail(&bp->b_list, list);
1708 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1788 } else 1709 } else
1789 skipped++; 1710 skipped++;
1790 } 1711 }
@@ -1838,8 +1759,8 @@ xfsbufd(
1838 do { 1759 do {
1839 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1760 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1840 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); 1761 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1841 int count = 0;
1842 struct list_head tmp; 1762 struct list_head tmp;
1763 struct blk_plug plug;
1843 1764
1844 if (unlikely(freezing(current))) { 1765 if (unlikely(freezing(current))) {
1845 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1766 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1855,16 +1776,15 @@ xfsbufd(
1855 1776
1856 xfs_buf_delwri_split(target, &tmp, age); 1777 xfs_buf_delwri_split(target, &tmp, age);
1857 list_sort(NULL, &tmp, xfs_buf_cmp); 1778 list_sort(NULL, &tmp, xfs_buf_cmp);
1779
1780 blk_start_plug(&plug);
1858 while (!list_empty(&tmp)) { 1781 while (!list_empty(&tmp)) {
1859 struct xfs_buf *bp; 1782 struct xfs_buf *bp;
1860 bp = list_first_entry(&tmp, struct xfs_buf, b_list); 1783 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1861 list_del_init(&bp->b_list); 1784 list_del_init(&bp->b_list);
1862 xfs_bdstrat_cb(bp); 1785 xfs_bdstrat_cb(bp);
1863 count++;
1864 } 1786 }
1865 if (count) 1787 blk_finish_plug(&plug);
1866 blk_run_address_space(target->bt_mapping);
1867
1868 } while (!kthread_should_stop()); 1788 } while (!kthread_should_stop());
1869 1789
1870 return 0; 1790 return 0;
@@ -1884,6 +1804,7 @@ xfs_flush_buftarg(
1884 int pincount = 0; 1804 int pincount = 0;
1885 LIST_HEAD(tmp_list); 1805 LIST_HEAD(tmp_list);
1886 LIST_HEAD(wait_list); 1806 LIST_HEAD(wait_list);
1807 struct blk_plug plug;
1887 1808
1888 xfs_buf_runall_queues(xfsconvertd_workqueue); 1809 xfs_buf_runall_queues(xfsconvertd_workqueue);
1889 xfs_buf_runall_queues(xfsdatad_workqueue); 1810 xfs_buf_runall_queues(xfsdatad_workqueue);
@@ -1898,6 +1819,8 @@ xfs_flush_buftarg(
1898 * we do that after issuing all the IO. 1819 * we do that after issuing all the IO.
1899 */ 1820 */
1900 list_sort(NULL, &tmp_list, xfs_buf_cmp); 1821 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1822
1823 blk_start_plug(&plug);
1901 while (!list_empty(&tmp_list)) { 1824 while (!list_empty(&tmp_list)) {
1902 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); 1825 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1903 ASSERT(target == bp->b_target); 1826 ASSERT(target == bp->b_target);
@@ -1908,15 +1831,15 @@ xfs_flush_buftarg(
1908 } 1831 }
1909 xfs_bdstrat_cb(bp); 1832 xfs_bdstrat_cb(bp);
1910 } 1833 }
1834 blk_finish_plug(&plug);
1911 1835
1912 if (wait) { 1836 if (wait) {
1913 /* Expedite and wait for IO to complete. */ 1837 /* Wait for IO to complete. */
1914 blk_run_address_space(target->bt_mapping);
1915 while (!list_empty(&wait_list)) { 1838 while (!list_empty(&wait_list)) {
1916 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1839 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1917 1840
1918 list_del_init(&bp->b_list); 1841 list_del_init(&bp->b_list);
1919 xfs_iowait(bp); 1842 xfs_buf_iowait(bp);
1920 xfs_buf_relse(bp); 1843 xfs_buf_relse(bp);
1921 } 1844 }
1922 } 1845 }
@@ -1933,19 +1856,19 @@ xfs_buf_init(void)
1933 goto out; 1856 goto out;
1934 1857
1935 xfslogd_workqueue = alloc_workqueue("xfslogd", 1858 xfslogd_workqueue = alloc_workqueue("xfslogd",
1936 WQ_RESCUER | WQ_HIGHPRI, 1); 1859 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1937 if (!xfslogd_workqueue) 1860 if (!xfslogd_workqueue)
1938 goto out_free_buf_zone; 1861 goto out_free_buf_zone;
1939 1862
1940 xfsdatad_workqueue = create_workqueue("xfsdatad"); 1863 xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
1941 if (!xfsdatad_workqueue) 1864 if (!xfsdatad_workqueue)
1942 goto out_destroy_xfslogd_workqueue; 1865 goto out_destroy_xfslogd_workqueue;
1943 1866
1944 xfsconvertd_workqueue = create_workqueue("xfsconvertd"); 1867 xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
1868 WQ_MEM_RECLAIM, 1);
1945 if (!xfsconvertd_workqueue) 1869 if (!xfsconvertd_workqueue)
1946 goto out_destroy_xfsdatad_workqueue; 1870 goto out_destroy_xfsdatad_workqueue;
1947 1871
1948 register_shrinker(&xfs_buf_shake);
1949 return 0; 1872 return 0;
1950 1873
1951 out_destroy_xfsdatad_workqueue: 1874 out_destroy_xfsdatad_workqueue:
@@ -1961,7 +1884,6 @@ xfs_buf_init(void)
1961void 1884void
1962xfs_buf_terminate(void) 1885xfs_buf_terminate(void)
1963{ 1886{
1964 unregister_shrinker(&xfs_buf_shake);
1965 destroy_workqueue(xfsconvertd_workqueue); 1887 destroy_workqueue(xfsconvertd_workqueue);
1966 destroy_workqueue(xfsdatad_workqueue); 1888 destroy_workqueue(xfsdatad_workqueue);
1967 destroy_workqueue(xfslogd_workqueue); 1889 destroy_workqueue(xfslogd_workqueue);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 2a05614f0b92..50a7d5fb3b73 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -51,7 +51,6 @@ typedef enum {
51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ 52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ 53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */
55#define XBF_ORDERED (1 << 11)/* use ordered writes */ 54#define XBF_ORDERED (1 << 11)/* use ordered writes */
56#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ 55#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
57#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ 56#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
@@ -62,38 +61,11 @@ typedef enum {
62#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ 61#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
63 62
64/* flags used only internally */ 63/* flags used only internally */
65#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
66#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ 64#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
67#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ 65#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
66#define _XBF_KMEM (1 << 20)/* backed by heap memory */
68#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ 67#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
69 68
70/*
71 * Special flag for supporting metadata blocks smaller than a FSB.
72 *
73 * In this case we can have multiple xfs_buf_t on a single page and
74 * need to lock out concurrent xfs_buf_t readers as they only
75 * serialise access to the buffer.
76 *
77 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
78 * between reads of the page. Hence we can have one thread read the
79 * page and modify it, but then race with another thread that thinks
80 * the page is not up-to-date and hence reads it again.
81 *
82 * The result is that the first modifcation to the page is lost.
83 * This sort of AGF/AGI reading race can happen when unlinking inodes
84 * that require truncation and results in the AGI unlinked list
85 * modifications being lost.
86 */
87#define _XBF_PAGE_LOCKED (1 << 22)
88
89/*
90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information.
94 */
95#define _XFS_BARRIER_FAILED (1 << 23)
96
97typedef unsigned int xfs_buf_flags_t; 69typedef unsigned int xfs_buf_flags_t;
98 70
99#define XFS_BUF_FLAGS \ 71#define XFS_BUF_FLAGS \
@@ -104,19 +76,15 @@ typedef unsigned int xfs_buf_flags_t;
104 { XBF_DONE, "DONE" }, \ 76 { XBF_DONE, "DONE" }, \
105 { XBF_DELWRI, "DELWRI" }, \ 77 { XBF_DELWRI, "DELWRI" }, \
106 { XBF_STALE, "STALE" }, \ 78 { XBF_STALE, "STALE" }, \
107 { XBF_FS_MANAGED, "FS_MANAGED" }, \
108 { XBF_ORDERED, "ORDERED" }, \ 79 { XBF_ORDERED, "ORDERED" }, \
109 { XBF_READ_AHEAD, "READ_AHEAD" }, \ 80 { XBF_READ_AHEAD, "READ_AHEAD" }, \
110 { XBF_LOCK, "LOCK" }, /* should never be set */\ 81 { XBF_LOCK, "LOCK" }, /* should never be set */\
111 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ 82 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
112 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ 83 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
113 { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \
114 { _XBF_PAGES, "PAGES" }, \ 84 { _XBF_PAGES, "PAGES" }, \
115 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 85 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
116 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 86 { _XBF_KMEM, "KMEM" }, \
117 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ 87 { _XBF_DELWRI_Q, "DELWRI_Q" }
118 { _XFS_BARRIER_FAILED, "BARRIER_FAILED" }
119
120 88
121typedef enum { 89typedef enum {
122 XBT_FORCE_SLEEP = 0, 90 XBT_FORCE_SLEEP = 0,
@@ -131,70 +99,67 @@ typedef struct xfs_bufhash {
131typedef struct xfs_buftarg { 99typedef struct xfs_buftarg {
132 dev_t bt_dev; 100 dev_t bt_dev;
133 struct block_device *bt_bdev; 101 struct block_device *bt_bdev;
134 struct address_space *bt_mapping; 102 struct backing_dev_info *bt_bdi;
103 struct xfs_mount *bt_mount;
135 unsigned int bt_bsize; 104 unsigned int bt_bsize;
136 unsigned int bt_sshift; 105 unsigned int bt_sshift;
137 size_t bt_smask; 106 size_t bt_smask;
138 107
139 /* per device buffer hash table */
140 uint bt_hashshift;
141 xfs_bufhash_t *bt_hash;
142
143 /* per device delwri queue */ 108 /* per device delwri queue */
144 struct task_struct *bt_task; 109 struct task_struct *bt_task;
145 struct list_head bt_list;
146 struct list_head bt_delwrite_queue; 110 struct list_head bt_delwrite_queue;
147 spinlock_t bt_delwrite_lock; 111 spinlock_t bt_delwrite_lock;
148 unsigned long bt_flags; 112 unsigned long bt_flags;
149} xfs_buftarg_t;
150 113
151/* 114 /* LRU control structures */
152 * xfs_buf_t: Buffer structure for pagecache-based buffers 115 struct shrinker bt_shrinker;
153 * 116 struct list_head bt_lru;
154 * This buffer structure is used by the pagecache buffer management routines 117 spinlock_t bt_lru_lock;
155 * to refer to an assembly of pages forming a logical buffer. 118 unsigned int bt_lru_nr;
156 * 119} xfs_buftarg_t;
157 * The buffer structure is used on a temporary basis only, and discarded when
158 * released. The real data storage is recorded in the pagecache. Buffers are
159 * hashed to the block device on which the file system resides.
160 */
161 120
162struct xfs_buf; 121struct xfs_buf;
163typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 122typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
164typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
165typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
166 123
167#define XB_PAGES 2 124#define XB_PAGES 2
168 125
169typedef struct xfs_buf { 126typedef struct xfs_buf {
127 /*
128 * first cacheline holds all the fields needed for an uncontended cache
129 * hit to be fully processed. The semaphore straddles the cacheline
130 * boundary, but the counter and lock sits on the first cacheline,
131 * which is the only bit that is touched if we hit the semaphore
132 * fast-path on locking.
133 */
134 struct rb_node b_rbnode; /* rbtree node */
135 xfs_off_t b_file_offset; /* offset in file */
136 size_t b_buffer_length;/* size of buffer in bytes */
137 atomic_t b_hold; /* reference count */
138 atomic_t b_lru_ref; /* lru reclaim ref count */
139 xfs_buf_flags_t b_flags; /* status flags */
170 struct semaphore b_sema; /* semaphore for lockables */ 140 struct semaphore b_sema; /* semaphore for lockables */
171 unsigned long b_queuetime; /* time buffer was queued */ 141
172 atomic_t b_pin_count; /* pin count */ 142 struct list_head b_lru; /* lru list */
173 wait_queue_head_t b_waiters; /* unpin waiters */ 143 wait_queue_head_t b_waiters; /* unpin waiters */
174 struct list_head b_list; 144 struct list_head b_list;
175 xfs_buf_flags_t b_flags; /* status flags */ 145 struct xfs_perag *b_pag; /* contains rbtree root */
176 struct list_head b_hash_list; /* hash table list */
177 xfs_bufhash_t *b_hash; /* hash table list start */
178 xfs_buftarg_t *b_target; /* buffer target (device) */ 146 xfs_buftarg_t *b_target; /* buffer target (device) */
179 atomic_t b_hold; /* reference count */
180 xfs_daddr_t b_bn; /* block number for I/O */ 147 xfs_daddr_t b_bn; /* block number for I/O */
181 xfs_off_t b_file_offset; /* offset in file */
182 size_t b_buffer_length;/* size of buffer in bytes */
183 size_t b_count_desired;/* desired transfer size */ 148 size_t b_count_desired;/* desired transfer size */
184 void *b_addr; /* virtual address of buffer */ 149 void *b_addr; /* virtual address of buffer */
185 struct work_struct b_iodone_work; 150 struct work_struct b_iodone_work;
186 atomic_t b_io_remaining; /* #outstanding I/O requests */
187 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 151 xfs_buf_iodone_t b_iodone; /* I/O completion function */
188 xfs_buf_relse_t b_relse; /* releasing function */
189 struct completion b_iowait; /* queue for I/O waiters */ 152 struct completion b_iowait; /* queue for I/O waiters */
190 void *b_fspriv; 153 void *b_fspriv;
191 void *b_fspriv2; 154 void *b_fspriv2;
192 struct xfs_mount *b_mount;
193 unsigned short b_error; /* error code on I/O */
194 unsigned int b_page_count; /* size of page array */
195 unsigned int b_offset; /* page offset in first page */
196 struct page **b_pages; /* array of page pointers */ 155 struct page **b_pages; /* array of page pointers */
197 struct page *b_page_array[XB_PAGES]; /* inline pages */ 156 struct page *b_page_array[XB_PAGES]; /* inline pages */
157 unsigned long b_queuetime; /* time buffer was queued */
158 atomic_t b_pin_count; /* pin count */
159 atomic_t b_io_remaining; /* #outstanding I/O requests */
160 unsigned int b_page_count; /* size of page array */
161 unsigned int b_offset; /* page offset in first page */
162 unsigned short b_error; /* error code on I/O */
198#ifdef XFS_BUF_LOCK_TRACKING 163#ifdef XFS_BUF_LOCK_TRACKING
199 int b_last_holder; 164 int b_last_holder;
200#endif 165#endif
@@ -213,11 +178,14 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
213 xfs_buf_flags_t); 178 xfs_buf_flags_t);
214 179
215extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 180extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
216extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); 181extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
182extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
217extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 183extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
218extern void xfs_buf_hold(xfs_buf_t *); 184extern void xfs_buf_hold(xfs_buf_t *);
219extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, 185extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
220 xfs_buf_flags_t); 186struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
187 struct xfs_buftarg *target,
188 xfs_daddr_t daddr, size_t length, int flags);
221 189
222/* Releasing Buffers */ 190/* Releasing Buffers */
223extern void xfs_buf_free(xfs_buf_t *); 191extern void xfs_buf_free(xfs_buf_t *);
@@ -242,6 +210,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *);
242extern int xfs_buf_iowait(xfs_buf_t *); 210extern int xfs_buf_iowait(xfs_buf_t *);
243extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 211extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
244 xfs_buf_rw_t); 212 xfs_buf_rw_t);
213#define xfs_buf_zero(bp, off, len) \
214 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
245 215
246static inline int xfs_buf_geterror(xfs_buf_t *bp) 216static inline int xfs_buf_geterror(xfs_buf_t *bp)
247{ 217{
@@ -267,7 +237,8 @@ extern void xfs_buf_terminate(void);
267#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 237#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
268 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 238 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
269 239
270#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) 240void xfs_buf_stale(struct xfs_buf *bp);
241#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
271#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 242#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
272#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 243#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
273#define XFS_BUF_SUPER_STALE(bp) do { \ 244#define XFS_BUF_SUPER_STALE(bp) do { \
@@ -276,8 +247,6 @@ extern void xfs_buf_terminate(void);
276 XFS_BUF_DONE(bp); \ 247 XFS_BUF_DONE(bp); \
277 } while (0) 248 } while (0)
278 249
279#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
280
281#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 250#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
282#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) 251#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
283#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) 252#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
@@ -320,7 +289,6 @@ extern void xfs_buf_terminate(void);
320#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 289#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
321#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 290#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
322#define XFS_BUF_SET_START(bp) do { } while (0) 291#define XFS_BUF_SET_START(bp) do { } while (0)
323#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
324 292
325#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 293#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
326#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 294#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -333,9 +301,15 @@ extern void xfs_buf_terminate(void);
333#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 301#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
334#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 302#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
335 303
336#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) 304static inline void
305xfs_buf_set_ref(
306 struct xfs_buf *bp,
307 int lru_ref)
308{
309 atomic_set(&bp->b_lru_ref, lru_ref);
310}
311#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
337#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 312#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
338#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
339 313
340#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) 314#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
341 315
@@ -351,30 +325,15 @@ extern void xfs_buf_terminate(void);
351 325
352static inline void xfs_buf_relse(xfs_buf_t *bp) 326static inline void xfs_buf_relse(xfs_buf_t *bp)
353{ 327{
354 if (!bp->b_relse) 328 xfs_buf_unlock(bp);
355 xfs_buf_unlock(bp);
356 xfs_buf_rele(bp); 329 xfs_buf_rele(bp);
357} 330}
358 331
359#define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
360
361#define xfs_biomove(bp, off, len, data, rw) \
362 xfs_buf_iomove((bp), (off), (len), (data), \
363 ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
364
365#define xfs_biozero(bp, off, len) \
366 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
367
368#define xfs_iowait(bp) xfs_buf_iowait(bp)
369
370#define xfs_baread(target, rablkno, ralen) \
371 xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
372
373
374/* 332/*
375 * Handling of buftargs. 333 * Handling of buftargs.
376 */ 334 */
377extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); 335extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
336 struct block_device *, int, const char *);
378extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 337extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
379extern void xfs_wait_buftarg(xfs_buftarg_t *); 338extern void xfs_wait_buftarg(xfs_buftarg_t *);
380extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 339extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index 55bddf3b6091..000000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_CRED_H__
19#define __XFS_CRED_H__
20
21#include <linux/capability.h>
22
23/*
24 * Credentials
25 */
26typedef const struct cred cred_t;
27
28#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..244e797dae32
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,222 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_sb.h"
20#include "xfs_inum.h"
21#include "xfs_log.h"
22#include "xfs_ag.h"
23#include "xfs_mount.h"
24#include "xfs_quota.h"
25#include "xfs_trans.h"
26#include "xfs_alloc_btree.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_inode.h"
31#include "xfs_alloc.h"
32#include "xfs_error.h"
33#include "xfs_discard.h"
34#include "xfs_trace.h"
35
36STATIC int
37xfs_trim_extents(
38 struct xfs_mount *mp,
39 xfs_agnumber_t agno,
40 xfs_fsblock_t start,
41 xfs_fsblock_t len,
42 xfs_fsblock_t minlen,
43 __uint64_t *blocks_trimmed)
44{
45 struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
46 struct xfs_btree_cur *cur;
47 struct xfs_buf *agbp;
48 struct xfs_perag *pag;
49 int error;
50 int i;
51
52 pag = xfs_perag_get(mp, agno);
53
54 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
55 if (error || !agbp)
56 goto out_put_perag;
57
58 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
59
60 /*
61 * Force out the log. This means any transactions that might have freed
62 * space before we took the AGF buffer lock are now on disk, and the
63 * volatile disk cache is flushed.
64 */
65 xfs_log_force(mp, XFS_LOG_SYNC);
66
67 /*
68 * Look up the longest btree in the AGF and start with it.
69 */
70 error = xfs_alloc_lookup_le(cur, 0,
71 XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
72 if (error)
73 goto out_del_cursor;
74
75 /*
76 * Loop until we are done with all extents that are large
77 * enough to be worth discarding.
78 */
79 while (i) {
80 xfs_agblock_t fbno;
81 xfs_extlen_t flen;
82
83 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
84 if (error)
85 goto out_del_cursor;
86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
87 ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
88
89 /*
90 * Too small? Give up.
91 */
92 if (flen < minlen) {
93 trace_xfs_discard_toosmall(mp, agno, fbno, flen);
94 goto out_del_cursor;
95 }
96
97 /*
98 * If the extent is entirely outside of the range we are
99 * supposed to discard skip it. Do not bother to trim
100 * down partially overlapping ranges for now.
101 */
102 if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
103 XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
104 trace_xfs_discard_exclude(mp, agno, fbno, flen);
105 goto next_extent;
106 }
107
108 /*
109 * If any blocks in the range are still busy, skip the
110 * discard and try again the next time.
111 */
112 if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
113 trace_xfs_discard_busy(mp, agno, fbno, flen);
114 goto next_extent;
115 }
116
117 trace_xfs_discard_extent(mp, agno, fbno, flen);
118 error = -blkdev_issue_discard(bdev,
119 XFS_AGB_TO_DADDR(mp, agno, fbno),
120 XFS_FSB_TO_BB(mp, flen),
121 GFP_NOFS, 0);
122 if (error)
123 goto out_del_cursor;
124 *blocks_trimmed += flen;
125
126next_extent:
127 error = xfs_btree_decrement(cur, 0, &i);
128 if (error)
129 goto out_del_cursor;
130 }
131
132out_del_cursor:
133 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
134 xfs_buf_relse(agbp);
135out_put_perag:
136 xfs_perag_put(pag);
137 return error;
138}
139
140int
141xfs_ioc_trim(
142 struct xfs_mount *mp,
143 struct fstrim_range __user *urange)
144{
145 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
146 unsigned int granularity = q->limits.discard_granularity;
147 struct fstrim_range range;
148 xfs_fsblock_t start, len, minlen;
149 xfs_agnumber_t start_agno, end_agno, agno;
150 __uint64_t blocks_trimmed = 0;
151 int error, last_error = 0;
152
153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM);
155 if (!blk_queue_discard(q))
156 return -XFS_ERROR(EOPNOTSUPP);
157 if (copy_from_user(&range, urange, sizeof(range)))
158 return -XFS_ERROR(EFAULT);
159
160 /*
161 * Truncating down the len isn't actually quite correct, but using
162 * XFS_B_TO_FSB would mean we trivially get overflows for values
163 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
164 * used by the fstrim application. In the end it really doesn't
165 * matter as trimming blocks is an advisory interface.
166 */
167 start = XFS_B_TO_FSBT(mp, range.start);
168 len = XFS_B_TO_FSBT(mp, range.len);
169 minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
170
171 start_agno = XFS_FSB_TO_AGNO(mp, start);
172 if (start_agno >= mp->m_sb.sb_agcount)
173 return -XFS_ERROR(EINVAL);
174
175 end_agno = XFS_FSB_TO_AGNO(mp, start + len);
176 if (end_agno >= mp->m_sb.sb_agcount)
177 end_agno = mp->m_sb.sb_agcount - 1;
178
179 for (agno = start_agno; agno <= end_agno; agno++) {
180 error = -xfs_trim_extents(mp, agno, start, len, minlen,
181 &blocks_trimmed);
182 if (error)
183 last_error = error;
184 }
185
186 if (last_error)
187 return last_error;
188
189 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
190 if (copy_to_user(urange, &range, sizeof(range)))
191 return -XFS_ERROR(EFAULT);
192 return 0;
193}
194
195int
196xfs_discard_extents(
197 struct xfs_mount *mp,
198 struct list_head *list)
199{
200 struct xfs_busy_extent *busyp;
201 int error = 0;
202
203 list_for_each_entry(busyp, list, list) {
204 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
205 busyp->length);
206
207 error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
208 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
209 XFS_FSB_TO_BB(mp, busyp->length),
210 GFP_NOFS, 0);
211 if (error && error != EOPNOTSUPP) {
212 xfs_info(mp,
213 "discard failed for extent [0x%llu,%u], error %d",
214 (unsigned long long)busyp->bno,
215 busyp->length,
216 error);
217 return error;
218 }
219 }
220
221 return 0;
222}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..344879aea646
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,10 @@
1#ifndef XFS_DISCARD_H
2#define XFS_DISCARD_H 1
3
4struct fstrim_range;
5struct list_head;
6
7extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
8extern int xfs_discard_extents(struct xfs_mount *, struct list_head *);
9
10#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 3764d74790ec..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
70 else 70 else
71 fileid_type = FILEID_INO32_GEN_PARENT; 71 fileid_type = FILEID_INO32_GEN_PARENT;
72 72
73 /* filesystem may contain 64bit inode numbers */ 73 /*
74 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) 74 * If the the filesystem may contain 64bit inode numbers, we need
75 * to use larger file handles that can represent them.
76 *
77 * While we only allocate inodes that do not fit into 32 bits any
78 * large enough filesystem may contain them, thus the slightly
79 * confusing looking conditional below.
80 */
81 if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
82 (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
75 fileid_type |= XFS_FILEID_TYPE_64FLAG; 83 fileid_type |= XFS_FILEID_TYPE_64FLAG;
76 84
77 /* 85 /*
@@ -81,8 +89,10 @@ xfs_fs_encode_fh(
81 * seven combinations work. The real answer is "don't use v2". 89 * seven combinations work. The real answer is "don't use v2".
82 */ 90 */
83 len = xfs_fileid_length(fileid_type); 91 len = xfs_fileid_length(fileid_type);
84 if (*max_len < len) 92 if (*max_len < len) {
93 *max_len = len;
85 return 255; 94 return 255;
95 }
86 *max_len = len; 96 *max_len = len;
87 97
88 switch (fileid_type) { 98 switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..7f782af286bf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38 38
39#include <linux/dcache.h> 39#include <linux/dcache.h>
40#include <linux/falloc.h>
40 41
41static const struct vm_operations_struct xfs_file_vm_ops; 42static const struct vm_operations_struct xfs_file_vm_ops;
42 43
43/* 44/*
45 * Locking primitives for read and write IO paths to ensure we consistently use
46 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
47 */
48static inline void
49xfs_rw_ilock(
50 struct xfs_inode *ip,
51 int type)
52{
53 if (type & XFS_IOLOCK_EXCL)
54 mutex_lock(&VFS_I(ip)->i_mutex);
55 xfs_ilock(ip, type);
56}
57
58static inline void
59xfs_rw_iunlock(
60 struct xfs_inode *ip,
61 int type)
62{
63 xfs_iunlock(ip, type);
64 if (type & XFS_IOLOCK_EXCL)
65 mutex_unlock(&VFS_I(ip)->i_mutex);
66}
67
68static inline void
69xfs_rw_ilock_demote(
70 struct xfs_inode *ip,
71 int type)
72{
73 xfs_ilock_demote(ip, type);
74 if (type & XFS_IOLOCK_EXCL)
75 mutex_unlock(&VFS_I(ip)->i_mutex);
76}
77
78/*
44 * xfs_iozero 79 * xfs_iozero
45 * 80 *
46 * xfs_iozero clears the specified range of buffer supplied, 81 * xfs_iozero clears the specified range of buffer supplied,
@@ -96,19 +131,34 @@ xfs_file_fsync(
96{ 131{
97 struct inode *inode = file->f_mapping->host; 132 struct inode *inode = file->f_mapping->host;
98 struct xfs_inode *ip = XFS_I(inode); 133 struct xfs_inode *ip = XFS_I(inode);
134 struct xfs_mount *mp = ip->i_mount;
99 struct xfs_trans *tp; 135 struct xfs_trans *tp;
100 int error = 0; 136 int error = 0;
101 int log_flushed = 0; 137 int log_flushed = 0;
102 138
103 trace_xfs_file_fsync(ip); 139 trace_xfs_file_fsync(ip);
104 140
105 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 141 if (XFS_FORCED_SHUTDOWN(mp))
106 return -XFS_ERROR(EIO); 142 return -XFS_ERROR(EIO);
107 143
108 xfs_iflags_clear(ip, XFS_ITRUNCATED); 144 xfs_iflags_clear(ip, XFS_ITRUNCATED);
109 145
110 xfs_ioend_wait(ip); 146 xfs_ioend_wait(ip);
111 147
148 if (mp->m_flags & XFS_MOUNT_BARRIER) {
149 /*
150 * If we have an RT and/or log subvolume we need to make sure
151 * to flush the write cache the device used for file data
152 * first. This is to ensure newly written file data make
153 * it to disk before logging the new inode size in case of
154 * an extending write.
155 */
156 if (XFS_IS_REALTIME_INODE(ip))
157 xfs_blkdev_issue_flush(mp->m_rtdev_targp);
158 else if (mp->m_logdev_targp != mp->m_ddev_targp)
159 xfs_blkdev_issue_flush(mp->m_ddev_targp);
160 }
161
112 /* 162 /*
113 * We always need to make sure that the required inode state is safe on 163 * We always need to make sure that the required inode state is safe on
114 * disk. The inode might be clean but we still might need to force the 164 * disk. The inode might be clean but we still might need to force the
@@ -140,9 +190,9 @@ xfs_file_fsync(
140 * updates. The sync transaction will also force the log. 190 * updates. The sync transaction will also force the log.
141 */ 191 */
142 xfs_iunlock(ip, XFS_ILOCK_SHARED); 192 xfs_iunlock(ip, XFS_ILOCK_SHARED);
143 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); 193 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
144 error = xfs_trans_reserve(tp, 0, 194 error = xfs_trans_reserve(tp, 0,
145 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); 195 XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
146 if (error) { 196 if (error) {
147 xfs_trans_cancel(tp, 0); 197 xfs_trans_cancel(tp, 0);
148 return -error; 198 return -error;
@@ -174,28 +224,25 @@ xfs_file_fsync(
174 * force the log. 224 * force the log.
175 */ 225 */
176 if (xfs_ipincount(ip)) { 226 if (xfs_ipincount(ip)) {
177 error = _xfs_log_force_lsn(ip->i_mount, 227 error = _xfs_log_force_lsn(mp,
178 ip->i_itemp->ili_last_lsn, 228 ip->i_itemp->ili_last_lsn,
179 XFS_LOG_SYNC, &log_flushed); 229 XFS_LOG_SYNC, &log_flushed);
180 } 230 }
181 xfs_iunlock(ip, XFS_ILOCK_SHARED); 231 xfs_iunlock(ip, XFS_ILOCK_SHARED);
182 } 232 }
183 233
184 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { 234 /*
185 /* 235 * If we only have a single device, and the log force about was
186 * If the log write didn't issue an ordered tag we need 236 * a no-op we might have to flush the data device cache here.
187 * to flush the disk cache for the data device now. 237 * This can only happen for fdatasync/O_DSYNC if we were overwriting
188 */ 238 * an already allocated file and thus do not have any metadata to
189 if (!log_flushed) 239 * commit.
190 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); 240 */
191 241 if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
192 /* 242 mp->m_logdev_targp == mp->m_ddev_targp &&
193 * If this inode is on the RT dev we need to flush that 243 !XFS_IS_REALTIME_INODE(ip) &&
194 * cache as well. 244 !log_flushed)
195 */ 245 xfs_blkdev_issue_flush(mp->m_ddev_targp);
196 if (XFS_IS_REALTIME_INODE(ip))
197 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
198 }
199 246
200 return -error; 247 return -error;
201} 248}
@@ -262,22 +309,21 @@ xfs_file_aio_read(
262 if (XFS_FORCED_SHUTDOWN(mp)) 309 if (XFS_FORCED_SHUTDOWN(mp))
263 return -EIO; 310 return -EIO;
264 311
265 if (unlikely(ioflags & IO_ISDIRECT))
266 mutex_lock(&inode->i_mutex);
267 xfs_ilock(ip, XFS_IOLOCK_SHARED);
268
269 if (unlikely(ioflags & IO_ISDIRECT)) { 312 if (unlikely(ioflags & IO_ISDIRECT)) {
313 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
314
270 if (inode->i_mapping->nrpages) { 315 if (inode->i_mapping->nrpages) {
271 ret = -xfs_flushinval_pages(ip, 316 ret = -xfs_flushinval_pages(ip,
272 (iocb->ki_pos & PAGE_CACHE_MASK), 317 (iocb->ki_pos & PAGE_CACHE_MASK),
273 -1, FI_REMAPF_LOCKED); 318 -1, FI_REMAPF_LOCKED);
319 if (ret) {
320 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
321 return ret;
322 }
274 } 323 }
275 mutex_unlock(&inode->i_mutex); 324 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
276 if (ret) { 325 } else
277 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 326 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
278 return ret;
279 }
280 }
281 327
282 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 328 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
283 329
@@ -285,7 +331,7 @@ xfs_file_aio_read(
285 if (ret > 0) 331 if (ret > 0)
286 XFS_STATS_ADD(xs_read_bytes, ret); 332 XFS_STATS_ADD(xs_read_bytes, ret);
287 333
288 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 334 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
289 return ret; 335 return ret;
290} 336}
291 337
@@ -309,7 +355,7 @@ xfs_file_splice_read(
309 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 355 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
310 return -EIO; 356 return -EIO;
311 357
312 xfs_ilock(ip, XFS_IOLOCK_SHARED); 358 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
313 359
314 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 360 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
315 361
@@ -317,10 +363,61 @@ xfs_file_splice_read(
317 if (ret > 0) 363 if (ret > 0)
318 XFS_STATS_ADD(xs_read_bytes, ret); 364 XFS_STATS_ADD(xs_read_bytes, ret);
319 365
320 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 366 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
321 return ret; 367 return ret;
322} 368}
323 369
370STATIC void
371xfs_aio_write_isize_update(
372 struct inode *inode,
373 loff_t *ppos,
374 ssize_t bytes_written)
375{
376 struct xfs_inode *ip = XFS_I(inode);
377 xfs_fsize_t isize = i_size_read(inode);
378
379 if (bytes_written > 0)
380 XFS_STATS_ADD(xs_write_bytes, bytes_written);
381
382 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
383 *ppos > isize))
384 *ppos = isize;
385
386 if (*ppos > ip->i_size) {
387 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
388 if (*ppos > ip->i_size)
389 ip->i_size = *ppos;
390 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
391 }
392}
393
394/*
395 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
396 * part of the I/O may have been written to disk before the error occurred. In
397 * this case the on-disk file size may have been adjusted beyond the in-memory
398 * file size and now needs to be truncated back.
399 */
400STATIC void
401xfs_aio_write_newsize_update(
402 struct xfs_inode *ip)
403{
404 if (ip->i_new_size) {
405 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
406 ip->i_new_size = 0;
407 if (ip->i_d.di_size > ip->i_size)
408 ip->i_d.di_size = ip->i_size;
409 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
410 }
411}
412
413/*
414 * xfs_file_splice_write() does not use xfs_rw_ilock() because
415 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
416 * couuld cause lock inversions between the aio_write path and the splice path
417 * if someone is doing concurrent splice(2) based writes and write(2) based
418 * writes to the same inode. The only real way to fix this is to re-implement
419 * the generic code here with correct locking orders.
420 */
324STATIC ssize_t 421STATIC ssize_t
325xfs_file_splice_write( 422xfs_file_splice_write(
326 struct pipe_inode_info *pipe, 423 struct pipe_inode_info *pipe,
@@ -331,7 +428,7 @@ xfs_file_splice_write(
331{ 428{
332 struct inode *inode = outfilp->f_mapping->host; 429 struct inode *inode = outfilp->f_mapping->host;
333 struct xfs_inode *ip = XFS_I(inode); 430 struct xfs_inode *ip = XFS_I(inode);
334 xfs_fsize_t isize, new_size; 431 xfs_fsize_t new_size;
335 int ioflags = 0; 432 int ioflags = 0;
336 ssize_t ret; 433 ssize_t ret;
337 434
@@ -355,27 +452,9 @@ xfs_file_splice_write(
355 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 452 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
356 453
357 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 454 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
358 if (ret > 0)
359 XFS_STATS_ADD(xs_write_bytes, ret);
360 455
361 isize = i_size_read(inode); 456 xfs_aio_write_isize_update(inode, ppos, ret);
362 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) 457 xfs_aio_write_newsize_update(ip);
363 *ppos = isize;
364
365 if (*ppos > ip->i_size) {
366 xfs_ilock(ip, XFS_ILOCK_EXCL);
367 if (*ppos > ip->i_size)
368 ip->i_size = *ppos;
369 xfs_iunlock(ip, XFS_ILOCK_EXCL);
370 }
371
372 if (ip->i_new_size) {
373 xfs_ilock(ip, XFS_ILOCK_EXCL);
374 ip->i_new_size = 0;
375 if (ip->i_d.di_size > ip->i_size)
376 ip->i_d.di_size = ip->i_size;
377 xfs_iunlock(ip, XFS_ILOCK_EXCL);
378 }
379 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 458 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
380 return ret; 459 return ret;
381} 460}
@@ -562,247 +641,318 @@ out_lock:
562 return error; 641 return error;
563} 642}
564 643
644/*
645 * Common pre-write limit and setup checks.
646 *
647 * Returns with iolock held according to @iolock.
648 */
565STATIC ssize_t 649STATIC ssize_t
566xfs_file_aio_write( 650xfs_file_aio_write_checks(
567 struct kiocb *iocb, 651 struct file *file,
568 const struct iovec *iovp, 652 loff_t *pos,
569 unsigned long nr_segs, 653 size_t *count,
570 loff_t pos) 654 int *iolock)
571{ 655{
572 struct file *file = iocb->ki_filp; 656 struct inode *inode = file->f_mapping->host;
573 struct address_space *mapping = file->f_mapping;
574 struct inode *inode = mapping->host;
575 struct xfs_inode *ip = XFS_I(inode); 657 struct xfs_inode *ip = XFS_I(inode);
576 struct xfs_mount *mp = ip->i_mount; 658 xfs_fsize_t new_size;
577 ssize_t ret = 0, error = 0; 659 int error = 0;
578 int ioflags = 0;
579 xfs_fsize_t isize, new_size;
580 int iolock;
581 size_t ocount = 0, count;
582 int need_i_mutex;
583 660
584 XFS_STATS_INC(xs_write_calls); 661 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
662 if (error) {
663 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
664 *iolock = 0;
665 return error;
666 }
585 667
586 BUG_ON(iocb->ki_pos != pos); 668 new_size = *pos + *count;
669 if (new_size > ip->i_size)
670 ip->i_new_size = new_size;
587 671
588 if (unlikely(file->f_flags & O_DIRECT)) 672 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
589 ioflags |= IO_ISDIRECT; 673 file_update_time(file);
590 if (file->f_mode & FMODE_NOCMTIME)
591 ioflags |= IO_INVIS;
592 674
593 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); 675 /*
676 * If the offset is beyond the size of the file, we need to zero any
677 * blocks that fall between the existing EOF and the start of this
678 * write.
679 */
680 if (*pos > ip->i_size)
681 error = -xfs_zero_eof(ip, *pos, ip->i_size);
682
683 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
594 if (error) 684 if (error)
595 return error; 685 return error;
596 686
597 count = ocount; 687 /*
598 if (count == 0) 688 * If we're writing the file then make sure to clear the setuid and
599 return 0; 689 * setgid bits if the process is not being run by root. This keeps
600 690 * people from modifying setuid and setgid binaries.
601 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 691 */
692 return file_remove_suid(file);
602 693
603 if (XFS_FORCED_SHUTDOWN(mp)) 694}
604 return -EIO;
605 695
606relock: 696/*
607 if (ioflags & IO_ISDIRECT) { 697 * xfs_file_dio_aio_write - handle direct IO writes
608 iolock = XFS_IOLOCK_SHARED; 698 *
609 need_i_mutex = 0; 699 * Lock the inode appropriately to prepare for and issue a direct IO write.
610 } else { 700 * By separating it from the buffered write path we remove all the tricky to
611 iolock = XFS_IOLOCK_EXCL; 701 * follow locking changes and looping.
612 need_i_mutex = 1; 702 *
613 mutex_lock(&inode->i_mutex); 703 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
704 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
705 * pages are flushed out.
706 *
707 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
708 * allowing them to be done in parallel with reads and other direct IO writes.
709 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
710 * needs to do sub-block zeroing and that requires serialisation against other
711 * direct IOs to the same block. In this case we need to serialise the
712 * submission of the unaligned IOs so that we don't get racing block zeroing in
713 * the dio layer. To avoid the problem with aio, we also need to wait for
714 * outstanding IOs to complete so that unwritten extent conversion is completed
715 * before we try to map the overlapping block. This is currently implemented by
716 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
717 *
718 * Returns with locks held indicated by @iolock and errors indicated by
719 * negative return values.
720 */
721STATIC ssize_t
722xfs_file_dio_aio_write(
723 struct kiocb *iocb,
724 const struct iovec *iovp,
725 unsigned long nr_segs,
726 loff_t pos,
727 size_t ocount,
728 int *iolock)
729{
730 struct file *file = iocb->ki_filp;
731 struct address_space *mapping = file->f_mapping;
732 struct inode *inode = mapping->host;
733 struct xfs_inode *ip = XFS_I(inode);
734 struct xfs_mount *mp = ip->i_mount;
735 ssize_t ret = 0;
736 size_t count = ocount;
737 int unaligned_io = 0;
738 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
739 mp->m_rtdev_targp : mp->m_ddev_targp;
740
741 *iolock = 0;
742 if ((pos & target->bt_smask) || (count & target->bt_smask))
743 return -XFS_ERROR(EINVAL);
744
745 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
746 unaligned_io = 1;
747
748 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
749 *iolock = XFS_IOLOCK_EXCL;
750 else
751 *iolock = XFS_IOLOCK_SHARED;
752 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
753
754 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
755 if (ret)
756 return ret;
757
758 if (mapping->nrpages) {
759 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
760 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
761 FI_REMAPF_LOCKED);
762 if (ret)
763 return ret;
614 } 764 }
615 765
616 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 766 /*
617 767 * If we are doing unaligned IO, wait for all other IO to drain,
618start: 768 * otherwise demote the lock if we had to flush cached pages
619 error = -generic_write_checks(file, &pos, &count, 769 */
620 S_ISBLK(inode->i_mode)); 770 if (unaligned_io)
621 if (error) { 771 xfs_ioend_wait(ip);
622 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 772 else if (*iolock == XFS_IOLOCK_EXCL) {
623 goto out_unlock_mutex; 773 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
774 *iolock = XFS_IOLOCK_SHARED;
624 } 775 }
625 776
626 if (ioflags & IO_ISDIRECT) { 777 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
627 xfs_buftarg_t *target = 778 ret = generic_file_direct_write(iocb, iovp,
628 XFS_IS_REALTIME_INODE(ip) ? 779 &nr_segs, pos, &iocb->ki_pos, count, ocount);
629 mp->m_rtdev_targp : mp->m_ddev_targp;
630 780
631 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 781 /* No fallback to buffered IO on errors for XFS. */
632 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 782 ASSERT(ret < 0 || ret == count);
633 return XFS_ERROR(-EINVAL); 783 return ret;
634 } 784}
635 785
636 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { 786STATIC ssize_t
637 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); 787xfs_file_buffered_aio_write(
638 iolock = XFS_IOLOCK_EXCL; 788 struct kiocb *iocb,
639 need_i_mutex = 1; 789 const struct iovec *iovp,
640 mutex_lock(&inode->i_mutex); 790 unsigned long nr_segs,
641 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); 791 loff_t pos,
642 goto start; 792 size_t ocount,
643 } 793 int *iolock)
644 } 794{
795 struct file *file = iocb->ki_filp;
796 struct address_space *mapping = file->f_mapping;
797 struct inode *inode = mapping->host;
798 struct xfs_inode *ip = XFS_I(inode);
799 ssize_t ret;
800 int enospc = 0;
801 size_t count = ocount;
645 802
646 new_size = pos + count; 803 *iolock = XFS_IOLOCK_EXCL;
647 if (new_size > ip->i_size) 804 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
648 ip->i_new_size = new_size;
649 805
650 if (likely(!(ioflags & IO_INVIS))) 806 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
651 file_update_time(file); 807 if (ret)
808 return ret;
809
810 /* We can write back this queue in page reclaim */
811 current->backing_dev_info = mapping->backing_dev_info;
652 812
813write_retry:
814 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
815 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
816 pos, &iocb->ki_pos, count, ret);
653 /* 817 /*
654 * If the offset is beyond the size of the file, we have a couple 818 * if we just got an ENOSPC, flush the inode now we aren't holding any
655 * of things to do. First, if there is already space allocated 819 * page locks and retry *once*
656 * we need to either create holes or zero the disk or ...
657 *
658 * If there is a page where the previous size lands, we need
659 * to zero it out up to the new size.
660 */ 820 */
661 821 if (ret == -ENOSPC && !enospc) {
662 if (pos > ip->i_size) { 822 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
663 error = xfs_zero_eof(ip, pos, ip->i_size); 823 if (ret)
664 if (error) { 824 return ret;
665 xfs_iunlock(ip, XFS_ILOCK_EXCL); 825 enospc = 1;
666 goto out_unlock_internal; 826 goto write_retry;
667 }
668 } 827 }
669 xfs_iunlock(ip, XFS_ILOCK_EXCL); 828 current->backing_dev_info = NULL;
829 return ret;
830}
670 831
671 /* 832STATIC ssize_t
672 * If we're writing the file then make sure to clear the 833xfs_file_aio_write(
673 * setuid and setgid bits if the process is not being run 834 struct kiocb *iocb,
674 * by root. This keeps people from modifying setuid and 835 const struct iovec *iovp,
675 * setgid binaries. 836 unsigned long nr_segs,
676 */ 837 loff_t pos)
677 error = -file_remove_suid(file); 838{
678 if (unlikely(error)) 839 struct file *file = iocb->ki_filp;
679 goto out_unlock_internal; 840 struct address_space *mapping = file->f_mapping;
841 struct inode *inode = mapping->host;
842 struct xfs_inode *ip = XFS_I(inode);
843 ssize_t ret;
844 int iolock;
845 size_t ocount = 0;
680 846
681 /* We can write back this queue in page reclaim */ 847 XFS_STATS_INC(xs_write_calls);
682 current->backing_dev_info = mapping->backing_dev_info;
683 848
684 if ((ioflags & IO_ISDIRECT)) { 849 BUG_ON(iocb->ki_pos != pos);
685 if (mapping->nrpages) {
686 WARN_ON(need_i_mutex == 0);
687 error = xfs_flushinval_pages(ip,
688 (pos & PAGE_CACHE_MASK),
689 -1, FI_REMAPF_LOCKED);
690 if (error)
691 goto out_unlock_internal;
692 }
693 850
694 if (need_i_mutex) { 851 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
695 /* demote the lock now the cached pages are gone */ 852 if (ret)
696 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); 853 return ret;
697 mutex_unlock(&inode->i_mutex);
698 854
699 iolock = XFS_IOLOCK_SHARED; 855 if (ocount == 0)
700 need_i_mutex = 0; 856 return 0;
701 }
702 857
703 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); 858 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
704 ret = generic_file_direct_write(iocb, iovp,
705 &nr_segs, pos, &iocb->ki_pos, count, ocount);
706 859
707 /* 860 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
708 * direct-io write to a hole: fall through to buffered I/O 861 return -EIO;
709 * for completing the rest of the request.
710 */
711 if (ret >= 0 && ret != count) {
712 XFS_STATS_ADD(xs_write_bytes, ret);
713 862
714 pos += ret; 863 if (unlikely(file->f_flags & O_DIRECT))
715 count -= ret; 864 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
865 ocount, &iolock);
866 else
867 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
868 ocount, &iolock);
716 869
717 ioflags &= ~IO_ISDIRECT; 870 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
718 xfs_iunlock(ip, iolock);
719 goto relock;
720 }
721 } else {
722 int enospc = 0;
723 ssize_t ret2 = 0;
724 871
725write_retry: 872 if (ret <= 0)
726 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); 873 goto out_unlock;
727 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
728 pos, &iocb->ki_pos, count, ret);
729 /*
730 * if we just got an ENOSPC, flush the inode now we
731 * aren't holding any page locks and retry *once*
732 */
733 if (ret2 == -ENOSPC && !enospc) {
734 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
735 if (error)
736 goto out_unlock_internal;
737 enospc = 1;
738 goto write_retry;
739 }
740 ret = ret2;
741 }
742 874
743 current->backing_dev_info = NULL; 875 /* Handle various SYNC-type writes */
876 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
877 loff_t end = pos + ret - 1;
878 int error, error2;
744 879
745 isize = i_size_read(inode); 880 xfs_rw_iunlock(ip, iolock);
746 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) 881 error = filemap_write_and_wait_range(mapping, pos, end);
747 iocb->ki_pos = isize; 882 xfs_rw_ilock(ip, iolock);
748 883
749 if (iocb->ki_pos > ip->i_size) { 884 error2 = -xfs_file_fsync(file,
750 xfs_ilock(ip, XFS_ILOCK_EXCL); 885 (file->f_flags & __O_SYNC) ? 0 : 1);
751 if (iocb->ki_pos > ip->i_size) 886 if (error)
752 ip->i_size = iocb->ki_pos; 887 ret = error;
753 xfs_iunlock(ip, XFS_ILOCK_EXCL); 888 else if (error2)
889 ret = error2;
754 } 890 }
755 891
756 error = -ret; 892out_unlock:
757 if (ret <= 0) 893 xfs_aio_write_newsize_update(ip);
758 goto out_unlock_internal; 894 xfs_rw_iunlock(ip, iolock);
895 return ret;
896}
759 897
760 XFS_STATS_ADD(xs_write_bytes, ret); 898STATIC long
899xfs_file_fallocate(
900 struct file *file,
901 int mode,
902 loff_t offset,
903 loff_t len)
904{
905 struct inode *inode = file->f_path.dentry->d_inode;
906 long error;
907 loff_t new_size = 0;
908 xfs_flock64_t bf;
909 xfs_inode_t *ip = XFS_I(inode);
910 int cmd = XFS_IOC_RESVSP;
911 int attr_flags = XFS_ATTR_NOLOCK;
761 912
762 /* Handle various SYNC-type writes */ 913 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
763 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 914 return -EOPNOTSUPP;
764 loff_t end = pos + ret - 1;
765 int error2;
766 915
767 xfs_iunlock(ip, iolock); 916 bf.l_whence = 0;
768 if (need_i_mutex) 917 bf.l_start = offset;
769 mutex_unlock(&inode->i_mutex); 918 bf.l_len = len;
770 919
771 error2 = filemap_write_and_wait_range(mapping, pos, end); 920 xfs_ilock(ip, XFS_IOLOCK_EXCL);
772 if (!error)
773 error = error2;
774 if (need_i_mutex)
775 mutex_lock(&inode->i_mutex);
776 xfs_ilock(ip, iolock);
777 921
778 error2 = -xfs_file_fsync(file, 922 if (mode & FALLOC_FL_PUNCH_HOLE)
779 (file->f_flags & __O_SYNC) ? 0 : 1); 923 cmd = XFS_IOC_UNRESVSP;
780 if (!error) 924
781 error = error2; 925 /* check the new inode size is valid before allocating */
926 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
927 offset + len > i_size_read(inode)) {
928 new_size = offset + len;
929 error = inode_newsize_ok(inode, new_size);
930 if (error)
931 goto out_unlock;
782 } 932 }
783 933
784 out_unlock_internal: 934 if (file->f_flags & O_DSYNC)
785 if (ip->i_new_size) { 935 attr_flags |= XFS_ATTR_SYNC;
786 xfs_ilock(ip, XFS_ILOCK_EXCL); 936
787 ip->i_new_size = 0; 937 error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
788 /* 938 if (error)
789 * If this was a direct or synchronous I/O that failed (such 939 goto out_unlock;
790 * as ENOSPC) then part of the I/O may have been written to 940
791 * disk before the error occured. In this case the on-disk 941 /* Change file size if needed */
792 * file size may have been adjusted beyond the in-memory file 942 if (new_size) {
793 * size and now needs to be truncated back. 943 struct iattr iattr;
794 */ 944
795 if (ip->i_d.di_size > ip->i_size) 945 iattr.ia_valid = ATTR_SIZE;
796 ip->i_d.di_size = ip->i_size; 946 iattr.ia_size = new_size;
797 xfs_iunlock(ip, XFS_ILOCK_EXCL); 947 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
798 } 948 }
799 xfs_iunlock(ip, iolock); 949
800 out_unlock_mutex: 950out_unlock:
801 if (need_i_mutex) 951 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
802 mutex_unlock(&inode->i_mutex); 952 return error;
803 return -error;
804} 953}
805 954
955
806STATIC int 956STATIC int
807xfs_file_open( 957xfs_file_open(
808 struct inode *inode, 958 struct inode *inode,
@@ -921,6 +1071,7 @@ const struct file_operations xfs_file_operations = {
921 .open = xfs_file_open, 1071 .open = xfs_file_open,
922 .release = xfs_file_release, 1072 .release = xfs_file_release,
923 .fsync = xfs_file_fsync, 1073 .fsync = xfs_file_fsync,
1074 .fallocate = xfs_file_fallocate,
924}; 1075};
925 1076
926const struct file_operations xfs_dir_file_operations = { 1077const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1f279b012f94..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -32,10 +32,9 @@ xfs_tosspages(
32 xfs_off_t last, 32 xfs_off_t last,
33 int fiopt) 33 int fiopt)
34{ 34{
35 struct address_space *mapping = VFS_I(ip)->i_mapping; 35 /* can't toss partial tail pages, so mask them out */
36 36 last &= ~(PAGE_SIZE - 1);
37 if (mapping->nrpages) 37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38 truncate_inode_pages(mapping, first);
39} 38}
40 39
41int 40int
@@ -50,12 +49,11 @@ xfs_flushinval_pages(
50 49
51 trace_xfs_pagecache_inval(ip, first, last); 50 trace_xfs_pagecache_inval(ip, first, last);
52 51
53 if (mapping->nrpages) { 52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
54 xfs_iflags_clear(ip, XFS_ITRUNCATED); 53 ret = filemap_write_and_wait_range(mapping, first,
55 ret = filemap_write_and_wait(mapping); 54 last == -1 ? LLONG_MAX : last);
56 if (!ret) 55 if (!ret)
57 truncate_inode_pages(mapping, first); 56 truncate_inode_pages_range(mapping, first, last);
58 }
59 return -ret; 57 return -ret;
60} 58}
61 59
@@ -71,10 +69,9 @@ xfs_flush_pages(
71 int ret = 0; 69 int ret = 0;
72 int ret2; 70 int ret2;
73 71
74 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
75 xfs_iflags_clear(ip, XFS_ITRUNCATED); 73 ret = -filemap_fdatawrite_range(mapping, first,
76 ret = -filemap_fdatawrite(mapping); 74 last == -1 ? LLONG_MAX : last);
77 }
78 if (flags & XBF_ASYNC) 75 if (flags & XBF_ASYNC)
79 return ret; 76 return ret;
80 ret2 = xfs_wait_on_pages(ip, first, last); 77 ret2 = xfs_wait_on_pages(ip, first, last);
@@ -91,7 +88,9 @@ xfs_wait_on_pages(
91{ 88{
92 struct address_space *mapping = VFS_I(ip)->i_mapping; 89 struct address_space *mapping = VFS_I(ip)->i_mapping;
93 90
94 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) 91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
95 return -filemap_fdatawait(mapping); 92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last);
94 }
96 return 0; 95 return 0;
97} 96}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 2ae8b1ccb02e..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -16,7 +16,6 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_cred.h"
20#include "xfs_sysctl.h" 19#include "xfs_sysctl.h"
21 20
22/* 21/*
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index 69f71caf061c..000000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,23 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_GLOBALS_H__
19#define __XFS_GLOBALS_H__
20
21extern uint64_t xfs_panic_mask; /* set to cause more panics */
22
23#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3b9e626f7cd1..acca2c5ca3fa 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
39#include "xfs_dfrag.h" 39#include "xfs_dfrag.h"
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
42#include "xfs_discard.h"
42#include "xfs_quota.h" 43#include "xfs_quota.h"
43#include "xfs_inode_item.h" 44#include "xfs_inode_item.h"
44#include "xfs_export.h" 45#include "xfs_export.h"
@@ -416,7 +417,7 @@ xfs_attrlist_by_handle(
416 if (IS_ERR(dentry)) 417 if (IS_ERR(dentry))
417 return PTR_ERR(dentry); 418 return PTR_ERR(dentry);
418 419
419 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 420 kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
420 if (!kbuf) 421 if (!kbuf)
421 goto out_dput; 422 goto out_dput;
422 423
@@ -623,6 +624,10 @@ xfs_ioc_space(
623 624
624 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 625 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
625 attr_flags |= XFS_ATTR_NONBLOCK; 626 attr_flags |= XFS_ATTR_NONBLOCK;
627
628 if (filp->f_flags & O_DSYNC)
629 attr_flags |= XFS_ATTR_SYNC;
630
626 if (ioflags & IO_INVIS) 631 if (ioflags & IO_INVIS)
627 attr_flags |= XFS_ATTR_DMI; 632 attr_flags |= XFS_ATTR_DMI;
628 633
@@ -694,14 +699,19 @@ xfs_ioc_fsgeometry_v1(
694 xfs_mount_t *mp, 699 xfs_mount_t *mp,
695 void __user *arg) 700 void __user *arg)
696{ 701{
697 xfs_fsop_geom_v1_t fsgeo; 702 xfs_fsop_geom_t fsgeo;
698 int error; 703 int error;
699 704
700 error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); 705 error = xfs_fs_geometry(mp, &fsgeo, 3);
701 if (error) 706 if (error)
702 return -error; 707 return -error;
703 708
704 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) 709 /*
710 * Caller should have passed an argument of type
711 * xfs_fsop_geom_v1_t. This is a proper subset of the
712 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
713 */
714 if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
705 return -XFS_ERROR(EFAULT); 715 return -XFS_ERROR(EFAULT);
706 return 0; 716 return 0;
707} 717}
@@ -790,7 +800,7 @@ xfs_ioc_fsgetxattr(
790 xfs_ilock(ip, XFS_ILOCK_SHARED); 800 xfs_ilock(ip, XFS_ILOCK_SHARED);
791 fa.fsx_xflags = xfs_ip2xflags(ip); 801 fa.fsx_xflags = xfs_ip2xflags(ip);
792 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; 802 fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
793 fa.fsx_projid = ip->i_d.di_projid; 803 fa.fsx_projid = xfs_get_projid(ip);
794 804
795 if (attr) { 805 if (attr) {
796 if (ip->i_afp) { 806 if (ip->i_afp) {
@@ -909,10 +919,10 @@ xfs_ioctl_setattr(
909 return XFS_ERROR(EIO); 919 return XFS_ERROR(EIO);
910 920
911 /* 921 /*
912 * Disallow 32bit project ids because on-disk structure 922 * Disallow 32bit project ids when projid32bit feature is not enabled.
913 * is 16bit only.
914 */ 923 */
915 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) 924 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
925 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
916 return XFS_ERROR(EINVAL); 926 return XFS_ERROR(EINVAL);
917 927
918 /* 928 /*
@@ -961,7 +971,7 @@ xfs_ioctl_setattr(
961 if (mask & FSX_PROJID) { 971 if (mask & FSX_PROJID) {
962 if (XFS_IS_QUOTA_RUNNING(mp) && 972 if (XFS_IS_QUOTA_RUNNING(mp) &&
963 XFS_IS_PQUOTA_ON(mp) && 973 XFS_IS_PQUOTA_ON(mp) &&
964 ip->i_d.di_projid != fa->fsx_projid) { 974 xfs_get_projid(ip) != fa->fsx_projid) {
965 ASSERT(tp); 975 ASSERT(tp);
966 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 976 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
967 capable(CAP_FOWNER) ? 977 capable(CAP_FOWNER) ?
@@ -984,10 +994,22 @@ xfs_ioctl_setattr(
984 994
985 /* 995 /*
986 * Extent size must be a multiple of the appropriate block 996 * Extent size must be a multiple of the appropriate block
987 * size, if set at all. 997 * size, if set at all. It must also be smaller than the
998 * maximum extent size supported by the filesystem.
999 *
1000 * Also, for non-realtime files, limit the extent size hint to
1001 * half the size of the AGs in the filesystem so alignment
1002 * doesn't result in extents larger than an AG.
988 */ 1003 */
989 if (fa->fsx_extsize != 0) { 1004 if (fa->fsx_extsize != 0) {
990 xfs_extlen_t size; 1005 xfs_extlen_t size;
1006 xfs_fsblock_t extsize_fsb;
1007
1008 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1009 if (extsize_fsb > MAXEXTLEN) {
1010 code = XFS_ERROR(EINVAL);
1011 goto error_return;
1012 }
991 1013
992 if (XFS_IS_REALTIME_INODE(ip) || 1014 if (XFS_IS_REALTIME_INODE(ip) ||
993 ((mask & FSX_XFLAGS) && 1015 ((mask & FSX_XFLAGS) &&
@@ -996,6 +1018,10 @@ xfs_ioctl_setattr(
996 mp->m_sb.sb_blocklog; 1018 mp->m_sb.sb_blocklog;
997 } else { 1019 } else {
998 size = mp->m_sb.sb_blocksize; 1020 size = mp->m_sb.sb_blocksize;
1021 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1022 code = XFS_ERROR(EINVAL);
1023 goto error_return;
1024 }
999 } 1025 }
1000 1026
1001 if (fa->fsx_extsize % size) { 1027 if (fa->fsx_extsize % size) {
@@ -1063,12 +1089,12 @@ xfs_ioctl_setattr(
1063 * Change the ownerships and register quota modifications 1089 * Change the ownerships and register quota modifications
1064 * in the transaction. 1090 * in the transaction.
1065 */ 1091 */
1066 if (ip->i_d.di_projid != fa->fsx_projid) { 1092 if (xfs_get_projid(ip) != fa->fsx_projid) {
1067 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { 1093 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1068 olddquot = xfs_qm_vop_chown(tp, ip, 1094 olddquot = xfs_qm_vop_chown(tp, ip,
1069 &ip->i_gdquot, gdqp); 1095 &ip->i_gdquot, gdqp);
1070 } 1096 }
1071 ip->i_d.di_projid = fa->fsx_projid; 1097 xfs_set_projid(ip, fa->fsx_projid);
1072 1098
1073 /* 1099 /*
1074 * We may have to rev the inode as well as 1100 * We may have to rev the inode as well as
@@ -1088,8 +1114,8 @@ xfs_ioctl_setattr(
1088 xfs_diflags_to_linux(ip); 1114 xfs_diflags_to_linux(ip);
1089 } 1115 }
1090 1116
1117 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1091 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1118 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1092 xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1093 1119
1094 XFS_STATS_INC(xs_ig_attrchg); 1120 XFS_STATS_INC(xs_ig_attrchg);
1095 1121
@@ -1294,6 +1320,8 @@ xfs_file_ioctl(
1294 trace_xfs_file_ioctl(ip); 1320 trace_xfs_file_ioctl(ip);
1295 1321
1296 switch (cmd) { 1322 switch (cmd) {
1323 case FITRIM:
1324 return xfs_ioc_trim(mp, arg);
1297 case XFS_IOC_ALLOCSP: 1325 case XFS_IOC_ALLOCSP:
1298 case XFS_IOC_FREESP: 1326 case XFS_IOC_FREESP:
1299 case XFS_IOC_RESVSP: 1327 case XFS_IOC_RESVSP:
@@ -1301,7 +1329,8 @@ xfs_file_ioctl(
1301 case XFS_IOC_ALLOCSP64: 1329 case XFS_IOC_ALLOCSP64:
1302 case XFS_IOC_FREESP64: 1330 case XFS_IOC_FREESP64:
1303 case XFS_IOC_RESVSP64: 1331 case XFS_IOC_RESVSP64:
1304 case XFS_IOC_UNRESVSP64: { 1332 case XFS_IOC_UNRESVSP64:
1333 case XFS_IOC_ZERO_RANGE: {
1305 xfs_flock64_t bf; 1334 xfs_flock64_t bf;
1306 1335
1307 if (copy_from_user(&bf, arg, sizeof(bf))) 1336 if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 6c83f7f62dc9..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin(
164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) || 164 get_user(bstat->bs_extsize, &bstat32->bs_extsize) ||
165 get_user(bstat->bs_extents, &bstat32->bs_extents) || 165 get_user(bstat->bs_extents, &bstat32->bs_extents) ||
166 get_user(bstat->bs_gen, &bstat32->bs_gen) || 166 get_user(bstat->bs_gen, &bstat32->bs_gen) ||
167 get_user(bstat->bs_projid, &bstat32->bs_projid) || 167 get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
168 get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
168 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || 169 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
169 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || 170 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
170 get_user(bstat->bs_aextents, &bstat32->bs_aextents)) 171 get_user(bstat->bs_aextents, &bstat32->bs_aextents))
@@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat(
218 put_user(buffer->bs_extents, &p32->bs_extents) || 219 put_user(buffer->bs_extents, &p32->bs_extents) ||
219 put_user(buffer->bs_gen, &p32->bs_gen) || 220 put_user(buffer->bs_gen, &p32->bs_gen) ||
220 put_user(buffer->bs_projid, &p32->bs_projid) || 221 put_user(buffer->bs_projid, &p32->bs_projid) ||
222 put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) ||
221 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || 223 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
222 put_user(buffer->bs_dmstate, &p32->bs_dmstate) || 224 put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
223 put_user(buffer->bs_aextents, &p32->bs_aextents)) 225 put_user(buffer->bs_aextents, &p32->bs_aextents))
@@ -574,6 +576,7 @@ xfs_file_compat_ioctl(
574 case XFS_IOC_FSGEOMETRY_V1: 576 case XFS_IOC_FSGEOMETRY_V1:
575 case XFS_IOC_FSGROWFSDATA: 577 case XFS_IOC_FSGROWFSDATA:
576 case XFS_IOC_FSGROWFSRT: 578 case XFS_IOC_FSGROWFSRT:
579 case XFS_IOC_ZERO_RANGE:
577 return xfs_file_ioctl(filp, cmd, p); 580 return xfs_file_ioctl(filp, cmd, p);
578#else 581#else
579 case XFS_IOC_ALLOCSP_32: 582 case XFS_IOC_ALLOCSP_32:
@@ -583,7 +586,8 @@ xfs_file_compat_ioctl(
583 case XFS_IOC_RESVSP_32: 586 case XFS_IOC_RESVSP_32:
584 case XFS_IOC_UNRESVSP_32: 587 case XFS_IOC_UNRESVSP_32:
585 case XFS_IOC_RESVSP64_32: 588 case XFS_IOC_RESVSP64_32:
586 case XFS_IOC_UNRESVSP64_32: { 589 case XFS_IOC_UNRESVSP64_32:
590 case XFS_IOC_ZERO_RANGE_32: {
587 struct xfs_flock64 bf; 591 struct xfs_flock64 bf;
588 592
589 if (xfs_compat_flock64_copyin(&bf, arg)) 593 if (xfs_compat_flock64_copyin(&bf, arg))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 1024c4f8ba0d..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat {
65 __s32 bs_extsize; /* extent size */ 65 __s32 bs_extsize; /* extent size */
66 __s32 bs_extents; /* number of extents */ 66 __s32 bs_extents; /* number of extents */
67 __u32 bs_gen; /* generation count */ 67 __u32 bs_gen; /* generation count */
68 __u16 bs_projid; /* project id */ 68 __u16 bs_projid_lo; /* lower part of project id */
69 unsigned char bs_pad[14]; /* pad space, unused */ 69#define bs_projid bs_projid_lo /* (previously just bs_projid) */
70 __u16 bs_projid_hi; /* high part of project id */
71 unsigned char bs_pad[12]; /* pad space, unused */
70 __u32 bs_dmevmask; /* DMIG event mask */ 72 __u32 bs_dmevmask; /* DMIG event mask */
71 __u16 bs_dmstate; /* DMIG state info */ 73 __u16 bs_dmstate; /* DMIG state info */
72 __u16 bs_aextents; /* attribute number of extents */ 74 __u16 bs_aextents; /* attribute number of extents */
@@ -182,6 +184,7 @@ typedef struct compat_xfs_flock64 {
182#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) 184#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64)
183#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) 185#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64)
184#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) 186#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64)
187#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64)
185 188
186typedef struct compat_xfs_fsop_geom_v1 { 189typedef struct compat_xfs_fsop_geom_v1 {
187 __u32 blocksize; /* filesystem (data) block size */ 190 __u32 blocksize; /* filesystem (data) block size */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b1fc2a6bfe83..d44d92cd12b1 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/posix_acl.h> 47#include <linux/posix_acl.h>
48#include <linux/security.h> 48#include <linux/security.h>
49#include <linux/falloc.h>
50#include <linux/fiemap.h> 49#include <linux/fiemap.h>
51#include <linux/slab.h> 50#include <linux/slab.h>
52 51
@@ -71,7 +70,7 @@ xfs_synchronize_times(
71 70
72/* 71/*
73 * If the linux inode is valid, mark it dirty. 72 * If the linux inode is valid, mark it dirty.
74 * Used when commiting a dirty inode into a transaction so that 73 * Used when committing a dirty inode into a transaction so that
75 * the inode will get written back by the linux code 74 * the inode will get written back by the linux code
76 */ 75 */
77void 76void
@@ -95,41 +94,6 @@ xfs_mark_inode_dirty(
95} 94}
96 95
97/* 96/*
98 * Change the requested timestamp in the given inode.
99 * We don't lock across timestamp updates, and we don't log them but
100 * we do record the fact that there is dirty information in core.
101 */
102void
103xfs_ichgtime(
104 xfs_inode_t *ip,
105 int flags)
106{
107 struct inode *inode = VFS_I(ip);
108 timespec_t tv;
109 int sync_it = 0;
110
111 tv = current_fs_time(inode->i_sb);
112
113 if ((flags & XFS_ICHGTIME_MOD) &&
114 !timespec_equal(&inode->i_mtime, &tv)) {
115 inode->i_mtime = tv;
116 sync_it = 1;
117 }
118 if ((flags & XFS_ICHGTIME_CHG) &&
119 !timespec_equal(&inode->i_ctime, &tv)) {
120 inode->i_ctime = tv;
121 sync_it = 1;
122 }
123
124 /*
125 * Update complete - now make sure everyone knows that the inode
126 * is dirty.
127 */
128 if (sync_it)
129 xfs_mark_inode_dirty_sync(ip);
130}
131
132/*
133 * Hook in SELinux. This is not quite correct yet, what we really need 97 * Hook in SELinux. This is not quite correct yet, what we really need
134 * here (as we do for default ACLs) is a mechanism by which creation of 98 * here (as we do for default ACLs) is a mechanism by which creation of
135 * these attrs can be journalled at inode creation time (along with the 99 * these attrs can be journalled at inode creation time (along with the
@@ -138,7 +102,8 @@ xfs_ichgtime(
138STATIC int 102STATIC int
139xfs_init_security( 103xfs_init_security(
140 struct inode *inode, 104 struct inode *inode,
141 struct inode *dir) 105 struct inode *dir,
106 const struct qstr *qstr)
142{ 107{
143 struct xfs_inode *ip = XFS_I(inode); 108 struct xfs_inode *ip = XFS_I(inode);
144 size_t length; 109 size_t length;
@@ -146,7 +111,7 @@ xfs_init_security(
146 unsigned char *name; 111 unsigned char *name;
147 int error; 112 int error;
148 113
149 error = security_inode_init_security(inode, dir, (char **)&name, 114 error = security_inode_init_security(inode, dir, qstr, (char **)&name,
150 &value, &length); 115 &value, &length);
151 if (error) { 116 if (error) {
152 if (error == -EOPNOTSUPP) 117 if (error == -EOPNOTSUPP)
@@ -217,20 +182,20 @@ xfs_vn_mknod(
217 if (IS_POSIXACL(dir)) { 182 if (IS_POSIXACL(dir)) {
218 default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); 183 default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
219 if (IS_ERR(default_acl)) 184 if (IS_ERR(default_acl))
220 return -PTR_ERR(default_acl); 185 return PTR_ERR(default_acl);
221 186
222 if (!default_acl) 187 if (!default_acl)
223 mode &= ~current_umask(); 188 mode &= ~current_umask();
224 } 189 }
225 190
226 xfs_dentry_to_name(&name, dentry); 191 xfs_dentry_to_name(&name, dentry);
227 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); 192 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
228 if (unlikely(error)) 193 if (unlikely(error))
229 goto out_free_acl; 194 goto out_free_acl;
230 195
231 inode = VFS_I(ip); 196 inode = VFS_I(ip);
232 197
233 error = xfs_init_security(inode, dir); 198 error = xfs_init_security(inode, dir, &dentry->d_name);
234 if (unlikely(error)) 199 if (unlikely(error))
235 goto out_cleanup_inode; 200 goto out_cleanup_inode;
236 201
@@ -352,7 +317,7 @@ xfs_vn_link(
352 if (unlikely(error)) 317 if (unlikely(error))
353 return -error; 318 return -error;
354 319
355 atomic_inc(&inode->i_count); 320 ihold(inode);
356 d_instantiate(dentry, inode); 321 d_instantiate(dentry, inode);
357 return 0; 322 return 0;
358} 323}
@@ -397,13 +362,13 @@ xfs_vn_symlink(
397 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); 362 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
398 xfs_dentry_to_name(&name, dentry); 363 xfs_dentry_to_name(&name, dentry);
399 364
400 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); 365 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
401 if (unlikely(error)) 366 if (unlikely(error))
402 goto out; 367 goto out;
403 368
404 inode = VFS_I(cip); 369 inode = VFS_I(cip);
405 370
406 error = xfs_init_security(inode, dir); 371 error = xfs_init_security(inode, dir, &dentry->d_name);
407 if (unlikely(error)) 372 if (unlikely(error))
408 goto out_cleanup_inode; 373 goto out_cleanup_inode;
409 374
@@ -540,58 +505,6 @@ xfs_vn_setattr(
540 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); 505 return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
541} 506}
542 507
543STATIC long
544xfs_vn_fallocate(
545 struct inode *inode,
546 int mode,
547 loff_t offset,
548 loff_t len)
549{
550 long error;
551 loff_t new_size = 0;
552 xfs_flock64_t bf;
553 xfs_inode_t *ip = XFS_I(inode);
554
555 /* preallocation on directories not yet supported */
556 error = -ENODEV;
557 if (S_ISDIR(inode->i_mode))
558 goto out_error;
559
560 bf.l_whence = 0;
561 bf.l_start = offset;
562 bf.l_len = len;
563
564 xfs_ilock(ip, XFS_IOLOCK_EXCL);
565
566 /* check the new inode size is valid before allocating */
567 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
568 offset + len > i_size_read(inode)) {
569 new_size = offset + len;
570 error = inode_newsize_ok(inode, new_size);
571 if (error)
572 goto out_unlock;
573 }
574
575 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
576 0, XFS_ATTR_NOLOCK);
577 if (error)
578 goto out_unlock;
579
580 /* Change file size if needed */
581 if (new_size) {
582 struct iattr iattr;
583
584 iattr.ia_valid = ATTR_SIZE;
585 iattr.ia_size = new_size;
586 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
587 }
588
589out_unlock:
590 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
591out_error:
592 return error;
593}
594
595#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 508#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
596 509
597/* 510/*
@@ -685,7 +598,6 @@ static const struct inode_operations xfs_inode_operations = {
685 .getxattr = generic_getxattr, 598 .getxattr = generic_getxattr,
686 .removexattr = generic_removexattr, 599 .removexattr = generic_removexattr,
687 .listxattr = xfs_vn_listxattr, 600 .listxattr = xfs_vn_listxattr,
688 .fallocate = xfs_vn_fallocate,
689 .fiemap = xfs_vn_fiemap, 601 .fiemap = xfs_vn_fiemap,
690}; 602};
691 603
@@ -795,7 +707,10 @@ xfs_setup_inode(
795 707
796 inode->i_ino = ip->i_ino; 708 inode->i_ino = ip->i_ino;
797 inode->i_state = I_NEW; 709 inode->i_state = I_NEW;
798 inode_add_to_lists(ip->i_mount->m_super, inode); 710
711 inode_sb_list_add(inode);
712 /* make the inode look hashed for the writeback code */
713 hlist_add_fake(&inode->i_hash);
799 714
800 inode->i_mode = ip->i_d.di_mode; 715 inode->i_mode = ip->i_d.di_mode;
801 inode->i_nlink = ip->i_d.di_nlink; 716 inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 2fa0bd9ebc7f..8633521b3b2e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -37,10 +37,8 @@
37 37
38#include <kmem.h> 38#include <kmem.h>
39#include <mrlock.h> 39#include <mrlock.h>
40#include <sv.h>
41#include <time.h> 40#include <time.h>
42 41
43#include <support/debug.h>
44#include <support/uuid.h> 42#include <support/uuid.h>
45 43
46#include <linux/semaphore.h> 44#include <linux/semaphore.h>
@@ -71,6 +69,8 @@
71#include <linux/random.h> 69#include <linux/random.h>
72#include <linux/ctype.h> 70#include <linux/ctype.h>
73#include <linux/writeback.h> 71#include <linux/writeback.h>
72#include <linux/capability.h>
73#include <linux/list_sort.h>
74 74
75#include <asm/page.h> 75#include <asm/page.h>
76#include <asm/div64.h> 76#include <asm/div64.h>
@@ -79,15 +79,14 @@
79#include <asm/byteorder.h> 79#include <asm/byteorder.h>
80#include <asm/unaligned.h> 80#include <asm/unaligned.h>
81 81
82#include <xfs_cred.h>
83#include <xfs_vnode.h> 82#include <xfs_vnode.h>
84#include <xfs_stats.h> 83#include <xfs_stats.h>
85#include <xfs_sysctl.h> 84#include <xfs_sysctl.h>
86#include <xfs_iops.h> 85#include <xfs_iops.h>
87#include <xfs_aops.h> 86#include <xfs_aops.h>
88#include <xfs_super.h> 87#include <xfs_super.h>
89#include <xfs_globals.h>
90#include <xfs_buf.h> 88#include <xfs_buf.h>
89#include <xfs_message.h>
91 90
92/* 91/*
93 * Feature macros (disable/enable) 92 * Feature macros (disable/enable)
@@ -144,7 +143,7 @@
144#define SYNCHRONIZE() barrier() 143#define SYNCHRONIZE() barrier()
145#define __return_address __builtin_return_address(0) 144#define __return_address __builtin_return_address(0)
146 145
147#define dfltprid 0 146#define XFS_PROJID_DEFAULT 0
148#define MAXPATHLEN 1024 147#define MAXPATHLEN 1024
149 148
150#define MIN(a,b) (min(a,b)) 149#define MIN(a,b) (min(a,b))
@@ -282,4 +281,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
282#define __arch_pack 281#define __arch_pack
283#endif 282#endif
284 283
284#define ASSERT_ALWAYS(expr) \
285 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
286
287#ifndef DEBUG
288#define ASSERT(expr) ((void)0)
289
290#ifndef STATIC
291# define STATIC static noinline
292#endif
293
294#else /* DEBUG */
295
296#define ASSERT(expr) \
297 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
298
299#ifndef STATIC
300# define STATIC noinline
301#endif
302
303#endif /* DEBUG */
304
285#endif /* __XFS_LINUX__ */ 305#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 000000000000..bd672def95ac
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,108 @@
1/*
2 * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27
28/*
29 * XFS logging functions
30 */
31static void
32__xfs_printk(
33 const char *level,
34 const struct xfs_mount *mp,
35 struct va_format *vaf)
36{
37 if (mp && mp->m_fsname) {
38 printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
39 return;
40 }
41 printk("%sXFS: %pV\n", level, vaf);
42}
43
44#define define_xfs_printk_level(func, kern_level) \
45void func(const struct xfs_mount *mp, const char *fmt, ...) \
46{ \
47 struct va_format vaf; \
48 va_list args; \
49 \
50 va_start(args, fmt); \
51 \
52 vaf.fmt = fmt; \
53 vaf.va = &args; \
54 \
55 __xfs_printk(kern_level, mp, &vaf); \
56 va_end(args); \
57} \
58
59define_xfs_printk_level(xfs_emerg, KERN_EMERG);
60define_xfs_printk_level(xfs_alert, KERN_ALERT);
61define_xfs_printk_level(xfs_crit, KERN_CRIT);
62define_xfs_printk_level(xfs_err, KERN_ERR);
63define_xfs_printk_level(xfs_warn, KERN_WARNING);
64define_xfs_printk_level(xfs_notice, KERN_NOTICE);
65define_xfs_printk_level(xfs_info, KERN_INFO);
66#ifdef DEBUG
67define_xfs_printk_level(xfs_debug, KERN_DEBUG);
68#endif
69
70void
71xfs_alert_tag(
72 const struct xfs_mount *mp,
73 int panic_tag,
74 const char *fmt, ...)
75{
76 struct va_format vaf;
77 va_list args;
78 int do_panic = 0;
79
80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
81 xfs_alert(mp, "Transforming an alert into a BUG.");
82 do_panic = 1;
83 }
84
85 va_start(args, fmt);
86
87 vaf.fmt = fmt;
88 vaf.va = &args;
89
90 __xfs_printk(KERN_ALERT, mp, &vaf);
91 va_end(args);
92
93 BUG_ON(do_panic);
94}
95
96void
97assfail(char *expr, char *file, int line)
98{
99 xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
100 expr, file, line);
101 BUG();
102}
103
104void
105xfs_hex_dump(void *p, int length)
106{
107 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
108}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 000000000000..7fb7ea007672
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,39 @@
1#ifndef __XFS_MESSAGE_H
2#define __XFS_MESSAGE_H 1
3
4struct xfs_mount;
5
6extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
7 __attribute__ ((format (printf, 2, 3)));
8extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
9 __attribute__ ((format (printf, 2, 3)));
10extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
11 const char *fmt, ...)
12 __attribute__ ((format (printf, 3, 4)));
13extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
14 __attribute__ ((format (printf, 2, 3)));
15extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
16 __attribute__ ((format (printf, 2, 3)));
17extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
18 __attribute__ ((format (printf, 2, 3)));
19extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
20 __attribute__ ((format (printf, 2, 3)));
21extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
22 __attribute__ ((format (printf, 2, 3)));
23
24#ifdef DEBUG
25extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
26 __attribute__ ((format (printf, 2, 3)));
27#else
28static inline void
29__attribute__ ((format (printf, 2, 3)))
30xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
31{
32}
33#endif
34
35extern void assfail(char *expr, char *f, int l);
36
37extern void xfs_hex_dump(void *p, int length);
38
39#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a4e07974955b..a1a881e68a9a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -44,7 +44,6 @@
44#include "xfs_buf_item.h" 44#include "xfs_buf_item.h"
45#include "xfs_utils.h" 45#include "xfs_utils.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_version.h"
48#include "xfs_log_priv.h" 47#include "xfs_log_priv.h"
49#include "xfs_trans_priv.h" 48#include "xfs_trans_priv.h"
50#include "xfs_filestream.h" 49#include "xfs_filestream.h"
@@ -111,8 +110,10 @@ mempool_t *xfs_ioend_pool;
111#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ 110#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
112#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ 111#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
113#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ 112#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
114#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ 113#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */
115#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ 114#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
115#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
116#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
116 117
117/* 118/*
118 * Table driven mount option parser. 119 * Table driven mount option parser.
@@ -174,6 +175,15 @@ xfs_parseargs(
174 __uint8_t iosizelog = 0; 175 __uint8_t iosizelog = 0;
175 176
176 /* 177 /*
178 * set up the mount name first so all the errors will refer to the
179 * correct device.
180 */
181 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
182 if (!mp->m_fsname)
183 return ENOMEM;
184 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
185
186 /*
177 * Copy binary VFS mount flags we are interested in. 187 * Copy binary VFS mount flags we are interested in.
178 */ 188 */
179 if (sb->s_flags & MS_RDONLY) 189 if (sb->s_flags & MS_RDONLY)
@@ -190,6 +200,7 @@ xfs_parseargs(
190 mp->m_flags |= XFS_MOUNT_BARRIER; 200 mp->m_flags |= XFS_MOUNT_BARRIER;
191 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; 201 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
192 mp->m_flags |= XFS_MOUNT_SMALL_INUMS; 202 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
203 mp->m_flags |= XFS_MOUNT_DELAYLOG;
193 204
194 /* 205 /*
195 * These can be overridden by the mount option parsing. 206 * These can be overridden by the mount option parsing.
@@ -208,24 +219,21 @@ xfs_parseargs(
208 219
209 if (!strcmp(this_char, MNTOPT_LOGBUFS)) { 220 if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
210 if (!value || !*value) { 221 if (!value || !*value) {
211 cmn_err(CE_WARN, 222 xfs_warn(mp, "%s option requires an argument",
212 "XFS: %s option requires an argument",
213 this_char); 223 this_char);
214 return EINVAL; 224 return EINVAL;
215 } 225 }
216 mp->m_logbufs = simple_strtoul(value, &eov, 10); 226 mp->m_logbufs = simple_strtoul(value, &eov, 10);
217 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { 227 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
218 if (!value || !*value) { 228 if (!value || !*value) {
219 cmn_err(CE_WARN, 229 xfs_warn(mp, "%s option requires an argument",
220 "XFS: %s option requires an argument",
221 this_char); 230 this_char);
222 return EINVAL; 231 return EINVAL;
223 } 232 }
224 mp->m_logbsize = suffix_strtoul(value, &eov, 10); 233 mp->m_logbsize = suffix_strtoul(value, &eov, 10);
225 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { 234 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
226 if (!value || !*value) { 235 if (!value || !*value) {
227 cmn_err(CE_WARN, 236 xfs_warn(mp, "%s option requires an argument",
228 "XFS: %s option requires an argument",
229 this_char); 237 this_char);
230 return EINVAL; 238 return EINVAL;
231 } 239 }
@@ -233,14 +241,12 @@ xfs_parseargs(
233 if (!mp->m_logname) 241 if (!mp->m_logname)
234 return ENOMEM; 242 return ENOMEM;
235 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 243 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
236 cmn_err(CE_WARN, 244 xfs_warn(mp, "%s option not allowed on this system",
237 "XFS: %s option not allowed on this system",
238 this_char); 245 this_char);
239 return EINVAL; 246 return EINVAL;
240 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 247 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
241 if (!value || !*value) { 248 if (!value || !*value) {
242 cmn_err(CE_WARN, 249 xfs_warn(mp, "%s option requires an argument",
243 "XFS: %s option requires an argument",
244 this_char); 250 this_char);
245 return EINVAL; 251 return EINVAL;
246 } 252 }
@@ -249,8 +255,7 @@ xfs_parseargs(
249 return ENOMEM; 255 return ENOMEM;
250 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { 256 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
251 if (!value || !*value) { 257 if (!value || !*value) {
252 cmn_err(CE_WARN, 258 xfs_warn(mp, "%s option requires an argument",
253 "XFS: %s option requires an argument",
254 this_char); 259 this_char);
255 return EINVAL; 260 return EINVAL;
256 } 261 }
@@ -258,8 +263,7 @@ xfs_parseargs(
258 iosizelog = ffs(iosize) - 1; 263 iosizelog = ffs(iosize) - 1;
259 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { 264 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
260 if (!value || !*value) { 265 if (!value || !*value) {
261 cmn_err(CE_WARN, 266 xfs_warn(mp, "%s option requires an argument",
262 "XFS: %s option requires an argument",
263 this_char); 267 this_char);
264 return EINVAL; 268 return EINVAL;
265 } 269 }
@@ -281,16 +285,14 @@ xfs_parseargs(
281 mp->m_flags |= XFS_MOUNT_SWALLOC; 285 mp->m_flags |= XFS_MOUNT_SWALLOC;
282 } else if (!strcmp(this_char, MNTOPT_SUNIT)) { 286 } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
283 if (!value || !*value) { 287 if (!value || !*value) {
284 cmn_err(CE_WARN, 288 xfs_warn(mp, "%s option requires an argument",
285 "XFS: %s option requires an argument",
286 this_char); 289 this_char);
287 return EINVAL; 290 return EINVAL;
288 } 291 }
289 dsunit = simple_strtoul(value, &eov, 10); 292 dsunit = simple_strtoul(value, &eov, 10);
290 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { 293 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
291 if (!value || !*value) { 294 if (!value || !*value) {
292 cmn_err(CE_WARN, 295 xfs_warn(mp, "%s option requires an argument",
293 "XFS: %s option requires an argument",
294 this_char); 296 this_char);
295 return EINVAL; 297 return EINVAL;
296 } 298 }
@@ -298,8 +300,7 @@ xfs_parseargs(
298 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { 300 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
299 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; 301 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
300#if !XFS_BIG_INUMS 302#if !XFS_BIG_INUMS
301 cmn_err(CE_WARN, 303 xfs_warn(mp, "%s option not allowed on this system",
302 "XFS: %s option not allowed on this system",
303 this_char); 304 this_char);
304 return EINVAL; 305 return EINVAL;
305#endif 306#endif
@@ -354,26 +355,26 @@ xfs_parseargs(
354 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 355 mp->m_qflags &= ~XFS_OQUOTA_ENFD;
355 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 356 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
356 mp->m_flags |= XFS_MOUNT_DELAYLOG; 357 mp->m_flags |= XFS_MOUNT_DELAYLOG;
357 cmn_err(CE_WARN,
358 "Enabling EXPERIMENTAL delayed logging feature "
359 "- use at your own risk.\n");
360 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 358 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
361 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 359 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
360 } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
361 mp->m_flags |= XFS_MOUNT_DISCARD;
362 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
363 mp->m_flags &= ~XFS_MOUNT_DISCARD;
362 } else if (!strcmp(this_char, "ihashsize")) { 364 } else if (!strcmp(this_char, "ihashsize")) {
363 cmn_err(CE_WARN, 365 xfs_warn(mp,
364 "XFS: ihashsize no longer used, option is deprecated."); 366 "ihashsize no longer used, option is deprecated.");
365 } else if (!strcmp(this_char, "osyncisdsync")) { 367 } else if (!strcmp(this_char, "osyncisdsync")) {
366 cmn_err(CE_WARN, 368 xfs_warn(mp,
367 "XFS: osyncisdsync has no effect, option is deprecated."); 369 "osyncisdsync has no effect, option is deprecated.");
368 } else if (!strcmp(this_char, "osyncisosync")) { 370 } else if (!strcmp(this_char, "osyncisosync")) {
369 cmn_err(CE_WARN, 371 xfs_warn(mp,
370 "XFS: osyncisosync has no effect, option is deprecated."); 372 "osyncisosync has no effect, option is deprecated.");
371 } else if (!strcmp(this_char, "irixsgid")) { 373 } else if (!strcmp(this_char, "irixsgid")) {
372 cmn_err(CE_WARN, 374 xfs_warn(mp,
373 "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); 375 "irixsgid is now a sysctl(2) variable, option is deprecated.");
374 } else { 376 } else {
375 cmn_err(CE_WARN, 377 xfs_warn(mp, "unknown mount option [%s].", this_char);
376 "XFS: unknown mount option [%s].", this_char);
377 return EINVAL; 378 return EINVAL;
378 } 379 }
379 } 380 }
@@ -383,40 +384,44 @@ xfs_parseargs(
383 */ 384 */
384 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && 385 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
385 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 386 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
386 cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only."); 387 xfs_warn(mp, "no-recovery mounts must be read-only.");
387 return EINVAL; 388 return EINVAL;
388 } 389 }
389 390
390 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { 391 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
391 cmn_err(CE_WARN, 392 xfs_warn(mp,
392 "XFS: sunit and swidth options incompatible with the noalign option"); 393 "sunit and swidth options incompatible with the noalign option");
394 return EINVAL;
395 }
396
397 if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
398 !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
399 xfs_warn(mp,
400 "the discard option is incompatible with the nodelaylog option");
393 return EINVAL; 401 return EINVAL;
394 } 402 }
395 403
396#ifndef CONFIG_XFS_QUOTA 404#ifndef CONFIG_XFS_QUOTA
397 if (XFS_IS_QUOTA_RUNNING(mp)) { 405 if (XFS_IS_QUOTA_RUNNING(mp)) {
398 cmn_err(CE_WARN, 406 xfs_warn(mp, "quota support not available in this kernel.");
399 "XFS: quota support not available in this kernel.");
400 return EINVAL; 407 return EINVAL;
401 } 408 }
402#endif 409#endif
403 410
404 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && 411 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
405 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { 412 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
406 cmn_err(CE_WARN, 413 xfs_warn(mp, "cannot mount with both project and group quota");
407 "XFS: cannot mount with both project and group quota");
408 return EINVAL; 414 return EINVAL;
409 } 415 }
410 416
411 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 417 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
412 cmn_err(CE_WARN, 418 xfs_warn(mp, "sunit and swidth must be specified together");
413 "XFS: sunit and swidth must be specified together");
414 return EINVAL; 419 return EINVAL;
415 } 420 }
416 421
417 if (dsunit && (dswidth % dsunit != 0)) { 422 if (dsunit && (dswidth % dsunit != 0)) {
418 cmn_err(CE_WARN, 423 xfs_warn(mp,
419 "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)", 424 "stripe width (%d) must be a multiple of the stripe unit (%d)",
420 dswidth, dsunit); 425 dswidth, dsunit);
421 return EINVAL; 426 return EINVAL;
422 } 427 }
@@ -442,8 +447,7 @@ done:
442 mp->m_logbufs != 0 && 447 mp->m_logbufs != 0 &&
443 (mp->m_logbufs < XLOG_MIN_ICLOGS || 448 (mp->m_logbufs < XLOG_MIN_ICLOGS ||
444 mp->m_logbufs > XLOG_MAX_ICLOGS)) { 449 mp->m_logbufs > XLOG_MAX_ICLOGS)) {
445 cmn_err(CE_WARN, 450 xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
446 "XFS: invalid logbufs value: %d [not %d-%d]",
447 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); 451 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
448 return XFS_ERROR(EINVAL); 452 return XFS_ERROR(EINVAL);
449 } 453 }
@@ -452,22 +456,16 @@ done:
452 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || 456 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
453 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || 457 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
454 !is_power_of_2(mp->m_logbsize))) { 458 !is_power_of_2(mp->m_logbsize))) {
455 cmn_err(CE_WARN, 459 xfs_warn(mp,
456 "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", 460 "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
457 mp->m_logbsize); 461 mp->m_logbsize);
458 return XFS_ERROR(EINVAL); 462 return XFS_ERROR(EINVAL);
459 } 463 }
460 464
461 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
462 if (!mp->m_fsname)
463 return ENOMEM;
464 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
465
466 if (iosizelog) { 465 if (iosizelog) {
467 if (iosizelog > XFS_MAX_IO_LOG || 466 if (iosizelog > XFS_MAX_IO_LOG ||
468 iosizelog < XFS_MIN_IO_LOG) { 467 iosizelog < XFS_MIN_IO_LOG) {
469 cmn_err(CE_WARN, 468 xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
470 "XFS: invalid log iosize: %d [not %d-%d]",
471 iosizelog, XFS_MIN_IO_LOG, 469 iosizelog, XFS_MIN_IO_LOG,
472 XFS_MAX_IO_LOG); 470 XFS_MAX_IO_LOG);
473 return XFS_ERROR(EINVAL); 471 return XFS_ERROR(EINVAL);
@@ -503,6 +501,7 @@ xfs_showargs(
503 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 501 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
504 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 502 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
505 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, 503 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
504 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
506 { 0, NULL } 505 { 0, NULL }
507 }; 506 };
508 static struct proc_xfs_info xfs_info_unset[] = { 507 static struct proc_xfs_info xfs_info_unset[] = {
@@ -577,7 +576,7 @@ xfs_max_file_offset(
577 576
578 /* Figure out maximum filesize, on Linux this can depend on 577 /* Figure out maximum filesize, on Linux this can depend on
579 * the filesystem blocksize (on 32 bit platforms). 578 * the filesystem blocksize (on 32 bit platforms).
580 * __block_prepare_write does this in an [unsigned] long... 579 * __block_write_begin does this in an [unsigned] long...
581 * page->index << (PAGE_CACHE_SHIFT - bbits) 580 * page->index << (PAGE_CACHE_SHIFT - bbits)
582 * So, for page sized blocks (4K on 32 bit platforms), 581 * So, for page sized blocks (4K on 32 bit platforms),
583 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is 582 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
@@ -610,10 +609,11 @@ xfs_blkdev_get(
610{ 609{
611 int error = 0; 610 int error = 0;
612 611
613 *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); 612 *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
613 mp);
614 if (IS_ERR(*bdevp)) { 614 if (IS_ERR(*bdevp)) {
615 error = PTR_ERR(*bdevp); 615 error = PTR_ERR(*bdevp);
616 printk("XFS: Invalid device [%s], error=%d\n", name, error); 616 xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
617 } 617 }
618 618
619 return -error; 619 return -error;
@@ -624,77 +624,14 @@ xfs_blkdev_put(
624 struct block_device *bdev) 624 struct block_device *bdev)
625{ 625{
626 if (bdev) 626 if (bdev)
627 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); 627 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
628}
629
630/*
631 * Try to write out the superblock using barriers.
632 */
633STATIC int
634xfs_barrier_test(
635 xfs_mount_t *mp)
636{
637 xfs_buf_t *sbp = xfs_getsb(mp, 0);
638 int error;
639
640 XFS_BUF_UNDONE(sbp);
641 XFS_BUF_UNREAD(sbp);
642 XFS_BUF_UNDELAYWRITE(sbp);
643 XFS_BUF_WRITE(sbp);
644 XFS_BUF_UNASYNC(sbp);
645 XFS_BUF_ORDERED(sbp);
646
647 xfsbdstrat(mp, sbp);
648 error = xfs_iowait(sbp);
649
650 /*
651 * Clear all the flags we set and possible error state in the
652 * buffer. We only did the write to try out whether barriers
653 * worked and shouldn't leave any traces in the superblock
654 * buffer.
655 */
656 XFS_BUF_DONE(sbp);
657 XFS_BUF_ERROR(sbp, 0);
658 XFS_BUF_UNORDERED(sbp);
659
660 xfs_buf_relse(sbp);
661 return error;
662}
663
664STATIC void
665xfs_mountfs_check_barriers(xfs_mount_t *mp)
666{
667 int error;
668
669 if (mp->m_logdev_targp != mp->m_ddev_targp) {
670 xfs_fs_cmn_err(CE_NOTE, mp,
671 "Disabling barriers, not supported with external log device");
672 mp->m_flags &= ~XFS_MOUNT_BARRIER;
673 return;
674 }
675
676 if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
677 xfs_fs_cmn_err(CE_NOTE, mp,
678 "Disabling barriers, underlying device is readonly");
679 mp->m_flags &= ~XFS_MOUNT_BARRIER;
680 return;
681 }
682
683 error = xfs_barrier_test(mp);
684 if (error) {
685 xfs_fs_cmn_err(CE_NOTE, mp,
686 "Disabling barriers, trial barrier write failed");
687 mp->m_flags &= ~XFS_MOUNT_BARRIER;
688 return;
689 }
690} 628}
691 629
692void 630void
693xfs_blkdev_issue_flush( 631xfs_blkdev_issue_flush(
694 xfs_buftarg_t *buftarg) 632 xfs_buftarg_t *buftarg)
695{ 633{
696 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, 634 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
697 BLKDEV_IFL_WAIT);
698} 635}
699 636
700STATIC void 637STATIC void
@@ -747,8 +684,8 @@ xfs_open_devices(
747 goto out_close_logdev; 684 goto out_close_logdev;
748 685
749 if (rtdev == ddev || rtdev == logdev) { 686 if (rtdev == ddev || rtdev == logdev) {
750 cmn_err(CE_WARN, 687 xfs_warn(mp,
751 "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); 688 "Cannot mount filesystem with identical rtdev and ddev/logdev.");
752 error = EINVAL; 689 error = EINVAL;
753 goto out_close_rtdev; 690 goto out_close_rtdev;
754 } 691 }
@@ -758,18 +695,20 @@ xfs_open_devices(
758 * Setup xfs_mount buffer target pointers 695 * Setup xfs_mount buffer target pointers
759 */ 696 */
760 error = ENOMEM; 697 error = ENOMEM;
761 mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); 698 mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
762 if (!mp->m_ddev_targp) 699 if (!mp->m_ddev_targp)
763 goto out_close_rtdev; 700 goto out_close_rtdev;
764 701
765 if (rtdev) { 702 if (rtdev) {
766 mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); 703 mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
704 mp->m_fsname);
767 if (!mp->m_rtdev_targp) 705 if (!mp->m_rtdev_targp)
768 goto out_free_ddev_targ; 706 goto out_free_ddev_targ;
769 } 707 }
770 708
771 if (logdev && logdev != ddev) { 709 if (logdev && logdev != ddev) {
772 mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); 710 mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
711 mp->m_fsname);
773 if (!mp->m_logdev_targp) 712 if (!mp->m_logdev_targp)
774 goto out_free_rtdev_targ; 713 goto out_free_rtdev_targ;
775 } else { 714 } else {
@@ -829,63 +768,6 @@ xfs_setup_devices(
829 return 0; 768 return 0;
830} 769}
831 770
832/*
833 * XFS AIL push thread support
834 */
835void
836xfsaild_wakeup(
837 struct xfs_ail *ailp,
838 xfs_lsn_t threshold_lsn)
839{
840 ailp->xa_target = threshold_lsn;
841 wake_up_process(ailp->xa_task);
842}
843
844STATIC int
845xfsaild(
846 void *data)
847{
848 struct xfs_ail *ailp = data;
849 xfs_lsn_t last_pushed_lsn = 0;
850 long tout = 0; /* milliseconds */
851
852 while (!kthread_should_stop()) {
853 schedule_timeout_interruptible(tout ?
854 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
855
856 /* swsusp */
857 try_to_freeze();
858
859 ASSERT(ailp->xa_mount->m_log);
860 if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
861 continue;
862
863 tout = xfsaild_push(ailp, &last_pushed_lsn);
864 }
865
866 return 0;
867} /* xfsaild */
868
869int
870xfsaild_start(
871 struct xfs_ail *ailp)
872{
873 ailp->xa_target = 0;
874 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
875 ailp->xa_mount->m_fsname);
876 if (IS_ERR(ailp->xa_task))
877 return -PTR_ERR(ailp->xa_task);
878 return 0;
879}
880
881void
882xfsaild_stop(
883 struct xfs_ail *ailp)
884{
885 kthread_stop(ailp->xa_task);
886}
887
888
889/* Catch misguided souls that try to use this interface on XFS */ 771/* Catch misguided souls that try to use this interface on XFS */
890STATIC struct inode * 772STATIC struct inode *
891xfs_fs_alloc_inode( 773xfs_fs_alloc_inode(
@@ -938,7 +820,7 @@ out_reclaim:
938 * Slab object creation initialisation for the XFS inode. 820 * Slab object creation initialisation for the XFS inode.
939 * This covers only the idempotent fields in the XFS inode; 821 * This covers only the idempotent fields in the XFS inode;
940 * all other fields need to be initialised on allocation 822 * all other fields need to be initialised on allocation
941 * from the slab. This avoids the need to repeatedly intialise 823 * from the slab. This avoids the need to repeatedly initialise
942 * fields in the xfs inode that left in the initialise state 824 * fields in the xfs inode that left in the initialise state
943 * when freeing the inode. 825 * when freeing the inode.
944 */ 826 */
@@ -972,12 +854,7 @@ xfs_fs_inode_init_once(
972 854
973/* 855/*
974 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that 856 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
975 * we catch unlogged VFS level updates to the inode. Care must be taken 857 * we catch unlogged VFS level updates to the inode.
976 * here - the transaction code calls mark_inode_dirty_sync() to mark the
977 * VFS inode dirty in a transaction and clears the i_update_core field;
978 * it must clear the field after calling mark_inode_dirty_sync() to
979 * correctly indicate that the dirty state has been propagated into the
980 * inode log item.
981 * 858 *
982 * We need the barrier() to maintain correct ordering between unlogged 859 * We need the barrier() to maintain correct ordering between unlogged
983 * updates and the transaction commit code that clears the i_update_core 860 * updates and the transaction commit code that clears the i_update_core
@@ -986,7 +863,8 @@ xfs_fs_inode_init_once(
986 */ 863 */
987STATIC void 864STATIC void
988xfs_fs_dirty_inode( 865xfs_fs_dirty_inode(
989 struct inode *inode) 866 struct inode *inode,
867 int flags)
990{ 868{
991 barrier(); 869 barrier();
992 XFS_I(inode)->i_update_core = 1; 870 XFS_I(inode)->i_update_core = 1;
@@ -1084,7 +962,7 @@ xfs_fs_write_inode(
1084 error = 0; 962 error = 0;
1085 goto out_unlock; 963 goto out_unlock;
1086 } 964 }
1087 error = xfs_iflush(ip, 0); 965 error = xfs_iflush(ip, SYNC_TRYLOCK);
1088 } 966 }
1089 967
1090 out_unlock: 968 out_unlock:
@@ -1126,6 +1004,8 @@ xfs_fs_evict_inode(
1126 */ 1004 */
1127 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 1005 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
1128 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 1006 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
1007 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
1008 &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
1129 1009
1130 xfs_inactive(ip); 1010 xfs_inactive(ip);
1131} 1011}
@@ -1195,22 +1075,12 @@ xfs_fs_sync_fs(
1195 return -error; 1075 return -error;
1196 1076
1197 if (laptop_mode) { 1077 if (laptop_mode) {
1198 int prev_sync_seq = mp->m_sync_seq;
1199
1200 /* 1078 /*
1201 * The disk must be active because we're syncing. 1079 * The disk must be active because we're syncing.
1202 * We schedule xfssyncd now (now that the disk is 1080 * We schedule xfssyncd now (now that the disk is
1203 * active) instead of later (when it might not be). 1081 * active) instead of later (when it might not be).
1204 */ 1082 */
1205 wake_up_process(mp->m_sync_task); 1083 flush_delayed_work_sync(&mp->m_sync_work);
1206 /*
1207 * We have to wait for the sync iteration to complete.
1208 * If we don't, the disk activity caused by the sync
1209 * will come after the sync is completed, and that
1210 * triggers another sync from laptop mode.
1211 */
1212 wait_event(mp->m_wait_single_sync_task,
1213 mp->m_sync_seq != prev_sync_seq);
1214 } 1084 }
1215 1085
1216 return 0; 1086 return 0;
@@ -1308,14 +1178,6 @@ xfs_fs_remount(
1308 switch (token) { 1178 switch (token) {
1309 case Opt_barrier: 1179 case Opt_barrier:
1310 mp->m_flags |= XFS_MOUNT_BARRIER; 1180 mp->m_flags |= XFS_MOUNT_BARRIER;
1311
1312 /*
1313 * Test if barriers are actually working if we can,
1314 * else delay this check until the filesystem is
1315 * marked writeable.
1316 */
1317 if (!(mp->m_flags & XFS_MOUNT_RDONLY))
1318 xfs_mountfs_check_barriers(mp);
1319 break; 1181 break;
1320 case Opt_nobarrier: 1182 case Opt_nobarrier:
1321 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1183 mp->m_flags &= ~XFS_MOUNT_BARRIER;
@@ -1338,8 +1200,8 @@ xfs_fs_remount(
1338 * options that we can't actually change. 1200 * options that we can't actually change.
1339 */ 1201 */
1340#if 0 1202#if 0
1341 printk(KERN_INFO 1203 xfs_info(mp,
1342 "XFS: mount option \"%s\" not supported for remount\n", p); 1204 "mount option \"%s\" not supported for remount\n", p);
1343 return -EINVAL; 1205 return -EINVAL;
1344#else 1206#else
1345 break; 1207 break;
@@ -1350,8 +1212,6 @@ xfs_fs_remount(
1350 /* ro -> rw */ 1212 /* ro -> rw */
1351 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { 1213 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
1352 mp->m_flags &= ~XFS_MOUNT_RDONLY; 1214 mp->m_flags &= ~XFS_MOUNT_RDONLY;
1353 if (mp->m_flags & XFS_MOUNT_BARRIER)
1354 xfs_mountfs_check_barriers(mp);
1355 1215
1356 /* 1216 /*
1357 * If this is the first remount to writeable state we 1217 * If this is the first remount to writeable state we
@@ -1360,8 +1220,7 @@ xfs_fs_remount(
1360 if (mp->m_update_flags) { 1220 if (mp->m_update_flags) {
1361 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1221 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1362 if (error) { 1222 if (error) {
1363 cmn_err(CE_WARN, 1223 xfs_warn(mp, "failed to write sb changes");
1364 "XFS: failed to write sb changes");
1365 return error; 1224 return error;
1366 } 1225 }
1367 mp->m_update_flags = 0; 1226 mp->m_update_flags = 0;
@@ -1407,7 +1266,7 @@ xfs_fs_freeze(
1407 1266
1408 xfs_save_resvblks(mp); 1267 xfs_save_resvblks(mp);
1409 xfs_quiesce_attr(mp); 1268 xfs_quiesce_attr(mp);
1410 return -xfs_fs_log_dummy(mp, SYNC_WAIT); 1269 return -xfs_fs_log_dummy(mp);
1411} 1270}
1412 1271
1413STATIC int 1272STATIC int
@@ -1445,15 +1304,15 @@ xfs_finish_flags(
1445 mp->m_logbsize = mp->m_sb.sb_logsunit; 1304 mp->m_logbsize = mp->m_sb.sb_logsunit;
1446 } else if (mp->m_logbsize > 0 && 1305 } else if (mp->m_logbsize > 0 &&
1447 mp->m_logbsize < mp->m_sb.sb_logsunit) { 1306 mp->m_logbsize < mp->m_sb.sb_logsunit) {
1448 cmn_err(CE_WARN, 1307 xfs_warn(mp,
1449 "XFS: logbuf size must be greater than or equal to log stripe size"); 1308 "logbuf size must be greater than or equal to log stripe size");
1450 return XFS_ERROR(EINVAL); 1309 return XFS_ERROR(EINVAL);
1451 } 1310 }
1452 } else { 1311 } else {
1453 /* Fail a mount if the logbuf is larger than 32K */ 1312 /* Fail a mount if the logbuf is larger than 32K */
1454 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { 1313 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
1455 cmn_err(CE_WARN, 1314 xfs_warn(mp,
1456 "XFS: logbuf size for version 1 logs must be 16K or 32K"); 1315 "logbuf size for version 1 logs must be 16K or 32K");
1457 return XFS_ERROR(EINVAL); 1316 return XFS_ERROR(EINVAL);
1458 } 1317 }
1459 } 1318 }
@@ -1470,8 +1329,8 @@ xfs_finish_flags(
1470 * prohibit r/w mounts of read-only filesystems 1329 * prohibit r/w mounts of read-only filesystems
1471 */ 1330 */
1472 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { 1331 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
1473 cmn_err(CE_WARN, 1332 xfs_warn(mp,
1474 "XFS: cannot mount a read-only filesystem as read-write"); 1333 "cannot mount a read-only filesystem as read-write");
1475 return XFS_ERROR(EROFS); 1334 return XFS_ERROR(EROFS);
1476 } 1335 }
1477 1336
@@ -1495,9 +1354,6 @@ xfs_fs_fill_super(
1495 spin_lock_init(&mp->m_sb_lock); 1354 spin_lock_init(&mp->m_sb_lock);
1496 mutex_init(&mp->m_growlock); 1355 mutex_init(&mp->m_growlock);
1497 atomic_set(&mp->m_active_trans, 0); 1356 atomic_set(&mp->m_active_trans, 0);
1498 INIT_LIST_HEAD(&mp->m_sync_list);
1499 spin_lock_init(&mp->m_sync_lock);
1500 init_waitqueue_head(&mp->m_wait_single_sync_task);
1501 1357
1502 mp->m_super = sb; 1358 mp->m_super = sb;
1503 sb->s_fs_info = mp; 1359 sb->s_fs_info = mp;
@@ -1521,8 +1377,9 @@ xfs_fs_fill_super(
1521 if (error) 1377 if (error)
1522 goto out_free_fsname; 1378 goto out_free_fsname;
1523 1379
1524 if (xfs_icsb_init_counters(mp)) 1380 error = xfs_icsb_init_counters(mp);
1525 mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; 1381 if (error)
1382 goto out_close_devices;
1526 1383
1527 error = xfs_readsb(mp, flags); 1384 error = xfs_readsb(mp, flags);
1528 if (error) 1385 if (error)
@@ -1536,17 +1393,18 @@ xfs_fs_fill_super(
1536 if (error) 1393 if (error)
1537 goto out_free_sb; 1394 goto out_free_sb;
1538 1395
1539 if (mp->m_flags & XFS_MOUNT_BARRIER)
1540 xfs_mountfs_check_barriers(mp);
1541
1542 error = xfs_filestream_mount(mp); 1396 error = xfs_filestream_mount(mp);
1543 if (error) 1397 if (error)
1544 goto out_free_sb; 1398 goto out_free_sb;
1545 1399
1546 error = xfs_mountfs(mp); 1400 /*
1547 if (error) 1401 * we must configure the block size in the superblock before we run the
1548 goto out_filestream_unmount; 1402 * full mount process as the mount process can lookup and cache inodes.
1549 1403 * For the same reason we must also initialise the syncd and register
1404 * the inode cache shrinker so that inodes can be reclaimed during
1405 * operations like a quotacheck that iterate all inodes in the
1406 * filesystem.
1407 */
1550 sb->s_magic = XFS_SB_MAGIC; 1408 sb->s_magic = XFS_SB_MAGIC;
1551 sb->s_blocksize = mp->m_sb.sb_blocksize; 1409 sb->s_blocksize = mp->m_sb.sb_blocksize;
1552 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1410 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1554,6 +1412,16 @@ xfs_fs_fill_super(
1554 sb->s_time_gran = 1; 1412 sb->s_time_gran = 1;
1555 set_posix_acl_flag(sb); 1413 set_posix_acl_flag(sb);
1556 1414
1415 error = xfs_syncd_init(mp);
1416 if (error)
1417 goto out_filestream_unmount;
1418
1419 xfs_inode_shrinker_register(mp);
1420
1421 error = xfs_mountfs(mp);
1422 if (error)
1423 goto out_syncd_stop;
1424
1557 root = igrab(VFS_I(mp->m_rootip)); 1425 root = igrab(VFS_I(mp->m_rootip));
1558 if (!root) { 1426 if (!root) {
1559 error = ENOENT; 1427 error = ENOENT;
@@ -1569,20 +1437,18 @@ xfs_fs_fill_super(
1569 goto fail_vnrele; 1437 goto fail_vnrele;
1570 } 1438 }
1571 1439
1572 error = xfs_syncd_init(mp);
1573 if (error)
1574 goto fail_vnrele;
1575
1576 xfs_inode_shrinker_register(mp);
1577
1578 return 0; 1440 return 0;
1579 1441
1442 out_syncd_stop:
1443 xfs_inode_shrinker_unregister(mp);
1444 xfs_syncd_stop(mp);
1580 out_filestream_unmount: 1445 out_filestream_unmount:
1581 xfs_filestream_unmount(mp); 1446 xfs_filestream_unmount(mp);
1582 out_free_sb: 1447 out_free_sb:
1583 xfs_freesb(mp); 1448 xfs_freesb(mp);
1584 out_destroy_counters: 1449 out_destroy_counters:
1585 xfs_icsb_destroy_counters(mp); 1450 xfs_icsb_destroy_counters(mp);
1451 out_close_devices:
1586 xfs_close_devices(mp); 1452 xfs_close_devices(mp);
1587 out_free_fsname: 1453 out_free_fsname:
1588 xfs_free_fsname(mp); 1454 xfs_free_fsname(mp);
@@ -1599,6 +1465,9 @@ xfs_fs_fill_super(
1599 } 1465 }
1600 1466
1601 fail_unmount: 1467 fail_unmount:
1468 xfs_inode_shrinker_unregister(mp);
1469 xfs_syncd_stop(mp);
1470
1602 /* 1471 /*
1603 * Blow away any referenced inode in the filestreams cache. 1472 * Blow away any referenced inode in the filestreams cache.
1604 * This can and will cause log traffic as inodes go inactive 1473 * This can and will cause log traffic as inodes go inactive
@@ -1612,16 +1481,14 @@ xfs_fs_fill_super(
1612 goto out_free_sb; 1481 goto out_free_sb;
1613} 1482}
1614 1483
1615STATIC int 1484STATIC struct dentry *
1616xfs_fs_get_sb( 1485xfs_fs_mount(
1617 struct file_system_type *fs_type, 1486 struct file_system_type *fs_type,
1618 int flags, 1487 int flags,
1619 const char *dev_name, 1488 const char *dev_name,
1620 void *data, 1489 void *data)
1621 struct vfsmount *mnt)
1622{ 1490{
1623 return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super, 1491 return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
1624 mnt);
1625} 1492}
1626 1493
1627static const struct super_operations xfs_super_operations = { 1494static const struct super_operations xfs_super_operations = {
@@ -1642,7 +1509,7 @@ static const struct super_operations xfs_super_operations = {
1642static struct file_system_type xfs_fs_type = { 1509static struct file_system_type xfs_fs_type = {
1643 .owner = THIS_MODULE, 1510 .owner = THIS_MODULE,
1644 .name = "xfs", 1511 .name = "xfs",
1645 .get_sb = xfs_fs_get_sb, 1512 .mount = xfs_fs_mount,
1646 .kill_sb = kill_block_super, 1513 .kill_sb = kill_block_super,
1647 .fs_flags = FS_REQUIRES_DEV, 1514 .fs_flags = FS_REQUIRES_DEV,
1648}; 1515};
@@ -1790,6 +1657,38 @@ xfs_destroy_zones(void)
1790} 1657}
1791 1658
1792STATIC int __init 1659STATIC int __init
1660xfs_init_workqueues(void)
1661{
1662 /*
1663 * max_active is set to 8 to give enough concurency to allow
1664 * multiple work operations on each CPU to run. This allows multiple
1665 * filesystems to be running sync work concurrently, and scales with
1666 * the number of CPUs in the system.
1667 */
1668 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
1669 if (!xfs_syncd_wq)
1670 goto out;
1671
1672 xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
1673 if (!xfs_ail_wq)
1674 goto out_destroy_syncd;
1675
1676 return 0;
1677
1678out_destroy_syncd:
1679 destroy_workqueue(xfs_syncd_wq);
1680out:
1681 return -ENOMEM;
1682}
1683
1684STATIC void
1685xfs_destroy_workqueues(void)
1686{
1687 destroy_workqueue(xfs_ail_wq);
1688 destroy_workqueue(xfs_syncd_wq);
1689}
1690
1691STATIC int __init
1793init_xfs_fs(void) 1692init_xfs_fs(void)
1794{ 1693{
1795 int error; 1694 int error;
@@ -1804,10 +1703,14 @@ init_xfs_fs(void)
1804 if (error) 1703 if (error)
1805 goto out; 1704 goto out;
1806 1705
1807 error = xfs_mru_cache_init(); 1706 error = xfs_init_workqueues();
1808 if (error) 1707 if (error)
1809 goto out_destroy_zones; 1708 goto out_destroy_zones;
1810 1709
1710 error = xfs_mru_cache_init();
1711 if (error)
1712 goto out_destroy_wq;
1713
1811 error = xfs_filestream_init(); 1714 error = xfs_filestream_init();
1812 if (error) 1715 if (error)
1813 goto out_mru_cache_uninit; 1716 goto out_mru_cache_uninit;
@@ -1841,6 +1744,8 @@ init_xfs_fs(void)
1841 xfs_filestream_uninit(); 1744 xfs_filestream_uninit();
1842 out_mru_cache_uninit: 1745 out_mru_cache_uninit:
1843 xfs_mru_cache_uninit(); 1746 xfs_mru_cache_uninit();
1747 out_destroy_wq:
1748 xfs_destroy_workqueues();
1844 out_destroy_zones: 1749 out_destroy_zones:
1845 xfs_destroy_zones(); 1750 xfs_destroy_zones();
1846 out: 1751 out:
@@ -1857,6 +1762,7 @@ exit_xfs_fs(void)
1857 xfs_buf_terminate(); 1762 xfs_buf_terminate();
1858 xfs_filestream_uninit(); 1763 xfs_filestream_uninit();
1859 xfs_mru_cache_uninit(); 1764 xfs_mru_cache_uninit();
1765 xfs_destroy_workqueues();
1860 xfs_destroy_zones(); 1766 xfs_destroy_zones();
1861} 1767}
1862 1768
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 1ef4a4d2d997..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -62,6 +62,7 @@ extern void xfs_qm_exit(void);
62# define XFS_DBG_STRING "no debug" 62# define XFS_DBG_STRING "no debug"
63#endif 63#endif
64 64
65#define XFS_VERSION_STRING "SGI XFS"
65#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ 66#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
66 XFS_SECURITY_STRING \ 67 XFS_SECURITY_STRING \
67 XFS_REALTIME_STRING \ 68 XFS_REALTIME_STRING \
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 81976ffed7d6..8ecad5ff9f9b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -39,42 +40,61 @@
39#include <linux/kthread.h> 40#include <linux/kthread.h>
40#include <linux/freezer.h> 41#include <linux/freezer.h>
41 42
43struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
42 44
43STATIC xfs_inode_t * 45/*
44xfs_inode_ag_lookup( 46 * The inode lookup is done in batches to keep the amount of lock traffic and
45 struct xfs_mount *mp, 47 * radix tree lookups to a minimum. The batch size is a trade off between
46 struct xfs_perag *pag, 48 * lookup reduction and stack usage. This is in the reclaim path, so we can't
47 uint32_t *first_index, 49 * be too greedy.
48 int tag) 50 */
51#define XFS_LOOKUP_BATCH 32
52
53STATIC int
54xfs_inode_ag_walk_grab(
55 struct xfs_inode *ip)
49{ 56{
50 int nr_found; 57 struct inode *inode = VFS_I(ip);
51 struct xfs_inode *ip; 58
59 ASSERT(rcu_read_lock_held());
52 60
53 /* 61 /*
54 * use a gang lookup to find the next inode in the tree 62 * check for stale RCU freed inode
55 * as the tree is sparse and a gang lookup walks to find 63 *
56 * the number of objects requested. 64 * If the inode has been reallocated, it doesn't matter if it's not in
65 * the AG we are walking - we are walking for writeback, so if it
66 * passes all the "valid inode" checks and is dirty, then we'll write
67 * it back anyway. If it has been reallocated and still being
68 * initialised, the XFS_INEW check below will catch it.
57 */ 69 */
58 if (tag == XFS_ICI_NO_TAG) { 70 spin_lock(&ip->i_flags_lock);
59 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 71 if (!ip->i_ino)
60 (void **)&ip, *first_index, 1); 72 goto out_unlock_noent;
61 } else { 73
62 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 74 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
63 (void **)&ip, *first_index, 1, tag); 75 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
76 goto out_unlock_noent;
77 spin_unlock(&ip->i_flags_lock);
78
79 /* nothing to sync during shutdown */
80 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
81 return EFSCORRUPTED;
82
83 /* If we can't grab the inode, it must on it's way to reclaim. */
84 if (!igrab(inode))
85 return ENOENT;
86
87 if (is_bad_inode(inode)) {
88 IRELE(ip);
89 return ENOENT;
64 } 90 }
65 if (!nr_found)
66 return NULL;
67 91
68 /* 92 /* inode is valid */
69 * Update the index for the next lookup. Catch overflows 93 return 0;
70 * into the next AG range which can occur if we have inodes 94
71 * in the last block of the AG and we are currently 95out_unlock_noent:
72 * pointing to the last inode. 96 spin_unlock(&ip->i_flags_lock);
73 */ 97 return ENOENT;
74 *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
75 if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
76 return NULL;
77 return ip;
78} 98}
79 99
80STATIC int 100STATIC int
@@ -83,49 +103,83 @@ xfs_inode_ag_walk(
83 struct xfs_perag *pag, 103 struct xfs_perag *pag,
84 int (*execute)(struct xfs_inode *ip, 104 int (*execute)(struct xfs_inode *ip,
85 struct xfs_perag *pag, int flags), 105 struct xfs_perag *pag, int flags),
86 int flags, 106 int flags)
87 int tag,
88 int exclusive,
89 int *nr_to_scan)
90{ 107{
91 uint32_t first_index; 108 uint32_t first_index;
92 int last_error = 0; 109 int last_error = 0;
93 int skipped; 110 int skipped;
111 int done;
112 int nr_found;
94 113
95restart: 114restart:
115 done = 0;
96 skipped = 0; 116 skipped = 0;
97 first_index = 0; 117 first_index = 0;
118 nr_found = 0;
98 do { 119 do {
120 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
99 int error = 0; 121 int error = 0;
100 xfs_inode_t *ip; 122 int i;
101 123
102 if (exclusive) 124 rcu_read_lock();
103 write_lock(&pag->pag_ici_lock); 125 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
104 else 126 (void **)batch, first_index,
105 read_lock(&pag->pag_ici_lock); 127 XFS_LOOKUP_BATCH);
106 ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); 128 if (!nr_found) {
107 if (!ip) { 129 rcu_read_unlock();
108 if (exclusive)
109 write_unlock(&pag->pag_ici_lock);
110 else
111 read_unlock(&pag->pag_ici_lock);
112 break; 130 break;
113 } 131 }
114 132
115 /* execute releases pag->pag_ici_lock */ 133 /*
116 error = execute(ip, pag, flags); 134 * Grab the inodes before we drop the lock. if we found
117 if (error == EAGAIN) { 135 * nothing, nr == 0 and the loop will be skipped.
118 skipped++; 136 */
119 continue; 137 for (i = 0; i < nr_found; i++) {
138 struct xfs_inode *ip = batch[i];
139
140 if (done || xfs_inode_ag_walk_grab(ip))
141 batch[i] = NULL;
142
143 /*
144 * Update the index for the next lookup. Catch
145 * overflows into the next AG range which can occur if
146 * we have inodes in the last block of the AG and we
147 * are currently pointing to the last inode.
148 *
149 * Because we may see inodes that are from the wrong AG
150 * due to RCU freeing and reallocation, only update the
151 * index if it lies in this AG. It was a race that lead
152 * us to see this inode, so another lookup from the
153 * same index will not find it again.
154 */
155 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
156 continue;
157 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
158 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
159 done = 1;
160 }
161
162 /* unlock now we've grabbed the inodes. */
163 rcu_read_unlock();
164
165 for (i = 0; i < nr_found; i++) {
166 if (!batch[i])
167 continue;
168 error = execute(batch[i], pag, flags);
169 IRELE(batch[i]);
170 if (error == EAGAIN) {
171 skipped++;
172 continue;
173 }
174 if (error && last_error != EFSCORRUPTED)
175 last_error = error;
120 } 176 }
121 if (error)
122 last_error = error;
123 177
124 /* bail out if the filesystem is corrupted. */ 178 /* bail out if the filesystem is corrupted. */
125 if (error == EFSCORRUPTED) 179 if (error == EFSCORRUPTED)
126 break; 180 break;
127 181
128 } while ((*nr_to_scan)--); 182 } while (nr_found && !done);
129 183
130 if (skipped) { 184 if (skipped) {
131 delay(1); 185 delay(1);
@@ -134,110 +188,32 @@ restart:
134 return last_error; 188 return last_error;
135} 189}
136 190
137/*
138 * Select the next per-ag structure to iterate during the walk. The reclaim
139 * walk is optimised only to walk AGs with reclaimable inodes in them.
140 */
141static struct xfs_perag *
142xfs_inode_ag_iter_next_pag(
143 struct xfs_mount *mp,
144 xfs_agnumber_t *first,
145 int tag)
146{
147 struct xfs_perag *pag = NULL;
148
149 if (tag == XFS_ICI_RECLAIM_TAG) {
150 int found;
151 int ref;
152
153 spin_lock(&mp->m_perag_lock);
154 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
155 (void **)&pag, *first, 1, tag);
156 if (found <= 0) {
157 spin_unlock(&mp->m_perag_lock);
158 return NULL;
159 }
160 *first = pag->pag_agno + 1;
161 /* open coded pag reference increment */
162 ref = atomic_inc_return(&pag->pag_ref);
163 spin_unlock(&mp->m_perag_lock);
164 trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
165 } else {
166 pag = xfs_perag_get(mp, *first);
167 (*first)++;
168 }
169 return pag;
170}
171
172int 191int
173xfs_inode_ag_iterator( 192xfs_inode_ag_iterator(
174 struct xfs_mount *mp, 193 struct xfs_mount *mp,
175 int (*execute)(struct xfs_inode *ip, 194 int (*execute)(struct xfs_inode *ip,
176 struct xfs_perag *pag, int flags), 195 struct xfs_perag *pag, int flags),
177 int flags, 196 int flags)
178 int tag,
179 int exclusive,
180 int *nr_to_scan)
181{ 197{
182 struct xfs_perag *pag; 198 struct xfs_perag *pag;
183 int error = 0; 199 int error = 0;
184 int last_error = 0; 200 int last_error = 0;
185 xfs_agnumber_t ag; 201 xfs_agnumber_t ag;
186 int nr;
187 202
188 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
189 ag = 0; 203 ag = 0;
190 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { 204 while ((pag = xfs_perag_get(mp, ag))) {
191 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 205 ag = pag->pag_agno + 1;
192 exclusive, &nr); 206 error = xfs_inode_ag_walk(mp, pag, execute, flags);
193 xfs_perag_put(pag); 207 xfs_perag_put(pag);
194 if (error) { 208 if (error) {
195 last_error = error; 209 last_error = error;
196 if (error == EFSCORRUPTED) 210 if (error == EFSCORRUPTED)
197 break; 211 break;
198 } 212 }
199 if (nr <= 0)
200 break;
201 } 213 }
202 if (nr_to_scan)
203 *nr_to_scan = nr;
204 return XFS_ERROR(last_error); 214 return XFS_ERROR(last_error);
205} 215}
206 216
207/* must be called with pag_ici_lock held and releases it */
208int
209xfs_sync_inode_valid(
210 struct xfs_inode *ip,
211 struct xfs_perag *pag)
212{
213 struct inode *inode = VFS_I(ip);
214 int error = EFSCORRUPTED;
215
216 /* nothing to sync during shutdown */
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 goto out_unlock;
219
220 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
221 error = ENOENT;
222 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
223 goto out_unlock;
224
225 /* If we can't grab the inode, it must on it's way to reclaim. */
226 if (!igrab(inode))
227 goto out_unlock;
228
229 if (is_bad_inode(inode)) {
230 IRELE(ip);
231 goto out_unlock;
232 }
233
234 /* inode is valid */
235 error = 0;
236out_unlock:
237 read_unlock(&pag->pag_ici_lock);
238 return error;
239}
240
241STATIC int 217STATIC int
242xfs_sync_inode_data( 218xfs_sync_inode_data(
243 struct xfs_inode *ip, 219 struct xfs_inode *ip,
@@ -248,10 +224,6 @@ xfs_sync_inode_data(
248 struct address_space *mapping = inode->i_mapping; 224 struct address_space *mapping = inode->i_mapping;
249 int error = 0; 225 int error = 0;
250 226
251 error = xfs_sync_inode_valid(ip, pag);
252 if (error)
253 return error;
254
255 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 227 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
256 goto out_wait; 228 goto out_wait;
257 229
@@ -268,7 +240,6 @@ xfs_sync_inode_data(
268 out_wait: 240 out_wait:
269 if (flags & SYNC_WAIT) 241 if (flags & SYNC_WAIT)
270 xfs_ioend_wait(ip); 242 xfs_ioend_wait(ip);
271 IRELE(ip);
272 return error; 243 return error;
273} 244}
274 245
@@ -280,10 +251,6 @@ xfs_sync_inode_attr(
280{ 251{
281 int error = 0; 252 int error = 0;
282 253
283 error = xfs_sync_inode_valid(ip, pag);
284 if (error)
285 return error;
286
287 xfs_ilock(ip, XFS_ILOCK_SHARED); 254 xfs_ilock(ip, XFS_ILOCK_SHARED);
288 if (xfs_inode_clean(ip)) 255 if (xfs_inode_clean(ip))
289 goto out_unlock; 256 goto out_unlock;
@@ -300,9 +267,18 @@ xfs_sync_inode_attr(
300 267
301 error = xfs_iflush(ip, flags); 268 error = xfs_iflush(ip, flags);
302 269
270 /*
271 * We don't want to try again on non-blocking flushes that can't run
272 * again immediately. If an inode really must be written, then that's
273 * what the SYNC_WAIT flag is for.
274 */
275 if (error == EAGAIN) {
276 ASSERT(!(flags & SYNC_WAIT));
277 error = 0;
278 }
279
303 out_unlock: 280 out_unlock:
304 xfs_iunlock(ip, XFS_ILOCK_SHARED); 281 xfs_iunlock(ip, XFS_ILOCK_SHARED);
305 IRELE(ip);
306 return error; 282 return error;
307} 283}
308 284
@@ -318,8 +294,7 @@ xfs_sync_data(
318 294
319 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); 295 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
320 296
321 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, 297 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
322 XFS_ICI_NO_TAG, 0, NULL);
323 if (error) 298 if (error)
324 return XFS_ERROR(error); 299 return XFS_ERROR(error);
325 300
@@ -337,8 +312,7 @@ xfs_sync_attr(
337{ 312{
338 ASSERT((flags & ~SYNC_WAIT) == 0); 313 ASSERT((flags & ~SYNC_WAIT) == 0);
339 314
340 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, 315 return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
341 XFS_ICI_NO_TAG, 0, NULL);
342} 316}
343 317
344STATIC int 318STATIC int
@@ -401,7 +375,7 @@ xfs_quiesce_data(
401 375
402 /* mark the log as covered if needed */ 376 /* mark the log as covered if needed */
403 if (xfs_log_need_covered(mp)) 377 if (xfs_log_need_covered(mp))
404 error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); 378 error2 = xfs_fs_log_dummy(mp);
405 379
406 /* flush data-only devices */ 380 /* flush data-only devices */
407 if (mp->m_rtdev_targp) 381 if (mp->m_rtdev_targp)
@@ -440,7 +414,7 @@ xfs_quiesce_fs(
440/* 414/*
441 * Second stage of a quiesce. The data is already synced, now we have to take 415 * Second stage of a quiesce. The data is already synced, now we have to take
442 * care of the metadata. New transactions are already blocked, so we need to 416 * care of the metadata. New transactions are already blocked, so we need to
443 * wait for any remaining transactions to drain out before proceding. 417 * wait for any remaining transactions to drain out before proceeding.
444 */ 418 */
445void 419void
446xfs_quiesce_attr( 420xfs_quiesce_attr(
@@ -464,69 +438,18 @@ xfs_quiesce_attr(
464 /* Push the superblock and write an unmount record */ 438 /* Push the superblock and write an unmount record */
465 error = xfs_log_sbcount(mp, 1); 439 error = xfs_log_sbcount(mp, 1);
466 if (error) 440 if (error)
467 xfs_fs_cmn_err(CE_WARN, mp, 441 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
468 "xfs_attr_quiesce: failed to log sb changes. "
469 "Frozen image may not be consistent."); 442 "Frozen image may not be consistent.");
470 xfs_log_unmount_write(mp); 443 xfs_log_unmount_write(mp);
471 xfs_unmountfs_writesb(mp); 444 xfs_unmountfs_writesb(mp);
472} 445}
473 446
474/* 447static void
475 * Enqueue a work item to be picked up by the vfs xfssyncd thread. 448xfs_syncd_queue_sync(
476 * Doing this has two advantages: 449 struct xfs_mount *mp)
477 * - It saves on stack space, which is tight in certain situations
478 * - It can be used (with care) as a mechanism to avoid deadlocks.
479 * Flushing while allocating in a full filesystem requires both.
480 */
481STATIC void
482xfs_syncd_queue_work(
483 struct xfs_mount *mp,
484 void *data,
485 void (*syncer)(struct xfs_mount *, void *),
486 struct completion *completion)
487{
488 struct xfs_sync_work *work;
489
490 work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
491 INIT_LIST_HEAD(&work->w_list);
492 work->w_syncer = syncer;
493 work->w_data = data;
494 work->w_mount = mp;
495 work->w_completion = completion;
496 spin_lock(&mp->m_sync_lock);
497 list_add_tail(&work->w_list, &mp->m_sync_list);
498 spin_unlock(&mp->m_sync_lock);
499 wake_up_process(mp->m_sync_task);
500}
501
502/*
503 * Flush delayed allocate data, attempting to free up reserved space
504 * from existing allocations. At this point a new allocation attempt
505 * has failed with ENOSPC and we are in the process of scratching our
506 * heads, looking about for more room...
507 */
508STATIC void
509xfs_flush_inodes_work(
510 struct xfs_mount *mp,
511 void *arg)
512{
513 struct inode *inode = arg;
514 xfs_sync_data(mp, SYNC_TRYLOCK);
515 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
516 iput(inode);
517}
518
519void
520xfs_flush_inodes(
521 xfs_inode_t *ip)
522{ 450{
523 struct inode *inode = VFS_I(ip); 451 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
524 DECLARE_COMPLETION_ONSTACK(completion); 452 msecs_to_jiffies(xfs_syncd_centisecs * 10));
525
526 igrab(inode);
527 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
528 wait_for_completion(&completion);
529 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
530} 453}
531 454
532/* 455/*
@@ -536,84 +459,119 @@ xfs_flush_inodes(
536 */ 459 */
537STATIC void 460STATIC void
538xfs_sync_worker( 461xfs_sync_worker(
539 struct xfs_mount *mp, 462 struct work_struct *work)
540 void *unused)
541{ 463{
464 struct xfs_mount *mp = container_of(to_delayed_work(work),
465 struct xfs_mount, m_sync_work);
542 int error; 466 int error;
543 467
544 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 468 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
545 xfs_log_force(mp, 0);
546 xfs_reclaim_inodes(mp, 0);
547 /* dgc: errors ignored here */ 469 /* dgc: errors ignored here */
548 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
549 if (mp->m_super->s_frozen == SB_UNFROZEN && 470 if (mp->m_super->s_frozen == SB_UNFROZEN &&
550 xfs_log_need_covered(mp)) 471 xfs_log_need_covered(mp))
551 error = xfs_fs_log_dummy(mp, 0); 472 error = xfs_fs_log_dummy(mp);
473 else
474 xfs_log_force(mp, 0);
475 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
476
477 /* start pushing all the metadata that is currently dirty */
478 xfs_ail_push_all(mp->m_ail);
552 } 479 }
553 mp->m_sync_seq++; 480
554 wake_up(&mp->m_wait_single_sync_task); 481 /* queue us up again */
482 xfs_syncd_queue_sync(mp);
555} 483}
556 484
557STATIC int 485/*
558xfssyncd( 486 * Queue a new inode reclaim pass if there are reclaimable inodes and there
559 void *arg) 487 * isn't a reclaim pass already in progress. By default it runs every 5s based
488 * on the xfs syncd work default of 30s. Perhaps this should have it's own
489 * tunable, but that can be done if this method proves to be ineffective or too
490 * aggressive.
491 */
492static void
493xfs_syncd_queue_reclaim(
494 struct xfs_mount *mp)
560{ 495{
561 struct xfs_mount *mp = arg;
562 long timeleft;
563 xfs_sync_work_t *work, *n;
564 LIST_HEAD (tmp);
565
566 set_freezable();
567 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
568 for (;;) {
569 if (list_empty(&mp->m_sync_list))
570 timeleft = schedule_timeout_interruptible(timeleft);
571 /* swsusp */
572 try_to_freeze();
573 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
574 break;
575 496
576 spin_lock(&mp->m_sync_lock); 497 /*
577 /* 498 * We can have inodes enter reclaim after we've shut down the syncd
578 * We can get woken by laptop mode, to do a sync - 499 * workqueue during unmount, so don't allow reclaim work to be queued
579 * that's the (only!) case where the list would be 500 * during unmount.
580 * empty with time remaining. 501 */
581 */ 502 if (!(mp->m_super->s_flags & MS_ACTIVE))
582 if (!timeleft || list_empty(&mp->m_sync_list)) { 503 return;
583 if (!timeleft)
584 timeleft = xfs_syncd_centisecs *
585 msecs_to_jiffies(10);
586 INIT_LIST_HEAD(&mp->m_sync_work.w_list);
587 list_add_tail(&mp->m_sync_work.w_list,
588 &mp->m_sync_list);
589 }
590 list_splice_init(&mp->m_sync_list, &tmp);
591 spin_unlock(&mp->m_sync_lock);
592 504
593 list_for_each_entry_safe(work, n, &tmp, w_list) { 505 rcu_read_lock();
594 (*work->w_syncer)(mp, work->w_data); 506 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
595 list_del(&work->w_list); 507 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
596 if (work == &mp->m_sync_work) 508 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
597 continue;
598 if (work->w_completion)
599 complete(work->w_completion);
600 kmem_free(work);
601 }
602 } 509 }
510 rcu_read_unlock();
511}
603 512
604 return 0; 513/*
514 * This is a fast pass over the inode cache to try to get reclaim moving on as
515 * many inodes as possible in a short period of time. It kicks itself every few
516 * seconds, as well as being kicked by the inode cache shrinker when memory
517 * goes low. It scans as quickly as possible avoiding locked inodes or those
518 * already being flushed, and once done schedules a future pass.
519 */
520STATIC void
521xfs_reclaim_worker(
522 struct work_struct *work)
523{
524 struct xfs_mount *mp = container_of(to_delayed_work(work),
525 struct xfs_mount, m_reclaim_work);
526
527 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
528 xfs_syncd_queue_reclaim(mp);
529}
530
531/*
532 * Flush delayed allocate data, attempting to free up reserved space
533 * from existing allocations. At this point a new allocation attempt
534 * has failed with ENOSPC and we are in the process of scratching our
535 * heads, looking about for more room.
536 *
537 * Queue a new data flush if there isn't one already in progress and
538 * wait for completion of the flush. This means that we only ever have one
539 * inode flush in progress no matter how many ENOSPC events are occurring and
540 * so will prevent the system from bogging down due to every concurrent
541 * ENOSPC event scanning all the active inodes in the system for writeback.
542 */
543void
544xfs_flush_inodes(
545 struct xfs_inode *ip)
546{
547 struct xfs_mount *mp = ip->i_mount;
548
549 queue_work(xfs_syncd_wq, &mp->m_flush_work);
550 flush_work_sync(&mp->m_flush_work);
551}
552
553STATIC void
554xfs_flush_worker(
555 struct work_struct *work)
556{
557 struct xfs_mount *mp = container_of(work,
558 struct xfs_mount, m_flush_work);
559
560 xfs_sync_data(mp, SYNC_TRYLOCK);
561 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
605} 562}
606 563
607int 564int
608xfs_syncd_init( 565xfs_syncd_init(
609 struct xfs_mount *mp) 566 struct xfs_mount *mp)
610{ 567{
611 mp->m_sync_work.w_syncer = xfs_sync_worker; 568 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
612 mp->m_sync_work.w_mount = mp; 569 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
613 mp->m_sync_work.w_completion = NULL; 570 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
614 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); 571
615 if (IS_ERR(mp->m_sync_task)) 572 xfs_syncd_queue_sync(mp);
616 return -PTR_ERR(mp->m_sync_task); 573 xfs_syncd_queue_reclaim(mp);
574
617 return 0; 575 return 0;
618} 576}
619 577
@@ -621,7 +579,9 @@ void
621xfs_syncd_stop( 579xfs_syncd_stop(
622 struct xfs_mount *mp) 580 struct xfs_mount *mp)
623{ 581{
624 kthread_stop(mp->m_sync_task); 582 cancel_delayed_work_sync(&mp->m_sync_work);
583 cancel_delayed_work_sync(&mp->m_reclaim_work);
584 cancel_work_sync(&mp->m_flush_work);
625} 585}
626 586
627void 587void
@@ -640,6 +600,10 @@ __xfs_inode_set_reclaim_tag(
640 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 600 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
641 XFS_ICI_RECLAIM_TAG); 601 XFS_ICI_RECLAIM_TAG);
642 spin_unlock(&ip->i_mount->m_perag_lock); 602 spin_unlock(&ip->i_mount->m_perag_lock);
603
604 /* schedule periodic background inode reclaim */
605 xfs_syncd_queue_reclaim(ip->i_mount);
606
643 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 607 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
644 -1, _RET_IP_); 608 -1, _RET_IP_);
645 } 609 }
@@ -659,12 +623,12 @@ xfs_inode_set_reclaim_tag(
659 struct xfs_perag *pag; 623 struct xfs_perag *pag;
660 624
661 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 625 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
662 write_lock(&pag->pag_ici_lock); 626 spin_lock(&pag->pag_ici_lock);
663 spin_lock(&ip->i_flags_lock); 627 spin_lock(&ip->i_flags_lock);
664 __xfs_inode_set_reclaim_tag(pag, ip); 628 __xfs_inode_set_reclaim_tag(pag, ip);
665 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 629 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
666 spin_unlock(&ip->i_flags_lock); 630 spin_unlock(&ip->i_flags_lock);
667 write_unlock(&pag->pag_ici_lock); 631 spin_unlock(&pag->pag_ici_lock);
668 xfs_perag_put(pag); 632 xfs_perag_put(pag);
669} 633}
670 634
@@ -698,6 +662,53 @@ __xfs_inode_clear_reclaim_tag(
698} 662}
699 663
700/* 664/*
665 * Grab the inode for reclaim exclusively.
666 * Return 0 if we grabbed it, non-zero otherwise.
667 */
668STATIC int
669xfs_reclaim_inode_grab(
670 struct xfs_inode *ip,
671 int flags)
672{
673 ASSERT(rcu_read_lock_held());
674
675 /* quick check for stale RCU freed inode */
676 if (!ip->i_ino)
677 return 1;
678
679 /*
680 * do some unlocked checks first to avoid unnecessary lock traffic.
681 * The first is a flush lock check, the second is a already in reclaim
682 * check. Only do these checks if we are not going to block on locks.
683 */
684 if ((flags & SYNC_TRYLOCK) &&
685 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
686 return 1;
687 }
688
689 /*
690 * The radix tree lock here protects a thread in xfs_iget from racing
691 * with us starting reclaim on the inode. Once we have the
692 * XFS_IRECLAIM flag set it will not touch us.
693 *
694 * Due to RCU lookup, we may find inodes that have been freed and only
695 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
696 * aren't candidates for reclaim at all, so we must check the
697 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
698 */
699 spin_lock(&ip->i_flags_lock);
700 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
701 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
702 /* not a reclaim candidate. */
703 spin_unlock(&ip->i_flags_lock);
704 return 1;
705 }
706 __xfs_iflags_set(ip, XFS_IRECLAIM);
707 spin_unlock(&ip->i_flags_lock);
708 return 0;
709}
710
711/*
701 * Inodes in different states need to be treated differently, and the return 712 * Inodes in different states need to be treated differently, and the return
702 * value of xfs_iflush is not sufficient to get this right. The following table 713 * value of xfs_iflush is not sufficient to get this right. The following table
703 * lists the inode states and the reclaim actions necessary for non-blocking 714 * lists the inode states and the reclaim actions necessary for non-blocking
@@ -753,25 +764,10 @@ xfs_reclaim_inode(
753 struct xfs_perag *pag, 764 struct xfs_perag *pag,
754 int sync_mode) 765 int sync_mode)
755{ 766{
756 int error = 0; 767 int error;
757
758 /*
759 * The radix tree lock here protects a thread in xfs_iget from racing
760 * with us starting reclaim on the inode. Once we have the
761 * XFS_IRECLAIM flag set it will not touch us.
762 */
763 spin_lock(&ip->i_flags_lock);
764 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
765 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
766 /* ignore as it is already under reclaim */
767 spin_unlock(&ip->i_flags_lock);
768 write_unlock(&pag->pag_ici_lock);
769 return 0;
770 }
771 __xfs_iflags_set(ip, XFS_IRECLAIM);
772 spin_unlock(&ip->i_flags_lock);
773 write_unlock(&pag->pag_ici_lock);
774 768
769restart:
770 error = 0;
775 xfs_ilock(ip, XFS_ILOCK_EXCL); 771 xfs_ilock(ip, XFS_ILOCK_EXCL);
776 if (!xfs_iflock_nowait(ip)) { 772 if (!xfs_iflock_nowait(ip)) {
777 if (!(sync_mode & SYNC_WAIT)) 773 if (!(sync_mode & SYNC_WAIT))
@@ -797,9 +793,31 @@ xfs_reclaim_inode(
797 if (xfs_inode_clean(ip)) 793 if (xfs_inode_clean(ip))
798 goto reclaim; 794 goto reclaim;
799 795
800 /* Now we have an inode that needs flushing */ 796 /*
801 error = xfs_iflush(ip, sync_mode); 797 * Now we have an inode that needs flushing.
798 *
799 * We do a nonblocking flush here even if we are doing a SYNC_WAIT
800 * reclaim as we can deadlock with inode cluster removal.
801 * xfs_ifree_cluster() can lock the inode buffer before it locks the
802 * ip->i_lock, and we are doing the exact opposite here. As a result,
803 * doing a blocking xfs_itobp() to get the cluster buffer will result
804 * in an ABBA deadlock with xfs_ifree_cluster().
805 *
806 * As xfs_ifree_cluser() must gather all inodes that are active in the
807 * cache to mark them stale, if we hit this case we don't actually want
808 * to do IO here - we want the inode marked stale so we can simply
809 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
810 * just unlock the inode, back off and try again. Hopefully the next
811 * pass through will see the stale flag set on the inode.
812 */
813 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
802 if (sync_mode & SYNC_WAIT) { 814 if (sync_mode & SYNC_WAIT) {
815 if (error == EAGAIN) {
816 xfs_iunlock(ip, XFS_ILOCK_EXCL);
817 /* backoff longer than in xfs_ifree_cluster */
818 delay(2);
819 goto restart;
820 }
803 xfs_iflock(ip); 821 xfs_iflock(ip);
804 goto reclaim; 822 goto reclaim;
805 } 823 }
@@ -814,7 +832,7 @@ xfs_reclaim_inode(
814 * pass on the error. 832 * pass on the error.
815 */ 833 */
816 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 834 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
817 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 835 xfs_warn(ip->i_mount,
818 "inode 0x%llx background reclaim flush failed with %d", 836 "inode 0x%llx background reclaim flush failed with %d",
819 (long long)ip->i_ino, error); 837 (long long)ip->i_ino, error);
820 } 838 }
@@ -842,12 +860,12 @@ reclaim:
842 * added to the tree assert that it's been there before to catch 860 * added to the tree assert that it's been there before to catch
843 * problems with the inode life time early on. 861 * problems with the inode life time early on.
844 */ 862 */
845 write_lock(&pag->pag_ici_lock); 863 spin_lock(&pag->pag_ici_lock);
846 if (!radix_tree_delete(&pag->pag_ici_root, 864 if (!radix_tree_delete(&pag->pag_ici_root,
847 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 865 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
848 ASSERT(0); 866 ASSERT(0);
849 __xfs_inode_clear_reclaim(pag, ip); 867 __xfs_inode_clear_reclaim(pag, ip);
850 write_unlock(&pag->pag_ici_lock); 868 spin_unlock(&pag->pag_ici_lock);
851 869
852 /* 870 /*
853 * Here we do an (almost) spurious inode lock in order to coordinate 871 * Here we do an (almost) spurious inode lock in order to coordinate
@@ -868,45 +886,181 @@ reclaim:
868 886
869} 887}
870 888
889/*
890 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
891 * corrupted, we still want to try to reclaim all the inodes. If we don't,
892 * then a shut down during filesystem unmount reclaim walk leak all the
893 * unreclaimed inodes.
894 */
895int
896xfs_reclaim_inodes_ag(
897 struct xfs_mount *mp,
898 int flags,
899 int *nr_to_scan)
900{
901 struct xfs_perag *pag;
902 int error = 0;
903 int last_error = 0;
904 xfs_agnumber_t ag;
905 int trylock = flags & SYNC_TRYLOCK;
906 int skipped;
907
908restart:
909 ag = 0;
910 skipped = 0;
911 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
912 unsigned long first_index = 0;
913 int done = 0;
914 int nr_found = 0;
915
916 ag = pag->pag_agno + 1;
917
918 if (trylock) {
919 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
920 skipped++;
921 xfs_perag_put(pag);
922 continue;
923 }
924 first_index = pag->pag_ici_reclaim_cursor;
925 } else
926 mutex_lock(&pag->pag_ici_reclaim_lock);
927
928 do {
929 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
930 int i;
931
932 rcu_read_lock();
933 nr_found = radix_tree_gang_lookup_tag(
934 &pag->pag_ici_root,
935 (void **)batch, first_index,
936 XFS_LOOKUP_BATCH,
937 XFS_ICI_RECLAIM_TAG);
938 if (!nr_found) {
939 done = 1;
940 rcu_read_unlock();
941 break;
942 }
943
944 /*
945 * Grab the inodes before we drop the lock. if we found
946 * nothing, nr == 0 and the loop will be skipped.
947 */
948 for (i = 0; i < nr_found; i++) {
949 struct xfs_inode *ip = batch[i];
950
951 if (done || xfs_reclaim_inode_grab(ip, flags))
952 batch[i] = NULL;
953
954 /*
955 * Update the index for the next lookup. Catch
956 * overflows into the next AG range which can
957 * occur if we have inodes in the last block of
958 * the AG and we are currently pointing to the
959 * last inode.
960 *
961 * Because we may see inodes that are from the
962 * wrong AG due to RCU freeing and
963 * reallocation, only update the index if it
964 * lies in this AG. It was a race that lead us
965 * to see this inode, so another lookup from
966 * the same index will not find it again.
967 */
968 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
969 pag->pag_agno)
970 continue;
971 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
972 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
973 done = 1;
974 }
975
976 /* unlock now we've grabbed the inodes. */
977 rcu_read_unlock();
978
979 for (i = 0; i < nr_found; i++) {
980 if (!batch[i])
981 continue;
982 error = xfs_reclaim_inode(batch[i], pag, flags);
983 if (error && last_error != EFSCORRUPTED)
984 last_error = error;
985 }
986
987 *nr_to_scan -= XFS_LOOKUP_BATCH;
988
989 } while (nr_found && !done && *nr_to_scan > 0);
990
991 if (trylock && !done)
992 pag->pag_ici_reclaim_cursor = first_index;
993 else
994 pag->pag_ici_reclaim_cursor = 0;
995 mutex_unlock(&pag->pag_ici_reclaim_lock);
996 xfs_perag_put(pag);
997 }
998
999 /*
1000 * if we skipped any AG, and we still have scan count remaining, do
1001 * another pass this time using blocking reclaim semantics (i.e
1002 * waiting on the reclaim locks and ignoring the reclaim cursors). This
1003 * ensure that when we get more reclaimers than AGs we block rather
1004 * than spin trying to execute reclaim.
1005 */
1006 if (trylock && skipped && *nr_to_scan > 0) {
1007 trylock = 0;
1008 goto restart;
1009 }
1010 return XFS_ERROR(last_error);
1011}
1012
871int 1013int
872xfs_reclaim_inodes( 1014xfs_reclaim_inodes(
873 xfs_mount_t *mp, 1015 xfs_mount_t *mp,
874 int mode) 1016 int mode)
875{ 1017{
876 return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, 1018 int nr_to_scan = INT_MAX;
877 XFS_ICI_RECLAIM_TAG, 1, NULL); 1019
1020 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
878} 1021}
879 1022
880/* 1023/*
881 * Shrinker infrastructure. 1024 * Inode cache shrinker.
1025 *
1026 * When called we make sure that there is a background (fast) inode reclaim in
1027 * progress, while we will throttle the speed of reclaim via doiing synchronous
1028 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1029 * them to be cleaned, which we hope will not be very long due to the
1030 * background walker having already kicked the IO off on those dirty inodes.
882 */ 1031 */
883static int 1032static int
884xfs_reclaim_inode_shrink( 1033xfs_reclaim_inode_shrink(
885 struct shrinker *shrink, 1034 struct shrinker *shrink,
886 int nr_to_scan, 1035 struct shrink_control *sc)
887 gfp_t gfp_mask)
888{ 1036{
889 struct xfs_mount *mp; 1037 struct xfs_mount *mp;
890 struct xfs_perag *pag; 1038 struct xfs_perag *pag;
891 xfs_agnumber_t ag; 1039 xfs_agnumber_t ag;
892 int reclaimable; 1040 int reclaimable;
1041 int nr_to_scan = sc->nr_to_scan;
1042 gfp_t gfp_mask = sc->gfp_mask;
893 1043
894 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1044 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
895 if (nr_to_scan) { 1045 if (nr_to_scan) {
1046 /* kick background reclaimer and push the AIL */
1047 xfs_syncd_queue_reclaim(mp);
1048 xfs_ail_push_all(mp->m_ail);
1049
896 if (!(gfp_mask & __GFP_FS)) 1050 if (!(gfp_mask & __GFP_FS))
897 return -1; 1051 return -1;
898 1052
899 xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, 1053 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
900 XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); 1054 &nr_to_scan);
901 /* if we don't exhaust the scan, don't bother coming back */ 1055 /* terminate if we don't exhaust the scan */
902 if (nr_to_scan > 0) 1056 if (nr_to_scan > 0)
903 return -1; 1057 return -1;
904 } 1058 }
905 1059
906 reclaimable = 0; 1060 reclaimable = 0;
907 ag = 0; 1061 ag = 0;
908 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, 1062 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
909 XFS_ICI_RECLAIM_TAG))) { 1063 ag = pag->pag_agno + 1;
910 reclaimable += pag->pag_ici_reclaimable; 1064 reclaimable += pag->pag_ici_reclaimable;
911 xfs_perag_put(pag); 1065 xfs_perag_put(pag);
912 } 1066 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index fe78726196f8..e3a6ad27415f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -32,6 +32,8 @@ typedef struct xfs_sync_work {
32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
34 34
35extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
36
35int xfs_syncd_init(struct xfs_mount *mp); 37int xfs_syncd_init(struct xfs_mount *mp);
36void xfs_syncd_stop(struct xfs_mount *mp); 38void xfs_syncd_stop(struct xfs_mount *mp);
37 39
@@ -47,10 +49,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
47void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 49void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
48 struct xfs_inode *ip); 50 struct xfs_inode *ip);
49 51
50int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); 52int xfs_sync_inode_grab(struct xfs_inode *ip);
51int xfs_inode_ag_iterator(struct xfs_mount *mp, 53int xfs_inode_ag_iterator(struct xfs_mount *mp,
52 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 54 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
53 int flags, int tag, int write_lock, int *nr_to_scan); 55 int flags);
54 56
55void xfs_inode_shrinker_register(struct xfs_mount *mp); 57void xfs_inode_shrinker_register(struct xfs_mount *mp);
56void xfs_inode_shrinker_unregister(struct xfs_mount *mp); 58void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include "xfs_error.h"
21 22
22static struct ctl_table_header *xfs_table_header; 23static struct ctl_table_header *xfs_table_header;
23 24
@@ -36,7 +37,7 @@ xfs_stats_clear_proc_handler(
36 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); 37 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
37 38
38 if (!ret && write && *valp) { 39 if (!ret && write && *valp) {
39 printk("XFS Clearing xfsstats\n"); 40 xfs_notice(NULL, "Clearing xfsstats");
40 for_each_possible_cpu(c) { 41 for_each_possible_cpu(c) {
41 preempt_disable(); 42 preempt_disable();
42 /* save vn_active, it's a universal truth! */ 43 /* save vn_active, it's a universal truth! */
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
51 52
52 return ret; 53 return ret;
53} 54}
55
56STATIC int
57xfs_panic_mask_proc_handler(
58 ctl_table *ctl,
59 int write,
60 void __user *buffer,
61 size_t *lenp,
62 loff_t *ppos)
63{
64 int ret, *valp = ctl->data;
65
66 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
67 if (!ret && write) {
68 xfs_panic_mask = *valp;
69#ifdef DEBUG
70 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
71#endif
72 }
73 return ret;
74}
54#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
55 76
56static ctl_table xfs_table[] = { 77static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
77 .data = &xfs_params.panic_mask.val, 98 .data = &xfs_params.panic_mask.val,
78 .maxlen = sizeof(int), 99 .maxlen = sizeof(int),
79 .mode = 0644, 100 .mode = 0644,
80 .proc_handler = proc_dointvec_minmax, 101 .proc_handler = xfs_panic_mask_proc_handler,
81 .extra1 = &xfs_params.panic_mask.min, 102 .extra1 = &xfs_params.panic_mask.min,
82 .extra2 = &xfs_params.panic_mask.max 103 .extra2 = &xfs_params.panic_mask.max
83 }, 104 },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index be5dffd282a1..d48b7a579ae1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name, \
124 unsigned long caller_ip), \ 124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip)) 125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get); 126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); 127DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
128DEFINE_PERAG_REF_EVENT(xfs_perag_put); 128DEFINE_PERAG_REF_EVENT(xfs_perag_put);
129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
@@ -325,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock);
325DEFINE_BUF_EVENT(xfs_buf_lock_done); 325DEFINE_BUF_EVENT(xfs_buf_lock_done);
326DEFINE_BUF_EVENT(xfs_buf_cond_lock); 326DEFINE_BUF_EVENT(xfs_buf_cond_lock);
327DEFINE_BUF_EVENT(xfs_buf_unlock); 327DEFINE_BUF_EVENT(xfs_buf_unlock);
328DEFINE_BUF_EVENT(xfs_buf_ordered_retry);
329DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
330DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
331DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
332DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); 331DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
333DEFINE_BUF_EVENT(xfs_buf_delwri_split); 332DEFINE_BUF_EVENT(xfs_buf_delwri_split);
334DEFINE_BUF_EVENT(xfs_buf_get_noaddr); 333DEFINE_BUF_EVENT(xfs_buf_get_uncached);
335DEFINE_BUF_EVENT(xfs_bdstrat_shut); 334DEFINE_BUF_EVENT(xfs_bdstrat_shut);
336DEFINE_BUF_EVENT(xfs_buf_item_relse); 335DEFINE_BUF_EVENT(xfs_buf_item_relse);
337DEFINE_BUF_EVENT(xfs_buf_item_iodone); 336DEFINE_BUF_EVENT(xfs_buf_item_iodone);
@@ -767,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
767 __field(int, curr_res) 766 __field(int, curr_res)
768 __field(int, unit_res) 767 __field(int, unit_res)
769 __field(unsigned int, flags) 768 __field(unsigned int, flags)
770 __field(void *, reserve_headq) 769 __field(int, reserveq)
771 __field(void *, write_headq) 770 __field(int, writeq)
772 __field(int, grant_reserve_cycle) 771 __field(int, grant_reserve_cycle)
773 __field(int, grant_reserve_bytes) 772 __field(int, grant_reserve_bytes)
774 __field(int, grant_write_cycle) 773 __field(int, grant_write_cycle)
@@ -785,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
785 __entry->curr_res = tic->t_curr_res; 784 __entry->curr_res = tic->t_curr_res;
786 __entry->unit_res = tic->t_unit_res; 785 __entry->unit_res = tic->t_unit_res;
787 __entry->flags = tic->t_flags; 786 __entry->flags = tic->t_flags;
788 __entry->reserve_headq = log->l_reserve_headq; 787 __entry->reserveq = list_empty(&log->l_reserveq);
789 __entry->write_headq = log->l_write_headq; 788 __entry->writeq = list_empty(&log->l_writeq);
790 __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; 789 xlog_crack_grant_head(&log->l_grant_reserve_head,
791 __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; 790 &__entry->grant_reserve_cycle,
792 __entry->grant_write_cycle = log->l_grant_write_cycle; 791 &__entry->grant_reserve_bytes);
793 __entry->grant_write_bytes = log->l_grant_write_bytes; 792 xlog_crack_grant_head(&log->l_grant_write_head,
793 &__entry->grant_write_cycle,
794 &__entry->grant_write_bytes);
794 __entry->curr_cycle = log->l_curr_cycle; 795 __entry->curr_cycle = log->l_curr_cycle;
795 __entry->curr_block = log->l_curr_block; 796 __entry->curr_block = log->l_curr_block;
796 __entry->tail_lsn = log->l_tail_lsn; 797 __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
797 ), 798 ),
798 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " 799 TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
799 "t_unit_res %u t_flags %s reserve_headq 0x%p " 800 "t_unit_res %u t_flags %s reserveq %s "
800 "write_headq 0x%p grant_reserve_cycle %d " 801 "writeq %s grant_reserve_cycle %d "
801 "grant_reserve_bytes %d grant_write_cycle %d " 802 "grant_reserve_bytes %d grant_write_cycle %d "
802 "grant_write_bytes %d curr_cycle %d curr_block %d " 803 "grant_write_bytes %d curr_cycle %d curr_block %d "
803 "tail_cycle %d tail_block %d", 804 "tail_cycle %d tail_block %d",
@@ -808,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
808 __entry->curr_res, 809 __entry->curr_res,
809 __entry->unit_res, 810 __entry->unit_res,
810 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), 811 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
811 __entry->reserve_headq, 812 __entry->reserveq ? "empty" : "active",
812 __entry->write_headq, 813 __entry->writeq ? "empty" : "active",
813 __entry->grant_reserve_cycle, 814 __entry->grant_reserve_cycle,
814 __entry->grant_reserve_bytes, 815 __entry->grant_reserve_bytes,
815 __entry->grant_write_cycle, 816 __entry->grant_write_cycle,
@@ -836,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); 837DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
837DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); 838DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
838DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); 839DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
840DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
839DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
@@ -843,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 850DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); 851DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
@@ -936,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
936DEFINE_PAGE_EVENT(xfs_releasepage); 939DEFINE_PAGE_EVENT(xfs_releasepage);
937DEFINE_PAGE_EVENT(xfs_invalidatepage); 940DEFINE_PAGE_EVENT(xfs_invalidatepage);
938 941
939DECLARE_EVENT_CLASS(xfs_iomap_class, 942DECLARE_EVENT_CLASS(xfs_imap_class,
940 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 943 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
941 int flags, struct xfs_bmbt_irec *irec), 944 int type, struct xfs_bmbt_irec *irec),
942 TP_ARGS(ip, offset, count, flags, irec), 945 TP_ARGS(ip, offset, count, type, irec),
943 TP_STRUCT__entry( 946 TP_STRUCT__entry(
944 __field(dev_t, dev) 947 __field(dev_t, dev)
945 __field(xfs_ino_t, ino) 948 __field(xfs_ino_t, ino)
@@ -947,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
947 __field(loff_t, new_size) 950 __field(loff_t, new_size)
948 __field(loff_t, offset) 951 __field(loff_t, offset)
949 __field(size_t, count) 952 __field(size_t, count)
950 __field(int, flags) 953 __field(int, type)
951 __field(xfs_fileoff_t, startoff) 954 __field(xfs_fileoff_t, startoff)
952 __field(xfs_fsblock_t, startblock) 955 __field(xfs_fsblock_t, startblock)
953 __field(xfs_filblks_t, blockcount) 956 __field(xfs_filblks_t, blockcount)
@@ -959,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
959 __entry->new_size = ip->i_new_size; 962 __entry->new_size = ip->i_new_size;
960 __entry->offset = offset; 963 __entry->offset = offset;
961 __entry->count = count; 964 __entry->count = count;
962 __entry->flags = flags; 965 __entry->type = type;
963 __entry->startoff = irec ? irec->br_startoff : 0; 966 __entry->startoff = irec ? irec->br_startoff : 0;
964 __entry->startblock = irec ? irec->br_startblock : 0; 967 __entry->startblock = irec ? irec->br_startblock : 0;
965 __entry->blockcount = irec ? irec->br_blockcount : 0; 968 __entry->blockcount = irec ? irec->br_blockcount : 0;
966 ), 969 ),
967 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 970 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
968 "offset 0x%llx count %zd flags %s " 971 "offset 0x%llx count %zd type %s "
969 "startoff 0x%llx startblock %lld blockcount 0x%llx", 972 "startoff 0x%llx startblock %lld blockcount 0x%llx",
970 MAJOR(__entry->dev), MINOR(__entry->dev), 973 MAJOR(__entry->dev), MINOR(__entry->dev),
971 __entry->ino, 974 __entry->ino,
@@ -973,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
973 __entry->new_size, 976 __entry->new_size,
974 __entry->offset, 977 __entry->offset,
975 __entry->count, 978 __entry->count,
976 __print_flags(__entry->flags, "|", BMAPI_FLAGS), 979 __print_symbolic(__entry->type, XFS_IO_TYPES),
977 __entry->startoff, 980 __entry->startoff,
978 (__int64_t)__entry->startblock, 981 (__int64_t)__entry->startblock,
979 __entry->blockcount) 982 __entry->blockcount)
980) 983)
981 984
982#define DEFINE_IOMAP_EVENT(name) \ 985#define DEFINE_IOMAP_EVENT(name) \
983DEFINE_EVENT(xfs_iomap_class, name, \ 986DEFINE_EVENT(xfs_imap_class, name, \
984 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 987 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
985 int flags, struct xfs_bmbt_irec *irec), \ 988 int type, struct xfs_bmbt_irec *irec), \
986 TP_ARGS(ip, offset, count, flags, irec)) 989 TP_ARGS(ip, offset, count, type, irec))
987DEFINE_IOMAP_EVENT(xfs_iomap_enter); 990DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
988DEFINE_IOMAP_EVENT(xfs_iomap_found); 991DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
989DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 992DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
993DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
990 994
991DECLARE_EVENT_CLASS(xfs_simple_io_class, 995DECLARE_EVENT_CLASS(xfs_simple_io_class,
992 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 996 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1023,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
1023 TP_ARGS(ip, offset, count)) 1027 TP_ARGS(ip, offset, count))
1024DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 1028DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
1025DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 1029DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
1030DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
1026 1031
1027 1032
1028TRACE_EVENT(xfs_itruncate_start, 1033TRACE_EVENT(xfs_itruncate_start,
@@ -1146,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap,
1146 1151
1147); 1152);
1148 1153
1149#define XFS_BUSY_SYNC \ 1154DECLARE_EVENT_CLASS(xfs_busy_class,
1150 { 0, "async" }, \
1151 { 1, "sync" }
1152
1153TRACE_EVENT(xfs_alloc_busy,
1154 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1155 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1156 TP_ARGS(trans, agno, agbno, len, sync),
1157 TP_STRUCT__entry(
1158 __field(dev_t, dev)
1159 __field(struct xfs_trans *, tp)
1160 __field(int, tid)
1161 __field(xfs_agnumber_t, agno)
1162 __field(xfs_agblock_t, agbno)
1163 __field(xfs_extlen_t, len)
1164 __field(int, sync)
1165 ),
1166 TP_fast_assign(
1167 __entry->dev = trans->t_mountp->m_super->s_dev;
1168 __entry->tp = trans;
1169 __entry->tid = trans->t_ticket->t_tid;
1170 __entry->agno = agno;
1171 __entry->agbno = agbno;
1172 __entry->len = len;
1173 __entry->sync = sync;
1174 ),
1175 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1176 MAJOR(__entry->dev), MINOR(__entry->dev),
1177 __entry->tp,
1178 __entry->tid,
1179 __entry->agno,
1180 __entry->agbno,
1181 __entry->len,
1182 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1183
1184);
1185
1186TRACE_EVENT(xfs_alloc_unbusy,
1187 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1155 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1188 xfs_agblock_t agbno, xfs_extlen_t len), 1156 xfs_agblock_t agbno, xfs_extlen_t len),
1189 TP_ARGS(mp, agno, agbno, len), 1157 TP_ARGS(mp, agno, agbno, len),
@@ -1205,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy,
1205 __entry->agbno, 1173 __entry->agbno,
1206 __entry->len) 1174 __entry->len)
1207); 1175);
1208 1176#define DEFINE_BUSY_EVENT(name) \
1209#define XFS_BUSY_STATES \ 1177DEFINE_EVENT(xfs_busy_class, name, \
1210 { 0, "missing" }, \ 1178 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1211 { 1, "found" } 1179 xfs_agblock_t agbno, xfs_extlen_t len), \
1212 1180 TP_ARGS(mp, agno, agbno, len))
1213TRACE_EVENT(xfs_alloc_busysearch, 1181DEFINE_BUSY_EVENT(xfs_alloc_busy);
1182DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
1183DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
1184DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
1185DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
1186
1187TRACE_EVENT(xfs_alloc_busy_trim,
1214 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1188 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1215 xfs_agblock_t agbno, xfs_extlen_t len, int found), 1189 xfs_agblock_t agbno, xfs_extlen_t len,
1216 TP_ARGS(mp, agno, agbno, len, found), 1190 xfs_agblock_t tbno, xfs_extlen_t tlen),
1191 TP_ARGS(mp, agno, agbno, len, tbno, tlen),
1217 TP_STRUCT__entry( 1192 TP_STRUCT__entry(
1218 __field(dev_t, dev) 1193 __field(dev_t, dev)
1219 __field(xfs_agnumber_t, agno) 1194 __field(xfs_agnumber_t, agno)
1220 __field(xfs_agblock_t, agbno) 1195 __field(xfs_agblock_t, agbno)
1221 __field(xfs_extlen_t, len) 1196 __field(xfs_extlen_t, len)
1222 __field(int, found) 1197 __field(xfs_agblock_t, tbno)
1198 __field(xfs_extlen_t, tlen)
1223 ), 1199 ),
1224 TP_fast_assign( 1200 TP_fast_assign(
1225 __entry->dev = mp->m_super->s_dev; 1201 __entry->dev = mp->m_super->s_dev;
1226 __entry->agno = agno; 1202 __entry->agno = agno;
1227 __entry->agbno = agbno; 1203 __entry->agbno = agbno;
1228 __entry->len = len; 1204 __entry->len = len;
1229 __entry->found = found; 1205 __entry->tbno = tbno;
1206 __entry->tlen = tlen;
1230 ), 1207 ),
1231 TP_printk("dev %d:%d agno %u agbno %u len %u %s", 1208 TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
1232 MAJOR(__entry->dev), MINOR(__entry->dev), 1209 MAJOR(__entry->dev), MINOR(__entry->dev),
1233 __entry->agno, 1210 __entry->agno,
1234 __entry->agbno, 1211 __entry->agbno,
1235 __entry->len, 1212 __entry->len,
1236 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1213 __entry->tbno,
1214 __entry->tlen)
1237); 1215);
1238 1216
1239TRACE_EVENT(xfs_trans_commit_lsn, 1217TRACE_EVENT(xfs_trans_commit_lsn,
@@ -1413,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
1413 __entry->wasfromfl, 1391 __entry->wasfromfl,
1414 __entry->isfl, 1392 __entry->isfl,
1415 __entry->userdata, 1393 __entry->userdata,
1416 __entry->firstblock) 1394 (unsigned long long)__entry->firstblock)
1417) 1395)
1418 1396
1419#define DEFINE_ALLOC_EVENT(name) \ 1397#define DEFINE_ALLOC_EVENT(name) \
@@ -1421,17 +1399,21 @@ DEFINE_EVENT(xfs_alloc_class, name, \
1421 TP_PROTO(struct xfs_alloc_arg *args), \ 1399 TP_PROTO(struct xfs_alloc_arg *args), \
1422 TP_ARGS(args)) 1400 TP_ARGS(args))
1423DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); 1401DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
1402DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
1424DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); 1403DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
1425DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); 1404DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
1426DEFINE_ALLOC_EVENT(xfs_alloc_near_first); 1405DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
1427DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); 1406DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
1428DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); 1407DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
1429DEFINE_ALLOC_EVENT(xfs_alloc_near_error); 1408DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
1409DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
1410DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
1430DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); 1411DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
1431DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); 1412DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
1432DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); 1413DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
1433DEFINE_ALLOC_EVENT(xfs_alloc_size_done); 1414DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
1434DEFINE_ALLOC_EVENT(xfs_alloc_size_error); 1415DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
1416DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
1435DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); 1417DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
1436DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); 1418DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
1437DEFINE_ALLOC_EVENT(xfs_alloc_small_done); 1419DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
@@ -1753,6 +1735,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
1753DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); 1735DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
1754DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); 1736DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
1755 1737
1738DECLARE_EVENT_CLASS(xfs_discard_class,
1739 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1740 xfs_agblock_t agbno, xfs_extlen_t len),
1741 TP_ARGS(mp, agno, agbno, len),
1742 TP_STRUCT__entry(
1743 __field(dev_t, dev)
1744 __field(xfs_agnumber_t, agno)
1745 __field(xfs_agblock_t, agbno)
1746 __field(xfs_extlen_t, len)
1747 ),
1748 TP_fast_assign(
1749 __entry->dev = mp->m_super->s_dev;
1750 __entry->agno = agno;
1751 __entry->agbno = agbno;
1752 __entry->len = len;
1753 ),
1754 TP_printk("dev %d:%d agno %u agbno %u len %u\n",
1755 MAJOR(__entry->dev), MINOR(__entry->dev),
1756 __entry->agno,
1757 __entry->agbno,
1758 __entry->len)
1759)
1760
1761#define DEFINE_DISCARD_EVENT(name) \
1762DEFINE_EVENT(xfs_discard_class, name, \
1763 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
1764 xfs_agblock_t agbno, xfs_extlen_t len), \
1765 TP_ARGS(mp, agno, agbno, len))
1766DEFINE_DISCARD_EVENT(xfs_discard_extent);
1767DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
1768DEFINE_DISCARD_EVENT(xfs_discard_exclude);
1769DEFINE_DISCARD_EVENT(xfs_discard_busy);
1770
1756#endif /* _TRACE_XFS_H */ 1771#endif /* _TRACE_XFS_H */
1757 1772
1758#undef TRACE_INCLUDE_PATH 1773#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
deleted file mode 100644
index f8d279d7563a..000000000000
--- a/fs/xfs/linux-2.6/xfs_version.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_VERSION_H__
19#define __XFS_VERSION_H__
20
21/*
22 * Dummy file that can contain a timestamp to put into the
23 * XFS init string, to help users keep track of what they're
24 * running
25 */
26
27#define XFS_VERSION_STRING "SGI XFS"
28
29#endif /* __XFS_VERSION_H__ */