diff options
author | David Chinner <dgc@sgi.com> | 2005-09-04 18:33:35 -0400 |
---|---|---|
committer | Nathan Scott <nathans@sgi.com> | 2005-09-04 18:33:35 -0400 |
commit | 2f926587512869ebf6bc820bd5f030e127aae774 (patch) | |
tree | f5bfb33cea1c6d7d2ababf5f800e63e2ec89a65a /fs | |
parent | ba403ab43e896c57f32995ccba9a6bd6ec8dd1b9 (diff) |
[XFS] Fix racy access to pb_flags. pagebuf_rele() modified pb_flags after
the pagebuf had been unlocked if the buffer was delwri. At high load, this
could result in a race when the superblock was being synced that would
result the flags being incorrect and the iodone functions being executed
incorrectly. This then leads to iclog callback failures or AIL list
corruptions resulting in filesystem shutdowns.
SGI-PV: 923981
SGI-Modid: xfs-linux:xfs-kern:23616a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.c | 66 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.h | 3 |
2 files changed, 51 insertions, 18 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index f43689754da..e6340906342 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c | |||
@@ -590,8 +590,10 @@ found: | |||
590 | PB_SET_OWNER(pb); | 590 | PB_SET_OWNER(pb); |
591 | } | 591 | } |
592 | 592 | ||
593 | if (pb->pb_flags & PBF_STALE) | 593 | if (pb->pb_flags & PBF_STALE) { |
594 | ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0); | ||
594 | pb->pb_flags &= PBF_MAPPED; | 595 | pb->pb_flags &= PBF_MAPPED; |
596 | } | ||
595 | PB_TRACE(pb, "got_lock", 0); | 597 | PB_TRACE(pb, "got_lock", 0); |
596 | XFS_STATS_INC(pb_get_locked); | 598 | XFS_STATS_INC(pb_get_locked); |
597 | return (pb); | 599 | return (pb); |
@@ -872,6 +874,17 @@ pagebuf_rele( | |||
872 | 874 | ||
873 | PB_TRACE(pb, "rele", pb->pb_relse); | 875 | PB_TRACE(pb, "rele", pb->pb_relse); |
874 | 876 | ||
877 | /* | ||
878 | * pagebuf_lookup buffers are not hashed, not delayed write, | ||
879 | * and don't have their own release routines. Special case. | ||
880 | */ | ||
881 | if (unlikely(!hash)) { | ||
882 | ASSERT(!pb->pb_relse); | ||
883 | if (atomic_dec_and_test(&pb->pb_hold)) | ||
884 | xfs_buf_free(pb); | ||
885 | return; | ||
886 | } | ||
887 | |||
875 | if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) { | 888 | if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) { |
876 | int do_free = 1; | 889 | int do_free = 1; |
877 | 890 | ||
@@ -883,22 +896,23 @@ pagebuf_rele( | |||
883 | do_free = 0; | 896 | do_free = 0; |
884 | } | 897 | } |
885 | 898 | ||
886 | if (pb->pb_flags & PBF_DELWRI) { | 899 | if (pb->pb_flags & PBF_FS_MANAGED) { |
887 | pb->pb_flags |= PBF_ASYNC; | ||
888 | atomic_inc(&pb->pb_hold); | ||
889 | pagebuf_delwri_queue(pb, 0); | ||
890 | do_free = 0; | ||
891 | } else if (pb->pb_flags & PBF_FS_MANAGED) { | ||
892 | do_free = 0; | 900 | do_free = 0; |
893 | } | 901 | } |
894 | 902 | ||
895 | if (do_free) { | 903 | if (do_free) { |
904 | ASSERT((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == 0); | ||
896 | list_del_init(&pb->pb_hash_list); | 905 | list_del_init(&pb->pb_hash_list); |
897 | spin_unlock(&hash->bh_lock); | 906 | spin_unlock(&hash->bh_lock); |
898 | pagebuf_free(pb); | 907 | pagebuf_free(pb); |
899 | } else { | 908 | } else { |
900 | spin_unlock(&hash->bh_lock); | 909 | spin_unlock(&hash->bh_lock); |
901 | } | 910 | } |
911 | } else { | ||
912 | /* | ||
913 | * Catch reference count leaks | ||
914 | */ | ||
915 | ASSERT(atomic_read(&pb->pb_hold) >= 0); | ||
902 | } | 916 | } |
903 | } | 917 | } |
904 | 918 | ||
@@ -976,13 +990,24 @@ pagebuf_lock( | |||
976 | * pagebuf_unlock | 990 | * pagebuf_unlock |
977 | * | 991 | * |
978 | * pagebuf_unlock releases the lock on the buffer object created by | 992 | * pagebuf_unlock releases the lock on the buffer object created by |
979 | * pagebuf_lock or pagebuf_cond_lock (not any | 993 | * pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages |
980 | * pinning of underlying pages created by pagebuf_pin). | 994 | * created by pagebuf_pin). |
995 | * | ||
996 | * If the buffer is marked delwri but is not queued, do so before we | ||
997 | * unlock the buffer as we need to set flags correctly. We also need to | ||
998 | * take a reference for the delwri queue because the unlocker is going to | ||
999 | * drop their's and they don't know we just queued it. | ||
981 | */ | 1000 | */ |
982 | void | 1001 | void |
983 | pagebuf_unlock( /* unlock buffer */ | 1002 | pagebuf_unlock( /* unlock buffer */ |
984 | xfs_buf_t *pb) /* buffer to unlock */ | 1003 | xfs_buf_t *pb) /* buffer to unlock */ |
985 | { | 1004 | { |
1005 | if ((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == PBF_DELWRI) { | ||
1006 | atomic_inc(&pb->pb_hold); | ||
1007 | pb->pb_flags |= PBF_ASYNC; | ||
1008 | pagebuf_delwri_queue(pb, 0); | ||
1009 | } | ||
1010 | |||
986 | PB_CLEAR_OWNER(pb); | 1011 | PB_CLEAR_OWNER(pb); |
987 | up(&pb->pb_sema); | 1012 | up(&pb->pb_sema); |
988 | PB_TRACE(pb, "unlock", 0); | 1013 | PB_TRACE(pb, "unlock", 0); |
@@ -1486,6 +1511,11 @@ again: | |||
1486 | ASSERT(btp == bp->pb_target); | 1511 | ASSERT(btp == bp->pb_target); |
1487 | if (!(bp->pb_flags & PBF_FS_MANAGED)) { | 1512 | if (!(bp->pb_flags & PBF_FS_MANAGED)) { |
1488 | spin_unlock(&hash->bh_lock); | 1513 | spin_unlock(&hash->bh_lock); |
1514 | /* | ||
1515 | * Catch superblock reference count leaks | ||
1516 | * immediately | ||
1517 | */ | ||
1518 | BUG_ON(bp->pb_bn == 0); | ||
1489 | delay(100); | 1519 | delay(100); |
1490 | goto again; | 1520 | goto again; |
1491 | } | 1521 | } |
@@ -1661,17 +1691,20 @@ pagebuf_delwri_queue( | |||
1661 | int unlock) | 1691 | int unlock) |
1662 | { | 1692 | { |
1663 | PB_TRACE(pb, "delwri_q", (long)unlock); | 1693 | PB_TRACE(pb, "delwri_q", (long)unlock); |
1664 | ASSERT(pb->pb_flags & PBF_DELWRI); | 1694 | ASSERT((pb->pb_flags & (PBF_DELWRI|PBF_ASYNC)) == |
1695 | (PBF_DELWRI|PBF_ASYNC)); | ||
1665 | 1696 | ||
1666 | spin_lock(&pbd_delwrite_lock); | 1697 | spin_lock(&pbd_delwrite_lock); |
1667 | /* If already in the queue, dequeue and place at tail */ | 1698 | /* If already in the queue, dequeue and place at tail */ |
1668 | if (!list_empty(&pb->pb_list)) { | 1699 | if (!list_empty(&pb->pb_list)) { |
1700 | ASSERT(pb->pb_flags & _PBF_DELWRI_Q); | ||
1669 | if (unlock) { | 1701 | if (unlock) { |
1670 | atomic_dec(&pb->pb_hold); | 1702 | atomic_dec(&pb->pb_hold); |
1671 | } | 1703 | } |
1672 | list_del(&pb->pb_list); | 1704 | list_del(&pb->pb_list); |
1673 | } | 1705 | } |
1674 | 1706 | ||
1707 | pb->pb_flags |= _PBF_DELWRI_Q; | ||
1675 | list_add_tail(&pb->pb_list, &pbd_delwrite_queue); | 1708 | list_add_tail(&pb->pb_list, &pbd_delwrite_queue); |
1676 | pb->pb_queuetime = jiffies; | 1709 | pb->pb_queuetime = jiffies; |
1677 | spin_unlock(&pbd_delwrite_lock); | 1710 | spin_unlock(&pbd_delwrite_lock); |
@@ -1688,10 +1721,11 @@ pagebuf_delwri_dequeue( | |||
1688 | 1721 | ||
1689 | spin_lock(&pbd_delwrite_lock); | 1722 | spin_lock(&pbd_delwrite_lock); |
1690 | if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) { | 1723 | if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) { |
1724 | ASSERT(pb->pb_flags & _PBF_DELWRI_Q); | ||
1691 | list_del_init(&pb->pb_list); | 1725 | list_del_init(&pb->pb_list); |
1692 | dequeued = 1; | 1726 | dequeued = 1; |
1693 | } | 1727 | } |
1694 | pb->pb_flags &= ~PBF_DELWRI; | 1728 | pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); |
1695 | spin_unlock(&pbd_delwrite_lock); | 1729 | spin_unlock(&pbd_delwrite_lock); |
1696 | 1730 | ||
1697 | if (dequeued) | 1731 | if (dequeued) |
@@ -1770,7 +1804,7 @@ xfsbufd( | |||
1770 | break; | 1804 | break; |
1771 | } | 1805 | } |
1772 | 1806 | ||
1773 | pb->pb_flags &= ~PBF_DELWRI; | 1807 | pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); |
1774 | pb->pb_flags |= PBF_WRITE; | 1808 | pb->pb_flags |= PBF_WRITE; |
1775 | list_move(&pb->pb_list, &tmp); | 1809 | list_move(&pb->pb_list, &tmp); |
1776 | } | 1810 | } |
@@ -1820,15 +1854,13 @@ xfs_flush_buftarg( | |||
1820 | if (pb->pb_target != target) | 1854 | if (pb->pb_target != target) |
1821 | continue; | 1855 | continue; |
1822 | 1856 | ||
1823 | ASSERT(pb->pb_flags & PBF_DELWRI); | 1857 | ASSERT(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)); |
1824 | PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb)); | 1858 | PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb)); |
1825 | if (pagebuf_ispin(pb)) { | 1859 | if (pagebuf_ispin(pb)) { |
1826 | pincount++; | 1860 | pincount++; |
1827 | continue; | 1861 | continue; |
1828 | } | 1862 | } |
1829 | 1863 | ||
1830 | pb->pb_flags &= ~PBF_DELWRI; | ||
1831 | pb->pb_flags |= PBF_WRITE; | ||
1832 | list_move(&pb->pb_list, &tmp); | 1864 | list_move(&pb->pb_list, &tmp); |
1833 | } | 1865 | } |
1834 | spin_unlock(&pbd_delwrite_lock); | 1866 | spin_unlock(&pbd_delwrite_lock); |
@@ -1837,12 +1869,14 @@ xfs_flush_buftarg( | |||
1837 | * Dropped the delayed write list lock, now walk the temporary list | 1869 | * Dropped the delayed write list lock, now walk the temporary list |
1838 | */ | 1870 | */ |
1839 | list_for_each_entry_safe(pb, n, &tmp, pb_list) { | 1871 | list_for_each_entry_safe(pb, n, &tmp, pb_list) { |
1872 | pagebuf_lock(pb); | ||
1873 | pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); | ||
1874 | pb->pb_flags |= PBF_WRITE; | ||
1840 | if (wait) | 1875 | if (wait) |
1841 | pb->pb_flags &= ~PBF_ASYNC; | 1876 | pb->pb_flags &= ~PBF_ASYNC; |
1842 | else | 1877 | else |
1843 | list_del_init(&pb->pb_list); | 1878 | list_del_init(&pb->pb_list); |
1844 | 1879 | ||
1845 | pagebuf_lock(pb); | ||
1846 | pagebuf_iostrategy(pb); | 1880 | pagebuf_iostrategy(pb); |
1847 | } | 1881 | } |
1848 | 1882 | ||
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index c322e9d71f3..4b7fe3b5e46 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h | |||
@@ -89,6 +89,7 @@ typedef enum page_buf_flags_e { /* pb_flags values */ | |||
89 | _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ | 89 | _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ |
90 | _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ | 90 | _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ |
91 | _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ | 91 | _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ |
92 | _PBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ | ||
92 | } page_buf_flags_t; | 93 | } page_buf_flags_t; |
93 | 94 | ||
94 | #define PBF_UPDATE (PBF_READ | PBF_WRITE) | 95 | #define PBF_UPDATE (PBF_READ | PBF_WRITE) |
@@ -337,8 +338,6 @@ extern void pagebuf_trace( | |||
337 | 338 | ||
338 | 339 | ||
339 | 340 | ||
340 | |||
341 | |||
342 | /* These are just for xfs_syncsub... it sets an internal variable | 341 | /* These are just for xfs_syncsub... it sets an internal variable |
343 | * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t | 342 | * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t |
344 | */ | 343 | */ |