aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Chinner <dgc@sgi.com>2005-09-04 18:33:35 -0400
committerNathan Scott <nathans@sgi.com>2005-09-04 18:33:35 -0400
commit2f926587512869ebf6bc820bd5f030e127aae774 (patch)
treef5bfb33cea1c6d7d2ababf5f800e63e2ec89a65a
parentba403ab43e896c57f32995ccba9a6bd6ec8dd1b9 (diff)
[XFS] Fix racy access to pb_flags. pagebuf_rele() modified pb_flags after
the pagebuf had been unlocked if the buffer was delwri. At high load, this could result in a race when the superblock was being synced that would result the flags being incorrect and the iodone functions being executed incorrectly. This then leads to iclog callback failures or AIL list corruptions resulting in filesystem shutdowns. SGI-PV: 923981 SGI-Modid: xfs-linux:xfs-kern:23616a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Nathan Scott <nathans@sgi.com>
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c66
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h3
2 files changed, 51 insertions, 18 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index f43689754dae..e6340906342c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -590,8 +590,10 @@ found:
590 PB_SET_OWNER(pb); 590 PB_SET_OWNER(pb);
591 } 591 }
592 592
593 if (pb->pb_flags & PBF_STALE) 593 if (pb->pb_flags & PBF_STALE) {
594 ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0);
594 pb->pb_flags &= PBF_MAPPED; 595 pb->pb_flags &= PBF_MAPPED;
596 }
595 PB_TRACE(pb, "got_lock", 0); 597 PB_TRACE(pb, "got_lock", 0);
596 XFS_STATS_INC(pb_get_locked); 598 XFS_STATS_INC(pb_get_locked);
597 return (pb); 599 return (pb);
@@ -872,6 +874,17 @@ pagebuf_rele(
872 874
873 PB_TRACE(pb, "rele", pb->pb_relse); 875 PB_TRACE(pb, "rele", pb->pb_relse);
874 876
877 /*
878 * pagebuf_lookup buffers are not hashed, not delayed write,
879 * and don't have their own release routines. Special case.
880 */
881 if (unlikely(!hash)) {
882 ASSERT(!pb->pb_relse);
883 if (atomic_dec_and_test(&pb->pb_hold))
884 xfs_buf_free(pb);
885 return;
886 }
887
875 if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) { 888 if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
876 int do_free = 1; 889 int do_free = 1;
877 890
@@ -883,22 +896,23 @@ pagebuf_rele(
883 do_free = 0; 896 do_free = 0;
884 } 897 }
885 898
886 if (pb->pb_flags & PBF_DELWRI) { 899 if (pb->pb_flags & PBF_FS_MANAGED) {
887 pb->pb_flags |= PBF_ASYNC;
888 atomic_inc(&pb->pb_hold);
889 pagebuf_delwri_queue(pb, 0);
890 do_free = 0;
891 } else if (pb->pb_flags & PBF_FS_MANAGED) {
892 do_free = 0; 900 do_free = 0;
893 } 901 }
894 902
895 if (do_free) { 903 if (do_free) {
904 ASSERT((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == 0);
896 list_del_init(&pb->pb_hash_list); 905 list_del_init(&pb->pb_hash_list);
897 spin_unlock(&hash->bh_lock); 906 spin_unlock(&hash->bh_lock);
898 pagebuf_free(pb); 907 pagebuf_free(pb);
899 } else { 908 } else {
900 spin_unlock(&hash->bh_lock); 909 spin_unlock(&hash->bh_lock);
901 } 910 }
911 } else {
912 /*
913 * Catch reference count leaks
914 */
915 ASSERT(atomic_read(&pb->pb_hold) >= 0);
902 } 916 }
903} 917}
904 918
@@ -976,13 +990,24 @@ pagebuf_lock(
976 * pagebuf_unlock 990 * pagebuf_unlock
977 * 991 *
978 * pagebuf_unlock releases the lock on the buffer object created by 992 * pagebuf_unlock releases the lock on the buffer object created by
979 * pagebuf_lock or pagebuf_cond_lock (not any 993 * pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages
980 * pinning of underlying pages created by pagebuf_pin). 994 * created by pagebuf_pin).
995 *
996 * If the buffer is marked delwri but is not queued, do so before we
997 * unlock the buffer as we need to set flags correctly. We also need to
998 * take a reference for the delwri queue because the unlocker is going to
999 * drop their's and they don't know we just queued it.
981 */ 1000 */
982void 1001void
983pagebuf_unlock( /* unlock buffer */ 1002pagebuf_unlock( /* unlock buffer */
984 xfs_buf_t *pb) /* buffer to unlock */ 1003 xfs_buf_t *pb) /* buffer to unlock */
985{ 1004{
1005 if ((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == PBF_DELWRI) {
1006 atomic_inc(&pb->pb_hold);
1007 pb->pb_flags |= PBF_ASYNC;
1008 pagebuf_delwri_queue(pb, 0);
1009 }
1010
986 PB_CLEAR_OWNER(pb); 1011 PB_CLEAR_OWNER(pb);
987 up(&pb->pb_sema); 1012 up(&pb->pb_sema);
988 PB_TRACE(pb, "unlock", 0); 1013 PB_TRACE(pb, "unlock", 0);
@@ -1486,6 +1511,11 @@ again:
1486 ASSERT(btp == bp->pb_target); 1511 ASSERT(btp == bp->pb_target);
1487 if (!(bp->pb_flags & PBF_FS_MANAGED)) { 1512 if (!(bp->pb_flags & PBF_FS_MANAGED)) {
1488 spin_unlock(&hash->bh_lock); 1513 spin_unlock(&hash->bh_lock);
1514 /*
1515 * Catch superblock reference count leaks
1516 * immediately
1517 */
1518 BUG_ON(bp->pb_bn == 0);
1489 delay(100); 1519 delay(100);
1490 goto again; 1520 goto again;
1491 } 1521 }
@@ -1661,17 +1691,20 @@ pagebuf_delwri_queue(
1661 int unlock) 1691 int unlock)
1662{ 1692{
1663 PB_TRACE(pb, "delwri_q", (long)unlock); 1693 PB_TRACE(pb, "delwri_q", (long)unlock);
1664 ASSERT(pb->pb_flags & PBF_DELWRI); 1694 ASSERT((pb->pb_flags & (PBF_DELWRI|PBF_ASYNC)) ==
1695 (PBF_DELWRI|PBF_ASYNC));
1665 1696
1666 spin_lock(&pbd_delwrite_lock); 1697 spin_lock(&pbd_delwrite_lock);
1667 /* If already in the queue, dequeue and place at tail */ 1698 /* If already in the queue, dequeue and place at tail */
1668 if (!list_empty(&pb->pb_list)) { 1699 if (!list_empty(&pb->pb_list)) {
1700 ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
1669 if (unlock) { 1701 if (unlock) {
1670 atomic_dec(&pb->pb_hold); 1702 atomic_dec(&pb->pb_hold);
1671 } 1703 }
1672 list_del(&pb->pb_list); 1704 list_del(&pb->pb_list);
1673 } 1705 }
1674 1706
1707 pb->pb_flags |= _PBF_DELWRI_Q;
1675 list_add_tail(&pb->pb_list, &pbd_delwrite_queue); 1708 list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
1676 pb->pb_queuetime = jiffies; 1709 pb->pb_queuetime = jiffies;
1677 spin_unlock(&pbd_delwrite_lock); 1710 spin_unlock(&pbd_delwrite_lock);
@@ -1688,10 +1721,11 @@ pagebuf_delwri_dequeue(
1688 1721
1689 spin_lock(&pbd_delwrite_lock); 1722 spin_lock(&pbd_delwrite_lock);
1690 if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) { 1723 if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
1724 ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
1691 list_del_init(&pb->pb_list); 1725 list_del_init(&pb->pb_list);
1692 dequeued = 1; 1726 dequeued = 1;
1693 } 1727 }
1694 pb->pb_flags &= ~PBF_DELWRI; 1728 pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
1695 spin_unlock(&pbd_delwrite_lock); 1729 spin_unlock(&pbd_delwrite_lock);
1696 1730
1697 if (dequeued) 1731 if (dequeued)
@@ -1770,7 +1804,7 @@ xfsbufd(
1770 break; 1804 break;
1771 } 1805 }
1772 1806
1773 pb->pb_flags &= ~PBF_DELWRI; 1807 pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
1774 pb->pb_flags |= PBF_WRITE; 1808 pb->pb_flags |= PBF_WRITE;
1775 list_move(&pb->pb_list, &tmp); 1809 list_move(&pb->pb_list, &tmp);
1776 } 1810 }
@@ -1820,15 +1854,13 @@ xfs_flush_buftarg(
1820 if (pb->pb_target != target) 1854 if (pb->pb_target != target)
1821 continue; 1855 continue;
1822 1856
1823 ASSERT(pb->pb_flags & PBF_DELWRI); 1857 ASSERT(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q));
1824 PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb)); 1858 PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
1825 if (pagebuf_ispin(pb)) { 1859 if (pagebuf_ispin(pb)) {
1826 pincount++; 1860 pincount++;
1827 continue; 1861 continue;
1828 } 1862 }
1829 1863
1830 pb->pb_flags &= ~PBF_DELWRI;
1831 pb->pb_flags |= PBF_WRITE;
1832 list_move(&pb->pb_list, &tmp); 1864 list_move(&pb->pb_list, &tmp);
1833 } 1865 }
1834 spin_unlock(&pbd_delwrite_lock); 1866 spin_unlock(&pbd_delwrite_lock);
@@ -1837,12 +1869,14 @@ xfs_flush_buftarg(
1837 * Dropped the delayed write list lock, now walk the temporary list 1869 * Dropped the delayed write list lock, now walk the temporary list
1838 */ 1870 */
1839 list_for_each_entry_safe(pb, n, &tmp, pb_list) { 1871 list_for_each_entry_safe(pb, n, &tmp, pb_list) {
1872 pagebuf_lock(pb);
1873 pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
1874 pb->pb_flags |= PBF_WRITE;
1840 if (wait) 1875 if (wait)
1841 pb->pb_flags &= ~PBF_ASYNC; 1876 pb->pb_flags &= ~PBF_ASYNC;
1842 else 1877 else
1843 list_del_init(&pb->pb_list); 1878 list_del_init(&pb->pb_list);
1844 1879
1845 pagebuf_lock(pb);
1846 pagebuf_iostrategy(pb); 1880 pagebuf_iostrategy(pb);
1847 } 1881 }
1848 1882
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index c322e9d71f3d..4b7fe3b5e460 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -89,6 +89,7 @@ typedef enum page_buf_flags_e { /* pb_flags values */
89 _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ 89 _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */
90 _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ 90 _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */
91 _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ 91 _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
92 _PBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
92} page_buf_flags_t; 93} page_buf_flags_t;
93 94
94#define PBF_UPDATE (PBF_READ | PBF_WRITE) 95#define PBF_UPDATE (PBF_READ | PBF_WRITE)
@@ -337,8 +338,6 @@ extern void pagebuf_trace(
337 338
338 339
339 340
340
341
342/* These are just for xfs_syncsub... it sets an internal variable 341/* These are just for xfs_syncsub... it sets an internal variable
343 * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t 342 * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
344 */ 343 */