aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-08-27 16:55:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-08-27 16:55:12 -0400
commit3510ca20ece0150af6b10c77a74ff1b5c198e3e2 (patch)
tree8d567e360a30985f8e0351bc35e80adea18e5267 /mm/filemap.c
parent0cc3b0ec23ce4c69e1e890ed2b8d2fa932b14aad (diff)
Minor page waitqueue cleanups
Tim Chen and Kan Liang have been battling a customer load that shows extremely long page wakeup lists. The cause seems to be constant NUMA migration of a hot page that is shared across a lot of threads, but the actual root cause for the exact behavior has not been found. Tim has a patch that batches the wait list traversal at wakeup time, so that we at least don't get long uninterruptible cases where we traverse and wake up thousands of processes and get nasty latency spikes. That is likely 4.14 material, but we're still discussing the page waitqueue specific parts of it. In the meantime, I've tried to look at making the page wait queues less expensive, and failing miserably. If you have thousands of threads waiting for the same page, it will be painful. We'll need to try to figure out the NUMA balancing issue some day, in addition to avoiding the excessive spinlock hold times. That said, having tried to rewrite the page wait queues, I can at least fix up some of the braindamage in the current situation. In particular: (a) we don't want to continue walking the page wait list if the bit we're waiting for already got set again (which seems to be one of the patterns of the bad load). That makes no progress and just causes pointless cache pollution chasing the pointers. (b) we don't want to put the non-locking waiters always on the front of the queue, and the locking waiters always on the back. Not only is that unfair, it means that we wake up thousands of reading threads that will just end up being blocked by the writer later anyway. Also add a comment about the layout of 'struct wait_page_key' - there is an external user of it in the cachefiles code that means that it has to match the layout of 'struct wait_bit_key' in the two first members. It so happens to match, because 'struct page *' and 'unsigned long *' end up having the same values simply because the page flags are the first member in struct page. Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Kan Liang <kan.liang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Christopher Lameter <cl@linux.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c11
1 files changed, 6 insertions, 5 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a49702445ce0..baba290c276b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -885,6 +885,7 @@ void __init pagecache_init(void)
885 page_writeback_init(); 885 page_writeback_init();
886} 886}
887 887
888/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
888struct wait_page_key { 889struct wait_page_key {
889 struct page *page; 890 struct page *page;
890 int bit_nr; 891 int bit_nr;
@@ -909,8 +910,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
909 910
910 if (wait_page->bit_nr != key->bit_nr) 911 if (wait_page->bit_nr != key->bit_nr)
911 return 0; 912 return 0;
913
914 /* Stop walking if it's locked */
912 if (test_bit(key->bit_nr, &key->page->flags)) 915 if (test_bit(key->bit_nr, &key->page->flags))
913 return 0; 916 return -1;
914 917
915 return autoremove_wake_function(wait, mode, sync, key); 918 return autoremove_wake_function(wait, mode, sync, key);
916} 919}
@@ -964,6 +967,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
964 int ret = 0; 967 int ret = 0;
965 968
966 init_wait(wait); 969 init_wait(wait);
970 wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
967 wait->func = wake_page_function; 971 wait->func = wake_page_function;
968 wait_page.page = page; 972 wait_page.page = page;
969 wait_page.bit_nr = bit_nr; 973 wait_page.bit_nr = bit_nr;
@@ -972,10 +976,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
972 spin_lock_irq(&q->lock); 976 spin_lock_irq(&q->lock);
973 977
974 if (likely(list_empty(&wait->entry))) { 978 if (likely(list_empty(&wait->entry))) {
975 if (lock) 979 __add_wait_queue_entry_tail(q, wait);
976 __add_wait_queue_entry_tail_exclusive(q, wait);
977 else
978 __add_wait_queue(q, wait);
979 SetPageWaiters(page); 980 SetPageWaiters(page);
980 } 981 }
981 982