aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorGreg Thelen <gthelen@google.com>2018-04-20 17:55:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-20 20:18:35 -0400
commit2e898e4c0a3897ccd434adac5abb8330194f527b (patch)
treefad1bb9399439b70ee1700d464e70a08449a7eec /include
parent88c28f2469151b031f8cea9b28ed5be1b74a4172 (diff)
writeback: safer lock nesting
lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if the page's memcg is undergoing move accounting, which occurs when a process leaves its memcg for a new one that has memory.move_charge_at_immigrate set. unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if the given inode is switching writeback domains. Switches occur when enough writes are issued from a new domain. This existing pattern is thus suspicious: lock_page_memcg(page); unlocked_inode_to_wb_begin(inode, &locked); ... unlocked_inode_to_wb_end(inode, locked); unlock_page_memcg(page); If both inode switch and process memcg migration are both in-flight then unlocked_inode_to_wb_end() will unconditionally enable interrupts while still holding the lock_page_memcg() irq spinlock. This suggests the possibility of deadlock if an interrupt occurs before unlock_page_memcg(). truncate __cancel_dirty_page lock_page_memcg unlocked_inode_to_wb_begin unlocked_inode_to_wb_end <interrupts mistakenly enabled> <interrupt> end_page_writeback test_clear_page_writeback lock_page_memcg <deadlock> unlock_page_memcg Due to configuration limitations this deadlock is not currently possible because we don't mix cgroup writeback (a cgroupv2 feature) and memory.move_charge_at_immigrate (a cgroupv1 feature). If the kernel is hacked to always claim inode switching and memcg moving_account, then this script triggers lockup in less than a minute: cd /mnt/cgroup/memory mkdir a b echo 1 > a/memory.move_charge_at_immigrate echo 1 > b/memory.move_charge_at_immigrate ( echo $BASHPID > a/cgroup.procs while true; do dd if=/dev/zero of=/mnt/big bs=1M count=256 done ) & while true; do sync done & sleep 1h & SLEEP=$! while true; do echo $SLEEP > a/cgroup.procs echo $SLEEP > b/cgroup.procs done The deadlock does not seem possible, so it's debatable if there's any reason to modify the kernel. I suggest we should to prevent future surprises. And Wang Long said "this deadlock occurs three times in our environment", so there's more reason to apply this, even to stable. Stable 4.4 has minor conflicts applying this patch. For a clean 4.4 patch see "[PATCH for-4.4] writeback: safer lock nesting" https://lkml.org/lkml/2018/4/11/146 Wang Long said "this deadlock occurs three times in our environment" [gthelen@google.com: v4] Link: http://lkml.kernel.org/r/20180411084653.254724-1-gthelen@google.com [akpm@linux-foundation.org: comment tweaks, struct initialization simplification] Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613 Link: http://lkml.kernel.org/r/20180410005908.167976-1-gthelen@google.com Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") Signed-off-by: Greg Thelen <gthelen@google.com> Reported-by: Wang Long <wanglong19@meituan.com> Acked-by: Wang Long <wanglong19@meituan.com> Acked-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Andrew Morton <akpm@linux-foundation.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Tejun Heo <tj@kernel.org> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: <stable@vger.kernel.org> [v4.2+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/backing-dev-defs.h5
-rw-r--r--include/linux/backing-dev.h30
2 files changed, 21 insertions, 14 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index bfe86b54f6c1..0bd432a4d7bd 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -223,6 +223,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
223 set_wb_congested(bdi->wb.congested, sync); 223 set_wb_congested(bdi->wb.congested, sync);
224} 224}
225 225
226struct wb_lock_cookie {
227 bool locked;
228 unsigned long flags;
229};
230
226#ifdef CONFIG_CGROUP_WRITEBACK 231#ifdef CONFIG_CGROUP_WRITEBACK
227 232
228/** 233/**
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f6be4b0b6c18..72ca0f3d39f3 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -347,7 +347,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
347/** 347/**
348 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction 348 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
349 * @inode: target inode 349 * @inode: target inode
350 * @lockedp: temp bool output param, to be passed to the end function 350 * @cookie: output param, to be passed to the end function
351 * 351 *
352 * The caller wants to access the wb associated with @inode but isn't 352 * The caller wants to access the wb associated with @inode but isn't
353 * holding inode->i_lock, the i_pages lock or wb->list_lock. This 353 * holding inode->i_lock, the i_pages lock or wb->list_lock. This
@@ -355,12 +355,12 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
355 * association doesn't change until the transaction is finished with 355 * association doesn't change until the transaction is finished with
356 * unlocked_inode_to_wb_end(). 356 * unlocked_inode_to_wb_end().
357 * 357 *
358 * The caller must call unlocked_inode_to_wb_end() with *@lockdep 358 * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
359 * afterwards and can't sleep during transaction. IRQ may or may not be 359 * can't sleep during the transaction. IRQs may or may not be disabled on
360 * disabled on return. 360 * return.
361 */ 361 */
362static inline struct bdi_writeback * 362static inline struct bdi_writeback *
363unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) 363unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
364{ 364{
365 rcu_read_lock(); 365 rcu_read_lock();
366 366
@@ -368,10 +368,10 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
368 * Paired with store_release in inode_switch_wb_work_fn() and 368 * Paired with store_release in inode_switch_wb_work_fn() and
369 * ensures that we see the new wb if we see cleared I_WB_SWITCH. 369 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
370 */ 370 */
371 *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; 371 cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
372 372
373 if (unlikely(*lockedp)) 373 if (unlikely(cookie->locked))
374 xa_lock_irq(&inode->i_mapping->i_pages); 374 xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);
375 375
376 /* 376 /*
377 * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages 377 * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
@@ -383,12 +383,13 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
383/** 383/**
384 * unlocked_inode_to_wb_end - end inode wb access transaction 384 * unlocked_inode_to_wb_end - end inode wb access transaction
385 * @inode: target inode 385 * @inode: target inode
386 * @locked: *@lockedp from unlocked_inode_to_wb_begin() 386 * @cookie: @cookie from unlocked_inode_to_wb_begin()
387 */ 387 */
388static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) 388static inline void unlocked_inode_to_wb_end(struct inode *inode,
389 struct wb_lock_cookie *cookie)
389{ 390{
390 if (unlikely(locked)) 391 if (unlikely(cookie->locked))
391 xa_unlock_irq(&inode->i_mapping->i_pages); 392 xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags);
392 393
393 rcu_read_unlock(); 394 rcu_read_unlock();
394} 395}
@@ -435,12 +436,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
435} 436}
436 437
437static inline struct bdi_writeback * 438static inline struct bdi_writeback *
438unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) 439unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
439{ 440{
440 return inode_to_wb(inode); 441 return inode_to_wb(inode);
441} 442}
442 443
443static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) 444static inline void unlocked_inode_to_wb_end(struct inode *inode,
445 struct wb_lock_cookie *cookie)
444{ 446{
445} 447}
446 448