aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/caching/operations.txt2
-rw-r--r--Documentation/trace/ftrace.txt2
-rw-r--r--drivers/md/dm-bufio.c41
-rw-r--r--drivers/md/dm-snap.c10
-rw-r--r--drivers/media/usb/dvb-usb-v2/dvb_usb_core.c12
-rw-r--r--fs/btrfs/extent_io.c10
-rw-r--r--fs/buffer.c11
-rw-r--r--fs/cifs/connect.c10
-rw-r--r--fs/cifs/file.c9
-rw-r--r--fs/cifs/inode.c6
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/fs-writeback.c3
-rw-r--r--fs/fscache/cookie.c7
-rw-r--r--fs/fscache/internal.h2
-rw-r--r--fs/fscache/main.c18
-rw-r--r--fs/fscache/page.c4
-rw-r--r--fs/gfs2/glock.c25
-rw-r--r--fs/gfs2/lock_dlm.c8
-rw-r--r--fs/gfs2/ops_fstype.c11
-rw-r--r--fs/gfs2/recovery.c8
-rw-r--r--fs/gfs2/super.c8
-rw-r--r--fs/inode.c7
-rw-r--r--fs/jbd2/transaction.c10
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c4
-rw-r--r--fs/nfs/inode.c6
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/nfs4state.c4
-rw-r--r--fs/nfs/pagelist.c14
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/write.c4
-rw-r--r--include/linux/irq_work.h5
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/sunrpc/sched.h2
-rw-r--r--include/linux/tick.h9
-rw-r--r--include/linux/wait.h125
-rw-r--r--include/linux/writeback.h3
-rw-r--r--kernel/cpu.c33
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/irq_work.c110
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/sched/core.c119
-rw-r--r--kernel/sched/deadline.c18
-rw-r--r--kernel/sched/fair.c244
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c30
-rw-r--r--kernel/sched/sched.h38
-rw-r--r--kernel/sched/wait.c30
-rw-r--r--kernel/smp.c9
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--mm/filemap.c20
-rw-r--r--mm/ksm.c8
-rw-r--r--net/bluetooth/hci_core.c8
-rw-r--r--net/sunrpc/sched.c4
-rw-r--r--security/keys/gc.c11
-rw-r--r--security/keys/request_key.c23
57 files changed, 588 insertions, 560 deletions
diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.txt
index bee2a5f93d60..a1c052cbba35 100644
--- a/Documentation/filesystems/caching/operations.txt
+++ b/Documentation/filesystems/caching/operations.txt
@@ -90,7 +90,7 @@ operations:
90 to be cleared before proceeding: 90 to be cleared before proceeding:
91 91
92 wait_on_bit(&op->flags, FSCACHE_OP_WAITING, 92 wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
93 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 93 TASK_UNINTERRUPTIBLE);
94 94
95 95
96 (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it 96 (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 2479b2a0c77c..4da42616939f 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1515,7 +1515,7 @@ Doing the same with chrt -r 5 and function-trace set.
1515 <idle>-0 3d.h4 1us+: 0:120:R + [003] 2448: 94:R sleep 1515 <idle>-0 3d.h4 1us+: 0:120:R + [003] 2448: 94:R sleep
1516 <idle>-0 3d.h4 2us : ttwu_do_activate.constprop.87 <-try_to_wake_up 1516 <idle>-0 3d.h4 2us : ttwu_do_activate.constprop.87 <-try_to_wake_up
1517 <idle>-0 3d.h3 3us : check_preempt_curr <-ttwu_do_wakeup 1517 <idle>-0 3d.h3 3us : check_preempt_curr <-ttwu_do_wakeup
1518 <idle>-0 3d.h3 3us : resched_task <-check_preempt_curr 1518 <idle>-0 3d.h3 3us : resched_curr <-check_preempt_curr
1519 <idle>-0 3dNh3 4us : task_woken_rt <-ttwu_do_wakeup 1519 <idle>-0 3dNh3 4us : task_woken_rt <-ttwu_do_wakeup
1520 <idle>-0 3dNh3 4us : _raw_spin_unlock <-try_to_wake_up 1520 <idle>-0 3dNh3 4us : _raw_spin_unlock <-try_to_wake_up
1521 <idle>-0 3dNh3 4us : sub_preempt_count <-_raw_spin_unlock 1521 <idle>-0 3dNh3 4us : sub_preempt_count <-_raw_spin_unlock
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index d724459860d9..ab472c557d18 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -615,16 +615,6 @@ static void write_endio(struct bio *bio, int error)
615} 615}
616 616
617/* 617/*
618 * This function is called when wait_on_bit is actually waiting.
619 */
620static int do_io_schedule(void *word)
621{
622 io_schedule();
623
624 return 0;
625}
626
627/*
628 * Initiate a write on a dirty buffer, but don't wait for it. 618 * Initiate a write on a dirty buffer, but don't wait for it.
629 * 619 *
630 * - If the buffer is not dirty, exit. 620 * - If the buffer is not dirty, exit.
@@ -640,8 +630,7 @@ static void __write_dirty_buffer(struct dm_buffer *b,
640 return; 630 return;
641 631
642 clear_bit(B_DIRTY, &b->state); 632 clear_bit(B_DIRTY, &b->state);
643 wait_on_bit_lock(&b->state, B_WRITING, 633 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
644 do_io_schedule, TASK_UNINTERRUPTIBLE);
645 634
646 if (!write_list) 635 if (!write_list)
647 submit_io(b, WRITE, b->block, write_endio); 636 submit_io(b, WRITE, b->block, write_endio);
@@ -675,9 +664,9 @@ static void __make_buffer_clean(struct dm_buffer *b)
675 if (!b->state) /* fast case */ 664 if (!b->state) /* fast case */
676 return; 665 return;
677 666
678 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 667 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
679 __write_dirty_buffer(b, NULL); 668 __write_dirty_buffer(b, NULL);
680 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); 669 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
681} 670}
682 671
683/* 672/*
@@ -1030,7 +1019,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
1030 if (need_submit) 1019 if (need_submit)
1031 submit_io(b, READ, b->block, read_endio); 1020 submit_io(b, READ, b->block, read_endio);
1032 1021
1033 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 1022 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1034 1023
1035 if (b->read_error) { 1024 if (b->read_error) {
1036 int error = b->read_error; 1025 int error = b->read_error;
@@ -1209,15 +1198,13 @@ again:
1209 dropped_lock = 1; 1198 dropped_lock = 1;
1210 b->hold_count++; 1199 b->hold_count++;
1211 dm_bufio_unlock(c); 1200 dm_bufio_unlock(c);
1212 wait_on_bit(&b->state, B_WRITING, 1201 wait_on_bit_io(&b->state, B_WRITING,
1213 do_io_schedule, 1202 TASK_UNINTERRUPTIBLE);
1214 TASK_UNINTERRUPTIBLE);
1215 dm_bufio_lock(c); 1203 dm_bufio_lock(c);
1216 b->hold_count--; 1204 b->hold_count--;
1217 } else 1205 } else
1218 wait_on_bit(&b->state, B_WRITING, 1206 wait_on_bit_io(&b->state, B_WRITING,
1219 do_io_schedule, 1207 TASK_UNINTERRUPTIBLE);
1220 TASK_UNINTERRUPTIBLE);
1221 } 1208 }
1222 1209
1223 if (!test_bit(B_DIRTY, &b->state) && 1210 if (!test_bit(B_DIRTY, &b->state) &&
@@ -1321,15 +1308,15 @@ retry:
1321 1308
1322 __write_dirty_buffer(b, NULL); 1309 __write_dirty_buffer(b, NULL);
1323 if (b->hold_count == 1) { 1310 if (b->hold_count == 1) {
1324 wait_on_bit(&b->state, B_WRITING, 1311 wait_on_bit_io(&b->state, B_WRITING,
1325 do_io_schedule, TASK_UNINTERRUPTIBLE); 1312 TASK_UNINTERRUPTIBLE);
1326 set_bit(B_DIRTY, &b->state); 1313 set_bit(B_DIRTY, &b->state);
1327 __unlink_buffer(b); 1314 __unlink_buffer(b);
1328 __link_buffer(b, new_block, LIST_DIRTY); 1315 __link_buffer(b, new_block, LIST_DIRTY);
1329 } else { 1316 } else {
1330 sector_t old_block; 1317 sector_t old_block;
1331 wait_on_bit_lock(&b->state, B_WRITING, 1318 wait_on_bit_lock_io(&b->state, B_WRITING,
1332 do_io_schedule, TASK_UNINTERRUPTIBLE); 1319 TASK_UNINTERRUPTIBLE);
1333 /* 1320 /*
1334 * Relink buffer to "new_block" so that write_callback 1321 * Relink buffer to "new_block" so that write_callback
1335 * sees "new_block" as a block number. 1322 * sees "new_block" as a block number.
@@ -1341,8 +1328,8 @@ retry:
1341 __unlink_buffer(b); 1328 __unlink_buffer(b);
1342 __link_buffer(b, new_block, b->list_mode); 1329 __link_buffer(b, new_block, b->list_mode);
1343 submit_io(b, WRITE, new_block, write_endio); 1330 submit_io(b, WRITE, new_block, write_endio);
1344 wait_on_bit(&b->state, B_WRITING, 1331 wait_on_bit_io(&b->state, B_WRITING,
1345 do_io_schedule, TASK_UNINTERRUPTIBLE); 1332 TASK_UNINTERRUPTIBLE);
1346 __unlink_buffer(b); 1333 __unlink_buffer(b);
1347 __link_buffer(b, old_block, b->list_mode); 1334 __link_buffer(b, old_block, b->list_mode);
1348 } 1335 }
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5bd2290cfb1e..864b03f47727 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1032,21 +1032,13 @@ static void start_merge(struct dm_snapshot *s)
1032 snapshot_merge_next_chunks(s); 1032 snapshot_merge_next_chunks(s);
1033} 1033}
1034 1034
1035static int wait_schedule(void *ptr)
1036{
1037 schedule();
1038
1039 return 0;
1040}
1041
1042/* 1035/*
1043 * Stop the merging process and wait until it finishes. 1036 * Stop the merging process and wait until it finishes.
1044 */ 1037 */
1045static void stop_merge(struct dm_snapshot *s) 1038static void stop_merge(struct dm_snapshot *s)
1046{ 1039{
1047 set_bit(SHUTDOWN_MERGE, &s->state_bits); 1040 set_bit(SHUTDOWN_MERGE, &s->state_bits);
1048 wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, 1041 wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE);
1049 TASK_UNINTERRUPTIBLE);
1050 clear_bit(SHUTDOWN_MERGE, &s->state_bits); 1042 clear_bit(SHUTDOWN_MERGE, &s->state_bits);
1051} 1043}
1052 1044
diff --git a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c
index e35580618936..f296394bb7c5 100644
--- a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c
+++ b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c
@@ -253,13 +253,6 @@ static int dvb_usbv2_adapter_stream_exit(struct dvb_usb_adapter *adap)
253 return usb_urb_exitv2(&adap->stream); 253 return usb_urb_exitv2(&adap->stream);
254} 254}
255 255
256static int wait_schedule(void *ptr)
257{
258 schedule();
259
260 return 0;
261}
262
263static int dvb_usb_start_feed(struct dvb_demux_feed *dvbdmxfeed) 256static int dvb_usb_start_feed(struct dvb_demux_feed *dvbdmxfeed)
264{ 257{
265 struct dvb_usb_adapter *adap = dvbdmxfeed->demux->priv; 258 struct dvb_usb_adapter *adap = dvbdmxfeed->demux->priv;
@@ -273,8 +266,7 @@ static int dvb_usb_start_feed(struct dvb_demux_feed *dvbdmxfeed)
273 dvbdmxfeed->pid, dvbdmxfeed->index); 266 dvbdmxfeed->pid, dvbdmxfeed->index);
274 267
275 /* wait init is done */ 268 /* wait init is done */
276 wait_on_bit(&adap->state_bits, ADAP_INIT, wait_schedule, 269 wait_on_bit(&adap->state_bits, ADAP_INIT, TASK_UNINTERRUPTIBLE);
277 TASK_UNINTERRUPTIBLE);
278 270
279 if (adap->active_fe == -1) 271 if (adap->active_fe == -1)
280 return -EINVAL; 272 return -EINVAL;
@@ -568,7 +560,7 @@ static int dvb_usb_fe_sleep(struct dvb_frontend *fe)
568 560
569 if (!adap->suspend_resume_active) { 561 if (!adap->suspend_resume_active) {
570 set_bit(ADAP_SLEEP, &adap->state_bits); 562 set_bit(ADAP_SLEEP, &adap->state_bits);
571 wait_on_bit(&adap->state_bits, ADAP_STREAMING, wait_schedule, 563 wait_on_bit(&adap->state_bits, ADAP_STREAMING,
572 TASK_UNINTERRUPTIBLE); 564 TASK_UNINTERRUPTIBLE);
573 } 565 }
574 566
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a389820d158b..3e11aab9f391 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3437,16 +3437,10 @@ done_unlocked:
3437 return 0; 3437 return 0;
3438} 3438}
3439 3439
3440static int eb_wait(void *word)
3441{
3442 io_schedule();
3443 return 0;
3444}
3445
3446void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3440void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3447{ 3441{
3448 wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, 3442 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3449 TASK_UNINTERRUPTIBLE); 3443 TASK_UNINTERRUPTIBLE);
3450} 3444}
3451 3445
3452static noinline_for_stack int 3446static noinline_for_stack int
diff --git a/fs/buffer.c b/fs/buffer.c
index eba6e4f621ce..8f05111bbb8b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -61,16 +61,9 @@ inline void touch_buffer(struct buffer_head *bh)
61} 61}
62EXPORT_SYMBOL(touch_buffer); 62EXPORT_SYMBOL(touch_buffer);
63 63
64static int sleep_on_buffer(void *word)
65{
66 io_schedule();
67 return 0;
68}
69
70void __lock_buffer(struct buffer_head *bh) 64void __lock_buffer(struct buffer_head *bh)
71{ 65{
72 wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer, 66 wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
73 TASK_UNINTERRUPTIBLE);
74} 67}
75EXPORT_SYMBOL(__lock_buffer); 68EXPORT_SYMBOL(__lock_buffer);
76 69
@@ -123,7 +116,7 @@ EXPORT_SYMBOL(buffer_check_dirty_writeback);
123 */ 116 */
124void __wait_on_buffer(struct buffer_head * bh) 117void __wait_on_buffer(struct buffer_head * bh)
125{ 118{
126 wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE); 119 wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
127} 120}
128EXPORT_SYMBOL(__wait_on_buffer); 121EXPORT_SYMBOL(__wait_on_buffer);
129 122
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 20d75b8ddb26..b98366f21f9e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3934,13 +3934,6 @@ cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
3934 return tlink_tcon(cifs_sb_master_tlink(cifs_sb)); 3934 return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
3935} 3935}
3936 3936
3937static int
3938cifs_sb_tcon_pending_wait(void *unused)
3939{
3940 schedule();
3941 return signal_pending(current) ? -ERESTARTSYS : 0;
3942}
3943
3944/* find and return a tlink with given uid */ 3937/* find and return a tlink with given uid */
3945static struct tcon_link * 3938static struct tcon_link *
3946tlink_rb_search(struct rb_root *root, kuid_t uid) 3939tlink_rb_search(struct rb_root *root, kuid_t uid)
@@ -4039,11 +4032,10 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
4039 } else { 4032 } else {
4040wait_for_construction: 4033wait_for_construction:
4041 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, 4034 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
4042 cifs_sb_tcon_pending_wait,
4043 TASK_INTERRUPTIBLE); 4035 TASK_INTERRUPTIBLE);
4044 if (ret) { 4036 if (ret) {
4045 cifs_put_tlink(tlink); 4037 cifs_put_tlink(tlink);
4046 return ERR_PTR(ret); 4038 return ERR_PTR(-ERESTARTSYS);
4047 } 4039 }
4048 4040
4049 /* if it's good, return it */ 4041 /* if it's good, return it */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e90a1e9aa627..b88b1ade4d3d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3618,13 +3618,6 @@ static int cifs_launder_page(struct page *page)
3618 return rc; 3618 return rc;
3619} 3619}
3620 3620
3621static int
3622cifs_pending_writers_wait(void *unused)
3623{
3624 schedule();
3625 return 0;
3626}
3627
3628void cifs_oplock_break(struct work_struct *work) 3621void cifs_oplock_break(struct work_struct *work)
3629{ 3622{
3630 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 3623 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -3636,7 +3629,7 @@ void cifs_oplock_break(struct work_struct *work)
3636 int rc = 0; 3629 int rc = 0;
3637 3630
3638 wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, 3631 wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
3639 cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); 3632 TASK_UNINTERRUPTIBLE);
3640 3633
3641 server->ops->downgrade_oplock(server, cinode, 3634 server->ops->downgrade_oplock(server, cinode,
3642 test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); 3635 test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a174605f6afa..41de3935caa0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1780,7 +1780,7 @@ cifs_invalidate_mapping(struct inode *inode)
1780 * @word: long word containing the bit lock 1780 * @word: long word containing the bit lock
1781 */ 1781 */
1782static int 1782static int
1783cifs_wait_bit_killable(void *word) 1783cifs_wait_bit_killable(struct wait_bit_key *key)
1784{ 1784{
1785 if (fatal_signal_pending(current)) 1785 if (fatal_signal_pending(current))
1786 return -ERESTARTSYS; 1786 return -ERESTARTSYS;
@@ -1794,8 +1794,8 @@ cifs_revalidate_mapping(struct inode *inode)
1794 int rc; 1794 int rc;
1795 unsigned long *flags = &CIFS_I(inode)->flags; 1795 unsigned long *flags = &CIFS_I(inode)->flags;
1796 1796
1797 rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, 1797 rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
1798 TASK_KILLABLE); 1798 TASK_KILLABLE);
1799 if (rc) 1799 if (rc)
1800 return rc; 1800 return rc;
1801 1801
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3b0c62e622da..6bf55d0ed494 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -582,7 +582,7 @@ int cifs_get_writer(struct cifsInodeInfo *cinode)
582 582
583start: 583start:
584 rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, 584 rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
585 cifs_oplock_break_wait, TASK_KILLABLE); 585 TASK_KILLABLE);
586 if (rc) 586 if (rc)
587 return rc; 587 return rc;
588 588
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index be568b7311d6..ef9bef118342 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -342,7 +342,8 @@ static void __inode_wait_for_writeback(struct inode *inode)
342 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 342 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
343 while (inode->i_state & I_SYNC) { 343 while (inode->i_state & I_SYNC) {
344 spin_unlock(&inode->i_lock); 344 spin_unlock(&inode->i_lock);
345 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 345 __wait_on_bit(wqh, &wq, bit_wait,
346 TASK_UNINTERRUPTIBLE);
346 spin_lock(&inode->i_lock); 347 spin_lock(&inode->i_lock);
347 } 348 }
348} 349}
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index aec01be91b0a..89acec742e0b 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -160,7 +160,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie,
160 _enter("%p", cookie); 160 _enter("%p", cookie);
161 161
162 wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, 162 wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
163 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 163 TASK_UNINTERRUPTIBLE);
164 164
165 if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) 165 if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
166 goto out_unlock; 166 goto out_unlock;
@@ -255,7 +255,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
255 if (!fscache_defer_lookup) { 255 if (!fscache_defer_lookup) {
256 _debug("non-deferred lookup %p", &cookie->flags); 256 _debug("non-deferred lookup %p", &cookie->flags);
257 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, 257 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
258 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 258 TASK_UNINTERRUPTIBLE);
259 _debug("complete"); 259 _debug("complete");
260 if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) 260 if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
261 goto unavailable; 261 goto unavailable;
@@ -463,7 +463,6 @@ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
463 _enter("%p", cookie); 463 _enter("%p", cookie);
464 464
465 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, 465 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
466 fscache_wait_bit_interruptible,
467 TASK_UNINTERRUPTIBLE); 466 TASK_UNINTERRUPTIBLE);
468 467
469 _leave(""); 468 _leave("");
@@ -525,7 +524,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
525 } 524 }
526 525
527 wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, 526 wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
528 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 527 TASK_UNINTERRUPTIBLE);
529 if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) 528 if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
530 goto out_unlock_enable; 529 goto out_unlock_enable;
531 530
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index bc6c08fcfddd..7872a62ef30c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void)
97 return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); 97 return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
98} 98}
99 99
100extern int fscache_wait_bit(void *);
101extern int fscache_wait_bit_interruptible(void *);
102extern int fscache_wait_atomic_t(atomic_t *); 100extern int fscache_wait_atomic_t(atomic_t *);
103 101
104/* 102/*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 63f868e869b9..a31b83c5cbd9 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -197,24 +197,6 @@ static void __exit fscache_exit(void)
197module_exit(fscache_exit); 197module_exit(fscache_exit);
198 198
199/* 199/*
200 * wait_on_bit() sleep function for uninterruptible waiting
201 */
202int fscache_wait_bit(void *flags)
203{
204 schedule();
205 return 0;
206}
207
208/*
209 * wait_on_bit() sleep function for interruptible waiting
210 */
211int fscache_wait_bit_interruptible(void *flags)
212{
213 schedule();
214 return signal_pending(current);
215}
216
217/*
218 * wait_on_atomic_t() sleep function for uninterruptible waiting 200 * wait_on_atomic_t() sleep function for uninterruptible waiting
219 */ 201 */
220int fscache_wait_atomic_t(atomic_t *p) 202int fscache_wait_atomic_t(atomic_t *p)
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ed70714503fa..85332b9d19d1 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -298,7 +298,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
298 298
299 jif = jiffies; 299 jif = jiffies;
300 if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, 300 if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
301 fscache_wait_bit_interruptible,
302 TASK_INTERRUPTIBLE) != 0) { 301 TASK_INTERRUPTIBLE) != 0) {
303 fscache_stat(&fscache_n_retrievals_intr); 302 fscache_stat(&fscache_n_retrievals_intr);
304 _leave(" = -ERESTARTSYS"); 303 _leave(" = -ERESTARTSYS");
@@ -342,7 +341,6 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
342 if (stat_op_waits) 341 if (stat_op_waits)
343 fscache_stat(stat_op_waits); 342 fscache_stat(stat_op_waits);
344 if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, 343 if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
345 fscache_wait_bit_interruptible,
346 TASK_INTERRUPTIBLE) != 0) { 344 TASK_INTERRUPTIBLE) != 0) {
347 ret = fscache_cancel_op(op, do_cancel); 345 ret = fscache_cancel_op(op, do_cancel);
348 if (ret == 0) 346 if (ret == 0)
@@ -351,7 +349,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
351 /* it's been removed from the pending queue by another party, 349 /* it's been removed from the pending queue by another party,
352 * so we should get to run shortly */ 350 * so we should get to run shortly */
353 wait_on_bit(&op->flags, FSCACHE_OP_WAITING, 351 wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
354 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 352 TASK_UNINTERRUPTIBLE);
355 } 353 }
356 _debug("<<< GO"); 354 _debug("<<< GO");
357 355
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ee4e04fe60fc..7f513b1ceb2c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -856,27 +856,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
856} 856}
857 857
858/** 858/**
859 * gfs2_glock_holder_wait
860 * @word: unused
861 *
862 * This function and gfs2_glock_demote_wait both show up in the WCHAN
863 * field. Thus I've separated these otherwise identical functions in
864 * order to be more informative to the user.
865 */
866
867static int gfs2_glock_holder_wait(void *word)
868{
869 schedule();
870 return 0;
871}
872
873static int gfs2_glock_demote_wait(void *word)
874{
875 schedule();
876 return 0;
877}
878
879/**
880 * gfs2_glock_wait - wait on a glock acquisition 859 * gfs2_glock_wait - wait on a glock acquisition
881 * @gh: the glock holder 860 * @gh: the glock holder
882 * 861 *
@@ -888,7 +867,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
888 unsigned long time1 = jiffies; 867 unsigned long time1 = jiffies;
889 868
890 might_sleep(); 869 might_sleep();
891 wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); 870 wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
892 if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ 871 if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */
893 /* Lengthen the minimum hold time. */ 872 /* Lengthen the minimum hold time. */
894 gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + 873 gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time +
@@ -1128,7 +1107,7 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh)
1128 struct gfs2_glock *gl = gh->gh_gl; 1107 struct gfs2_glock *gl = gh->gh_gl;
1129 gfs2_glock_dq(gh); 1108 gfs2_glock_dq(gh);
1130 might_sleep(); 1109 might_sleep();
1131 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); 1110 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE);
1132} 1111}
1133 1112
1134/** 1113/**
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 4fafea1c9ecf..641383a9c1bb 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -936,12 +936,6 @@ fail:
936 return error; 936 return error;
937} 937}
938 938
939static int dlm_recovery_wait(void *word)
940{
941 schedule();
942 return 0;
943}
944
945static int control_first_done(struct gfs2_sbd *sdp) 939static int control_first_done(struct gfs2_sbd *sdp)
946{ 940{
947 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 941 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -976,7 +970,7 @@ restart:
976 fs_info(sdp, "control_first_done wait gen %u\n", start_gen); 970 fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
977 971
978 wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, 972 wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
979 dlm_recovery_wait, TASK_UNINTERRUPTIBLE); 973 TASK_UNINTERRUPTIBLE);
980 goto restart; 974 goto restart;
981 } 975 }
982 976
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index bc564c0d6d16..d3eae244076e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1024,20 +1024,13 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
1024 lm->lm_unmount(sdp); 1024 lm->lm_unmount(sdp);
1025} 1025}
1026 1026
1027static int gfs2_journalid_wait(void *word)
1028{
1029 if (signal_pending(current))
1030 return -EINTR;
1031 schedule();
1032 return 0;
1033}
1034
1035static int wait_on_journal(struct gfs2_sbd *sdp) 1027static int wait_on_journal(struct gfs2_sbd *sdp)
1036{ 1028{
1037 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 1029 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
1038 return 0; 1030 return 0;
1039 1031
1040 return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); 1032 return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, TASK_INTERRUPTIBLE)
1033 ? -EINTR : 0;
1041} 1034}
1042 1035
1043void gfs2_online_uevent(struct gfs2_sbd *sdp) 1036void gfs2_online_uevent(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 94555d4c5698..573bd3b758fa 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -591,12 +591,6 @@ done:
591 wake_up_bit(&jd->jd_flags, JDF_RECOVERY); 591 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
592} 592}
593 593
594static int gfs2_recovery_wait(void *word)
595{
596 schedule();
597 return 0;
598}
599
600int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) 594int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
601{ 595{
602 int rv; 596 int rv;
@@ -609,7 +603,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
609 BUG_ON(!rv); 603 BUG_ON(!rv);
610 604
611 if (wait) 605 if (wait)
612 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, 606 wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
613 TASK_UNINTERRUPTIBLE); 607 TASK_UNINTERRUPTIBLE);
614 608
615 return wait ? jd->jd_recover_error : 0; 609 return wait ? jd->jd_recover_error : 0;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 1319b5c4ec68..2607ff13d486 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -864,12 +864,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
864 return error; 864 return error;
865} 865}
866 866
867static int gfs2_umount_recovery_wait(void *word)
868{
869 schedule();
870 return 0;
871}
872
873/** 867/**
874 * gfs2_put_super - Unmount the filesystem 868 * gfs2_put_super - Unmount the filesystem
875 * @sb: The VFS superblock 869 * @sb: The VFS superblock
@@ -894,7 +888,7 @@ restart:
894 continue; 888 continue;
895 spin_unlock(&sdp->sd_jindex_spin); 889 spin_unlock(&sdp->sd_jindex_spin);
896 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, 890 wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
897 gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); 891 TASK_UNINTERRUPTIBLE);
898 goto restart; 892 goto restart;
899 } 893 }
900 spin_unlock(&sdp->sd_jindex_spin); 894 spin_unlock(&sdp->sd_jindex_spin);
diff --git a/fs/inode.c b/fs/inode.c
index 6eecb7ff0b9a..5938f3928944 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1695,13 +1695,6 @@ int inode_needs_sync(struct inode *inode)
1695} 1695}
1696EXPORT_SYMBOL(inode_needs_sync); 1696EXPORT_SYMBOL(inode_needs_sync);
1697 1697
1698int inode_wait(void *word)
1699{
1700 schedule();
1701 return 0;
1702}
1703EXPORT_SYMBOL(inode_wait);
1704
1705/* 1698/*
1706 * If we try to find an inode in the inode hash while it is being 1699 * If we try to find an inode in the inode hash while it is being
1707 * deleted, we have to wait until the filesystem completes its 1700 * deleted, we have to wait until the filesystem completes its
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6f0f590cc5a3..5f09370c90a8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -763,12 +763,6 @@ static void warn_dirty_buffer(struct buffer_head *bh)
763 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 763 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
764} 764}
765 765
766static int sleep_on_shadow_bh(void *word)
767{
768 io_schedule();
769 return 0;
770}
771
772/* 766/*
773 * If the buffer is already part of the current transaction, then there 767 * If the buffer is already part of the current transaction, then there
774 * is nothing we need to do. If it is already part of a prior 768 * is nothing we need to do. If it is already part of a prior
@@ -906,8 +900,8 @@ repeat:
906 if (buffer_shadow(bh)) { 900 if (buffer_shadow(bh)) {
907 JBUFFER_TRACE(jh, "on shadow: sleep"); 901 JBUFFER_TRACE(jh, "on shadow: sleep");
908 jbd_unlock_bh_state(bh); 902 jbd_unlock_bh_state(bh);
909 wait_on_bit(&bh->b_state, BH_Shadow, 903 wait_on_bit_io(&bh->b_state, BH_Shadow,
910 sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); 904 TASK_UNINTERRUPTIBLE);
911 goto repeat; 905 goto repeat;
912 } 906 }
913 907
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 4042ff58fe3f..524dd80d1898 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -361,8 +361,8 @@ start:
361 * Prevent starvation issues if someone is doing a consistency 361 * Prevent starvation issues if someone is doing a consistency
362 * sync-to-disk 362 * sync-to-disk
363 */ 363 */
364 ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, 364 ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
365 nfs_wait_bit_killable, TASK_KILLABLE); 365 nfs_wait_bit_killable, TASK_KILLABLE);
366 if (ret) 366 if (ret)
367 return ret; 367 return ret;
368 368
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 44bf0140a4c7..e2a0361e24c6 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -783,8 +783,8 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
783static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) 783static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
784{ 784{
785 might_sleep(); 785 might_sleep();
786 wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, 786 wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
787 nfs_wait_bit_killable, TASK_KILLABLE); 787 nfs_wait_bit_killable, TASK_KILLABLE);
788} 788}
789 789
790static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) 790static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9927913c97c2..abd37a380535 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -75,7 +75,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
75 * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks 75 * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
76 * @word: long word containing the bit lock 76 * @word: long word containing the bit lock
77 */ 77 */
78int nfs_wait_bit_killable(void *word) 78int nfs_wait_bit_killable(struct wait_bit_key *key)
79{ 79{
80 if (fatal_signal_pending(current)) 80 if (fatal_signal_pending(current))
81 return -ERESTARTSYS; 81 return -ERESTARTSYS;
@@ -1074,8 +1074,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1074 * the bit lock here if it looks like we're going to be doing that. 1074 * the bit lock here if it looks like we're going to be doing that.
1075 */ 1075 */
1076 for (;;) { 1076 for (;;) {
1077 ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, 1077 ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
1078 nfs_wait_bit_killable, TASK_KILLABLE); 1078 nfs_wait_bit_killable, TASK_KILLABLE);
1079 if (ret) 1079 if (ret)
1080 goto out; 1080 goto out;
1081 spin_lock(&inode->i_lock); 1081 spin_lock(&inode->i_lock);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f415cbf9f6c3..617f36611d4a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -348,7 +348,7 @@ extern int nfs_drop_inode(struct inode *);
348extern void nfs_clear_inode(struct inode *); 348extern void nfs_clear_inode(struct inode *);
349extern void nfs_evict_inode(struct inode *); 349extern void nfs_evict_inode(struct inode *);
350void nfs_zap_acl_cache(struct inode *inode); 350void nfs_zap_acl_cache(struct inode *inode);
351extern int nfs_wait_bit_killable(void *word); 351extern int nfs_wait_bit_killable(struct wait_bit_key *key);
352 352
353/* super.c */ 353/* super.c */
354extern const struct super_operations nfs_sops; 354extern const struct super_operations nfs_sops;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 848f6853c59e..42f121182167 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1251,8 +1251,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
1251 might_sleep(); 1251 might_sleep();
1252 1252
1253 atomic_inc(&clp->cl_count); 1253 atomic_inc(&clp->cl_count);
1254 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, 1254 res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
1255 nfs_wait_bit_killable, TASK_KILLABLE); 1255 nfs_wait_bit_killable, TASK_KILLABLE);
1256 if (res) 1256 if (res)
1257 goto out; 1257 goto out;
1258 if (clp->cl_cons_state < 0) 1258 if (clp->cl_cons_state < 0)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 17fab89f6358..0be5050638f7 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -115,7 +115,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c)
115 set_bit(NFS_IO_INPROGRESS, &c->flags); 115 set_bit(NFS_IO_INPROGRESS, &c->flags);
116 if (atomic_read(&c->io_count) == 0) 116 if (atomic_read(&c->io_count) == 0)
117 break; 117 break;
118 ret = nfs_wait_bit_killable(&c->flags); 118 ret = nfs_wait_bit_killable(&q.key);
119 } while (atomic_read(&c->io_count) != 0); 119 } while (atomic_read(&c->io_count) != 0);
120 finish_wait(wq, &q.wait); 120 finish_wait(wq, &q.wait);
121 return ret; 121 return ret;
@@ -136,12 +136,6 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
136 return __nfs_iocounter_wait(c); 136 return __nfs_iocounter_wait(c);
137} 137}
138 138
139static int nfs_wait_bit_uninterruptible(void *word)
140{
141 io_schedule();
142 return 0;
143}
144
145/* 139/*
146 * nfs_page_group_lock - lock the head of the page group 140 * nfs_page_group_lock - lock the head of the page group
147 * @req - request in group that is to be locked 141 * @req - request in group that is to be locked
@@ -156,7 +150,6 @@ nfs_page_group_lock(struct nfs_page *req)
156 WARN_ON_ONCE(head != head->wb_head); 150 WARN_ON_ONCE(head != head->wb_head);
157 151
158 wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, 152 wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
159 nfs_wait_bit_uninterruptible,
160 TASK_UNINTERRUPTIBLE); 153 TASK_UNINTERRUPTIBLE);
161} 154}
162 155
@@ -435,9 +428,8 @@ void nfs_release_request(struct nfs_page *req)
435int 428int
436nfs_wait_on_request(struct nfs_page *req) 429nfs_wait_on_request(struct nfs_page *req)
437{ 430{
438 return wait_on_bit(&req->wb_flags, PG_BUSY, 431 return wait_on_bit_io(&req->wb_flags, PG_BUSY,
439 nfs_wait_bit_uninterruptible, 432 TASK_UNINTERRUPTIBLE);
440 TASK_UNINTERRUPTIBLE);
441} 433}
442 434
443/* 435/*
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6fdcd233d6f7..a8914b335617 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1885,7 +1885,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1885 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { 1885 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
1886 if (!sync) 1886 if (!sync)
1887 goto out; 1887 goto out;
1888 status = wait_on_bit_lock(&nfsi->flags, 1888 status = wait_on_bit_lock_action(&nfsi->flags,
1889 NFS_INO_LAYOUTCOMMITTING, 1889 NFS_INO_LAYOUTCOMMITTING,
1890 nfs_wait_bit_killable, 1890 nfs_wait_bit_killable,
1891 TASK_KILLABLE); 1891 TASK_KILLABLE);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5e2f10304548..962c9ee758be 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -623,7 +623,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
623 int err; 623 int err;
624 624
625 /* Stop dirtying of new pages while we sync */ 625 /* Stop dirtying of new pages while we sync */
626 err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, 626 err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
627 nfs_wait_bit_killable, TASK_KILLABLE); 627 nfs_wait_bit_killable, TASK_KILLABLE);
628 if (err) 628 if (err)
629 goto out_err; 629 goto out_err;
@@ -1703,7 +1703,7 @@ int nfs_commit_inode(struct inode *inode, int how)
1703 return error; 1703 return error;
1704 if (!may_wait) 1704 if (!may_wait)
1705 goto out_mark_dirty; 1705 goto out_mark_dirty;
1706 error = wait_on_bit(&NFS_I(inode)->flags, 1706 error = wait_on_bit_action(&NFS_I(inode)->flags,
1707 NFS_INO_COMMIT, 1707 NFS_INO_COMMIT,
1708 nfs_wait_bit_killable, 1708 nfs_wait_bit_killable,
1709 TASK_KILLABLE); 1709 TASK_KILLABLE);
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 19ae05d4b8ec..bf9422c3aefe 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -33,6 +33,11 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
33#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), } 33#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
34 34
35bool irq_work_queue(struct irq_work *work); 35bool irq_work_queue(struct irq_work *work);
36
37#ifdef CONFIG_SMP
38bool irq_work_queue_on(struct irq_work *work, int cpu);
39#endif
40
36void irq_work_run(void); 41void irq_work_run(void);
37void irq_work_sync(struct irq_work *work); 42void irq_work_sync(struct irq_work *work);
38 43
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b39a671cfd59..42cac4dc2157 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1437,8 +1437,6 @@ struct task_struct {
1437 struct rb_node *pi_waiters_leftmost; 1437 struct rb_node *pi_waiters_leftmost;
1438 /* Deadlock detection and priority inheritance handling */ 1438 /* Deadlock detection and priority inheritance handling */
1439 struct rt_mutex_waiter *pi_blocked_on; 1439 struct rt_mutex_waiter *pi_blocked_on;
1440 /* Top pi_waiters task */
1441 struct task_struct *pi_top_task;
1442#endif 1440#endif
1443 1441
1444#ifdef CONFIG_DEBUG_MUTEXES 1442#ifdef CONFIG_DEBUG_MUTEXES
@@ -2782,7 +2780,7 @@ static inline bool __must_check current_set_polling_and_test(void)
2782 2780
2783 /* 2781 /*
2784 * Polling state must be visible before we test NEED_RESCHED, 2782 * Polling state must be visible before we test NEED_RESCHED,
2785 * paired by resched_task() 2783 * paired by resched_curr()
2786 */ 2784 */
2787 smp_mb__after_atomic(); 2785 smp_mb__after_atomic();
2788 2786
@@ -2800,7 +2798,7 @@ static inline bool __must_check current_clr_polling_and_test(void)
2800 2798
2801 /* 2799 /*
2802 * Polling state must be visible before we test NEED_RESCHED, 2800 * Polling state must be visible before we test NEED_RESCHED,
2803 * paired by resched_task() 2801 * paired by resched_curr()
2804 */ 2802 */
2805 smp_mb__after_atomic(); 2803 smp_mb__after_atomic();
2806 2804
@@ -2832,7 +2830,7 @@ static inline void current_clr_polling(void)
2832 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also 2830 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
2833 * fold. 2831 * fold.
2834 */ 2832 */
2835 smp_mb(); /* paired with resched_task() */ 2833 smp_mb(); /* paired with resched_curr() */
2836 2834
2837 preempt_fold_need_resched(); 2835 preempt_fold_need_resched();
2838} 2836}
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index ad7dbe2cfecd..1a8959944c5f 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -236,7 +236,7 @@ void * rpc_malloc(struct rpc_task *, size_t);
236void rpc_free(void *); 236void rpc_free(void *);
237int rpciod_up(void); 237int rpciod_up(void);
238void rpciod_down(void); 238void rpciod_down(void);
239int __rpc_wait_for_completion_task(struct rpc_task *task, int (*)(void *)); 239int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *);
240#ifdef RPC_DEBUG 240#ifdef RPC_DEBUG
241struct net; 241struct net;
242void rpc_show_tasks(struct net *); 242void rpc_show_tasks(struct net *);
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 06cc093ab7ad..059052306831 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -183,7 +183,13 @@ static inline bool tick_nohz_full_cpu(int cpu)
183 183
184extern void tick_nohz_init(void); 184extern void tick_nohz_init(void);
185extern void __tick_nohz_full_check(void); 185extern void __tick_nohz_full_check(void);
186extern void tick_nohz_full_kick(void); 186extern void tick_nohz_full_kick_cpu(int cpu);
187
188static inline void tick_nohz_full_kick(void)
189{
190 tick_nohz_full_kick_cpu(smp_processor_id());
191}
192
187extern void tick_nohz_full_kick_all(void); 193extern void tick_nohz_full_kick_all(void);
188extern void __tick_nohz_task_switch(struct task_struct *tsk); 194extern void __tick_nohz_task_switch(struct task_struct *tsk);
189#else 195#else
@@ -191,6 +197,7 @@ static inline void tick_nohz_init(void) { }
191static inline bool tick_nohz_full_enabled(void) { return false; } 197static inline bool tick_nohz_full_enabled(void) { return false; }
192static inline bool tick_nohz_full_cpu(int cpu) { return false; } 198static inline bool tick_nohz_full_cpu(int cpu) { return false; }
193static inline void __tick_nohz_full_check(void) { } 199static inline void __tick_nohz_full_check(void) { }
200static inline void tick_nohz_full_kick_cpu(int cpu) { }
194static inline void tick_nohz_full_kick(void) { } 201static inline void tick_nohz_full_kick(void) { }
195static inline void tick_nohz_full_kick_all(void) { } 202static inline void tick_nohz_full_kick_all(void) { }
196static inline void __tick_nohz_task_switch(struct task_struct *tsk) { } 203static inline void __tick_nohz_task_switch(struct task_struct *tsk) { }
diff --git a/include/linux/wait.h b/include/linux/wait.h
index bd68819f0815..6fb1ba5f9b2f 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -25,6 +25,7 @@ struct wait_bit_key {
25 void *flags; 25 void *flags;
26 int bit_nr; 26 int bit_nr;
27#define WAIT_ATOMIC_T_BIT_NR -1 27#define WAIT_ATOMIC_T_BIT_NR -1
28 unsigned long private;
28}; 29};
29 30
30struct wait_bit_queue { 31struct wait_bit_queue {
@@ -141,18 +142,19 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
141 list_del(&old->task_list); 142 list_del(&old->task_list);
142} 143}
143 144
145typedef int wait_bit_action_f(struct wait_bit_key *);
144void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); 146void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
145void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); 147void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
146void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); 148void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
147void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); 149void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
148void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); 150void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
149void __wake_up_bit(wait_queue_head_t *, void *, int); 151void __wake_up_bit(wait_queue_head_t *, void *, int);
150int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); 152int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
151int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); 153int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
152void wake_up_bit(void *, int); 154void wake_up_bit(void *, int);
153void wake_up_atomic_t(atomic_t *); 155void wake_up_atomic_t(atomic_t *);
154int out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned); 156int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned);
155int out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned); 157int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned);
156int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned); 158int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned);
157wait_queue_head_t *bit_waitqueue(void *, int); 159wait_queue_head_t *bit_waitqueue(void *, int);
158 160
@@ -854,11 +856,14 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
854 (wait)->flags = 0; \ 856 (wait)->flags = 0; \
855 } while (0) 857 } while (0)
856 858
859
860extern int bit_wait(struct wait_bit_key *);
861extern int bit_wait_io(struct wait_bit_key *);
862
857/** 863/**
858 * wait_on_bit - wait for a bit to be cleared 864 * wait_on_bit - wait for a bit to be cleared
859 * @word: the word being waited on, a kernel virtual address 865 * @word: the word being waited on, a kernel virtual address
860 * @bit: the bit of the word being waited on 866 * @bit: the bit of the word being waited on
861 * @action: the function used to sleep, which may take special actions
862 * @mode: the task state to sleep in 867 * @mode: the task state to sleep in
863 * 868 *
864 * There is a standard hashed waitqueue table for generic use. This 869 * There is a standard hashed waitqueue table for generic use. This
@@ -867,9 +872,62 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
867 * call wait_on_bit() in threads waiting for the bit to clear. 872 * call wait_on_bit() in threads waiting for the bit to clear.
868 * One uses wait_on_bit() where one is waiting for the bit to clear, 873 * One uses wait_on_bit() where one is waiting for the bit to clear,
869 * but has no intention of setting it. 874 * but has no intention of setting it.
875 * Returned value will be zero if the bit was cleared, or non-zero
876 * if the process received a signal and the mode permitted wakeup
877 * on that signal.
878 */
879static inline int
880wait_on_bit(void *word, int bit, unsigned mode)
881{
882 if (!test_bit(bit, word))
883 return 0;
884 return out_of_line_wait_on_bit(word, bit,
885 bit_wait,
886 mode);
887}
888
889/**
890 * wait_on_bit_io - wait for a bit to be cleared
891 * @word: the word being waited on, a kernel virtual address
892 * @bit: the bit of the word being waited on
893 * @mode: the task state to sleep in
894 *
895 * Use the standard hashed waitqueue table to wait for a bit
896 * to be cleared. This is similar to wait_on_bit(), but calls
897 * io_schedule() instead of schedule() for the actual waiting.
898 *
899 * Returned value will be zero if the bit was cleared, or non-zero
900 * if the process received a signal and the mode permitted wakeup
901 * on that signal.
902 */
903static inline int
904wait_on_bit_io(void *word, int bit, unsigned mode)
905{
906 if (!test_bit(bit, word))
907 return 0;
908 return out_of_line_wait_on_bit(word, bit,
909 bit_wait_io,
910 mode);
911}
912
913/**
914 * wait_on_bit_action - wait for a bit to be cleared
915 * @word: the word being waited on, a kernel virtual address
916 * @bit: the bit of the word being waited on
917 * @action: the function used to sleep, which may take special actions
918 * @mode: the task state to sleep in
919 *
920 * Use the standard hashed waitqueue table to wait for a bit
921 * to be cleared, and allow the waiting action to be specified.
922 * This is like wait_on_bit() but allows fine control of how the waiting
923 * is done.
924 *
925 * Returned value will be zero if the bit was cleared, or non-zero
926 * if the process received a signal and the mode permitted wakeup
927 * on that signal.
870 */ 928 */
871static inline int 929static inline int
872wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode) 930wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
873{ 931{
874 if (!test_bit(bit, word)) 932 if (!test_bit(bit, word))
875 return 0; 933 return 0;
@@ -880,7 +938,6 @@ wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode)
880 * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it 938 * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
881 * @word: the word being waited on, a kernel virtual address 939 * @word: the word being waited on, a kernel virtual address
882 * @bit: the bit of the word being waited on 940 * @bit: the bit of the word being waited on
883 * @action: the function used to sleep, which may take special actions
884 * @mode: the task state to sleep in 941 * @mode: the task state to sleep in
885 * 942 *
886 * There is a standard hashed waitqueue table for generic use. This 943 * There is a standard hashed waitqueue table for generic use. This
@@ -891,9 +948,61 @@ wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode)
891 * wait_on_bit() in threads waiting to be able to set the bit. 948 * wait_on_bit() in threads waiting to be able to set the bit.
892 * One uses wait_on_bit_lock() where one is waiting for the bit to 949 * One uses wait_on_bit_lock() where one is waiting for the bit to
893 * clear with the intention of setting it, and when done, clearing it. 950 * clear with the intention of setting it, and when done, clearing it.
951 *
952 * Returns zero if the bit was (eventually) found to be clear and was
953 * set. Returns non-zero if a signal was delivered to the process and
954 * the @mode allows that signal to wake the process.
955 */
956static inline int
957wait_on_bit_lock(void *word, int bit, unsigned mode)
958{
959 if (!test_and_set_bit(bit, word))
960 return 0;
961 return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
962}
963
964/**
965 * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
966 * @word: the word being waited on, a kernel virtual address
967 * @bit: the bit of the word being waited on
968 * @mode: the task state to sleep in
969 *
970 * Use the standard hashed waitqueue table to wait for a bit
971 * to be cleared and then to atomically set it. This is similar
972 * to wait_on_bit(), but calls io_schedule() instead of schedule()
973 * for the actual waiting.
974 *
975 * Returns zero if the bit was (eventually) found to be clear and was
976 * set. Returns non-zero if a signal was delivered to the process and
977 * the @mode allows that signal to wake the process.
978 */
979static inline int
980wait_on_bit_lock_io(void *word, int bit, unsigned mode)
981{
982 if (!test_and_set_bit(bit, word))
983 return 0;
984 return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
985}
986
987/**
988 * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
989 * @word: the word being waited on, a kernel virtual address
990 * @bit: the bit of the word being waited on
991 * @action: the function used to sleep, which may take special actions
992 * @mode: the task state to sleep in
993 *
994 * Use the standard hashed waitqueue table to wait for a bit
995 * to be cleared and then to set it, and allow the waiting action
996 * to be specified.
997 * This is like wait_on_bit() but allows fine control of how the waiting
998 * is done.
999 *
1000 * Returns zero if the bit was (eventually) found to be clear and was
1001 * set. Returns non-zero if a signal was delivered to the process and
1002 * the @mode allows that signal to wake the process.
894 */ 1003 */
895static inline int 1004static inline int
896wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode) 1005wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
897{ 1006{
898 if (!test_and_set_bit(bit, word)) 1007 if (!test_and_set_bit(bit, word))
899 return 0; 1008 return 0;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 5777c13849ba..a219be961c0a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -90,7 +90,6 @@ struct writeback_control {
90 * fs/fs-writeback.c 90 * fs/fs-writeback.c
91 */ 91 */
92struct bdi_writeback; 92struct bdi_writeback;
93int inode_wait(void *);
94void writeback_inodes_sb(struct super_block *, enum wb_reason reason); 93void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
95void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, 94void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
96 enum wb_reason reason); 95 enum wb_reason reason);
@@ -105,7 +104,7 @@ void inode_wait_for_writeback(struct inode *inode);
105static inline void wait_on_inode(struct inode *inode) 104static inline void wait_on_inode(struct inode *inode)
106{ 105{
107 might_sleep(); 106 might_sleep();
108 wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); 107 wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
109} 108}
110 109
111/* 110/*
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a343bde710b1..81e2a388a0f6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
274 rcu_read_unlock(); 274 rcu_read_unlock();
275} 275}
276 276
277static inline void check_for_tasks(int cpu) 277static inline void check_for_tasks(int dead_cpu)
278{ 278{
279 struct task_struct *p; 279 struct task_struct *g, *p;
280 cputime_t utime, stime;
281 280
282 write_lock_irq(&tasklist_lock); 281 read_lock_irq(&tasklist_lock);
283 for_each_process(p) { 282 do_each_thread(g, p) {
284 task_cputime(p, &utime, &stime); 283 if (!p->on_rq)
285 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 284 continue;
286 (utime || stime)) 285 /*
287 pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", 286 * We do the check with unlocked task_rq(p)->lock.
288 p->comm, task_pid_nr(p), cpu, 287 * Order the reading to do not warn about a task,
289 p->state, p->flags); 288 * which was running on this cpu in the past, and
290 } 289 * it's just been woken on another cpu.
291 write_unlock_irq(&tasklist_lock); 290 */
291 rmb();
292 if (task_cpu(p) != dead_cpu)
293 continue;
294
295 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
296 p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
297 } while_each_thread(g, p);
298 read_unlock_irq(&tasklist_lock);
292} 299}
293 300
294struct take_cpu_down_param { 301struct take_cpu_down_param {
diff --git a/kernel/fork.c b/kernel/fork.c
index 6a13c46cd87d..962885edbe53 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1095,7 +1095,6 @@ static void rt_mutex_init_task(struct task_struct *p)
1095 p->pi_waiters = RB_ROOT; 1095 p->pi_waiters = RB_ROOT;
1096 p->pi_waiters_leftmost = NULL; 1096 p->pi_waiters_leftmost = NULL;
1097 p->pi_blocked_on = NULL; 1097 p->pi_blocked_on = NULL;
1098 p->pi_top_task = NULL;
1099#endif 1098#endif
1100} 1099}
1101 1100
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index a82170e2fa78..e6bcbe756663 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
16#include <linux/tick.h> 16#include <linux/tick.h>
17#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h>
19#include <asm/processor.h> 20#include <asm/processor.h>
20 21
21 22
22static DEFINE_PER_CPU(struct llist_head, irq_work_list); 23static DEFINE_PER_CPU(struct llist_head, raised_list);
23static DEFINE_PER_CPU(int, irq_work_raised); 24static DEFINE_PER_CPU(struct llist_head, lazy_list);
24 25
25/* 26/*
26 * Claim the entry so that no one else will poke at it. 27 * Claim the entry so that no one else will poke at it.
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)
55 */ 56 */
56} 57}
57 58
59#ifdef CONFIG_SMP
58/* 60/*
59 * Enqueue the irq_work @entry unless it's already pending 61 * Enqueue the irq_work @work on @cpu unless it's already pending
60 * somewhere. 62 * somewhere.
61 * 63 *
62 * Can be re-enqueued while the callback is still in progress. 64 * Can be re-enqueued while the callback is still in progress.
63 */ 65 */
66bool irq_work_queue_on(struct irq_work *work, int cpu)
67{
68 /* All work should have been flushed before going offline */
69 WARN_ON_ONCE(cpu_is_offline(cpu));
70
71 /* Arch remote IPI send/receive backend aren't NMI safe */
72 WARN_ON_ONCE(in_nmi());
73
74 /* Only queue if not already pending */
75 if (!irq_work_claim(work))
76 return false;
77
78 if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
79 arch_send_call_function_single_ipi(cpu);
80
81 return true;
82}
83EXPORT_SYMBOL_GPL(irq_work_queue_on);
84#endif
85
86/* Enqueue the irq work @work on the current CPU */
64bool irq_work_queue(struct irq_work *work) 87bool irq_work_queue(struct irq_work *work)
65{ 88{
66 /* Only queue if not already pending */ 89 /* Only queue if not already pending */
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)
70 /* Queue the entry and raise the IPI if needed. */ 93 /* Queue the entry and raise the IPI if needed. */
71 preempt_disable(); 94 preempt_disable();
72 95
73 llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); 96 /* If the work is "lazy", handle it from next tick if any */
74 97 if (work->flags & IRQ_WORK_LAZY) {
75 /* 98 if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
76 * If the work is not "lazy" or the tick is stopped, raise the irq 99 tick_nohz_tick_stopped())
77 * work interrupt (if supported by the arch), otherwise, just wait 100 arch_irq_work_raise();
78 * for the next tick. 101 } else {
79 */ 102 if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
80 if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
81 if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
82 arch_irq_work_raise(); 103 arch_irq_work_raise();
83 } 104 }
84 105
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
90 111
91bool irq_work_needs_cpu(void) 112bool irq_work_needs_cpu(void)
92{ 113{
93 struct llist_head *this_list; 114 struct llist_head *raised, *lazy;
94 115
95 this_list = &__get_cpu_var(irq_work_list); 116 raised = &__get_cpu_var(raised_list);
96 if (llist_empty(this_list)) 117 lazy = &__get_cpu_var(lazy_list);
118 if (llist_empty(raised) && llist_empty(lazy))
97 return false; 119 return false;
98 120
99 /* All work should have been flushed before going offline */ 121 /* All work should have been flushed before going offline */
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void)
102 return true; 124 return true;
103} 125}
104 126
105static void __irq_work_run(void) 127static void irq_work_run_list(struct llist_head *list)
106{ 128{
107 unsigned long flags; 129 unsigned long flags;
108 struct irq_work *work; 130 struct irq_work *work;
109 struct llist_head *this_list;
110 struct llist_node *llnode; 131 struct llist_node *llnode;
111 132
133 BUG_ON(!irqs_disabled());
112 134
113 /* 135 if (llist_empty(list))
114 * Reset the "raised" state right before we check the list because
115 * an NMI may enqueue after we find the list empty from the runner.
116 */
117 __this_cpu_write(irq_work_raised, 0);
118 barrier();
119
120 this_list = &__get_cpu_var(irq_work_list);
121 if (llist_empty(this_list))
122 return; 136 return;
123 137
124 BUG_ON(!irqs_disabled()); 138 llnode = llist_del_all(list);
125
126 llnode = llist_del_all(this_list);
127 while (llnode != NULL) { 139 while (llnode != NULL) {
128 work = llist_entry(llnode, struct irq_work, llnode); 140 work = llist_entry(llnode, struct irq_work, llnode);
129 141
@@ -149,13 +161,13 @@ static void __irq_work_run(void)
149} 161}
150 162
151/* 163/*
152 * Run the irq_work entries on this cpu. Requires to be ran from hardirq 164 * hotplug calls this through:
153 * context with local IRQs disabled. 165 * hotplug_cfd() -> flush_smp_call_function_queue()
154 */ 166 */
155void irq_work_run(void) 167void irq_work_run(void)
156{ 168{
157 BUG_ON(!in_irq()); 169 irq_work_run_list(&__get_cpu_var(raised_list));
158 __irq_work_run(); 170 irq_work_run_list(&__get_cpu_var(lazy_list));
159} 171}
160EXPORT_SYMBOL_GPL(irq_work_run); 172EXPORT_SYMBOL_GPL(irq_work_run);
161 173
@@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work)
171 cpu_relax(); 183 cpu_relax();
172} 184}
173EXPORT_SYMBOL_GPL(irq_work_sync); 185EXPORT_SYMBOL_GPL(irq_work_sync);
174
175#ifdef CONFIG_HOTPLUG_CPU
176static int irq_work_cpu_notify(struct notifier_block *self,
177 unsigned long action, void *hcpu)
178{
179 long cpu = (long)hcpu;
180
181 switch (action) {
182 case CPU_DYING:
183 /* Called from stop_machine */
184 if (WARN_ON_ONCE(cpu != smp_processor_id()))
185 break;
186 __irq_work_run();
187 break;
188 default:
189 break;
190 }
191 return NOTIFY_OK;
192}
193
194static struct notifier_block cpu_notify;
195
196static __init int irq_work_init_cpu_notifier(void)
197{
198 cpu_notify.notifier_call = irq_work_cpu_notify;
199 cpu_notify.priority = 0;
200 register_cpu_notifier(&cpu_notify);
201 return 0;
202}
203device_initcall(irq_work_init_cpu_notifier);
204
205#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index adf98622cb32..54e75226c2c4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,12 +28,6 @@
28#include <linux/compat.h> 28#include <linux/compat.h>
29 29
30 30
31static int ptrace_trapping_sleep_fn(void *flags)
32{
33 schedule();
34 return 0;
35}
36
37/* 31/*
38 * ptrace a task: make the debugger its new parent and 32 * ptrace a task: make the debugger its new parent and
39 * move it to the ptrace list. 33 * move it to the ptrace list.
@@ -371,7 +365,7 @@ unlock_creds:
371out: 365out:
372 if (!retval) { 366 if (!retval) {
373 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, 367 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
374 ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); 368 TASK_UNINTERRUPTIBLE);
375 proc_ptrace_connector(task, PTRACE_ATTACH); 369 proc_ptrace_connector(task, PTRACE_ATTACH);
376 } 370 }
377 371
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 126f7e3f04e7..1211575a2208 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
139 return; 139 return;
140 140
141 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 141 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
142 if (delta < 0)
143 return;
142 rq->clock += delta; 144 rq->clock += delta;
143 update_rq_clock_task(rq, delta); 145 update_rq_clock_task(rq, delta);
144} 146}
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
243 char buf[64]; 245 char buf[64];
244 char *cmp; 246 char *cmp;
245 int i; 247 int i;
248 struct inode *inode;
246 249
247 if (cnt > 63) 250 if (cnt > 63)
248 cnt = 63; 251 cnt = 63;
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
253 buf[cnt] = 0; 256 buf[cnt] = 0;
254 cmp = strstrip(buf); 257 cmp = strstrip(buf);
255 258
259 /* Ensure the static_key remains in a consistent state */
260 inode = file_inode(filp);
261 mutex_lock(&inode->i_mutex);
256 i = sched_feat_set(cmp); 262 i = sched_feat_set(cmp);
263 mutex_unlock(&inode->i_mutex);
257 if (i == __SCHED_FEAT_NR) 264 if (i == __SCHED_FEAT_NR)
258 return -EINVAL; 265 return -EINVAL;
259 266
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
587#endif 594#endif
588 595
589/* 596/*
590 * resched_task - mark a task 'to be rescheduled now'. 597 * resched_curr - mark rq's current task 'to be rescheduled now'.
591 * 598 *
592 * On UP this means the setting of the need_resched flag, on SMP it 599 * On UP this means the setting of the need_resched flag, on SMP it
593 * might also involve a cross-CPU call to trigger the scheduler on 600 * might also involve a cross-CPU call to trigger the scheduler on
594 * the target CPU. 601 * the target CPU.
595 */ 602 */
596void resched_task(struct task_struct *p) 603void resched_curr(struct rq *rq)
597{ 604{
605 struct task_struct *curr = rq->curr;
598 int cpu; 606 int cpu;
599 607
600 lockdep_assert_held(&task_rq(p)->lock); 608 lockdep_assert_held(&rq->lock);
601 609
602 if (test_tsk_need_resched(p)) 610 if (test_tsk_need_resched(curr))
603 return; 611 return;
604 612
605 cpu = task_cpu(p); 613 cpu = cpu_of(rq);
606 614
607 if (cpu == smp_processor_id()) { 615 if (cpu == smp_processor_id()) {
608 set_tsk_need_resched(p); 616 set_tsk_need_resched(curr);
609 set_preempt_need_resched(); 617 set_preempt_need_resched();
610 return; 618 return;
611 } 619 }
612 620
613 if (set_nr_and_not_polling(p)) 621 if (set_nr_and_not_polling(curr))
614 smp_send_reschedule(cpu); 622 smp_send_reschedule(cpu);
615 else 623 else
616 trace_sched_wake_idle_without_ipi(cpu); 624 trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
623 631
624 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 632 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
625 return; 633 return;
626 resched_task(cpu_curr(cpu)); 634 resched_curr(rq);
627 raw_spin_unlock_irqrestore(&rq->lock, flags); 635 raw_spin_unlock_irqrestore(&rq->lock, flags);
628} 636}
629 637
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
684 692
685static bool wake_up_full_nohz_cpu(int cpu) 693static bool wake_up_full_nohz_cpu(int cpu)
686{ 694{
695 /*
696 * We just need the target to call irq_exit() and re-evaluate
697 * the next tick. The nohz full kick at least implies that.
698 * If needed we can still optimize that later with an
699 * empty IRQ.
700 */
687 if (tick_nohz_full_cpu(cpu)) { 701 if (tick_nohz_full_cpu(cpu)) {
688 if (cpu != smp_processor_id() || 702 if (cpu != smp_processor_id() ||
689 tick_nohz_tick_stopped()) 703 tick_nohz_tick_stopped())
690 smp_send_reschedule(cpu); 704 tick_nohz_full_kick_cpu(cpu);
691 return true; 705 return true;
692 } 706 }
693 707
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
730#ifdef CONFIG_NO_HZ_FULL 744#ifdef CONFIG_NO_HZ_FULL
731bool sched_can_stop_tick(void) 745bool sched_can_stop_tick(void)
732{ 746{
733 struct rq *rq; 747 /*
734 748 * More than one running task need preemption.
735 rq = this_rq(); 749 * nr_running update is assumed to be visible
736 750 * after IPI is sent from wakers.
737 /* Make sure rq->nr_running update is visible after the IPI */ 751 */
738 smp_rmb(); 752 if (this_rq()->nr_running > 1)
739 753 return false;
740 /* More than one running task need preemption */
741 if (rq->nr_running > 1)
742 return false;
743 754
744 return true; 755 return true;
745} 756}
746#endif /* CONFIG_NO_HZ_FULL */ 757#endif /* CONFIG_NO_HZ_FULL */
747 758
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1022 if (class == rq->curr->sched_class) 1033 if (class == rq->curr->sched_class)
1023 break; 1034 break;
1024 if (class == p->sched_class) { 1035 if (class == p->sched_class) {
1025 resched_task(rq->curr); 1036 resched_curr(rq);
1026 break; 1037 break;
1027 } 1038 }
1028 } 1039 }
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
1568 */ 1579 */
1569 preempt_fold_need_resched(); 1580 preempt_fold_need_resched();
1570 1581
1571 if (llist_empty(&this_rq()->wake_list) 1582 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1572 && !tick_nohz_full_cpu(smp_processor_id())
1573 && !got_nohz_idle_kick())
1574 return; 1583 return;
1575 1584
1576 /* 1585 /*
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
1587 * somewhat pessimize the simple resched case. 1596 * somewhat pessimize the simple resched case.
1588 */ 1597 */
1589 irq_enter(); 1598 irq_enter();
1590 tick_nohz_full_check();
1591 sched_ttwu_pending(); 1599 sched_ttwu_pending();
1592 1600
1593 /* 1601 /*
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2431{ 2439{
2432 u64 ns = 0; 2440 u64 ns = 0;
2433 2441
2434 if (task_current(rq, p)) { 2442 /*
2443 * Must be ->curr _and_ ->on_rq. If dequeued, we would
2444 * project cycles that may never be accounted to this
2445 * thread, breaking clock_gettime().
2446 */
2447 if (task_current(rq, p) && p->on_rq) {
2435 update_rq_clock(rq); 2448 update_rq_clock(rq);
2436 ns = rq_clock_task(rq) - p->se.exec_start; 2449 ns = rq_clock_task(rq) - p->se.exec_start;
2437 if ((s64)ns < 0) 2450 if ((s64)ns < 0)
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2474 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2487 * If we race with it leaving cpu, we'll take a lock. So we're correct.
2475 * If we race with it entering cpu, unaccounted time is 0. This is 2488 * If we race with it entering cpu, unaccounted time is 0. This is
2476 * indistinguishable from the read occurring a few cycles earlier. 2489 * indistinguishable from the read occurring a few cycles earlier.
2490 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
2491 * been accounted, so we're correct here as well.
2477 */ 2492 */
2478 if (!p->on_cpu) 2493 if (!p->on_cpu || !p->on_rq)
2479 return p->se.sum_exec_runtime; 2494 return p->se.sum_exec_runtime;
2480#endif 2495#endif
2481 2496
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2971 } 2986 }
2972 2987
2973 trace_sched_pi_setprio(p, prio); 2988 trace_sched_pi_setprio(p, prio);
2974 p->pi_top_task = rt_mutex_get_top_task(p);
2975 oldprio = p->prio; 2989 oldprio = p->prio;
2976 prev_class = p->sched_class; 2990 prev_class = p->sched_class;
2977 on_rq = p->on_rq; 2991 on_rq = p->on_rq;
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2991 * running task 3005 * running task
2992 */ 3006 */
2993 if (dl_prio(prio)) { 3007 if (dl_prio(prio)) {
2994 if (!dl_prio(p->normal_prio) || (p->pi_top_task && 3008 struct task_struct *pi_task = rt_mutex_get_top_task(p);
2995 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { 3009 if (!dl_prio(p->normal_prio) ||
3010 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
2996 p->dl.dl_boosted = 1; 3011 p->dl.dl_boosted = 1;
2997 p->dl.dl_throttled = 0; 3012 p->dl.dl_throttled = 0;
2998 enqueue_flag = ENQUEUE_REPLENISH; 3013 enqueue_flag = ENQUEUE_REPLENISH;
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)
3064 * lowered its priority, then reschedule its CPU: 3079 * lowered its priority, then reschedule its CPU:
3065 */ 3080 */
3066 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3081 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3067 resched_task(rq->curr); 3082 resched_curr(rq);
3068 } 3083 }
3069out_unlock: 3084out_unlock:
3070 task_rq_unlock(rq, p, &flags); 3085 task_rq_unlock(rq, p, &flags);
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3203 dl_se->dl_yielded = 0; 3218 dl_se->dl_yielded = 0;
3204} 3219}
3205 3220
3221/*
3222 * sched_setparam() passes in -1 for its policy, to let the functions
3223 * it calls know not to change it.
3224 */
3225#define SETPARAM_POLICY -1
3226
3206static void __setscheduler_params(struct task_struct *p, 3227static void __setscheduler_params(struct task_struct *p,
3207 const struct sched_attr *attr) 3228 const struct sched_attr *attr)
3208{ 3229{
3209 int policy = attr->sched_policy; 3230 int policy = attr->sched_policy;
3210 3231
3211 if (policy == -1) /* setparam */ 3232 if (policy == SETPARAM_POLICY)
3212 policy = p->policy; 3233 policy = p->policy;
3213 3234
3214 p->policy = policy; 3235 p->policy = policy;
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
3557 .sched_nice = PRIO_TO_NICE(p->static_prio), 3578 .sched_nice = PRIO_TO_NICE(p->static_prio),
3558 }; 3579 };
3559 3580
3560 /* 3581 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
3561 * Fixup the legacy SCHED_RESET_ON_FORK hack 3582 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
3562 */
3563 if (policy & SCHED_RESET_ON_FORK) {
3564 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3583 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3565 policy &= ~SCHED_RESET_ON_FORK; 3584 policy &= ~SCHED_RESET_ON_FORK;
3566 attr.sched_policy = policy; 3585 attr.sched_policy = policy;
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3730 */ 3749 */
3731SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3750SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3732{ 3751{
3733 return do_sched_setscheduler(pid, -1, param); 3752 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
3734} 3753}
3735 3754
3736/** 3755/**
@@ -4285,7 +4304,7 @@ again:
4285 * fairness. 4304 * fairness.
4286 */ 4305 */
4287 if (preempt && rq != p_rq) 4306 if (preempt && rq != p_rq)
4288 resched_task(p_rq->curr); 4307 resched_curr(p_rq);
4289 } 4308 }
4290 4309
4291out_unlock: 4310out_unlock:
@@ -6465,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6465 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6484 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6466 child->parent = sd; 6485 child->parent = sd;
6467 sd->child = child; 6486 sd->child = child;
6487
6488 if (!cpumask_subset(sched_domain_span(child),
6489 sched_domain_span(sd))) {
6490 pr_err("BUG: arch topology borken\n");
6491#ifdef CONFIG_SCHED_DEBUG
6492 pr_err(" the %s domain not a subset of the %s domain\n",
6493 child->name, sd->name);
6494#endif
6495 /* Fixup, ensure @sd has at least @child cpus. */
6496 cpumask_or(sched_domain_span(sd),
6497 sched_domain_span(sd),
6498 sched_domain_span(child));
6499 }
6500
6468 } 6501 }
6469 set_domain_attribute(sd, attr); 6502 set_domain_attribute(sd, attr);
6470 6503
@@ -7092,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7092 __setscheduler(rq, p, &attr); 7125 __setscheduler(rq, p, &attr);
7093 if (on_rq) { 7126 if (on_rq) {
7094 enqueue_task(rq, p, 0); 7127 enqueue_task(rq, p, 0);
7095 resched_task(rq->curr); 7128 resched_curr(rq);
7096 } 7129 }
7097 7130
7098 check_class_changed(rq, p, prev_class, old_prio); 7131 check_class_changed(rq, p, prev_class, old_prio);
@@ -7803,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7803 if (period > max_cfs_quota_period) 7836 if (period > max_cfs_quota_period)
7804 return -EINVAL; 7837 return -EINVAL;
7805 7838
7839 /*
7840 * Prevent race between setting of cfs_rq->runtime_enabled and
7841 * unthrottle_offline_cfs_rqs().
7842 */
7843 get_online_cpus();
7806 mutex_lock(&cfs_constraints_mutex); 7844 mutex_lock(&cfs_constraints_mutex);
7807 ret = __cfs_schedulable(tg, period, quota); 7845 ret = __cfs_schedulable(tg, period, quota);
7808 if (ret) 7846 if (ret)
@@ -7828,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7828 } 7866 }
7829 raw_spin_unlock_irq(&cfs_b->lock); 7867 raw_spin_unlock_irq(&cfs_b->lock);
7830 7868
7831 for_each_possible_cpu(i) { 7869 for_each_online_cpu(i) {
7832 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7870 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7833 struct rq *rq = cfs_rq->rq; 7871 struct rq *rq = cfs_rq->rq;
7834 7872
@@ -7844,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7844 cfs_bandwidth_usage_dec(); 7882 cfs_bandwidth_usage_dec();
7845out_unlock: 7883out_unlock:
7846 mutex_unlock(&cfs_constraints_mutex); 7884 mutex_unlock(&cfs_constraints_mutex);
7885 put_online_cpus();
7847 7886
7848 return ret; 7887 return ret;
7849} 7888}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc4f98b1258f..255ce138b652 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
306 * the overrunning entity can't interfere with other entity in the system and 306 * the overrunning entity can't interfere with other entity in the system and
307 * can't make them miss their deadlines. Reasons why this kind of overruns 307 * can't make them miss their deadlines. Reasons why this kind of overruns
308 * could happen are, typically, a entity voluntarily trying to overcome its 308 * could happen are, typically, a entity voluntarily trying to overcome its
309 * runtime, or it just underestimated it during sched_setscheduler_ex(). 309 * runtime, or it just underestimated it during sched_setattr().
310 */ 310 */
311static void replenish_dl_entity(struct sched_dl_entity *dl_se, 311static void replenish_dl_entity(struct sched_dl_entity *dl_se,
312 struct sched_dl_entity *pi_se) 312 struct sched_dl_entity *pi_se)
@@ -535,7 +535,7 @@ again:
535 if (task_has_dl_policy(rq->curr)) 535 if (task_has_dl_policy(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 536 check_preempt_curr_dl(rq, p, 0);
537 else 537 else
538 resched_task(rq->curr); 538 resched_curr(rq);
539#ifdef CONFIG_SMP 539#ifdef CONFIG_SMP
540 /* 540 /*
541 * Queueing this task back might have overloaded rq, 541 * Queueing this task back might have overloaded rq,
@@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq)
634 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 634 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
635 635
636 if (!is_leftmost(curr, &rq->dl)) 636 if (!is_leftmost(curr, &rq->dl))
637 resched_task(curr); 637 resched_curr(rq);
638 } 638 }
639 639
640 /* 640 /*
@@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
964 cpudl_find(&rq->rd->cpudl, p, NULL) != -1) 964 cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
965 return; 965 return;
966 966
967 resched_task(rq->curr); 967 resched_curr(rq);
968} 968}
969 969
970static int pull_dl_task(struct rq *this_rq); 970static int pull_dl_task(struct rq *this_rq);
@@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
979 int flags) 979 int flags)
980{ 980{
981 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { 981 if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
982 resched_task(rq->curr); 982 resched_curr(rq);
983 return; 983 return;
984 } 984 }
985 985
@@ -1333,7 +1333,7 @@ retry:
1333 if (dl_task(rq->curr) && 1333 if (dl_task(rq->curr) &&
1334 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && 1334 dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
1335 rq->curr->nr_cpus_allowed > 1) { 1335 rq->curr->nr_cpus_allowed > 1) {
1336 resched_task(rq->curr); 1336 resched_curr(rq);
1337 return 0; 1337 return 0;
1338 } 1338 }
1339 1339
@@ -1373,7 +1373,7 @@ retry:
1373 set_task_cpu(next_task, later_rq->cpu); 1373 set_task_cpu(next_task, later_rq->cpu);
1374 activate_task(later_rq, next_task, 0); 1374 activate_task(later_rq, next_task, 0);
1375 1375
1376 resched_task(later_rq->curr); 1376 resched_curr(later_rq);
1377 1377
1378 double_unlock_balance(rq, later_rq); 1378 double_unlock_balance(rq, later_rq);
1379 1379
@@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1632 */ 1632 */
1633 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && 1633 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
1634 rq->curr == p) 1634 rq->curr == p)
1635 resched_task(p); 1635 resched_curr(rq);
1636#else 1636#else
1637 /* 1637 /*
1638 * Again, we don't know if p has a earlier 1638 * Again, we don't know if p has a earlier
1639 * or later deadline, so let's blindly set a 1639 * or later deadline, so let's blindly set a
1640 * (maybe not needed) rescheduling point. 1640 * (maybe not needed) rescheduling point.
1641 */ 1641 */
1642 resched_task(p); 1642 resched_curr(rq);
1643#endif /* CONFIG_SMP */ 1643#endif /* CONFIG_SMP */
1644 } else 1644 } else
1645 switched_to_dl(rq, p); 1645 switched_to_dl(rq, p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d3335e1f..bfa3c86d0d68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1062 if (!cpus) 1062 if (!cpus)
1063 return; 1063 return;
1064 1064
1065 ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
1066 ns->task_capacity = 1065 ns->task_capacity =
1067 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); 1066 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
1068 ns->has_free_capacity = (ns->nr_running < ns->task_capacity); 1067 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
1096 env->best_cpu = env->dst_cpu; 1095 env->best_cpu = env->dst_cpu;
1097} 1096}
1098 1097
1099static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, 1098static bool load_too_imbalanced(long src_load, long dst_load,
1100 long src_load, long dst_load,
1101 struct task_numa_env *env) 1099 struct task_numa_env *env)
1102{ 1100{
1103 long imb, old_imb; 1101 long imb, old_imb;
1102 long orig_src_load, orig_dst_load;
1103 long src_capacity, dst_capacity;
1104
1105 /*
1106 * The load is corrected for the CPU capacity available on each node.
1107 *
1108 * src_load dst_load
1109 * ------------ vs ---------
1110 * src_capacity dst_capacity
1111 */
1112 src_capacity = env->src_stats.compute_capacity;
1113 dst_capacity = env->dst_stats.compute_capacity;
1104 1114
1105 /* We care about the slope of the imbalance, not the direction. */ 1115 /* We care about the slope of the imbalance, not the direction. */
1106 if (dst_load < src_load) 1116 if (dst_load < src_load)
1107 swap(dst_load, src_load); 1117 swap(dst_load, src_load);
1108 1118
1109 /* Is the difference below the threshold? */ 1119 /* Is the difference below the threshold? */
1110 imb = dst_load * 100 - src_load * env->imbalance_pct; 1120 imb = dst_load * src_capacity * 100 -
1121 src_load * dst_capacity * env->imbalance_pct;
1111 if (imb <= 0) 1122 if (imb <= 0)
1112 return false; 1123 return false;
1113 1124
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1115 * The imbalance is above the allowed threshold. 1126 * The imbalance is above the allowed threshold.
1116 * Compare it with the old imbalance. 1127 * Compare it with the old imbalance.
1117 */ 1128 */
1129 orig_src_load = env->src_stats.load;
1130 orig_dst_load = env->dst_stats.load;
1131
1118 if (orig_dst_load < orig_src_load) 1132 if (orig_dst_load < orig_src_load)
1119 swap(orig_dst_load, orig_src_load); 1133 swap(orig_dst_load, orig_src_load);
1120 1134
1121 old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; 1135 old_imb = orig_dst_load * src_capacity * 100 -
1136 orig_src_load * dst_capacity * env->imbalance_pct;
1122 1137
1123 /* Would this change make things worse? */ 1138 /* Would this change make things worse? */
1124 return (imb > old_imb); 1139 return (imb > old_imb);
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,
1136 struct rq *src_rq = cpu_rq(env->src_cpu); 1151 struct rq *src_rq = cpu_rq(env->src_cpu);
1137 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1152 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1138 struct task_struct *cur; 1153 struct task_struct *cur;
1139 long orig_src_load, src_load; 1154 long src_load, dst_load;
1140 long orig_dst_load, dst_load;
1141 long load; 1155 long load;
1142 long imp = (groupimp > 0) ? groupimp : taskimp; 1156 long imp = env->p->numa_group ? groupimp : taskimp;
1157 long moveimp = imp;
1143 1158
1144 rcu_read_lock(); 1159 rcu_read_lock();
1145 cur = ACCESS_ONCE(dst_rq->curr); 1160 cur = ACCESS_ONCE(dst_rq->curr);
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
1177 * itself (not part of a group), use the task weight 1192 * itself (not part of a group), use the task weight
1178 * instead. 1193 * instead.
1179 */ 1194 */
1180 if (env->p->numa_group)
1181 imp = groupimp;
1182 else
1183 imp = taskimp;
1184
1185 if (cur->numa_group) 1195 if (cur->numa_group)
1186 imp += group_weight(cur, env->src_nid) - 1196 imp += group_weight(cur, env->src_nid) -
1187 group_weight(cur, env->dst_nid); 1197 group_weight(cur, env->dst_nid);
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,
1191 } 1201 }
1192 } 1202 }
1193 1203
1194 if (imp < env->best_imp) 1204 if (imp <= env->best_imp && moveimp <= env->best_imp)
1195 goto unlock; 1205 goto unlock;
1196 1206
1197 if (!cur) { 1207 if (!cur) {
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,
1204 } 1214 }
1205 1215
1206 /* Balance doesn't matter much if we're running a task per cpu */ 1216 /* Balance doesn't matter much if we're running a task per cpu */
1207 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) 1217 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1218 dst_rq->nr_running == 1)
1208 goto assign; 1219 goto assign;
1209 1220
1210 /* 1221 /*
1211 * In the overloaded case, try and keep the load balanced. 1222 * In the overloaded case, try and keep the load balanced.
1212 */ 1223 */
1213balance: 1224balance:
1214 orig_dst_load = env->dst_stats.load;
1215 orig_src_load = env->src_stats.load;
1216
1217 /* XXX missing capacity terms */
1218 load = task_h_load(env->p); 1225 load = task_h_load(env->p);
1219 dst_load = orig_dst_load + load; 1226 dst_load = env->dst_stats.load + load;
1220 src_load = orig_src_load - load; 1227 src_load = env->src_stats.load - load;
1228
1229 if (moveimp > imp && moveimp > env->best_imp) {
1230 /*
1231 * If the improvement from just moving env->p direction is
1232 * better than swapping tasks around, check if a move is
1233 * possible. Store a slightly smaller score than moveimp,
1234 * so an actually idle CPU will win.
1235 */
1236 if (!load_too_imbalanced(src_load, dst_load, env)) {
1237 imp = moveimp - 1;
1238 cur = NULL;
1239 goto assign;
1240 }
1241 }
1242
1243 if (imp <= env->best_imp)
1244 goto unlock;
1221 1245
1222 if (cur) { 1246 if (cur) {
1223 load = task_h_load(cur); 1247 load = task_h_load(cur);
@@ -1225,8 +1249,7 @@ balance:
1225 src_load += load; 1249 src_load += load;
1226 } 1250 }
1227 1251
1228 if (load_too_imbalanced(orig_src_load, orig_dst_load, 1252 if (load_too_imbalanced(src_load, dst_load, env))
1229 src_load, dst_load, env))
1230 goto unlock; 1253 goto unlock;
1231 1254
1232assign: 1255assign:
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)
1302 groupimp = group_weight(p, env.dst_nid) - groupweight; 1325 groupimp = group_weight(p, env.dst_nid) - groupweight;
1303 update_numa_stats(&env.dst_stats, env.dst_nid); 1326 update_numa_stats(&env.dst_stats, env.dst_nid);
1304 1327
1305 /* If the preferred nid has free capacity, try to use it. */ 1328 /* Try to find a spot on the preferred nid. */
1306 if (env.dst_stats.has_free_capacity) 1329 task_numa_find_cpu(&env, taskimp, groupimp);
1307 task_numa_find_cpu(&env, taskimp, groupimp);
1308 1330
1309 /* No space available on the preferred nid. Look elsewhere. */ 1331 /* No space available on the preferred nid. Look elsewhere. */
1310 if (env.best_cpu == -1) { 1332 if (env.best_cpu == -1) {
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)
1324 } 1346 }
1325 } 1347 }
1326 1348
1327 /* No better CPU than the current one was found. */
1328 if (env.best_cpu == -1)
1329 return -EAGAIN;
1330
1331 /* 1349 /*
1332 * If the task is part of a workload that spans multiple NUMA nodes, 1350 * If the task is part of a workload that spans multiple NUMA nodes,
1333 * and is migrating into one of the workload's active nodes, remember 1351 * and is migrating into one of the workload's active nodes, remember
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)
1336 * A task that migrated to a second choice node will be better off 1354 * A task that migrated to a second choice node will be better off
1337 * trying for a better one later. Do not set the preferred node here. 1355 * trying for a better one later. Do not set the preferred node here.
1338 */ 1356 */
1339 if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) 1357 if (p->numa_group) {
1340 sched_setnuma(p, env.dst_nid); 1358 if (env.best_cpu == -1)
1359 nid = env.src_nid;
1360 else
1361 nid = env.dst_nid;
1362
1363 if (node_isset(nid, p->numa_group->active_nodes))
1364 sched_setnuma(p, env.dst_nid);
1365 }
1366
1367 /* No better CPU than the current one was found. */
1368 if (env.best_cpu == -1)
1369 return -EAGAIN;
1341 1370
1342 /* 1371 /*
1343 * Reset the scan period if the task is being rescheduled on an 1372 * Reset the scan period if the task is being rescheduled on an
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
1415/* 1444/*
1416 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1445 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1417 * increments. The more local the fault statistics are, the higher the scan 1446 * increments. The more local the fault statistics are, the higher the scan
1418 * period will be for the next scan window. If local/remote ratio is below 1447 * period will be for the next scan window. If local/(local+remote) ratio is
1419 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the 1448 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1420 * scan period will decrease 1449 * the scan period will decrease. Aim for 70% local accesses.
1421 */ 1450 */
1422#define NUMA_PERIOD_SLOTS 10 1451#define NUMA_PERIOD_SLOTS 10
1423#define NUMA_PERIOD_THRESHOLD 3 1452#define NUMA_PERIOD_THRESHOLD 7
1424 1453
1425/* 1454/*
1426 * Increase the scan period (slow down scanning) if the majority of 1455 * Increase the scan period (slow down scanning) if the majority of
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)
1595 1624
1596 if (p->numa_group) { 1625 if (p->numa_group) {
1597 update_numa_active_node_mask(p->numa_group); 1626 update_numa_active_node_mask(p->numa_group);
1598 /*
1599 * If the preferred task and group nids are different,
1600 * iterate over the nodes again to find the best place.
1601 */
1602 if (max_nid != max_group_nid) {
1603 unsigned long weight, max_weight = 0;
1604
1605 for_each_online_node(nid) {
1606 weight = task_weight(p, nid) + group_weight(p, nid);
1607 if (weight > max_weight) {
1608 max_weight = weight;
1609 max_nid = nid;
1610 }
1611 }
1612 }
1613
1614 spin_unlock_irq(group_lock); 1627 spin_unlock_irq(group_lock);
1628 max_nid = max_group_nid;
1615 } 1629 }
1616 1630
1617 /* Preferred node as the node with the most faults */ 1631 if (max_faults) {
1618 if (max_faults && max_nid != p->numa_preferred_nid) { 1632 /* Set the new preferred node */
1619 /* Update the preferred nid and migrate task if possible */ 1633 if (max_nid != p->numa_preferred_nid)
1620 sched_setnuma(p, max_nid); 1634 sched_setnuma(p, max_nid);
1621 numa_migrate_preferred(p); 1635
1636 if (task_node(p) != p->numa_preferred_nid)
1637 numa_migrate_preferred(p);
1622 } 1638 }
1623} 1639}
1624 1640
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2899 ideal_runtime = sched_slice(cfs_rq, curr); 2915 ideal_runtime = sched_slice(cfs_rq, curr);
2900 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 2916 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2901 if (delta_exec > ideal_runtime) { 2917 if (delta_exec > ideal_runtime) {
2902 resched_task(rq_of(cfs_rq)->curr); 2918 resched_curr(rq_of(cfs_rq));
2903 /* 2919 /*
2904 * The current task ran long enough, ensure it doesn't get 2920 * The current task ran long enough, ensure it doesn't get
2905 * re-elected due to buddy favours. 2921 * re-elected due to buddy favours.
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2923 return; 2939 return;
2924 2940
2925 if (delta > ideal_runtime) 2941 if (delta > ideal_runtime)
2926 resched_task(rq_of(cfs_rq)->curr); 2942 resched_curr(rq_of(cfs_rq));
2927} 2943}
2928 2944
2929static void 2945static void
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3063 * validating it and just reschedule. 3079 * validating it and just reschedule.
3064 */ 3080 */
3065 if (queued) { 3081 if (queued) {
3066 resched_task(rq_of(cfs_rq)->curr); 3082 resched_curr(rq_of(cfs_rq));
3067 return; 3083 return;
3068 } 3084 }
3069 /* 3085 /*
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3254 * hierarchy can be throttled 3270 * hierarchy can be throttled
3255 */ 3271 */
3256 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) 3272 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3257 resched_task(rq_of(cfs_rq)->curr); 3273 resched_curr(rq_of(cfs_rq));
3258} 3274}
3259 3275
3260static __always_inline 3276static __always_inline
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3360 cfs_rq->throttled = 1; 3376 cfs_rq->throttled = 1;
3361 cfs_rq->throttled_clock = rq_clock(rq); 3377 cfs_rq->throttled_clock = rq_clock(rq);
3362 raw_spin_lock(&cfs_b->lock); 3378 raw_spin_lock(&cfs_b->lock);
3363 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3379 /*
3380 * Add to the _head_ of the list, so that an already-started
3381 * distribute_cfs_runtime will not see us
3382 */
3383 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3364 if (!cfs_b->timer_active) 3384 if (!cfs_b->timer_active)
3365 __start_cfs_bandwidth(cfs_b, false); 3385 __start_cfs_bandwidth(cfs_b, false);
3366 raw_spin_unlock(&cfs_b->lock); 3386 raw_spin_unlock(&cfs_b->lock);
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3410 3430
3411 /* determine whether we need to wake up potentially idle cpu */ 3431 /* determine whether we need to wake up potentially idle cpu */
3412 if (rq->curr == rq->idle && rq->cfs.nr_running) 3432 if (rq->curr == rq->idle && rq->cfs.nr_running)
3413 resched_task(rq->curr); 3433 resched_curr(rq);
3414} 3434}
3415 3435
3416static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, 3436static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3417 u64 remaining, u64 expires) 3437 u64 remaining, u64 expires)
3418{ 3438{
3419 struct cfs_rq *cfs_rq; 3439 struct cfs_rq *cfs_rq;
3420 u64 runtime = remaining; 3440 u64 runtime;
3441 u64 starting_runtime = remaining;
3421 3442
3422 rcu_read_lock(); 3443 rcu_read_lock();
3423 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, 3444 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3448,7 +3469,7 @@ next:
3448 } 3469 }
3449 rcu_read_unlock(); 3470 rcu_read_unlock();
3450 3471
3451 return remaining; 3472 return starting_runtime - remaining;
3452} 3473}
3453 3474
3454/* 3475/*
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3494 /* account preceding periods in which throttling occurred */ 3515 /* account preceding periods in which throttling occurred */
3495 cfs_b->nr_throttled += overrun; 3516 cfs_b->nr_throttled += overrun;
3496 3517
3497 /*
3498 * There are throttled entities so we must first use the new bandwidth
3499 * to unthrottle them before making it generally available. This
3500 * ensures that all existing debts will be paid before a new cfs_rq is
3501 * allowed to run.
3502 */
3503 runtime = cfs_b->runtime;
3504 runtime_expires = cfs_b->runtime_expires; 3518 runtime_expires = cfs_b->runtime_expires;
3505 cfs_b->runtime = 0;
3506 3519
3507 /* 3520 /*
3508 * This check is repeated as we are holding onto the new bandwidth 3521 * This check is repeated as we are holding onto the new bandwidth while
3509 * while we unthrottle. This can potentially race with an unthrottled 3522 * we unthrottle. This can potentially race with an unthrottled group
3510 * group trying to acquire new bandwidth from the global pool. 3523 * trying to acquire new bandwidth from the global pool. This can result
3524 * in us over-using our runtime if it is all used during this loop, but
3525 * only by limited amounts in that extreme case.
3511 */ 3526 */
3512 while (throttled && runtime > 0) { 3527 while (throttled && cfs_b->runtime > 0) {
3528 runtime = cfs_b->runtime;
3513 raw_spin_unlock(&cfs_b->lock); 3529 raw_spin_unlock(&cfs_b->lock);
3514 /* we can't nest cfs_b->lock while distributing bandwidth */ 3530 /* we can't nest cfs_b->lock while distributing bandwidth */
3515 runtime = distribute_cfs_runtime(cfs_b, runtime, 3531 runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3517 raw_spin_lock(&cfs_b->lock); 3533 raw_spin_lock(&cfs_b->lock);
3518 3534
3519 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 3535 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3536
3537 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3520 } 3538 }
3521 3539
3522 /* return (any) remaining runtime */
3523 cfs_b->runtime = runtime;
3524 /* 3540 /*
3525 * While we are ensured activity in the period following an 3541 * While we are ensured activity in the period following an
3526 * unthrottle, this also covers the case in which the new bandwidth is 3542 * unthrottle, this also covers the case in which the new bandwidth is
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3631 return; 3647 return;
3632 } 3648 }
3633 3649
3634 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3650 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3635 runtime = cfs_b->runtime; 3651 runtime = cfs_b->runtime;
3636 cfs_b->runtime = 0; 3652
3637 }
3638 expires = cfs_b->runtime_expires; 3653 expires = cfs_b->runtime_expires;
3639 raw_spin_unlock(&cfs_b->lock); 3654 raw_spin_unlock(&cfs_b->lock);
3640 3655
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3645 3660
3646 raw_spin_lock(&cfs_b->lock); 3661 raw_spin_lock(&cfs_b->lock);
3647 if (expires == cfs_b->runtime_expires) 3662 if (expires == cfs_b->runtime_expires)
3648 cfs_b->runtime = runtime; 3663 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3649 raw_spin_unlock(&cfs_b->lock); 3664 raw_spin_unlock(&cfs_b->lock);
3650} 3665}
3651 3666
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3775 hrtimer_cancel(&cfs_b->slack_timer); 3790 hrtimer_cancel(&cfs_b->slack_timer);
3776} 3791}
3777 3792
3793static void __maybe_unused update_runtime_enabled(struct rq *rq)
3794{
3795 struct cfs_rq *cfs_rq;
3796
3797 for_each_leaf_cfs_rq(rq, cfs_rq) {
3798 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
3799
3800 raw_spin_lock(&cfs_b->lock);
3801 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
3802 raw_spin_unlock(&cfs_b->lock);
3803 }
3804}
3805
3778static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) 3806static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3779{ 3807{
3780 struct cfs_rq *cfs_rq; 3808 struct cfs_rq *cfs_rq;
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3788 * there's some valid quota amount 3816 * there's some valid quota amount
3789 */ 3817 */
3790 cfs_rq->runtime_remaining = 1; 3818 cfs_rq->runtime_remaining = 1;
3819 /*
3820 * Offline rq is schedulable till cpu is completely disabled
3821 * in take_cpu_down(), so we prevent new cfs throttling here.
3822 */
3823 cfs_rq->runtime_enabled = 0;
3824
3791 if (cfs_rq_throttled(cfs_rq)) 3825 if (cfs_rq_throttled(cfs_rq))
3792 unthrottle_cfs_rq(cfs_rq); 3826 unthrottle_cfs_rq(cfs_rq);
3793 } 3827 }
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3831 return NULL; 3865 return NULL;
3832} 3866}
3833static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 3867static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3868static inline void update_runtime_enabled(struct rq *rq) {}
3834static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} 3869static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3835 3870
3836#endif /* CONFIG_CFS_BANDWIDTH */ 3871#endif /* CONFIG_CFS_BANDWIDTH */
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3854 3889
3855 if (delta < 0) { 3890 if (delta < 0) {
3856 if (rq->curr == p) 3891 if (rq->curr == p)
3857 resched_task(p); 3892 resched_curr(rq);
3858 return; 3893 return;
3859 } 3894 }
3860 3895
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
4723 return; 4758 return;
4724 4759
4725preempt: 4760preempt:
4726 resched_task(curr); 4761 resched_curr(rq);
4727 /* 4762 /*
4728 * Only set the backward buddy when the current task is still 4763 * Only set the backward buddy when the current task is still
4729 * on the rq. This can happen when a wakeup gets interleaved 4764 * on the rq. This can happen when a wakeup gets interleaved
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
5094/* 5129/*
5095 * Is this task likely cache-hot: 5130 * Is this task likely cache-hot:
5096 */ 5131 */
5097static int 5132static int task_hot(struct task_struct *p, struct lb_env *env)
5098task_hot(struct task_struct *p, u64 now)
5099{ 5133{
5100 s64 delta; 5134 s64 delta;
5101 5135
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)
5108 /* 5142 /*
5109 * Buddy candidates are cache hot: 5143 * Buddy candidates are cache hot:
5110 */ 5144 */
5111 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 5145 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5112 (&p->se == cfs_rq_of(&p->se)->next || 5146 (&p->se == cfs_rq_of(&p->se)->next ||
5113 &p->se == cfs_rq_of(&p->se)->last)) 5147 &p->se == cfs_rq_of(&p->se)->last))
5114 return 1; 5148 return 1;
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)
5118 if (sysctl_sched_migration_cost == 0) 5152 if (sysctl_sched_migration_cost == 0)
5119 return 0; 5153 return 0;
5120 5154
5121 delta = now - p->se.exec_start; 5155 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5122 5156
5123 return delta < (s64)sysctl_sched_migration_cost; 5157 return delta < (s64)sysctl_sched_migration_cost;
5124} 5158}
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5272 * 2) task is cache cold, or 5306 * 2) task is cache cold, or
5273 * 3) too many balance attempts have failed. 5307 * 3) too many balance attempts have failed.
5274 */ 5308 */
5275 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); 5309 tsk_cache_hot = task_hot(p, env);
5276 if (!tsk_cache_hot) 5310 if (!tsk_cache_hot)
5277 tsk_cache_hot = migrate_degrades_locality(p, env); 5311 tsk_cache_hot = migrate_degrades_locality(p, env);
5278 5312
@@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
5864 * @load_idx: Load index of sched_domain of this_cpu for load calc. 5898 * @load_idx: Load index of sched_domain of this_cpu for load calc.
5865 * @local_group: Does group contain this_cpu. 5899 * @local_group: Does group contain this_cpu.
5866 * @sgs: variable to hold the statistics for this group. 5900 * @sgs: variable to hold the statistics for this group.
5901 * @overload: Indicate more than one runnable task for any CPU.
5867 */ 5902 */
5868static inline void update_sg_lb_stats(struct lb_env *env, 5903static inline void update_sg_lb_stats(struct lb_env *env,
5869 struct sched_group *group, int load_idx, 5904 struct sched_group *group, int load_idx,
5870 int local_group, struct sg_lb_stats *sgs) 5905 int local_group, struct sg_lb_stats *sgs,
5906 bool *overload)
5871{ 5907{
5872 unsigned long load; 5908 unsigned long load;
5873 int i; 5909 int i;
@@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5885 5921
5886 sgs->group_load += load; 5922 sgs->group_load += load;
5887 sgs->sum_nr_running += rq->nr_running; 5923 sgs->sum_nr_running += rq->nr_running;
5924
5925 if (rq->nr_running > 1)
5926 *overload = true;
5927
5888#ifdef CONFIG_NUMA_BALANCING 5928#ifdef CONFIG_NUMA_BALANCING
5889 sgs->nr_numa_running += rq->nr_numa_running; 5929 sgs->nr_numa_running += rq->nr_numa_running;
5890 sgs->nr_preferred_running += rq->nr_preferred_running; 5930 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
5995 struct sched_group *sg = env->sd->groups; 6035 struct sched_group *sg = env->sd->groups;
5996 struct sg_lb_stats tmp_sgs; 6036 struct sg_lb_stats tmp_sgs;
5997 int load_idx, prefer_sibling = 0; 6037 int load_idx, prefer_sibling = 0;
6038 bool overload = false;
5998 6039
5999 if (child && child->flags & SD_PREFER_SIBLING) 6040 if (child && child->flags & SD_PREFER_SIBLING)
6000 prefer_sibling = 1; 6041 prefer_sibling = 1;
@@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6015 update_group_capacity(env->sd, env->dst_cpu); 6056 update_group_capacity(env->sd, env->dst_cpu);
6016 } 6057 }
6017 6058
6018 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 6059 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6060 &overload);
6019 6061
6020 if (local_group) 6062 if (local_group)
6021 goto next_group; 6063 goto next_group;
@@ -6049,6 +6091,13 @@ next_group:
6049 6091
6050 if (env->sd->flags & SD_NUMA) 6092 if (env->sd->flags & SD_NUMA)
6051 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 6093 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6094
6095 if (!env->sd->parent) {
6096 /* update overload indicator if we are at root domain */
6097 if (env->dst_rq->rd->overload != overload)
6098 env->dst_rq->rd->overload = overload;
6099 }
6100
6052} 6101}
6053 6102
6054/** 6103/**
@@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq)
6767 */ 6816 */
6768 this_rq->idle_stamp = rq_clock(this_rq); 6817 this_rq->idle_stamp = rq_clock(this_rq);
6769 6818
6770 if (this_rq->avg_idle < sysctl_sched_migration_cost) { 6819 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
6820 !this_rq->rd->overload) {
6771 rcu_read_lock(); 6821 rcu_read_lock();
6772 sd = rcu_dereference_check_sched_domain(this_rq->sd); 6822 sd = rcu_dereference_check_sched_domain(this_rq->sd);
6773 if (sd) 6823 if (sd)
@@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)
7325static void rq_online_fair(struct rq *rq) 7375static void rq_online_fair(struct rq *rq)
7326{ 7376{
7327 update_sysctl(); 7377 update_sysctl();
7378
7379 update_runtime_enabled(rq);
7328} 7380}
7329 7381
7330static void rq_offline_fair(struct rq *rq) 7382static void rq_offline_fair(struct rq *rq)
@@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)
7398 * 'current' within the tree based on its new key value. 7450 * 'current' within the tree based on its new key value.
7399 */ 7451 */
7400 swap(curr->vruntime, se->vruntime); 7452 swap(curr->vruntime, se->vruntime);
7401 resched_task(rq->curr); 7453 resched_curr(rq);
7402 } 7454 }
7403 7455
7404 se->vruntime -= cfs_rq->min_vruntime; 7456 se->vruntime -= cfs_rq->min_vruntime;
@@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7423 */ 7475 */
7424 if (rq->curr == p) { 7476 if (rq->curr == p) {
7425 if (p->prio > oldprio) 7477 if (p->prio > oldprio)
7426 resched_task(rq->curr); 7478 resched_curr(rq);
7427 } else 7479 } else
7428 check_preempt_curr(rq, p, 0); 7480 check_preempt_curr(rq, p, 0);
7429} 7481}
@@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
7486 * if we can still preempt the current task. 7538 * if we can still preempt the current task.
7487 */ 7539 */
7488 if (rq->curr == p) 7540 if (rq->curr == p)
7489 resched_task(rq->curr); 7541 resched_curr(rq);
7490 else 7542 else
7491 check_preempt_curr(rq, p, 0); 7543 check_preempt_curr(rq, p, 0);
7492} 7544}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..9f1608f99819 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -79,7 +79,7 @@ static void cpuidle_idle_call(void)
79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 79 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 80 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
81 int next_state, entered_state; 81 int next_state, entered_state;
82 bool broadcast; 82 unsigned int broadcast;
83 83
84 /* 84 /*
85 * Check if the idle task must be rescheduled. If it is the 85 * Check if the idle task must be rescheduled. If it is the
@@ -135,7 +135,7 @@ use_default:
135 goto exit_idle; 135 goto exit_idle;
136 } 136 }
137 137
138 broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); 138 broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
139 139
140 /* 140 /*
141 * Tell the time framework to switch to a broadcast timer 141 * Tell the time framework to switch to a broadcast timer
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 879f2b75266a..67ad4e7f506a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
20 */ 20 */
21static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) 21static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
22{ 22{
23 resched_task(rq->idle); 23 resched_curr(rq);
24} 24}
25 25
26static struct task_struct * 26static struct task_struct *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a49083192c64..5f6edca4fafd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
463static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 463static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
464{ 464{
465 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 465 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
466 struct rq *rq = rq_of_rt_rq(rt_rq);
466 struct sched_rt_entity *rt_se; 467 struct sched_rt_entity *rt_se;
467 468
468 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 469 int cpu = cpu_of(rq);
469 470
470 rt_se = rt_rq->tg->rt_se[cpu]; 471 rt_se = rt_rq->tg->rt_se[cpu];
471 472
@@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
476 enqueue_rt_entity(rt_se, false); 477 enqueue_rt_entity(rt_se, false);
477 478
478 if (rt_rq->highest_prio.curr < curr->prio) 479 if (rt_rq->highest_prio.curr < curr->prio)
479 resched_task(curr); 480 resched_curr(rq);
480 } 481 }
481} 482}
482 483
@@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
566 return; 567 return;
567 568
568 enqueue_top_rt_rq(rt_rq); 569 enqueue_top_rt_rq(rt_rq);
569 resched_task(rq->curr); 570 resched_curr(rq);
570} 571}
571 572
572static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 573static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -740,6 +741,9 @@ balanced:
740 rt_rq->rt_throttled = 0; 741 rt_rq->rt_throttled = 0;
741 raw_spin_unlock(&rt_rq->rt_runtime_lock); 742 raw_spin_unlock(&rt_rq->rt_runtime_lock);
742 raw_spin_unlock(&rt_b->rt_runtime_lock); 743 raw_spin_unlock(&rt_b->rt_runtime_lock);
744
745 /* Make rt_rq available for pick_next_task() */
746 sched_rt_rq_enqueue(rt_rq);
743 } 747 }
744} 748}
745 749
@@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq)
948 raw_spin_lock(&rt_rq->rt_runtime_lock); 952 raw_spin_lock(&rt_rq->rt_runtime_lock);
949 rt_rq->rt_time += delta_exec; 953 rt_rq->rt_time += delta_exec;
950 if (sched_rt_runtime_exceeded(rt_rq)) 954 if (sched_rt_runtime_exceeded(rt_rq))
951 resched_task(curr); 955 resched_curr(rq);
952 raw_spin_unlock(&rt_rq->rt_runtime_lock); 956 raw_spin_unlock(&rt_rq->rt_runtime_lock);
953 } 957 }
954 } 958 }
@@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1363 * to try and push current away: 1367 * to try and push current away:
1364 */ 1368 */
1365 requeue_task_rt(rq, p, 1); 1369 requeue_task_rt(rq, p, 1);
1366 resched_task(rq->curr); 1370 resched_curr(rq);
1367} 1371}
1368 1372
1369#endif /* CONFIG_SMP */ 1373#endif /* CONFIG_SMP */
@@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1374static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1378static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1375{ 1379{
1376 if (p->prio < rq->curr->prio) { 1380 if (p->prio < rq->curr->prio) {
1377 resched_task(rq->curr); 1381 resched_curr(rq);
1378 return; 1382 return;
1379 } 1383 }
1380 1384
@@ -1690,7 +1694,7 @@ retry:
1690 * just reschedule current. 1694 * just reschedule current.
1691 */ 1695 */
1692 if (unlikely(next_task->prio < rq->curr->prio)) { 1696 if (unlikely(next_task->prio < rq->curr->prio)) {
1693 resched_task(rq->curr); 1697 resched_curr(rq);
1694 return 0; 1698 return 0;
1695 } 1699 }
1696 1700
@@ -1737,7 +1741,7 @@ retry:
1737 activate_task(lowest_rq, next_task, 0); 1741 activate_task(lowest_rq, next_task, 0);
1738 ret = 1; 1742 ret = 1;
1739 1743
1740 resched_task(lowest_rq->curr); 1744 resched_curr(lowest_rq);
1741 1745
1742 double_unlock_balance(rq, lowest_rq); 1746 double_unlock_balance(rq, lowest_rq);
1743 1747
@@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1936 return; 1940 return;
1937 1941
1938 if (pull_rt_task(rq)) 1942 if (pull_rt_task(rq))
1939 resched_task(rq->curr); 1943 resched_curr(rq);
1940} 1944}
1941 1945
1942void __init init_sched_rt_class(void) 1946void __init init_sched_rt_class(void)
@@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1974 check_resched = 0; 1978 check_resched = 0;
1975#endif /* CONFIG_SMP */ 1979#endif /* CONFIG_SMP */
1976 if (check_resched && p->prio < rq->curr->prio) 1980 if (check_resched && p->prio < rq->curr->prio)
1977 resched_task(rq->curr); 1981 resched_curr(rq);
1978 } 1982 }
1979} 1983}
1980 1984
@@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2003 * Only reschedule if p is still on the same runqueue. 2007 * Only reschedule if p is still on the same runqueue.
2004 */ 2008 */
2005 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) 2009 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
2006 resched_task(p); 2010 resched_curr(rq);
2007#else 2011#else
2008 /* For UP simply resched on drop of prio */ 2012 /* For UP simply resched on drop of prio */
2009 if (oldprio < p->prio) 2013 if (oldprio < p->prio)
2010 resched_task(p); 2014 resched_curr(rq);
2011#endif /* CONFIG_SMP */ 2015#endif /* CONFIG_SMP */
2012 } else { 2016 } else {
2013 /* 2017 /*
@@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2016 * then reschedule. 2020 * then reschedule.
2017 */ 2021 */
2018 if (p->prio < rq->curr->prio) 2022 if (p->prio < rq->curr->prio)
2019 resched_task(rq->curr); 2023 resched_curr(rq);
2020 } 2024 }
2021} 2025}
2022 2026
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31cc02ebc54e..579712f4e9d5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
477 cpumask_var_t span; 477 cpumask_var_t span;
478 cpumask_var_t online; 478 cpumask_var_t online;
479 479
480 /* Indicate more than one runnable task for any CPU */
481 bool overload;
482
480 /* 483 /*
481 * The bit corresponding to a CPU gets set here if such CPU has more 484 * The bit corresponding to a CPU gets set here if such CPU has more
482 * than one runnable -deadline task (as it is below for RT tasks). 485 * than one runnable -deadline task (as it is below for RT tasks).
@@ -884,20 +887,10 @@ enum {
884#undef SCHED_FEAT 887#undef SCHED_FEAT
885 888
886#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 889#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
887static __always_inline bool static_branch__true(struct static_key *key)
888{
889 return static_key_true(key); /* Not out of line branch. */
890}
891
892static __always_inline bool static_branch__false(struct static_key *key)
893{
894 return static_key_false(key); /* Out of line branch. */
895}
896
897#define SCHED_FEAT(name, enabled) \ 890#define SCHED_FEAT(name, enabled) \
898static __always_inline bool static_branch_##name(struct static_key *key) \ 891static __always_inline bool static_branch_##name(struct static_key *key) \
899{ \ 892{ \
900 return static_branch__##enabled(key); \ 893 return static_key_##enabled(key); \
901} 894}
902 895
903#include "features.h" 896#include "features.h"
@@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void);
1196extern void init_sched_fair_class(void); 1189extern void init_sched_fair_class(void);
1197extern void init_sched_dl_class(void); 1190extern void init_sched_dl_class(void);
1198 1191
1199extern void resched_task(struct task_struct *p); 1192extern void resched_curr(struct rq *rq);
1200extern void resched_cpu(int cpu); 1193extern void resched_cpu(int cpu);
1201 1194
1202extern struct rt_bandwidth def_rt_bandwidth; 1195extern struct rt_bandwidth def_rt_bandwidth;
@@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
1218 1211
1219 rq->nr_running = prev_nr + count; 1212 rq->nr_running = prev_nr + count;
1220 1213
1221#ifdef CONFIG_NO_HZ_FULL
1222 if (prev_nr < 2 && rq->nr_running >= 2) { 1214 if (prev_nr < 2 && rq->nr_running >= 2) {
1215#ifdef CONFIG_SMP
1216 if (!rq->rd->overload)
1217 rq->rd->overload = true;
1218#endif
1219
1220#ifdef CONFIG_NO_HZ_FULL
1223 if (tick_nohz_full_cpu(rq->cpu)) { 1221 if (tick_nohz_full_cpu(rq->cpu)) {
1224 /* Order rq->nr_running write against the IPI */ 1222 /*
1225 smp_wmb(); 1223 * Tick is needed if more than one task runs on a CPU.
1226 smp_send_reschedule(rq->cpu); 1224 * Send the target an IPI to kick it out of nohz mode.
1225 *
1226 * We assume that IPI implies full memory barrier and the
1227 * new value of rq->nr_running is visible on reception
1228 * from the target.
1229 */
1230 tick_nohz_full_kick_cpu(rq->cpu);
1227 } 1231 }
1228 }
1229#endif 1232#endif
1233 }
1230} 1234}
1231 1235
1232static inline void sub_nr_running(struct rq *rq, unsigned count) 1236static inline void sub_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 0ffa20ae657b..15cab1a4f84e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function);
319 */ 319 */
320int __sched 320int __sched
321__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, 321__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
322 int (*action)(void *), unsigned mode) 322 wait_bit_action_f *action, unsigned mode)
323{ 323{
324 int ret = 0; 324 int ret = 0;
325 325
326 do { 326 do {
327 prepare_to_wait(wq, &q->wait, mode); 327 prepare_to_wait(wq, &q->wait, mode);
328 if (test_bit(q->key.bit_nr, q->key.flags)) 328 if (test_bit(q->key.bit_nr, q->key.flags))
329 ret = (*action)(q->key.flags); 329 ret = (*action)(&q->key);
330 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); 330 } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
331 finish_wait(wq, &q->wait); 331 finish_wait(wq, &q->wait);
332 return ret; 332 return ret;
@@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
334EXPORT_SYMBOL(__wait_on_bit); 334EXPORT_SYMBOL(__wait_on_bit);
335 335
336int __sched out_of_line_wait_on_bit(void *word, int bit, 336int __sched out_of_line_wait_on_bit(void *word, int bit,
337 int (*action)(void *), unsigned mode) 337 wait_bit_action_f *action, unsigned mode)
338{ 338{
339 wait_queue_head_t *wq = bit_waitqueue(word, bit); 339 wait_queue_head_t *wq = bit_waitqueue(word, bit);
340 DEFINE_WAIT_BIT(wait, word, bit); 340 DEFINE_WAIT_BIT(wait, word, bit);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit);
345 345
346int __sched 346int __sched
347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
348 int (*action)(void *), unsigned mode) 348 wait_bit_action_f *action, unsigned mode)
349{ 349{
350 do { 350 do {
351 int ret; 351 int ret;
@@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
353 prepare_to_wait_exclusive(wq, &q->wait, mode); 353 prepare_to_wait_exclusive(wq, &q->wait, mode);
354 if (!test_bit(q->key.bit_nr, q->key.flags)) 354 if (!test_bit(q->key.bit_nr, q->key.flags))
355 continue; 355 continue;
356 ret = action(q->key.flags); 356 ret = action(&q->key);
357 if (!ret) 357 if (!ret)
358 continue; 358 continue;
359 abort_exclusive_wait(wq, &q->wait, mode, &q->key); 359 abort_exclusive_wait(wq, &q->wait, mode, &q->key);
@@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
365EXPORT_SYMBOL(__wait_on_bit_lock); 365EXPORT_SYMBOL(__wait_on_bit_lock);
366 366
367int __sched out_of_line_wait_on_bit_lock(void *word, int bit, 367int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
368 int (*action)(void *), unsigned mode) 368 wait_bit_action_f *action, unsigned mode)
369{ 369{
370 wait_queue_head_t *wq = bit_waitqueue(word, bit); 370 wait_queue_head_t *wq = bit_waitqueue(word, bit);
371 DEFINE_WAIT_BIT(wait, word, bit); 371 DEFINE_WAIT_BIT(wait, word, bit);
@@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p)
502 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); 502 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
503} 503}
504EXPORT_SYMBOL(wake_up_atomic_t); 504EXPORT_SYMBOL(wake_up_atomic_t);
505
506__sched int bit_wait(struct wait_bit_key *word)
507{
508 if (signal_pending_state(current->state, current))
509 return 1;
510 schedule();
511 return 0;
512}
513EXPORT_SYMBOL(bit_wait);
514
515__sched int bit_wait_io(struct wait_bit_key *word)
516{
517 if (signal_pending_state(current->state, current))
518 return 1;
519 io_schedule();
520 return 0;
521}
522EXPORT_SYMBOL(bit_wait_io);
diff --git a/kernel/smp.c b/kernel/smp.c
index 80c33f8de14f..487653b5844f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
3 * 3 *
4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008 4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
5 */ 5 */
6#include <linux/irq_work.h>
6#include <linux/rcupdate.h> 7#include <linux/rcupdate.h>
7#include <linux/rculist.h> 8#include <linux/rculist.h>
8#include <linux/kernel.h> 9#include <linux/kernel.h>
@@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
251 csd->func(csd->info); 252 csd->func(csd->info);
252 csd_unlock(csd); 253 csd_unlock(csd);
253 } 254 }
255
256 /*
257 * Handle irq works queued remotely by irq_work_queue_on().
258 * Smp functions above are typically synchronous so they
259 * better run first since some other CPUs may be busy waiting
260 * for them.
261 */
262 irq_work_run();
254} 263}
255 264
256/* 265/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f784d83e29f1..99aa6ee3908f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -225,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
225}; 225};
226 226
227/* 227/*
228 * Kick the current CPU if it's full dynticks in order to force it to 228 * Kick the CPU if it's full dynticks in order to force it to
229 * re-evaluate its dependency on the tick and restart it if necessary. 229 * re-evaluate its dependency on the tick and restart it if necessary.
230 */ 230 */
231void tick_nohz_full_kick(void) 231void tick_nohz_full_kick_cpu(int cpu)
232{ 232{
233 if (tick_nohz_full_cpu(smp_processor_id())) 233 if (!tick_nohz_full_cpu(cpu))
234 irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); 234 return;
235
236 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
235} 237}
236 238
237static void nohz_full_kick_ipi(void *info) 239static void nohz_full_kick_ipi(void *info)
diff --git a/mm/filemap.c b/mm/filemap.c
index 900edfaf6df5..65d44fd88c78 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -241,18 +241,6 @@ void delete_from_page_cache(struct page *page)
241} 241}
242EXPORT_SYMBOL(delete_from_page_cache); 242EXPORT_SYMBOL(delete_from_page_cache);
243 243
244static int sleep_on_page(void *word)
245{
246 io_schedule();
247 return 0;
248}
249
250static int sleep_on_page_killable(void *word)
251{
252 sleep_on_page(word);
253 return fatal_signal_pending(current) ? -EINTR : 0;
254}
255
256static int filemap_check_errors(struct address_space *mapping) 244static int filemap_check_errors(struct address_space *mapping)
257{ 245{
258 int ret = 0; 246 int ret = 0;
@@ -692,7 +680,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
692 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 680 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
693 681
694 if (test_bit(bit_nr, &page->flags)) 682 if (test_bit(bit_nr, &page->flags))
695 __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, 683 __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
696 TASK_UNINTERRUPTIBLE); 684 TASK_UNINTERRUPTIBLE);
697} 685}
698EXPORT_SYMBOL(wait_on_page_bit); 686EXPORT_SYMBOL(wait_on_page_bit);
@@ -705,7 +693,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
705 return 0; 693 return 0;
706 694
707 return __wait_on_bit(page_waitqueue(page), &wait, 695 return __wait_on_bit(page_waitqueue(page), &wait,
708 sleep_on_page_killable, TASK_KILLABLE); 696 bit_wait_io, TASK_KILLABLE);
709} 697}
710 698
711/** 699/**
@@ -806,7 +794,7 @@ void __lock_page(struct page *page)
806{ 794{
807 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 795 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
808 796
809 __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, 797 __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
810 TASK_UNINTERRUPTIBLE); 798 TASK_UNINTERRUPTIBLE);
811} 799}
812EXPORT_SYMBOL(__lock_page); 800EXPORT_SYMBOL(__lock_page);
@@ -816,7 +804,7 @@ int __lock_page_killable(struct page *page)
816 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 804 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
817 805
818 return __wait_on_bit_lock(page_waitqueue(page), &wait, 806 return __wait_on_bit_lock(page_waitqueue(page), &wait,
819 sleep_on_page_killable, TASK_KILLABLE); 807 bit_wait_io, TASK_KILLABLE);
820} 808}
821EXPORT_SYMBOL_GPL(__lock_page_killable); 809EXPORT_SYMBOL_GPL(__lock_page_killable);
822 810
diff --git a/mm/ksm.c b/mm/ksm.c
index 346ddc9e4c0d..fb7590222706 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1978,18 +1978,12 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1978#endif /* CONFIG_MIGRATION */ 1978#endif /* CONFIG_MIGRATION */
1979 1979
1980#ifdef CONFIG_MEMORY_HOTREMOVE 1980#ifdef CONFIG_MEMORY_HOTREMOVE
1981static int just_wait(void *word)
1982{
1983 schedule();
1984 return 0;
1985}
1986
1987static void wait_while_offlining(void) 1981static void wait_while_offlining(void)
1988{ 1982{
1989 while (ksm_run & KSM_RUN_OFFLINE) { 1983 while (ksm_run & KSM_RUN_OFFLINE) {
1990 mutex_unlock(&ksm_thread_mutex); 1984 mutex_unlock(&ksm_thread_mutex);
1991 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), 1985 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
1992 just_wait, TASK_UNINTERRUPTIBLE); 1986 TASK_UNINTERRUPTIBLE);
1993 mutex_lock(&ksm_thread_mutex); 1987 mutex_lock(&ksm_thread_mutex);
1994 } 1988 }
1995} 1989}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 0a43cce9a914..e090bffe1bf8 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2186,12 +2186,6 @@ static void hci_inq_req(struct hci_request *req, unsigned long opt)
2186 hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); 2186 hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp);
2187} 2187}
2188 2188
2189static int wait_inquiry(void *word)
2190{
2191 schedule();
2192 return signal_pending(current);
2193}
2194
2195int hci_inquiry(void __user *arg) 2189int hci_inquiry(void __user *arg)
2196{ 2190{
2197 __u8 __user *ptr = arg; 2191 __u8 __user *ptr = arg;
@@ -2242,7 +2236,7 @@ int hci_inquiry(void __user *arg)
2242 /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is 2236 /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is
2243 * cleared). If it is interrupted by a signal, return -EINTR. 2237 * cleared). If it is interrupted by a signal, return -EINTR.
2244 */ 2238 */
2245 if (wait_on_bit(&hdev->flags, HCI_INQUIRY, wait_inquiry, 2239 if (wait_on_bit(&hdev->flags, HCI_INQUIRY,
2246 TASK_INTERRUPTIBLE)) 2240 TASK_INTERRUPTIBLE))
2247 return -EINTR; 2241 return -EINTR;
2248 } 2242 }
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index c0365c14b858..9358c79fd589 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -250,7 +250,7 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
250} 250}
251EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); 251EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
252 252
253static int rpc_wait_bit_killable(void *word) 253static int rpc_wait_bit_killable(struct wait_bit_key *key)
254{ 254{
255 if (fatal_signal_pending(current)) 255 if (fatal_signal_pending(current))
256 return -ERESTARTSYS; 256 return -ERESTARTSYS;
@@ -309,7 +309,7 @@ static int rpc_complete_task(struct rpc_task *task)
309 * to enforce taking of the wq->lock and hence avoid races with 309 * to enforce taking of the wq->lock and hence avoid races with
310 * rpc_complete_task(). 310 * rpc_complete_task().
311 */ 311 */
312int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *)) 312int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action)
313{ 313{
314 if (action == NULL) 314 if (action == NULL)
315 action = rpc_wait_bit_killable; 315 action = rpc_wait_bit_killable;
diff --git a/security/keys/gc.c b/security/keys/gc.c
index d3222b6d7d59..9609a7f0faea 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -92,15 +92,6 @@ static void key_gc_timer_func(unsigned long data)
92} 92}
93 93
94/* 94/*
95 * wait_on_bit() sleep function for uninterruptible waiting
96 */
97static int key_gc_wait_bit(void *flags)
98{
99 schedule();
100 return 0;
101}
102
103/*
104 * Reap keys of dead type. 95 * Reap keys of dead type.
105 * 96 *
106 * We use three flags to make sure we see three complete cycles of the garbage 97 * We use three flags to make sure we see three complete cycles of the garbage
@@ -123,7 +114,7 @@ void key_gc_keytype(struct key_type *ktype)
123 schedule_work(&key_gc_work); 114 schedule_work(&key_gc_work);
124 115
125 kdebug("sleep"); 116 kdebug("sleep");
126 wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE, key_gc_wait_bit, 117 wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE,
127 TASK_UNINTERRUPTIBLE); 118 TASK_UNINTERRUPTIBLE);
128 119
129 key_gc_dead_keytype = NULL; 120 key_gc_dead_keytype = NULL;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 381411941cc1..26a94f18af94 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -21,24 +21,6 @@
21 21
22#define key_negative_timeout 60 /* default timeout on a negative key's existence */ 22#define key_negative_timeout 60 /* default timeout on a negative key's existence */
23 23
24/*
25 * wait_on_bit() sleep function for uninterruptible waiting
26 */
27static int key_wait_bit(void *flags)
28{
29 schedule();
30 return 0;
31}
32
33/*
34 * wait_on_bit() sleep function for interruptible waiting
35 */
36static int key_wait_bit_intr(void *flags)
37{
38 schedule();
39 return signal_pending(current) ? -ERESTARTSYS : 0;
40}
41
42/** 24/**
43 * complete_request_key - Complete the construction of a key. 25 * complete_request_key - Complete the construction of a key.
44 * @cons: The key construction record. 26 * @cons: The key construction record.
@@ -592,10 +574,9 @@ int wait_for_key_construction(struct key *key, bool intr)
592 int ret; 574 int ret;
593 575
594 ret = wait_on_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT, 576 ret = wait_on_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT,
595 intr ? key_wait_bit_intr : key_wait_bit,
596 intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); 577 intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
597 if (ret < 0) 578 if (ret)
598 return ret; 579 return -ERESTARTSYS;
599 if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { 580 if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) {
600 smp_rmb(); 581 smp_rmb();
601 return key->type_data.reject_error; 582 return key->type_data.reject_error;