diff options
Diffstat (limited to 'kernel/rcutree_plugin.h')
-rw-r--r-- | kernel/rcutree_plugin.h | 466 |
1 files changed, 443 insertions, 23 deletions
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 63098a59216e..130c97b027f2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
31 | #include <linux/tick.h> | 31 | #include "time/tick-internal.h" |
32 | 32 | ||
33 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
34 | 34 | ||
@@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
110 | 110 | ||
111 | #ifdef CONFIG_TREE_PREEMPT_RCU | 111 | #ifdef CONFIG_TREE_PREEMPT_RCU |
112 | 112 | ||
113 | struct rcu_state rcu_preempt_state = | 113 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
114 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | ||
115 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | ||
116 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 114 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
117 | 115 | ||
118 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 116 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
@@ -169,7 +167,7 @@ static void rcu_preempt_qs(int cpu) | |||
169 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 167 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
170 | 168 | ||
171 | if (rdp->passed_quiesce == 0) | 169 | if (rdp->passed_quiesce == 0) |
172 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); | 170 | trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); |
173 | rdp->passed_quiesce = 1; | 171 | rdp->passed_quiesce = 1; |
174 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 172 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
175 | } | 173 | } |
@@ -388,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
388 | np = rcu_next_node_entry(t, rnp); | 386 | np = rcu_next_node_entry(t, rnp); |
389 | list_del_init(&t->rcu_node_entry); | 387 | list_del_init(&t->rcu_node_entry); |
390 | t->rcu_blocked_node = NULL; | 388 | t->rcu_blocked_node = NULL; |
391 | trace_rcu_unlock_preempted_task("rcu_preempt", | 389 | trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), |
392 | rnp->gpnum, t->pid); | 390 | rnp->gpnum, t->pid); |
393 | if (&t->rcu_node_entry == rnp->gp_tasks) | 391 | if (&t->rcu_node_entry == rnp->gp_tasks) |
394 | rnp->gp_tasks = np; | 392 | rnp->gp_tasks = np; |
@@ -412,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
412 | */ | 410 | */ |
413 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | 411 | empty_exp_now = !rcu_preempted_readers_exp(rnp); |
414 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { | 412 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { |
415 | trace_rcu_quiescent_state_report("preempt_rcu", | 413 | trace_rcu_quiescent_state_report(TPS("preempt_rcu"), |
416 | rnp->gpnum, | 414 | rnp->gpnum, |
417 | 0, rnp->qsmask, | 415 | 0, rnp->qsmask, |
418 | rnp->level, | 416 | rnp->level, |
@@ -1250,12 +1248,12 @@ static int rcu_boost_kthread(void *arg) | |||
1250 | int spincnt = 0; | 1248 | int spincnt = 0; |
1251 | int more2boost; | 1249 | int more2boost; |
1252 | 1250 | ||
1253 | trace_rcu_utilization("Start boost kthread@init"); | 1251 | trace_rcu_utilization(TPS("Start boost kthread@init")); |
1254 | for (;;) { | 1252 | for (;;) { |
1255 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | 1253 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; |
1256 | trace_rcu_utilization("End boost kthread@rcu_wait"); | 1254 | trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); |
1257 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | 1255 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); |
1258 | trace_rcu_utilization("Start boost kthread@rcu_wait"); | 1256 | trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); |
1259 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | 1257 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; |
1260 | more2boost = rcu_boost(rnp); | 1258 | more2boost = rcu_boost(rnp); |
1261 | if (more2boost) | 1259 | if (more2boost) |
@@ -1264,14 +1262,14 @@ static int rcu_boost_kthread(void *arg) | |||
1264 | spincnt = 0; | 1262 | spincnt = 0; |
1265 | if (spincnt > 10) { | 1263 | if (spincnt > 10) { |
1266 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; | 1264 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; |
1267 | trace_rcu_utilization("End boost kthread@rcu_yield"); | 1265 | trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); |
1268 | schedule_timeout_interruptible(2); | 1266 | schedule_timeout_interruptible(2); |
1269 | trace_rcu_utilization("Start boost kthread@rcu_yield"); | 1267 | trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); |
1270 | spincnt = 0; | 1268 | spincnt = 0; |
1271 | } | 1269 | } |
1272 | } | 1270 | } |
1273 | /* NOTREACHED */ | 1271 | /* NOTREACHED */ |
1274 | trace_rcu_utilization("End boost kthread@notreached"); | 1272 | trace_rcu_utilization(TPS("End boost kthread@notreached")); |
1275 | return 0; | 1273 | return 0; |
1276 | } | 1274 | } |
1277 | 1275 | ||
@@ -1352,7 +1350,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | |||
1352 | * already exist. We only create this kthread for preemptible RCU. | 1350 | * already exist. We only create this kthread for preemptible RCU. |
1353 | * Returns zero if all is well, a negated errno otherwise. | 1351 | * Returns zero if all is well, a negated errno otherwise. |
1354 | */ | 1352 | */ |
1355 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 1353 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
1356 | struct rcu_node *rnp) | 1354 | struct rcu_node *rnp) |
1357 | { | 1355 | { |
1358 | int rnp_index = rnp - &rsp->node[0]; | 1356 | int rnp_index = rnp - &rsp->node[0]; |
@@ -1419,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
1419 | int spincnt; | 1417 | int spincnt; |
1420 | 1418 | ||
1421 | for (spincnt = 0; spincnt < 10; spincnt++) { | 1419 | for (spincnt = 0; spincnt < 10; spincnt++) { |
1422 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); | 1420 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); |
1423 | local_bh_disable(); | 1421 | local_bh_disable(); |
1424 | *statusp = RCU_KTHREAD_RUNNING; | 1422 | *statusp = RCU_KTHREAD_RUNNING; |
1425 | this_cpu_inc(rcu_cpu_kthread_loops); | 1423 | this_cpu_inc(rcu_cpu_kthread_loops); |
@@ -1431,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
1431 | rcu_kthread_do_work(); | 1429 | rcu_kthread_do_work(); |
1432 | local_bh_enable(); | 1430 | local_bh_enable(); |
1433 | if (*workp == 0) { | 1431 | if (*workp == 0) { |
1434 | trace_rcu_utilization("End CPU kthread@rcu_wait"); | 1432 | trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); |
1435 | *statusp = RCU_KTHREAD_WAITING; | 1433 | *statusp = RCU_KTHREAD_WAITING; |
1436 | return; | 1434 | return; |
1437 | } | 1435 | } |
1438 | } | 1436 | } |
1439 | *statusp = RCU_KTHREAD_YIELDING; | 1437 | *statusp = RCU_KTHREAD_YIELDING; |
1440 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); | 1438 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); |
1441 | schedule_timeout_interruptible(2); | 1439 | schedule_timeout_interruptible(2); |
1442 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | 1440 | trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); |
1443 | *statusp = RCU_KTHREAD_WAITING; | 1441 | *statusp = RCU_KTHREAD_WAITING; |
1444 | } | 1442 | } |
1445 | 1443 | ||
@@ -1507,7 +1505,7 @@ static int __init rcu_spawn_kthreads(void) | |||
1507 | } | 1505 | } |
1508 | early_initcall(rcu_spawn_kthreads); | 1506 | early_initcall(rcu_spawn_kthreads); |
1509 | 1507 | ||
1510 | static void __cpuinit rcu_prepare_kthreads(int cpu) | 1508 | static void rcu_prepare_kthreads(int cpu) |
1511 | { | 1509 | { |
1512 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 1510 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
1513 | struct rcu_node *rnp = rdp->mynode; | 1511 | struct rcu_node *rnp = rdp->mynode; |
@@ -1549,7 +1547,7 @@ static int __init rcu_scheduler_really_started(void) | |||
1549 | } | 1547 | } |
1550 | early_initcall(rcu_scheduler_really_started); | 1548 | early_initcall(rcu_scheduler_really_started); |
1551 | 1549 | ||
1552 | static void __cpuinit rcu_prepare_kthreads(int cpu) | 1550 | static void rcu_prepare_kthreads(int cpu) |
1553 | { | 1551 | { |
1554 | } | 1552 | } |
1555 | 1553 | ||
@@ -2202,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2202 | * Wait for the grace period. Do so interruptibly to avoid messing | 2200 | * Wait for the grace period. Do so interruptibly to avoid messing |
2203 | * up the load average. | 2201 | * up the load average. |
2204 | */ | 2202 | */ |
2205 | trace_rcu_future_gp(rnp, rdp, c, "StartWait"); | 2203 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); |
2206 | for (;;) { | 2204 | for (;;) { |
2207 | wait_event_interruptible( | 2205 | wait_event_interruptible( |
2208 | rnp->nocb_gp_wq[c & 0x1], | 2206 | rnp->nocb_gp_wq[c & 0x1], |
@@ -2210,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2210 | if (likely(d)) | 2208 | if (likely(d)) |
2211 | break; | 2209 | break; |
2212 | flush_signals(current); | 2210 | flush_signals(current); |
2213 | trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); | 2211 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); |
2214 | } | 2212 | } |
2215 | trace_rcu_future_gp(rnp, rdp, c, "EndWait"); | 2213 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); |
2216 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ | 2214 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ |
2217 | } | 2215 | } |
2218 | 2216 | ||
@@ -2375,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu) | |||
2375 | smp_send_reschedule(cpu); | 2373 | smp_send_reschedule(cpu); |
2376 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | 2374 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ |
2377 | } | 2375 | } |
2376 | |||
2377 | |||
2378 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
2379 | |||
2380 | /* | ||
2381 | * Define RCU flavor that holds sysidle state. This needs to be the | ||
2382 | * most active flavor of RCU. | ||
2383 | */ | ||
2384 | #ifdef CONFIG_PREEMPT_RCU | ||
2385 | static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; | ||
2386 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2387 | static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; | ||
2388 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
2389 | |||
2390 | static int full_sysidle_state; /* Current system-idle state. */ | ||
2391 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | ||
2392 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | ||
2393 | #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ | ||
2394 | #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ | ||
2395 | #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ | ||
2396 | |||
2397 | /* | ||
2398 | * Invoked to note exit from irq or task transition to idle. Note that | ||
2399 | * usermode execution does -not- count as idle here! After all, we want | ||
2400 | * to detect full-system idle states, not RCU quiescent states and grace | ||
2401 | * periods. The caller must have disabled interrupts. | ||
2402 | */ | ||
2403 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2404 | { | ||
2405 | unsigned long j; | ||
2406 | |||
2407 | /* Adjust nesting, check for fully idle. */ | ||
2408 | if (irq) { | ||
2409 | rdtp->dynticks_idle_nesting--; | ||
2410 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2411 | if (rdtp->dynticks_idle_nesting != 0) | ||
2412 | return; /* Still not fully idle. */ | ||
2413 | } else { | ||
2414 | if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == | ||
2415 | DYNTICK_TASK_NEST_VALUE) { | ||
2416 | rdtp->dynticks_idle_nesting = 0; | ||
2417 | } else { | ||
2418 | rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
2419 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2420 | return; /* Still not fully idle. */ | ||
2421 | } | ||
2422 | } | ||
2423 | |||
2424 | /* Record start of fully idle period. */ | ||
2425 | j = jiffies; | ||
2426 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; | ||
2427 | smp_mb__before_atomic_inc(); | ||
2428 | atomic_inc(&rdtp->dynticks_idle); | ||
2429 | smp_mb__after_atomic_inc(); | ||
2430 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | ||
2431 | } | ||
2432 | |||
2433 | /* | ||
2434 | * Unconditionally force exit from full system-idle state. This is | ||
2435 | * invoked when a normal CPU exits idle, but must be called separately | ||
2436 | * for the timekeeping CPU (tick_do_timer_cpu). The reason for this | ||
2437 | * is that the timekeeping CPU is permitted to take scheduling-clock | ||
2438 | * interrupts while the system is in system-idle state, and of course | ||
2439 | * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock | ||
2440 | * interrupt from any other type of interrupt. | ||
2441 | */ | ||
2442 | void rcu_sysidle_force_exit(void) | ||
2443 | { | ||
2444 | int oldstate = ACCESS_ONCE(full_sysidle_state); | ||
2445 | int newoldstate; | ||
2446 | |||
2447 | /* | ||
2448 | * Each pass through the following loop attempts to exit full | ||
2449 | * system-idle state. If contention proves to be a problem, | ||
2450 | * a trylock-based contention tree could be used here. | ||
2451 | */ | ||
2452 | while (oldstate > RCU_SYSIDLE_SHORT) { | ||
2453 | newoldstate = cmpxchg(&full_sysidle_state, | ||
2454 | oldstate, RCU_SYSIDLE_NOT); | ||
2455 | if (oldstate == newoldstate && | ||
2456 | oldstate == RCU_SYSIDLE_FULL_NOTED) { | ||
2457 | rcu_kick_nohz_cpu(tick_do_timer_cpu); | ||
2458 | return; /* We cleared it, done! */ | ||
2459 | } | ||
2460 | oldstate = newoldstate; | ||
2461 | } | ||
2462 | smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ | ||
2463 | } | ||
2464 | |||
2465 | /* | ||
2466 | * Invoked to note entry to irq or task transition from idle. Note that | ||
2467 | * usermode execution does -not- count as idle here! The caller must | ||
2468 | * have disabled interrupts. | ||
2469 | */ | ||
2470 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2471 | { | ||
2472 | /* Adjust nesting, check for already non-idle. */ | ||
2473 | if (irq) { | ||
2474 | rdtp->dynticks_idle_nesting++; | ||
2475 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2476 | if (rdtp->dynticks_idle_nesting != 1) | ||
2477 | return; /* Already non-idle. */ | ||
2478 | } else { | ||
2479 | /* | ||
2480 | * Allow for irq misnesting. Yes, it really is possible | ||
2481 | * to enter an irq handler then never leave it, and maybe | ||
2482 | * also vice versa. Handle both possibilities. | ||
2483 | */ | ||
2484 | if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { | ||
2485 | rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; | ||
2486 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2487 | return; /* Already non-idle. */ | ||
2488 | } else { | ||
2489 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
2490 | } | ||
2491 | } | ||
2492 | |||
2493 | /* Record end of idle period. */ | ||
2494 | smp_mb__before_atomic_inc(); | ||
2495 | atomic_inc(&rdtp->dynticks_idle); | ||
2496 | smp_mb__after_atomic_inc(); | ||
2497 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | ||
2498 | |||
2499 | /* | ||
2500 | * If we are the timekeeping CPU, we are permitted to be non-idle | ||
2501 | * during a system-idle state. This must be the case, because | ||
2502 | * the timekeeping CPU has to take scheduling-clock interrupts | ||
2503 | * during the time that the system is transitioning to full | ||
2504 | * system-idle state. This means that the timekeeping CPU must | ||
2505 | * invoke rcu_sysidle_force_exit() directly if it does anything | ||
2506 | * more than take a scheduling-clock interrupt. | ||
2507 | */ | ||
2508 | if (smp_processor_id() == tick_do_timer_cpu) | ||
2509 | return; | ||
2510 | |||
2511 | /* Update system-idle state: We are clearly no longer fully idle! */ | ||
2512 | rcu_sysidle_force_exit(); | ||
2513 | } | ||
2514 | |||
2515 | /* | ||
2516 | * Check to see if the current CPU is idle. Note that usermode execution | ||
2517 | * does not count as idle. The caller must have disabled interrupts. | ||
2518 | */ | ||
2519 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2520 | unsigned long *maxj) | ||
2521 | { | ||
2522 | int cur; | ||
2523 | unsigned long j; | ||
2524 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
2525 | |||
2526 | /* | ||
2527 | * If some other CPU has already reported non-idle, if this is | ||
2528 | * not the flavor of RCU that tracks sysidle state, or if this | ||
2529 | * is an offline or the timekeeping CPU, nothing to do. | ||
2530 | */ | ||
2531 | if (!*isidle || rdp->rsp != rcu_sysidle_state || | ||
2532 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | ||
2533 | return; | ||
2534 | if (rcu_gp_in_progress(rdp->rsp)) | ||
2535 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | ||
2536 | |||
2537 | /* Pick up current idle and NMI-nesting counter and check. */ | ||
2538 | cur = atomic_read(&rdtp->dynticks_idle); | ||
2539 | if (cur & 0x1) { | ||
2540 | *isidle = false; /* We are not idle! */ | ||
2541 | return; | ||
2542 | } | ||
2543 | smp_mb(); /* Read counters before timestamps. */ | ||
2544 | |||
2545 | /* Pick up timestamps. */ | ||
2546 | j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); | ||
2547 | /* If this CPU entered idle more recently, update maxj timestamp. */ | ||
2548 | if (ULONG_CMP_LT(*maxj, j)) | ||
2549 | *maxj = j; | ||
2550 | } | ||
2551 | |||
2552 | /* | ||
2553 | * Is this the flavor of RCU that is handling full-system idle? | ||
2554 | */ | ||
2555 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2556 | { | ||
2557 | return rsp == rcu_sysidle_state; | ||
2558 | } | ||
2559 | |||
2560 | /* | ||
2561 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
2562 | * timekeeping CPU. | ||
2563 | */ | ||
2564 | static void rcu_bind_gp_kthread(void) | ||
2565 | { | ||
2566 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
2567 | |||
2568 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
2569 | return; | ||
2570 | if (raw_smp_processor_id() != cpu) | ||
2571 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
2572 | } | ||
2573 | |||
2574 | /* | ||
2575 | * Return a delay in jiffies based on the number of CPUs, rcu_node | ||
2576 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | ||
2577 | * systems more time to transition to full-idle state in order to | ||
2578 | * avoid the cache thrashing that otherwise occur on the state variable. | ||
2579 | * Really small systems (less than a couple of tens of CPUs) should | ||
2580 | * instead use a single global atomically incremented counter, and later | ||
2581 | * versions of this will automatically reconfigure themselves accordingly. | ||
2582 | */ | ||
2583 | static unsigned long rcu_sysidle_delay(void) | ||
2584 | { | ||
2585 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2586 | return 0; | ||
2587 | return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); | ||
2588 | } | ||
2589 | |||
2590 | /* | ||
2591 | * Advance the full-system-idle state. This is invoked when all of | ||
2592 | * the non-timekeeping CPUs are idle. | ||
2593 | */ | ||
2594 | static void rcu_sysidle(unsigned long j) | ||
2595 | { | ||
2596 | /* Check the current state. */ | ||
2597 | switch (ACCESS_ONCE(full_sysidle_state)) { | ||
2598 | case RCU_SYSIDLE_NOT: | ||
2599 | |||
2600 | /* First time all are idle, so note a short idle period. */ | ||
2601 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; | ||
2602 | break; | ||
2603 | |||
2604 | case RCU_SYSIDLE_SHORT: | ||
2605 | |||
2606 | /* | ||
2607 | * Idle for a bit, time to advance to next state? | ||
2608 | * cmpxchg failure means race with non-idle, let them win. | ||
2609 | */ | ||
2610 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2611 | (void)cmpxchg(&full_sysidle_state, | ||
2612 | RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); | ||
2613 | break; | ||
2614 | |||
2615 | case RCU_SYSIDLE_LONG: | ||
2616 | |||
2617 | /* | ||
2618 | * Do an additional check pass before advancing to full. | ||
2619 | * cmpxchg failure means race with non-idle, let them win. | ||
2620 | */ | ||
2621 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2622 | (void)cmpxchg(&full_sysidle_state, | ||
2623 | RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); | ||
2624 | break; | ||
2625 | |||
2626 | default: | ||
2627 | break; | ||
2628 | } | ||
2629 | } | ||
2630 | |||
2631 | /* | ||
2632 | * Found a non-idle non-timekeeping CPU, so kick the system-idle state | ||
2633 | * back to the beginning. | ||
2634 | */ | ||
2635 | static void rcu_sysidle_cancel(void) | ||
2636 | { | ||
2637 | smp_mb(); | ||
2638 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | ||
2639 | } | ||
2640 | |||
2641 | /* | ||
2642 | * Update the sysidle state based on the results of a force-quiescent-state | ||
2643 | * scan of the CPUs' dyntick-idle state. | ||
2644 | */ | ||
2645 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | ||
2646 | unsigned long maxj, bool gpkt) | ||
2647 | { | ||
2648 | if (rsp != rcu_sysidle_state) | ||
2649 | return; /* Wrong flavor, ignore. */ | ||
2650 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2651 | return; /* Running state machine from timekeeping CPU. */ | ||
2652 | if (isidle) | ||
2653 | rcu_sysidle(maxj); /* More idle! */ | ||
2654 | else | ||
2655 | rcu_sysidle_cancel(); /* Idle is over. */ | ||
2656 | } | ||
2657 | |||
2658 | /* | ||
2659 | * Wrapper for rcu_sysidle_report() when called from the grace-period | ||
2660 | * kthread's context. | ||
2661 | */ | ||
2662 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2663 | unsigned long maxj) | ||
2664 | { | ||
2665 | rcu_sysidle_report(rsp, isidle, maxj, true); | ||
2666 | } | ||
2667 | |||
2668 | /* Callback and function for forcing an RCU grace period. */ | ||
2669 | struct rcu_sysidle_head { | ||
2670 | struct rcu_head rh; | ||
2671 | int inuse; | ||
2672 | }; | ||
2673 | |||
2674 | static void rcu_sysidle_cb(struct rcu_head *rhp) | ||
2675 | { | ||
2676 | struct rcu_sysidle_head *rshp; | ||
2677 | |||
2678 | /* | ||
2679 | * The following memory barrier is needed to replace the | ||
2680 | * memory barriers that would normally be in the memory | ||
2681 | * allocator. | ||
2682 | */ | ||
2683 | smp_mb(); /* grace period precedes setting inuse. */ | ||
2684 | |||
2685 | rshp = container_of(rhp, struct rcu_sysidle_head, rh); | ||
2686 | ACCESS_ONCE(rshp->inuse) = 0; | ||
2687 | } | ||
2688 | |||
2689 | /* | ||
2690 | * Check to see if the system is fully idle, other than the timekeeping CPU. | ||
2691 | * The caller must have disabled interrupts. | ||
2692 | */ | ||
2693 | bool rcu_sys_is_idle(void) | ||
2694 | { | ||
2695 | static struct rcu_sysidle_head rsh; | ||
2696 | int rss = ACCESS_ONCE(full_sysidle_state); | ||
2697 | |||
2698 | if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) | ||
2699 | return false; | ||
2700 | |||
2701 | /* Handle small-system case by doing a full scan of CPUs. */ | ||
2702 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { | ||
2703 | int oldrss = rss - 1; | ||
2704 | |||
2705 | /* | ||
2706 | * One pass to advance to each state up to _FULL. | ||
2707 | * Give up if any pass fails to advance the state. | ||
2708 | */ | ||
2709 | while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { | ||
2710 | int cpu; | ||
2711 | bool isidle = true; | ||
2712 | unsigned long maxj = jiffies - ULONG_MAX / 4; | ||
2713 | struct rcu_data *rdp; | ||
2714 | |||
2715 | /* Scan all the CPUs looking for nonidle CPUs. */ | ||
2716 | for_each_possible_cpu(cpu) { | ||
2717 | rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); | ||
2718 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | ||
2719 | if (!isidle) | ||
2720 | break; | ||
2721 | } | ||
2722 | rcu_sysidle_report(rcu_sysidle_state, | ||
2723 | isidle, maxj, false); | ||
2724 | oldrss = rss; | ||
2725 | rss = ACCESS_ONCE(full_sysidle_state); | ||
2726 | } | ||
2727 | } | ||
2728 | |||
2729 | /* If this is the first observation of an idle period, record it. */ | ||
2730 | if (rss == RCU_SYSIDLE_FULL) { | ||
2731 | rss = cmpxchg(&full_sysidle_state, | ||
2732 | RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); | ||
2733 | return rss == RCU_SYSIDLE_FULL; | ||
2734 | } | ||
2735 | |||
2736 | smp_mb(); /* ensure rss load happens before later caller actions. */ | ||
2737 | |||
2738 | /* If already fully idle, tell the caller (in case of races). */ | ||
2739 | if (rss == RCU_SYSIDLE_FULL_NOTED) | ||
2740 | return true; | ||
2741 | |||
2742 | /* | ||
2743 | * If we aren't there yet, and a grace period is not in flight, | ||
2744 | * initiate a grace period. Either way, tell the caller that | ||
2745 | * we are not there yet. We use an xchg() rather than an assignment | ||
2746 | * to make up for the memory barriers that would otherwise be | ||
2747 | * provided by the memory allocator. | ||
2748 | */ | ||
2749 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | ||
2750 | !rcu_gp_in_progress(rcu_sysidle_state) && | ||
2751 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | ||
2752 | call_rcu(&rsh.rh, rcu_sysidle_cb); | ||
2753 | return false; | ||
2754 | } | ||
2755 | |||
2756 | /* | ||
2757 | * Initialize dynticks sysidle state for CPUs coming online. | ||
2758 | */ | ||
2759 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2760 | { | ||
2761 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; | ||
2762 | } | ||
2763 | |||
2764 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
2765 | |||
2766 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2767 | { | ||
2768 | } | ||
2769 | |||
2770 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2771 | { | ||
2772 | } | ||
2773 | |||
2774 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2775 | unsigned long *maxj) | ||
2776 | { | ||
2777 | } | ||
2778 | |||
2779 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2780 | { | ||
2781 | return false; | ||
2782 | } | ||
2783 | |||
2784 | static void rcu_bind_gp_kthread(void) | ||
2785 | { | ||
2786 | } | ||
2787 | |||
2788 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2789 | unsigned long maxj) | ||
2790 | { | ||
2791 | } | ||
2792 | |||
2793 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2794 | { | ||
2795 | } | ||
2796 | |||
2797 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||