/* * Joe's tiny RCU, for small SMP systems. * * The main purpose of jRCU is to bring together and execute on a single * CPU the RCU end-of-batch operations of all CPUs. This relieves all but * one CPU from this periodic responsibility. This is important when the * system has user supplied realtime applications that require the full * use of CPUs dedicated to those applications. * * A secondary purpose is to come up with an RCU implementation that is as * simple as possible yet still suitable for SMP platforms, at least the * smaller ones. In this regard it fills the gap between TinyRCU, which * runs on uniprocessors only, and TreeRCU, a deeply complex implementation * best suited for the largest NUMA boxes on Earth. * * Algorithm: jRCU is frame based. That is, it periodically wakes up and * either advances jRCU state or it NOPs. For state to advance, every CPU * must have at least one period, however small, where its preempt_count() * is zero, since the last time jRCU state advanced. * * 'Advancing state' simply means moving the functions queued up by * call_rcu() along a FIFO. Those that drop off the end of the FIFO are * invoked before being discarded. jRCU advances batches of functions * through the FIFO rather than individual functions; when a function is * queued, call_rcu() puts it into the batch that is at the head of the FIFO. * * jRCU assumes that the frames are large enough that architecture barrier * operations performed in one frame have fully completed by the start of * the next. This period is typically in the tens of microseconds, so * it may not be wise to run jRCU at a frame rate less than 100 usecs. * * Author: Joe Korty * * Acknowledgements: Paul E. McKenney's 'TinyRCU for uniprocessors' inspired * the thought that there could could be something similiarly simple for SMP. * The rcu_list chain operators are from Jim Houston's Alternative RCU. * * Copyright Concurrent Computer Corporation, 2011 * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* * This RCU maintains three callback lists: the current batch (per cpu), * the previous batch (also per cpu), and the pending list (global). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Define an rcu list type and operators. An rcu list has only ->next * pointers for the chain nodes; the list head however is special and * has pointers to both the first and last nodes of the chain. Tweaked * so that null head, tail pointers can be used to signify an empty list. */ struct rcu_list { struct rcu_head *head; struct rcu_head **tail; int count; /* stats-n-debug */ }; static inline void rcu_list_init(struct rcu_list *l) { l->head = NULL; l->tail = NULL; l->count = 0; } /* * Add an element to the tail of an rcu list */ static inline void rcu_list_add(struct rcu_list *l, struct rcu_head *h) { if (unlikely(l->tail == NULL)) l->tail = &l->head; *l->tail = h; l->tail = &h->next; l->count++; h->next = NULL; } /* * Append the contents of one rcu list to another. The 'from' list is left * corrupted on exit; the caller must re-initialize it before it can be used * again. */ static inline void rcu_list_join(struct rcu_list *to, struct rcu_list *from) { if (from->head) { if (unlikely(to->tail == NULL)) { to->tail = &to->head; to->count = 0; } *to->tail = from->head; to->tail = from->tail; to->count += from->count; } } /* * selects, in ->cblist[] below, which is the current callback list and which * is the previous. */ static u8 rcu_which ____cacheline_aligned_in_smp; struct rcu_data { u8 wait; /* goes false when this cpu consents to * the retirement of the current batch */ struct rcu_list cblist[2]; /* current & previous callback lists */ s64 nqueued; /* #callbacks queued (stats-n-debug) */ } ____cacheline_aligned_in_smp; static struct rcu_data rcu_data[NR_CPUS]; /* debug & statistics stuff */ static struct rcu_stats { unsigned npasses; /* #passes made */ unsigned nlast; /* #passes since last end-of-batch */ unsigned nbatches; /* #end-of-batches (eobs) seen */ unsigned nmis; /* #passes discarded due to NMI */ atomic_t nbarriers; /* #rcu barriers processed */ atomic_t nsyncs; /* #rcu syncs processed */ s64 ninvoked; /* #invoked (ie, finished) callbacks */ unsigned nforced; /* #forced eobs (should be zero) */ } rcu_stats; #define RCU_HZ (20) #define RCU_HZ_PERIOD_US (USEC_PER_SEC / RCU_HZ) #define RCU_HZ_DELTA_US (USEC_PER_SEC / HZ) static int rcu_hz_period_us = RCU_HZ_PERIOD_US; static int rcu_hz_delta_us = RCU_HZ_DELTA_US; static int rcu_hz_precise; int rcu_scheduler_active __read_mostly; int rcu_nmi_seen __read_mostly; static int rcu_wdog_ctr; /* time since last end-of-batch, in usecs */ static int rcu_wdog_lim = 10 * USEC_PER_SEC; /* rcu watchdog interval */ /* * Return our CPU id or zero if we are too early in the boot process to * know what that is. For RCU to work correctly, a cpu named '0' must * eventually be present (but need not ever be online). */ #ifdef HAVE_THREAD_INFO_CPU static inline int rcu_cpu(void) { return current_thread_info()->cpu; } #else static unsigned rcu_cpu_early_flag __read_mostly = 1; static inline int rcu_cpu(void) { if (unlikely(rcu_cpu_early_flag)) { if (!(rcu_scheduler_active && nr_cpu_ids > 1)) return 0; rcu_cpu_early_flag = 0; } return raw_smp_processor_id(); } #endif /* HAVE_THREAD_INFO_CPU */ /* * Invoke whenever the calling CPU consents to end-of-batch. All CPUs * must so consent before the batch is truly ended. */ static inline void rcu_eob(int cpu) { struct rcu_data *rd = &rcu_data[cpu]; if (unlikely(rd->wait)) { rd->wait = 0; #ifndef CONFIG_JRCU_LAZY smp_mb(); #endif } } void jrcu_read_unlock(void) { if (preempt_count() == 1) rcu_eob(rcu_cpu()); preempt_enable(); } EXPORT_SYMBOL_GPL(jrcu_read_unlock); void rcu_note_context_switch(int cpu) { rcu_eob(cpu); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); void rcu_note_might_resched(void) { unsigned long flags; raw_local_irq_save(flags); rcu_eob(rcu_cpu()); raw_local_irq_restore(flags); } EXPORT_SYMBOL(rcu_note_might_resched); void synchronize_sched(void) { struct rcu_synchronize rcu; if (!rcu_scheduler_active) return; init_completion(&rcu.completion); call_rcu(&rcu.head, wakeme_after_rcu); wait_for_completion(&rcu.completion); atomic_inc(&rcu_stats.nsyncs); } EXPORT_SYMBOL_GPL(synchronize_sched); void rcu_barrier(void) { synchronize_sched(); synchronize_sched(); atomic_inc(&rcu_stats.nbarriers); } EXPORT_SYMBOL_GPL(rcu_barrier); void rcu_force_quiescent_state(void) { } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* * Insert an RCU callback onto the calling CPUs list of 'current batch' * callbacks. Lockless version, can be invoked anywhere except under NMI. */ void call_rcu_sched(struct rcu_head *cb, void (*func)(struct rcu_head *rcu)) { unsigned long flags; struct rcu_data *rd; struct rcu_list *cblist; int which; cb->func = func; cb->next = NULL; raw_local_irq_save(flags); smp_mb(); rd = &rcu_data[rcu_cpu()]; which = ACCESS_ONCE(rcu_which); cblist = &rd->cblist[which]; /* The following is not NMI-safe, therefore call_rcu() * cannot be invoked under NMI. */ rcu_list_add(cblist, cb); rd->nqueued++; smp_mb(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Invoke all callbacks on the passed-in list. */ static void rcu_invoke_callbacks(struct rcu_list *pending) { struct rcu_head *curr, *next; for (curr = pending->head; curr;) { unsigned long offset = (unsigned long)curr->func; next = curr->next; if (__is_kfree_rcu_offset(offset)) kfree((void *)curr - offset); else curr->func(curr); curr = next; rcu_stats.ninvoked++; } } /* * Check if the conditions for ending the current batch are true. If * so then end it. * * Must be invoked periodically, and the periodic invocations must be * far enough apart in time for the previous batch to become quiescent. * This is a few tens of microseconds unless NMIs are involved; an NMI * stretches out the requirement by the duration of the NMI. * * "Quiescent" means the owning cpu is no longer appending callbacks * and has completed execution of a trailing write-memory-barrier insn. */ static void __rcu_delimit_batches(struct rcu_list *pending) { struct rcu_data *rd; struct rcu_list *plist; int cpu, eob, prev; if (!rcu_scheduler_active) return; rcu_stats.nlast++; /* If an NMI occured then the previous batch may not yet be * quiescent. Let's wait till it is. */ if (rcu_nmi_seen) { rcu_nmi_seen = 0; rcu_stats.nmis++; return; } /* * Find out if the current batch has ended * (end-of-batch). */ eob = 1; for_each_online_cpu(cpu) { rd = &rcu_data[cpu]; if (rd->wait) { rd->wait = preempt_count_cpu(cpu) > idle_cpu(cpu); if (rd->wait) { eob = 0; break; } } } /* * Exit if batch has not ended. But first, tickle all non-cooperating * CPUs if enough time has passed. */ if (eob == 0) { if (rcu_wdog_ctr >= rcu_wdog_lim) { rcu_wdog_ctr = 0; rcu_stats.nforced++; for_each_online_cpu(cpu) { if (rcu_data[cpu].wait) force_cpu_resched(cpu); } } rcu_wdog_ctr += rcu_hz_period_us; return; } /* * End the current RCU batch and start a new one. * * This is a two-step operation: move every cpu's previous list * to the global pending list, then tell every cpu to swap its * current and pending lists (ie, toggle rcu_which). * * We tolerate the cpus taking a bit of time noticing this swap; * we expect them to continue to put callbacks on the old current * list (which is now the previous list) for a while. That time, * however, cannot exceed one RCU_HZ period. */ prev = ACCESS_ONCE(rcu_which) ^ 1; for_each_present_cpu(cpu) { rd = &rcu_data[cpu]; plist = &rd->cblist[prev]; /* Chain previous batch of callbacks, if any, to the pending list */ if (plist->head) { rcu_list_join(pending, plist); rcu_list_init(plist); } if (cpu_online(cpu)) /* wins race with offlining every time */ rd->wait = preempt_count_cpu(cpu) > idle_cpu(cpu); else rd->wait = 0; } smp_mb(); /* just paranoia, the below xchg should do this on all archs */ /* * Swap current and previous lists. The other cpus must not * see this out-of-order w.r.t. the above emptying of each cpu's * previous list. The xchg accomplishes that and, as a side (but * seemingly unneeded) bonus, keeps this cpu from advancing its insn * counter until the results of that xchg are visible on other cpus. */ xchg(&rcu_which, prev); /* only place where rcu_which is written to */ rcu_stats.nbatches++; rcu_stats.nlast = 0; rcu_wdog_ctr = 0; } static void rcu_delimit_batches(void) { unsigned long flags; struct rcu_list pending; rcu_list_init(&pending); rcu_stats.npasses++; raw_local_irq_save(flags); smp_mb(); __rcu_delimit_batches(&pending); smp_mb(); raw_local_irq_restore(flags); if (pending.head) rcu_invoke_callbacks(&pending); } /* ------------------ interrupt driver section ------------------ */ /* * We drive RCU from a periodic interrupt during most of boot. Once boot * is complete we (optionally) transition to a daemon. */ #include #include #include #include #define rcu_hz_period_ns (rcu_hz_period_us * NSEC_PER_USEC) #define rcu_hz_delta_ns (rcu_hz_delta_us * NSEC_PER_USEC) static struct hrtimer rcu_timer; static void rcu_softirq_func(struct softirq_action *h) { rcu_delimit_batches(); } static enum hrtimer_restart rcu_timer_func(struct hrtimer *t) { ktime_t next; raise_softirq(RCU_SOFTIRQ); next = ktime_add_ns(ktime_get(), rcu_hz_period_ns); hrtimer_set_expires_range_ns(&rcu_timer, next, rcu_hz_precise ? 0 : rcu_hz_delta_ns); return HRTIMER_RESTART; } static void rcu_timer_start(void) { hrtimer_forward_now(&rcu_timer, ns_to_ktime(rcu_hz_period_ns)); hrtimer_start_expires(&rcu_timer, HRTIMER_MODE_ABS); } static __init void rcu_timer_init(void) { open_softirq(RCU_SOFTIRQ, rcu_softirq_func); hrtimer_init(&rcu_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); rcu_timer.function = rcu_timer_func; } #ifdef CONFIG_JRCU_DAEMON static void rcu_timer_stop(void) { hrtimer_cancel(&rcu_timer); } #endif void __init rcu_scheduler_starting(void) { rcu_timer_init(); } #ifndef CONFIG_JRCU_DAEMON void __init int rcu_start_callback_processing(void) { rcu_timer_start(); rcu_scheduler_active = 1; pr_info("JRCU: callback processing via timer has started.\n"); return 0; } #else /* CONFIG_JRCU_DAEMON */ /* ------------------ daemon driver section --------------------- */ /* * Once the system is fully up, we will drive the periodic-polling part * of JRCU from a kernel daemon, jrcud. Until then it is driven by * an interrupt. */ #include #include #include static int rcu_priority; static struct task_struct *rcu_daemon; static int jrcu_set_priority(int priority) { struct sched_param param; if (priority == 0) { set_user_nice(current, -19); return 0; } if (priority < 0) param.sched_priority = MAX_USER_RT_PRIO + priority; else param.sched_priority = priority; sched_setscheduler_nocheck(current, SCHED_RR, ¶m); return param.sched_priority; } static int jrcud_func(void *arg) { current->flags |= PF_NOFREEZE; rcu_priority = jrcu_set_priority(CONFIG_JRCU_DAEMON_PRIO); rcu_timer_stop(); pr_info("JRCU: callback processing via daemon started.\n"); while (!kthread_should_stop()) { if (rcu_hz_precise) { usleep_range(rcu_hz_period_us, rcu_hz_period_us); } else { usleep_range(rcu_hz_period_us, rcu_hz_period_us + rcu_hz_delta_us); } rcu_delimit_batches(); } pr_info("JRCU: replaced callback daemon with a timer.\n"); rcu_daemon = NULL; rcu_timer_start(); return 0; } static __init int rcu_start_callback_processing(void) { struct task_struct *p; p = kthread_run(jrcud_func, NULL, "jrcud"); if (IS_ERR(p)) { pr_warn("JRCU: cannot replace callback timer with a daemon\n"); return -ENODEV; } rcu_daemon = p; rcu_scheduler_active = 1; pr_info("JRCU: callback processing now allowed.\n"); return 0; } #endif /* CONFIG_JRCU_DAEMON */ subsys_initcall_sync(rcu_start_callback_processing); /* ------------------ debug and statistics section -------------- */ #ifdef CONFIG_DEBUG_FS #include #include static int rcu_hz = RCU_HZ; static int rcu_debugfs_show(struct seq_file *m, void *unused) { int cpu, q; s64 nqueued; nqueued = 0; for_each_present_cpu(cpu) nqueued += rcu_data[cpu].nqueued; seq_printf(m, "%14u: hz, %s\n", rcu_hz, rcu_hz_precise ? "precise" : "sloppy"); seq_printf(m, "%14u: watchdog (secs)\n", rcu_wdog_lim / (int)USEC_PER_SEC); seq_printf(m, "%14d: #secs left on watchdog\n", (rcu_wdog_lim - rcu_wdog_ctr) / (int)USEC_PER_SEC); #ifdef CONFIG_JRCU_DAEMON if (rcu_daemon) seq_printf(m, "%14u: daemon priority\n", rcu_priority); else seq_printf(m, "%14s: daemon priority\n", "none, no daemon"); #endif seq_printf(m, "\n"); seq_printf(m, "%14u: #passes\n", rcu_stats.npasses); seq_printf(m, "%14u: #passes discarded due to NMI\n", rcu_stats.nmis); seq_printf(m, "%14u: #passes resulting in end-of-batch\n", rcu_stats.nbatches); seq_printf(m, "%14u: #passes not resulting in end-of-batch\n", rcu_stats.npasses - rcu_stats.nbatches); seq_printf(m, "%14u: #passes since last end-of-batch\n", rcu_stats.nlast); seq_printf(m, "%14u: #passes forced (0 is best)\n", rcu_stats.nforced); seq_printf(m, "\n"); seq_printf(m, "%14u: #barriers\n", atomic_read(&rcu_stats.nbarriers)); seq_printf(m, "%14u: #syncs\n", atomic_read(&rcu_stats.nsyncs)); seq_printf(m, "%14llu: #callbacks invoked\n", rcu_stats.ninvoked); seq_printf(m, "%14d: #callbacks left to invoke\n", (int)(nqueued - rcu_stats.ninvoked)); seq_printf(m, "\n"); for_each_online_cpu(cpu) seq_printf(m, "%4d ", cpu); seq_printf(m, " CPU\n"); for_each_online_cpu(cpu) { struct rcu_data *rd = &rcu_data[cpu]; seq_printf(m, "--%c%c ", idle_cpu(cpu) ? 'I' : '-', rd->wait ? 'W' : '-'); } seq_printf(m, " FLAGS\n"); for (q = 0; q < 2; q++) { int w = ACCESS_ONCE(rcu_which); for_each_online_cpu(cpu) { struct rcu_data *rd = &rcu_data[cpu]; struct rcu_list *l = &rd->cblist[q]; seq_printf(m, "%4d ", l->count); } seq_printf(m, " Q%d%c\n", q, " *"[q == w]); } seq_printf(m, "\nFLAGS:\n"); seq_printf(m, " I - cpu idle, W - cpu waiting for end-of-batch,\n"); seq_printf(m, " * - the current Q, other is the previous Q.\n"); return 0; } static ssize_t rcu_debugfs_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { int i, j, c; char token[32]; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (count <= 0) return count; if (!access_ok(VERIFY_READ, buffer, count)) return -EFAULT; i = 0; if (__get_user(c, &buffer[i++])) return -EFAULT; next: /* Token extractor -- first, skip leading whitepace */ while (c && isspace(c) && i < count) { if (__get_user(c, &buffer[i++])) return -EFAULT; } if (i >= count || c == 0) return count; /* all done, no more tokens */ j = 0; do { if (j == (sizeof(token) - 1)) return -EINVAL; token[j++] = c; if (__get_user(c, &buffer[i++])) return -EFAULT; } while (c && !isspace(c) && i < count); /* extract next token */ token[j++] = 0; if (!strncmp(token, "hz=", 3)) { int rcu_hz_wanted = -1; sscanf(&token[3], "%d", &rcu_hz_wanted); if (rcu_hz_wanted < 2 || rcu_hz_wanted > 1000) return -EINVAL; rcu_hz = rcu_hz_wanted; rcu_hz_period_us = USEC_PER_SEC / rcu_hz; } else if (!strncmp(token, "precise=", 8)) { sscanf(&token[8], "%d", &rcu_hz_precise); } else if (!strncmp(token, "wdog=", 5)) { int wdog = -1; sscanf(&token[5], "%d", &wdog); if (wdog < 3 || wdog > 1000) return -EINVAL; rcu_wdog_lim = wdog * USEC_PER_SEC; } else return -EINVAL; goto next; } static int rcu_debugfs_open(struct inode *inode, struct file *file) { return single_open(file, rcu_debugfs_show, NULL); } static const struct file_operations rcu_debugfs_fops = { .owner = THIS_MODULE, .open = rcu_debugfs_open, .read = seq_read, .write = rcu_debugfs_write, .llseek = seq_lseek, .release = single_release, }; static struct dentry *rcudir; static int __init rcu_debugfs_init(void) { struct dentry *retval; rcudir = debugfs_create_dir("rcu", NULL); if (!rcudir) goto error; retval = debugfs_create_file("rcudata", 0644, rcudir, NULL, &rcu_debugfs_fops); if (!retval) goto error; return 0; error: debugfs_remove_recursive(rcudir); pr_warn("JRCU: Could not create debugfs files.\n"); return -ENOSYS; } late_initcall(rcu_debugfs_init); #endif /* CONFIG_DEBUG_FS */