aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2012-07-25 15:40:40 -0400
committerIngo Molnar <mingo@kernel.org>2012-07-25 15:40:40 -0400
commitd431adfbc9b7de651f3164c6b7ffcad75805d7e4 (patch)
tree29bce222c81a3a392e51c11e2188659aa6d1bded /kernel
parentd6250a3f12edb3a86db9598ffeca3de8b4a219e9 (diff)
parente2b34e311be3a57c9abcb927e37a57e38913714c (diff)
Merge branch 'linus' into x86/urgent
Merge in Linus's tree to avoid a conflict. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/debug/kdb/kdb_main.c91
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/events/core.c49
-rw-r--r--kernel/events/uprobes.c461
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/hrtimer.c53
-rw-r--r--kernel/power/hibernate.c8
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/printk.c285
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcutiny.c4
-rw-r--r--kernel/rcutiny_plugin.h56
-rw-r--r--kernel/rcutorture.c72
-rw-r--r--kernel/rcutree.c479
-rw-r--r--kernel/rcutree.h47
-rw-r--r--kernel/rcutree_plugin.h237
-rw-r--r--kernel/rcutree_trace.c148
-rw-r--r--kernel/sched/core.c276
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/smp.c20
-rw-r--r--kernel/smpboot.h2
-rw-r--r--kernel/sys.c16
-rw-r--r--kernel/time/ntp.c8
-rw-r--r--kernel/time/tick-sched.c194
-rw-r--r--kernel/time/timekeeping.c511
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c110
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.c33
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_output.c2
34 files changed, 1838 insertions, 1417 deletions
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2bb..1f91413edb87 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/kmsg_dump.h>
17#include <linux/reboot.h> 18#include <linux/reboot.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/sysrq.h> 20#include <linux/sysrq.h>
@@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv)
2040 */ 2041 */
2041static int kdb_dmesg(int argc, const char **argv) 2042static int kdb_dmesg(int argc, const char **argv)
2042{ 2043{
2043 char *syslog_data[4], *start, *end, c = '\0', *p; 2044 int diag;
2044 int diag, logging, logsize, lines = 0, adjust = 0, n; 2045 int logging;
2046 int lines = 0;
2047 int adjust = 0;
2048 int n = 0;
2049 int skip = 0;
2050 struct kmsg_dumper dumper = { .active = 1 };
2051 size_t len;
2052 char buf[201];
2045 2053
2046 if (argc > 2) 2054 if (argc > 2)
2047 return KDB_ARGCOUNT; 2055 return KDB_ARGCOUNT;
@@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv)
2064 kdb_set(2, setargs); 2072 kdb_set(2, setargs);
2065 } 2073 }
2066 2074
2067 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] 2075 kmsg_dump_rewind_nolock(&dumper);
2068 * logical start, end+1. */ 2076 while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
2069 kdb_syslog_data(syslog_data); 2077 n++;
2070 if (syslog_data[2] == syslog_data[3]) 2078
2071 return 0;
2072 logsize = syslog_data[1] - syslog_data[0];
2073 start = syslog_data[2];
2074 end = syslog_data[3];
2075#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
2076 for (n = 0, p = start; p < end; ++p) {
2077 c = *KDB_WRAP(p);
2078 if (c == '\n')
2079 ++n;
2080 }
2081 if (c != '\n')
2082 ++n;
2083 if (lines < 0) { 2079 if (lines < 0) {
2084 if (adjust >= n) 2080 if (adjust >= n)
2085 kdb_printf("buffer only contains %d lines, nothing " 2081 kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv)
2087 else if (adjust - lines >= n) 2083 else if (adjust - lines >= n)
2088 kdb_printf("buffer only contains %d lines, last %d " 2084 kdb_printf("buffer only contains %d lines, last %d "
2089 "lines printed\n", n, n - adjust); 2085 "lines printed\n", n, n - adjust);
2090 if (adjust) { 2086 skip = adjust;
2091 for (; start < end && adjust; ++start) { 2087 lines = abs(lines);
2092 if (*KDB_WRAP(start) == '\n')
2093 --adjust;
2094 }
2095 if (start < end)
2096 ++start;
2097 }
2098 for (p = start; p < end && lines; ++p) {
2099 if (*KDB_WRAP(p) == '\n')
2100 ++lines;
2101 }
2102 end = p;
2103 } else if (lines > 0) { 2088 } else if (lines > 0) {
2104 int skip = n - (adjust + lines); 2089 skip = n - lines - adjust;
2090 lines = abs(lines);
2105 if (adjust >= n) { 2091 if (adjust >= n) {
2106 kdb_printf("buffer only contains %d lines, " 2092 kdb_printf("buffer only contains %d lines, "
2107 "nothing printed\n", n); 2093 "nothing printed\n", n);
@@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv)
2112 kdb_printf("buffer only contains %d lines, first " 2098 kdb_printf("buffer only contains %d lines, first "
2113 "%d lines printed\n", n, lines); 2099 "%d lines printed\n", n, lines);
2114 } 2100 }
2115 for (; start < end && skip; ++start) { 2101 } else {
2116 if (*KDB_WRAP(start) == '\n') 2102 lines = n;
2117 --skip;
2118 }
2119 for (p = start; p < end && lines; ++p) {
2120 if (*KDB_WRAP(p) == '\n')
2121 --lines;
2122 }
2123 end = p;
2124 } 2103 }
2125 /* Do a line at a time (max 200 chars) to reduce protocol overhead */ 2104
2126 c = '\n'; 2105 if (skip >= n || skip < 0)
2127 while (start != end) { 2106 return 0;
2128 char buf[201]; 2107
2129 p = buf; 2108 kmsg_dump_rewind_nolock(&dumper);
2130 if (KDB_FLAG(CMD_INTERRUPT)) 2109 while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
2131 return 0; 2110 if (skip) {
2132 while (start < end && (c = *KDB_WRAP(start)) && 2111 skip--;
2133 (p - buf) < sizeof(buf)-1) { 2112 continue;
2134 ++start;
2135 *p++ = c;
2136 if (c == '\n')
2137 break;
2138 } 2113 }
2139 *p = '\0'; 2114 if (!lines--)
2140 kdb_printf("%s", buf); 2115 break;
2116
2117 kdb_printf("%.*s\n", (int)len - 1, buf);
2141 } 2118 }
2142 if (c != '\n')
2143 kdb_printf("\n");
2144 2119
2145 return 0; 2120 return 0;
2146} 2121}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513b..392ec6a25844 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
205extern int kdb_grep_leading; 205extern int kdb_grep_leading;
206extern int kdb_grep_trailing; 206extern int kdb_grep_trailing;
207extern char *kdb_cmds[]; 207extern char *kdb_cmds[];
208extern void kdb_syslog_data(char *syslog_data[]);
209extern unsigned long kdb_task_state_string(const char *); 208extern unsigned long kdb_task_state_string(const char *);
210extern char kdb_task_state_char (const struct task_struct *); 209extern char kdb_task_state_char (const struct task_struct *);
211extern unsigned long kdb_task_state(const struct task_struct *p, 210extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7d71d6ec972..f1cf0edeb39a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1645 lockdep_assert_held(&ctx->mutex); 1645 lockdep_assert_held(&ctx->mutex);
1646 1646
1647 event->ctx = ctx; 1647 event->ctx = ctx;
1648 if (event->cpu != -1)
1649 event->cpu = cpu;
1648 1650
1649 if (!task) { 1651 if (!task) {
1650 /* 1652 /*
@@ -6252,6 +6254,8 @@ SYSCALL_DEFINE5(perf_event_open,
6252 } 6254 }
6253 } 6255 }
6254 6256
6257 get_online_cpus();
6258
6255 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, 6259 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6256 NULL, NULL); 6260 NULL, NULL);
6257 if (IS_ERR(event)) { 6261 if (IS_ERR(event)) {
@@ -6304,7 +6308,7 @@ SYSCALL_DEFINE5(perf_event_open,
6304 /* 6308 /*
6305 * Get the target context (task or percpu): 6309 * Get the target context (task or percpu):
6306 */ 6310 */
6307 ctx = find_get_context(pmu, task, cpu); 6311 ctx = find_get_context(pmu, task, event->cpu);
6308 if (IS_ERR(ctx)) { 6312 if (IS_ERR(ctx)) {
6309 err = PTR_ERR(ctx); 6313 err = PTR_ERR(ctx);
6310 goto err_alloc; 6314 goto err_alloc;
@@ -6377,20 +6381,23 @@ SYSCALL_DEFINE5(perf_event_open,
6377 mutex_lock(&ctx->mutex); 6381 mutex_lock(&ctx->mutex);
6378 6382
6379 if (move_group) { 6383 if (move_group) {
6380 perf_install_in_context(ctx, group_leader, cpu); 6384 synchronize_rcu();
6385 perf_install_in_context(ctx, group_leader, event->cpu);
6381 get_ctx(ctx); 6386 get_ctx(ctx);
6382 list_for_each_entry(sibling, &group_leader->sibling_list, 6387 list_for_each_entry(sibling, &group_leader->sibling_list,
6383 group_entry) { 6388 group_entry) {
6384 perf_install_in_context(ctx, sibling, cpu); 6389 perf_install_in_context(ctx, sibling, event->cpu);
6385 get_ctx(ctx); 6390 get_ctx(ctx);
6386 } 6391 }
6387 } 6392 }
6388 6393
6389 perf_install_in_context(ctx, event, cpu); 6394 perf_install_in_context(ctx, event, event->cpu);
6390 ++ctx->generation; 6395 ++ctx->generation;
6391 perf_unpin_context(ctx); 6396 perf_unpin_context(ctx);
6392 mutex_unlock(&ctx->mutex); 6397 mutex_unlock(&ctx->mutex);
6393 6398
6399 put_online_cpus();
6400
6394 event->owner = current; 6401 event->owner = current;
6395 6402
6396 mutex_lock(&current->perf_event_mutex); 6403 mutex_lock(&current->perf_event_mutex);
@@ -6419,6 +6426,7 @@ err_context:
6419err_alloc: 6426err_alloc:
6420 free_event(event); 6427 free_event(event);
6421err_task: 6428err_task:
6429 put_online_cpus();
6422 if (task) 6430 if (task)
6423 put_task_struct(task); 6431 put_task_struct(task);
6424err_group_fd: 6432err_group_fd:
@@ -6479,6 +6487,39 @@ err:
6479} 6487}
6480EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 6488EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6481 6489
6490void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
6491{
6492 struct perf_event_context *src_ctx;
6493 struct perf_event_context *dst_ctx;
6494 struct perf_event *event, *tmp;
6495 LIST_HEAD(events);
6496
6497 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
6498 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
6499
6500 mutex_lock(&src_ctx->mutex);
6501 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
6502 event_entry) {
6503 perf_remove_from_context(event);
6504 put_ctx(src_ctx);
6505 list_add(&event->event_entry, &events);
6506 }
6507 mutex_unlock(&src_ctx->mutex);
6508
6509 synchronize_rcu();
6510
6511 mutex_lock(&dst_ctx->mutex);
6512 list_for_each_entry_safe(event, tmp, &events, event_entry) {
6513 list_del(&event->event_entry);
6514 if (event->state >= PERF_EVENT_STATE_OFF)
6515 event->state = PERF_EVENT_STATE_INACTIVE;
6516 perf_install_in_context(dst_ctx, event, dst_cpu);
6517 get_ctx(dst_ctx);
6518 }
6519 mutex_unlock(&dst_ctx->mutex);
6520}
6521EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
6522
6482static void sync_child_event(struct perf_event *child_event, 6523static void sync_child_event(struct perf_event *child_event,
6483 struct task_struct *child) 6524 struct task_struct *child)
6484{ 6525{
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 985be4d80fe8..f93532748bca 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -38,13 +38,29 @@
38#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) 38#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
39#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE 39#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
40 40
41static struct srcu_struct uprobes_srcu;
42static struct rb_root uprobes_tree = RB_ROOT; 41static struct rb_root uprobes_tree = RB_ROOT;
43 42
44static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ 43static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
45 44
46#define UPROBES_HASH_SZ 13 45#define UPROBES_HASH_SZ 13
47 46
47/*
48 * We need separate register/unregister and mmap/munmap lock hashes because
49 * of mmap_sem nesting.
50 *
51 * uprobe_register() needs to install probes on (potentially) all processes
52 * and thus needs to acquire multiple mmap_sems (consequtively, not
53 * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
54 * for the particular process doing the mmap.
55 *
56 * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
57 * because of lock order against i_mmap_mutex. This means there's a hole in
58 * the register vma iteration where a mmap() can happen.
59 *
60 * Thus uprobe_register() can race with uprobe_mmap() and we can try and
61 * install a probe where one is already installed.
62 */
63
48/* serialize (un)register */ 64/* serialize (un)register */
49static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; 65static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
50 66
@@ -61,17 +77,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
61 */ 77 */
62static atomic_t uprobe_events = ATOMIC_INIT(0); 78static atomic_t uprobe_events = ATOMIC_INIT(0);
63 79
64/*
65 * Maintain a temporary per vma info that can be used to search if a vma
66 * has already been handled. This structure is introduced since extending
67 * vm_area_struct wasnt recommended.
68 */
69struct vma_info {
70 struct list_head probe_list;
71 struct mm_struct *mm;
72 loff_t vaddr;
73};
74
75struct uprobe { 80struct uprobe {
76 struct rb_node rb_node; /* node in the rb tree */ 81 struct rb_node rb_node; /* node in the rb tree */
77 atomic_t ref; 82 atomic_t ref;
@@ -100,7 +105,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
100 if (!is_register) 105 if (!is_register)
101 return true; 106 return true;
102 107
103 if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) 108 if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
109 == (VM_READ|VM_EXEC))
104 return true; 110 return true;
105 111
106 return false; 112 return false;
@@ -129,33 +135,17 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
129static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) 135static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
130{ 136{
131 struct mm_struct *mm = vma->vm_mm; 137 struct mm_struct *mm = vma->vm_mm;
132 pgd_t *pgd;
133 pud_t *pud;
134 pmd_t *pmd;
135 pte_t *ptep;
136 spinlock_t *ptl;
137 unsigned long addr; 138 unsigned long addr;
138 int err = -EFAULT; 139 spinlock_t *ptl;
140 pte_t *ptep;
139 141
140 addr = page_address_in_vma(page, vma); 142 addr = page_address_in_vma(page, vma);
141 if (addr == -EFAULT) 143 if (addr == -EFAULT)
142 goto out; 144 return -EFAULT;
143
144 pgd = pgd_offset(mm, addr);
145 if (!pgd_present(*pgd))
146 goto out;
147
148 pud = pud_offset(pgd, addr);
149 if (!pud_present(*pud))
150 goto out;
151
152 pmd = pmd_offset(pud, addr);
153 if (!pmd_present(*pmd))
154 goto out;
155 145
156 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 146 ptep = page_check_address(page, mm, addr, &ptl, 0);
157 if (!ptep) 147 if (!ptep)
158 goto out; 148 return -EAGAIN;
159 149
160 get_page(kpage); 150 get_page(kpage);
161 page_add_new_anon_rmap(kpage, vma, addr); 151 page_add_new_anon_rmap(kpage, vma, addr);
@@ -174,10 +164,8 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
174 try_to_free_swap(page); 164 try_to_free_swap(page);
175 put_page(page); 165 put_page(page);
176 pte_unmap_unlock(ptep, ptl); 166 pte_unmap_unlock(ptep, ptl);
177 err = 0;
178 167
179out: 168 return 0;
180 return err;
181} 169}
182 170
183/** 171/**
@@ -222,9 +210,8 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
222 void *vaddr_old, *vaddr_new; 210 void *vaddr_old, *vaddr_new;
223 struct vm_area_struct *vma; 211 struct vm_area_struct *vma;
224 struct uprobe *uprobe; 212 struct uprobe *uprobe;
225 loff_t addr;
226 int ret; 213 int ret;
227 214retry:
228 /* Read the page with vaddr into memory */ 215 /* Read the page with vaddr into memory */
229 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); 216 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
230 if (ret <= 0) 217 if (ret <= 0)
@@ -246,10 +233,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
246 if (mapping != vma->vm_file->f_mapping) 233 if (mapping != vma->vm_file->f_mapping)
247 goto put_out; 234 goto put_out;
248 235
249 addr = vma_address(vma, uprobe->offset);
250 if (vaddr != (unsigned long)addr)
251 goto put_out;
252
253 ret = -ENOMEM; 236 ret = -ENOMEM;
254 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 237 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
255 if (!new_page) 238 if (!new_page)
@@ -267,11 +250,7 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
267 vaddr_new = kmap_atomic(new_page); 250 vaddr_new = kmap_atomic(new_page);
268 251
269 memcpy(vaddr_new, vaddr_old, PAGE_SIZE); 252 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
270 253 memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
271 /* poke the new insn in, ASSUMES we don't cross page boundary */
272 vaddr &= ~PAGE_MASK;
273 BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
274 memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
275 254
276 kunmap_atomic(vaddr_new); 255 kunmap_atomic(vaddr_new);
277 kunmap_atomic(vaddr_old); 256 kunmap_atomic(vaddr_old);
@@ -291,6 +270,8 @@ unlock_out:
291put_out: 270put_out:
292 put_page(old_page); 271 put_page(old_page);
293 272
273 if (unlikely(ret == -EAGAIN))
274 goto retry;
294 return ret; 275 return ret;
295} 276}
296 277
@@ -312,7 +293,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
312 void *vaddr_new; 293 void *vaddr_new;
313 int ret; 294 int ret;
314 295
315 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); 296 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
316 if (ret <= 0) 297 if (ret <= 0)
317 return ret; 298 return ret;
318 299
@@ -333,10 +314,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
333 uprobe_opcode_t opcode; 314 uprobe_opcode_t opcode;
334 int result; 315 int result;
335 316
317 if (current->mm == mm) {
318 pagefault_disable();
319 result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
320 sizeof(opcode));
321 pagefault_enable();
322
323 if (likely(result == 0))
324 goto out;
325 }
326
336 result = read_opcode(mm, vaddr, &opcode); 327 result = read_opcode(mm, vaddr, &opcode);
337 if (result) 328 if (result)
338 return result; 329 return result;
339 330out:
340 if (is_swbp_insn(&opcode)) 331 if (is_swbp_insn(&opcode))
341 return 1; 332 return 1;
342 333
@@ -355,7 +346,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
355int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 346int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
356{ 347{
357 int result; 348 int result;
358 349 /*
350 * See the comment near uprobes_hash().
351 */
359 result = is_swbp_at_addr(mm, vaddr); 352 result = is_swbp_at_addr(mm, vaddr);
360 if (result == 1) 353 if (result == 1)
361 return -EEXIST; 354 return -EEXIST;
@@ -520,7 +513,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
520 uprobe->inode = igrab(inode); 513 uprobe->inode = igrab(inode);
521 uprobe->offset = offset; 514 uprobe->offset = offset;
522 init_rwsem(&uprobe->consumer_rwsem); 515 init_rwsem(&uprobe->consumer_rwsem);
523 INIT_LIST_HEAD(&uprobe->pending_list);
524 516
525 /* add to uprobes_tree, sorted on inode:offset */ 517 /* add to uprobes_tree, sorted on inode:offset */
526 cur_uprobe = insert_uprobe(uprobe); 518 cur_uprobe = insert_uprobe(uprobe);
@@ -588,20 +580,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
588} 580}
589 581
590static int 582static int
591__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, 583__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
592 unsigned long nbytes, unsigned long offset) 584 unsigned long nbytes, loff_t offset)
593{ 585{
594 struct file *filp = vma->vm_file;
595 struct page *page; 586 struct page *page;
596 void *vaddr; 587 void *vaddr;
597 unsigned long off1; 588 unsigned long off;
598 unsigned long idx; 589 pgoff_t idx;
599 590
600 if (!filp) 591 if (!filp)
601 return -EINVAL; 592 return -EINVAL;
602 593
603 idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); 594 if (!mapping->a_ops->readpage)
604 off1 = offset &= ~PAGE_MASK; 595 return -EIO;
596
597 idx = offset >> PAGE_CACHE_SHIFT;
598 off = offset & ~PAGE_MASK;
605 599
606 /* 600 /*
607 * Ensure that the page that has the original instruction is 601 * Ensure that the page that has the original instruction is
@@ -612,22 +606,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins
612 return PTR_ERR(page); 606 return PTR_ERR(page);
613 607
614 vaddr = kmap_atomic(page); 608 vaddr = kmap_atomic(page);
615 memcpy(insn, vaddr + off1, nbytes); 609 memcpy(insn, vaddr + off, nbytes);
616 kunmap_atomic(vaddr); 610 kunmap_atomic(vaddr);
617 page_cache_release(page); 611 page_cache_release(page);
618 612
619 return 0; 613 return 0;
620} 614}
621 615
622static int 616static int copy_insn(struct uprobe *uprobe, struct file *filp)
623copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
624{ 617{
625 struct address_space *mapping; 618 struct address_space *mapping;
626 unsigned long nbytes; 619 unsigned long nbytes;
627 int bytes; 620 int bytes;
628 621
629 addr &= ~PAGE_MASK; 622 nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
630 nbytes = PAGE_SIZE - addr;
631 mapping = uprobe->inode->i_mapping; 623 mapping = uprobe->inode->i_mapping;
632 624
633 /* Instruction at end of binary; copy only available bytes */ 625 /* Instruction at end of binary; copy only available bytes */
@@ -638,13 +630,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
638 630
639 /* Instruction at the page-boundary; copy bytes in second page */ 631 /* Instruction at the page-boundary; copy bytes in second page */
640 if (nbytes < bytes) { 632 if (nbytes < bytes) {
641 if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, 633 int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
642 bytes - nbytes, uprobe->offset + nbytes)) 634 bytes - nbytes, uprobe->offset + nbytes);
643 return -ENOMEM; 635 if (err)
644 636 return err;
645 bytes = nbytes; 637 bytes = nbytes;
646 } 638 }
647 return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); 639 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
648} 640}
649 641
650/* 642/*
@@ -672,9 +664,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
672 */ 664 */
673static int 665static int
674install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, 666install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
675 struct vm_area_struct *vma, loff_t vaddr) 667 struct vm_area_struct *vma, unsigned long vaddr)
676{ 668{
677 unsigned long addr;
678 int ret; 669 int ret;
679 670
680 /* 671 /*
@@ -687,20 +678,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
687 if (!uprobe->consumers) 678 if (!uprobe->consumers)
688 return -EEXIST; 679 return -EEXIST;
689 680
690 addr = (unsigned long)vaddr;
691
692 if (!(uprobe->flags & UPROBE_COPY_INSN)) { 681 if (!(uprobe->flags & UPROBE_COPY_INSN)) {
693 ret = copy_insn(uprobe, vma, addr); 682 ret = copy_insn(uprobe, vma->vm_file);
694 if (ret) 683 if (ret)
695 return ret; 684 return ret;
696 685
697 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) 686 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
698 return -EEXIST; 687 return -ENOTSUPP;
699 688
700 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); 689 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
701 if (ret) 690 if (ret)
702 return ret; 691 return ret;
703 692
693 /* write_opcode() assumes we don't cross page boundary */
694 BUG_ON((uprobe->offset & ~PAGE_MASK) +
695 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
696
704 uprobe->flags |= UPROBE_COPY_INSN; 697 uprobe->flags |= UPROBE_COPY_INSN;
705 } 698 }
706 699
@@ -713,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
713 * Hence increment before and decrement on failure. 706 * Hence increment before and decrement on failure.
714 */ 707 */
715 atomic_inc(&mm->uprobes_state.count); 708 atomic_inc(&mm->uprobes_state.count);
716 ret = set_swbp(&uprobe->arch, mm, addr); 709 ret = set_swbp(&uprobe->arch, mm, vaddr);
717 if (ret) 710 if (ret)
718 atomic_dec(&mm->uprobes_state.count); 711 atomic_dec(&mm->uprobes_state.count);
719 712
@@ -721,27 +714,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
721} 714}
722 715
723static void 716static void
724remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) 717remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
725{ 718{
726 if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) 719 if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
727 atomic_dec(&mm->uprobes_state.count); 720 atomic_dec(&mm->uprobes_state.count);
728} 721}
729 722
730/* 723/*
731 * There could be threads that have hit the breakpoint and are entering the 724 * There could be threads that have already hit the breakpoint. They
732 * notifier code and trying to acquire the uprobes_treelock. The thread 725 * will recheck the current insn and restart if find_uprobe() fails.
733 * calling delete_uprobe() that is removing the uprobe from the rb_tree can 726 * See find_active_uprobe().
734 * race with these threads and might acquire the uprobes_treelock compared
735 * to some of the breakpoint hit threads. In such a case, the breakpoint
736 * hit threads will not find the uprobe. The current unregistering thread
737 * waits till all other threads have hit a breakpoint, to acquire the
738 * uprobes_treelock before the uprobe is removed from the rbtree.
739 */ 727 */
740static void delete_uprobe(struct uprobe *uprobe) 728static void delete_uprobe(struct uprobe *uprobe)
741{ 729{
742 unsigned long flags; 730 unsigned long flags;
743 731
744 synchronize_srcu(&uprobes_srcu);
745 spin_lock_irqsave(&uprobes_treelock, flags); 732 spin_lock_irqsave(&uprobes_treelock, flags);
746 rb_erase(&uprobe->rb_node, &uprobes_tree); 733 rb_erase(&uprobe->rb_node, &uprobes_tree);
747 spin_unlock_irqrestore(&uprobes_treelock, flags); 734 spin_unlock_irqrestore(&uprobes_treelock, flags);
@@ -750,139 +737,135 @@ static void delete_uprobe(struct uprobe *uprobe)
750 atomic_dec(&uprobe_events); 737 atomic_dec(&uprobe_events);
751} 738}
752 739
753static struct vma_info * 740struct map_info {
754__find_next_vma_info(struct address_space *mapping, struct list_head *head, 741 struct map_info *next;
755 struct vma_info *vi, loff_t offset, bool is_register) 742 struct mm_struct *mm;
743 unsigned long vaddr;
744};
745
746static inline struct map_info *free_map_info(struct map_info *info)
747{
748 struct map_info *next = info->next;
749 kfree(info);
750 return next;
751}
752
753static struct map_info *
754build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
756{ 755{
756 unsigned long pgoff = offset >> PAGE_SHIFT;
757 struct prio_tree_iter iter; 757 struct prio_tree_iter iter;
758 struct vm_area_struct *vma; 758 struct vm_area_struct *vma;
759 struct vma_info *tmpvi; 759 struct map_info *curr = NULL;
760 unsigned long pgoff; 760 struct map_info *prev = NULL;
761 int existing_vma; 761 struct map_info *info;
762 loff_t vaddr; 762 int more = 0;
763
764 pgoff = offset >> PAGE_SHIFT;
765 763
764 again:
765 mutex_lock(&mapping->i_mmap_mutex);
766 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 766 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
767 if (!valid_vma(vma, is_register)) 767 if (!valid_vma(vma, is_register))
768 continue; 768 continue;
769 769
770 existing_vma = 0; 770 if (!prev && !more) {
771 vaddr = vma_address(vma, offset); 771 /*
772 772 * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
773 list_for_each_entry(tmpvi, head, probe_list) { 773 * reclaim. This is optimistic, no harm done if it fails.
774 if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { 774 */
775 existing_vma = 1; 775 prev = kmalloc(sizeof(struct map_info),
776 break; 776 GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
777 } 777 if (prev)
778 prev->next = NULL;
778 } 779 }
779 780 if (!prev) {
780 /* 781 more++;
781 * Another vma needs a probe to be installed. However skip 782 continue;
782 * installing the probe if the vma is about to be unlinked.
783 */
784 if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
785 vi->mm = vma->vm_mm;
786 vi->vaddr = vaddr;
787 list_add(&vi->probe_list, head);
788
789 return vi;
790 } 783 }
791 }
792 784
793 return NULL; 785 if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
794} 786 continue;
795
796/*
797 * Iterate in the rmap prio tree and find a vma where a probe has not
798 * yet been inserted.
799 */
800static struct vma_info *
801find_next_vma_info(struct address_space *mapping, struct list_head *head,
802 loff_t offset, bool is_register)
803{
804 struct vma_info *vi, *retvi;
805 787
806 vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); 788 info = prev;
807 if (!vi) 789 prev = prev->next;
808 return ERR_PTR(-ENOMEM); 790 info->next = curr;
791 curr = info;
809 792
810 mutex_lock(&mapping->i_mmap_mutex); 793 info->mm = vma->vm_mm;
811 retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); 794 info->vaddr = vma_address(vma, offset);
795 }
812 mutex_unlock(&mapping->i_mmap_mutex); 796 mutex_unlock(&mapping->i_mmap_mutex);
813 797
814 if (!retvi) 798 if (!more)
815 kfree(vi); 799 goto out;
800
801 prev = curr;
802 while (curr) {
803 mmput(curr->mm);
804 curr = curr->next;
805 }
816 806
817 return retvi; 807 do {
808 info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
809 if (!info) {
810 curr = ERR_PTR(-ENOMEM);
811 goto out;
812 }
813 info->next = prev;
814 prev = info;
815 } while (--more);
816
817 goto again;
818 out:
819 while (prev)
820 prev = free_map_info(prev);
821 return curr;
818} 822}
819 823
820static int register_for_each_vma(struct uprobe *uprobe, bool is_register) 824static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
821{ 825{
822 struct list_head try_list; 826 struct map_info *info;
823 struct vm_area_struct *vma; 827 int err = 0;
824 struct address_space *mapping;
825 struct vma_info *vi, *tmpvi;
826 struct mm_struct *mm;
827 loff_t vaddr;
828 int ret;
829 828
830 mapping = uprobe->inode->i_mapping; 829 info = build_map_info(uprobe->inode->i_mapping,
831 INIT_LIST_HEAD(&try_list); 830 uprobe->offset, is_register);
831 if (IS_ERR(info))
832 return PTR_ERR(info);
832 833
833 ret = 0; 834 while (info) {
835 struct mm_struct *mm = info->mm;
836 struct vm_area_struct *vma;
834 837
835 for (;;) { 838 if (err)
836 vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); 839 goto free;
837 if (!vi)
838 break;
839 840
840 if (IS_ERR(vi)) { 841 down_write(&mm->mmap_sem);
841 ret = PTR_ERR(vi); 842 vma = find_vma(mm, (unsigned long)info->vaddr);
842 break; 843 if (!vma || !valid_vma(vma, is_register))
843 } 844 goto unlock;
844 845
845 mm = vi->mm;
846 down_read(&mm->mmap_sem);
847 vma = find_vma(mm, (unsigned long)vi->vaddr);
848 if (!vma || !valid_vma(vma, is_register)) {
849 list_del(&vi->probe_list);
850 kfree(vi);
851 up_read(&mm->mmap_sem);
852 mmput(mm);
853 continue;
854 }
855 vaddr = vma_address(vma, uprobe->offset);
856 if (vma->vm_file->f_mapping->host != uprobe->inode || 846 if (vma->vm_file->f_mapping->host != uprobe->inode ||
857 vaddr != vi->vaddr) { 847 vma_address(vma, uprobe->offset) != info->vaddr)
858 list_del(&vi->probe_list); 848 goto unlock;
859 kfree(vi);
860 up_read(&mm->mmap_sem);
861 mmput(mm);
862 continue;
863 }
864
865 if (is_register)
866 ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
867 else
868 remove_breakpoint(uprobe, mm, vi->vaddr);
869 849
870 up_read(&mm->mmap_sem);
871 mmput(mm);
872 if (is_register) { 850 if (is_register) {
873 if (ret && ret == -EEXIST) 851 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
874 ret = 0; 852 /*
875 if (ret) 853 * We can race against uprobe_mmap(), see the
876 break; 854 * comment near uprobe_hash().
855 */
856 if (err == -EEXIST)
857 err = 0;
858 } else {
859 remove_breakpoint(uprobe, mm, info->vaddr);
877 } 860 }
861 unlock:
862 up_write(&mm->mmap_sem);
863 free:
864 mmput(mm);
865 info = free_map_info(info);
878 } 866 }
879 867
880 list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { 868 return err;
881 list_del(&vi->probe_list);
882 kfree(vi);
883 }
884
885 return ret;
886} 869}
887 870
888static int __uprobe_register(struct uprobe *uprobe) 871static int __uprobe_register(struct uprobe *uprobe)
@@ -1048,7 +1031,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head)
1048int uprobe_mmap(struct vm_area_struct *vma) 1031int uprobe_mmap(struct vm_area_struct *vma)
1049{ 1032{
1050 struct list_head tmp_list; 1033 struct list_head tmp_list;
1051 struct uprobe *uprobe, *u; 1034 struct uprobe *uprobe;
1052 struct inode *inode; 1035 struct inode *inode;
1053 int ret, count; 1036 int ret, count;
1054 1037
@@ -1066,12 +1049,9 @@ int uprobe_mmap(struct vm_area_struct *vma)
1066 ret = 0; 1049 ret = 0;
1067 count = 0; 1050 count = 0;
1068 1051
1069 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1052 list_for_each_entry(uprobe, &tmp_list, pending_list) {
1070 loff_t vaddr;
1071
1072 list_del(&uprobe->pending_list);
1073 if (!ret) { 1053 if (!ret) {
1074 vaddr = vma_address(vma, uprobe->offset); 1054 loff_t vaddr = vma_address(vma, uprobe->offset);
1075 1055
1076 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { 1056 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
1077 put_uprobe(uprobe); 1057 put_uprobe(uprobe);
@@ -1079,8 +1059,10 @@ int uprobe_mmap(struct vm_area_struct *vma)
1079 } 1059 }
1080 1060
1081 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1061 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1082 1062 /*
1083 /* Ignore double add: */ 1063 * We can race against uprobe_register(), see the
1064 * comment near uprobe_hash().
1065 */
1084 if (ret == -EEXIST) { 1066 if (ret == -EEXIST) {
1085 ret = 0; 1067 ret = 0;
1086 1068
@@ -1115,7 +1097,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
1115void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1097void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1116{ 1098{
1117 struct list_head tmp_list; 1099 struct list_head tmp_list;
1118 struct uprobe *uprobe, *u; 1100 struct uprobe *uprobe;
1119 struct inode *inode; 1101 struct inode *inode;
1120 1102
1121 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1103 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
@@ -1132,11 +1114,8 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1132 mutex_lock(uprobes_mmap_hash(inode)); 1114 mutex_lock(uprobes_mmap_hash(inode));
1133 build_probe_list(inode, &tmp_list); 1115 build_probe_list(inode, &tmp_list);
1134 1116
1135 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1117 list_for_each_entry(uprobe, &tmp_list, pending_list) {
1136 loff_t vaddr; 1118 loff_t vaddr = vma_address(vma, uprobe->offset);
1137
1138 list_del(&uprobe->pending_list);
1139 vaddr = vma_address(vma, uprobe->offset);
1140 1119
1141 if (vaddr >= start && vaddr < end) { 1120 if (vaddr >= start && vaddr < end) {
1142 /* 1121 /*
@@ -1378,9 +1357,6 @@ void uprobe_free_utask(struct task_struct *t)
1378{ 1357{
1379 struct uprobe_task *utask = t->utask; 1358 struct uprobe_task *utask = t->utask;
1380 1359
1381 if (t->uprobe_srcu_id != -1)
1382 srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
1383
1384 if (!utask) 1360 if (!utask)
1385 return; 1361 return;
1386 1362
@@ -1398,7 +1374,6 @@ void uprobe_free_utask(struct task_struct *t)
1398void uprobe_copy_process(struct task_struct *t) 1374void uprobe_copy_process(struct task_struct *t)
1399{ 1375{
1400 t->utask = NULL; 1376 t->utask = NULL;
1401 t->uprobe_srcu_id = -1;
1402} 1377}
1403 1378
1404/* 1379/*
@@ -1417,7 +1392,6 @@ static struct uprobe_task *add_utask(void)
1417 if (unlikely(!utask)) 1392 if (unlikely(!utask))
1418 return NULL; 1393 return NULL;
1419 1394
1420 utask->active_uprobe = NULL;
1421 current->utask = utask; 1395 current->utask = utask;
1422 return utask; 1396 return utask;
1423} 1397}
@@ -1479,41 +1453,64 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1479 return false; 1453 return false;
1480} 1454}
1481 1455
1456static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1457{
1458 struct mm_struct *mm = current->mm;
1459 struct uprobe *uprobe = NULL;
1460 struct vm_area_struct *vma;
1461
1462 down_read(&mm->mmap_sem);
1463 vma = find_vma(mm, bp_vaddr);
1464 if (vma && vma->vm_start <= bp_vaddr) {
1465 if (valid_vma(vma, false)) {
1466 struct inode *inode;
1467 loff_t offset;
1468
1469 inode = vma->vm_file->f_mapping->host;
1470 offset = bp_vaddr - vma->vm_start;
1471 offset += (vma->vm_pgoff << PAGE_SHIFT);
1472 uprobe = find_uprobe(inode, offset);
1473 }
1474
1475 if (!uprobe)
1476 *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
1477 } else {
1478 *is_swbp = -EFAULT;
1479 }
1480 up_read(&mm->mmap_sem);
1481
1482 return uprobe;
1483}
1484
1482/* 1485/*
1483 * Run handler and ask thread to singlestep. 1486 * Run handler and ask thread to singlestep.
1484 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1487 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1485 */ 1488 */
1486static void handle_swbp(struct pt_regs *regs) 1489static void handle_swbp(struct pt_regs *regs)
1487{ 1490{
1488 struct vm_area_struct *vma;
1489 struct uprobe_task *utask; 1491 struct uprobe_task *utask;
1490 struct uprobe *uprobe; 1492 struct uprobe *uprobe;
1491 struct mm_struct *mm;
1492 unsigned long bp_vaddr; 1493 unsigned long bp_vaddr;
1494 int uninitialized_var(is_swbp);
1493 1495
1494 uprobe = NULL;
1495 bp_vaddr = uprobe_get_swbp_addr(regs); 1496 bp_vaddr = uprobe_get_swbp_addr(regs);
1496 mm = current->mm; 1497 uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1497 down_read(&mm->mmap_sem);
1498 vma = find_vma(mm, bp_vaddr);
1499
1500 if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
1501 struct inode *inode;
1502 loff_t offset;
1503
1504 inode = vma->vm_file->f_mapping->host;
1505 offset = bp_vaddr - vma->vm_start;
1506 offset += (vma->vm_pgoff << PAGE_SHIFT);
1507 uprobe = find_uprobe(inode, offset);
1508 }
1509
1510 srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
1511 current->uprobe_srcu_id = -1;
1512 up_read(&mm->mmap_sem);
1513 1498
1514 if (!uprobe) { 1499 if (!uprobe) {
1515 /* No matching uprobe; signal SIGTRAP. */ 1500 if (is_swbp > 0) {
1516 send_sig(SIGTRAP, current, 0); 1501 /* No matching uprobe; signal SIGTRAP. */
1502 send_sig(SIGTRAP, current, 0);
1503 } else {
1504 /*
1505 * Either we raced with uprobe_unregister() or we can't
1506 * access this memory. The latter is only possible if
1507 * another thread plays with our ->mm. In both cases
1508 * we can simply restart. If this vma was unmapped we
1509 * can pretend this insn was not executed yet and get
1510 * the (correct) SIGSEGV after restart.
1511 */
1512 instruction_pointer_set(regs, bp_vaddr);
1513 }
1517 return; 1514 return;
1518 } 1515 }
1519 1516
@@ -1620,7 +1617,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1620 utask->state = UTASK_BP_HIT; 1617 utask->state = UTASK_BP_HIT;
1621 1618
1622 set_thread_flag(TIF_UPROBE); 1619 set_thread_flag(TIF_UPROBE);
1623 current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
1624 1620
1625 return 1; 1621 return 1;
1626} 1622}
@@ -1655,7 +1651,6 @@ static int __init init_uprobes(void)
1655 mutex_init(&uprobes_mutex[i]); 1651 mutex_init(&uprobes_mutex[i]);
1656 mutex_init(&uprobes_mmap_mutex[i]); 1652 mutex_init(&uprobes_mmap_mutex[i]);
1657 } 1653 }
1658 init_srcu_struct(&uprobes_srcu);
1659 1654
1660 return register_die_notifier(&uprobe_exception_nb); 1655 return register_die_notifier(&uprobe_exception_nb);
1661} 1656}
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..f00e319d8376 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
304 } 304 }
305 305
306 err = arch_dup_task_struct(tsk, orig); 306 err = arch_dup_task_struct(tsk, orig);
307 if (err)
308 goto out;
309 307
308 /*
309 * We defer looking at err, because we will need this setup
310 * for the clean up path to work correctly.
311 */
310 tsk->stack = ti; 312 tsk->stack = ti;
311
312 setup_thread_stack(tsk, orig); 313 setup_thread_stack(tsk, orig);
314
315 if (err)
316 goto out;
317
313 clear_user_return_notifier(tsk); 318 clear_user_return_notifier(tsk);
314 clear_tsk_need_resched(tsk); 319 clear_tsk_need_resched(tsk);
315 stackend = end_of_stack(tsk); 320 stackend = end_of_stack(tsk);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682b..6db7a5ed52b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
657 return 0; 657 return 0;
658} 658}
659 659
660static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
661{
662 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
663 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
664
665 return ktime_get_update_offsets(offs_real, offs_boot);
666}
667
660/* 668/*
661 * Retrigger next event is called after clock was set 669 * Retrigger next event is called after clock was set
662 * 670 *
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
665static void retrigger_next_event(void *arg) 673static void retrigger_next_event(void *arg)
666{ 674{
667 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 675 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
668 struct timespec realtime_offset, xtim, wtm, sleep;
669 676
670 if (!hrtimer_hres_active()) 677 if (!hrtimer_hres_active())
671 return; 678 return;
672 679
673 /* Optimized out for !HIGH_RES */
674 get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
675 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
676
677 /* Adjust CLOCK_REALTIME offset */
678 raw_spin_lock(&base->lock); 680 raw_spin_lock(&base->lock);
679 base->clock_base[HRTIMER_BASE_REALTIME].offset = 681 hrtimer_update_base(base);
680 timespec_to_ktime(realtime_offset);
681 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
682 timespec_to_ktime(sleep);
683
684 hrtimer_force_reprogram(base, 0); 682 hrtimer_force_reprogram(base, 0);
685 raw_spin_unlock(&base->lock); 683 raw_spin_unlock(&base->lock);
686} 684}
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)
710 base->clock_base[i].resolution = KTIME_HIGH_RES; 708 base->clock_base[i].resolution = KTIME_HIGH_RES;
711 709
712 tick_setup_sched_timer(); 710 tick_setup_sched_timer();
713
714 /* "Retrigger" the interrupt to get things going */ 711 /* "Retrigger" the interrupt to get things going */
715 retrigger_next_event(NULL); 712 retrigger_next_event(NULL);
716 local_irq_restore(flags); 713 local_irq_restore(flags);
717 return 1; 714 return 1;
718} 715}
719 716
717/*
718 * Called from timekeeping code to reprogramm the hrtimer interrupt
719 * device. If called from the timer interrupt context we defer it to
720 * softirq context.
721 */
722void clock_was_set_delayed(void)
723{
724 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
725
726 cpu_base->clock_was_set = 1;
727 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
728}
729
720#else 730#else
721 731
722static inline int hrtimer_hres_active(void) { return 0; } 732static inline int hrtimer_hres_active(void) { return 0; }
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1250 cpu_base->nr_events++; 1260 cpu_base->nr_events++;
1251 dev->next_event.tv64 = KTIME_MAX; 1261 dev->next_event.tv64 = KTIME_MAX;
1252 1262
1253 entry_time = now = ktime_get(); 1263 raw_spin_lock(&cpu_base->lock);
1264 entry_time = now = hrtimer_update_base(cpu_base);
1254retry: 1265retry:
1255 expires_next.tv64 = KTIME_MAX; 1266 expires_next.tv64 = KTIME_MAX;
1256
1257 raw_spin_lock(&cpu_base->lock);
1258 /* 1267 /*
1259 * We set expires_next to KTIME_MAX here with cpu_base->lock 1268 * We set expires_next to KTIME_MAX here with cpu_base->lock
1260 * held to prevent that a timer is enqueued in our queue via 1269 * held to prevent that a timer is enqueued in our queue via
@@ -1330,8 +1339,12 @@ retry:
1330 * We need to prevent that we loop forever in the hrtimer 1339 * We need to prevent that we loop forever in the hrtimer
1331 * interrupt routine. We give it 3 attempts to avoid 1340 * interrupt routine. We give it 3 attempts to avoid
1332 * overreacting on some spurious event. 1341 * overreacting on some spurious event.
1342 *
1343 * Acquire base lock for updating the offsets and retrieving
1344 * the current time.
1333 */ 1345 */
1334 now = ktime_get(); 1346 raw_spin_lock(&cpu_base->lock);
1347 now = hrtimer_update_base(cpu_base);
1335 cpu_base->nr_retries++; 1348 cpu_base->nr_retries++;
1336 if (++retries < 3) 1349 if (++retries < 3)
1337 goto retry; 1350 goto retry;
@@ -1343,6 +1356,7 @@ retry:
1343 */ 1356 */
1344 cpu_base->nr_hangs++; 1357 cpu_base->nr_hangs++;
1345 cpu_base->hang_detected = 1; 1358 cpu_base->hang_detected = 1;
1359 raw_spin_unlock(&cpu_base->lock);
1346 delta = ktime_sub(now, entry_time); 1360 delta = ktime_sub(now, entry_time);
1347 if (delta.tv64 > cpu_base->max_hang_time.tv64) 1361 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1348 cpu_base->max_hang_time = delta; 1362 cpu_base->max_hang_time = delta;
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)
1395 1409
1396static void run_hrtimer_softirq(struct softirq_action *h) 1410static void run_hrtimer_softirq(struct softirq_action *h)
1397{ 1411{
1412 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1413
1414 if (cpu_base->clock_was_set) {
1415 cpu_base->clock_was_set = 0;
1416 clock_was_set();
1417 }
1418
1398 hrtimer_peek_ahead_timers(); 1419 hrtimer_peek_ahead_timers();
1399} 1420}
1400 1421
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a279..238025f5472e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -27,7 +27,6 @@
27#include <linux/syscore_ops.h> 27#include <linux/syscore_ops.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/genhd.h> 29#include <linux/genhd.h>
30#include <scsi/scsi_scan.h>
31 30
32#include "power.h" 31#include "power.h"
33 32
@@ -748,13 +747,6 @@ static int software_resume(void)
748 async_synchronize_full(); 747 async_synchronize_full();
749 } 748 }
750 749
751 /*
752 * We can't depend on SCSI devices being available after loading
753 * one of their modules until scsi_complete_async_scans() is
754 * called and the resume device usually is a SCSI one.
755 */
756 scsi_complete_async_scans();
757
758 swsusp_resume_device = name_to_dev_t(resume_file); 750 swsusp_resume_device = name_to_dev_t(resume_file);
759 if (!swsusp_resume_device) { 751 if (!swsusp_resume_device) {
760 error = -ENODEV; 752 error = -ENODEV;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a95..4ed81e74f86f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
24#include <linux/console.h> 24#include <linux/console.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/freezer.h> 26#include <linux/freezer.h>
27#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
30 29
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
84 * appear. 83 * appear.
85 */ 84 */
86 wait_for_device_probe(); 85 wait_for_device_probe();
87 scsi_complete_async_scans();
88 86
89 data->swap = -1; 87 data->swap = -1;
90 data->mode = O_WRONLY; 88 data->mode = O_WRONLY;
diff --git a/kernel/printk.c b/kernel/printk.c
index dba18211685e..ac4bc9e79465 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -194,8 +194,10 @@ static int console_may_schedule;
194 */ 194 */
195 195
196enum log_flags { 196enum log_flags {
197 LOG_DEFAULT = 0, 197 LOG_NOCONS = 1, /* already flushed, do not print to console */
198 LOG_NOCONS = 1, /* already flushed, do not print to console */ 198 LOG_NEWLINE = 2, /* text ended with a newline */
199 LOG_PREFIX = 4, /* text started with a prefix */
200 LOG_CONT = 8, /* text is a fragment of a continuation line */
199}; 201};
200 202
201struct log { 203struct log {
@@ -217,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);
217/* the next printk record to read by syslog(READ) or /proc/kmsg */ 219/* the next printk record to read by syslog(READ) or /proc/kmsg */
218static u64 syslog_seq; 220static u64 syslog_seq;
219static u32 syslog_idx; 221static u32 syslog_idx;
222static enum log_flags syslog_prev;
223static size_t syslog_partial;
220 224
221/* index and sequence number of the first record stored in the buffer */ 225/* index and sequence number of the first record stored in the buffer */
222static u64 log_first_seq; 226static u64 log_first_seq;
@@ -430,20 +434,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
430 ret = mutex_lock_interruptible(&user->lock); 434 ret = mutex_lock_interruptible(&user->lock);
431 if (ret) 435 if (ret)
432 return ret; 436 return ret;
433 raw_spin_lock(&logbuf_lock); 437 raw_spin_lock_irq(&logbuf_lock);
434 while (user->seq == log_next_seq) { 438 while (user->seq == log_next_seq) {
435 if (file->f_flags & O_NONBLOCK) { 439 if (file->f_flags & O_NONBLOCK) {
436 ret = -EAGAIN; 440 ret = -EAGAIN;
437 raw_spin_unlock(&logbuf_lock); 441 raw_spin_unlock_irq(&logbuf_lock);
438 goto out; 442 goto out;
439 } 443 }
440 444
441 raw_spin_unlock(&logbuf_lock); 445 raw_spin_unlock_irq(&logbuf_lock);
442 ret = wait_event_interruptible(log_wait, 446 ret = wait_event_interruptible(log_wait,
443 user->seq != log_next_seq); 447 user->seq != log_next_seq);
444 if (ret) 448 if (ret)
445 goto out; 449 goto out;
446 raw_spin_lock(&logbuf_lock); 450 raw_spin_lock_irq(&logbuf_lock);
447 } 451 }
448 452
449 if (user->seq < log_first_seq) { 453 if (user->seq < log_first_seq) {
@@ -451,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
451 user->idx = log_first_idx; 455 user->idx = log_first_idx;
452 user->seq = log_first_seq; 456 user->seq = log_first_seq;
453 ret = -EPIPE; 457 ret = -EPIPE;
454 raw_spin_unlock(&logbuf_lock); 458 raw_spin_unlock_irq(&logbuf_lock);
455 goto out; 459 goto out;
456 } 460 }
457 461
@@ -465,7 +469,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
465 for (i = 0; i < msg->text_len; i++) { 469 for (i = 0; i < msg->text_len; i++) {
466 unsigned char c = log_text(msg)[i]; 470 unsigned char c = log_text(msg)[i];
467 471
468 if (c < ' ' || c >= 128) 472 if (c < ' ' || c >= 127 || c == '\\')
469 len += sprintf(user->buf + len, "\\x%02x", c); 473 len += sprintf(user->buf + len, "\\x%02x", c);
470 else 474 else
471 user->buf[len++] = c; 475 user->buf[len++] = c;
@@ -489,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
489 continue; 493 continue;
490 } 494 }
491 495
492 if (c < ' ' || c >= 128) { 496 if (c < ' ' || c >= 127 || c == '\\') {
493 len += sprintf(user->buf + len, "\\x%02x", c); 497 len += sprintf(user->buf + len, "\\x%02x", c);
494 continue; 498 continue;
495 } 499 }
@@ -501,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
501 505
502 user->idx = log_next(user->idx); 506 user->idx = log_next(user->idx);
503 user->seq++; 507 user->seq++;
504 raw_spin_unlock(&logbuf_lock); 508 raw_spin_unlock_irq(&logbuf_lock);
505 509
506 if (len > count) { 510 if (len > count) {
507 ret = -EINVAL; 511 ret = -EINVAL;
@@ -528,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
528 if (offset) 532 if (offset)
529 return -ESPIPE; 533 return -ESPIPE;
530 534
531 raw_spin_lock(&logbuf_lock); 535 raw_spin_lock_irq(&logbuf_lock);
532 switch (whence) { 536 switch (whence) {
533 case SEEK_SET: 537 case SEEK_SET:
534 /* the first record */ 538 /* the first record */
@@ -552,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
552 default: 556 default:
553 ret = -EINVAL; 557 ret = -EINVAL;
554 } 558 }
555 raw_spin_unlock(&logbuf_lock); 559 raw_spin_unlock_irq(&logbuf_lock);
556 return ret; 560 return ret;
557} 561}
558 562
@@ -566,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
566 570
567 poll_wait(file, &log_wait, wait); 571 poll_wait(file, &log_wait, wait);
568 572
569 raw_spin_lock(&logbuf_lock); 573 raw_spin_lock_irq(&logbuf_lock);
570 if (user->seq < log_next_seq) { 574 if (user->seq < log_next_seq) {
571 /* return error when data has vanished underneath us */ 575 /* return error when data has vanished underneath us */
572 if (user->seq < log_first_seq) 576 if (user->seq < log_first_seq)
573 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; 577 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
574 ret = POLLIN|POLLRDNORM; 578 ret = POLLIN|POLLRDNORM;
575 } 579 }
576 raw_spin_unlock(&logbuf_lock); 580 raw_spin_unlock_irq(&logbuf_lock);
577 581
578 return ret; 582 return ret;
579} 583}
@@ -597,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
597 601
598 mutex_init(&user->lock); 602 mutex_init(&user->lock);
599 603
600 raw_spin_lock(&logbuf_lock); 604 raw_spin_lock_irq(&logbuf_lock);
601 user->idx = log_first_idx; 605 user->idx = log_first_idx;
602 user->seq = log_first_seq; 606 user->seq = log_first_seq;
603 raw_spin_unlock(&logbuf_lock); 607 raw_spin_unlock_irq(&logbuf_lock);
604 608
605 file->private_data = user; 609 file->private_data = user;
606 return 0; 610 return 0;
@@ -818,15 +822,18 @@ static size_t print_time(u64 ts, char *buf)
818static size_t print_prefix(const struct log *msg, bool syslog, char *buf) 822static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
819{ 823{
820 size_t len = 0; 824 size_t len = 0;
825 unsigned int prefix = (msg->facility << 3) | msg->level;
821 826
822 if (syslog) { 827 if (syslog) {
823 if (buf) { 828 if (buf) {
824 len += sprintf(buf, "<%u>", msg->level); 829 len += sprintf(buf, "<%u>", prefix);
825 } else { 830 } else {
826 len += 3; 831 len += 3;
827 if (msg->level > 9) 832 if (prefix > 999)
828 len++; 833 len += 3;
829 if (msg->level > 99) 834 else if (prefix > 99)
835 len += 2;
836 else if (prefix > 9)
830 len++; 837 len++;
831 } 838 }
832 } 839 }
@@ -835,13 +842,26 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
835 return len; 842 return len;
836} 843}
837 844
838static size_t msg_print_text(const struct log *msg, bool syslog, 845static size_t msg_print_text(const struct log *msg, enum log_flags prev,
839 char *buf, size_t size) 846 bool syslog, char *buf, size_t size)
840{ 847{
841 const char *text = log_text(msg); 848 const char *text = log_text(msg);
842 size_t text_size = msg->text_len; 849 size_t text_size = msg->text_len;
850 bool prefix = true;
851 bool newline = true;
843 size_t len = 0; 852 size_t len = 0;
844 853
854 if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
855 prefix = false;
856
857 if (msg->flags & LOG_CONT) {
858 if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
859 prefix = false;
860
861 if (!(msg->flags & LOG_NEWLINE))
862 newline = false;
863 }
864
845 do { 865 do {
846 const char *next = memchr(text, '\n', text_size); 866 const char *next = memchr(text, '\n', text_size);
847 size_t text_len; 867 size_t text_len;
@@ -859,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog,
859 text_len + 1>= size - len) 879 text_len + 1>= size - len)
860 break; 880 break;
861 881
862 len += print_prefix(msg, syslog, buf + len); 882 if (prefix)
883 len += print_prefix(msg, syslog, buf + len);
863 memcpy(buf + len, text, text_len); 884 memcpy(buf + len, text, text_len);
864 len += text_len; 885 len += text_len;
865 buf[len++] = '\n'; 886 if (next || newline)
887 buf[len++] = '\n';
866 } else { 888 } else {
867 /* SYSLOG_ACTION_* buffer size only calculation */ 889 /* SYSLOG_ACTION_* buffer size only calculation */
868 len += print_prefix(msg, syslog, NULL); 890 if (prefix)
869 len += text_len + 1; 891 len += print_prefix(msg, syslog, NULL);
892 len += text_len;
893 if (next || newline)
894 len++;
870 } 895 }
871 896
897 prefix = true;
872 text = next; 898 text = next;
873 } while (text); 899 } while (text);
874 900
@@ -887,22 +913,35 @@ static int syslog_print(char __user *buf, int size)
887 913
888 while (size > 0) { 914 while (size > 0) {
889 size_t n; 915 size_t n;
916 size_t skip;
890 917
891 raw_spin_lock_irq(&logbuf_lock); 918 raw_spin_lock_irq(&logbuf_lock);
892 if (syslog_seq < log_first_seq) { 919 if (syslog_seq < log_first_seq) {
893 /* messages are gone, move to first one */ 920 /* messages are gone, move to first one */
894 syslog_seq = log_first_seq; 921 syslog_seq = log_first_seq;
895 syslog_idx = log_first_idx; 922 syslog_idx = log_first_idx;
923 syslog_prev = 0;
924 syslog_partial = 0;
896 } 925 }
897 if (syslog_seq == log_next_seq) { 926 if (syslog_seq == log_next_seq) {
898 raw_spin_unlock_irq(&logbuf_lock); 927 raw_spin_unlock_irq(&logbuf_lock);
899 break; 928 break;
900 } 929 }
930
931 skip = syslog_partial;
901 msg = log_from_idx(syslog_idx); 932 msg = log_from_idx(syslog_idx);
902 n = msg_print_text(msg, true, text, LOG_LINE_MAX); 933 n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
903 if (n <= size) { 934 if (n - syslog_partial <= size) {
935 /* message fits into buffer, move forward */
904 syslog_idx = log_next(syslog_idx); 936 syslog_idx = log_next(syslog_idx);
905 syslog_seq++; 937 syslog_seq++;
938 syslog_prev = msg->flags;
939 n -= syslog_partial;
940 syslog_partial = 0;
941 } else if (!len){
942 /* partial read(), remember position */
943 n = size;
944 syslog_partial += n;
906 } else 945 } else
907 n = 0; 946 n = 0;
908 raw_spin_unlock_irq(&logbuf_lock); 947 raw_spin_unlock_irq(&logbuf_lock);
@@ -910,17 +949,15 @@ static int syslog_print(char __user *buf, int size)
910 if (!n) 949 if (!n)
911 break; 950 break;
912 951
913 len += n; 952 if (copy_to_user(buf, text + skip, n)) {
914 size -= n;
915 buf += n;
916 n = copy_to_user(buf - n, text, n);
917
918 if (n) {
919 len -= n;
920 if (!len) 953 if (!len)
921 len = -EFAULT; 954 len = -EFAULT;
922 break; 955 break;
923 } 956 }
957
958 len += n;
959 size -= n;
960 buf += n;
924 } 961 }
925 962
926 kfree(text); 963 kfree(text);
@@ -941,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
941 u64 next_seq; 978 u64 next_seq;
942 u64 seq; 979 u64 seq;
943 u32 idx; 980 u32 idx;
981 enum log_flags prev;
944 982
945 if (clear_seq < log_first_seq) { 983 if (clear_seq < log_first_seq) {
946 /* messages are gone, move to first available one */ 984 /* messages are gone, move to first available one */
@@ -954,10 +992,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
954 */ 992 */
955 seq = clear_seq; 993 seq = clear_seq;
956 idx = clear_idx; 994 idx = clear_idx;
995 prev = 0;
957 while (seq < log_next_seq) { 996 while (seq < log_next_seq) {
958 struct log *msg = log_from_idx(idx); 997 struct log *msg = log_from_idx(idx);
959 998
960 len += msg_print_text(msg, true, NULL, 0); 999 len += msg_print_text(msg, prev, true, NULL, 0);
961 idx = log_next(idx); 1000 idx = log_next(idx);
962 seq++; 1001 seq++;
963 } 1002 }
@@ -965,10 +1004,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
965 /* move first record forward until length fits into the buffer */ 1004 /* move first record forward until length fits into the buffer */
966 seq = clear_seq; 1005 seq = clear_seq;
967 idx = clear_idx; 1006 idx = clear_idx;
1007 prev = 0;
968 while (len > size && seq < log_next_seq) { 1008 while (len > size && seq < log_next_seq) {
969 struct log *msg = log_from_idx(idx); 1009 struct log *msg = log_from_idx(idx);
970 1010
971 len -= msg_print_text(msg, true, NULL, 0); 1011 len -= msg_print_text(msg, prev, true, NULL, 0);
972 idx = log_next(idx); 1012 idx = log_next(idx);
973 seq++; 1013 seq++;
974 } 1014 }
@@ -977,17 +1017,19 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
977 next_seq = log_next_seq; 1017 next_seq = log_next_seq;
978 1018
979 len = 0; 1019 len = 0;
1020 prev = 0;
980 while (len >= 0 && seq < next_seq) { 1021 while (len >= 0 && seq < next_seq) {
981 struct log *msg = log_from_idx(idx); 1022 struct log *msg = log_from_idx(idx);
982 int textlen; 1023 int textlen;
983 1024
984 textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); 1025 textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
985 if (textlen < 0) { 1026 if (textlen < 0) {
986 len = textlen; 1027 len = textlen;
987 break; 1028 break;
988 } 1029 }
989 idx = log_next(idx); 1030 idx = log_next(idx);
990 seq++; 1031 seq++;
1032 prev = msg->flags;
991 1033
992 raw_spin_unlock_irq(&logbuf_lock); 1034 raw_spin_unlock_irq(&logbuf_lock);
993 if (copy_to_user(buf + len, text, textlen)) 1035 if (copy_to_user(buf + len, text, textlen))
@@ -1000,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1000 /* messages are gone, move to next one */ 1042 /* messages are gone, move to next one */
1001 seq = log_first_seq; 1043 seq = log_first_seq;
1002 idx = log_first_idx; 1044 idx = log_first_idx;
1045 prev = 0;
1003 } 1046 }
1004 } 1047 }
1005 } 1048 }
@@ -1018,7 +1061,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1018{ 1061{
1019 bool clear = false; 1062 bool clear = false;
1020 static int saved_console_loglevel = -1; 1063 static int saved_console_loglevel = -1;
1021 static DEFINE_MUTEX(syslog_mutex);
1022 int error; 1064 int error;
1023 1065
1024 error = check_syslog_permissions(type, from_file); 1066 error = check_syslog_permissions(type, from_file);
@@ -1045,17 +1087,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1045 error = -EFAULT; 1087 error = -EFAULT;
1046 goto out; 1088 goto out;
1047 } 1089 }
1048 error = mutex_lock_interruptible(&syslog_mutex);
1049 if (error)
1050 goto out;
1051 error = wait_event_interruptible(log_wait, 1090 error = wait_event_interruptible(log_wait,
1052 syslog_seq != log_next_seq); 1091 syslog_seq != log_next_seq);
1053 if (error) { 1092 if (error)
1054 mutex_unlock(&syslog_mutex);
1055 goto out; 1093 goto out;
1056 }
1057 error = syslog_print(buf, len); 1094 error = syslog_print(buf, len);
1058 mutex_unlock(&syslog_mutex);
1059 break; 1095 break;
1060 /* Read/clear last kernel messages */ 1096 /* Read/clear last kernel messages */
1061 case SYSLOG_ACTION_READ_CLEAR: 1097 case SYSLOG_ACTION_READ_CLEAR:
@@ -1111,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1111 /* messages are gone, move to first one */ 1147 /* messages are gone, move to first one */
1112 syslog_seq = log_first_seq; 1148 syslog_seq = log_first_seq;
1113 syslog_idx = log_first_idx; 1149 syslog_idx = log_first_idx;
1150 syslog_prev = 0;
1151 syslog_partial = 0;
1114 } 1152 }
1115 if (from_file) { 1153 if (from_file) {
1116 /* 1154 /*
@@ -1120,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1120 */ 1158 */
1121 error = log_next_idx - syslog_idx; 1159 error = log_next_idx - syslog_idx;
1122 } else { 1160 } else {
1123 u64 seq; 1161 u64 seq = syslog_seq;
1124 u32 idx; 1162 u32 idx = syslog_idx;
1163 enum log_flags prev = syslog_prev;
1125 1164
1126 error = 0; 1165 error = 0;
1127 seq = syslog_seq;
1128 idx = syslog_idx;
1129 while (seq < log_next_seq) { 1166 while (seq < log_next_seq) {
1130 struct log *msg = log_from_idx(idx); 1167 struct log *msg = log_from_idx(idx);
1131 1168
1132 error += msg_print_text(msg, true, NULL, 0); 1169 error += msg_print_text(msg, prev, true, NULL, 0);
1133 idx = log_next(idx); 1170 idx = log_next(idx);
1134 seq++; 1171 seq++;
1172 prev = msg->flags;
1135 } 1173 }
1174 error -= syslog_partial;
1136 } 1175 }
1137 raw_spin_unlock_irq(&logbuf_lock); 1176 raw_spin_unlock_irq(&logbuf_lock);
1138 break; 1177 break;
@@ -1153,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1153 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1192 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
1154} 1193}
1155 1194
1156#ifdef CONFIG_KGDB_KDB
1157/* kdb dmesg command needs access to the syslog buffer. do_syslog()
1158 * uses locks so it cannot be used during debugging. Just tell kdb
1159 * where the start and end of the physical and logical logs are. This
1160 * is equivalent to do_syslog(3).
1161 */
1162void kdb_syslog_data(char *syslog_data[4])
1163{
1164 syslog_data[0] = log_buf;
1165 syslog_data[1] = log_buf + log_buf_len;
1166 syslog_data[2] = log_buf + log_first_idx;
1167 syslog_data[3] = log_buf + log_next_idx;
1168}
1169#endif /* CONFIG_KGDB_KDB */
1170
1171static bool __read_mostly ignore_loglevel; 1195static bool __read_mostly ignore_loglevel;
1172 1196
1173static int __init ignore_loglevel_setup(char *str) 1197static int __init ignore_loglevel_setup(char *str)
@@ -1400,10 +1424,9 @@ asmlinkage int vprintk_emit(int facility, int level,
1400 static char textbuf[LOG_LINE_MAX]; 1424 static char textbuf[LOG_LINE_MAX];
1401 char *text = textbuf; 1425 char *text = textbuf;
1402 size_t text_len; 1426 size_t text_len;
1427 enum log_flags lflags = 0;
1403 unsigned long flags; 1428 unsigned long flags;
1404 int this_cpu; 1429 int this_cpu;
1405 bool newline = false;
1406 bool prefix = false;
1407 int printed_len = 0; 1430 int printed_len = 0;
1408 1431
1409 boot_delay_msec(); 1432 boot_delay_msec();
@@ -1442,7 +1465,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1442 recursion_bug = 0; 1465 recursion_bug = 0;
1443 printed_len += strlen(recursion_msg); 1466 printed_len += strlen(recursion_msg);
1444 /* emit KERN_CRIT message */ 1467 /* emit KERN_CRIT message */
1445 log_store(0, 2, LOG_DEFAULT, 0, 1468 log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1446 NULL, 0, recursion_msg, printed_len); 1469 NULL, 0, recursion_msg, printed_len);
1447 } 1470 }
1448 1471
@@ -1455,7 +1478,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1455 /* mark and strip a trailing newline */ 1478 /* mark and strip a trailing newline */
1456 if (text_len && text[text_len-1] == '\n') { 1479 if (text_len && text[text_len-1] == '\n') {
1457 text_len--; 1480 text_len--;
1458 newline = true; 1481 lflags |= LOG_NEWLINE;
1459 } 1482 }
1460 1483
1461 /* strip syslog prefix and extract log level or control flags */ 1484 /* strip syslog prefix and extract log level or control flags */
@@ -1465,7 +1488,7 @@ asmlinkage int vprintk_emit(int facility, int level,
1465 if (level == -1) 1488 if (level == -1)
1466 level = text[1] - '0'; 1489 level = text[1] - '0';
1467 case 'd': /* KERN_DEFAULT */ 1490 case 'd': /* KERN_DEFAULT */
1468 prefix = true; 1491 lflags |= LOG_PREFIX;
1469 case 'c': /* KERN_CONT */ 1492 case 'c': /* KERN_CONT */
1470 text += 3; 1493 text += 3;
1471 text_len -= 3; 1494 text_len -= 3;
@@ -1475,22 +1498,20 @@ asmlinkage int vprintk_emit(int facility, int level,
1475 if (level == -1) 1498 if (level == -1)
1476 level = default_message_loglevel; 1499 level = default_message_loglevel;
1477 1500
1478 if (dict) { 1501 if (dict)
1479 prefix = true; 1502 lflags |= LOG_PREFIX|LOG_NEWLINE;
1480 newline = true;
1481 }
1482 1503
1483 if (!newline) { 1504 if (!(lflags & LOG_NEWLINE)) {
1484 /* 1505 /*
1485 * Flush the conflicting buffer. An earlier newline was missing, 1506 * Flush the conflicting buffer. An earlier newline was missing,
1486 * or another task also prints continuation lines. 1507 * or another task also prints continuation lines.
1487 */ 1508 */
1488 if (cont.len && (prefix || cont.owner != current)) 1509 if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
1489 cont_flush(); 1510 cont_flush();
1490 1511
1491 /* buffer line if possible, otherwise store it right away */ 1512 /* buffer line if possible, otherwise store it right away */
1492 if (!cont_add(facility, level, text, text_len)) 1513 if (!cont_add(facility, level, text, text_len))
1493 log_store(facility, level, LOG_DEFAULT, 0, 1514 log_store(facility, level, lflags | LOG_CONT, 0,
1494 dict, dictlen, text, text_len); 1515 dict, dictlen, text, text_len);
1495 } else { 1516 } else {
1496 bool stored = false; 1517 bool stored = false;
@@ -1502,13 +1523,13 @@ asmlinkage int vprintk_emit(int facility, int level,
1502 * flush it out and store this line separately. 1523 * flush it out and store this line separately.
1503 */ 1524 */
1504 if (cont.len && cont.owner == current) { 1525 if (cont.len && cont.owner == current) {
1505 if (!prefix) 1526 if (!(lflags & LOG_PREFIX))
1506 stored = cont_add(facility, level, text, text_len); 1527 stored = cont_add(facility, level, text, text_len);
1507 cont_flush(); 1528 cont_flush();
1508 } 1529 }
1509 1530
1510 if (!stored) 1531 if (!stored)
1511 log_store(facility, level, LOG_DEFAULT, 0, 1532 log_store(facility, level, lflags, 0,
1512 dict, dictlen, text, text_len); 1533 dict, dictlen, text, text_len);
1513 } 1534 }
1514 printed_len += text_len; 1535 printed_len += text_len;
@@ -1607,8 +1628,8 @@ static struct cont {
1607static struct log *log_from_idx(u32 idx) { return NULL; } 1628static struct log *log_from_idx(u32 idx) { return NULL; }
1608static u32 log_next(u32 idx) { return 0; } 1629static u32 log_next(u32 idx) { return 0; }
1609static void call_console_drivers(int level, const char *text, size_t len) {} 1630static void call_console_drivers(int level, const char *text, size_t len) {}
1610static size_t msg_print_text(const struct log *msg, bool syslog, 1631static size_t msg_print_text(const struct log *msg, enum log_flags prev,
1611 char *buf, size_t size) { return 0; } 1632 bool syslog, char *buf, size_t size) { return 0; }
1612static size_t cont_print_text(char *text, size_t size) { return 0; } 1633static size_t cont_print_text(char *text, size_t size) { return 0; }
1613 1634
1614#endif /* CONFIG_PRINTK */ 1635#endif /* CONFIG_PRINTK */
@@ -1884,6 +1905,7 @@ void wake_up_klogd(void)
1884/* the next printk record to write to the console */ 1905/* the next printk record to write to the console */
1885static u64 console_seq; 1906static u64 console_seq;
1886static u32 console_idx; 1907static u32 console_idx;
1908static enum log_flags console_prev;
1887 1909
1888/** 1910/**
1889 * console_unlock - unlock the console system 1911 * console_unlock - unlock the console system
@@ -1944,6 +1966,7 @@ again:
1944 /* messages are gone, move to first one */ 1966 /* messages are gone, move to first one */
1945 console_seq = log_first_seq; 1967 console_seq = log_first_seq;
1946 console_idx = log_first_idx; 1968 console_idx = log_first_idx;
1969 console_prev = 0;
1947 } 1970 }
1948skip: 1971skip:
1949 if (console_seq == log_next_seq) 1972 if (console_seq == log_next_seq)
@@ -1957,14 +1980,21 @@ skip:
1957 */ 1980 */
1958 console_idx = log_next(console_idx); 1981 console_idx = log_next(console_idx);
1959 console_seq++; 1982 console_seq++;
1983 /*
1984 * We will get here again when we register a new
1985 * CON_PRINTBUFFER console. Clear the flag so we
1986 * will properly dump everything later.
1987 */
1988 msg->flags &= ~LOG_NOCONS;
1960 goto skip; 1989 goto skip;
1961 } 1990 }
1962 1991
1963 level = msg->level; 1992 level = msg->level;
1964 len = msg_print_text(msg, false, text, sizeof(text)); 1993 len = msg_print_text(msg, console_prev, false,
1965 1994 text, sizeof(text));
1966 console_idx = log_next(console_idx); 1995 console_idx = log_next(console_idx);
1967 console_seq++; 1996 console_seq++;
1997 console_prev = msg->flags;
1968 raw_spin_unlock(&logbuf_lock); 1998 raw_spin_unlock(&logbuf_lock);
1969 1999
1970 stop_critical_timings(); /* don't trace print latency */ 2000 stop_critical_timings(); /* don't trace print latency */
@@ -2227,6 +2257,7 @@ void register_console(struct console *newcon)
2227 raw_spin_lock_irqsave(&logbuf_lock, flags); 2257 raw_spin_lock_irqsave(&logbuf_lock, flags);
2228 console_seq = syslog_seq; 2258 console_seq = syslog_seq;
2229 console_idx = syslog_idx; 2259 console_idx = syslog_idx;
2260 console_prev = syslog_prev;
2230 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2261 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2231 /* 2262 /*
2232 * We're about to replay the log buffer. Only do this to the 2263 * We're about to replay the log buffer. Only do this to the
@@ -2479,7 +2510,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
2479} 2510}
2480 2511
2481/** 2512/**
2482 * kmsg_dump_get_line - retrieve one kmsg log line 2513 * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
2483 * @dumper: registered kmsg dumper 2514 * @dumper: registered kmsg dumper
2484 * @syslog: include the "<4>" prefixes 2515 * @syslog: include the "<4>" prefixes
2485 * @line: buffer to copy the line to 2516 * @line: buffer to copy the line to
@@ -2494,11 +2525,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
2494 * 2525 *
2495 * A return value of FALSE indicates that there are no more records to 2526 * A return value of FALSE indicates that there are no more records to
2496 * read. 2527 * read.
2528 *
2529 * The function is similar to kmsg_dump_get_line(), but grabs no locks.
2497 */ 2530 */
2498bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, 2531bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
2499 char *line, size_t size, size_t *len) 2532 char *line, size_t size, size_t *len)
2500{ 2533{
2501 unsigned long flags;
2502 struct log *msg; 2534 struct log *msg;
2503 size_t l = 0; 2535 size_t l = 0;
2504 bool ret = false; 2536 bool ret = false;
@@ -2506,7 +2538,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2506 if (!dumper->active) 2538 if (!dumper->active)
2507 goto out; 2539 goto out;
2508 2540
2509 raw_spin_lock_irqsave(&logbuf_lock, flags);
2510 if (dumper->cur_seq < log_first_seq) { 2541 if (dumper->cur_seq < log_first_seq) {
2511 /* messages are gone, move to first available one */ 2542 /* messages are gone, move to first available one */
2512 dumper->cur_seq = log_first_seq; 2543 dumper->cur_seq = log_first_seq;
@@ -2514,24 +2545,50 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2514 } 2545 }
2515 2546
2516 /* last entry */ 2547 /* last entry */
2517 if (dumper->cur_seq >= log_next_seq) { 2548 if (dumper->cur_seq >= log_next_seq)
2518 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2519 goto out; 2549 goto out;
2520 }
2521 2550
2522 msg = log_from_idx(dumper->cur_idx); 2551 msg = log_from_idx(dumper->cur_idx);
2523 l = msg_print_text(msg, syslog, 2552 l = msg_print_text(msg, 0, syslog, line, size);
2524 line, size);
2525 2553
2526 dumper->cur_idx = log_next(dumper->cur_idx); 2554 dumper->cur_idx = log_next(dumper->cur_idx);
2527 dumper->cur_seq++; 2555 dumper->cur_seq++;
2528 ret = true; 2556 ret = true;
2529 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2530out: 2557out:
2531 if (len) 2558 if (len)
2532 *len = l; 2559 *len = l;
2533 return ret; 2560 return ret;
2534} 2561}
2562
2563/**
2564 * kmsg_dump_get_line - retrieve one kmsg log line
2565 * @dumper: registered kmsg dumper
2566 * @syslog: include the "<4>" prefixes
2567 * @line: buffer to copy the line to
2568 * @size: maximum size of the buffer
2569 * @len: length of line placed into buffer
2570 *
2571 * Start at the beginning of the kmsg buffer, with the oldest kmsg
2572 * record, and copy one record into the provided buffer.
2573 *
2574 * Consecutive calls will return the next available record moving
2575 * towards the end of the buffer with the youngest messages.
2576 *
2577 * A return value of FALSE indicates that there are no more records to
2578 * read.
2579 */
2580bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
2581 char *line, size_t size, size_t *len)
2582{
2583 unsigned long flags;
2584 bool ret;
2585
2586 raw_spin_lock_irqsave(&logbuf_lock, flags);
2587 ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
2588 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2589
2590 return ret;
2591}
2535EXPORT_SYMBOL_GPL(kmsg_dump_get_line); 2592EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
2536 2593
2537/** 2594/**
@@ -2561,6 +2618,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2561 u32 idx; 2618 u32 idx;
2562 u64 next_seq; 2619 u64 next_seq;
2563 u32 next_idx; 2620 u32 next_idx;
2621 enum log_flags prev;
2564 size_t l = 0; 2622 size_t l = 0;
2565 bool ret = false; 2623 bool ret = false;
2566 2624
@@ -2583,23 +2641,27 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2583 /* calculate length of entire buffer */ 2641 /* calculate length of entire buffer */
2584 seq = dumper->cur_seq; 2642 seq = dumper->cur_seq;
2585 idx = dumper->cur_idx; 2643 idx = dumper->cur_idx;
2644 prev = 0;
2586 while (seq < dumper->next_seq) { 2645 while (seq < dumper->next_seq) {
2587 struct log *msg = log_from_idx(idx); 2646 struct log *msg = log_from_idx(idx);
2588 2647
2589 l += msg_print_text(msg, true, NULL, 0); 2648 l += msg_print_text(msg, prev, true, NULL, 0);
2590 idx = log_next(idx); 2649 idx = log_next(idx);
2591 seq++; 2650 seq++;
2651 prev = msg->flags;
2592 } 2652 }
2593 2653
2594 /* move first record forward until length fits into the buffer */ 2654 /* move first record forward until length fits into the buffer */
2595 seq = dumper->cur_seq; 2655 seq = dumper->cur_seq;
2596 idx = dumper->cur_idx; 2656 idx = dumper->cur_idx;
2657 prev = 0;
2597 while (l > size && seq < dumper->next_seq) { 2658 while (l > size && seq < dumper->next_seq) {
2598 struct log *msg = log_from_idx(idx); 2659 struct log *msg = log_from_idx(idx);
2599 2660
2600 l -= msg_print_text(msg, true, NULL, 0); 2661 l -= msg_print_text(msg, prev, true, NULL, 0);
2601 idx = log_next(idx); 2662 idx = log_next(idx);
2602 seq++; 2663 seq++;
2664 prev = msg->flags;
2603 } 2665 }
2604 2666
2605 /* last message in next interation */ 2667 /* last message in next interation */
@@ -2607,14 +2669,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2607 next_idx = idx; 2669 next_idx = idx;
2608 2670
2609 l = 0; 2671 l = 0;
2672 prev = 0;
2610 while (seq < dumper->next_seq) { 2673 while (seq < dumper->next_seq) {
2611 struct log *msg = log_from_idx(idx); 2674 struct log *msg = log_from_idx(idx);
2612 2675
2613 l += msg_print_text(msg, syslog, 2676 l += msg_print_text(msg, prev, syslog, buf + l, size - l);
2614 buf + l, size - l);
2615
2616 idx = log_next(idx); 2677 idx = log_next(idx);
2617 seq++; 2678 seq++;
2679 prev = msg->flags;
2618 } 2680 }
2619 2681
2620 dumper->next_seq = next_seq; 2682 dumper->next_seq = next_seq;
@@ -2629,6 +2691,24 @@ out:
2629EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); 2691EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
2630 2692
2631/** 2693/**
2694 * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
2695 * @dumper: registered kmsg dumper
2696 *
2697 * Reset the dumper's iterator so that kmsg_dump_get_line() and
2698 * kmsg_dump_get_buffer() can be called again and used multiple
2699 * times within the same dumper.dump() callback.
2700 *
2701 * The function is similar to kmsg_dump_rewind(), but grabs no locks.
2702 */
2703void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
2704{
2705 dumper->cur_seq = clear_seq;
2706 dumper->cur_idx = clear_idx;
2707 dumper->next_seq = log_next_seq;
2708 dumper->next_idx = log_next_idx;
2709}
2710
2711/**
2632 * kmsg_dump_rewind - reset the interator 2712 * kmsg_dump_rewind - reset the interator
2633 * @dumper: registered kmsg dumper 2713 * @dumper: registered kmsg dumper
2634 * 2714 *
@@ -2641,10 +2721,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2641 unsigned long flags; 2721 unsigned long flags;
2642 2722
2643 raw_spin_lock_irqsave(&logbuf_lock, flags); 2723 raw_spin_lock_irqsave(&logbuf_lock, flags);
2644 dumper->cur_seq = clear_seq; 2724 kmsg_dump_rewind_nolock(dumper);
2645 dumper->cur_idx = clear_idx;
2646 dumper->next_seq = log_next_seq;
2647 dumper->next_idx = log_next_idx;
2648 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2725 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2649} 2726}
2650EXPORT_SYMBOL_GPL(kmsg_dump_rewind); 2727EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 95cba41ce1e9..4e6a61b15e86 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -54,6 +54,50 @@
54#ifdef CONFIG_PREEMPT_RCU 54#ifdef CONFIG_PREEMPT_RCU
55 55
56/* 56/*
57 * Preemptible RCU implementation for rcu_read_lock().
58 * Just increment ->rcu_read_lock_nesting, shared state will be updated
59 * if we block.
60 */
61void __rcu_read_lock(void)
62{
63 current->rcu_read_lock_nesting++;
64 barrier(); /* critical section after entry code. */
65}
66EXPORT_SYMBOL_GPL(__rcu_read_lock);
67
68/*
69 * Preemptible RCU implementation for rcu_read_unlock().
70 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
71 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
72 * invoke rcu_read_unlock_special() to clean up after a context switch
73 * in an RCU read-side critical section and other special cases.
74 */
75void __rcu_read_unlock(void)
76{
77 struct task_struct *t = current;
78
79 if (t->rcu_read_lock_nesting != 1) {
80 --t->rcu_read_lock_nesting;
81 } else {
82 barrier(); /* critical section before exit code. */
83 t->rcu_read_lock_nesting = INT_MIN;
84 barrier(); /* assign before ->rcu_read_unlock_special load */
85 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
86 rcu_read_unlock_special(t);
87 barrier(); /* ->rcu_read_unlock_special load before assign */
88 t->rcu_read_lock_nesting = 0;
89 }
90#ifdef CONFIG_PROVE_LOCKING
91 {
92 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
93
94 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
95 }
96#endif /* #ifdef CONFIG_PROVE_LOCKING */
97}
98EXPORT_SYMBOL_GPL(__rcu_read_unlock);
99
100/*
57 * Check for a task exiting while in a preemptible-RCU read-side 101 * Check for a task exiting while in a preemptible-RCU read-side
58 * critical section, clean up if so. No need to issue warnings, 102 * critical section, clean up if so. No need to issue warnings,
59 * as debug_check_no_locks_held() already does this if lockdep 103 * as debug_check_no_locks_held() already does this if lockdep
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 37a5444204d2..547b1fe5b052 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -172,7 +172,7 @@ void rcu_irq_enter(void)
172 local_irq_restore(flags); 172 local_irq_restore(flags);
173} 173}
174 174
175#ifdef CONFIG_PROVE_RCU 175#ifdef CONFIG_DEBUG_LOCK_ALLOC
176 176
177/* 177/*
178 * Test whether RCU thinks that the current CPU is idle. 178 * Test whether RCU thinks that the current CPU is idle.
@@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void)
183} 183}
184EXPORT_SYMBOL(rcu_is_cpu_idle); 184EXPORT_SYMBOL(rcu_is_cpu_idle);
185 185
186#endif /* #ifdef CONFIG_PROVE_RCU */ 186#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
187 187
188/* 188/*
189 * Test whether the current CPU was interrupted from idle. Nested 189 * Test whether the current CPU was interrupted from idle. Nested
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index fc31a2d65100..918fd1e8509c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
132 RCU_TRACE(.rcb.name = "rcu_preempt") 132 RCU_TRACE(.rcb.name = "rcu_preempt")
133}; 133};
134 134
135static void rcu_read_unlock_special(struct task_struct *t);
136static int rcu_preempted_readers_exp(void); 135static int rcu_preempted_readers_exp(void);
137static void rcu_report_exp_done(void); 136static void rcu_report_exp_done(void);
138 137
@@ -351,8 +350,9 @@ static int rcu_initiate_boost(void)
351 rcu_preempt_ctrlblk.boost_tasks = 350 rcu_preempt_ctrlblk.boost_tasks =
352 rcu_preempt_ctrlblk.gp_tasks; 351 rcu_preempt_ctrlblk.gp_tasks;
353 invoke_rcu_callbacks(); 352 invoke_rcu_callbacks();
354 } else 353 } else {
355 RCU_TRACE(rcu_initiate_boost_trace()); 354 RCU_TRACE(rcu_initiate_boost_trace());
355 }
356 return 1; 356 return 1;
357} 357}
358 358
@@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void)
527} 527}
528 528
529/* 529/*
530 * Tiny-preemptible RCU implementation for rcu_read_lock().
531 * Just increment ->rcu_read_lock_nesting, shared state will be updated
532 * if we block.
533 */
534void __rcu_read_lock(void)
535{
536 current->rcu_read_lock_nesting++;
537 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
538}
539EXPORT_SYMBOL_GPL(__rcu_read_lock);
540
541/*
542 * Handle special cases during rcu_read_unlock(), such as needing to 530 * Handle special cases during rcu_read_unlock(), such as needing to
543 * notify RCU core processing or task having blocked during the RCU 531 * notify RCU core processing or task having blocked during the RCU
544 * read-side critical section. 532 * read-side critical section.
545 */ 533 */
546static noinline void rcu_read_unlock_special(struct task_struct *t) 534void rcu_read_unlock_special(struct task_struct *t)
547{ 535{
548 int empty; 536 int empty;
549 int empty_exp; 537 int empty_exp;
@@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
627} 615}
628 616
629/* 617/*
630 * Tiny-preemptible RCU implementation for rcu_read_unlock().
631 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
632 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
633 * invoke rcu_read_unlock_special() to clean up after a context switch
634 * in an RCU read-side critical section and other special cases.
635 */
636void __rcu_read_unlock(void)
637{
638 struct task_struct *t = current;
639
640 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
641 if (t->rcu_read_lock_nesting != 1)
642 --t->rcu_read_lock_nesting;
643 else {
644 t->rcu_read_lock_nesting = INT_MIN;
645 barrier(); /* assign before ->rcu_read_unlock_special load */
646 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
647 rcu_read_unlock_special(t);
648 barrier(); /* ->rcu_read_unlock_special load before assign */
649 t->rcu_read_lock_nesting = 0;
650 }
651#ifdef CONFIG_PROVE_LOCKING
652 {
653 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
654
655 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
656 }
657#endif /* #ifdef CONFIG_PROVE_LOCKING */
658}
659EXPORT_SYMBOL_GPL(__rcu_read_unlock);
660
661/*
662 * Check for a quiescent state from the current CPU. When a task blocks, 618 * Check for a quiescent state from the current CPU. When a task blocks,
663 * the task is recorded in the rcu_preempt_ctrlblk structure, which is 619 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
664 * checked elsewhere. This is called from the scheduling-clock interrupt. 620 * checked elsewhere. This is called from the scheduling-clock interrupt.
@@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void)
823 rpcp->exp_tasks = NULL; 779 rpcp->exp_tasks = NULL;
824 780
825 /* Wait for tail of ->blkd_tasks list to drain. */ 781 /* Wait for tail of ->blkd_tasks list to drain. */
826 if (!rcu_preempted_readers_exp()) 782 if (!rcu_preempted_readers_exp()) {
827 local_irq_restore(flags); 783 local_irq_restore(flags);
828 else { 784 } else {
829 rcu_initiate_boost(); 785 rcu_initiate_boost();
830 local_irq_restore(flags); 786 local_irq_restore(flags);
831 wait_event(sync_rcu_preempt_exp_wq, 787 wait_event(sync_rcu_preempt_exp_wq,
@@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
846 */ 802 */
847int rcu_preempt_needs_cpu(void) 803int rcu_preempt_needs_cpu(void)
848{ 804{
849 if (!rcu_preempt_running_reader())
850 rcu_preempt_cpu_qs();
851 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; 805 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
852} 806}
853 807
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e66b34ab7555..25b15033c61f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -49,8 +49,7 @@
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
53 "Josh Triplett <josh@freedesktop.org>");
54 53
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 54static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 55static int nfakewriters = 4; /* # fake writer threads */
@@ -206,6 +205,7 @@ static unsigned long boost_starttime; /* jiffies of next boost test start. */
206DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 205DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
207 /* and boost task create/destroy. */ 206 /* and boost task create/destroy. */
208static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ 207static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
208static bool barrier_phase; /* Test phase. */
209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ 209static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ 210static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); 211static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
@@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p)
407 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { 407 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
408 rp->rtort_mbtest = 0; 408 rp->rtort_mbtest = 0;
409 rcu_torture_free(rp); 409 rcu_torture_free(rp);
410 } else 410 } else {
411 cur_ops->deferred_free(rp); 411 cur_ops->deferred_free(rp);
412 }
412} 413}
413 414
414static int rcu_no_completed(void) 415static int rcu_no_completed(void)
@@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void)
635 synchronize_srcu(&srcu_ctl); 636 synchronize_srcu(&srcu_ctl);
636} 637}
637 638
639static void srcu_torture_call(struct rcu_head *head,
640 void (*func)(struct rcu_head *head))
641{
642 call_srcu(&srcu_ctl, head, func);
643}
644
645static void srcu_torture_barrier(void)
646{
647 srcu_barrier(&srcu_ctl);
648}
649
638static int srcu_torture_stats(char *page) 650static int srcu_torture_stats(char *page)
639{ 651{
640 int cnt = 0; 652 int cnt = 0;
@@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = {
661 .completed = srcu_torture_completed, 673 .completed = srcu_torture_completed,
662 .deferred_free = srcu_torture_deferred_free, 674 .deferred_free = srcu_torture_deferred_free,
663 .sync = srcu_torture_synchronize, 675 .sync = srcu_torture_synchronize,
664 .call = NULL, 676 .call = srcu_torture_call,
665 .cb_barrier = NULL, 677 .cb_barrier = srcu_torture_barrier,
666 .stats = srcu_torture_stats, 678 .stats = srcu_torture_stats,
667 .name = "srcu" 679 .name = "srcu"
668}; 680};
@@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg)
1013 do { 1025 do {
1014 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 1026 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
1015 udelay(rcu_random(&rand) & 0x3ff); 1027 udelay(rcu_random(&rand) & 0x3ff);
1016 cur_ops->sync(); 1028 if (cur_ops->cb_barrier != NULL &&
1029 rcu_random(&rand) % (nfakewriters * 8) == 0)
1030 cur_ops->cb_barrier();
1031 else
1032 cur_ops->sync();
1017 rcu_stutter_wait("rcu_torture_fakewriter"); 1033 rcu_stutter_wait("rcu_torture_fakewriter");
1018 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1034 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1019 1035
@@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page)
1183 } 1199 }
1184 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 1200 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
1185 cnt += sprintf(&page[cnt], 1201 cnt += sprintf(&page[cnt],
1186 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1202 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
1187 "rtmbe: %d rtbke: %ld rtbre: %ld "
1188 "rtbf: %ld rtb: %ld nt: %ld "
1189 "onoff: %ld/%ld:%ld/%ld "
1190 "barrier: %ld/%ld:%ld",
1191 rcu_torture_current, 1203 rcu_torture_current,
1192 rcu_torture_current_version, 1204 rcu_torture_current_version,
1193 list_empty(&rcu_torture_freelist), 1205 list_empty(&rcu_torture_freelist),
1194 atomic_read(&n_rcu_torture_alloc), 1206 atomic_read(&n_rcu_torture_alloc),
1195 atomic_read(&n_rcu_torture_alloc_fail), 1207 atomic_read(&n_rcu_torture_alloc_fail),
1196 atomic_read(&n_rcu_torture_free), 1208 atomic_read(&n_rcu_torture_free));
1209 cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
1197 atomic_read(&n_rcu_torture_mberror), 1210 atomic_read(&n_rcu_torture_mberror),
1198 n_rcu_torture_boost_ktrerror, 1211 n_rcu_torture_boost_ktrerror,
1199 n_rcu_torture_boost_rterror, 1212 n_rcu_torture_boost_rterror);
1213 cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
1200 n_rcu_torture_boost_failure, 1214 n_rcu_torture_boost_failure,
1201 n_rcu_torture_boosts, 1215 n_rcu_torture_boosts,
1202 n_rcu_torture_timers, 1216 n_rcu_torture_timers);
1217 cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ",
1203 n_online_successes, 1218 n_online_successes,
1204 n_online_attempts, 1219 n_online_attempts,
1205 n_offline_successes, 1220 n_offline_successes,
1206 n_offline_attempts, 1221 n_offline_attempts);
1222 cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
1207 n_barrier_successes, 1223 n_barrier_successes,
1208 n_barrier_attempts, 1224 n_barrier_attempts,
1209 n_rcu_torture_barrier_error); 1225 n_rcu_torture_barrier_error);
@@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg)
1445 delta = shutdown_time - jiffies_snap; 1461 delta = shutdown_time - jiffies_snap;
1446 if (verbose) 1462 if (verbose)
1447 printk(KERN_ALERT "%s" TORTURE_FLAG 1463 printk(KERN_ALERT "%s" TORTURE_FLAG
1448 "rcu_torture_shutdown task: %lu " 1464 "rcu_torture_shutdown task: %lu jiffies remaining\n",
1449 "jiffies remaining\n",
1450 torture_type, delta); 1465 torture_type, delta);
1451 schedule_timeout_interruptible(delta); 1466 schedule_timeout_interruptible(delta);
1452 jiffies_snap = ACCESS_ONCE(jiffies); 1467 jiffies_snap = ACCESS_ONCE(jiffies);
@@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg)
1498 if (cpu_down(cpu) == 0) { 1513 if (cpu_down(cpu) == 0) {
1499 if (verbose) 1514 if (verbose)
1500 printk(KERN_ALERT "%s" TORTURE_FLAG 1515 printk(KERN_ALERT "%s" TORTURE_FLAG
1501 "rcu_torture_onoff task: " 1516 "rcu_torture_onoff task: offlined %d\n",
1502 "offlined %d\n",
1503 torture_type, cpu); 1517 torture_type, cpu);
1504 n_offline_successes++; 1518 n_offline_successes++;
1505 } 1519 }
@@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg)
1512 if (cpu_up(cpu) == 0) { 1526 if (cpu_up(cpu) == 0) {
1513 if (verbose) 1527 if (verbose)
1514 printk(KERN_ALERT "%s" TORTURE_FLAG 1528 printk(KERN_ALERT "%s" TORTURE_FLAG
1515 "rcu_torture_onoff task: " 1529 "rcu_torture_onoff task: onlined %d\n",
1516 "onlined %d\n",
1517 torture_type, cpu); 1530 torture_type, cpu);
1518 n_online_successes++; 1531 n_online_successes++;
1519 } 1532 }
@@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu)
1631static int rcu_torture_barrier_cbs(void *arg) 1644static int rcu_torture_barrier_cbs(void *arg)
1632{ 1645{
1633 long myid = (long)arg; 1646 long myid = (long)arg;
1647 bool lastphase = 0;
1634 struct rcu_head rcu; 1648 struct rcu_head rcu;
1635 1649
1636 init_rcu_head_on_stack(&rcu); 1650 init_rcu_head_on_stack(&rcu);
@@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg)
1638 set_user_nice(current, 19); 1652 set_user_nice(current, 19);
1639 do { 1653 do {
1640 wait_event(barrier_cbs_wq[myid], 1654 wait_event(barrier_cbs_wq[myid],
1641 atomic_read(&barrier_cbs_count) == n_barrier_cbs || 1655 barrier_phase != lastphase ||
1642 kthread_should_stop() || 1656 kthread_should_stop() ||
1643 fullstop != FULLSTOP_DONTSTOP); 1657 fullstop != FULLSTOP_DONTSTOP);
1658 lastphase = barrier_phase;
1659 smp_mb(); /* ensure barrier_phase load before ->call(). */
1644 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1660 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1645 break; 1661 break;
1646 cur_ops->call(&rcu, rcu_torture_barrier_cbf); 1662 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
@@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg)
1665 do { 1681 do {
1666 atomic_set(&barrier_cbs_invoked, 0); 1682 atomic_set(&barrier_cbs_invoked, 0);
1667 atomic_set(&barrier_cbs_count, n_barrier_cbs); 1683 atomic_set(&barrier_cbs_count, n_barrier_cbs);
1668 /* wake_up() path contains the required barriers. */ 1684 smp_mb(); /* Ensure barrier_phase after prior assignments. */
1685 barrier_phase = !barrier_phase;
1669 for (i = 0; i < n_barrier_cbs; i++) 1686 for (i = 0; i < n_barrier_cbs; i++)
1670 wake_up(&barrier_cbs_wq[i]); 1687 wake_up(&barrier_cbs_wq[i]);
1671 wait_event(barrier_wq, 1688 wait_event(barrier_wq,
@@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg)
1684 schedule_timeout_interruptible(HZ / 10); 1701 schedule_timeout_interruptible(HZ / 10);
1685 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1702 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
1686 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); 1703 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
1687 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); 1704 rcutorture_shutdown_absorb("rcu_torture_barrier");
1688 while (!kthread_should_stop()) 1705 while (!kthread_should_stop())
1689 schedule_timeout_interruptible(1); 1706 schedule_timeout_interruptible(1);
1690 return 0; 1707 return 0;
@@ -1908,8 +1925,8 @@ rcu_torture_init(void)
1908 static struct rcu_torture_ops *torture_ops[] = 1925 static struct rcu_torture_ops *torture_ops[] =
1909 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1926 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1910 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1927 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1911 &srcu_ops, &srcu_sync_ops, &srcu_raw_ops, 1928 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1912 &srcu_raw_sync_ops, &srcu_expedited_ops, 1929 &srcu_raw_ops, &srcu_raw_sync_ops,
1913 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1930 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1914 1931
1915 mutex_lock(&fullstop_mutex); 1932 mutex_lock(&fullstop_mutex);
@@ -1931,8 +1948,7 @@ rcu_torture_init(void)
1931 return -EINVAL; 1948 return -EINVAL;
1932 } 1949 }
1933 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1950 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1934 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " 1951 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
1935 "fqs_duration, fqs disabled.\n");
1936 fqs_duration = 0; 1952 fqs_duration = 0;
1937 } 1953 }
1938 if (cur_ops->init) 1954 if (cur_ops->init)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 38ecdda3f55f..f280e542e3e9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,36 +60,44 @@
60 60
61/* Data structures. */ 61/* Data structures. */
62 62
63static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 63static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
64 64
65#define RCU_STATE_INITIALIZER(structname) { \ 65#define RCU_STATE_INITIALIZER(sname, cr) { \
66 .level = { &structname##_state.node[0] }, \ 66 .level = { &sname##_state.node[0] }, \
67 .levelcnt = { \ 67 .call = cr, \
68 NUM_RCU_LVL_0, /* root of hierarchy. */ \
69 NUM_RCU_LVL_1, \
70 NUM_RCU_LVL_2, \
71 NUM_RCU_LVL_3, \
72 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
73 }, \
74 .fqs_state = RCU_GP_IDLE, \ 68 .fqs_state = RCU_GP_IDLE, \
75 .gpnum = -300, \ 69 .gpnum = -300, \
76 .completed = -300, \ 70 .completed = -300, \
77 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 71 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
78 .orphan_nxttail = &structname##_state.orphan_nxtlist, \ 72 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
79 .orphan_donetail = &structname##_state.orphan_donelist, \ 73 .orphan_donetail = &sname##_state.orphan_donelist, \
80 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ 74 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
81 .n_force_qs = 0, \ 75 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
82 .n_force_qs_ngp = 0, \ 76 .name = #sname, \
83 .name = #structname, \
84} 77}
85 78
86struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); 79struct rcu_state rcu_sched_state =
80 RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
87DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
88 82
89struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); 83struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
90DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 84DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
91 85
92static struct rcu_state *rcu_state; 86static struct rcu_state *rcu_state;
87LIST_HEAD(rcu_struct_flavors);
88
89/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
90static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
91module_param(rcu_fanout_leaf, int, 0);
92int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
93static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
94 NUM_RCU_LVL_0,
95 NUM_RCU_LVL_1,
96 NUM_RCU_LVL_2,
97 NUM_RCU_LVL_3,
98 NUM_RCU_LVL_4,
99};
100int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
93 101
94/* 102/*
95 * The rcu_scheduler_active variable transitions from zero to one just 103 * The rcu_scheduler_active variable transitions from zero to one just
@@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
147unsigned long rcutorture_testseq; 155unsigned long rcutorture_testseq;
148unsigned long rcutorture_vernum; 156unsigned long rcutorture_vernum;
149 157
150/* State information for rcu_barrier() and friends. */
151
152static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
153static atomic_t rcu_barrier_cpu_count;
154static DEFINE_MUTEX(rcu_barrier_mutex);
155static struct completion rcu_barrier_completion;
156
157/* 158/*
158 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 159 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
159 * permit this function to be invoked without holding the root rcu_node 160 * permit this function to be invoked without holding the root rcu_node
@@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu)
201{ 202{
202 trace_rcu_utilization("Start context switch"); 203 trace_rcu_utilization("Start context switch");
203 rcu_sched_qs(cpu); 204 rcu_sched_qs(cpu);
205 rcu_preempt_note_context_switch(cpu);
204 trace_rcu_utilization("End context switch"); 206 trace_rcu_utilization("End context switch");
205} 207}
206EXPORT_SYMBOL_GPL(rcu_note_context_switch); 208EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
357 struct task_struct *idle = idle_task(smp_processor_id()); 359 struct task_struct *idle = idle_task(smp_processor_id());
358 360
359 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); 361 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
360 ftrace_dump(DUMP_ALL); 362 ftrace_dump(DUMP_ORIG);
361 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 363 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
362 current->pid, current->comm, 364 current->pid, current->comm,
363 idle->pid, idle->comm); /* must be idle task! */ 365 idle->pid, idle->comm); /* must be idle task! */
@@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
467 469
468 trace_rcu_dyntick("Error on exit: not idle task", 470 trace_rcu_dyntick("Error on exit: not idle task",
469 oldval, rdtp->dynticks_nesting); 471 oldval, rdtp->dynticks_nesting);
470 ftrace_dump(DUMP_ALL); 472 ftrace_dump(DUMP_ORIG);
471 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 473 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
472 current->pid, current->comm, 474 current->pid, current->comm,
473 idle->pid, idle->comm); /* must be idle task! */ 475 idle->pid, idle->comm); /* must be idle task! */
@@ -584,8 +586,6 @@ void rcu_nmi_exit(void)
584 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 586 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
585} 587}
586 588
587#ifdef CONFIG_PROVE_RCU
588
589/** 589/**
590 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle 590 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
591 * 591 *
@@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void)
603} 603}
604EXPORT_SYMBOL(rcu_is_cpu_idle); 604EXPORT_SYMBOL(rcu_is_cpu_idle);
605 605
606#ifdef CONFIG_HOTPLUG_CPU 606#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
607 607
608/* 608/*
609 * Is the current CPU online? Disable preemption to avoid false positives 609 * Is the current CPU online? Disable preemption to avoid false positives
@@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void)
644} 644}
645EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); 645EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
646 646
647#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 647#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
648
649#endif /* #ifdef CONFIG_PROVE_RCU */
650 648
651/** 649/**
652 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle 650 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
@@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
732 int cpu; 730 int cpu;
733 long delta; 731 long delta;
734 unsigned long flags; 732 unsigned long flags;
735 int ndetected; 733 int ndetected = 0;
736 struct rcu_node *rnp = rcu_get_root(rsp); 734 struct rcu_node *rnp = rcu_get_root(rsp);
737 735
738 /* Only let one CPU complain about others per time interval. */ 736 /* Only let one CPU complain about others per time interval. */
@@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
773 */ 771 */
774 rnp = rcu_get_root(rsp); 772 rnp = rcu_get_root(rsp);
775 raw_spin_lock_irqsave(&rnp->lock, flags); 773 raw_spin_lock_irqsave(&rnp->lock, flags);
776 ndetected = rcu_print_task_stall(rnp); 774 ndetected += rcu_print_task_stall(rnp);
777 raw_spin_unlock_irqrestore(&rnp->lock, flags); 775 raw_spin_unlock_irqrestore(&rnp->lock, flags);
778 776
779 print_cpu_stall_info_end(); 777 print_cpu_stall_info_end();
@@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
859 */ 857 */
860void rcu_cpu_stall_reset(void) 858void rcu_cpu_stall_reset(void)
861{ 859{
862 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; 860 struct rcu_state *rsp;
863 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; 861
864 rcu_preempt_stall_reset(); 862 for_each_rcu_flavor(rsp)
863 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
865} 864}
866 865
867static struct notifier_block rcu_panic_block = { 866static struct notifier_block rcu_panic_block = {
@@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
893 if (rnp->qsmask & rdp->grpmask) { 892 if (rnp->qsmask & rdp->grpmask) {
894 rdp->qs_pending = 1; 893 rdp->qs_pending = 1;
895 rdp->passed_quiesce = 0; 894 rdp->passed_quiesce = 0;
896 } else 895 } else {
897 rdp->qs_pending = 0; 896 rdp->qs_pending = 0;
897 }
898 zero_cpu_stall_ticks(rdp); 898 zero_cpu_stall_ticks(rdp);
899 } 899 }
900} 900}
@@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
936} 936}
937 937
938/* 938/*
939 * Initialize the specified rcu_data structure's callback list to empty.
940 */
941static void init_callback_list(struct rcu_data *rdp)
942{
943 int i;
944
945 rdp->nxtlist = NULL;
946 for (i = 0; i < RCU_NEXT_SIZE; i++)
947 rdp->nxttail[i] = &rdp->nxtlist;
948}
949
950/*
939 * Advance this CPU's callbacks, but only if the current grace period 951 * Advance this CPU's callbacks, but only if the current grace period
940 * has ended. This may be called only from the CPU to whom the rdp 952 * has ended. This may be called only from the CPU to whom the rdp
941 * belongs. In addition, the corresponding leaf rcu_node structure's 953 * belongs. In addition, the corresponding leaf rcu_node structure's
@@ -1327,8 +1339,6 @@ static void
1327rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 1339rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1328 struct rcu_node *rnp, struct rcu_data *rdp) 1340 struct rcu_node *rnp, struct rcu_data *rdp)
1329{ 1341{
1330 int i;
1331
1332 /* 1342 /*
1333 * Orphan the callbacks. First adjust the counts. This is safe 1343 * Orphan the callbacks. First adjust the counts. This is safe
1334 * because ->onofflock excludes _rcu_barrier()'s adoption of 1344 * because ->onofflock excludes _rcu_barrier()'s adoption of
@@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1339 rsp->qlen += rdp->qlen; 1349 rsp->qlen += rdp->qlen;
1340 rdp->n_cbs_orphaned += rdp->qlen; 1350 rdp->n_cbs_orphaned += rdp->qlen;
1341 rdp->qlen_lazy = 0; 1351 rdp->qlen_lazy = 0;
1342 rdp->qlen = 0; 1352 ACCESS_ONCE(rdp->qlen) = 0;
1343 } 1353 }
1344 1354
1345 /* 1355 /*
@@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1368 } 1378 }
1369 1379
1370 /* Finally, initialize the rcu_data structure's list to empty. */ 1380 /* Finally, initialize the rcu_data structure's list to empty. */
1371 rdp->nxtlist = NULL; 1381 init_callback_list(rdp);
1372 for (i = 0; i < RCU_NEXT_SIZE; i++)
1373 rdp->nxttail[i] = &rdp->nxtlist;
1374} 1382}
1375 1383
1376/* 1384/*
@@ -1504,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1504 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1512 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1505 if (need_report & RCU_OFL_TASKS_EXP_GP) 1513 if (need_report & RCU_OFL_TASKS_EXP_GP)
1506 rcu_report_exp_rnp(rsp, rnp, true); 1514 rcu_report_exp_rnp(rsp, rnp, true);
1515 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
1516 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
1517 cpu, rdp->qlen, rdp->nxtlist);
1507} 1518}
1508 1519
1509#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1520#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1591,7 +1602,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1591 } 1602 }
1592 smp_mb(); /* List handling before counting for rcu_barrier(). */ 1603 smp_mb(); /* List handling before counting for rcu_barrier(). */
1593 rdp->qlen_lazy -= count_lazy; 1604 rdp->qlen_lazy -= count_lazy;
1594 rdp->qlen -= count; 1605 ACCESS_ONCE(rdp->qlen) -= count;
1595 rdp->n_cbs_invoked += count; 1606 rdp->n_cbs_invoked += count;
1596 1607
1597 /* Reinstate batch limit if we have worked down the excess. */ 1608 /* Reinstate batch limit if we have worked down the excess. */
@@ -1604,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1604 rdp->n_force_qs_snap = rsp->n_force_qs; 1615 rdp->n_force_qs_snap = rsp->n_force_qs;
1605 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 1616 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
1606 rdp->qlen_last_fqs_check = rdp->qlen; 1617 rdp->qlen_last_fqs_check = rdp->qlen;
1618 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
1607 1619
1608 local_irq_restore(flags); 1620 local_irq_restore(flags);
1609 1621
@@ -1744,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1744 break; /* grace period idle or initializing, ignore. */ 1756 break; /* grace period idle or initializing, ignore. */
1745 1757
1746 case RCU_SAVE_DYNTICK: 1758 case RCU_SAVE_DYNTICK:
1747 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1748 break; /* So gcc recognizes the dead code. */
1749 1759
1750 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 1760 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1751 1761
@@ -1787,9 +1797,10 @@ unlock_fqs_ret:
1787 * whom the rdp belongs. 1797 * whom the rdp belongs.
1788 */ 1798 */
1789static void 1799static void
1790__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1800__rcu_process_callbacks(struct rcu_state *rsp)
1791{ 1801{
1792 unsigned long flags; 1802 unsigned long flags;
1803 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1793 1804
1794 WARN_ON_ONCE(rdp->beenonline == 0); 1805 WARN_ON_ONCE(rdp->beenonline == 0);
1795 1806
@@ -1825,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1825 */ 1836 */
1826static void rcu_process_callbacks(struct softirq_action *unused) 1837static void rcu_process_callbacks(struct softirq_action *unused)
1827{ 1838{
1839 struct rcu_state *rsp;
1840
1828 trace_rcu_utilization("Start RCU core"); 1841 trace_rcu_utilization("Start RCU core");
1829 __rcu_process_callbacks(&rcu_sched_state, 1842 for_each_rcu_flavor(rsp)
1830 &__get_cpu_var(rcu_sched_data)); 1843 __rcu_process_callbacks(rsp);
1831 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1832 rcu_preempt_process_callbacks();
1833 trace_rcu_utilization("End RCU core"); 1844 trace_rcu_utilization("End RCU core");
1834} 1845}
1835 1846
@@ -1856,6 +1867,56 @@ static void invoke_rcu_core(void)
1856 raise_softirq(RCU_SOFTIRQ); 1867 raise_softirq(RCU_SOFTIRQ);
1857} 1868}
1858 1869
1870/*
1871 * Handle any core-RCU processing required by a call_rcu() invocation.
1872 */
1873static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
1874 struct rcu_head *head, unsigned long flags)
1875{
1876 /*
1877 * If called from an extended quiescent state, invoke the RCU
1878 * core in order to force a re-evaluation of RCU's idleness.
1879 */
1880 if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
1881 invoke_rcu_core();
1882
1883 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
1884 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
1885 return;
1886
1887 /*
1888 * Force the grace period if too many callbacks or too long waiting.
1889 * Enforce hysteresis, and don't invoke force_quiescent_state()
1890 * if some other CPU has recently done so. Also, don't bother
1891 * invoking force_quiescent_state() if the newly enqueued callback
1892 * is the only one waiting for a grace period to complete.
1893 */
1894 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1895
1896 /* Are we ignoring a completed grace period? */
1897 rcu_process_gp_end(rsp, rdp);
1898 check_for_new_grace_period(rsp, rdp);
1899
1900 /* Start a new grace period if one not already started. */
1901 if (!rcu_gp_in_progress(rsp)) {
1902 unsigned long nestflag;
1903 struct rcu_node *rnp_root = rcu_get_root(rsp);
1904
1905 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1906 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1907 } else {
1908 /* Give the grace period a kick. */
1909 rdp->blimit = LONG_MAX;
1910 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1911 *rdp->nxttail[RCU_DONE_TAIL] != head)
1912 force_quiescent_state(rsp, 0);
1913 rdp->n_force_qs_snap = rsp->n_force_qs;
1914 rdp->qlen_last_fqs_check = rdp->qlen;
1915 }
1916 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1917 force_quiescent_state(rsp, 1);
1918}
1919
1859static void 1920static void
1860__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1921__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1861 struct rcu_state *rsp, bool lazy) 1922 struct rcu_state *rsp, bool lazy)
@@ -1880,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1880 rdp = this_cpu_ptr(rsp->rda); 1941 rdp = this_cpu_ptr(rsp->rda);
1881 1942
1882 /* Add the callback to our list. */ 1943 /* Add the callback to our list. */
1883 rdp->qlen++; 1944 ACCESS_ONCE(rdp->qlen)++;
1884 if (lazy) 1945 if (lazy)
1885 rdp->qlen_lazy++; 1946 rdp->qlen_lazy++;
1886 else 1947 else
@@ -1895,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1895 else 1956 else
1896 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); 1957 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
1897 1958
1898 /* If interrupts were disabled, don't dive into RCU core. */ 1959 /* Go handle any RCU core processing required. */
1899 if (irqs_disabled_flags(flags)) { 1960 __call_rcu_core(rsp, rdp, head, flags);
1900 local_irq_restore(flags);
1901 return;
1902 }
1903
1904 /*
1905 * Force the grace period if too many callbacks or too long waiting.
1906 * Enforce hysteresis, and don't invoke force_quiescent_state()
1907 * if some other CPU has recently done so. Also, don't bother
1908 * invoking force_quiescent_state() if the newly enqueued callback
1909 * is the only one waiting for a grace period to complete.
1910 */
1911 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1912
1913 /* Are we ignoring a completed grace period? */
1914 rcu_process_gp_end(rsp, rdp);
1915 check_for_new_grace_period(rsp, rdp);
1916
1917 /* Start a new grace period if one not already started. */
1918 if (!rcu_gp_in_progress(rsp)) {
1919 unsigned long nestflag;
1920 struct rcu_node *rnp_root = rcu_get_root(rsp);
1921
1922 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1923 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
1924 } else {
1925 /* Give the grace period a kick. */
1926 rdp->blimit = LONG_MAX;
1927 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1928 *rdp->nxttail[RCU_DONE_TAIL] != head)
1929 force_quiescent_state(rsp, 0);
1930 rdp->n_force_qs_snap = rsp->n_force_qs;
1931 rdp->qlen_last_fqs_check = rdp->qlen;
1932 }
1933 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1934 force_quiescent_state(rsp, 1);
1935 local_irq_restore(flags); 1961 local_irq_restore(flags);
1936} 1962}
1937 1963
@@ -1961,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1961 * occasionally incorrectly indicate that there are multiple CPUs online 1987 * occasionally incorrectly indicate that there are multiple CPUs online
1962 * when there was in fact only one the whole time, as this just adds 1988 * when there was in fact only one the whole time, as this just adds
1963 * some overhead: RCU still operates correctly. 1989 * some overhead: RCU still operates correctly.
1964 *
1965 * Of course, sampling num_online_cpus() with preemption enabled can
1966 * give erroneous results if there are concurrent CPU-hotplug operations.
1967 * For example, given a demonic sequence of preemptions in num_online_cpus()
1968 * and CPU-hotplug operations, there could be two or more CPUs online at
1969 * all times, but num_online_cpus() might well return one (or even zero).
1970 *
1971 * However, all such demonic sequences require at least one CPU-offline
1972 * operation. Furthermore, rcu_blocking_is_gp() giving the wrong answer
1973 * is only a problem if there is an RCU read-side critical section executing
1974 * throughout. But RCU-sched and RCU-bh read-side critical sections
1975 * disable either preemption or bh, which prevents a CPU from going offline.
1976 * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
1977 * that there is only one CPU when in fact there was more than one throughout
1978 * is when there were no RCU readers in the system. If there are no
1979 * RCU readers, the grace period by definition can be of zero length,
1980 * regardless of the number of online CPUs.
1981 */ 1990 */
1982static inline int rcu_blocking_is_gp(void) 1991static inline int rcu_blocking_is_gp(void)
1983{ 1992{
1993 int ret;
1994
1984 might_sleep(); /* Check for RCU read-side critical section. */ 1995 might_sleep(); /* Check for RCU read-side critical section. */
1985 return num_online_cpus() <= 1; 1996 preempt_disable();
1997 ret = num_online_cpus() <= 1;
1998 preempt_enable();
1999 return ret;
1986} 2000}
1987 2001
1988/** 2002/**
@@ -2117,9 +2131,9 @@ void synchronize_sched_expedited(void)
2117 put_online_cpus(); 2131 put_online_cpus();
2118 2132
2119 /* No joy, try again later. Or just synchronize_sched(). */ 2133 /* No joy, try again later. Or just synchronize_sched(). */
2120 if (trycount++ < 10) 2134 if (trycount++ < 10) {
2121 udelay(trycount * num_online_cpus()); 2135 udelay(trycount * num_online_cpus());
2122 else { 2136 } else {
2123 synchronize_sched(); 2137 synchronize_sched();
2124 return; 2138 return;
2125 } 2139 }
@@ -2240,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2240 */ 2254 */
2241static int rcu_pending(int cpu) 2255static int rcu_pending(int cpu)
2242{ 2256{
2243 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || 2257 struct rcu_state *rsp;
2244 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || 2258
2245 rcu_preempt_pending(cpu); 2259 for_each_rcu_flavor(rsp)
2260 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
2261 return 1;
2262 return 0;
2246} 2263}
2247 2264
2248/* 2265/*
@@ -2252,20 +2269,41 @@ static int rcu_pending(int cpu)
2252 */ 2269 */
2253static int rcu_cpu_has_callbacks(int cpu) 2270static int rcu_cpu_has_callbacks(int cpu)
2254{ 2271{
2272 struct rcu_state *rsp;
2273
2255 /* RCU callbacks either ready or pending? */ 2274 /* RCU callbacks either ready or pending? */
2256 return per_cpu(rcu_sched_data, cpu).nxtlist || 2275 for_each_rcu_flavor(rsp)
2257 per_cpu(rcu_bh_data, cpu).nxtlist || 2276 if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
2258 rcu_preempt_cpu_has_callbacks(cpu); 2277 return 1;
2278 return 0;
2279}
2280
2281/*
2282 * Helper function for _rcu_barrier() tracing. If tracing is disabled,
2283 * the compiler is expected to optimize this away.
2284 */
2285static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
2286 int cpu, unsigned long done)
2287{
2288 trace_rcu_barrier(rsp->name, s, cpu,
2289 atomic_read(&rsp->barrier_cpu_count), done);
2259} 2290}
2260 2291
2261/* 2292/*
2262 * RCU callback function for _rcu_barrier(). If we are last, wake 2293 * RCU callback function for _rcu_barrier(). If we are last, wake
2263 * up the task executing _rcu_barrier(). 2294 * up the task executing _rcu_barrier().
2264 */ 2295 */
2265static void rcu_barrier_callback(struct rcu_head *notused) 2296static void rcu_barrier_callback(struct rcu_head *rhp)
2266{ 2297{
2267 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2298 struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
2268 complete(&rcu_barrier_completion); 2299 struct rcu_state *rsp = rdp->rsp;
2300
2301 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
2302 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
2303 complete(&rsp->barrier_completion);
2304 } else {
2305 _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
2306 }
2269} 2307}
2270 2308
2271/* 2309/*
@@ -2273,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused)
2273 */ 2311 */
2274static void rcu_barrier_func(void *type) 2312static void rcu_barrier_func(void *type)
2275{ 2313{
2276 int cpu = smp_processor_id(); 2314 struct rcu_state *rsp = type;
2277 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); 2315 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
2278 void (*call_rcu_func)(struct rcu_head *head,
2279 void (*func)(struct rcu_head *head));
2280 2316
2281 atomic_inc(&rcu_barrier_cpu_count); 2317 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
2282 call_rcu_func = type; 2318 atomic_inc(&rsp->barrier_cpu_count);
2283 call_rcu_func(head, rcu_barrier_callback); 2319 rsp->call(&rdp->barrier_head, rcu_barrier_callback);
2284} 2320}
2285 2321
2286/* 2322/*
2287 * Orchestrate the specified type of RCU barrier, waiting for all 2323 * Orchestrate the specified type of RCU barrier, waiting for all
2288 * RCU callbacks of the specified type to complete. 2324 * RCU callbacks of the specified type to complete.
2289 */ 2325 */
2290static void _rcu_barrier(struct rcu_state *rsp, 2326static void _rcu_barrier(struct rcu_state *rsp)
2291 void (*call_rcu_func)(struct rcu_head *head,
2292 void (*func)(struct rcu_head *head)))
2293{ 2327{
2294 int cpu; 2328 int cpu;
2295 unsigned long flags; 2329 unsigned long flags;
2296 struct rcu_data *rdp; 2330 struct rcu_data *rdp;
2297 struct rcu_head rh; 2331 struct rcu_data rd;
2332 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
2333 unsigned long snap_done;
2298 2334
2299 init_rcu_head_on_stack(&rh); 2335 init_rcu_head_on_stack(&rd.barrier_head);
2336 _rcu_barrier_trace(rsp, "Begin", -1, snap);
2300 2337
2301 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 2338 /* Take mutex to serialize concurrent rcu_barrier() requests. */
2302 mutex_lock(&rcu_barrier_mutex); 2339 mutex_lock(&rsp->barrier_mutex);
2340
2341 /*
2342 * Ensure that all prior references, including to ->n_barrier_done,
2343 * are ordered before the _rcu_barrier() machinery.
2344 */
2345 smp_mb(); /* See above block comment. */
2346
2347 /*
2348 * Recheck ->n_barrier_done to see if others did our work for us.
2349 * This means checking ->n_barrier_done for an even-to-odd-to-even
2350 * transition. The "if" expression below therefore rounds the old
2351 * value up to the next even number and adds two before comparing.
2352 */
2353 snap_done = ACCESS_ONCE(rsp->n_barrier_done);
2354 _rcu_barrier_trace(rsp, "Check", -1, snap_done);
2355 if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
2356 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
2357 smp_mb(); /* caller's subsequent code after above check. */
2358 mutex_unlock(&rsp->barrier_mutex);
2359 return;
2360 }
2303 2361
2304 smp_mb(); /* Prevent any prior operations from leaking in. */ 2362 /*
2363 * Increment ->n_barrier_done to avoid duplicate work. Use
2364 * ACCESS_ONCE() to prevent the compiler from speculating
2365 * the increment to precede the early-exit check.
2366 */
2367 ACCESS_ONCE(rsp->n_barrier_done)++;
2368 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
2369 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
2370 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
2305 2371
2306 /* 2372 /*
2307 * Initialize the count to one rather than to zero in order to 2373 * Initialize the count to one rather than to zero in order to
@@ -2320,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp,
2320 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening 2386 * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
2321 * us -- but before CPU 1's orphaned callbacks are invoked!!! 2387 * us -- but before CPU 1's orphaned callbacks are invoked!!!
2322 */ 2388 */
2323 init_completion(&rcu_barrier_completion); 2389 init_completion(&rsp->barrier_completion);
2324 atomic_set(&rcu_barrier_cpu_count, 1); 2390 atomic_set(&rsp->barrier_cpu_count, 1);
2325 raw_spin_lock_irqsave(&rsp->onofflock, flags); 2391 raw_spin_lock_irqsave(&rsp->onofflock, flags);
2326 rsp->rcu_barrier_in_progress = current; 2392 rsp->rcu_barrier_in_progress = current;
2327 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2393 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
@@ -2337,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp,
2337 preempt_disable(); 2403 preempt_disable();
2338 rdp = per_cpu_ptr(rsp->rda, cpu); 2404 rdp = per_cpu_ptr(rsp->rda, cpu);
2339 if (cpu_is_offline(cpu)) { 2405 if (cpu_is_offline(cpu)) {
2406 _rcu_barrier_trace(rsp, "Offline", cpu,
2407 rsp->n_barrier_done);
2340 preempt_enable(); 2408 preempt_enable();
2341 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) 2409 while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
2342 schedule_timeout_interruptible(1); 2410 schedule_timeout_interruptible(1);
2343 } else if (ACCESS_ONCE(rdp->qlen)) { 2411 } else if (ACCESS_ONCE(rdp->qlen)) {
2344 smp_call_function_single(cpu, rcu_barrier_func, 2412 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2345 (void *)call_rcu_func, 1); 2413 rsp->n_barrier_done);
2414 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
2346 preempt_enable(); 2415 preempt_enable();
2347 } else { 2416 } else {
2417 _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
2418 rsp->n_barrier_done);
2348 preempt_enable(); 2419 preempt_enable();
2349 } 2420 }
2350 } 2421 }
@@ -2361,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp,
2361 rcu_adopt_orphan_cbs(rsp); 2432 rcu_adopt_orphan_cbs(rsp);
2362 rsp->rcu_barrier_in_progress = NULL; 2433 rsp->rcu_barrier_in_progress = NULL;
2363 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2434 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2364 atomic_inc(&rcu_barrier_cpu_count); 2435 atomic_inc(&rsp->barrier_cpu_count);
2365 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ 2436 smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
2366 call_rcu_func(&rh, rcu_barrier_callback); 2437 rd.rsp = rsp;
2438 rsp->call(&rd.barrier_head, rcu_barrier_callback);
2367 2439
2368 /* 2440 /*
2369 * Now that we have an rcu_barrier_callback() callback on each 2441 * Now that we have an rcu_barrier_callback() callback on each
2370 * CPU, and thus each counted, remove the initial count. 2442 * CPU, and thus each counted, remove the initial count.
2371 */ 2443 */
2372 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 2444 if (atomic_dec_and_test(&rsp->barrier_cpu_count))
2373 complete(&rcu_barrier_completion); 2445 complete(&rsp->barrier_completion);
2446
2447 /* Increment ->n_barrier_done to prevent duplicate work. */
2448 smp_mb(); /* Keep increment after above mechanism. */
2449 ACCESS_ONCE(rsp->n_barrier_done)++;
2450 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
2451 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
2452 smp_mb(); /* Keep increment before caller's subsequent code. */
2374 2453
2375 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 2454 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2376 wait_for_completion(&rcu_barrier_completion); 2455 wait_for_completion(&rsp->barrier_completion);
2377 2456
2378 /* Other rcu_barrier() invocations can now safely proceed. */ 2457 /* Other rcu_barrier() invocations can now safely proceed. */
2379 mutex_unlock(&rcu_barrier_mutex); 2458 mutex_unlock(&rsp->barrier_mutex);
2380 2459
2381 destroy_rcu_head_on_stack(&rh); 2460 destroy_rcu_head_on_stack(&rd.barrier_head);
2382} 2461}
2383 2462
2384/** 2463/**
@@ -2386,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
2386 */ 2465 */
2387void rcu_barrier_bh(void) 2466void rcu_barrier_bh(void)
2388{ 2467{
2389 _rcu_barrier(&rcu_bh_state, call_rcu_bh); 2468 _rcu_barrier(&rcu_bh_state);
2390} 2469}
2391EXPORT_SYMBOL_GPL(rcu_barrier_bh); 2470EXPORT_SYMBOL_GPL(rcu_barrier_bh);
2392 2471
@@ -2395,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
2395 */ 2474 */
2396void rcu_barrier_sched(void) 2475void rcu_barrier_sched(void)
2397{ 2476{
2398 _rcu_barrier(&rcu_sched_state, call_rcu_sched); 2477 _rcu_barrier(&rcu_sched_state);
2399} 2478}
2400EXPORT_SYMBOL_GPL(rcu_barrier_sched); 2479EXPORT_SYMBOL_GPL(rcu_barrier_sched);
2401 2480
@@ -2406,18 +2485,15 @@ static void __init
2406rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 2485rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2407{ 2486{
2408 unsigned long flags; 2487 unsigned long flags;
2409 int i;
2410 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2488 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2411 struct rcu_node *rnp = rcu_get_root(rsp); 2489 struct rcu_node *rnp = rcu_get_root(rsp);
2412 2490
2413 /* Set up local state, ensuring consistent view of global state. */ 2491 /* Set up local state, ensuring consistent view of global state. */
2414 raw_spin_lock_irqsave(&rnp->lock, flags); 2492 raw_spin_lock_irqsave(&rnp->lock, flags);
2415 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 2493 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
2416 rdp->nxtlist = NULL; 2494 init_callback_list(rdp);
2417 for (i = 0; i < RCU_NEXT_SIZE; i++)
2418 rdp->nxttail[i] = &rdp->nxtlist;
2419 rdp->qlen_lazy = 0; 2495 rdp->qlen_lazy = 0;
2420 rdp->qlen = 0; 2496 ACCESS_ONCE(rdp->qlen) = 0;
2421 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2497 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2422 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 2498 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2423 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2499 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -2491,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2491 2567
2492static void __cpuinit rcu_prepare_cpu(int cpu) 2568static void __cpuinit rcu_prepare_cpu(int cpu)
2493{ 2569{
2494 rcu_init_percpu_data(cpu, &rcu_sched_state, 0); 2570 struct rcu_state *rsp;
2495 rcu_init_percpu_data(cpu, &rcu_bh_state, 0); 2571
2496 rcu_preempt_init_percpu_data(cpu); 2572 for_each_rcu_flavor(rsp)
2573 rcu_init_percpu_data(cpu, rsp,
2574 strcmp(rsp->name, "rcu_preempt") == 0);
2497} 2575}
2498 2576
2499/* 2577/*
@@ -2505,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2505 long cpu = (long)hcpu; 2583 long cpu = (long)hcpu;
2506 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2584 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2507 struct rcu_node *rnp = rdp->mynode; 2585 struct rcu_node *rnp = rdp->mynode;
2586 struct rcu_state *rsp;
2508 2587
2509 trace_rcu_utilization("Start CPU hotplug"); 2588 trace_rcu_utilization("Start CPU hotplug");
2510 switch (action) { 2589 switch (action) {
@@ -2529,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2529 * touch any data without introducing corruption. We send the 2608 * touch any data without introducing corruption. We send the
2530 * dying CPU's callbacks to an arbitrarily chosen online CPU. 2609 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2531 */ 2610 */
2532 rcu_cleanup_dying_cpu(&rcu_bh_state); 2611 for_each_rcu_flavor(rsp)
2533 rcu_cleanup_dying_cpu(&rcu_sched_state); 2612 rcu_cleanup_dying_cpu(rsp);
2534 rcu_preempt_cleanup_dying_cpu();
2535 rcu_cleanup_after_idle(cpu); 2613 rcu_cleanup_after_idle(cpu);
2536 break; 2614 break;
2537 case CPU_DEAD: 2615 case CPU_DEAD:
2538 case CPU_DEAD_FROZEN: 2616 case CPU_DEAD_FROZEN:
2539 case CPU_UP_CANCELED: 2617 case CPU_UP_CANCELED:
2540 case CPU_UP_CANCELED_FROZEN: 2618 case CPU_UP_CANCELED_FROZEN:
2541 rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); 2619 for_each_rcu_flavor(rsp)
2542 rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); 2620 rcu_cleanup_dead_cpu(cpu, rsp);
2543 rcu_preempt_cleanup_dead_cpu(cpu);
2544 break; 2621 break;
2545 default: 2622 default:
2546 break; 2623 break;
@@ -2573,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2573{ 2650{
2574 int i; 2651 int i;
2575 2652
2576 for (i = NUM_RCU_LVLS - 1; i > 0; i--) 2653 for (i = rcu_num_lvls - 1; i > 0; i--)
2577 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 2654 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
2578 rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; 2655 rsp->levelspread[0] = rcu_fanout_leaf;
2579} 2656}
2580#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 2657#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
2581static void __init rcu_init_levelspread(struct rcu_state *rsp) 2658static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2585,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
2585 int i; 2662 int i;
2586 2663
2587 cprv = NR_CPUS; 2664 cprv = NR_CPUS;
2588 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { 2665 for (i = rcu_num_lvls - 1; i >= 0; i--) {
2589 ccur = rsp->levelcnt[i]; 2666 ccur = rsp->levelcnt[i];
2590 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 2667 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
2591 cprv = ccur; 2668 cprv = ccur;
@@ -2612,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2612 2689
2613 /* Initialize the level-tracking arrays. */ 2690 /* Initialize the level-tracking arrays. */
2614 2691
2615 for (i = 1; i < NUM_RCU_LVLS; i++) 2692 for (i = 0; i < rcu_num_lvls; i++)
2693 rsp->levelcnt[i] = num_rcu_lvl[i];
2694 for (i = 1; i < rcu_num_lvls; i++)
2616 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 2695 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
2617 rcu_init_levelspread(rsp); 2696 rcu_init_levelspread(rsp);
2618 2697
2619 /* Initialize the elements themselves, starting from the leaves. */ 2698 /* Initialize the elements themselves, starting from the leaves. */
2620 2699
2621 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { 2700 for (i = rcu_num_lvls - 1; i >= 0; i--) {
2622 cpustride *= rsp->levelspread[i]; 2701 cpustride *= rsp->levelspread[i];
2623 rnp = rsp->level[i]; 2702 rnp = rsp->level[i];
2624 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 2703 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2648,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp,
2648 } 2727 }
2649 2728
2650 rsp->rda = rda; 2729 rsp->rda = rda;
2651 rnp = rsp->level[NUM_RCU_LVLS - 1]; 2730 rnp = rsp->level[rcu_num_lvls - 1];
2652 for_each_possible_cpu(i) { 2731 for_each_possible_cpu(i) {
2653 while (i > rnp->grphi) 2732 while (i > rnp->grphi)
2654 rnp++; 2733 rnp++;
2655 per_cpu_ptr(rsp->rda, i)->mynode = rnp; 2734 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
2656 rcu_boot_init_percpu_data(i, rsp); 2735 rcu_boot_init_percpu_data(i, rsp);
2657 } 2736 }
2737 list_add(&rsp->flavors, &rcu_struct_flavors);
2738}
2739
2740/*
2741 * Compute the rcu_node tree geometry from kernel parameters. This cannot
2742 * replace the definitions in rcutree.h because those are needed to size
2743 * the ->node array in the rcu_state structure.
2744 */
2745static void __init rcu_init_geometry(void)
2746{
2747 int i;
2748 int j;
2749 int n = nr_cpu_ids;
2750 int rcu_capacity[MAX_RCU_LVLS + 1];
2751
2752 /* If the compile-time values are accurate, just leave. */
2753 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
2754 return;
2755
2756 /*
2757 * Compute number of nodes that can be handled an rcu_node tree
2758 * with the given number of levels. Setting rcu_capacity[0] makes
2759 * some of the arithmetic easier.
2760 */
2761 rcu_capacity[0] = 1;
2762 rcu_capacity[1] = rcu_fanout_leaf;
2763 for (i = 2; i <= MAX_RCU_LVLS; i++)
2764 rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
2765
2766 /*
2767 * The boot-time rcu_fanout_leaf parameter is only permitted
2768 * to increase the leaf-level fanout, not decrease it. Of course,
2769 * the leaf-level fanout cannot exceed the number of bits in
2770 * the rcu_node masks. Finally, the tree must be able to accommodate
2771 * the configured number of CPUs. Complain and fall back to the
2772 * compile-time values if these limits are exceeded.
2773 */
2774 if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
2775 rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
2776 n > rcu_capacity[MAX_RCU_LVLS]) {
2777 WARN_ON(1);
2778 return;
2779 }
2780
2781 /* Calculate the number of rcu_nodes at each level of the tree. */
2782 for (i = 1; i <= MAX_RCU_LVLS; i++)
2783 if (n <= rcu_capacity[i]) {
2784 for (j = 0; j <= i; j++)
2785 num_rcu_lvl[j] =
2786 DIV_ROUND_UP(n, rcu_capacity[i - j]);
2787 rcu_num_lvls = i;
2788 for (j = i + 1; j <= MAX_RCU_LVLS; j++)
2789 num_rcu_lvl[j] = 0;
2790 break;
2791 }
2792
2793 /* Calculate the total number of rcu_node structures. */
2794 rcu_num_nodes = 0;
2795 for (i = 0; i <= MAX_RCU_LVLS; i++)
2796 rcu_num_nodes += num_rcu_lvl[i];
2797 rcu_num_nodes -= n;
2658} 2798}
2659 2799
2660void __init rcu_init(void) 2800void __init rcu_init(void)
@@ -2662,6 +2802,7 @@ void __init rcu_init(void)
2662 int cpu; 2802 int cpu;
2663 2803
2664 rcu_bootup_announce(); 2804 rcu_bootup_announce();
2805 rcu_init_geometry();
2665 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 2806 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
2666 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 2807 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
2667 __rcu_init_preempt(); 2808 __rcu_init_preempt();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index ea056495783e..4d29169f2124 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -42,28 +42,28 @@
42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) 42#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
43 43
44#if NR_CPUS <= RCU_FANOUT_1 44#if NR_CPUS <= RCU_FANOUT_1
45# define NUM_RCU_LVLS 1 45# define RCU_NUM_LVLS 1
46# define NUM_RCU_LVL_0 1 46# define NUM_RCU_LVL_0 1
47# define NUM_RCU_LVL_1 (NR_CPUS) 47# define NUM_RCU_LVL_1 (NR_CPUS)
48# define NUM_RCU_LVL_2 0 48# define NUM_RCU_LVL_2 0
49# define NUM_RCU_LVL_3 0 49# define NUM_RCU_LVL_3 0
50# define NUM_RCU_LVL_4 0 50# define NUM_RCU_LVL_4 0
51#elif NR_CPUS <= RCU_FANOUT_2 51#elif NR_CPUS <= RCU_FANOUT_2
52# define NUM_RCU_LVLS 2 52# define RCU_NUM_LVLS 2
53# define NUM_RCU_LVL_0 1 53# define NUM_RCU_LVL_0 1
54# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 54# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
55# define NUM_RCU_LVL_2 (NR_CPUS) 55# define NUM_RCU_LVL_2 (NR_CPUS)
56# define NUM_RCU_LVL_3 0 56# define NUM_RCU_LVL_3 0
57# define NUM_RCU_LVL_4 0 57# define NUM_RCU_LVL_4 0
58#elif NR_CPUS <= RCU_FANOUT_3 58#elif NR_CPUS <= RCU_FANOUT_3
59# define NUM_RCU_LVLS 3 59# define RCU_NUM_LVLS 3
60# define NUM_RCU_LVL_0 1 60# define NUM_RCU_LVL_0 1
61# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 61# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
62# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 62# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
63# define NUM_RCU_LVL_3 (NR_CPUS) 63# define NUM_RCU_LVL_3 (NR_CPUS)
64# define NUM_RCU_LVL_4 0 64# define NUM_RCU_LVL_4 0
65#elif NR_CPUS <= RCU_FANOUT_4 65#elif NR_CPUS <= RCU_FANOUT_4
66# define NUM_RCU_LVLS 4 66# define RCU_NUM_LVLS 4
67# define NUM_RCU_LVL_0 1 67# define NUM_RCU_LVL_0 1
68# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) 68# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
69# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 69# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,6 +76,9 @@
76#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) 76#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
77#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) 77#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
78 78
79extern int rcu_num_lvls;
80extern int rcu_num_nodes;
81
79/* 82/*
80 * Dynticks per-CPU state. 83 * Dynticks per-CPU state.
81 */ 84 */
@@ -97,6 +100,7 @@ struct rcu_dynticks {
97 /* # times non-lazy CBs posted to CPU. */ 100 /* # times non-lazy CBs posted to CPU. */
98 unsigned long nonlazy_posted_snap; 101 unsigned long nonlazy_posted_snap;
99 /* idle-period nonlazy_posted snapshot. */ 102 /* idle-period nonlazy_posted snapshot. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
100#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
101}; 105};
102 106
@@ -206,7 +210,7 @@ struct rcu_node {
206 */ 210 */
207#define rcu_for_each_node_breadth_first(rsp, rnp) \ 211#define rcu_for_each_node_breadth_first(rsp, rnp) \
208 for ((rnp) = &(rsp)->node[0]; \ 212 for ((rnp) = &(rsp)->node[0]; \
209 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 213 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
210 214
211/* 215/*
212 * Do a breadth-first scan of the non-leaf rcu_node structures for the 216 * Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -215,7 +219,7 @@ struct rcu_node {
215 */ 219 */
216#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ 220#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
217 for ((rnp) = &(rsp)->node[0]; \ 221 for ((rnp) = &(rsp)->node[0]; \
218 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) 222 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
219 223
220/* 224/*
221 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state 225 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -224,8 +228,8 @@ struct rcu_node {
224 * It is still a leaf node, even if it is also the root node. 228 * It is still a leaf node, even if it is also the root node.
225 */ 229 */
226#define rcu_for_each_leaf_node(rsp, rnp) \ 230#define rcu_for_each_leaf_node(rsp, rnp) \
227 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ 231 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
228 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) 232 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
229 233
230/* Index values for nxttail array in struct rcu_data. */ 234/* Index values for nxttail array in struct rcu_data. */
231#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 235#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
@@ -311,6 +315,9 @@ struct rcu_data {
311 unsigned long n_rp_need_fqs; 315 unsigned long n_rp_need_fqs;
312 unsigned long n_rp_need_nothing; 316 unsigned long n_rp_need_nothing;
313 317
318 /* 6) _rcu_barrier() callback. */
319 struct rcu_head barrier_head;
320
314 int cpu; 321 int cpu;
315 struct rcu_state *rsp; 322 struct rcu_state *rsp;
316}; 323};
@@ -357,10 +364,12 @@ do { \
357 */ 364 */
358struct rcu_state { 365struct rcu_state {
359 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ 366 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
360 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ 367 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
361 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 368 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
362 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ 369 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
363 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 370 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
371 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
372 void (*func)(struct rcu_head *head));
364 373
365 /* The following fields are guarded by the root rcu_node's lock. */ 374 /* The following fields are guarded by the root rcu_node's lock. */
366 375
@@ -392,6 +401,11 @@ struct rcu_state {
392 struct task_struct *rcu_barrier_in_progress; 401 struct task_struct *rcu_barrier_in_progress;
393 /* Task doing rcu_barrier(), */ 402 /* Task doing rcu_barrier(), */
394 /* or NULL if no barrier. */ 403 /* or NULL if no barrier. */
404 struct mutex barrier_mutex; /* Guards barrier fields. */
405 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
406 struct completion barrier_completion; /* Wake at barrier end. */
407 unsigned long n_barrier_done; /* ++ at start and end of */
408 /* _rcu_barrier(). */
395 raw_spinlock_t fqslock; /* Only one task forcing */ 409 raw_spinlock_t fqslock; /* Only one task forcing */
396 /* quiescent states. */ 410 /* quiescent states. */
397 unsigned long jiffies_force_qs; /* Time at which to invoke */ 411 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -409,8 +423,13 @@ struct rcu_state {
409 unsigned long gp_max; /* Maximum GP duration in */ 423 unsigned long gp_max; /* Maximum GP duration in */
410 /* jiffies. */ 424 /* jiffies. */
411 char *name; /* Name of structure. */ 425 char *name; /* Name of structure. */
426 struct list_head flavors; /* List of RCU flavors. */
412}; 427};
413 428
429extern struct list_head rcu_struct_flavors;
430#define for_each_rcu_flavor(rsp) \
431 list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
432
414/* Return values for rcu_preempt_offline_tasks(). */ 433/* Return values for rcu_preempt_offline_tasks(). */
415 434
416#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ 435#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */
@@ -444,6 +463,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
444/* Forward declarations for rcutree_plugin.h */ 463/* Forward declarations for rcutree_plugin.h */
445static void rcu_bootup_announce(void); 464static void rcu_bootup_announce(void);
446long rcu_batches_completed(void); 465long rcu_batches_completed(void);
466static void rcu_preempt_note_context_switch(int cpu);
447static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 467static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
448#ifdef CONFIG_HOTPLUG_CPU 468#ifdef CONFIG_HOTPLUG_CPU
449static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, 469static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -452,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu);
452#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 472#endif /* #ifdef CONFIG_HOTPLUG_CPU */
453static void rcu_print_detail_task_stall(struct rcu_state *rsp); 473static void rcu_print_detail_task_stall(struct rcu_state *rsp);
454static int rcu_print_task_stall(struct rcu_node *rnp); 474static int rcu_print_task_stall(struct rcu_node *rnp);
455static void rcu_preempt_stall_reset(void);
456static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 475static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
457#ifdef CONFIG_HOTPLUG_CPU 476#ifdef CONFIG_HOTPLUG_CPU
458static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 477static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
459 struct rcu_node *rnp, 478 struct rcu_node *rnp,
460 struct rcu_data *rdp); 479 struct rcu_data *rdp);
461#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 480#endif /* #ifdef CONFIG_HOTPLUG_CPU */
462static void rcu_preempt_cleanup_dead_cpu(int cpu);
463static void rcu_preempt_check_callbacks(int cpu); 481static void rcu_preempt_check_callbacks(int cpu);
464static void rcu_preempt_process_callbacks(void);
465void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 482void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
466#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 483#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
467static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 484static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
468 bool wake); 485 bool wake);
469#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 486#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
470static int rcu_preempt_pending(int cpu);
471static int rcu_preempt_cpu_has_callbacks(int cpu);
472static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
473static void rcu_preempt_cleanup_dying_cpu(void);
474static void __init __rcu_init_preempt(void); 487static void __init __rcu_init_preempt(void);
475static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 488static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
476static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 489static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5271a020887e..7f3244c0df01 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void)
68 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); 68 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
69#endif 69#endif
70#if NUM_RCU_LVL_4 != 0 70#if NUM_RCU_LVL_4 != 0
71 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); 71 printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
72#endif 72#endif
73 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
74 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
75 if (nr_cpu_ids != NR_CPUS)
76 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
73} 77}
74 78
75#ifdef CONFIG_TREE_PREEMPT_RCU 79#ifdef CONFIG_TREE_PREEMPT_RCU
76 80
77struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); 81struct rcu_state rcu_preempt_state =
82 RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
78DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
79static struct rcu_state *rcu_state = &rcu_preempt_state; 84static struct rcu_state *rcu_state = &rcu_preempt_state;
80 85
81static void rcu_read_unlock_special(struct task_struct *t);
82static int rcu_preempted_readers_exp(struct rcu_node *rnp); 86static int rcu_preempted_readers_exp(struct rcu_node *rnp);
83 87
84/* 88/*
@@ -153,7 +157,7 @@ static void rcu_preempt_qs(int cpu)
153 * 157 *
154 * Caller must disable preemption. 158 * Caller must disable preemption.
155 */ 159 */
156void rcu_preempt_note_context_switch(void) 160static void rcu_preempt_note_context_switch(int cpu)
157{ 161{
158 struct task_struct *t = current; 162 struct task_struct *t = current;
159 unsigned long flags; 163 unsigned long flags;
@@ -164,7 +168,7 @@ void rcu_preempt_note_context_switch(void)
164 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 168 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
165 169
166 /* Possibly blocking in an RCU read-side critical section. */ 170 /* Possibly blocking in an RCU read-side critical section. */
167 rdp = __this_cpu_ptr(rcu_preempt_state.rda); 171 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
168 rnp = rdp->mynode; 172 rnp = rdp->mynode;
169 raw_spin_lock_irqsave(&rnp->lock, flags); 173 raw_spin_lock_irqsave(&rnp->lock, flags);
170 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 174 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,23 +232,11 @@ void rcu_preempt_note_context_switch(void)
228 * means that we continue to block the current grace period. 232 * means that we continue to block the current grace period.
229 */ 233 */
230 local_irq_save(flags); 234 local_irq_save(flags);
231 rcu_preempt_qs(smp_processor_id()); 235 rcu_preempt_qs(cpu);
232 local_irq_restore(flags); 236 local_irq_restore(flags);
233} 237}
234 238
235/* 239/*
236 * Tree-preemptible RCU implementation for rcu_read_lock().
237 * Just increment ->rcu_read_lock_nesting, shared state will be updated
238 * if we block.
239 */
240void __rcu_read_lock(void)
241{
242 current->rcu_read_lock_nesting++;
243 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
244}
245EXPORT_SYMBOL_GPL(__rcu_read_lock);
246
247/*
248 * Check for preempted RCU readers blocking the current grace period 240 * Check for preempted RCU readers blocking the current grace period
249 * for the specified rcu_node structure. If the caller needs a reliable 241 * for the specified rcu_node structure. If the caller needs a reliable
250 * answer, it must hold the rcu_node's ->lock. 242 * answer, it must hold the rcu_node's ->lock.
@@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
310 * notify RCU core processing or task having blocked during the RCU 302 * notify RCU core processing or task having blocked during the RCU
311 * read-side critical section. 303 * read-side critical section.
312 */ 304 */
313static noinline void rcu_read_unlock_special(struct task_struct *t) 305void rcu_read_unlock_special(struct task_struct *t)
314{ 306{
315 int empty; 307 int empty;
316 int empty_exp; 308 int empty_exp;
@@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
398 rnp->grphi, 390 rnp->grphi,
399 !!rnp->gp_tasks); 391 !!rnp->gp_tasks);
400 rcu_report_unblock_qs_rnp(rnp, flags); 392 rcu_report_unblock_qs_rnp(rnp, flags);
401 } else 393 } else {
402 raw_spin_unlock_irqrestore(&rnp->lock, flags); 394 raw_spin_unlock_irqrestore(&rnp->lock, flags);
395 }
403 396
404#ifdef CONFIG_RCU_BOOST 397#ifdef CONFIG_RCU_BOOST
405 /* Unboost if we were boosted. */ 398 /* Unboost if we were boosted. */
@@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
418 } 411 }
419} 412}
420 413
421/*
422 * Tree-preemptible RCU implementation for rcu_read_unlock().
423 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
424 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
425 * invoke rcu_read_unlock_special() to clean up after a context switch
426 * in an RCU read-side critical section and other special cases.
427 */
428void __rcu_read_unlock(void)
429{
430 struct task_struct *t = current;
431
432 if (t->rcu_read_lock_nesting != 1)
433 --t->rcu_read_lock_nesting;
434 else {
435 barrier(); /* critical section before exit code. */
436 t->rcu_read_lock_nesting = INT_MIN;
437 barrier(); /* assign before ->rcu_read_unlock_special load */
438 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
439 rcu_read_unlock_special(t);
440 barrier(); /* ->rcu_read_unlock_special load before assign */
441 t->rcu_read_lock_nesting = 0;
442 }
443#ifdef CONFIG_PROVE_LOCKING
444 {
445 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
446
447 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
448 }
449#endif /* #ifdef CONFIG_PROVE_LOCKING */
450}
451EXPORT_SYMBOL_GPL(__rcu_read_unlock);
452
453#ifdef CONFIG_RCU_CPU_STALL_VERBOSE 414#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
454 415
455/* 416/*
@@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
540} 501}
541 502
542/* 503/*
543 * Suppress preemptible RCU's CPU stall warnings by pushing the
544 * time of the next stall-warning message comfortably far into the
545 * future.
546 */
547static void rcu_preempt_stall_reset(void)
548{
549 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
550}
551
552/*
553 * Check that the list of blocked tasks for the newly completed grace 504 * Check that the list of blocked tasks for the newly completed grace
554 * period is in fact empty. It is a serious bug to complete a grace 505 * period is in fact empty. It is a serious bug to complete a grace
555 * period that still has RCU readers blocked! This function must be 506 * period that still has RCU readers blocked! This function must be
@@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
650#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 601#endif /* #ifdef CONFIG_HOTPLUG_CPU */
651 602
652/* 603/*
653 * Do CPU-offline processing for preemptible RCU.
654 */
655static void rcu_preempt_cleanup_dead_cpu(int cpu)
656{
657 rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
658}
659
660/*
661 * Check for a quiescent state from the current CPU. When a task blocks, 604 * Check for a quiescent state from the current CPU. When a task blocks,
662 * the task is recorded in the corresponding CPU's rcu_node structure, 605 * the task is recorded in the corresponding CPU's rcu_node structure,
663 * which is checked elsewhere. 606 * which is checked elsewhere.
@@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu)
677 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 620 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
678} 621}
679 622
680/*
681 * Process callbacks for preemptible RCU.
682 */
683static void rcu_preempt_process_callbacks(void)
684{
685 __rcu_process_callbacks(&rcu_preempt_state,
686 &__get_cpu_var(rcu_preempt_data));
687}
688
689#ifdef CONFIG_RCU_BOOST 623#ifdef CONFIG_RCU_BOOST
690 624
691static void rcu_preempt_do_callbacks(void) 625static void rcu_preempt_do_callbacks(void)
@@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
824 int must_wait = 0; 758 int must_wait = 0;
825 759
826 raw_spin_lock_irqsave(&rnp->lock, flags); 760 raw_spin_lock_irqsave(&rnp->lock, flags);
827 if (list_empty(&rnp->blkd_tasks)) 761 if (list_empty(&rnp->blkd_tasks)) {
828 raw_spin_unlock_irqrestore(&rnp->lock, flags); 762 raw_spin_unlock_irqrestore(&rnp->lock, flags);
829 else { 763 } else {
830 rnp->exp_tasks = rnp->blkd_tasks.next; 764 rnp->exp_tasks = rnp->blkd_tasks.next;
831 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 765 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
832 must_wait = 1; 766 must_wait = 1;
@@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void)
870 * expedited grace period for us, just leave. 804 * expedited grace period for us, just leave.
871 */ 805 */
872 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { 806 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
873 if (trycount++ < 10) 807 if (trycount++ < 10) {
874 udelay(trycount * num_online_cpus()); 808 udelay(trycount * num_online_cpus());
875 else { 809 } else {
876 synchronize_rcu(); 810 synchronize_rcu();
877 return; 811 return;
878 } 812 }
@@ -917,51 +851,16 @@ mb_ret:
917} 851}
918EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 852EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
919 853
920/*
921 * Check to see if there is any immediate preemptible-RCU-related work
922 * to be done.
923 */
924static int rcu_preempt_pending(int cpu)
925{
926 return __rcu_pending(&rcu_preempt_state,
927 &per_cpu(rcu_preempt_data, cpu));
928}
929
930/*
931 * Does preemptible RCU have callbacks on this CPU?
932 */
933static int rcu_preempt_cpu_has_callbacks(int cpu)
934{
935 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
936}
937
938/** 854/**
939 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 855 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
940 */ 856 */
941void rcu_barrier(void) 857void rcu_barrier(void)
942{ 858{
943 _rcu_barrier(&rcu_preempt_state, call_rcu); 859 _rcu_barrier(&rcu_preempt_state);
944} 860}
945EXPORT_SYMBOL_GPL(rcu_barrier); 861EXPORT_SYMBOL_GPL(rcu_barrier);
946 862
947/* 863/*
948 * Initialize preemptible RCU's per-CPU data.
949 */
950static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
951{
952 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
953}
954
955/*
956 * Move preemptible RCU's callbacks from dying CPU to other online CPU
957 * and record a quiescent state.
958 */
959static void rcu_preempt_cleanup_dying_cpu(void)
960{
961 rcu_cleanup_dying_cpu(&rcu_preempt_state);
962}
963
964/*
965 * Initialize preemptible RCU's state structures. 864 * Initialize preemptible RCU's state structures.
966 */ 865 */
967static void __init __rcu_init_preempt(void) 866static void __init __rcu_init_preempt(void)
@@ -1002,6 +901,14 @@ void rcu_force_quiescent_state(void)
1002EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 901EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
1003 902
1004/* 903/*
904 * Because preemptible RCU does not exist, we never have to check for
905 * CPUs being in quiescent states.
906 */
907static void rcu_preempt_note_context_switch(int cpu)
908{
909}
910
911/*
1005 * Because preemptible RCU does not exist, there are never any preempted 912 * Because preemptible RCU does not exist, there are never any preempted
1006 * RCU readers. 913 * RCU readers.
1007 */ 914 */
@@ -1038,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
1038} 945}
1039 946
1040/* 947/*
1041 * Because preemptible RCU does not exist, there is no need to suppress
1042 * its CPU stall warnings.
1043 */
1044static void rcu_preempt_stall_reset(void)
1045{
1046}
1047
1048/*
1049 * Because there is no preemptible RCU, there can be no readers blocked, 948 * Because there is no preemptible RCU, there can be no readers blocked,
1050 * so there is no need to check for blocked tasks. So check only for 949 * so there is no need to check for blocked tasks. So check only for
1051 * bogus qsmask values. 950 * bogus qsmask values.
@@ -1073,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1073#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 972#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1074 973
1075/* 974/*
1076 * Because preemptible RCU does not exist, it never needs CPU-offline
1077 * processing.
1078 */
1079static void rcu_preempt_cleanup_dead_cpu(int cpu)
1080{
1081}
1082
1083/*
1084 * Because preemptible RCU does not exist, it never has any callbacks 975 * Because preemptible RCU does not exist, it never has any callbacks
1085 * to check. 976 * to check.
1086 */ 977 */
@@ -1089,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu)
1089} 980}
1090 981
1091/* 982/*
1092 * Because preemptible RCU does not exist, it never has any callbacks
1093 * to process.
1094 */
1095static void rcu_preempt_process_callbacks(void)
1096{
1097}
1098
1099/*
1100 * Queue an RCU callback for lazy invocation after a grace period. 983 * Queue an RCU callback for lazy invocation after a grace period.
1101 * This will likely be later named something like "call_rcu_lazy()", 984 * This will likely be later named something like "call_rcu_lazy()",
1102 * but this change will require some way of tagging the lazy RCU 985 * but this change will require some way of tagging the lazy RCU
@@ -1137,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1137#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1020#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1138 1021
1139/* 1022/*
1140 * Because preemptible RCU does not exist, it never has any work to do.
1141 */
1142static int rcu_preempt_pending(int cpu)
1143{
1144 return 0;
1145}
1146
1147/*
1148 * Because preemptible RCU does not exist, it never has callbacks
1149 */
1150static int rcu_preempt_cpu_has_callbacks(int cpu)
1151{
1152 return 0;
1153}
1154
1155/*
1156 * Because preemptible RCU does not exist, rcu_barrier() is just 1023 * Because preemptible RCU does not exist, rcu_barrier() is just
1157 * another name for rcu_barrier_sched(). 1024 * another name for rcu_barrier_sched().
1158 */ 1025 */
@@ -1163,21 +1030,6 @@ void rcu_barrier(void)
1163EXPORT_SYMBOL_GPL(rcu_barrier); 1030EXPORT_SYMBOL_GPL(rcu_barrier);
1164 1031
1165/* 1032/*
1166 * Because preemptible RCU does not exist, there is no per-CPU
1167 * data to initialize.
1168 */
1169static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1170{
1171}
1172
1173/*
1174 * Because there is no preemptible RCU, there is no cleanup to do.
1175 */
1176static void rcu_preempt_cleanup_dying_cpu(void)
1177{
1178}
1179
1180/*
1181 * Because preemptible RCU does not exist, it need not be initialized. 1033 * Because preemptible RCU does not exist, it need not be initialized.
1182 */ 1034 */
1183static void __init __rcu_init_preempt(void) 1035static void __init __rcu_init_preempt(void)
@@ -1960,9 +1812,11 @@ static void rcu_idle_count_callbacks_posted(void)
1960 */ 1812 */
1961#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ 1813#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
1962#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ 1814#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
1963#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1815#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1964#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1816#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1965 1817
1818extern int tick_nohz_enabled;
1819
1966/* 1820/*
1967 * Does the specified flavor of RCU have non-lazy callbacks pending on 1821 * Does the specified flavor of RCU have non-lazy callbacks pending on
1968 * the specified CPU? Both RCU flavor and CPU are specified by the 1822 * the specified CPU? Both RCU flavor and CPU are specified by the
@@ -2039,10 +1893,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
2039 return 1; 1893 return 1;
2040 } 1894 }
2041 /* Set up for the possibility that RCU will post a timer. */ 1895 /* Set up for the possibility that RCU will post a timer. */
2042 if (rcu_cpu_has_nonlazy_callbacks(cpu)) 1896 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
2043 *delta_jiffies = RCU_IDLE_GP_DELAY; 1897 *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
2044 else 1898 RCU_IDLE_GP_DELAY) - jiffies;
2045 *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; 1899 } else {
1900 *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
1901 *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
1902 }
2046 return 0; 1903 return 0;
2047} 1904}
2048 1905
@@ -2101,6 +1958,7 @@ static void rcu_cleanup_after_idle(int cpu)
2101 1958
2102 del_timer(&rdtp->idle_gp_timer); 1959 del_timer(&rdtp->idle_gp_timer);
2103 trace_rcu_prep_idle("Cleanup after idle"); 1960 trace_rcu_prep_idle("Cleanup after idle");
1961 rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
2104} 1962}
2105 1963
2106/* 1964/*
@@ -2126,6 +1984,18 @@ static void rcu_prepare_for_idle(int cpu)
2126{ 1984{
2127 struct timer_list *tp; 1985 struct timer_list *tp;
2128 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1986 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1987 int tne;
1988
1989 /* Handle nohz enablement switches conservatively. */
1990 tne = ACCESS_ONCE(tick_nohz_enabled);
1991 if (tne != rdtp->tick_nohz_enabled_snap) {
1992 if (rcu_cpu_has_callbacks(cpu))
1993 invoke_rcu_core(); /* force nohz to see update. */
1994 rdtp->tick_nohz_enabled_snap = tne;
1995 return;
1996 }
1997 if (!tne)
1998 return;
2129 1999
2130 /* 2000 /*
2131 * If this is an idle re-entry, for example, due to use of 2001 * If this is an idle re-entry, for example, due to use of
@@ -2179,10 +2049,11 @@ static void rcu_prepare_for_idle(int cpu)
2179 if (rcu_cpu_has_nonlazy_callbacks(cpu)) { 2049 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
2180 trace_rcu_prep_idle("Dyntick with callbacks"); 2050 trace_rcu_prep_idle("Dyntick with callbacks");
2181 rdtp->idle_gp_timer_expires = 2051 rdtp->idle_gp_timer_expires =
2182 jiffies + RCU_IDLE_GP_DELAY; 2052 round_up(jiffies + RCU_IDLE_GP_DELAY,
2053 RCU_IDLE_GP_DELAY);
2183 } else { 2054 } else {
2184 rdtp->idle_gp_timer_expires = 2055 rdtp->idle_gp_timer_expires =
2185 jiffies + RCU_IDLE_LAZY_GP_DELAY; 2056 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
2186 trace_rcu_prep_idle("Dyntick with lazy callbacks"); 2057 trace_rcu_prep_idle("Dyntick with lazy callbacks");
2187 } 2058 }
2188 tp = &rdtp->idle_gp_timer; 2059 tp = &rdtp->idle_gp_timer;
@@ -2223,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu)
2223 if (rcu_cpu_has_callbacks(cpu)) { 2094 if (rcu_cpu_has_callbacks(cpu)) {
2224 trace_rcu_prep_idle("More callbacks"); 2095 trace_rcu_prep_idle("More callbacks");
2225 invoke_rcu_core(); 2096 invoke_rcu_core();
2226 } else 2097 } else {
2227 trace_rcu_prep_idle("Callbacks drained"); 2098 trace_rcu_prep_idle("Callbacks drained");
2099 }
2228} 2100}
2229 2101
2230/* 2102/*
@@ -2261,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2261 2133
2262static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 2134static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2263{ 2135{
2136 *cp = '\0';
2264} 2137}
2265 2138
2266#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ 2139#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d4..abffb486e94e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,31 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49static int show_rcubarrier(struct seq_file *m, void *unused)
50{
51 struct rcu_state *rsp;
52
53 for_each_rcu_flavor(rsp)
54 seq_printf(m, "%s: %c bcc: %d nbd: %lu\n",
55 rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.',
56 atomic_read(&rsp->barrier_cpu_count),
57 rsp->n_barrier_done);
58 return 0;
59}
60
61static int rcubarrier_open(struct inode *inode, struct file *file)
62{
63 return single_open(file, show_rcubarrier, NULL);
64}
65
66static const struct file_operations rcubarrier_fops = {
67 .owner = THIS_MODULE,
68 .open = rcubarrier_open,
69 .read = seq_read,
70 .llseek = seq_lseek,
71 .release = single_release,
72};
73
49#ifdef CONFIG_RCU_BOOST 74#ifdef CONFIG_RCU_BOOST
50 75
51static char convert_kthread_status(unsigned int kthread_status) 76static char convert_kthread_status(unsigned int kthread_status)
@@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
95 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); 120 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
96} 121}
97 122
98#define PRINT_RCU_DATA(name, func, m) \
99 do { \
100 int _p_r_d_i; \
101 \
102 for_each_possible_cpu(_p_r_d_i) \
103 func(m, &per_cpu(name, _p_r_d_i)); \
104 } while (0)
105
106static int show_rcudata(struct seq_file *m, void *unused) 123static int show_rcudata(struct seq_file *m, void *unused)
107{ 124{
108#ifdef CONFIG_TREE_PREEMPT_RCU 125 int cpu;
109 seq_puts(m, "rcu_preempt:\n"); 126 struct rcu_state *rsp;
110 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); 127
111#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 128 for_each_rcu_flavor(rsp) {
112 seq_puts(m, "rcu_sched:\n"); 129 seq_printf(m, "%s:\n", rsp->name);
113 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); 130 for_each_possible_cpu(cpu)
114 seq_puts(m, "rcu_bh:\n"); 131 print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
115 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 132 }
116 return 0; 133 return 0;
117} 134}
118 135
@@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
166 183
167static int show_rcudata_csv(struct seq_file *m, void *unused) 184static int show_rcudata_csv(struct seq_file *m, void *unused)
168{ 185{
186 int cpu;
187 struct rcu_state *rsp;
188
169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 189 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 190 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
171 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); 191 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
@@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
173 seq_puts(m, "\"kt\",\"ktl\""); 193 seq_puts(m, "\"kt\",\"ktl\"");
174#endif /* #ifdef CONFIG_RCU_BOOST */ 194#endif /* #ifdef CONFIG_RCU_BOOST */
175 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); 195 seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
176#ifdef CONFIG_TREE_PREEMPT_RCU 196 for_each_rcu_flavor(rsp) {
177 seq_puts(m, "\"rcu_preempt:\"\n"); 197 seq_printf(m, "\"%s:\"\n", rsp->name);
178 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 198 for_each_possible_cpu(cpu)
179#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 199 print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu));
180 seq_puts(m, "\"rcu_sched:\"\n"); 200 }
181 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
182 seq_puts(m, "\"rcu_bh:\"\n");
183 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
184 return 0; 201 return 0;
185} 202}
186 203
@@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = {
201 218
202static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) 219static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
203{ 220{
204 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " 221 seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
205 "j=%04x bt=%04x\n",
206 rnp->grplo, rnp->grphi, 222 rnp->grplo, rnp->grphi,
207 "T."[list_empty(&rnp->blkd_tasks)], 223 "T."[list_empty(&rnp->blkd_tasks)],
208 "N."[!rnp->gp_tasks], 224 "N."[!rnp->gp_tasks],
@@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
210 "B."[!rnp->boost_tasks], 226 "B."[!rnp->boost_tasks],
211 convert_kthread_status(rnp->boost_kthread_status), 227 convert_kthread_status(rnp->boost_kthread_status),
212 rnp->n_tasks_boosted, rnp->n_exp_boosts, 228 rnp->n_tasks_boosted, rnp->n_exp_boosts,
213 rnp->n_normal_boosts, 229 rnp->n_normal_boosts);
230 seq_printf(m, "j=%04x bt=%04x\n",
214 (int)(jiffies & 0xffff), 231 (int)(jiffies & 0xffff),
215 (int)(rnp->boost_time & 0xffff)); 232 (int)(rnp->boost_time & 0xffff));
216 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", 233 seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
217 " balk",
218 rnp->n_balk_blkd_tasks, 234 rnp->n_balk_blkd_tasks,
219 rnp->n_balk_exp_gp_tasks, 235 rnp->n_balk_exp_gp_tasks,
220 rnp->n_balk_boost_tasks, 236 rnp->n_balk_boost_tasks,
@@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
270 struct rcu_node *rnp; 286 struct rcu_node *rnp;
271 287
272 gpnum = rsp->gpnum; 288 gpnum = rsp->gpnum;
273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 289 seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ",
274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 290 rsp->name, rsp->completed, gpnum, rsp->fqs_state,
275 rsp->completed, gpnum, rsp->fqs_state,
276 (long)(rsp->jiffies_force_qs - jiffies), 291 (long)(rsp->jiffies_force_qs - jiffies),
277 (int)(jiffies & 0xffff), 292 (int)(jiffies & 0xffff));
293 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
278 rsp->n_force_qs, rsp->n_force_qs_ngp, 294 rsp->n_force_qs, rsp->n_force_qs_ngp,
279 rsp->n_force_qs - rsp->n_force_qs_ngp, 295 rsp->n_force_qs - rsp->n_force_qs_ngp,
280 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); 296 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
281 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 297 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
282 if (rnp->level != level) { 298 if (rnp->level != level) {
283 seq_puts(m, "\n"); 299 seq_puts(m, "\n");
284 level = rnp->level; 300 level = rnp->level;
@@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
295 311
296static int show_rcuhier(struct seq_file *m, void *unused) 312static int show_rcuhier(struct seq_file *m, void *unused)
297{ 313{
298#ifdef CONFIG_TREE_PREEMPT_RCU 314 struct rcu_state *rsp;
299 seq_puts(m, "rcu_preempt:\n"); 315
300 print_one_rcu_state(m, &rcu_preempt_state); 316 for_each_rcu_flavor(rsp)
301#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 317 print_one_rcu_state(m, rsp);
302 seq_puts(m, "rcu_sched:\n");
303 print_one_rcu_state(m, &rcu_sched_state);
304 seq_puts(m, "rcu_bh:\n");
305 print_one_rcu_state(m, &rcu_bh_state);
306 return 0; 318 return 0;
307} 319}
308 320
@@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
343 355
344static int show_rcugp(struct seq_file *m, void *unused) 356static int show_rcugp(struct seq_file *m, void *unused)
345{ 357{
346#ifdef CONFIG_TREE_PREEMPT_RCU 358 struct rcu_state *rsp;
347 show_one_rcugp(m, &rcu_preempt_state); 359
348#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 360 for_each_rcu_flavor(rsp)
349 show_one_rcugp(m, &rcu_sched_state); 361 show_one_rcugp(m, rsp);
350 show_one_rcugp(m, &rcu_bh_state);
351 return 0; 362 return 0;
352} 363}
353 364
@@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = {
366 377
367static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 378static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
368{ 379{
369 seq_printf(m, "%3d%cnp=%ld " 380 seq_printf(m, "%3d%cnp=%ld ",
370 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
371 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
372 rdp->cpu, 381 rdp->cpu,
373 cpu_is_offline(rdp->cpu) ? '!' : ' ', 382 cpu_is_offline(rdp->cpu) ? '!' : ' ',
374 rdp->n_rcu_pending, 383 rdp->n_rcu_pending);
384 seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
375 rdp->n_rp_qs_pending, 385 rdp->n_rp_qs_pending,
376 rdp->n_rp_report_qs, 386 rdp->n_rp_report_qs,
377 rdp->n_rp_cb_ready, 387 rdp->n_rp_cb_ready,
378 rdp->n_rp_cpu_needs_gp, 388 rdp->n_rp_cpu_needs_gp);
389 seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
379 rdp->n_rp_gp_completed, 390 rdp->n_rp_gp_completed,
380 rdp->n_rp_gp_started, 391 rdp->n_rp_gp_started,
381 rdp->n_rp_need_fqs, 392 rdp->n_rp_need_fqs,
382 rdp->n_rp_need_nothing); 393 rdp->n_rp_need_nothing);
383} 394}
384 395
385static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) 396static int show_rcu_pending(struct seq_file *m, void *unused)
386{ 397{
387 int cpu; 398 int cpu;
388 struct rcu_data *rdp; 399 struct rcu_data *rdp;
389 400 struct rcu_state *rsp;
390 for_each_possible_cpu(cpu) { 401
391 rdp = per_cpu_ptr(rsp->rda, cpu); 402 for_each_rcu_flavor(rsp) {
392 if (rdp->beenonline) 403 seq_printf(m, "%s:\n", rsp->name);
393 print_one_rcu_pending(m, rdp); 404 for_each_possible_cpu(cpu) {
405 rdp = per_cpu_ptr(rsp->rda, cpu);
406 if (rdp->beenonline)
407 print_one_rcu_pending(m, rdp);
408 }
394 } 409 }
395}
396
397static int show_rcu_pending(struct seq_file *m, void *unused)
398{
399#ifdef CONFIG_TREE_PREEMPT_RCU
400 seq_puts(m, "rcu_preempt:\n");
401 print_rcu_pendings(m, &rcu_preempt_state);
402#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
403 seq_puts(m, "rcu_sched:\n");
404 print_rcu_pendings(m, &rcu_sched_state);
405 seq_puts(m, "rcu_bh:\n");
406 print_rcu_pendings(m, &rcu_bh_state);
407 return 0; 410 return 0;
408} 411}
409 412
@@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void)
453 if (!rcudir) 456 if (!rcudir)
454 goto free_out; 457 goto free_out;
455 458
459 retval = debugfs_create_file("rcubarrier", 0444, rcudir,
460 NULL, &rcubarrier_fops);
461 if (!retval)
462 goto free_out;
463
456 retval = debugfs_create_file("rcudata", 0444, rcudir, 464 retval = debugfs_create_file("rcudata", 0444, rcudir,
457 NULL, &rcudata_fops); 465 NULL, &rcudata_fops);
458 if (!retval) 466 if (!retval)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d4..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
2081#endif 2081#endif
2082 2082
2083 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2084 rcu_switch_from(prev);
2085 switch_to(prev, next, prev); 2084 switch_to(prev, next, prev);
2086 2085
2087 barrier(); 2086 barrier();
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)
2161} 2160}
2162 2161
2163 2162
2163/*
2164 * Global load-average calculations
2165 *
2166 * We take a distributed and async approach to calculating the global load-avg
2167 * in order to minimize overhead.
2168 *
2169 * The global load average is an exponentially decaying average of nr_running +
2170 * nr_uninterruptible.
2171 *
2172 * Once every LOAD_FREQ:
2173 *
2174 * nr_active = 0;
2175 * for_each_possible_cpu(cpu)
2176 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2177 *
2178 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2179 *
2180 * Due to a number of reasons the above turns in the mess below:
2181 *
2182 * - for_each_possible_cpu() is prohibitively expensive on machines with
2183 * serious number of cpus, therefore we need to take a distributed approach
2184 * to calculating nr_active.
2185 *
2186 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2187 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2188 *
2189 * So assuming nr_active := 0 when we start out -- true per definition, we
2190 * can simply take per-cpu deltas and fold those into a global accumulate
2191 * to obtain the same result. See calc_load_fold_active().
2192 *
2193 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2194 * across the machine, we assume 10 ticks is sufficient time for every
2195 * cpu to have completed this task.
2196 *
2197 * This places an upper-bound on the IRQ-off latency of the machine. Then
2198 * again, being late doesn't loose the delta, just wrecks the sample.
2199 *
2200 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2201 * this would add another cross-cpu cacheline miss and atomic operation
2202 * to the wakeup path. Instead we increment on whatever cpu the task ran
2203 * when it went into uninterruptible state and decrement on whatever cpu
2204 * did the wakeup. This means that only the sum of nr_uninterruptible over
2205 * all cpus yields the correct result.
2206 *
2207 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2208 */
2209
2164/* Variables and functions for calc_load */ 2210/* Variables and functions for calc_load */
2165static atomic_long_t calc_load_tasks; 2211static atomic_long_t calc_load_tasks;
2166static unsigned long calc_load_update; 2212static unsigned long calc_load_update;
2167unsigned long avenrun[3]; 2213unsigned long avenrun[3];
2168EXPORT_SYMBOL(avenrun); 2214EXPORT_SYMBOL(avenrun); /* should be removed */
2215
2216/**
2217 * get_avenrun - get the load average array
2218 * @loads: pointer to dest load array
2219 * @offset: offset to add
2220 * @shift: shift count to shift the result left
2221 *
2222 * These values are estimates at best, so no need for locking.
2223 */
2224void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2225{
2226 loads[0] = (avenrun[0] + offset) << shift;
2227 loads[1] = (avenrun[1] + offset) << shift;
2228 loads[2] = (avenrun[2] + offset) << shift;
2229}
2169 2230
2170static long calc_load_fold_active(struct rq *this_rq) 2231static long calc_load_fold_active(struct rq *this_rq)
2171{ 2232{
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
2182 return delta; 2243 return delta;
2183} 2244}
2184 2245
2246/*
2247 * a1 = a0 * e + a * (1 - e)
2248 */
2185static unsigned long 2249static unsigned long
2186calc_load(unsigned long load, unsigned long exp, unsigned long active) 2250calc_load(unsigned long load, unsigned long exp, unsigned long active)
2187{ 2251{
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2193 2257
2194#ifdef CONFIG_NO_HZ 2258#ifdef CONFIG_NO_HZ
2195/* 2259/*
2196 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 2260 * Handle NO_HZ for the global load-average.
2261 *
2262 * Since the above described distributed algorithm to compute the global
2263 * load-average relies on per-cpu sampling from the tick, it is affected by
2264 * NO_HZ.
2265 *
2266 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2267 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2268 * when we read the global state.
2269 *
2270 * Obviously reality has to ruin such a delightfully simple scheme:
2271 *
2272 * - When we go NO_HZ idle during the window, we can negate our sample
2273 * contribution, causing under-accounting.
2274 *
2275 * We avoid this by keeping two idle-delta counters and flipping them
2276 * when the window starts, thus separating old and new NO_HZ load.
2277 *
2278 * The only trick is the slight shift in index flip for read vs write.
2279 *
2280 * 0s 5s 10s 15s
2281 * +10 +10 +10 +10
2282 * |-|-----------|-|-----------|-|-----------|-|
2283 * r:0 0 1 1 0 0 1 1 0
2284 * w:0 1 1 0 0 1 1 0 0
2285 *
2286 * This ensures we'll fold the old idle contribution in this window while
2287 * accumlating the new one.
2288 *
2289 * - When we wake up from NO_HZ idle during the window, we push up our
2290 * contribution, since we effectively move our sample point to a known
2291 * busy state.
2292 *
2293 * This is solved by pushing the window forward, and thus skipping the
2294 * sample, for this cpu (effectively using the idle-delta for this cpu which
2295 * was in effect at the time the window opened). This also solves the issue
2296 * of having to deal with a cpu having been in NOHZ idle for multiple
2297 * LOAD_FREQ intervals.
2197 * 2298 *
2198 * When making the ILB scale, we should try to pull this in as well. 2299 * When making the ILB scale, we should try to pull this in as well.
2199 */ 2300 */
2200static atomic_long_t calc_load_tasks_idle; 2301static atomic_long_t calc_load_idle[2];
2302static int calc_load_idx;
2201 2303
2202void calc_load_account_idle(struct rq *this_rq) 2304static inline int calc_load_write_idx(void)
2203{ 2305{
2306 int idx = calc_load_idx;
2307
2308 /*
2309 * See calc_global_nohz(), if we observe the new index, we also
2310 * need to observe the new update time.
2311 */
2312 smp_rmb();
2313
2314 /*
2315 * If the folding window started, make sure we start writing in the
2316 * next idle-delta.
2317 */
2318 if (!time_before(jiffies, calc_load_update))
2319 idx++;
2320
2321 return idx & 1;
2322}
2323
2324static inline int calc_load_read_idx(void)
2325{
2326 return calc_load_idx & 1;
2327}
2328
2329void calc_load_enter_idle(void)
2330{
2331 struct rq *this_rq = this_rq();
2204 long delta; 2332 long delta;
2205 2333
2334 /*
2335 * We're going into NOHZ mode, if there's any pending delta, fold it
2336 * into the pending idle delta.
2337 */
2206 delta = calc_load_fold_active(this_rq); 2338 delta = calc_load_fold_active(this_rq);
2207 if (delta) 2339 if (delta) {
2208 atomic_long_add(delta, &calc_load_tasks_idle); 2340 int idx = calc_load_write_idx();
2341 atomic_long_add(delta, &calc_load_idle[idx]);
2342 }
2209} 2343}
2210 2344
2211static long calc_load_fold_idle(void) 2345void calc_load_exit_idle(void)
2212{ 2346{
2213 long delta = 0; 2347 struct rq *this_rq = this_rq();
2348
2349 /*
2350 * If we're still before the sample window, we're done.
2351 */
2352 if (time_before(jiffies, this_rq->calc_load_update))
2353 return;
2214 2354
2215 /* 2355 /*
2216 * Its got a race, we don't care... 2356 * We woke inside or after the sample window, this means we're already
2357 * accounted through the nohz accounting, so skip the entire deal and
2358 * sync up for the next window.
2217 */ 2359 */
2218 if (atomic_long_read(&calc_load_tasks_idle)) 2360 this_rq->calc_load_update = calc_load_update;
2219 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 2361 if (time_before(jiffies, this_rq->calc_load_update + 10))
2362 this_rq->calc_load_update += LOAD_FREQ;
2363}
2364
2365static long calc_load_fold_idle(void)
2366{
2367 int idx = calc_load_read_idx();
2368 long delta = 0;
2369
2370 if (atomic_long_read(&calc_load_idle[idx]))
2371 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2220 2372
2221 return delta; 2373 return delta;
2222} 2374}
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)
2302{ 2454{
2303 long delta, active, n; 2455 long delta, active, n;
2304 2456
2305 /* 2457 if (!time_before(jiffies, calc_load_update + 10)) {
2306 * If we crossed a calc_load_update boundary, make sure to fold 2458 /*
2307 * any pending idle changes, the respective CPUs might have 2459 * Catch-up, fold however many we are behind still
2308 * missed the tick driven calc_load_account_active() update 2460 */
2309 * due to NO_HZ. 2461 delta = jiffies - calc_load_update - 10;
2310 */ 2462 n = 1 + (delta / LOAD_FREQ);
2311 delta = calc_load_fold_idle();
2312 if (delta)
2313 atomic_long_add(delta, &calc_load_tasks);
2314
2315 /*
2316 * It could be the one fold was all it took, we done!
2317 */
2318 if (time_before(jiffies, calc_load_update + 10))
2319 return;
2320
2321 /*
2322 * Catch-up, fold however many we are behind still
2323 */
2324 delta = jiffies - calc_load_update - 10;
2325 n = 1 + (delta / LOAD_FREQ);
2326 2463
2327 active = atomic_long_read(&calc_load_tasks); 2464 active = atomic_long_read(&calc_load_tasks);
2328 active = active > 0 ? active * FIXED_1 : 0; 2465 active = active > 0 ? active * FIXED_1 : 0;
2329 2466
2330 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2467 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2331 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2468 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2332 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 2469 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2333 2470
2334 calc_load_update += n * LOAD_FREQ; 2471 calc_load_update += n * LOAD_FREQ;
2335} 2472 }
2336#else
2337void calc_load_account_idle(struct rq *this_rq)
2338{
2339}
2340 2473
2341static inline long calc_load_fold_idle(void) 2474 /*
2342{ 2475 * Flip the idle index...
2343 return 0; 2476 *
2477 * Make sure we first write the new time then flip the index, so that
2478 * calc_load_write_idx() will see the new time when it reads the new
2479 * index, this avoids a double flip messing things up.
2480 */
2481 smp_wmb();
2482 calc_load_idx++;
2344} 2483}
2484#else /* !CONFIG_NO_HZ */
2345 2485
2346static void calc_global_nohz(void) 2486static inline long calc_load_fold_idle(void) { return 0; }
2347{ 2487static inline void calc_global_nohz(void) { }
2348}
2349#endif
2350 2488
2351/** 2489#endif /* CONFIG_NO_HZ */
2352 * get_avenrun - get the load average array
2353 * @loads: pointer to dest load array
2354 * @offset: offset to add
2355 * @shift: shift count to shift the result left
2356 *
2357 * These values are estimates at best, so no need for locking.
2358 */
2359void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2360{
2361 loads[0] = (avenrun[0] + offset) << shift;
2362 loads[1] = (avenrun[1] + offset) << shift;
2363 loads[2] = (avenrun[2] + offset) << shift;
2364}
2365 2490
2366/* 2491/*
2367 * calc_load - update the avenrun load estimates 10 ticks after the 2492 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2369 */ 2494 */
2370void calc_global_load(unsigned long ticks) 2495void calc_global_load(unsigned long ticks)
2371{ 2496{
2372 long active; 2497 long active, delta;
2373 2498
2374 if (time_before(jiffies, calc_load_update + 10)) 2499 if (time_before(jiffies, calc_load_update + 10))
2375 return; 2500 return;
2376 2501
2502 /*
2503 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2504 */
2505 delta = calc_load_fold_idle();
2506 if (delta)
2507 atomic_long_add(delta, &calc_load_tasks);
2508
2377 active = atomic_long_read(&calc_load_tasks); 2509 active = atomic_long_read(&calc_load_tasks);
2378 active = active > 0 ? active * FIXED_1 : 0; 2510 active = active > 0 ? active * FIXED_1 : 0;
2379 2511
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
2384 calc_load_update += LOAD_FREQ; 2516 calc_load_update += LOAD_FREQ;
2385 2517
2386 /* 2518 /*
2387 * Account one period with whatever state we found before 2519 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2388 * folding in the nohz state and ageing the entire idle period.
2389 *
2390 * This avoids loosing a sample when we go idle between
2391 * calc_load_account_active() (10 ticks ago) and now and thus
2392 * under-accounting.
2393 */ 2520 */
2394 calc_global_nohz(); 2521 calc_global_nohz();
2395} 2522}
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
2406 return; 2533 return;
2407 2534
2408 delta = calc_load_fold_active(this_rq); 2535 delta = calc_load_fold_active(this_rq);
2409 delta += calc_load_fold_idle();
2410 if (delta) 2536 if (delta)
2411 atomic_long_add(delta, &calc_load_tasks); 2537 atomic_long_add(delta, &calc_load_tasks);
2412 2538
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
2414} 2540}
2415 2541
2416/* 2542/*
2543 * End of global load-average stuff
2544 */
2545
2546/*
2417 * The exact cpuload at various idx values, calculated at every tick would be 2547 * The exact cpuload at various idx values, calculated at every tick would be
2418 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2548 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2419 * 2549 *
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 25static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 26{
27 schedstat_inc(rq, sched_goidle); 27 schedstat_inc(rq, sched_goidle);
28 calc_load_account_idle(rq);
29 return rq->idle; 28 return rq->idle;
30} 29}
31 30
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d52cea7f33d..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void)
942 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 942 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
943} 943}
944 944
945void calc_load_account_idle(struct rq *this_rq);
946
947#ifdef CONFIG_SCHED_HRTICK 945#ifdef CONFIG_SCHED_HRTICK
948 946
949/* 947/*
diff --git a/kernel/smp.c b/kernel/smp.c
index d0ae5b24875e..29dd40a9f2f4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
581 return 0; 581 return 0;
582} 582}
583EXPORT_SYMBOL(smp_call_function); 583EXPORT_SYMBOL(smp_call_function);
584
585void ipi_call_lock(void)
586{
587 raw_spin_lock(&call_function.lock);
588}
589
590void ipi_call_unlock(void)
591{
592 raw_spin_unlock(&call_function.lock);
593}
594
595void ipi_call_lock_irq(void)
596{
597 raw_spin_lock_irq(&call_function.lock);
598}
599
600void ipi_call_unlock_irq(void)
601{
602 raw_spin_unlock_irq(&call_function.lock);
603}
604#endif /* USE_GENERIC_SMP_HELPERS */ 584#endif /* USE_GENERIC_SMP_HELPERS */
605 585
606/* Setup configured maximum number of CPUs to activate */ 586/* Setup configured maximum number of CPUs to activate */
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 80c0acfb8472..6ef9433e1c70 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -3,8 +3,6 @@
3 3
4struct task_struct; 4struct task_struct;
5 5
6int smpboot_prepare(unsigned int cpu);
7
8#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD 6#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
9struct task_struct *idle_thread_get(unsigned int cpu); 7struct task_struct *idle_thread_get(unsigned int cpu);
10void idle_thread_set_boot_cpu(void); 8void idle_thread_set_boot_cpu(void);
diff --git a/kernel/sys.c b/kernel/sys.c
index e0c8ffc50d7f..2d39a84cd857 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask)
1788#ifdef CONFIG_CHECKPOINT_RESTORE 1788#ifdef CONFIG_CHECKPOINT_RESTORE
1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1789static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1790{ 1790{
1791 struct vm_area_struct *vma;
1792 struct file *exe_file; 1791 struct file *exe_file;
1793 struct dentry *dentry; 1792 struct dentry *dentry;
1794 int err; 1793 int err;
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1816 down_write(&mm->mmap_sem); 1815 down_write(&mm->mmap_sem);
1817 1816
1818 /* 1817 /*
1819 * Forbid mm->exe_file change if there are mapped other files. 1818 * Forbid mm->exe_file change if old file still mapped.
1820 */ 1819 */
1821 err = -EBUSY; 1820 err = -EBUSY;
1822 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1821 if (mm->exe_file) {
1823 if (vma->vm_file && !path_equal(&vma->vm_file->f_path, 1822 struct vm_area_struct *vma;
1824 &exe_file->f_path)) 1823
1825 goto exit_unlock; 1824 for (vma = mm->mmap; vma; vma = vma->vm_next)
1825 if (vma->vm_file &&
1826 path_equal(&vma->vm_file->f_path,
1827 &mm->exe_file->f_path))
1828 goto exit_unlock;
1826 } 1829 }
1827 1830
1828 /* 1831 /*
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1835 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) 1838 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1836 goto exit_unlock; 1839 goto exit_unlock;
1837 1840
1841 err = 0;
1838 set_mm_exe_file(mm, exe_file); 1842 set_mm_exe_file(mm, exe_file);
1839exit_unlock: 1843exit_unlock:
1840 up_write(&mm->mmap_sem); 1844 up_write(&mm->mmap_sem);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 70b33abcc7bb..b7fbadc5c973 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs)
409 time_state = TIME_DEL; 409 time_state = TIME_DEL;
410 break; 410 break;
411 case TIME_INS: 411 case TIME_INS:
412 if (secs % 86400 == 0) { 412 if (!(time_status & STA_INS))
413 time_state = TIME_OK;
414 else if (secs % 86400 == 0) {
413 leap = -1; 415 leap = -1;
414 time_state = TIME_OOP; 416 time_state = TIME_OOP;
415 time_tai++; 417 time_tai++;
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs)
418 } 420 }
419 break; 421 break;
420 case TIME_DEL: 422 case TIME_DEL:
421 if ((secs + 1) % 86400 == 0) { 423 if (!(time_status & STA_DEL))
424 time_state = TIME_OK;
425 else if ((secs + 1) % 86400 == 0) {
422 leap = 1; 426 leap = 1;
423 time_tai--; 427 time_tai--;
424 time_state = TIME_WAIT; 428 time_state = TIME_WAIT;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 869997833928..024540f97f74 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void)
105/* 105/*
106 * NO HZ enabled ? 106 * NO HZ enabled ?
107 */ 107 */
108static int tick_nohz_enabled __read_mostly = 1; 108int tick_nohz_enabled __read_mostly = 1;
109 109
110/* 110/*
111 * Enable / Disable tickless mode 111 * Enable / Disable tickless mode
@@ -271,50 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
271} 271}
272EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 272EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
273 273
274static void tick_nohz_stop_sched_tick(struct tick_sched *ts) 274static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
275 ktime_t now, int cpu)
275{ 276{
276 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 277 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
278 ktime_t last_update, expires, ret = { .tv64 = 0 };
277 unsigned long rcu_delta_jiffies; 279 unsigned long rcu_delta_jiffies;
278 ktime_t last_update, expires, now;
279 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 280 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
280 u64 time_delta; 281 u64 time_delta;
281 int cpu;
282
283 cpu = smp_processor_id();
284 ts = &per_cpu(tick_cpu_sched, cpu);
285
286 now = tick_nohz_start_idle(cpu, ts);
287
288 /*
289 * If this cpu is offline and it is the one which updates
290 * jiffies, then give up the assignment and let it be taken by
291 * the cpu which runs the tick timer next. If we don't drop
292 * this here the jiffies might be stale and do_timer() never
293 * invoked.
294 */
295 if (unlikely(!cpu_online(cpu))) {
296 if (cpu == tick_do_timer_cpu)
297 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
298 }
299
300 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
301 return;
302 282
303 if (need_resched())
304 return;
305
306 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
307 static int ratelimit;
308
309 if (ratelimit < 10) {
310 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
311 (unsigned int) local_softirq_pending());
312 ratelimit++;
313 }
314 return;
315 }
316
317 ts->idle_calls++;
318 /* Read jiffies and the time when jiffies were updated last */ 283 /* Read jiffies and the time when jiffies were updated last */
319 do { 284 do {
320 seq = read_seqbegin(&xtime_lock); 285 seq = read_seqbegin(&xtime_lock);
@@ -397,6 +362,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
397 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 362 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
398 goto out; 363 goto out;
399 364
365 ret = expires;
366
400 /* 367 /*
401 * nohz_stop_sched_tick can be called several times before 368 * nohz_stop_sched_tick can be called several times before
402 * the nohz_restart_sched_tick is called. This happens when 369 * the nohz_restart_sched_tick is called. This happens when
@@ -406,17 +373,12 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
406 */ 373 */
407 if (!ts->tick_stopped) { 374 if (!ts->tick_stopped) {
408 select_nohz_load_balancer(1); 375 select_nohz_load_balancer(1);
376 calc_load_enter_idle();
409 377
410 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
411 ts->tick_stopped = 1; 379 ts->tick_stopped = 1;
412 ts->idle_jiffies = last_jiffies;
413 } 380 }
414 381
415 ts->idle_sleeps++;
416
417 /* Mark expires */
418 ts->idle_expires = expires;
419
420 /* 382 /*
421 * If the expiration time == KTIME_MAX, then 383 * If the expiration time == KTIME_MAX, then
422 * in this case we simply stop the tick timer. 384 * in this case we simply stop the tick timer.
@@ -447,6 +409,65 @@ out:
447 ts->next_jiffies = next_jiffies; 409 ts->next_jiffies = next_jiffies;
448 ts->last_jiffies = last_jiffies; 410 ts->last_jiffies = last_jiffies;
449 ts->sleep_length = ktime_sub(dev->next_event, now); 411 ts->sleep_length = ktime_sub(dev->next_event, now);
412
413 return ret;
414}
415
416static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
417{
418 /*
419 * If this cpu is offline and it is the one which updates
420 * jiffies, then give up the assignment and let it be taken by
421 * the cpu which runs the tick timer next. If we don't drop
422 * this here the jiffies might be stale and do_timer() never
423 * invoked.
424 */
425 if (unlikely(!cpu_online(cpu))) {
426 if (cpu == tick_do_timer_cpu)
427 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
428 }
429
430 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
431 return false;
432
433 if (need_resched())
434 return false;
435
436 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
437 static int ratelimit;
438
439 if (ratelimit < 10) {
440 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
441 (unsigned int) local_softirq_pending());
442 ratelimit++;
443 }
444 return false;
445 }
446
447 return true;
448}
449
450static void __tick_nohz_idle_enter(struct tick_sched *ts)
451{
452 ktime_t now, expires;
453 int cpu = smp_processor_id();
454
455 now = tick_nohz_start_idle(cpu, ts);
456
457 if (can_stop_idle_tick(cpu, ts)) {
458 int was_stopped = ts->tick_stopped;
459
460 ts->idle_calls++;
461
462 expires = tick_nohz_stop_sched_tick(ts, now, cpu);
463 if (expires.tv64 > 0LL) {
464 ts->idle_sleeps++;
465 ts->idle_expires = expires;
466 }
467
468 if (!was_stopped && ts->tick_stopped)
469 ts->idle_jiffies = ts->last_jiffies;
470 }
450} 471}
451 472
452/** 473/**
@@ -484,7 +505,7 @@ void tick_nohz_idle_enter(void)
484 * update of the idle time accounting in tick_nohz_start_idle(). 505 * update of the idle time accounting in tick_nohz_start_idle().
485 */ 506 */
486 ts->inidle = 1; 507 ts->inidle = 1;
487 tick_nohz_stop_sched_tick(ts); 508 __tick_nohz_idle_enter(ts);
488 509
489 local_irq_enable(); 510 local_irq_enable();
490} 511}
@@ -504,7 +525,7 @@ void tick_nohz_irq_exit(void)
504 if (!ts->inidle) 525 if (!ts->inidle)
505 return; 526 return;
506 527
507 tick_nohz_stop_sched_tick(ts); 528 __tick_nohz_idle_enter(ts);
508} 529}
509 530
510/** 531/**
@@ -522,7 +543,7 @@ ktime_t tick_nohz_get_sleep_length(void)
522static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 543static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
523{ 544{
524 hrtimer_cancel(&ts->sched_timer); 545 hrtimer_cancel(&ts->sched_timer);
525 hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); 546 hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
526 547
527 while (1) { 548 while (1) {
528 /* Forward the time to expire in the future */ 549 /* Forward the time to expire in the future */
@@ -545,6 +566,41 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
545 } 566 }
546} 567}
547 568
569static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
570{
571 /* Update jiffies first */
572 select_nohz_load_balancer(0);
573 tick_do_update_jiffies64(now);
574 update_cpu_load_nohz();
575
576 touch_softlockup_watchdog();
577 /*
578 * Cancel the scheduled timer and restore the tick
579 */
580 ts->tick_stopped = 0;
581 ts->idle_exittime = now;
582
583 tick_nohz_restart(ts, now);
584}
585
586static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
587{
588#ifndef CONFIG_VIRT_CPU_ACCOUNTING
589 unsigned long ticks;
590 /*
591 * We stopped the tick in idle. Update process times would miss the
592 * time we slept as update_process_times does only a 1 tick
593 * accounting. Enforce that this is accounted to idle !
594 */
595 ticks = jiffies - ts->idle_jiffies;
596 /*
597 * We might be one off. Do not randomly account a huge number of ticks!
598 */
599 if (ticks && ticks < LONG_MAX)
600 account_idle_ticks(ticks);
601#endif
602}
603
548/** 604/**
549 * tick_nohz_idle_exit - restart the idle tick from the idle task 605 * tick_nohz_idle_exit - restart the idle tick from the idle task
550 * 606 *
@@ -556,9 +612,6 @@ void tick_nohz_idle_exit(void)
556{ 612{
557 int cpu = smp_processor_id(); 613 int cpu = smp_processor_id();
558 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 614 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
559#ifndef CONFIG_VIRT_CPU_ACCOUNTING
560 unsigned long ticks;
561#endif
562 ktime_t now; 615 ktime_t now;
563 616
564 local_irq_disable(); 617 local_irq_disable();
@@ -573,39 +626,11 @@ void tick_nohz_idle_exit(void)
573 if (ts->idle_active) 626 if (ts->idle_active)
574 tick_nohz_stop_idle(cpu, now); 627 tick_nohz_stop_idle(cpu, now);
575 628
576 if (!ts->tick_stopped) { 629 if (ts->tick_stopped) {
577 local_irq_enable(); 630 tick_nohz_restart_sched_tick(ts, now);
578 return; 631 tick_nohz_account_idle_ticks(ts);
579 } 632 }
580 633
581 /* Update jiffies first */
582 select_nohz_load_balancer(0);
583 tick_do_update_jiffies64(now);
584 update_cpu_load_nohz();
585
586#ifndef CONFIG_VIRT_CPU_ACCOUNTING
587 /*
588 * We stopped the tick in idle. Update process times would miss the
589 * time we slept as update_process_times does only a 1 tick
590 * accounting. Enforce that this is accounted to idle !
591 */
592 ticks = jiffies - ts->idle_jiffies;
593 /*
594 * We might be one off. Do not randomly account a huge number of ticks!
595 */
596 if (ticks && ticks < LONG_MAX)
597 account_idle_ticks(ticks);
598#endif
599
600 touch_softlockup_watchdog();
601 /*
602 * Cancel the scheduled timer and restore the tick
603 */
604 ts->tick_stopped = 0;
605 ts->idle_exittime = now;
606
607 tick_nohz_restart(ts, now);
608
609 local_irq_enable(); 634 local_irq_enable();
610} 635}
611 636
@@ -809,7 +834,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
809 */ 834 */
810 if (ts->tick_stopped) { 835 if (ts->tick_stopped) {
811 touch_softlockup_watchdog(); 836 touch_softlockup_watchdog();
812 ts->idle_jiffies++; 837 if (idle_cpu(cpu))
838 ts->idle_jiffies++;
813 } 839 }
814 update_process_times(user_mode(regs)); 840 update_process_times(user_mode(regs));
815 profile_tick(CPU_PROFILING); 841 profile_tick(CPU_PROFILING);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6f46a00a1e8a..f045cc50832d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -24,32 +24,32 @@
24/* Structure holding internal timekeeping values. */ 24/* Structure holding internal timekeeping values. */
25struct timekeeper { 25struct timekeeper {
26 /* Current clocksource used for timekeeping. */ 26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock; 27 struct clocksource *clock;
28 /* NTP adjusted clock multiplier */ 28 /* NTP adjusted clock multiplier */
29 u32 mult; 29 u32 mult;
30 /* The shift value of the current clocksource. */ 30 /* The shift value of the current clocksource. */
31 int shift; 31 u32 shift;
32
33 /* Number of clock cycles in one NTP interval. */ 32 /* Number of clock cycles in one NTP interval. */
34 cycle_t cycle_interval; 33 cycle_t cycle_interval;
35 /* Number of clock shifted nano seconds in one NTP interval. */ 34 /* Number of clock shifted nano seconds in one NTP interval. */
36 u64 xtime_interval; 35 u64 xtime_interval;
37 /* shifted nano seconds left over when rounding cycle_interval */ 36 /* shifted nano seconds left over when rounding cycle_interval */
38 s64 xtime_remainder; 37 s64 xtime_remainder;
39 /* Raw nano seconds accumulated per NTP interval. */ 38 /* Raw nano seconds accumulated per NTP interval. */
40 u32 raw_interval; 39 u32 raw_interval;
40
41 /* Current CLOCK_REALTIME time in seconds */
42 u64 xtime_sec;
43 /* Clock shifted nano seconds */
44 u64 xtime_nsec;
41 45
42 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
43 u64 xtime_nsec;
44 /* Difference between accumulated time and NTP time in ntp 46 /* Difference between accumulated time and NTP time in ntp
45 * shifted nano seconds. */ 47 * shifted nano seconds. */
46 s64 ntp_error; 48 s64 ntp_error;
47 /* Shift conversion between clock shifted nano seconds and 49 /* Shift conversion between clock shifted nano seconds and
48 * ntp shifted nano seconds. */ 50 * ntp shifted nano seconds. */
49 int ntp_error_shift; 51 u32 ntp_error_shift;
50 52
51 /* The current time */
52 struct timespec xtime;
53 /* 53 /*
54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected 54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged 55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
@@ -64,14 +64,17 @@ struct timekeeper {
64 * - wall_to_monotonic is no longer the boot time, getboottime must be 64 * - wall_to_monotonic is no longer the boot time, getboottime must be
65 * used instead. 65 * used instead.
66 */ 66 */
67 struct timespec wall_to_monotonic; 67 struct timespec wall_to_monotonic;
68 /* time spent in suspend */ 68 /* time spent in suspend */
69 struct timespec total_sleep_time; 69 struct timespec total_sleep_time;
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ 70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time; 71 struct timespec raw_time;
72 72 /* Offset clock monotonic -> clock realtime */
73 ktime_t offs_real;
74 /* Offset clock monotonic -> clock boottime */
75 ktime_t offs_boot;
73 /* Seqlock for all timekeeper values */ 76 /* Seqlock for all timekeeper values */
74 seqlock_t lock; 77 seqlock_t lock;
75}; 78};
76 79
77static struct timekeeper timekeeper; 80static struct timekeeper timekeeper;
@@ -82,11 +85,37 @@ static struct timekeeper timekeeper;
82 */ 85 */
83__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 86__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
84 87
85
86/* flag for if timekeeping is suspended */ 88/* flag for if timekeeping is suspended */
87int __read_mostly timekeeping_suspended; 89int __read_mostly timekeeping_suspended;
88 90
91static inline void tk_normalize_xtime(struct timekeeper *tk)
92{
93 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
94 tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
95 tk->xtime_sec++;
96 }
97}
98
99static struct timespec tk_xtime(struct timekeeper *tk)
100{
101 struct timespec ts;
102
103 ts.tv_sec = tk->xtime_sec;
104 ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
105 return ts;
106}
107
108static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
109{
110 tk->xtime_sec = ts->tv_sec;
111 tk->xtime_nsec = ts->tv_nsec << tk->shift;
112}
89 113
114static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
115{
116 tk->xtime_sec += ts->tv_sec;
117 tk->xtime_nsec += ts->tv_nsec << tk->shift;
118}
90 119
91/** 120/**
92 * timekeeper_setup_internals - Set up internals to use clocksource clock. 121 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -98,12 +127,14 @@ int __read_mostly timekeeping_suspended;
98 * 127 *
99 * Unless you're the timekeeping code, you should not be using this! 128 * Unless you're the timekeeping code, you should not be using this!
100 */ 129 */
101static void timekeeper_setup_internals(struct clocksource *clock) 130static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
102{ 131{
103 cycle_t interval; 132 cycle_t interval;
104 u64 tmp, ntpinterval; 133 u64 tmp, ntpinterval;
134 struct clocksource *old_clock;
105 135
106 timekeeper.clock = clock; 136 old_clock = tk->clock;
137 tk->clock = clock;
107 clock->cycle_last = clock->read(clock); 138 clock->cycle_last = clock->read(clock);
108 139
109 /* Do the ns -> cycle conversion first, using original mult */ 140 /* Do the ns -> cycle conversion first, using original mult */
@@ -116,71 +147,96 @@ static void timekeeper_setup_internals(struct clocksource *clock)
116 tmp = 1; 147 tmp = 1;
117 148
118 interval = (cycle_t) tmp; 149 interval = (cycle_t) tmp;
119 timekeeper.cycle_interval = interval; 150 tk->cycle_interval = interval;
120 151
121 /* Go back from cycles -> shifted ns */ 152 /* Go back from cycles -> shifted ns */
122 timekeeper.xtime_interval = (u64) interval * clock->mult; 153 tk->xtime_interval = (u64) interval * clock->mult;
123 timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; 154 tk->xtime_remainder = ntpinterval - tk->xtime_interval;
124 timekeeper.raw_interval = 155 tk->raw_interval =
125 ((u64) interval * clock->mult) >> clock->shift; 156 ((u64) interval * clock->mult) >> clock->shift;
126 157
127 timekeeper.xtime_nsec = 0; 158 /* if changing clocks, convert xtime_nsec shift units */
128 timekeeper.shift = clock->shift; 159 if (old_clock) {
160 int shift_change = clock->shift - old_clock->shift;
161 if (shift_change < 0)
162 tk->xtime_nsec >>= -shift_change;
163 else
164 tk->xtime_nsec <<= shift_change;
165 }
166 tk->shift = clock->shift;
129 167
130 timekeeper.ntp_error = 0; 168 tk->ntp_error = 0;
131 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; 169 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
132 170
133 /* 171 /*
134 * The timekeeper keeps its own mult values for the currently 172 * The timekeeper keeps its own mult values for the currently
135 * active clocksource. These value will be adjusted via NTP 173 * active clocksource. These value will be adjusted via NTP
136 * to counteract clock drifting. 174 * to counteract clock drifting.
137 */ 175 */
138 timekeeper.mult = clock->mult; 176 tk->mult = clock->mult;
139} 177}
140 178
141/* Timekeeper helper functions. */ 179/* Timekeeper helper functions. */
142static inline s64 timekeeping_get_ns(void) 180static inline s64 timekeeping_get_ns(struct timekeeper *tk)
143{ 181{
144 cycle_t cycle_now, cycle_delta; 182 cycle_t cycle_now, cycle_delta;
145 struct clocksource *clock; 183 struct clocksource *clock;
184 s64 nsec;
146 185
147 /* read clocksource: */ 186 /* read clocksource: */
148 clock = timekeeper.clock; 187 clock = tk->clock;
149 cycle_now = clock->read(clock); 188 cycle_now = clock->read(clock);
150 189
151 /* calculate the delta since the last update_wall_time: */ 190 /* calculate the delta since the last update_wall_time: */
152 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 191 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
153 192
154 /* return delta convert to nanoseconds using ntp adjusted mult. */ 193 nsec = cycle_delta * tk->mult + tk->xtime_nsec;
155 return clocksource_cyc2ns(cycle_delta, timekeeper.mult, 194 nsec >>= tk->shift;
156 timekeeper.shift); 195
196 /* If arch requires, add in gettimeoffset() */
197 return nsec + arch_gettimeoffset();
157} 198}
158 199
159static inline s64 timekeeping_get_ns_raw(void) 200static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
160{ 201{
161 cycle_t cycle_now, cycle_delta; 202 cycle_t cycle_now, cycle_delta;
162 struct clocksource *clock; 203 struct clocksource *clock;
204 s64 nsec;
163 205
164 /* read clocksource: */ 206 /* read clocksource: */
165 clock = timekeeper.clock; 207 clock = tk->clock;
166 cycle_now = clock->read(clock); 208 cycle_now = clock->read(clock);
167 209
168 /* calculate the delta since the last update_wall_time: */ 210 /* calculate the delta since the last update_wall_time: */
169 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 211 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
170 212
171 /* return delta convert to nanoseconds. */ 213 /* convert delta to nanoseconds. */
172 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 214 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
215
216 /* If arch requires, add in gettimeoffset() */
217 return nsec + arch_gettimeoffset();
218}
219
220static void update_rt_offset(struct timekeeper *tk)
221{
222 struct timespec tmp, *wtm = &tk->wall_to_monotonic;
223
224 set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
225 tk->offs_real = timespec_to_ktime(tmp);
173} 226}
174 227
175/* must hold write on timekeeper.lock */ 228/* must hold write on timekeeper.lock */
176static void timekeeping_update(bool clearntp) 229static void timekeeping_update(struct timekeeper *tk, bool clearntp)
177{ 230{
231 struct timespec xt;
232
178 if (clearntp) { 233 if (clearntp) {
179 timekeeper.ntp_error = 0; 234 tk->ntp_error = 0;
180 ntp_clear(); 235 ntp_clear();
181 } 236 }
182 update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, 237 update_rt_offset(tk);
183 timekeeper.clock, timekeeper.mult); 238 xt = tk_xtime(tk);
239 update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
184} 240}
185 241
186 242
@@ -191,27 +247,26 @@ static void timekeeping_update(bool clearntp)
191 * update_wall_time(). This is useful before significant clock changes, 247 * update_wall_time(). This is useful before significant clock changes,
192 * as it avoids having to deal with this time offset explicitly. 248 * as it avoids having to deal with this time offset explicitly.
193 */ 249 */
194static void timekeeping_forward_now(void) 250static void timekeeping_forward_now(struct timekeeper *tk)
195{ 251{
196 cycle_t cycle_now, cycle_delta; 252 cycle_t cycle_now, cycle_delta;
197 struct clocksource *clock; 253 struct clocksource *clock;
198 s64 nsec; 254 s64 nsec;
199 255
200 clock = timekeeper.clock; 256 clock = tk->clock;
201 cycle_now = clock->read(clock); 257 cycle_now = clock->read(clock);
202 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 258 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
203 clock->cycle_last = cycle_now; 259 clock->cycle_last = cycle_now;
204 260
205 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, 261 tk->xtime_nsec += cycle_delta * tk->mult;
206 timekeeper.shift);
207 262
208 /* If arch requires, add in gettimeoffset() */ 263 /* If arch requires, add in gettimeoffset() */
209 nsec += arch_gettimeoffset(); 264 tk->xtime_nsec += arch_gettimeoffset() << tk->shift;
210 265
211 timespec_add_ns(&timekeeper.xtime, nsec); 266 tk_normalize_xtime(tk);
212 267
213 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 268 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
214 timespec_add_ns(&timekeeper.raw_time, nsec); 269 timespec_add_ns(&tk->raw_time, nsec);
215} 270}
216 271
217/** 272/**
@@ -223,18 +278,15 @@ static void timekeeping_forward_now(void)
223void getnstimeofday(struct timespec *ts) 278void getnstimeofday(struct timespec *ts)
224{ 279{
225 unsigned long seq; 280 unsigned long seq;
226 s64 nsecs; 281 s64 nsecs = 0;
227 282
228 WARN_ON(timekeeping_suspended); 283 WARN_ON(timekeeping_suspended);
229 284
230 do { 285 do {
231 seq = read_seqbegin(&timekeeper.lock); 286 seq = read_seqbegin(&timekeeper.lock);
232 287
233 *ts = timekeeper.xtime; 288 ts->tv_sec = timekeeper.xtime_sec;
234 nsecs = timekeeping_get_ns(); 289 ts->tv_nsec = timekeeping_get_ns(&timekeeper);
235
236 /* If arch requires, add in gettimeoffset() */
237 nsecs += arch_gettimeoffset();
238 290
239 } while (read_seqretry(&timekeeper.lock, seq)); 291 } while (read_seqretry(&timekeeper.lock, seq));
240 292
@@ -251,13 +303,10 @@ ktime_t ktime_get(void)
251 303
252 do { 304 do {
253 seq = read_seqbegin(&timekeeper.lock); 305 seq = read_seqbegin(&timekeeper.lock);
254 secs = timekeeper.xtime.tv_sec + 306 secs = timekeeper.xtime_sec +
255 timekeeper.wall_to_monotonic.tv_sec; 307 timekeeper.wall_to_monotonic.tv_sec;
256 nsecs = timekeeper.xtime.tv_nsec + 308 nsecs = timekeeping_get_ns(&timekeeper) +
257 timekeeper.wall_to_monotonic.tv_nsec; 309 timekeeper.wall_to_monotonic.tv_nsec;
258 nsecs += timekeeping_get_ns();
259 /* If arch requires, add in gettimeoffset() */
260 nsecs += arch_gettimeoffset();
261 310
262 } while (read_seqretry(&timekeeper.lock, seq)); 311 } while (read_seqretry(&timekeeper.lock, seq));
263 /* 312 /*
@@ -280,22 +329,19 @@ void ktime_get_ts(struct timespec *ts)
280{ 329{
281 struct timespec tomono; 330 struct timespec tomono;
282 unsigned int seq; 331 unsigned int seq;
283 s64 nsecs;
284 332
285 WARN_ON(timekeeping_suspended); 333 WARN_ON(timekeeping_suspended);
286 334
287 do { 335 do {
288 seq = read_seqbegin(&timekeeper.lock); 336 seq = read_seqbegin(&timekeeper.lock);
289 *ts = timekeeper.xtime; 337 ts->tv_sec = timekeeper.xtime_sec;
338 ts->tv_nsec = timekeeping_get_ns(&timekeeper);
290 tomono = timekeeper.wall_to_monotonic; 339 tomono = timekeeper.wall_to_monotonic;
291 nsecs = timekeeping_get_ns();
292 /* If arch requires, add in gettimeoffset() */
293 nsecs += arch_gettimeoffset();
294 340
295 } while (read_seqretry(&timekeeper.lock, seq)); 341 } while (read_seqretry(&timekeeper.lock, seq));
296 342
297 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, 343 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
298 ts->tv_nsec + tomono.tv_nsec + nsecs); 344 ts->tv_nsec + tomono.tv_nsec);
299} 345}
300EXPORT_SYMBOL_GPL(ktime_get_ts); 346EXPORT_SYMBOL_GPL(ktime_get_ts);
301 347
@@ -318,20 +364,14 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
318 WARN_ON_ONCE(timekeeping_suspended); 364 WARN_ON_ONCE(timekeeping_suspended);
319 365
320 do { 366 do {
321 u32 arch_offset;
322
323 seq = read_seqbegin(&timekeeper.lock); 367 seq = read_seqbegin(&timekeeper.lock);
324 368
325 *ts_raw = timekeeper.raw_time; 369 *ts_raw = timekeeper.raw_time;
326 *ts_real = timekeeper.xtime; 370 ts_real->tv_sec = timekeeper.xtime_sec;
327 371 ts_real->tv_nsec = 0;
328 nsecs_raw = timekeeping_get_ns_raw();
329 nsecs_real = timekeeping_get_ns();
330 372
331 /* If arch requires, add in gettimeoffset() */ 373 nsecs_raw = timekeeping_get_ns_raw(&timekeeper);
332 arch_offset = arch_gettimeoffset(); 374 nsecs_real = timekeeping_get_ns(&timekeeper);
333 nsecs_raw += arch_offset;
334 nsecs_real += arch_offset;
335 375
336 } while (read_seqretry(&timekeeper.lock, seq)); 376 } while (read_seqretry(&timekeeper.lock, seq));
337 377
@@ -366,7 +406,7 @@ EXPORT_SYMBOL(do_gettimeofday);
366 */ 406 */
367int do_settimeofday(const struct timespec *tv) 407int do_settimeofday(const struct timespec *tv)
368{ 408{
369 struct timespec ts_delta; 409 struct timespec ts_delta, xt;
370 unsigned long flags; 410 unsigned long flags;
371 411
372 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 412 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
@@ -374,15 +414,18 @@ int do_settimeofday(const struct timespec *tv)
374 414
375 write_seqlock_irqsave(&timekeeper.lock, flags); 415 write_seqlock_irqsave(&timekeeper.lock, flags);
376 416
377 timekeeping_forward_now(); 417 timekeeping_forward_now(&timekeeper);
418
419 xt = tk_xtime(&timekeeper);
420 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
421 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
378 422
379 ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
380 ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
381 timekeeper.wall_to_monotonic = 423 timekeeper.wall_to_monotonic =
382 timespec_sub(timekeeper.wall_to_monotonic, ts_delta); 424 timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
383 425
384 timekeeper.xtime = *tv; 426 tk_set_xtime(&timekeeper, tv);
385 timekeeping_update(true); 427
428 timekeeping_update(&timekeeper, true);
386 429
387 write_sequnlock_irqrestore(&timekeeper.lock, flags); 430 write_sequnlock_irqrestore(&timekeeper.lock, flags);
388 431
@@ -409,13 +452,14 @@ int timekeeping_inject_offset(struct timespec *ts)
409 452
410 write_seqlock_irqsave(&timekeeper.lock, flags); 453 write_seqlock_irqsave(&timekeeper.lock, flags);
411 454
412 timekeeping_forward_now(); 455 timekeeping_forward_now(&timekeeper);
413 456
414 timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); 457
458 tk_xtime_add(&timekeeper, ts);
415 timekeeper.wall_to_monotonic = 459 timekeeper.wall_to_monotonic =
416 timespec_sub(timekeeper.wall_to_monotonic, *ts); 460 timespec_sub(timekeeper.wall_to_monotonic, *ts);
417 461
418 timekeeping_update(true); 462 timekeeping_update(&timekeeper, true);
419 463
420 write_sequnlock_irqrestore(&timekeeper.lock, flags); 464 write_sequnlock_irqrestore(&timekeeper.lock, flags);
421 465
@@ -440,14 +484,14 @@ static int change_clocksource(void *data)
440 484
441 write_seqlock_irqsave(&timekeeper.lock, flags); 485 write_seqlock_irqsave(&timekeeper.lock, flags);
442 486
443 timekeeping_forward_now(); 487 timekeeping_forward_now(&timekeeper);
444 if (!new->enable || new->enable(new) == 0) { 488 if (!new->enable || new->enable(new) == 0) {
445 old = timekeeper.clock; 489 old = timekeeper.clock;
446 timekeeper_setup_internals(new); 490 tk_setup_internals(&timekeeper, new);
447 if (old->disable) 491 if (old->disable)
448 old->disable(old); 492 old->disable(old);
449 } 493 }
450 timekeeping_update(true); 494 timekeeping_update(&timekeeper, true);
451 495
452 write_sequnlock_irqrestore(&timekeeper.lock, flags); 496 write_sequnlock_irqrestore(&timekeeper.lock, flags);
453 497
@@ -497,7 +541,7 @@ void getrawmonotonic(struct timespec *ts)
497 541
498 do { 542 do {
499 seq = read_seqbegin(&timekeeper.lock); 543 seq = read_seqbegin(&timekeeper.lock);
500 nsecs = timekeeping_get_ns_raw(); 544 nsecs = timekeeping_get_ns_raw(&timekeeper);
501 *ts = timekeeper.raw_time; 545 *ts = timekeeper.raw_time;
502 546
503 } while (read_seqretry(&timekeeper.lock, seq)); 547 } while (read_seqretry(&timekeeper.lock, seq));
@@ -532,6 +576,7 @@ u64 timekeeping_max_deferment(void)
532{ 576{
533 unsigned long seq; 577 unsigned long seq;
534 u64 ret; 578 u64 ret;
579
535 do { 580 do {
536 seq = read_seqbegin(&timekeeper.lock); 581 seq = read_seqbegin(&timekeeper.lock);
537 582
@@ -592,18 +637,17 @@ void __init timekeeping_init(void)
592 clock = clocksource_default_clock(); 637 clock = clocksource_default_clock();
593 if (clock->enable) 638 if (clock->enable)
594 clock->enable(clock); 639 clock->enable(clock);
595 timekeeper_setup_internals(clock); 640 tk_setup_internals(&timekeeper, clock);
596 641
597 timekeeper.xtime.tv_sec = now.tv_sec; 642 tk_set_xtime(&timekeeper, &now);
598 timekeeper.xtime.tv_nsec = now.tv_nsec;
599 timekeeper.raw_time.tv_sec = 0; 643 timekeeper.raw_time.tv_sec = 0;
600 timekeeper.raw_time.tv_nsec = 0; 644 timekeeper.raw_time.tv_nsec = 0;
601 if (boot.tv_sec == 0 && boot.tv_nsec == 0) { 645 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
602 boot.tv_sec = timekeeper.xtime.tv_sec; 646 boot = tk_xtime(&timekeeper);
603 boot.tv_nsec = timekeeper.xtime.tv_nsec; 647
604 }
605 set_normalized_timespec(&timekeeper.wall_to_monotonic, 648 set_normalized_timespec(&timekeeper.wall_to_monotonic,
606 -boot.tv_sec, -boot.tv_nsec); 649 -boot.tv_sec, -boot.tv_nsec);
650 update_rt_offset(&timekeeper);
607 timekeeper.total_sleep_time.tv_sec = 0; 651 timekeeper.total_sleep_time.tv_sec = 0;
608 timekeeper.total_sleep_time.tv_nsec = 0; 652 timekeeper.total_sleep_time.tv_nsec = 0;
609 write_sequnlock_irqrestore(&timekeeper.lock, flags); 653 write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -612,6 +656,12 @@ void __init timekeeping_init(void)
612/* time in seconds when suspend began */ 656/* time in seconds when suspend began */
613static struct timespec timekeeping_suspend_time; 657static struct timespec timekeeping_suspend_time;
614 658
659static void update_sleep_time(struct timespec t)
660{
661 timekeeper.total_sleep_time = t;
662 timekeeper.offs_boot = timespec_to_ktime(t);
663}
664
615/** 665/**
616 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 666 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
617 * @delta: pointer to a timespec delta value 667 * @delta: pointer to a timespec delta value
@@ -619,7 +669,8 @@ static struct timespec timekeeping_suspend_time;
619 * Takes a timespec offset measuring a suspend interval and properly 669 * Takes a timespec offset measuring a suspend interval and properly
620 * adds the sleep offset to the timekeeping variables. 670 * adds the sleep offset to the timekeeping variables.
621 */ 671 */
622static void __timekeeping_inject_sleeptime(struct timespec *delta) 672static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
673 struct timespec *delta)
623{ 674{
624 if (!timespec_valid(delta)) { 675 if (!timespec_valid(delta)) {
625 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 676 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
@@ -627,11 +678,9 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
627 return; 678 return;
628 } 679 }
629 680
630 timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); 681 tk_xtime_add(tk, delta);
631 timekeeper.wall_to_monotonic = 682 tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta);
632 timespec_sub(timekeeper.wall_to_monotonic, *delta); 683 update_sleep_time(timespec_add(tk->total_sleep_time, *delta));
633 timekeeper.total_sleep_time = timespec_add(
634 timekeeper.total_sleep_time, *delta);
635} 684}
636 685
637 686
@@ -657,11 +706,11 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
657 706
658 write_seqlock_irqsave(&timekeeper.lock, flags); 707 write_seqlock_irqsave(&timekeeper.lock, flags);
659 708
660 timekeeping_forward_now(); 709 timekeeping_forward_now(&timekeeper);
661 710
662 __timekeeping_inject_sleeptime(delta); 711 __timekeeping_inject_sleeptime(&timekeeper, delta);
663 712
664 timekeeping_update(true); 713 timekeeping_update(&timekeeper, true);
665 714
666 write_sequnlock_irqrestore(&timekeeper.lock, flags); 715 write_sequnlock_irqrestore(&timekeeper.lock, flags);
667 716
@@ -690,12 +739,13 @@ static void timekeeping_resume(void)
690 739
691 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 740 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
692 ts = timespec_sub(ts, timekeeping_suspend_time); 741 ts = timespec_sub(ts, timekeeping_suspend_time);
693 __timekeeping_inject_sleeptime(&ts); 742 __timekeeping_inject_sleeptime(&timekeeper, &ts);
694 } 743 }
695 /* re-base the last cycle value */ 744 /* re-base the last cycle value */
696 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 745 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
697 timekeeper.ntp_error = 0; 746 timekeeper.ntp_error = 0;
698 timekeeping_suspended = 0; 747 timekeeping_suspended = 0;
748 timekeeping_update(&timekeeper, false);
699 write_sequnlock_irqrestore(&timekeeper.lock, flags); 749 write_sequnlock_irqrestore(&timekeeper.lock, flags);
700 750
701 touch_softlockup_watchdog(); 751 touch_softlockup_watchdog();
@@ -715,7 +765,7 @@ static int timekeeping_suspend(void)
715 read_persistent_clock(&timekeeping_suspend_time); 765 read_persistent_clock(&timekeeping_suspend_time);
716 766
717 write_seqlock_irqsave(&timekeeper.lock, flags); 767 write_seqlock_irqsave(&timekeeper.lock, flags);
718 timekeeping_forward_now(); 768 timekeeping_forward_now(&timekeeper);
719 timekeeping_suspended = 1; 769 timekeeping_suspended = 1;
720 770
721 /* 771 /*
@@ -724,7 +774,7 @@ static int timekeeping_suspend(void)
724 * try to compensate so the difference in system time 774 * try to compensate so the difference in system time
725 * and persistent_clock time stays close to constant. 775 * and persistent_clock time stays close to constant.
726 */ 776 */
727 delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); 777 delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time);
728 delta_delta = timespec_sub(delta, old_delta); 778 delta_delta = timespec_sub(delta, old_delta);
729 if (abs(delta_delta.tv_sec) >= 2) { 779 if (abs(delta_delta.tv_sec) >= 2) {
730 /* 780 /*
@@ -763,7 +813,8 @@ device_initcall(timekeeping_init_ops);
763 * If the error is already larger, we look ahead even further 813 * If the error is already larger, we look ahead even further
764 * to compensate for late or lost adjustments. 814 * to compensate for late or lost adjustments.
765 */ 815 */
766static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, 816static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
817 s64 error, s64 *interval,
767 s64 *offset) 818 s64 *offset)
768{ 819{
769 s64 tick_error, i; 820 s64 tick_error, i;
@@ -779,7 +830,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
779 * here. This is tuned so that an error of about 1 msec is adjusted 830 * here. This is tuned so that an error of about 1 msec is adjusted
780 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 831 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
781 */ 832 */
782 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 833 error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
783 error2 = abs(error2); 834 error2 = abs(error2);
784 for (look_ahead = 0; error2 > 0; look_ahead++) 835 for (look_ahead = 0; error2 > 0; look_ahead++)
785 error2 >>= 2; 836 error2 >>= 2;
@@ -788,8 +839,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
788 * Now calculate the error in (1 << look_ahead) ticks, but first 839 * Now calculate the error in (1 << look_ahead) ticks, but first
789 * remove the single look ahead already included in the error. 840 * remove the single look ahead already included in the error.
790 */ 841 */
791 tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); 842 tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
792 tick_error -= timekeeper.xtime_interval >> 1; 843 tick_error -= tk->xtime_interval >> 1;
793 error = ((error - tick_error) >> look_ahead) + tick_error; 844 error = ((error - tick_error) >> look_ahead) + tick_error;
794 845
795 /* Finally calculate the adjustment shift value. */ 846 /* Finally calculate the adjustment shift value. */
@@ -814,9 +865,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
814 * this is optimized for the most common adjustments of -1,0,1, 865 * this is optimized for the most common adjustments of -1,0,1,
815 * for other values we can do a bit more work. 866 * for other values we can do a bit more work.
816 */ 867 */
817static void timekeeping_adjust(s64 offset) 868static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
818{ 869{
819 s64 error, interval = timekeeper.cycle_interval; 870 s64 error, interval = tk->cycle_interval;
820 int adj; 871 int adj;
821 872
822 /* 873 /*
@@ -832,7 +883,7 @@ static void timekeeping_adjust(s64 offset)
832 * 883 *
833 * Note: It does not "save" on aggravation when reading the code. 884 * Note: It does not "save" on aggravation when reading the code.
834 */ 885 */
835 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); 886 error = tk->ntp_error >> (tk->ntp_error_shift - 1);
836 if (error > interval) { 887 if (error > interval) {
837 /* 888 /*
838 * We now divide error by 4(via shift), which checks if 889 * We now divide error by 4(via shift), which checks if
@@ -854,7 +905,8 @@ static void timekeeping_adjust(s64 offset)
854 if (likely(error <= interval)) 905 if (likely(error <= interval))
855 adj = 1; 906 adj = 1;
856 else 907 else
857 adj = timekeeping_bigadjust(error, &interval, &offset); 908 adj = timekeeping_bigadjust(tk, error, &interval,
909 &offset);
858 } else if (error < -interval) { 910 } else if (error < -interval) {
859 /* See comment above, this is just switched for the negative */ 911 /* See comment above, this is just switched for the negative */
860 error >>= 2; 912 error >>= 2;
@@ -863,18 +915,17 @@ static void timekeeping_adjust(s64 offset)
863 interval = -interval; 915 interval = -interval;
864 offset = -offset; 916 offset = -offset;
865 } else 917 } else
866 adj = timekeeping_bigadjust(error, &interval, &offset); 918 adj = timekeeping_bigadjust(tk, error, &interval,
867 } else /* No adjustment needed */ 919 &offset);
920 } else
868 return; 921 return;
869 922
870 if (unlikely(timekeeper.clock->maxadj && 923 if (unlikely(tk->clock->maxadj &&
871 (timekeeper.mult + adj > 924 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
872 timekeeper.clock->mult + timekeeper.clock->maxadj))) {
873 printk_once(KERN_WARNING 925 printk_once(KERN_WARNING
874 "Adjusting %s more than 11%% (%ld vs %ld)\n", 926 "Adjusting %s more than 11%% (%ld vs %ld)\n",
875 timekeeper.clock->name, (long)timekeeper.mult + adj, 927 tk->clock->name, (long)tk->mult + adj,
876 (long)timekeeper.clock->mult + 928 (long)tk->clock->mult + tk->clock->maxadj);
877 timekeeper.clock->maxadj);
878 } 929 }
879 /* 930 /*
880 * So the following can be confusing. 931 * So the following can be confusing.
@@ -925,11 +976,60 @@ static void timekeeping_adjust(s64 offset)
925 * 976 *
926 * XXX - TODO: Doc ntp_error calculation. 977 * XXX - TODO: Doc ntp_error calculation.
927 */ 978 */
928 timekeeper.mult += adj; 979 tk->mult += adj;
929 timekeeper.xtime_interval += interval; 980 tk->xtime_interval += interval;
930 timekeeper.xtime_nsec -= offset; 981 tk->xtime_nsec -= offset;
931 timekeeper.ntp_error -= (interval - offset) << 982 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
932 timekeeper.ntp_error_shift; 983
984 /*
985 * It may be possible that when we entered this function, xtime_nsec
986 * was very small. Further, if we're slightly speeding the clocksource
987 * in the code above, its possible the required corrective factor to
988 * xtime_nsec could cause it to underflow.
989 *
990 * Now, since we already accumulated the second, cannot simply roll
991 * the accumulated second back, since the NTP subsystem has been
992 * notified via second_overflow. So instead we push xtime_nsec forward
993 * by the amount we underflowed, and add that amount into the error.
994 *
995 * We'll correct this error next time through this function, when
996 * xtime_nsec is not as small.
997 */
998 if (unlikely((s64)tk->xtime_nsec < 0)) {
999 s64 neg = -(s64)tk->xtime_nsec;
1000 tk->xtime_nsec = 0;
1001 tk->ntp_error += neg << tk->ntp_error_shift;
1002 }
1003
1004}
1005
1006
1007/**
1008 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
1009 *
1010 * Helper function that accumulates a the nsecs greater then a second
1011 * from the xtime_nsec field to the xtime_secs field.
1012 * It also calls into the NTP code to handle leapsecond processing.
1013 *
1014 */
1015static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1016{
1017 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
1018
1019 while (tk->xtime_nsec >= nsecps) {
1020 int leap;
1021
1022 tk->xtime_nsec -= nsecps;
1023 tk->xtime_sec++;
1024
1025 /* Figure out if its a leap sec and apply if needed */
1026 leap = second_overflow(tk->xtime_sec);
1027 tk->xtime_sec += leap;
1028 tk->wall_to_monotonic.tv_sec -= leap;
1029 if (leap)
1030 clock_was_set_delayed();
1031
1032 }
933} 1033}
934 1034
935 1035
@@ -942,44 +1042,36 @@ static void timekeeping_adjust(s64 offset)
942 * 1042 *
943 * Returns the unconsumed cycles. 1043 * Returns the unconsumed cycles.
944 */ 1044 */
945static cycle_t logarithmic_accumulation(cycle_t offset, int shift) 1045static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1046 u32 shift)
946{ 1047{
947 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
948 u64 raw_nsecs; 1048 u64 raw_nsecs;
949 1049
950 /* If the offset is smaller than a shifted interval, do nothing */ 1050 /* If the offset is smaller then a shifted interval, do nothing */
951 if (offset < timekeeper.cycle_interval<<shift) 1051 if (offset < tk->cycle_interval<<shift)
952 return offset; 1052 return offset;
953 1053
954 /* Accumulate one shifted interval */ 1054 /* Accumulate one shifted interval */
955 offset -= timekeeper.cycle_interval << shift; 1055 offset -= tk->cycle_interval << shift;
956 timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; 1056 tk->clock->cycle_last += tk->cycle_interval << shift;
957 1057
958 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; 1058 tk->xtime_nsec += tk->xtime_interval << shift;
959 while (timekeeper.xtime_nsec >= nsecps) { 1059 accumulate_nsecs_to_secs(tk);
960 int leap;
961 timekeeper.xtime_nsec -= nsecps;
962 timekeeper.xtime.tv_sec++;
963 leap = second_overflow(timekeeper.xtime.tv_sec);
964 timekeeper.xtime.tv_sec += leap;
965 timekeeper.wall_to_monotonic.tv_sec -= leap;
966 }
967 1060
968 /* Accumulate raw time */ 1061 /* Accumulate raw time */
969 raw_nsecs = timekeeper.raw_interval << shift; 1062 raw_nsecs = tk->raw_interval << shift;
970 raw_nsecs += timekeeper.raw_time.tv_nsec; 1063 raw_nsecs += tk->raw_time.tv_nsec;
971 if (raw_nsecs >= NSEC_PER_SEC) { 1064 if (raw_nsecs >= NSEC_PER_SEC) {
972 u64 raw_secs = raw_nsecs; 1065 u64 raw_secs = raw_nsecs;
973 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); 1066 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
974 timekeeper.raw_time.tv_sec += raw_secs; 1067 tk->raw_time.tv_sec += raw_secs;
975 } 1068 }
976 timekeeper.raw_time.tv_nsec = raw_nsecs; 1069 tk->raw_time.tv_nsec = raw_nsecs;
977 1070
978 /* Accumulate error between NTP and clock interval */ 1071 /* Accumulate error between NTP and clock interval */
979 timekeeper.ntp_error += ntp_tick_length() << shift; 1072 tk->ntp_error += ntp_tick_length() << shift;
980 timekeeper.ntp_error -= 1073 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
981 (timekeeper.xtime_interval + timekeeper.xtime_remainder) << 1074 (tk->ntp_error_shift + shift);
982 (timekeeper.ntp_error_shift + shift);
983 1075
984 return offset; 1076 return offset;
985} 1077}
@@ -995,6 +1087,7 @@ static void update_wall_time(void)
995 cycle_t offset; 1087 cycle_t offset;
996 int shift = 0, maxshift; 1088 int shift = 0, maxshift;
997 unsigned long flags; 1089 unsigned long flags;
1090 s64 remainder;
998 1091
999 write_seqlock_irqsave(&timekeeper.lock, flags); 1092 write_seqlock_irqsave(&timekeeper.lock, flags);
1000 1093
@@ -1009,8 +1102,6 @@ static void update_wall_time(void)
1009#else 1102#else
1010 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1103 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1011#endif 1104#endif
1012 timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
1013 timekeeper.shift;
1014 1105
1015 /* 1106 /*
1016 * With NO_HZ we may have to accumulate many cycle_intervals 1107 * With NO_HZ we may have to accumulate many cycle_intervals
@@ -1026,62 +1117,36 @@ static void update_wall_time(void)
1026 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; 1117 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1027 shift = min(shift, maxshift); 1118 shift = min(shift, maxshift);
1028 while (offset >= timekeeper.cycle_interval) { 1119 while (offset >= timekeeper.cycle_interval) {
1029 offset = logarithmic_accumulation(offset, shift); 1120 offset = logarithmic_accumulation(&timekeeper, offset, shift);
1030 if(offset < timekeeper.cycle_interval<<shift) 1121 if(offset < timekeeper.cycle_interval<<shift)
1031 shift--; 1122 shift--;
1032 } 1123 }
1033 1124
1034 /* correct the clock when NTP error is too big */ 1125 /* correct the clock when NTP error is too big */
1035 timekeeping_adjust(offset); 1126 timekeeping_adjust(&timekeeper, offset);
1036
1037 /*
1038 * Since in the loop above, we accumulate any amount of time
1039 * in xtime_nsec over a second into xtime.tv_sec, its possible for
1040 * xtime_nsec to be fairly small after the loop. Further, if we're
1041 * slightly speeding the clocksource up in timekeeping_adjust(),
1042 * its possible the required corrective factor to xtime_nsec could
1043 * cause it to underflow.
1044 *
1045 * Now, we cannot simply roll the accumulated second back, since
1046 * the NTP subsystem has been notified via second_overflow. So
1047 * instead we push xtime_nsec forward by the amount we underflowed,
1048 * and add that amount into the error.
1049 *
1050 * We'll correct this error next time through this function, when
1051 * xtime_nsec is not as small.
1052 */
1053 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
1054 s64 neg = -(s64)timekeeper.xtime_nsec;
1055 timekeeper.xtime_nsec = 0;
1056 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
1057 }
1058 1127
1059 1128
1060 /* 1129 /*
1061 * Store full nanoseconds into xtime after rounding it up and 1130 * Store only full nanoseconds into xtime_nsec after rounding
1062 * add the remainder to the error difference. 1131 * it up and add the remainder to the error difference.
1063 */ 1132 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
1064 timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> 1133 * by truncating the remainder in vsyscalls. However, it causes
1065 timekeeper.shift) + 1; 1134 * additional work to be done in timekeeping_adjust(). Once
1066 timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << 1135 * the vsyscall implementations are converted to use xtime_nsec
1067 timekeeper.shift; 1136 * (shifted nanoseconds), this can be killed.
1068 timekeeper.ntp_error += timekeeper.xtime_nsec << 1137 */
1069 timekeeper.ntp_error_shift; 1138 remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
1139 timekeeper.xtime_nsec -= remainder;
1140 timekeeper.xtime_nsec += 1 << timekeeper.shift;
1141 timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
1070 1142
1071 /* 1143 /*
1072 * Finally, make sure that after the rounding 1144 * Finally, make sure that after the rounding
1073 * xtime.tv_nsec isn't larger than NSEC_PER_SEC 1145 * xtime_nsec isn't larger than NSEC_PER_SEC
1074 */ 1146 */
1075 if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { 1147 accumulate_nsecs_to_secs(&timekeeper);
1076 int leap;
1077 timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
1078 timekeeper.xtime.tv_sec++;
1079 leap = second_overflow(timekeeper.xtime.tv_sec);
1080 timekeeper.xtime.tv_sec += leap;
1081 timekeeper.wall_to_monotonic.tv_sec -= leap;
1082 }
1083 1148
1084 timekeeping_update(false); 1149 timekeeping_update(&timekeeper, false);
1085 1150
1086out: 1151out:
1087 write_sequnlock_irqrestore(&timekeeper.lock, flags); 1152 write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -1126,21 +1191,20 @@ void get_monotonic_boottime(struct timespec *ts)
1126{ 1191{
1127 struct timespec tomono, sleep; 1192 struct timespec tomono, sleep;
1128 unsigned int seq; 1193 unsigned int seq;
1129 s64 nsecs;
1130 1194
1131 WARN_ON(timekeeping_suspended); 1195 WARN_ON(timekeeping_suspended);
1132 1196
1133 do { 1197 do {
1134 seq = read_seqbegin(&timekeeper.lock); 1198 seq = read_seqbegin(&timekeeper.lock);
1135 *ts = timekeeper.xtime; 1199 ts->tv_sec = timekeeper.xtime_sec;
1200 ts->tv_nsec = timekeeping_get_ns(&timekeeper);
1136 tomono = timekeeper.wall_to_monotonic; 1201 tomono = timekeeper.wall_to_monotonic;
1137 sleep = timekeeper.total_sleep_time; 1202 sleep = timekeeper.total_sleep_time;
1138 nsecs = timekeeping_get_ns();
1139 1203
1140 } while (read_seqretry(&timekeeper.lock, seq)); 1204 } while (read_seqretry(&timekeeper.lock, seq));
1141 1205
1142 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, 1206 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
1143 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); 1207 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
1144} 1208}
1145EXPORT_SYMBOL_GPL(get_monotonic_boottime); 1209EXPORT_SYMBOL_GPL(get_monotonic_boottime);
1146 1210
@@ -1173,13 +1237,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
1173 1237
1174unsigned long get_seconds(void) 1238unsigned long get_seconds(void)
1175{ 1239{
1176 return timekeeper.xtime.tv_sec; 1240 return timekeeper.xtime_sec;
1177} 1241}
1178EXPORT_SYMBOL(get_seconds); 1242EXPORT_SYMBOL(get_seconds);
1179 1243
1180struct timespec __current_kernel_time(void) 1244struct timespec __current_kernel_time(void)
1181{ 1245{
1182 return timekeeper.xtime; 1246 return tk_xtime(&timekeeper);
1183} 1247}
1184 1248
1185struct timespec current_kernel_time(void) 1249struct timespec current_kernel_time(void)
@@ -1190,7 +1254,7 @@ struct timespec current_kernel_time(void)
1190 do { 1254 do {
1191 seq = read_seqbegin(&timekeeper.lock); 1255 seq = read_seqbegin(&timekeeper.lock);
1192 1256
1193 now = timekeeper.xtime; 1257 now = tk_xtime(&timekeeper);
1194 } while (read_seqretry(&timekeeper.lock, seq)); 1258 } while (read_seqretry(&timekeeper.lock, seq));
1195 1259
1196 return now; 1260 return now;
@@ -1205,7 +1269,7 @@ struct timespec get_monotonic_coarse(void)
1205 do { 1269 do {
1206 seq = read_seqbegin(&timekeeper.lock); 1270 seq = read_seqbegin(&timekeeper.lock);
1207 1271
1208 now = timekeeper.xtime; 1272 now = tk_xtime(&timekeeper);
1209 mono = timekeeper.wall_to_monotonic; 1273 mono = timekeeper.wall_to_monotonic;
1210 } while (read_seqretry(&timekeeper.lock, seq)); 1274 } while (read_seqretry(&timekeeper.lock, seq));
1211 1275
@@ -1240,12 +1304,43 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1240 1304
1241 do { 1305 do {
1242 seq = read_seqbegin(&timekeeper.lock); 1306 seq = read_seqbegin(&timekeeper.lock);
1243 *xtim = timekeeper.xtime; 1307 *xtim = tk_xtime(&timekeeper);
1244 *wtom = timekeeper.wall_to_monotonic; 1308 *wtom = timekeeper.wall_to_monotonic;
1245 *sleep = timekeeper.total_sleep_time; 1309 *sleep = timekeeper.total_sleep_time;
1246 } while (read_seqretry(&timekeeper.lock, seq)); 1310 } while (read_seqretry(&timekeeper.lock, seq));
1247} 1311}
1248 1312
1313#ifdef CONFIG_HIGH_RES_TIMERS
1314/**
1315 * ktime_get_update_offsets - hrtimer helper
1316 * @offs_real: pointer to storage for monotonic -> realtime offset
1317 * @offs_boot: pointer to storage for monotonic -> boottime offset
1318 *
1319 * Returns current monotonic time and updates the offsets
1320 * Called from hrtimer_interupt() or retrigger_next_event()
1321 */
1322ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1323{
1324 ktime_t now;
1325 unsigned int seq;
1326 u64 secs, nsecs;
1327
1328 do {
1329 seq = read_seqbegin(&timekeeper.lock);
1330
1331 secs = timekeeper.xtime_sec;
1332 nsecs = timekeeping_get_ns(&timekeeper);
1333
1334 *offs_real = timekeeper.offs_real;
1335 *offs_boot = timekeeper.offs_boot;
1336 } while (read_seqretry(&timekeeper.lock, seq));
1337
1338 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1339 now = ktime_sub(now, *offs_real);
1340 return now;
1341}
1342#endif
1343
1249/** 1344/**
1250 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format 1345 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1251 */ 1346 */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3258455549f4..af5a7e9f164b 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
167 { 167 {
168 struct tick_sched *ts = tick_get_tick_sched(cpu); 168 struct tick_sched *ts = tick_get_tick_sched(cpu);
169 P(nohz_mode); 169 P(nohz_mode);
170 P_ns(idle_tick); 170 P_ns(last_tick);
171 P(tick_stopped); 171 P(tick_stopped);
172 P(idle_jiffies); 172 P(idle_jiffies);
173 P(idle_calls); 173 P(idle_calls);
@@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
259 u64 now = ktime_to_ns(ktime_get()); 259 u64 now = ktime_to_ns(ktime_get());
260 int cpu; 260 int cpu;
261 261
262 SEQ_printf(m, "Timer List Version: v0.6\n"); 262 SEQ_printf(m, "Timer List Version: v0.7\n");
263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
265 265
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ec7e7e0db43..a61c09374eba 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -77,6 +77,7 @@ struct tvec_base {
77 struct timer_list *running_timer; 77 struct timer_list *running_timer;
78 unsigned long timer_jiffies; 78 unsigned long timer_jiffies;
79 unsigned long next_timer; 79 unsigned long next_timer;
80 unsigned long active_timers;
80 struct tvec_root tv1; 81 struct tvec_root tv1;
81 struct tvec tv2; 82 struct tvec tv2;
82 struct tvec tv3; 83 struct tvec tv3;
@@ -330,7 +331,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
330} 331}
331EXPORT_SYMBOL_GPL(set_timer_slack); 332EXPORT_SYMBOL_GPL(set_timer_slack);
332 333
333static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 334static void
335__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
334{ 336{
335 unsigned long expires = timer->expires; 337 unsigned long expires = timer->expires;
336 unsigned long idx = expires - base->timer_jiffies; 338 unsigned long idx = expires - base->timer_jiffies;
@@ -372,6 +374,19 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
372 list_add_tail(&timer->entry, vec); 374 list_add_tail(&timer->entry, vec);
373} 375}
374 376
377static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
378{
379 __internal_add_timer(base, timer);
380 /*
381 * Update base->active_timers and base->next_timer
382 */
383 if (!tbase_get_deferrable(timer->base)) {
384 if (time_before(timer->expires, base->next_timer))
385 base->next_timer = timer->expires;
386 base->active_timers++;
387 }
388}
389
375#ifdef CONFIG_TIMER_STATS 390#ifdef CONFIG_TIMER_STATS
376void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) 391void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
377{ 392{
@@ -654,8 +669,7 @@ void init_timer_deferrable_key(struct timer_list *timer,
654} 669}
655EXPORT_SYMBOL(init_timer_deferrable_key); 670EXPORT_SYMBOL(init_timer_deferrable_key);
656 671
657static inline void detach_timer(struct timer_list *timer, 672static inline void detach_timer(struct timer_list *timer, bool clear_pending)
658 int clear_pending)
659{ 673{
660 struct list_head *entry = &timer->entry; 674 struct list_head *entry = &timer->entry;
661 675
@@ -667,6 +681,29 @@ static inline void detach_timer(struct timer_list *timer,
667 entry->prev = LIST_POISON2; 681 entry->prev = LIST_POISON2;
668} 682}
669 683
684static inline void
685detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
686{
687 detach_timer(timer, true);
688 if (!tbase_get_deferrable(timer->base))
689 timer->base->active_timers--;
690}
691
692static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
693 bool clear_pending)
694{
695 if (!timer_pending(timer))
696 return 0;
697
698 detach_timer(timer, clear_pending);
699 if (!tbase_get_deferrable(timer->base)) {
700 timer->base->active_timers--;
701 if (timer->expires == base->next_timer)
702 base->next_timer = base->timer_jiffies;
703 }
704 return 1;
705}
706
670/* 707/*
671 * We are using hashed locking: holding per_cpu(tvec_bases).lock 708 * We are using hashed locking: holding per_cpu(tvec_bases).lock
672 * means that all timers which are tied to this base via timer->base are 709 * means that all timers which are tied to this base via timer->base are
@@ -712,16 +749,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
712 749
713 base = lock_timer_base(timer, &flags); 750 base = lock_timer_base(timer, &flags);
714 751
715 if (timer_pending(timer)) { 752 ret = detach_if_pending(timer, base, false);
716 detach_timer(timer, 0); 753 if (!ret && pending_only)
717 if (timer->expires == base->next_timer && 754 goto out_unlock;
718 !tbase_get_deferrable(timer->base))
719 base->next_timer = base->timer_jiffies;
720 ret = 1;
721 } else {
722 if (pending_only)
723 goto out_unlock;
724 }
725 755
726 debug_activate(timer, expires); 756 debug_activate(timer, expires);
727 757
@@ -752,9 +782,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
752 } 782 }
753 783
754 timer->expires = expires; 784 timer->expires = expires;
755 if (time_before(timer->expires, base->next_timer) &&
756 !tbase_get_deferrable(timer->base))
757 base->next_timer = timer->expires;
758 internal_add_timer(base, timer); 785 internal_add_timer(base, timer);
759 786
760out_unlock: 787out_unlock:
@@ -920,9 +947,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
920 spin_lock_irqsave(&base->lock, flags); 947 spin_lock_irqsave(&base->lock, flags);
921 timer_set_base(timer, base); 948 timer_set_base(timer, base);
922 debug_activate(timer, timer->expires); 949 debug_activate(timer, timer->expires);
923 if (time_before(timer->expires, base->next_timer) &&
924 !tbase_get_deferrable(timer->base))
925 base->next_timer = timer->expires;
926 internal_add_timer(base, timer); 950 internal_add_timer(base, timer);
927 /* 951 /*
928 * Check whether the other CPU is idle and needs to be 952 * Check whether the other CPU is idle and needs to be
@@ -959,13 +983,7 @@ int del_timer(struct timer_list *timer)
959 timer_stats_timer_clear_start_info(timer); 983 timer_stats_timer_clear_start_info(timer);
960 if (timer_pending(timer)) { 984 if (timer_pending(timer)) {
961 base = lock_timer_base(timer, &flags); 985 base = lock_timer_base(timer, &flags);
962 if (timer_pending(timer)) { 986 ret = detach_if_pending(timer, base, true);
963 detach_timer(timer, 1);
964 if (timer->expires == base->next_timer &&
965 !tbase_get_deferrable(timer->base))
966 base->next_timer = base->timer_jiffies;
967 ret = 1;
968 }
969 spin_unlock_irqrestore(&base->lock, flags); 987 spin_unlock_irqrestore(&base->lock, flags);
970 } 988 }
971 989
@@ -990,19 +1008,10 @@ int try_to_del_timer_sync(struct timer_list *timer)
990 1008
991 base = lock_timer_base(timer, &flags); 1009 base = lock_timer_base(timer, &flags);
992 1010
993 if (base->running_timer == timer) 1011 if (base->running_timer != timer) {
994 goto out; 1012 timer_stats_timer_clear_start_info(timer);
995 1013 ret = detach_if_pending(timer, base, true);
996 timer_stats_timer_clear_start_info(timer);
997 ret = 0;
998 if (timer_pending(timer)) {
999 detach_timer(timer, 1);
1000 if (timer->expires == base->next_timer &&
1001 !tbase_get_deferrable(timer->base))
1002 base->next_timer = base->timer_jiffies;
1003 ret = 1;
1004 } 1014 }
1005out:
1006 spin_unlock_irqrestore(&base->lock, flags); 1015 spin_unlock_irqrestore(&base->lock, flags);
1007 1016
1008 return ret; 1017 return ret;
@@ -1089,7 +1098,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1089 */ 1098 */
1090 list_for_each_entry_safe(timer, tmp, &tv_list, entry) { 1099 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
1091 BUG_ON(tbase_get_base(timer->base) != base); 1100 BUG_ON(tbase_get_base(timer->base) != base);
1092 internal_add_timer(base, timer); 1101 /* No accounting, while moving them */
1102 __internal_add_timer(base, timer);
1093 } 1103 }
1094 1104
1095 return index; 1105 return index;
@@ -1178,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base)
1178 timer_stats_account_timer(timer); 1188 timer_stats_account_timer(timer);
1179 1189
1180 base->running_timer = timer; 1190 base->running_timer = timer;
1181 detach_timer(timer, 1); 1191 detach_expired_timer(timer, base);
1182 1192
1183 spin_unlock_irq(&base->lock); 1193 spin_unlock_irq(&base->lock);
1184 call_timer_fn(timer, fn, data); 1194 call_timer_fn(timer, fn, data);
@@ -1316,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
1316unsigned long get_next_timer_interrupt(unsigned long now) 1326unsigned long get_next_timer_interrupt(unsigned long now)
1317{ 1327{
1318 struct tvec_base *base = __this_cpu_read(tvec_bases); 1328 struct tvec_base *base = __this_cpu_read(tvec_bases);
1319 unsigned long expires; 1329 unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
1320 1330
1321 /* 1331 /*
1322 * Pretend that there is no timer pending if the cpu is offline. 1332 * Pretend that there is no timer pending if the cpu is offline.
1323 * Possible pending timers will be migrated later to an active cpu. 1333 * Possible pending timers will be migrated later to an active cpu.
1324 */ 1334 */
1325 if (cpu_is_offline(smp_processor_id())) 1335 if (cpu_is_offline(smp_processor_id()))
1326 return now + NEXT_TIMER_MAX_DELTA; 1336 return expires;
1337
1327 spin_lock(&base->lock); 1338 spin_lock(&base->lock);
1328 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1339 if (base->active_timers) {
1329 base->next_timer = __next_timer_interrupt(base); 1340 if (time_before_eq(base->next_timer, base->timer_jiffies))
1330 expires = base->next_timer; 1341 base->next_timer = __next_timer_interrupt(base);
1342 expires = base->next_timer;
1343 }
1331 spin_unlock(&base->lock); 1344 spin_unlock(&base->lock);
1332 1345
1333 if (time_before_eq(expires, now)) 1346 if (time_before_eq(expires, now))
@@ -1704,6 +1717,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1704 1717
1705 base->timer_jiffies = jiffies; 1718 base->timer_jiffies = jiffies;
1706 base->next_timer = base->timer_jiffies; 1719 base->next_timer = base->timer_jiffies;
1720 base->active_timers = 0;
1707 return 0; 1721 return 0;
1708} 1722}
1709 1723
@@ -1714,11 +1728,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1714 1728
1715 while (!list_empty(head)) { 1729 while (!list_empty(head)) {
1716 timer = list_first_entry(head, struct timer_list, entry); 1730 timer = list_first_entry(head, struct timer_list, entry);
1717 detach_timer(timer, 0); 1731 /* We ignore the accounting on the dying cpu */
1732 detach_timer(timer, false);
1718 timer_set_base(timer, new_base); 1733 timer_set_base(timer, new_base);
1719 if (time_before(timer->expires, new_base->next_timer) &&
1720 !tbase_get_deferrable(timer->base))
1721 new_base->next_timer = timer->expires;
1722 internal_add_timer(new_base, timer); 1734 internal_add_timer(new_base, timer);
1723 } 1735 }
1724} 1736}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a008663d86c8..b4f20fba09fc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
312 312
313static int __register_ftrace_function(struct ftrace_ops *ops) 313static int __register_ftrace_function(struct ftrace_ops *ops)
314{ 314{
315 if (ftrace_disabled) 315 if (unlikely(ftrace_disabled))
316 return -ENODEV; 316 return -ENODEV;
317 317
318 if (FTRACE_WARN_ON(ops == &global_ops)) 318 if (FTRACE_WARN_ON(ops == &global_ops))
@@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
4299 4299
4300 mutex_lock(&ftrace_lock); 4300 mutex_lock(&ftrace_lock);
4301 4301
4302 if (unlikely(ftrace_disabled))
4303 goto out_unlock;
4304
4305 ret = __register_ftrace_function(ops); 4302 ret = __register_ftrace_function(ops);
4306 if (!ret) 4303 if (!ret)
4307 ret = ftrace_startup(ops, 0); 4304 ret = ftrace_startup(ops, 0);
4308 4305
4309
4310 out_unlock:
4311 mutex_unlock(&ftrace_lock); 4306 mutex_unlock(&ftrace_lock);
4307
4312 return ret; 4308 return ret;
4313} 4309}
4314EXPORT_SYMBOL_GPL(register_ftrace_function); 4310EXPORT_SYMBOL_GPL(register_ftrace_function);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5e..49491fa7daa2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1075 rb_init_page(bpage->page); 1075 rb_init_page(bpage->page);
1076 1076
1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1077 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1078 INIT_LIST_HEAD(&cpu_buffer->new_pages);
1078 1079
1079 ret = rb_allocate_pages(cpu_buffer, nr_pages); 1080 ret = rb_allocate_pages(cpu_buffer, nr_pages);
1080 if (ret < 0) 1081 if (ret < 0)
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
1346 * If something was added to this page, it was full 1347 * If something was added to this page, it was full
1347 * since it is not the tail page. So we deduct the 1348 * since it is not the tail page. So we deduct the
1348 * bytes consumed in ring buffer from here. 1349 * bytes consumed in ring buffer from here.
1349 * No need to update overruns, since this page is 1350 * Increment overrun to account for the lost events.
1350 * deleted from ring buffer and its entries are
1351 * already accounted for.
1352 */ 1351 */
1352 local_add(page_entries, &cpu_buffer->overrun);
1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); 1353 local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1354 } 1354 }
1355 1355
@@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
3239 if (cpu_buffer->commit_page == cpu_buffer->reader_page) 3239 if (cpu_buffer->commit_page == cpu_buffer->reader_page)
3240 goto out; 3240 goto out;
3241 3241
3242 /* Don't bother swapping if the ring buffer is empty */
3243 if (rb_num_of_entries(cpu_buffer) == 0)
3244 goto out;
3245
3242 /* 3246 /*
3243 * Reset the reader page to size zero. 3247 * Reset the reader page to size zero.
3244 */ 3248 */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a7fa0702be1c..a120f98c4112 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -830,6 +830,8 @@ int register_tracer(struct tracer *type)
830 current_trace = saved_tracer; 830 current_trace = saved_tracer;
831 if (ret) { 831 if (ret) {
832 printk(KERN_CONT "FAILED!\n"); 832 printk(KERN_CONT "FAILED!\n");
833 /* Add the warning after printing 'FAILED' */
834 WARN_ON(1);
833 goto out; 835 goto out;
834 } 836 }
835 /* Only reset on passing, to avoid touching corrupted buffers */ 837 /* Only reset on passing, to avoid touching corrupted buffers */
@@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
1708 1710
1709static void trace_iterator_increment(struct trace_iterator *iter) 1711static void trace_iterator_increment(struct trace_iterator *iter)
1710{ 1712{
1713 struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
1714
1711 iter->idx++; 1715 iter->idx++;
1712 if (iter->buffer_iter[iter->cpu]) 1716 if (buf_iter)
1713 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); 1717 ring_buffer_read(buf_iter, NULL);
1714} 1718}
1715 1719
1716static struct trace_entry * 1720static struct trace_entry *
@@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1718 unsigned long *lost_events) 1722 unsigned long *lost_events)
1719{ 1723{
1720 struct ring_buffer_event *event; 1724 struct ring_buffer_event *event;
1721 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1725 struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
1722 1726
1723 if (buf_iter) 1727 if (buf_iter)
1724 event = ring_buffer_iter_peek(buf_iter, ts); 1728 event = ring_buffer_iter_peek(buf_iter, ts);
@@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1856 1860
1857 tr->data[cpu]->skipped_entries = 0; 1861 tr->data[cpu]->skipped_entries = 0;
1858 1862
1859 if (!iter->buffer_iter[cpu]) 1863 buf_iter = trace_buffer_iter(iter, cpu);
1864 if (!buf_iter)
1860 return; 1865 return;
1861 1866
1862 buf_iter = iter->buffer_iter[cpu];
1863 ring_buffer_iter_reset(buf_iter); 1867 ring_buffer_iter_reset(buf_iter);
1864 1868
1865 /* 1869 /*
@@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2205 2209
2206int trace_empty(struct trace_iterator *iter) 2210int trace_empty(struct trace_iterator *iter)
2207{ 2211{
2212 struct ring_buffer_iter *buf_iter;
2208 int cpu; 2213 int cpu;
2209 2214
2210 /* If we are looking at one CPU buffer, only check that one */ 2215 /* If we are looking at one CPU buffer, only check that one */
2211 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { 2216 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
2212 cpu = iter->cpu_file; 2217 cpu = iter->cpu_file;
2213 if (iter->buffer_iter[cpu]) { 2218 buf_iter = trace_buffer_iter(iter, cpu);
2214 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) 2219 if (buf_iter) {
2220 if (!ring_buffer_iter_empty(buf_iter))
2215 return 0; 2221 return 0;
2216 } else { 2222 } else {
2217 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2223 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter)
2221 } 2227 }
2222 2228
2223 for_each_tracing_cpu(cpu) { 2229 for_each_tracing_cpu(cpu) {
2224 if (iter->buffer_iter[cpu]) { 2230 buf_iter = trace_buffer_iter(iter, cpu);
2225 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) 2231 if (buf_iter) {
2232 if (!ring_buffer_iter_empty(buf_iter))
2226 return 0; 2233 return 0;
2227 } else { 2234 } else {
2228 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2235 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file)
2381 if (!iter) 2388 if (!iter)
2382 return ERR_PTR(-ENOMEM); 2389 return ERR_PTR(-ENOMEM);
2383 2390
2391 iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
2392 GFP_KERNEL);
2393 if (!iter->buffer_iter)
2394 goto release;
2395
2384 /* 2396 /*
2385 * We make a copy of the current tracer to avoid concurrent 2397 * We make a copy of the current tracer to avoid concurrent
2386 * changes on it while we are reading. 2398 * changes on it while we are reading.
@@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file)
2441 fail: 2453 fail:
2442 mutex_unlock(&trace_types_lock); 2454 mutex_unlock(&trace_types_lock);
2443 kfree(iter->trace); 2455 kfree(iter->trace);
2456 kfree(iter->buffer_iter);
2457release:
2444 seq_release_private(inode, file); 2458 seq_release_private(inode, file);
2445 return ERR_PTR(-ENOMEM); 2459 return ERR_PTR(-ENOMEM);
2446} 2460}
@@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file)
2481 mutex_destroy(&iter->mutex); 2495 mutex_destroy(&iter->mutex);
2482 free_cpumask_var(iter->started); 2496 free_cpumask_var(iter->started);
2483 kfree(iter->trace); 2497 kfree(iter->trace);
2498 kfree(iter->buffer_iter);
2484 seq_release_private(inode, file); 2499 seq_release_private(inode, file);
2485 return 0; 2500 return 0;
2486} 2501}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5aec220d2de0..55e1f7f0db12 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -317,6 +317,14 @@ struct tracer {
317 317
318#define TRACE_PIPE_ALL_CPU -1 318#define TRACE_PIPE_ALL_CPU -1
319 319
320static inline struct ring_buffer_iter *
321trace_buffer_iter(struct trace_iterator *iter, int cpu)
322{
323 if (iter->buffer_iter && iter->buffer_iter[cpu])
324 return iter->buffer_iter[cpu];
325 return NULL;
326}
327
320int tracer_init(struct tracer *t, struct trace_array *tr); 328int tracer_init(struct tracer *t, struct trace_array *tr);
321int tracing_is_enabled(void); 329int tracing_is_enabled(void);
322void trace_wake_up(void); 330void trace_wake_up(void);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a7d2a4c653d8..ce27c8ba8d31 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter,
538 next = &data->ret; 538 next = &data->ret;
539 } else { 539 } else {
540 540
541 ring_iter = iter->buffer_iter[iter->cpu]; 541 ring_iter = trace_buffer_iter(iter, iter->cpu);
542 542
543 /* First peek to compare current entry and the next one */ 543 /* First peek to compare current entry and the next one */
544 if (ring_iter) 544 if (ring_iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df611a0e76c5..123b189c732c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1325,4 +1325,4 @@ __init static int init_events(void)
1325 1325
1326 return 0; 1326 return 0;
1327} 1327}
1328device_initcall(init_events); 1328early_initcall(init_events);