aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c21
-rw-r--r--kernel/audit_tree.c19
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c11
-rw-r--r--kernel/debug/kdb/kdb_main.c15
-rw-r--r--kernel/events/callchain.c9
-rw-r--r--kernel/events/core.c30
-rw-r--r--kernel/events/internal.h3
-rw-r--r--kernel/events/uprobes.c213
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c50
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/irq/handle.c7
-rw-r--r--kernel/irq/irqdomain.c362
-rw-r--r--kernel/irq/manage.c32
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c37
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/printk.c34
-rw-r--r--kernel/resource.c24
-rw-r--r--kernel/sched/core.c41
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/fair.c40
-rw-r--r--kernel/sched/rt.c13
-rw-r--r--kernel/sched/sched.h8
-rw-r--r--kernel/sched/stop_task.c22
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c57
-rw-r--r--kernel/sysctl.c69
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/task_work.c1
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/timekeeping.c442
-rw-r--r--kernel/timer.c9
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_syscalls.c8
-rw-r--r--kernel/trace/trace_uprobe.c2
41 files changed, 983 insertions, 669 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 4a3f28d2ca65..ea3b7b6191c7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1456,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key)
1456} 1456}
1457 1457
1458/** 1458/**
1459 * audit_log_link_denied - report a link restriction denial
1460 * @operation: specific link opreation
1461 * @link: the path that triggered the restriction
1462 */
1463void audit_log_link_denied(const char *operation, struct path *link)
1464{
1465 struct audit_buffer *ab;
1466
1467 ab = audit_log_start(current->audit_context, GFP_KERNEL,
1468 AUDIT_ANOM_LINK);
1469 audit_log_format(ab, "op=%s action=denied", operation);
1470 audit_log_format(ab, " pid=%d comm=", current->pid);
1471 audit_log_untrustedstring(ab, current->comm);
1472 audit_log_d_path(ab, " path=", link);
1473 audit_log_format(ab, " dev=");
1474 audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
1475 audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
1476 audit_log_end(ab);
1477}
1478
1479/**
1459 * audit_log_end - end one audit record 1480 * audit_log_end - end one audit record
1460 * @ab: the audit_buffer 1481 * @ab: the audit_buffer
1461 * 1482 *
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 3a5ca582ba1e..ed206fd88cca 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -250,7 +250,6 @@ static void untag_chunk(struct node *p)
250 spin_unlock(&hash_lock); 250 spin_unlock(&hash_lock);
251 spin_unlock(&entry->lock); 251 spin_unlock(&entry->lock);
252 fsnotify_destroy_mark(entry); 252 fsnotify_destroy_mark(entry);
253 fsnotify_put_mark(entry);
254 goto out; 253 goto out;
255 } 254 }
256 255
@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p)
259 258
260 fsnotify_duplicate_mark(&new->mark, entry); 259 fsnotify_duplicate_mark(&new->mark, entry);
261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 260 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
262 free_chunk(new); 261 fsnotify_put_mark(&new->mark);
263 goto Fallback; 262 goto Fallback;
264 } 263 }
265 264
@@ -293,7 +292,7 @@ static void untag_chunk(struct node *p)
293 spin_unlock(&hash_lock); 292 spin_unlock(&hash_lock);
294 spin_unlock(&entry->lock); 293 spin_unlock(&entry->lock);
295 fsnotify_destroy_mark(entry); 294 fsnotify_destroy_mark(entry);
296 fsnotify_put_mark(entry); 295 fsnotify_put_mark(&new->mark); /* drop initial reference */
297 goto out; 296 goto out;
298 297
299Fallback: 298Fallback:
@@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
322 321
323 entry = &chunk->mark; 322 entry = &chunk->mark;
324 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { 323 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
325 free_chunk(chunk); 324 fsnotify_put_mark(entry);
326 return -ENOSPC; 325 return -ENOSPC;
327 } 326 }
328 327
@@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
347 insert_hash(chunk); 346 insert_hash(chunk);
348 spin_unlock(&hash_lock); 347 spin_unlock(&hash_lock);
349 spin_unlock(&entry->lock); 348 spin_unlock(&entry->lock);
349 fsnotify_put_mark(entry); /* drop initial reference */
350 return 0; 350 return 0;
351} 351}
352 352
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
396 fsnotify_duplicate_mark(chunk_entry, old_entry); 396 fsnotify_duplicate_mark(chunk_entry, old_entry);
397 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { 397 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
398 spin_unlock(&old_entry->lock); 398 spin_unlock(&old_entry->lock);
399 free_chunk(chunk); 399 fsnotify_put_mark(chunk_entry);
400 fsnotify_put_mark(old_entry); 400 fsnotify_put_mark(old_entry);
401 return -ENOSPC; 401 return -ENOSPC;
402 } 402 }
@@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
444 spin_unlock(&chunk_entry->lock); 444 spin_unlock(&chunk_entry->lock);
445 spin_unlock(&old_entry->lock); 445 spin_unlock(&old_entry->lock);
446 fsnotify_destroy_mark(old_entry); 446 fsnotify_destroy_mark(old_entry);
447 fsnotify_put_mark(chunk_entry); /* drop initial reference */
447 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
448 fsnotify_put_mark(old_entry); /* and kill it */
449 return 0; 449 return 0;
450} 450}
451 451
@@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
916 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); 916 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
917 917
918 evict_chunk(chunk); 918 evict_chunk(chunk);
919 fsnotify_put_mark(entry); 919
920 /*
921 * We are guaranteed to have at least one reference to the mark from
922 * either the inode or the caller of fsnotify_destroy_mark().
923 */
924 BUG_ON(atomic_read(&entry->refcnt) < 1);
920} 925}
921 926
922static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, 927static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb5227a19e..14d32588cccd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)
416 416
417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 417 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
418 mutex_lock(&zonelists_mutex); 418 mutex_lock(&zonelists_mutex);
419 build_all_zonelists(NULL); 419 build_all_zonelists(NULL, NULL);
420 mutex_unlock(&zonelists_mutex); 420 mutex_unlock(&zonelists_mutex);
421 } 421 }
422#endif 422#endif
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8b68ce78ff17..be7b33b73d30 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -12,6 +12,7 @@
12#include <linux/kdb.h> 12#include <linux/kdb.h>
13#include <linux/kdebug.h> 13#include <linux/kdebug.h>
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/hardirq.h>
15#include "kdb_private.h" 16#include "kdb_private.h"
16#include "../debug_core.h" 17#include "../debug_core.h"
17 18
@@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks)
52 if (atomic_read(&kgdb_setting_breakpoint)) 53 if (atomic_read(&kgdb_setting_breakpoint))
53 reason = KDB_REASON_KEYBOARD; 54 reason = KDB_REASON_KEYBOARD;
54 55
56 if (in_nmi())
57 reason = KDB_REASON_NMI;
58
55 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { 59 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
56 if ((bp->bp_enabled) && (bp->bp_addr == addr)) { 60 if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
57 reason = KDB_REASON_BREAK; 61 reason = KDB_REASON_BREAK;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index bb9520f0f6ff..0a69d2adc4f3 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -715,9 +715,6 @@ kdb_printit:
715 /* check for having reached the LINES number of printed lines */ 715 /* check for having reached the LINES number of printed lines */
716 if (kdb_nextline == linecount) { 716 if (kdb_nextline == linecount) {
717 char buf1[16] = ""; 717 char buf1[16] = "";
718#if defined(CONFIG_SMP)
719 char buf2[32];
720#endif
721 718
722 /* Watch out for recursion here. Any routine that calls 719 /* Watch out for recursion here. Any routine that calls
723 * kdb_printf will come back through here. And kdb_read 720 * kdb_printf will come back through here. And kdb_read
@@ -732,14 +729,6 @@ kdb_printit:
732 if (moreprompt == NULL) 729 if (moreprompt == NULL)
733 moreprompt = "more> "; 730 moreprompt = "more> ";
734 731
735#if defined(CONFIG_SMP)
736 if (strchr(moreprompt, '%')) {
737 sprintf(buf2, moreprompt, get_cpu());
738 put_cpu();
739 moreprompt = buf2;
740 }
741#endif
742
743 kdb_input_flush(); 732 kdb_input_flush();
744 c = console_drivers; 733 c = console_drivers;
745 734
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 1f91413edb87..31df1706b9a9 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -139,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
139static char *__env[] = { 139static char *__env[] = {
140#if defined(CONFIG_SMP) 140#if defined(CONFIG_SMP)
141 "PROMPT=[%d]kdb> ", 141 "PROMPT=[%d]kdb> ",
142 "MOREPROMPT=[%d]more> ",
143#else 142#else
144 "PROMPT=kdb> ", 143 "PROMPT=kdb> ",
145 "MOREPROMPT=more> ",
146#endif 144#endif
145 "MOREPROMPT=more> ",
147 "RADIX=16", 146 "RADIX=16",
148 "MDCOUNT=8", /* lines of md output */ 147 "MDCOUNT=8", /* lines of md output */
149 KDB_PLATFORM_ENV, 148 KDB_PLATFORM_ENV,
@@ -1236,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1236 *cmdbuf = '\0'; 1235 *cmdbuf = '\0';
1237 *(cmd_hist[cmd_head]) = '\0'; 1236 *(cmd_hist[cmd_head]) = '\0';
1238 1237
1239 if (KDB_FLAG(ONLY_DO_DUMP)) {
1240 /* kdb is off but a catastrophic error requires a dump.
1241 * Take the dump and reboot.
1242 * Turn on logging so the kdb output appears in the log
1243 * buffer in the dump.
1244 */
1245 const char *setargs[] = { "set", "LOGGING", "1" };
1246 kdb_set(2, setargs);
1247 kdb_reboot(0, NULL);
1248 /*NOTREACHED*/
1249 }
1250
1251do_full_getstr: 1238do_full_getstr:
1252#if defined(CONFIG_SMP) 1239#if defined(CONFIG_SMP)
1253 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), 1240 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a040f399..98d4597f43d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -153,7 +153,8 @@ put_callchain_entry(int rctx)
153 put_recursion_context(__get_cpu_var(callchain_recursion), rctx); 153 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
154} 154}
155 155
156struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) 156struct perf_callchain_entry *
157perf_callchain(struct perf_event *event, struct pt_regs *regs)
157{ 158{
158 int rctx; 159 int rctx;
159 struct perf_callchain_entry *entry; 160 struct perf_callchain_entry *entry;
@@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
178 } 179 }
179 180
180 if (regs) { 181 if (regs) {
182 /*
183 * Disallow cross-task user callchains.
184 */
185 if (event->ctx->task && event->ctx->task != current)
186 goto exit_put;
187
181 perf_callchain_store(entry, PERF_CONTEXT_USER); 188 perf_callchain_store(entry, PERF_CONTEXT_USER);
182 perf_callchain_user(entry, regs); 189 perf_callchain_user(entry, regs);
183 } 190 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f1cf0edeb39a..b7935fcec7d9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4039,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4039 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 4039 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4040 int size = 1; 4040 int size = 1;
4041 4041
4042 data->callchain = perf_callchain(regs); 4042 data->callchain = perf_callchain(event, regs);
4043 4043
4044 if (data->callchain) 4044 if (data->callchain)
4045 size += data->callchain->nr; 4045 size += data->callchain->nr;
@@ -5209,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event,
5209} 5209}
5210 5210
5211void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 5211void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5212 struct pt_regs *regs, struct hlist_head *head, int rctx) 5212 struct pt_regs *regs, struct hlist_head *head, int rctx,
5213 struct task_struct *task)
5213{ 5214{
5214 struct perf_sample_data data; 5215 struct perf_sample_data data;
5215 struct perf_event *event; 5216 struct perf_event *event;
@@ -5228,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5228 perf_swevent_event(event, count, &data, regs); 5229 perf_swevent_event(event, count, &data, regs);
5229 } 5230 }
5230 5231
5232 /*
5233 * If we got specified a target task, also iterate its context and
5234 * deliver this event there too.
5235 */
5236 if (task && task != current) {
5237 struct perf_event_context *ctx;
5238 struct trace_entry *entry = record;
5239
5240 rcu_read_lock();
5241 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
5242 if (!ctx)
5243 goto unlock;
5244
5245 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5246 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5247 continue;
5248 if (event->attr.config != entry->type)
5249 continue;
5250 if (perf_tp_event_match(event, &data, regs))
5251 perf_swevent_event(event, count, &data, regs);
5252 }
5253unlock:
5254 rcu_read_unlock();
5255 }
5256
5231 perf_swevent_put_recursion_context(rctx); 5257 perf_swevent_put_recursion_context(rctx);
5232} 5258}
5233EXPORT_SYMBOL_GPL(perf_tp_event); 5259EXPORT_SYMBOL_GPL(perf_tp_event);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90afc..a096c19f2c2a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle,
101} 101}
102 102
103/* Callchain handling */ 103/* Callchain handling */
104extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); 104extern struct perf_callchain_entry *
105perf_callchain(struct perf_event *event, struct pt_regs *regs);
105extern int get_callchain_buffers(void); 106extern int get_callchain_buffers(void);
106extern void put_callchain_buffers(void); 107extern void put_callchain_buffers(void);
107 108
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f93532748bca..c08a22d02f72 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -32,6 +32,7 @@
32#include <linux/swap.h> /* try_to_free_swap */ 32#include <linux/swap.h> /* try_to_free_swap */
33#include <linux/ptrace.h> /* user_enable_single_step */ 33#include <linux/ptrace.h> /* user_enable_single_step */
34#include <linux/kdebug.h> /* notifier mechanism */ 34#include <linux/kdebug.h> /* notifier mechanism */
35#include "../../mm/internal.h" /* munlock_vma_page */
35 36
36#include <linux/uprobes.h> 37#include <linux/uprobes.h>
37 38
@@ -112,14 +113,14 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
112 return false; 113 return false;
113} 114}
114 115
115static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) 116static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
116{ 117{
117 loff_t vaddr; 118 return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
118 119}
119 vaddr = vma->vm_start + offset;
120 vaddr -= vma->vm_pgoff << PAGE_SHIFT;
121 120
122 return vaddr; 121static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
122{
123 return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
123} 124}
124 125
125/** 126/**
@@ -127,25 +128,27 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
127 * based on replace_page in mm/ksm.c 128 * based on replace_page in mm/ksm.c
128 * 129 *
129 * @vma: vma that holds the pte pointing to page 130 * @vma: vma that holds the pte pointing to page
131 * @addr: address the old @page is mapped at
130 * @page: the cowed page we are replacing by kpage 132 * @page: the cowed page we are replacing by kpage
131 * @kpage: the modified page we replace page by 133 * @kpage: the modified page we replace page by
132 * 134 *
133 * Returns 0 on success, -EFAULT on failure. 135 * Returns 0 on success, -EFAULT on failure.
134 */ 136 */
135static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) 137static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
138 struct page *page, struct page *kpage)
136{ 139{
137 struct mm_struct *mm = vma->vm_mm; 140 struct mm_struct *mm = vma->vm_mm;
138 unsigned long addr;
139 spinlock_t *ptl; 141 spinlock_t *ptl;
140 pte_t *ptep; 142 pte_t *ptep;
143 int err;
141 144
142 addr = page_address_in_vma(page, vma); 145 /* For try_to_free_swap() and munlock_vma_page() below */
143 if (addr == -EFAULT) 146 lock_page(page);
144 return -EFAULT;
145 147
148 err = -EAGAIN;
146 ptep = page_check_address(page, mm, addr, &ptl, 0); 149 ptep = page_check_address(page, mm, addr, &ptl, 0);
147 if (!ptep) 150 if (!ptep)
148 return -EAGAIN; 151 goto unlock;
149 152
150 get_page(kpage); 153 get_page(kpage);
151 page_add_new_anon_rmap(kpage, vma, addr); 154 page_add_new_anon_rmap(kpage, vma, addr);
@@ -162,10 +165,16 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
162 page_remove_rmap(page); 165 page_remove_rmap(page);
163 if (!page_mapped(page)) 166 if (!page_mapped(page))
164 try_to_free_swap(page); 167 try_to_free_swap(page);
165 put_page(page);
166 pte_unmap_unlock(ptep, ptl); 168 pte_unmap_unlock(ptep, ptl);
167 169
168 return 0; 170 if (vma->vm_flags & VM_LOCKED)
171 munlock_vma_page(page);
172 put_page(page);
173
174 err = 0;
175 unlock:
176 unlock_page(page);
177 return err;
169} 178}
170 179
171/** 180/**
@@ -206,45 +215,23 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
206 unsigned long vaddr, uprobe_opcode_t opcode) 215 unsigned long vaddr, uprobe_opcode_t opcode)
207{ 216{
208 struct page *old_page, *new_page; 217 struct page *old_page, *new_page;
209 struct address_space *mapping;
210 void *vaddr_old, *vaddr_new; 218 void *vaddr_old, *vaddr_new;
211 struct vm_area_struct *vma; 219 struct vm_area_struct *vma;
212 struct uprobe *uprobe;
213 int ret; 220 int ret;
221
214retry: 222retry:
215 /* Read the page with vaddr into memory */ 223 /* Read the page with vaddr into memory */
216 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); 224 ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
217 if (ret <= 0) 225 if (ret <= 0)
218 return ret; 226 return ret;
219 227
220 ret = -EINVAL;
221
222 /*
223 * We are interested in text pages only. Our pages of interest
224 * should be mapped for read and execute only. We desist from
225 * adding probes in write mapped pages since the breakpoints
226 * might end up in the file copy.
227 */
228 if (!valid_vma(vma, is_swbp_insn(&opcode)))
229 goto put_out;
230
231 uprobe = container_of(auprobe, struct uprobe, arch);
232 mapping = uprobe->inode->i_mapping;
233 if (mapping != vma->vm_file->f_mapping)
234 goto put_out;
235
236 ret = -ENOMEM; 228 ret = -ENOMEM;
237 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 229 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
238 if (!new_page) 230 if (!new_page)
239 goto put_out; 231 goto put_old;
240 232
241 __SetPageUptodate(new_page); 233 __SetPageUptodate(new_page);
242 234
243 /*
244 * lock page will serialize against do_wp_page()'s
245 * PageAnon() handling
246 */
247 lock_page(old_page);
248 /* copy the page now that we've got it stable */ 235 /* copy the page now that we've got it stable */
249 vaddr_old = kmap_atomic(old_page); 236 vaddr_old = kmap_atomic(old_page);
250 vaddr_new = kmap_atomic(new_page); 237 vaddr_new = kmap_atomic(new_page);
@@ -257,17 +244,13 @@ retry:
257 244
258 ret = anon_vma_prepare(vma); 245 ret = anon_vma_prepare(vma);
259 if (ret) 246 if (ret)
260 goto unlock_out; 247 goto put_new;
261 248
262 lock_page(new_page); 249 ret = __replace_page(vma, vaddr, old_page, new_page);
263 ret = __replace_page(vma, old_page, new_page);
264 unlock_page(new_page);
265 250
266unlock_out: 251put_new:
267 unlock_page(old_page);
268 page_cache_release(new_page); 252 page_cache_release(new_page);
269 253put_old:
270put_out:
271 put_page(old_page); 254 put_page(old_page);
272 255
273 if (unlikely(ret == -EAGAIN)) 256 if (unlikely(ret == -EAGAIN))
@@ -791,7 +774,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
791 curr = info; 774 curr = info;
792 775
793 info->mm = vma->vm_mm; 776 info->mm = vma->vm_mm;
794 info->vaddr = vma_address(vma, offset); 777 info->vaddr = offset_to_vaddr(vma, offset);
795 } 778 }
796 mutex_unlock(&mapping->i_mmap_mutex); 779 mutex_unlock(&mapping->i_mmap_mutex);
797 780
@@ -839,12 +822,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
839 goto free; 822 goto free;
840 823
841 down_write(&mm->mmap_sem); 824 down_write(&mm->mmap_sem);
842 vma = find_vma(mm, (unsigned long)info->vaddr); 825 vma = find_vma(mm, info->vaddr);
843 if (!vma || !valid_vma(vma, is_register)) 826 if (!vma || !valid_vma(vma, is_register) ||
827 vma->vm_file->f_mapping->host != uprobe->inode)
844 goto unlock; 828 goto unlock;
845 829
846 if (vma->vm_file->f_mapping->host != uprobe->inode || 830 if (vma->vm_start > info->vaddr ||
847 vma_address(vma, uprobe->offset) != info->vaddr) 831 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
848 goto unlock; 832 goto unlock;
849 833
850 if (is_register) { 834 if (is_register) {
@@ -960,59 +944,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
960 put_uprobe(uprobe); 944 put_uprobe(uprobe);
961} 945}
962 946
963/* 947static struct rb_node *
964 * Of all the nodes that correspond to the given inode, return the node 948find_node_in_range(struct inode *inode, loff_t min, loff_t max)
965 * with the least offset.
966 */
967static struct rb_node *find_least_offset_node(struct inode *inode)
968{ 949{
969 struct uprobe u = { .inode = inode, .offset = 0};
970 struct rb_node *n = uprobes_tree.rb_node; 950 struct rb_node *n = uprobes_tree.rb_node;
971 struct rb_node *close_node = NULL;
972 struct uprobe *uprobe;
973 int match;
974 951
975 while (n) { 952 while (n) {
976 uprobe = rb_entry(n, struct uprobe, rb_node); 953 struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
977 match = match_uprobe(&u, uprobe);
978
979 if (uprobe->inode == inode)
980 close_node = n;
981
982 if (!match)
983 return close_node;
984 954
985 if (match < 0) 955 if (inode < u->inode) {
986 n = n->rb_left; 956 n = n->rb_left;
987 else 957 } else if (inode > u->inode) {
988 n = n->rb_right; 958 n = n->rb_right;
959 } else {
960 if (max < u->offset)
961 n = n->rb_left;
962 else if (min > u->offset)
963 n = n->rb_right;
964 else
965 break;
966 }
989 } 967 }
990 968
991 return close_node; 969 return n;
992} 970}
993 971
994/* 972/*
995 * For a given inode, build a list of probes that need to be inserted. 973 * For a given range in vma, build a list of probes that need to be inserted.
996 */ 974 */
997static void build_probe_list(struct inode *inode, struct list_head *head) 975static void build_probe_list(struct inode *inode,
976 struct vm_area_struct *vma,
977 unsigned long start, unsigned long end,
978 struct list_head *head)
998{ 979{
999 struct uprobe *uprobe; 980 loff_t min, max;
1000 unsigned long flags; 981 unsigned long flags;
1001 struct rb_node *n; 982 struct rb_node *n, *t;
1002 983 struct uprobe *u;
1003 spin_lock_irqsave(&uprobes_treelock, flags);
1004
1005 n = find_least_offset_node(inode);
1006 984
1007 for (; n; n = rb_next(n)) { 985 INIT_LIST_HEAD(head);
1008 uprobe = rb_entry(n, struct uprobe, rb_node); 986 min = vaddr_to_offset(vma, start);
1009 if (uprobe->inode != inode) 987 max = min + (end - start) - 1;
1010 break;
1011 988
1012 list_add(&uprobe->pending_list, head); 989 spin_lock_irqsave(&uprobes_treelock, flags);
1013 atomic_inc(&uprobe->ref); 990 n = find_node_in_range(inode, min, max);
991 if (n) {
992 for (t = n; t; t = rb_prev(t)) {
993 u = rb_entry(t, struct uprobe, rb_node);
994 if (u->inode != inode || u->offset < min)
995 break;
996 list_add(&u->pending_list, head);
997 atomic_inc(&u->ref);
998 }
999 for (t = n; (t = rb_next(t)); ) {
1000 u = rb_entry(t, struct uprobe, rb_node);
1001 if (u->inode != inode || u->offset > max)
1002 break;
1003 list_add(&u->pending_list, head);
1004 atomic_inc(&u->ref);
1005 }
1014 } 1006 }
1015
1016 spin_unlock_irqrestore(&uprobes_treelock, flags); 1007 spin_unlock_irqrestore(&uprobes_treelock, flags);
1017} 1008}
1018 1009
@@ -1031,7 +1022,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head)
1031int uprobe_mmap(struct vm_area_struct *vma) 1022int uprobe_mmap(struct vm_area_struct *vma)
1032{ 1023{
1033 struct list_head tmp_list; 1024 struct list_head tmp_list;
1034 struct uprobe *uprobe; 1025 struct uprobe *uprobe, *u;
1035 struct inode *inode; 1026 struct inode *inode;
1036 int ret, count; 1027 int ret, count;
1037 1028
@@ -1042,21 +1033,15 @@ int uprobe_mmap(struct vm_area_struct *vma)
1042 if (!inode) 1033 if (!inode)
1043 return 0; 1034 return 0;
1044 1035
1045 INIT_LIST_HEAD(&tmp_list);
1046 mutex_lock(uprobes_mmap_hash(inode)); 1036 mutex_lock(uprobes_mmap_hash(inode));
1047 build_probe_list(inode, &tmp_list); 1037 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1048 1038
1049 ret = 0; 1039 ret = 0;
1050 count = 0; 1040 count = 0;
1051 1041
1052 list_for_each_entry(uprobe, &tmp_list, pending_list) { 1042 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1053 if (!ret) { 1043 if (!ret) {
1054 loff_t vaddr = vma_address(vma, uprobe->offset); 1044 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1055
1056 if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
1057 put_uprobe(uprobe);
1058 continue;
1059 }
1060 1045
1061 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1046 ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1062 /* 1047 /*
@@ -1097,12 +1082,15 @@ int uprobe_mmap(struct vm_area_struct *vma)
1097void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1082void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1098{ 1083{
1099 struct list_head tmp_list; 1084 struct list_head tmp_list;
1100 struct uprobe *uprobe; 1085 struct uprobe *uprobe, *u;
1101 struct inode *inode; 1086 struct inode *inode;
1102 1087
1103 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1088 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
1104 return; 1089 return;
1105 1090
1091 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1092 return;
1093
1106 if (!atomic_read(&vma->vm_mm->uprobes_state.count)) 1094 if (!atomic_read(&vma->vm_mm->uprobes_state.count))
1107 return; 1095 return;
1108 1096
@@ -1110,21 +1098,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1110 if (!inode) 1098 if (!inode)
1111 return; 1099 return;
1112 1100
1113 INIT_LIST_HEAD(&tmp_list);
1114 mutex_lock(uprobes_mmap_hash(inode)); 1101 mutex_lock(uprobes_mmap_hash(inode));
1115 build_probe_list(inode, &tmp_list); 1102 build_probe_list(inode, vma, start, end, &tmp_list);
1116 1103
1117 list_for_each_entry(uprobe, &tmp_list, pending_list) { 1104 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1118 loff_t vaddr = vma_address(vma, uprobe->offset); 1105 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1119 1106 /*
1120 if (vaddr >= start && vaddr < end) { 1107 * An unregister could have removed the probe before
1121 /* 1108 * unmap. So check before we decrement the count.
1122 * An unregister could have removed the probe before 1109 */
1123 * unmap. So check before we decrement the count. 1110 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1124 */ 1111 atomic_dec(&vma->vm_mm->uprobes_state.count);
1125 if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
1126 atomic_dec(&vma->vm_mm->uprobes_state.count);
1127 }
1128 put_uprobe(uprobe); 1112 put_uprobe(uprobe);
1129 } 1113 }
1130 mutex_unlock(uprobes_mmap_hash(inode)); 1114 mutex_unlock(uprobes_mmap_hash(inode));
@@ -1463,12 +1447,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1463 vma = find_vma(mm, bp_vaddr); 1447 vma = find_vma(mm, bp_vaddr);
1464 if (vma && vma->vm_start <= bp_vaddr) { 1448 if (vma && vma->vm_start <= bp_vaddr) {
1465 if (valid_vma(vma, false)) { 1449 if (valid_vma(vma, false)) {
1466 struct inode *inode; 1450 struct inode *inode = vma->vm_file->f_mapping->host;
1467 loff_t offset; 1451 loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1468 1452
1469 inode = vma->vm_file->f_mapping->host;
1470 offset = bp_vaddr - vma->vm_start;
1471 offset += (vma->vm_pgoff << PAGE_SHIFT);
1472 uprobe = find_uprobe(inode, offset); 1453 uprobe = find_uprobe(inode, offset);
1473 } 1454 }
1474 1455
diff --git a/kernel/exit.c b/kernel/exit.c
index d17f6c4ddfa9..f65345f9e5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -483,7 +483,7 @@ static void close_files(struct files_struct * files)
483 rcu_read_unlock(); 483 rcu_read_unlock();
484 for (;;) { 484 for (;;) {
485 unsigned long set; 485 unsigned long set;
486 i = j * __NFDBITS; 486 i = j * BITS_PER_LONG;
487 if (i >= fdt->max_fds) 487 if (i >= fdt->max_fds)
488 break; 488 break;
489 set = fdt->open_fds[j++]; 489 set = fdt->open_fds[j++];
diff --git a/kernel/fork.c b/kernel/fork.c
index ff1cad3b7bdc..2c8857e12855 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -114,6 +114,10 @@ int nr_processes(void)
114 return total; 114 return total;
115} 115}
116 116
117void __weak arch_release_task_struct(struct task_struct *tsk)
118{
119}
120
117#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 121#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
118static struct kmem_cache *task_struct_cachep; 122static struct kmem_cache *task_struct_cachep;
119 123
@@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node)
122 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); 126 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
123} 127}
124 128
125void __weak arch_release_task_struct(struct task_struct *tsk) { }
126
127static inline void free_task_struct(struct task_struct *tsk) 129static inline void free_task_struct(struct task_struct *tsk)
128{ 130{
129 arch_release_task_struct(tsk);
130 kmem_cache_free(task_struct_cachep, tsk); 131 kmem_cache_free(task_struct_cachep, tsk);
131} 132}
132#endif 133#endif
133 134
135void __weak arch_release_thread_info(struct thread_info *ti)
136{
137}
138
134#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR 139#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
135void __weak arch_release_thread_info(struct thread_info *ti) { }
136 140
137/* 141/*
138 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a 142 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
150 154
151static inline void free_thread_info(struct thread_info *ti) 155static inline void free_thread_info(struct thread_info *ti)
152{ 156{
153 arch_release_thread_info(ti);
154 free_pages((unsigned long)ti, THREAD_SIZE_ORDER); 157 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
155} 158}
156# else 159# else
@@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
164 167
165static void free_thread_info(struct thread_info *ti) 168static void free_thread_info(struct thread_info *ti)
166{ 169{
167 arch_release_thread_info(ti);
168 kmem_cache_free(thread_info_cache, ti); 170 kmem_cache_free(thread_info_cache, ti);
169} 171}
170 172
@@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
205void free_task(struct task_struct *tsk) 207void free_task(struct task_struct *tsk)
206{ 208{
207 account_kernel_stack(tsk->stack, -1); 209 account_kernel_stack(tsk->stack, -1);
210 arch_release_thread_info(tsk->stack);
208 free_thread_info(tsk->stack); 211 free_thread_info(tsk->stack);
209 rt_mutex_debug_task_free(tsk); 212 rt_mutex_debug_task_free(tsk);
210 ftrace_graph_exit_task(tsk); 213 ftrace_graph_exit_task(tsk);
211 put_seccomp_filter(tsk); 214 put_seccomp_filter(tsk);
215 arch_release_task_struct(tsk);
212 free_task_struct(tsk); 216 free_task_struct(tsk);
213} 217}
214EXPORT_SYMBOL(free_task); 218EXPORT_SYMBOL(free_task);
@@ -298,23 +302,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
298 return NULL; 302 return NULL;
299 303
300 ti = alloc_thread_info_node(tsk, node); 304 ti = alloc_thread_info_node(tsk, node);
301 if (!ti) { 305 if (!ti)
302 free_task_struct(tsk); 306 goto free_tsk;
303 return NULL;
304 }
305 307
306 err = arch_dup_task_struct(tsk, orig); 308 err = arch_dup_task_struct(tsk, orig);
309 if (err)
310 goto free_ti;
307 311
308 /*
309 * We defer looking at err, because we will need this setup
310 * for the clean up path to work correctly.
311 */
312 tsk->stack = ti; 312 tsk->stack = ti;
313 setup_thread_stack(tsk, orig);
314
315 if (err)
316 goto out;
317 313
314 setup_thread_stack(tsk, orig);
318 clear_user_return_notifier(tsk); 315 clear_user_return_notifier(tsk);
319 clear_tsk_need_resched(tsk); 316 clear_tsk_need_resched(tsk);
320 stackend = end_of_stack(tsk); 317 stackend = end_of_stack(tsk);
@@ -338,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
338 335
339 return tsk; 336 return tsk;
340 337
341out: 338free_ti:
342 free_thread_info(ti); 339 free_thread_info(ti);
340free_tsk:
343 free_task_struct(tsk); 341 free_task_struct(tsk);
344 return NULL; 342 return NULL;
345} 343}
@@ -383,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
383 struct file *file; 381 struct file *file;
384 382
385 if (mpnt->vm_flags & VM_DONTCOPY) { 383 if (mpnt->vm_flags & VM_DONTCOPY) {
386 long pages = vma_pages(mpnt);
387 mm->total_vm -= pages;
388 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 384 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
389 -pages); 385 -vma_pages(mpnt));
390 continue; 386 continue;
391 } 387 }
392 charge = 0; 388 charge = 0;
393 if (mpnt->vm_flags & VM_ACCOUNT) { 389 if (mpnt->vm_flags & VM_ACCOUNT) {
394 unsigned long len; 390 unsigned long len = vma_pages(mpnt);
395 len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 391
396 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ 392 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
397 goto fail_nomem; 393 goto fail_nomem;
398 charge = len; 394 charge = len;
@@ -459,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
459 if (retval) 455 if (retval)
460 goto out; 456 goto out;
461 457
462 if (file && uprobe_mmap(tmp)) 458 if (file)
463 goto out; 459 uprobe_mmap(tmp);
464 } 460 }
465 /* a new mm has just been created */ 461 /* a new mm has just been created */
466 arch_dup_mmap(oldmm, mm); 462 arch_dup_mmap(oldmm, mm);
@@ -1310,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1310#ifdef CONFIG_DEBUG_MUTEXES 1306#ifdef CONFIG_DEBUG_MUTEXES
1311 p->blocked_on = NULL; /* not blocked yet */ 1307 p->blocked_on = NULL; /* not blocked yet */
1312#endif 1308#endif
1313#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1309#ifdef CONFIG_MEMCG
1314 p->memcg_batch.do_batch = 0; 1310 p->memcg_batch.do_batch = 0;
1315 p->memcg_batch.memcg = NULL; 1311 p->memcg_batch.memcg = NULL;
1316#endif 1312#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index e2b0fb9a0b3b..3717e7b306e0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2231 * @uaddr2: the pi futex we will take prior to returning to user-space 2231 * @uaddr2: the pi futex we will take prior to returning to user-space
2232 * 2232 *
2233 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2233 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2234 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and 2234 * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
2235 * complete the acquisition of the rt_mutex prior to returning to userspace. 2235 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2236 * This ensures the rt_mutex maintains an owner when it has waiters; without 2236 * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
2237 * one, the pi logic wouldn't know which task to boost/deboost, if there was a 2237 * without one, the pi logic would not know which task to boost/deboost, if
2238 * need to. 2238 * there was a need to.
2239 * 2239 *
2240 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2240 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2241 * via the following: 2241 * via the following:
@@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2272 struct futex_q q = futex_q_init; 2272 struct futex_q q = futex_q_init;
2273 int res, ret; 2273 int res, ret;
2274 2274
2275 if (uaddr == uaddr2)
2276 return -EINVAL;
2277
2275 if (!bitset) 2278 if (!bitset)
2276 return -EINVAL; 2279 return -EINVAL;
2277 2280
@@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2343 * signal. futex_unlock_pi() will not destroy the lock_ptr nor 2346 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2344 * the pi_state. 2347 * the pi_state.
2345 */ 2348 */
2346 WARN_ON(!&q.pi_state); 2349 WARN_ON(!q.pi_state);
2347 pi_mutex = &q.pi_state->pi_mutex; 2350 pi_mutex = &q.pi_state->pi_mutex;
2348 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); 2351 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2349 debug_rt_mutex_free_waiter(&rt_waiter); 2352 debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2370 * fault, unlock the rt_mutex and return the fault to userspace. 2373 * fault, unlock the rt_mutex and return the fault to userspace.
2371 */ 2374 */
2372 if (ret == -EFAULT) { 2375 if (ret == -EFAULT) {
2373 if (rt_mutex_owner(pi_mutex) == current) 2376 if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2374 rt_mutex_unlock(pi_mutex); 2377 rt_mutex_unlock(pi_mutex);
2375 } else if (ret == -EINTR) { 2378 } else if (ret == -EINTR) {
2376 /* 2379 /*
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index bdb180325551..131ca176b497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,7 +133,7 @@ irqreturn_t
133handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) 133handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
134{ 134{
135 irqreturn_t retval = IRQ_NONE; 135 irqreturn_t retval = IRQ_NONE;
136 unsigned int random = 0, irq = desc->irq_data.irq; 136 unsigned int flags = 0, irq = desc->irq_data.irq;
137 137
138 do { 138 do {
139 irqreturn_t res; 139 irqreturn_t res;
@@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
161 161
162 /* Fall through to add to randomness */ 162 /* Fall through to add to randomness */
163 case IRQ_HANDLED: 163 case IRQ_HANDLED:
164 random |= action->flags; 164 flags |= action->flags;
165 break; 165 break;
166 166
167 default: 167 default:
@@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
172 action = action->next; 172 action = action->next;
173 } while (action); 173 } while (action);
174 174
175 if (random & IRQF_SAMPLE_RANDOM) 175 add_interrupt_randomness(irq, flags);
176 add_interrupt_randomness(irq);
177 176
178 if (!noirqdebug) 177 if (!noirqdebug)
179 note_interrupt(irq, desc, retval); 178 note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 38c5eb839c92..49a77727db42 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
10#include <linux/mutex.h> 10#include <linux/mutex.h>
11#include <linux/of.h> 11#include <linux/of.h>
12#include <linux/of_address.h> 12#include <linux/of_address.h>
13#include <linux/topology.h>
13#include <linux/seq_file.h> 14#include <linux/seq_file.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/smp.h> 16#include <linux/smp.h>
@@ -45,7 +46,8 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
45{ 46{
46 struct irq_domain *domain; 47 struct irq_domain *domain;
47 48
48 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 49 domain = kzalloc_node(sizeof(*domain), GFP_KERNEL,
50 of_node_to_nid(of_node));
49 if (WARN_ON(!domain)) 51 if (WARN_ON(!domain))
50 return NULL; 52 return NULL;
51 53
@@ -138,6 +140,36 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
138} 140}
139 141
140/** 142/**
143 * irq_domain_add_simple() - Allocate and register a simple irq_domain.
144 * @of_node: pointer to interrupt controller's device tree node.
145 * @size: total number of irqs in mapping
146 * @first_irq: first number of irq block assigned to the domain
147 * @ops: map/unmap domain callbacks
148 * @host_data: Controller private data pointer
149 *
150 * Allocates a legacy irq_domain if irq_base is positive or a linear
151 * domain otherwise.
152 *
153 * This is intended to implement the expected behaviour for most
154 * interrupt controllers which is that a linear mapping should
155 * normally be used unless the system requires a legacy mapping in
156 * order to support supplying interrupt numbers during non-DT
157 * registration of devices.
158 */
159struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
160 unsigned int size,
161 unsigned int first_irq,
162 const struct irq_domain_ops *ops,
163 void *host_data)
164{
165 if (first_irq > 0)
166 return irq_domain_add_legacy(of_node, size, first_irq, 0,
167 ops, host_data);
168 else
169 return irq_domain_add_linear(of_node, size, ops, host_data);
170}
171
172/**
141 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. 173 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
142 * @of_node: pointer to interrupt controller's device tree node. 174 * @of_node: pointer to interrupt controller's device tree node.
143 * @size: total number of irqs in legacy mapping 175 * @size: total number of irqs in legacy mapping
@@ -203,7 +235,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
203 * one can then use irq_create_mapping() to 235 * one can then use irq_create_mapping() to
204 * explicitly change them 236 * explicitly change them
205 */ 237 */
206 ops->map(domain, irq, hwirq); 238 if (ops->map)
239 ops->map(domain, irq, hwirq);
207 240
208 /* Clear norequest flags */ 241 /* Clear norequest flags */
209 irq_clear_status_flags(irq, IRQ_NOREQUEST); 242 irq_clear_status_flags(irq, IRQ_NOREQUEST);
@@ -215,7 +248,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
215EXPORT_SYMBOL_GPL(irq_domain_add_legacy); 248EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
216 249
217/** 250/**
218 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. 251 * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
219 * @of_node: pointer to interrupt controller's device tree node. 252 * @of_node: pointer to interrupt controller's device tree node.
220 * @size: Number of interrupts in the domain. 253 * @size: Number of interrupts in the domain.
221 * @ops: map/unmap domain callbacks 254 * @ops: map/unmap domain callbacks
@@ -229,7 +262,8 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
229 struct irq_domain *domain; 262 struct irq_domain *domain;
230 unsigned int *revmap; 263 unsigned int *revmap;
231 264
232 revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); 265 revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
266 of_node_to_nid(of_node));
233 if (WARN_ON(!revmap)) 267 if (WARN_ON(!revmap))
234 return NULL; 268 return NULL;
235 269
@@ -330,24 +364,112 @@ void irq_set_default_host(struct irq_domain *domain)
330} 364}
331EXPORT_SYMBOL_GPL(irq_set_default_host); 365EXPORT_SYMBOL_GPL(irq_set_default_host);
332 366
333static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, 367static void irq_domain_disassociate_many(struct irq_domain *domain,
334 irq_hw_number_t hwirq) 368 unsigned int irq_base, int count)
335{ 369{
336 struct irq_data *irq_data = irq_get_irq_data(virq); 370 /*
371 * disassociate in reverse order;
372 * not strictly necessary, but nice for unwinding
373 */
374 while (count--) {
375 int irq = irq_base + count;
376 struct irq_data *irq_data = irq_get_irq_data(irq);
377 irq_hw_number_t hwirq = irq_data->hwirq;
378
379 if (WARN_ON(!irq_data || irq_data->domain != domain))
380 continue;
381
382 irq_set_status_flags(irq, IRQ_NOREQUEST);
383
384 /* remove chip and handler */
385 irq_set_chip_and_handler(irq, NULL, NULL);
386
387 /* Make sure it's completed */
388 synchronize_irq(irq);
389
390 /* Tell the PIC about it */
391 if (domain->ops->unmap)
392 domain->ops->unmap(domain, irq);
393 smp_mb();
337 394
338 irq_data->hwirq = hwirq;
339 irq_data->domain = domain;
340 if (domain->ops->map(domain, virq, hwirq)) {
341 pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq);
342 irq_data->domain = NULL; 395 irq_data->domain = NULL;
343 irq_data->hwirq = 0; 396 irq_data->hwirq = 0;
344 return -1; 397
398 /* Clear reverse map */
399 switch(domain->revmap_type) {
400 case IRQ_DOMAIN_MAP_LINEAR:
401 if (hwirq < domain->revmap_data.linear.size)
402 domain->revmap_data.linear.revmap[hwirq] = 0;
403 break;
404 case IRQ_DOMAIN_MAP_TREE:
405 mutex_lock(&revmap_trees_mutex);
406 radix_tree_delete(&domain->revmap_data.tree, hwirq);
407 mutex_unlock(&revmap_trees_mutex);
408 break;
409 }
345 } 410 }
411}
412
413int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
414 irq_hw_number_t hwirq_base, int count)
415{
416 unsigned int virq = irq_base;
417 irq_hw_number_t hwirq = hwirq_base;
418 int i, ret;
419
420 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
421 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
422
423 for (i = 0; i < count; i++) {
424 struct irq_data *irq_data = irq_get_irq_data(virq + i);
425
426 if (WARN(!irq_data, "error: irq_desc not allocated; "
427 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
428 return -EINVAL;
429 if (WARN(irq_data->domain, "error: irq_desc already associated; "
430 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
431 return -EINVAL;
432 };
433
434 for (i = 0; i < count; i++, virq++, hwirq++) {
435 struct irq_data *irq_data = irq_get_irq_data(virq);
436
437 irq_data->hwirq = hwirq;
438 irq_data->domain = domain;
439 if (domain->ops->map) {
440 ret = domain->ops->map(domain, virq, hwirq);
441 if (ret != 0) {
442 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
443 virq, hwirq, ret);
444 WARN_ON(1);
445 irq_data->domain = NULL;
446 irq_data->hwirq = 0;
447 goto err_unmap;
448 }
449 }
346 450
347 irq_clear_status_flags(virq, IRQ_NOREQUEST); 451 switch (domain->revmap_type) {
452 case IRQ_DOMAIN_MAP_LINEAR:
453 if (hwirq < domain->revmap_data.linear.size)
454 domain->revmap_data.linear.revmap[hwirq] = virq;
455 break;
456 case IRQ_DOMAIN_MAP_TREE:
457 mutex_lock(&revmap_trees_mutex);
458 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
459 mutex_unlock(&revmap_trees_mutex);
460 break;
461 }
462
463 irq_clear_status_flags(virq, IRQ_NOREQUEST);
464 }
348 465
349 return 0; 466 return 0;
467
468 err_unmap:
469 irq_domain_disassociate_many(domain, irq_base, i);
470 return -EINVAL;
350} 471}
472EXPORT_SYMBOL_GPL(irq_domain_associate_many);
351 473
352/** 474/**
353 * irq_create_direct_mapping() - Allocate an irq for direct mapping 475 * irq_create_direct_mapping() - Allocate an irq for direct mapping
@@ -364,10 +486,10 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
364 if (domain == NULL) 486 if (domain == NULL)
365 domain = irq_default_domain; 487 domain = irq_default_domain;
366 488
367 BUG_ON(domain == NULL); 489 if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
368 WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); 490 return 0;
369 491
370 virq = irq_alloc_desc_from(1, 0); 492 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
371 if (!virq) { 493 if (!virq) {
372 pr_debug("create_direct virq allocation failed\n"); 494 pr_debug("create_direct virq allocation failed\n");
373 return 0; 495 return 0;
@@ -380,7 +502,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
380 } 502 }
381 pr_debug("create_direct obtained virq %d\n", virq); 503 pr_debug("create_direct obtained virq %d\n", virq);
382 504
383 if (irq_setup_virq(domain, virq, virq)) { 505 if (irq_domain_associate(domain, virq, virq)) {
384 irq_free_desc(virq); 506 irq_free_desc(virq);
385 return 0; 507 return 0;
386 } 508 }
@@ -433,17 +555,16 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
433 hint = hwirq % nr_irqs; 555 hint = hwirq % nr_irqs;
434 if (hint == 0) 556 if (hint == 0)
435 hint++; 557 hint++;
436 virq = irq_alloc_desc_from(hint, 0); 558 virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
437 if (virq <= 0) 559 if (virq <= 0)
438 virq = irq_alloc_desc_from(1, 0); 560 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
439 if (virq <= 0) { 561 if (virq <= 0) {
440 pr_debug("-> virq allocation failed\n"); 562 pr_debug("-> virq allocation failed\n");
441 return 0; 563 return 0;
442 } 564 }
443 565
444 if (irq_setup_virq(domain, virq, hwirq)) { 566 if (irq_domain_associate(domain, virq, hwirq)) {
445 if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) 567 irq_free_desc(virq);
446 irq_free_desc(virq);
447 return 0; 568 return 0;
448 } 569 }
449 570
@@ -454,6 +575,44 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
454} 575}
455EXPORT_SYMBOL_GPL(irq_create_mapping); 576EXPORT_SYMBOL_GPL(irq_create_mapping);
456 577
578/**
579 * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
580 * @domain: domain owning the interrupt range
581 * @irq_base: beginning of linux IRQ range
582 * @hwirq_base: beginning of hardware IRQ range
583 * @count: Number of interrupts to map
584 *
585 * This routine is used for allocating and mapping a range of hardware
586 * irqs to linux irqs where the linux irq numbers are at pre-defined
587 * locations. For use by controllers that already have static mappings
588 * to insert in to the domain.
589 *
590 * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
591 * domain insertion.
592 *
593 * 0 is returned upon success, while any failure to establish a static
594 * mapping is treated as an error.
595 */
596int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
597 irq_hw_number_t hwirq_base, int count)
598{
599 int ret;
600
601 ret = irq_alloc_descs(irq_base, irq_base, count,
602 of_node_to_nid(domain->of_node));
603 if (unlikely(ret < 0))
604 return ret;
605
606 ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count);
607 if (unlikely(ret < 0)) {
608 irq_free_descs(irq_base, count);
609 return ret;
610 }
611
612 return 0;
613}
614EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
615
457unsigned int irq_create_of_mapping(struct device_node *controller, 616unsigned int irq_create_of_mapping(struct device_node *controller,
458 const u32 *intspec, unsigned int intsize) 617 const u32 *intspec, unsigned int intsize)
459{ 618{
@@ -511,7 +670,6 @@ void irq_dispose_mapping(unsigned int virq)
511{ 670{
512 struct irq_data *irq_data = irq_get_irq_data(virq); 671 struct irq_data *irq_data = irq_get_irq_data(virq);
513 struct irq_domain *domain; 672 struct irq_domain *domain;
514 irq_hw_number_t hwirq;
515 673
516 if (!virq || !irq_data) 674 if (!virq || !irq_data)
517 return; 675 return;
@@ -524,33 +682,7 @@ void irq_dispose_mapping(unsigned int virq)
524 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) 682 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
525 return; 683 return;
526 684
527 irq_set_status_flags(virq, IRQ_NOREQUEST); 685 irq_domain_disassociate_many(domain, virq, 1);
528
529 /* remove chip and handler */
530 irq_set_chip_and_handler(virq, NULL, NULL);
531
532 /* Make sure it's completed */
533 synchronize_irq(virq);
534
535 /* Tell the PIC about it */
536 if (domain->ops->unmap)
537 domain->ops->unmap(domain, virq);
538 smp_mb();
539
540 /* Clear reverse map */
541 hwirq = irq_data->hwirq;
542 switch(domain->revmap_type) {
543 case IRQ_DOMAIN_MAP_LINEAR:
544 if (hwirq < domain->revmap_data.linear.size)
545 domain->revmap_data.linear.revmap[hwirq] = 0;
546 break;
547 case IRQ_DOMAIN_MAP_TREE:
548 mutex_lock(&revmap_trees_mutex);
549 radix_tree_delete(&domain->revmap_data.tree, hwirq);
550 mutex_unlock(&revmap_trees_mutex);
551 break;
552 }
553
554 irq_free_desc(virq); 686 irq_free_desc(virq);
555} 687}
556EXPORT_SYMBOL_GPL(irq_dispose_mapping); 688EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -559,16 +691,11 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping);
559 * irq_find_mapping() - Find a linux irq from an hw irq number. 691 * irq_find_mapping() - Find a linux irq from an hw irq number.
560 * @domain: domain owning this hardware interrupt 692 * @domain: domain owning this hardware interrupt
561 * @hwirq: hardware irq number in that domain space 693 * @hwirq: hardware irq number in that domain space
562 *
563 * This is a slow path, for use by generic code. It's expected that an
564 * irq controller implementation directly calls the appropriate low level
565 * mapping function.
566 */ 694 */
567unsigned int irq_find_mapping(struct irq_domain *domain, 695unsigned int irq_find_mapping(struct irq_domain *domain,
568 irq_hw_number_t hwirq) 696 irq_hw_number_t hwirq)
569{ 697{
570 unsigned int i; 698 struct irq_data *data;
571 unsigned int hint = hwirq % nr_irqs;
572 699
573 /* Look for default domain if nececssary */ 700 /* Look for default domain if nececssary */
574 if (domain == NULL) 701 if (domain == NULL)
@@ -576,115 +703,47 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
576 if (domain == NULL) 703 if (domain == NULL)
577 return 0; 704 return 0;
578 705
579 /* legacy -> bail early */ 706 switch (domain->revmap_type) {
580 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) 707 case IRQ_DOMAIN_MAP_LEGACY:
581 return irq_domain_legacy_revmap(domain, hwirq); 708 return irq_domain_legacy_revmap(domain, hwirq);
582 709 case IRQ_DOMAIN_MAP_LINEAR:
583 /* Slow path does a linear search of the map */ 710 return irq_linear_revmap(domain, hwirq);
584 if (hint == 0) 711 case IRQ_DOMAIN_MAP_TREE:
585 hint = 1; 712 rcu_read_lock();
586 i = hint; 713 data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
587 do { 714 rcu_read_unlock();
588 struct irq_data *data = irq_get_irq_data(i); 715 if (data)
716 return data->irq;
717 break;
718 case IRQ_DOMAIN_MAP_NOMAP:
719 data = irq_get_irq_data(hwirq);
589 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 720 if (data && (data->domain == domain) && (data->hwirq == hwirq))
590 return i; 721 return hwirq;
591 i++; 722 break;
592 if (i >= nr_irqs) 723 }
593 i = 1; 724
594 } while(i != hint);
595 return 0; 725 return 0;
596} 726}
597EXPORT_SYMBOL_GPL(irq_find_mapping); 727EXPORT_SYMBOL_GPL(irq_find_mapping);
598 728
599/** 729/**
600 * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
601 * @domain: domain owning this hardware interrupt
602 * @hwirq: hardware irq number in that domain space
603 *
604 * This is a fast path, for use by irq controller code that uses radix tree
605 * revmaps
606 */
607unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
608 irq_hw_number_t hwirq)
609{
610 struct irq_data *irq_data;
611
612 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
613 return irq_find_mapping(domain, hwirq);
614
615 /*
616 * Freeing an irq can delete nodes along the path to
617 * do the lookup via call_rcu.
618 */
619 rcu_read_lock();
620 irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
621 rcu_read_unlock();
622
623 /*
624 * If found in radix tree, then fine.
625 * Else fallback to linear lookup - this should not happen in practice
626 * as it means that we failed to insert the node in the radix tree.
627 */
628 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
629}
630EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup);
631
632/**
633 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
634 * @domain: domain owning this hardware interrupt
635 * @virq: linux irq number
636 * @hwirq: hardware irq number in that domain space
637 *
638 * This is for use by irq controllers that use a radix tree reverse
639 * mapping for fast lookup.
640 */
641void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
642 irq_hw_number_t hwirq)
643{
644 struct irq_data *irq_data = irq_get_irq_data(virq);
645
646 if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
647 return;
648
649 if (virq) {
650 mutex_lock(&revmap_trees_mutex);
651 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
652 mutex_unlock(&revmap_trees_mutex);
653 }
654}
655EXPORT_SYMBOL_GPL(irq_radix_revmap_insert);
656
657/**
658 * irq_linear_revmap() - Find a linux irq from a hw irq number. 730 * irq_linear_revmap() - Find a linux irq from a hw irq number.
659 * @domain: domain owning this hardware interrupt 731 * @domain: domain owning this hardware interrupt
660 * @hwirq: hardware irq number in that domain space 732 * @hwirq: hardware irq number in that domain space
661 * 733 *
662 * This is a fast path, for use by irq controller code that uses linear 734 * This is a fast path that can be called directly by irq controller code to
663 * revmaps. It does fallback to the slow path if the revmap doesn't exist 735 * save a handful of instructions.
664 * yet and will create the revmap entry with appropriate locking
665 */ 736 */
666unsigned int irq_linear_revmap(struct irq_domain *domain, 737unsigned int irq_linear_revmap(struct irq_domain *domain,
667 irq_hw_number_t hwirq) 738 irq_hw_number_t hwirq)
668{ 739{
669 unsigned int *revmap; 740 BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
670
671 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
672 return irq_find_mapping(domain, hwirq);
673 741
674 /* Check revmap bounds */ 742 /* Check revmap bounds; complain if exceeded */
675 if (unlikely(hwirq >= domain->revmap_data.linear.size)) 743 if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
676 return irq_find_mapping(domain, hwirq); 744 return 0;
677
678 /* Check if revmap was allocated */
679 revmap = domain->revmap_data.linear.revmap;
680 if (unlikely(revmap == NULL))
681 return irq_find_mapping(domain, hwirq);
682
683 /* Fill up revmap with slow path if no mapping found */
684 if (unlikely(!revmap[hwirq]))
685 revmap[hwirq] = irq_find_mapping(domain, hwirq);
686 745
687 return revmap[hwirq]; 746 return domain->revmap_data.linear.revmap[hwirq];
688} 747}
689EXPORT_SYMBOL_GPL(irq_linear_revmap); 748EXPORT_SYMBOL_GPL(irq_linear_revmap);
690 749
@@ -761,12 +820,6 @@ static int __init irq_debugfs_init(void)
761__initcall(irq_debugfs_init); 820__initcall(irq_debugfs_init);
762#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ 821#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
763 822
764static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
765 irq_hw_number_t hwirq)
766{
767 return 0;
768}
769
770/** 823/**
771 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings 824 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
772 * 825 *
@@ -829,7 +882,6 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
829EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); 882EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
830 883
831const struct irq_domain_ops irq_domain_simple_ops = { 884const struct irq_domain_ops irq_domain_simple_ops = {
832 .map = irq_domain_simple_map,
833 .xlate = irq_domain_xlate_onetwocell, 885 .xlate = irq_domain_xlate_onetwocell,
834}; 886};
835EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 887EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 814c9ef6bba1..4c69326aa773 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
893 return -ENOSYS; 893 return -ENOSYS;
894 if (!try_module_get(desc->owner)) 894 if (!try_module_get(desc->owner))
895 return -ENODEV; 895 return -ENODEV;
896 /*
897 * Some drivers like serial.c use request_irq() heavily,
898 * so we have to be careful not to interfere with a
899 * running system.
900 */
901 if (new->flags & IRQF_SAMPLE_RANDOM) {
902 /*
903 * This function might sleep, we want to call it first,
904 * outside of the atomic block.
905 * Yes, this might clear the entropy pool if the wrong
906 * driver is attempted to be loaded, without actually
907 * installing a new handler, but is this really a problem,
908 * only the sysadmin is able to do this.
909 */
910 rand_initialize_irq(irq);
911 }
912 896
913 /* 897 /*
914 * Check whether the interrupt nests into another interrupt 898 * Check whether the interrupt nests into another interrupt
@@ -960,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
960 } 944 }
961 945
962 /* 946 /*
947 * Drivers are often written to work w/o knowledge about the
948 * underlying irq chip implementation, so a request for a
949 * threaded irq without a primary hard irq context handler
950 * requires the ONESHOT flag to be set. Some irq chips like
951 * MSI based interrupts are per se one shot safe. Check the
952 * chip flags, so we can avoid the unmask dance at the end of
953 * the threaded handler for those.
954 */
955 if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
956 new->flags &= ~IRQF_ONESHOT;
957
958 /*
963 * The following block of code has to be executed atomically 959 * The following block of code has to be executed atomically
964 */ 960 */
965 raw_spin_lock_irqsave(&desc->lock, flags); 961 raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1033,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1033 */ 1029 */
1034 new->thread_mask = 1 << ffz(thread_mask); 1030 new->thread_mask = 1 << ffz(thread_mask);
1035 1031
1036 } else if (new->handler == irq_default_primary_handler) { 1032 } else if (new->handler == irq_default_primary_handler &&
1033 !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
1037 /* 1034 /*
1038 * The interrupt was requested with handler = NULL, so 1035 * The interrupt was requested with handler = NULL, so
1039 * we use the default primary handler for it. But it 1036 * we use the default primary handler for it. But it
@@ -1354,7 +1351,6 @@ EXPORT_SYMBOL(free_irq);
1354 * Flags: 1351 * Flags:
1355 * 1352 *
1356 * IRQF_SHARED Interrupt is shared 1353 * IRQF_SHARED Interrupt is shared
1357 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1358 * IRQF_TRIGGER_* Specify active edge(s) or level 1354 * IRQF_TRIGGER_* Specify active edge(s) or level
1359 * 1355 *
1360 */ 1356 */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4e2e472f6aeb..0668d58d6413 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void)
1424 1424
1425void crash_save_vmcoreinfo(void) 1425void crash_save_vmcoreinfo(void)
1426{ 1426{
1427 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); 1427 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1428 update_vmcoreinfo_note(); 1428 update_vmcoreinfo_note();
1429} 1429}
1430 1430
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ff2c7cb86d77..6f99aead66c6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,6 +45,13 @@ extern int max_threads;
45 45
46static struct workqueue_struct *khelper_wq; 46static struct workqueue_struct *khelper_wq;
47 47
48/*
49 * kmod_thread_locker is used for deadlock avoidance. There is no explicit
50 * locking to protect this global - it is private to the singleton khelper
51 * thread and should only ever be modified by that thread.
52 */
53static const struct task_struct *kmod_thread_locker;
54
48#define CAP_BSET (void *)1 55#define CAP_BSET (void *)1
49#define CAP_PI (void *)2 56#define CAP_PI (void *)2
50 57
@@ -221,6 +228,13 @@ fail:
221 return 0; 228 return 0;
222} 229}
223 230
231static int call_helper(void *data)
232{
233 /* Worker thread started blocking khelper thread. */
234 kmod_thread_locker = current;
235 return ____call_usermodehelper(data);
236}
237
224static void call_usermodehelper_freeinfo(struct subprocess_info *info) 238static void call_usermodehelper_freeinfo(struct subprocess_info *info)
225{ 239{
226 if (info->cleanup) 240 if (info->cleanup)
@@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work)
295 if (wait == UMH_WAIT_PROC) 309 if (wait == UMH_WAIT_PROC)
296 pid = kernel_thread(wait_for_helper, sub_info, 310 pid = kernel_thread(wait_for_helper, sub_info,
297 CLONE_FS | CLONE_FILES | SIGCHLD); 311 CLONE_FS | CLONE_FILES | SIGCHLD);
298 else 312 else {
299 pid = kernel_thread(____call_usermodehelper, sub_info, 313 pid = kernel_thread(call_helper, sub_info,
300 CLONE_VFORK | SIGCHLD); 314 CLONE_VFORK | SIGCHLD);
315 /* Worker thread stopped blocking khelper thread. */
316 kmod_thread_locker = NULL;
317 }
301 318
302 switch (wait) { 319 switch (wait) {
303 case UMH_NO_WAIT: 320 case UMH_NO_WAIT:
@@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
548 retval = -EBUSY; 565 retval = -EBUSY;
549 goto out; 566 goto out;
550 } 567 }
568 /*
569 * Worker thread must not wait for khelper thread at below
570 * wait_for_completion() if the thread was created with CLONE_VFORK
571 * flag, for khelper thread is already waiting for the thread at
572 * wait_for_completion() in do_fork().
573 */
574 if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
575 retval = -EBUSY;
576 goto out;
577 }
551 578
552 sub_info->complete = &done; 579 sub_info->complete = &done;
553 sub_info->wait = wait; 580 sub_info->wait = wait;
@@ -577,6 +604,12 @@ unlock:
577 return retval; 604 return retval;
578} 605}
579 606
607/*
608 * call_usermodehelper_fns() will not run the caller-provided cleanup function
609 * if a memory allocation failure is experienced. So the caller might need to
610 * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
611 * the necessaary cleanup within the caller.
612 */
580int call_usermodehelper_fns( 613int call_usermodehelper_fns(
581 char *path, char **argv, char **envp, int wait, 614 char *path, char **argv, char **envp, int wait,
582 int (*init)(struct subprocess_info *info, struct cred *new), 615 int (*init)(struct subprocess_info *info, struct cred *new),
diff --git a/kernel/panic.c b/kernel/panic.c
index d2a5f4ecc6dd..e1b2822fff97 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,6 +75,14 @@ void panic(const char *fmt, ...)
75 int state = 0; 75 int state = 0;
76 76
77 /* 77 /*
78 * Disable local interrupts. This will prevent panic_smp_self_stop
79 * from deadlocking the first cpu that invokes the panic, since
80 * there is nothing to prevent an interrupt handler (that runs
81 * after the panic_lock is acquired) from invoking panic again.
82 */
83 local_irq_disable();
84
85 /*
78 * It's possible to come here directly from a panic-assertion and 86 * It's possible to come here directly from a panic-assertion and
79 * not have preempt disabled. Some functions called from here want 87 * not have preempt disabled. Some functions called from here want
80 * preempt to be disabled. No point enabling it later though... 88 * preempt to be disabled. No point enabling it later though...
diff --git a/kernel/printk.c b/kernel/printk.c
index 50c96b5651b6..66a2ea37b576 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -389,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
389 389
390 line = buf; 390 line = buf;
391 for (i = 0; i < count; i++) { 391 for (i = 0; i < count; i++) {
392 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) 392 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
393 ret = -EFAULT;
393 goto out; 394 goto out;
395 }
394 line += iv[i].iov_len; 396 line += iv[i].iov_len;
395 } 397 }
396 398
@@ -1032,6 +1034,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1032 struct log *msg = log_from_idx(idx); 1034 struct log *msg = log_from_idx(idx);
1033 1035
1034 len += msg_print_text(msg, prev, true, NULL, 0); 1036 len += msg_print_text(msg, prev, true, NULL, 0);
1037 prev = msg->flags;
1035 idx = log_next(idx); 1038 idx = log_next(idx);
1036 seq++; 1039 seq++;
1037 } 1040 }
@@ -1044,6 +1047,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1044 struct log *msg = log_from_idx(idx); 1047 struct log *msg = log_from_idx(idx);
1045 1048
1046 len -= msg_print_text(msg, prev, true, NULL, 0); 1049 len -= msg_print_text(msg, prev, true, NULL, 0);
1050 prev = msg->flags;
1047 idx = log_next(idx); 1051 idx = log_next(idx);
1048 seq++; 1052 seq++;
1049 } 1053 }
@@ -1540,17 +1544,23 @@ asmlinkage int vprintk_emit(int facility, int level,
1540 lflags |= LOG_NEWLINE; 1544 lflags |= LOG_NEWLINE;
1541 } 1545 }
1542 1546
1543 /* strip syslog prefix and extract log level or control flags */ 1547 /* strip kernel syslog prefix and extract log level or control flags */
1544 if (text[0] == '<' && text[1] && text[2] == '>') { 1548 if (facility == 0) {
1545 switch (text[1]) { 1549 int kern_level = printk_get_level(text);
1546 case '0' ... '7': 1550
1547 if (level == -1) 1551 if (kern_level) {
1548 level = text[1] - '0'; 1552 const char *end_of_header = printk_skip_level(text);
1549 case 'd': /* KERN_DEFAULT */ 1553 switch (kern_level) {
1550 lflags |= LOG_PREFIX; 1554 case '0' ... '7':
1551 case 'c': /* KERN_CONT */ 1555 if (level == -1)
1552 text += 3; 1556 level = kern_level - '0';
1553 text_len -= 3; 1557 case 'd': /* KERN_DEFAULT */
1558 lflags |= LOG_PREFIX;
1559 case 'c': /* KERN_CONT */
1560 break;
1561 }
1562 text_len -= end_of_header - text;
1563 text = (char *)end_of_header;
1554 } 1564 }
1555 } 1565 }
1556 1566
diff --git a/kernel/resource.c b/kernel/resource.c
index dc8b47764443..34d45886ee84 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,6 +7,8 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/export.h> 12#include <linux/export.h>
11#include <linux/errno.h> 13#include <linux/errno.h>
12#include <linux/ioport.h> 14#include <linux/ioport.h>
@@ -791,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root,
791 resource_size_t start, resource_size_t end, 793 resource_size_t start, resource_size_t end,
792 const char *name) 794 const char *name)
793{ 795{
796 int abort = 0;
797
794 write_lock(&resource_lock); 798 write_lock(&resource_lock);
795 __reserve_region_with_split(root, start, end, name); 799 if (root->start > start || root->end < end) {
800 pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
801 (unsigned long long)start, (unsigned long long)end,
802 root);
803 if (start > root->end || end < root->start)
804 abort = 1;
805 else {
806 if (end > root->end)
807 end = root->end;
808 if (start < root->start)
809 start = root->start;
810 pr_err("fixing request to [0x%llx-0x%llx]\n",
811 (unsigned long long)start,
812 (unsigned long long)end);
813 }
814 dump_stack();
815 }
816 if (!abort)
817 __reserve_region_with_split(root, start, end, name);
796 write_unlock(&resource_lock); 818 write_unlock(&resource_lock);
797} 819}
798 820
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5d011ef4c0df..fbf1fd098dc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1910,12 +1910,12 @@ static inline void
1910prepare_task_switch(struct rq *rq, struct task_struct *prev, 1910prepare_task_switch(struct rq *rq, struct task_struct *prev,
1911 struct task_struct *next) 1911 struct task_struct *next)
1912{ 1912{
1913 trace_sched_switch(prev, next);
1913 sched_info_switch(prev, next); 1914 sched_info_switch(prev, next);
1914 perf_event_task_sched_out(prev, next); 1915 perf_event_task_sched_out(prev, next);
1915 fire_sched_out_preempt_notifiers(prev, next); 1916 fire_sched_out_preempt_notifiers(prev, next);
1916 prepare_lock_switch(rq, next); 1917 prepare_lock_switch(rq, next);
1917 prepare_arch_switch(next); 1918 prepare_arch_switch(next);
1918 trace_sched_switch(prev, next);
1919} 1919}
1920 1920
1921/** 1921/**
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3142# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) 3142# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3143#endif 3143#endif
3144 3144
3145static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
3146{
3147 u64 temp = (__force u64) rtime;
3148
3149 temp *= (__force u64) utime;
3150
3151 if (sizeof(cputime_t) == 4)
3152 temp = div_u64(temp, (__force u32) total);
3153 else
3154 temp = div64_u64(temp, (__force u64) total);
3155
3156 return (__force cputime_t) temp;
3157}
3158
3145void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3159void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3146{ 3160{
3147 cputime_t rtime, utime = p->utime, total = utime + p->stime; 3161 cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3151 */ 3165 */
3152 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3166 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3153 3167
3154 if (total) { 3168 if (total)
3155 u64 temp = (__force u64) rtime; 3169 utime = scale_utime(utime, rtime, total);
3156 3170 else
3157 temp *= (__force u64) utime;
3158 do_div(temp, (__force u32) total);
3159 utime = (__force cputime_t) temp;
3160 } else
3161 utime = rtime; 3171 utime = rtime;
3162 3172
3163 /* 3173 /*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3184 total = cputime.utime + cputime.stime; 3194 total = cputime.utime + cputime.stime;
3185 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3195 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3186 3196
3187 if (total) { 3197 if (total)
3188 u64 temp = (__force u64) rtime; 3198 utime = scale_utime(cputime.utime, rtime, total);
3189 3199 else
3190 temp *= (__force u64) cputime.utime;
3191 do_div(temp, (__force u32) total);
3192 utime = (__force cputime_t) temp;
3193 } else
3194 utime = rtime; 3200 utime = rtime;
3195 3201
3196 sig->prev_utime = max(sig->prev_utime, utime); 3202 sig->prev_utime = max(sig->prev_utime, utime);
@@ -4340,9 +4346,7 @@ recheck:
4340 */ 4346 */
4341 if (unlikely(policy == p->policy && (!rt_policy(policy) || 4347 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
4342 param->sched_priority == p->rt_priority))) { 4348 param->sched_priority == p->rt_priority))) {
4343 4349 task_rq_unlock(rq, p, &flags);
4344 __task_rq_unlock(rq);
4345 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4346 return 0; 4350 return 0;
4347 } 4351 }
4348 4352
@@ -7248,6 +7252,7 @@ int in_sched_functions(unsigned long addr)
7248 7252
7249#ifdef CONFIG_CGROUP_SCHED 7253#ifdef CONFIG_CGROUP_SCHED
7250struct task_group root_task_group; 7254struct task_group root_task_group;
7255LIST_HEAD(task_groups);
7251#endif 7256#endif
7252 7257
7253DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 7258DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf660..23aa789c53ee 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -65,8 +65,8 @@ static int convert_prio(int prio)
65int cpupri_find(struct cpupri *cp, struct task_struct *p, 65int cpupri_find(struct cpupri *cp, struct task_struct *p,
66 struct cpumask *lowest_mask) 66 struct cpumask *lowest_mask)
67{ 67{
68 int idx = 0; 68 int idx = 0;
69 int task_pri = convert_prio(p->prio); 69 int task_pri = convert_prio(p->prio);
70 70
71 if (task_pri >= MAX_RT_PRIO) 71 if (task_pri >= MAX_RT_PRIO)
72 return 0; 72 return 0;
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
137 */ 137 */
138void cpupri_set(struct cpupri *cp, int cpu, int newpri) 138void cpupri_set(struct cpupri *cp, int cpu, int newpri)
139{ 139{
140 int *currpri = &cp->cpu_to_pri[cpu]; 140 int *currpri = &cp->cpu_to_pri[cpu];
141 int oldpri = *currpri; 141 int oldpri = *currpri;
142 int do_mb = 0; 142 int do_mb = 0;
143 143
144 newpri = convert_prio(newpri); 144 newpri = convert_prio(newpri);
145 145
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 22321db64952..c219bf8d704c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3069,6 +3069,9 @@ struct lb_env {
3069 int new_dst_cpu; 3069 int new_dst_cpu;
3070 enum cpu_idle_type idle; 3070 enum cpu_idle_type idle;
3071 long imbalance; 3071 long imbalance;
3072 /* The set of CPUs under consideration for load-balancing */
3073 struct cpumask *cpus;
3074
3072 unsigned int flags; 3075 unsigned int flags;
3073 3076
3074 unsigned int loop; 3077 unsigned int loop;
@@ -3384,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data)
3384 3387
3385static void update_h_load(long cpu) 3388static void update_h_load(long cpu)
3386{ 3389{
3390 struct rq *rq = cpu_rq(cpu);
3391 unsigned long now = jiffies;
3392
3393 if (rq->h_load_throttle == now)
3394 return;
3395
3396 rq->h_load_throttle = now;
3397
3387 rcu_read_lock(); 3398 rcu_read_lock();
3388 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 3399 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3389 rcu_read_unlock(); 3400 rcu_read_unlock();
@@ -3653,8 +3664,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3653 */ 3664 */
3654static inline void update_sg_lb_stats(struct lb_env *env, 3665static inline void update_sg_lb_stats(struct lb_env *env,
3655 struct sched_group *group, int load_idx, 3666 struct sched_group *group, int load_idx,
3656 int local_group, const struct cpumask *cpus, 3667 int local_group, int *balance, struct sg_lb_stats *sgs)
3657 int *balance, struct sg_lb_stats *sgs)
3658{ 3668{
3659 unsigned long nr_running, max_nr_running, min_nr_running; 3669 unsigned long nr_running, max_nr_running, min_nr_running;
3660 unsigned long load, max_cpu_load, min_cpu_load; 3670 unsigned long load, max_cpu_load, min_cpu_load;
@@ -3671,7 +3681,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3671 max_nr_running = 0; 3681 max_nr_running = 0;
3672 min_nr_running = ~0UL; 3682 min_nr_running = ~0UL;
3673 3683
3674 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3684 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
3675 struct rq *rq = cpu_rq(i); 3685 struct rq *rq = cpu_rq(i);
3676 3686
3677 nr_running = rq->nr_running; 3687 nr_running = rq->nr_running;
@@ -3800,8 +3810,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
3800 * @sds: variable to hold the statistics for this sched_domain. 3810 * @sds: variable to hold the statistics for this sched_domain.
3801 */ 3811 */
3802static inline void update_sd_lb_stats(struct lb_env *env, 3812static inline void update_sd_lb_stats(struct lb_env *env,
3803 const struct cpumask *cpus, 3813 int *balance, struct sd_lb_stats *sds)
3804 int *balance, struct sd_lb_stats *sds)
3805{ 3814{
3806 struct sched_domain *child = env->sd->child; 3815 struct sched_domain *child = env->sd->child;
3807 struct sched_group *sg = env->sd->groups; 3816 struct sched_group *sg = env->sd->groups;
@@ -3818,8 +3827,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3818 3827
3819 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 3828 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
3820 memset(&sgs, 0, sizeof(sgs)); 3829 memset(&sgs, 0, sizeof(sgs));
3821 update_sg_lb_stats(env, sg, load_idx, local_group, 3830 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
3822 cpus, balance, &sgs);
3823 3831
3824 if (local_group && !(*balance)) 3832 if (local_group && !(*balance))
3825 return; 3833 return;
@@ -4055,7 +4063,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4055 * to restore balance. 4063 * to restore balance.
4056 * 4064 *
4057 * @env: The load balancing environment. 4065 * @env: The load balancing environment.
4058 * @cpus: The set of CPUs under consideration for load-balancing.
4059 * @balance: Pointer to a variable indicating if this_cpu 4066 * @balance: Pointer to a variable indicating if this_cpu
4060 * is the appropriate cpu to perform load balancing at this_level. 4067 * is the appropriate cpu to perform load balancing at this_level.
4061 * 4068 *
@@ -4065,7 +4072,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4065 * put to idle by rebalancing its tasks onto our group. 4072 * put to idle by rebalancing its tasks onto our group.
4066 */ 4073 */
4067static struct sched_group * 4074static struct sched_group *
4068find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) 4075find_busiest_group(struct lb_env *env, int *balance)
4069{ 4076{
4070 struct sd_lb_stats sds; 4077 struct sd_lb_stats sds;
4071 4078
@@ -4075,7 +4082,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
4075 * Compute the various statistics relavent for load balancing at 4082 * Compute the various statistics relavent for load balancing at
4076 * this level. 4083 * this level.
4077 */ 4084 */
4078 update_sd_lb_stats(env, cpus, balance, &sds); 4085 update_sd_lb_stats(env, balance, &sds);
4079 4086
4080 /* 4087 /*
4081 * this_cpu is not the appropriate cpu to perform load balancing at 4088 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4155,8 +4162,7 @@ ret:
4155 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4162 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4156 */ 4163 */
4157static struct rq *find_busiest_queue(struct lb_env *env, 4164static struct rq *find_busiest_queue(struct lb_env *env,
4158 struct sched_group *group, 4165 struct sched_group *group)
4159 const struct cpumask *cpus)
4160{ 4166{
4161 struct rq *busiest = NULL, *rq; 4167 struct rq *busiest = NULL, *rq;
4162 unsigned long max_load = 0; 4168 unsigned long max_load = 0;
@@ -4171,7 +4177,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4171 if (!capacity) 4177 if (!capacity)
4172 capacity = fix_small_capacity(env->sd, group); 4178 capacity = fix_small_capacity(env->sd, group);
4173 4179
4174 if (!cpumask_test_cpu(i, cpus)) 4180 if (!cpumask_test_cpu(i, env->cpus))
4175 continue; 4181 continue;
4176 4182
4177 rq = cpu_rq(i); 4183 rq = cpu_rq(i);
@@ -4252,6 +4258,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4252 .dst_grpmask = sched_group_cpus(sd->groups), 4258 .dst_grpmask = sched_group_cpus(sd->groups),
4253 .idle = idle, 4259 .idle = idle,
4254 .loop_break = sched_nr_migrate_break, 4260 .loop_break = sched_nr_migrate_break,
4261 .cpus = cpus,
4255 }; 4262 };
4256 4263
4257 cpumask_copy(cpus, cpu_active_mask); 4264 cpumask_copy(cpus, cpu_active_mask);
@@ -4260,7 +4267,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4260 schedstat_inc(sd, lb_count[idle]); 4267 schedstat_inc(sd, lb_count[idle]);
4261 4268
4262redo: 4269redo:
4263 group = find_busiest_group(&env, cpus, balance); 4270 group = find_busiest_group(&env, balance);
4264 4271
4265 if (*balance == 0) 4272 if (*balance == 0)
4266 goto out_balanced; 4273 goto out_balanced;
@@ -4270,7 +4277,7 @@ redo:
4270 goto out_balanced; 4277 goto out_balanced;
4271 } 4278 }
4272 4279
4273 busiest = find_busiest_queue(&env, group, cpus); 4280 busiest = find_busiest_queue(&env, group);
4274 if (!busiest) { 4281 if (!busiest) {
4275 schedstat_inc(sd, lb_nobusyq[idle]); 4282 schedstat_inc(sd, lb_nobusyq[idle]);
4276 goto out_balanced; 4283 goto out_balanced;
@@ -4294,11 +4301,10 @@ redo:
4294 env.src_rq = busiest; 4301 env.src_rq = busiest;
4295 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 4302 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4296 4303
4304 update_h_load(env.src_cpu);
4297more_balance: 4305more_balance:
4298 local_irq_save(flags); 4306 local_irq_save(flags);
4299 double_rq_lock(this_rq, busiest); 4307 double_rq_lock(this_rq, busiest);
4300 if (!env.loop)
4301 update_h_load(env.src_cpu);
4302 4308
4303 /* 4309 /*
4304 * cur_ld_moved - load moved in current iteration 4310 * cur_ld_moved - load moved in current iteration
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca01102..944cb68420e9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
788 const struct cpumask *span; 788 const struct cpumask *span;
789 789
790 span = sched_rt_period_mask(); 790 span = sched_rt_period_mask();
791#ifdef CONFIG_RT_GROUP_SCHED
792 /*
793 * FIXME: isolated CPUs should really leave the root task group,
794 * whether they are isolcpus or were isolated via cpusets, lest
795 * the timer run on a CPU which does not service all runqueues,
796 * potentially leaving other CPUs indefinitely throttled. If
797 * isolation is really required, the user will turn the throttle
798 * off to kill the perturbations it causes anyway. Meanwhile,
799 * this maintains functionality for boot and/or troubleshooting.
800 */
801 if (rt_b == &root_task_group.rt_bandwidth)
802 span = cpu_online_mask;
803#endif
791 for_each_cpu(i, span) { 804 for_each_cpu(i, span) {
792 int enqueue = 0; 805 int enqueue = 0;
793 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 806 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c35a1a7dd4d6..f6714d009e77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
80struct cfs_rq; 80struct cfs_rq;
81struct rt_rq; 81struct rt_rq;
82 82
83static LIST_HEAD(task_groups); 83extern struct list_head task_groups;
84 84
85struct cfs_bandwidth { 85struct cfs_bandwidth {
86#ifdef CONFIG_CFS_BANDWIDTH 86#ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
374#ifdef CONFIG_FAIR_GROUP_SCHED 374#ifdef CONFIG_FAIR_GROUP_SCHED
375 /* list of leaf cfs_rq on this cpu: */ 375 /* list of leaf cfs_rq on this cpu: */
376 struct list_head leaf_cfs_rq_list; 376 struct list_head leaf_cfs_rq_list;
377#endif 377#ifdef CONFIG_SMP
378 unsigned long h_load_throttle;
379#endif /* CONFIG_SMP */
380#endif /* CONFIG_FAIR_GROUP_SCHED */
381
378#ifdef CONFIG_RT_GROUP_SCHED 382#ifdef CONFIG_RT_GROUP_SCHED
379 struct list_head leaf_rt_rq_list; 383 struct list_head leaf_rt_rq_list;
380#endif 384#endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd23..da5eb5bed84a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
27{ 27{
28 struct task_struct *stop = rq->stop; 28 struct task_struct *stop = rq->stop;
29 29
30 if (stop && stop->on_rq) 30 if (stop && stop->on_rq) {
31 stop->se.exec_start = rq->clock_task;
31 return stop; 32 return stop;
33 }
32 34
33 return NULL; 35 return NULL;
34} 36}
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
52 54
53static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) 55static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
54{ 56{
57 struct task_struct *curr = rq->curr;
58 u64 delta_exec;
59
60 delta_exec = rq->clock_task - curr->se.exec_start;
61 if (unlikely((s64)delta_exec < 0))
62 delta_exec = 0;
63
64 schedstat_set(curr->se.statistics.exec_max,
65 max(curr->se.statistics.exec_max, delta_exec));
66
67 curr->se.sum_exec_runtime += delta_exec;
68 account_group_exec_runtime(curr, delta_exec);
69
70 curr->se.exec_start = rq->clock_task;
71 cpuacct_charge(curr, delta_exec);
55} 72}
56 73
57static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) 74static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
60 77
61static void set_curr_task_stop(struct rq *rq) 78static void set_curr_task_stop(struct rq *rq)
62{ 79{
80 struct task_struct *stop = rq->stop;
81
82 stop->se.exec_start = rq->clock_task;
63} 83}
64 84
65static void switched_to_stop(struct rq *rq, struct task_struct *p) 85static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..b73e681df09e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void)
210 __u32 pending; 210 __u32 pending;
211 int max_restart = MAX_SOFTIRQ_RESTART; 211 int max_restart = MAX_SOFTIRQ_RESTART;
212 int cpu; 212 int cpu;
213 unsigned long old_flags = current->flags;
214
215 /*
216 * Mask out PF_MEMALLOC s current task context is borrowed for the
217 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
218 * again if the socket is related to swap
219 */
220 current->flags &= ~PF_MEMALLOC;
213 221
214 pending = local_softirq_pending(); 222 pending = local_softirq_pending();
215 account_system_vtime(current); 223 account_system_vtime(current);
@@ -265,6 +273,7 @@ restart:
265 273
266 account_system_vtime(current); 274 account_system_vtime(current);
267 __local_bh_enable(SOFTIRQ_OFFSET); 275 __local_bh_enable(SOFTIRQ_OFFSET);
276 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
268} 277}
269 278
270#ifndef __ARCH_HAS_DO_SOFTIRQ 279#ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/kernel/sys.c b/kernel/sys.c
index 2d39a84cd857..241507f23eca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2015,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2015 break; 2015 break;
2016 } 2016 }
2017 me->pdeath_signal = arg2; 2017 me->pdeath_signal = arg2;
2018 error = 0;
2019 break; 2018 break;
2020 case PR_GET_PDEATHSIG: 2019 case PR_GET_PDEATHSIG:
2021 error = put_user(me->pdeath_signal, (int __user *)arg2); 2020 error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2029,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2029 break; 2028 break;
2030 } 2029 }
2031 set_dumpable(me->mm, arg2); 2030 set_dumpable(me->mm, arg2);
2032 error = 0;
2033 break; 2031 break;
2034 2032
2035 case PR_SET_UNALIGN: 2033 case PR_SET_UNALIGN:
@@ -2056,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2056 case PR_SET_TIMING: 2054 case PR_SET_TIMING:
2057 if (arg2 != PR_TIMING_STATISTICAL) 2055 if (arg2 != PR_TIMING_STATISTICAL)
2058 error = -EINVAL; 2056 error = -EINVAL;
2059 else
2060 error = 0;
2061 break; 2057 break;
2062
2063 case PR_SET_NAME: 2058 case PR_SET_NAME:
2064 comm[sizeof(me->comm)-1] = 0; 2059 comm[sizeof(me->comm)-1] = 0;
2065 if (strncpy_from_user(comm, (char __user *)arg2, 2060 if (strncpy_from_user(comm, (char __user *)arg2,
@@ -2067,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2067 return -EFAULT; 2062 return -EFAULT;
2068 set_task_comm(me, comm); 2063 set_task_comm(me, comm);
2069 proc_comm_connector(me); 2064 proc_comm_connector(me);
2070 return 0; 2065 break;
2071 case PR_GET_NAME: 2066 case PR_GET_NAME:
2072 get_task_comm(comm, me); 2067 get_task_comm(comm, me);
2073 if (copy_to_user((char __user *)arg2, comm, 2068 if (copy_to_user((char __user *)arg2, comm,
2074 sizeof(comm))) 2069 sizeof(comm)))
2075 return -EFAULT; 2070 return -EFAULT;
2076 return 0; 2071 break;
2077 case PR_GET_ENDIAN: 2072 case PR_GET_ENDIAN:
2078 error = GET_ENDIAN(me, arg2); 2073 error = GET_ENDIAN(me, arg2);
2079 break; 2074 break;
2080 case PR_SET_ENDIAN: 2075 case PR_SET_ENDIAN:
2081 error = SET_ENDIAN(me, arg2); 2076 error = SET_ENDIAN(me, arg2);
2082 break; 2077 break;
2083
2084 case PR_GET_SECCOMP: 2078 case PR_GET_SECCOMP:
2085 error = prctl_get_seccomp(); 2079 error = prctl_get_seccomp();
2086 break; 2080 break;
@@ -2108,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2108 current->default_timer_slack_ns; 2102 current->default_timer_slack_ns;
2109 else 2103 else
2110 current->timer_slack_ns = arg2; 2104 current->timer_slack_ns = arg2;
2111 error = 0;
2112 break; 2105 break;
2113 case PR_MCE_KILL: 2106 case PR_MCE_KILL:
2114 if (arg4 | arg5) 2107 if (arg4 | arg5)
@@ -2134,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2134 default: 2127 default:
2135 return -EINVAL; 2128 return -EINVAL;
2136 } 2129 }
2137 error = 0;
2138 break; 2130 break;
2139 case PR_MCE_KILL_GET: 2131 case PR_MCE_KILL_GET:
2140 if (arg2 | arg3 | arg4 | arg5) 2132 if (arg2 | arg3 | arg4 | arg5)
@@ -2153,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2153 break; 2145 break;
2154 case PR_SET_CHILD_SUBREAPER: 2146 case PR_SET_CHILD_SUBREAPER:
2155 me->signal->is_child_subreaper = !!arg2; 2147 me->signal->is_child_subreaper = !!arg2;
2156 error = 0;
2157 break; 2148 break;
2158 case PR_GET_CHILD_SUBREAPER: 2149 case PR_GET_CHILD_SUBREAPER:
2159 error = put_user(me->signal->is_child_subreaper, 2150 error = put_user(me->signal->is_child_subreaper,
@@ -2195,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info)
2195 argv_free(info->argv); 2186 argv_free(info->argv);
2196} 2187}
2197 2188
2198/** 2189static int __orderly_poweroff(void)
2199 * orderly_poweroff - Trigger an orderly system poweroff
2200 * @force: force poweroff if command execution fails
2201 *
2202 * This may be called from any context to trigger a system shutdown.
2203 * If the orderly shutdown fails, it will force an immediate shutdown.
2204 */
2205int orderly_poweroff(bool force)
2206{ 2190{
2207 int argc; 2191 int argc;
2208 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); 2192 char **argv;
2209 static char *envp[] = { 2193 static char *envp[] = {
2210 "HOME=/", 2194 "HOME=/",
2211 "PATH=/sbin:/bin:/usr/sbin:/usr/bin", 2195 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2212 NULL 2196 NULL
2213 }; 2197 };
2214 int ret = -ENOMEM; 2198 int ret;
2215 2199
2200 argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2216 if (argv == NULL) { 2201 if (argv == NULL) {
2217 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 2202 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2218 __func__, poweroff_cmd); 2203 __func__, poweroff_cmd);
2219 goto out; 2204 return -ENOMEM;
2220 } 2205 }
2221 2206
2222 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, 2207 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
2223 NULL, argv_cleanup, NULL); 2208 NULL, argv_cleanup, NULL);
2224out:
2225 if (likely(!ret))
2226 return 0;
2227
2228 if (ret == -ENOMEM) 2209 if (ret == -ENOMEM)
2229 argv_free(argv); 2210 argv_free(argv);
2230 2211
2231 if (force) { 2212 return ret;
2213}
2214
2215/**
2216 * orderly_poweroff - Trigger an orderly system poweroff
2217 * @force: force poweroff if command execution fails
2218 *
2219 * This may be called from any context to trigger a system shutdown.
2220 * If the orderly shutdown fails, it will force an immediate shutdown.
2221 */
2222int orderly_poweroff(bool force)
2223{
2224 int ret = __orderly_poweroff();
2225
2226 if (ret && force) {
2232 printk(KERN_WARNING "Failed to start orderly shutdown: " 2227 printk(KERN_WARNING "Failed to start orderly shutdown: "
2233 "forcing the issue\n"); 2228 "forcing the issue\n");
2234 2229
2235 /* I guess this should try to kick off some daemon to 2230 /*
2236 sync and poweroff asap. Or not even bother syncing 2231 * I guess this should try to kick off some daemon to sync and
2237 if we're doing an emergency shutdown? */ 2232 * poweroff asap. Or not even bother syncing if we're doing an
2233 * emergency shutdown?
2234 */
2238 emergency_sync(); 2235 emergency_sync();
2239 kernel_power_off(); 2236 kernel_power_off();
2240 } 2237 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab11879aeb4..87174ef59161 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,6 +30,7 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
32#include <linux/kmemcheck.h> 32#include <linux/kmemcheck.h>
33#include <linux/kmemleak.h>
33#include <linux/fs.h> 34#include <linux/fs.h>
34#include <linux/init.h> 35#include <linux/init.h>
35#include <linux/kernel.h> 36#include <linux/kernel.h>
@@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, loff_t *ppos); 175 void __user *buffer, size_t *lenp, loff_t *ppos);
175#endif 176#endif
176 177
178static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
179 void __user *buffer, size_t *lenp, loff_t *ppos);
180static int proc_dostring_coredump(struct ctl_table *table, int write,
181 void __user *buffer, size_t *lenp, loff_t *ppos);
182
177#ifdef CONFIG_MAGIC_SYSRQ 183#ifdef CONFIG_MAGIC_SYSRQ
178/* Note: sysrq code uses it's own private copy */ 184/* Note: sysrq code uses it's own private copy */
179static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 185static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = {
410 .data = core_pattern, 416 .data = core_pattern,
411 .maxlen = CORENAME_MAX_SIZE, 417 .maxlen = CORENAME_MAX_SIZE,
412 .mode = 0644, 418 .mode = 0644,
413 .proc_handler = proc_dostring, 419 .proc_handler = proc_dostring_coredump,
414 }, 420 },
415 { 421 {
416 .procname = "core_pipe_limit", 422 .procname = "core_pipe_limit",
@@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = {
1095 .extra1 = &zero, 1101 .extra1 = &zero,
1096 }, 1102 },
1097 { 1103 {
1098 .procname = "nr_pdflush_threads", 1104 .procname = "nr_pdflush_threads",
1099 .data = &nr_pdflush_threads, 1105 .mode = 0444 /* read-only */,
1100 .maxlen = sizeof nr_pdflush_threads, 1106 .proc_handler = pdflush_proc_obsolete,
1101 .mode = 0444 /* read-only*/,
1102 .proc_handler = proc_dointvec,
1103 }, 1107 },
1104 { 1108 {
1105 .procname = "swappiness", 1109 .procname = "swappiness",
@@ -1494,11 +1498,29 @@ static struct ctl_table fs_table[] = {
1494#endif 1498#endif
1495#endif 1499#endif
1496 { 1500 {
1501 .procname = "protected_symlinks",
1502 .data = &sysctl_protected_symlinks,
1503 .maxlen = sizeof(int),
1504 .mode = 0600,
1505 .proc_handler = proc_dointvec_minmax,
1506 .extra1 = &zero,
1507 .extra2 = &one,
1508 },
1509 {
1510 .procname = "protected_hardlinks",
1511 .data = &sysctl_protected_hardlinks,
1512 .maxlen = sizeof(int),
1513 .mode = 0600,
1514 .proc_handler = proc_dointvec_minmax,
1515 .extra1 = &zero,
1516 .extra2 = &one,
1517 },
1518 {
1497 .procname = "suid_dumpable", 1519 .procname = "suid_dumpable",
1498 .data = &suid_dumpable, 1520 .data = &suid_dumpable,
1499 .maxlen = sizeof(int), 1521 .maxlen = sizeof(int),
1500 .mode = 0644, 1522 .mode = 0644,
1501 .proc_handler = proc_dointvec_minmax, 1523 .proc_handler = proc_dointvec_minmax_coredump,
1502 .extra1 = &zero, 1524 .extra1 = &zero,
1503 .extra2 = &two, 1525 .extra2 = &two,
1504 }, 1526 },
@@ -1551,7 +1573,10 @@ static struct ctl_table dev_table[] = {
1551 1573
1552int __init sysctl_init(void) 1574int __init sysctl_init(void)
1553{ 1575{
1554 register_sysctl_table(sysctl_base_table); 1576 struct ctl_table_header *hdr;
1577
1578 hdr = register_sysctl_table(sysctl_base_table);
1579 kmemleak_not_leak(hdr);
1555 return 0; 1580 return 0;
1556} 1581}
1557 1582
@@ -2009,6 +2034,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
2009 do_proc_dointvec_minmax_conv, &param); 2034 do_proc_dointvec_minmax_conv, &param);
2010} 2035}
2011 2036
2037static void validate_coredump_safety(void)
2038{
2039 if (suid_dumpable == SUID_DUMPABLE_SAFE &&
2040 core_pattern[0] != '/' && core_pattern[0] != '|') {
2041 printk(KERN_WARNING "Unsafe core_pattern used with "\
2042 "suid_dumpable=2. Pipe handler or fully qualified "\
2043 "core dump path required.\n");
2044 }
2045}
2046
2047static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
2048 void __user *buffer, size_t *lenp, loff_t *ppos)
2049{
2050 int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2051 if (!error)
2052 validate_coredump_safety();
2053 return error;
2054}
2055
2056static int proc_dostring_coredump(struct ctl_table *table, int write,
2057 void __user *buffer, size_t *lenp, loff_t *ppos)
2058{
2059 int error = proc_dostring(table, write, buffer, lenp, ppos);
2060 if (!error)
2061 validate_coredump_safety();
2062 return error;
2063}
2064
2012static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2065static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2013 void __user *buffer, 2066 void __user *buffer,
2014 size_t *lenp, loff_t *ppos, 2067 size_t *lenp, loff_t *ppos,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..65bdcf198d4e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, 147 { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ 148 /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ 149 /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
150 { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, 150 /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, 151 { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
152 /* VM_PAGEBUF unused */ 152 /* VM_PAGEBUF unused */
153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ 153 /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 91d4e1742a0c..d320d44903bd 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -75,6 +75,7 @@ void task_work_run(void)
75 p = q->next; 75 p = q->next;
76 q->func(q); 76 q->func(q);
77 q = p; 77 q = p;
78 cond_resched();
78 } 79 }
79 } 80 }
80} 81}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e66046456f4f..d0a32796550f 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
436 436
437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 437 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
438 sizeof(struct cgroupstats)); 438 sizeof(struct cgroupstats));
439 if (na == NULL) {
440 rc = -EMSGSIZE;
441 goto err;
442 }
443
439 stats = nla_data(na); 444 stats = nla_data(na);
440 memset(stats, 0, sizeof(*stats)); 445 memset(stats, 0, sizeof(*stats));
441 446
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a470154e0408..46da0537c10b 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
37 * requested HZ value. It is also not recommended 37 * requested HZ value. It is also not recommended
38 * for "tick-less" systems. 38 * for "tick-less" systems.
39 */ 39 */
40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) 40#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ))
41 41
42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier 42/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
43 * conversion, the .shift value could be zero. However 43 * conversion, the .shift value could be zero. However
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b7fbadc5c973..24174b4d669b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock);
28/* USER_HZ period (usecs): */ 28/* USER_HZ period (usecs): */
29unsigned long tick_usec = TICK_USEC; 29unsigned long tick_usec = TICK_USEC;
30 30
31/* ACTHZ period (nsecs): */ 31/* SHIFTED_HZ period (nsecs): */
32unsigned long tick_nsec; 32unsigned long tick_nsec;
33 33
34static u64 tick_length; 34static u64 tick_length;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f045cc50832d..34e5eac81424 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -65,14 +65,14 @@ struct timekeeper {
65 * used instead. 65 * used instead.
66 */ 66 */
67 struct timespec wall_to_monotonic; 67 struct timespec wall_to_monotonic;
68 /* time spent in suspend */
69 struct timespec total_sleep_time;
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time;
72 /* Offset clock monotonic -> clock realtime */ 68 /* Offset clock monotonic -> clock realtime */
73 ktime_t offs_real; 69 ktime_t offs_real;
70 /* time spent in suspend */
71 struct timespec total_sleep_time;
74 /* Offset clock monotonic -> clock boottime */ 72 /* Offset clock monotonic -> clock boottime */
75 ktime_t offs_boot; 73 ktime_t offs_boot;
74 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
75 struct timespec raw_time;
76 /* Seqlock for all timekeeper values */ 76 /* Seqlock for all timekeeper values */
77 seqlock_t lock; 77 seqlock_t lock;
78}; 78};
@@ -108,13 +108,39 @@ static struct timespec tk_xtime(struct timekeeper *tk)
108static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) 108static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
109{ 109{
110 tk->xtime_sec = ts->tv_sec; 110 tk->xtime_sec = ts->tv_sec;
111 tk->xtime_nsec = ts->tv_nsec << tk->shift; 111 tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
112} 112}
113 113
114static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) 114static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
115{ 115{
116 tk->xtime_sec += ts->tv_sec; 116 tk->xtime_sec += ts->tv_sec;
117 tk->xtime_nsec += ts->tv_nsec << tk->shift; 117 tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
118 tk_normalize_xtime(tk);
119}
120
121static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
122{
123 struct timespec tmp;
124
125 /*
126 * Verify consistency of: offset_real = -wall_to_monotonic
127 * before modifying anything
128 */
129 set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
130 -tk->wall_to_monotonic.tv_nsec);
131 WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
132 tk->wall_to_monotonic = wtm;
133 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
134 tk->offs_real = timespec_to_ktime(tmp);
135}
136
137static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
138{
139 /* Verify consistency before modifying */
140 WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
141
142 tk->total_sleep_time = t;
143 tk->offs_boot = timespec_to_ktime(t);
118} 144}
119 145
120/** 146/**
@@ -217,14 +243,6 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
217 return nsec + arch_gettimeoffset(); 243 return nsec + arch_gettimeoffset();
218} 244}
219 245
220static void update_rt_offset(struct timekeeper *tk)
221{
222 struct timespec tmp, *wtm = &tk->wall_to_monotonic;
223
224 set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
225 tk->offs_real = timespec_to_ktime(tmp);
226}
227
228/* must hold write on timekeeper.lock */ 246/* must hold write on timekeeper.lock */
229static void timekeeping_update(struct timekeeper *tk, bool clearntp) 247static void timekeeping_update(struct timekeeper *tk, bool clearntp)
230{ 248{
@@ -234,12 +252,10 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
234 tk->ntp_error = 0; 252 tk->ntp_error = 0;
235 ntp_clear(); 253 ntp_clear();
236 } 254 }
237 update_rt_offset(tk);
238 xt = tk_xtime(tk); 255 xt = tk_xtime(tk);
239 update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); 256 update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
240} 257}
241 258
242
243/** 259/**
244 * timekeeping_forward_now - update clock to the current time 260 * timekeeping_forward_now - update clock to the current time
245 * 261 *
@@ -261,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
261 tk->xtime_nsec += cycle_delta * tk->mult; 277 tk->xtime_nsec += cycle_delta * tk->mult;
262 278
263 /* If arch requires, add in gettimeoffset() */ 279 /* If arch requires, add in gettimeoffset() */
264 tk->xtime_nsec += arch_gettimeoffset() << tk->shift; 280 tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
265 281
266 tk_normalize_xtime(tk); 282 tk_normalize_xtime(tk);
267 283
@@ -277,18 +293,19 @@ static void timekeeping_forward_now(struct timekeeper *tk)
277 */ 293 */
278void getnstimeofday(struct timespec *ts) 294void getnstimeofday(struct timespec *ts)
279{ 295{
296 struct timekeeper *tk = &timekeeper;
280 unsigned long seq; 297 unsigned long seq;
281 s64 nsecs = 0; 298 s64 nsecs = 0;
282 299
283 WARN_ON(timekeeping_suspended); 300 WARN_ON(timekeeping_suspended);
284 301
285 do { 302 do {
286 seq = read_seqbegin(&timekeeper.lock); 303 seq = read_seqbegin(&tk->lock);
287 304
288 ts->tv_sec = timekeeper.xtime_sec; 305 ts->tv_sec = tk->xtime_sec;
289 ts->tv_nsec = timekeeping_get_ns(&timekeeper); 306 ts->tv_nsec = timekeeping_get_ns(tk);
290 307
291 } while (read_seqretry(&timekeeper.lock, seq)); 308 } while (read_seqretry(&tk->lock, seq));
292 309
293 timespec_add_ns(ts, nsecs); 310 timespec_add_ns(ts, nsecs);
294} 311}
@@ -296,19 +313,18 @@ EXPORT_SYMBOL(getnstimeofday);
296 313
297ktime_t ktime_get(void) 314ktime_t ktime_get(void)
298{ 315{
316 struct timekeeper *tk = &timekeeper;
299 unsigned int seq; 317 unsigned int seq;
300 s64 secs, nsecs; 318 s64 secs, nsecs;
301 319
302 WARN_ON(timekeeping_suspended); 320 WARN_ON(timekeeping_suspended);
303 321
304 do { 322 do {
305 seq = read_seqbegin(&timekeeper.lock); 323 seq = read_seqbegin(&tk->lock);
306 secs = timekeeper.xtime_sec + 324 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
307 timekeeper.wall_to_monotonic.tv_sec; 325 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
308 nsecs = timekeeping_get_ns(&timekeeper) +
309 timekeeper.wall_to_monotonic.tv_nsec;
310 326
311 } while (read_seqretry(&timekeeper.lock, seq)); 327 } while (read_seqretry(&tk->lock, seq));
312 /* 328 /*
313 * Use ktime_set/ktime_add_ns to create a proper ktime on 329 * Use ktime_set/ktime_add_ns to create a proper ktime on
314 * 32-bit architectures without CONFIG_KTIME_SCALAR. 330 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -327,18 +343,19 @@ EXPORT_SYMBOL_GPL(ktime_get);
327 */ 343 */
328void ktime_get_ts(struct timespec *ts) 344void ktime_get_ts(struct timespec *ts)
329{ 345{
346 struct timekeeper *tk = &timekeeper;
330 struct timespec tomono; 347 struct timespec tomono;
331 unsigned int seq; 348 unsigned int seq;
332 349
333 WARN_ON(timekeeping_suspended); 350 WARN_ON(timekeeping_suspended);
334 351
335 do { 352 do {
336 seq = read_seqbegin(&timekeeper.lock); 353 seq = read_seqbegin(&tk->lock);
337 ts->tv_sec = timekeeper.xtime_sec; 354 ts->tv_sec = tk->xtime_sec;
338 ts->tv_nsec = timekeeping_get_ns(&timekeeper); 355 ts->tv_nsec = timekeeping_get_ns(tk);
339 tomono = timekeeper.wall_to_monotonic; 356 tomono = tk->wall_to_monotonic;
340 357
341 } while (read_seqretry(&timekeeper.lock, seq)); 358 } while (read_seqretry(&tk->lock, seq));
342 359
343 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, 360 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
344 ts->tv_nsec + tomono.tv_nsec); 361 ts->tv_nsec + tomono.tv_nsec);
@@ -358,22 +375,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
358 */ 375 */
359void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) 376void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
360{ 377{
378 struct timekeeper *tk = &timekeeper;
361 unsigned long seq; 379 unsigned long seq;
362 s64 nsecs_raw, nsecs_real; 380 s64 nsecs_raw, nsecs_real;
363 381
364 WARN_ON_ONCE(timekeeping_suspended); 382 WARN_ON_ONCE(timekeeping_suspended);
365 383
366 do { 384 do {
367 seq = read_seqbegin(&timekeeper.lock); 385 seq = read_seqbegin(&tk->lock);
368 386
369 *ts_raw = timekeeper.raw_time; 387 *ts_raw = tk->raw_time;
370 ts_real->tv_sec = timekeeper.xtime_sec; 388 ts_real->tv_sec = tk->xtime_sec;
371 ts_real->tv_nsec = 0; 389 ts_real->tv_nsec = 0;
372 390
373 nsecs_raw = timekeeping_get_ns_raw(&timekeeper); 391 nsecs_raw = timekeeping_get_ns_raw(tk);
374 nsecs_real = timekeeping_get_ns(&timekeeper); 392 nsecs_real = timekeeping_get_ns(tk);
375 393
376 } while (read_seqretry(&timekeeper.lock, seq)); 394 } while (read_seqretry(&tk->lock, seq));
377 395
378 timespec_add_ns(ts_raw, nsecs_raw); 396 timespec_add_ns(ts_raw, nsecs_raw);
379 timespec_add_ns(ts_real, nsecs_real); 397 timespec_add_ns(ts_real, nsecs_real);
@@ -406,28 +424,28 @@ EXPORT_SYMBOL(do_gettimeofday);
406 */ 424 */
407int do_settimeofday(const struct timespec *tv) 425int do_settimeofday(const struct timespec *tv)
408{ 426{
427 struct timekeeper *tk = &timekeeper;
409 struct timespec ts_delta, xt; 428 struct timespec ts_delta, xt;
410 unsigned long flags; 429 unsigned long flags;
411 430
412 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 431 if (!timespec_valid_strict(tv))
413 return -EINVAL; 432 return -EINVAL;
414 433
415 write_seqlock_irqsave(&timekeeper.lock, flags); 434 write_seqlock_irqsave(&tk->lock, flags);
416 435
417 timekeeping_forward_now(&timekeeper); 436 timekeeping_forward_now(tk);
418 437
419 xt = tk_xtime(&timekeeper); 438 xt = tk_xtime(tk);
420 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 439 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
421 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 440 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
422 441
423 timekeeper.wall_to_monotonic = 442 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
424 timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
425 443
426 tk_set_xtime(&timekeeper, tv); 444 tk_set_xtime(tk, tv);
427 445
428 timekeeping_update(&timekeeper, true); 446 timekeeping_update(tk, true);
429 447
430 write_sequnlock_irqrestore(&timekeeper.lock, flags); 448 write_sequnlock_irqrestore(&tk->lock, flags);
431 449
432 /* signal hrtimers about time change */ 450 /* signal hrtimers about time change */
433 clock_was_set(); 451 clock_was_set();
@@ -436,7 +454,6 @@ int do_settimeofday(const struct timespec *tv)
436} 454}
437EXPORT_SYMBOL(do_settimeofday); 455EXPORT_SYMBOL(do_settimeofday);
438 456
439
440/** 457/**
441 * timekeeping_inject_offset - Adds or subtracts from the current time. 458 * timekeeping_inject_offset - Adds or subtracts from the current time.
442 * @tv: pointer to the timespec variable containing the offset 459 * @tv: pointer to the timespec variable containing the offset
@@ -445,28 +462,37 @@ EXPORT_SYMBOL(do_settimeofday);
445 */ 462 */
446int timekeeping_inject_offset(struct timespec *ts) 463int timekeeping_inject_offset(struct timespec *ts)
447{ 464{
465 struct timekeeper *tk = &timekeeper;
448 unsigned long flags; 466 unsigned long flags;
467 struct timespec tmp;
468 int ret = 0;
449 469
450 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 470 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
451 return -EINVAL; 471 return -EINVAL;
452 472
453 write_seqlock_irqsave(&timekeeper.lock, flags); 473 write_seqlock_irqsave(&tk->lock, flags);
454 474
455 timekeeping_forward_now(&timekeeper); 475 timekeeping_forward_now(tk);
456 476
477 /* Make sure the proposed value is valid */
478 tmp = timespec_add(tk_xtime(tk), *ts);
479 if (!timespec_valid_strict(&tmp)) {
480 ret = -EINVAL;
481 goto error;
482 }
457 483
458 tk_xtime_add(&timekeeper, ts); 484 tk_xtime_add(tk, ts);
459 timekeeper.wall_to_monotonic = 485 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
460 timespec_sub(timekeeper.wall_to_monotonic, *ts);
461 486
462 timekeeping_update(&timekeeper, true); 487error: /* even if we error out, we forwarded the time, so call update */
488 timekeeping_update(tk, true);
463 489
464 write_sequnlock_irqrestore(&timekeeper.lock, flags); 490 write_sequnlock_irqrestore(&tk->lock, flags);
465 491
466 /* signal hrtimers about time change */ 492 /* signal hrtimers about time change */
467 clock_was_set(); 493 clock_was_set();
468 494
469 return 0; 495 return ret;
470} 496}
471EXPORT_SYMBOL(timekeeping_inject_offset); 497EXPORT_SYMBOL(timekeeping_inject_offset);
472 498
@@ -477,23 +503,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
477 */ 503 */
478static int change_clocksource(void *data) 504static int change_clocksource(void *data)
479{ 505{
506 struct timekeeper *tk = &timekeeper;
480 struct clocksource *new, *old; 507 struct clocksource *new, *old;
481 unsigned long flags; 508 unsigned long flags;
482 509
483 new = (struct clocksource *) data; 510 new = (struct clocksource *) data;
484 511
485 write_seqlock_irqsave(&timekeeper.lock, flags); 512 write_seqlock_irqsave(&tk->lock, flags);
486 513
487 timekeeping_forward_now(&timekeeper); 514 timekeeping_forward_now(tk);
488 if (!new->enable || new->enable(new) == 0) { 515 if (!new->enable || new->enable(new) == 0) {
489 old = timekeeper.clock; 516 old = tk->clock;
490 tk_setup_internals(&timekeeper, new); 517 tk_setup_internals(tk, new);
491 if (old->disable) 518 if (old->disable)
492 old->disable(old); 519 old->disable(old);
493 } 520 }
494 timekeeping_update(&timekeeper, true); 521 timekeeping_update(tk, true);
495 522
496 write_sequnlock_irqrestore(&timekeeper.lock, flags); 523 write_sequnlock_irqrestore(&tk->lock, flags);
497 524
498 return 0; 525 return 0;
499} 526}
@@ -507,7 +534,9 @@ static int change_clocksource(void *data)
507 */ 534 */
508void timekeeping_notify(struct clocksource *clock) 535void timekeeping_notify(struct clocksource *clock)
509{ 536{
510 if (timekeeper.clock == clock) 537 struct timekeeper *tk = &timekeeper;
538
539 if (tk->clock == clock)
511 return; 540 return;
512 stop_machine(change_clocksource, clock, NULL); 541 stop_machine(change_clocksource, clock, NULL);
513 tick_clock_notify(); 542 tick_clock_notify();
@@ -536,35 +565,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
536 */ 565 */
537void getrawmonotonic(struct timespec *ts) 566void getrawmonotonic(struct timespec *ts)
538{ 567{
568 struct timekeeper *tk = &timekeeper;
539 unsigned long seq; 569 unsigned long seq;
540 s64 nsecs; 570 s64 nsecs;
541 571
542 do { 572 do {
543 seq = read_seqbegin(&timekeeper.lock); 573 seq = read_seqbegin(&tk->lock);
544 nsecs = timekeeping_get_ns_raw(&timekeeper); 574 nsecs = timekeeping_get_ns_raw(tk);
545 *ts = timekeeper.raw_time; 575 *ts = tk->raw_time;
546 576
547 } while (read_seqretry(&timekeeper.lock, seq)); 577 } while (read_seqretry(&tk->lock, seq));
548 578
549 timespec_add_ns(ts, nsecs); 579 timespec_add_ns(ts, nsecs);
550} 580}
551EXPORT_SYMBOL(getrawmonotonic); 581EXPORT_SYMBOL(getrawmonotonic);
552 582
553
554/** 583/**
555 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 584 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
556 */ 585 */
557int timekeeping_valid_for_hres(void) 586int timekeeping_valid_for_hres(void)
558{ 587{
588 struct timekeeper *tk = &timekeeper;
559 unsigned long seq; 589 unsigned long seq;
560 int ret; 590 int ret;
561 591
562 do { 592 do {
563 seq = read_seqbegin(&timekeeper.lock); 593 seq = read_seqbegin(&tk->lock);
564 594
565 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 595 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
566 596
567 } while (read_seqretry(&timekeeper.lock, seq)); 597 } while (read_seqretry(&tk->lock, seq));
568 598
569 return ret; 599 return ret;
570} 600}
@@ -574,15 +604,16 @@ int timekeeping_valid_for_hres(void)
574 */ 604 */
575u64 timekeeping_max_deferment(void) 605u64 timekeeping_max_deferment(void)
576{ 606{
607 struct timekeeper *tk = &timekeeper;
577 unsigned long seq; 608 unsigned long seq;
578 u64 ret; 609 u64 ret;
579 610
580 do { 611 do {
581 seq = read_seqbegin(&timekeeper.lock); 612 seq = read_seqbegin(&tk->lock);
582 613
583 ret = timekeeper.clock->max_idle_ns; 614 ret = tk->clock->max_idle_ns;
584 615
585 } while (read_seqretry(&timekeeper.lock, seq)); 616 } while (read_seqretry(&tk->lock, seq));
586 617
587 return ret; 618 return ret;
588} 619}
@@ -622,46 +653,56 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
622 */ 653 */
623void __init timekeeping_init(void) 654void __init timekeeping_init(void)
624{ 655{
656 struct timekeeper *tk = &timekeeper;
625 struct clocksource *clock; 657 struct clocksource *clock;
626 unsigned long flags; 658 unsigned long flags;
627 struct timespec now, boot; 659 struct timespec now, boot, tmp;
628 660
629 read_persistent_clock(&now); 661 read_persistent_clock(&now);
662 if (!timespec_valid_strict(&now)) {
663 pr_warn("WARNING: Persistent clock returned invalid value!\n"
664 " Check your CMOS/BIOS settings.\n");
665 now.tv_sec = 0;
666 now.tv_nsec = 0;
667 }
668
630 read_boot_clock(&boot); 669 read_boot_clock(&boot);
670 if (!timespec_valid_strict(&boot)) {
671 pr_warn("WARNING: Boot clock returned invalid value!\n"
672 " Check your CMOS/BIOS settings.\n");
673 boot.tv_sec = 0;
674 boot.tv_nsec = 0;
675 }
631 676
632 seqlock_init(&timekeeper.lock); 677 seqlock_init(&tk->lock);
633 678
634 ntp_init(); 679 ntp_init();
635 680
636 write_seqlock_irqsave(&timekeeper.lock, flags); 681 write_seqlock_irqsave(&tk->lock, flags);
637 clock = clocksource_default_clock(); 682 clock = clocksource_default_clock();
638 if (clock->enable) 683 if (clock->enable)
639 clock->enable(clock); 684 clock->enable(clock);
640 tk_setup_internals(&timekeeper, clock); 685 tk_setup_internals(tk, clock);
641 686
642 tk_set_xtime(&timekeeper, &now); 687 tk_set_xtime(tk, &now);
643 timekeeper.raw_time.tv_sec = 0; 688 tk->raw_time.tv_sec = 0;
644 timekeeper.raw_time.tv_nsec = 0; 689 tk->raw_time.tv_nsec = 0;
645 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 690 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
646 boot = tk_xtime(&timekeeper); 691 boot = tk_xtime(tk);
647 692
648 set_normalized_timespec(&timekeeper.wall_to_monotonic, 693 set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
649 -boot.tv_sec, -boot.tv_nsec); 694 tk_set_wall_to_mono(tk, tmp);
650 update_rt_offset(&timekeeper); 695
651 timekeeper.total_sleep_time.tv_sec = 0; 696 tmp.tv_sec = 0;
652 timekeeper.total_sleep_time.tv_nsec = 0; 697 tmp.tv_nsec = 0;
653 write_sequnlock_irqrestore(&timekeeper.lock, flags); 698 tk_set_sleep_time(tk, tmp);
699
700 write_sequnlock_irqrestore(&tk->lock, flags);
654} 701}
655 702
656/* time in seconds when suspend began */ 703/* time in seconds when suspend began */
657static struct timespec timekeeping_suspend_time; 704static struct timespec timekeeping_suspend_time;
658 705
659static void update_sleep_time(struct timespec t)
660{
661 timekeeper.total_sleep_time = t;
662 timekeeper.offs_boot = timespec_to_ktime(t);
663}
664
665/** 706/**
666 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 707 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
667 * @delta: pointer to a timespec delta value 708 * @delta: pointer to a timespec delta value
@@ -672,18 +713,16 @@ static void update_sleep_time(struct timespec t)
672static void __timekeeping_inject_sleeptime(struct timekeeper *tk, 713static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
673 struct timespec *delta) 714 struct timespec *delta)
674{ 715{
675 if (!timespec_valid(delta)) { 716 if (!timespec_valid_strict(delta)) {
676 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 717 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
677 "sleep delta value!\n"); 718 "sleep delta value!\n");
678 return; 719 return;
679 } 720 }
680
681 tk_xtime_add(tk, delta); 721 tk_xtime_add(tk, delta);
682 tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta); 722 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
683 update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); 723 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
684} 724}
685 725
686
687/** 726/**
688 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values 727 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
689 * @delta: pointer to a timespec delta value 728 * @delta: pointer to a timespec delta value
@@ -696,6 +735,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
696 */ 735 */
697void timekeeping_inject_sleeptime(struct timespec *delta) 736void timekeeping_inject_sleeptime(struct timespec *delta)
698{ 737{
738 struct timekeeper *tk = &timekeeper;
699 unsigned long flags; 739 unsigned long flags;
700 struct timespec ts; 740 struct timespec ts;
701 741
@@ -704,21 +744,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
704 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 744 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
705 return; 745 return;
706 746
707 write_seqlock_irqsave(&timekeeper.lock, flags); 747 write_seqlock_irqsave(&tk->lock, flags);
708 748
709 timekeeping_forward_now(&timekeeper); 749 timekeeping_forward_now(tk);
710 750
711 __timekeeping_inject_sleeptime(&timekeeper, delta); 751 __timekeeping_inject_sleeptime(tk, delta);
712 752
713 timekeeping_update(&timekeeper, true); 753 timekeeping_update(tk, true);
714 754
715 write_sequnlock_irqrestore(&timekeeper.lock, flags); 755 write_sequnlock_irqrestore(&tk->lock, flags);
716 756
717 /* signal hrtimers about time change */ 757 /* signal hrtimers about time change */
718 clock_was_set(); 758 clock_was_set();
719} 759}
720 760
721
722/** 761/**
723 * timekeeping_resume - Resumes the generic timekeeping subsystem. 762 * timekeeping_resume - Resumes the generic timekeeping subsystem.
724 * 763 *
@@ -728,6 +767,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
728 */ 767 */
729static void timekeeping_resume(void) 768static void timekeeping_resume(void)
730{ 769{
770 struct timekeeper *tk = &timekeeper;
731 unsigned long flags; 771 unsigned long flags;
732 struct timespec ts; 772 struct timespec ts;
733 773
@@ -735,18 +775,18 @@ static void timekeeping_resume(void)
735 775
736 clocksource_resume(); 776 clocksource_resume();
737 777
738 write_seqlock_irqsave(&timekeeper.lock, flags); 778 write_seqlock_irqsave(&tk->lock, flags);
739 779
740 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 780 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
741 ts = timespec_sub(ts, timekeeping_suspend_time); 781 ts = timespec_sub(ts, timekeeping_suspend_time);
742 __timekeeping_inject_sleeptime(&timekeeper, &ts); 782 __timekeeping_inject_sleeptime(tk, &ts);
743 } 783 }
744 /* re-base the last cycle value */ 784 /* re-base the last cycle value */
745 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 785 tk->clock->cycle_last = tk->clock->read(tk->clock);
746 timekeeper.ntp_error = 0; 786 tk->ntp_error = 0;
747 timekeeping_suspended = 0; 787 timekeeping_suspended = 0;
748 timekeeping_update(&timekeeper, false); 788 timekeeping_update(tk, false);
749 write_sequnlock_irqrestore(&timekeeper.lock, flags); 789 write_sequnlock_irqrestore(&tk->lock, flags);
750 790
751 touch_softlockup_watchdog(); 791 touch_softlockup_watchdog();
752 792
@@ -758,14 +798,15 @@ static void timekeeping_resume(void)
758 798
759static int timekeeping_suspend(void) 799static int timekeeping_suspend(void)
760{ 800{
801 struct timekeeper *tk = &timekeeper;
761 unsigned long flags; 802 unsigned long flags;
762 struct timespec delta, delta_delta; 803 struct timespec delta, delta_delta;
763 static struct timespec old_delta; 804 static struct timespec old_delta;
764 805
765 read_persistent_clock(&timekeeping_suspend_time); 806 read_persistent_clock(&timekeeping_suspend_time);
766 807
767 write_seqlock_irqsave(&timekeeper.lock, flags); 808 write_seqlock_irqsave(&tk->lock, flags);
768 timekeeping_forward_now(&timekeeper); 809 timekeeping_forward_now(tk);
769 timekeeping_suspended = 1; 810 timekeeping_suspended = 1;
770 811
771 /* 812 /*
@@ -774,7 +815,7 @@ static int timekeeping_suspend(void)
774 * try to compensate so the difference in system time 815 * try to compensate so the difference in system time
775 * and persistent_clock time stays close to constant. 816 * and persistent_clock time stays close to constant.
776 */ 817 */
777 delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time); 818 delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
778 delta_delta = timespec_sub(delta, old_delta); 819 delta_delta = timespec_sub(delta, old_delta);
779 if (abs(delta_delta.tv_sec) >= 2) { 820 if (abs(delta_delta.tv_sec) >= 2) {
780 /* 821 /*
@@ -787,7 +828,7 @@ static int timekeeping_suspend(void)
787 timekeeping_suspend_time = 828 timekeeping_suspend_time =
788 timespec_add(timekeeping_suspend_time, delta_delta); 829 timespec_add(timekeeping_suspend_time, delta_delta);
789 } 830 }
790 write_sequnlock_irqrestore(&timekeeper.lock, flags); 831 write_sequnlock_irqrestore(&tk->lock, flags);
791 832
792 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 833 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
793 clocksource_suspend(); 834 clocksource_suspend();
@@ -898,27 +939,29 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
898 * the error. This causes the likely below to be unlikely. 939 * the error. This causes the likely below to be unlikely.
899 * 940 *
900 * The proper fix is to avoid rounding up by using 941 * The proper fix is to avoid rounding up by using
901 * the high precision timekeeper.xtime_nsec instead of 942 * the high precision tk->xtime_nsec instead of
902 * xtime.tv_nsec everywhere. Fixing this will take some 943 * xtime.tv_nsec everywhere. Fixing this will take some
903 * time. 944 * time.
904 */ 945 */
905 if (likely(error <= interval)) 946 if (likely(error <= interval))
906 adj = 1; 947 adj = 1;
907 else 948 else
908 adj = timekeeping_bigadjust(tk, error, &interval, 949 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
909 &offset); 950 } else {
910 } else if (error < -interval) { 951 if (error < -interval) {
911 /* See comment above, this is just switched for the negative */ 952 /* See comment above, this is just switched for the negative */
912 error >>= 2; 953 error >>= 2;
913 if (likely(error >= -interval)) { 954 if (likely(error >= -interval)) {
914 adj = -1; 955 adj = -1;
915 interval = -interval; 956 interval = -interval;
916 offset = -offset; 957 offset = -offset;
917 } else 958 } else {
918 adj = timekeeping_bigadjust(tk, error, &interval, 959 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
919 &offset); 960 }
920 } else 961 } else {
921 return; 962 goto out_adjust;
963 }
964 }
922 965
923 if (unlikely(tk->clock->maxadj && 966 if (unlikely(tk->clock->maxadj &&
924 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { 967 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
@@ -981,6 +1024,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
981 tk->xtime_nsec -= offset; 1024 tk->xtime_nsec -= offset;
982 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; 1025 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
983 1026
1027out_adjust:
984 /* 1028 /*
985 * It may be possible that when we entered this function, xtime_nsec 1029 * It may be possible that when we entered this function, xtime_nsec
986 * was very small. Further, if we're slightly speeding the clocksource 1030 * was very small. Further, if we're slightly speeding the clocksource
@@ -1003,7 +1047,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1003 1047
1004} 1048}
1005 1049
1006
1007/** 1050/**
1008 * accumulate_nsecs_to_secs - Accumulates nsecs into secs 1051 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
1009 * 1052 *
@@ -1024,15 +1067,21 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1024 1067
1025 /* Figure out if its a leap sec and apply if needed */ 1068 /* Figure out if its a leap sec and apply if needed */
1026 leap = second_overflow(tk->xtime_sec); 1069 leap = second_overflow(tk->xtime_sec);
1027 tk->xtime_sec += leap; 1070 if (unlikely(leap)) {
1028 tk->wall_to_monotonic.tv_sec -= leap; 1071 struct timespec ts;
1029 if (leap) 1072
1030 clock_was_set_delayed(); 1073 tk->xtime_sec += leap;
1074
1075 ts.tv_sec = leap;
1076 ts.tv_nsec = 0;
1077 tk_set_wall_to_mono(tk,
1078 timespec_sub(tk->wall_to_monotonic, ts));
1031 1079
1080 clock_was_set_delayed();
1081 }
1032 } 1082 }
1033} 1083}
1034 1084
1035
1036/** 1085/**
1037 * logarithmic_accumulation - shifted accumulation of cycles 1086 * logarithmic_accumulation - shifted accumulation of cycles
1038 * 1087 *
@@ -1076,7 +1125,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1076 return offset; 1125 return offset;
1077} 1126}
1078 1127
1079
1080/** 1128/**
1081 * update_wall_time - Uses the current clocksource to increment the wall time 1129 * update_wall_time - Uses the current clocksource to increment the wall time
1082 * 1130 *
@@ -1084,25 +1132,30 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1084static void update_wall_time(void) 1132static void update_wall_time(void)
1085{ 1133{
1086 struct clocksource *clock; 1134 struct clocksource *clock;
1135 struct timekeeper *tk = &timekeeper;
1087 cycle_t offset; 1136 cycle_t offset;
1088 int shift = 0, maxshift; 1137 int shift = 0, maxshift;
1089 unsigned long flags; 1138 unsigned long flags;
1090 s64 remainder; 1139 s64 remainder;
1091 1140
1092 write_seqlock_irqsave(&timekeeper.lock, flags); 1141 write_seqlock_irqsave(&tk->lock, flags);
1093 1142
1094 /* Make sure we're fully resumed: */ 1143 /* Make sure we're fully resumed: */
1095 if (unlikely(timekeeping_suspended)) 1144 if (unlikely(timekeeping_suspended))
1096 goto out; 1145 goto out;
1097 1146
1098 clock = timekeeper.clock; 1147 clock = tk->clock;
1099 1148
1100#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1149#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1101 offset = timekeeper.cycle_interval; 1150 offset = tk->cycle_interval;
1102#else 1151#else
1103 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1152 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1104#endif 1153#endif
1105 1154
1155 /* Check if there's really nothing to do */
1156 if (offset < tk->cycle_interval)
1157 goto out;
1158
1106 /* 1159 /*
1107 * With NO_HZ we may have to accumulate many cycle_intervals 1160 * With NO_HZ we may have to accumulate many cycle_intervals
1108 * (think "ticks") worth of time at once. To do this efficiently, 1161 * (think "ticks") worth of time at once. To do this efficiently,
@@ -1111,19 +1164,19 @@ static void update_wall_time(void)
1111 * chunk in one go, and then try to consume the next smaller 1164 * chunk in one go, and then try to consume the next smaller
1112 * doubled multiple. 1165 * doubled multiple.
1113 */ 1166 */
1114 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); 1167 shift = ilog2(offset) - ilog2(tk->cycle_interval);
1115 shift = max(0, shift); 1168 shift = max(0, shift);
1116 /* Bound shift to one less than what overflows tick_length */ 1169 /* Bound shift to one less than what overflows tick_length */
1117 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; 1170 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1118 shift = min(shift, maxshift); 1171 shift = min(shift, maxshift);
1119 while (offset >= timekeeper.cycle_interval) { 1172 while (offset >= tk->cycle_interval) {
1120 offset = logarithmic_accumulation(&timekeeper, offset, shift); 1173 offset = logarithmic_accumulation(tk, offset, shift);
1121 if(offset < timekeeper.cycle_interval<<shift) 1174 if (offset < tk->cycle_interval<<shift)
1122 shift--; 1175 shift--;
1123 } 1176 }
1124 1177
1125 /* correct the clock when NTP error is too big */ 1178 /* correct the clock when NTP error is too big */
1126 timekeeping_adjust(&timekeeper, offset); 1179 timekeeping_adjust(tk, offset);
1127 1180
1128 1181
1129 /* 1182 /*
@@ -1135,21 +1188,21 @@ static void update_wall_time(void)
1135 * the vsyscall implementations are converted to use xtime_nsec 1188 * the vsyscall implementations are converted to use xtime_nsec
1136 * (shifted nanoseconds), this can be killed. 1189 * (shifted nanoseconds), this can be killed.
1137 */ 1190 */
1138 remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1); 1191 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1139 timekeeper.xtime_nsec -= remainder; 1192 tk->xtime_nsec -= remainder;
1140 timekeeper.xtime_nsec += 1 << timekeeper.shift; 1193 tk->xtime_nsec += 1ULL << tk->shift;
1141 timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift; 1194 tk->ntp_error += remainder << tk->ntp_error_shift;
1142 1195
1143 /* 1196 /*
1144 * Finally, make sure that after the rounding 1197 * Finally, make sure that after the rounding
1145 * xtime_nsec isn't larger than NSEC_PER_SEC 1198 * xtime_nsec isn't larger than NSEC_PER_SEC
1146 */ 1199 */
1147 accumulate_nsecs_to_secs(&timekeeper); 1200 accumulate_nsecs_to_secs(tk);
1148 1201
1149 timekeeping_update(&timekeeper, false); 1202 timekeeping_update(tk, false);
1150 1203
1151out: 1204out:
1152 write_sequnlock_irqrestore(&timekeeper.lock, flags); 1205 write_sequnlock_irqrestore(&tk->lock, flags);
1153 1206
1154} 1207}
1155 1208
@@ -1166,18 +1219,18 @@ out:
1166 */ 1219 */
1167void getboottime(struct timespec *ts) 1220void getboottime(struct timespec *ts)
1168{ 1221{
1222 struct timekeeper *tk = &timekeeper;
1169 struct timespec boottime = { 1223 struct timespec boottime = {
1170 .tv_sec = timekeeper.wall_to_monotonic.tv_sec + 1224 .tv_sec = tk->wall_to_monotonic.tv_sec +
1171 timekeeper.total_sleep_time.tv_sec, 1225 tk->total_sleep_time.tv_sec,
1172 .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + 1226 .tv_nsec = tk->wall_to_monotonic.tv_nsec +
1173 timekeeper.total_sleep_time.tv_nsec 1227 tk->total_sleep_time.tv_nsec
1174 }; 1228 };
1175 1229
1176 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 1230 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
1177} 1231}
1178EXPORT_SYMBOL_GPL(getboottime); 1232EXPORT_SYMBOL_GPL(getboottime);
1179 1233
1180
1181/** 1234/**
1182 * get_monotonic_boottime - Returns monotonic time since boot 1235 * get_monotonic_boottime - Returns monotonic time since boot
1183 * @ts: pointer to the timespec to be set 1236 * @ts: pointer to the timespec to be set
@@ -1189,19 +1242,20 @@ EXPORT_SYMBOL_GPL(getboottime);
1189 */ 1242 */
1190void get_monotonic_boottime(struct timespec *ts) 1243void get_monotonic_boottime(struct timespec *ts)
1191{ 1244{
1245 struct timekeeper *tk = &timekeeper;
1192 struct timespec tomono, sleep; 1246 struct timespec tomono, sleep;
1193 unsigned int seq; 1247 unsigned int seq;
1194 1248
1195 WARN_ON(timekeeping_suspended); 1249 WARN_ON(timekeeping_suspended);
1196 1250
1197 do { 1251 do {
1198 seq = read_seqbegin(&timekeeper.lock); 1252 seq = read_seqbegin(&tk->lock);
1199 ts->tv_sec = timekeeper.xtime_sec; 1253 ts->tv_sec = tk->xtime_sec;
1200 ts->tv_nsec = timekeeping_get_ns(&timekeeper); 1254 ts->tv_nsec = timekeeping_get_ns(tk);
1201 tomono = timekeeper.wall_to_monotonic; 1255 tomono = tk->wall_to_monotonic;
1202 sleep = timekeeper.total_sleep_time; 1256 sleep = tk->total_sleep_time;
1203 1257
1204 } while (read_seqretry(&timekeeper.lock, seq)); 1258 } while (read_seqretry(&tk->lock, seq));
1205 1259
1206 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, 1260 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
1207 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); 1261 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
@@ -1231,31 +1285,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
1231 */ 1285 */
1232void monotonic_to_bootbased(struct timespec *ts) 1286void monotonic_to_bootbased(struct timespec *ts)
1233{ 1287{
1234 *ts = timespec_add(*ts, timekeeper.total_sleep_time); 1288 struct timekeeper *tk = &timekeeper;
1289
1290 *ts = timespec_add(*ts, tk->total_sleep_time);
1235} 1291}
1236EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1292EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
1237 1293
1238unsigned long get_seconds(void) 1294unsigned long get_seconds(void)
1239{ 1295{
1240 return timekeeper.xtime_sec; 1296 struct timekeeper *tk = &timekeeper;
1297
1298 return tk->xtime_sec;
1241} 1299}
1242EXPORT_SYMBOL(get_seconds); 1300EXPORT_SYMBOL(get_seconds);
1243 1301
1244struct timespec __current_kernel_time(void) 1302struct timespec __current_kernel_time(void)
1245{ 1303{
1246 return tk_xtime(&timekeeper); 1304 struct timekeeper *tk = &timekeeper;
1305
1306 return tk_xtime(tk);
1247} 1307}
1248 1308
1249struct timespec current_kernel_time(void) 1309struct timespec current_kernel_time(void)
1250{ 1310{
1311 struct timekeeper *tk = &timekeeper;
1251 struct timespec now; 1312 struct timespec now;
1252 unsigned long seq; 1313 unsigned long seq;
1253 1314
1254 do { 1315 do {
1255 seq = read_seqbegin(&timekeeper.lock); 1316 seq = read_seqbegin(&tk->lock);
1256 1317
1257 now = tk_xtime(&timekeeper); 1318 now = tk_xtime(tk);
1258 } while (read_seqretry(&timekeeper.lock, seq)); 1319 } while (read_seqretry(&tk->lock, seq));
1259 1320
1260 return now; 1321 return now;
1261} 1322}
@@ -1263,15 +1324,16 @@ EXPORT_SYMBOL(current_kernel_time);
1263 1324
1264struct timespec get_monotonic_coarse(void) 1325struct timespec get_monotonic_coarse(void)
1265{ 1326{
1327 struct timekeeper *tk = &timekeeper;
1266 struct timespec now, mono; 1328 struct timespec now, mono;
1267 unsigned long seq; 1329 unsigned long seq;
1268 1330
1269 do { 1331 do {
1270 seq = read_seqbegin(&timekeeper.lock); 1332 seq = read_seqbegin(&tk->lock);
1271 1333
1272 now = tk_xtime(&timekeeper); 1334 now = tk_xtime(tk);
1273 mono = timekeeper.wall_to_monotonic; 1335 mono = tk->wall_to_monotonic;
1274 } while (read_seqretry(&timekeeper.lock, seq)); 1336 } while (read_seqretry(&tk->lock, seq));
1275 1337
1276 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1338 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1277 now.tv_nsec + mono.tv_nsec); 1339 now.tv_nsec + mono.tv_nsec);
@@ -1300,14 +1362,15 @@ void do_timer(unsigned long ticks)
1300void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, 1362void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1301 struct timespec *wtom, struct timespec *sleep) 1363 struct timespec *wtom, struct timespec *sleep)
1302{ 1364{
1365 struct timekeeper *tk = &timekeeper;
1303 unsigned long seq; 1366 unsigned long seq;
1304 1367
1305 do { 1368 do {
1306 seq = read_seqbegin(&timekeeper.lock); 1369 seq = read_seqbegin(&tk->lock);
1307 *xtim = tk_xtime(&timekeeper); 1370 *xtim = tk_xtime(tk);
1308 *wtom = timekeeper.wall_to_monotonic; 1371 *wtom = tk->wall_to_monotonic;
1309 *sleep = timekeeper.total_sleep_time; 1372 *sleep = tk->total_sleep_time;
1310 } while (read_seqretry(&timekeeper.lock, seq)); 1373 } while (read_seqretry(&tk->lock, seq));
1311} 1374}
1312 1375
1313#ifdef CONFIG_HIGH_RES_TIMERS 1376#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1321,19 +1384,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1321 */ 1384 */
1322ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) 1385ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1323{ 1386{
1387 struct timekeeper *tk = &timekeeper;
1324 ktime_t now; 1388 ktime_t now;
1325 unsigned int seq; 1389 unsigned int seq;
1326 u64 secs, nsecs; 1390 u64 secs, nsecs;
1327 1391
1328 do { 1392 do {
1329 seq = read_seqbegin(&timekeeper.lock); 1393 seq = read_seqbegin(&tk->lock);
1330 1394
1331 secs = timekeeper.xtime_sec; 1395 secs = tk->xtime_sec;
1332 nsecs = timekeeping_get_ns(&timekeeper); 1396 nsecs = timekeeping_get_ns(tk);
1333 1397
1334 *offs_real = timekeeper.offs_real; 1398 *offs_real = tk->offs_real;
1335 *offs_boot = timekeeper.offs_boot; 1399 *offs_boot = tk->offs_boot;
1336 } while (read_seqretry(&timekeeper.lock, seq)); 1400 } while (read_seqretry(&tk->lock, seq));
1337 1401
1338 now = ktime_add_ns(ktime_set(secs, 0), nsecs); 1402 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1339 now = ktime_sub(now, *offs_real); 1403 now = ktime_sub(now, *offs_real);
@@ -1346,19 +1410,19 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1346 */ 1410 */
1347ktime_t ktime_get_monotonic_offset(void) 1411ktime_t ktime_get_monotonic_offset(void)
1348{ 1412{
1413 struct timekeeper *tk = &timekeeper;
1349 unsigned long seq; 1414 unsigned long seq;
1350 struct timespec wtom; 1415 struct timespec wtom;
1351 1416
1352 do { 1417 do {
1353 seq = read_seqbegin(&timekeeper.lock); 1418 seq = read_seqbegin(&tk->lock);
1354 wtom = timekeeper.wall_to_monotonic; 1419 wtom = tk->wall_to_monotonic;
1355 } while (read_seqretry(&timekeeper.lock, seq)); 1420 } while (read_seqretry(&tk->lock, seq));
1356 1421
1357 return timespec_to_ktime(wtom); 1422 return timespec_to_ktime(wtom);
1358} 1423}
1359EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); 1424EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1360 1425
1361
1362/** 1426/**
1363 * xtime_update() - advances the timekeeping infrastructure 1427 * xtime_update() - advances the timekeeping infrastructure
1364 * @ticks: number of ticks, that have elapsed since the last call. 1428 * @ticks: number of ticks, that have elapsed since the last call.
diff --git a/kernel/timer.c b/kernel/timer.c
index a61c09374eba..8c5e7b908c68 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1407,13 +1407,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1407 1407
1408#endif 1408#endif
1409 1409
1410#ifndef __alpha__
1411
1412/*
1413 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
1414 * should be moved into arch/i386 instead?
1415 */
1416
1417/** 1410/**
1418 * sys_getpid - return the thread group id of the current process 1411 * sys_getpid - return the thread group id of the current process
1419 * 1412 *
@@ -1469,8 +1462,6 @@ SYSCALL_DEFINE0(getegid)
1469 return from_kgid_munged(current_user_ns(), current_egid()); 1462 return from_kgid_munged(current_user_ns(), current_egid());
1470} 1463}
1471 1464
1472#endif
1473
1474static void process_timeout(unsigned long __data) 1465static void process_timeout(unsigned long __data)
1475{ 1466{
1476 wake_up_process((struct task_struct *)__data); 1467 wake_up_process((struct task_struct *)__data);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index fee3752ae8f6..8a6d2ee2086c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
281 281
282 head = this_cpu_ptr(event_function.perf_events); 282 head = this_cpu_ptr(event_function.perf_events);
283 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 283 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
284 1, &regs, head); 284 1, &regs, head, NULL);
285 285
286#undef ENTRY_SIZE 286#undef ENTRY_SIZE
287} 287}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b31d3d5699fe..1a2117043bb1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1002 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1002 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1003 1003
1004 head = this_cpu_ptr(call->perf_events); 1004 head = this_cpu_ptr(call->perf_events);
1005 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 1005 perf_trace_buf_submit(entry, size, rctx,
1006 entry->ip, 1, regs, head, NULL);
1006} 1007}
1007 1008
1008/* Kretprobe profile handler */ 1009/* Kretprobe profile handler */
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1033 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1034 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1034 1035
1035 head = this_cpu_ptr(call->perf_events); 1036 head = this_cpu_ptr(call->perf_events);
1036 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1037 perf_trace_buf_submit(entry, size, rctx,
1038 entry->ret_ip, 1, regs, head, NULL);
1037} 1039}
1038#endif /* CONFIG_PERF_EVENTS */ 1040#endif /* CONFIG_PERF_EVENTS */
1039 1041
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc73369099..6b245f64c8dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
506 int size; 506 int size;
507 507
508 syscall_nr = syscall_get_nr(current, regs); 508 syscall_nr = syscall_get_nr(current, regs);
509 if (syscall_nr < 0)
510 return;
509 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 511 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
510 return; 512 return;
511 513
@@ -532,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
532 (unsigned long *)&rec->args); 534 (unsigned long *)&rec->args);
533 535
534 head = this_cpu_ptr(sys_data->enter_event->perf_events); 536 head = this_cpu_ptr(sys_data->enter_event->perf_events);
535 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 537 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
536} 538}
537 539
538int perf_sysenter_enable(struct ftrace_event_call *call) 540int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
580 int size; 582 int size;
581 583
582 syscall_nr = syscall_get_nr(current, regs); 584 syscall_nr = syscall_get_nr(current, regs);
585 if (syscall_nr < 0)
586 return;
583 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 587 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
584 return; 588 return;
585 589
@@ -608,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
608 rec->ret = syscall_get_return_value(current, regs); 612 rec->ret = syscall_get_return_value(current, regs);
609 613
610 head = this_cpu_ptr(sys_data->exit_event->perf_events); 614 head = this_cpu_ptr(sys_data->exit_event->perf_events);
611 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 615 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
612} 616}
613 617
614int perf_sysexit_enable(struct ftrace_event_call *call) 618int perf_sysexit_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2b36ac68549e..03003cd7dd96 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
671 671
672 head = this_cpu_ptr(call->perf_events); 672 head = this_cpu_ptr(call->perf_events);
673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); 673 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
674 674
675 out: 675 out:
676 preempt_enable(); 676 preempt_enable();