aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2014-08-24 16:32:24 -0400
committerIngo Molnar <mingo@kernel.org>2014-08-24 16:32:24 -0400
commit83bc90e11576f9c100f8ef4ba2bcd0b89212e3fb (patch)
treee59186b4d315c80255851e0d204143ecc21399a0 /kernel
parente21ded5ecc531a64d6fc0c1693285e890b4e9569 (diff)
parent451fd72219dd6f3355e2d036c598544c760ee532 (diff)
Merge branch 'linus' into perf/core, to fix conflicts
Conflicts: arch/x86/kernel/cpu/perf_event_intel_uncore*.c Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile28
-rw-r--r--kernel/acct.c494
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/core.c534
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/delayacct.c62
-rw-r--r--kernel/events/uprobes.c15
-rw-r--r--kernel/exit.c50
-rw-r--r--kernel/fork.c133
-rw-r--r--kernel/gcov/fs.c3
-rw-r--r--kernel/irq/generic-chip.c5
-rw-r--r--kernel/irq/irqdomain.c2
-rw-r--r--kernel/kallsyms.c2
-rw-r--r--kernel/kexec.c1291
-rw-r--r--kernel/module.c19
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/main.c25
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/snapshot.c515
-rw-r--r--kernel/power/suspend.c152
-rw-r--r--kernel/power/suspend_test.c12
-rw-r--r--kernel/printk/printk.c169
-rw-r--r--kernel/resource.c101
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/proc.c7
-rw-r--r--kernel/seccomp.c430
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/system_keyring.c1
-rw-r--r--kernel/test_kprobes.c87
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/Makefile19
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clocksource.c12
-rw-r--r--kernel/time/hrtimer.c (renamed from kernel/hrtimer.c)125
-rw-r--r--kernel/time/itimer.c (renamed from kernel/itimer.c)0
-rw-r--r--kernel/time/ntp.c15
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-cpu-timers.c (renamed from kernel/posix-cpu-timers.c)0
-rw-r--r--kernel/time/posix-timers.c (renamed from kernel/posix-timers.c)2
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/time.c (renamed from kernel/time.c)64
-rw-r--r--kernel/time/timeconst.bc (renamed from kernel/timeconst.bc)0
-rw-r--r--kernel/time/timekeeping.c1148
-rw-r--r--kernel/time/timekeeping.h20
-rw-r--r--kernel/time/timekeeping_debug.c2
-rw-r--r--kernel/time/timekeeping_internal.h17
-rw-r--r--kernel/time/timer.c (renamed from kernel/timer.c)34
-rw-r--r--kernel/time/udelay_test.c168
-rw-r--r--kernel/trace/ring_buffer.c31
-rw-r--r--kernel/trace/trace.c11
-rw-r--r--kernel/tsacct.c19
-rw-r--r--kernel/user_namespace.c6
-rw-r--r--kernel/utsname.c6
-rw-r--r--kernel/watchdog.c11
67 files changed, 4265 insertions, 1717 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f2a8b6246ce9..dc5c77544fd6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,12 +3,11 @@
3# 3#
4 4
5obj-y = fork.o exec_domain.o panic.o \ 5obj-y = fork.o exec_domain.o panic.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 extable.o params.o posix-timers.o \ 9 extable.o params.o \
10 kthread.o sys_ni.o posix-cpu-timers.o \ 10 kthread.o sys_ni.o nsproxy.o \
11 hrtimer.o nsproxy.o \
12 notifier.o ksysfs.o cred.o reboot.o \ 11 notifier.o ksysfs.o cred.o reboot.o \
13 async.o range.o groups.o smpboot.o 12 async.o range.o groups.o smpboot.o
14 13
@@ -87,6 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
87obj-$(CONFIG_TRACEPOINTS) += trace/ 86obj-$(CONFIG_TRACEPOINTS) += trace/
88obj-$(CONFIG_IRQ_WORK) += irq_work.o 87obj-$(CONFIG_IRQ_WORK) += irq_work.o
89obj-$(CONFIG_CPU_PM) += cpu_pm.o 88obj-$(CONFIG_CPU_PM) += cpu_pm.o
89obj-$(CONFIG_NET) += bpf/
90 90
91obj-$(CONFIG_PERF_EVENTS) += events/ 91obj-$(CONFIG_PERF_EVENTS) += events/
92 92
@@ -105,27 +105,11 @@ targets += config_data.gz
105$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE 105$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
106 $(call if_changed,gzip) 106 $(call if_changed,gzip)
107 107
108 filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") 108 filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
109targets += config_data.h 109targets += config_data.h
110$(obj)/config_data.h: $(obj)/config_data.gz FORCE 110$(obj)/config_data.h: $(obj)/config_data.gz FORCE
111 $(call filechk,ikconfiggz) 111 $(call filechk,ikconfiggz)
112 112
113$(obj)/time.o: $(obj)/timeconst.h
114
115quiet_cmd_hzfile = HZFILE $@
116 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
117
118targets += hz.bc
119$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
120 $(call if_changed,hzfile)
121
122quiet_cmd_bc = BC $@
123 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
124
125targets += timeconst.h
126$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
127 $(call if_changed,bc)
128
129############################################################################### 113###############################################################################
130# 114#
131# Roll all the X.509 certificates that we can find together and pull them into 115# Roll all the X.509 certificates that we can find together and pull them into
diff --git a/kernel/acct.c b/kernel/acct.c
index 808a86ff229d..b4c667d22e79 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
59#include <asm/div64.h> 59#include <asm/div64.h>
60#include <linux/blkdev.h> /* sector_div */ 60#include <linux/blkdev.h> /* sector_div */
61#include <linux/pid_namespace.h> 61#include <linux/pid_namespace.h>
62#include <linux/fs_pin.h>
62 63
63/* 64/*
64 * These constants control the amount of freespace that suspend and 65 * These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
75/* 76/*
76 * External references and all of the globals. 77 * External references and all of the globals.
77 */ 78 */
78static void do_acct_process(struct bsd_acct_struct *acct, 79static void do_acct_process(struct bsd_acct_struct *acct);
79 struct pid_namespace *ns, struct file *);
80 80
81/*
82 * This structure is used so that all the data protected by lock
83 * can be placed in the same cache line as the lock. This primes
84 * the cache line to have the data after getting the lock.
85 */
86struct bsd_acct_struct { 81struct bsd_acct_struct {
82 struct fs_pin pin;
83 struct mutex lock;
87 int active; 84 int active;
88 unsigned long needcheck; 85 unsigned long needcheck;
89 struct file *file; 86 struct file *file;
90 struct pid_namespace *ns; 87 struct pid_namespace *ns;
91 struct list_head list; 88 struct work_struct work;
89 struct completion done;
92}; 90};
93 91
94static DEFINE_SPINLOCK(acct_lock);
95static LIST_HEAD(acct_list);
96
97/* 92/*
98 * Check the amount of free space and suspend/resume accordingly. 93 * Check the amount of free space and suspend/resume accordingly.
99 */ 94 */
100static int check_free_space(struct bsd_acct_struct *acct, struct file *file) 95static int check_free_space(struct bsd_acct_struct *acct)
101{ 96{
102 struct kstatfs sbuf; 97 struct kstatfs sbuf;
103 int res; 98
104 int act; 99 if (time_is_before_jiffies(acct->needcheck))
105 u64 resume;
106 u64 suspend;
107
108 spin_lock(&acct_lock);
109 res = acct->active;
110 if (!file || time_is_before_jiffies(acct->needcheck))
111 goto out; 100 goto out;
112 spin_unlock(&acct_lock);
113 101
114 /* May block */ 102 /* May block */
115 if (vfs_statfs(&file->f_path, &sbuf)) 103 if (vfs_statfs(&acct->file->f_path, &sbuf))
116 return res;
117 suspend = sbuf.f_blocks * SUSPEND;
118 resume = sbuf.f_blocks * RESUME;
119
120 do_div(suspend, 100);
121 do_div(resume, 100);
122
123 if (sbuf.f_bavail <= suspend)
124 act = -1;
125 else if (sbuf.f_bavail >= resume)
126 act = 1;
127 else
128 act = 0;
129
130 /*
131 * If some joker switched acct->file under us we'ld better be
132 * silent and _not_ touch anything.
133 */
134 spin_lock(&acct_lock);
135 if (file != acct->file) {
136 if (act)
137 res = act > 0;
138 goto out; 104 goto out;
139 }
140 105
141 if (acct->active) { 106 if (acct->active) {
142 if (act < 0) { 107 u64 suspend = sbuf.f_blocks * SUSPEND;
108 do_div(suspend, 100);
109 if (sbuf.f_bavail <= suspend) {
143 acct->active = 0; 110 acct->active = 0;
144 printk(KERN_INFO "Process accounting paused\n"); 111 pr_info("Process accounting paused\n");
145 } 112 }
146 } else { 113 } else {
147 if (act > 0) { 114 u64 resume = sbuf.f_blocks * RESUME;
115 do_div(resume, 100);
116 if (sbuf.f_bavail >= resume) {
148 acct->active = 1; 117 acct->active = 1;
149 printk(KERN_INFO "Process accounting resumed\n"); 118 pr_info("Process accounting resumed\n");
150 } 119 }
151 } 120 }
152 121
153 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; 122 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
154 res = acct->active;
155out: 123out:
156 spin_unlock(&acct_lock); 124 return acct->active;
125}
126
127static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
128{
129 struct bsd_acct_struct *res;
130again:
131 smp_rmb();
132 rcu_read_lock();
133 res = ACCESS_ONCE(ns->bacct);
134 if (!res) {
135 rcu_read_unlock();
136 return NULL;
137 }
138 if (!atomic_long_inc_not_zero(&res->pin.count)) {
139 rcu_read_unlock();
140 cpu_relax();
141 goto again;
142 }
143 rcu_read_unlock();
144 mutex_lock(&res->lock);
145 if (!res->ns) {
146 mutex_unlock(&res->lock);
147 pin_put(&res->pin);
148 goto again;
149 }
157 return res; 150 return res;
158} 151}
159 152
160/* 153static void close_work(struct work_struct *work)
161 * Close the old accounting file (if currently open) and then replace 154{
162 * it with file (if non-NULL). 155 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
163 * 156 struct file *file = acct->file;
164 * NOTE: acct_lock MUST be held on entry and exit. 157 if (file->f_op->flush)
165 */ 158 file->f_op->flush(file, NULL);
166static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, 159 __fput_sync(file);
167 struct pid_namespace *ns) 160 complete(&acct->done);
161}
162
163static void acct_kill(struct bsd_acct_struct *acct,
164 struct bsd_acct_struct *new)
168{ 165{
169 struct file *old_acct = NULL; 166 if (acct) {
170 struct pid_namespace *old_ns = NULL; 167 struct pid_namespace *ns = acct->ns;
171 168 do_acct_process(acct);
172 if (acct->file) { 169 INIT_WORK(&acct->work, close_work);
173 old_acct = acct->file; 170 init_completion(&acct->done);
174 old_ns = acct->ns; 171 schedule_work(&acct->work);
175 acct->active = 0; 172 wait_for_completion(&acct->done);
176 acct->file = NULL; 173 pin_remove(&acct->pin);
174 ns->bacct = new;
177 acct->ns = NULL; 175 acct->ns = NULL;
178 list_del(&acct->list); 176 atomic_long_dec(&acct->pin.count);
177 mutex_unlock(&acct->lock);
178 pin_put(&acct->pin);
179 } 179 }
180 if (file) { 180}
181 acct->file = file; 181
182 acct->ns = ns; 182static void acct_pin_kill(struct fs_pin *pin)
183 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; 183{
184 acct->active = 1; 184 struct bsd_acct_struct *acct;
185 list_add(&acct->list, &acct_list); 185 acct = container_of(pin, struct bsd_acct_struct, pin);
186 } 186 mutex_lock(&acct->lock);
187 if (old_acct) { 187 if (!acct->ns) {
188 mnt_unpin(old_acct->f_path.mnt); 188 mutex_unlock(&acct->lock);
189 spin_unlock(&acct_lock); 189 pin_put(pin);
190 do_acct_process(acct, old_ns, old_acct); 190 acct = NULL;
191 filp_close(old_acct, NULL);
192 spin_lock(&acct_lock);
193 } 191 }
192 acct_kill(acct, NULL);
194} 193}
195 194
196static int acct_on(struct filename *pathname) 195static int acct_on(struct filename *pathname)
197{ 196{
198 struct file *file; 197 struct file *file;
199 struct vfsmount *mnt; 198 struct vfsmount *mnt, *internal;
200 struct pid_namespace *ns; 199 struct pid_namespace *ns = task_active_pid_ns(current);
201 struct bsd_acct_struct *acct = NULL; 200 struct bsd_acct_struct *acct, *old;
201 int err;
202
203 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
204 if (!acct)
205 return -ENOMEM;
202 206
203 /* Difference from BSD - they don't do O_APPEND */ 207 /* Difference from BSD - they don't do O_APPEND */
204 file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 208 file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
205 if (IS_ERR(file)) 209 if (IS_ERR(file)) {
210 kfree(acct);
206 return PTR_ERR(file); 211 return PTR_ERR(file);
212 }
207 213
208 if (!S_ISREG(file_inode(file)->i_mode)) { 214 if (!S_ISREG(file_inode(file)->i_mode)) {
215 kfree(acct);
209 filp_close(file, NULL); 216 filp_close(file, NULL);
210 return -EACCES; 217 return -EACCES;
211 } 218 }
212 219
213 if (!file->f_op->write) { 220 if (!file->f_op->write) {
221 kfree(acct);
214 filp_close(file, NULL); 222 filp_close(file, NULL);
215 return -EIO; 223 return -EIO;
216 } 224 }
217 225 internal = mnt_clone_internal(&file->f_path);
218 ns = task_active_pid_ns(current); 226 if (IS_ERR(internal)) {
219 if (ns->bacct == NULL) { 227 kfree(acct);
220 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); 228 filp_close(file, NULL);
221 if (acct == NULL) { 229 return PTR_ERR(internal);
222 filp_close(file, NULL);
223 return -ENOMEM;
224 }
225 } 230 }
226 231 err = mnt_want_write(internal);
227 spin_lock(&acct_lock); 232 if (err) {
228 if (ns->bacct == NULL) { 233 mntput(internal);
229 ns->bacct = acct; 234 kfree(acct);
230 acct = NULL; 235 filp_close(file, NULL);
236 return err;
231 } 237 }
232
233 mnt = file->f_path.mnt; 238 mnt = file->f_path.mnt;
234 mnt_pin(mnt); 239 file->f_path.mnt = internal;
235 acct_file_reopen(ns->bacct, file, ns); 240
236 spin_unlock(&acct_lock); 241 atomic_long_set(&acct->pin.count, 1);
237 242 acct->pin.kill = acct_pin_kill;
238 mntput(mnt); /* it's pinned, now give up active reference */ 243 acct->file = file;
239 kfree(acct); 244 acct->needcheck = jiffies;
240 245 acct->ns = ns;
246 mutex_init(&acct->lock);
247 mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
248 pin_insert(&acct->pin, mnt);
249
250 old = acct_get(ns);
251 if (old)
252 acct_kill(old, acct);
253 else
254 ns->bacct = acct;
255 mutex_unlock(&acct->lock);
256 mnt_drop_write(mnt);
257 mntput(mnt);
241 return 0; 258 return 0;
242} 259}
243 260
261static DEFINE_MUTEX(acct_on_mutex);
262
244/** 263/**
245 * sys_acct - enable/disable process accounting 264 * sys_acct - enable/disable process accounting
246 * @name: file name for accounting records or NULL to shutdown accounting 265 * @name: file name for accounting records or NULL to shutdown accounting
@@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
261 280
262 if (name) { 281 if (name) {
263 struct filename *tmp = getname(name); 282 struct filename *tmp = getname(name);
283
264 if (IS_ERR(tmp)) 284 if (IS_ERR(tmp))
265 return PTR_ERR(tmp); 285 return PTR_ERR(tmp);
286 mutex_lock(&acct_on_mutex);
266 error = acct_on(tmp); 287 error = acct_on(tmp);
288 mutex_unlock(&acct_on_mutex);
267 putname(tmp); 289 putname(tmp);
268 } else { 290 } else {
269 struct bsd_acct_struct *acct; 291 acct_kill(acct_get(task_active_pid_ns(current)), NULL);
270
271 acct = task_active_pid_ns(current)->bacct;
272 if (acct == NULL)
273 return 0;
274
275 spin_lock(&acct_lock);
276 acct_file_reopen(acct, NULL, NULL);
277 spin_unlock(&acct_lock);
278 } 292 }
279 293
280 return error; 294 return error;
281} 295}
282 296
283/**
284 * acct_auto_close - turn off a filesystem's accounting if it is on
285 * @m: vfsmount being shut down
286 *
287 * If the accounting is turned on for a file in the subtree pointed to
288 * to by m, turn accounting off. Done when m is about to die.
289 */
290void acct_auto_close_mnt(struct vfsmount *m)
291{
292 struct bsd_acct_struct *acct;
293
294 spin_lock(&acct_lock);
295restart:
296 list_for_each_entry(acct, &acct_list, list)
297 if (acct->file && acct->file->f_path.mnt == m) {
298 acct_file_reopen(acct, NULL, NULL);
299 goto restart;
300 }
301 spin_unlock(&acct_lock);
302}
303
304/**
305 * acct_auto_close - turn off a filesystem's accounting if it is on
306 * @sb: super block for the filesystem
307 *
308 * If the accounting is turned on for a file in the filesystem pointed
309 * to by sb, turn accounting off.
310 */
311void acct_auto_close(struct super_block *sb)
312{
313 struct bsd_acct_struct *acct;
314
315 spin_lock(&acct_lock);
316restart:
317 list_for_each_entry(acct, &acct_list, list)
318 if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
319 acct_file_reopen(acct, NULL, NULL);
320 goto restart;
321 }
322 spin_unlock(&acct_lock);
323}
324
325void acct_exit_ns(struct pid_namespace *ns) 297void acct_exit_ns(struct pid_namespace *ns)
326{ 298{
327 struct bsd_acct_struct *acct = ns->bacct; 299 acct_kill(acct_get(ns), NULL);
328
329 if (acct == NULL)
330 return;
331
332 spin_lock(&acct_lock);
333 if (acct->file != NULL)
334 acct_file_reopen(acct, NULL, NULL);
335 spin_unlock(&acct_lock);
336
337 kfree(acct);
338} 300}
339 301
340/* 302/*
@@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value)
376 return exp; 338 return exp;
377} 339}
378 340
379#if ACCT_VERSION==1 || ACCT_VERSION==2 341#if ACCT_VERSION == 1 || ACCT_VERSION == 2
380/* 342/*
381 * encode an u64 into a comp2_t (24 bits) 343 * encode an u64 into a comp2_t (24 bits)
382 * 344 *
@@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value)
389#define MANTSIZE2 20 /* 20 bit mantissa. */ 351#define MANTSIZE2 20 /* 20 bit mantissa. */
390#define EXPSIZE2 5 /* 5 bit base 2 exponent. */ 352#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
391#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ 353#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
392#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ 354#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
393 355
394static comp2_t encode_comp2_t(u64 value) 356static comp2_t encode_comp2_t(u64 value)
395{ 357{
@@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value)
420} 382}
421#endif 383#endif
422 384
423#if ACCT_VERSION==3 385#if ACCT_VERSION == 3
424/* 386/*
425 * encode an u64 into a 32 bit IEEE float 387 * encode an u64 into a 32 bit IEEE float
426 */ 388 */
@@ -429,8 +391,9 @@ static u32 encode_float(u64 value)
429 unsigned exp = 190; 391 unsigned exp = 190;
430 unsigned u; 392 unsigned u;
431 393
432 if (value==0) return 0; 394 if (value == 0)
433 while ((s64)value > 0){ 395 return 0;
396 while ((s64)value > 0) {
434 value <<= 1; 397 value <<= 1;
435 exp--; 398 exp--;
436 } 399 }
@@ -448,120 +411,112 @@ static u32 encode_float(u64 value)
448 * do_exit() or when switching to a different output file. 411 * do_exit() or when switching to a different output file.
449 */ 412 */
450 413
451/* 414static void fill_ac(acct_t *ac)
452 * do_acct_process does all actual work. Caller holds the reference to file.
453 */
454static void do_acct_process(struct bsd_acct_struct *acct,
455 struct pid_namespace *ns, struct file *file)
456{ 415{
457 struct pacct_struct *pacct = &current->signal->pacct; 416 struct pacct_struct *pacct = &current->signal->pacct;
458 acct_t ac; 417 u64 elapsed, run_time;
459 mm_segment_t fs;
460 unsigned long flim;
461 u64 elapsed;
462 u64 run_time;
463 struct timespec uptime;
464 struct tty_struct *tty; 418 struct tty_struct *tty;
465 const struct cred *orig_cred;
466
467 /* Perform file operations on behalf of whoever enabled accounting */
468 orig_cred = override_creds(file->f_cred);
469
470 /*
471 * First check to see if there is enough free_space to continue
472 * the process accounting system.
473 */
474 if (!check_free_space(acct, file))
475 goto out;
476 419
477 /* 420 /*
478 * Fill the accounting struct with the needed info as recorded 421 * Fill the accounting struct with the needed info as recorded
479 * by the different kernel functions. 422 * by the different kernel functions.
480 */ 423 */
481 memset(&ac, 0, sizeof(acct_t)); 424 memset(ac, 0, sizeof(acct_t));
482 425
483 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; 426 ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
484 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 427 strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
485 428
486 /* calculate run_time in nsec*/ 429 /* calculate run_time in nsec*/
487 do_posix_clock_monotonic_gettime(&uptime); 430 run_time = ktime_get_ns();
488 run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; 431 run_time -= current->group_leader->start_time;
489 run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
490 + current->group_leader->start_time.tv_nsec;
491 /* convert nsec -> AHZ */ 432 /* convert nsec -> AHZ */
492 elapsed = nsec_to_AHZ(run_time); 433 elapsed = nsec_to_AHZ(run_time);
493#if ACCT_VERSION==3 434#if ACCT_VERSION == 3
494 ac.ac_etime = encode_float(elapsed); 435 ac->ac_etime = encode_float(elapsed);
495#else 436#else
496 ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? 437 ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
497 (unsigned long) elapsed : (unsigned long) -1l); 438 (unsigned long) elapsed : (unsigned long) -1l);
498#endif 439#endif
499#if ACCT_VERSION==1 || ACCT_VERSION==2 440#if ACCT_VERSION == 1 || ACCT_VERSION == 2
500 { 441 {
501 /* new enlarged etime field */ 442 /* new enlarged etime field */
502 comp2_t etime = encode_comp2_t(elapsed); 443 comp2_t etime = encode_comp2_t(elapsed);
503 ac.ac_etime_hi = etime >> 16; 444
504 ac.ac_etime_lo = (u16) etime; 445 ac->ac_etime_hi = etime >> 16;
446 ac->ac_etime_lo = (u16) etime;
505 } 447 }
506#endif 448#endif
507 do_div(elapsed, AHZ); 449 do_div(elapsed, AHZ);
508 ac.ac_btime = get_seconds() - elapsed; 450 ac->ac_btime = get_seconds() - elapsed;
451#if ACCT_VERSION==2
452 ac->ac_ahz = AHZ;
453#endif
454
455 spin_lock_irq(&current->sighand->siglock);
456 tty = current->signal->tty; /* Safe as we hold the siglock */
457 ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
458 ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
459 ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
460 ac->ac_flag = pacct->ac_flag;
461 ac->ac_mem = encode_comp_t(pacct->ac_mem);
462 ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
463 ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
464 ac->ac_exitcode = pacct->ac_exitcode;
465 spin_unlock_irq(&current->sighand->siglock);
466}
467/*
468 * do_acct_process does all actual work. Caller holds the reference to file.
469 */
470static void do_acct_process(struct bsd_acct_struct *acct)
471{
472 acct_t ac;
473 unsigned long flim;
474 const struct cred *orig_cred;
475 struct pid_namespace *ns = acct->ns;
476 struct file *file = acct->file;
477
478 /*
479 * Accounting records are not subject to resource limits.
480 */
481 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
482 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
483 /* Perform file operations on behalf of whoever enabled accounting */
484 orig_cred = override_creds(file->f_cred);
485
486 /*
487 * First check to see if there is enough free_space to continue
488 * the process accounting system.
489 */
490 if (!check_free_space(acct))
491 goto out;
492
493 fill_ac(&ac);
509 /* we really need to bite the bullet and change layout */ 494 /* we really need to bite the bullet and change layout */
510 ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); 495 ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
511 ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); 496 ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
512#if ACCT_VERSION==2 497#if ACCT_VERSION == 1 || ACCT_VERSION == 2
513 ac.ac_ahz = AHZ;
514#endif
515#if ACCT_VERSION==1 || ACCT_VERSION==2
516 /* backward-compatible 16 bit fields */ 498 /* backward-compatible 16 bit fields */
517 ac.ac_uid16 = ac.ac_uid; 499 ac.ac_uid16 = ac.ac_uid;
518 ac.ac_gid16 = ac.ac_gid; 500 ac.ac_gid16 = ac.ac_gid;
519#endif 501#endif
520#if ACCT_VERSION==3 502#if ACCT_VERSION == 3
521 ac.ac_pid = task_tgid_nr_ns(current, ns); 503 ac.ac_pid = task_tgid_nr_ns(current, ns);
522 rcu_read_lock(); 504 rcu_read_lock();
523 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); 505 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
524 rcu_read_unlock(); 506 rcu_read_unlock();
525#endif 507#endif
526
527 spin_lock_irq(&current->sighand->siglock);
528 tty = current->signal->tty; /* Safe as we hold the siglock */
529 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
530 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
531 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
532 ac.ac_flag = pacct->ac_flag;
533 ac.ac_mem = encode_comp_t(pacct->ac_mem);
534 ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
535 ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
536 ac.ac_exitcode = pacct->ac_exitcode;
537 spin_unlock_irq(&current->sighand->siglock);
538 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
539 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
540 ac.ac_swaps = encode_comp_t(0);
541
542 /* 508 /*
543 * Get freeze protection. If the fs is frozen, just skip the write 509 * Get freeze protection. If the fs is frozen, just skip the write
544 * as we could deadlock the system otherwise. 510 * as we could deadlock the system otherwise.
545 */ 511 */
546 if (!file_start_write_trylock(file)) 512 if (file_start_write_trylock(file)) {
547 goto out; 513 /* it's been opened O_APPEND, so position is irrelevant */
548 /* 514 loff_t pos = 0;
549 * Kernel segment override to datasegment and write it 515 __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
550 * to the accounting file. 516 file_end_write(file);
551 */ 517 }
552 fs = get_fs();
553 set_fs(KERNEL_DS);
554 /*
555 * Accounting records are not subject to resource limits.
556 */
557 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
558 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
559 file->f_op->write(file, (char *)&ac,
560 sizeof(acct_t), &file->f_pos);
561 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
562 set_fs(fs);
563 file_end_write(file);
564out: 518out:
519 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
565 revert_creds(orig_cred); 520 revert_creds(orig_cred);
566} 521}
567 522
@@ -578,6 +533,7 @@ void acct_collect(long exitcode, int group_dead)
578 533
579 if (group_dead && current->mm) { 534 if (group_dead && current->mm) {
580 struct vm_area_struct *vma; 535 struct vm_area_struct *vma;
536
581 down_read(&current->mm->mmap_sem); 537 down_read(&current->mm->mmap_sem);
582 vma = current->mm->mmap; 538 vma = current->mm->mmap;
583 while (vma) { 539 while (vma) {
@@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead)
609 spin_unlock_irq(&current->sighand->siglock); 565 spin_unlock_irq(&current->sighand->siglock);
610} 566}
611 567
612static void acct_process_in_ns(struct pid_namespace *ns) 568static void slow_acct_process(struct pid_namespace *ns)
613{ 569{
614 struct file *file = NULL; 570 for ( ; ns; ns = ns->parent) {
615 struct bsd_acct_struct *acct; 571 struct bsd_acct_struct *acct = acct_get(ns);
616 572 if (acct) {
617 acct = ns->bacct; 573 do_acct_process(acct);
618 /* 574 mutex_unlock(&acct->lock);
619 * accelerate the common fastpath: 575 pin_put(&acct->pin);
620 */ 576 }
621 if (!acct || !acct->file)
622 return;
623
624 spin_lock(&acct_lock);
625 file = acct->file;
626 if (unlikely(!file)) {
627 spin_unlock(&acct_lock);
628 return;
629 } 577 }
630 get_file(file);
631 spin_unlock(&acct_lock);
632
633 do_acct_process(acct, ns, file);
634 fput(file);
635} 578}
636 579
637/** 580/**
638 * acct_process - now just a wrapper around acct_process_in_ns, 581 * acct_process
639 * which in turn is a wrapper around do_acct_process.
640 * 582 *
641 * handles process accounting for an exiting task 583 * handles process accounting for an exiting task
642 */ 584 */
@@ -649,6 +591,10 @@ void acct_process(void)
649 * alive and holds its namespace, which in turn holds 591 * alive and holds its namespace, which in turn holds
650 * its parent. 592 * its parent.
651 */ 593 */
652 for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) 594 for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
653 acct_process_in_ns(ns); 595 if (ns->bacct)
596 break;
597 }
598 if (unlikely(ns))
599 slow_acct_process(ns);
654} 600}
diff --git a/kernel/audit.c b/kernel/audit.c
index 3ef2e0e797e8..ba2ff5a5c600 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1677,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1677 audit_log_format(ab, " %s=", prefix); 1677 audit_log_format(ab, " %s=", prefix);
1678 CAP_FOR_EACH_U32(i) { 1678 CAP_FOR_EACH_U32(i) {
1679 audit_log_format(ab, "%08x", 1679 audit_log_format(ab, "%08x",
1680 cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); 1680 cap->cap[CAP_LAST_U32 - i]);
1681 } 1681 }
1682} 1682}
1683 1683
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 8e9bc9c3dbb7..c447cd9848d1 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count)
106 if (unlikely(!entry)) 106 if (unlikely(!entry))
107 return NULL; 107 return NULL;
108 108
109 fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); 109 fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
110 if (unlikely(!fields)) { 110 if (unlikely(!fields)) {
111 kfree(entry); 111 kfree(entry);
112 return NULL; 112 return NULL;
@@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES];
160 160
161int __init audit_register_class(int class, unsigned *list) 161int __init audit_register_class(int class, unsigned *list)
162{ 162{
163 __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); 163 __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
164 if (!p) 164 if (!p)
165 return -ENOMEM; 165 return -ENOMEM;
166 while (*list != ~0U) { 166 while (*list != ~0U) {
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9fd4246b04b8..e1d1d1952bfa 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,7 +9,6 @@
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h>
13#include <linux/log2.h> 12#include <linux/log2.h>
14#include <linux/spinlock_types.h> 13#include <linux/spinlock_types.h>
15 14
@@ -18,7 +17,6 @@ void foo(void)
18 /* The enum constants to put into include/generated/bounds.h */ 17 /* The enum constants to put into include/generated/bounds.h */
19 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 18 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
20 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 19 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
21 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
22#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
23 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); 21 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
24#endif 22#endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
new file mode 100644
index 000000000000..6a71145e2769
--- /dev/null
+++ b/kernel/bpf/Makefile
@@ -0,0 +1 @@
obj-y := core.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
new file mode 100644
index 000000000000..7f0dbcbb34af
--- /dev/null
+++ b/kernel/bpf/core.c
@@ -0,0 +1,534 @@
1/*
2 * Linux Socket Filter - Kernel level socket filtering
3 *
4 * Based on the design of the Berkeley Packet Filter. The new
5 * internal format has been designed by PLUMgrid:
6 *
7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
8 *
9 * Authors:
10 *
11 * Jay Schulist <jschlst@samba.org>
12 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com>
14 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
19 *
20 * Andi Kleen - Fix a few bad bugs and races.
21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
22 */
23#include <linux/filter.h>
24#include <linux/skbuff.h>
25#include <asm/unaligned.h>
26
27/* Registers */
28#define BPF_R0 regs[BPF_REG_0]
29#define BPF_R1 regs[BPF_REG_1]
30#define BPF_R2 regs[BPF_REG_2]
31#define BPF_R3 regs[BPF_REG_3]
32#define BPF_R4 regs[BPF_REG_4]
33#define BPF_R5 regs[BPF_REG_5]
34#define BPF_R6 regs[BPF_REG_6]
35#define BPF_R7 regs[BPF_REG_7]
36#define BPF_R8 regs[BPF_REG_8]
37#define BPF_R9 regs[BPF_REG_9]
38#define BPF_R10 regs[BPF_REG_10]
39
40/* Named registers */
41#define DST regs[insn->dst_reg]
42#define SRC regs[insn->src_reg]
43#define FP regs[BPF_REG_FP]
44#define ARG1 regs[BPF_REG_ARG1]
45#define CTX regs[BPF_REG_CTX]
46#define IMM insn->imm
47
48/* No hurry in this branch
49 *
50 * Exported for the bpf jit load helper.
51 */
52void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
53{
54 u8 *ptr = NULL;
55
56 if (k >= SKF_NET_OFF)
57 ptr = skb_network_header(skb) + k - SKF_NET_OFF;
58 else if (k >= SKF_LL_OFF)
59 ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
60 if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
61 return ptr;
62
63 return NULL;
64}
65
66/* Base function for offset calculation. Needs to go into .text section,
67 * therefore keeping it non-static as well; will also be used by JITs
68 * anyway later on, so do not let the compiler omit it.
69 */
70noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
71{
72 return 0;
73}
74
75/**
76 * __bpf_prog_run - run eBPF program on a given context
77 * @ctx: is the data we are operating on
78 * @insn: is the array of eBPF instructions
79 *
80 * Decode and execute eBPF instructions.
81 */
82static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
83{
84 u64 stack[MAX_BPF_STACK / sizeof(u64)];
85 u64 regs[MAX_BPF_REG], tmp;
86 static const void *jumptable[256] = {
87 [0 ... 255] = &&default_label,
88 /* Now overwrite non-defaults ... */
89 /* 32 bit ALU operations */
90 [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
91 [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
92 [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
93 [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
94 [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
95 [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
96 [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
97 [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
98 [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
99 [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
100 [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
101 [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
102 [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
103 [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
104 [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
105 [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
106 [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
107 [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
108 [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
109 [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
110 [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
111 [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
112 [BPF_ALU | BPF_NEG] = &&ALU_NEG,
113 [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
114 [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
115 /* 64 bit ALU operations */
116 [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
117 [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
118 [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
119 [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
120 [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
121 [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
122 [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
123 [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
124 [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
125 [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
126 [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
127 [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
128 [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
129 [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
130 [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
131 [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
132 [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
133 [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
134 [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
135 [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
136 [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
137 [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
138 [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
139 [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
140 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
141 /* Call instruction */
142 [BPF_JMP | BPF_CALL] = &&JMP_CALL,
143 /* Jumps */
144 [BPF_JMP | BPF_JA] = &&JMP_JA,
145 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
146 [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
147 [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
148 [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
149 [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
150 [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
151 [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
152 [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
153 [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
154 [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
155 [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
156 [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
157 [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
158 [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
159 /* Program return */
160 [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
161 /* Store instructions */
162 [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
163 [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
164 [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
165 [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
166 [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
167 [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
168 [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
169 [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
170 [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
171 [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
172 /* Load instructions */
173 [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
174 [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
175 [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
176 [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
177 [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
178 [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
179 [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
180 [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
181 [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
182 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
183 };
184 void *ptr;
185 int off;
186
187#define CONT ({ insn++; goto select_insn; })
188#define CONT_JMP ({ insn++; goto select_insn; })
189
190 FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
191 ARG1 = (u64) (unsigned long) ctx;
192
193 /* Registers used in classic BPF programs need to be reset first. */
194 regs[BPF_REG_A] = 0;
195 regs[BPF_REG_X] = 0;
196
197select_insn:
198 goto *jumptable[insn->code];
199
200 /* ALU */
201#define ALU(OPCODE, OP) \
202 ALU64_##OPCODE##_X: \
203 DST = DST OP SRC; \
204 CONT; \
205 ALU_##OPCODE##_X: \
206 DST = (u32) DST OP (u32) SRC; \
207 CONT; \
208 ALU64_##OPCODE##_K: \
209 DST = DST OP IMM; \
210 CONT; \
211 ALU_##OPCODE##_K: \
212 DST = (u32) DST OP (u32) IMM; \
213 CONT;
214
215 ALU(ADD, +)
216 ALU(SUB, -)
217 ALU(AND, &)
218 ALU(OR, |)
219 ALU(LSH, <<)
220 ALU(RSH, >>)
221 ALU(XOR, ^)
222 ALU(MUL, *)
223#undef ALU
224 ALU_NEG:
225 DST = (u32) -DST;
226 CONT;
227 ALU64_NEG:
228 DST = -DST;
229 CONT;
230 ALU_MOV_X:
231 DST = (u32) SRC;
232 CONT;
233 ALU_MOV_K:
234 DST = (u32) IMM;
235 CONT;
236 ALU64_MOV_X:
237 DST = SRC;
238 CONT;
239 ALU64_MOV_K:
240 DST = IMM;
241 CONT;
242 ALU64_ARSH_X:
243 (*(s64 *) &DST) >>= SRC;
244 CONT;
245 ALU64_ARSH_K:
246 (*(s64 *) &DST) >>= IMM;
247 CONT;
248 ALU64_MOD_X:
249 if (unlikely(SRC == 0))
250 return 0;
251 tmp = DST;
252 DST = do_div(tmp, SRC);
253 CONT;
254 ALU_MOD_X:
255 if (unlikely(SRC == 0))
256 return 0;
257 tmp = (u32) DST;
258 DST = do_div(tmp, (u32) SRC);
259 CONT;
260 ALU64_MOD_K:
261 tmp = DST;
262 DST = do_div(tmp, IMM);
263 CONT;
264 ALU_MOD_K:
265 tmp = (u32) DST;
266 DST = do_div(tmp, (u32) IMM);
267 CONT;
268 ALU64_DIV_X:
269 if (unlikely(SRC == 0))
270 return 0;
271 do_div(DST, SRC);
272 CONT;
273 ALU_DIV_X:
274 if (unlikely(SRC == 0))
275 return 0;
276 tmp = (u32) DST;
277 do_div(tmp, (u32) SRC);
278 DST = (u32) tmp;
279 CONT;
280 ALU64_DIV_K:
281 do_div(DST, IMM);
282 CONT;
283 ALU_DIV_K:
284 tmp = (u32) DST;
285 do_div(tmp, (u32) IMM);
286 DST = (u32) tmp;
287 CONT;
288 ALU_END_TO_BE:
289 switch (IMM) {
290 case 16:
291 DST = (__force u16) cpu_to_be16(DST);
292 break;
293 case 32:
294 DST = (__force u32) cpu_to_be32(DST);
295 break;
296 case 64:
297 DST = (__force u64) cpu_to_be64(DST);
298 break;
299 }
300 CONT;
301 ALU_END_TO_LE:
302 switch (IMM) {
303 case 16:
304 DST = (__force u16) cpu_to_le16(DST);
305 break;
306 case 32:
307 DST = (__force u32) cpu_to_le32(DST);
308 break;
309 case 64:
310 DST = (__force u64) cpu_to_le64(DST);
311 break;
312 }
313 CONT;
314
315 /* CALL */
316 JMP_CALL:
317 /* Function call scratches BPF_R1-BPF_R5 registers,
318 * preserves BPF_R6-BPF_R9, and stores return value
319 * into BPF_R0.
320 */
321 BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
322 BPF_R4, BPF_R5);
323 CONT;
324
325 /* JMP */
326 JMP_JA:
327 insn += insn->off;
328 CONT;
329 JMP_JEQ_X:
330 if (DST == SRC) {
331 insn += insn->off;
332 CONT_JMP;
333 }
334 CONT;
335 JMP_JEQ_K:
336 if (DST == IMM) {
337 insn += insn->off;
338 CONT_JMP;
339 }
340 CONT;
341 JMP_JNE_X:
342 if (DST != SRC) {
343 insn += insn->off;
344 CONT_JMP;
345 }
346 CONT;
347 JMP_JNE_K:
348 if (DST != IMM) {
349 insn += insn->off;
350 CONT_JMP;
351 }
352 CONT;
353 JMP_JGT_X:
354 if (DST > SRC) {
355 insn += insn->off;
356 CONT_JMP;
357 }
358 CONT;
359 JMP_JGT_K:
360 if (DST > IMM) {
361 insn += insn->off;
362 CONT_JMP;
363 }
364 CONT;
365 JMP_JGE_X:
366 if (DST >= SRC) {
367 insn += insn->off;
368 CONT_JMP;
369 }
370 CONT;
371 JMP_JGE_K:
372 if (DST >= IMM) {
373 insn += insn->off;
374 CONT_JMP;
375 }
376 CONT;
377 JMP_JSGT_X:
378 if (((s64) DST) > ((s64) SRC)) {
379 insn += insn->off;
380 CONT_JMP;
381 }
382 CONT;
383 JMP_JSGT_K:
384 if (((s64) DST) > ((s64) IMM)) {
385 insn += insn->off;
386 CONT_JMP;
387 }
388 CONT;
389 JMP_JSGE_X:
390 if (((s64) DST) >= ((s64) SRC)) {
391 insn += insn->off;
392 CONT_JMP;
393 }
394 CONT;
395 JMP_JSGE_K:
396 if (((s64) DST) >= ((s64) IMM)) {
397 insn += insn->off;
398 CONT_JMP;
399 }
400 CONT;
401 JMP_JSET_X:
402 if (DST & SRC) {
403 insn += insn->off;
404 CONT_JMP;
405 }
406 CONT;
407 JMP_JSET_K:
408 if (DST & IMM) {
409 insn += insn->off;
410 CONT_JMP;
411 }
412 CONT;
413 JMP_EXIT:
414 return BPF_R0;
415
416 /* STX and ST and LDX*/
417#define LDST(SIZEOP, SIZE) \
418 STX_MEM_##SIZEOP: \
419 *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
420 CONT; \
421 ST_MEM_##SIZEOP: \
422 *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
423 CONT; \
424 LDX_MEM_##SIZEOP: \
425 DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
426 CONT;
427
428 LDST(B, u8)
429 LDST(H, u16)
430 LDST(W, u32)
431 LDST(DW, u64)
432#undef LDST
433 STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
434 atomic_add((u32) SRC, (atomic_t *)(unsigned long)
435 (DST + insn->off));
436 CONT;
437 STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
438 atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
439 (DST + insn->off));
440 CONT;
441 LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
442 off = IMM;
443load_word:
444 /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
445 * only appearing in the programs where ctx ==
446 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
447 * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
448 * internal BPF verifier will check that BPF_R6 ==
449 * ctx.
450 *
451 * BPF_ABS and BPF_IND are wrappers of function calls,
452 * so they scratch BPF_R1-BPF_R5 registers, preserve
453 * BPF_R6-BPF_R9, and store return value into BPF_R0.
454 *
455 * Implicit input:
456 * ctx == skb == BPF_R6 == CTX
457 *
458 * Explicit input:
459 * SRC == any register
460 * IMM == 32-bit immediate
461 *
462 * Output:
463 * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
464 */
465
466 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
467 if (likely(ptr != NULL)) {
468 BPF_R0 = get_unaligned_be32(ptr);
469 CONT;
470 }
471
472 return 0;
473 LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
474 off = IMM;
475load_half:
476 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
477 if (likely(ptr != NULL)) {
478 BPF_R0 = get_unaligned_be16(ptr);
479 CONT;
480 }
481
482 return 0;
483 LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
484 off = IMM;
485load_byte:
486 ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
487 if (likely(ptr != NULL)) {
488 BPF_R0 = *(u8 *)ptr;
489 CONT;
490 }
491
492 return 0;
493 LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
494 off = IMM + SRC;
495 goto load_word;
496 LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
497 off = IMM + SRC;
498 goto load_half;
499 LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
500 off = IMM + SRC;
501 goto load_byte;
502
503 default_label:
504 /* If we ever reach this, we have a bug somewhere. */
505 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
506 return 0;
507}
508
509void __weak bpf_int_jit_compile(struct bpf_prog *prog)
510{
511}
512
513/**
514 * bpf_prog_select_runtime - select execution runtime for BPF program
515 * @fp: bpf_prog populated with internal BPF program
516 *
517 * try to JIT internal BPF program, if JIT is not available select interpreter
518 * BPF program will be executed via BPF_PROG_RUN() macro
519 */
520void bpf_prog_select_runtime(struct bpf_prog *fp)
521{
522 fp->bpf_func = (void *) __bpf_prog_run;
523
524 /* Probe if internal BPF can be JITed */
525 bpf_int_jit_compile(fp);
526}
527EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
528
529/* free internal BPF program */
530void bpf_prog_free(struct bpf_prog *fp)
531{
532 bpf_jit_free(fp);
533}
534EXPORT_SYMBOL_GPL(bpf_prog_free);
diff --git a/kernel/capability.c b/kernel/capability.c
index a5cf13c018ce..989f5bfc57dc 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -258,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
258 i++; 258 i++;
259 } 259 }
260 260
261 effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
262 permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
263 inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
264
261 new = prepare_creds(); 265 new = prepare_creds();
262 if (!new) 266 if (!new)
263 return -ENOMEM; 267 return -ENOMEM;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2f7c760305ca..379650b984f8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2472,7 +2472,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
2472static void kdb_sysinfo(struct sysinfo *val) 2472static void kdb_sysinfo(struct sysinfo *val)
2473{ 2473{
2474 struct timespec uptime; 2474 struct timespec uptime;
2475 do_posix_clock_monotonic_gettime(&uptime); 2475 ktime_get_ts(&uptime);
2476 memset(val, 0, sizeof(*val)); 2476 memset(val, 0, sizeof(*val));
2477 val->uptime = uptime.tv_sec; 2477 val->uptime = uptime.tv_sec;
2478 val->loads[0] = avenrun[0]; 2478 val->loads[0] = avenrun[0];
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 54996b71e66d..ef90b04d783f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk)
46} 46}
47 47
48/* 48/*
49 * Start accounting for a delay statistic using 49 * Finish delay accounting for a statistic using its timestamps (@start),
50 * its starting timestamp (@start) 50 * accumalator (@total) and @count
51 */ 51 */
52 52static void delayacct_end(u64 *start, u64 *total, u32 *count)
53static inline void delayacct_start(struct timespec *start)
54{ 53{
55 do_posix_clock_monotonic_gettime(start); 54 s64 ns = ktime_get_ns() - *start;
56}
57
58/*
59 * Finish delay accounting for a statistic using
60 * its timestamps (@start, @end), accumalator (@total) and @count
61 */
62
63static void delayacct_end(struct timespec *start, struct timespec *end,
64 u64 *total, u32 *count)
65{
66 struct timespec ts;
67 s64 ns;
68 unsigned long flags; 55 unsigned long flags;
69 56
70 do_posix_clock_monotonic_gettime(end); 57 if (ns > 0) {
71 ts = timespec_sub(*end, *start); 58 spin_lock_irqsave(&current->delays->lock, flags);
72 ns = timespec_to_ns(&ts); 59 *total += ns;
73 if (ns < 0) 60 (*count)++;
74 return; 61 spin_unlock_irqrestore(&current->delays->lock, flags);
75 62 }
76 spin_lock_irqsave(&current->delays->lock, flags);
77 *total += ns;
78 (*count)++;
79 spin_unlock_irqrestore(&current->delays->lock, flags);
80} 63}
81 64
82void __delayacct_blkio_start(void) 65void __delayacct_blkio_start(void)
83{ 66{
84 delayacct_start(&current->delays->blkio_start); 67 current->delays->blkio_start = ktime_get_ns();
85} 68}
86 69
87void __delayacct_blkio_end(void) 70void __delayacct_blkio_end(void)
@@ -89,35 +72,29 @@ void __delayacct_blkio_end(void)
89 if (current->delays->flags & DELAYACCT_PF_SWAPIN) 72 if (current->delays->flags & DELAYACCT_PF_SWAPIN)
90 /* Swapin block I/O */ 73 /* Swapin block I/O */
91 delayacct_end(&current->delays->blkio_start, 74 delayacct_end(&current->delays->blkio_start,
92 &current->delays->blkio_end,
93 &current->delays->swapin_delay, 75 &current->delays->swapin_delay,
94 &current->delays->swapin_count); 76 &current->delays->swapin_count);
95 else /* Other block I/O */ 77 else /* Other block I/O */
96 delayacct_end(&current->delays->blkio_start, 78 delayacct_end(&current->delays->blkio_start,
97 &current->delays->blkio_end,
98 &current->delays->blkio_delay, 79 &current->delays->blkio_delay,
99 &current->delays->blkio_count); 80 &current->delays->blkio_count);
100} 81}
101 82
102int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) 83int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
103{ 84{
104 s64 tmp;
105 unsigned long t1;
106 unsigned long long t2, t3;
107 unsigned long flags;
108 struct timespec ts;
109 cputime_t utime, stime, stimescaled, utimescaled; 85 cputime_t utime, stime, stimescaled, utimescaled;
86 unsigned long long t2, t3;
87 unsigned long flags, t1;
88 s64 tmp;
110 89
111 tmp = (s64)d->cpu_run_real_total;
112 task_cputime(tsk, &utime, &stime); 90 task_cputime(tsk, &utime, &stime);
113 cputime_to_timespec(utime + stime, &ts); 91 tmp = (s64)d->cpu_run_real_total;
114 tmp += timespec_to_ns(&ts); 92 tmp += cputime_to_nsecs(utime + stime);
115 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; 93 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
116 94
117 tmp = (s64)d->cpu_scaled_run_real_total;
118 task_cputime_scaled(tsk, &utimescaled, &stimescaled); 95 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
119 cputime_to_timespec(utimescaled + stimescaled, &ts); 96 tmp = (s64)d->cpu_scaled_run_real_total;
120 tmp += timespec_to_ns(&ts); 97 tmp += cputime_to_nsecs(utimescaled + stimescaled);
121 d->cpu_scaled_run_real_total = 98 d->cpu_scaled_run_real_total =
122 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; 99 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
123 100
@@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
169 146
170void __delayacct_freepages_start(void) 147void __delayacct_freepages_start(void)
171{ 148{
172 delayacct_start(&current->delays->freepages_start); 149 current->delays->freepages_start = ktime_get_ns();
173} 150}
174 151
175void __delayacct_freepages_end(void) 152void __delayacct_freepages_end(void)
176{ 153{
177 delayacct_end(&current->delays->freepages_start, 154 delayacct_end(&current->delays->freepages_start,
178 &current->delays->freepages_end,
179 &current->delays->freepages_delay, 155 &current->delays->freepages_delay,
180 &current->delays->freepages_count); 156 &current->delays->freepages_count);
181} 157}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6f3254e8c137..1d0af8a2c646 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
167 /* For mmu_notifiers */ 167 /* For mmu_notifiers */
168 const unsigned long mmun_start = addr; 168 const unsigned long mmun_start = addr;
169 const unsigned long mmun_end = addr + PAGE_SIZE; 169 const unsigned long mmun_end = addr + PAGE_SIZE;
170 struct mem_cgroup *memcg;
171
172 err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
173 if (err)
174 return err;
170 175
171 /* For try_to_free_swap() and munlock_vma_page() below */ 176 /* For try_to_free_swap() and munlock_vma_page() below */
172 lock_page(page); 177 lock_page(page);
@@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
179 184
180 get_page(kpage); 185 get_page(kpage);
181 page_add_new_anon_rmap(kpage, vma, addr); 186 page_add_new_anon_rmap(kpage, vma, addr);
187 mem_cgroup_commit_charge(kpage, memcg, false);
188 lru_cache_add_active_or_unevictable(kpage, vma);
182 189
183 if (!PageAnon(page)) { 190 if (!PageAnon(page)) {
184 dec_mm_counter(mm, MM_FILEPAGES); 191 dec_mm_counter(mm, MM_FILEPAGES);
@@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
200 207
201 err = 0; 208 err = 0;
202 unlock: 209 unlock:
210 mem_cgroup_cancel_charge(kpage, memcg);
203 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 211 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
204 unlock_page(page); 212 unlock_page(page);
205 return err; 213 return err;
@@ -315,18 +323,11 @@ retry:
315 if (!new_page) 323 if (!new_page)
316 goto put_old; 324 goto put_old;
317 325
318 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
319 goto put_new;
320
321 __SetPageUptodate(new_page); 326 __SetPageUptodate(new_page);
322 copy_highpage(new_page, old_page); 327 copy_highpage(new_page, old_page);
323 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 328 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
324 329
325 ret = __replace_page(vma, vaddr, old_page, new_page); 330 ret = __replace_page(vma, vaddr, old_page, new_page);
326 if (ret)
327 mem_cgroup_uncharge_page(new_page);
328
329put_new:
330 page_cache_release(new_page); 331 page_cache_release(new_page);
331put_old: 332put_old:
332 put_page(old_page); 333 put_page(old_page);
diff --git a/kernel/exit.c b/kernel/exit.c
index e5c4668f1799..32c58f7433a3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,7 +59,7 @@
59#include <asm/pgtable.h> 59#include <asm/pgtable.h>
60#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
61 61
62static void exit_mm(struct task_struct * tsk); 62static void exit_mm(struct task_struct *tsk);
63 63
64static void __unhash_process(struct task_struct *p, bool group_dead) 64static void __unhash_process(struct task_struct *p, bool group_dead)
65{ 65{
@@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk)
151 spin_unlock(&sighand->siglock); 151 spin_unlock(&sighand->siglock);
152 152
153 __cleanup_sighand(sighand); 153 __cleanup_sighand(sighand);
154 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 154 clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
155 if (group_dead) { 155 if (group_dead) {
156 flush_sigqueue(&sig->shared_pending); 156 flush_sigqueue(&sig->shared_pending);
157 tty_kref_put(tty); 157 tty_kref_put(tty);
@@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
168} 168}
169 169
170 170
171void release_task(struct task_struct * p) 171void release_task(struct task_struct *p)
172{ 172{
173 struct task_struct *leader; 173 struct task_struct *leader;
174 int zap_leader; 174 int zap_leader;
@@ -192,7 +192,8 @@ repeat:
192 */ 192 */
193 zap_leader = 0; 193 zap_leader = 0;
194 leader = p->group_leader; 194 leader = p->group_leader;
195 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 195 if (leader != p && thread_group_empty(leader)
196 && leader->exit_state == EXIT_ZOMBIE) {
196 /* 197 /*
197 * If we were the last child thread and the leader has 198 * If we were the last child thread and the leader has
198 * exited already, and the leader's parent ignores SIGCHLD, 199 * exited already, and the leader's parent ignores SIGCHLD,
@@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp)
241 * 242 *
242 * "I ask you, have you ever known what it is to be an orphan?" 243 * "I ask you, have you ever known what it is to be an orphan?"
243 */ 244 */
244static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) 245static int will_become_orphaned_pgrp(struct pid *pgrp,
246 struct task_struct *ignored_task)
245{ 247{
246 struct task_struct *p; 248 struct task_struct *p;
247 249
@@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
294 struct task_struct *ignored_task = tsk; 296 struct task_struct *ignored_task = tsk;
295 297
296 if (!parent) 298 if (!parent)
297 /* exit: our father is in a different pgrp than 299 /* exit: our father is in a different pgrp than
298 * we are and we were the only connection outside. 300 * we are and we were the only connection outside.
299 */ 301 */
300 parent = tsk->real_parent; 302 parent = tsk->real_parent;
301 else 303 else
302 /* reparent: our child is in a different pgrp than 304 /* reparent: our child is in a different pgrp than
@@ -405,7 +407,7 @@ assign_new_owner:
405 * Turn us into a lazy TLB process if we 407 * Turn us into a lazy TLB process if we
406 * aren't already.. 408 * aren't already..
407 */ 409 */
408static void exit_mm(struct task_struct * tsk) 410static void exit_mm(struct task_struct *tsk)
409{ 411{
410 struct mm_struct *mm = tsk->mm; 412 struct mm_struct *mm = tsk->mm;
411 struct core_state *core_state; 413 struct core_state *core_state;
@@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk)
425 core_state = mm->core_state; 427 core_state = mm->core_state;
426 if (core_state) { 428 if (core_state) {
427 struct core_thread self; 429 struct core_thread self;
430
428 up_read(&mm->mmap_sem); 431 up_read(&mm->mmap_sem);
429 432
430 self.task = tsk; 433 self.task = tsk;
@@ -455,6 +458,7 @@ static void exit_mm(struct task_struct * tsk)
455 task_unlock(tsk); 458 task_unlock(tsk);
456 mm_update_next_owner(mm); 459 mm_update_next_owner(mm);
457 mmput(mm); 460 mmput(mm);
461 clear_thread_flag(TIF_MEMDIE);
458} 462}
459 463
460/* 464/*
@@ -565,6 +569,7 @@ static void forget_original_parent(struct task_struct *father)
565 569
566 list_for_each_entry_safe(p, n, &father->children, sibling) { 570 list_for_each_entry_safe(p, n, &father->children, sibling) {
567 struct task_struct *t = p; 571 struct task_struct *t = p;
572
568 do { 573 do {
569 t->real_parent = reaper; 574 t->real_parent = reaper;
570 if (t->parent == father) { 575 if (t->parent == father) {
@@ -598,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
598 /* 603 /*
599 * This does two things: 604 * This does two things:
600 * 605 *
601 * A. Make init inherit all the child processes 606 * A. Make init inherit all the child processes
602 * B. Check to see if any process groups have become orphaned 607 * B. Check to see if any process groups have become orphaned
603 * as a result of our exiting, and if they have any stopped 608 * as a result of our exiting, and if they have any stopped
604 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 609 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
@@ -648,9 +653,8 @@ static void check_stack_usage(void)
648 653
649 spin_lock(&low_water_lock); 654 spin_lock(&low_water_lock);
650 if (free < lowest_to_date) { 655 if (free < lowest_to_date) {
651 printk(KERN_WARNING "%s (%d) used greatest stack depth: " 656 pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
652 "%lu bytes left\n", 657 current->comm, task_pid_nr(current), free);
653 current->comm, task_pid_nr(current), free);
654 lowest_to_date = free; 658 lowest_to_date = free;
655 } 659 }
656 spin_unlock(&low_water_lock); 660 spin_unlock(&low_water_lock);
@@ -691,8 +695,7 @@ void do_exit(long code)
691 * leave this task alone and wait for reboot. 695 * leave this task alone and wait for reboot.
692 */ 696 */
693 if (unlikely(tsk->flags & PF_EXITING)) { 697 if (unlikely(tsk->flags & PF_EXITING)) {
694 printk(KERN_ALERT 698 pr_alert("Fixing recursive fault but reboot is needed!\n");
695 "Fixing recursive fault but reboot is needed!\n");
696 /* 699 /*
697 * We can do this unlocked here. The futex code uses 700 * We can do this unlocked here. The futex code uses
698 * this flag just to verify whether the pi state 701 * this flag just to verify whether the pi state
@@ -716,9 +719,9 @@ void do_exit(long code)
716 raw_spin_unlock_wait(&tsk->pi_lock); 719 raw_spin_unlock_wait(&tsk->pi_lock);
717 720
718 if (unlikely(in_atomic())) 721 if (unlikely(in_atomic()))
719 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 722 pr_info("note: %s[%d] exited with preempt_count %d\n",
720 current->comm, task_pid_nr(current), 723 current->comm, task_pid_nr(current),
721 preempt_count()); 724 preempt_count());
722 725
723 acct_update_integrals(tsk); 726 acct_update_integrals(tsk);
724 /* sync mm's RSS info before statistics gathering */ 727 /* sync mm's RSS info before statistics gathering */
@@ -836,7 +839,6 @@ void do_exit(long code)
836 for (;;) 839 for (;;)
837 cpu_relax(); /* For when BUG is null */ 840 cpu_relax(); /* For when BUG is null */
838} 841}
839
840EXPORT_SYMBOL_GPL(do_exit); 842EXPORT_SYMBOL_GPL(do_exit);
841 843
842void complete_and_exit(struct completion *comp, long code) 844void complete_and_exit(struct completion *comp, long code)
@@ -846,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code)
846 848
847 do_exit(code); 849 do_exit(code);
848} 850}
849
850EXPORT_SYMBOL(complete_and_exit); 851EXPORT_SYMBOL(complete_and_exit);
851 852
852SYSCALL_DEFINE1(exit, int, error_code) 853SYSCALL_DEFINE1(exit, int, error_code)
@@ -869,6 +870,7 @@ do_group_exit(int exit_code)
869 exit_code = sig->group_exit_code; 870 exit_code = sig->group_exit_code;
870 else if (!thread_group_empty(current)) { 871 else if (!thread_group_empty(current)) {
871 struct sighand_struct *const sighand = current->sighand; 872 struct sighand_struct *const sighand = current->sighand;
873
872 spin_lock_irq(&sighand->siglock); 874 spin_lock_irq(&sighand->siglock);
873 if (signal_group_exit(sig)) 875 if (signal_group_exit(sig))
874 /* Another thread got here before we took the lock. */ 876 /* Another thread got here before we took the lock. */
@@ -1033,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1033 * as other threads in the parent group can be right 1035 * as other threads in the parent group can be right
1034 * here reaping other children at the same time. 1036 * here reaping other children at the same time.
1035 * 1037 *
1036 * We use thread_group_cputime_adjusted() to get times for the thread 1038 * We use thread_group_cputime_adjusted() to get times for
1037 * group, which consolidates times for all threads in the 1039 * the thread group, which consolidates times for all threads
1038 * group including the group leader. 1040 * in the group including the group leader.
1039 */ 1041 */
1040 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1042 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1041 spin_lock_irq(&p->real_parent->sighand->siglock); 1043 spin_lock_irq(&p->real_parent->sighand->siglock);
@@ -1417,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1417 1419
1418 list_for_each_entry(p, &tsk->children, sibling) { 1420 list_for_each_entry(p, &tsk->children, sibling) {
1419 int ret = wait_consider_task(wo, 0, p); 1421 int ret = wait_consider_task(wo, 0, p);
1422
1420 if (ret) 1423 if (ret)
1421 return ret; 1424 return ret;
1422 } 1425 }
@@ -1430,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1430 1433
1431 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1434 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1432 int ret = wait_consider_task(wo, 1, p); 1435 int ret = wait_consider_task(wo, 1, p);
1436
1433 if (ret) 1437 if (ret)
1434 return ret; 1438 return ret;
1435 } 1439 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 962885edbe53..0cf9cdb6e491 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
315 goto free_ti; 315 goto free_ti;
316 316
317 tsk->stack = ti; 317 tsk->stack = ti;
318#ifdef CONFIG_SECCOMP
319 /*
320 * We must handle setting up seccomp filters once we're under
321 * the sighand lock in case orig has changed between now and
322 * then. Until then, filter must be NULL to avoid messing up
323 * the usage counts on the error path calling free_task.
324 */
325 tsk->seccomp.filter = NULL;
326#endif
318 327
319 setup_thread_stack(tsk, orig); 328 setup_thread_stack(tsk, orig);
320 clear_user_return_notifier(tsk); 329 clear_user_return_notifier(tsk);
@@ -365,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
365 */ 374 */
366 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); 375 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
367 376
368 mm->locked_vm = 0; 377 mm->total_vm = oldmm->total_vm;
369 mm->mmap = NULL; 378 mm->shared_vm = oldmm->shared_vm;
370 mm->vmacache_seqnum = 0; 379 mm->exec_vm = oldmm->exec_vm;
371 mm->map_count = 0; 380 mm->stack_vm = oldmm->stack_vm;
372 cpumask_clear(mm_cpumask(mm)); 381
373 mm->mm_rb = RB_ROOT;
374 rb_link = &mm->mm_rb.rb_node; 382 rb_link = &mm->mm_rb.rb_node;
375 rb_parent = NULL; 383 rb_parent = NULL;
376 pprev = &mm->mmap; 384 pprev = &mm->mmap;
@@ -421,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
421 atomic_dec(&inode->i_writecount); 429 atomic_dec(&inode->i_writecount);
422 mutex_lock(&mapping->i_mmap_mutex); 430 mutex_lock(&mapping->i_mmap_mutex);
423 if (tmp->vm_flags & VM_SHARED) 431 if (tmp->vm_flags & VM_SHARED)
424 mapping->i_mmap_writable++; 432 atomic_inc(&mapping->i_mmap_writable);
425 flush_dcache_mmap_lock(mapping); 433 flush_dcache_mmap_lock(mapping);
426 /* insert tmp into the share list, just after mpnt */ 434 /* insert tmp into the share list, just after mpnt */
427 if (unlikely(tmp->vm_flags & VM_NONLINEAR)) 435 if (unlikely(tmp->vm_flags & VM_NONLINEAR))
@@ -527,19 +535,37 @@ static void mm_init_aio(struct mm_struct *mm)
527#endif 535#endif
528} 536}
529 537
538static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
539{
540#ifdef CONFIG_MEMCG
541 mm->owner = p;
542#endif
543}
544
530static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) 545static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
531{ 546{
547 mm->mmap = NULL;
548 mm->mm_rb = RB_ROOT;
549 mm->vmacache_seqnum = 0;
532 atomic_set(&mm->mm_users, 1); 550 atomic_set(&mm->mm_users, 1);
533 atomic_set(&mm->mm_count, 1); 551 atomic_set(&mm->mm_count, 1);
534 init_rwsem(&mm->mmap_sem); 552 init_rwsem(&mm->mmap_sem);
535 INIT_LIST_HEAD(&mm->mmlist); 553 INIT_LIST_HEAD(&mm->mmlist);
536 mm->core_state = NULL; 554 mm->core_state = NULL;
537 atomic_long_set(&mm->nr_ptes, 0); 555 atomic_long_set(&mm->nr_ptes, 0);
556 mm->map_count = 0;
557 mm->locked_vm = 0;
558 mm->pinned_vm = 0;
538 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 559 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
539 spin_lock_init(&mm->page_table_lock); 560 spin_lock_init(&mm->page_table_lock);
561 mm_init_cpumask(mm);
540 mm_init_aio(mm); 562 mm_init_aio(mm);
541 mm_init_owner(mm, p); 563 mm_init_owner(mm, p);
564 mmu_notifier_mm_init(mm);
542 clear_tlb_flush_pending(mm); 565 clear_tlb_flush_pending(mm);
566#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
567 mm->pmd_huge_pte = NULL;
568#endif
543 569
544 if (current->mm) { 570 if (current->mm) {
545 mm->flags = current->mm->flags & MMF_INIT_MASK; 571 mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -549,11 +575,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
549 mm->def_flags = 0; 575 mm->def_flags = 0;
550 } 576 }
551 577
552 if (likely(!mm_alloc_pgd(mm))) { 578 if (mm_alloc_pgd(mm))
553 mmu_notifier_mm_init(mm); 579 goto fail_nopgd;
554 return mm; 580
555 } 581 if (init_new_context(p, mm))
582 goto fail_nocontext;
556 583
584 return mm;
585
586fail_nocontext:
587 mm_free_pgd(mm);
588fail_nopgd:
557 free_mm(mm); 589 free_mm(mm);
558 return NULL; 590 return NULL;
559} 591}
@@ -587,7 +619,6 @@ struct mm_struct *mm_alloc(void)
587 return NULL; 619 return NULL;
588 620
589 memset(mm, 0, sizeof(*mm)); 621 memset(mm, 0, sizeof(*mm));
590 mm_init_cpumask(mm);
591 return mm_init(mm, current); 622 return mm_init(mm, current);
592} 623}
593 624
@@ -819,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
819 goto fail_nomem; 850 goto fail_nomem;
820 851
821 memcpy(mm, oldmm, sizeof(*mm)); 852 memcpy(mm, oldmm, sizeof(*mm));
822 mm_init_cpumask(mm);
823 853
824#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
825 mm->pmd_huge_pte = NULL;
826#endif
827 if (!mm_init(mm, tsk)) 854 if (!mm_init(mm, tsk))
828 goto fail_nomem; 855 goto fail_nomem;
829 856
830 if (init_new_context(tsk, mm))
831 goto fail_nocontext;
832
833 dup_mm_exe_file(oldmm, mm); 857 dup_mm_exe_file(oldmm, mm);
834 858
835 err = dup_mmap(mm, oldmm); 859 err = dup_mmap(mm, oldmm);
@@ -851,15 +875,6 @@ free_pt:
851 875
852fail_nomem: 876fail_nomem:
853 return NULL; 877 return NULL;
854
855fail_nocontext:
856 /*
857 * If init_new_context() failed, we cannot use mmput() to free the mm
858 * because it calls destroy_context()
859 */
860 mm_free_pgd(mm);
861 free_mm(mm);
862 return NULL;
863} 878}
864 879
865static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) 880static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@ -1081,6 +1096,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1081 return 0; 1096 return 0;
1082} 1097}
1083 1098
1099static void copy_seccomp(struct task_struct *p)
1100{
1101#ifdef CONFIG_SECCOMP
1102 /*
1103 * Must be called with sighand->lock held, which is common to
1104 * all threads in the group. Holding cred_guard_mutex is not
1105 * needed because this new task is not yet running and cannot
1106 * be racing exec.
1107 */
1108 assert_spin_locked(&current->sighand->siglock);
1109
1110 /* Ref-count the new filter user, and assign it. */
1111 get_seccomp_filter(current);
1112 p->seccomp = current->seccomp;
1113
1114 /*
1115 * Explicitly enable no_new_privs here in case it got set
1116 * between the task_struct being duplicated and holding the
1117 * sighand lock. The seccomp state and nnp must be in sync.
1118 */
1119 if (task_no_new_privs(current))
1120 task_set_no_new_privs(p);
1121
1122 /*
1123 * If the parent gained a seccomp mode after copying thread
1124 * flags and between before we held the sighand lock, we have
1125 * to manually enable the seccomp thread flag here.
1126 */
1127 if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1128 set_tsk_thread_flag(p, TIF_SECCOMP);
1129#endif
1130}
1131
1084SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) 1132SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1085{ 1133{
1086 current->clear_child_tid = tidptr; 1134 current->clear_child_tid = tidptr;
@@ -1098,13 +1146,6 @@ static void rt_mutex_init_task(struct task_struct *p)
1098#endif 1146#endif
1099} 1147}
1100 1148
1101#ifdef CONFIG_MEMCG
1102void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1103{
1104 mm->owner = p;
1105}
1106#endif /* CONFIG_MEMCG */
1107
1108/* 1149/*
1109 * Initialize POSIX timer handling for a single task. 1150 * Initialize POSIX timer handling for a single task.
1110 */ 1151 */
@@ -1195,7 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1195 goto fork_out; 1236 goto fork_out;
1196 1237
1197 ftrace_graph_init_task(p); 1238 ftrace_graph_init_task(p);
1198 get_seccomp_filter(p);
1199 1239
1200 rt_mutex_init_task(p); 1240 rt_mutex_init_task(p);
1201 1241
@@ -1261,9 +1301,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1261 1301
1262 posix_cpu_timers_init(p); 1302 posix_cpu_timers_init(p);
1263 1303
1264 do_posix_clock_monotonic_gettime(&p->start_time); 1304 p->start_time = ktime_get_ns();
1265 p->real_start_time = p->start_time; 1305 p->real_start_time = ktime_get_boot_ns();
1266 monotonic_to_bootbased(&p->real_start_time);
1267 p->io_context = NULL; 1306 p->io_context = NULL;
1268 p->audit_context = NULL; 1307 p->audit_context = NULL;
1269 if (clone_flags & CLONE_THREAD) 1308 if (clone_flags & CLONE_THREAD)
@@ -1306,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1306#ifdef CONFIG_DEBUG_MUTEXES 1345#ifdef CONFIG_DEBUG_MUTEXES
1307 p->blocked_on = NULL; /* not blocked yet */ 1346 p->blocked_on = NULL; /* not blocked yet */
1308#endif 1347#endif
1309#ifdef CONFIG_MEMCG
1310 p->memcg_batch.do_batch = 0;
1311 p->memcg_batch.memcg = NULL;
1312#endif
1313#ifdef CONFIG_BCACHE 1348#ifdef CONFIG_BCACHE
1314 p->sequential_io = 0; 1349 p->sequential_io = 0;
1315 p->sequential_io_avg = 0; 1350 p->sequential_io_avg = 0;
@@ -1327,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1327 if (retval) 1362 if (retval)
1328 goto bad_fork_cleanup_policy; 1363 goto bad_fork_cleanup_policy;
1329 /* copy all the process information */ 1364 /* copy all the process information */
1365 shm_init_task(p);
1330 retval = copy_semundo(clone_flags, p); 1366 retval = copy_semundo(clone_flags, p);
1331 if (retval) 1367 if (retval)
1332 goto bad_fork_cleanup_audit; 1368 goto bad_fork_cleanup_audit;
@@ -1436,6 +1472,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1436 spin_lock(&current->sighand->siglock); 1472 spin_lock(&current->sighand->siglock);
1437 1473
1438 /* 1474 /*
1475 * Copy seccomp details explicitly here, in case they were changed
1476 * before holding sighand lock.
1477 */
1478 copy_seccomp(p);
1479
1480 /*
1439 * Process group and session signals need to be delivered to just the 1481 * Process group and session signals need to be delivered to just the
1440 * parent before the fork or both the parent and the child after the 1482 * parent before the fork or both the parent and the child after the
1441 * fork. Restart if a signal comes in before we add the new process to 1483 * fork. Restart if a signal comes in before we add the new process to
@@ -1872,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1872 */ 1914 */
1873 exit_sem(current); 1915 exit_sem(current);
1874 } 1916 }
1917 if (unshare_flags & CLONE_NEWIPC) {
1918 /* Orphan segments in old ns (see sem above). */
1919 exit_shm(current);
1920 shm_init_task(current);
1921 }
1875 1922
1876 if (new_nsproxy) 1923 if (new_nsproxy)
1877 switch_task_namespaces(current, new_nsproxy); 1924 switch_task_namespaces(current, new_nsproxy);
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 15ff01a76379..edf67c493a8e 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -784,8 +784,7 @@ static __init int gcov_fs_init(void)
784 784
785err_remove: 785err_remove:
786 pr_err("init failed\n"); 786 pr_err("init failed\n");
787 if (root_node.dentry) 787 debugfs_remove(root_node.dentry);
788 debugfs_remove(root_node.dentry);
789 788
790 return rc; 789 return rc;
791} 790}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 452d6f2ba21d..cf80e7b0ddab 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class;
341/* 341/*
342 * irq_map_generic_chip - Map a generic chip for an irq domain 342 * irq_map_generic_chip - Map a generic chip for an irq domain
343 */ 343 */
344static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, 344int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
345 irq_hw_number_t hw_irq) 345 irq_hw_number_t hw_irq)
346{ 346{
347 struct irq_data *data = irq_get_irq_data(virq); 347 struct irq_data *data = irq_get_irq_data(virq);
348 struct irq_domain_chip_generic *dgc = d->gc; 348 struct irq_domain_chip_generic *dgc = d->gc;
@@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
394 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); 394 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
395 return 0; 395 return 0;
396} 396}
397EXPORT_SYMBOL_GPL(irq_map_generic_chip);
397 398
398struct irq_domain_ops irq_generic_chip_ops = { 399struct irq_domain_ops irq_generic_chip_ops = {
399 .map = irq_map_generic_chip, 400 .map = irq_map_generic_chip,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index eb5e10e32e05..6534ff6ce02e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain)
231} 231}
232EXPORT_SYMBOL_GPL(irq_set_default_host); 232EXPORT_SYMBOL_GPL(irq_set_default_host);
233 233
234static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) 234void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
235{ 235{
236 struct irq_data *irq_data = irq_get_irq_data(irq); 236 struct irq_data *irq_data = irq_get_irq_data(irq);
237 irq_hw_number_t hwirq; 237 irq_hw_number_t hwirq;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index cb0cf37dac3a..ae5167087845 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address,
364 address += symbol_offset; 364 address += symbol_offset;
365 name = kallsyms_lookup(address, &size, &offset, &modname, buffer); 365 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
366 if (!name) 366 if (!name)
367 return sprintf(buffer, "0x%lx", address); 367 return sprintf(buffer, "0x%lx", address - symbol_offset);
368 368
369 if (name != buffer) 369 if (name != buffer)
370 strcpy(buffer, name); 370 strcpy(buffer, name);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4b8f0c925884..0b49a0a58102 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
6 * Version 2. See the file COPYING for more details. 6 * Version 2. See the file COPYING for more details.
7 */ 7 */
8 8
9#define pr_fmt(fmt) "kexec: " fmt
10
9#include <linux/capability.h> 11#include <linux/capability.h>
10#include <linux/mm.h> 12#include <linux/mm.h>
11#include <linux/file.h> 13#include <linux/file.h>
@@ -40,6 +42,9 @@
40#include <asm/io.h> 42#include <asm/io.h>
41#include <asm/sections.h> 43#include <asm/sections.h>
42 44
45#include <crypto/hash.h>
46#include <crypto/sha.h>
47
43/* Per cpu memory for storing cpu states in case of system crash. */ 48/* Per cpu memory for storing cpu states in case of system crash. */
44note_buf_t __percpu *crash_notes; 49note_buf_t __percpu *crash_notes;
45 50
@@ -52,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
52/* Flag to indicate we are going to kexec a new kernel */ 57/* Flag to indicate we are going to kexec a new kernel */
53bool kexec_in_progress = false; 58bool kexec_in_progress = false;
54 59
60/*
61 * Declare these symbols weak so that if architecture provides a purgatory,
62 * these will be overridden.
63 */
64char __weak kexec_purgatory[0];
65size_t __weak kexec_purgatory_size = 0;
66
67static int kexec_calculate_store_digests(struct kimage *image);
68
55/* Location of the reserved area for the crash kernel */ 69/* Location of the reserved area for the crash kernel */
56struct resource crashk_res = { 70struct resource crashk_res = {
57 .name = "Crash kernel", 71 .name = "Crash kernel",
@@ -125,45 +139,27 @@ static struct page *kimage_alloc_page(struct kimage *image,
125 gfp_t gfp_mask, 139 gfp_t gfp_mask,
126 unsigned long dest); 140 unsigned long dest);
127 141
128static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 142static int copy_user_segment_list(struct kimage *image,
129 unsigned long nr_segments, 143 unsigned long nr_segments,
130 struct kexec_segment __user *segments) 144 struct kexec_segment __user *segments)
131{ 145{
146 int ret;
132 size_t segment_bytes; 147 size_t segment_bytes;
133 struct kimage *image;
134 unsigned long i;
135 int result;
136
137 /* Allocate a controlling structure */
138 result = -ENOMEM;
139 image = kzalloc(sizeof(*image), GFP_KERNEL);
140 if (!image)
141 goto out;
142
143 image->head = 0;
144 image->entry = &image->head;
145 image->last_entry = &image->head;
146 image->control_page = ~0; /* By default this does not apply */
147 image->start = entry;
148 image->type = KEXEC_TYPE_DEFAULT;
149
150 /* Initialize the list of control pages */
151 INIT_LIST_HEAD(&image->control_pages);
152
153 /* Initialize the list of destination pages */
154 INIT_LIST_HEAD(&image->dest_pages);
155
156 /* Initialize the list of unusable pages */
157 INIT_LIST_HEAD(&image->unuseable_pages);
158 148
159 /* Read in the segments */ 149 /* Read in the segments */
160 image->nr_segments = nr_segments; 150 image->nr_segments = nr_segments;
161 segment_bytes = nr_segments * sizeof(*segments); 151 segment_bytes = nr_segments * sizeof(*segments);
162 result = copy_from_user(image->segment, segments, segment_bytes); 152 ret = copy_from_user(image->segment, segments, segment_bytes);
163 if (result) { 153 if (ret)
164 result = -EFAULT; 154 ret = -EFAULT;
165 goto out; 155
166 } 156 return ret;
157}
158
159static int sanity_check_segment_list(struct kimage *image)
160{
161 int result, i;
162 unsigned long nr_segments = image->nr_segments;
167 163
168 /* 164 /*
169 * Verify we have good destination addresses. The caller is 165 * Verify we have good destination addresses. The caller is
@@ -185,9 +181,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
185 mstart = image->segment[i].mem; 181 mstart = image->segment[i].mem;
186 mend = mstart + image->segment[i].memsz; 182 mend = mstart + image->segment[i].memsz;
187 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 183 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
188 goto out; 184 return result;
189 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 185 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
190 goto out; 186 return result;
191 } 187 }
192 188
193 /* Verify our destination addresses do not overlap. 189 /* Verify our destination addresses do not overlap.
@@ -208,7 +204,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
208 pend = pstart + image->segment[j].memsz; 204 pend = pstart + image->segment[j].memsz;
209 /* Do the segments overlap ? */ 205 /* Do the segments overlap ? */
210 if ((mend > pstart) && (mstart < pend)) 206 if ((mend > pstart) && (mstart < pend))
211 goto out; 207 return result;
212 } 208 }
213 } 209 }
214 210
@@ -220,130 +216,401 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
220 result = -EINVAL; 216 result = -EINVAL;
221 for (i = 0; i < nr_segments; i++) { 217 for (i = 0; i < nr_segments; i++) {
222 if (image->segment[i].bufsz > image->segment[i].memsz) 218 if (image->segment[i].bufsz > image->segment[i].memsz)
223 goto out; 219 return result;
224 } 220 }
225 221
226 result = 0; 222 /*
227out: 223 * Verify we have good destination addresses. Normally
228 if (result == 0) 224 * the caller is responsible for making certain we don't
229 *rimage = image; 225 * attempt to load the new image into invalid or reserved
230 else 226 * areas of RAM. But crash kernels are preloaded into a
231 kfree(image); 227 * reserved area of ram. We must ensure the addresses
228 * are in the reserved area otherwise preloading the
229 * kernel could corrupt things.
230 */
232 231
233 return result; 232 if (image->type == KEXEC_TYPE_CRASH) {
233 result = -EADDRNOTAVAIL;
234 for (i = 0; i < nr_segments; i++) {
235 unsigned long mstart, mend;
236
237 mstart = image->segment[i].mem;
238 mend = mstart + image->segment[i].memsz - 1;
239 /* Ensure we are within the crash kernel limits */
240 if ((mstart < crashk_res.start) ||
241 (mend > crashk_res.end))
242 return result;
243 }
244 }
234 245
246 return 0;
247}
248
249static struct kimage *do_kimage_alloc_init(void)
250{
251 struct kimage *image;
252
253 /* Allocate a controlling structure */
254 image = kzalloc(sizeof(*image), GFP_KERNEL);
255 if (!image)
256 return NULL;
257
258 image->head = 0;
259 image->entry = &image->head;
260 image->last_entry = &image->head;
261 image->control_page = ~0; /* By default this does not apply */
262 image->type = KEXEC_TYPE_DEFAULT;
263
264 /* Initialize the list of control pages */
265 INIT_LIST_HEAD(&image->control_pages);
266
267 /* Initialize the list of destination pages */
268 INIT_LIST_HEAD(&image->dest_pages);
269
270 /* Initialize the list of unusable pages */
271 INIT_LIST_HEAD(&image->unusable_pages);
272
273 return image;
235} 274}
236 275
237static void kimage_free_page_list(struct list_head *list); 276static void kimage_free_page_list(struct list_head *list);
238 277
239static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 278static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
240 unsigned long nr_segments, 279 unsigned long nr_segments,
241 struct kexec_segment __user *segments) 280 struct kexec_segment __user *segments,
281 unsigned long flags)
242{ 282{
243 int result; 283 int ret;
244 struct kimage *image; 284 struct kimage *image;
285 bool kexec_on_panic = flags & KEXEC_ON_CRASH;
286
287 if (kexec_on_panic) {
288 /* Verify we have a valid entry point */
289 if ((entry < crashk_res.start) || (entry > crashk_res.end))
290 return -EADDRNOTAVAIL;
291 }
245 292
246 /* Allocate and initialize a controlling structure */ 293 /* Allocate and initialize a controlling structure */
247 image = NULL; 294 image = do_kimage_alloc_init();
248 result = do_kimage_alloc(&image, entry, nr_segments, segments); 295 if (!image)
249 if (result) 296 return -ENOMEM;
250 goto out; 297
298 image->start = entry;
299
300 ret = copy_user_segment_list(image, nr_segments, segments);
301 if (ret)
302 goto out_free_image;
303
304 ret = sanity_check_segment_list(image);
305 if (ret)
306 goto out_free_image;
307
308 /* Enable the special crash kernel control page allocation policy. */
309 if (kexec_on_panic) {
310 image->control_page = crashk_res.start;
311 image->type = KEXEC_TYPE_CRASH;
312 }
251 313
252 /* 314 /*
253 * Find a location for the control code buffer, and add it 315 * Find a location for the control code buffer, and add it
254 * the vector of segments so that it's pages will also be 316 * the vector of segments so that it's pages will also be
255 * counted as destination pages. 317 * counted as destination pages.
256 */ 318 */
257 result = -ENOMEM; 319 ret = -ENOMEM;
258 image->control_code_page = kimage_alloc_control_pages(image, 320 image->control_code_page = kimage_alloc_control_pages(image,
259 get_order(KEXEC_CONTROL_PAGE_SIZE)); 321 get_order(KEXEC_CONTROL_PAGE_SIZE));
260 if (!image->control_code_page) { 322 if (!image->control_code_page) {
261 pr_err("Could not allocate control_code_buffer\n"); 323 pr_err("Could not allocate control_code_buffer\n");
262 goto out_free; 324 goto out_free_image;
263 } 325 }
264 326
265 image->swap_page = kimage_alloc_control_pages(image, 0); 327 if (!kexec_on_panic) {
266 if (!image->swap_page) { 328 image->swap_page = kimage_alloc_control_pages(image, 0);
267 pr_err("Could not allocate swap buffer\n"); 329 if (!image->swap_page) {
268 goto out_free; 330 pr_err("Could not allocate swap buffer\n");
331 goto out_free_control_pages;
332 }
269 } 333 }
270 334
271 *rimage = image; 335 *rimage = image;
272 return 0; 336 return 0;
273 337out_free_control_pages:
274out_free:
275 kimage_free_page_list(&image->control_pages); 338 kimage_free_page_list(&image->control_pages);
339out_free_image:
276 kfree(image); 340 kfree(image);
277out: 341 return ret;
278 return result;
279} 342}
280 343
281static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 344static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
282 unsigned long nr_segments,
283 struct kexec_segment __user *segments)
284{ 345{
285 int result; 346 struct fd f = fdget(fd);
286 struct kimage *image; 347 int ret;
287 unsigned long i; 348 struct kstat stat;
349 loff_t pos;
350 ssize_t bytes = 0;
288 351
289 image = NULL; 352 if (!f.file)
290 /* Verify we have a valid entry point */ 353 return -EBADF;
291 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 354
292 result = -EADDRNOTAVAIL; 355 ret = vfs_getattr(&f.file->f_path, &stat);
356 if (ret)
357 goto out;
358
359 if (stat.size > INT_MAX) {
360 ret = -EFBIG;
293 goto out; 361 goto out;
294 } 362 }
295 363
296 /* Allocate and initialize a controlling structure */ 364 /* Don't hand 0 to vmalloc, it whines. */
297 result = do_kimage_alloc(&image, entry, nr_segments, segments); 365 if (stat.size == 0) {
298 if (result) 366 ret = -EINVAL;
299 goto out; 367 goto out;
368 }
300 369
301 /* Enable the special crash kernel control page 370 *buf = vmalloc(stat.size);
302 * allocation policy. 371 if (!*buf) {
303 */ 372 ret = -ENOMEM;
304 image->control_page = crashk_res.start; 373 goto out;
305 image->type = KEXEC_TYPE_CRASH; 374 }
306 375
307 /* 376 pos = 0;
308 * Verify we have good destination addresses. Normally 377 while (pos < stat.size) {
309 * the caller is responsible for making certain we don't 378 bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
310 * attempt to load the new image into invalid or reserved 379 stat.size - pos);
311 * areas of RAM. But crash kernels are preloaded into a 380 if (bytes < 0) {
312 * reserved area of ram. We must ensure the addresses 381 vfree(*buf);
313 * are in the reserved area otherwise preloading the 382 ret = bytes;
314 * kernel could corrupt things. 383 goto out;
315 */ 384 }
316 result = -EADDRNOTAVAIL;
317 for (i = 0; i < nr_segments; i++) {
318 unsigned long mstart, mend;
319 385
320 mstart = image->segment[i].mem; 386 if (bytes == 0)
321 mend = mstart + image->segment[i].memsz - 1; 387 break;
322 /* Ensure we are within the crash kernel limits */ 388 pos += bytes;
323 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
324 goto out_free;
325 } 389 }
326 390
391 if (pos != stat.size) {
392 ret = -EBADF;
393 vfree(*buf);
394 goto out;
395 }
396
397 *buf_len = pos;
398out:
399 fdput(f);
400 return ret;
401}
402
403/* Architectures can provide this probe function */
404int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
405 unsigned long buf_len)
406{
407 return -ENOEXEC;
408}
409
410void * __weak arch_kexec_kernel_image_load(struct kimage *image)
411{
412 return ERR_PTR(-ENOEXEC);
413}
414
415void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
416{
417}
418
419int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
420 unsigned long buf_len)
421{
422 return -EKEYREJECTED;
423}
424
425/* Apply relocations of type RELA */
426int __weak
427arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
428 unsigned int relsec)
429{
430 pr_err("RELA relocation unsupported.\n");
431 return -ENOEXEC;
432}
433
434/* Apply relocations of type REL */
435int __weak
436arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
437 unsigned int relsec)
438{
439 pr_err("REL relocation unsupported.\n");
440 return -ENOEXEC;
441}
442
443/*
444 * Free up memory used by kernel, initrd, and comand line. This is temporary
445 * memory allocation which is not needed any more after these buffers have
446 * been loaded into separate segments and have been copied elsewhere.
447 */
448static void kimage_file_post_load_cleanup(struct kimage *image)
449{
450 struct purgatory_info *pi = &image->purgatory_info;
451
452 vfree(image->kernel_buf);
453 image->kernel_buf = NULL;
454
455 vfree(image->initrd_buf);
456 image->initrd_buf = NULL;
457
458 kfree(image->cmdline_buf);
459 image->cmdline_buf = NULL;
460
461 vfree(pi->purgatory_buf);
462 pi->purgatory_buf = NULL;
463
464 vfree(pi->sechdrs);
465 pi->sechdrs = NULL;
466
467 /* See if architecture has anything to cleanup post load */
468 arch_kimage_file_post_load_cleanup(image);
469
327 /* 470 /*
328 * Find a location for the control code buffer, and add 471 * Above call should have called into bootloader to free up
329 * the vector of segments so that it's pages will also be 472 * any data stored in kimage->image_loader_data. It should
330 * counted as destination pages. 473 * be ok now to free it up.
331 */ 474 */
332 result = -ENOMEM; 475 kfree(image->image_loader_data);
476 image->image_loader_data = NULL;
477}
478
479/*
480 * In file mode list of segments is prepared by kernel. Copy relevant
481 * data from user space, do error checking, prepare segment list
482 */
483static int
484kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
485 const char __user *cmdline_ptr,
486 unsigned long cmdline_len, unsigned flags)
487{
488 int ret = 0;
489 void *ldata;
490
491 ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
492 &image->kernel_buf_len);
493 if (ret)
494 return ret;
495
496 /* Call arch image probe handlers */
497 ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
498 image->kernel_buf_len);
499
500 if (ret)
501 goto out;
502
503#ifdef CONFIG_KEXEC_VERIFY_SIG
504 ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
505 image->kernel_buf_len);
506 if (ret) {
507 pr_debug("kernel signature verification failed.\n");
508 goto out;
509 }
510 pr_debug("kernel signature verification successful.\n");
511#endif
512 /* It is possible that there no initramfs is being loaded */
513 if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
514 ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
515 &image->initrd_buf_len);
516 if (ret)
517 goto out;
518 }
519
520 if (cmdline_len) {
521 image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
522 if (!image->cmdline_buf) {
523 ret = -ENOMEM;
524 goto out;
525 }
526
527 ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
528 cmdline_len);
529 if (ret) {
530 ret = -EFAULT;
531 goto out;
532 }
533
534 image->cmdline_buf_len = cmdline_len;
535
536 /* command line should be a string with last byte null */
537 if (image->cmdline_buf[cmdline_len - 1] != '\0') {
538 ret = -EINVAL;
539 goto out;
540 }
541 }
542
543 /* Call arch image load handlers */
544 ldata = arch_kexec_kernel_image_load(image);
545
546 if (IS_ERR(ldata)) {
547 ret = PTR_ERR(ldata);
548 goto out;
549 }
550
551 image->image_loader_data = ldata;
552out:
553 /* In case of error, free up all allocated memory in this function */
554 if (ret)
555 kimage_file_post_load_cleanup(image);
556 return ret;
557}
558
559static int
560kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
561 int initrd_fd, const char __user *cmdline_ptr,
562 unsigned long cmdline_len, unsigned long flags)
563{
564 int ret;
565 struct kimage *image;
566 bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
567
568 image = do_kimage_alloc_init();
569 if (!image)
570 return -ENOMEM;
571
572 image->file_mode = 1;
573
574 if (kexec_on_panic) {
575 /* Enable special crash kernel control page alloc policy. */
576 image->control_page = crashk_res.start;
577 image->type = KEXEC_TYPE_CRASH;
578 }
579
580 ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
581 cmdline_ptr, cmdline_len, flags);
582 if (ret)
583 goto out_free_image;
584
585 ret = sanity_check_segment_list(image);
586 if (ret)
587 goto out_free_post_load_bufs;
588
589 ret = -ENOMEM;
333 image->control_code_page = kimage_alloc_control_pages(image, 590 image->control_code_page = kimage_alloc_control_pages(image,
334 get_order(KEXEC_CONTROL_PAGE_SIZE)); 591 get_order(KEXEC_CONTROL_PAGE_SIZE));
335 if (!image->control_code_page) { 592 if (!image->control_code_page) {
336 pr_err("Could not allocate control_code_buffer\n"); 593 pr_err("Could not allocate control_code_buffer\n");
337 goto out_free; 594 goto out_free_post_load_bufs;
595 }
596
597 if (!kexec_on_panic) {
598 image->swap_page = kimage_alloc_control_pages(image, 0);
599 if (!image->swap_page) {
600 pr_err(KERN_ERR "Could not allocate swap buffer\n");
601 goto out_free_control_pages;
602 }
338 } 603 }
339 604
340 *rimage = image; 605 *rimage = image;
341 return 0; 606 return 0;
342 607out_free_control_pages:
343out_free: 608 kimage_free_page_list(&image->control_pages);
609out_free_post_load_bufs:
610 kimage_file_post_load_cleanup(image);
611out_free_image:
344 kfree(image); 612 kfree(image);
345out: 613 return ret;
346 return result;
347} 614}
348 615
349static int kimage_is_destination_range(struct kimage *image, 616static int kimage_is_destination_range(struct kimage *image,
@@ -609,7 +876,7 @@ static void kimage_free_extra_pages(struct kimage *image)
609 kimage_free_page_list(&image->dest_pages); 876 kimage_free_page_list(&image->dest_pages);
610 877
611 /* Walk through and free any unusable pages I have cached */ 878 /* Walk through and free any unusable pages I have cached */
612 kimage_free_page_list(&image->unuseable_pages); 879 kimage_free_page_list(&image->unusable_pages);
613 880
614} 881}
615static void kimage_terminate(struct kimage *image) 882static void kimage_terminate(struct kimage *image)
@@ -663,6 +930,14 @@ static void kimage_free(struct kimage *image)
663 930
664 /* Free the kexec control pages... */ 931 /* Free the kexec control pages... */
665 kimage_free_page_list(&image->control_pages); 932 kimage_free_page_list(&image->control_pages);
933
934 /*
935 * Free up any temporary buffers allocated. This might hit if
936 * error occurred much later after buffer allocation.
937 */
938 if (image->file_mode)
939 kimage_file_post_load_cleanup(image);
940
666 kfree(image); 941 kfree(image);
667} 942}
668 943
@@ -732,7 +1007,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
732 /* If the page cannot be used file it away */ 1007 /* If the page cannot be used file it away */
733 if (page_to_pfn(page) > 1008 if (page_to_pfn(page) >
734 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 1009 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
735 list_add(&page->lru, &image->unuseable_pages); 1010 list_add(&page->lru, &image->unusable_pages);
736 continue; 1011 continue;
737 } 1012 }
738 addr = page_to_pfn(page) << PAGE_SHIFT; 1013 addr = page_to_pfn(page) << PAGE_SHIFT;
@@ -791,10 +1066,14 @@ static int kimage_load_normal_segment(struct kimage *image,
791 unsigned long maddr; 1066 unsigned long maddr;
792 size_t ubytes, mbytes; 1067 size_t ubytes, mbytes;
793 int result; 1068 int result;
794 unsigned char __user *buf; 1069 unsigned char __user *buf = NULL;
1070 unsigned char *kbuf = NULL;
795 1071
796 result = 0; 1072 result = 0;
797 buf = segment->buf; 1073 if (image->file_mode)
1074 kbuf = segment->kbuf;
1075 else
1076 buf = segment->buf;
798 ubytes = segment->bufsz; 1077 ubytes = segment->bufsz;
799 mbytes = segment->memsz; 1078 mbytes = segment->memsz;
800 maddr = segment->mem; 1079 maddr = segment->mem;
@@ -826,7 +1105,11 @@ static int kimage_load_normal_segment(struct kimage *image,
826 PAGE_SIZE - (maddr & ~PAGE_MASK)); 1105 PAGE_SIZE - (maddr & ~PAGE_MASK));
827 uchunk = min(ubytes, mchunk); 1106 uchunk = min(ubytes, mchunk);
828 1107
829 result = copy_from_user(ptr, buf, uchunk); 1108 /* For file based kexec, source pages are in kernel memory */
1109 if (image->file_mode)
1110 memcpy(ptr, kbuf, uchunk);
1111 else
1112 result = copy_from_user(ptr, buf, uchunk);
830 kunmap(page); 1113 kunmap(page);
831 if (result) { 1114 if (result) {
832 result = -EFAULT; 1115 result = -EFAULT;
@@ -834,7 +1117,10 @@ static int kimage_load_normal_segment(struct kimage *image,
834 } 1117 }
835 ubytes -= uchunk; 1118 ubytes -= uchunk;
836 maddr += mchunk; 1119 maddr += mchunk;
837 buf += mchunk; 1120 if (image->file_mode)
1121 kbuf += mchunk;
1122 else
1123 buf += mchunk;
838 mbytes -= mchunk; 1124 mbytes -= mchunk;
839 } 1125 }
840out: 1126out:
@@ -851,10 +1137,14 @@ static int kimage_load_crash_segment(struct kimage *image,
851 unsigned long maddr; 1137 unsigned long maddr;
852 size_t ubytes, mbytes; 1138 size_t ubytes, mbytes;
853 int result; 1139 int result;
854 unsigned char __user *buf; 1140 unsigned char __user *buf = NULL;
1141 unsigned char *kbuf = NULL;
855 1142
856 result = 0; 1143 result = 0;
857 buf = segment->buf; 1144 if (image->file_mode)
1145 kbuf = segment->kbuf;
1146 else
1147 buf = segment->buf;
858 ubytes = segment->bufsz; 1148 ubytes = segment->bufsz;
859 mbytes = segment->memsz; 1149 mbytes = segment->memsz;
860 maddr = segment->mem; 1150 maddr = segment->mem;
@@ -877,7 +1167,12 @@ static int kimage_load_crash_segment(struct kimage *image,
877 /* Zero the trailing part of the page */ 1167 /* Zero the trailing part of the page */
878 memset(ptr + uchunk, 0, mchunk - uchunk); 1168 memset(ptr + uchunk, 0, mchunk - uchunk);
879 } 1169 }
880 result = copy_from_user(ptr, buf, uchunk); 1170
1171 /* For file based kexec, source pages are in kernel memory */
1172 if (image->file_mode)
1173 memcpy(ptr, kbuf, uchunk);
1174 else
1175 result = copy_from_user(ptr, buf, uchunk);
881 kexec_flush_icache_page(page); 1176 kexec_flush_icache_page(page);
882 kunmap(page); 1177 kunmap(page);
883 if (result) { 1178 if (result) {
@@ -886,7 +1181,10 @@ static int kimage_load_crash_segment(struct kimage *image,
886 } 1181 }
887 ubytes -= uchunk; 1182 ubytes -= uchunk;
888 maddr += mchunk; 1183 maddr += mchunk;
889 buf += mchunk; 1184 if (image->file_mode)
1185 kbuf += mchunk;
1186 else
1187 buf += mchunk;
890 mbytes -= mchunk; 1188 mbytes -= mchunk;
891 } 1189 }
892out: 1190out:
@@ -986,16 +1284,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
986 1284
987 /* Loading another kernel to reboot into */ 1285 /* Loading another kernel to reboot into */
988 if ((flags & KEXEC_ON_CRASH) == 0) 1286 if ((flags & KEXEC_ON_CRASH) == 0)
989 result = kimage_normal_alloc(&image, entry, 1287 result = kimage_alloc_init(&image, entry, nr_segments,
990 nr_segments, segments); 1288 segments, flags);
991 /* Loading another kernel to switch to if this one crashes */ 1289 /* Loading another kernel to switch to if this one crashes */
992 else if (flags & KEXEC_ON_CRASH) { 1290 else if (flags & KEXEC_ON_CRASH) {
993 /* Free any current crash dump kernel before 1291 /* Free any current crash dump kernel before
994 * we corrupt it. 1292 * we corrupt it.
995 */ 1293 */
996 kimage_free(xchg(&kexec_crash_image, NULL)); 1294 kimage_free(xchg(&kexec_crash_image, NULL));
997 result = kimage_crash_alloc(&image, entry, 1295 result = kimage_alloc_init(&image, entry, nr_segments,
998 nr_segments, segments); 1296 segments, flags);
999 crash_map_reserved_pages(); 1297 crash_map_reserved_pages();
1000 } 1298 }
1001 if (result) 1299 if (result)
@@ -1077,6 +1375,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1077} 1375}
1078#endif 1376#endif
1079 1377
1378SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
1379 unsigned long, cmdline_len, const char __user *, cmdline_ptr,
1380 unsigned long, flags)
1381{
1382 int ret = 0, i;
1383 struct kimage **dest_image, *image;
1384
1385 /* We only trust the superuser with rebooting the system. */
1386 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
1387 return -EPERM;
1388
1389 /* Make sure we have a legal set of flags */
1390 if (flags != (flags & KEXEC_FILE_FLAGS))
1391 return -EINVAL;
1392
1393 image = NULL;
1394
1395 if (!mutex_trylock(&kexec_mutex))
1396 return -EBUSY;
1397
1398 dest_image = &kexec_image;
1399 if (flags & KEXEC_FILE_ON_CRASH)
1400 dest_image = &kexec_crash_image;
1401
1402 if (flags & KEXEC_FILE_UNLOAD)
1403 goto exchange;
1404
1405 /*
1406 * In case of crash, new kernel gets loaded in reserved region. It is
1407 * same memory where old crash kernel might be loaded. Free any
1408 * current crash dump kernel before we corrupt it.
1409 */
1410 if (flags & KEXEC_FILE_ON_CRASH)
1411 kimage_free(xchg(&kexec_crash_image, NULL));
1412
1413 ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
1414 cmdline_len, flags);
1415 if (ret)
1416 goto out;
1417
1418 ret = machine_kexec_prepare(image);
1419 if (ret)
1420 goto out;
1421
1422 ret = kexec_calculate_store_digests(image);
1423 if (ret)
1424 goto out;
1425
1426 for (i = 0; i < image->nr_segments; i++) {
1427 struct kexec_segment *ksegment;
1428
1429 ksegment = &image->segment[i];
1430 pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
1431 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
1432 ksegment->memsz);
1433
1434 ret = kimage_load_segment(image, &image->segment[i]);
1435 if (ret)
1436 goto out;
1437 }
1438
1439 kimage_terminate(image);
1440
1441 /*
1442 * Free up any temporary buffers allocated which are not needed
1443 * after image has been loaded
1444 */
1445 kimage_file_post_load_cleanup(image);
1446exchange:
1447 image = xchg(dest_image, image);
1448out:
1449 mutex_unlock(&kexec_mutex);
1450 kimage_free(image);
1451 return ret;
1452}
1453
1080void crash_kexec(struct pt_regs *regs) 1454void crash_kexec(struct pt_regs *regs)
1081{ 1455{
1082 /* Take the kexec_mutex here to prevent sys_kexec_load 1456 /* Take the kexec_mutex here to prevent sys_kexec_load
@@ -1632,6 +2006,683 @@ static int __init crash_save_vmcoreinfo_init(void)
1632 2006
1633subsys_initcall(crash_save_vmcoreinfo_init); 2007subsys_initcall(crash_save_vmcoreinfo_init);
1634 2008
2009static int __kexec_add_segment(struct kimage *image, char *buf,
2010 unsigned long bufsz, unsigned long mem,
2011 unsigned long memsz)
2012{
2013 struct kexec_segment *ksegment;
2014
2015 ksegment = &image->segment[image->nr_segments];
2016 ksegment->kbuf = buf;
2017 ksegment->bufsz = bufsz;
2018 ksegment->mem = mem;
2019 ksegment->memsz = memsz;
2020 image->nr_segments++;
2021
2022 return 0;
2023}
2024
2025static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
2026 struct kexec_buf *kbuf)
2027{
2028 struct kimage *image = kbuf->image;
2029 unsigned long temp_start, temp_end;
2030
2031 temp_end = min(end, kbuf->buf_max);
2032 temp_start = temp_end - kbuf->memsz;
2033
2034 do {
2035 /* align down start */
2036 temp_start = temp_start & (~(kbuf->buf_align - 1));
2037
2038 if (temp_start < start || temp_start < kbuf->buf_min)
2039 return 0;
2040
2041 temp_end = temp_start + kbuf->memsz - 1;
2042
2043 /*
2044 * Make sure this does not conflict with any of existing
2045 * segments
2046 */
2047 if (kimage_is_destination_range(image, temp_start, temp_end)) {
2048 temp_start = temp_start - PAGE_SIZE;
2049 continue;
2050 }
2051
2052 /* We found a suitable memory range */
2053 break;
2054 } while (1);
2055
2056 /* If we are here, we found a suitable memory range */
2057 __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
2058 kbuf->memsz);
2059
2060 /* Success, stop navigating through remaining System RAM ranges */
2061 return 1;
2062}
2063
2064static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
2065 struct kexec_buf *kbuf)
2066{
2067 struct kimage *image = kbuf->image;
2068 unsigned long temp_start, temp_end;
2069
2070 temp_start = max(start, kbuf->buf_min);
2071
2072 do {
2073 temp_start = ALIGN(temp_start, kbuf->buf_align);
2074 temp_end = temp_start + kbuf->memsz - 1;
2075
2076 if (temp_end > end || temp_end > kbuf->buf_max)
2077 return 0;
2078 /*
2079 * Make sure this does not conflict with any of existing
2080 * segments
2081 */
2082 if (kimage_is_destination_range(image, temp_start, temp_end)) {
2083 temp_start = temp_start + PAGE_SIZE;
2084 continue;
2085 }
2086
2087 /* We found a suitable memory range */
2088 break;
2089 } while (1);
2090
2091 /* If we are here, we found a suitable memory range */
2092 __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
2093 kbuf->memsz);
2094
2095 /* Success, stop navigating through remaining System RAM ranges */
2096 return 1;
2097}
2098
2099static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
2100{
2101 struct kexec_buf *kbuf = (struct kexec_buf *)arg;
2102 unsigned long sz = end - start + 1;
2103
2104 /* Returning 0 will take to next memory range */
2105 if (sz < kbuf->memsz)
2106 return 0;
2107
2108 if (end < kbuf->buf_min || start > kbuf->buf_max)
2109 return 0;
2110
2111 /*
2112 * Allocate memory top down with-in ram range. Otherwise bottom up
2113 * allocation.
2114 */
2115 if (kbuf->top_down)
2116 return locate_mem_hole_top_down(start, end, kbuf);
2117 return locate_mem_hole_bottom_up(start, end, kbuf);
2118}
2119
2120/*
2121 * Helper function for placing a buffer in a kexec segment. This assumes
2122 * that kexec_mutex is held.
2123 */
2124int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
2125 unsigned long memsz, unsigned long buf_align,
2126 unsigned long buf_min, unsigned long buf_max,
2127 bool top_down, unsigned long *load_addr)
2128{
2129
2130 struct kexec_segment *ksegment;
2131 struct kexec_buf buf, *kbuf;
2132 int ret;
2133
2134 /* Currently adding segment this way is allowed only in file mode */
2135 if (!image->file_mode)
2136 return -EINVAL;
2137
2138 if (image->nr_segments >= KEXEC_SEGMENT_MAX)
2139 return -EINVAL;
2140
2141 /*
2142 * Make sure we are not trying to add buffer after allocating
2143 * control pages. All segments need to be placed first before
2144 * any control pages are allocated. As control page allocation
2145 * logic goes through list of segments to make sure there are
2146 * no destination overlaps.
2147 */
2148 if (!list_empty(&image->control_pages)) {
2149 WARN_ON(1);
2150 return -EINVAL;
2151 }
2152
2153 memset(&buf, 0, sizeof(struct kexec_buf));
2154 kbuf = &buf;
2155 kbuf->image = image;
2156 kbuf->buffer = buffer;
2157 kbuf->bufsz = bufsz;
2158
2159 kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
2160 kbuf->buf_align = max(buf_align, PAGE_SIZE);
2161 kbuf->buf_min = buf_min;
2162 kbuf->buf_max = buf_max;
2163 kbuf->top_down = top_down;
2164
2165 /* Walk the RAM ranges and allocate a suitable range for the buffer */
2166 if (image->type == KEXEC_TYPE_CRASH)
2167 ret = walk_iomem_res("Crash kernel",
2168 IORESOURCE_MEM | IORESOURCE_BUSY,
2169 crashk_res.start, crashk_res.end, kbuf,
2170 locate_mem_hole_callback);
2171 else
2172 ret = walk_system_ram_res(0, -1, kbuf,
2173 locate_mem_hole_callback);
2174 if (ret != 1) {
2175 /* A suitable memory range could not be found for buffer */
2176 return -EADDRNOTAVAIL;
2177 }
2178
2179 /* Found a suitable memory range */
2180 ksegment = &image->segment[image->nr_segments - 1];
2181 *load_addr = ksegment->mem;
2182 return 0;
2183}
2184
2185/* Calculate and store the digest of segments */
2186static int kexec_calculate_store_digests(struct kimage *image)
2187{
2188 struct crypto_shash *tfm;
2189 struct shash_desc *desc;
2190 int ret = 0, i, j, zero_buf_sz, sha_region_sz;
2191 size_t desc_size, nullsz;
2192 char *digest;
2193 void *zero_buf;
2194 struct kexec_sha_region *sha_regions;
2195 struct purgatory_info *pi = &image->purgatory_info;
2196
2197 zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
2198 zero_buf_sz = PAGE_SIZE;
2199
2200 tfm = crypto_alloc_shash("sha256", 0, 0);
2201 if (IS_ERR(tfm)) {
2202 ret = PTR_ERR(tfm);
2203 goto out;
2204 }
2205
2206 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
2207 desc = kzalloc(desc_size, GFP_KERNEL);
2208 if (!desc) {
2209 ret = -ENOMEM;
2210 goto out_free_tfm;
2211 }
2212
2213 sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
2214 sha_regions = vzalloc(sha_region_sz);
2215 if (!sha_regions)
2216 goto out_free_desc;
2217
2218 desc->tfm = tfm;
2219 desc->flags = 0;
2220
2221 ret = crypto_shash_init(desc);
2222 if (ret < 0)
2223 goto out_free_sha_regions;
2224
2225 digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
2226 if (!digest) {
2227 ret = -ENOMEM;
2228 goto out_free_sha_regions;
2229 }
2230
2231 for (j = i = 0; i < image->nr_segments; i++) {
2232 struct kexec_segment *ksegment;
2233
2234 ksegment = &image->segment[i];
2235 /*
2236 * Skip purgatory as it will be modified once we put digest
2237 * info in purgatory.
2238 */
2239 if (ksegment->kbuf == pi->purgatory_buf)
2240 continue;
2241
2242 ret = crypto_shash_update(desc, ksegment->kbuf,
2243 ksegment->bufsz);
2244 if (ret)
2245 break;
2246
2247 /*
2248 * Assume rest of the buffer is filled with zero and
2249 * update digest accordingly.
2250 */
2251 nullsz = ksegment->memsz - ksegment->bufsz;
2252 while (nullsz) {
2253 unsigned long bytes = nullsz;
2254
2255 if (bytes > zero_buf_sz)
2256 bytes = zero_buf_sz;
2257 ret = crypto_shash_update(desc, zero_buf, bytes);
2258 if (ret)
2259 break;
2260 nullsz -= bytes;
2261 }
2262
2263 if (ret)
2264 break;
2265
2266 sha_regions[j].start = ksegment->mem;
2267 sha_regions[j].len = ksegment->memsz;
2268 j++;
2269 }
2270
2271 if (!ret) {
2272 ret = crypto_shash_final(desc, digest);
2273 if (ret)
2274 goto out_free_digest;
2275 ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
2276 sha_regions, sha_region_sz, 0);
2277 if (ret)
2278 goto out_free_digest;
2279
2280 ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
2281 digest, SHA256_DIGEST_SIZE, 0);
2282 if (ret)
2283 goto out_free_digest;
2284 }
2285
2286out_free_digest:
2287 kfree(digest);
2288out_free_sha_regions:
2289 vfree(sha_regions);
2290out_free_desc:
2291 kfree(desc);
2292out_free_tfm:
2293 kfree(tfm);
2294out:
2295 return ret;
2296}
2297
2298/* Actually load purgatory. Lot of code taken from kexec-tools */
2299static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
2300 unsigned long max, int top_down)
2301{
2302 struct purgatory_info *pi = &image->purgatory_info;
2303 unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
2304 unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
2305 unsigned char *buf_addr, *src;
2306 int i, ret = 0, entry_sidx = -1;
2307 const Elf_Shdr *sechdrs_c;
2308 Elf_Shdr *sechdrs = NULL;
2309 void *purgatory_buf = NULL;
2310
2311 /*
2312 * sechdrs_c points to section headers in purgatory and are read
2313 * only. No modifications allowed.
2314 */
2315 sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
2316
2317 /*
2318 * We can not modify sechdrs_c[] and its fields. It is read only.
2319 * Copy it over to a local copy where one can store some temporary
2320 * data and free it at the end. We need to modify ->sh_addr and
2321 * ->sh_offset fields to keep track of permanent and temporary
2322 * locations of sections.
2323 */
2324 sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
2325 if (!sechdrs)
2326 return -ENOMEM;
2327
2328 memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
2329
2330 /*
2331 * We seem to have multiple copies of sections. First copy is which
2332 * is embedded in kernel in read only section. Some of these sections
2333 * will be copied to a temporary buffer and relocated. And these
2334 * sections will finally be copied to their final destination at
2335 * segment load time.
2336 *
2337 * Use ->sh_offset to reflect section address in memory. It will
2338 * point to original read only copy if section is not allocatable.
2339 * Otherwise it will point to temporary copy which will be relocated.
2340 *
2341 * Use ->sh_addr to contain final address of the section where it
2342 * will go during execution time.
2343 */
2344 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2345 if (sechdrs[i].sh_type == SHT_NOBITS)
2346 continue;
2347
2348 sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
2349 sechdrs[i].sh_offset;
2350 }
2351
2352 /*
2353 * Identify entry point section and make entry relative to section
2354 * start.
2355 */
2356 entry = pi->ehdr->e_entry;
2357 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2358 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2359 continue;
2360
2361 if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
2362 continue;
2363
2364 /* Make entry section relative */
2365 if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
2366 ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
2367 pi->ehdr->e_entry)) {
2368 entry_sidx = i;
2369 entry -= sechdrs[i].sh_addr;
2370 break;
2371 }
2372 }
2373
2374 /* Determine how much memory is needed to load relocatable object. */
2375 buf_align = 1;
2376 bss_align = 1;
2377 buf_sz = 0;
2378 bss_sz = 0;
2379
2380 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2381 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2382 continue;
2383
2384 align = sechdrs[i].sh_addralign;
2385 if (sechdrs[i].sh_type != SHT_NOBITS) {
2386 if (buf_align < align)
2387 buf_align = align;
2388 buf_sz = ALIGN(buf_sz, align);
2389 buf_sz += sechdrs[i].sh_size;
2390 } else {
2391 /* bss section */
2392 if (bss_align < align)
2393 bss_align = align;
2394 bss_sz = ALIGN(bss_sz, align);
2395 bss_sz += sechdrs[i].sh_size;
2396 }
2397 }
2398
2399 /* Determine the bss padding required to align bss properly */
2400 bss_pad = 0;
2401 if (buf_sz & (bss_align - 1))
2402 bss_pad = bss_align - (buf_sz & (bss_align - 1));
2403
2404 memsz = buf_sz + bss_pad + bss_sz;
2405
2406 /* Allocate buffer for purgatory */
2407 purgatory_buf = vzalloc(buf_sz);
2408 if (!purgatory_buf) {
2409 ret = -ENOMEM;
2410 goto out;
2411 }
2412
2413 if (buf_align < bss_align)
2414 buf_align = bss_align;
2415
2416 /* Add buffer to segment list */
2417 ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
2418 buf_align, min, max, top_down,
2419 &pi->purgatory_load_addr);
2420 if (ret)
2421 goto out;
2422
2423 /* Load SHF_ALLOC sections */
2424 buf_addr = purgatory_buf;
2425 load_addr = curr_load_addr = pi->purgatory_load_addr;
2426 bss_addr = load_addr + buf_sz + bss_pad;
2427
2428 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2429 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2430 continue;
2431
2432 align = sechdrs[i].sh_addralign;
2433 if (sechdrs[i].sh_type != SHT_NOBITS) {
2434 curr_load_addr = ALIGN(curr_load_addr, align);
2435 offset = curr_load_addr - load_addr;
2436 /* We already modifed ->sh_offset to keep src addr */
2437 src = (char *) sechdrs[i].sh_offset;
2438 memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
2439
2440 /* Store load address and source address of section */
2441 sechdrs[i].sh_addr = curr_load_addr;
2442
2443 /*
2444 * This section got copied to temporary buffer. Update
2445 * ->sh_offset accordingly.
2446 */
2447 sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
2448
2449 /* Advance to the next address */
2450 curr_load_addr += sechdrs[i].sh_size;
2451 } else {
2452 bss_addr = ALIGN(bss_addr, align);
2453 sechdrs[i].sh_addr = bss_addr;
2454 bss_addr += sechdrs[i].sh_size;
2455 }
2456 }
2457
2458 /* Update entry point based on load address of text section */
2459 if (entry_sidx >= 0)
2460 entry += sechdrs[entry_sidx].sh_addr;
2461
2462 /* Make kernel jump to purgatory after shutdown */
2463 image->start = entry;
2464
2465 /* Used later to get/set symbol values */
2466 pi->sechdrs = sechdrs;
2467
2468 /*
2469 * Used later to identify which section is purgatory and skip it
2470 * from checksumming.
2471 */
2472 pi->purgatory_buf = purgatory_buf;
2473 return ret;
2474out:
2475 vfree(sechdrs);
2476 vfree(purgatory_buf);
2477 return ret;
2478}
2479
2480static int kexec_apply_relocations(struct kimage *image)
2481{
2482 int i, ret;
2483 struct purgatory_info *pi = &image->purgatory_info;
2484 Elf_Shdr *sechdrs = pi->sechdrs;
2485
2486 /* Apply relocations */
2487 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2488 Elf_Shdr *section, *symtab;
2489
2490 if (sechdrs[i].sh_type != SHT_RELA &&
2491 sechdrs[i].sh_type != SHT_REL)
2492 continue;
2493
2494 /*
2495 * For section of type SHT_RELA/SHT_REL,
2496 * ->sh_link contains section header index of associated
2497 * symbol table. And ->sh_info contains section header
2498 * index of section to which relocations apply.
2499 */
2500 if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
2501 sechdrs[i].sh_link >= pi->ehdr->e_shnum)
2502 return -ENOEXEC;
2503
2504 section = &sechdrs[sechdrs[i].sh_info];
2505 symtab = &sechdrs[sechdrs[i].sh_link];
2506
2507 if (!(section->sh_flags & SHF_ALLOC))
2508 continue;
2509
2510 /*
2511 * symtab->sh_link contain section header index of associated
2512 * string table.
2513 */
2514 if (symtab->sh_link >= pi->ehdr->e_shnum)
2515 /* Invalid section number? */
2516 continue;
2517
2518 /*
2519 * Respective archicture needs to provide support for applying
2520 * relocations of type SHT_RELA/SHT_REL.
2521 */
2522 if (sechdrs[i].sh_type == SHT_RELA)
2523 ret = arch_kexec_apply_relocations_add(pi->ehdr,
2524 sechdrs, i);
2525 else if (sechdrs[i].sh_type == SHT_REL)
2526 ret = arch_kexec_apply_relocations(pi->ehdr,
2527 sechdrs, i);
2528 if (ret)
2529 return ret;
2530 }
2531
2532 return 0;
2533}
2534
2535/* Load relocatable purgatory object and relocate it appropriately */
2536int kexec_load_purgatory(struct kimage *image, unsigned long min,
2537 unsigned long max, int top_down,
2538 unsigned long *load_addr)
2539{
2540 struct purgatory_info *pi = &image->purgatory_info;
2541 int ret;
2542
2543 if (kexec_purgatory_size <= 0)
2544 return -EINVAL;
2545
2546 if (kexec_purgatory_size < sizeof(Elf_Ehdr))
2547 return -ENOEXEC;
2548
2549 pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
2550
2551 if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
2552 || pi->ehdr->e_type != ET_REL
2553 || !elf_check_arch(pi->ehdr)
2554 || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
2555 return -ENOEXEC;
2556
2557 if (pi->ehdr->e_shoff >= kexec_purgatory_size
2558 || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
2559 kexec_purgatory_size - pi->ehdr->e_shoff))
2560 return -ENOEXEC;
2561
2562 ret = __kexec_load_purgatory(image, min, max, top_down);
2563 if (ret)
2564 return ret;
2565
2566 ret = kexec_apply_relocations(image);
2567 if (ret)
2568 goto out;
2569
2570 *load_addr = pi->purgatory_load_addr;
2571 return 0;
2572out:
2573 vfree(pi->sechdrs);
2574 vfree(pi->purgatory_buf);
2575 return ret;
2576}
2577
2578static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
2579 const char *name)
2580{
2581 Elf_Sym *syms;
2582 Elf_Shdr *sechdrs;
2583 Elf_Ehdr *ehdr;
2584 int i, k;
2585 const char *strtab;
2586
2587 if (!pi->sechdrs || !pi->ehdr)
2588 return NULL;
2589
2590 sechdrs = pi->sechdrs;
2591 ehdr = pi->ehdr;
2592
2593 for (i = 0; i < ehdr->e_shnum; i++) {
2594 if (sechdrs[i].sh_type != SHT_SYMTAB)
2595 continue;
2596
2597 if (sechdrs[i].sh_link >= ehdr->e_shnum)
2598 /* Invalid strtab section number */
2599 continue;
2600 strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
2601 syms = (Elf_Sym *)sechdrs[i].sh_offset;
2602
2603 /* Go through symbols for a match */
2604 for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
2605 if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
2606 continue;
2607
2608 if (strcmp(strtab + syms[k].st_name, name) != 0)
2609 continue;
2610
2611 if (syms[k].st_shndx == SHN_UNDEF ||
2612 syms[k].st_shndx >= ehdr->e_shnum) {
2613 pr_debug("Symbol: %s has bad section index %d.\n",
2614 name, syms[k].st_shndx);
2615 return NULL;
2616 }
2617
2618 /* Found the symbol we are looking for */
2619 return &syms[k];
2620 }
2621 }
2622
2623 return NULL;
2624}
2625
2626void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
2627{
2628 struct purgatory_info *pi = &image->purgatory_info;
2629 Elf_Sym *sym;
2630 Elf_Shdr *sechdr;
2631
2632 sym = kexec_purgatory_find_symbol(pi, name);
2633 if (!sym)
2634 return ERR_PTR(-EINVAL);
2635
2636 sechdr = &pi->sechdrs[sym->st_shndx];
2637
2638 /*
2639 * Returns the address where symbol will finally be loaded after
2640 * kexec_load_segment()
2641 */
2642 return (void *)(sechdr->sh_addr + sym->st_value);
2643}
2644
2645/*
2646 * Get or set value of a symbol. If "get_value" is true, symbol value is
2647 * returned in buf otherwise symbol value is set based on value in buf.
2648 */
2649int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
2650 void *buf, unsigned int size, bool get_value)
2651{
2652 Elf_Sym *sym;
2653 Elf_Shdr *sechdrs;
2654 struct purgatory_info *pi = &image->purgatory_info;
2655 char *sym_buf;
2656
2657 sym = kexec_purgatory_find_symbol(pi, name);
2658 if (!sym)
2659 return -EINVAL;
2660
2661 if (sym->st_size != size) {
2662 pr_err("symbol %s size mismatch: expected %lu actual %u\n",
2663 name, (unsigned long)sym->st_size, size);
2664 return -EINVAL;
2665 }
2666
2667 sechdrs = pi->sechdrs;
2668
2669 if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
2670 pr_err("symbol %s is in a bss section. Cannot %s\n", name,
2671 get_value ? "get" : "set");
2672 return -EINVAL;
2673 }
2674
2675 sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
2676 sym->st_value;
2677
2678 if (get_value)
2679 memcpy((void *)buf, sym_buf, size);
2680 else
2681 memcpy((void *)sym_buf, buf, size);
2682
2683 return 0;
2684}
2685
1635/* 2686/*
1636 * Move into place and start executing a preloaded standalone 2687 * Move into place and start executing a preloaded standalone
1637 * executable. If nothing was preloaded return an error. 2688 * executable. If nothing was preloaded return an error.
diff --git a/kernel/module.c b/kernel/module.c
index ae79ce615cb9..03214bd288e9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3304,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
3304 mutex_lock(&module_mutex); 3304 mutex_lock(&module_mutex);
3305 module_bug_cleanup(mod); 3305 module_bug_cleanup(mod);
3306 mutex_unlock(&module_mutex); 3306 mutex_unlock(&module_mutex);
3307
3308 /* we can't deallocate the module until we clear memory protection */
3309 unset_module_init_ro_nx(mod);
3310 unset_module_core_ro_nx(mod);
3311
3307 ddebug_cleanup: 3312 ddebug_cleanup:
3308 dynamic_debug_remove(info->debug); 3313 dynamic_debug_remove(info->debug);
3309 synchronize_sched(); 3314 synchronize_sched();
@@ -3381,6 +3386,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
3381 */ 3386 */
3382static inline int is_arm_mapping_symbol(const char *str) 3387static inline int is_arm_mapping_symbol(const char *str)
3383{ 3388{
3389 if (str[0] == '.' && str[1] == 'L')
3390 return true;
3384 return str[0] == '$' && strchr("atd", str[1]) 3391 return str[0] == '$' && strchr("atd", str[1])
3385 && (str[2] == '\0' || str[2] == '.'); 3392 && (str[2] == '\0' || str[2] == '.');
3386} 3393}
@@ -3444,8 +3451,7 @@ const char *module_address_lookup(unsigned long addr,
3444 list_for_each_entry_rcu(mod, &modules, list) { 3451 list_for_each_entry_rcu(mod, &modules, list) {
3445 if (mod->state == MODULE_STATE_UNFORMED) 3452 if (mod->state == MODULE_STATE_UNFORMED)
3446 continue; 3453 continue;
3447 if (within_module_init(addr, mod) || 3454 if (within_module(addr, mod)) {
3448 within_module_core(addr, mod)) {
3449 if (modname) 3455 if (modname)
3450 *modname = mod->name; 3456 *modname = mod->name;
3451 ret = get_ksymbol(mod, addr, size, offset); 3457 ret = get_ksymbol(mod, addr, size, offset);
@@ -3469,8 +3475,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
3469 list_for_each_entry_rcu(mod, &modules, list) { 3475 list_for_each_entry_rcu(mod, &modules, list) {
3470 if (mod->state == MODULE_STATE_UNFORMED) 3476 if (mod->state == MODULE_STATE_UNFORMED)
3471 continue; 3477 continue;
3472 if (within_module_init(addr, mod) || 3478 if (within_module(addr, mod)) {
3473 within_module_core(addr, mod)) {
3474 const char *sym; 3479 const char *sym;
3475 3480
3476 sym = get_ksymbol(mod, addr, NULL, NULL); 3481 sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -3495,8 +3500,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
3495 list_for_each_entry_rcu(mod, &modules, list) { 3500 list_for_each_entry_rcu(mod, &modules, list) {
3496 if (mod->state == MODULE_STATE_UNFORMED) 3501 if (mod->state == MODULE_STATE_UNFORMED)
3497 continue; 3502 continue;
3498 if (within_module_init(addr, mod) || 3503 if (within_module(addr, mod)) {
3499 within_module_core(addr, mod)) {
3500 const char *sym; 3504 const char *sym;
3501 3505
3502 sym = get_ksymbol(mod, addr, size, offset); 3506 sym = get_ksymbol(mod, addr, size, offset);
@@ -3760,8 +3764,7 @@ struct module *__module_address(unsigned long addr)
3760 list_for_each_entry_rcu(mod, &modules, list) { 3764 list_for_each_entry_rcu(mod, &modules, list) {
3761 if (mod->state == MODULE_STATE_UNFORMED) 3765 if (mod->state == MODULE_STATE_UNFORMED)
3762 continue; 3766 continue;
3763 if (within_module_core(addr, mod) 3767 if (within_module(addr, mod))
3764 || within_module_init(addr, mod))
3765 return mod; 3768 return mod;
3766 } 3769 }
3767 return NULL; 3770 return NULL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e7811086b82..ef42d0ab3115 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
204 204
205 might_sleep(); 205 might_sleep();
206 206
207 task_lock(p);
207 ns = p->nsproxy; 208 ns = p->nsproxy;
209 p->nsproxy = new;
210 task_unlock(p);
208 211
209 rcu_assign_pointer(p->nsproxy, new); 212 if (ns && atomic_dec_and_test(&ns->count))
210
211 if (ns && atomic_dec_and_test(&ns->count)) {
212 /*
213 * wait for others to get what they want from this nsproxy.
214 *
215 * cannot release this nsproxy via the call_rcu() since
216 * put_mnt_ns() will want to sleep
217 */
218 synchronize_rcu();
219 free_nsproxy(ns); 213 free_nsproxy(ns);
220 }
221} 214}
222 215
223void exit_task_namespaces(struct task_struct *p) 216void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 62e16cef9cc2..d09dc5c32c67 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -224,6 +224,7 @@ static const struct tnt tnts[] = {
224 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, 224 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
225 { TAINT_OOT_MODULE, 'O', ' ' }, 225 { TAINT_OOT_MODULE, 'O', ' ' },
226 { TAINT_UNSIGNED_MODULE, 'E', ' ' }, 226 { TAINT_UNSIGNED_MODULE, 'E', ' ' },
227 { TAINT_SOFTLOCKUP, 'L', ' ' },
227}; 228};
228 229
229/** 230/**
diff --git a/kernel/params.c b/kernel/params.c
index 1e52ca233fd9..34f527023794 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -256,6 +256,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
256STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); 256STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
257STANDARD_PARAM_DEF(long, long, "%li", kstrtol); 257STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
258STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); 258STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
259STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
259 260
260int param_set_charp(const char *val, const struct kernel_param *kp) 261int param_set_charp(const char *val, const struct kernel_param *kp)
261{ 262{
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9a83d780facd..e4e4121fa327 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -253,9 +253,6 @@ config APM_EMULATION
253 anything, try disabling/enabling this option (or disabling/enabling 253 anything, try disabling/enabling this option (or disabling/enabling
254 APM in your BIOS). 254 APM in your BIOS).
255 255
256config ARCH_HAS_OPP
257 bool
258
259config PM_OPP 256config PM_OPP
260 bool 257 bool
261 ---help--- 258 ---help---
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8e90f330f139..9a59d042ea84 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
296 suspend_state_t i; 296 suspend_state_t i;
297 297
298 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) 298 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
299 if (pm_states[i].state) 299 if (pm_states[i])
300 s += sprintf(s,"%s ", pm_states[i].label); 300 s += sprintf(s,"%s ", pm_states[i]);
301 301
302#endif 302#endif
303 if (hibernation_available()) 303 if (hibernation_available())
@@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
311static suspend_state_t decode_state(const char *buf, size_t n) 311static suspend_state_t decode_state(const char *buf, size_t n)
312{ 312{
313#ifdef CONFIG_SUSPEND 313#ifdef CONFIG_SUSPEND
314 suspend_state_t state = PM_SUSPEND_MIN; 314 suspend_state_t state;
315 struct pm_sleep_state *s;
316#endif 315#endif
317 char *p; 316 char *p;
318 int len; 317 int len;
@@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n)
325 return PM_SUSPEND_MAX; 324 return PM_SUSPEND_MAX;
326 325
327#ifdef CONFIG_SUSPEND 326#ifdef CONFIG_SUSPEND
328 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) 327 for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
329 if (s->state && len == strlen(s->label) 328 const char *label = pm_states[state];
330 && !strncmp(buf, s->label, len)) 329
331 return s->state; 330 if (label && len == strlen(label) && !strncmp(buf, label, len))
331 return state;
332 }
332#endif 333#endif
333 334
334 return PM_SUSPEND_ON; 335 return PM_SUSPEND_ON;
@@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
446 447
447#ifdef CONFIG_SUSPEND 448#ifdef CONFIG_SUSPEND
448 if (state < PM_SUSPEND_MAX) 449 if (state < PM_SUSPEND_MAX)
449 return sprintf(buf, "%s\n", pm_states[state].state ? 450 return sprintf(buf, "%s\n", pm_states[state] ?
450 pm_states[state].label : "error"); 451 pm_states[state] : "error");
451#endif 452#endif
452#ifdef CONFIG_HIBERNATION 453#ifdef CONFIG_HIBERNATION
453 return sprintf(buf, "disk\n"); 454 return sprintf(buf, "disk\n");
@@ -615,7 +616,6 @@ static struct attribute_group attr_group = {
615 .attrs = g, 616 .attrs = g,
616}; 617};
617 618
618#ifdef CONFIG_PM_RUNTIME
619struct workqueue_struct *pm_wq; 619struct workqueue_struct *pm_wq;
620EXPORT_SYMBOL_GPL(pm_wq); 620EXPORT_SYMBOL_GPL(pm_wq);
621 621
@@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void)
625 625
626 return pm_wq ? 0 : -ENOMEM; 626 return pm_wq ? 0 : -ENOMEM;
627} 627}
628#else
629static inline int pm_start_workqueue(void) { return 0; }
630#endif
631 628
632static int __init pm_init(void) 629static int __init pm_init(void)
633{ 630{
diff --git a/kernel/power/power.h b/kernel/power/power.h
index c60f13b5270a..5d49dcac2537 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
178 unsigned int, char *); 178 unsigned int, char *);
179 179
180#ifdef CONFIG_SUSPEND 180#ifdef CONFIG_SUSPEND
181struct pm_sleep_state {
182 const char *label;
183 suspend_state_t state;
184};
185
186/* kernel/power/suspend.c */ 181/* kernel/power/suspend.c */
187extern struct pm_sleep_state pm_states[]; 182extern const char *pm_states[];
188 183
189extern int suspend_devices_and_enter(suspend_state_t state); 184extern int suspend_devices_and_enter(suspend_state_t state);
190#else /* !CONFIG_SUSPEND */ 185#else /* !CONFIG_SUSPEND */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1ea328aafdc9..c4b8093c80b3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
248 * information is stored (in the form of a block of bitmap) 248 * information is stored (in the form of a block of bitmap)
249 * It also contains the pfns that correspond to the start and end of 249 * It also contains the pfns that correspond to the start and end of
250 * the represented memory area. 250 * the represented memory area.
251 *
252 * The memory bitmap is organized as a radix tree to guarantee fast random
253 * access to the bits. There is one radix tree for each zone (as returned
254 * from create_mem_extents).
255 *
256 * One radix tree is represented by one struct mem_zone_bm_rtree. There are
257 * two linked lists for the nodes of the tree, one for the inner nodes and
258 * one for the leave nodes. The linked leave nodes are used for fast linear
259 * access of the memory bitmap.
260 *
261 * The struct rtree_node represents one node of the radix tree.
251 */ 262 */
252 263
253#define BM_END_OF_MAP (~0UL) 264#define BM_END_OF_MAP (~0UL)
254 265
255#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) 266#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
267#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3)
268#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1)
256 269
257struct bm_block { 270/*
258 struct list_head hook; /* hook into a list of bitmap blocks */ 271 * struct rtree_node is a wrapper struct to link the nodes
259 unsigned long start_pfn; /* pfn represented by the first bit */ 272 * of the rtree together for easy linear iteration over
260 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ 273 * bits and easy freeing
261 unsigned long *data; /* bitmap representing pages */ 274 */
275struct rtree_node {
276 struct list_head list;
277 unsigned long *data;
262}; 278};
263 279
264static inline unsigned long bm_block_bits(struct bm_block *bb) 280/*
265{ 281 * struct mem_zone_bm_rtree represents a bitmap used for one
266 return bb->end_pfn - bb->start_pfn; 282 * populated memory zone.
267} 283 */
284struct mem_zone_bm_rtree {
285 struct list_head list; /* Link Zones together */
286 struct list_head nodes; /* Radix Tree inner nodes */
287 struct list_head leaves; /* Radix Tree leaves */
288 unsigned long start_pfn; /* Zone start page frame */
289 unsigned long end_pfn; /* Zone end page frame + 1 */
290 struct rtree_node *rtree; /* Radix Tree Root */
291 int levels; /* Number of Radix Tree Levels */
292 unsigned int blocks; /* Number of Bitmap Blocks */
293};
268 294
269/* strcut bm_position is used for browsing memory bitmaps */ 295/* strcut bm_position is used for browsing memory bitmaps */
270 296
271struct bm_position { 297struct bm_position {
272 struct bm_block *block; 298 struct mem_zone_bm_rtree *zone;
273 int bit; 299 struct rtree_node *node;
300 unsigned long node_pfn;
301 int node_bit;
274}; 302};
275 303
276struct memory_bitmap { 304struct memory_bitmap {
277 struct list_head blocks; /* list of bitmap blocks */ 305 struct list_head zones;
278 struct linked_page *p_list; /* list of pages used to store zone 306 struct linked_page *p_list; /* list of pages used to store zone
279 * bitmap objects and bitmap block 307 * bitmap objects and bitmap block
280 * objects 308 * objects
@@ -284,38 +312,178 @@ struct memory_bitmap {
284 312
285/* Functions that operate on memory bitmaps */ 313/* Functions that operate on memory bitmaps */
286 314
287static void memory_bm_position_reset(struct memory_bitmap *bm) 315#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long))
316#if BITS_PER_LONG == 32
317#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2)
318#else
319#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3)
320#endif
321#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
322
323/*
324 * alloc_rtree_node - Allocate a new node and add it to the radix tree.
325 *
326 * This function is used to allocate inner nodes as well as the
327 * leave nodes of the radix tree. It also adds the node to the
328 * corresponding linked list passed in by the *list parameter.
329 */
330static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
331 struct chain_allocator *ca,
332 struct list_head *list)
288{ 333{
289 bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); 334 struct rtree_node *node;
290 bm->cur.bit = 0;
291}
292 335
293static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); 336 node = chain_alloc(ca, sizeof(struct rtree_node));
337 if (!node)
338 return NULL;
294 339
295/** 340 node->data = get_image_page(gfp_mask, safe_needed);
296 * create_bm_block_list - create a list of block bitmap objects 341 if (!node->data)
297 * @pages - number of pages to track 342 return NULL;
298 * @list - list to put the allocated blocks into 343
299 * @ca - chain allocator to be used for allocating memory 344 list_add_tail(&node->list, list);
345
346 return node;
347}
348
349/*
350 * add_rtree_block - Add a new leave node to the radix tree
351 *
352 * The leave nodes need to be allocated in order to keep the leaves
353 * linked list in order. This is guaranteed by the zone->blocks
354 * counter.
300 */ 355 */
301static int create_bm_block_list(unsigned long pages, 356static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
302 struct list_head *list, 357 int safe_needed, struct chain_allocator *ca)
303 struct chain_allocator *ca)
304{ 358{
305 unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); 359 struct rtree_node *node, *block, **dst;
360 unsigned int levels_needed, block_nr;
361 int i;
306 362
307 while (nr_blocks-- > 0) { 363 block_nr = zone->blocks;
308 struct bm_block *bb; 364 levels_needed = 0;
309 365
310 bb = chain_alloc(ca, sizeof(struct bm_block)); 366 /* How many levels do we need for this block nr? */
311 if (!bb) 367 while (block_nr) {
368 levels_needed += 1;
369 block_nr >>= BM_RTREE_LEVEL_SHIFT;
370 }
371
372 /* Make sure the rtree has enough levels */
373 for (i = zone->levels; i < levels_needed; i++) {
374 node = alloc_rtree_node(gfp_mask, safe_needed, ca,
375 &zone->nodes);
376 if (!node)
312 return -ENOMEM; 377 return -ENOMEM;
313 list_add(&bb->hook, list); 378
379 node->data[0] = (unsigned long)zone->rtree;
380 zone->rtree = node;
381 zone->levels += 1;
314 } 382 }
315 383
384 /* Allocate new block */
385 block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
386 if (!block)
387 return -ENOMEM;
388
389 /* Now walk the rtree to insert the block */
390 node = zone->rtree;
391 dst = &zone->rtree;
392 block_nr = zone->blocks;
393 for (i = zone->levels; i > 0; i--) {
394 int index;
395
396 if (!node) {
397 node = alloc_rtree_node(gfp_mask, safe_needed, ca,
398 &zone->nodes);
399 if (!node)
400 return -ENOMEM;
401 *dst = node;
402 }
403
404 index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
405 index &= BM_RTREE_LEVEL_MASK;
406 dst = (struct rtree_node **)&((*dst)->data[index]);
407 node = *dst;
408 }
409
410 zone->blocks += 1;
411 *dst = block;
412
316 return 0; 413 return 0;
317} 414}
318 415
416static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
417 int clear_nosave_free);
418
419/*
420 * create_zone_bm_rtree - create a radix tree for one zone
421 *
422 * Allocated the mem_zone_bm_rtree structure and initializes it.
423 * This function also allocated and builds the radix tree for the
424 * zone.
425 */
426static struct mem_zone_bm_rtree *
427create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
428 struct chain_allocator *ca,
429 unsigned long start, unsigned long end)
430{
431 struct mem_zone_bm_rtree *zone;
432 unsigned int i, nr_blocks;
433 unsigned long pages;
434
435 pages = end - start;
436 zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
437 if (!zone)
438 return NULL;
439
440 INIT_LIST_HEAD(&zone->nodes);
441 INIT_LIST_HEAD(&zone->leaves);
442 zone->start_pfn = start;
443 zone->end_pfn = end;
444 nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
445
446 for (i = 0; i < nr_blocks; i++) {
447 if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
448 free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
449 return NULL;
450 }
451 }
452
453 return zone;
454}
455
456/*
457 * free_zone_bm_rtree - Free the memory of the radix tree
458 *
459 * Free all node pages of the radix tree. The mem_zone_bm_rtree
460 * structure itself is not freed here nor are the rtree_node
461 * structs.
462 */
463static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
464 int clear_nosave_free)
465{
466 struct rtree_node *node;
467
468 list_for_each_entry(node, &zone->nodes, list)
469 free_image_page(node->data, clear_nosave_free);
470
471 list_for_each_entry(node, &zone->leaves, list)
472 free_image_page(node->data, clear_nosave_free);
473}
474
475static void memory_bm_position_reset(struct memory_bitmap *bm)
476{
477 bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
478 list);
479 bm->cur.node = list_entry(bm->cur.zone->leaves.next,
480 struct rtree_node, list);
481 bm->cur.node_pfn = 0;
482 bm->cur.node_bit = 0;
483}
484
485static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
486
319struct mem_extent { 487struct mem_extent {
320 struct list_head hook; 488 struct list_head hook;
321 unsigned long start; 489 unsigned long start;
@@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
407 int error; 575 int error;
408 576
409 chain_init(&ca, gfp_mask, safe_needed); 577 chain_init(&ca, gfp_mask, safe_needed);
410 INIT_LIST_HEAD(&bm->blocks); 578 INIT_LIST_HEAD(&bm->zones);
411 579
412 error = create_mem_extents(&mem_extents, gfp_mask); 580 error = create_mem_extents(&mem_extents, gfp_mask);
413 if (error) 581 if (error)
414 return error; 582 return error;
415 583
416 list_for_each_entry(ext, &mem_extents, hook) { 584 list_for_each_entry(ext, &mem_extents, hook) {
417 struct bm_block *bb; 585 struct mem_zone_bm_rtree *zone;
418 unsigned long pfn = ext->start;
419 unsigned long pages = ext->end - ext->start;
420 586
421 bb = list_entry(bm->blocks.prev, struct bm_block, hook); 587 zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
422 588 ext->start, ext->end);
423 error = create_bm_block_list(pages, bm->blocks.prev, &ca); 589 if (!zone) {
424 if (error) 590 error = -ENOMEM;
425 goto Error; 591 goto Error;
426
427 list_for_each_entry_continue(bb, &bm->blocks, hook) {
428 bb->data = get_image_page(gfp_mask, safe_needed);
429 if (!bb->data) {
430 error = -ENOMEM;
431 goto Error;
432 }
433
434 bb->start_pfn = pfn;
435 if (pages >= BM_BITS_PER_BLOCK) {
436 pfn += BM_BITS_PER_BLOCK;
437 pages -= BM_BITS_PER_BLOCK;
438 } else {
439 /* This is executed only once in the loop */
440 pfn += pages;
441 }
442 bb->end_pfn = pfn;
443 } 592 }
593 list_add_tail(&zone->list, &bm->zones);
444 } 594 }
445 595
446 bm->p_list = ca.chain; 596 bm->p_list = ca.chain;
@@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
460 */ 610 */
461static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) 611static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
462{ 612{
463 struct bm_block *bb; 613 struct mem_zone_bm_rtree *zone;
464 614
465 list_for_each_entry(bb, &bm->blocks, hook) 615 list_for_each_entry(zone, &bm->zones, list)
466 if (bb->data) 616 free_zone_bm_rtree(zone, clear_nosave_free);
467 free_image_page(bb->data, clear_nosave_free);
468 617
469 free_list_of_pages(bm->p_list, clear_nosave_free); 618 free_list_of_pages(bm->p_list, clear_nosave_free);
470 619
471 INIT_LIST_HEAD(&bm->blocks); 620 INIT_LIST_HEAD(&bm->zones);
472} 621}
473 622
474/** 623/**
475 * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds 624 * memory_bm_find_bit - Find the bit for pfn in the memory
476 * to given pfn. The cur_zone_bm member of @bm and the cur_block member 625 * bitmap
477 * of @bm->cur_zone_bm are updated. 626 *
627 * Find the bit in the bitmap @bm that corresponds to given pfn.
628 * The cur.zone, cur.block and cur.node_pfn member of @bm are
629 * updated.
630 * It walks the radix tree to find the page which contains the bit for
631 * pfn and returns the bit position in **addr and *bit_nr.
478 */ 632 */
479static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 633static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
480 void **addr, unsigned int *bit_nr) 634 void **addr, unsigned int *bit_nr)
481{ 635{
482 struct bm_block *bb; 636 struct mem_zone_bm_rtree *curr, *zone;
637 struct rtree_node *node;
638 int i, block_nr;
639
640 zone = bm->cur.zone;
641
642 if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
643 goto zone_found;
644
645 zone = NULL;
646
647 /* Find the right zone */
648 list_for_each_entry(curr, &bm->zones, list) {
649 if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
650 zone = curr;
651 break;
652 }
653 }
483 654
655 if (!zone)
656 return -EFAULT;
657
658zone_found:
484 /* 659 /*
485 * Check if the pfn corresponds to the current bitmap block and find 660 * We have a zone. Now walk the radix tree to find the leave
486 * the block where it fits if this is not the case. 661 * node for our pfn.
487 */ 662 */
488 bb = bm->cur.block;
489 if (pfn < bb->start_pfn)
490 list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
491 if (pfn >= bb->start_pfn)
492 break;
493 663
494 if (pfn >= bb->end_pfn) 664 node = bm->cur.node;
495 list_for_each_entry_continue(bb, &bm->blocks, hook) 665 if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
496 if (pfn >= bb->start_pfn && pfn < bb->end_pfn) 666 goto node_found;
497 break;
498 667
499 if (&bb->hook == &bm->blocks) 668 node = zone->rtree;
500 return -EFAULT; 669 block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
670
671 for (i = zone->levels; i > 0; i--) {
672 int index;
673
674 index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
675 index &= BM_RTREE_LEVEL_MASK;
676 BUG_ON(node->data[index] == 0);
677 node = (struct rtree_node *)node->data[index];
678 }
679
680node_found:
681 /* Update last position */
682 bm->cur.zone = zone;
683 bm->cur.node = node;
684 bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
685
686 /* Set return values */
687 *addr = node->data;
688 *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
501 689
502 /* The block has been found */
503 bm->cur.block = bb;
504 pfn -= bb->start_pfn;
505 bm->cur.bit = pfn + 1;
506 *bit_nr = pfn;
507 *addr = bb->data;
508 return 0; 690 return 0;
509} 691}
510 692
@@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
528 error = memory_bm_find_bit(bm, pfn, &addr, &bit); 710 error = memory_bm_find_bit(bm, pfn, &addr, &bit);
529 if (!error) 711 if (!error)
530 set_bit(bit, addr); 712 set_bit(bit, addr);
713
531 return error; 714 return error;
532} 715}
533 716
@@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
542 clear_bit(bit, addr); 725 clear_bit(bit, addr);
543} 726}
544 727
728static void memory_bm_clear_current(struct memory_bitmap *bm)
729{
730 int bit;
731
732 bit = max(bm->cur.node_bit - 1, 0);
733 clear_bit(bit, bm->cur.node->data);
734}
735
545static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) 736static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
546{ 737{
547 void *addr; 738 void *addr;
@@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
561 return !memory_bm_find_bit(bm, pfn, &addr, &bit); 752 return !memory_bm_find_bit(bm, pfn, &addr, &bit);
562} 753}
563 754
564/** 755/*
565 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit 756 * rtree_next_node - Jumps to the next leave node
566 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
567 * returned.
568 * 757 *
569 * It is required to run memory_bm_position_reset() before the first call to 758 * Sets the position to the beginning of the next node in the
570 * this function. 759 * memory bitmap. This is either the next node in the current
760 * zone's radix tree or the first node in the radix tree of the
761 * next zone.
762 *
763 * Returns true if there is a next node, false otherwise.
571 */ 764 */
765static bool rtree_next_node(struct memory_bitmap *bm)
766{
767 bm->cur.node = list_entry(bm->cur.node->list.next,
768 struct rtree_node, list);
769 if (&bm->cur.node->list != &bm->cur.zone->leaves) {
770 bm->cur.node_pfn += BM_BITS_PER_BLOCK;
771 bm->cur.node_bit = 0;
772 touch_softlockup_watchdog();
773 return true;
774 }
775
776 /* No more nodes, goto next zone */
777 bm->cur.zone = list_entry(bm->cur.zone->list.next,
778 struct mem_zone_bm_rtree, list);
779 if (&bm->cur.zone->list != &bm->zones) {
780 bm->cur.node = list_entry(bm->cur.zone->leaves.next,
781 struct rtree_node, list);
782 bm->cur.node_pfn = 0;
783 bm->cur.node_bit = 0;
784 return true;
785 }
786
787 /* No more zones */
788 return false;
789}
572 790
791/**
792 * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
793 *
794 * Starting from the last returned position this function searches
795 * for the next set bit in the memory bitmap and returns its
796 * number. If no more bit is set BM_END_OF_MAP is returned.
797 *
798 * It is required to run memory_bm_position_reset() before the
799 * first call to this function.
800 */
573static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) 801static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
574{ 802{
575 struct bm_block *bb; 803 unsigned long bits, pfn, pages;
576 int bit; 804 int bit;
577 805
578 bb = bm->cur.block;
579 do { 806 do {
580 bit = bm->cur.bit; 807 pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
581 bit = find_next_bit(bb->data, bm_block_bits(bb), bit); 808 bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
582 if (bit < bm_block_bits(bb)) 809 bit = find_next_bit(bm->cur.node->data, bits,
583 goto Return_pfn; 810 bm->cur.node_bit);
584 811 if (bit < bits) {
585 bb = list_entry(bb->hook.next, struct bm_block, hook); 812 pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
586 bm->cur.block = bb; 813 bm->cur.node_bit = bit + 1;
587 bm->cur.bit = 0; 814 return pfn;
588 } while (&bb->hook != &bm->blocks); 815 }
816 } while (rtree_next_node(bm));
589 817
590 memory_bm_position_reset(bm);
591 return BM_END_OF_MAP; 818 return BM_END_OF_MAP;
592
593 Return_pfn:
594 bm->cur.bit = bit + 1;
595 return bb->start_pfn + bit;
596} 819}
597 820
598/** 821/**
@@ -731,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
731 } 954 }
732} 955}
733 956
957static bool is_nosave_page(unsigned long pfn)
958{
959 struct nosave_region *region;
960
961 list_for_each_entry(region, &nosave_regions, list) {
962 if (pfn >= region->start_pfn && pfn < region->end_pfn) {
963 pr_err("PM: %#010llx in e820 nosave region: "
964 "[mem %#010llx-%#010llx]\n",
965 (unsigned long long) pfn << PAGE_SHIFT,
966 (unsigned long long) region->start_pfn << PAGE_SHIFT,
967 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
968 - 1);
969 return true;
970 }
971 }
972
973 return false;
974}
975
734/** 976/**
735 * create_basic_memory_bitmaps - create bitmaps needed for marking page 977 * create_basic_memory_bitmaps - create bitmaps needed for marking page
736 * frames that should not be saved and free page frames. The pointers 978 * frames that should not be saved and free page frames. The pointers
@@ -816,12 +1058,17 @@ void free_basic_memory_bitmaps(void)
816 1058
817unsigned int snapshot_additional_pages(struct zone *zone) 1059unsigned int snapshot_additional_pages(struct zone *zone)
818{ 1060{
819 unsigned int res; 1061 unsigned int rtree, nodes;
820 1062
821 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 1063 rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
822 res += DIV_ROUND_UP(res * sizeof(struct bm_block), 1064 rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
823 LINKED_PAGE_DATA_SIZE); 1065 LINKED_PAGE_DATA_SIZE);
824 return 2 * res; 1066 while (nodes > 1) {
1067 nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
1068 rtree += nodes;
1069 }
1070
1071 return 2 * rtree;
825} 1072}
826 1073
827#ifdef CONFIG_HIGHMEM 1074#ifdef CONFIG_HIGHMEM
@@ -1094,23 +1341,35 @@ static struct memory_bitmap copy_bm;
1094 1341
1095void swsusp_free(void) 1342void swsusp_free(void)
1096{ 1343{
1097 struct zone *zone; 1344 unsigned long fb_pfn, fr_pfn;
1098 unsigned long pfn, max_zone_pfn;
1099 1345
1100 for_each_populated_zone(zone) { 1346 memory_bm_position_reset(forbidden_pages_map);
1101 max_zone_pfn = zone_end_pfn(zone); 1347 memory_bm_position_reset(free_pages_map);
1102 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1348
1103 if (pfn_valid(pfn)) { 1349loop:
1104 struct page *page = pfn_to_page(pfn); 1350 fr_pfn = memory_bm_next_pfn(free_pages_map);
1105 1351 fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
1106 if (swsusp_page_is_forbidden(page) && 1352
1107 swsusp_page_is_free(page)) { 1353 /*
1108 swsusp_unset_page_forbidden(page); 1354 * Find the next bit set in both bitmaps. This is guaranteed to
1109 swsusp_unset_page_free(page); 1355 * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
1110 __free_page(page); 1356 */
1111 } 1357 do {
1112 } 1358 if (fb_pfn < fr_pfn)
1359 fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
1360 if (fr_pfn < fb_pfn)
1361 fr_pfn = memory_bm_next_pfn(free_pages_map);
1362 } while (fb_pfn != fr_pfn);
1363
1364 if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
1365 struct page *page = pfn_to_page(fr_pfn);
1366
1367 memory_bm_clear_current(forbidden_pages_map);
1368 memory_bm_clear_current(free_pages_map);
1369 __free_page(page);
1370 goto loop;
1113 } 1371 }
1372
1114 nr_copy_pages = 0; 1373 nr_copy_pages = 0;
1115 nr_meta_pages = 0; 1374 nr_meta_pages = 0;
1116 restore_pblist = NULL; 1375 restore_pblist = NULL;
@@ -1775,7 +2034,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1775 do { 2034 do {
1776 pfn = memory_bm_next_pfn(bm); 2035 pfn = memory_bm_next_pfn(bm);
1777 if (likely(pfn != BM_END_OF_MAP)) { 2036 if (likely(pfn != BM_END_OF_MAP)) {
1778 if (likely(pfn_valid(pfn))) 2037 if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
1779 swsusp_set_page_free(pfn_to_page(pfn)); 2038 swsusp_set_page_free(pfn_to_page(pfn));
1780 else 2039 else
1781 return -EFAULT; 2040 return -EFAULT;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4b736b4dfa96..6dadb25cb0d8 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,20 +31,11 @@
31 31
32#include "power.h" 32#include "power.h"
33 33
34struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { 34static const char *pm_labels[] = { "mem", "standby", "freeze", };
35 [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, 35const char *pm_states[PM_SUSPEND_MAX];
36 [PM_SUSPEND_STANDBY] = { .label = "standby", },
37 [PM_SUSPEND_MEM] = { .label = "mem", },
38};
39 36
40static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
41static const struct platform_freeze_ops *freeze_ops; 38static const struct platform_freeze_ops *freeze_ops;
42
43static bool need_suspend_ops(suspend_state_t state)
44{
45 return state > PM_SUSPEND_FREEZE;
46}
47
48static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 39static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
49static bool suspend_freeze_wake; 40static bool suspend_freeze_wake;
50 41
@@ -97,10 +88,7 @@ static bool relative_states;
97static int __init sleep_states_setup(char *str) 88static int __init sleep_states_setup(char *str)
98{ 89{
99 relative_states = !strncmp(str, "1", 1); 90 relative_states = !strncmp(str, "1", 1);
100 if (relative_states) { 91 pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
101 pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
102 pm_states[PM_SUSPEND_FREEZE].state = 0;
103 }
104 return 1; 92 return 1;
105} 93}
106 94
@@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup);
113void suspend_set_ops(const struct platform_suspend_ops *ops) 101void suspend_set_ops(const struct platform_suspend_ops *ops)
114{ 102{
115 suspend_state_t i; 103 suspend_state_t i;
116 int j = PM_SUSPEND_MAX - 1; 104 int j = 0;
117 105
118 lock_system_sleep(); 106 lock_system_sleep();
119 107
120 suspend_ops = ops; 108 suspend_ops = ops;
121 for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) 109 for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
122 if (valid_state(i)) 110 if (valid_state(i)) {
123 pm_states[j--].state = i; 111 pm_states[i] = pm_labels[j++];
124 else if (!relative_states) 112 } else if (!relative_states) {
125 pm_states[j--].state = 0; 113 pm_states[i] = NULL;
114 j++;
115 }
126 116
127 pm_states[j--].state = PM_SUSPEND_FREEZE; 117 pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
128 while (j >= PM_SUSPEND_MIN)
129 pm_states[j--].state = 0;
130 118
131 unlock_system_sleep(); 119 unlock_system_sleep();
132} 120}
@@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state)
145} 133}
146EXPORT_SYMBOL_GPL(suspend_valid_only_mem); 134EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
147 135
136static bool sleep_state_supported(suspend_state_t state)
137{
138 return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
139}
140
141static int platform_suspend_prepare(suspend_state_t state)
142{
143 return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
144 suspend_ops->prepare() : 0;
145}
146
147static int platform_suspend_prepare_late(suspend_state_t state)
148{
149 return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
150 suspend_ops->prepare_late() : 0;
151}
152
153static void platform_suspend_wake(suspend_state_t state)
154{
155 if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
156 suspend_ops->wake();
157}
158
159static void platform_suspend_finish(suspend_state_t state)
160{
161 if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
162 suspend_ops->finish();
163}
164
165static int platform_suspend_begin(suspend_state_t state)
166{
167 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
168 return freeze_ops->begin();
169 else if (suspend_ops->begin)
170 return suspend_ops->begin(state);
171 else
172 return 0;
173}
174
175static void platform_suspend_end(suspend_state_t state)
176{
177 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
178 freeze_ops->end();
179 else if (suspend_ops->end)
180 suspend_ops->end();
181}
182
183static void platform_suspend_recover(suspend_state_t state)
184{
185 if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
186 suspend_ops->recover();
187}
188
189static bool platform_suspend_again(suspend_state_t state)
190{
191 return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
192 suspend_ops->suspend_again() : false;
193}
194
148static int suspend_test(int level) 195static int suspend_test(int level)
149{ 196{
150#ifdef CONFIG_PM_DEBUG 197#ifdef CONFIG_PM_DEBUG
@@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state)
168{ 215{
169 int error; 216 int error;
170 217
171 if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) 218 if (!sleep_state_supported(state))
172 return -EPERM; 219 return -EPERM;
173 220
174 pm_prepare_console(); 221 pm_prepare_console();
@@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
214{ 261{
215 int error; 262 int error;
216 263
217 if (need_suspend_ops(state) && suspend_ops->prepare) { 264 error = platform_suspend_prepare(state);
218 error = suspend_ops->prepare(); 265 if (error)
219 if (error) 266 goto Platform_finish;
220 goto Platform_finish;
221 }
222 267
223 error = dpm_suspend_end(PMSG_SUSPEND); 268 error = dpm_suspend_end(PMSG_SUSPEND);
224 if (error) { 269 if (error) {
225 printk(KERN_ERR "PM: Some devices failed to power down\n"); 270 printk(KERN_ERR "PM: Some devices failed to power down\n");
226 goto Platform_finish; 271 goto Platform_finish;
227 } 272 }
228 273 error = platform_suspend_prepare_late(state);
229 if (need_suspend_ops(state) && suspend_ops->prepare_late) { 274 if (error)
230 error = suspend_ops->prepare_late(); 275 goto Platform_wake;
231 if (error)
232 goto Platform_wake;
233 }
234 276
235 if (suspend_test(TEST_PLATFORM)) 277 if (suspend_test(TEST_PLATFORM))
236 goto Platform_wake; 278 goto Platform_wake;
@@ -276,15 +318,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
276 enable_nonboot_cpus(); 318 enable_nonboot_cpus();
277 319
278 Platform_wake: 320 Platform_wake:
279 if (need_suspend_ops(state) && suspend_ops->wake) 321 platform_suspend_wake(state);
280 suspend_ops->wake();
281
282 dpm_resume_start(PMSG_RESUME); 322 dpm_resume_start(PMSG_RESUME);
283 323
284 Platform_finish: 324 Platform_finish:
285 if (need_suspend_ops(state) && suspend_ops->finish) 325 platform_suspend_finish(state);
286 suspend_ops->finish();
287
288 return error; 326 return error;
289} 327}
290 328
@@ -297,18 +335,13 @@ int suspend_devices_and_enter(suspend_state_t state)
297 int error; 335 int error;
298 bool wakeup = false; 336 bool wakeup = false;
299 337
300 if (need_suspend_ops(state) && !suspend_ops) 338 if (!sleep_state_supported(state))
301 return -ENOSYS; 339 return -ENOSYS;
302 340
303 if (need_suspend_ops(state) && suspend_ops->begin) { 341 error = platform_suspend_begin(state);
304 error = suspend_ops->begin(state); 342 if (error)
305 if (error) 343 goto Close;
306 goto Close; 344
307 } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
308 error = freeze_ops->begin();
309 if (error)
310 goto Close;
311 }
312 suspend_console(); 345 suspend_console();
313 suspend_test_start(); 346 suspend_test_start();
314 error = dpm_suspend_start(PMSG_SUSPEND); 347 error = dpm_suspend_start(PMSG_SUSPEND);
@@ -322,25 +355,20 @@ int suspend_devices_and_enter(suspend_state_t state)
322 355
323 do { 356 do {
324 error = suspend_enter(state, &wakeup); 357 error = suspend_enter(state, &wakeup);
325 } while (!error && !wakeup && need_suspend_ops(state) 358 } while (!error && !wakeup && platform_suspend_again(state));
326 && suspend_ops->suspend_again && suspend_ops->suspend_again());
327 359
328 Resume_devices: 360 Resume_devices:
329 suspend_test_start(); 361 suspend_test_start();
330 dpm_resume_end(PMSG_RESUME); 362 dpm_resume_end(PMSG_RESUME);
331 suspend_test_finish("resume devices"); 363 suspend_test_finish("resume devices");
332 resume_console(); 364 resume_console();
333 Close:
334 if (need_suspend_ops(state) && suspend_ops->end)
335 suspend_ops->end();
336 else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
337 freeze_ops->end();
338 365
366 Close:
367 platform_suspend_end(state);
339 return error; 368 return error;
340 369
341 Recover_platform: 370 Recover_platform:
342 if (need_suspend_ops(state) && suspend_ops->recover) 371 platform_suspend_recover(state);
343 suspend_ops->recover();
344 goto Resume_devices; 372 goto Resume_devices;
345} 373}
346 374
@@ -393,7 +421,7 @@ static int enter_state(suspend_state_t state)
393 printk("done.\n"); 421 printk("done.\n");
394 trace_suspend_resume(TPS("sync_filesystems"), 0, false); 422 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
395 423
396 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); 424 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
397 error = suspend_prepare(state); 425 error = suspend_prepare(state);
398 if (error) 426 if (error)
399 goto Unlock; 427 goto Unlock;
@@ -402,7 +430,7 @@ static int enter_state(suspend_state_t state)
402 goto Finish; 430 goto Finish;
403 431
404 trace_suspend_resume(TPS("suspend_enter"), state, false); 432 trace_suspend_resume(TPS("suspend_enter"), state, false);
405 pr_debug("PM: Entering %s sleep\n", pm_states[state].label); 433 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
406 pm_restrict_gfp_mask(); 434 pm_restrict_gfp_mask();
407 error = suspend_devices_and_enter(state); 435 error = suspend_devices_and_enter(state);
408 pm_restore_gfp_mask(); 436 pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 269b097e78ea..2f524928b6aa 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
92 } 92 }
93 93
94 if (state == PM_SUSPEND_MEM) { 94 if (state == PM_SUSPEND_MEM) {
95 printk(info_test, pm_states[state].label); 95 printk(info_test, pm_states[state]);
96 status = pm_suspend(state); 96 status = pm_suspend(state);
97 if (status == -ENODEV) 97 if (status == -ENODEV)
98 state = PM_SUSPEND_STANDBY; 98 state = PM_SUSPEND_STANDBY;
99 } 99 }
100 if (state == PM_SUSPEND_STANDBY) { 100 if (state == PM_SUSPEND_STANDBY) {
101 printk(info_test, pm_states[state].label); 101 printk(info_test, pm_states[state]);
102 status = pm_suspend(state); 102 status = pm_suspend(state);
103 } 103 }
104 if (status < 0) 104 if (status < 0)
@@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value)
141 /* "=mem" ==> "mem" */ 141 /* "=mem" ==> "mem" */
142 value++; 142 value++;
143 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) 143 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
144 if (!strcmp(pm_states[i].label, value)) { 144 if (!strcmp(pm_states[i], value)) {
145 test_state = pm_states[i].state; 145 test_state = i;
146 return 0; 146 return 0;
147 } 147 }
148 148
@@ -162,8 +162,8 @@ static int __init test_suspend(void)
162 /* PM is initialized by now; is that state testable? */ 162 /* PM is initialized by now; is that state testable? */
163 if (test_state == PM_SUSPEND_ON) 163 if (test_state == PM_SUSPEND_ON)
164 goto done; 164 goto done;
165 if (!pm_states[test_state].state) { 165 if (!pm_states[test_state]) {
166 printk(warn_bad_state, pm_states[test_state].label); 166 printk(warn_bad_state, pm_states[test_state]);
167 goto done; 167 goto done;
168 } 168 }
169 169
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 13e839dbca07..e04c455a0e38 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
45#include <linux/poll.h> 45#include <linux/poll.h>
46#include <linux/irq_work.h> 46#include <linux/irq_work.h>
47#include <linux/utsname.h> 47#include <linux/utsname.h>
48#include <linux/ctype.h>
48 49
49#include <asm/uaccess.h> 50#include <asm/uaccess.h>
50 51
@@ -56,7 +57,7 @@
56 57
57int console_printk[4] = { 58int console_printk[4] = {
58 CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ 59 CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
59 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ 60 MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
60 CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ 61 CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
61 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ 62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
62}; 63};
@@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip)
113 * This is used for debugging the mess that is the VT code by 114 * This is used for debugging the mess that is the VT code by
114 * keeping track if we have the console semaphore held. It's 115 * keeping track if we have the console semaphore held. It's
115 * definitely not the perfect debug tool (we don't know if _WE_ 116 * definitely not the perfect debug tool (we don't know if _WE_
116 * hold it are racing, but it helps tracking those weird code 117 * hold it and are racing, but it helps tracking those weird code
117 * path in the console code where we end up in places I want 118 * paths in the console code where we end up in places I want
118 * locked without the console sempahore held 119 * locked without the console sempahore held).
119 */ 120 */
120static int console_locked, console_suspended; 121static int console_locked, console_suspended;
121 122
@@ -146,8 +147,8 @@ static int console_may_schedule;
146 * the overall length of the record. 147 * the overall length of the record.
147 * 148 *
148 * The heads to the first and last entry in the buffer, as well as the 149 * The heads to the first and last entry in the buffer, as well as the
149 * sequence numbers of these both entries are maintained when messages 150 * sequence numbers of these entries are maintained when messages are
150 * are stored.. 151 * stored.
151 * 152 *
152 * If the heads indicate available messages, the length in the header 153 * If the heads indicate available messages, the length in the header
153 * tells the start next message. A length == 0 for the next message 154 * tells the start next message. A length == 0 for the next message
@@ -257,7 +258,7 @@ static u64 clear_seq;
257static u32 clear_idx; 258static u32 clear_idx;
258 259
259#define PREFIX_MAX 32 260#define PREFIX_MAX 32
260#define LOG_LINE_MAX 1024 - PREFIX_MAX 261#define LOG_LINE_MAX (1024 - PREFIX_MAX)
261 262
262/* record buffer */ 263/* record buffer */
263#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 264#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -266,10 +267,23 @@ static u32 clear_idx;
266#define LOG_ALIGN __alignof__(struct printk_log) 267#define LOG_ALIGN __alignof__(struct printk_log)
267#endif 268#endif
268#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 269#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
270#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
269static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); 271static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
270static char *log_buf = __log_buf; 272static char *log_buf = __log_buf;
271static u32 log_buf_len = __LOG_BUF_LEN; 273static u32 log_buf_len = __LOG_BUF_LEN;
272 274
275/* Return log buffer address */
276char *log_buf_addr_get(void)
277{
278 return log_buf;
279}
280
281/* Return log buffer size */
282u32 log_buf_len_get(void)
283{
284 return log_buf_len;
285}
286
273/* human readable text of the record */ 287/* human readable text of the record */
274static char *log_text(const struct printk_log *msg) 288static char *log_text(const struct printk_log *msg)
275{ 289{
@@ -344,7 +358,7 @@ static int log_make_free_space(u32 msg_size)
344 while (log_first_seq < log_next_seq) { 358 while (log_first_seq < log_next_seq) {
345 if (logbuf_has_space(msg_size, false)) 359 if (logbuf_has_space(msg_size, false))
346 return 0; 360 return 0;
347 /* drop old messages until we have enough continuous space */ 361 /* drop old messages until we have enough contiguous space */
348 log_first_idx = log_next(log_first_idx); 362 log_first_idx = log_next(log_first_idx);
349 log_first_seq++; 363 log_first_seq++;
350 } 364 }
@@ -453,11 +467,7 @@ static int log_store(int facility, int level,
453 return msg->text_len; 467 return msg->text_len;
454} 468}
455 469
456#ifdef CONFIG_SECURITY_DMESG_RESTRICT 470int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
457int dmesg_restrict = 1;
458#else
459int dmesg_restrict;
460#endif
461 471
462static int syslog_action_restricted(int type) 472static int syslog_action_restricted(int type)
463{ 473{
@@ -828,34 +838,74 @@ void log_buf_kexec_setup(void)
828/* requested log_buf_len from kernel cmdline */ 838/* requested log_buf_len from kernel cmdline */
829static unsigned long __initdata new_log_buf_len; 839static unsigned long __initdata new_log_buf_len;
830 840
831/* save requested log_buf_len since it's too early to process it */ 841/* we practice scaling the ring buffer by powers of 2 */
832static int __init log_buf_len_setup(char *str) 842static void __init log_buf_len_update(unsigned size)
833{ 843{
834 unsigned size = memparse(str, &str);
835
836 if (size) 844 if (size)
837 size = roundup_pow_of_two(size); 845 size = roundup_pow_of_two(size);
838 if (size > log_buf_len) 846 if (size > log_buf_len)
839 new_log_buf_len = size; 847 new_log_buf_len = size;
848}
849
850/* save requested log_buf_len since it's too early to process it */
851static int __init log_buf_len_setup(char *str)
852{
853 unsigned size = memparse(str, &str);
854
855 log_buf_len_update(size);
840 856
841 return 0; 857 return 0;
842} 858}
843early_param("log_buf_len", log_buf_len_setup); 859early_param("log_buf_len", log_buf_len_setup);
844 860
861static void __init log_buf_add_cpu(void)
862{
863 unsigned int cpu_extra;
864
865 /*
866 * archs should set up cpu_possible_bits properly with
867 * set_cpu_possible() after setup_arch() but just in
868 * case lets ensure this is valid.
869 */
870 if (num_possible_cpus() == 1)
871 return;
872
873 cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
874
875 /* by default this will only continue through for large > 64 CPUs */
876 if (cpu_extra <= __LOG_BUF_LEN / 2)
877 return;
878
879 pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
880 __LOG_CPU_MAX_BUF_LEN);
881 pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
882 cpu_extra);
883 pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
884
885 log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
886}
887
845void __init setup_log_buf(int early) 888void __init setup_log_buf(int early)
846{ 889{
847 unsigned long flags; 890 unsigned long flags;
848 char *new_log_buf; 891 char *new_log_buf;
849 int free; 892 int free;
850 893
894 if (log_buf != __log_buf)
895 return;
896
897 if (!early && !new_log_buf_len)
898 log_buf_add_cpu();
899
851 if (!new_log_buf_len) 900 if (!new_log_buf_len)
852 return; 901 return;
853 902
854 if (early) { 903 if (early) {
855 new_log_buf = 904 new_log_buf =
856 memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); 905 memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
857 } else { 906 } else {
858 new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); 907 new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
908 LOG_ALIGN);
859 } 909 }
860 910
861 if (unlikely(!new_log_buf)) { 911 if (unlikely(!new_log_buf)) {
@@ -872,7 +922,7 @@ void __init setup_log_buf(int early)
872 memcpy(log_buf, __log_buf, __LOG_BUF_LEN); 922 memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
873 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 923 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
874 924
875 pr_info("log_buf_len: %d\n", log_buf_len); 925 pr_info("log_buf_len: %d bytes\n", log_buf_len);
876 pr_info("early log buf free: %d(%d%%)\n", 926 pr_info("early log buf free: %d(%d%%)\n",
877 free, (free * 100) / __LOG_BUF_LEN); 927 free, (free * 100) / __LOG_BUF_LEN);
878} 928}
@@ -881,7 +931,7 @@ static bool __read_mostly ignore_loglevel;
881 931
882static int __init ignore_loglevel_setup(char *str) 932static int __init ignore_loglevel_setup(char *str)
883{ 933{
884 ignore_loglevel = 1; 934 ignore_loglevel = true;
885 pr_info("debug: ignoring loglevel setting.\n"); 935 pr_info("debug: ignoring loglevel setting.\n");
886 936
887 return 0; 937 return 0;
@@ -947,11 +997,7 @@ static inline void boot_delay_msec(int level)
947} 997}
948#endif 998#endif
949 999
950#if defined(CONFIG_PRINTK_TIME) 1000static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
951static bool printk_time = 1;
952#else
953static bool printk_time;
954#endif
955module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 1001module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
956 1002
957static size_t print_time(u64 ts, char *buf) 1003static size_t print_time(u64 ts, char *buf)
@@ -1310,7 +1356,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1310 * for pending data, not the size; return the count of 1356 * for pending data, not the size; return the count of
1311 * records, not the length. 1357 * records, not the length.
1312 */ 1358 */
1313 error = log_next_idx - syslog_idx; 1359 error = log_next_seq - syslog_seq;
1314 } else { 1360 } else {
1315 u64 seq = syslog_seq; 1361 u64 seq = syslog_seq;
1316 u32 idx = syslog_idx; 1362 u32 idx = syslog_idx;
@@ -1416,10 +1462,9 @@ static int have_callable_console(void)
1416/* 1462/*
1417 * Can we actually use the console at this time on this cpu? 1463 * Can we actually use the console at this time on this cpu?
1418 * 1464 *
1419 * Console drivers may assume that per-cpu resources have 1465 * Console drivers may assume that per-cpu resources have been allocated. So
1420 * been allocated. So unless they're explicitly marked as 1466 * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
1421 * being able to cope (CON_ANYTIME) don't call them until 1467 * call them until this CPU is officially up.
1422 * this CPU is officially up.
1423 */ 1468 */
1424static inline int can_use_console(unsigned int cpu) 1469static inline int can_use_console(unsigned int cpu)
1425{ 1470{
@@ -1432,8 +1477,10 @@ static inline int can_use_console(unsigned int cpu)
1432 * console_lock held, and 'console_locked' set) if it 1477 * console_lock held, and 'console_locked' set) if it
1433 * is successful, false otherwise. 1478 * is successful, false otherwise.
1434 */ 1479 */
1435static int console_trylock_for_printk(unsigned int cpu) 1480static int console_trylock_for_printk(void)
1436{ 1481{
1482 unsigned int cpu = smp_processor_id();
1483
1437 if (!console_trylock()) 1484 if (!console_trylock())
1438 return 0; 1485 return 0;
1439 /* 1486 /*
@@ -1476,7 +1523,7 @@ static struct cont {
1476 struct task_struct *owner; /* task of first print*/ 1523 struct task_struct *owner; /* task of first print*/
1477 u64 ts_nsec; /* time of first print */ 1524 u64 ts_nsec; /* time of first print */
1478 u8 level; /* log level of first message */ 1525 u8 level; /* log level of first message */
1479 u8 facility; /* log level of first message */ 1526 u8 facility; /* log facility of first message */
1480 enum log_flags flags; /* prefix, newline flags */ 1527 enum log_flags flags; /* prefix, newline flags */
1481 bool flushed:1; /* buffer sealed and committed */ 1528 bool flushed:1; /* buffer sealed and committed */
1482} cont; 1529} cont;
@@ -1608,7 +1655,8 @@ asmlinkage int vprintk_emit(int facility, int level,
1608 */ 1655 */
1609 if (!oops_in_progress && !lockdep_recursing(current)) { 1656 if (!oops_in_progress && !lockdep_recursing(current)) {
1610 recursion_bug = 1; 1657 recursion_bug = 1;
1611 goto out_restore_irqs; 1658 local_irq_restore(flags);
1659 return 0;
1612 } 1660 }
1613 zap_locks(); 1661 zap_locks();
1614 } 1662 }
@@ -1716,21 +1764,30 @@ asmlinkage int vprintk_emit(int facility, int level,
1716 1764
1717 logbuf_cpu = UINT_MAX; 1765 logbuf_cpu = UINT_MAX;
1718 raw_spin_unlock(&logbuf_lock); 1766 raw_spin_unlock(&logbuf_lock);
1767 lockdep_on();
1768 local_irq_restore(flags);
1719 1769
1720 /* If called from the scheduler, we can not call up(). */ 1770 /* If called from the scheduler, we can not call up(). */
1721 if (!in_sched) { 1771 if (!in_sched) {
1772 lockdep_off();
1773 /*
1774 * Disable preemption to avoid being preempted while holding
1775 * console_sem which would prevent anyone from printing to
1776 * console
1777 */
1778 preempt_disable();
1779
1722 /* 1780 /*
1723 * Try to acquire and then immediately release the console 1781 * Try to acquire and then immediately release the console
1724 * semaphore. The release will print out buffers and wake up 1782 * semaphore. The release will print out buffers and wake up
1725 * /dev/kmsg and syslog() users. 1783 * /dev/kmsg and syslog() users.
1726 */ 1784 */
1727 if (console_trylock_for_printk(this_cpu)) 1785 if (console_trylock_for_printk())
1728 console_unlock(); 1786 console_unlock();
1787 preempt_enable();
1788 lockdep_on();
1729 } 1789 }
1730 1790
1731 lockdep_on();
1732out_restore_irqs:
1733 local_irq_restore(flags);
1734 return printed_len; 1791 return printed_len;
1735} 1792}
1736EXPORT_SYMBOL(vprintk_emit); 1793EXPORT_SYMBOL(vprintk_emit);
@@ -1802,7 +1859,7 @@ EXPORT_SYMBOL(printk);
1802 1859
1803#define LOG_LINE_MAX 0 1860#define LOG_LINE_MAX 0
1804#define PREFIX_MAX 0 1861#define PREFIX_MAX 0
1805#define LOG_LINE_MAX 0 1862
1806static u64 syslog_seq; 1863static u64 syslog_seq;
1807static u32 syslog_idx; 1864static u32 syslog_idx;
1808static u64 console_seq; 1865static u64 console_seq;
@@ -1881,11 +1938,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
1881 return 0; 1938 return 0;
1882} 1939}
1883/* 1940/*
1884 * Set up a list of consoles. Called from init/main.c 1941 * Set up a console. Called via do_early_param() in init/main.c
1942 * for each "console=" parameter in the boot command line.
1885 */ 1943 */
1886static int __init console_setup(char *str) 1944static int __init console_setup(char *str)
1887{ 1945{
1888 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ 1946 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
1889 char *s, *options, *brl_options = NULL; 1947 char *s, *options, *brl_options = NULL;
1890 int idx; 1948 int idx;
1891 1949
@@ -1902,7 +1960,8 @@ static int __init console_setup(char *str)
1902 strncpy(buf, str, sizeof(buf) - 1); 1960 strncpy(buf, str, sizeof(buf) - 1);
1903 } 1961 }
1904 buf[sizeof(buf) - 1] = 0; 1962 buf[sizeof(buf) - 1] = 0;
1905 if ((options = strchr(str, ',')) != NULL) 1963 options = strchr(str, ',');
1964 if (options)
1906 *(options++) = 0; 1965 *(options++) = 0;
1907#ifdef __sparc__ 1966#ifdef __sparc__
1908 if (!strcmp(str, "ttya")) 1967 if (!strcmp(str, "ttya"))
@@ -1911,7 +1970,7 @@ static int __init console_setup(char *str)
1911 strcpy(buf, "ttyS1"); 1970 strcpy(buf, "ttyS1");
1912#endif 1971#endif
1913 for (s = buf; *s; s++) 1972 for (s = buf; *s; s++)
1914 if ((*s >= '0' && *s <= '9') || *s == ',') 1973 if (isdigit(*s) || *s == ',')
1915 break; 1974 break;
1916 idx = simple_strtoul(s, NULL, 10); 1975 idx = simple_strtoul(s, NULL, 10);
1917 *s = 0; 1976 *s = 0;
@@ -1950,7 +2009,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1950 i++, c++) 2009 i++, c++)
1951 if (strcmp(c->name, name) == 0 && c->index == idx) { 2010 if (strcmp(c->name, name) == 0 && c->index == idx) {
1952 strlcpy(c->name, name_new, sizeof(c->name)); 2011 strlcpy(c->name, name_new, sizeof(c->name));
1953 c->name[sizeof(c->name) - 1] = 0;
1954 c->options = options; 2012 c->options = options;
1955 c->index = idx_new; 2013 c->index = idx_new;
1956 return i; 2014 return i;
@@ -1959,12 +2017,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1959 return -1; 2017 return -1;
1960} 2018}
1961 2019
1962bool console_suspend_enabled = 1; 2020bool console_suspend_enabled = true;
1963EXPORT_SYMBOL(console_suspend_enabled); 2021EXPORT_SYMBOL(console_suspend_enabled);
1964 2022
1965static int __init console_suspend_disable(char *str) 2023static int __init console_suspend_disable(char *str)
1966{ 2024{
1967 console_suspend_enabled = 0; 2025 console_suspend_enabled = false;
1968 return 1; 2026 return 1;
1969} 2027}
1970__setup("no_console_suspend", console_suspend_disable); 2028__setup("no_console_suspend", console_suspend_disable);
@@ -2045,8 +2103,8 @@ EXPORT_SYMBOL(console_lock);
2045/** 2103/**
2046 * console_trylock - try to lock the console system for exclusive use. 2104 * console_trylock - try to lock the console system for exclusive use.
2047 * 2105 *
2048 * Tried to acquire a lock which guarantees that the caller has 2106 * Try to acquire a lock which guarantees that the caller has exclusive
2049 * exclusive access to the console system and the console_drivers list. 2107 * access to the console system and the console_drivers list.
2050 * 2108 *
2051 * returns 1 on success, and 0 on failure to acquire the lock. 2109 * returns 1 on success, and 0 on failure to acquire the lock.
2052 */ 2110 */
@@ -2618,14 +2676,13 @@ EXPORT_SYMBOL(__printk_ratelimit);
2618bool printk_timed_ratelimit(unsigned long *caller_jiffies, 2676bool printk_timed_ratelimit(unsigned long *caller_jiffies,
2619 unsigned int interval_msecs) 2677 unsigned int interval_msecs)
2620{ 2678{
2621 if (*caller_jiffies == 0 2679 unsigned long elapsed = jiffies - *caller_jiffies;
2622 || !time_in_range(jiffies, *caller_jiffies, 2680
2623 *caller_jiffies 2681 if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
2624 + msecs_to_jiffies(interval_msecs))) { 2682 return false;
2625 *caller_jiffies = jiffies; 2683
2626 return true; 2684 *caller_jiffies = jiffies;
2627 } 2685 return true;
2628 return false;
2629} 2686}
2630EXPORT_SYMBOL(printk_timed_ratelimit); 2687EXPORT_SYMBOL(printk_timed_ratelimit);
2631 2688
diff --git a/kernel/resource.c b/kernel/resource.c
index 3c2237ac32db..da14b8d09296 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock);
59static struct resource *bootmem_resource_free; 59static struct resource *bootmem_resource_free;
60static DEFINE_SPINLOCK(bootmem_resource_lock); 60static DEFINE_SPINLOCK(bootmem_resource_lock);
61 61
62static void *r_next(struct seq_file *m, void *v, loff_t *pos) 62static struct resource *next_resource(struct resource *p, bool sibling_only)
63{ 63{
64 struct resource *p = v; 64 /* Caller wants to traverse through siblings only */
65 (*pos)++; 65 if (sibling_only)
66 return p->sibling;
67
66 if (p->child) 68 if (p->child)
67 return p->child; 69 return p->child;
68 while (!p->sibling && p->parent) 70 while (!p->sibling && p->parent)
@@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
70 return p->sibling; 72 return p->sibling;
71} 73}
72 74
75static void *r_next(struct seq_file *m, void *v, loff_t *pos)
76{
77 struct resource *p = v;
78 (*pos)++;
79 return (void *)next_resource(p, false);
80}
81
73#ifdef CONFIG_PROC_FS 82#ifdef CONFIG_PROC_FS
74 83
75enum { MAX_IORES_LEVEL = 5 }; 84enum { MAX_IORES_LEVEL = 5 };
@@ -322,16 +331,19 @@ int release_resource(struct resource *old)
322 331
323EXPORT_SYMBOL(release_resource); 332EXPORT_SYMBOL(release_resource);
324 333
325#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
326/* 334/*
327 * Finds the lowest memory reosurce exists within [res->start.res->end) 335 * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
328 * the caller must specify res->start, res->end, res->flags and "name". 336 * the caller must specify res->start, res->end, res->flags and "name".
329 * If found, returns 0, res is overwritten, if not found, returns -1. 337 * If found, returns 0, res is overwritten, if not found, returns -1.
338 * This walks through whole tree and not just first level children
339 * until and unless first_level_children_only is true.
330 */ 340 */
331static int find_next_system_ram(struct resource *res, char *name) 341static int find_next_iomem_res(struct resource *res, char *name,
342 bool first_level_children_only)
332{ 343{
333 resource_size_t start, end; 344 resource_size_t start, end;
334 struct resource *p; 345 struct resource *p;
346 bool sibling_only = false;
335 347
336 BUG_ON(!res); 348 BUG_ON(!res);
337 349
@@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name)
340 BUG_ON(start >= end); 352 BUG_ON(start >= end);
341 353
342 read_lock(&resource_lock); 354 read_lock(&resource_lock);
343 for (p = iomem_resource.child; p ; p = p->sibling) { 355
344 /* system ram is just marked as IORESOURCE_MEM */ 356 if (first_level_children_only) {
357 p = iomem_resource.child;
358 sibling_only = true;
359 } else
360 p = &iomem_resource;
361
362 while ((p = next_resource(p, sibling_only))) {
345 if (p->flags != res->flags) 363 if (p->flags != res->flags)
346 continue; 364 continue;
347 if (name && strcmp(p->name, name)) 365 if (name && strcmp(p->name, name))
@@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name)
353 if ((p->end >= start) && (p->start < end)) 371 if ((p->end >= start) && (p->start < end))
354 break; 372 break;
355 } 373 }
374
356 read_unlock(&resource_lock); 375 read_unlock(&resource_lock);
357 if (!p) 376 if (!p)
358 return -1; 377 return -1;
@@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name)
365} 384}
366 385
367/* 386/*
387 * Walks through iomem resources and calls func() with matching resource
388 * ranges. This walks through whole tree and not just first level children.
389 * All the memory ranges which overlap start,end and also match flags and
390 * name are valid candidates.
391 *
392 * @name: name of resource
393 * @flags: resource flags
394 * @start: start addr
395 * @end: end addr
396 */
397int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
398 void *arg, int (*func)(u64, u64, void *))
399{
400 struct resource res;
401 u64 orig_end;
402 int ret = -1;
403
404 res.start = start;
405 res.end = end;
406 res.flags = flags;
407 orig_end = res.end;
408 while ((res.start < res.end) &&
409 (!find_next_iomem_res(&res, name, false))) {
410 ret = (*func)(res.start, res.end, arg);
411 if (ret)
412 break;
413 res.start = res.end + 1;
414 res.end = orig_end;
415 }
416 return ret;
417}
418
419/*
420 * This function calls callback against all memory range of "System RAM"
421 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
422 * Now, this function is only for "System RAM". This function deals with
423 * full ranges and not pfn. If resources are not pfn aligned, dealing
424 * with pfn can truncate ranges.
425 */
426int walk_system_ram_res(u64 start, u64 end, void *arg,
427 int (*func)(u64, u64, void *))
428{
429 struct resource res;
430 u64 orig_end;
431 int ret = -1;
432
433 res.start = start;
434 res.end = end;
435 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
436 orig_end = res.end;
437 while ((res.start < res.end) &&
438 (!find_next_iomem_res(&res, "System RAM", true))) {
439 ret = (*func)(res.start, res.end, arg);
440 if (ret)
441 break;
442 res.start = res.end + 1;
443 res.end = orig_end;
444 }
445 return ret;
446}
447
448#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
449
450/*
368 * This function calls callback against all memory range of "System RAM" 451 * This function calls callback against all memory range of "System RAM"
369 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. 452 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
370 * Now, this function is only for "System RAM". 453 * Now, this function is only for "System RAM".
@@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
382 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; 465 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
383 orig_end = res.end; 466 orig_end = res.end;
384 while ((res.start < res.end) && 467 while ((res.start < res.end) &&
385 (find_next_system_ram(&res, "System RAM") >= 0)) { 468 (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
386 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; 469 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
387 end_pfn = (res.end + 1) >> PAGE_SHIFT; 470 end_pfn = (res.end + 1) >> PAGE_SHIFT;
388 if (end_pfn > pfn) 471 if (end_pfn > pfn)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1211575a2208..ec1a286684a5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2393,6 +2393,13 @@ unsigned long nr_iowait_cpu(int cpu)
2393 return atomic_read(&this->nr_iowait); 2393 return atomic_read(&this->nr_iowait);
2394} 2394}
2395 2395
2396void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2397{
2398 struct rq *this = this_rq();
2399 *nr_waiters = atomic_read(&this->nr_iowait);
2400 *load = this->cpu_load[0];
2401}
2402
2396#ifdef CONFIG_SMP 2403#ifdef CONFIG_SMP
2397 2404
2398/* 2405/*
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9f1608f99819..11e7bc434f43 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,8 +147,6 @@ use_default:
147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) 147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
148 goto use_default; 148 goto use_default;
149 149
150 trace_cpu_idle_rcuidle(next_state, dev->cpu);
151
152 /* 150 /*
153 * Enter the idle state previously returned by the governor decision. 151 * Enter the idle state previously returned by the governor decision.
154 * This function will block until an interrupt occurs and will take 152 * This function will block until an interrupt occurs and will take
@@ -156,8 +154,6 @@ use_default:
156 */ 154 */
157 entered_state = cpuidle_enter(drv, dev, next_state); 155 entered_state = cpuidle_enter(drv, dev, next_state);
158 156
159 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
160
161 if (broadcast) 157 if (broadcast)
162 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 158 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
163 159
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 16f5a30f9c88..8ecd552fe4f2 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -8,13 +8,6 @@
8 8
9#include "sched.h" 9#include "sched.h"
10 10
11unsigned long this_cpu_load(void)
12{
13 struct rq *this = this_rq();
14 return this->cpu_load[0];
15}
16
17
18/* 11/*
19 * Global load-average calculations 12 * Global load-average calculations
20 * 13 *
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 301bbc24739c..44eb005c6695 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,15 +18,17 @@
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/seccomp.h> 20#include <linux/seccomp.h>
21#include <linux/slab.h>
22#include <linux/syscalls.h>
21 23
22/* #define SECCOMP_DEBUG 1 */ 24/* #define SECCOMP_DEBUG 1 */
23 25
24#ifdef CONFIG_SECCOMP_FILTER 26#ifdef CONFIG_SECCOMP_FILTER
25#include <asm/syscall.h> 27#include <asm/syscall.h>
26#include <linux/filter.h> 28#include <linux/filter.h>
29#include <linux/pid.h>
27#include <linux/ptrace.h> 30#include <linux/ptrace.h>
28#include <linux/security.h> 31#include <linux/security.h>
29#include <linux/slab.h>
30#include <linux/tracehook.h> 32#include <linux/tracehook.h>
31#include <linux/uaccess.h> 33#include <linux/uaccess.h>
32 34
@@ -54,7 +56,7 @@
54struct seccomp_filter { 56struct seccomp_filter {
55 atomic_t usage; 57 atomic_t usage;
56 struct seccomp_filter *prev; 58 struct seccomp_filter *prev;
57 struct sk_filter *prog; 59 struct bpf_prog *prog;
58}; 60};
59 61
60/* Limit any path through the tree to 256KB worth of instructions. */ 62/* Limit any path through the tree to 256KB worth of instructions. */
@@ -87,7 +89,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
87 * @filter: filter to verify 89 * @filter: filter to verify
88 * @flen: length of filter 90 * @flen: length of filter
89 * 91 *
90 * Takes a previously checked filter (by sk_chk_filter) and 92 * Takes a previously checked filter (by bpf_check_classic) and
91 * redirects all filter code that loads struct sk_buff data 93 * redirects all filter code that loads struct sk_buff data
92 * and related data through seccomp_bpf_load. It also 94 * and related data through seccomp_bpf_load. It also
93 * enforces length and alignment checking of those loads. 95 * enforces length and alignment checking of those loads.
@@ -172,51 +174,184 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
172 */ 174 */
173static u32 seccomp_run_filters(int syscall) 175static u32 seccomp_run_filters(int syscall)
174{ 176{
175 struct seccomp_filter *f; 177 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
176 struct seccomp_data sd; 178 struct seccomp_data sd;
177 u32 ret = SECCOMP_RET_ALLOW; 179 u32 ret = SECCOMP_RET_ALLOW;
178 180
179 /* Ensure unexpected behavior doesn't result in failing open. */ 181 /* Ensure unexpected behavior doesn't result in failing open. */
180 if (WARN_ON(current->seccomp.filter == NULL)) 182 if (unlikely(WARN_ON(f == NULL)))
181 return SECCOMP_RET_KILL; 183 return SECCOMP_RET_KILL;
182 184
185 /* Make sure cross-thread synced filter points somewhere sane. */
186 smp_read_barrier_depends();
187
183 populate_seccomp_data(&sd); 188 populate_seccomp_data(&sd);
184 189
185 /* 190 /*
186 * All filters in the list are evaluated and the lowest BPF return 191 * All filters in the list are evaluated and the lowest BPF return
187 * value always takes priority (ignoring the DATA). 192 * value always takes priority (ignoring the DATA).
188 */ 193 */
189 for (f = current->seccomp.filter; f; f = f->prev) { 194 for (; f; f = f->prev) {
190 u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); 195 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);
191 196
192 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 197 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
193 ret = cur_ret; 198 ret = cur_ret;
194 } 199 }
195 return ret; 200 return ret;
196} 201}
202#endif /* CONFIG_SECCOMP_FILTER */
203
204static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
205{
206 assert_spin_locked(&current->sighand->siglock);
207
208 if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
209 return false;
210
211 return true;
212}
213
214static inline void seccomp_assign_mode(struct task_struct *task,
215 unsigned long seccomp_mode)
216{
217 assert_spin_locked(&task->sighand->siglock);
218
219 task->seccomp.mode = seccomp_mode;
220 /*
221 * Make sure TIF_SECCOMP cannot be set before the mode (and
222 * filter) is set.
223 */
224 smp_mb__before_atomic();
225 set_tsk_thread_flag(task, TIF_SECCOMP);
226}
227
228#ifdef CONFIG_SECCOMP_FILTER
229/* Returns 1 if the parent is an ancestor of the child. */
230static int is_ancestor(struct seccomp_filter *parent,
231 struct seccomp_filter *child)
232{
233 /* NULL is the root ancestor. */
234 if (parent == NULL)
235 return 1;
236 for (; child; child = child->prev)
237 if (child == parent)
238 return 1;
239 return 0;
240}
197 241
198/** 242/**
199 * seccomp_attach_filter: Attaches a seccomp filter to current. 243 * seccomp_can_sync_threads: checks if all threads can be synchronized
244 *
245 * Expects sighand and cred_guard_mutex locks to be held.
246 *
247 * Returns 0 on success, -ve on error, or the pid of a thread which was
248 * either not in the correct seccomp mode or it did not have an ancestral
249 * seccomp filter.
250 */
251static inline pid_t seccomp_can_sync_threads(void)
252{
253 struct task_struct *thread, *caller;
254
255 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
256 assert_spin_locked(&current->sighand->siglock);
257
258 /* Validate all threads being eligible for synchronization. */
259 caller = current;
260 for_each_thread(caller, thread) {
261 pid_t failed;
262
263 /* Skip current, since it is initiating the sync. */
264 if (thread == caller)
265 continue;
266
267 if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
268 (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
269 is_ancestor(thread->seccomp.filter,
270 caller->seccomp.filter)))
271 continue;
272
273 /* Return the first thread that cannot be synchronized. */
274 failed = task_pid_vnr(thread);
275 /* If the pid cannot be resolved, then return -ESRCH */
276 if (unlikely(WARN_ON(failed == 0)))
277 failed = -ESRCH;
278 return failed;
279 }
280
281 return 0;
282}
283
284/**
285 * seccomp_sync_threads: sets all threads to use current's filter
286 *
287 * Expects sighand and cred_guard_mutex locks to be held, and for
288 * seccomp_can_sync_threads() to have returned success already
289 * without dropping the locks.
290 *
291 */
292static inline void seccomp_sync_threads(void)
293{
294 struct task_struct *thread, *caller;
295
296 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
297 assert_spin_locked(&current->sighand->siglock);
298
299 /* Synchronize all threads. */
300 caller = current;
301 for_each_thread(caller, thread) {
302 /* Skip current, since it needs no changes. */
303 if (thread == caller)
304 continue;
305
306 /* Get a task reference for the new leaf node. */
307 get_seccomp_filter(caller);
308 /*
309 * Drop the task reference to the shared ancestor since
310 * current's path will hold a reference. (This also
311 * allows a put before the assignment.)
312 */
313 put_seccomp_filter(thread);
314 smp_store_release(&thread->seccomp.filter,
315 caller->seccomp.filter);
316 /*
317 * Opt the other thread into seccomp if needed.
318 * As threads are considered to be trust-realm
319 * equivalent (see ptrace_may_access), it is safe to
320 * allow one thread to transition the other.
321 */
322 if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
323 /*
324 * Don't let an unprivileged task work around
325 * the no_new_privs restriction by creating
326 * a thread that sets it up, enters seccomp,
327 * then dies.
328 */
329 if (task_no_new_privs(caller))
330 task_set_no_new_privs(thread);
331
332 seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
333 }
334 }
335}
336
337/**
338 * seccomp_prepare_filter: Prepares a seccomp filter for use.
200 * @fprog: BPF program to install 339 * @fprog: BPF program to install
201 * 340 *
202 * Returns 0 on success or an errno on failure. 341 * Returns filter on success or an ERR_PTR on failure.
203 */ 342 */
204static long seccomp_attach_filter(struct sock_fprog *fprog) 343static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
205{ 344{
206 struct seccomp_filter *filter; 345 struct seccomp_filter *filter;
207 unsigned long fp_size = fprog->len * sizeof(struct sock_filter); 346 unsigned long fp_size;
208 unsigned long total_insns = fprog->len;
209 struct sock_filter *fp; 347 struct sock_filter *fp;
210 int new_len; 348 int new_len;
211 long ret; 349 long ret;
212 350
213 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) 351 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
214 return -EINVAL; 352 return ERR_PTR(-EINVAL);
215 353 BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
216 for (filter = current->seccomp.filter; filter; filter = filter->prev) 354 fp_size = fprog->len * sizeof(struct sock_filter);
217 total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
218 if (total_insns > MAX_INSNS_PER_PATH)
219 return -ENOMEM;
220 355
221 /* 356 /*
222 * Installing a seccomp filter requires that the task has 357 * Installing a seccomp filter requires that the task has
@@ -224,14 +359,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
224 * This avoids scenarios where unprivileged tasks can affect the 359 * This avoids scenarios where unprivileged tasks can affect the
225 * behavior of privileged children. 360 * behavior of privileged children.
226 */ 361 */
227 if (!current->no_new_privs && 362 if (!task_no_new_privs(current) &&
228 security_capable_noaudit(current_cred(), current_user_ns(), 363 security_capable_noaudit(current_cred(), current_user_ns(),
229 CAP_SYS_ADMIN) != 0) 364 CAP_SYS_ADMIN) != 0)
230 return -EACCES; 365 return ERR_PTR(-EACCES);
231 366
232 fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); 367 fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
233 if (!fp) 368 if (!fp)
234 return -ENOMEM; 369 return ERR_PTR(-ENOMEM);
235 370
236 /* Copy the instructions from fprog. */ 371 /* Copy the instructions from fprog. */
237 ret = -EFAULT; 372 ret = -EFAULT;
@@ -239,7 +374,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
239 goto free_prog; 374 goto free_prog;
240 375
241 /* Check and rewrite the fprog via the skb checker */ 376 /* Check and rewrite the fprog via the skb checker */
242 ret = sk_chk_filter(fp, fprog->len); 377 ret = bpf_check_classic(fp, fprog->len);
243 if (ret) 378 if (ret)
244 goto free_prog; 379 goto free_prog;
245 380
@@ -248,8 +383,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
248 if (ret) 383 if (ret)
249 goto free_prog; 384 goto free_prog;
250 385
251 /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ 386 /* Convert 'sock_filter' insns to 'bpf_insn' insns */
252 ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); 387 ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
253 if (ret) 388 if (ret)
254 goto free_prog; 389 goto free_prog;
255 390
@@ -260,12 +395,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
260 if (!filter) 395 if (!filter)
261 goto free_prog; 396 goto free_prog;
262 397
263 filter->prog = kzalloc(sk_filter_size(new_len), 398 filter->prog = kzalloc(bpf_prog_size(new_len),
264 GFP_KERNEL|__GFP_NOWARN); 399 GFP_KERNEL|__GFP_NOWARN);
265 if (!filter->prog) 400 if (!filter->prog)
266 goto free_filter; 401 goto free_filter;
267 402
268 ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); 403 ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
269 if (ret) 404 if (ret)
270 goto free_filter_prog; 405 goto free_filter_prog;
271 kfree(fp); 406 kfree(fp);
@@ -273,15 +408,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
273 atomic_set(&filter->usage, 1); 408 atomic_set(&filter->usage, 1);
274 filter->prog->len = new_len; 409 filter->prog->len = new_len;
275 410
276 sk_filter_select_runtime(filter->prog); 411 bpf_prog_select_runtime(filter->prog);
277 412
278 /* 413 return filter;
279 * If there is an existing filter, make it the prev and don't drop its
280 * task reference.
281 */
282 filter->prev = current->seccomp.filter;
283 current->seccomp.filter = filter;
284 return 0;
285 414
286free_filter_prog: 415free_filter_prog:
287 kfree(filter->prog); 416 kfree(filter->prog);
@@ -289,19 +418,20 @@ free_filter:
289 kfree(filter); 418 kfree(filter);
290free_prog: 419free_prog:
291 kfree(fp); 420 kfree(fp);
292 return ret; 421 return ERR_PTR(ret);
293} 422}
294 423
295/** 424/**
296 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog 425 * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
297 * @user_filter: pointer to the user data containing a sock_fprog. 426 * @user_filter: pointer to the user data containing a sock_fprog.
298 * 427 *
299 * Returns 0 on success and non-zero otherwise. 428 * Returns 0 on success and non-zero otherwise.
300 */ 429 */
301static long seccomp_attach_user_filter(char __user *user_filter) 430static struct seccomp_filter *
431seccomp_prepare_user_filter(const char __user *user_filter)
302{ 432{
303 struct sock_fprog fprog; 433 struct sock_fprog fprog;
304 long ret = -EFAULT; 434 struct seccomp_filter *filter = ERR_PTR(-EFAULT);
305 435
306#ifdef CONFIG_COMPAT 436#ifdef CONFIG_COMPAT
307 if (is_compat_task()) { 437 if (is_compat_task()) {
@@ -314,9 +444,56 @@ static long seccomp_attach_user_filter(char __user *user_filter)
314#endif 444#endif
315 if (copy_from_user(&fprog, user_filter, sizeof(fprog))) 445 if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
316 goto out; 446 goto out;
317 ret = seccomp_attach_filter(&fprog); 447 filter = seccomp_prepare_filter(&fprog);
318out: 448out:
319 return ret; 449 return filter;
450}
451
452/**
453 * seccomp_attach_filter: validate and attach filter
454 * @flags: flags to change filter behavior
455 * @filter: seccomp filter to add to the current process
456 *
457 * Caller must be holding current->sighand->siglock lock.
458 *
459 * Returns 0 on success, -ve on error.
460 */
461static long seccomp_attach_filter(unsigned int flags,
462 struct seccomp_filter *filter)
463{
464 unsigned long total_insns;
465 struct seccomp_filter *walker;
466
467 assert_spin_locked(&current->sighand->siglock);
468
469 /* Validate resulting filter length. */
470 total_insns = filter->prog->len;
471 for (walker = current->seccomp.filter; walker; walker = walker->prev)
472 total_insns += walker->prog->len + 4; /* 4 instr penalty */
473 if (total_insns > MAX_INSNS_PER_PATH)
474 return -ENOMEM;
475
476 /* If thread sync has been requested, check that it is possible. */
477 if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
478 int ret;
479
480 ret = seccomp_can_sync_threads();
481 if (ret)
482 return ret;
483 }
484
485 /*
486 * If there is an existing filter, make it the prev and don't drop its
487 * task reference.
488 */
489 filter->prev = current->seccomp.filter;
490 current->seccomp.filter = filter;
491
492 /* Now that the new filter is in place, synchronize to all threads. */
493 if (flags & SECCOMP_FILTER_FLAG_TSYNC)
494 seccomp_sync_threads();
495
496 return 0;
320} 497}
321 498
322/* get_seccomp_filter - increments the reference count of the filter on @tsk */ 499/* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -329,6 +506,14 @@ void get_seccomp_filter(struct task_struct *tsk)
329 atomic_inc(&orig->usage); 506 atomic_inc(&orig->usage);
330} 507}
331 508
509static inline void seccomp_filter_free(struct seccomp_filter *filter)
510{
511 if (filter) {
512 bpf_prog_free(filter->prog);
513 kfree(filter);
514 }
515}
516
332/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ 517/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
333void put_seccomp_filter(struct task_struct *tsk) 518void put_seccomp_filter(struct task_struct *tsk)
334{ 519{
@@ -337,8 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk)
337 while (orig && atomic_dec_and_test(&orig->usage)) { 522 while (orig && atomic_dec_and_test(&orig->usage)) {
338 struct seccomp_filter *freeme = orig; 523 struct seccomp_filter *freeme = orig;
339 orig = orig->prev; 524 orig = orig->prev;
340 sk_filter_free(freeme->prog); 525 seccomp_filter_free(freeme);
341 kfree(freeme);
342 } 526 }
343} 527}
344 528
@@ -382,12 +566,17 @@ static int mode1_syscalls_32[] = {
382 566
383int __secure_computing(int this_syscall) 567int __secure_computing(int this_syscall)
384{ 568{
385 int mode = current->seccomp.mode;
386 int exit_sig = 0; 569 int exit_sig = 0;
387 int *syscall; 570 int *syscall;
388 u32 ret; 571 u32 ret;
389 572
390 switch (mode) { 573 /*
574 * Make sure that any changes to mode from another thread have
575 * been seen after TIF_SECCOMP was seen.
576 */
577 rmb();
578
579 switch (current->seccomp.mode) {
391 case SECCOMP_MODE_STRICT: 580 case SECCOMP_MODE_STRICT:
392 syscall = mode1_syscalls; 581 syscall = mode1_syscalls;
393#ifdef CONFIG_COMPAT 582#ifdef CONFIG_COMPAT
@@ -473,47 +662,152 @@ long prctl_get_seccomp(void)
473} 662}
474 663
475/** 664/**
476 * prctl_set_seccomp: configures current->seccomp.mode 665 * seccomp_set_mode_strict: internal function for setting strict seccomp
477 * @seccomp_mode: requested mode to use
478 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
479 * 666 *
480 * This function may be called repeatedly with a @seccomp_mode of 667 * Once current->seccomp.mode is non-zero, it may not be changed.
481 * SECCOMP_MODE_FILTER to install additional filters. Every filter 668 *
482 * successfully installed will be evaluated (in reverse order) for each system 669 * Returns 0 on success or -EINVAL on failure.
483 * call the task makes. 670 */
671static long seccomp_set_mode_strict(void)
672{
673 const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
674 long ret = -EINVAL;
675
676 spin_lock_irq(&current->sighand->siglock);
677
678 if (!seccomp_may_assign_mode(seccomp_mode))
679 goto out;
680
681#ifdef TIF_NOTSC
682 disable_TSC();
683#endif
684 seccomp_assign_mode(current, seccomp_mode);
685 ret = 0;
686
687out:
688 spin_unlock_irq(&current->sighand->siglock);
689
690 return ret;
691}
692
693#ifdef CONFIG_SECCOMP_FILTER
694/**
695 * seccomp_set_mode_filter: internal function for setting seccomp filter
696 * @flags: flags to change filter behavior
697 * @filter: struct sock_fprog containing filter
698 *
699 * This function may be called repeatedly to install additional filters.
700 * Every filter successfully installed will be evaluated (in reverse order)
701 * for each system call the task makes.
484 * 702 *
485 * Once current->seccomp.mode is non-zero, it may not be changed. 703 * Once current->seccomp.mode is non-zero, it may not be changed.
486 * 704 *
487 * Returns 0 on success or -EINVAL on failure. 705 * Returns 0 on success or -EINVAL on failure.
488 */ 706 */
489long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) 707static long seccomp_set_mode_filter(unsigned int flags,
708 const char __user *filter)
490{ 709{
710 const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
711 struct seccomp_filter *prepared = NULL;
491 long ret = -EINVAL; 712 long ret = -EINVAL;
492 713
493 if (current->seccomp.mode && 714 /* Validate flags. */
494 current->seccomp.mode != seccomp_mode) 715 if (flags & ~SECCOMP_FILTER_FLAG_MASK)
716 return -EINVAL;
717
718 /* Prepare the new filter before holding any locks. */
719 prepared = seccomp_prepare_user_filter(filter);
720 if (IS_ERR(prepared))
721 return PTR_ERR(prepared);
722
723 /*
724 * Make sure we cannot change seccomp or nnp state via TSYNC
725 * while another thread is in the middle of calling exec.
726 */
727 if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
728 mutex_lock_killable(&current->signal->cred_guard_mutex))
729 goto out_free;
730
731 spin_lock_irq(&current->sighand->siglock);
732
733 if (!seccomp_may_assign_mode(seccomp_mode))
734 goto out;
735
736 ret = seccomp_attach_filter(flags, prepared);
737 if (ret)
495 goto out; 738 goto out;
739 /* Do not free the successfully attached filter. */
740 prepared = NULL;
741
742 seccomp_assign_mode(current, seccomp_mode);
743out:
744 spin_unlock_irq(&current->sighand->siglock);
745 if (flags & SECCOMP_FILTER_FLAG_TSYNC)
746 mutex_unlock(&current->signal->cred_guard_mutex);
747out_free:
748 seccomp_filter_free(prepared);
749 return ret;
750}
751#else
752static inline long seccomp_set_mode_filter(unsigned int flags,
753 const char __user *filter)
754{
755 return -EINVAL;
756}
757#endif
758
759/* Common entry point for both prctl and syscall. */
760static long do_seccomp(unsigned int op, unsigned int flags,
761 const char __user *uargs)
762{
763 switch (op) {
764 case SECCOMP_SET_MODE_STRICT:
765 if (flags != 0 || uargs != NULL)
766 return -EINVAL;
767 return seccomp_set_mode_strict();
768 case SECCOMP_SET_MODE_FILTER:
769 return seccomp_set_mode_filter(flags, uargs);
770 default:
771 return -EINVAL;
772 }
773}
774
775SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
776 const char __user *, uargs)
777{
778 return do_seccomp(op, flags, uargs);
779}
780
781/**
782 * prctl_set_seccomp: configures current->seccomp.mode
783 * @seccomp_mode: requested mode to use
784 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
785 *
786 * Returns 0 on success or -EINVAL on failure.
787 */
788long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
789{
790 unsigned int op;
791 char __user *uargs;
496 792
497 switch (seccomp_mode) { 793 switch (seccomp_mode) {
498 case SECCOMP_MODE_STRICT: 794 case SECCOMP_MODE_STRICT:
499 ret = 0; 795 op = SECCOMP_SET_MODE_STRICT;
500#ifdef TIF_NOTSC 796 /*
501 disable_TSC(); 797 * Setting strict mode through prctl always ignored filter,
502#endif 798 * so make sure it is always NULL here to pass the internal
799 * check in do_seccomp().
800 */
801 uargs = NULL;
503 break; 802 break;
504#ifdef CONFIG_SECCOMP_FILTER
505 case SECCOMP_MODE_FILTER: 803 case SECCOMP_MODE_FILTER:
506 ret = seccomp_attach_user_filter(filter); 804 op = SECCOMP_SET_MODE_FILTER;
507 if (ret) 805 uargs = filter;
508 goto out;
509 break; 806 break;
510#endif
511 default: 807 default:
512 goto out; 808 return -EINVAL;
513 } 809 }
514 810
515 current->seccomp.mode = seccomp_mode; 811 /* prctl interface doesn't have flags, so they are always zero. */
516 set_thread_flag(TIF_SECCOMP); 812 return do_seccomp(op, 0, uargs);
517out:
518 return ret;
519} 813}
diff --git a/kernel/signal.c b/kernel/signal.c
index 40b76e351e64..8f0876f9f6dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2170,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
2170 return signr; 2170 return signr;
2171} 2171}
2172 2172
2173int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, 2173int get_signal(struct ksignal *ksig)
2174 struct pt_regs *regs, void *cookie)
2175{ 2174{
2176 struct sighand_struct *sighand = current->sighand; 2175 struct sighand_struct *sighand = current->sighand;
2177 struct signal_struct *signal = current->signal; 2176 struct signal_struct *signal = current->signal;
@@ -2241,13 +2240,13 @@ relock:
2241 goto relock; 2240 goto relock;
2242 } 2241 }
2243 2242
2244 signr = dequeue_signal(current, &current->blocked, info); 2243 signr = dequeue_signal(current, &current->blocked, &ksig->info);
2245 2244
2246 if (!signr) 2245 if (!signr)
2247 break; /* will return 0 */ 2246 break; /* will return 0 */
2248 2247
2249 if (unlikely(current->ptrace) && signr != SIGKILL) { 2248 if (unlikely(current->ptrace) && signr != SIGKILL) {
2250 signr = ptrace_signal(signr, info); 2249 signr = ptrace_signal(signr, &ksig->info);
2251 if (!signr) 2250 if (!signr)
2252 continue; 2251 continue;
2253 } 2252 }
@@ -2255,13 +2254,13 @@ relock:
2255 ka = &sighand->action[signr-1]; 2254 ka = &sighand->action[signr-1];
2256 2255
2257 /* Trace actually delivered signals. */ 2256 /* Trace actually delivered signals. */
2258 trace_signal_deliver(signr, info, ka); 2257 trace_signal_deliver(signr, &ksig->info, ka);
2259 2258
2260 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 2259 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
2261 continue; 2260 continue;
2262 if (ka->sa.sa_handler != SIG_DFL) { 2261 if (ka->sa.sa_handler != SIG_DFL) {
2263 /* Run the handler. */ 2262 /* Run the handler. */
2264 *return_ka = *ka; 2263 ksig->ka = *ka;
2265 2264
2266 if (ka->sa.sa_flags & SA_ONESHOT) 2265 if (ka->sa.sa_flags & SA_ONESHOT)
2267 ka->sa.sa_handler = SIG_DFL; 2266 ka->sa.sa_handler = SIG_DFL;
@@ -2311,7 +2310,7 @@ relock:
2311 spin_lock_irq(&sighand->siglock); 2310 spin_lock_irq(&sighand->siglock);
2312 } 2311 }
2313 2312
2314 if (likely(do_signal_stop(info->si_signo))) { 2313 if (likely(do_signal_stop(ksig->info.si_signo))) {
2315 /* It released the siglock. */ 2314 /* It released the siglock. */
2316 goto relock; 2315 goto relock;
2317 } 2316 }
@@ -2332,7 +2331,7 @@ relock:
2332 2331
2333 if (sig_kernel_coredump(signr)) { 2332 if (sig_kernel_coredump(signr)) {
2334 if (print_fatal_signals) 2333 if (print_fatal_signals)
2335 print_fatal_signal(info->si_signo); 2334 print_fatal_signal(ksig->info.si_signo);
2336 proc_coredump_connector(current); 2335 proc_coredump_connector(current);
2337 /* 2336 /*
2338 * If it was able to dump core, this kills all 2337 * If it was able to dump core, this kills all
@@ -2342,34 +2341,32 @@ relock:
2342 * first and our do_group_exit call below will use 2341 * first and our do_group_exit call below will use
2343 * that value and ignore the one we pass it. 2342 * that value and ignore the one we pass it.
2344 */ 2343 */
2345 do_coredump(info); 2344 do_coredump(&ksig->info);
2346 } 2345 }
2347 2346
2348 /* 2347 /*
2349 * Death signals, no core dump. 2348 * Death signals, no core dump.
2350 */ 2349 */
2351 do_group_exit(info->si_signo); 2350 do_group_exit(ksig->info.si_signo);
2352 /* NOTREACHED */ 2351 /* NOTREACHED */
2353 } 2352 }
2354 spin_unlock_irq(&sighand->siglock); 2353 spin_unlock_irq(&sighand->siglock);
2355 return signr; 2354
2355 ksig->sig = signr;
2356 return ksig->sig > 0;
2356} 2357}
2357 2358
2358/** 2359/**
2359 * signal_delivered - 2360 * signal_delivered -
2360 * @sig: number of signal being delivered 2361 * @ksig: kernel signal struct
2361 * @info: siginfo_t of signal being delivered
2362 * @ka: sigaction setting that chose the handler
2363 * @regs: user register state
2364 * @stepping: nonzero if debugger single-step or block-step in use 2362 * @stepping: nonzero if debugger single-step or block-step in use
2365 * 2363 *
2366 * This function should be called when a signal has successfully been 2364 * This function should be called when a signal has successfully been
2367 * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask 2365 * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
2368 * is always blocked, and the signal itself is blocked unless %SA_NODEFER 2366 * is always blocked, and the signal itself is blocked unless %SA_NODEFER
2369 * is set in @ka->sa.sa_flags. Tracing is notified. 2367 * is set in @ksig->ka.sa.sa_flags. Tracing is notified.
2370 */ 2368 */
2371void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, 2369static void signal_delivered(struct ksignal *ksig, int stepping)
2372 struct pt_regs *regs, int stepping)
2373{ 2370{
2374 sigset_t blocked; 2371 sigset_t blocked;
2375 2372
@@ -2379,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
2379 simply clear the restore sigmask flag. */ 2376 simply clear the restore sigmask flag. */
2380 clear_restore_sigmask(); 2377 clear_restore_sigmask();
2381 2378
2382 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask); 2379 sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
2383 if (!(ka->sa.sa_flags & SA_NODEFER)) 2380 if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
2384 sigaddset(&blocked, sig); 2381 sigaddset(&blocked, ksig->sig);
2385 set_current_blocked(&blocked); 2382 set_current_blocked(&blocked);
2386 tracehook_signal_handler(sig, info, ka, regs, stepping); 2383 tracehook_signal_handler(stepping);
2387} 2384}
2388 2385
2389void signal_setup_done(int failed, struct ksignal *ksig, int stepping) 2386void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -2391,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
2391 if (failed) 2388 if (failed)
2392 force_sigsegv(ksig->sig, current); 2389 force_sigsegv(ksig->sig, current);
2393 else 2390 else
2394 signal_delivered(ksig->sig, &ksig->info, &ksig->ka, 2391 signal_delivered(ksig, stepping);
2395 signal_pt_regs(), stepping);
2396} 2392}
2397 2393
2398/* 2394/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 487653b5844f..aff8aa14f547 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
670 if (cond_func(cpu, info)) { 670 if (cond_func(cpu, info)) {
671 ret = smp_call_function_single(cpu, func, 671 ret = smp_call_function_single(cpu, func,
672 info, wait); 672 info, wait);
673 WARN_ON_ONCE(!ret); 673 WARN_ON_ONCE(ret);
674 } 674 }
675 preempt_enable(); 675 preempt_enable();
676 } 676 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 66a751ebf9d9..ce8129192a26 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1990 if (arg2 != 1 || arg3 || arg4 || arg5) 1990 if (arg2 != 1 || arg3 || arg4 || arg5)
1991 return -EINVAL; 1991 return -EINVAL;
1992 1992
1993 current->no_new_privs = 1; 1993 task_set_no_new_privs(current);
1994 break; 1994 break;
1995 case PR_GET_NO_NEW_PRIVS: 1995 case PR_GET_NO_NEW_PRIVS:
1996 if (arg2 || arg3 || arg4 || arg5) 1996 if (arg2 || arg3 || arg4 || arg5)
1997 return -EINVAL; 1997 return -EINVAL;
1998 return current->no_new_privs ? 1 : 0; 1998 return task_no_new_privs(current) ? 1 : 0;
1999 case PR_GET_THP_DISABLE: 1999 case PR_GET_THP_DISABLE:
2000 if (arg2 || arg3 || arg4 || arg5) 2000 if (arg2 || arg3 || arg4 || arg5)
2001 return -EINVAL; 2001 return -EINVAL;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 36441b51b5df..391d4ddb6f4b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapon);
25cond_syscall(sys_swapoff); 25cond_syscall(sys_swapoff);
26cond_syscall(sys_kexec_load); 26cond_syscall(sys_kexec_load);
27cond_syscall(compat_sys_kexec_load); 27cond_syscall(compat_sys_kexec_load);
28cond_syscall(sys_kexec_file_load);
28cond_syscall(sys_init_module); 29cond_syscall(sys_init_module);
29cond_syscall(sys_finit_module); 30cond_syscall(sys_finit_module);
30cond_syscall(sys_delete_module); 31cond_syscall(sys_delete_module);
@@ -197,6 +198,7 @@ cond_syscall(compat_sys_timerfd_settime);
197cond_syscall(compat_sys_timerfd_gettime); 198cond_syscall(compat_sys_timerfd_gettime);
198cond_syscall(sys_eventfd); 199cond_syscall(sys_eventfd);
199cond_syscall(sys_eventfd2); 200cond_syscall(sys_eventfd2);
201cond_syscall(sys_memfd_create);
200 202
201/* performance counters: */ 203/* performance counters: */
202cond_syscall(sys_perf_event_open); 204cond_syscall(sys_perf_event_open);
@@ -213,3 +215,6 @@ cond_syscall(compat_sys_open_by_handle_at);
213 215
214/* compare kernel pointers */ 216/* compare kernel pointers */
215cond_syscall(sys_kcmp); 217cond_syscall(sys_kcmp);
218
219/* operate on Secure Computing state */
220cond_syscall(sys_seccomp);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75b22e22a72c..75875a741b5e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = {
1240 .maxlen = sizeof(unsigned long), 1240 .maxlen = sizeof(unsigned long),
1241 .mode = 0644, 1241 .mode = 0644,
1242 .proc_handler = hugetlb_sysctl_handler, 1242 .proc_handler = hugetlb_sysctl_handler,
1243 .extra1 = (void *)&hugetlb_zero, 1243 .extra1 = &zero,
1244 .extra2 = (void *)&hugetlb_infinity,
1245 }, 1244 },
1246#ifdef CONFIG_NUMA 1245#ifdef CONFIG_NUMA
1247 { 1246 {
@@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = {
1250 .maxlen = sizeof(unsigned long), 1249 .maxlen = sizeof(unsigned long),
1251 .mode = 0644, 1250 .mode = 0644,
1252 .proc_handler = &hugetlb_mempolicy_sysctl_handler, 1251 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1253 .extra1 = (void *)&hugetlb_zero, 1252 .extra1 = &zero,
1254 .extra2 = (void *)&hugetlb_infinity,
1255 }, 1253 },
1256#endif 1254#endif
1257 { 1255 {
@@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = {
1274 .maxlen = sizeof(unsigned long), 1272 .maxlen = sizeof(unsigned long),
1275 .mode = 0644, 1273 .mode = 0644,
1276 .proc_handler = hugetlb_overcommit_handler, 1274 .proc_handler = hugetlb_overcommit_handler,
1277 .extra1 = (void *)&hugetlb_zero, 1275 .extra1 = &zero,
1278 .extra2 = (void *)&hugetlb_infinity,
1279 }, 1276 },
1280#endif 1277#endif
1281 { 1278 {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 653cbbd9e7ad..e4ba9a5a5ccb 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = {
522 { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, 522 { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
523 { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, 523 { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
524 { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, 524 { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
525 { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" },
525 {} 526 {}
526}; 527};
527 528
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 52ebc70263f4..875f64e8935b 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void)
89 pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", 89 pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
90 PTR_ERR(key)); 90 PTR_ERR(key));
91 } else { 91 } else {
92 set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
92 pr_notice("Loaded X.509 cert '%s'\n", 93 pr_notice("Loaded X.509 cert '%s'\n",
93 key_ref_to_ptr(key)->description); 94 key_ref_to_ptr(key)->description);
94 key_ref_put(key); 95 key_ref_put(key);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 12d6ebbfdd83..0dbab6d1acb4 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -14,6 +14,8 @@
14 * the GNU General Public License for more details. 14 * the GNU General Public License for more details.
15 */ 15 */
16 16
17#define pr_fmt(fmt) "Kprobe smoke test: " fmt
18
17#include <linux/kernel.h> 19#include <linux/kernel.h>
18#include <linux/kprobes.h> 20#include <linux/kprobes.h>
19#include <linux/random.h> 21#include <linux/random.h>
@@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
41{ 43{
42 if (preh_val != (rand1 / div_factor)) { 44 if (preh_val != (rand1 / div_factor)) {
43 handler_errors++; 45 handler_errors++;
44 printk(KERN_ERR "Kprobe smoke test failed: " 46 pr_err("incorrect value in post_handler\n");
45 "incorrect value in post_handler\n");
46 } 47 }
47 posth_val = preh_val + div_factor; 48 posth_val = preh_val + div_factor;
48} 49}
@@ -59,8 +60,7 @@ static int test_kprobe(void)
59 60
60 ret = register_kprobe(&kp); 61 ret = register_kprobe(&kp);
61 if (ret < 0) { 62 if (ret < 0) {
62 printk(KERN_ERR "Kprobe smoke test failed: " 63 pr_err("register_kprobe returned %d\n", ret);
63 "register_kprobe returned %d\n", ret);
64 return ret; 64 return ret;
65 } 65 }
66 66
@@ -68,14 +68,12 @@ static int test_kprobe(void)
68 unregister_kprobe(&kp); 68 unregister_kprobe(&kp);
69 69
70 if (preh_val == 0) { 70 if (preh_val == 0) {
71 printk(KERN_ERR "Kprobe smoke test failed: " 71 pr_err("kprobe pre_handler not called\n");
72 "kprobe pre_handler not called\n");
73 handler_errors++; 72 handler_errors++;
74 } 73 }
75 74
76 if (posth_val == 0) { 75 if (posth_val == 0) {
77 printk(KERN_ERR "Kprobe smoke test failed: " 76 pr_err("kprobe post_handler not called\n");
78 "kprobe post_handler not called\n");
79 handler_errors++; 77 handler_errors++;
80 } 78 }
81 79
@@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
98{ 96{
99 if (preh_val != (rand1 / div_factor) + 1) { 97 if (preh_val != (rand1 / div_factor) + 1) {
100 handler_errors++; 98 handler_errors++;
101 printk(KERN_ERR "Kprobe smoke test failed: " 99 pr_err("incorrect value in post_handler2\n");
102 "incorrect value in post_handler2\n");
103 } 100 }
104 posth_val = preh_val + div_factor; 101 posth_val = preh_val + div_factor;
105} 102}
@@ -120,8 +117,7 @@ static int test_kprobes(void)
120 kp.flags = 0; 117 kp.flags = 0;
121 ret = register_kprobes(kps, 2); 118 ret = register_kprobes(kps, 2);
122 if (ret < 0) { 119 if (ret < 0) {
123 printk(KERN_ERR "Kprobe smoke test failed: " 120 pr_err("register_kprobes returned %d\n", ret);
124 "register_kprobes returned %d\n", ret);
125 return ret; 121 return ret;
126 } 122 }
127 123
@@ -130,14 +126,12 @@ static int test_kprobes(void)
130 ret = target(rand1); 126 ret = target(rand1);
131 127
132 if (preh_val == 0) { 128 if (preh_val == 0) {
133 printk(KERN_ERR "Kprobe smoke test failed: " 129 pr_err("kprobe pre_handler not called\n");
134 "kprobe pre_handler not called\n");
135 handler_errors++; 130 handler_errors++;
136 } 131 }
137 132
138 if (posth_val == 0) { 133 if (posth_val == 0) {
139 printk(KERN_ERR "Kprobe smoke test failed: " 134 pr_err("kprobe post_handler not called\n");
140 "kprobe post_handler not called\n");
141 handler_errors++; 135 handler_errors++;
142 } 136 }
143 137
@@ -146,14 +140,12 @@ static int test_kprobes(void)
146 ret = target2(rand1); 140 ret = target2(rand1);
147 141
148 if (preh_val == 0) { 142 if (preh_val == 0) {
149 printk(KERN_ERR "Kprobe smoke test failed: " 143 pr_err("kprobe pre_handler2 not called\n");
150 "kprobe pre_handler2 not called\n");
151 handler_errors++; 144 handler_errors++;
152 } 145 }
153 146
154 if (posth_val == 0) { 147 if (posth_val == 0) {
155 printk(KERN_ERR "Kprobe smoke test failed: " 148 pr_err("kprobe post_handler2 not called\n");
156 "kprobe post_handler2 not called\n");
157 handler_errors++; 149 handler_errors++;
158 } 150 }
159 151
@@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value)
166{ 158{
167 if (value != rand1) { 159 if (value != rand1) {
168 handler_errors++; 160 handler_errors++;
169 printk(KERN_ERR "Kprobe smoke test failed: " 161 pr_err("incorrect value in jprobe handler\n");
170 "incorrect value in jprobe handler\n");
171 } 162 }
172 163
173 jph_val = rand1; 164 jph_val = rand1;
@@ -186,16 +177,14 @@ static int test_jprobe(void)
186 177
187 ret = register_jprobe(&jp); 178 ret = register_jprobe(&jp);
188 if (ret < 0) { 179 if (ret < 0) {
189 printk(KERN_ERR "Kprobe smoke test failed: " 180 pr_err("register_jprobe returned %d\n", ret);
190 "register_jprobe returned %d\n", ret);
191 return ret; 181 return ret;
192 } 182 }
193 183
194 ret = target(rand1); 184 ret = target(rand1);
195 unregister_jprobe(&jp); 185 unregister_jprobe(&jp);
196 if (jph_val == 0) { 186 if (jph_val == 0) {
197 printk(KERN_ERR "Kprobe smoke test failed: " 187 pr_err("jprobe handler not called\n");
198 "jprobe handler not called\n");
199 handler_errors++; 188 handler_errors++;
200 } 189 }
201 190
@@ -217,24 +206,21 @@ static int test_jprobes(void)
217 jp.kp.flags = 0; 206 jp.kp.flags = 0;
218 ret = register_jprobes(jps, 2); 207 ret = register_jprobes(jps, 2);
219 if (ret < 0) { 208 if (ret < 0) {
220 printk(KERN_ERR "Kprobe smoke test failed: " 209 pr_err("register_jprobes returned %d\n", ret);
221 "register_jprobes returned %d\n", ret);
222 return ret; 210 return ret;
223 } 211 }
224 212
225 jph_val = 0; 213 jph_val = 0;
226 ret = target(rand1); 214 ret = target(rand1);
227 if (jph_val == 0) { 215 if (jph_val == 0) {
228 printk(KERN_ERR "Kprobe smoke test failed: " 216 pr_err("jprobe handler not called\n");
229 "jprobe handler not called\n");
230 handler_errors++; 217 handler_errors++;
231 } 218 }
232 219
233 jph_val = 0; 220 jph_val = 0;
234 ret = target2(rand1); 221 ret = target2(rand1);
235 if (jph_val == 0) { 222 if (jph_val == 0) {
236 printk(KERN_ERR "Kprobe smoke test failed: " 223 pr_err("jprobe handler2 not called\n");
237 "jprobe handler2 not called\n");
238 handler_errors++; 224 handler_errors++;
239 } 225 }
240 unregister_jprobes(jps, 2); 226 unregister_jprobes(jps, 2);
@@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
256 242
257 if (ret != (rand1 / div_factor)) { 243 if (ret != (rand1 / div_factor)) {
258 handler_errors++; 244 handler_errors++;
259 printk(KERN_ERR "Kprobe smoke test failed: " 245 pr_err("incorrect value in kretprobe handler\n");
260 "incorrect value in kretprobe handler\n");
261 } 246 }
262 if (krph_val == 0) { 247 if (krph_val == 0) {
263 handler_errors++; 248 handler_errors++;
264 printk(KERN_ERR "Kprobe smoke test failed: " 249 pr_err("call to kretprobe entry handler failed\n");
265 "call to kretprobe entry handler failed\n");
266 } 250 }
267 251
268 krph_val = rand1; 252 krph_val = rand1;
@@ -281,16 +265,14 @@ static int test_kretprobe(void)
281 265
282 ret = register_kretprobe(&rp); 266 ret = register_kretprobe(&rp);
283 if (ret < 0) { 267 if (ret < 0) {
284 printk(KERN_ERR "Kprobe smoke test failed: " 268 pr_err("register_kretprobe returned %d\n", ret);
285 "register_kretprobe returned %d\n", ret);
286 return ret; 269 return ret;
287 } 270 }
288 271
289 ret = target(rand1); 272 ret = target(rand1);
290 unregister_kretprobe(&rp); 273 unregister_kretprobe(&rp);
291 if (krph_val != rand1) { 274 if (krph_val != rand1) {
292 printk(KERN_ERR "Kprobe smoke test failed: " 275 pr_err("kretprobe handler not called\n");
293 "kretprobe handler not called\n");
294 handler_errors++; 276 handler_errors++;
295 } 277 }
296 278
@@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
303 285
304 if (ret != (rand1 / div_factor) + 1) { 286 if (ret != (rand1 / div_factor) + 1) {
305 handler_errors++; 287 handler_errors++;
306 printk(KERN_ERR "Kprobe smoke test failed: " 288 pr_err("incorrect value in kretprobe handler2\n");
307 "incorrect value in kretprobe handler2\n");
308 } 289 }
309 if (krph_val == 0) { 290 if (krph_val == 0) {
310 handler_errors++; 291 handler_errors++;
311 printk(KERN_ERR "Kprobe smoke test failed: " 292 pr_err("call to kretprobe entry handler failed\n");
312 "call to kretprobe entry handler failed\n");
313 } 293 }
314 294
315 krph_val = rand1; 295 krph_val = rand1;
@@ -332,24 +312,21 @@ static int test_kretprobes(void)
332 rp.kp.flags = 0; 312 rp.kp.flags = 0;
333 ret = register_kretprobes(rps, 2); 313 ret = register_kretprobes(rps, 2);
334 if (ret < 0) { 314 if (ret < 0) {
335 printk(KERN_ERR "Kprobe smoke test failed: " 315 pr_err("register_kretprobe returned %d\n", ret);
336 "register_kretprobe returned %d\n", ret);
337 return ret; 316 return ret;
338 } 317 }
339 318
340 krph_val = 0; 319 krph_val = 0;
341 ret = target(rand1); 320 ret = target(rand1);
342 if (krph_val != rand1) { 321 if (krph_val != rand1) {
343 printk(KERN_ERR "Kprobe smoke test failed: " 322 pr_err("kretprobe handler not called\n");
344 "kretprobe handler not called\n");
345 handler_errors++; 323 handler_errors++;
346 } 324 }
347 325
348 krph_val = 0; 326 krph_val = 0;
349 ret = target2(rand1); 327 ret = target2(rand1);
350 if (krph_val != rand1) { 328 if (krph_val != rand1) {
351 printk(KERN_ERR "Kprobe smoke test failed: " 329 pr_err("kretprobe handler2 not called\n");
352 "kretprobe handler2 not called\n");
353 handler_errors++; 330 handler_errors++;
354 } 331 }
355 unregister_kretprobes(rps, 2); 332 unregister_kretprobes(rps, 2);
@@ -368,7 +345,7 @@ int init_test_probes(void)
368 rand1 = prandom_u32(); 345 rand1 = prandom_u32();
369 } while (rand1 <= div_factor); 346 } while (rand1 <= div_factor);
370 347
371 printk(KERN_INFO "Kprobe smoke test started\n"); 348 pr_info("started\n");
372 num_tests++; 349 num_tests++;
373 ret = test_kprobe(); 350 ret = test_kprobe();
374 if (ret < 0) 351 if (ret < 0)
@@ -402,13 +379,11 @@ int init_test_probes(void)
402#endif /* CONFIG_KRETPROBES */ 379#endif /* CONFIG_KRETPROBES */
403 380
404 if (errors) 381 if (errors)
405 printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " 382 pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
406 "%d tests failed\n", errors, num_tests);
407 else if (handler_errors) 383 else if (handler_errors)
408 printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " 384 pr_err("BUG: %d error(s) running handlers\n", handler_errors);
409 "running handlers\n", handler_errors);
410 else 385 else
411 printk(KERN_INFO "Kprobe smoke test passed successfully\n"); 386 pr_info("passed successfully\n");
412 387
413 return 0; 388 return 0;
414} 389}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f448513a45ed..d626dc98e8df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
12config ARCH_CLOCKSOURCE_DATA 12config ARCH_CLOCKSOURCE_DATA
13 bool 13 bool
14 14
15# Clocksources require validation of the clocksource against the last
16# cycle update - x86/TSC misfeature
17config CLOCKSOURCE_VALIDATE_LAST_CYCLE
18 bool
19
15# Timekeeping vsyscall support 20# Timekeeping vsyscall support
16config GENERIC_TIME_VSYSCALL 21config GENERIC_TIME_VSYSCALL
17 bool 22 bool
@@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL
20config GENERIC_TIME_VSYSCALL_OLD 25config GENERIC_TIME_VSYSCALL_OLD
21 bool 26 bool
22 27
23# ktime_t scalar 64bit nsec representation
24config KTIME_SCALAR
25 bool
26
27# Old style timekeeping 28# Old style timekeeping
28config ARCH_USES_GETTIMEOFFSET 29config ARCH_USES_GETTIMEOFFSET
29 bool 30 bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 57a413fd0ebf..7347426fa68d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,3 +1,4 @@
1obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 2obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 3obj-y += timeconv.o posix-clock.o alarmtimer.o
3 4
@@ -12,3 +13,21 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
12obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 13obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
13obj-$(CONFIG_TIMER_STATS) += timer_stats.o 14obj-$(CONFIG_TIMER_STATS) += timer_stats.o
14obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 15obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
16obj-$(CONFIG_TEST_UDELAY) += udelay_test.o
17
18$(obj)/time.o: $(obj)/timeconst.h
19
20quiet_cmd_hzfile = HZFILE $@
21 cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
22
23targets += hz.bc
24$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
25 $(call if_changed,hzfile)
26
27quiet_cmd_bc = BC $@
28 cmd_bc = bc -q $(filter-out FORCE,$^) > $@
29
30targets += timeconst.h
31$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
32 $(call if_changed,bc)
33
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index fe75444ae7ec..4aec4a457431 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
71 71
72 return ret; 72 return ret;
73} 73}
74 74EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
75 75
76static int alarmtimer_rtc_add_device(struct device *dev, 76static int alarmtimer_rtc_add_device(struct device *dev,
77 struct class_interface *class_intf) 77 struct class_interface *class_intf)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ba3e502c955a..2e949cc9c9f1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -32,6 +32,7 @@
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33 33
34#include "tick-internal.h" 34#include "tick-internal.h"
35#include "timekeeping_internal.h"
35 36
36void timecounter_init(struct timecounter *tc, 37void timecounter_init(struct timecounter *tc,
37 const struct cyclecounter *cc, 38 const struct cyclecounter *cc,
@@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
249static void clocksource_watchdog(unsigned long data) 250static void clocksource_watchdog(unsigned long data)
250{ 251{
251 struct clocksource *cs; 252 struct clocksource *cs;
252 cycle_t csnow, wdnow; 253 cycle_t csnow, wdnow, delta;
253 int64_t wd_nsec, cs_nsec; 254 int64_t wd_nsec, cs_nsec;
254 int next_cpu, reset_pending; 255 int next_cpu, reset_pending;
255 256
@@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data)
282 continue; 283 continue;
283 } 284 }
284 285
285 wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, 286 delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
286 watchdog->mult, watchdog->shift); 287 wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
288 watchdog->shift);
287 289
288 cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & 290 delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
289 cs->mask, cs->mult, cs->shift); 291 cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
290 cs->cs_last = csnow; 292 cs->cs_last = csnow;
291 cs->wd_last = wdnow; 293 cs->wd_last = wdnow;
292 294
diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c
index 3ab28993f6e0..1c2fe7de2842 100644
--- a/kernel/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,6 +54,8 @@
54 54
55#include <trace/events/timer.h> 55#include <trace/events/timer.h>
56 56
57#include "timekeeping.h"
58
57/* 59/*
58 * The timer bases: 60 * The timer bases:
59 * 61 *
@@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
114 */ 116 */
115static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 117static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
116{ 118{
117 ktime_t xtim, mono, boot; 119 ktime_t xtim, mono, boot, tai;
118 struct timespec xts, tom, slp; 120 ktime_t off_real, off_boot, off_tai;
119 s32 tai_offset;
120 121
121 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); 122 mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
122 tai_offset = timekeeping_get_tai_offset(); 123 boot = ktime_add(mono, off_boot);
124 xtim = ktime_add(mono, off_real);
125 tai = ktime_add(xtim, off_tai);
123 126
124 xtim = timespec_to_ktime(xts);
125 mono = ktime_add(xtim, timespec_to_ktime(tom));
126 boot = ktime_add(mono, timespec_to_ktime(slp));
127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; 127 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; 128 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
129 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; 129 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
130 base->clock_base[HRTIMER_BASE_TAI].softirq_time = 130 base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
131 ktime_add(xtim, ktime_set(tai_offset, 0));
132} 131}
133 132
134/* 133/*
@@ -264,60 +263,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
264 * too large for inlining: 263 * too large for inlining:
265 */ 264 */
266#if BITS_PER_LONG < 64 265#if BITS_PER_LONG < 64
267# ifndef CONFIG_KTIME_SCALAR
268/**
269 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
270 * @kt: addend
271 * @nsec: the scalar nsec value to add
272 *
273 * Returns the sum of kt and nsec in ktime_t format
274 */
275ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
276{
277 ktime_t tmp;
278
279 if (likely(nsec < NSEC_PER_SEC)) {
280 tmp.tv64 = nsec;
281 } else {
282 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
283
284 /* Make sure nsec fits into long */
285 if (unlikely(nsec > KTIME_SEC_MAX))
286 return (ktime_t){ .tv64 = KTIME_MAX };
287
288 tmp = ktime_set((long)nsec, rem);
289 }
290
291 return ktime_add(kt, tmp);
292}
293
294EXPORT_SYMBOL_GPL(ktime_add_ns);
295
296/**
297 * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
298 * @kt: minuend
299 * @nsec: the scalar nsec value to subtract
300 *
301 * Returns the subtraction of @nsec from @kt in ktime_t format
302 */
303ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
304{
305 ktime_t tmp;
306
307 if (likely(nsec < NSEC_PER_SEC)) {
308 tmp.tv64 = nsec;
309 } else {
310 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
311
312 tmp = ktime_set((long)nsec, rem);
313 }
314
315 return ktime_sub(kt, tmp);
316}
317
318EXPORT_SYMBOL_GPL(ktime_sub_ns);
319# endif /* !CONFIG_KTIME_SCALAR */
320
321/* 266/*
322 * Divide a ktime value by a nanosecond value 267 * Divide a ktime value by a nanosecond value
323 */ 268 */
@@ -337,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
337 282
338 return dclc; 283 return dclc;
339} 284}
285EXPORT_SYMBOL_GPL(ktime_divns);
340#endif /* BITS_PER_LONG >= 64 */ 286#endif /* BITS_PER_LONG >= 64 */
341 287
342/* 288/*
@@ -602,6 +548,11 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
602 * timers, we have to check, whether it expires earlier than the timer for 548 * timers, we have to check, whether it expires earlier than the timer for
603 * which the clock event device was armed. 549 * which the clock event device was armed.
604 * 550 *
551 * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
552 * and no expiry check happens. The timer gets enqueued into the rbtree. The
553 * reprogramming and expiry check is done in the hrtimer_interrupt or in the
554 * softirq.
555 *
605 * Called with interrupts disabled and base->cpu_base.lock held 556 * Called with interrupts disabled and base->cpu_base.lock held
606 */ 557 */
607static int hrtimer_reprogram(struct hrtimer *timer, 558static int hrtimer_reprogram(struct hrtimer *timer,
@@ -662,25 +613,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
662 base->hres_active = 0; 613 base->hres_active = 0;
663} 614}
664 615
665/*
666 * When High resolution timers are active, try to reprogram. Note, that in case
667 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
668 * check happens. The timer gets enqueued into the rbtree. The reprogramming
669 * and expiry check is done in the hrtimer_interrupt or in the softirq.
670 */
671static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
672 struct hrtimer_clock_base *base)
673{
674 return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
675}
676
677static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) 616static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
678{ 617{
679 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; 618 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
680 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; 619 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
681 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; 620 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
682 621
683 return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); 622 return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
684} 623}
685 624
686/* 625/*
@@ -755,8 +694,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
755static inline int hrtimer_switch_to_hres(void) { return 0; } 694static inline int hrtimer_switch_to_hres(void) { return 0; }
756static inline void 695static inline void
757hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } 696hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
758static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 697static inline int hrtimer_reprogram(struct hrtimer *timer,
759 struct hrtimer_clock_base *base) 698 struct hrtimer_clock_base *base)
760{ 699{
761 return 0; 700 return 0;
762} 701}
@@ -1013,14 +952,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1013 952
1014 leftmost = enqueue_hrtimer(timer, new_base); 953 leftmost = enqueue_hrtimer(timer, new_base);
1015 954
1016 /* 955 if (!leftmost) {
1017 * Only allow reprogramming if the new base is on this CPU. 956 unlock_hrtimer_base(timer, &flags);
1018 * (it might still be on another CPU if the timer was pending) 957 return ret;
1019 * 958 }
1020 * XXX send_remote_softirq() ? 959
1021 */ 960 if (!hrtimer_is_hres_active(timer)) {
1022 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) 961 /*
1023 && hrtimer_enqueue_reprogram(timer, new_base)) { 962 * Kick to reschedule the next tick to handle the new timer
963 * on dynticks target.
964 */
965 wake_up_nohz_cpu(new_base->cpu_base->cpu);
966 } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) &&
967 hrtimer_reprogram(timer, new_base)) {
968 /*
969 * Only allow reprogramming if the new base is on this CPU.
970 * (it might still be on another CPU if the timer was pending)
971 *
972 * XXX send_remote_softirq() ?
973 */
1024 if (wakeup) { 974 if (wakeup) {
1025 /* 975 /*
1026 * We need to drop cpu_base->lock to avoid a 976 * We need to drop cpu_base->lock to avoid a
@@ -1680,6 +1630,7 @@ static void init_hrtimers_cpu(int cpu)
1680 timerqueue_init_head(&cpu_base->clock_base[i].active); 1630 timerqueue_init_head(&cpu_base->clock_base[i].active);
1681 } 1631 }
1682 1632
1633 cpu_base->cpu = cpu;
1683 hrtimer_init_hres(cpu_base); 1634 hrtimer_init_hres(cpu_base);
1684} 1635}
1685 1636
diff --git a/kernel/itimer.c b/kernel/time/itimer.c
index 8d262b467573..8d262b467573 100644
--- a/kernel/itimer.c
+++ b/kernel/time/itimer.c
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 33db43a39515..87a346fd6d61 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
466 466
467static void sync_cmos_clock(struct work_struct *work) 467static void sync_cmos_clock(struct work_struct *work)
468{ 468{
469 struct timespec now, next; 469 struct timespec64 now;
470 struct timespec next;
470 int fail = 1; 471 int fail = 1;
471 472
472 /* 473 /*
@@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work)
485 return; 486 return;
486 } 487 }
487 488
488 getnstimeofday(&now); 489 getnstimeofday64(&now);
489 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { 490 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
490 struct timespec adjust = now; 491 struct timespec adjust = timespec64_to_timespec(now);
491 492
492 fail = -ENODEV; 493 fail = -ENODEV;
493 if (persistent_clock_is_local) 494 if (persistent_clock_is_local)
@@ -531,7 +532,7 @@ void ntp_notify_cmos_timer(void) { }
531/* 532/*
532 * Propagate a new txc->status value into the NTP state: 533 * Propagate a new txc->status value into the NTP state:
533 */ 534 */
534static inline void process_adj_status(struct timex *txc, struct timespec *ts) 535static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
535{ 536{
536 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { 537 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
537 time_state = TIME_OK; 538 time_state = TIME_OK;
@@ -554,7 +555,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
554 555
555 556
556static inline void process_adjtimex_modes(struct timex *txc, 557static inline void process_adjtimex_modes(struct timex *txc,
557 struct timespec *ts, 558 struct timespec64 *ts,
558 s32 *time_tai) 559 s32 *time_tai)
559{ 560{
560 if (txc->modes & ADJ_STATUS) 561 if (txc->modes & ADJ_STATUS)
@@ -640,7 +641,7 @@ int ntp_validate_timex(struct timex *txc)
640 * adjtimex mainly allows reading (and writing, if superuser) of 641 * adjtimex mainly allows reading (and writing, if superuser) of
641 * kernel time-keeping variables. used by xntpd. 642 * kernel time-keeping variables. used by xntpd.
642 */ 643 */
643int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) 644int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
644{ 645{
645 int result; 646 int result;
646 647
@@ -684,7 +685,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
684 /* fill PPS status fields */ 685 /* fill PPS status fields */
685 pps_fill_timex(txc); 686 pps_fill_timex(txc);
686 687
687 txc->time.tv_sec = ts->tv_sec; 688 txc->time.tv_sec = (time_t)ts->tv_sec;
688 txc->time.tv_usec = ts->tv_nsec; 689 txc->time.tv_usec = ts->tv_nsec;
689 if (!(time_status & STA_NANO)) 690 if (!(time_status & STA_NANO))
690 txc->time.tv_usec /= NSEC_PER_USEC; 691 txc->time.tv_usec /= NSEC_PER_USEC;
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 1950cb4ca2a4..bbd102ad9df7 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -7,6 +7,6 @@ extern void ntp_clear(void);
7extern u64 ntp_tick_length(void); 7extern u64 ntp_tick_length(void);
8extern int second_overflow(unsigned long secs); 8extern int second_overflow(unsigned long secs);
9extern int ntp_validate_timex(struct timex *); 9extern int ntp_validate_timex(struct timex *);
10extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); 10extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
11extern void __hardpps(const struct timespec *, const struct timespec *); 11extern void __hardpps(const struct timespec *, const struct timespec *);
12#endif /* _LINUX_NTP_INTERNAL_H */ 12#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..3b8946416a5f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c
index 424c2d4265c9..42b463ad90f2 100644
--- a/kernel/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -49,6 +49,8 @@
49#include <linux/export.h> 49#include <linux/export.h>
50#include <linux/hashtable.h> 50#include <linux/hashtable.h>
51 51
52#include "timekeeping.h"
53
52/* 54/*
53 * Management arrays for POSIX timers. Timers are now kept in static hash table 55 * Management arrays for POSIX timers. Timers are now kept in static hash table
54 * with 512 entries. 56 * with 512 entries.
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7ab92b19965a..c19c1d84b6f3 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
4#include <linux/hrtimer.h> 4#include <linux/hrtimer.h>
5#include <linux/tick.h> 5#include <linux/tick.h>
6 6
7#include "timekeeping.h"
8
7extern seqlock_t jiffies_lock; 9extern seqlock_t jiffies_lock;
8 10
9#define CS_NAME_LEN 32 11#define CS_NAME_LEN 32
diff --git a/kernel/time.c b/kernel/time/time.c
index 7c7964c33ae7..f0294ba14634 100644
--- a/kernel/time.c
+++ b/kernel/time/time.c
@@ -42,6 +42,7 @@
42#include <asm/unistd.h> 42#include <asm/unistd.h>
43 43
44#include "timeconst.h" 44#include "timeconst.h"
45#include "timekeeping.h"
45 46
46/* 47/*
47 * The timezone where the local system is located. Used as a default by some 48 * The timezone where the local system is located. Used as a default by some
@@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec)
420} 421}
421EXPORT_SYMBOL(ns_to_timeval); 422EXPORT_SYMBOL(ns_to_timeval);
422 423
424#if BITS_PER_LONG == 32
425/**
426 * set_normalized_timespec - set timespec sec and nsec parts and normalize
427 *
428 * @ts: pointer to timespec variable to be set
429 * @sec: seconds to set
430 * @nsec: nanoseconds to set
431 *
432 * Set seconds and nanoseconds field of a timespec variable and
433 * normalize to the timespec storage format
434 *
435 * Note: The tv_nsec part is always in the range of
436 * 0 <= tv_nsec < NSEC_PER_SEC
437 * For negative values only the tv_sec field is negative !
438 */
439void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
440{
441 while (nsec >= NSEC_PER_SEC) {
442 /*
443 * The following asm() prevents the compiler from
444 * optimising this loop into a modulo operation. See
445 * also __iter_div_u64_rem() in include/linux/time.h
446 */
447 asm("" : "+rm"(nsec));
448 nsec -= NSEC_PER_SEC;
449 ++sec;
450 }
451 while (nsec < 0) {
452 asm("" : "+rm"(nsec));
453 nsec += NSEC_PER_SEC;
454 --sec;
455 }
456 ts->tv_sec = sec;
457 ts->tv_nsec = nsec;
458}
459EXPORT_SYMBOL(set_normalized_timespec64);
460
461/**
462 * ns_to_timespec64 - Convert nanoseconds to timespec64
463 * @nsec: the nanoseconds value to be converted
464 *
465 * Returns the timespec64 representation of the nsec parameter.
466 */
467struct timespec64 ns_to_timespec64(const s64 nsec)
468{
469 struct timespec64 ts;
470 s32 rem;
471
472 if (!nsec)
473 return (struct timespec64) {0, 0};
474
475 ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
476 if (unlikely(rem < 0)) {
477 ts.tv_sec--;
478 rem += NSEC_PER_SEC;
479 }
480 ts.tv_nsec = rem;
481
482 return ts;
483}
484EXPORT_SYMBOL(ns_to_timespec64);
485#endif
423/* 486/*
424 * When we convert to jiffies then we interpret incoming values 487 * When we convert to jiffies then we interpret incoming values
425 * the following way: 488 * the following way:
@@ -694,6 +757,7 @@ unsigned long nsecs_to_jiffies(u64 n)
694{ 757{
695 return (unsigned long)nsecs_to_jiffies64(n); 758 return (unsigned long)nsecs_to_jiffies64(n);
696} 759}
760EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
697 761
698/* 762/*
699 * Add two timespec values and do a safety check for overflow. 763 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2cafda..511bdf2cafda 100644
--- a/kernel/timeconst.bc
+++ b/kernel/time/timeconst.bc
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 32d8d6aaedb8..fb4a9c2cf8d9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,11 +32,34 @@
32#define TK_MIRROR (1 << 1) 32#define TK_MIRROR (1 << 1)
33#define TK_CLOCK_WAS_SET (1 << 2) 33#define TK_CLOCK_WAS_SET (1 << 2)
34 34
35static struct timekeeper timekeeper; 35/*
36 * The most important data for readout fits into a single 64 byte
37 * cache line.
38 */
39static struct {
40 seqcount_t seq;
41 struct timekeeper timekeeper;
42} tk_core ____cacheline_aligned;
43
36static DEFINE_RAW_SPINLOCK(timekeeper_lock); 44static DEFINE_RAW_SPINLOCK(timekeeper_lock);
37static seqcount_t timekeeper_seq;
38static struct timekeeper shadow_timekeeper; 45static struct timekeeper shadow_timekeeper;
39 46
47/**
48 * struct tk_fast - NMI safe timekeeper
49 * @seq: Sequence counter for protecting updates. The lowest bit
50 * is the index for the tk_read_base array
51 * @base: tk_read_base array. Access is indexed by the lowest bit of
52 * @seq.
53 *
54 * See @update_fast_timekeeper() below.
55 */
56struct tk_fast {
57 seqcount_t seq;
58 struct tk_read_base base[2];
59};
60
61static struct tk_fast tk_fast_mono ____cacheline_aligned;
62
40/* flag for if timekeeping is suspended */ 63/* flag for if timekeeping is suspended */
41int __read_mostly timekeeping_suspended; 64int __read_mostly timekeeping_suspended;
42 65
@@ -45,49 +68,54 @@ bool __read_mostly persistent_clock_exist = false;
45 68
46static inline void tk_normalize_xtime(struct timekeeper *tk) 69static inline void tk_normalize_xtime(struct timekeeper *tk)
47{ 70{
48 while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { 71 while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
49 tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; 72 tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
50 tk->xtime_sec++; 73 tk->xtime_sec++;
51 } 74 }
52} 75}
53 76
54static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) 77static inline struct timespec64 tk_xtime(struct timekeeper *tk)
78{
79 struct timespec64 ts;
80
81 ts.tv_sec = tk->xtime_sec;
82 ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
83 return ts;
84}
85
86static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
55{ 87{
56 tk->xtime_sec = ts->tv_sec; 88 tk->xtime_sec = ts->tv_sec;
57 tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; 89 tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
58} 90}
59 91
60static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) 92static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
61{ 93{
62 tk->xtime_sec += ts->tv_sec; 94 tk->xtime_sec += ts->tv_sec;
63 tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; 95 tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
64 tk_normalize_xtime(tk); 96 tk_normalize_xtime(tk);
65} 97}
66 98
67static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) 99static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
68{ 100{
69 struct timespec tmp; 101 struct timespec64 tmp;
70 102
71 /* 103 /*
72 * Verify consistency of: offset_real = -wall_to_monotonic 104 * Verify consistency of: offset_real = -wall_to_monotonic
73 * before modifying anything 105 * before modifying anything
74 */ 106 */
75 set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, 107 set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
76 -tk->wall_to_monotonic.tv_nsec); 108 -tk->wall_to_monotonic.tv_nsec);
77 WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); 109 WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
78 tk->wall_to_monotonic = wtm; 110 tk->wall_to_monotonic = wtm;
79 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); 111 set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
80 tk->offs_real = timespec_to_ktime(tmp); 112 tk->offs_real = timespec64_to_ktime(tmp);
81 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); 113 tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
82} 114}
83 115
84static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) 116static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
85{ 117{
86 /* Verify consistency before modifying */ 118 tk->offs_boot = ktime_add(tk->offs_boot, delta);
87 WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
88
89 tk->total_sleep_time = t;
90 tk->offs_boot = timespec_to_ktime(t);
91} 119}
92 120
93/** 121/**
@@ -107,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
107 u64 tmp, ntpinterval; 135 u64 tmp, ntpinterval;
108 struct clocksource *old_clock; 136 struct clocksource *old_clock;
109 137
110 old_clock = tk->clock; 138 old_clock = tk->tkr.clock;
111 tk->clock = clock; 139 tk->tkr.clock = clock;
112 tk->cycle_last = clock->cycle_last = clock->read(clock); 140 tk->tkr.read = clock->read;
141 tk->tkr.mask = clock->mask;
142 tk->tkr.cycle_last = tk->tkr.read(clock);
113 143
114 /* Do the ns -> cycle conversion first, using original mult */ 144 /* Do the ns -> cycle conversion first, using original mult */
115 tmp = NTP_INTERVAL_LENGTH; 145 tmp = NTP_INTERVAL_LENGTH;
@@ -133,78 +163,213 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
133 if (old_clock) { 163 if (old_clock) {
134 int shift_change = clock->shift - old_clock->shift; 164 int shift_change = clock->shift - old_clock->shift;
135 if (shift_change < 0) 165 if (shift_change < 0)
136 tk->xtime_nsec >>= -shift_change; 166 tk->tkr.xtime_nsec >>= -shift_change;
137 else 167 else
138 tk->xtime_nsec <<= shift_change; 168 tk->tkr.xtime_nsec <<= shift_change;
139 } 169 }
140 tk->shift = clock->shift; 170 tk->tkr.shift = clock->shift;
141 171
142 tk->ntp_error = 0; 172 tk->ntp_error = 0;
143 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; 173 tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
174 tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
144 175
145 /* 176 /*
146 * The timekeeper keeps its own mult values for the currently 177 * The timekeeper keeps its own mult values for the currently
147 * active clocksource. These value will be adjusted via NTP 178 * active clocksource. These value will be adjusted via NTP
148 * to counteract clock drifting. 179 * to counteract clock drifting.
149 */ 180 */
150 tk->mult = clock->mult; 181 tk->tkr.mult = clock->mult;
182 tk->ntp_err_mult = 0;
151} 183}
152 184
153/* Timekeeper helper functions. */ 185/* Timekeeper helper functions. */
154 186
155#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 187#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
156u32 (*arch_gettimeoffset)(void); 188static u32 default_arch_gettimeoffset(void) { return 0; }
157 189u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
158u32 get_arch_timeoffset(void)
159{
160 if (likely(arch_gettimeoffset))
161 return arch_gettimeoffset();
162 return 0;
163}
164#else 190#else
165static inline u32 get_arch_timeoffset(void) { return 0; } 191static inline u32 arch_gettimeoffset(void) { return 0; }
166#endif 192#endif
167 193
168static inline s64 timekeeping_get_ns(struct timekeeper *tk) 194static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
169{ 195{
170 cycle_t cycle_now, cycle_delta; 196 cycle_t cycle_now, delta;
171 struct clocksource *clock;
172 s64 nsec; 197 s64 nsec;
173 198
174 /* read clocksource: */ 199 /* read clocksource: */
175 clock = tk->clock; 200 cycle_now = tkr->read(tkr->clock);
176 cycle_now = clock->read(clock);
177 201
178 /* calculate the delta since the last update_wall_time: */ 202 /* calculate the delta since the last update_wall_time: */
179 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 203 delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
180 204
181 nsec = cycle_delta * tk->mult + tk->xtime_nsec; 205 nsec = delta * tkr->mult + tkr->xtime_nsec;
182 nsec >>= tk->shift; 206 nsec >>= tkr->shift;
183 207
184 /* If arch requires, add in get_arch_timeoffset() */ 208 /* If arch requires, add in get_arch_timeoffset() */
185 return nsec + get_arch_timeoffset(); 209 return nsec + arch_gettimeoffset();
186} 210}
187 211
188static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) 212static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
189{ 213{
190 cycle_t cycle_now, cycle_delta; 214 struct clocksource *clock = tk->tkr.clock;
191 struct clocksource *clock; 215 cycle_t cycle_now, delta;
192 s64 nsec; 216 s64 nsec;
193 217
194 /* read clocksource: */ 218 /* read clocksource: */
195 clock = tk->clock; 219 cycle_now = tk->tkr.read(clock);
196 cycle_now = clock->read(clock);
197 220
198 /* calculate the delta since the last update_wall_time: */ 221 /* calculate the delta since the last update_wall_time: */
199 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 222 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
200 223
201 /* convert delta to nanoseconds. */ 224 /* convert delta to nanoseconds. */
202 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 225 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
203 226
204 /* If arch requires, add in get_arch_timeoffset() */ 227 /* If arch requires, add in get_arch_timeoffset() */
205 return nsec + get_arch_timeoffset(); 228 return nsec + arch_gettimeoffset();
229}
230
231/**
232 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
233 * @tk: The timekeeper from which we take the update
234 * @tkf: The fast timekeeper to update
235 * @tbase: The time base for the fast timekeeper (mono/raw)
236 *
237 * We want to use this from any context including NMI and tracing /
238 * instrumenting the timekeeping code itself.
239 *
240 * So we handle this differently than the other timekeeping accessor
241 * functions which retry when the sequence count has changed. The
242 * update side does:
243 *
244 * smp_wmb(); <- Ensure that the last base[1] update is visible
245 * tkf->seq++;
246 * smp_wmb(); <- Ensure that the seqcount update is visible
247 * update(tkf->base[0], tk);
248 * smp_wmb(); <- Ensure that the base[0] update is visible
249 * tkf->seq++;
250 * smp_wmb(); <- Ensure that the seqcount update is visible
251 * update(tkf->base[1], tk);
252 *
253 * The reader side does:
254 *
255 * do {
256 * seq = tkf->seq;
257 * smp_rmb();
258 * idx = seq & 0x01;
259 * now = now(tkf->base[idx]);
260 * smp_rmb();
261 * } while (seq != tkf->seq)
262 *
263 * As long as we update base[0] readers are forced off to
264 * base[1]. Once base[0] is updated readers are redirected to base[0]
265 * and the base[1] update takes place.
266 *
267 * So if a NMI hits the update of base[0] then it will use base[1]
268 * which is still consistent. In the worst case this can result is a
269 * slightly wrong timestamp (a few nanoseconds). See
270 * @ktime_get_mono_fast_ns.
271 */
272static void update_fast_timekeeper(struct timekeeper *tk)
273{
274 struct tk_read_base *base = tk_fast_mono.base;
275
276 /* Force readers off to base[1] */
277 raw_write_seqcount_latch(&tk_fast_mono.seq);
278
279 /* Update base[0] */
280 memcpy(base, &tk->tkr, sizeof(*base));
281
282 /* Force readers back to base[0] */
283 raw_write_seqcount_latch(&tk_fast_mono.seq);
284
285 /* Update base[1] */
286 memcpy(base + 1, base, sizeof(*base));
206} 287}
207 288
289/**
290 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
291 *
292 * This timestamp is not guaranteed to be monotonic across an update.
293 * The timestamp is calculated by:
294 *
295 * now = base_mono + clock_delta * slope
296 *
297 * So if the update lowers the slope, readers who are forced to the
298 * not yet updated second array are still using the old steeper slope.
299 *
300 * tmono
301 * ^
302 * | o n
303 * | o n
304 * | u
305 * | o
306 * |o
307 * |12345678---> reader order
308 *
309 * o = old slope
310 * u = update
311 * n = new slope
312 *
313 * So reader 6 will observe time going backwards versus reader 5.
314 *
315 * While other CPUs are likely to be able observe that, the only way
316 * for a CPU local observation is when an NMI hits in the middle of
317 * the update. Timestamps taken from that NMI context might be ahead
318 * of the following timestamps. Callers need to be aware of that and
319 * deal with it.
320 */
321u64 notrace ktime_get_mono_fast_ns(void)
322{
323 struct tk_read_base *tkr;
324 unsigned int seq;
325 u64 now;
326
327 do {
328 seq = raw_read_seqcount(&tk_fast_mono.seq);
329 tkr = tk_fast_mono.base + (seq & 0x01);
330 now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
331
332 } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
333 return now;
334}
335EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
336
337#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
338
339static inline void update_vsyscall(struct timekeeper *tk)
340{
341 struct timespec xt, wm;
342
343 xt = timespec64_to_timespec(tk_xtime(tk));
344 wm = timespec64_to_timespec(tk->wall_to_monotonic);
345 update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
346 tk->tkr.cycle_last);
347}
348
349static inline void old_vsyscall_fixup(struct timekeeper *tk)
350{
351 s64 remainder;
352
353 /*
354 * Store only full nanoseconds into xtime_nsec after rounding
355 * it up and add the remainder to the error difference.
356 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
357 * by truncating the remainder in vsyscalls. However, it causes
358 * additional work to be done in timekeeping_adjust(). Once
359 * the vsyscall implementations are converted to use xtime_nsec
360 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
361 * users are removed, this can be killed.
362 */
363 remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
364 tk->tkr.xtime_nsec -= remainder;
365 tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
366 tk->ntp_error += remainder << tk->ntp_error_shift;
367 tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
368}
369#else
370#define old_vsyscall_fixup(tk)
371#endif
372
208static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 373static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
209 374
210static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) 375static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -217,7 +382,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
217 */ 382 */
218int pvclock_gtod_register_notifier(struct notifier_block *nb) 383int pvclock_gtod_register_notifier(struct notifier_block *nb)
219{ 384{
220 struct timekeeper *tk = &timekeeper; 385 struct timekeeper *tk = &tk_core.timekeeper;
221 unsigned long flags; 386 unsigned long flags;
222 int ret; 387 int ret;
223 388
@@ -247,6 +412,29 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
247} 412}
248EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 413EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
249 414
415/*
416 * Update the ktime_t based scalar nsec members of the timekeeper
417 */
418static inline void tk_update_ktime_data(struct timekeeper *tk)
419{
420 s64 nsec;
421
422 /*
423 * The xtime based monotonic readout is:
424 * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
425 * The ktime based monotonic readout is:
426 * nsec = base_mono + now();
427 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
428 */
429 nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
430 nsec *= NSEC_PER_SEC;
431 nsec += tk->wall_to_monotonic.tv_nsec;
432 tk->tkr.base_mono = ns_to_ktime(nsec);
433
434 /* Update the monotonic raw base */
435 tk->base_raw = timespec64_to_ktime(tk->raw_time);
436}
437
250/* must hold timekeeper_lock */ 438/* must hold timekeeper_lock */
251static void timekeeping_update(struct timekeeper *tk, unsigned int action) 439static void timekeeping_update(struct timekeeper *tk, unsigned int action)
252{ 440{
@@ -257,8 +445,13 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
257 update_vsyscall(tk); 445 update_vsyscall(tk);
258 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); 446 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
259 447
448 tk_update_ktime_data(tk);
449
260 if (action & TK_MIRROR) 450 if (action & TK_MIRROR)
261 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); 451 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
452 sizeof(tk_core.timekeeper));
453
454 update_fast_timekeeper(tk);
262} 455}
263 456
264/** 457/**
@@ -270,49 +463,48 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
270 */ 463 */
271static void timekeeping_forward_now(struct timekeeper *tk) 464static void timekeeping_forward_now(struct timekeeper *tk)
272{ 465{
273 cycle_t cycle_now, cycle_delta; 466 struct clocksource *clock = tk->tkr.clock;
274 struct clocksource *clock; 467 cycle_t cycle_now, delta;
275 s64 nsec; 468 s64 nsec;
276 469
277 clock = tk->clock; 470 cycle_now = tk->tkr.read(clock);
278 cycle_now = clock->read(clock); 471 delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
279 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 472 tk->tkr.cycle_last = cycle_now;
280 tk->cycle_last = clock->cycle_last = cycle_now;
281 473
282 tk->xtime_nsec += cycle_delta * tk->mult; 474 tk->tkr.xtime_nsec += delta * tk->tkr.mult;
283 475
284 /* If arch requires, add in get_arch_timeoffset() */ 476 /* If arch requires, add in get_arch_timeoffset() */
285 tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; 477 tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
286 478
287 tk_normalize_xtime(tk); 479 tk_normalize_xtime(tk);
288 480
289 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 481 nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
290 timespec_add_ns(&tk->raw_time, nsec); 482 timespec64_add_ns(&tk->raw_time, nsec);
291} 483}
292 484
293/** 485/**
294 * __getnstimeofday - Returns the time of day in a timespec. 486 * __getnstimeofday64 - Returns the time of day in a timespec64.
295 * @ts: pointer to the timespec to be set 487 * @ts: pointer to the timespec to be set
296 * 488 *
297 * Updates the time of day in the timespec. 489 * Updates the time of day in the timespec.
298 * Returns 0 on success, or -ve when suspended (timespec will be undefined). 490 * Returns 0 on success, or -ve when suspended (timespec will be undefined).
299 */ 491 */
300int __getnstimeofday(struct timespec *ts) 492int __getnstimeofday64(struct timespec64 *ts)
301{ 493{
302 struct timekeeper *tk = &timekeeper; 494 struct timekeeper *tk = &tk_core.timekeeper;
303 unsigned long seq; 495 unsigned long seq;
304 s64 nsecs = 0; 496 s64 nsecs = 0;
305 497
306 do { 498 do {
307 seq = read_seqcount_begin(&timekeeper_seq); 499 seq = read_seqcount_begin(&tk_core.seq);
308 500
309 ts->tv_sec = tk->xtime_sec; 501 ts->tv_sec = tk->xtime_sec;
310 nsecs = timekeeping_get_ns(tk); 502 nsecs = timekeeping_get_ns(&tk->tkr);
311 503
312 } while (read_seqcount_retry(&timekeeper_seq, seq)); 504 } while (read_seqcount_retry(&tk_core.seq, seq));
313 505
314 ts->tv_nsec = 0; 506 ts->tv_nsec = 0;
315 timespec_add_ns(ts, nsecs); 507 timespec64_add_ns(ts, nsecs);
316 508
317 /* 509 /*
318 * Do not bail out early, in case there were callers still using 510 * Do not bail out early, in case there were callers still using
@@ -322,116 +514,138 @@ int __getnstimeofday(struct timespec *ts)
322 return -EAGAIN; 514 return -EAGAIN;
323 return 0; 515 return 0;
324} 516}
325EXPORT_SYMBOL(__getnstimeofday); 517EXPORT_SYMBOL(__getnstimeofday64);
326 518
327/** 519/**
328 * getnstimeofday - Returns the time of day in a timespec. 520 * getnstimeofday64 - Returns the time of day in a timespec64.
329 * @ts: pointer to the timespec to be set 521 * @ts: pointer to the timespec to be set
330 * 522 *
331 * Returns the time of day in a timespec (WARN if suspended). 523 * Returns the time of day in a timespec (WARN if suspended).
332 */ 524 */
333void getnstimeofday(struct timespec *ts) 525void getnstimeofday64(struct timespec64 *ts)
334{ 526{
335 WARN_ON(__getnstimeofday(ts)); 527 WARN_ON(__getnstimeofday64(ts));
336} 528}
337EXPORT_SYMBOL(getnstimeofday); 529EXPORT_SYMBOL(getnstimeofday64);
338 530
339ktime_t ktime_get(void) 531ktime_t ktime_get(void)
340{ 532{
341 struct timekeeper *tk = &timekeeper; 533 struct timekeeper *tk = &tk_core.timekeeper;
342 unsigned int seq; 534 unsigned int seq;
343 s64 secs, nsecs; 535 ktime_t base;
536 s64 nsecs;
344 537
345 WARN_ON(timekeeping_suspended); 538 WARN_ON(timekeeping_suspended);
346 539
347 do { 540 do {
348 seq = read_seqcount_begin(&timekeeper_seq); 541 seq = read_seqcount_begin(&tk_core.seq);
349 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 542 base = tk->tkr.base_mono;
350 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; 543 nsecs = timekeeping_get_ns(&tk->tkr);
351 544
352 } while (read_seqcount_retry(&timekeeper_seq, seq)); 545 } while (read_seqcount_retry(&tk_core.seq, seq));
353 /* 546
354 * Use ktime_set/ktime_add_ns to create a proper ktime on 547 return ktime_add_ns(base, nsecs);
355 * 32-bit architectures without CONFIG_KTIME_SCALAR.
356 */
357 return ktime_add_ns(ktime_set(secs, 0), nsecs);
358} 548}
359EXPORT_SYMBOL_GPL(ktime_get); 549EXPORT_SYMBOL_GPL(ktime_get);
360 550
361/** 551static ktime_t *offsets[TK_OFFS_MAX] = {
362 * ktime_get_ts - get the monotonic clock in timespec format 552 [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
363 * @ts: pointer to timespec variable 553 [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
364 * 554 [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
365 * The function calculates the monotonic clock from the realtime 555};
366 * clock and the wall_to_monotonic offset and stores the result 556
367 * in normalized timespec format in the variable pointed to by @ts. 557ktime_t ktime_get_with_offset(enum tk_offsets offs)
368 */
369void ktime_get_ts(struct timespec *ts)
370{ 558{
371 struct timekeeper *tk = &timekeeper; 559 struct timekeeper *tk = &tk_core.timekeeper;
372 struct timespec tomono;
373 s64 nsec;
374 unsigned int seq; 560 unsigned int seq;
561 ktime_t base, *offset = offsets[offs];
562 s64 nsecs;
375 563
376 WARN_ON(timekeeping_suspended); 564 WARN_ON(timekeeping_suspended);
377 565
378 do { 566 do {
379 seq = read_seqcount_begin(&timekeeper_seq); 567 seq = read_seqcount_begin(&tk_core.seq);
380 ts->tv_sec = tk->xtime_sec; 568 base = ktime_add(tk->tkr.base_mono, *offset);
381 nsec = timekeeping_get_ns(tk); 569 nsecs = timekeeping_get_ns(&tk->tkr);
382 tomono = tk->wall_to_monotonic;
383 570
384 } while (read_seqcount_retry(&timekeeper_seq, seq)); 571 } while (read_seqcount_retry(&tk_core.seq, seq));
385 572
386 ts->tv_sec += tomono.tv_sec; 573 return ktime_add_ns(base, nsecs);
387 ts->tv_nsec = 0;
388 timespec_add_ns(ts, nsec + tomono.tv_nsec);
389}
390EXPORT_SYMBOL_GPL(ktime_get_ts);
391 574
575}
576EXPORT_SYMBOL_GPL(ktime_get_with_offset);
392 577
393/** 578/**
394 * timekeeping_clocktai - Returns the TAI time of day in a timespec 579 * ktime_mono_to_any() - convert mononotic time to any other time
395 * @ts: pointer to the timespec to be set 580 * @tmono: time to convert.
396 * 581 * @offs: which offset to use
397 * Returns the time of day in a timespec.
398 */ 582 */
399void timekeeping_clocktai(struct timespec *ts) 583ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
400{ 584{
401 struct timekeeper *tk = &timekeeper; 585 ktime_t *offset = offsets[offs];
402 unsigned long seq; 586 unsigned long seq;
403 u64 nsecs; 587 ktime_t tconv;
404
405 WARN_ON(timekeeping_suspended);
406 588
407 do { 589 do {
408 seq = read_seqcount_begin(&timekeeper_seq); 590 seq = read_seqcount_begin(&tk_core.seq);
591 tconv = ktime_add(tmono, *offset);
592 } while (read_seqcount_retry(&tk_core.seq, seq));
409 593
410 ts->tv_sec = tk->xtime_sec + tk->tai_offset; 594 return tconv;
411 nsecs = timekeeping_get_ns(tk); 595}
596EXPORT_SYMBOL_GPL(ktime_mono_to_any);
412 597
413 } while (read_seqcount_retry(&timekeeper_seq, seq)); 598/**
599 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
600 */
601ktime_t ktime_get_raw(void)
602{
603 struct timekeeper *tk = &tk_core.timekeeper;
604 unsigned int seq;
605 ktime_t base;
606 s64 nsecs;
414 607
415 ts->tv_nsec = 0; 608 do {
416 timespec_add_ns(ts, nsecs); 609 seq = read_seqcount_begin(&tk_core.seq);
610 base = tk->base_raw;
611 nsecs = timekeeping_get_ns_raw(tk);
417 612
418} 613 } while (read_seqcount_retry(&tk_core.seq, seq));
419EXPORT_SYMBOL(timekeeping_clocktai);
420 614
615 return ktime_add_ns(base, nsecs);
616}
617EXPORT_SYMBOL_GPL(ktime_get_raw);
421 618
422/** 619/**
423 * ktime_get_clocktai - Returns the TAI time of day in a ktime 620 * ktime_get_ts64 - get the monotonic clock in timespec64 format
621 * @ts: pointer to timespec variable
424 * 622 *
425 * Returns the time of day in a ktime. 623 * The function calculates the monotonic clock from the realtime
624 * clock and the wall_to_monotonic offset and stores the result
625 * in normalized timespec format in the variable pointed to by @ts.
426 */ 626 */
427ktime_t ktime_get_clocktai(void) 627void ktime_get_ts64(struct timespec64 *ts)
428{ 628{
429 struct timespec ts; 629 struct timekeeper *tk = &tk_core.timekeeper;
630 struct timespec64 tomono;
631 s64 nsec;
632 unsigned int seq;
633
634 WARN_ON(timekeeping_suspended);
430 635
431 timekeeping_clocktai(&ts); 636 do {
432 return timespec_to_ktime(ts); 637 seq = read_seqcount_begin(&tk_core.seq);
638 ts->tv_sec = tk->xtime_sec;
639 nsec = timekeeping_get_ns(&tk->tkr);
640 tomono = tk->wall_to_monotonic;
641
642 } while (read_seqcount_retry(&tk_core.seq, seq));
643
644 ts->tv_sec += tomono.tv_sec;
645 ts->tv_nsec = 0;
646 timespec64_add_ns(ts, nsec + tomono.tv_nsec);
433} 647}
434EXPORT_SYMBOL(ktime_get_clocktai); 648EXPORT_SYMBOL_GPL(ktime_get_ts64);
435 649
436#ifdef CONFIG_NTP_PPS 650#ifdef CONFIG_NTP_PPS
437 651
@@ -446,23 +660,23 @@ EXPORT_SYMBOL(ktime_get_clocktai);
446 */ 660 */
447void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) 661void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
448{ 662{
449 struct timekeeper *tk = &timekeeper; 663 struct timekeeper *tk = &tk_core.timekeeper;
450 unsigned long seq; 664 unsigned long seq;
451 s64 nsecs_raw, nsecs_real; 665 s64 nsecs_raw, nsecs_real;
452 666
453 WARN_ON_ONCE(timekeeping_suspended); 667 WARN_ON_ONCE(timekeeping_suspended);
454 668
455 do { 669 do {
456 seq = read_seqcount_begin(&timekeeper_seq); 670 seq = read_seqcount_begin(&tk_core.seq);
457 671
458 *ts_raw = tk->raw_time; 672 *ts_raw = timespec64_to_timespec(tk->raw_time);
459 ts_real->tv_sec = tk->xtime_sec; 673 ts_real->tv_sec = tk->xtime_sec;
460 ts_real->tv_nsec = 0; 674 ts_real->tv_nsec = 0;
461 675
462 nsecs_raw = timekeeping_get_ns_raw(tk); 676 nsecs_raw = timekeeping_get_ns_raw(tk);
463 nsecs_real = timekeeping_get_ns(tk); 677 nsecs_real = timekeeping_get_ns(&tk->tkr);
464 678
465 } while (read_seqcount_retry(&timekeeper_seq, seq)); 679 } while (read_seqcount_retry(&tk_core.seq, seq));
466 680
467 timespec_add_ns(ts_raw, nsecs_raw); 681 timespec_add_ns(ts_raw, nsecs_raw);
468 timespec_add_ns(ts_real, nsecs_real); 682 timespec_add_ns(ts_real, nsecs_real);
@@ -479,9 +693,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real);
479 */ 693 */
480void do_gettimeofday(struct timeval *tv) 694void do_gettimeofday(struct timeval *tv)
481{ 695{
482 struct timespec now; 696 struct timespec64 now;
483 697
484 getnstimeofday(&now); 698 getnstimeofday64(&now);
485 tv->tv_sec = now.tv_sec; 699 tv->tv_sec = now.tv_sec;
486 tv->tv_usec = now.tv_nsec/1000; 700 tv->tv_usec = now.tv_nsec/1000;
487} 701}
@@ -495,15 +709,15 @@ EXPORT_SYMBOL(do_gettimeofday);
495 */ 709 */
496int do_settimeofday(const struct timespec *tv) 710int do_settimeofday(const struct timespec *tv)
497{ 711{
498 struct timekeeper *tk = &timekeeper; 712 struct timekeeper *tk = &tk_core.timekeeper;
499 struct timespec ts_delta, xt; 713 struct timespec64 ts_delta, xt, tmp;
500 unsigned long flags; 714 unsigned long flags;
501 715
502 if (!timespec_valid_strict(tv)) 716 if (!timespec_valid_strict(tv))
503 return -EINVAL; 717 return -EINVAL;
504 718
505 raw_spin_lock_irqsave(&timekeeper_lock, flags); 719 raw_spin_lock_irqsave(&timekeeper_lock, flags);
506 write_seqcount_begin(&timekeeper_seq); 720 write_seqcount_begin(&tk_core.seq);
507 721
508 timekeeping_forward_now(tk); 722 timekeeping_forward_now(tk);
509 723
@@ -511,13 +725,14 @@ int do_settimeofday(const struct timespec *tv)
511 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; 725 ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
512 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; 726 ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
513 727
514 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); 728 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
515 729
516 tk_set_xtime(tk, tv); 730 tmp = timespec_to_timespec64(*tv);
731 tk_set_xtime(tk, &tmp);
517 732
518 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 733 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
519 734
520 write_seqcount_end(&timekeeper_seq); 735 write_seqcount_end(&tk_core.seq);
521 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 736 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
522 737
523 /* signal hrtimers about time change */ 738 /* signal hrtimers about time change */
@@ -535,33 +750,35 @@ EXPORT_SYMBOL(do_settimeofday);
535 */ 750 */
536int timekeeping_inject_offset(struct timespec *ts) 751int timekeeping_inject_offset(struct timespec *ts)
537{ 752{
538 struct timekeeper *tk = &timekeeper; 753 struct timekeeper *tk = &tk_core.timekeeper;
539 unsigned long flags; 754 unsigned long flags;
540 struct timespec tmp; 755 struct timespec64 ts64, tmp;
541 int ret = 0; 756 int ret = 0;
542 757
543 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 758 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
544 return -EINVAL; 759 return -EINVAL;
545 760
761 ts64 = timespec_to_timespec64(*ts);
762
546 raw_spin_lock_irqsave(&timekeeper_lock, flags); 763 raw_spin_lock_irqsave(&timekeeper_lock, flags);
547 write_seqcount_begin(&timekeeper_seq); 764 write_seqcount_begin(&tk_core.seq);
548 765
549 timekeeping_forward_now(tk); 766 timekeeping_forward_now(tk);
550 767
551 /* Make sure the proposed value is valid */ 768 /* Make sure the proposed value is valid */
552 tmp = timespec_add(tk_xtime(tk), *ts); 769 tmp = timespec64_add(tk_xtime(tk), ts64);
553 if (!timespec_valid_strict(&tmp)) { 770 if (!timespec64_valid_strict(&tmp)) {
554 ret = -EINVAL; 771 ret = -EINVAL;
555 goto error; 772 goto error;
556 } 773 }
557 774
558 tk_xtime_add(tk, ts); 775 tk_xtime_add(tk, &ts64);
559 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 776 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
560 777
561error: /* even if we error out, we forwarded the time, so call update */ 778error: /* even if we error out, we forwarded the time, so call update */
562 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 779 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
563 780
564 write_seqcount_end(&timekeeper_seq); 781 write_seqcount_end(&tk_core.seq);
565 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 782 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
566 783
567 /* signal hrtimers about time change */ 784 /* signal hrtimers about time change */
@@ -578,14 +795,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
578 */ 795 */
579s32 timekeeping_get_tai_offset(void) 796s32 timekeeping_get_tai_offset(void)
580{ 797{
581 struct timekeeper *tk = &timekeeper; 798 struct timekeeper *tk = &tk_core.timekeeper;
582 unsigned int seq; 799 unsigned int seq;
583 s32 ret; 800 s32 ret;
584 801
585 do { 802 do {
586 seq = read_seqcount_begin(&timekeeper_seq); 803 seq = read_seqcount_begin(&tk_core.seq);
587 ret = tk->tai_offset; 804 ret = tk->tai_offset;
588 } while (read_seqcount_retry(&timekeeper_seq, seq)); 805 } while (read_seqcount_retry(&tk_core.seq, seq));
589 806
590 return ret; 807 return ret;
591} 808}
@@ -606,14 +823,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
606 */ 823 */
607void timekeeping_set_tai_offset(s32 tai_offset) 824void timekeeping_set_tai_offset(s32 tai_offset)
608{ 825{
609 struct timekeeper *tk = &timekeeper; 826 struct timekeeper *tk = &tk_core.timekeeper;
610 unsigned long flags; 827 unsigned long flags;
611 828
612 raw_spin_lock_irqsave(&timekeeper_lock, flags); 829 raw_spin_lock_irqsave(&timekeeper_lock, flags);
613 write_seqcount_begin(&timekeeper_seq); 830 write_seqcount_begin(&tk_core.seq);
614 __timekeeping_set_tai_offset(tk, tai_offset); 831 __timekeeping_set_tai_offset(tk, tai_offset);
615 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 832 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
616 write_seqcount_end(&timekeeper_seq); 833 write_seqcount_end(&tk_core.seq);
617 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 834 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
618 clock_was_set(); 835 clock_was_set();
619} 836}
@@ -625,14 +842,14 @@ void timekeeping_set_tai_offset(s32 tai_offset)
625 */ 842 */
626static int change_clocksource(void *data) 843static int change_clocksource(void *data)
627{ 844{
628 struct timekeeper *tk = &timekeeper; 845 struct timekeeper *tk = &tk_core.timekeeper;
629 struct clocksource *new, *old; 846 struct clocksource *new, *old;
630 unsigned long flags; 847 unsigned long flags;
631 848
632 new = (struct clocksource *) data; 849 new = (struct clocksource *) data;
633 850
634 raw_spin_lock_irqsave(&timekeeper_lock, flags); 851 raw_spin_lock_irqsave(&timekeeper_lock, flags);
635 write_seqcount_begin(&timekeeper_seq); 852 write_seqcount_begin(&tk_core.seq);
636 853
637 timekeeping_forward_now(tk); 854 timekeeping_forward_now(tk);
638 /* 855 /*
@@ -641,7 +858,7 @@ static int change_clocksource(void *data)
641 */ 858 */
642 if (try_module_get(new->owner)) { 859 if (try_module_get(new->owner)) {
643 if (!new->enable || new->enable(new) == 0) { 860 if (!new->enable || new->enable(new) == 0) {
644 old = tk->clock; 861 old = tk->tkr.clock;
645 tk_setup_internals(tk, new); 862 tk_setup_internals(tk, new);
646 if (old->disable) 863 if (old->disable)
647 old->disable(old); 864 old->disable(old);
@@ -652,7 +869,7 @@ static int change_clocksource(void *data)
652 } 869 }
653 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 870 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
654 871
655 write_seqcount_end(&timekeeper_seq); 872 write_seqcount_end(&tk_core.seq);
656 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 873 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
657 874
658 return 0; 875 return 0;
@@ -667,29 +884,14 @@ static int change_clocksource(void *data)
667 */ 884 */
668int timekeeping_notify(struct clocksource *clock) 885int timekeeping_notify(struct clocksource *clock)
669{ 886{
670 struct timekeeper *tk = &timekeeper; 887 struct timekeeper *tk = &tk_core.timekeeper;
671 888
672 if (tk->clock == clock) 889 if (tk->tkr.clock == clock)
673 return 0; 890 return 0;
674 stop_machine(change_clocksource, clock, NULL); 891 stop_machine(change_clocksource, clock, NULL);
675 tick_clock_notify(); 892 tick_clock_notify();
676 return tk->clock == clock ? 0 : -1; 893 return tk->tkr.clock == clock ? 0 : -1;
677}
678
679/**
680 * ktime_get_real - get the real (wall-) time in ktime_t format
681 *
682 * returns the time in ktime_t format
683 */
684ktime_t ktime_get_real(void)
685{
686 struct timespec now;
687
688 getnstimeofday(&now);
689
690 return timespec_to_ktime(now);
691} 894}
692EXPORT_SYMBOL_GPL(ktime_get_real);
693 895
694/** 896/**
695 * getrawmonotonic - Returns the raw monotonic time in a timespec 897 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -699,18 +901,20 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
699 */ 901 */
700void getrawmonotonic(struct timespec *ts) 902void getrawmonotonic(struct timespec *ts)
701{ 903{
702 struct timekeeper *tk = &timekeeper; 904 struct timekeeper *tk = &tk_core.timekeeper;
905 struct timespec64 ts64;
703 unsigned long seq; 906 unsigned long seq;
704 s64 nsecs; 907 s64 nsecs;
705 908
706 do { 909 do {
707 seq = read_seqcount_begin(&timekeeper_seq); 910 seq = read_seqcount_begin(&tk_core.seq);
708 nsecs = timekeeping_get_ns_raw(tk); 911 nsecs = timekeeping_get_ns_raw(tk);
709 *ts = tk->raw_time; 912 ts64 = tk->raw_time;
710 913
711 } while (read_seqcount_retry(&timekeeper_seq, seq)); 914 } while (read_seqcount_retry(&tk_core.seq, seq));
712 915
713 timespec_add_ns(ts, nsecs); 916 timespec64_add_ns(&ts64, nsecs);
917 *ts = timespec64_to_timespec(ts64);
714} 918}
715EXPORT_SYMBOL(getrawmonotonic); 919EXPORT_SYMBOL(getrawmonotonic);
716 920
@@ -719,16 +923,16 @@ EXPORT_SYMBOL(getrawmonotonic);
719 */ 923 */
720int timekeeping_valid_for_hres(void) 924int timekeeping_valid_for_hres(void)
721{ 925{
722 struct timekeeper *tk = &timekeeper; 926 struct timekeeper *tk = &tk_core.timekeeper;
723 unsigned long seq; 927 unsigned long seq;
724 int ret; 928 int ret;
725 929
726 do { 930 do {
727 seq = read_seqcount_begin(&timekeeper_seq); 931 seq = read_seqcount_begin(&tk_core.seq);
728 932
729 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 933 ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
730 934
731 } while (read_seqcount_retry(&timekeeper_seq, seq)); 935 } while (read_seqcount_retry(&tk_core.seq, seq));
732 936
733 return ret; 937 return ret;
734} 938}
@@ -738,16 +942,16 @@ int timekeeping_valid_for_hres(void)
738 */ 942 */
739u64 timekeeping_max_deferment(void) 943u64 timekeeping_max_deferment(void)
740{ 944{
741 struct timekeeper *tk = &timekeeper; 945 struct timekeeper *tk = &tk_core.timekeeper;
742 unsigned long seq; 946 unsigned long seq;
743 u64 ret; 947 u64 ret;
744 948
745 do { 949 do {
746 seq = read_seqcount_begin(&timekeeper_seq); 950 seq = read_seqcount_begin(&tk_core.seq);
747 951
748 ret = tk->clock->max_idle_ns; 952 ret = tk->tkr.clock->max_idle_ns;
749 953
750 } while (read_seqcount_retry(&timekeeper_seq, seq)); 954 } while (read_seqcount_retry(&tk_core.seq, seq));
751 955
752 return ret; 956 return ret;
753} 957}
@@ -787,14 +991,15 @@ void __weak read_boot_clock(struct timespec *ts)
787 */ 991 */
788void __init timekeeping_init(void) 992void __init timekeeping_init(void)
789{ 993{
790 struct timekeeper *tk = &timekeeper; 994 struct timekeeper *tk = &tk_core.timekeeper;
791 struct clocksource *clock; 995 struct clocksource *clock;
792 unsigned long flags; 996 unsigned long flags;
793 struct timespec now, boot, tmp; 997 struct timespec64 now, boot, tmp;
794 998 struct timespec ts;
795 read_persistent_clock(&now);
796 999
797 if (!timespec_valid_strict(&now)) { 1000 read_persistent_clock(&ts);
1001 now = timespec_to_timespec64(ts);
1002 if (!timespec64_valid_strict(&now)) {
798 pr_warn("WARNING: Persistent clock returned invalid value!\n" 1003 pr_warn("WARNING: Persistent clock returned invalid value!\n"
799 " Check your CMOS/BIOS settings.\n"); 1004 " Check your CMOS/BIOS settings.\n");
800 now.tv_sec = 0; 1005 now.tv_sec = 0;
@@ -802,8 +1007,9 @@ void __init timekeeping_init(void)
802 } else if (now.tv_sec || now.tv_nsec) 1007 } else if (now.tv_sec || now.tv_nsec)
803 persistent_clock_exist = true; 1008 persistent_clock_exist = true;
804 1009
805 read_boot_clock(&boot); 1010 read_boot_clock(&ts);
806 if (!timespec_valid_strict(&boot)) { 1011 boot = timespec_to_timespec64(ts);
1012 if (!timespec64_valid_strict(&boot)) {
807 pr_warn("WARNING: Boot clock returned invalid value!\n" 1013 pr_warn("WARNING: Boot clock returned invalid value!\n"
808 " Check your CMOS/BIOS settings.\n"); 1014 " Check your CMOS/BIOS settings.\n");
809 boot.tv_sec = 0; 1015 boot.tv_sec = 0;
@@ -811,7 +1017,7 @@ void __init timekeeping_init(void)
811 } 1017 }
812 1018
813 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1019 raw_spin_lock_irqsave(&timekeeper_lock, flags);
814 write_seqcount_begin(&timekeeper_seq); 1020 write_seqcount_begin(&tk_core.seq);
815 ntp_init(); 1021 ntp_init();
816 1022
817 clock = clocksource_default_clock(); 1023 clock = clocksource_default_clock();
@@ -822,24 +1028,21 @@ void __init timekeeping_init(void)
822 tk_set_xtime(tk, &now); 1028 tk_set_xtime(tk, &now);
823 tk->raw_time.tv_sec = 0; 1029 tk->raw_time.tv_sec = 0;
824 tk->raw_time.tv_nsec = 0; 1030 tk->raw_time.tv_nsec = 0;
1031 tk->base_raw.tv64 = 0;
825 if (boot.tv_sec == 0 && boot.tv_nsec == 0) 1032 if (boot.tv_sec == 0 && boot.tv_nsec == 0)
826 boot = tk_xtime(tk); 1033 boot = tk_xtime(tk);
827 1034
828 set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); 1035 set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
829 tk_set_wall_to_mono(tk, tmp); 1036 tk_set_wall_to_mono(tk, tmp);
830 1037
831 tmp.tv_sec = 0; 1038 timekeeping_update(tk, TK_MIRROR);
832 tmp.tv_nsec = 0;
833 tk_set_sleep_time(tk, tmp);
834
835 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
836 1039
837 write_seqcount_end(&timekeeper_seq); 1040 write_seqcount_end(&tk_core.seq);
838 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1041 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
839} 1042}
840 1043
841/* time in seconds when suspend began */ 1044/* time in seconds when suspend began */
842static struct timespec timekeeping_suspend_time; 1045static struct timespec64 timekeeping_suspend_time;
843 1046
844/** 1047/**
845 * __timekeeping_inject_sleeptime - Internal function to add sleep interval 1048 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
@@ -849,17 +1052,17 @@ static struct timespec timekeeping_suspend_time;
849 * adds the sleep offset to the timekeeping variables. 1052 * adds the sleep offset to the timekeeping variables.
850 */ 1053 */
851static void __timekeeping_inject_sleeptime(struct timekeeper *tk, 1054static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
852 struct timespec *delta) 1055 struct timespec64 *delta)
853{ 1056{
854 if (!timespec_valid_strict(delta)) { 1057 if (!timespec64_valid_strict(delta)) {
855 printk_deferred(KERN_WARNING 1058 printk_deferred(KERN_WARNING
856 "__timekeeping_inject_sleeptime: Invalid " 1059 "__timekeeping_inject_sleeptime: Invalid "
857 "sleep delta value!\n"); 1060 "sleep delta value!\n");
858 return; 1061 return;
859 } 1062 }
860 tk_xtime_add(tk, delta); 1063 tk_xtime_add(tk, delta);
861 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); 1064 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
862 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); 1065 tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
863 tk_debug_account_sleep_time(delta); 1066 tk_debug_account_sleep_time(delta);
864} 1067}
865 1068
@@ -875,7 +1078,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
875 */ 1078 */
876void timekeeping_inject_sleeptime(struct timespec *delta) 1079void timekeeping_inject_sleeptime(struct timespec *delta)
877{ 1080{
878 struct timekeeper *tk = &timekeeper; 1081 struct timekeeper *tk = &tk_core.timekeeper;
1082 struct timespec64 tmp;
879 unsigned long flags; 1083 unsigned long flags;
880 1084
881 /* 1085 /*
@@ -886,15 +1090,16 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
886 return; 1090 return;
887 1091
888 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1092 raw_spin_lock_irqsave(&timekeeper_lock, flags);
889 write_seqcount_begin(&timekeeper_seq); 1093 write_seqcount_begin(&tk_core.seq);
890 1094
891 timekeeping_forward_now(tk); 1095 timekeeping_forward_now(tk);
892 1096
893 __timekeeping_inject_sleeptime(tk, delta); 1097 tmp = timespec_to_timespec64(*delta);
1098 __timekeeping_inject_sleeptime(tk, &tmp);
894 1099
895 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 1100 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
896 1101
897 write_seqcount_end(&timekeeper_seq); 1102 write_seqcount_end(&tk_core.seq);
898 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1103 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
899 1104
900 /* signal hrtimers about time change */ 1105 /* signal hrtimers about time change */
@@ -910,20 +1115,22 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
910 */ 1115 */
911static void timekeeping_resume(void) 1116static void timekeeping_resume(void)
912{ 1117{
913 struct timekeeper *tk = &timekeeper; 1118 struct timekeeper *tk = &tk_core.timekeeper;
914 struct clocksource *clock = tk->clock; 1119 struct clocksource *clock = tk->tkr.clock;
915 unsigned long flags; 1120 unsigned long flags;
916 struct timespec ts_new, ts_delta; 1121 struct timespec64 ts_new, ts_delta;
1122 struct timespec tmp;
917 cycle_t cycle_now, cycle_delta; 1123 cycle_t cycle_now, cycle_delta;
918 bool suspendtime_found = false; 1124 bool suspendtime_found = false;
919 1125
920 read_persistent_clock(&ts_new); 1126 read_persistent_clock(&tmp);
1127 ts_new = timespec_to_timespec64(tmp);
921 1128
922 clockevents_resume(); 1129 clockevents_resume();
923 clocksource_resume(); 1130 clocksource_resume();
924 1131
925 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1132 raw_spin_lock_irqsave(&timekeeper_lock, flags);
926 write_seqcount_begin(&timekeeper_seq); 1133 write_seqcount_begin(&tk_core.seq);
927 1134
928 /* 1135 /*
929 * After system resumes, we need to calculate the suspended time and 1136 * After system resumes, we need to calculate the suspended time and
@@ -937,15 +1144,16 @@ static void timekeeping_resume(void)
937 * The less preferred source will only be tried if there is no better 1144 * The less preferred source will only be tried if there is no better
938 * usable source. The rtc part is handled separately in rtc core code. 1145 * usable source. The rtc part is handled separately in rtc core code.
939 */ 1146 */
940 cycle_now = clock->read(clock); 1147 cycle_now = tk->tkr.read(clock);
941 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && 1148 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
942 cycle_now > clock->cycle_last) { 1149 cycle_now > tk->tkr.cycle_last) {
943 u64 num, max = ULLONG_MAX; 1150 u64 num, max = ULLONG_MAX;
944 u32 mult = clock->mult; 1151 u32 mult = clock->mult;
945 u32 shift = clock->shift; 1152 u32 shift = clock->shift;
946 s64 nsec = 0; 1153 s64 nsec = 0;
947 1154
948 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 1155 cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
1156 tk->tkr.mask);
949 1157
950 /* 1158 /*
951 * "cycle_delta * mutl" may cause 64 bits overflow, if the 1159 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -960,10 +1168,10 @@ static void timekeeping_resume(void)
960 } 1168 }
961 nsec += ((u64) cycle_delta * mult) >> shift; 1169 nsec += ((u64) cycle_delta * mult) >> shift;
962 1170
963 ts_delta = ns_to_timespec(nsec); 1171 ts_delta = ns_to_timespec64(nsec);
964 suspendtime_found = true; 1172 suspendtime_found = true;
965 } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { 1173 } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
966 ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); 1174 ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
967 suspendtime_found = true; 1175 suspendtime_found = true;
968 } 1176 }
969 1177
@@ -971,11 +1179,11 @@ static void timekeeping_resume(void)
971 __timekeeping_inject_sleeptime(tk, &ts_delta); 1179 __timekeeping_inject_sleeptime(tk, &ts_delta);
972 1180
973 /* Re-base the last cycle value */ 1181 /* Re-base the last cycle value */
974 tk->cycle_last = clock->cycle_last = cycle_now; 1182 tk->tkr.cycle_last = cycle_now;
975 tk->ntp_error = 0; 1183 tk->ntp_error = 0;
976 timekeeping_suspended = 0; 1184 timekeeping_suspended = 0;
977 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 1185 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
978 write_seqcount_end(&timekeeper_seq); 1186 write_seqcount_end(&tk_core.seq);
979 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1187 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
980 1188
981 touch_softlockup_watchdog(); 1189 touch_softlockup_watchdog();
@@ -988,12 +1196,14 @@ static void timekeeping_resume(void)
988 1196
989static int timekeeping_suspend(void) 1197static int timekeeping_suspend(void)
990{ 1198{
991 struct timekeeper *tk = &timekeeper; 1199 struct timekeeper *tk = &tk_core.timekeeper;
992 unsigned long flags; 1200 unsigned long flags;
993 struct timespec delta, delta_delta; 1201 struct timespec64 delta, delta_delta;
994 static struct timespec old_delta; 1202 static struct timespec64 old_delta;
1203 struct timespec tmp;
995 1204
996 read_persistent_clock(&timekeeping_suspend_time); 1205 read_persistent_clock(&tmp);
1206 timekeeping_suspend_time = timespec_to_timespec64(tmp);
997 1207
998 /* 1208 /*
999 * On some systems the persistent_clock can not be detected at 1209 * On some systems the persistent_clock can not be detected at
@@ -1004,7 +1214,7 @@ static int timekeeping_suspend(void)
1004 persistent_clock_exist = true; 1214 persistent_clock_exist = true;
1005 1215
1006 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1216 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1007 write_seqcount_begin(&timekeeper_seq); 1217 write_seqcount_begin(&tk_core.seq);
1008 timekeeping_forward_now(tk); 1218 timekeeping_forward_now(tk);
1009 timekeeping_suspended = 1; 1219 timekeeping_suspended = 1;
1010 1220
@@ -1014,8 +1224,8 @@ static int timekeeping_suspend(void)
1014 * try to compensate so the difference in system time 1224 * try to compensate so the difference in system time
1015 * and persistent_clock time stays close to constant. 1225 * and persistent_clock time stays close to constant.
1016 */ 1226 */
1017 delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); 1227 delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
1018 delta_delta = timespec_sub(delta, old_delta); 1228 delta_delta = timespec64_sub(delta, old_delta);
1019 if (abs(delta_delta.tv_sec) >= 2) { 1229 if (abs(delta_delta.tv_sec) >= 2) {
1020 /* 1230 /*
1021 * if delta_delta is too large, assume time correction 1231 * if delta_delta is too large, assume time correction
@@ -1025,11 +1235,11 @@ static int timekeeping_suspend(void)
1025 } else { 1235 } else {
1026 /* Otherwise try to adjust old_system to compensate */ 1236 /* Otherwise try to adjust old_system to compensate */
1027 timekeeping_suspend_time = 1237 timekeeping_suspend_time =
1028 timespec_add(timekeeping_suspend_time, delta_delta); 1238 timespec64_add(timekeeping_suspend_time, delta_delta);
1029 } 1239 }
1030 1240
1031 timekeeping_update(tk, TK_MIRROR); 1241 timekeeping_update(tk, TK_MIRROR);
1032 write_seqcount_end(&timekeeper_seq); 1242 write_seqcount_end(&tk_core.seq);
1033 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1243 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1034 1244
1035 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 1245 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -1050,125 +1260,34 @@ static int __init timekeeping_init_ops(void)
1050 register_syscore_ops(&timekeeping_syscore_ops); 1260 register_syscore_ops(&timekeeping_syscore_ops);
1051 return 0; 1261 return 0;
1052} 1262}
1053
1054device_initcall(timekeeping_init_ops); 1263device_initcall(timekeeping_init_ops);
1055 1264
1056/* 1265/*
1057 * If the error is already larger, we look ahead even further 1266 * Apply a multiplier adjustment to the timekeeper
1058 * to compensate for late or lost adjustments.
1059 */ 1267 */
1060static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, 1268static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
1061 s64 error, s64 *interval, 1269 s64 offset,
1062 s64 *offset) 1270 bool negative,
1271 int adj_scale)
1063{ 1272{
1064 s64 tick_error, i; 1273 s64 interval = tk->cycle_interval;
1065 u32 look_ahead, adj; 1274 s32 mult_adj = 1;
1066 s32 error2, mult;
1067
1068 /*
1069 * Use the current error value to determine how much to look ahead.
1070 * The larger the error the slower we adjust for it to avoid problems
1071 * with losing too many ticks, otherwise we would overadjust and
1072 * produce an even larger error. The smaller the adjustment the
1073 * faster we try to adjust for it, as lost ticks can do less harm
1074 * here. This is tuned so that an error of about 1 msec is adjusted
1075 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
1076 */
1077 error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
1078 error2 = abs(error2);
1079 for (look_ahead = 0; error2 > 0; look_ahead++)
1080 error2 >>= 2;
1081 1275
1082 /* 1276 if (negative) {
1083 * Now calculate the error in (1 << look_ahead) ticks, but first 1277 mult_adj = -mult_adj;
1084 * remove the single look ahead already included in the error. 1278 interval = -interval;
1085 */ 1279 offset = -offset;
1086 tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
1087 tick_error -= tk->xtime_interval >> 1;
1088 error = ((error - tick_error) >> look_ahead) + tick_error;
1089
1090 /* Finally calculate the adjustment shift value. */
1091 i = *interval;
1092 mult = 1;
1093 if (error < 0) {
1094 error = -error;
1095 *interval = -*interval;
1096 *offset = -*offset;
1097 mult = -1;
1098 } 1280 }
1099 for (adj = 0; error > i; adj++) 1281 mult_adj <<= adj_scale;
1100 error >>= 1; 1282 interval <<= adj_scale;
1101 1283 offset <<= adj_scale;
1102 *interval <<= adj;
1103 *offset <<= adj;
1104 return mult << adj;
1105}
1106
1107/*
1108 * Adjust the multiplier to reduce the error value,
1109 * this is optimized for the most common adjustments of -1,0,1,
1110 * for other values we can do a bit more work.
1111 */
1112static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1113{
1114 s64 error, interval = tk->cycle_interval;
1115 int adj;
1116 1284
1117 /* 1285 /*
1118 * The point of this is to check if the error is greater than half
1119 * an interval.
1120 *
1121 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
1122 *
1123 * Note we subtract one in the shift, so that error is really error*2.
1124 * This "saves" dividing(shifting) interval twice, but keeps the
1125 * (error > interval) comparison as still measuring if error is
1126 * larger than half an interval.
1127 *
1128 * Note: It does not "save" on aggravation when reading the code.
1129 */
1130 error = tk->ntp_error >> (tk->ntp_error_shift - 1);
1131 if (error > interval) {
1132 /*
1133 * We now divide error by 4(via shift), which checks if
1134 * the error is greater than twice the interval.
1135 * If it is greater, we need a bigadjust, if its smaller,
1136 * we can adjust by 1.
1137 */
1138 error >>= 2;
1139 if (likely(error <= interval))
1140 adj = 1;
1141 else
1142 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
1143 } else {
1144 if (error < -interval) {
1145 /* See comment above, this is just switched for the negative */
1146 error >>= 2;
1147 if (likely(error >= -interval)) {
1148 adj = -1;
1149 interval = -interval;
1150 offset = -offset;
1151 } else {
1152 adj = timekeeping_bigadjust(tk, error, &interval, &offset);
1153 }
1154 } else {
1155 goto out_adjust;
1156 }
1157 }
1158
1159 if (unlikely(tk->clock->maxadj &&
1160 (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
1161 printk_deferred_once(KERN_WARNING
1162 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1163 tk->clock->name, (long)tk->mult + adj,
1164 (long)tk->clock->mult + tk->clock->maxadj);
1165 }
1166 /*
1167 * So the following can be confusing. 1286 * So the following can be confusing.
1168 * 1287 *
1169 * To keep things simple, lets assume adj == 1 for now. 1288 * To keep things simple, lets assume mult_adj == 1 for now.
1170 * 1289 *
1171 * When adj != 1, remember that the interval and offset values 1290 * When mult_adj != 1, remember that the interval and offset values
1172 * have been appropriately scaled so the math is the same. 1291 * have been appropriately scaled so the math is the same.
1173 * 1292 *
1174 * The basic idea here is that we're increasing the multiplier 1293 * The basic idea here is that we're increasing the multiplier
@@ -1212,12 +1331,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1212 * 1331 *
1213 * XXX - TODO: Doc ntp_error calculation. 1332 * XXX - TODO: Doc ntp_error calculation.
1214 */ 1333 */
1215 tk->mult += adj; 1334 tk->tkr.mult += mult_adj;
1216 tk->xtime_interval += interval; 1335 tk->xtime_interval += interval;
1217 tk->xtime_nsec -= offset; 1336 tk->tkr.xtime_nsec -= offset;
1218 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; 1337 tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
1338}
1339
1340/*
1341 * Calculate the multiplier adjustment needed to match the frequency
1342 * specified by NTP
1343 */
1344static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
1345 s64 offset)
1346{
1347 s64 interval = tk->cycle_interval;
1348 s64 xinterval = tk->xtime_interval;
1349 s64 tick_error;
1350 bool negative;
1351 u32 adj;
1352
1353 /* Remove any current error adj from freq calculation */
1354 if (tk->ntp_err_mult)
1355 xinterval -= tk->cycle_interval;
1356
1357 tk->ntp_tick = ntp_tick_length();
1358
1359 /* Calculate current error per tick */
1360 tick_error = ntp_tick_length() >> tk->ntp_error_shift;
1361 tick_error -= (xinterval + tk->xtime_remainder);
1362
1363 /* Don't worry about correcting it if its small */
1364 if (likely((tick_error >= 0) && (tick_error <= interval)))
1365 return;
1366
1367 /* preserve the direction of correction */
1368 negative = (tick_error < 0);
1369
1370 /* Sort out the magnitude of the correction */
1371 tick_error = abs(tick_error);
1372 for (adj = 0; tick_error > interval; adj++)
1373 tick_error >>= 1;
1374
1375 /* scale the corrections */
1376 timekeeping_apply_adjustment(tk, offset, negative, adj);
1377}
1378
1379/*
1380 * Adjust the timekeeper's multiplier to the correct frequency
1381 * and also to reduce the accumulated error value.
1382 */
1383static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1384{
1385 /* Correct for the current frequency error */
1386 timekeeping_freqadjust(tk, offset);
1387
1388 /* Next make a small adjustment to fix any cumulative error */
1389 if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
1390 tk->ntp_err_mult = 1;
1391 timekeeping_apply_adjustment(tk, offset, 0, 0);
1392 } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
1393 /* Undo any existing error adjustment */
1394 timekeeping_apply_adjustment(tk, offset, 1, 0);
1395 tk->ntp_err_mult = 0;
1396 }
1397
1398 if (unlikely(tk->tkr.clock->maxadj &&
1399 (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
1400 printk_once(KERN_WARNING
1401 "Adjusting %s more than 11%% (%ld vs %ld)\n",
1402 tk->tkr.clock->name, (long)tk->tkr.mult,
1403 (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
1404 }
1219 1405
1220out_adjust:
1221 /* 1406 /*
1222 * It may be possible that when we entered this function, xtime_nsec 1407 * It may be possible that when we entered this function, xtime_nsec
1223 * was very small. Further, if we're slightly speeding the clocksource 1408 * was very small. Further, if we're slightly speeding the clocksource
@@ -1232,12 +1417,11 @@ out_adjust:
1232 * We'll correct this error next time through this function, when 1417 * We'll correct this error next time through this function, when
1233 * xtime_nsec is not as small. 1418 * xtime_nsec is not as small.
1234 */ 1419 */
1235 if (unlikely((s64)tk->xtime_nsec < 0)) { 1420 if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
1236 s64 neg = -(s64)tk->xtime_nsec; 1421 s64 neg = -(s64)tk->tkr.xtime_nsec;
1237 tk->xtime_nsec = 0; 1422 tk->tkr.xtime_nsec = 0;
1238 tk->ntp_error += neg << tk->ntp_error_shift; 1423 tk->ntp_error += neg << tk->ntp_error_shift;
1239 } 1424 }
1240
1241} 1425}
1242 1426
1243/** 1427/**
@@ -1250,26 +1434,26 @@ out_adjust:
1250 */ 1434 */
1251static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) 1435static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1252{ 1436{
1253 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; 1437 u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
1254 unsigned int clock_set = 0; 1438 unsigned int clock_set = 0;
1255 1439
1256 while (tk->xtime_nsec >= nsecps) { 1440 while (tk->tkr.xtime_nsec >= nsecps) {
1257 int leap; 1441 int leap;
1258 1442
1259 tk->xtime_nsec -= nsecps; 1443 tk->tkr.xtime_nsec -= nsecps;
1260 tk->xtime_sec++; 1444 tk->xtime_sec++;
1261 1445
1262 /* Figure out if its a leap sec and apply if needed */ 1446 /* Figure out if its a leap sec and apply if needed */
1263 leap = second_overflow(tk->xtime_sec); 1447 leap = second_overflow(tk->xtime_sec);
1264 if (unlikely(leap)) { 1448 if (unlikely(leap)) {
1265 struct timespec ts; 1449 struct timespec64 ts;
1266 1450
1267 tk->xtime_sec += leap; 1451 tk->xtime_sec += leap;
1268 1452
1269 ts.tv_sec = leap; 1453 ts.tv_sec = leap;
1270 ts.tv_nsec = 0; 1454 ts.tv_nsec = 0;
1271 tk_set_wall_to_mono(tk, 1455 tk_set_wall_to_mono(tk,
1272 timespec_sub(tk->wall_to_monotonic, ts)); 1456 timespec64_sub(tk->wall_to_monotonic, ts));
1273 1457
1274 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); 1458 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1275 1459
@@ -1301,9 +1485,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1301 1485
1302 /* Accumulate one shifted interval */ 1486 /* Accumulate one shifted interval */
1303 offset -= interval; 1487 offset -= interval;
1304 tk->cycle_last += interval; 1488 tk->tkr.cycle_last += interval;
1305 1489
1306 tk->xtime_nsec += tk->xtime_interval << shift; 1490 tk->tkr.xtime_nsec += tk->xtime_interval << shift;
1307 *clock_set |= accumulate_nsecs_to_secs(tk); 1491 *clock_set |= accumulate_nsecs_to_secs(tk);
1308 1492
1309 /* Accumulate raw time */ 1493 /* Accumulate raw time */
@@ -1317,48 +1501,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1317 tk->raw_time.tv_nsec = raw_nsecs; 1501 tk->raw_time.tv_nsec = raw_nsecs;
1318 1502
1319 /* Accumulate error between NTP and clock interval */ 1503 /* Accumulate error between NTP and clock interval */
1320 tk->ntp_error += ntp_tick_length() << shift; 1504 tk->ntp_error += tk->ntp_tick << shift;
1321 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << 1505 tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
1322 (tk->ntp_error_shift + shift); 1506 (tk->ntp_error_shift + shift);
1323 1507
1324 return offset; 1508 return offset;
1325} 1509}
1326 1510
1327#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
1328static inline void old_vsyscall_fixup(struct timekeeper *tk)
1329{
1330 s64 remainder;
1331
1332 /*
1333 * Store only full nanoseconds into xtime_nsec after rounding
1334 * it up and add the remainder to the error difference.
1335 * XXX - This is necessary to avoid small 1ns inconsistnecies caused
1336 * by truncating the remainder in vsyscalls. However, it causes
1337 * additional work to be done in timekeeping_adjust(). Once
1338 * the vsyscall implementations are converted to use xtime_nsec
1339 * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
1340 * users are removed, this can be killed.
1341 */
1342 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1343 tk->xtime_nsec -= remainder;
1344 tk->xtime_nsec += 1ULL << tk->shift;
1345 tk->ntp_error += remainder << tk->ntp_error_shift;
1346 tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
1347}
1348#else
1349#define old_vsyscall_fixup(tk)
1350#endif
1351
1352
1353
1354/** 1511/**
1355 * update_wall_time - Uses the current clocksource to increment the wall time 1512 * update_wall_time - Uses the current clocksource to increment the wall time
1356 * 1513 *
1357 */ 1514 */
1358void update_wall_time(void) 1515void update_wall_time(void)
1359{ 1516{
1360 struct clocksource *clock; 1517 struct timekeeper *real_tk = &tk_core.timekeeper;
1361 struct timekeeper *real_tk = &timekeeper;
1362 struct timekeeper *tk = &shadow_timekeeper; 1518 struct timekeeper *tk = &shadow_timekeeper;
1363 cycle_t offset; 1519 cycle_t offset;
1364 int shift = 0, maxshift; 1520 int shift = 0, maxshift;
@@ -1371,12 +1527,11 @@ void update_wall_time(void)
1371 if (unlikely(timekeeping_suspended)) 1527 if (unlikely(timekeeping_suspended))
1372 goto out; 1528 goto out;
1373 1529
1374 clock = real_tk->clock;
1375
1376#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1530#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1377 offset = real_tk->cycle_interval; 1531 offset = real_tk->cycle_interval;
1378#else 1532#else
1379 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1533 offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
1534 tk->tkr.cycle_last, tk->tkr.mask);
1380#endif 1535#endif
1381 1536
1382 /* Check if there's really nothing to do */ 1537 /* Check if there's really nothing to do */
@@ -1418,9 +1573,7 @@ void update_wall_time(void)
1418 */ 1573 */
1419 clock_set |= accumulate_nsecs_to_secs(tk); 1574 clock_set |= accumulate_nsecs_to_secs(tk);
1420 1575
1421 write_seqcount_begin(&timekeeper_seq); 1576 write_seqcount_begin(&tk_core.seq);
1422 /* Update clock->cycle_last with the new value */
1423 clock->cycle_last = tk->cycle_last;
1424 /* 1577 /*
1425 * Update the real timekeeper. 1578 * Update the real timekeeper.
1426 * 1579 *
@@ -1428,12 +1581,12 @@ void update_wall_time(void)
1428 * requires changes to all other timekeeper usage sites as 1581 * requires changes to all other timekeeper usage sites as
1429 * well, i.e. move the timekeeper pointer getter into the 1582 * well, i.e. move the timekeeper pointer getter into the
1430 * spinlocked/seqcount protected sections. And we trade this 1583 * spinlocked/seqcount protected sections. And we trade this
1431 * memcpy under the timekeeper_seq against one before we start 1584 * memcpy under the tk_core.seq against one before we start
1432 * updating. 1585 * updating.
1433 */ 1586 */
1434 memcpy(real_tk, tk, sizeof(*tk)); 1587 memcpy(real_tk, tk, sizeof(*tk));
1435 timekeeping_update(real_tk, clock_set); 1588 timekeeping_update(real_tk, clock_set);
1436 write_seqcount_end(&timekeeper_seq); 1589 write_seqcount_end(&tk_core.seq);
1437out: 1590out:
1438 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1591 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1439 if (clock_set) 1592 if (clock_set)
@@ -1454,83 +1607,16 @@ out:
1454 */ 1607 */
1455void getboottime(struct timespec *ts) 1608void getboottime(struct timespec *ts)
1456{ 1609{
1457 struct timekeeper *tk = &timekeeper; 1610 struct timekeeper *tk = &tk_core.timekeeper;
1458 struct timespec boottime = { 1611 ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
1459 .tv_sec = tk->wall_to_monotonic.tv_sec +
1460 tk->total_sleep_time.tv_sec,
1461 .tv_nsec = tk->wall_to_monotonic.tv_nsec +
1462 tk->total_sleep_time.tv_nsec
1463 };
1464
1465 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
1466}
1467EXPORT_SYMBOL_GPL(getboottime);
1468
1469/**
1470 * get_monotonic_boottime - Returns monotonic time since boot
1471 * @ts: pointer to the timespec to be set
1472 *
1473 * Returns the monotonic time since boot in a timespec.
1474 *
1475 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
1476 * includes the time spent in suspend.
1477 */
1478void get_monotonic_boottime(struct timespec *ts)
1479{
1480 struct timekeeper *tk = &timekeeper;
1481 struct timespec tomono, sleep;
1482 s64 nsec;
1483 unsigned int seq;
1484
1485 WARN_ON(timekeeping_suspended);
1486
1487 do {
1488 seq = read_seqcount_begin(&timekeeper_seq);
1489 ts->tv_sec = tk->xtime_sec;
1490 nsec = timekeeping_get_ns(tk);
1491 tomono = tk->wall_to_monotonic;
1492 sleep = tk->total_sleep_time;
1493
1494 } while (read_seqcount_retry(&timekeeper_seq, seq));
1495
1496 ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
1497 ts->tv_nsec = 0;
1498 timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
1499}
1500EXPORT_SYMBOL_GPL(get_monotonic_boottime);
1501
1502/**
1503 * ktime_get_boottime - Returns monotonic time since boot in a ktime
1504 *
1505 * Returns the monotonic time since boot in a ktime
1506 *
1507 * This is similar to CLOCK_MONTONIC/ktime_get, but also
1508 * includes the time spent in suspend.
1509 */
1510ktime_t ktime_get_boottime(void)
1511{
1512 struct timespec ts;
1513
1514 get_monotonic_boottime(&ts);
1515 return timespec_to_ktime(ts);
1516}
1517EXPORT_SYMBOL_GPL(ktime_get_boottime);
1518
1519/**
1520 * monotonic_to_bootbased - Convert the monotonic time to boot based.
1521 * @ts: pointer to the timespec to be converted
1522 */
1523void monotonic_to_bootbased(struct timespec *ts)
1524{
1525 struct timekeeper *tk = &timekeeper;
1526 1612
1527 *ts = timespec_add(*ts, tk->total_sleep_time); 1613 *ts = ktime_to_timespec(t);
1528} 1614}
1529EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1615EXPORT_SYMBOL_GPL(getboottime);
1530 1616
1531unsigned long get_seconds(void) 1617unsigned long get_seconds(void)
1532{ 1618{
1533 struct timekeeper *tk = &timekeeper; 1619 struct timekeeper *tk = &tk_core.timekeeper;
1534 1620
1535 return tk->xtime_sec; 1621 return tk->xtime_sec;
1536} 1622}
@@ -1538,43 +1624,44 @@ EXPORT_SYMBOL(get_seconds);
1538 1624
1539struct timespec __current_kernel_time(void) 1625struct timespec __current_kernel_time(void)
1540{ 1626{
1541 struct timekeeper *tk = &timekeeper; 1627 struct timekeeper *tk = &tk_core.timekeeper;
1542 1628
1543 return tk_xtime(tk); 1629 return timespec64_to_timespec(tk_xtime(tk));
1544} 1630}
1545 1631
1546struct timespec current_kernel_time(void) 1632struct timespec current_kernel_time(void)
1547{ 1633{
1548 struct timekeeper *tk = &timekeeper; 1634 struct timekeeper *tk = &tk_core.timekeeper;
1549 struct timespec now; 1635 struct timespec64 now;
1550 unsigned long seq; 1636 unsigned long seq;
1551 1637
1552 do { 1638 do {
1553 seq = read_seqcount_begin(&timekeeper_seq); 1639 seq = read_seqcount_begin(&tk_core.seq);
1554 1640
1555 now = tk_xtime(tk); 1641 now = tk_xtime(tk);
1556 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1642 } while (read_seqcount_retry(&tk_core.seq, seq));
1557 1643
1558 return now; 1644 return timespec64_to_timespec(now);
1559} 1645}
1560EXPORT_SYMBOL(current_kernel_time); 1646EXPORT_SYMBOL(current_kernel_time);
1561 1647
1562struct timespec get_monotonic_coarse(void) 1648struct timespec get_monotonic_coarse(void)
1563{ 1649{
1564 struct timekeeper *tk = &timekeeper; 1650 struct timekeeper *tk = &tk_core.timekeeper;
1565 struct timespec now, mono; 1651 struct timespec64 now, mono;
1566 unsigned long seq; 1652 unsigned long seq;
1567 1653
1568 do { 1654 do {
1569 seq = read_seqcount_begin(&timekeeper_seq); 1655 seq = read_seqcount_begin(&tk_core.seq);
1570 1656
1571 now = tk_xtime(tk); 1657 now = tk_xtime(tk);
1572 mono = tk->wall_to_monotonic; 1658 mono = tk->wall_to_monotonic;
1573 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1659 } while (read_seqcount_retry(&tk_core.seq, seq));
1574 1660
1575 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1661 set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
1576 now.tv_nsec + mono.tv_nsec); 1662 now.tv_nsec + mono.tv_nsec);
1577 return now; 1663
1664 return timespec64_to_timespec(now);
1578} 1665}
1579 1666
1580/* 1667/*
@@ -1587,29 +1674,38 @@ void do_timer(unsigned long ticks)
1587} 1674}
1588 1675
1589/** 1676/**
1590 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, 1677 * ktime_get_update_offsets_tick - hrtimer helper
1591 * and sleep offsets. 1678 * @offs_real: pointer to storage for monotonic -> realtime offset
1592 * @xtim: pointer to timespec to be set with xtime 1679 * @offs_boot: pointer to storage for monotonic -> boottime offset
1593 * @wtom: pointer to timespec to be set with wall_to_monotonic 1680 * @offs_tai: pointer to storage for monotonic -> clock tai offset
1594 * @sleep: pointer to timespec to be set with time in suspend 1681 *
1682 * Returns monotonic time at last tick and various offsets
1595 */ 1683 */
1596void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, 1684ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
1597 struct timespec *wtom, struct timespec *sleep) 1685 ktime_t *offs_tai)
1598{ 1686{
1599 struct timekeeper *tk = &timekeeper; 1687 struct timekeeper *tk = &tk_core.timekeeper;
1600 unsigned long seq; 1688 unsigned int seq;
1689 ktime_t base;
1690 u64 nsecs;
1601 1691
1602 do { 1692 do {
1603 seq = read_seqcount_begin(&timekeeper_seq); 1693 seq = read_seqcount_begin(&tk_core.seq);
1604 *xtim = tk_xtime(tk); 1694
1605 *wtom = tk->wall_to_monotonic; 1695 base = tk->tkr.base_mono;
1606 *sleep = tk->total_sleep_time; 1696 nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
1607 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1697
1698 *offs_real = tk->offs_real;
1699 *offs_boot = tk->offs_boot;
1700 *offs_tai = tk->offs_tai;
1701 } while (read_seqcount_retry(&tk_core.seq, seq));
1702
1703 return ktime_add_ns(base, nsecs);
1608} 1704}
1609 1705
1610#ifdef CONFIG_HIGH_RES_TIMERS 1706#ifdef CONFIG_HIGH_RES_TIMERS
1611/** 1707/**
1612 * ktime_get_update_offsets - hrtimer helper 1708 * ktime_get_update_offsets_now - hrtimer helper
1613 * @offs_real: pointer to storage for monotonic -> realtime offset 1709 * @offs_real: pointer to storage for monotonic -> realtime offset
1614 * @offs_boot: pointer to storage for monotonic -> boottime offset 1710 * @offs_boot: pointer to storage for monotonic -> boottime offset
1615 * @offs_tai: pointer to storage for monotonic -> clock tai offset 1711 * @offs_tai: pointer to storage for monotonic -> clock tai offset
@@ -1617,57 +1713,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1617 * Returns current monotonic time and updates the offsets 1713 * Returns current monotonic time and updates the offsets
1618 * Called from hrtimer_interrupt() or retrigger_next_event() 1714 * Called from hrtimer_interrupt() or retrigger_next_event()
1619 */ 1715 */
1620ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, 1716ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
1621 ktime_t *offs_tai) 1717 ktime_t *offs_tai)
1622{ 1718{
1623 struct timekeeper *tk = &timekeeper; 1719 struct timekeeper *tk = &tk_core.timekeeper;
1624 ktime_t now;
1625 unsigned int seq; 1720 unsigned int seq;
1626 u64 secs, nsecs; 1721 ktime_t base;
1722 u64 nsecs;
1627 1723
1628 do { 1724 do {
1629 seq = read_seqcount_begin(&timekeeper_seq); 1725 seq = read_seqcount_begin(&tk_core.seq);
1630 1726
1631 secs = tk->xtime_sec; 1727 base = tk->tkr.base_mono;
1632 nsecs = timekeeping_get_ns(tk); 1728 nsecs = timekeeping_get_ns(&tk->tkr);
1633 1729
1634 *offs_real = tk->offs_real; 1730 *offs_real = tk->offs_real;
1635 *offs_boot = tk->offs_boot; 1731 *offs_boot = tk->offs_boot;
1636 *offs_tai = tk->offs_tai; 1732 *offs_tai = tk->offs_tai;
1637 } while (read_seqcount_retry(&timekeeper_seq, seq)); 1733 } while (read_seqcount_retry(&tk_core.seq, seq));
1638 1734
1639 now = ktime_add_ns(ktime_set(secs, 0), nsecs); 1735 return ktime_add_ns(base, nsecs);
1640 now = ktime_sub(now, *offs_real);
1641 return now;
1642} 1736}
1643#endif 1737#endif
1644 1738
1645/** 1739/**
1646 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
1647 */
1648ktime_t ktime_get_monotonic_offset(void)
1649{
1650 struct timekeeper *tk = &timekeeper;
1651 unsigned long seq;
1652 struct timespec wtom;
1653
1654 do {
1655 seq = read_seqcount_begin(&timekeeper_seq);
1656 wtom = tk->wall_to_monotonic;
1657 } while (read_seqcount_retry(&timekeeper_seq, seq));
1658
1659 return timespec_to_ktime(wtom);
1660}
1661EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1662
1663/**
1664 * do_adjtimex() - Accessor function to NTP __do_adjtimex function 1740 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
1665 */ 1741 */
1666int do_adjtimex(struct timex *txc) 1742int do_adjtimex(struct timex *txc)
1667{ 1743{
1668 struct timekeeper *tk = &timekeeper; 1744 struct timekeeper *tk = &tk_core.timekeeper;
1669 unsigned long flags; 1745 unsigned long flags;
1670 struct timespec ts; 1746 struct timespec64 ts;
1671 s32 orig_tai, tai; 1747 s32 orig_tai, tai;
1672 int ret; 1748 int ret;
1673 1749
@@ -1687,10 +1763,10 @@ int do_adjtimex(struct timex *txc)
1687 return ret; 1763 return ret;
1688 } 1764 }
1689 1765
1690 getnstimeofday(&ts); 1766 getnstimeofday64(&ts);
1691 1767
1692 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1768 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1693 write_seqcount_begin(&timekeeper_seq); 1769 write_seqcount_begin(&tk_core.seq);
1694 1770
1695 orig_tai = tai = tk->tai_offset; 1771 orig_tai = tai = tk->tai_offset;
1696 ret = __do_adjtimex(txc, &ts, &tai); 1772 ret = __do_adjtimex(txc, &ts, &tai);
@@ -1699,7 +1775,7 @@ int do_adjtimex(struct timex *txc)
1699 __timekeeping_set_tai_offset(tk, tai); 1775 __timekeeping_set_tai_offset(tk, tai);
1700 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); 1776 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
1701 } 1777 }
1702 write_seqcount_end(&timekeeper_seq); 1778 write_seqcount_end(&tk_core.seq);
1703 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1779 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1704 1780
1705 if (tai != orig_tai) 1781 if (tai != orig_tai)
@@ -1719,11 +1795,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
1719 unsigned long flags; 1795 unsigned long flags;
1720 1796
1721 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1797 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1722 write_seqcount_begin(&timekeeper_seq); 1798 write_seqcount_begin(&tk_core.seq);
1723 1799
1724 __hardpps(phase_ts, raw_ts); 1800 __hardpps(phase_ts, raw_ts);
1725 1801
1726 write_seqcount_end(&timekeeper_seq); 1802 write_seqcount_end(&tk_core.seq);
1727 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1803 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1728} 1804}
1729EXPORT_SYMBOL(hardpps); 1805EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
new file mode 100644
index 000000000000..adc1fc98bde3
--- /dev/null
+++ b/kernel/time/timekeeping.h
@@ -0,0 +1,20 @@
1#ifndef _KERNEL_TIME_TIMEKEEPING_H
2#define _KERNEL_TIME_TIMEKEEPING_H
3/*
4 * Internal interfaces for kernel/time/
5 */
6extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
7 ktime_t *offs_boot,
8 ktime_t *offs_tai);
9extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
10 ktime_t *offs_boot,
11 ktime_t *offs_tai);
12
13extern int timekeeping_valid_for_hres(void);
14extern u64 timekeeping_max_deferment(void);
15extern int timekeeping_inject_offset(struct timespec *ts);
16extern s32 timekeeping_get_tai_offset(void);
17extern void timekeeping_set_tai_offset(s32 tai_offset);
18extern void timekeeping_clocktai(struct timespec *ts);
19
20#endif
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 4d54f97558df..f6bd65236712 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void)
67} 67}
68late_initcall(tk_debug_sleep_time_init); 68late_initcall(tk_debug_sleep_time_init);
69 69
70void tk_debug_account_sleep_time(struct timespec *t) 70void tk_debug_account_sleep_time(struct timespec64 *t)
71{ 71{
72 sleep_time_bin[fls(t->tv_sec)]++; 72 sleep_time_bin[fls(t->tv_sec)]++;
73} 73}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 13323ea08ffa..4ea005a7f9da 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -3,12 +3,27 @@
3/* 3/*
4 * timekeeping debug functions 4 * timekeeping debug functions
5 */ 5 */
6#include <linux/clocksource.h>
6#include <linux/time.h> 7#include <linux/time.h>
7 8
8#ifdef CONFIG_DEBUG_FS 9#ifdef CONFIG_DEBUG_FS
9extern void tk_debug_account_sleep_time(struct timespec *t); 10extern void tk_debug_account_sleep_time(struct timespec64 *t);
10#else 11#else
11#define tk_debug_account_sleep_time(x) 12#define tk_debug_account_sleep_time(x)
12#endif 13#endif
13 14
15#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
16static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
17{
18 cycle_t ret = (now - last) & mask;
19
20 return (s64) ret > 0 ? ret : 0;
21}
22#else
23static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
24{
25 return (now - last) & mask;
26}
27#endif
28
14#endif /* _TIMEKEEPING_INTERNAL_H */ 29#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/timer.c b/kernel/time/timer.c
index 3bb01a323b2a..aca5dfe2fa3d 100644
--- a/kernel/timer.c
+++ b/kernel/time/timer.c
@@ -82,6 +82,7 @@ struct tvec_base {
82 unsigned long next_timer; 82 unsigned long next_timer;
83 unsigned long active_timers; 83 unsigned long active_timers;
84 unsigned long all_timers; 84 unsigned long all_timers;
85 int cpu;
85 struct tvec_root tv1; 86 struct tvec_root tv1;
86 struct tvec tv2; 87 struct tvec tv2;
87 struct tvec tv3; 88 struct tvec tv3;
@@ -409,6 +410,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
409 base->next_timer = timer->expires; 410 base->next_timer = timer->expires;
410 } 411 }
411 base->all_timers++; 412 base->all_timers++;
413
414 /*
415 * Check whether the other CPU is in dynticks mode and needs
416 * to be triggered to reevaluate the timer wheel.
417 * We are protected against the other CPU fiddling
418 * with the timer by holding the timer base lock. This also
419 * makes sure that a CPU on the way to stop its tick can not
420 * evaluate the timer wheel.
421 *
422 * Spare the IPI for deferrable timers on idle targets though.
423 * The next busy ticks will take care of it. Except full dynticks
424 * require special care against races with idle_cpu(), lets deal
425 * with that later.
426 */
427 if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
428 wake_up_nohz_cpu(base->cpu);
412} 429}
413 430
414#ifdef CONFIG_TIMER_STATS 431#ifdef CONFIG_TIMER_STATS
@@ -948,22 +965,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
948 timer_set_base(timer, base); 965 timer_set_base(timer, base);
949 debug_activate(timer, timer->expires); 966 debug_activate(timer, timer->expires);
950 internal_add_timer(base, timer); 967 internal_add_timer(base, timer);
951 /*
952 * Check whether the other CPU is in dynticks mode and needs
953 * to be triggered to reevaluate the timer wheel.
954 * We are protected against the other CPU fiddling
955 * with the timer by holding the timer base lock. This also
956 * makes sure that a CPU on the way to stop its tick can not
957 * evaluate the timer wheel.
958 *
959 * Spare the IPI for deferrable timers on idle targets though.
960 * The next busy ticks will take care of it. Except full dynticks
961 * require special care against races with idle_cpu(), lets deal
962 * with that later.
963 */
964 if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu))
965 wake_up_nohz_cpu(cpu);
966
967 spin_unlock_irqrestore(&base->lock, flags); 968 spin_unlock_irqrestore(&base->lock, flags);
968} 969}
969EXPORT_SYMBOL_GPL(add_timer_on); 970EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1568,6 +1569,7 @@ static int init_timers_cpu(int cpu)
1568 } 1569 }
1569 spin_lock_init(&base->lock); 1570 spin_lock_init(&base->lock);
1570 tvec_base_done[cpu] = 1; 1571 tvec_base_done[cpu] = 1;
1572 base->cpu = cpu;
1571 } else { 1573 } else {
1572 base = per_cpu(tvec_bases, cpu); 1574 base = per_cpu(tvec_bases, cpu);
1573 } 1575 }
diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c
new file mode 100644
index 000000000000..e622ba365a13
--- /dev/null
+++ b/kernel/time/udelay_test.c
@@ -0,0 +1,168 @@
1/*
2 * udelay() test kernel module
3 *
4 * Test is executed by writing and reading to /sys/kernel/debug/udelay_test
5 * Tests are configured by writing: USECS ITERATIONS
6 * Tests are executed by reading from the same file.
7 * Specifying usecs of 0 or negative values will run multiples tests.
8 *
9 * Copyright (C) 2014 Google, Inc.
10 *
11 * This software is licensed under the terms of the GNU General Public
12 * License version 2, as published by the Free Software Foundation, and
13 * may be copied, distributed, and modified under those terms.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 */
20
21#include <linux/debugfs.h>
22#include <linux/delay.h>
23#include <linux/ktime.h>
24#include <linux/module.h>
25#include <linux/uaccess.h>
26
27#define DEFAULT_ITERATIONS 100
28
29#define DEBUGFS_FILENAME "udelay_test"
30
31static DEFINE_MUTEX(udelay_test_lock);
32static struct dentry *udelay_test_debugfs_file;
33static int udelay_test_usecs;
34static int udelay_test_iterations = DEFAULT_ITERATIONS;
35
36static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
37{
38 int min = 0, max = 0, fail_count = 0;
39 uint64_t sum = 0;
40 uint64_t avg;
41 int i;
42 /* Allow udelay to be up to 0.5% fast */
43 int allowed_error_ns = usecs * 5;
44
45 for (i = 0; i < iters; ++i) {
46 struct timespec ts1, ts2;
47 int time_passed;
48
49 ktime_get_ts(&ts1);
50 udelay(usecs);
51 ktime_get_ts(&ts2);
52 time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
53
54 if (i == 0 || time_passed < min)
55 min = time_passed;
56 if (i == 0 || time_passed > max)
57 max = time_passed;
58 if ((time_passed + allowed_error_ns) / 1000 < usecs)
59 ++fail_count;
60 WARN_ON(time_passed < 0);
61 sum += time_passed;
62 }
63
64 avg = sum;
65 do_div(avg, iters);
66 seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d",
67 usecs, iters, usecs * 1000,
68 (usecs * 1000) - allowed_error_ns, min, avg, max);
69 if (fail_count)
70 seq_printf(s, " FAIL=%d", fail_count);
71 seq_puts(s, "\n");
72
73 return 0;
74}
75
76static int udelay_test_show(struct seq_file *s, void *v)
77{
78 int usecs;
79 int iters;
80 int ret = 0;
81
82 mutex_lock(&udelay_test_lock);
83 usecs = udelay_test_usecs;
84 iters = udelay_test_iterations;
85 mutex_unlock(&udelay_test_lock);
86
87 if (usecs > 0 && iters > 0) {
88 return udelay_test_single(s, usecs, iters);
89 } else if (usecs == 0) {
90 struct timespec ts;
91
92 ktime_get_ts(&ts);
93 seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
94 loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
95 seq_puts(s, "usage:\n");
96 seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
97 seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
98 }
99
100 return ret;
101}
102
103static int udelay_test_open(struct inode *inode, struct file *file)
104{
105 return single_open(file, udelay_test_show, inode->i_private);
106}
107
108static ssize_t udelay_test_write(struct file *file, const char __user *buf,
109 size_t count, loff_t *pos)
110{
111 char lbuf[32];
112 int ret;
113 int usecs;
114 int iters;
115
116 if (count >= sizeof(lbuf))
117 return -EINVAL;
118
119 if (copy_from_user(lbuf, buf, count))
120 return -EFAULT;
121 lbuf[count] = '\0';
122
123 ret = sscanf(lbuf, "%d %d", &usecs, &iters);
124 if (ret < 1)
125 return -EINVAL;
126 else if (ret < 2)
127 iters = DEFAULT_ITERATIONS;
128
129 mutex_lock(&udelay_test_lock);
130 udelay_test_usecs = usecs;
131 udelay_test_iterations = iters;
132 mutex_unlock(&udelay_test_lock);
133
134 return count;
135}
136
137static const struct file_operations udelay_test_debugfs_ops = {
138 .owner = THIS_MODULE,
139 .open = udelay_test_open,
140 .read = seq_read,
141 .write = udelay_test_write,
142 .llseek = seq_lseek,
143 .release = single_release,
144};
145
146static int __init udelay_test_init(void)
147{
148 mutex_lock(&udelay_test_lock);
149 udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
150 S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
151 mutex_unlock(&udelay_test_lock);
152
153 return 0;
154}
155
156module_init(udelay_test_init);
157
158static void __exit udelay_test_exit(void)
159{
160 mutex_lock(&udelay_test_lock);
161 debugfs_remove(udelay_test_debugfs_file);
162 mutex_unlock(&udelay_test_lock);
163}
164
165module_exit(udelay_test_exit);
166
167MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
168MODULE_LICENSE("GPL");
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 925f629658d6..afb04b9b818a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1968,7 +1968,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1968 1968
1969/** 1969/**
1970 * rb_update_event - update event type and data 1970 * rb_update_event - update event type and data
1971 * @event: the even to update 1971 * @event: the event to update
1972 * @type: the type of event 1972 * @type: the type of event
1973 * @length: the size of the event field in the ring buffer 1973 * @length: the size of the event field in the ring buffer
1974 * 1974 *
@@ -3341,21 +3341,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
3341 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3341 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3342 3342
3343 /* Iterator usage is expected to have record disabled */ 3343 /* Iterator usage is expected to have record disabled */
3344 if (list_empty(&cpu_buffer->reader_page->list)) { 3344 iter->head_page = cpu_buffer->reader_page;
3345 iter->head_page = rb_set_head_page(cpu_buffer); 3345 iter->head = cpu_buffer->reader_page->read;
3346 if (unlikely(!iter->head_page)) 3346
3347 return; 3347 iter->cache_reader_page = iter->head_page;
3348 iter->head = iter->head_page->read; 3348 iter->cache_read = iter->head;
3349 } else { 3349
3350 iter->head_page = cpu_buffer->reader_page;
3351 iter->head = cpu_buffer->reader_page->read;
3352 }
3353 if (iter->head) 3350 if (iter->head)
3354 iter->read_stamp = cpu_buffer->read_stamp; 3351 iter->read_stamp = cpu_buffer->read_stamp;
3355 else 3352 else
3356 iter->read_stamp = iter->head_page->page->time_stamp; 3353 iter->read_stamp = iter->head_page->page->time_stamp;
3357 iter->cache_reader_page = cpu_buffer->reader_page;
3358 iter->cache_read = cpu_buffer->read;
3359} 3354}
3360 3355
3361/** 3356/**
@@ -3748,12 +3743,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3748 return NULL; 3743 return NULL;
3749 3744
3750 /* 3745 /*
3751 * We repeat when a time extend is encountered. 3746 * We repeat when a time extend is encountered or we hit
3752 * Since the time extend is always attached to a data event, 3747 * the end of the page. Since the time extend is always attached
3753 * we should never loop more than once. 3748 * to a data event, we should never loop more than three times.
3754 * (We never hit the following condition more than twice). 3749 * Once for going to next page, once on time extend, and
3750 * finally once to get the event.
3751 * (We never hit the following condition more than thrice).
3755 */ 3752 */
3756 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) 3753 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
3757 return NULL; 3754 return NULL;
3758 3755
3759 if (rb_per_cpu_empty(cpu_buffer)) 3756 if (rb_per_cpu_empty(cpu_buffer))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8bb80fe08767..8a528392b1f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -820,11 +820,12 @@ static struct {
820 const char *name; 820 const char *name;
821 int in_ns; /* is this clock in nanoseconds? */ 821 int in_ns; /* is this clock in nanoseconds? */
822} trace_clocks[] = { 822} trace_clocks[] = {
823 { trace_clock_local, "local", 1 }, 823 { trace_clock_local, "local", 1 },
824 { trace_clock_global, "global", 1 }, 824 { trace_clock_global, "global", 1 },
825 { trace_clock_counter, "counter", 0 }, 825 { trace_clock_counter, "counter", 0 },
826 { trace_clock_jiffies, "uptime", 0 }, 826 { trace_clock_jiffies, "uptime", 0 },
827 { trace_clock, "perf", 1 }, 827 { trace_clock, "perf", 1 },
828 { ktime_get_mono_fast_ns, "mono", 1 },
828 ARCH_TRACE_CLOCKS 829 ARCH_TRACE_CLOCKS
829}; 830};
830 831
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index a1dd9a1b1327..975cb49e32bf 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns,
31 struct taskstats *stats, struct task_struct *tsk) 31 struct taskstats *stats, struct task_struct *tsk)
32{ 32{
33 const struct cred *tcred; 33 const struct cred *tcred;
34 struct timespec uptime, ts;
35 cputime_t utime, stime, utimescaled, stimescaled; 34 cputime_t utime, stime, utimescaled, stimescaled;
36 u64 ac_etime; 35 u64 delta;
37 36
38 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); 37 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
39 38
40 /* calculate task elapsed time in timespec */ 39 /* calculate task elapsed time in nsec */
41 do_posix_clock_monotonic_gettime(&uptime); 40 delta = ktime_get_ns() - tsk->start_time;
42 ts = timespec_sub(uptime, tsk->start_time); 41 /* Convert to micro seconds */
43 /* rebase elapsed time to usec (should never be negative) */ 42 do_div(delta, NSEC_PER_USEC);
44 ac_etime = timespec_to_ns(&ts); 43 stats->ac_etime = delta;
45 do_div(ac_etime, NSEC_PER_USEC); 44 /* Convert to seconds for btime */
46 stats->ac_etime = ac_etime; 45 do_div(delta, USEC_PER_SEC);
47 stats->ac_btime = get_seconds() - ts.tv_sec; 46 stats->ac_btime = get_seconds() - delta;
48 if (thread_group_leader(tsk)) { 47 if (thread_group_leader(tsk)) {
49 stats->ac_exitcode = tsk->exit_code; 48 stats->ac_exitcode = tsk->exit_code;
50 if (tsk->flags & PF_FORKNOEXEC) 49 if (tsk->flags & PF_FORKNOEXEC)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index fcc02560fd6b..aa312b0dc3ec 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v)
526 return; 526 return;
527} 527}
528 528
529struct seq_operations proc_uid_seq_operations = { 529const struct seq_operations proc_uid_seq_operations = {
530 .start = uid_m_start, 530 .start = uid_m_start,
531 .stop = m_stop, 531 .stop = m_stop,
532 .next = m_next, 532 .next = m_next,
533 .show = uid_m_show, 533 .show = uid_m_show,
534}; 534};
535 535
536struct seq_operations proc_gid_seq_operations = { 536const struct seq_operations proc_gid_seq_operations = {
537 .start = gid_m_start, 537 .start = gid_m_start,
538 .stop = m_stop, 538 .stop = m_stop,
539 .next = m_next, 539 .next = m_next,
540 .show = gid_m_show, 540 .show = gid_m_show,
541}; 541};
542 542
543struct seq_operations proc_projid_seq_operations = { 543const struct seq_operations proc_projid_seq_operations = {
544 .start = projid_m_start, 544 .start = projid_m_start,
545 .stop = m_stop, 545 .stop = m_stop,
546 .next = m_next, 546 .next = m_next,
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd393124e507..883aaaa7de8a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
93 struct uts_namespace *ns = NULL; 93 struct uts_namespace *ns = NULL;
94 struct nsproxy *nsproxy; 94 struct nsproxy *nsproxy;
95 95
96 rcu_read_lock(); 96 task_lock(task);
97 nsproxy = task_nsproxy(task); 97 nsproxy = task->nsproxy;
98 if (nsproxy) { 98 if (nsproxy) {
99 ns = nsproxy->uts_ns; 99 ns = nsproxy->uts_ns;
100 get_uts_ns(ns); 100 get_uts_ns(ns);
101 } 101 }
102 rcu_read_unlock(); 102 task_unlock(task);
103 103
104 return ns; 104 return ns;
105} 105}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c3319bd1b040..a8d6914030fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
260 return; 260 return;
261 261
262 if (hardlockup_panic) 262 if (hardlockup_panic)
263 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); 263 panic("Watchdog detected hard LOCKUP on cpu %d",
264 this_cpu);
264 else 265 else
265 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 266 WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
267 this_cpu);
266 268
267 __this_cpu_write(hard_watchdog_warn, true); 269 __this_cpu_write(hard_watchdog_warn, true);
268 return; 270 return;
@@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
345 } 347 }
346 } 348 }
347 349
348 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 350 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
349 smp_processor_id(), duration, 351 smp_processor_id(), duration,
350 current->comm, task_pid_nr(current)); 352 current->comm, task_pid_nr(current));
351 print_modules(); 353 print_modules();
@@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
366 smp_mb__after_atomic(); 368 smp_mb__after_atomic();
367 } 369 }
368 370
371 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
369 if (softlockup_panic) 372 if (softlockup_panic)
370 panic("softlockup: hung tasks"); 373 panic("softlockup: hung tasks");
371 __this_cpu_write(soft_watchdog_warn, true); 374 __this_cpu_write(soft_watchdog_warn, true);
@@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
484 if (PTR_ERR(event) == -EOPNOTSUPP) 487 if (PTR_ERR(event) == -EOPNOTSUPP)
485 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); 488 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
486 else if (PTR_ERR(event) == -ENOENT) 489 else if (PTR_ERR(event) == -ENOENT)
487 pr_warning("disabled (cpu%i): hardware events not enabled\n", 490 pr_warn("disabled (cpu%i): hardware events not enabled\n",
488 cpu); 491 cpu);
489 else 492 else
490 pr_err("disabled (cpu%i): unable to create perf event: %ld\n", 493 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",