diff options
author | Ingo Molnar <mingo@kernel.org> | 2014-08-24 16:32:24 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2014-08-24 16:32:24 -0400 |
commit | 83bc90e11576f9c100f8ef4ba2bcd0b89212e3fb (patch) | |
tree | e59186b4d315c80255851e0d204143ecc21399a0 /kernel | |
parent | e21ded5ecc531a64d6fc0c1693285e890b4e9569 (diff) | |
parent | 451fd72219dd6f3355e2d036c598544c760ee532 (diff) |
Merge branch 'linus' into perf/core, to fix conflicts
Conflicts:
arch/x86/kernel/cpu/perf_event_intel_uncore*.c
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
67 files changed, 4265 insertions, 1717 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f2a8b6246ce9..dc5c77544fd6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -3,12 +3,11 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y = fork.o exec_domain.o panic.o \ | 5 | obj-y = fork.o exec_domain.o panic.o \ |
6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o softirq.o resource.o \ |
7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ |
9 | extable.o params.o posix-timers.o \ | 9 | extable.o params.o \ |
10 | kthread.o sys_ni.o posix-cpu-timers.o \ | 10 | kthread.o sys_ni.o nsproxy.o \ |
11 | hrtimer.o nsproxy.o \ | ||
12 | notifier.o ksysfs.o cred.o reboot.o \ | 11 | notifier.o ksysfs.o cred.o reboot.o \ |
13 | async.o range.o groups.o smpboot.o | 12 | async.o range.o groups.o smpboot.o |
14 | 13 | ||
@@ -87,6 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ | |||
87 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 86 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
88 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 87 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
89 | obj-$(CONFIG_CPU_PM) += cpu_pm.o | 88 | obj-$(CONFIG_CPU_PM) += cpu_pm.o |
89 | obj-$(CONFIG_NET) += bpf/ | ||
90 | 90 | ||
91 | obj-$(CONFIG_PERF_EVENTS) += events/ | 91 | obj-$(CONFIG_PERF_EVENTS) += events/ |
92 | 92 | ||
@@ -105,27 +105,11 @@ targets += config_data.gz | |||
105 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE | 105 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE |
106 | $(call if_changed,gzip) | 106 | $(call if_changed,gzip) |
107 | 107 | ||
108 | filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") | 108 | filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") |
109 | targets += config_data.h | 109 | targets += config_data.h |
110 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE | 110 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE |
111 | $(call filechk,ikconfiggz) | 111 | $(call filechk,ikconfiggz) |
112 | 112 | ||
113 | $(obj)/time.o: $(obj)/timeconst.h | ||
114 | |||
115 | quiet_cmd_hzfile = HZFILE $@ | ||
116 | cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ | ||
117 | |||
118 | targets += hz.bc | ||
119 | $(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE | ||
120 | $(call if_changed,hzfile) | ||
121 | |||
122 | quiet_cmd_bc = BC $@ | ||
123 | cmd_bc = bc -q $(filter-out FORCE,$^) > $@ | ||
124 | |||
125 | targets += timeconst.h | ||
126 | $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE | ||
127 | $(call if_changed,bc) | ||
128 | |||
129 | ############################################################################### | 113 | ############################################################################### |
130 | # | 114 | # |
131 | # Roll all the X.509 certificates that we can find together and pull them into | 115 | # Roll all the X.509 certificates that we can find together and pull them into |
diff --git a/kernel/acct.c b/kernel/acct.c index 808a86ff229d..b4c667d22e79 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <asm/div64.h> | 59 | #include <asm/div64.h> |
60 | #include <linux/blkdev.h> /* sector_div */ | 60 | #include <linux/blkdev.h> /* sector_div */ |
61 | #include <linux/pid_namespace.h> | 61 | #include <linux/pid_namespace.h> |
62 | #include <linux/fs_pin.h> | ||
62 | 63 | ||
63 | /* | 64 | /* |
64 | * These constants control the amount of freespace that suspend and | 65 | * These constants control the amount of freespace that suspend and |
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30}; | |||
75 | /* | 76 | /* |
76 | * External references and all of the globals. | 77 | * External references and all of the globals. |
77 | */ | 78 | */ |
78 | static void do_acct_process(struct bsd_acct_struct *acct, | 79 | static void do_acct_process(struct bsd_acct_struct *acct); |
79 | struct pid_namespace *ns, struct file *); | ||
80 | 80 | ||
81 | /* | ||
82 | * This structure is used so that all the data protected by lock | ||
83 | * can be placed in the same cache line as the lock. This primes | ||
84 | * the cache line to have the data after getting the lock. | ||
85 | */ | ||
86 | struct bsd_acct_struct { | 81 | struct bsd_acct_struct { |
82 | struct fs_pin pin; | ||
83 | struct mutex lock; | ||
87 | int active; | 84 | int active; |
88 | unsigned long needcheck; | 85 | unsigned long needcheck; |
89 | struct file *file; | 86 | struct file *file; |
90 | struct pid_namespace *ns; | 87 | struct pid_namespace *ns; |
91 | struct list_head list; | 88 | struct work_struct work; |
89 | struct completion done; | ||
92 | }; | 90 | }; |
93 | 91 | ||
94 | static DEFINE_SPINLOCK(acct_lock); | ||
95 | static LIST_HEAD(acct_list); | ||
96 | |||
97 | /* | 92 | /* |
98 | * Check the amount of free space and suspend/resume accordingly. | 93 | * Check the amount of free space and suspend/resume accordingly. |
99 | */ | 94 | */ |
100 | static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | 95 | static int check_free_space(struct bsd_acct_struct *acct) |
101 | { | 96 | { |
102 | struct kstatfs sbuf; | 97 | struct kstatfs sbuf; |
103 | int res; | 98 | |
104 | int act; | 99 | if (time_is_before_jiffies(acct->needcheck)) |
105 | u64 resume; | ||
106 | u64 suspend; | ||
107 | |||
108 | spin_lock(&acct_lock); | ||
109 | res = acct->active; | ||
110 | if (!file || time_is_before_jiffies(acct->needcheck)) | ||
111 | goto out; | 100 | goto out; |
112 | spin_unlock(&acct_lock); | ||
113 | 101 | ||
114 | /* May block */ | 102 | /* May block */ |
115 | if (vfs_statfs(&file->f_path, &sbuf)) | 103 | if (vfs_statfs(&acct->file->f_path, &sbuf)) |
116 | return res; | ||
117 | suspend = sbuf.f_blocks * SUSPEND; | ||
118 | resume = sbuf.f_blocks * RESUME; | ||
119 | |||
120 | do_div(suspend, 100); | ||
121 | do_div(resume, 100); | ||
122 | |||
123 | if (sbuf.f_bavail <= suspend) | ||
124 | act = -1; | ||
125 | else if (sbuf.f_bavail >= resume) | ||
126 | act = 1; | ||
127 | else | ||
128 | act = 0; | ||
129 | |||
130 | /* | ||
131 | * If some joker switched acct->file under us we'ld better be | ||
132 | * silent and _not_ touch anything. | ||
133 | */ | ||
134 | spin_lock(&acct_lock); | ||
135 | if (file != acct->file) { | ||
136 | if (act) | ||
137 | res = act > 0; | ||
138 | goto out; | 104 | goto out; |
139 | } | ||
140 | 105 | ||
141 | if (acct->active) { | 106 | if (acct->active) { |
142 | if (act < 0) { | 107 | u64 suspend = sbuf.f_blocks * SUSPEND; |
108 | do_div(suspend, 100); | ||
109 | if (sbuf.f_bavail <= suspend) { | ||
143 | acct->active = 0; | 110 | acct->active = 0; |
144 | printk(KERN_INFO "Process accounting paused\n"); | 111 | pr_info("Process accounting paused\n"); |
145 | } | 112 | } |
146 | } else { | 113 | } else { |
147 | if (act > 0) { | 114 | u64 resume = sbuf.f_blocks * RESUME; |
115 | do_div(resume, 100); | ||
116 | if (sbuf.f_bavail >= resume) { | ||
148 | acct->active = 1; | 117 | acct->active = 1; |
149 | printk(KERN_INFO "Process accounting resumed\n"); | 118 | pr_info("Process accounting resumed\n"); |
150 | } | 119 | } |
151 | } | 120 | } |
152 | 121 | ||
153 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; | 122 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; |
154 | res = acct->active; | ||
155 | out: | 123 | out: |
156 | spin_unlock(&acct_lock); | 124 | return acct->active; |
125 | } | ||
126 | |||
127 | static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) | ||
128 | { | ||
129 | struct bsd_acct_struct *res; | ||
130 | again: | ||
131 | smp_rmb(); | ||
132 | rcu_read_lock(); | ||
133 | res = ACCESS_ONCE(ns->bacct); | ||
134 | if (!res) { | ||
135 | rcu_read_unlock(); | ||
136 | return NULL; | ||
137 | } | ||
138 | if (!atomic_long_inc_not_zero(&res->pin.count)) { | ||
139 | rcu_read_unlock(); | ||
140 | cpu_relax(); | ||
141 | goto again; | ||
142 | } | ||
143 | rcu_read_unlock(); | ||
144 | mutex_lock(&res->lock); | ||
145 | if (!res->ns) { | ||
146 | mutex_unlock(&res->lock); | ||
147 | pin_put(&res->pin); | ||
148 | goto again; | ||
149 | } | ||
157 | return res; | 150 | return res; |
158 | } | 151 | } |
159 | 152 | ||
160 | /* | 153 | static void close_work(struct work_struct *work) |
161 | * Close the old accounting file (if currently open) and then replace | 154 | { |
162 | * it with file (if non-NULL). | 155 | struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); |
163 | * | 156 | struct file *file = acct->file; |
164 | * NOTE: acct_lock MUST be held on entry and exit. | 157 | if (file->f_op->flush) |
165 | */ | 158 | file->f_op->flush(file, NULL); |
166 | static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, | 159 | __fput_sync(file); |
167 | struct pid_namespace *ns) | 160 | complete(&acct->done); |
161 | } | ||
162 | |||
163 | static void acct_kill(struct bsd_acct_struct *acct, | ||
164 | struct bsd_acct_struct *new) | ||
168 | { | 165 | { |
169 | struct file *old_acct = NULL; | 166 | if (acct) { |
170 | struct pid_namespace *old_ns = NULL; | 167 | struct pid_namespace *ns = acct->ns; |
171 | 168 | do_acct_process(acct); | |
172 | if (acct->file) { | 169 | INIT_WORK(&acct->work, close_work); |
173 | old_acct = acct->file; | 170 | init_completion(&acct->done); |
174 | old_ns = acct->ns; | 171 | schedule_work(&acct->work); |
175 | acct->active = 0; | 172 | wait_for_completion(&acct->done); |
176 | acct->file = NULL; | 173 | pin_remove(&acct->pin); |
174 | ns->bacct = new; | ||
177 | acct->ns = NULL; | 175 | acct->ns = NULL; |
178 | list_del(&acct->list); | 176 | atomic_long_dec(&acct->pin.count); |
177 | mutex_unlock(&acct->lock); | ||
178 | pin_put(&acct->pin); | ||
179 | } | 179 | } |
180 | if (file) { | 180 | } |
181 | acct->file = file; | 181 | |
182 | acct->ns = ns; | 182 | static void acct_pin_kill(struct fs_pin *pin) |
183 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; | 183 | { |
184 | acct->active = 1; | 184 | struct bsd_acct_struct *acct; |
185 | list_add(&acct->list, &acct_list); | 185 | acct = container_of(pin, struct bsd_acct_struct, pin); |
186 | } | 186 | mutex_lock(&acct->lock); |
187 | if (old_acct) { | 187 | if (!acct->ns) { |
188 | mnt_unpin(old_acct->f_path.mnt); | 188 | mutex_unlock(&acct->lock); |
189 | spin_unlock(&acct_lock); | 189 | pin_put(pin); |
190 | do_acct_process(acct, old_ns, old_acct); | 190 | acct = NULL; |
191 | filp_close(old_acct, NULL); | ||
192 | spin_lock(&acct_lock); | ||
193 | } | 191 | } |
192 | acct_kill(acct, NULL); | ||
194 | } | 193 | } |
195 | 194 | ||
196 | static int acct_on(struct filename *pathname) | 195 | static int acct_on(struct filename *pathname) |
197 | { | 196 | { |
198 | struct file *file; | 197 | struct file *file; |
199 | struct vfsmount *mnt; | 198 | struct vfsmount *mnt, *internal; |
200 | struct pid_namespace *ns; | 199 | struct pid_namespace *ns = task_active_pid_ns(current); |
201 | struct bsd_acct_struct *acct = NULL; | 200 | struct bsd_acct_struct *acct, *old; |
201 | int err; | ||
202 | |||
203 | acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); | ||
204 | if (!acct) | ||
205 | return -ENOMEM; | ||
202 | 206 | ||
203 | /* Difference from BSD - they don't do O_APPEND */ | 207 | /* Difference from BSD - they don't do O_APPEND */ |
204 | file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); | 208 | file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); |
205 | if (IS_ERR(file)) | 209 | if (IS_ERR(file)) { |
210 | kfree(acct); | ||
206 | return PTR_ERR(file); | 211 | return PTR_ERR(file); |
212 | } | ||
207 | 213 | ||
208 | if (!S_ISREG(file_inode(file)->i_mode)) { | 214 | if (!S_ISREG(file_inode(file)->i_mode)) { |
215 | kfree(acct); | ||
209 | filp_close(file, NULL); | 216 | filp_close(file, NULL); |
210 | return -EACCES; | 217 | return -EACCES; |
211 | } | 218 | } |
212 | 219 | ||
213 | if (!file->f_op->write) { | 220 | if (!file->f_op->write) { |
221 | kfree(acct); | ||
214 | filp_close(file, NULL); | 222 | filp_close(file, NULL); |
215 | return -EIO; | 223 | return -EIO; |
216 | } | 224 | } |
217 | 225 | internal = mnt_clone_internal(&file->f_path); | |
218 | ns = task_active_pid_ns(current); | 226 | if (IS_ERR(internal)) { |
219 | if (ns->bacct == NULL) { | 227 | kfree(acct); |
220 | acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); | 228 | filp_close(file, NULL); |
221 | if (acct == NULL) { | 229 | return PTR_ERR(internal); |
222 | filp_close(file, NULL); | ||
223 | return -ENOMEM; | ||
224 | } | ||
225 | } | 230 | } |
226 | 231 | err = mnt_want_write(internal); | |
227 | spin_lock(&acct_lock); | 232 | if (err) { |
228 | if (ns->bacct == NULL) { | 233 | mntput(internal); |
229 | ns->bacct = acct; | 234 | kfree(acct); |
230 | acct = NULL; | 235 | filp_close(file, NULL); |
236 | return err; | ||
231 | } | 237 | } |
232 | |||
233 | mnt = file->f_path.mnt; | 238 | mnt = file->f_path.mnt; |
234 | mnt_pin(mnt); | 239 | file->f_path.mnt = internal; |
235 | acct_file_reopen(ns->bacct, file, ns); | 240 | |
236 | spin_unlock(&acct_lock); | 241 | atomic_long_set(&acct->pin.count, 1); |
237 | 242 | acct->pin.kill = acct_pin_kill; | |
238 | mntput(mnt); /* it's pinned, now give up active reference */ | 243 | acct->file = file; |
239 | kfree(acct); | 244 | acct->needcheck = jiffies; |
240 | 245 | acct->ns = ns; | |
246 | mutex_init(&acct->lock); | ||
247 | mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ | ||
248 | pin_insert(&acct->pin, mnt); | ||
249 | |||
250 | old = acct_get(ns); | ||
251 | if (old) | ||
252 | acct_kill(old, acct); | ||
253 | else | ||
254 | ns->bacct = acct; | ||
255 | mutex_unlock(&acct->lock); | ||
256 | mnt_drop_write(mnt); | ||
257 | mntput(mnt); | ||
241 | return 0; | 258 | return 0; |
242 | } | 259 | } |
243 | 260 | ||
261 | static DEFINE_MUTEX(acct_on_mutex); | ||
262 | |||
244 | /** | 263 | /** |
245 | * sys_acct - enable/disable process accounting | 264 | * sys_acct - enable/disable process accounting |
246 | * @name: file name for accounting records or NULL to shutdown accounting | 265 | * @name: file name for accounting records or NULL to shutdown accounting |
@@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name) | |||
261 | 280 | ||
262 | if (name) { | 281 | if (name) { |
263 | struct filename *tmp = getname(name); | 282 | struct filename *tmp = getname(name); |
283 | |||
264 | if (IS_ERR(tmp)) | 284 | if (IS_ERR(tmp)) |
265 | return PTR_ERR(tmp); | 285 | return PTR_ERR(tmp); |
286 | mutex_lock(&acct_on_mutex); | ||
266 | error = acct_on(tmp); | 287 | error = acct_on(tmp); |
288 | mutex_unlock(&acct_on_mutex); | ||
267 | putname(tmp); | 289 | putname(tmp); |
268 | } else { | 290 | } else { |
269 | struct bsd_acct_struct *acct; | 291 | acct_kill(acct_get(task_active_pid_ns(current)), NULL); |
270 | |||
271 | acct = task_active_pid_ns(current)->bacct; | ||
272 | if (acct == NULL) | ||
273 | return 0; | ||
274 | |||
275 | spin_lock(&acct_lock); | ||
276 | acct_file_reopen(acct, NULL, NULL); | ||
277 | spin_unlock(&acct_lock); | ||
278 | } | 292 | } |
279 | 293 | ||
280 | return error; | 294 | return error; |
281 | } | 295 | } |
282 | 296 | ||
283 | /** | ||
284 | * acct_auto_close - turn off a filesystem's accounting if it is on | ||
285 | * @m: vfsmount being shut down | ||
286 | * | ||
287 | * If the accounting is turned on for a file in the subtree pointed to | ||
288 | * to by m, turn accounting off. Done when m is about to die. | ||
289 | */ | ||
290 | void acct_auto_close_mnt(struct vfsmount *m) | ||
291 | { | ||
292 | struct bsd_acct_struct *acct; | ||
293 | |||
294 | spin_lock(&acct_lock); | ||
295 | restart: | ||
296 | list_for_each_entry(acct, &acct_list, list) | ||
297 | if (acct->file && acct->file->f_path.mnt == m) { | ||
298 | acct_file_reopen(acct, NULL, NULL); | ||
299 | goto restart; | ||
300 | } | ||
301 | spin_unlock(&acct_lock); | ||
302 | } | ||
303 | |||
304 | /** | ||
305 | * acct_auto_close - turn off a filesystem's accounting if it is on | ||
306 | * @sb: super block for the filesystem | ||
307 | * | ||
308 | * If the accounting is turned on for a file in the filesystem pointed | ||
309 | * to by sb, turn accounting off. | ||
310 | */ | ||
311 | void acct_auto_close(struct super_block *sb) | ||
312 | { | ||
313 | struct bsd_acct_struct *acct; | ||
314 | |||
315 | spin_lock(&acct_lock); | ||
316 | restart: | ||
317 | list_for_each_entry(acct, &acct_list, list) | ||
318 | if (acct->file && acct->file->f_path.dentry->d_sb == sb) { | ||
319 | acct_file_reopen(acct, NULL, NULL); | ||
320 | goto restart; | ||
321 | } | ||
322 | spin_unlock(&acct_lock); | ||
323 | } | ||
324 | |||
325 | void acct_exit_ns(struct pid_namespace *ns) | 297 | void acct_exit_ns(struct pid_namespace *ns) |
326 | { | 298 | { |
327 | struct bsd_acct_struct *acct = ns->bacct; | 299 | acct_kill(acct_get(ns), NULL); |
328 | |||
329 | if (acct == NULL) | ||
330 | return; | ||
331 | |||
332 | spin_lock(&acct_lock); | ||
333 | if (acct->file != NULL) | ||
334 | acct_file_reopen(acct, NULL, NULL); | ||
335 | spin_unlock(&acct_lock); | ||
336 | |||
337 | kfree(acct); | ||
338 | } | 300 | } |
339 | 301 | ||
340 | /* | 302 | /* |
@@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value) | |||
376 | return exp; | 338 | return exp; |
377 | } | 339 | } |
378 | 340 | ||
379 | #if ACCT_VERSION==1 || ACCT_VERSION==2 | 341 | #if ACCT_VERSION == 1 || ACCT_VERSION == 2 |
380 | /* | 342 | /* |
381 | * encode an u64 into a comp2_t (24 bits) | 343 | * encode an u64 into a comp2_t (24 bits) |
382 | * | 344 | * |
@@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value) | |||
389 | #define MANTSIZE2 20 /* 20 bit mantissa. */ | 351 | #define MANTSIZE2 20 /* 20 bit mantissa. */ |
390 | #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ | 352 | #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ |
391 | #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ | 353 | #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ |
392 | #define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ | 354 | #define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */ |
393 | 355 | ||
394 | static comp2_t encode_comp2_t(u64 value) | 356 | static comp2_t encode_comp2_t(u64 value) |
395 | { | 357 | { |
@@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value) | |||
420 | } | 382 | } |
421 | #endif | 383 | #endif |
422 | 384 | ||
423 | #if ACCT_VERSION==3 | 385 | #if ACCT_VERSION == 3 |
424 | /* | 386 | /* |
425 | * encode an u64 into a 32 bit IEEE float | 387 | * encode an u64 into a 32 bit IEEE float |
426 | */ | 388 | */ |
@@ -429,8 +391,9 @@ static u32 encode_float(u64 value) | |||
429 | unsigned exp = 190; | 391 | unsigned exp = 190; |
430 | unsigned u; | 392 | unsigned u; |
431 | 393 | ||
432 | if (value==0) return 0; | 394 | if (value == 0) |
433 | while ((s64)value > 0){ | 395 | return 0; |
396 | while ((s64)value > 0) { | ||
434 | value <<= 1; | 397 | value <<= 1; |
435 | exp--; | 398 | exp--; |
436 | } | 399 | } |
@@ -448,120 +411,112 @@ static u32 encode_float(u64 value) | |||
448 | * do_exit() or when switching to a different output file. | 411 | * do_exit() or when switching to a different output file. |
449 | */ | 412 | */ |
450 | 413 | ||
451 | /* | 414 | static void fill_ac(acct_t *ac) |
452 | * do_acct_process does all actual work. Caller holds the reference to file. | ||
453 | */ | ||
454 | static void do_acct_process(struct bsd_acct_struct *acct, | ||
455 | struct pid_namespace *ns, struct file *file) | ||
456 | { | 415 | { |
457 | struct pacct_struct *pacct = ¤t->signal->pacct; | 416 | struct pacct_struct *pacct = ¤t->signal->pacct; |
458 | acct_t ac; | 417 | u64 elapsed, run_time; |
459 | mm_segment_t fs; | ||
460 | unsigned long flim; | ||
461 | u64 elapsed; | ||
462 | u64 run_time; | ||
463 | struct timespec uptime; | ||
464 | struct tty_struct *tty; | 418 | struct tty_struct *tty; |
465 | const struct cred *orig_cred; | ||
466 | |||
467 | /* Perform file operations on behalf of whoever enabled accounting */ | ||
468 | orig_cred = override_creds(file->f_cred); | ||
469 | |||
470 | /* | ||
471 | * First check to see if there is enough free_space to continue | ||
472 | * the process accounting system. | ||
473 | */ | ||
474 | if (!check_free_space(acct, file)) | ||
475 | goto out; | ||
476 | 419 | ||
477 | /* | 420 | /* |
478 | * Fill the accounting struct with the needed info as recorded | 421 | * Fill the accounting struct with the needed info as recorded |
479 | * by the different kernel functions. | 422 | * by the different kernel functions. |
480 | */ | 423 | */ |
481 | memset(&ac, 0, sizeof(acct_t)); | 424 | memset(ac, 0, sizeof(acct_t)); |
482 | 425 | ||
483 | ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; | 426 | ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; |
484 | strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); | 427 | strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); |
485 | 428 | ||
486 | /* calculate run_time in nsec*/ | 429 | /* calculate run_time in nsec*/ |
487 | do_posix_clock_monotonic_gettime(&uptime); | 430 | run_time = ktime_get_ns(); |
488 | run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; | 431 | run_time -= current->group_leader->start_time; |
489 | run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC | ||
490 | + current->group_leader->start_time.tv_nsec; | ||
491 | /* convert nsec -> AHZ */ | 432 | /* convert nsec -> AHZ */ |
492 | elapsed = nsec_to_AHZ(run_time); | 433 | elapsed = nsec_to_AHZ(run_time); |
493 | #if ACCT_VERSION==3 | 434 | #if ACCT_VERSION == 3 |
494 | ac.ac_etime = encode_float(elapsed); | 435 | ac->ac_etime = encode_float(elapsed); |
495 | #else | 436 | #else |
496 | ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? | 437 | ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? |
497 | (unsigned long) elapsed : (unsigned long) -1l); | 438 | (unsigned long) elapsed : (unsigned long) -1l); |
498 | #endif | 439 | #endif |
499 | #if ACCT_VERSION==1 || ACCT_VERSION==2 | 440 | #if ACCT_VERSION == 1 || ACCT_VERSION == 2 |
500 | { | 441 | { |
501 | /* new enlarged etime field */ | 442 | /* new enlarged etime field */ |
502 | comp2_t etime = encode_comp2_t(elapsed); | 443 | comp2_t etime = encode_comp2_t(elapsed); |
503 | ac.ac_etime_hi = etime >> 16; | 444 | |
504 | ac.ac_etime_lo = (u16) etime; | 445 | ac->ac_etime_hi = etime >> 16; |
446 | ac->ac_etime_lo = (u16) etime; | ||
505 | } | 447 | } |
506 | #endif | 448 | #endif |
507 | do_div(elapsed, AHZ); | 449 | do_div(elapsed, AHZ); |
508 | ac.ac_btime = get_seconds() - elapsed; | 450 | ac->ac_btime = get_seconds() - elapsed; |
451 | #if ACCT_VERSION==2 | ||
452 | ac->ac_ahz = AHZ; | ||
453 | #endif | ||
454 | |||
455 | spin_lock_irq(¤t->sighand->siglock); | ||
456 | tty = current->signal->tty; /* Safe as we hold the siglock */ | ||
457 | ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; | ||
458 | ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); | ||
459 | ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); | ||
460 | ac->ac_flag = pacct->ac_flag; | ||
461 | ac->ac_mem = encode_comp_t(pacct->ac_mem); | ||
462 | ac->ac_minflt = encode_comp_t(pacct->ac_minflt); | ||
463 | ac->ac_majflt = encode_comp_t(pacct->ac_majflt); | ||
464 | ac->ac_exitcode = pacct->ac_exitcode; | ||
465 | spin_unlock_irq(¤t->sighand->siglock); | ||
466 | } | ||
467 | /* | ||
468 | * do_acct_process does all actual work. Caller holds the reference to file. | ||
469 | */ | ||
470 | static void do_acct_process(struct bsd_acct_struct *acct) | ||
471 | { | ||
472 | acct_t ac; | ||
473 | unsigned long flim; | ||
474 | const struct cred *orig_cred; | ||
475 | struct pid_namespace *ns = acct->ns; | ||
476 | struct file *file = acct->file; | ||
477 | |||
478 | /* | ||
479 | * Accounting records are not subject to resource limits. | ||
480 | */ | ||
481 | flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
482 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; | ||
483 | /* Perform file operations on behalf of whoever enabled accounting */ | ||
484 | orig_cred = override_creds(file->f_cred); | ||
485 | |||
486 | /* | ||
487 | * First check to see if there is enough free_space to continue | ||
488 | * the process accounting system. | ||
489 | */ | ||
490 | if (!check_free_space(acct)) | ||
491 | goto out; | ||
492 | |||
493 | fill_ac(&ac); | ||
509 | /* we really need to bite the bullet and change layout */ | 494 | /* we really need to bite the bullet and change layout */ |
510 | ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); | 495 | ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); |
511 | ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); | 496 | ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); |
512 | #if ACCT_VERSION==2 | 497 | #if ACCT_VERSION == 1 || ACCT_VERSION == 2 |
513 | ac.ac_ahz = AHZ; | ||
514 | #endif | ||
515 | #if ACCT_VERSION==1 || ACCT_VERSION==2 | ||
516 | /* backward-compatible 16 bit fields */ | 498 | /* backward-compatible 16 bit fields */ |
517 | ac.ac_uid16 = ac.ac_uid; | 499 | ac.ac_uid16 = ac.ac_uid; |
518 | ac.ac_gid16 = ac.ac_gid; | 500 | ac.ac_gid16 = ac.ac_gid; |
519 | #endif | 501 | #endif |
520 | #if ACCT_VERSION==3 | 502 | #if ACCT_VERSION == 3 |
521 | ac.ac_pid = task_tgid_nr_ns(current, ns); | 503 | ac.ac_pid = task_tgid_nr_ns(current, ns); |
522 | rcu_read_lock(); | 504 | rcu_read_lock(); |
523 | ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); | 505 | ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); |
524 | rcu_read_unlock(); | 506 | rcu_read_unlock(); |
525 | #endif | 507 | #endif |
526 | |||
527 | spin_lock_irq(¤t->sighand->siglock); | ||
528 | tty = current->signal->tty; /* Safe as we hold the siglock */ | ||
529 | ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; | ||
530 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); | ||
531 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); | ||
532 | ac.ac_flag = pacct->ac_flag; | ||
533 | ac.ac_mem = encode_comp_t(pacct->ac_mem); | ||
534 | ac.ac_minflt = encode_comp_t(pacct->ac_minflt); | ||
535 | ac.ac_majflt = encode_comp_t(pacct->ac_majflt); | ||
536 | ac.ac_exitcode = pacct->ac_exitcode; | ||
537 | spin_unlock_irq(¤t->sighand->siglock); | ||
538 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ | ||
539 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); | ||
540 | ac.ac_swaps = encode_comp_t(0); | ||
541 | |||
542 | /* | 508 | /* |
543 | * Get freeze protection. If the fs is frozen, just skip the write | 509 | * Get freeze protection. If the fs is frozen, just skip the write |
544 | * as we could deadlock the system otherwise. | 510 | * as we could deadlock the system otherwise. |
545 | */ | 511 | */ |
546 | if (!file_start_write_trylock(file)) | 512 | if (file_start_write_trylock(file)) { |
547 | goto out; | 513 | /* it's been opened O_APPEND, so position is irrelevant */ |
548 | /* | 514 | loff_t pos = 0; |
549 | * Kernel segment override to datasegment and write it | 515 | __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); |
550 | * to the accounting file. | 516 | file_end_write(file); |
551 | */ | 517 | } |
552 | fs = get_fs(); | ||
553 | set_fs(KERNEL_DS); | ||
554 | /* | ||
555 | * Accounting records are not subject to resource limits. | ||
556 | */ | ||
557 | flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
558 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; | ||
559 | file->f_op->write(file, (char *)&ac, | ||
560 | sizeof(acct_t), &file->f_pos); | ||
561 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; | ||
562 | set_fs(fs); | ||
563 | file_end_write(file); | ||
564 | out: | 518 | out: |
519 | current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; | ||
565 | revert_creds(orig_cred); | 520 | revert_creds(orig_cred); |
566 | } | 521 | } |
567 | 522 | ||
@@ -578,6 +533,7 @@ void acct_collect(long exitcode, int group_dead) | |||
578 | 533 | ||
579 | if (group_dead && current->mm) { | 534 | if (group_dead && current->mm) { |
580 | struct vm_area_struct *vma; | 535 | struct vm_area_struct *vma; |
536 | |||
581 | down_read(¤t->mm->mmap_sem); | 537 | down_read(¤t->mm->mmap_sem); |
582 | vma = current->mm->mmap; | 538 | vma = current->mm->mmap; |
583 | while (vma) { | 539 | while (vma) { |
@@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead) | |||
609 | spin_unlock_irq(¤t->sighand->siglock); | 565 | spin_unlock_irq(¤t->sighand->siglock); |
610 | } | 566 | } |
611 | 567 | ||
612 | static void acct_process_in_ns(struct pid_namespace *ns) | 568 | static void slow_acct_process(struct pid_namespace *ns) |
613 | { | 569 | { |
614 | struct file *file = NULL; | 570 | for ( ; ns; ns = ns->parent) { |
615 | struct bsd_acct_struct *acct; | 571 | struct bsd_acct_struct *acct = acct_get(ns); |
616 | 572 | if (acct) { | |
617 | acct = ns->bacct; | 573 | do_acct_process(acct); |
618 | /* | 574 | mutex_unlock(&acct->lock); |
619 | * accelerate the common fastpath: | 575 | pin_put(&acct->pin); |
620 | */ | 576 | } |
621 | if (!acct || !acct->file) | ||
622 | return; | ||
623 | |||
624 | spin_lock(&acct_lock); | ||
625 | file = acct->file; | ||
626 | if (unlikely(!file)) { | ||
627 | spin_unlock(&acct_lock); | ||
628 | return; | ||
629 | } | 577 | } |
630 | get_file(file); | ||
631 | spin_unlock(&acct_lock); | ||
632 | |||
633 | do_acct_process(acct, ns, file); | ||
634 | fput(file); | ||
635 | } | 578 | } |
636 | 579 | ||
637 | /** | 580 | /** |
638 | * acct_process - now just a wrapper around acct_process_in_ns, | 581 | * acct_process |
639 | * which in turn is a wrapper around do_acct_process. | ||
640 | * | 582 | * |
641 | * handles process accounting for an exiting task | 583 | * handles process accounting for an exiting task |
642 | */ | 584 | */ |
@@ -649,6 +591,10 @@ void acct_process(void) | |||
649 | * alive and holds its namespace, which in turn holds | 591 | * alive and holds its namespace, which in turn holds |
650 | * its parent. | 592 | * its parent. |
651 | */ | 593 | */ |
652 | for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) | 594 | for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { |
653 | acct_process_in_ns(ns); | 595 | if (ns->bacct) |
596 | break; | ||
597 | } | ||
598 | if (unlikely(ns)) | ||
599 | slow_acct_process(ns); | ||
654 | } | 600 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index 3ef2e0e797e8..ba2ff5a5c600 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -1677,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) | |||
1677 | audit_log_format(ab, " %s=", prefix); | 1677 | audit_log_format(ab, " %s=", prefix); |
1678 | CAP_FOR_EACH_U32(i) { | 1678 | CAP_FOR_EACH_U32(i) { |
1679 | audit_log_format(ab, "%08x", | 1679 | audit_log_format(ab, "%08x", |
1680 | cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); | 1680 | cap->cap[CAP_LAST_U32 - i]); |
1681 | } | 1681 | } |
1682 | } | 1682 | } |
1683 | 1683 | ||
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c3dbb7..c447cd9848d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) | |||
106 | if (unlikely(!entry)) | 106 | if (unlikely(!entry)) |
107 | return NULL; | 107 | return NULL; |
108 | 108 | ||
109 | fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); | 109 | fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); |
110 | if (unlikely(!fields)) { | 110 | if (unlikely(!fields)) { |
111 | kfree(entry); | 111 | kfree(entry); |
112 | return NULL; | 112 | return NULL; |
@@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; | |||
160 | 160 | ||
161 | int __init audit_register_class(int class, unsigned *list) | 161 | int __init audit_register_class(int class, unsigned *list) |
162 | { | 162 | { |
163 | __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); | 163 | __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); |
164 | if (!p) | 164 | if (!p) |
165 | return -ENOMEM; | 165 | return -ENOMEM; |
166 | while (*list != ~0U) { | 166 | while (*list != ~0U) { |
diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246b04b8..e1d1d1952bfa 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/page-flags.h> | 9 | #include <linux/page-flags.h> |
10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
11 | #include <linux/kbuild.h> | 11 | #include <linux/kbuild.h> |
12 | #include <linux/page_cgroup.h> | ||
13 | #include <linux/log2.h> | 12 | #include <linux/log2.h> |
14 | #include <linux/spinlock_types.h> | 13 | #include <linux/spinlock_types.h> |
15 | 14 | ||
@@ -18,7 +17,6 @@ void foo(void) | |||
18 | /* The enum constants to put into include/generated/bounds.h */ | 17 | /* The enum constants to put into include/generated/bounds.h */ |
19 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | 18 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); |
20 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | 19 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); |
21 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); | ||
22 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
23 | DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); | 21 | DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); |
24 | #endif | 22 | #endif |
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile new file mode 100644 index 000000000000..6a71145e2769 --- /dev/null +++ b/kernel/bpf/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-y := core.o | |||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c new file mode 100644 index 000000000000..7f0dbcbb34af --- /dev/null +++ b/kernel/bpf/core.c | |||
@@ -0,0 +1,534 @@ | |||
1 | /* | ||
2 | * Linux Socket Filter - Kernel level socket filtering | ||
3 | * | ||
4 | * Based on the design of the Berkeley Packet Filter. The new | ||
5 | * internal format has been designed by PLUMgrid: | ||
6 | * | ||
7 | * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com | ||
8 | * | ||
9 | * Authors: | ||
10 | * | ||
11 | * Jay Schulist <jschlst@samba.org> | ||
12 | * Alexei Starovoitov <ast@plumgrid.com> | ||
13 | * Daniel Borkmann <dborkman@redhat.com> | ||
14 | * | ||
15 | * This program is free software; you can redistribute it and/or | ||
16 | * modify it under the terms of the GNU General Public License | ||
17 | * as published by the Free Software Foundation; either version | ||
18 | * 2 of the License, or (at your option) any later version. | ||
19 | * | ||
20 | * Andi Kleen - Fix a few bad bugs and races. | ||
21 | * Kris Katterjohn - Added many additional checks in bpf_check_classic() | ||
22 | */ | ||
23 | #include <linux/filter.h> | ||
24 | #include <linux/skbuff.h> | ||
25 | #include <asm/unaligned.h> | ||
26 | |||
27 | /* Registers */ | ||
28 | #define BPF_R0 regs[BPF_REG_0] | ||
29 | #define BPF_R1 regs[BPF_REG_1] | ||
30 | #define BPF_R2 regs[BPF_REG_2] | ||
31 | #define BPF_R3 regs[BPF_REG_3] | ||
32 | #define BPF_R4 regs[BPF_REG_4] | ||
33 | #define BPF_R5 regs[BPF_REG_5] | ||
34 | #define BPF_R6 regs[BPF_REG_6] | ||
35 | #define BPF_R7 regs[BPF_REG_7] | ||
36 | #define BPF_R8 regs[BPF_REG_8] | ||
37 | #define BPF_R9 regs[BPF_REG_9] | ||
38 | #define BPF_R10 regs[BPF_REG_10] | ||
39 | |||
40 | /* Named registers */ | ||
41 | #define DST regs[insn->dst_reg] | ||
42 | #define SRC regs[insn->src_reg] | ||
43 | #define FP regs[BPF_REG_FP] | ||
44 | #define ARG1 regs[BPF_REG_ARG1] | ||
45 | #define CTX regs[BPF_REG_CTX] | ||
46 | #define IMM insn->imm | ||
47 | |||
48 | /* No hurry in this branch | ||
49 | * | ||
50 | * Exported for the bpf jit load helper. | ||
51 | */ | ||
52 | void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) | ||
53 | { | ||
54 | u8 *ptr = NULL; | ||
55 | |||
56 | if (k >= SKF_NET_OFF) | ||
57 | ptr = skb_network_header(skb) + k - SKF_NET_OFF; | ||
58 | else if (k >= SKF_LL_OFF) | ||
59 | ptr = skb_mac_header(skb) + k - SKF_LL_OFF; | ||
60 | if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) | ||
61 | return ptr; | ||
62 | |||
63 | return NULL; | ||
64 | } | ||
65 | |||
66 | /* Base function for offset calculation. Needs to go into .text section, | ||
67 | * therefore keeping it non-static as well; will also be used by JITs | ||
68 | * anyway later on, so do not let the compiler omit it. | ||
69 | */ | ||
70 | noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
71 | { | ||
72 | return 0; | ||
73 | } | ||
74 | |||
75 | /** | ||
76 | * __bpf_prog_run - run eBPF program on a given context | ||
77 | * @ctx: is the data we are operating on | ||
78 | * @insn: is the array of eBPF instructions | ||
79 | * | ||
80 | * Decode and execute eBPF instructions. | ||
81 | */ | ||
82 | static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) | ||
83 | { | ||
84 | u64 stack[MAX_BPF_STACK / sizeof(u64)]; | ||
85 | u64 regs[MAX_BPF_REG], tmp; | ||
86 | static const void *jumptable[256] = { | ||
87 | [0 ... 255] = &&default_label, | ||
88 | /* Now overwrite non-defaults ... */ | ||
89 | /* 32 bit ALU operations */ | ||
90 | [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, | ||
91 | [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, | ||
92 | [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, | ||
93 | [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, | ||
94 | [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, | ||
95 | [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, | ||
96 | [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, | ||
97 | [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, | ||
98 | [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, | ||
99 | [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, | ||
100 | [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, | ||
101 | [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, | ||
102 | [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, | ||
103 | [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, | ||
104 | [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, | ||
105 | [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, | ||
106 | [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, | ||
107 | [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, | ||
108 | [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, | ||
109 | [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, | ||
110 | [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, | ||
111 | [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, | ||
112 | [BPF_ALU | BPF_NEG] = &&ALU_NEG, | ||
113 | [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, | ||
114 | [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, | ||
115 | /* 64 bit ALU operations */ | ||
116 | [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, | ||
117 | [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, | ||
118 | [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, | ||
119 | [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, | ||
120 | [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, | ||
121 | [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, | ||
122 | [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, | ||
123 | [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, | ||
124 | [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, | ||
125 | [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, | ||
126 | [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, | ||
127 | [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, | ||
128 | [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, | ||
129 | [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, | ||
130 | [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, | ||
131 | [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, | ||
132 | [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, | ||
133 | [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, | ||
134 | [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, | ||
135 | [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, | ||
136 | [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, | ||
137 | [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, | ||
138 | [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, | ||
139 | [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, | ||
140 | [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, | ||
141 | /* Call instruction */ | ||
142 | [BPF_JMP | BPF_CALL] = &&JMP_CALL, | ||
143 | /* Jumps */ | ||
144 | [BPF_JMP | BPF_JA] = &&JMP_JA, | ||
145 | [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, | ||
146 | [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, | ||
147 | [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, | ||
148 | [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, | ||
149 | [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, | ||
150 | [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, | ||
151 | [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, | ||
152 | [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, | ||
153 | [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, | ||
154 | [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, | ||
155 | [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, | ||
156 | [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, | ||
157 | [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, | ||
158 | [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, | ||
159 | /* Program return */ | ||
160 | [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, | ||
161 | /* Store instructions */ | ||
162 | [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, | ||
163 | [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, | ||
164 | [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, | ||
165 | [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, | ||
166 | [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, | ||
167 | [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, | ||
168 | [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, | ||
169 | [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, | ||
170 | [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, | ||
171 | [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, | ||
172 | /* Load instructions */ | ||
173 | [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, | ||
174 | [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, | ||
175 | [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, | ||
176 | [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, | ||
177 | [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, | ||
178 | [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, | ||
179 | [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, | ||
180 | [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, | ||
181 | [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, | ||
182 | [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, | ||
183 | }; | ||
184 | void *ptr; | ||
185 | int off; | ||
186 | |||
187 | #define CONT ({ insn++; goto select_insn; }) | ||
188 | #define CONT_JMP ({ insn++; goto select_insn; }) | ||
189 | |||
190 | FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; | ||
191 | ARG1 = (u64) (unsigned long) ctx; | ||
192 | |||
193 | /* Registers used in classic BPF programs need to be reset first. */ | ||
194 | regs[BPF_REG_A] = 0; | ||
195 | regs[BPF_REG_X] = 0; | ||
196 | |||
197 | select_insn: | ||
198 | goto *jumptable[insn->code]; | ||
199 | |||
200 | /* ALU */ | ||
201 | #define ALU(OPCODE, OP) \ | ||
202 | ALU64_##OPCODE##_X: \ | ||
203 | DST = DST OP SRC; \ | ||
204 | CONT; \ | ||
205 | ALU_##OPCODE##_X: \ | ||
206 | DST = (u32) DST OP (u32) SRC; \ | ||
207 | CONT; \ | ||
208 | ALU64_##OPCODE##_K: \ | ||
209 | DST = DST OP IMM; \ | ||
210 | CONT; \ | ||
211 | ALU_##OPCODE##_K: \ | ||
212 | DST = (u32) DST OP (u32) IMM; \ | ||
213 | CONT; | ||
214 | |||
215 | ALU(ADD, +) | ||
216 | ALU(SUB, -) | ||
217 | ALU(AND, &) | ||
218 | ALU(OR, |) | ||
219 | ALU(LSH, <<) | ||
220 | ALU(RSH, >>) | ||
221 | ALU(XOR, ^) | ||
222 | ALU(MUL, *) | ||
223 | #undef ALU | ||
224 | ALU_NEG: | ||
225 | DST = (u32) -DST; | ||
226 | CONT; | ||
227 | ALU64_NEG: | ||
228 | DST = -DST; | ||
229 | CONT; | ||
230 | ALU_MOV_X: | ||
231 | DST = (u32) SRC; | ||
232 | CONT; | ||
233 | ALU_MOV_K: | ||
234 | DST = (u32) IMM; | ||
235 | CONT; | ||
236 | ALU64_MOV_X: | ||
237 | DST = SRC; | ||
238 | CONT; | ||
239 | ALU64_MOV_K: | ||
240 | DST = IMM; | ||
241 | CONT; | ||
242 | ALU64_ARSH_X: | ||
243 | (*(s64 *) &DST) >>= SRC; | ||
244 | CONT; | ||
245 | ALU64_ARSH_K: | ||
246 | (*(s64 *) &DST) >>= IMM; | ||
247 | CONT; | ||
248 | ALU64_MOD_X: | ||
249 | if (unlikely(SRC == 0)) | ||
250 | return 0; | ||
251 | tmp = DST; | ||
252 | DST = do_div(tmp, SRC); | ||
253 | CONT; | ||
254 | ALU_MOD_X: | ||
255 | if (unlikely(SRC == 0)) | ||
256 | return 0; | ||
257 | tmp = (u32) DST; | ||
258 | DST = do_div(tmp, (u32) SRC); | ||
259 | CONT; | ||
260 | ALU64_MOD_K: | ||
261 | tmp = DST; | ||
262 | DST = do_div(tmp, IMM); | ||
263 | CONT; | ||
264 | ALU_MOD_K: | ||
265 | tmp = (u32) DST; | ||
266 | DST = do_div(tmp, (u32) IMM); | ||
267 | CONT; | ||
268 | ALU64_DIV_X: | ||
269 | if (unlikely(SRC == 0)) | ||
270 | return 0; | ||
271 | do_div(DST, SRC); | ||
272 | CONT; | ||
273 | ALU_DIV_X: | ||
274 | if (unlikely(SRC == 0)) | ||
275 | return 0; | ||
276 | tmp = (u32) DST; | ||
277 | do_div(tmp, (u32) SRC); | ||
278 | DST = (u32) tmp; | ||
279 | CONT; | ||
280 | ALU64_DIV_K: | ||
281 | do_div(DST, IMM); | ||
282 | CONT; | ||
283 | ALU_DIV_K: | ||
284 | tmp = (u32) DST; | ||
285 | do_div(tmp, (u32) IMM); | ||
286 | DST = (u32) tmp; | ||
287 | CONT; | ||
288 | ALU_END_TO_BE: | ||
289 | switch (IMM) { | ||
290 | case 16: | ||
291 | DST = (__force u16) cpu_to_be16(DST); | ||
292 | break; | ||
293 | case 32: | ||
294 | DST = (__force u32) cpu_to_be32(DST); | ||
295 | break; | ||
296 | case 64: | ||
297 | DST = (__force u64) cpu_to_be64(DST); | ||
298 | break; | ||
299 | } | ||
300 | CONT; | ||
301 | ALU_END_TO_LE: | ||
302 | switch (IMM) { | ||
303 | case 16: | ||
304 | DST = (__force u16) cpu_to_le16(DST); | ||
305 | break; | ||
306 | case 32: | ||
307 | DST = (__force u32) cpu_to_le32(DST); | ||
308 | break; | ||
309 | case 64: | ||
310 | DST = (__force u64) cpu_to_le64(DST); | ||
311 | break; | ||
312 | } | ||
313 | CONT; | ||
314 | |||
315 | /* CALL */ | ||
316 | JMP_CALL: | ||
317 | /* Function call scratches BPF_R1-BPF_R5 registers, | ||
318 | * preserves BPF_R6-BPF_R9, and stores return value | ||
319 | * into BPF_R0. | ||
320 | */ | ||
321 | BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, | ||
322 | BPF_R4, BPF_R5); | ||
323 | CONT; | ||
324 | |||
325 | /* JMP */ | ||
326 | JMP_JA: | ||
327 | insn += insn->off; | ||
328 | CONT; | ||
329 | JMP_JEQ_X: | ||
330 | if (DST == SRC) { | ||
331 | insn += insn->off; | ||
332 | CONT_JMP; | ||
333 | } | ||
334 | CONT; | ||
335 | JMP_JEQ_K: | ||
336 | if (DST == IMM) { | ||
337 | insn += insn->off; | ||
338 | CONT_JMP; | ||
339 | } | ||
340 | CONT; | ||
341 | JMP_JNE_X: | ||
342 | if (DST != SRC) { | ||
343 | insn += insn->off; | ||
344 | CONT_JMP; | ||
345 | } | ||
346 | CONT; | ||
347 | JMP_JNE_K: | ||
348 | if (DST != IMM) { | ||
349 | insn += insn->off; | ||
350 | CONT_JMP; | ||
351 | } | ||
352 | CONT; | ||
353 | JMP_JGT_X: | ||
354 | if (DST > SRC) { | ||
355 | insn += insn->off; | ||
356 | CONT_JMP; | ||
357 | } | ||
358 | CONT; | ||
359 | JMP_JGT_K: | ||
360 | if (DST > IMM) { | ||
361 | insn += insn->off; | ||
362 | CONT_JMP; | ||
363 | } | ||
364 | CONT; | ||
365 | JMP_JGE_X: | ||
366 | if (DST >= SRC) { | ||
367 | insn += insn->off; | ||
368 | CONT_JMP; | ||
369 | } | ||
370 | CONT; | ||
371 | JMP_JGE_K: | ||
372 | if (DST >= IMM) { | ||
373 | insn += insn->off; | ||
374 | CONT_JMP; | ||
375 | } | ||
376 | CONT; | ||
377 | JMP_JSGT_X: | ||
378 | if (((s64) DST) > ((s64) SRC)) { | ||
379 | insn += insn->off; | ||
380 | CONT_JMP; | ||
381 | } | ||
382 | CONT; | ||
383 | JMP_JSGT_K: | ||
384 | if (((s64) DST) > ((s64) IMM)) { | ||
385 | insn += insn->off; | ||
386 | CONT_JMP; | ||
387 | } | ||
388 | CONT; | ||
389 | JMP_JSGE_X: | ||
390 | if (((s64) DST) >= ((s64) SRC)) { | ||
391 | insn += insn->off; | ||
392 | CONT_JMP; | ||
393 | } | ||
394 | CONT; | ||
395 | JMP_JSGE_K: | ||
396 | if (((s64) DST) >= ((s64) IMM)) { | ||
397 | insn += insn->off; | ||
398 | CONT_JMP; | ||
399 | } | ||
400 | CONT; | ||
401 | JMP_JSET_X: | ||
402 | if (DST & SRC) { | ||
403 | insn += insn->off; | ||
404 | CONT_JMP; | ||
405 | } | ||
406 | CONT; | ||
407 | JMP_JSET_K: | ||
408 | if (DST & IMM) { | ||
409 | insn += insn->off; | ||
410 | CONT_JMP; | ||
411 | } | ||
412 | CONT; | ||
413 | JMP_EXIT: | ||
414 | return BPF_R0; | ||
415 | |||
416 | /* STX and ST and LDX*/ | ||
417 | #define LDST(SIZEOP, SIZE) \ | ||
418 | STX_MEM_##SIZEOP: \ | ||
419 | *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ | ||
420 | CONT; \ | ||
421 | ST_MEM_##SIZEOP: \ | ||
422 | *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ | ||
423 | CONT; \ | ||
424 | LDX_MEM_##SIZEOP: \ | ||
425 | DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ | ||
426 | CONT; | ||
427 | |||
428 | LDST(B, u8) | ||
429 | LDST(H, u16) | ||
430 | LDST(W, u32) | ||
431 | LDST(DW, u64) | ||
432 | #undef LDST | ||
433 | STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ | ||
434 | atomic_add((u32) SRC, (atomic_t *)(unsigned long) | ||
435 | (DST + insn->off)); | ||
436 | CONT; | ||
437 | STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ | ||
438 | atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) | ||
439 | (DST + insn->off)); | ||
440 | CONT; | ||
441 | LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ | ||
442 | off = IMM; | ||
443 | load_word: | ||
444 | /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are | ||
445 | * only appearing in the programs where ctx == | ||
446 | * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] | ||
447 | * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, | ||
448 | * internal BPF verifier will check that BPF_R6 == | ||
449 | * ctx. | ||
450 | * | ||
451 | * BPF_ABS and BPF_IND are wrappers of function calls, | ||
452 | * so they scratch BPF_R1-BPF_R5 registers, preserve | ||
453 | * BPF_R6-BPF_R9, and store return value into BPF_R0. | ||
454 | * | ||
455 | * Implicit input: | ||
456 | * ctx == skb == BPF_R6 == CTX | ||
457 | * | ||
458 | * Explicit input: | ||
459 | * SRC == any register | ||
460 | * IMM == 32-bit immediate | ||
461 | * | ||
462 | * Output: | ||
463 | * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness | ||
464 | */ | ||
465 | |||
466 | ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); | ||
467 | if (likely(ptr != NULL)) { | ||
468 | BPF_R0 = get_unaligned_be32(ptr); | ||
469 | CONT; | ||
470 | } | ||
471 | |||
472 | return 0; | ||
473 | LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ | ||
474 | off = IMM; | ||
475 | load_half: | ||
476 | ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); | ||
477 | if (likely(ptr != NULL)) { | ||
478 | BPF_R0 = get_unaligned_be16(ptr); | ||
479 | CONT; | ||
480 | } | ||
481 | |||
482 | return 0; | ||
483 | LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ | ||
484 | off = IMM; | ||
485 | load_byte: | ||
486 | ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); | ||
487 | if (likely(ptr != NULL)) { | ||
488 | BPF_R0 = *(u8 *)ptr; | ||
489 | CONT; | ||
490 | } | ||
491 | |||
492 | return 0; | ||
493 | LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ | ||
494 | off = IMM + SRC; | ||
495 | goto load_word; | ||
496 | LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ | ||
497 | off = IMM + SRC; | ||
498 | goto load_half; | ||
499 | LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ | ||
500 | off = IMM + SRC; | ||
501 | goto load_byte; | ||
502 | |||
503 | default_label: | ||
504 | /* If we ever reach this, we have a bug somewhere. */ | ||
505 | WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); | ||
506 | return 0; | ||
507 | } | ||
508 | |||
509 | void __weak bpf_int_jit_compile(struct bpf_prog *prog) | ||
510 | { | ||
511 | } | ||
512 | |||
513 | /** | ||
514 | * bpf_prog_select_runtime - select execution runtime for BPF program | ||
515 | * @fp: bpf_prog populated with internal BPF program | ||
516 | * | ||
517 | * try to JIT internal BPF program, if JIT is not available select interpreter | ||
518 | * BPF program will be executed via BPF_PROG_RUN() macro | ||
519 | */ | ||
520 | void bpf_prog_select_runtime(struct bpf_prog *fp) | ||
521 | { | ||
522 | fp->bpf_func = (void *) __bpf_prog_run; | ||
523 | |||
524 | /* Probe if internal BPF can be JITed */ | ||
525 | bpf_int_jit_compile(fp); | ||
526 | } | ||
527 | EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); | ||
528 | |||
529 | /* free internal BPF program */ | ||
530 | void bpf_prog_free(struct bpf_prog *fp) | ||
531 | { | ||
532 | bpf_jit_free(fp); | ||
533 | } | ||
534 | EXPORT_SYMBOL_GPL(bpf_prog_free); | ||
diff --git a/kernel/capability.c b/kernel/capability.c index a5cf13c018ce..989f5bfc57dc 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -258,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) | |||
258 | i++; | 258 | i++; |
259 | } | 259 | } |
260 | 260 | ||
261 | effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; | ||
262 | permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; | ||
263 | inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; | ||
264 | |||
261 | new = prepare_creds(); | 265 | new = prepare_creds(); |
262 | if (!new) | 266 | if (!new) |
263 | return -ENOMEM; | 267 | return -ENOMEM; |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2f7c760305ca..379650b984f8 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -2472,7 +2472,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) | |||
2472 | static void kdb_sysinfo(struct sysinfo *val) | 2472 | static void kdb_sysinfo(struct sysinfo *val) |
2473 | { | 2473 | { |
2474 | struct timespec uptime; | 2474 | struct timespec uptime; |
2475 | do_posix_clock_monotonic_gettime(&uptime); | 2475 | ktime_get_ts(&uptime); |
2476 | memset(val, 0, sizeof(*val)); | 2476 | memset(val, 0, sizeof(*val)); |
2477 | val->uptime = uptime.tv_sec; | 2477 | val->uptime = uptime.tv_sec; |
2478 | val->loads[0] = avenrun[0]; | 2478 | val->loads[0] = avenrun[0]; |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 54996b71e66d..ef90b04d783f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk) | |||
46 | } | 46 | } |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * Start accounting for a delay statistic using | 49 | * Finish delay accounting for a statistic using its timestamps (@start), |
50 | * its starting timestamp (@start) | 50 | * accumalator (@total) and @count |
51 | */ | 51 | */ |
52 | 52 | static void delayacct_end(u64 *start, u64 *total, u32 *count) | |
53 | static inline void delayacct_start(struct timespec *start) | ||
54 | { | 53 | { |
55 | do_posix_clock_monotonic_gettime(start); | 54 | s64 ns = ktime_get_ns() - *start; |
56 | } | ||
57 | |||
58 | /* | ||
59 | * Finish delay accounting for a statistic using | ||
60 | * its timestamps (@start, @end), accumalator (@total) and @count | ||
61 | */ | ||
62 | |||
63 | static void delayacct_end(struct timespec *start, struct timespec *end, | ||
64 | u64 *total, u32 *count) | ||
65 | { | ||
66 | struct timespec ts; | ||
67 | s64 ns; | ||
68 | unsigned long flags; | 55 | unsigned long flags; |
69 | 56 | ||
70 | do_posix_clock_monotonic_gettime(end); | 57 | if (ns > 0) { |
71 | ts = timespec_sub(*end, *start); | 58 | spin_lock_irqsave(¤t->delays->lock, flags); |
72 | ns = timespec_to_ns(&ts); | 59 | *total += ns; |
73 | if (ns < 0) | 60 | (*count)++; |
74 | return; | 61 | spin_unlock_irqrestore(¤t->delays->lock, flags); |
75 | 62 | } | |
76 | spin_lock_irqsave(¤t->delays->lock, flags); | ||
77 | *total += ns; | ||
78 | (*count)++; | ||
79 | spin_unlock_irqrestore(¤t->delays->lock, flags); | ||
80 | } | 63 | } |
81 | 64 | ||
82 | void __delayacct_blkio_start(void) | 65 | void __delayacct_blkio_start(void) |
83 | { | 66 | { |
84 | delayacct_start(¤t->delays->blkio_start); | 67 | current->delays->blkio_start = ktime_get_ns(); |
85 | } | 68 | } |
86 | 69 | ||
87 | void __delayacct_blkio_end(void) | 70 | void __delayacct_blkio_end(void) |
@@ -89,35 +72,29 @@ void __delayacct_blkio_end(void) | |||
89 | if (current->delays->flags & DELAYACCT_PF_SWAPIN) | 72 | if (current->delays->flags & DELAYACCT_PF_SWAPIN) |
90 | /* Swapin block I/O */ | 73 | /* Swapin block I/O */ |
91 | delayacct_end(¤t->delays->blkio_start, | 74 | delayacct_end(¤t->delays->blkio_start, |
92 | ¤t->delays->blkio_end, | ||
93 | ¤t->delays->swapin_delay, | 75 | ¤t->delays->swapin_delay, |
94 | ¤t->delays->swapin_count); | 76 | ¤t->delays->swapin_count); |
95 | else /* Other block I/O */ | 77 | else /* Other block I/O */ |
96 | delayacct_end(¤t->delays->blkio_start, | 78 | delayacct_end(¤t->delays->blkio_start, |
97 | ¤t->delays->blkio_end, | ||
98 | ¤t->delays->blkio_delay, | 79 | ¤t->delays->blkio_delay, |
99 | ¤t->delays->blkio_count); | 80 | ¤t->delays->blkio_count); |
100 | } | 81 | } |
101 | 82 | ||
102 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | 83 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) |
103 | { | 84 | { |
104 | s64 tmp; | ||
105 | unsigned long t1; | ||
106 | unsigned long long t2, t3; | ||
107 | unsigned long flags; | ||
108 | struct timespec ts; | ||
109 | cputime_t utime, stime, stimescaled, utimescaled; | 85 | cputime_t utime, stime, stimescaled, utimescaled; |
86 | unsigned long long t2, t3; | ||
87 | unsigned long flags, t1; | ||
88 | s64 tmp; | ||
110 | 89 | ||
111 | tmp = (s64)d->cpu_run_real_total; | ||
112 | task_cputime(tsk, &utime, &stime); | 90 | task_cputime(tsk, &utime, &stime); |
113 | cputime_to_timespec(utime + stime, &ts); | 91 | tmp = (s64)d->cpu_run_real_total; |
114 | tmp += timespec_to_ns(&ts); | 92 | tmp += cputime_to_nsecs(utime + stime); |
115 | d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; | 93 | d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; |
116 | 94 | ||
117 | tmp = (s64)d->cpu_scaled_run_real_total; | ||
118 | task_cputime_scaled(tsk, &utimescaled, &stimescaled); | 95 | task_cputime_scaled(tsk, &utimescaled, &stimescaled); |
119 | cputime_to_timespec(utimescaled + stimescaled, &ts); | 96 | tmp = (s64)d->cpu_scaled_run_real_total; |
120 | tmp += timespec_to_ns(&ts); | 97 | tmp += cputime_to_nsecs(utimescaled + stimescaled); |
121 | d->cpu_scaled_run_real_total = | 98 | d->cpu_scaled_run_real_total = |
122 | (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; | 99 | (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; |
123 | 100 | ||
@@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) | |||
169 | 146 | ||
170 | void __delayacct_freepages_start(void) | 147 | void __delayacct_freepages_start(void) |
171 | { | 148 | { |
172 | delayacct_start(¤t->delays->freepages_start); | 149 | current->delays->freepages_start = ktime_get_ns(); |
173 | } | 150 | } |
174 | 151 | ||
175 | void __delayacct_freepages_end(void) | 152 | void __delayacct_freepages_end(void) |
176 | { | 153 | { |
177 | delayacct_end(¤t->delays->freepages_start, | 154 | delayacct_end(¤t->delays->freepages_start, |
178 | ¤t->delays->freepages_end, | ||
179 | ¤t->delays->freepages_delay, | 155 | ¤t->delays->freepages_delay, |
180 | ¤t->delays->freepages_count); | 156 | ¤t->delays->freepages_count); |
181 | } | 157 | } |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e8c137..1d0af8a2c646 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
167 | /* For mmu_notifiers */ | 167 | /* For mmu_notifiers */ |
168 | const unsigned long mmun_start = addr; | 168 | const unsigned long mmun_start = addr; |
169 | const unsigned long mmun_end = addr + PAGE_SIZE; | 169 | const unsigned long mmun_end = addr + PAGE_SIZE; |
170 | struct mem_cgroup *memcg; | ||
171 | |||
172 | err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); | ||
173 | if (err) | ||
174 | return err; | ||
170 | 175 | ||
171 | /* For try_to_free_swap() and munlock_vma_page() below */ | 176 | /* For try_to_free_swap() and munlock_vma_page() below */ |
172 | lock_page(page); | 177 | lock_page(page); |
@@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
179 | 184 | ||
180 | get_page(kpage); | 185 | get_page(kpage); |
181 | page_add_new_anon_rmap(kpage, vma, addr); | 186 | page_add_new_anon_rmap(kpage, vma, addr); |
187 | mem_cgroup_commit_charge(kpage, memcg, false); | ||
188 | lru_cache_add_active_or_unevictable(kpage, vma); | ||
182 | 189 | ||
183 | if (!PageAnon(page)) { | 190 | if (!PageAnon(page)) { |
184 | dec_mm_counter(mm, MM_FILEPAGES); | 191 | dec_mm_counter(mm, MM_FILEPAGES); |
@@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
200 | 207 | ||
201 | err = 0; | 208 | err = 0; |
202 | unlock: | 209 | unlock: |
210 | mem_cgroup_cancel_charge(kpage, memcg); | ||
203 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 211 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
204 | unlock_page(page); | 212 | unlock_page(page); |
205 | return err; | 213 | return err; |
@@ -315,18 +323,11 @@ retry: | |||
315 | if (!new_page) | 323 | if (!new_page) |
316 | goto put_old; | 324 | goto put_old; |
317 | 325 | ||
318 | if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) | ||
319 | goto put_new; | ||
320 | |||
321 | __SetPageUptodate(new_page); | 326 | __SetPageUptodate(new_page); |
322 | copy_highpage(new_page, old_page); | 327 | copy_highpage(new_page, old_page); |
323 | copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); | 328 | copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); |
324 | 329 | ||
325 | ret = __replace_page(vma, vaddr, old_page, new_page); | 330 | ret = __replace_page(vma, vaddr, old_page, new_page); |
326 | if (ret) | ||
327 | mem_cgroup_uncharge_page(new_page); | ||
328 | |||
329 | put_new: | ||
330 | page_cache_release(new_page); | 331 | page_cache_release(new_page); |
331 | put_old: | 332 | put_old: |
332 | put_page(old_page); | 333 | put_page(old_page); |
diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668f1799..32c58f7433a3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -59,7 +59,7 @@ | |||
59 | #include <asm/pgtable.h> | 59 | #include <asm/pgtable.h> |
60 | #include <asm/mmu_context.h> | 60 | #include <asm/mmu_context.h> |
61 | 61 | ||
62 | static void exit_mm(struct task_struct * tsk); | 62 | static void exit_mm(struct task_struct *tsk); |
63 | 63 | ||
64 | static void __unhash_process(struct task_struct *p, bool group_dead) | 64 | static void __unhash_process(struct task_struct *p, bool group_dead) |
65 | { | 65 | { |
@@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
151 | spin_unlock(&sighand->siglock); | 151 | spin_unlock(&sighand->siglock); |
152 | 152 | ||
153 | __cleanup_sighand(sighand); | 153 | __cleanup_sighand(sighand); |
154 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | 154 | clear_tsk_thread_flag(tsk, TIF_SIGPENDING); |
155 | if (group_dead) { | 155 | if (group_dead) { |
156 | flush_sigqueue(&sig->shared_pending); | 156 | flush_sigqueue(&sig->shared_pending); |
157 | tty_kref_put(tty); | 157 | tty_kref_put(tty); |
@@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
168 | } | 168 | } |
169 | 169 | ||
170 | 170 | ||
171 | void release_task(struct task_struct * p) | 171 | void release_task(struct task_struct *p) |
172 | { | 172 | { |
173 | struct task_struct *leader; | 173 | struct task_struct *leader; |
174 | int zap_leader; | 174 | int zap_leader; |
@@ -192,7 +192,8 @@ repeat: | |||
192 | */ | 192 | */ |
193 | zap_leader = 0; | 193 | zap_leader = 0; |
194 | leader = p->group_leader; | 194 | leader = p->group_leader; |
195 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { | 195 | if (leader != p && thread_group_empty(leader) |
196 | && leader->exit_state == EXIT_ZOMBIE) { | ||
196 | /* | 197 | /* |
197 | * If we were the last child thread and the leader has | 198 | * If we were the last child thread and the leader has |
198 | * exited already, and the leader's parent ignores SIGCHLD, | 199 | * exited already, and the leader's parent ignores SIGCHLD, |
@@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) | |||
241 | * | 242 | * |
242 | * "I ask you, have you ever known what it is to be an orphan?" | 243 | * "I ask you, have you ever known what it is to be an orphan?" |
243 | */ | 244 | */ |
244 | static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) | 245 | static int will_become_orphaned_pgrp(struct pid *pgrp, |
246 | struct task_struct *ignored_task) | ||
245 | { | 247 | { |
246 | struct task_struct *p; | 248 | struct task_struct *p; |
247 | 249 | ||
@@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | |||
294 | struct task_struct *ignored_task = tsk; | 296 | struct task_struct *ignored_task = tsk; |
295 | 297 | ||
296 | if (!parent) | 298 | if (!parent) |
297 | /* exit: our father is in a different pgrp than | 299 | /* exit: our father is in a different pgrp than |
298 | * we are and we were the only connection outside. | 300 | * we are and we were the only connection outside. |
299 | */ | 301 | */ |
300 | parent = tsk->real_parent; | 302 | parent = tsk->real_parent; |
301 | else | 303 | else |
302 | /* reparent: our child is in a different pgrp than | 304 | /* reparent: our child is in a different pgrp than |
@@ -405,7 +407,7 @@ assign_new_owner: | |||
405 | * Turn us into a lazy TLB process if we | 407 | * Turn us into a lazy TLB process if we |
406 | * aren't already.. | 408 | * aren't already.. |
407 | */ | 409 | */ |
408 | static void exit_mm(struct task_struct * tsk) | 410 | static void exit_mm(struct task_struct *tsk) |
409 | { | 411 | { |
410 | struct mm_struct *mm = tsk->mm; | 412 | struct mm_struct *mm = tsk->mm; |
411 | struct core_state *core_state; | 413 | struct core_state *core_state; |
@@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk) | |||
425 | core_state = mm->core_state; | 427 | core_state = mm->core_state; |
426 | if (core_state) { | 428 | if (core_state) { |
427 | struct core_thread self; | 429 | struct core_thread self; |
430 | |||
428 | up_read(&mm->mmap_sem); | 431 | up_read(&mm->mmap_sem); |
429 | 432 | ||
430 | self.task = tsk; | 433 | self.task = tsk; |
@@ -455,6 +458,7 @@ static void exit_mm(struct task_struct * tsk) | |||
455 | task_unlock(tsk); | 458 | task_unlock(tsk); |
456 | mm_update_next_owner(mm); | 459 | mm_update_next_owner(mm); |
457 | mmput(mm); | 460 | mmput(mm); |
461 | clear_thread_flag(TIF_MEMDIE); | ||
458 | } | 462 | } |
459 | 463 | ||
460 | /* | 464 | /* |
@@ -565,6 +569,7 @@ static void forget_original_parent(struct task_struct *father) | |||
565 | 569 | ||
566 | list_for_each_entry_safe(p, n, &father->children, sibling) { | 570 | list_for_each_entry_safe(p, n, &father->children, sibling) { |
567 | struct task_struct *t = p; | 571 | struct task_struct *t = p; |
572 | |||
568 | do { | 573 | do { |
569 | t->real_parent = reaper; | 574 | t->real_parent = reaper; |
570 | if (t->parent == father) { | 575 | if (t->parent == father) { |
@@ -598,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
598 | /* | 603 | /* |
599 | * This does two things: | 604 | * This does two things: |
600 | * | 605 | * |
601 | * A. Make init inherit all the child processes | 606 | * A. Make init inherit all the child processes |
602 | * B. Check to see if any process groups have become orphaned | 607 | * B. Check to see if any process groups have become orphaned |
603 | * as a result of our exiting, and if they have any stopped | 608 | * as a result of our exiting, and if they have any stopped |
604 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) | 609 | * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) |
@@ -648,9 +653,8 @@ static void check_stack_usage(void) | |||
648 | 653 | ||
649 | spin_lock(&low_water_lock); | 654 | spin_lock(&low_water_lock); |
650 | if (free < lowest_to_date) { | 655 | if (free < lowest_to_date) { |
651 | printk(KERN_WARNING "%s (%d) used greatest stack depth: " | 656 | pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", |
652 | "%lu bytes left\n", | 657 | current->comm, task_pid_nr(current), free); |
653 | current->comm, task_pid_nr(current), free); | ||
654 | lowest_to_date = free; | 658 | lowest_to_date = free; |
655 | } | 659 | } |
656 | spin_unlock(&low_water_lock); | 660 | spin_unlock(&low_water_lock); |
@@ -691,8 +695,7 @@ void do_exit(long code) | |||
691 | * leave this task alone and wait for reboot. | 695 | * leave this task alone and wait for reboot. |
692 | */ | 696 | */ |
693 | if (unlikely(tsk->flags & PF_EXITING)) { | 697 | if (unlikely(tsk->flags & PF_EXITING)) { |
694 | printk(KERN_ALERT | 698 | pr_alert("Fixing recursive fault but reboot is needed!\n"); |
695 | "Fixing recursive fault but reboot is needed!\n"); | ||
696 | /* | 699 | /* |
697 | * We can do this unlocked here. The futex code uses | 700 | * We can do this unlocked here. The futex code uses |
698 | * this flag just to verify whether the pi state | 701 | * this flag just to verify whether the pi state |
@@ -716,9 +719,9 @@ void do_exit(long code) | |||
716 | raw_spin_unlock_wait(&tsk->pi_lock); | 719 | raw_spin_unlock_wait(&tsk->pi_lock); |
717 | 720 | ||
718 | if (unlikely(in_atomic())) | 721 | if (unlikely(in_atomic())) |
719 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 722 | pr_info("note: %s[%d] exited with preempt_count %d\n", |
720 | current->comm, task_pid_nr(current), | 723 | current->comm, task_pid_nr(current), |
721 | preempt_count()); | 724 | preempt_count()); |
722 | 725 | ||
723 | acct_update_integrals(tsk); | 726 | acct_update_integrals(tsk); |
724 | /* sync mm's RSS info before statistics gathering */ | 727 | /* sync mm's RSS info before statistics gathering */ |
@@ -836,7 +839,6 @@ void do_exit(long code) | |||
836 | for (;;) | 839 | for (;;) |
837 | cpu_relax(); /* For when BUG is null */ | 840 | cpu_relax(); /* For when BUG is null */ |
838 | } | 841 | } |
839 | |||
840 | EXPORT_SYMBOL_GPL(do_exit); | 842 | EXPORT_SYMBOL_GPL(do_exit); |
841 | 843 | ||
842 | void complete_and_exit(struct completion *comp, long code) | 844 | void complete_and_exit(struct completion *comp, long code) |
@@ -846,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code) | |||
846 | 848 | ||
847 | do_exit(code); | 849 | do_exit(code); |
848 | } | 850 | } |
849 | |||
850 | EXPORT_SYMBOL(complete_and_exit); | 851 | EXPORT_SYMBOL(complete_and_exit); |
851 | 852 | ||
852 | SYSCALL_DEFINE1(exit, int, error_code) | 853 | SYSCALL_DEFINE1(exit, int, error_code) |
@@ -869,6 +870,7 @@ do_group_exit(int exit_code) | |||
869 | exit_code = sig->group_exit_code; | 870 | exit_code = sig->group_exit_code; |
870 | else if (!thread_group_empty(current)) { | 871 | else if (!thread_group_empty(current)) { |
871 | struct sighand_struct *const sighand = current->sighand; | 872 | struct sighand_struct *const sighand = current->sighand; |
873 | |||
872 | spin_lock_irq(&sighand->siglock); | 874 | spin_lock_irq(&sighand->siglock); |
873 | if (signal_group_exit(sig)) | 875 | if (signal_group_exit(sig)) |
874 | /* Another thread got here before we took the lock. */ | 876 | /* Another thread got here before we took the lock. */ |
@@ -1033,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1033 | * as other threads in the parent group can be right | 1035 | * as other threads in the parent group can be right |
1034 | * here reaping other children at the same time. | 1036 | * here reaping other children at the same time. |
1035 | * | 1037 | * |
1036 | * We use thread_group_cputime_adjusted() to get times for the thread | 1038 | * We use thread_group_cputime_adjusted() to get times for |
1037 | * group, which consolidates times for all threads in the | 1039 | * the thread group, which consolidates times for all threads |
1038 | * group including the group leader. | 1040 | * in the group including the group leader. |
1039 | */ | 1041 | */ |
1040 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); | 1042 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
1041 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1043 | spin_lock_irq(&p->real_parent->sighand->siglock); |
@@ -1417,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) | |||
1417 | 1419 | ||
1418 | list_for_each_entry(p, &tsk->children, sibling) { | 1420 | list_for_each_entry(p, &tsk->children, sibling) { |
1419 | int ret = wait_consider_task(wo, 0, p); | 1421 | int ret = wait_consider_task(wo, 0, p); |
1422 | |||
1420 | if (ret) | 1423 | if (ret) |
1421 | return ret; | 1424 | return ret; |
1422 | } | 1425 | } |
@@ -1430,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) | |||
1430 | 1433 | ||
1431 | list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { | 1434 | list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { |
1432 | int ret = wait_consider_task(wo, 1, p); | 1435 | int ret = wait_consider_task(wo, 1, p); |
1436 | |||
1433 | if (ret) | 1437 | if (ret) |
1434 | return ret; | 1438 | return ret; |
1435 | } | 1439 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index 962885edbe53..0cf9cdb6e491 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
315 | goto free_ti; | 315 | goto free_ti; |
316 | 316 | ||
317 | tsk->stack = ti; | 317 | tsk->stack = ti; |
318 | #ifdef CONFIG_SECCOMP | ||
319 | /* | ||
320 | * We must handle setting up seccomp filters once we're under | ||
321 | * the sighand lock in case orig has changed between now and | ||
322 | * then. Until then, filter must be NULL to avoid messing up | ||
323 | * the usage counts on the error path calling free_task. | ||
324 | */ | ||
325 | tsk->seccomp.filter = NULL; | ||
326 | #endif | ||
318 | 327 | ||
319 | setup_thread_stack(tsk, orig); | 328 | setup_thread_stack(tsk, orig); |
320 | clear_user_return_notifier(tsk); | 329 | clear_user_return_notifier(tsk); |
@@ -365,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
365 | */ | 374 | */ |
366 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); | 375 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); |
367 | 376 | ||
368 | mm->locked_vm = 0; | 377 | mm->total_vm = oldmm->total_vm; |
369 | mm->mmap = NULL; | 378 | mm->shared_vm = oldmm->shared_vm; |
370 | mm->vmacache_seqnum = 0; | 379 | mm->exec_vm = oldmm->exec_vm; |
371 | mm->map_count = 0; | 380 | mm->stack_vm = oldmm->stack_vm; |
372 | cpumask_clear(mm_cpumask(mm)); | 381 | |
373 | mm->mm_rb = RB_ROOT; | ||
374 | rb_link = &mm->mm_rb.rb_node; | 382 | rb_link = &mm->mm_rb.rb_node; |
375 | rb_parent = NULL; | 383 | rb_parent = NULL; |
376 | pprev = &mm->mmap; | 384 | pprev = &mm->mmap; |
@@ -421,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
421 | atomic_dec(&inode->i_writecount); | 429 | atomic_dec(&inode->i_writecount); |
422 | mutex_lock(&mapping->i_mmap_mutex); | 430 | mutex_lock(&mapping->i_mmap_mutex); |
423 | if (tmp->vm_flags & VM_SHARED) | 431 | if (tmp->vm_flags & VM_SHARED) |
424 | mapping->i_mmap_writable++; | 432 | atomic_inc(&mapping->i_mmap_writable); |
425 | flush_dcache_mmap_lock(mapping); | 433 | flush_dcache_mmap_lock(mapping); |
426 | /* insert tmp into the share list, just after mpnt */ | 434 | /* insert tmp into the share list, just after mpnt */ |
427 | if (unlikely(tmp->vm_flags & VM_NONLINEAR)) | 435 | if (unlikely(tmp->vm_flags & VM_NONLINEAR)) |
@@ -527,19 +535,37 @@ static void mm_init_aio(struct mm_struct *mm) | |||
527 | #endif | 535 | #endif |
528 | } | 536 | } |
529 | 537 | ||
538 | static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | ||
539 | { | ||
540 | #ifdef CONFIG_MEMCG | ||
541 | mm->owner = p; | ||
542 | #endif | ||
543 | } | ||
544 | |||
530 | static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | 545 | static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) |
531 | { | 546 | { |
547 | mm->mmap = NULL; | ||
548 | mm->mm_rb = RB_ROOT; | ||
549 | mm->vmacache_seqnum = 0; | ||
532 | atomic_set(&mm->mm_users, 1); | 550 | atomic_set(&mm->mm_users, 1); |
533 | atomic_set(&mm->mm_count, 1); | 551 | atomic_set(&mm->mm_count, 1); |
534 | init_rwsem(&mm->mmap_sem); | 552 | init_rwsem(&mm->mmap_sem); |
535 | INIT_LIST_HEAD(&mm->mmlist); | 553 | INIT_LIST_HEAD(&mm->mmlist); |
536 | mm->core_state = NULL; | 554 | mm->core_state = NULL; |
537 | atomic_long_set(&mm->nr_ptes, 0); | 555 | atomic_long_set(&mm->nr_ptes, 0); |
556 | mm->map_count = 0; | ||
557 | mm->locked_vm = 0; | ||
558 | mm->pinned_vm = 0; | ||
538 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); | 559 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
539 | spin_lock_init(&mm->page_table_lock); | 560 | spin_lock_init(&mm->page_table_lock); |
561 | mm_init_cpumask(mm); | ||
540 | mm_init_aio(mm); | 562 | mm_init_aio(mm); |
541 | mm_init_owner(mm, p); | 563 | mm_init_owner(mm, p); |
564 | mmu_notifier_mm_init(mm); | ||
542 | clear_tlb_flush_pending(mm); | 565 | clear_tlb_flush_pending(mm); |
566 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | ||
567 | mm->pmd_huge_pte = NULL; | ||
568 | #endif | ||
543 | 569 | ||
544 | if (current->mm) { | 570 | if (current->mm) { |
545 | mm->flags = current->mm->flags & MMF_INIT_MASK; | 571 | mm->flags = current->mm->flags & MMF_INIT_MASK; |
@@ -549,11 +575,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | |||
549 | mm->def_flags = 0; | 575 | mm->def_flags = 0; |
550 | } | 576 | } |
551 | 577 | ||
552 | if (likely(!mm_alloc_pgd(mm))) { | 578 | if (mm_alloc_pgd(mm)) |
553 | mmu_notifier_mm_init(mm); | 579 | goto fail_nopgd; |
554 | return mm; | 580 | |
555 | } | 581 | if (init_new_context(p, mm)) |
582 | goto fail_nocontext; | ||
556 | 583 | ||
584 | return mm; | ||
585 | |||
586 | fail_nocontext: | ||
587 | mm_free_pgd(mm); | ||
588 | fail_nopgd: | ||
557 | free_mm(mm); | 589 | free_mm(mm); |
558 | return NULL; | 590 | return NULL; |
559 | } | 591 | } |
@@ -587,7 +619,6 @@ struct mm_struct *mm_alloc(void) | |||
587 | return NULL; | 619 | return NULL; |
588 | 620 | ||
589 | memset(mm, 0, sizeof(*mm)); | 621 | memset(mm, 0, sizeof(*mm)); |
590 | mm_init_cpumask(mm); | ||
591 | return mm_init(mm, current); | 622 | return mm_init(mm, current); |
592 | } | 623 | } |
593 | 624 | ||
@@ -819,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) | |||
819 | goto fail_nomem; | 850 | goto fail_nomem; |
820 | 851 | ||
821 | memcpy(mm, oldmm, sizeof(*mm)); | 852 | memcpy(mm, oldmm, sizeof(*mm)); |
822 | mm_init_cpumask(mm); | ||
823 | 853 | ||
824 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | ||
825 | mm->pmd_huge_pte = NULL; | ||
826 | #endif | ||
827 | if (!mm_init(mm, tsk)) | 854 | if (!mm_init(mm, tsk)) |
828 | goto fail_nomem; | 855 | goto fail_nomem; |
829 | 856 | ||
830 | if (init_new_context(tsk, mm)) | ||
831 | goto fail_nocontext; | ||
832 | |||
833 | dup_mm_exe_file(oldmm, mm); | 857 | dup_mm_exe_file(oldmm, mm); |
834 | 858 | ||
835 | err = dup_mmap(mm, oldmm); | 859 | err = dup_mmap(mm, oldmm); |
@@ -851,15 +875,6 @@ free_pt: | |||
851 | 875 | ||
852 | fail_nomem: | 876 | fail_nomem: |
853 | return NULL; | 877 | return NULL; |
854 | |||
855 | fail_nocontext: | ||
856 | /* | ||
857 | * If init_new_context() failed, we cannot use mmput() to free the mm | ||
858 | * because it calls destroy_context() | ||
859 | */ | ||
860 | mm_free_pgd(mm); | ||
861 | free_mm(mm); | ||
862 | return NULL; | ||
863 | } | 878 | } |
864 | 879 | ||
865 | static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) | 880 | static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) |
@@ -1081,6 +1096,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
1081 | return 0; | 1096 | return 0; |
1082 | } | 1097 | } |
1083 | 1098 | ||
1099 | static void copy_seccomp(struct task_struct *p) | ||
1100 | { | ||
1101 | #ifdef CONFIG_SECCOMP | ||
1102 | /* | ||
1103 | * Must be called with sighand->lock held, which is common to | ||
1104 | * all threads in the group. Holding cred_guard_mutex is not | ||
1105 | * needed because this new task is not yet running and cannot | ||
1106 | * be racing exec. | ||
1107 | */ | ||
1108 | assert_spin_locked(¤t->sighand->siglock); | ||
1109 | |||
1110 | /* Ref-count the new filter user, and assign it. */ | ||
1111 | get_seccomp_filter(current); | ||
1112 | p->seccomp = current->seccomp; | ||
1113 | |||
1114 | /* | ||
1115 | * Explicitly enable no_new_privs here in case it got set | ||
1116 | * between the task_struct being duplicated and holding the | ||
1117 | * sighand lock. The seccomp state and nnp must be in sync. | ||
1118 | */ | ||
1119 | if (task_no_new_privs(current)) | ||
1120 | task_set_no_new_privs(p); | ||
1121 | |||
1122 | /* | ||
1123 | * If the parent gained a seccomp mode after copying thread | ||
1124 | * flags and between before we held the sighand lock, we have | ||
1125 | * to manually enable the seccomp thread flag here. | ||
1126 | */ | ||
1127 | if (p->seccomp.mode != SECCOMP_MODE_DISABLED) | ||
1128 | set_tsk_thread_flag(p, TIF_SECCOMP); | ||
1129 | #endif | ||
1130 | } | ||
1131 | |||
1084 | SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) | 1132 | SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) |
1085 | { | 1133 | { |
1086 | current->clear_child_tid = tidptr; | 1134 | current->clear_child_tid = tidptr; |
@@ -1098,13 +1146,6 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1098 | #endif | 1146 | #endif |
1099 | } | 1147 | } |
1100 | 1148 | ||
1101 | #ifdef CONFIG_MEMCG | ||
1102 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | ||
1103 | { | ||
1104 | mm->owner = p; | ||
1105 | } | ||
1106 | #endif /* CONFIG_MEMCG */ | ||
1107 | |||
1108 | /* | 1149 | /* |
1109 | * Initialize POSIX timer handling for a single task. | 1150 | * Initialize POSIX timer handling for a single task. |
1110 | */ | 1151 | */ |
@@ -1195,7 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1195 | goto fork_out; | 1236 | goto fork_out; |
1196 | 1237 | ||
1197 | ftrace_graph_init_task(p); | 1238 | ftrace_graph_init_task(p); |
1198 | get_seccomp_filter(p); | ||
1199 | 1239 | ||
1200 | rt_mutex_init_task(p); | 1240 | rt_mutex_init_task(p); |
1201 | 1241 | ||
@@ -1261,9 +1301,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1261 | 1301 | ||
1262 | posix_cpu_timers_init(p); | 1302 | posix_cpu_timers_init(p); |
1263 | 1303 | ||
1264 | do_posix_clock_monotonic_gettime(&p->start_time); | 1304 | p->start_time = ktime_get_ns(); |
1265 | p->real_start_time = p->start_time; | 1305 | p->real_start_time = ktime_get_boot_ns(); |
1266 | monotonic_to_bootbased(&p->real_start_time); | ||
1267 | p->io_context = NULL; | 1306 | p->io_context = NULL; |
1268 | p->audit_context = NULL; | 1307 | p->audit_context = NULL; |
1269 | if (clone_flags & CLONE_THREAD) | 1308 | if (clone_flags & CLONE_THREAD) |
@@ -1306,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1306 | #ifdef CONFIG_DEBUG_MUTEXES | 1345 | #ifdef CONFIG_DEBUG_MUTEXES |
1307 | p->blocked_on = NULL; /* not blocked yet */ | 1346 | p->blocked_on = NULL; /* not blocked yet */ |
1308 | #endif | 1347 | #endif |
1309 | #ifdef CONFIG_MEMCG | ||
1310 | p->memcg_batch.do_batch = 0; | ||
1311 | p->memcg_batch.memcg = NULL; | ||
1312 | #endif | ||
1313 | #ifdef CONFIG_BCACHE | 1348 | #ifdef CONFIG_BCACHE |
1314 | p->sequential_io = 0; | 1349 | p->sequential_io = 0; |
1315 | p->sequential_io_avg = 0; | 1350 | p->sequential_io_avg = 0; |
@@ -1327,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1327 | if (retval) | 1362 | if (retval) |
1328 | goto bad_fork_cleanup_policy; | 1363 | goto bad_fork_cleanup_policy; |
1329 | /* copy all the process information */ | 1364 | /* copy all the process information */ |
1365 | shm_init_task(p); | ||
1330 | retval = copy_semundo(clone_flags, p); | 1366 | retval = copy_semundo(clone_flags, p); |
1331 | if (retval) | 1367 | if (retval) |
1332 | goto bad_fork_cleanup_audit; | 1368 | goto bad_fork_cleanup_audit; |
@@ -1436,6 +1472,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1436 | spin_lock(¤t->sighand->siglock); | 1472 | spin_lock(¤t->sighand->siglock); |
1437 | 1473 | ||
1438 | /* | 1474 | /* |
1475 | * Copy seccomp details explicitly here, in case they were changed | ||
1476 | * before holding sighand lock. | ||
1477 | */ | ||
1478 | copy_seccomp(p); | ||
1479 | |||
1480 | /* | ||
1439 | * Process group and session signals need to be delivered to just the | 1481 | * Process group and session signals need to be delivered to just the |
1440 | * parent before the fork or both the parent and the child after the | 1482 | * parent before the fork or both the parent and the child after the |
1441 | * fork. Restart if a signal comes in before we add the new process to | 1483 | * fork. Restart if a signal comes in before we add the new process to |
@@ -1872,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1872 | */ | 1914 | */ |
1873 | exit_sem(current); | 1915 | exit_sem(current); |
1874 | } | 1916 | } |
1917 | if (unshare_flags & CLONE_NEWIPC) { | ||
1918 | /* Orphan segments in old ns (see sem above). */ | ||
1919 | exit_shm(current); | ||
1920 | shm_init_task(current); | ||
1921 | } | ||
1875 | 1922 | ||
1876 | if (new_nsproxy) | 1923 | if (new_nsproxy) |
1877 | switch_task_namespaces(current, new_nsproxy); | 1924 | switch_task_namespaces(current, new_nsproxy); |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a76379..edf67c493a8e 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
@@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) | |||
784 | 784 | ||
785 | err_remove: | 785 | err_remove: |
786 | pr_err("init failed\n"); | 786 | pr_err("init failed\n"); |
787 | if (root_node.dentry) | 787 | debugfs_remove(root_node.dentry); |
788 | debugfs_remove(root_node.dentry); | ||
789 | 788 | ||
790 | return rc; | 789 | return rc; |
791 | } | 790 | } |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 452d6f2ba21d..cf80e7b0ddab 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class; | |||
341 | /* | 341 | /* |
342 | * irq_map_generic_chip - Map a generic chip for an irq domain | 342 | * irq_map_generic_chip - Map a generic chip for an irq domain |
343 | */ | 343 | */ |
344 | static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | 344 | int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, |
345 | irq_hw_number_t hw_irq) | 345 | irq_hw_number_t hw_irq) |
346 | { | 346 | { |
347 | struct irq_data *data = irq_get_irq_data(virq); | 347 | struct irq_data *data = irq_get_irq_data(virq); |
348 | struct irq_domain_chip_generic *dgc = d->gc; | 348 | struct irq_domain_chip_generic *dgc = d->gc; |
@@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | |||
394 | irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); | 394 | irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); |
395 | return 0; | 395 | return 0; |
396 | } | 396 | } |
397 | EXPORT_SYMBOL_GPL(irq_map_generic_chip); | ||
397 | 398 | ||
398 | struct irq_domain_ops irq_generic_chip_ops = { | 399 | struct irq_domain_ops irq_generic_chip_ops = { |
399 | .map = irq_map_generic_chip, | 400 | .map = irq_map_generic_chip, |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index eb5e10e32e05..6534ff6ce02e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain) | |||
231 | } | 231 | } |
232 | EXPORT_SYMBOL_GPL(irq_set_default_host); | 232 | EXPORT_SYMBOL_GPL(irq_set_default_host); |
233 | 233 | ||
234 | static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) | 234 | void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) |
235 | { | 235 | { |
236 | struct irq_data *irq_data = irq_get_irq_data(irq); | 236 | struct irq_data *irq_data = irq_get_irq_data(irq); |
237 | irq_hw_number_t hwirq; | 237 | irq_hw_number_t hwirq; |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37dac3a..ae5167087845 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, | |||
364 | address += symbol_offset; | 364 | address += symbol_offset; |
365 | name = kallsyms_lookup(address, &size, &offset, &modname, buffer); | 365 | name = kallsyms_lookup(address, &size, &offset, &modname, buffer); |
366 | if (!name) | 366 | if (!name) |
367 | return sprintf(buffer, "0x%lx", address); | 367 | return sprintf(buffer, "0x%lx", address - symbol_offset); |
368 | 368 | ||
369 | if (name != buffer) | 369 | if (name != buffer) |
370 | strcpy(buffer, name); | 370 | strcpy(buffer, name); |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c925884..0b49a0a58102 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -6,6 +6,8 @@ | |||
6 | * Version 2. See the file COPYING for more details. | 6 | * Version 2. See the file COPYING for more details. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #define pr_fmt(fmt) "kexec: " fmt | ||
10 | |||
9 | #include <linux/capability.h> | 11 | #include <linux/capability.h> |
10 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
11 | #include <linux/file.h> | 13 | #include <linux/file.h> |
@@ -40,6 +42,9 @@ | |||
40 | #include <asm/io.h> | 42 | #include <asm/io.h> |
41 | #include <asm/sections.h> | 43 | #include <asm/sections.h> |
42 | 44 | ||
45 | #include <crypto/hash.h> | ||
46 | #include <crypto/sha.h> | ||
47 | |||
43 | /* Per cpu memory for storing cpu states in case of system crash. */ | 48 | /* Per cpu memory for storing cpu states in case of system crash. */ |
44 | note_buf_t __percpu *crash_notes; | 49 | note_buf_t __percpu *crash_notes; |
45 | 50 | ||
@@ -52,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | |||
52 | /* Flag to indicate we are going to kexec a new kernel */ | 57 | /* Flag to indicate we are going to kexec a new kernel */ |
53 | bool kexec_in_progress = false; | 58 | bool kexec_in_progress = false; |
54 | 59 | ||
60 | /* | ||
61 | * Declare these symbols weak so that if architecture provides a purgatory, | ||
62 | * these will be overridden. | ||
63 | */ | ||
64 | char __weak kexec_purgatory[0]; | ||
65 | size_t __weak kexec_purgatory_size = 0; | ||
66 | |||
67 | static int kexec_calculate_store_digests(struct kimage *image); | ||
68 | |||
55 | /* Location of the reserved area for the crash kernel */ | 69 | /* Location of the reserved area for the crash kernel */ |
56 | struct resource crashk_res = { | 70 | struct resource crashk_res = { |
57 | .name = "Crash kernel", | 71 | .name = "Crash kernel", |
@@ -125,45 +139,27 @@ static struct page *kimage_alloc_page(struct kimage *image, | |||
125 | gfp_t gfp_mask, | 139 | gfp_t gfp_mask, |
126 | unsigned long dest); | 140 | unsigned long dest); |
127 | 141 | ||
128 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | 142 | static int copy_user_segment_list(struct kimage *image, |
129 | unsigned long nr_segments, | 143 | unsigned long nr_segments, |
130 | struct kexec_segment __user *segments) | 144 | struct kexec_segment __user *segments) |
131 | { | 145 | { |
146 | int ret; | ||
132 | size_t segment_bytes; | 147 | size_t segment_bytes; |
133 | struct kimage *image; | ||
134 | unsigned long i; | ||
135 | int result; | ||
136 | |||
137 | /* Allocate a controlling structure */ | ||
138 | result = -ENOMEM; | ||
139 | image = kzalloc(sizeof(*image), GFP_KERNEL); | ||
140 | if (!image) | ||
141 | goto out; | ||
142 | |||
143 | image->head = 0; | ||
144 | image->entry = &image->head; | ||
145 | image->last_entry = &image->head; | ||
146 | image->control_page = ~0; /* By default this does not apply */ | ||
147 | image->start = entry; | ||
148 | image->type = KEXEC_TYPE_DEFAULT; | ||
149 | |||
150 | /* Initialize the list of control pages */ | ||
151 | INIT_LIST_HEAD(&image->control_pages); | ||
152 | |||
153 | /* Initialize the list of destination pages */ | ||
154 | INIT_LIST_HEAD(&image->dest_pages); | ||
155 | |||
156 | /* Initialize the list of unusable pages */ | ||
157 | INIT_LIST_HEAD(&image->unuseable_pages); | ||
158 | 148 | ||
159 | /* Read in the segments */ | 149 | /* Read in the segments */ |
160 | image->nr_segments = nr_segments; | 150 | image->nr_segments = nr_segments; |
161 | segment_bytes = nr_segments * sizeof(*segments); | 151 | segment_bytes = nr_segments * sizeof(*segments); |
162 | result = copy_from_user(image->segment, segments, segment_bytes); | 152 | ret = copy_from_user(image->segment, segments, segment_bytes); |
163 | if (result) { | 153 | if (ret) |
164 | result = -EFAULT; | 154 | ret = -EFAULT; |
165 | goto out; | 155 | |
166 | } | 156 | return ret; |
157 | } | ||
158 | |||
159 | static int sanity_check_segment_list(struct kimage *image) | ||
160 | { | ||
161 | int result, i; | ||
162 | unsigned long nr_segments = image->nr_segments; | ||
167 | 163 | ||
168 | /* | 164 | /* |
169 | * Verify we have good destination addresses. The caller is | 165 | * Verify we have good destination addresses. The caller is |
@@ -185,9 +181,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
185 | mstart = image->segment[i].mem; | 181 | mstart = image->segment[i].mem; |
186 | mend = mstart + image->segment[i].memsz; | 182 | mend = mstart + image->segment[i].memsz; |
187 | if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) | 183 | if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) |
188 | goto out; | 184 | return result; |
189 | if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) | 185 | if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) |
190 | goto out; | 186 | return result; |
191 | } | 187 | } |
192 | 188 | ||
193 | /* Verify our destination addresses do not overlap. | 189 | /* Verify our destination addresses do not overlap. |
@@ -208,7 +204,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
208 | pend = pstart + image->segment[j].memsz; | 204 | pend = pstart + image->segment[j].memsz; |
209 | /* Do the segments overlap ? */ | 205 | /* Do the segments overlap ? */ |
210 | if ((mend > pstart) && (mstart < pend)) | 206 | if ((mend > pstart) && (mstart < pend)) |
211 | goto out; | 207 | return result; |
212 | } | 208 | } |
213 | } | 209 | } |
214 | 210 | ||
@@ -220,130 +216,401 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
220 | result = -EINVAL; | 216 | result = -EINVAL; |
221 | for (i = 0; i < nr_segments; i++) { | 217 | for (i = 0; i < nr_segments; i++) { |
222 | if (image->segment[i].bufsz > image->segment[i].memsz) | 218 | if (image->segment[i].bufsz > image->segment[i].memsz) |
223 | goto out; | 219 | return result; |
224 | } | 220 | } |
225 | 221 | ||
226 | result = 0; | 222 | /* |
227 | out: | 223 | * Verify we have good destination addresses. Normally |
228 | if (result == 0) | 224 | * the caller is responsible for making certain we don't |
229 | *rimage = image; | 225 | * attempt to load the new image into invalid or reserved |
230 | else | 226 | * areas of RAM. But crash kernels are preloaded into a |
231 | kfree(image); | 227 | * reserved area of ram. We must ensure the addresses |
228 | * are in the reserved area otherwise preloading the | ||
229 | * kernel could corrupt things. | ||
230 | */ | ||
232 | 231 | ||
233 | return result; | 232 | if (image->type == KEXEC_TYPE_CRASH) { |
233 | result = -EADDRNOTAVAIL; | ||
234 | for (i = 0; i < nr_segments; i++) { | ||
235 | unsigned long mstart, mend; | ||
236 | |||
237 | mstart = image->segment[i].mem; | ||
238 | mend = mstart + image->segment[i].memsz - 1; | ||
239 | /* Ensure we are within the crash kernel limits */ | ||
240 | if ((mstart < crashk_res.start) || | ||
241 | (mend > crashk_res.end)) | ||
242 | return result; | ||
243 | } | ||
244 | } | ||
234 | 245 | ||
246 | return 0; | ||
247 | } | ||
248 | |||
249 | static struct kimage *do_kimage_alloc_init(void) | ||
250 | { | ||
251 | struct kimage *image; | ||
252 | |||
253 | /* Allocate a controlling structure */ | ||
254 | image = kzalloc(sizeof(*image), GFP_KERNEL); | ||
255 | if (!image) | ||
256 | return NULL; | ||
257 | |||
258 | image->head = 0; | ||
259 | image->entry = &image->head; | ||
260 | image->last_entry = &image->head; | ||
261 | image->control_page = ~0; /* By default this does not apply */ | ||
262 | image->type = KEXEC_TYPE_DEFAULT; | ||
263 | |||
264 | /* Initialize the list of control pages */ | ||
265 | INIT_LIST_HEAD(&image->control_pages); | ||
266 | |||
267 | /* Initialize the list of destination pages */ | ||
268 | INIT_LIST_HEAD(&image->dest_pages); | ||
269 | |||
270 | /* Initialize the list of unusable pages */ | ||
271 | INIT_LIST_HEAD(&image->unusable_pages); | ||
272 | |||
273 | return image; | ||
235 | } | 274 | } |
236 | 275 | ||
237 | static void kimage_free_page_list(struct list_head *list); | 276 | static void kimage_free_page_list(struct list_head *list); |
238 | 277 | ||
239 | static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, | 278 | static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, |
240 | unsigned long nr_segments, | 279 | unsigned long nr_segments, |
241 | struct kexec_segment __user *segments) | 280 | struct kexec_segment __user *segments, |
281 | unsigned long flags) | ||
242 | { | 282 | { |
243 | int result; | 283 | int ret; |
244 | struct kimage *image; | 284 | struct kimage *image; |
285 | bool kexec_on_panic = flags & KEXEC_ON_CRASH; | ||
286 | |||
287 | if (kexec_on_panic) { | ||
288 | /* Verify we have a valid entry point */ | ||
289 | if ((entry < crashk_res.start) || (entry > crashk_res.end)) | ||
290 | return -EADDRNOTAVAIL; | ||
291 | } | ||
245 | 292 | ||
246 | /* Allocate and initialize a controlling structure */ | 293 | /* Allocate and initialize a controlling structure */ |
247 | image = NULL; | 294 | image = do_kimage_alloc_init(); |
248 | result = do_kimage_alloc(&image, entry, nr_segments, segments); | 295 | if (!image) |
249 | if (result) | 296 | return -ENOMEM; |
250 | goto out; | 297 | |
298 | image->start = entry; | ||
299 | |||
300 | ret = copy_user_segment_list(image, nr_segments, segments); | ||
301 | if (ret) | ||
302 | goto out_free_image; | ||
303 | |||
304 | ret = sanity_check_segment_list(image); | ||
305 | if (ret) | ||
306 | goto out_free_image; | ||
307 | |||
308 | /* Enable the special crash kernel control page allocation policy. */ | ||
309 | if (kexec_on_panic) { | ||
310 | image->control_page = crashk_res.start; | ||
311 | image->type = KEXEC_TYPE_CRASH; | ||
312 | } | ||
251 | 313 | ||
252 | /* | 314 | /* |
253 | * Find a location for the control code buffer, and add it | 315 | * Find a location for the control code buffer, and add it |
254 | * the vector of segments so that it's pages will also be | 316 | * the vector of segments so that it's pages will also be |
255 | * counted as destination pages. | 317 | * counted as destination pages. |
256 | */ | 318 | */ |
257 | result = -ENOMEM; | 319 | ret = -ENOMEM; |
258 | image->control_code_page = kimage_alloc_control_pages(image, | 320 | image->control_code_page = kimage_alloc_control_pages(image, |
259 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 321 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
260 | if (!image->control_code_page) { | 322 | if (!image->control_code_page) { |
261 | pr_err("Could not allocate control_code_buffer\n"); | 323 | pr_err("Could not allocate control_code_buffer\n"); |
262 | goto out_free; | 324 | goto out_free_image; |
263 | } | 325 | } |
264 | 326 | ||
265 | image->swap_page = kimage_alloc_control_pages(image, 0); | 327 | if (!kexec_on_panic) { |
266 | if (!image->swap_page) { | 328 | image->swap_page = kimage_alloc_control_pages(image, 0); |
267 | pr_err("Could not allocate swap buffer\n"); | 329 | if (!image->swap_page) { |
268 | goto out_free; | 330 | pr_err("Could not allocate swap buffer\n"); |
331 | goto out_free_control_pages; | ||
332 | } | ||
269 | } | 333 | } |
270 | 334 | ||
271 | *rimage = image; | 335 | *rimage = image; |
272 | return 0; | 336 | return 0; |
273 | 337 | out_free_control_pages: | |
274 | out_free: | ||
275 | kimage_free_page_list(&image->control_pages); | 338 | kimage_free_page_list(&image->control_pages); |
339 | out_free_image: | ||
276 | kfree(image); | 340 | kfree(image); |
277 | out: | 341 | return ret; |
278 | return result; | ||
279 | } | 342 | } |
280 | 343 | ||
281 | static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, | 344 | static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) |
282 | unsigned long nr_segments, | ||
283 | struct kexec_segment __user *segments) | ||
284 | { | 345 | { |
285 | int result; | 346 | struct fd f = fdget(fd); |
286 | struct kimage *image; | 347 | int ret; |
287 | unsigned long i; | 348 | struct kstat stat; |
349 | loff_t pos; | ||
350 | ssize_t bytes = 0; | ||
288 | 351 | ||
289 | image = NULL; | 352 | if (!f.file) |
290 | /* Verify we have a valid entry point */ | 353 | return -EBADF; |
291 | if ((entry < crashk_res.start) || (entry > crashk_res.end)) { | 354 | |
292 | result = -EADDRNOTAVAIL; | 355 | ret = vfs_getattr(&f.file->f_path, &stat); |
356 | if (ret) | ||
357 | goto out; | ||
358 | |||
359 | if (stat.size > INT_MAX) { | ||
360 | ret = -EFBIG; | ||
293 | goto out; | 361 | goto out; |
294 | } | 362 | } |
295 | 363 | ||
296 | /* Allocate and initialize a controlling structure */ | 364 | /* Don't hand 0 to vmalloc, it whines. */ |
297 | result = do_kimage_alloc(&image, entry, nr_segments, segments); | 365 | if (stat.size == 0) { |
298 | if (result) | 366 | ret = -EINVAL; |
299 | goto out; | 367 | goto out; |
368 | } | ||
300 | 369 | ||
301 | /* Enable the special crash kernel control page | 370 | *buf = vmalloc(stat.size); |
302 | * allocation policy. | 371 | if (!*buf) { |
303 | */ | 372 | ret = -ENOMEM; |
304 | image->control_page = crashk_res.start; | 373 | goto out; |
305 | image->type = KEXEC_TYPE_CRASH; | 374 | } |
306 | 375 | ||
307 | /* | 376 | pos = 0; |
308 | * Verify we have good destination addresses. Normally | 377 | while (pos < stat.size) { |
309 | * the caller is responsible for making certain we don't | 378 | bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, |
310 | * attempt to load the new image into invalid or reserved | 379 | stat.size - pos); |
311 | * areas of RAM. But crash kernels are preloaded into a | 380 | if (bytes < 0) { |
312 | * reserved area of ram. We must ensure the addresses | 381 | vfree(*buf); |
313 | * are in the reserved area otherwise preloading the | 382 | ret = bytes; |
314 | * kernel could corrupt things. | 383 | goto out; |
315 | */ | 384 | } |
316 | result = -EADDRNOTAVAIL; | ||
317 | for (i = 0; i < nr_segments; i++) { | ||
318 | unsigned long mstart, mend; | ||
319 | 385 | ||
320 | mstart = image->segment[i].mem; | 386 | if (bytes == 0) |
321 | mend = mstart + image->segment[i].memsz - 1; | 387 | break; |
322 | /* Ensure we are within the crash kernel limits */ | 388 | pos += bytes; |
323 | if ((mstart < crashk_res.start) || (mend > crashk_res.end)) | ||
324 | goto out_free; | ||
325 | } | 389 | } |
326 | 390 | ||
391 | if (pos != stat.size) { | ||
392 | ret = -EBADF; | ||
393 | vfree(*buf); | ||
394 | goto out; | ||
395 | } | ||
396 | |||
397 | *buf_len = pos; | ||
398 | out: | ||
399 | fdput(f); | ||
400 | return ret; | ||
401 | } | ||
402 | |||
403 | /* Architectures can provide this probe function */ | ||
404 | int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, | ||
405 | unsigned long buf_len) | ||
406 | { | ||
407 | return -ENOEXEC; | ||
408 | } | ||
409 | |||
410 | void * __weak arch_kexec_kernel_image_load(struct kimage *image) | ||
411 | { | ||
412 | return ERR_PTR(-ENOEXEC); | ||
413 | } | ||
414 | |||
415 | void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) | ||
416 | { | ||
417 | } | ||
418 | |||
419 | int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, | ||
420 | unsigned long buf_len) | ||
421 | { | ||
422 | return -EKEYREJECTED; | ||
423 | } | ||
424 | |||
425 | /* Apply relocations of type RELA */ | ||
426 | int __weak | ||
427 | arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, | ||
428 | unsigned int relsec) | ||
429 | { | ||
430 | pr_err("RELA relocation unsupported.\n"); | ||
431 | return -ENOEXEC; | ||
432 | } | ||
433 | |||
434 | /* Apply relocations of type REL */ | ||
435 | int __weak | ||
436 | arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, | ||
437 | unsigned int relsec) | ||
438 | { | ||
439 | pr_err("REL relocation unsupported.\n"); | ||
440 | return -ENOEXEC; | ||
441 | } | ||
442 | |||
443 | /* | ||
444 | * Free up memory used by kernel, initrd, and comand line. This is temporary | ||
445 | * memory allocation which is not needed any more after these buffers have | ||
446 | * been loaded into separate segments and have been copied elsewhere. | ||
447 | */ | ||
448 | static void kimage_file_post_load_cleanup(struct kimage *image) | ||
449 | { | ||
450 | struct purgatory_info *pi = &image->purgatory_info; | ||
451 | |||
452 | vfree(image->kernel_buf); | ||
453 | image->kernel_buf = NULL; | ||
454 | |||
455 | vfree(image->initrd_buf); | ||
456 | image->initrd_buf = NULL; | ||
457 | |||
458 | kfree(image->cmdline_buf); | ||
459 | image->cmdline_buf = NULL; | ||
460 | |||
461 | vfree(pi->purgatory_buf); | ||
462 | pi->purgatory_buf = NULL; | ||
463 | |||
464 | vfree(pi->sechdrs); | ||
465 | pi->sechdrs = NULL; | ||
466 | |||
467 | /* See if architecture has anything to cleanup post load */ | ||
468 | arch_kimage_file_post_load_cleanup(image); | ||
469 | |||
327 | /* | 470 | /* |
328 | * Find a location for the control code buffer, and add | 471 | * Above call should have called into bootloader to free up |
329 | * the vector of segments so that it's pages will also be | 472 | * any data stored in kimage->image_loader_data. It should |
330 | * counted as destination pages. | 473 | * be ok now to free it up. |
331 | */ | 474 | */ |
332 | result = -ENOMEM; | 475 | kfree(image->image_loader_data); |
476 | image->image_loader_data = NULL; | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * In file mode list of segments is prepared by kernel. Copy relevant | ||
481 | * data from user space, do error checking, prepare segment list | ||
482 | */ | ||
483 | static int | ||
484 | kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, | ||
485 | const char __user *cmdline_ptr, | ||
486 | unsigned long cmdline_len, unsigned flags) | ||
487 | { | ||
488 | int ret = 0; | ||
489 | void *ldata; | ||
490 | |||
491 | ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, | ||
492 | &image->kernel_buf_len); | ||
493 | if (ret) | ||
494 | return ret; | ||
495 | |||
496 | /* Call arch image probe handlers */ | ||
497 | ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, | ||
498 | image->kernel_buf_len); | ||
499 | |||
500 | if (ret) | ||
501 | goto out; | ||
502 | |||
503 | #ifdef CONFIG_KEXEC_VERIFY_SIG | ||
504 | ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, | ||
505 | image->kernel_buf_len); | ||
506 | if (ret) { | ||
507 | pr_debug("kernel signature verification failed.\n"); | ||
508 | goto out; | ||
509 | } | ||
510 | pr_debug("kernel signature verification successful.\n"); | ||
511 | #endif | ||
512 | /* It is possible that there no initramfs is being loaded */ | ||
513 | if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { | ||
514 | ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, | ||
515 | &image->initrd_buf_len); | ||
516 | if (ret) | ||
517 | goto out; | ||
518 | } | ||
519 | |||
520 | if (cmdline_len) { | ||
521 | image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); | ||
522 | if (!image->cmdline_buf) { | ||
523 | ret = -ENOMEM; | ||
524 | goto out; | ||
525 | } | ||
526 | |||
527 | ret = copy_from_user(image->cmdline_buf, cmdline_ptr, | ||
528 | cmdline_len); | ||
529 | if (ret) { | ||
530 | ret = -EFAULT; | ||
531 | goto out; | ||
532 | } | ||
533 | |||
534 | image->cmdline_buf_len = cmdline_len; | ||
535 | |||
536 | /* command line should be a string with last byte null */ | ||
537 | if (image->cmdline_buf[cmdline_len - 1] != '\0') { | ||
538 | ret = -EINVAL; | ||
539 | goto out; | ||
540 | } | ||
541 | } | ||
542 | |||
543 | /* Call arch image load handlers */ | ||
544 | ldata = arch_kexec_kernel_image_load(image); | ||
545 | |||
546 | if (IS_ERR(ldata)) { | ||
547 | ret = PTR_ERR(ldata); | ||
548 | goto out; | ||
549 | } | ||
550 | |||
551 | image->image_loader_data = ldata; | ||
552 | out: | ||
553 | /* In case of error, free up all allocated memory in this function */ | ||
554 | if (ret) | ||
555 | kimage_file_post_load_cleanup(image); | ||
556 | return ret; | ||
557 | } | ||
558 | |||
559 | static int | ||
560 | kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, | ||
561 | int initrd_fd, const char __user *cmdline_ptr, | ||
562 | unsigned long cmdline_len, unsigned long flags) | ||
563 | { | ||
564 | int ret; | ||
565 | struct kimage *image; | ||
566 | bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; | ||
567 | |||
568 | image = do_kimage_alloc_init(); | ||
569 | if (!image) | ||
570 | return -ENOMEM; | ||
571 | |||
572 | image->file_mode = 1; | ||
573 | |||
574 | if (kexec_on_panic) { | ||
575 | /* Enable special crash kernel control page alloc policy. */ | ||
576 | image->control_page = crashk_res.start; | ||
577 | image->type = KEXEC_TYPE_CRASH; | ||
578 | } | ||
579 | |||
580 | ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, | ||
581 | cmdline_ptr, cmdline_len, flags); | ||
582 | if (ret) | ||
583 | goto out_free_image; | ||
584 | |||
585 | ret = sanity_check_segment_list(image); | ||
586 | if (ret) | ||
587 | goto out_free_post_load_bufs; | ||
588 | |||
589 | ret = -ENOMEM; | ||
333 | image->control_code_page = kimage_alloc_control_pages(image, | 590 | image->control_code_page = kimage_alloc_control_pages(image, |
334 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 591 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
335 | if (!image->control_code_page) { | 592 | if (!image->control_code_page) { |
336 | pr_err("Could not allocate control_code_buffer\n"); | 593 | pr_err("Could not allocate control_code_buffer\n"); |
337 | goto out_free; | 594 | goto out_free_post_load_bufs; |
595 | } | ||
596 | |||
597 | if (!kexec_on_panic) { | ||
598 | image->swap_page = kimage_alloc_control_pages(image, 0); | ||
599 | if (!image->swap_page) { | ||
600 | pr_err(KERN_ERR "Could not allocate swap buffer\n"); | ||
601 | goto out_free_control_pages; | ||
602 | } | ||
338 | } | 603 | } |
339 | 604 | ||
340 | *rimage = image; | 605 | *rimage = image; |
341 | return 0; | 606 | return 0; |
342 | 607 | out_free_control_pages: | |
343 | out_free: | 608 | kimage_free_page_list(&image->control_pages); |
609 | out_free_post_load_bufs: | ||
610 | kimage_file_post_load_cleanup(image); | ||
611 | out_free_image: | ||
344 | kfree(image); | 612 | kfree(image); |
345 | out: | 613 | return ret; |
346 | return result; | ||
347 | } | 614 | } |
348 | 615 | ||
349 | static int kimage_is_destination_range(struct kimage *image, | 616 | static int kimage_is_destination_range(struct kimage *image, |
@@ -609,7 +876,7 @@ static void kimage_free_extra_pages(struct kimage *image) | |||
609 | kimage_free_page_list(&image->dest_pages); | 876 | kimage_free_page_list(&image->dest_pages); |
610 | 877 | ||
611 | /* Walk through and free any unusable pages I have cached */ | 878 | /* Walk through and free any unusable pages I have cached */ |
612 | kimage_free_page_list(&image->unuseable_pages); | 879 | kimage_free_page_list(&image->unusable_pages); |
613 | 880 | ||
614 | } | 881 | } |
615 | static void kimage_terminate(struct kimage *image) | 882 | static void kimage_terminate(struct kimage *image) |
@@ -663,6 +930,14 @@ static void kimage_free(struct kimage *image) | |||
663 | 930 | ||
664 | /* Free the kexec control pages... */ | 931 | /* Free the kexec control pages... */ |
665 | kimage_free_page_list(&image->control_pages); | 932 | kimage_free_page_list(&image->control_pages); |
933 | |||
934 | /* | ||
935 | * Free up any temporary buffers allocated. This might hit if | ||
936 | * error occurred much later after buffer allocation. | ||
937 | */ | ||
938 | if (image->file_mode) | ||
939 | kimage_file_post_load_cleanup(image); | ||
940 | |||
666 | kfree(image); | 941 | kfree(image); |
667 | } | 942 | } |
668 | 943 | ||
@@ -732,7 +1007,7 @@ static struct page *kimage_alloc_page(struct kimage *image, | |||
732 | /* If the page cannot be used file it away */ | 1007 | /* If the page cannot be used file it away */ |
733 | if (page_to_pfn(page) > | 1008 | if (page_to_pfn(page) > |
734 | (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { | 1009 | (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { |
735 | list_add(&page->lru, &image->unuseable_pages); | 1010 | list_add(&page->lru, &image->unusable_pages); |
736 | continue; | 1011 | continue; |
737 | } | 1012 | } |
738 | addr = page_to_pfn(page) << PAGE_SHIFT; | 1013 | addr = page_to_pfn(page) << PAGE_SHIFT; |
@@ -791,10 +1066,14 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
791 | unsigned long maddr; | 1066 | unsigned long maddr; |
792 | size_t ubytes, mbytes; | 1067 | size_t ubytes, mbytes; |
793 | int result; | 1068 | int result; |
794 | unsigned char __user *buf; | 1069 | unsigned char __user *buf = NULL; |
1070 | unsigned char *kbuf = NULL; | ||
795 | 1071 | ||
796 | result = 0; | 1072 | result = 0; |
797 | buf = segment->buf; | 1073 | if (image->file_mode) |
1074 | kbuf = segment->kbuf; | ||
1075 | else | ||
1076 | buf = segment->buf; | ||
798 | ubytes = segment->bufsz; | 1077 | ubytes = segment->bufsz; |
799 | mbytes = segment->memsz; | 1078 | mbytes = segment->memsz; |
800 | maddr = segment->mem; | 1079 | maddr = segment->mem; |
@@ -826,7 +1105,11 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
826 | PAGE_SIZE - (maddr & ~PAGE_MASK)); | 1105 | PAGE_SIZE - (maddr & ~PAGE_MASK)); |
827 | uchunk = min(ubytes, mchunk); | 1106 | uchunk = min(ubytes, mchunk); |
828 | 1107 | ||
829 | result = copy_from_user(ptr, buf, uchunk); | 1108 | /* For file based kexec, source pages are in kernel memory */ |
1109 | if (image->file_mode) | ||
1110 | memcpy(ptr, kbuf, uchunk); | ||
1111 | else | ||
1112 | result = copy_from_user(ptr, buf, uchunk); | ||
830 | kunmap(page); | 1113 | kunmap(page); |
831 | if (result) { | 1114 | if (result) { |
832 | result = -EFAULT; | 1115 | result = -EFAULT; |
@@ -834,7 +1117,10 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
834 | } | 1117 | } |
835 | ubytes -= uchunk; | 1118 | ubytes -= uchunk; |
836 | maddr += mchunk; | 1119 | maddr += mchunk; |
837 | buf += mchunk; | 1120 | if (image->file_mode) |
1121 | kbuf += mchunk; | ||
1122 | else | ||
1123 | buf += mchunk; | ||
838 | mbytes -= mchunk; | 1124 | mbytes -= mchunk; |
839 | } | 1125 | } |
840 | out: | 1126 | out: |
@@ -851,10 +1137,14 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
851 | unsigned long maddr; | 1137 | unsigned long maddr; |
852 | size_t ubytes, mbytes; | 1138 | size_t ubytes, mbytes; |
853 | int result; | 1139 | int result; |
854 | unsigned char __user *buf; | 1140 | unsigned char __user *buf = NULL; |
1141 | unsigned char *kbuf = NULL; | ||
855 | 1142 | ||
856 | result = 0; | 1143 | result = 0; |
857 | buf = segment->buf; | 1144 | if (image->file_mode) |
1145 | kbuf = segment->kbuf; | ||
1146 | else | ||
1147 | buf = segment->buf; | ||
858 | ubytes = segment->bufsz; | 1148 | ubytes = segment->bufsz; |
859 | mbytes = segment->memsz; | 1149 | mbytes = segment->memsz; |
860 | maddr = segment->mem; | 1150 | maddr = segment->mem; |
@@ -877,7 +1167,12 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
877 | /* Zero the trailing part of the page */ | 1167 | /* Zero the trailing part of the page */ |
878 | memset(ptr + uchunk, 0, mchunk - uchunk); | 1168 | memset(ptr + uchunk, 0, mchunk - uchunk); |
879 | } | 1169 | } |
880 | result = copy_from_user(ptr, buf, uchunk); | 1170 | |
1171 | /* For file based kexec, source pages are in kernel memory */ | ||
1172 | if (image->file_mode) | ||
1173 | memcpy(ptr, kbuf, uchunk); | ||
1174 | else | ||
1175 | result = copy_from_user(ptr, buf, uchunk); | ||
881 | kexec_flush_icache_page(page); | 1176 | kexec_flush_icache_page(page); |
882 | kunmap(page); | 1177 | kunmap(page); |
883 | if (result) { | 1178 | if (result) { |
@@ -886,7 +1181,10 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
886 | } | 1181 | } |
887 | ubytes -= uchunk; | 1182 | ubytes -= uchunk; |
888 | maddr += mchunk; | 1183 | maddr += mchunk; |
889 | buf += mchunk; | 1184 | if (image->file_mode) |
1185 | kbuf += mchunk; | ||
1186 | else | ||
1187 | buf += mchunk; | ||
890 | mbytes -= mchunk; | 1188 | mbytes -= mchunk; |
891 | } | 1189 | } |
892 | out: | 1190 | out: |
@@ -986,16 +1284,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | |||
986 | 1284 | ||
987 | /* Loading another kernel to reboot into */ | 1285 | /* Loading another kernel to reboot into */ |
988 | if ((flags & KEXEC_ON_CRASH) == 0) | 1286 | if ((flags & KEXEC_ON_CRASH) == 0) |
989 | result = kimage_normal_alloc(&image, entry, | 1287 | result = kimage_alloc_init(&image, entry, nr_segments, |
990 | nr_segments, segments); | 1288 | segments, flags); |
991 | /* Loading another kernel to switch to if this one crashes */ | 1289 | /* Loading another kernel to switch to if this one crashes */ |
992 | else if (flags & KEXEC_ON_CRASH) { | 1290 | else if (flags & KEXEC_ON_CRASH) { |
993 | /* Free any current crash dump kernel before | 1291 | /* Free any current crash dump kernel before |
994 | * we corrupt it. | 1292 | * we corrupt it. |
995 | */ | 1293 | */ |
996 | kimage_free(xchg(&kexec_crash_image, NULL)); | 1294 | kimage_free(xchg(&kexec_crash_image, NULL)); |
997 | result = kimage_crash_alloc(&image, entry, | 1295 | result = kimage_alloc_init(&image, entry, nr_segments, |
998 | nr_segments, segments); | 1296 | segments, flags); |
999 | crash_map_reserved_pages(); | 1297 | crash_map_reserved_pages(); |
1000 | } | 1298 | } |
1001 | if (result) | 1299 | if (result) |
@@ -1077,6 +1375,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | |||
1077 | } | 1375 | } |
1078 | #endif | 1376 | #endif |
1079 | 1377 | ||
1378 | SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, | ||
1379 | unsigned long, cmdline_len, const char __user *, cmdline_ptr, | ||
1380 | unsigned long, flags) | ||
1381 | { | ||
1382 | int ret = 0, i; | ||
1383 | struct kimage **dest_image, *image; | ||
1384 | |||
1385 | /* We only trust the superuser with rebooting the system. */ | ||
1386 | if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) | ||
1387 | return -EPERM; | ||
1388 | |||
1389 | /* Make sure we have a legal set of flags */ | ||
1390 | if (flags != (flags & KEXEC_FILE_FLAGS)) | ||
1391 | return -EINVAL; | ||
1392 | |||
1393 | image = NULL; | ||
1394 | |||
1395 | if (!mutex_trylock(&kexec_mutex)) | ||
1396 | return -EBUSY; | ||
1397 | |||
1398 | dest_image = &kexec_image; | ||
1399 | if (flags & KEXEC_FILE_ON_CRASH) | ||
1400 | dest_image = &kexec_crash_image; | ||
1401 | |||
1402 | if (flags & KEXEC_FILE_UNLOAD) | ||
1403 | goto exchange; | ||
1404 | |||
1405 | /* | ||
1406 | * In case of crash, new kernel gets loaded in reserved region. It is | ||
1407 | * same memory where old crash kernel might be loaded. Free any | ||
1408 | * current crash dump kernel before we corrupt it. | ||
1409 | */ | ||
1410 | if (flags & KEXEC_FILE_ON_CRASH) | ||
1411 | kimage_free(xchg(&kexec_crash_image, NULL)); | ||
1412 | |||
1413 | ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, | ||
1414 | cmdline_len, flags); | ||
1415 | if (ret) | ||
1416 | goto out; | ||
1417 | |||
1418 | ret = machine_kexec_prepare(image); | ||
1419 | if (ret) | ||
1420 | goto out; | ||
1421 | |||
1422 | ret = kexec_calculate_store_digests(image); | ||
1423 | if (ret) | ||
1424 | goto out; | ||
1425 | |||
1426 | for (i = 0; i < image->nr_segments; i++) { | ||
1427 | struct kexec_segment *ksegment; | ||
1428 | |||
1429 | ksegment = &image->segment[i]; | ||
1430 | pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", | ||
1431 | i, ksegment->buf, ksegment->bufsz, ksegment->mem, | ||
1432 | ksegment->memsz); | ||
1433 | |||
1434 | ret = kimage_load_segment(image, &image->segment[i]); | ||
1435 | if (ret) | ||
1436 | goto out; | ||
1437 | } | ||
1438 | |||
1439 | kimage_terminate(image); | ||
1440 | |||
1441 | /* | ||
1442 | * Free up any temporary buffers allocated which are not needed | ||
1443 | * after image has been loaded | ||
1444 | */ | ||
1445 | kimage_file_post_load_cleanup(image); | ||
1446 | exchange: | ||
1447 | image = xchg(dest_image, image); | ||
1448 | out: | ||
1449 | mutex_unlock(&kexec_mutex); | ||
1450 | kimage_free(image); | ||
1451 | return ret; | ||
1452 | } | ||
1453 | |||
1080 | void crash_kexec(struct pt_regs *regs) | 1454 | void crash_kexec(struct pt_regs *regs) |
1081 | { | 1455 | { |
1082 | /* Take the kexec_mutex here to prevent sys_kexec_load | 1456 | /* Take the kexec_mutex here to prevent sys_kexec_load |
@@ -1632,6 +2006,683 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1632 | 2006 | ||
1633 | subsys_initcall(crash_save_vmcoreinfo_init); | 2007 | subsys_initcall(crash_save_vmcoreinfo_init); |
1634 | 2008 | ||
2009 | static int __kexec_add_segment(struct kimage *image, char *buf, | ||
2010 | unsigned long bufsz, unsigned long mem, | ||
2011 | unsigned long memsz) | ||
2012 | { | ||
2013 | struct kexec_segment *ksegment; | ||
2014 | |||
2015 | ksegment = &image->segment[image->nr_segments]; | ||
2016 | ksegment->kbuf = buf; | ||
2017 | ksegment->bufsz = bufsz; | ||
2018 | ksegment->mem = mem; | ||
2019 | ksegment->memsz = memsz; | ||
2020 | image->nr_segments++; | ||
2021 | |||
2022 | return 0; | ||
2023 | } | ||
2024 | |||
2025 | static int locate_mem_hole_top_down(unsigned long start, unsigned long end, | ||
2026 | struct kexec_buf *kbuf) | ||
2027 | { | ||
2028 | struct kimage *image = kbuf->image; | ||
2029 | unsigned long temp_start, temp_end; | ||
2030 | |||
2031 | temp_end = min(end, kbuf->buf_max); | ||
2032 | temp_start = temp_end - kbuf->memsz; | ||
2033 | |||
2034 | do { | ||
2035 | /* align down start */ | ||
2036 | temp_start = temp_start & (~(kbuf->buf_align - 1)); | ||
2037 | |||
2038 | if (temp_start < start || temp_start < kbuf->buf_min) | ||
2039 | return 0; | ||
2040 | |||
2041 | temp_end = temp_start + kbuf->memsz - 1; | ||
2042 | |||
2043 | /* | ||
2044 | * Make sure this does not conflict with any of existing | ||
2045 | * segments | ||
2046 | */ | ||
2047 | if (kimage_is_destination_range(image, temp_start, temp_end)) { | ||
2048 | temp_start = temp_start - PAGE_SIZE; | ||
2049 | continue; | ||
2050 | } | ||
2051 | |||
2052 | /* We found a suitable memory range */ | ||
2053 | break; | ||
2054 | } while (1); | ||
2055 | |||
2056 | /* If we are here, we found a suitable memory range */ | ||
2057 | __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, | ||
2058 | kbuf->memsz); | ||
2059 | |||
2060 | /* Success, stop navigating through remaining System RAM ranges */ | ||
2061 | return 1; | ||
2062 | } | ||
2063 | |||
2064 | static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, | ||
2065 | struct kexec_buf *kbuf) | ||
2066 | { | ||
2067 | struct kimage *image = kbuf->image; | ||
2068 | unsigned long temp_start, temp_end; | ||
2069 | |||
2070 | temp_start = max(start, kbuf->buf_min); | ||
2071 | |||
2072 | do { | ||
2073 | temp_start = ALIGN(temp_start, kbuf->buf_align); | ||
2074 | temp_end = temp_start + kbuf->memsz - 1; | ||
2075 | |||
2076 | if (temp_end > end || temp_end > kbuf->buf_max) | ||
2077 | return 0; | ||
2078 | /* | ||
2079 | * Make sure this does not conflict with any of existing | ||
2080 | * segments | ||
2081 | */ | ||
2082 | if (kimage_is_destination_range(image, temp_start, temp_end)) { | ||
2083 | temp_start = temp_start + PAGE_SIZE; | ||
2084 | continue; | ||
2085 | } | ||
2086 | |||
2087 | /* We found a suitable memory range */ | ||
2088 | break; | ||
2089 | } while (1); | ||
2090 | |||
2091 | /* If we are here, we found a suitable memory range */ | ||
2092 | __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, | ||
2093 | kbuf->memsz); | ||
2094 | |||
2095 | /* Success, stop navigating through remaining System RAM ranges */ | ||
2096 | return 1; | ||
2097 | } | ||
2098 | |||
2099 | static int locate_mem_hole_callback(u64 start, u64 end, void *arg) | ||
2100 | { | ||
2101 | struct kexec_buf *kbuf = (struct kexec_buf *)arg; | ||
2102 | unsigned long sz = end - start + 1; | ||
2103 | |||
2104 | /* Returning 0 will take to next memory range */ | ||
2105 | if (sz < kbuf->memsz) | ||
2106 | return 0; | ||
2107 | |||
2108 | if (end < kbuf->buf_min || start > kbuf->buf_max) | ||
2109 | return 0; | ||
2110 | |||
2111 | /* | ||
2112 | * Allocate memory top down with-in ram range. Otherwise bottom up | ||
2113 | * allocation. | ||
2114 | */ | ||
2115 | if (kbuf->top_down) | ||
2116 | return locate_mem_hole_top_down(start, end, kbuf); | ||
2117 | return locate_mem_hole_bottom_up(start, end, kbuf); | ||
2118 | } | ||
2119 | |||
2120 | /* | ||
2121 | * Helper function for placing a buffer in a kexec segment. This assumes | ||
2122 | * that kexec_mutex is held. | ||
2123 | */ | ||
2124 | int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, | ||
2125 | unsigned long memsz, unsigned long buf_align, | ||
2126 | unsigned long buf_min, unsigned long buf_max, | ||
2127 | bool top_down, unsigned long *load_addr) | ||
2128 | { | ||
2129 | |||
2130 | struct kexec_segment *ksegment; | ||
2131 | struct kexec_buf buf, *kbuf; | ||
2132 | int ret; | ||
2133 | |||
2134 | /* Currently adding segment this way is allowed only in file mode */ | ||
2135 | if (!image->file_mode) | ||
2136 | return -EINVAL; | ||
2137 | |||
2138 | if (image->nr_segments >= KEXEC_SEGMENT_MAX) | ||
2139 | return -EINVAL; | ||
2140 | |||
2141 | /* | ||
2142 | * Make sure we are not trying to add buffer after allocating | ||
2143 | * control pages. All segments need to be placed first before | ||
2144 | * any control pages are allocated. As control page allocation | ||
2145 | * logic goes through list of segments to make sure there are | ||
2146 | * no destination overlaps. | ||
2147 | */ | ||
2148 | if (!list_empty(&image->control_pages)) { | ||
2149 | WARN_ON(1); | ||
2150 | return -EINVAL; | ||
2151 | } | ||
2152 | |||
2153 | memset(&buf, 0, sizeof(struct kexec_buf)); | ||
2154 | kbuf = &buf; | ||
2155 | kbuf->image = image; | ||
2156 | kbuf->buffer = buffer; | ||
2157 | kbuf->bufsz = bufsz; | ||
2158 | |||
2159 | kbuf->memsz = ALIGN(memsz, PAGE_SIZE); | ||
2160 | kbuf->buf_align = max(buf_align, PAGE_SIZE); | ||
2161 | kbuf->buf_min = buf_min; | ||
2162 | kbuf->buf_max = buf_max; | ||
2163 | kbuf->top_down = top_down; | ||
2164 | |||
2165 | /* Walk the RAM ranges and allocate a suitable range for the buffer */ | ||
2166 | if (image->type == KEXEC_TYPE_CRASH) | ||
2167 | ret = walk_iomem_res("Crash kernel", | ||
2168 | IORESOURCE_MEM | IORESOURCE_BUSY, | ||
2169 | crashk_res.start, crashk_res.end, kbuf, | ||
2170 | locate_mem_hole_callback); | ||
2171 | else | ||
2172 | ret = walk_system_ram_res(0, -1, kbuf, | ||
2173 | locate_mem_hole_callback); | ||
2174 | if (ret != 1) { | ||
2175 | /* A suitable memory range could not be found for buffer */ | ||
2176 | return -EADDRNOTAVAIL; | ||
2177 | } | ||
2178 | |||
2179 | /* Found a suitable memory range */ | ||
2180 | ksegment = &image->segment[image->nr_segments - 1]; | ||
2181 | *load_addr = ksegment->mem; | ||
2182 | return 0; | ||
2183 | } | ||
2184 | |||
2185 | /* Calculate and store the digest of segments */ | ||
2186 | static int kexec_calculate_store_digests(struct kimage *image) | ||
2187 | { | ||
2188 | struct crypto_shash *tfm; | ||
2189 | struct shash_desc *desc; | ||
2190 | int ret = 0, i, j, zero_buf_sz, sha_region_sz; | ||
2191 | size_t desc_size, nullsz; | ||
2192 | char *digest; | ||
2193 | void *zero_buf; | ||
2194 | struct kexec_sha_region *sha_regions; | ||
2195 | struct purgatory_info *pi = &image->purgatory_info; | ||
2196 | |||
2197 | zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); | ||
2198 | zero_buf_sz = PAGE_SIZE; | ||
2199 | |||
2200 | tfm = crypto_alloc_shash("sha256", 0, 0); | ||
2201 | if (IS_ERR(tfm)) { | ||
2202 | ret = PTR_ERR(tfm); | ||
2203 | goto out; | ||
2204 | } | ||
2205 | |||
2206 | desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); | ||
2207 | desc = kzalloc(desc_size, GFP_KERNEL); | ||
2208 | if (!desc) { | ||
2209 | ret = -ENOMEM; | ||
2210 | goto out_free_tfm; | ||
2211 | } | ||
2212 | |||
2213 | sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); | ||
2214 | sha_regions = vzalloc(sha_region_sz); | ||
2215 | if (!sha_regions) | ||
2216 | goto out_free_desc; | ||
2217 | |||
2218 | desc->tfm = tfm; | ||
2219 | desc->flags = 0; | ||
2220 | |||
2221 | ret = crypto_shash_init(desc); | ||
2222 | if (ret < 0) | ||
2223 | goto out_free_sha_regions; | ||
2224 | |||
2225 | digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); | ||
2226 | if (!digest) { | ||
2227 | ret = -ENOMEM; | ||
2228 | goto out_free_sha_regions; | ||
2229 | } | ||
2230 | |||
2231 | for (j = i = 0; i < image->nr_segments; i++) { | ||
2232 | struct kexec_segment *ksegment; | ||
2233 | |||
2234 | ksegment = &image->segment[i]; | ||
2235 | /* | ||
2236 | * Skip purgatory as it will be modified once we put digest | ||
2237 | * info in purgatory. | ||
2238 | */ | ||
2239 | if (ksegment->kbuf == pi->purgatory_buf) | ||
2240 | continue; | ||
2241 | |||
2242 | ret = crypto_shash_update(desc, ksegment->kbuf, | ||
2243 | ksegment->bufsz); | ||
2244 | if (ret) | ||
2245 | break; | ||
2246 | |||
2247 | /* | ||
2248 | * Assume rest of the buffer is filled with zero and | ||
2249 | * update digest accordingly. | ||
2250 | */ | ||
2251 | nullsz = ksegment->memsz - ksegment->bufsz; | ||
2252 | while (nullsz) { | ||
2253 | unsigned long bytes = nullsz; | ||
2254 | |||
2255 | if (bytes > zero_buf_sz) | ||
2256 | bytes = zero_buf_sz; | ||
2257 | ret = crypto_shash_update(desc, zero_buf, bytes); | ||
2258 | if (ret) | ||
2259 | break; | ||
2260 | nullsz -= bytes; | ||
2261 | } | ||
2262 | |||
2263 | if (ret) | ||
2264 | break; | ||
2265 | |||
2266 | sha_regions[j].start = ksegment->mem; | ||
2267 | sha_regions[j].len = ksegment->memsz; | ||
2268 | j++; | ||
2269 | } | ||
2270 | |||
2271 | if (!ret) { | ||
2272 | ret = crypto_shash_final(desc, digest); | ||
2273 | if (ret) | ||
2274 | goto out_free_digest; | ||
2275 | ret = kexec_purgatory_get_set_symbol(image, "sha_regions", | ||
2276 | sha_regions, sha_region_sz, 0); | ||
2277 | if (ret) | ||
2278 | goto out_free_digest; | ||
2279 | |||
2280 | ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", | ||
2281 | digest, SHA256_DIGEST_SIZE, 0); | ||
2282 | if (ret) | ||
2283 | goto out_free_digest; | ||
2284 | } | ||
2285 | |||
2286 | out_free_digest: | ||
2287 | kfree(digest); | ||
2288 | out_free_sha_regions: | ||
2289 | vfree(sha_regions); | ||
2290 | out_free_desc: | ||
2291 | kfree(desc); | ||
2292 | out_free_tfm: | ||
2293 | kfree(tfm); | ||
2294 | out: | ||
2295 | return ret; | ||
2296 | } | ||
2297 | |||
2298 | /* Actually load purgatory. Lot of code taken from kexec-tools */ | ||
2299 | static int __kexec_load_purgatory(struct kimage *image, unsigned long min, | ||
2300 | unsigned long max, int top_down) | ||
2301 | { | ||
2302 | struct purgatory_info *pi = &image->purgatory_info; | ||
2303 | unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; | ||
2304 | unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; | ||
2305 | unsigned char *buf_addr, *src; | ||
2306 | int i, ret = 0, entry_sidx = -1; | ||
2307 | const Elf_Shdr *sechdrs_c; | ||
2308 | Elf_Shdr *sechdrs = NULL; | ||
2309 | void *purgatory_buf = NULL; | ||
2310 | |||
2311 | /* | ||
2312 | * sechdrs_c points to section headers in purgatory and are read | ||
2313 | * only. No modifications allowed. | ||
2314 | */ | ||
2315 | sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; | ||
2316 | |||
2317 | /* | ||
2318 | * We can not modify sechdrs_c[] and its fields. It is read only. | ||
2319 | * Copy it over to a local copy where one can store some temporary | ||
2320 | * data and free it at the end. We need to modify ->sh_addr and | ||
2321 | * ->sh_offset fields to keep track of permanent and temporary | ||
2322 | * locations of sections. | ||
2323 | */ | ||
2324 | sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); | ||
2325 | if (!sechdrs) | ||
2326 | return -ENOMEM; | ||
2327 | |||
2328 | memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); | ||
2329 | |||
2330 | /* | ||
2331 | * We seem to have multiple copies of sections. First copy is which | ||
2332 | * is embedded in kernel in read only section. Some of these sections | ||
2333 | * will be copied to a temporary buffer and relocated. And these | ||
2334 | * sections will finally be copied to their final destination at | ||
2335 | * segment load time. | ||
2336 | * | ||
2337 | * Use ->sh_offset to reflect section address in memory. It will | ||
2338 | * point to original read only copy if section is not allocatable. | ||
2339 | * Otherwise it will point to temporary copy which will be relocated. | ||
2340 | * | ||
2341 | * Use ->sh_addr to contain final address of the section where it | ||
2342 | * will go during execution time. | ||
2343 | */ | ||
2344 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2345 | if (sechdrs[i].sh_type == SHT_NOBITS) | ||
2346 | continue; | ||
2347 | |||
2348 | sechdrs[i].sh_offset = (unsigned long)pi->ehdr + | ||
2349 | sechdrs[i].sh_offset; | ||
2350 | } | ||
2351 | |||
2352 | /* | ||
2353 | * Identify entry point section and make entry relative to section | ||
2354 | * start. | ||
2355 | */ | ||
2356 | entry = pi->ehdr->e_entry; | ||
2357 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2358 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
2359 | continue; | ||
2360 | |||
2361 | if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) | ||
2362 | continue; | ||
2363 | |||
2364 | /* Make entry section relative */ | ||
2365 | if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && | ||
2366 | ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > | ||
2367 | pi->ehdr->e_entry)) { | ||
2368 | entry_sidx = i; | ||
2369 | entry -= sechdrs[i].sh_addr; | ||
2370 | break; | ||
2371 | } | ||
2372 | } | ||
2373 | |||
2374 | /* Determine how much memory is needed to load relocatable object. */ | ||
2375 | buf_align = 1; | ||
2376 | bss_align = 1; | ||
2377 | buf_sz = 0; | ||
2378 | bss_sz = 0; | ||
2379 | |||
2380 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2381 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
2382 | continue; | ||
2383 | |||
2384 | align = sechdrs[i].sh_addralign; | ||
2385 | if (sechdrs[i].sh_type != SHT_NOBITS) { | ||
2386 | if (buf_align < align) | ||
2387 | buf_align = align; | ||
2388 | buf_sz = ALIGN(buf_sz, align); | ||
2389 | buf_sz += sechdrs[i].sh_size; | ||
2390 | } else { | ||
2391 | /* bss section */ | ||
2392 | if (bss_align < align) | ||
2393 | bss_align = align; | ||
2394 | bss_sz = ALIGN(bss_sz, align); | ||
2395 | bss_sz += sechdrs[i].sh_size; | ||
2396 | } | ||
2397 | } | ||
2398 | |||
2399 | /* Determine the bss padding required to align bss properly */ | ||
2400 | bss_pad = 0; | ||
2401 | if (buf_sz & (bss_align - 1)) | ||
2402 | bss_pad = bss_align - (buf_sz & (bss_align - 1)); | ||
2403 | |||
2404 | memsz = buf_sz + bss_pad + bss_sz; | ||
2405 | |||
2406 | /* Allocate buffer for purgatory */ | ||
2407 | purgatory_buf = vzalloc(buf_sz); | ||
2408 | if (!purgatory_buf) { | ||
2409 | ret = -ENOMEM; | ||
2410 | goto out; | ||
2411 | } | ||
2412 | |||
2413 | if (buf_align < bss_align) | ||
2414 | buf_align = bss_align; | ||
2415 | |||
2416 | /* Add buffer to segment list */ | ||
2417 | ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, | ||
2418 | buf_align, min, max, top_down, | ||
2419 | &pi->purgatory_load_addr); | ||
2420 | if (ret) | ||
2421 | goto out; | ||
2422 | |||
2423 | /* Load SHF_ALLOC sections */ | ||
2424 | buf_addr = purgatory_buf; | ||
2425 | load_addr = curr_load_addr = pi->purgatory_load_addr; | ||
2426 | bss_addr = load_addr + buf_sz + bss_pad; | ||
2427 | |||
2428 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2429 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | ||
2430 | continue; | ||
2431 | |||
2432 | align = sechdrs[i].sh_addralign; | ||
2433 | if (sechdrs[i].sh_type != SHT_NOBITS) { | ||
2434 | curr_load_addr = ALIGN(curr_load_addr, align); | ||
2435 | offset = curr_load_addr - load_addr; | ||
2436 | /* We already modifed ->sh_offset to keep src addr */ | ||
2437 | src = (char *) sechdrs[i].sh_offset; | ||
2438 | memcpy(buf_addr + offset, src, sechdrs[i].sh_size); | ||
2439 | |||
2440 | /* Store load address and source address of section */ | ||
2441 | sechdrs[i].sh_addr = curr_load_addr; | ||
2442 | |||
2443 | /* | ||
2444 | * This section got copied to temporary buffer. Update | ||
2445 | * ->sh_offset accordingly. | ||
2446 | */ | ||
2447 | sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); | ||
2448 | |||
2449 | /* Advance to the next address */ | ||
2450 | curr_load_addr += sechdrs[i].sh_size; | ||
2451 | } else { | ||
2452 | bss_addr = ALIGN(bss_addr, align); | ||
2453 | sechdrs[i].sh_addr = bss_addr; | ||
2454 | bss_addr += sechdrs[i].sh_size; | ||
2455 | } | ||
2456 | } | ||
2457 | |||
2458 | /* Update entry point based on load address of text section */ | ||
2459 | if (entry_sidx >= 0) | ||
2460 | entry += sechdrs[entry_sidx].sh_addr; | ||
2461 | |||
2462 | /* Make kernel jump to purgatory after shutdown */ | ||
2463 | image->start = entry; | ||
2464 | |||
2465 | /* Used later to get/set symbol values */ | ||
2466 | pi->sechdrs = sechdrs; | ||
2467 | |||
2468 | /* | ||
2469 | * Used later to identify which section is purgatory and skip it | ||
2470 | * from checksumming. | ||
2471 | */ | ||
2472 | pi->purgatory_buf = purgatory_buf; | ||
2473 | return ret; | ||
2474 | out: | ||
2475 | vfree(sechdrs); | ||
2476 | vfree(purgatory_buf); | ||
2477 | return ret; | ||
2478 | } | ||
2479 | |||
2480 | static int kexec_apply_relocations(struct kimage *image) | ||
2481 | { | ||
2482 | int i, ret; | ||
2483 | struct purgatory_info *pi = &image->purgatory_info; | ||
2484 | Elf_Shdr *sechdrs = pi->sechdrs; | ||
2485 | |||
2486 | /* Apply relocations */ | ||
2487 | for (i = 0; i < pi->ehdr->e_shnum; i++) { | ||
2488 | Elf_Shdr *section, *symtab; | ||
2489 | |||
2490 | if (sechdrs[i].sh_type != SHT_RELA && | ||
2491 | sechdrs[i].sh_type != SHT_REL) | ||
2492 | continue; | ||
2493 | |||
2494 | /* | ||
2495 | * For section of type SHT_RELA/SHT_REL, | ||
2496 | * ->sh_link contains section header index of associated | ||
2497 | * symbol table. And ->sh_info contains section header | ||
2498 | * index of section to which relocations apply. | ||
2499 | */ | ||
2500 | if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || | ||
2501 | sechdrs[i].sh_link >= pi->ehdr->e_shnum) | ||
2502 | return -ENOEXEC; | ||
2503 | |||
2504 | section = &sechdrs[sechdrs[i].sh_info]; | ||
2505 | symtab = &sechdrs[sechdrs[i].sh_link]; | ||
2506 | |||
2507 | if (!(section->sh_flags & SHF_ALLOC)) | ||
2508 | continue; | ||
2509 | |||
2510 | /* | ||
2511 | * symtab->sh_link contain section header index of associated | ||
2512 | * string table. | ||
2513 | */ | ||
2514 | if (symtab->sh_link >= pi->ehdr->e_shnum) | ||
2515 | /* Invalid section number? */ | ||
2516 | continue; | ||
2517 | |||
2518 | /* | ||
2519 | * Respective archicture needs to provide support for applying | ||
2520 | * relocations of type SHT_RELA/SHT_REL. | ||
2521 | */ | ||
2522 | if (sechdrs[i].sh_type == SHT_RELA) | ||
2523 | ret = arch_kexec_apply_relocations_add(pi->ehdr, | ||
2524 | sechdrs, i); | ||
2525 | else if (sechdrs[i].sh_type == SHT_REL) | ||
2526 | ret = arch_kexec_apply_relocations(pi->ehdr, | ||
2527 | sechdrs, i); | ||
2528 | if (ret) | ||
2529 | return ret; | ||
2530 | } | ||
2531 | |||
2532 | return 0; | ||
2533 | } | ||
2534 | |||
2535 | /* Load relocatable purgatory object and relocate it appropriately */ | ||
2536 | int kexec_load_purgatory(struct kimage *image, unsigned long min, | ||
2537 | unsigned long max, int top_down, | ||
2538 | unsigned long *load_addr) | ||
2539 | { | ||
2540 | struct purgatory_info *pi = &image->purgatory_info; | ||
2541 | int ret; | ||
2542 | |||
2543 | if (kexec_purgatory_size <= 0) | ||
2544 | return -EINVAL; | ||
2545 | |||
2546 | if (kexec_purgatory_size < sizeof(Elf_Ehdr)) | ||
2547 | return -ENOEXEC; | ||
2548 | |||
2549 | pi->ehdr = (Elf_Ehdr *)kexec_purgatory; | ||
2550 | |||
2551 | if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 | ||
2552 | || pi->ehdr->e_type != ET_REL | ||
2553 | || !elf_check_arch(pi->ehdr) | ||
2554 | || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) | ||
2555 | return -ENOEXEC; | ||
2556 | |||
2557 | if (pi->ehdr->e_shoff >= kexec_purgatory_size | ||
2558 | || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > | ||
2559 | kexec_purgatory_size - pi->ehdr->e_shoff)) | ||
2560 | return -ENOEXEC; | ||
2561 | |||
2562 | ret = __kexec_load_purgatory(image, min, max, top_down); | ||
2563 | if (ret) | ||
2564 | return ret; | ||
2565 | |||
2566 | ret = kexec_apply_relocations(image); | ||
2567 | if (ret) | ||
2568 | goto out; | ||
2569 | |||
2570 | *load_addr = pi->purgatory_load_addr; | ||
2571 | return 0; | ||
2572 | out: | ||
2573 | vfree(pi->sechdrs); | ||
2574 | vfree(pi->purgatory_buf); | ||
2575 | return ret; | ||
2576 | } | ||
2577 | |||
2578 | static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, | ||
2579 | const char *name) | ||
2580 | { | ||
2581 | Elf_Sym *syms; | ||
2582 | Elf_Shdr *sechdrs; | ||
2583 | Elf_Ehdr *ehdr; | ||
2584 | int i, k; | ||
2585 | const char *strtab; | ||
2586 | |||
2587 | if (!pi->sechdrs || !pi->ehdr) | ||
2588 | return NULL; | ||
2589 | |||
2590 | sechdrs = pi->sechdrs; | ||
2591 | ehdr = pi->ehdr; | ||
2592 | |||
2593 | for (i = 0; i < ehdr->e_shnum; i++) { | ||
2594 | if (sechdrs[i].sh_type != SHT_SYMTAB) | ||
2595 | continue; | ||
2596 | |||
2597 | if (sechdrs[i].sh_link >= ehdr->e_shnum) | ||
2598 | /* Invalid strtab section number */ | ||
2599 | continue; | ||
2600 | strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; | ||
2601 | syms = (Elf_Sym *)sechdrs[i].sh_offset; | ||
2602 | |||
2603 | /* Go through symbols for a match */ | ||
2604 | for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { | ||
2605 | if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) | ||
2606 | continue; | ||
2607 | |||
2608 | if (strcmp(strtab + syms[k].st_name, name) != 0) | ||
2609 | continue; | ||
2610 | |||
2611 | if (syms[k].st_shndx == SHN_UNDEF || | ||
2612 | syms[k].st_shndx >= ehdr->e_shnum) { | ||
2613 | pr_debug("Symbol: %s has bad section index %d.\n", | ||
2614 | name, syms[k].st_shndx); | ||
2615 | return NULL; | ||
2616 | } | ||
2617 | |||
2618 | /* Found the symbol we are looking for */ | ||
2619 | return &syms[k]; | ||
2620 | } | ||
2621 | } | ||
2622 | |||
2623 | return NULL; | ||
2624 | } | ||
2625 | |||
2626 | void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) | ||
2627 | { | ||
2628 | struct purgatory_info *pi = &image->purgatory_info; | ||
2629 | Elf_Sym *sym; | ||
2630 | Elf_Shdr *sechdr; | ||
2631 | |||
2632 | sym = kexec_purgatory_find_symbol(pi, name); | ||
2633 | if (!sym) | ||
2634 | return ERR_PTR(-EINVAL); | ||
2635 | |||
2636 | sechdr = &pi->sechdrs[sym->st_shndx]; | ||
2637 | |||
2638 | /* | ||
2639 | * Returns the address where symbol will finally be loaded after | ||
2640 | * kexec_load_segment() | ||
2641 | */ | ||
2642 | return (void *)(sechdr->sh_addr + sym->st_value); | ||
2643 | } | ||
2644 | |||
2645 | /* | ||
2646 | * Get or set value of a symbol. If "get_value" is true, symbol value is | ||
2647 | * returned in buf otherwise symbol value is set based on value in buf. | ||
2648 | */ | ||
2649 | int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, | ||
2650 | void *buf, unsigned int size, bool get_value) | ||
2651 | { | ||
2652 | Elf_Sym *sym; | ||
2653 | Elf_Shdr *sechdrs; | ||
2654 | struct purgatory_info *pi = &image->purgatory_info; | ||
2655 | char *sym_buf; | ||
2656 | |||
2657 | sym = kexec_purgatory_find_symbol(pi, name); | ||
2658 | if (!sym) | ||
2659 | return -EINVAL; | ||
2660 | |||
2661 | if (sym->st_size != size) { | ||
2662 | pr_err("symbol %s size mismatch: expected %lu actual %u\n", | ||
2663 | name, (unsigned long)sym->st_size, size); | ||
2664 | return -EINVAL; | ||
2665 | } | ||
2666 | |||
2667 | sechdrs = pi->sechdrs; | ||
2668 | |||
2669 | if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { | ||
2670 | pr_err("symbol %s is in a bss section. Cannot %s\n", name, | ||
2671 | get_value ? "get" : "set"); | ||
2672 | return -EINVAL; | ||
2673 | } | ||
2674 | |||
2675 | sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + | ||
2676 | sym->st_value; | ||
2677 | |||
2678 | if (get_value) | ||
2679 | memcpy((void *)buf, sym_buf, size); | ||
2680 | else | ||
2681 | memcpy((void *)sym_buf, buf, size); | ||
2682 | |||
2683 | return 0; | ||
2684 | } | ||
2685 | |||
1635 | /* | 2686 | /* |
1636 | * Move into place and start executing a preloaded standalone | 2687 | * Move into place and start executing a preloaded standalone |
1637 | * executable. If nothing was preloaded return an error. | 2688 | * executable. If nothing was preloaded return an error. |
diff --git a/kernel/module.c b/kernel/module.c index ae79ce615cb9..03214bd288e9 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -3304,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3304 | mutex_lock(&module_mutex); | 3304 | mutex_lock(&module_mutex); |
3305 | module_bug_cleanup(mod); | 3305 | module_bug_cleanup(mod); |
3306 | mutex_unlock(&module_mutex); | 3306 | mutex_unlock(&module_mutex); |
3307 | |||
3308 | /* we can't deallocate the module until we clear memory protection */ | ||
3309 | unset_module_init_ro_nx(mod); | ||
3310 | unset_module_core_ro_nx(mod); | ||
3311 | |||
3307 | ddebug_cleanup: | 3312 | ddebug_cleanup: |
3308 | dynamic_debug_remove(info->debug); | 3313 | dynamic_debug_remove(info->debug); |
3309 | synchronize_sched(); | 3314 | synchronize_sched(); |
@@ -3381,6 +3386,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size) | |||
3381 | */ | 3386 | */ |
3382 | static inline int is_arm_mapping_symbol(const char *str) | 3387 | static inline int is_arm_mapping_symbol(const char *str) |
3383 | { | 3388 | { |
3389 | if (str[0] == '.' && str[1] == 'L') | ||
3390 | return true; | ||
3384 | return str[0] == '$' && strchr("atd", str[1]) | 3391 | return str[0] == '$' && strchr("atd", str[1]) |
3385 | && (str[2] == '\0' || str[2] == '.'); | 3392 | && (str[2] == '\0' || str[2] == '.'); |
3386 | } | 3393 | } |
@@ -3444,8 +3451,7 @@ const char *module_address_lookup(unsigned long addr, | |||
3444 | list_for_each_entry_rcu(mod, &modules, list) { | 3451 | list_for_each_entry_rcu(mod, &modules, list) { |
3445 | if (mod->state == MODULE_STATE_UNFORMED) | 3452 | if (mod->state == MODULE_STATE_UNFORMED) |
3446 | continue; | 3453 | continue; |
3447 | if (within_module_init(addr, mod) || | 3454 | if (within_module(addr, mod)) { |
3448 | within_module_core(addr, mod)) { | ||
3449 | if (modname) | 3455 | if (modname) |
3450 | *modname = mod->name; | 3456 | *modname = mod->name; |
3451 | ret = get_ksymbol(mod, addr, size, offset); | 3457 | ret = get_ksymbol(mod, addr, size, offset); |
@@ -3469,8 +3475,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) | |||
3469 | list_for_each_entry_rcu(mod, &modules, list) { | 3475 | list_for_each_entry_rcu(mod, &modules, list) { |
3470 | if (mod->state == MODULE_STATE_UNFORMED) | 3476 | if (mod->state == MODULE_STATE_UNFORMED) |
3471 | continue; | 3477 | continue; |
3472 | if (within_module_init(addr, mod) || | 3478 | if (within_module(addr, mod)) { |
3473 | within_module_core(addr, mod)) { | ||
3474 | const char *sym; | 3479 | const char *sym; |
3475 | 3480 | ||
3476 | sym = get_ksymbol(mod, addr, NULL, NULL); | 3481 | sym = get_ksymbol(mod, addr, NULL, NULL); |
@@ -3495,8 +3500,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | |||
3495 | list_for_each_entry_rcu(mod, &modules, list) { | 3500 | list_for_each_entry_rcu(mod, &modules, list) { |
3496 | if (mod->state == MODULE_STATE_UNFORMED) | 3501 | if (mod->state == MODULE_STATE_UNFORMED) |
3497 | continue; | 3502 | continue; |
3498 | if (within_module_init(addr, mod) || | 3503 | if (within_module(addr, mod)) { |
3499 | within_module_core(addr, mod)) { | ||
3500 | const char *sym; | 3504 | const char *sym; |
3501 | 3505 | ||
3502 | sym = get_ksymbol(mod, addr, size, offset); | 3506 | sym = get_ksymbol(mod, addr, size, offset); |
@@ -3760,8 +3764,7 @@ struct module *__module_address(unsigned long addr) | |||
3760 | list_for_each_entry_rcu(mod, &modules, list) { | 3764 | list_for_each_entry_rcu(mod, &modules, list) { |
3761 | if (mod->state == MODULE_STATE_UNFORMED) | 3765 | if (mod->state == MODULE_STATE_UNFORMED) |
3762 | continue; | 3766 | continue; |
3763 | if (within_module_core(addr, mod) | 3767 | if (within_module(addr, mod)) |
3764 | || within_module_init(addr, mod)) | ||
3765 | return mod; | 3768 | return mod; |
3766 | } | 3769 | } |
3767 | return NULL; | 3770 | return NULL; |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8e7811086b82..ef42d0ab3115 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) | |||
204 | 204 | ||
205 | might_sleep(); | 205 | might_sleep(); |
206 | 206 | ||
207 | task_lock(p); | ||
207 | ns = p->nsproxy; | 208 | ns = p->nsproxy; |
209 | p->nsproxy = new; | ||
210 | task_unlock(p); | ||
208 | 211 | ||
209 | rcu_assign_pointer(p->nsproxy, new); | 212 | if (ns && atomic_dec_and_test(&ns->count)) |
210 | |||
211 | if (ns && atomic_dec_and_test(&ns->count)) { | ||
212 | /* | ||
213 | * wait for others to get what they want from this nsproxy. | ||
214 | * | ||
215 | * cannot release this nsproxy via the call_rcu() since | ||
216 | * put_mnt_ns() will want to sleep | ||
217 | */ | ||
218 | synchronize_rcu(); | ||
219 | free_nsproxy(ns); | 213 | free_nsproxy(ns); |
220 | } | ||
221 | } | 214 | } |
222 | 215 | ||
223 | void exit_task_namespaces(struct task_struct *p) | 216 | void exit_task_namespaces(struct task_struct *p) |
diff --git a/kernel/panic.c b/kernel/panic.c index 62e16cef9cc2..d09dc5c32c67 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -224,6 +224,7 @@ static const struct tnt tnts[] = { | |||
224 | { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, | 224 | { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, |
225 | { TAINT_OOT_MODULE, 'O', ' ' }, | 225 | { TAINT_OOT_MODULE, 'O', ' ' }, |
226 | { TAINT_UNSIGNED_MODULE, 'E', ' ' }, | 226 | { TAINT_UNSIGNED_MODULE, 'E', ' ' }, |
227 | { TAINT_SOFTLOCKUP, 'L', ' ' }, | ||
227 | }; | 228 | }; |
228 | 229 | ||
229 | /** | 230 | /** |
diff --git a/kernel/params.c b/kernel/params.c index 1e52ca233fd9..34f527023794 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -256,6 +256,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); | |||
256 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); | 256 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); |
257 | STANDARD_PARAM_DEF(long, long, "%li", kstrtol); | 257 | STANDARD_PARAM_DEF(long, long, "%li", kstrtol); |
258 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); | 258 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); |
259 | STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); | ||
259 | 260 | ||
260 | int param_set_charp(const char *val, const struct kernel_param *kp) | 261 | int param_set_charp(const char *val, const struct kernel_param *kp) |
261 | { | 262 | { |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9a83d780facd..e4e4121fa327 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -253,9 +253,6 @@ config APM_EMULATION | |||
253 | anything, try disabling/enabling this option (or disabling/enabling | 253 | anything, try disabling/enabling this option (or disabling/enabling |
254 | APM in your BIOS). | 254 | APM in your BIOS). |
255 | 255 | ||
256 | config ARCH_HAS_OPP | ||
257 | bool | ||
258 | |||
259 | config PM_OPP | 256 | config PM_OPP |
260 | bool | 257 | bool |
261 | ---help--- | 258 | ---help--- |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 8e90f330f139..9a59d042ea84 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
296 | suspend_state_t i; | 296 | suspend_state_t i; |
297 | 297 | ||
298 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) | 298 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) |
299 | if (pm_states[i].state) | 299 | if (pm_states[i]) |
300 | s += sprintf(s,"%s ", pm_states[i].label); | 300 | s += sprintf(s,"%s ", pm_states[i]); |
301 | 301 | ||
302 | #endif | 302 | #endif |
303 | if (hibernation_available()) | 303 | if (hibernation_available()) |
@@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
311 | static suspend_state_t decode_state(const char *buf, size_t n) | 311 | static suspend_state_t decode_state(const char *buf, size_t n) |
312 | { | 312 | { |
313 | #ifdef CONFIG_SUSPEND | 313 | #ifdef CONFIG_SUSPEND |
314 | suspend_state_t state = PM_SUSPEND_MIN; | 314 | suspend_state_t state; |
315 | struct pm_sleep_state *s; | ||
316 | #endif | 315 | #endif |
317 | char *p; | 316 | char *p; |
318 | int len; | 317 | int len; |
@@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n) | |||
325 | return PM_SUSPEND_MAX; | 324 | return PM_SUSPEND_MAX; |
326 | 325 | ||
327 | #ifdef CONFIG_SUSPEND | 326 | #ifdef CONFIG_SUSPEND |
328 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) | 327 | for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { |
329 | if (s->state && len == strlen(s->label) | 328 | const char *label = pm_states[state]; |
330 | && !strncmp(buf, s->label, len)) | 329 | |
331 | return s->state; | 330 | if (label && len == strlen(label) && !strncmp(buf, label, len)) |
331 | return state; | ||
332 | } | ||
332 | #endif | 333 | #endif |
333 | 334 | ||
334 | return PM_SUSPEND_ON; | 335 | return PM_SUSPEND_ON; |
@@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj, | |||
446 | 447 | ||
447 | #ifdef CONFIG_SUSPEND | 448 | #ifdef CONFIG_SUSPEND |
448 | if (state < PM_SUSPEND_MAX) | 449 | if (state < PM_SUSPEND_MAX) |
449 | return sprintf(buf, "%s\n", pm_states[state].state ? | 450 | return sprintf(buf, "%s\n", pm_states[state] ? |
450 | pm_states[state].label : "error"); | 451 | pm_states[state] : "error"); |
451 | #endif | 452 | #endif |
452 | #ifdef CONFIG_HIBERNATION | 453 | #ifdef CONFIG_HIBERNATION |
453 | return sprintf(buf, "disk\n"); | 454 | return sprintf(buf, "disk\n"); |
@@ -615,7 +616,6 @@ static struct attribute_group attr_group = { | |||
615 | .attrs = g, | 616 | .attrs = g, |
616 | }; | 617 | }; |
617 | 618 | ||
618 | #ifdef CONFIG_PM_RUNTIME | ||
619 | struct workqueue_struct *pm_wq; | 619 | struct workqueue_struct *pm_wq; |
620 | EXPORT_SYMBOL_GPL(pm_wq); | 620 | EXPORT_SYMBOL_GPL(pm_wq); |
621 | 621 | ||
@@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void) | |||
625 | 625 | ||
626 | return pm_wq ? 0 : -ENOMEM; | 626 | return pm_wq ? 0 : -ENOMEM; |
627 | } | 627 | } |
628 | #else | ||
629 | static inline int pm_start_workqueue(void) { return 0; } | ||
630 | #endif | ||
631 | 628 | ||
632 | static int __init pm_init(void) | 629 | static int __init pm_init(void) |
633 | { | 630 | { |
diff --git a/kernel/power/power.h b/kernel/power/power.h index c60f13b5270a..5d49dcac2537 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, | |||
178 | unsigned int, char *); | 178 | unsigned int, char *); |
179 | 179 | ||
180 | #ifdef CONFIG_SUSPEND | 180 | #ifdef CONFIG_SUSPEND |
181 | struct pm_sleep_state { | ||
182 | const char *label; | ||
183 | suspend_state_t state; | ||
184 | }; | ||
185 | |||
186 | /* kernel/power/suspend.c */ | 181 | /* kernel/power/suspend.c */ |
187 | extern struct pm_sleep_state pm_states[]; | 182 | extern const char *pm_states[]; |
188 | 183 | ||
189 | extern int suspend_devices_and_enter(suspend_state_t state); | 184 | extern int suspend_devices_and_enter(suspend_state_t state); |
190 | #else /* !CONFIG_SUSPEND */ | 185 | #else /* !CONFIG_SUSPEND */ |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1ea328aafdc9..c4b8093c80b3 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) | |||
248 | * information is stored (in the form of a block of bitmap) | 248 | * information is stored (in the form of a block of bitmap) |
249 | * It also contains the pfns that correspond to the start and end of | 249 | * It also contains the pfns that correspond to the start and end of |
250 | * the represented memory area. | 250 | * the represented memory area. |
251 | * | ||
252 | * The memory bitmap is organized as a radix tree to guarantee fast random | ||
253 | * access to the bits. There is one radix tree for each zone (as returned | ||
254 | * from create_mem_extents). | ||
255 | * | ||
256 | * One radix tree is represented by one struct mem_zone_bm_rtree. There are | ||
257 | * two linked lists for the nodes of the tree, one for the inner nodes and | ||
258 | * one for the leave nodes. The linked leave nodes are used for fast linear | ||
259 | * access of the memory bitmap. | ||
260 | * | ||
261 | * The struct rtree_node represents one node of the radix tree. | ||
251 | */ | 262 | */ |
252 | 263 | ||
253 | #define BM_END_OF_MAP (~0UL) | 264 | #define BM_END_OF_MAP (~0UL) |
254 | 265 | ||
255 | #define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) | 266 | #define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) |
267 | #define BM_BLOCK_SHIFT (PAGE_SHIFT + 3) | ||
268 | #define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1) | ||
256 | 269 | ||
257 | struct bm_block { | 270 | /* |
258 | struct list_head hook; /* hook into a list of bitmap blocks */ | 271 | * struct rtree_node is a wrapper struct to link the nodes |
259 | unsigned long start_pfn; /* pfn represented by the first bit */ | 272 | * of the rtree together for easy linear iteration over |
260 | unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ | 273 | * bits and easy freeing |
261 | unsigned long *data; /* bitmap representing pages */ | 274 | */ |
275 | struct rtree_node { | ||
276 | struct list_head list; | ||
277 | unsigned long *data; | ||
262 | }; | 278 | }; |
263 | 279 | ||
264 | static inline unsigned long bm_block_bits(struct bm_block *bb) | 280 | /* |
265 | { | 281 | * struct mem_zone_bm_rtree represents a bitmap used for one |
266 | return bb->end_pfn - bb->start_pfn; | 282 | * populated memory zone. |
267 | } | 283 | */ |
284 | struct mem_zone_bm_rtree { | ||
285 | struct list_head list; /* Link Zones together */ | ||
286 | struct list_head nodes; /* Radix Tree inner nodes */ | ||
287 | struct list_head leaves; /* Radix Tree leaves */ | ||
288 | unsigned long start_pfn; /* Zone start page frame */ | ||
289 | unsigned long end_pfn; /* Zone end page frame + 1 */ | ||
290 | struct rtree_node *rtree; /* Radix Tree Root */ | ||
291 | int levels; /* Number of Radix Tree Levels */ | ||
292 | unsigned int blocks; /* Number of Bitmap Blocks */ | ||
293 | }; | ||
268 | 294 | ||
269 | /* strcut bm_position is used for browsing memory bitmaps */ | 295 | /* strcut bm_position is used for browsing memory bitmaps */ |
270 | 296 | ||
271 | struct bm_position { | 297 | struct bm_position { |
272 | struct bm_block *block; | 298 | struct mem_zone_bm_rtree *zone; |
273 | int bit; | 299 | struct rtree_node *node; |
300 | unsigned long node_pfn; | ||
301 | int node_bit; | ||
274 | }; | 302 | }; |
275 | 303 | ||
276 | struct memory_bitmap { | 304 | struct memory_bitmap { |
277 | struct list_head blocks; /* list of bitmap blocks */ | 305 | struct list_head zones; |
278 | struct linked_page *p_list; /* list of pages used to store zone | 306 | struct linked_page *p_list; /* list of pages used to store zone |
279 | * bitmap objects and bitmap block | 307 | * bitmap objects and bitmap block |
280 | * objects | 308 | * objects |
@@ -284,38 +312,178 @@ struct memory_bitmap { | |||
284 | 312 | ||
285 | /* Functions that operate on memory bitmaps */ | 313 | /* Functions that operate on memory bitmaps */ |
286 | 314 | ||
287 | static void memory_bm_position_reset(struct memory_bitmap *bm) | 315 | #define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long)) |
316 | #if BITS_PER_LONG == 32 | ||
317 | #define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2) | ||
318 | #else | ||
319 | #define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3) | ||
320 | #endif | ||
321 | #define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) | ||
322 | |||
323 | /* | ||
324 | * alloc_rtree_node - Allocate a new node and add it to the radix tree. | ||
325 | * | ||
326 | * This function is used to allocate inner nodes as well as the | ||
327 | * leave nodes of the radix tree. It also adds the node to the | ||
328 | * corresponding linked list passed in by the *list parameter. | ||
329 | */ | ||
330 | static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, | ||
331 | struct chain_allocator *ca, | ||
332 | struct list_head *list) | ||
288 | { | 333 | { |
289 | bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); | 334 | struct rtree_node *node; |
290 | bm->cur.bit = 0; | ||
291 | } | ||
292 | 335 | ||
293 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); | 336 | node = chain_alloc(ca, sizeof(struct rtree_node)); |
337 | if (!node) | ||
338 | return NULL; | ||
294 | 339 | ||
295 | /** | 340 | node->data = get_image_page(gfp_mask, safe_needed); |
296 | * create_bm_block_list - create a list of block bitmap objects | 341 | if (!node->data) |
297 | * @pages - number of pages to track | 342 | return NULL; |
298 | * @list - list to put the allocated blocks into | 343 | |
299 | * @ca - chain allocator to be used for allocating memory | 344 | list_add_tail(&node->list, list); |
345 | |||
346 | return node; | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * add_rtree_block - Add a new leave node to the radix tree | ||
351 | * | ||
352 | * The leave nodes need to be allocated in order to keep the leaves | ||
353 | * linked list in order. This is guaranteed by the zone->blocks | ||
354 | * counter. | ||
300 | */ | 355 | */ |
301 | static int create_bm_block_list(unsigned long pages, | 356 | static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, |
302 | struct list_head *list, | 357 | int safe_needed, struct chain_allocator *ca) |
303 | struct chain_allocator *ca) | ||
304 | { | 358 | { |
305 | unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); | 359 | struct rtree_node *node, *block, **dst; |
360 | unsigned int levels_needed, block_nr; | ||
361 | int i; | ||
306 | 362 | ||
307 | while (nr_blocks-- > 0) { | 363 | block_nr = zone->blocks; |
308 | struct bm_block *bb; | 364 | levels_needed = 0; |
309 | 365 | ||
310 | bb = chain_alloc(ca, sizeof(struct bm_block)); | 366 | /* How many levels do we need for this block nr? */ |
311 | if (!bb) | 367 | while (block_nr) { |
368 | levels_needed += 1; | ||
369 | block_nr >>= BM_RTREE_LEVEL_SHIFT; | ||
370 | } | ||
371 | |||
372 | /* Make sure the rtree has enough levels */ | ||
373 | for (i = zone->levels; i < levels_needed; i++) { | ||
374 | node = alloc_rtree_node(gfp_mask, safe_needed, ca, | ||
375 | &zone->nodes); | ||
376 | if (!node) | ||
312 | return -ENOMEM; | 377 | return -ENOMEM; |
313 | list_add(&bb->hook, list); | 378 | |
379 | node->data[0] = (unsigned long)zone->rtree; | ||
380 | zone->rtree = node; | ||
381 | zone->levels += 1; | ||
314 | } | 382 | } |
315 | 383 | ||
384 | /* Allocate new block */ | ||
385 | block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves); | ||
386 | if (!block) | ||
387 | return -ENOMEM; | ||
388 | |||
389 | /* Now walk the rtree to insert the block */ | ||
390 | node = zone->rtree; | ||
391 | dst = &zone->rtree; | ||
392 | block_nr = zone->blocks; | ||
393 | for (i = zone->levels; i > 0; i--) { | ||
394 | int index; | ||
395 | |||
396 | if (!node) { | ||
397 | node = alloc_rtree_node(gfp_mask, safe_needed, ca, | ||
398 | &zone->nodes); | ||
399 | if (!node) | ||
400 | return -ENOMEM; | ||
401 | *dst = node; | ||
402 | } | ||
403 | |||
404 | index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); | ||
405 | index &= BM_RTREE_LEVEL_MASK; | ||
406 | dst = (struct rtree_node **)&((*dst)->data[index]); | ||
407 | node = *dst; | ||
408 | } | ||
409 | |||
410 | zone->blocks += 1; | ||
411 | *dst = block; | ||
412 | |||
316 | return 0; | 413 | return 0; |
317 | } | 414 | } |
318 | 415 | ||
416 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, | ||
417 | int clear_nosave_free); | ||
418 | |||
419 | /* | ||
420 | * create_zone_bm_rtree - create a radix tree for one zone | ||
421 | * | ||
422 | * Allocated the mem_zone_bm_rtree structure and initializes it. | ||
423 | * This function also allocated and builds the radix tree for the | ||
424 | * zone. | ||
425 | */ | ||
426 | static struct mem_zone_bm_rtree * | ||
427 | create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, | ||
428 | struct chain_allocator *ca, | ||
429 | unsigned long start, unsigned long end) | ||
430 | { | ||
431 | struct mem_zone_bm_rtree *zone; | ||
432 | unsigned int i, nr_blocks; | ||
433 | unsigned long pages; | ||
434 | |||
435 | pages = end - start; | ||
436 | zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree)); | ||
437 | if (!zone) | ||
438 | return NULL; | ||
439 | |||
440 | INIT_LIST_HEAD(&zone->nodes); | ||
441 | INIT_LIST_HEAD(&zone->leaves); | ||
442 | zone->start_pfn = start; | ||
443 | zone->end_pfn = end; | ||
444 | nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); | ||
445 | |||
446 | for (i = 0; i < nr_blocks; i++) { | ||
447 | if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) { | ||
448 | free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR); | ||
449 | return NULL; | ||
450 | } | ||
451 | } | ||
452 | |||
453 | return zone; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * free_zone_bm_rtree - Free the memory of the radix tree | ||
458 | * | ||
459 | * Free all node pages of the radix tree. The mem_zone_bm_rtree | ||
460 | * structure itself is not freed here nor are the rtree_node | ||
461 | * structs. | ||
462 | */ | ||
463 | static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, | ||
464 | int clear_nosave_free) | ||
465 | { | ||
466 | struct rtree_node *node; | ||
467 | |||
468 | list_for_each_entry(node, &zone->nodes, list) | ||
469 | free_image_page(node->data, clear_nosave_free); | ||
470 | |||
471 | list_for_each_entry(node, &zone->leaves, list) | ||
472 | free_image_page(node->data, clear_nosave_free); | ||
473 | } | ||
474 | |||
475 | static void memory_bm_position_reset(struct memory_bitmap *bm) | ||
476 | { | ||
477 | bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, | ||
478 | list); | ||
479 | bm->cur.node = list_entry(bm->cur.zone->leaves.next, | ||
480 | struct rtree_node, list); | ||
481 | bm->cur.node_pfn = 0; | ||
482 | bm->cur.node_bit = 0; | ||
483 | } | ||
484 | |||
485 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); | ||
486 | |||
319 | struct mem_extent { | 487 | struct mem_extent { |
320 | struct list_head hook; | 488 | struct list_head hook; |
321 | unsigned long start; | 489 | unsigned long start; |
@@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
407 | int error; | 575 | int error; |
408 | 576 | ||
409 | chain_init(&ca, gfp_mask, safe_needed); | 577 | chain_init(&ca, gfp_mask, safe_needed); |
410 | INIT_LIST_HEAD(&bm->blocks); | 578 | INIT_LIST_HEAD(&bm->zones); |
411 | 579 | ||
412 | error = create_mem_extents(&mem_extents, gfp_mask); | 580 | error = create_mem_extents(&mem_extents, gfp_mask); |
413 | if (error) | 581 | if (error) |
414 | return error; | 582 | return error; |
415 | 583 | ||
416 | list_for_each_entry(ext, &mem_extents, hook) { | 584 | list_for_each_entry(ext, &mem_extents, hook) { |
417 | struct bm_block *bb; | 585 | struct mem_zone_bm_rtree *zone; |
418 | unsigned long pfn = ext->start; | ||
419 | unsigned long pages = ext->end - ext->start; | ||
420 | 586 | ||
421 | bb = list_entry(bm->blocks.prev, struct bm_block, hook); | 587 | zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, |
422 | 588 | ext->start, ext->end); | |
423 | error = create_bm_block_list(pages, bm->blocks.prev, &ca); | 589 | if (!zone) { |
424 | if (error) | 590 | error = -ENOMEM; |
425 | goto Error; | 591 | goto Error; |
426 | |||
427 | list_for_each_entry_continue(bb, &bm->blocks, hook) { | ||
428 | bb->data = get_image_page(gfp_mask, safe_needed); | ||
429 | if (!bb->data) { | ||
430 | error = -ENOMEM; | ||
431 | goto Error; | ||
432 | } | ||
433 | |||
434 | bb->start_pfn = pfn; | ||
435 | if (pages >= BM_BITS_PER_BLOCK) { | ||
436 | pfn += BM_BITS_PER_BLOCK; | ||
437 | pages -= BM_BITS_PER_BLOCK; | ||
438 | } else { | ||
439 | /* This is executed only once in the loop */ | ||
440 | pfn += pages; | ||
441 | } | ||
442 | bb->end_pfn = pfn; | ||
443 | } | 592 | } |
593 | list_add_tail(&zone->list, &bm->zones); | ||
444 | } | 594 | } |
445 | 595 | ||
446 | bm->p_list = ca.chain; | 596 | bm->p_list = ca.chain; |
@@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
460 | */ | 610 | */ |
461 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) | 611 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) |
462 | { | 612 | { |
463 | struct bm_block *bb; | 613 | struct mem_zone_bm_rtree *zone; |
464 | 614 | ||
465 | list_for_each_entry(bb, &bm->blocks, hook) | 615 | list_for_each_entry(zone, &bm->zones, list) |
466 | if (bb->data) | 616 | free_zone_bm_rtree(zone, clear_nosave_free); |
467 | free_image_page(bb->data, clear_nosave_free); | ||
468 | 617 | ||
469 | free_list_of_pages(bm->p_list, clear_nosave_free); | 618 | free_list_of_pages(bm->p_list, clear_nosave_free); |
470 | 619 | ||
471 | INIT_LIST_HEAD(&bm->blocks); | 620 | INIT_LIST_HEAD(&bm->zones); |
472 | } | 621 | } |
473 | 622 | ||
474 | /** | 623 | /** |
475 | * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds | 624 | * memory_bm_find_bit - Find the bit for pfn in the memory |
476 | * to given pfn. The cur_zone_bm member of @bm and the cur_block member | 625 | * bitmap |
477 | * of @bm->cur_zone_bm are updated. | 626 | * |
627 | * Find the bit in the bitmap @bm that corresponds to given pfn. | ||
628 | * The cur.zone, cur.block and cur.node_pfn member of @bm are | ||
629 | * updated. | ||
630 | * It walks the radix tree to find the page which contains the bit for | ||
631 | * pfn and returns the bit position in **addr and *bit_nr. | ||
478 | */ | 632 | */ |
479 | static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, | 633 | static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, |
480 | void **addr, unsigned int *bit_nr) | 634 | void **addr, unsigned int *bit_nr) |
481 | { | 635 | { |
482 | struct bm_block *bb; | 636 | struct mem_zone_bm_rtree *curr, *zone; |
637 | struct rtree_node *node; | ||
638 | int i, block_nr; | ||
639 | |||
640 | zone = bm->cur.zone; | ||
641 | |||
642 | if (pfn >= zone->start_pfn && pfn < zone->end_pfn) | ||
643 | goto zone_found; | ||
644 | |||
645 | zone = NULL; | ||
646 | |||
647 | /* Find the right zone */ | ||
648 | list_for_each_entry(curr, &bm->zones, list) { | ||
649 | if (pfn >= curr->start_pfn && pfn < curr->end_pfn) { | ||
650 | zone = curr; | ||
651 | break; | ||
652 | } | ||
653 | } | ||
483 | 654 | ||
655 | if (!zone) | ||
656 | return -EFAULT; | ||
657 | |||
658 | zone_found: | ||
484 | /* | 659 | /* |
485 | * Check if the pfn corresponds to the current bitmap block and find | 660 | * We have a zone. Now walk the radix tree to find the leave |
486 | * the block where it fits if this is not the case. | 661 | * node for our pfn. |
487 | */ | 662 | */ |
488 | bb = bm->cur.block; | ||
489 | if (pfn < bb->start_pfn) | ||
490 | list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) | ||
491 | if (pfn >= bb->start_pfn) | ||
492 | break; | ||
493 | 663 | ||
494 | if (pfn >= bb->end_pfn) | 664 | node = bm->cur.node; |
495 | list_for_each_entry_continue(bb, &bm->blocks, hook) | 665 | if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) |
496 | if (pfn >= bb->start_pfn && pfn < bb->end_pfn) | 666 | goto node_found; |
497 | break; | ||
498 | 667 | ||
499 | if (&bb->hook == &bm->blocks) | 668 | node = zone->rtree; |
500 | return -EFAULT; | 669 | block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; |
670 | |||
671 | for (i = zone->levels; i > 0; i--) { | ||
672 | int index; | ||
673 | |||
674 | index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); | ||
675 | index &= BM_RTREE_LEVEL_MASK; | ||
676 | BUG_ON(node->data[index] == 0); | ||
677 | node = (struct rtree_node *)node->data[index]; | ||
678 | } | ||
679 | |||
680 | node_found: | ||
681 | /* Update last position */ | ||
682 | bm->cur.zone = zone; | ||
683 | bm->cur.node = node; | ||
684 | bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; | ||
685 | |||
686 | /* Set return values */ | ||
687 | *addr = node->data; | ||
688 | *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; | ||
501 | 689 | ||
502 | /* The block has been found */ | ||
503 | bm->cur.block = bb; | ||
504 | pfn -= bb->start_pfn; | ||
505 | bm->cur.bit = pfn + 1; | ||
506 | *bit_nr = pfn; | ||
507 | *addr = bb->data; | ||
508 | return 0; | 690 | return 0; |
509 | } | 691 | } |
510 | 692 | ||
@@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) | |||
528 | error = memory_bm_find_bit(bm, pfn, &addr, &bit); | 710 | error = memory_bm_find_bit(bm, pfn, &addr, &bit); |
529 | if (!error) | 711 | if (!error) |
530 | set_bit(bit, addr); | 712 | set_bit(bit, addr); |
713 | |||
531 | return error; | 714 | return error; |
532 | } | 715 | } |
533 | 716 | ||
@@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) | |||
542 | clear_bit(bit, addr); | 725 | clear_bit(bit, addr); |
543 | } | 726 | } |
544 | 727 | ||
728 | static void memory_bm_clear_current(struct memory_bitmap *bm) | ||
729 | { | ||
730 | int bit; | ||
731 | |||
732 | bit = max(bm->cur.node_bit - 1, 0); | ||
733 | clear_bit(bit, bm->cur.node->data); | ||
734 | } | ||
735 | |||
545 | static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) | 736 | static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) |
546 | { | 737 | { |
547 | void *addr; | 738 | void *addr; |
@@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) | |||
561 | return !memory_bm_find_bit(bm, pfn, &addr, &bit); | 752 | return !memory_bm_find_bit(bm, pfn, &addr, &bit); |
562 | } | 753 | } |
563 | 754 | ||
564 | /** | 755 | /* |
565 | * memory_bm_next_pfn - find the pfn that corresponds to the next set bit | 756 | * rtree_next_node - Jumps to the next leave node |
566 | * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is | ||
567 | * returned. | ||
568 | * | 757 | * |
569 | * It is required to run memory_bm_position_reset() before the first call to | 758 | * Sets the position to the beginning of the next node in the |
570 | * this function. | 759 | * memory bitmap. This is either the next node in the current |
760 | * zone's radix tree or the first node in the radix tree of the | ||
761 | * next zone. | ||
762 | * | ||
763 | * Returns true if there is a next node, false otherwise. | ||
571 | */ | 764 | */ |
765 | static bool rtree_next_node(struct memory_bitmap *bm) | ||
766 | { | ||
767 | bm->cur.node = list_entry(bm->cur.node->list.next, | ||
768 | struct rtree_node, list); | ||
769 | if (&bm->cur.node->list != &bm->cur.zone->leaves) { | ||
770 | bm->cur.node_pfn += BM_BITS_PER_BLOCK; | ||
771 | bm->cur.node_bit = 0; | ||
772 | touch_softlockup_watchdog(); | ||
773 | return true; | ||
774 | } | ||
775 | |||
776 | /* No more nodes, goto next zone */ | ||
777 | bm->cur.zone = list_entry(bm->cur.zone->list.next, | ||
778 | struct mem_zone_bm_rtree, list); | ||
779 | if (&bm->cur.zone->list != &bm->zones) { | ||
780 | bm->cur.node = list_entry(bm->cur.zone->leaves.next, | ||
781 | struct rtree_node, list); | ||
782 | bm->cur.node_pfn = 0; | ||
783 | bm->cur.node_bit = 0; | ||
784 | return true; | ||
785 | } | ||
786 | |||
787 | /* No more zones */ | ||
788 | return false; | ||
789 | } | ||
572 | 790 | ||
791 | /** | ||
792 | * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm | ||
793 | * | ||
794 | * Starting from the last returned position this function searches | ||
795 | * for the next set bit in the memory bitmap and returns its | ||
796 | * number. If no more bit is set BM_END_OF_MAP is returned. | ||
797 | * | ||
798 | * It is required to run memory_bm_position_reset() before the | ||
799 | * first call to this function. | ||
800 | */ | ||
573 | static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) | 801 | static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) |
574 | { | 802 | { |
575 | struct bm_block *bb; | 803 | unsigned long bits, pfn, pages; |
576 | int bit; | 804 | int bit; |
577 | 805 | ||
578 | bb = bm->cur.block; | ||
579 | do { | 806 | do { |
580 | bit = bm->cur.bit; | 807 | pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; |
581 | bit = find_next_bit(bb->data, bm_block_bits(bb), bit); | 808 | bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); |
582 | if (bit < bm_block_bits(bb)) | 809 | bit = find_next_bit(bm->cur.node->data, bits, |
583 | goto Return_pfn; | 810 | bm->cur.node_bit); |
584 | 811 | if (bit < bits) { | |
585 | bb = list_entry(bb->hook.next, struct bm_block, hook); | 812 | pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; |
586 | bm->cur.block = bb; | 813 | bm->cur.node_bit = bit + 1; |
587 | bm->cur.bit = 0; | 814 | return pfn; |
588 | } while (&bb->hook != &bm->blocks); | 815 | } |
816 | } while (rtree_next_node(bm)); | ||
589 | 817 | ||
590 | memory_bm_position_reset(bm); | ||
591 | return BM_END_OF_MAP; | 818 | return BM_END_OF_MAP; |
592 | |||
593 | Return_pfn: | ||
594 | bm->cur.bit = bit + 1; | ||
595 | return bb->start_pfn + bit; | ||
596 | } | 819 | } |
597 | 820 | ||
598 | /** | 821 | /** |
@@ -731,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
731 | } | 954 | } |
732 | } | 955 | } |
733 | 956 | ||
957 | static bool is_nosave_page(unsigned long pfn) | ||
958 | { | ||
959 | struct nosave_region *region; | ||
960 | |||
961 | list_for_each_entry(region, &nosave_regions, list) { | ||
962 | if (pfn >= region->start_pfn && pfn < region->end_pfn) { | ||
963 | pr_err("PM: %#010llx in e820 nosave region: " | ||
964 | "[mem %#010llx-%#010llx]\n", | ||
965 | (unsigned long long) pfn << PAGE_SHIFT, | ||
966 | (unsigned long long) region->start_pfn << PAGE_SHIFT, | ||
967 | ((unsigned long long) region->end_pfn << PAGE_SHIFT) | ||
968 | - 1); | ||
969 | return true; | ||
970 | } | ||
971 | } | ||
972 | |||
973 | return false; | ||
974 | } | ||
975 | |||
734 | /** | 976 | /** |
735 | * create_basic_memory_bitmaps - create bitmaps needed for marking page | 977 | * create_basic_memory_bitmaps - create bitmaps needed for marking page |
736 | * frames that should not be saved and free page frames. The pointers | 978 | * frames that should not be saved and free page frames. The pointers |
@@ -816,12 +1058,17 @@ void free_basic_memory_bitmaps(void) | |||
816 | 1058 | ||
817 | unsigned int snapshot_additional_pages(struct zone *zone) | 1059 | unsigned int snapshot_additional_pages(struct zone *zone) |
818 | { | 1060 | { |
819 | unsigned int res; | 1061 | unsigned int rtree, nodes; |
820 | 1062 | ||
821 | res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); | 1063 | rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); |
822 | res += DIV_ROUND_UP(res * sizeof(struct bm_block), | 1064 | rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), |
823 | LINKED_PAGE_DATA_SIZE); | 1065 | LINKED_PAGE_DATA_SIZE); |
824 | return 2 * res; | 1066 | while (nodes > 1) { |
1067 | nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL); | ||
1068 | rtree += nodes; | ||
1069 | } | ||
1070 | |||
1071 | return 2 * rtree; | ||
825 | } | 1072 | } |
826 | 1073 | ||
827 | #ifdef CONFIG_HIGHMEM | 1074 | #ifdef CONFIG_HIGHMEM |
@@ -1094,23 +1341,35 @@ static struct memory_bitmap copy_bm; | |||
1094 | 1341 | ||
1095 | void swsusp_free(void) | 1342 | void swsusp_free(void) |
1096 | { | 1343 | { |
1097 | struct zone *zone; | 1344 | unsigned long fb_pfn, fr_pfn; |
1098 | unsigned long pfn, max_zone_pfn; | ||
1099 | 1345 | ||
1100 | for_each_populated_zone(zone) { | 1346 | memory_bm_position_reset(forbidden_pages_map); |
1101 | max_zone_pfn = zone_end_pfn(zone); | 1347 | memory_bm_position_reset(free_pages_map); |
1102 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1348 | |
1103 | if (pfn_valid(pfn)) { | 1349 | loop: |
1104 | struct page *page = pfn_to_page(pfn); | 1350 | fr_pfn = memory_bm_next_pfn(free_pages_map); |
1105 | 1351 | fb_pfn = memory_bm_next_pfn(forbidden_pages_map); | |
1106 | if (swsusp_page_is_forbidden(page) && | 1352 | |
1107 | swsusp_page_is_free(page)) { | 1353 | /* |
1108 | swsusp_unset_page_forbidden(page); | 1354 | * Find the next bit set in both bitmaps. This is guaranteed to |
1109 | swsusp_unset_page_free(page); | 1355 | * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. |
1110 | __free_page(page); | 1356 | */ |
1111 | } | 1357 | do { |
1112 | } | 1358 | if (fb_pfn < fr_pfn) |
1359 | fb_pfn = memory_bm_next_pfn(forbidden_pages_map); | ||
1360 | if (fr_pfn < fb_pfn) | ||
1361 | fr_pfn = memory_bm_next_pfn(free_pages_map); | ||
1362 | } while (fb_pfn != fr_pfn); | ||
1363 | |||
1364 | if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { | ||
1365 | struct page *page = pfn_to_page(fr_pfn); | ||
1366 | |||
1367 | memory_bm_clear_current(forbidden_pages_map); | ||
1368 | memory_bm_clear_current(free_pages_map); | ||
1369 | __free_page(page); | ||
1370 | goto loop; | ||
1113 | } | 1371 | } |
1372 | |||
1114 | nr_copy_pages = 0; | 1373 | nr_copy_pages = 0; |
1115 | nr_meta_pages = 0; | 1374 | nr_meta_pages = 0; |
1116 | restore_pblist = NULL; | 1375 | restore_pblist = NULL; |
@@ -1775,7 +2034,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) | |||
1775 | do { | 2034 | do { |
1776 | pfn = memory_bm_next_pfn(bm); | 2035 | pfn = memory_bm_next_pfn(bm); |
1777 | if (likely(pfn != BM_END_OF_MAP)) { | 2036 | if (likely(pfn != BM_END_OF_MAP)) { |
1778 | if (likely(pfn_valid(pfn))) | 2037 | if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) |
1779 | swsusp_set_page_free(pfn_to_page(pfn)); | 2038 | swsusp_set_page_free(pfn_to_page(pfn)); |
1780 | else | 2039 | else |
1781 | return -EFAULT; | 2040 | return -EFAULT; |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4b736b4dfa96..6dadb25cb0d8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -31,20 +31,11 @@ | |||
31 | 31 | ||
32 | #include "power.h" | 32 | #include "power.h" |
33 | 33 | ||
34 | struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { | 34 | static const char *pm_labels[] = { "mem", "standby", "freeze", }; |
35 | [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, | 35 | const char *pm_states[PM_SUSPEND_MAX]; |
36 | [PM_SUSPEND_STANDBY] = { .label = "standby", }, | ||
37 | [PM_SUSPEND_MEM] = { .label = "mem", }, | ||
38 | }; | ||
39 | 36 | ||
40 | static const struct platform_suspend_ops *suspend_ops; | 37 | static const struct platform_suspend_ops *suspend_ops; |
41 | static const struct platform_freeze_ops *freeze_ops; | 38 | static const struct platform_freeze_ops *freeze_ops; |
42 | |||
43 | static bool need_suspend_ops(suspend_state_t state) | ||
44 | { | ||
45 | return state > PM_SUSPEND_FREEZE; | ||
46 | } | ||
47 | |||
48 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); | 39 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); |
49 | static bool suspend_freeze_wake; | 40 | static bool suspend_freeze_wake; |
50 | 41 | ||
@@ -97,10 +88,7 @@ static bool relative_states; | |||
97 | static int __init sleep_states_setup(char *str) | 88 | static int __init sleep_states_setup(char *str) |
98 | { | 89 | { |
99 | relative_states = !strncmp(str, "1", 1); | 90 | relative_states = !strncmp(str, "1", 1); |
100 | if (relative_states) { | 91 | pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; |
101 | pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; | ||
102 | pm_states[PM_SUSPEND_FREEZE].state = 0; | ||
103 | } | ||
104 | return 1; | 92 | return 1; |
105 | } | 93 | } |
106 | 94 | ||
@@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup); | |||
113 | void suspend_set_ops(const struct platform_suspend_ops *ops) | 101 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
114 | { | 102 | { |
115 | suspend_state_t i; | 103 | suspend_state_t i; |
116 | int j = PM_SUSPEND_MAX - 1; | 104 | int j = 0; |
117 | 105 | ||
118 | lock_system_sleep(); | 106 | lock_system_sleep(); |
119 | 107 | ||
120 | suspend_ops = ops; | 108 | suspend_ops = ops; |
121 | for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) | 109 | for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) |
122 | if (valid_state(i)) | 110 | if (valid_state(i)) { |
123 | pm_states[j--].state = i; | 111 | pm_states[i] = pm_labels[j++]; |
124 | else if (!relative_states) | 112 | } else if (!relative_states) { |
125 | pm_states[j--].state = 0; | 113 | pm_states[i] = NULL; |
114 | j++; | ||
115 | } | ||
126 | 116 | ||
127 | pm_states[j--].state = PM_SUSPEND_FREEZE; | 117 | pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; |
128 | while (j >= PM_SUSPEND_MIN) | ||
129 | pm_states[j--].state = 0; | ||
130 | 118 | ||
131 | unlock_system_sleep(); | 119 | unlock_system_sleep(); |
132 | } | 120 | } |
@@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state) | |||
145 | } | 133 | } |
146 | EXPORT_SYMBOL_GPL(suspend_valid_only_mem); | 134 | EXPORT_SYMBOL_GPL(suspend_valid_only_mem); |
147 | 135 | ||
136 | static bool sleep_state_supported(suspend_state_t state) | ||
137 | { | ||
138 | return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter); | ||
139 | } | ||
140 | |||
141 | static int platform_suspend_prepare(suspend_state_t state) | ||
142 | { | ||
143 | return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ? | ||
144 | suspend_ops->prepare() : 0; | ||
145 | } | ||
146 | |||
147 | static int platform_suspend_prepare_late(suspend_state_t state) | ||
148 | { | ||
149 | return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? | ||
150 | suspend_ops->prepare_late() : 0; | ||
151 | } | ||
152 | |||
153 | static void platform_suspend_wake(suspend_state_t state) | ||
154 | { | ||
155 | if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) | ||
156 | suspend_ops->wake(); | ||
157 | } | ||
158 | |||
159 | static void platform_suspend_finish(suspend_state_t state) | ||
160 | { | ||
161 | if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) | ||
162 | suspend_ops->finish(); | ||
163 | } | ||
164 | |||
165 | static int platform_suspend_begin(suspend_state_t state) | ||
166 | { | ||
167 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) | ||
168 | return freeze_ops->begin(); | ||
169 | else if (suspend_ops->begin) | ||
170 | return suspend_ops->begin(state); | ||
171 | else | ||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | static void platform_suspend_end(suspend_state_t state) | ||
176 | { | ||
177 | if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) | ||
178 | freeze_ops->end(); | ||
179 | else if (suspend_ops->end) | ||
180 | suspend_ops->end(); | ||
181 | } | ||
182 | |||
183 | static void platform_suspend_recover(suspend_state_t state) | ||
184 | { | ||
185 | if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) | ||
186 | suspend_ops->recover(); | ||
187 | } | ||
188 | |||
189 | static bool platform_suspend_again(suspend_state_t state) | ||
190 | { | ||
191 | return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ? | ||
192 | suspend_ops->suspend_again() : false; | ||
193 | } | ||
194 | |||
148 | static int suspend_test(int level) | 195 | static int suspend_test(int level) |
149 | { | 196 | { |
150 | #ifdef CONFIG_PM_DEBUG | 197 | #ifdef CONFIG_PM_DEBUG |
@@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state) | |||
168 | { | 215 | { |
169 | int error; | 216 | int error; |
170 | 217 | ||
171 | if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) | 218 | if (!sleep_state_supported(state)) |
172 | return -EPERM; | 219 | return -EPERM; |
173 | 220 | ||
174 | pm_prepare_console(); | 221 | pm_prepare_console(); |
@@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
214 | { | 261 | { |
215 | int error; | 262 | int error; |
216 | 263 | ||
217 | if (need_suspend_ops(state) && suspend_ops->prepare) { | 264 | error = platform_suspend_prepare(state); |
218 | error = suspend_ops->prepare(); | 265 | if (error) |
219 | if (error) | 266 | goto Platform_finish; |
220 | goto Platform_finish; | ||
221 | } | ||
222 | 267 | ||
223 | error = dpm_suspend_end(PMSG_SUSPEND); | 268 | error = dpm_suspend_end(PMSG_SUSPEND); |
224 | if (error) { | 269 | if (error) { |
225 | printk(KERN_ERR "PM: Some devices failed to power down\n"); | 270 | printk(KERN_ERR "PM: Some devices failed to power down\n"); |
226 | goto Platform_finish; | 271 | goto Platform_finish; |
227 | } | 272 | } |
228 | 273 | error = platform_suspend_prepare_late(state); | |
229 | if (need_suspend_ops(state) && suspend_ops->prepare_late) { | 274 | if (error) |
230 | error = suspend_ops->prepare_late(); | 275 | goto Platform_wake; |
231 | if (error) | ||
232 | goto Platform_wake; | ||
233 | } | ||
234 | 276 | ||
235 | if (suspend_test(TEST_PLATFORM)) | 277 | if (suspend_test(TEST_PLATFORM)) |
236 | goto Platform_wake; | 278 | goto Platform_wake; |
@@ -276,15 +318,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
276 | enable_nonboot_cpus(); | 318 | enable_nonboot_cpus(); |
277 | 319 | ||
278 | Platform_wake: | 320 | Platform_wake: |
279 | if (need_suspend_ops(state) && suspend_ops->wake) | 321 | platform_suspend_wake(state); |
280 | suspend_ops->wake(); | ||
281 | |||
282 | dpm_resume_start(PMSG_RESUME); | 322 | dpm_resume_start(PMSG_RESUME); |
283 | 323 | ||
284 | Platform_finish: | 324 | Platform_finish: |
285 | if (need_suspend_ops(state) && suspend_ops->finish) | 325 | platform_suspend_finish(state); |
286 | suspend_ops->finish(); | ||
287 | |||
288 | return error; | 326 | return error; |
289 | } | 327 | } |
290 | 328 | ||
@@ -297,18 +335,13 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
297 | int error; | 335 | int error; |
298 | bool wakeup = false; | 336 | bool wakeup = false; |
299 | 337 | ||
300 | if (need_suspend_ops(state) && !suspend_ops) | 338 | if (!sleep_state_supported(state)) |
301 | return -ENOSYS; | 339 | return -ENOSYS; |
302 | 340 | ||
303 | if (need_suspend_ops(state) && suspend_ops->begin) { | 341 | error = platform_suspend_begin(state); |
304 | error = suspend_ops->begin(state); | 342 | if (error) |
305 | if (error) | 343 | goto Close; |
306 | goto Close; | 344 | |
307 | } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { | ||
308 | error = freeze_ops->begin(); | ||
309 | if (error) | ||
310 | goto Close; | ||
311 | } | ||
312 | suspend_console(); | 345 | suspend_console(); |
313 | suspend_test_start(); | 346 | suspend_test_start(); |
314 | error = dpm_suspend_start(PMSG_SUSPEND); | 347 | error = dpm_suspend_start(PMSG_SUSPEND); |
@@ -322,25 +355,20 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
322 | 355 | ||
323 | do { | 356 | do { |
324 | error = suspend_enter(state, &wakeup); | 357 | error = suspend_enter(state, &wakeup); |
325 | } while (!error && !wakeup && need_suspend_ops(state) | 358 | } while (!error && !wakeup && platform_suspend_again(state)); |
326 | && suspend_ops->suspend_again && suspend_ops->suspend_again()); | ||
327 | 359 | ||
328 | Resume_devices: | 360 | Resume_devices: |
329 | suspend_test_start(); | 361 | suspend_test_start(); |
330 | dpm_resume_end(PMSG_RESUME); | 362 | dpm_resume_end(PMSG_RESUME); |
331 | suspend_test_finish("resume devices"); | 363 | suspend_test_finish("resume devices"); |
332 | resume_console(); | 364 | resume_console(); |
333 | Close: | ||
334 | if (need_suspend_ops(state) && suspend_ops->end) | ||
335 | suspend_ops->end(); | ||
336 | else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) | ||
337 | freeze_ops->end(); | ||
338 | 365 | ||
366 | Close: | ||
367 | platform_suspend_end(state); | ||
339 | return error; | 368 | return error; |
340 | 369 | ||
341 | Recover_platform: | 370 | Recover_platform: |
342 | if (need_suspend_ops(state) && suspend_ops->recover) | 371 | platform_suspend_recover(state); |
343 | suspend_ops->recover(); | ||
344 | goto Resume_devices; | 372 | goto Resume_devices; |
345 | } | 373 | } |
346 | 374 | ||
@@ -393,7 +421,7 @@ static int enter_state(suspend_state_t state) | |||
393 | printk("done.\n"); | 421 | printk("done.\n"); |
394 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); | 422 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); |
395 | 423 | ||
396 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); | 424 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); |
397 | error = suspend_prepare(state); | 425 | error = suspend_prepare(state); |
398 | if (error) | 426 | if (error) |
399 | goto Unlock; | 427 | goto Unlock; |
@@ -402,7 +430,7 @@ static int enter_state(suspend_state_t state) | |||
402 | goto Finish; | 430 | goto Finish; |
403 | 431 | ||
404 | trace_suspend_resume(TPS("suspend_enter"), state, false); | 432 | trace_suspend_resume(TPS("suspend_enter"), state, false); |
405 | pr_debug("PM: Entering %s sleep\n", pm_states[state].label); | 433 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); |
406 | pm_restrict_gfp_mask(); | 434 | pm_restrict_gfp_mask(); |
407 | error = suspend_devices_and_enter(state); | 435 | error = suspend_devices_and_enter(state); |
408 | pm_restore_gfp_mask(); | 436 | pm_restore_gfp_mask(); |
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 269b097e78ea..2f524928b6aa 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
92 | } | 92 | } |
93 | 93 | ||
94 | if (state == PM_SUSPEND_MEM) { | 94 | if (state == PM_SUSPEND_MEM) { |
95 | printk(info_test, pm_states[state].label); | 95 | printk(info_test, pm_states[state]); |
96 | status = pm_suspend(state); | 96 | status = pm_suspend(state); |
97 | if (status == -ENODEV) | 97 | if (status == -ENODEV) |
98 | state = PM_SUSPEND_STANDBY; | 98 | state = PM_SUSPEND_STANDBY; |
99 | } | 99 | } |
100 | if (state == PM_SUSPEND_STANDBY) { | 100 | if (state == PM_SUSPEND_STANDBY) { |
101 | printk(info_test, pm_states[state].label); | 101 | printk(info_test, pm_states[state]); |
102 | status = pm_suspend(state); | 102 | status = pm_suspend(state); |
103 | } | 103 | } |
104 | if (status < 0) | 104 | if (status < 0) |
@@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value) | |||
141 | /* "=mem" ==> "mem" */ | 141 | /* "=mem" ==> "mem" */ |
142 | value++; | 142 | value++; |
143 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) | 143 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) |
144 | if (!strcmp(pm_states[i].label, value)) { | 144 | if (!strcmp(pm_states[i], value)) { |
145 | test_state = pm_states[i].state; | 145 | test_state = i; |
146 | return 0; | 146 | return 0; |
147 | } | 147 | } |
148 | 148 | ||
@@ -162,8 +162,8 @@ static int __init test_suspend(void) | |||
162 | /* PM is initialized by now; is that state testable? */ | 162 | /* PM is initialized by now; is that state testable? */ |
163 | if (test_state == PM_SUSPEND_ON) | 163 | if (test_state == PM_SUSPEND_ON) |
164 | goto done; | 164 | goto done; |
165 | if (!pm_states[test_state].state) { | 165 | if (!pm_states[test_state]) { |
166 | printk(warn_bad_state, pm_states[test_state].label); | 166 | printk(warn_bad_state, pm_states[test_state]); |
167 | goto done; | 167 | goto done; |
168 | } | 168 | } |
169 | 169 | ||
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839dbca07..e04c455a0e38 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/poll.h> | 45 | #include <linux/poll.h> |
46 | #include <linux/irq_work.h> | 46 | #include <linux/irq_work.h> |
47 | #include <linux/utsname.h> | 47 | #include <linux/utsname.h> |
48 | #include <linux/ctype.h> | ||
48 | 49 | ||
49 | #include <asm/uaccess.h> | 50 | #include <asm/uaccess.h> |
50 | 51 | ||
@@ -56,7 +57,7 @@ | |||
56 | 57 | ||
57 | int console_printk[4] = { | 58 | int console_printk[4] = { |
58 | CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ | 59 | CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ |
59 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ | 60 | MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ |
60 | CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ | 61 | CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ |
61 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ | 62 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ |
62 | }; | 63 | }; |
@@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip) | |||
113 | * This is used for debugging the mess that is the VT code by | 114 | * This is used for debugging the mess that is the VT code by |
114 | * keeping track if we have the console semaphore held. It's | 115 | * keeping track if we have the console semaphore held. It's |
115 | * definitely not the perfect debug tool (we don't know if _WE_ | 116 | * definitely not the perfect debug tool (we don't know if _WE_ |
116 | * hold it are racing, but it helps tracking those weird code | 117 | * hold it and are racing, but it helps tracking those weird code |
117 | * path in the console code where we end up in places I want | 118 | * paths in the console code where we end up in places I want |
118 | * locked without the console sempahore held | 119 | * locked without the console sempahore held). |
119 | */ | 120 | */ |
120 | static int console_locked, console_suspended; | 121 | static int console_locked, console_suspended; |
121 | 122 | ||
@@ -146,8 +147,8 @@ static int console_may_schedule; | |||
146 | * the overall length of the record. | 147 | * the overall length of the record. |
147 | * | 148 | * |
148 | * The heads to the first and last entry in the buffer, as well as the | 149 | * The heads to the first and last entry in the buffer, as well as the |
149 | * sequence numbers of these both entries are maintained when messages | 150 | * sequence numbers of these entries are maintained when messages are |
150 | * are stored.. | 151 | * stored. |
151 | * | 152 | * |
152 | * If the heads indicate available messages, the length in the header | 153 | * If the heads indicate available messages, the length in the header |
153 | * tells the start next message. A length == 0 for the next message | 154 | * tells the start next message. A length == 0 for the next message |
@@ -257,7 +258,7 @@ static u64 clear_seq; | |||
257 | static u32 clear_idx; | 258 | static u32 clear_idx; |
258 | 259 | ||
259 | #define PREFIX_MAX 32 | 260 | #define PREFIX_MAX 32 |
260 | #define LOG_LINE_MAX 1024 - PREFIX_MAX | 261 | #define LOG_LINE_MAX (1024 - PREFIX_MAX) |
261 | 262 | ||
262 | /* record buffer */ | 263 | /* record buffer */ |
263 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | 264 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
@@ -266,10 +267,23 @@ static u32 clear_idx; | |||
266 | #define LOG_ALIGN __alignof__(struct printk_log) | 267 | #define LOG_ALIGN __alignof__(struct printk_log) |
267 | #endif | 268 | #endif |
268 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 269 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
270 | #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) | ||
269 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | 271 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); |
270 | static char *log_buf = __log_buf; | 272 | static char *log_buf = __log_buf; |
271 | static u32 log_buf_len = __LOG_BUF_LEN; | 273 | static u32 log_buf_len = __LOG_BUF_LEN; |
272 | 274 | ||
275 | /* Return log buffer address */ | ||
276 | char *log_buf_addr_get(void) | ||
277 | { | ||
278 | return log_buf; | ||
279 | } | ||
280 | |||
281 | /* Return log buffer size */ | ||
282 | u32 log_buf_len_get(void) | ||
283 | { | ||
284 | return log_buf_len; | ||
285 | } | ||
286 | |||
273 | /* human readable text of the record */ | 287 | /* human readable text of the record */ |
274 | static char *log_text(const struct printk_log *msg) | 288 | static char *log_text(const struct printk_log *msg) |
275 | { | 289 | { |
@@ -344,7 +358,7 @@ static int log_make_free_space(u32 msg_size) | |||
344 | while (log_first_seq < log_next_seq) { | 358 | while (log_first_seq < log_next_seq) { |
345 | if (logbuf_has_space(msg_size, false)) | 359 | if (logbuf_has_space(msg_size, false)) |
346 | return 0; | 360 | return 0; |
347 | /* drop old messages until we have enough continuous space */ | 361 | /* drop old messages until we have enough contiguous space */ |
348 | log_first_idx = log_next(log_first_idx); | 362 | log_first_idx = log_next(log_first_idx); |
349 | log_first_seq++; | 363 | log_first_seq++; |
350 | } | 364 | } |
@@ -453,11 +467,7 @@ static int log_store(int facility, int level, | |||
453 | return msg->text_len; | 467 | return msg->text_len; |
454 | } | 468 | } |
455 | 469 | ||
456 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | 470 | int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); |
457 | int dmesg_restrict = 1; | ||
458 | #else | ||
459 | int dmesg_restrict; | ||
460 | #endif | ||
461 | 471 | ||
462 | static int syslog_action_restricted(int type) | 472 | static int syslog_action_restricted(int type) |
463 | { | 473 | { |
@@ -828,34 +838,74 @@ void log_buf_kexec_setup(void) | |||
828 | /* requested log_buf_len from kernel cmdline */ | 838 | /* requested log_buf_len from kernel cmdline */ |
829 | static unsigned long __initdata new_log_buf_len; | 839 | static unsigned long __initdata new_log_buf_len; |
830 | 840 | ||
831 | /* save requested log_buf_len since it's too early to process it */ | 841 | /* we practice scaling the ring buffer by powers of 2 */ |
832 | static int __init log_buf_len_setup(char *str) | 842 | static void __init log_buf_len_update(unsigned size) |
833 | { | 843 | { |
834 | unsigned size = memparse(str, &str); | ||
835 | |||
836 | if (size) | 844 | if (size) |
837 | size = roundup_pow_of_two(size); | 845 | size = roundup_pow_of_two(size); |
838 | if (size > log_buf_len) | 846 | if (size > log_buf_len) |
839 | new_log_buf_len = size; | 847 | new_log_buf_len = size; |
848 | } | ||
849 | |||
850 | /* save requested log_buf_len since it's too early to process it */ | ||
851 | static int __init log_buf_len_setup(char *str) | ||
852 | { | ||
853 | unsigned size = memparse(str, &str); | ||
854 | |||
855 | log_buf_len_update(size); | ||
840 | 856 | ||
841 | return 0; | 857 | return 0; |
842 | } | 858 | } |
843 | early_param("log_buf_len", log_buf_len_setup); | 859 | early_param("log_buf_len", log_buf_len_setup); |
844 | 860 | ||
861 | static void __init log_buf_add_cpu(void) | ||
862 | { | ||
863 | unsigned int cpu_extra; | ||
864 | |||
865 | /* | ||
866 | * archs should set up cpu_possible_bits properly with | ||
867 | * set_cpu_possible() after setup_arch() but just in | ||
868 | * case lets ensure this is valid. | ||
869 | */ | ||
870 | if (num_possible_cpus() == 1) | ||
871 | return; | ||
872 | |||
873 | cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; | ||
874 | |||
875 | /* by default this will only continue through for large > 64 CPUs */ | ||
876 | if (cpu_extra <= __LOG_BUF_LEN / 2) | ||
877 | return; | ||
878 | |||
879 | pr_info("log_buf_len individual max cpu contribution: %d bytes\n", | ||
880 | __LOG_CPU_MAX_BUF_LEN); | ||
881 | pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", | ||
882 | cpu_extra); | ||
883 | pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); | ||
884 | |||
885 | log_buf_len_update(cpu_extra + __LOG_BUF_LEN); | ||
886 | } | ||
887 | |||
845 | void __init setup_log_buf(int early) | 888 | void __init setup_log_buf(int early) |
846 | { | 889 | { |
847 | unsigned long flags; | 890 | unsigned long flags; |
848 | char *new_log_buf; | 891 | char *new_log_buf; |
849 | int free; | 892 | int free; |
850 | 893 | ||
894 | if (log_buf != __log_buf) | ||
895 | return; | ||
896 | |||
897 | if (!early && !new_log_buf_len) | ||
898 | log_buf_add_cpu(); | ||
899 | |||
851 | if (!new_log_buf_len) | 900 | if (!new_log_buf_len) |
852 | return; | 901 | return; |
853 | 902 | ||
854 | if (early) { | 903 | if (early) { |
855 | new_log_buf = | 904 | new_log_buf = |
856 | memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); | 905 | memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); |
857 | } else { | 906 | } else { |
858 | new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); | 907 | new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, |
908 | LOG_ALIGN); | ||
859 | } | 909 | } |
860 | 910 | ||
861 | if (unlikely(!new_log_buf)) { | 911 | if (unlikely(!new_log_buf)) { |
@@ -872,7 +922,7 @@ void __init setup_log_buf(int early) | |||
872 | memcpy(log_buf, __log_buf, __LOG_BUF_LEN); | 922 | memcpy(log_buf, __log_buf, __LOG_BUF_LEN); |
873 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); | 923 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
874 | 924 | ||
875 | pr_info("log_buf_len: %d\n", log_buf_len); | 925 | pr_info("log_buf_len: %d bytes\n", log_buf_len); |
876 | pr_info("early log buf free: %d(%d%%)\n", | 926 | pr_info("early log buf free: %d(%d%%)\n", |
877 | free, (free * 100) / __LOG_BUF_LEN); | 927 | free, (free * 100) / __LOG_BUF_LEN); |
878 | } | 928 | } |
@@ -881,7 +931,7 @@ static bool __read_mostly ignore_loglevel; | |||
881 | 931 | ||
882 | static int __init ignore_loglevel_setup(char *str) | 932 | static int __init ignore_loglevel_setup(char *str) |
883 | { | 933 | { |
884 | ignore_loglevel = 1; | 934 | ignore_loglevel = true; |
885 | pr_info("debug: ignoring loglevel setting.\n"); | 935 | pr_info("debug: ignoring loglevel setting.\n"); |
886 | 936 | ||
887 | return 0; | 937 | return 0; |
@@ -947,11 +997,7 @@ static inline void boot_delay_msec(int level) | |||
947 | } | 997 | } |
948 | #endif | 998 | #endif |
949 | 999 | ||
950 | #if defined(CONFIG_PRINTK_TIME) | 1000 | static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); |
951 | static bool printk_time = 1; | ||
952 | #else | ||
953 | static bool printk_time; | ||
954 | #endif | ||
955 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | 1001 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
956 | 1002 | ||
957 | static size_t print_time(u64 ts, char *buf) | 1003 | static size_t print_time(u64 ts, char *buf) |
@@ -1310,7 +1356,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1310 | * for pending data, not the size; return the count of | 1356 | * for pending data, not the size; return the count of |
1311 | * records, not the length. | 1357 | * records, not the length. |
1312 | */ | 1358 | */ |
1313 | error = log_next_idx - syslog_idx; | 1359 | error = log_next_seq - syslog_seq; |
1314 | } else { | 1360 | } else { |
1315 | u64 seq = syslog_seq; | 1361 | u64 seq = syslog_seq; |
1316 | u32 idx = syslog_idx; | 1362 | u32 idx = syslog_idx; |
@@ -1416,10 +1462,9 @@ static int have_callable_console(void) | |||
1416 | /* | 1462 | /* |
1417 | * Can we actually use the console at this time on this cpu? | 1463 | * Can we actually use the console at this time on this cpu? |
1418 | * | 1464 | * |
1419 | * Console drivers may assume that per-cpu resources have | 1465 | * Console drivers may assume that per-cpu resources have been allocated. So |
1420 | * been allocated. So unless they're explicitly marked as | 1466 | * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't |
1421 | * being able to cope (CON_ANYTIME) don't call them until | 1467 | * call them until this CPU is officially up. |
1422 | * this CPU is officially up. | ||
1423 | */ | 1468 | */ |
1424 | static inline int can_use_console(unsigned int cpu) | 1469 | static inline int can_use_console(unsigned int cpu) |
1425 | { | 1470 | { |
@@ -1432,8 +1477,10 @@ static inline int can_use_console(unsigned int cpu) | |||
1432 | * console_lock held, and 'console_locked' set) if it | 1477 | * console_lock held, and 'console_locked' set) if it |
1433 | * is successful, false otherwise. | 1478 | * is successful, false otherwise. |
1434 | */ | 1479 | */ |
1435 | static int console_trylock_for_printk(unsigned int cpu) | 1480 | static int console_trylock_for_printk(void) |
1436 | { | 1481 | { |
1482 | unsigned int cpu = smp_processor_id(); | ||
1483 | |||
1437 | if (!console_trylock()) | 1484 | if (!console_trylock()) |
1438 | return 0; | 1485 | return 0; |
1439 | /* | 1486 | /* |
@@ -1476,7 +1523,7 @@ static struct cont { | |||
1476 | struct task_struct *owner; /* task of first print*/ | 1523 | struct task_struct *owner; /* task of first print*/ |
1477 | u64 ts_nsec; /* time of first print */ | 1524 | u64 ts_nsec; /* time of first print */ |
1478 | u8 level; /* log level of first message */ | 1525 | u8 level; /* log level of first message */ |
1479 | u8 facility; /* log level of first message */ | 1526 | u8 facility; /* log facility of first message */ |
1480 | enum log_flags flags; /* prefix, newline flags */ | 1527 | enum log_flags flags; /* prefix, newline flags */ |
1481 | bool flushed:1; /* buffer sealed and committed */ | 1528 | bool flushed:1; /* buffer sealed and committed */ |
1482 | } cont; | 1529 | } cont; |
@@ -1608,7 +1655,8 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1608 | */ | 1655 | */ |
1609 | if (!oops_in_progress && !lockdep_recursing(current)) { | 1656 | if (!oops_in_progress && !lockdep_recursing(current)) { |
1610 | recursion_bug = 1; | 1657 | recursion_bug = 1; |
1611 | goto out_restore_irqs; | 1658 | local_irq_restore(flags); |
1659 | return 0; | ||
1612 | } | 1660 | } |
1613 | zap_locks(); | 1661 | zap_locks(); |
1614 | } | 1662 | } |
@@ -1716,21 +1764,30 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1716 | 1764 | ||
1717 | logbuf_cpu = UINT_MAX; | 1765 | logbuf_cpu = UINT_MAX; |
1718 | raw_spin_unlock(&logbuf_lock); | 1766 | raw_spin_unlock(&logbuf_lock); |
1767 | lockdep_on(); | ||
1768 | local_irq_restore(flags); | ||
1719 | 1769 | ||
1720 | /* If called from the scheduler, we can not call up(). */ | 1770 | /* If called from the scheduler, we can not call up(). */ |
1721 | if (!in_sched) { | 1771 | if (!in_sched) { |
1772 | lockdep_off(); | ||
1773 | /* | ||
1774 | * Disable preemption to avoid being preempted while holding | ||
1775 | * console_sem which would prevent anyone from printing to | ||
1776 | * console | ||
1777 | */ | ||
1778 | preempt_disable(); | ||
1779 | |||
1722 | /* | 1780 | /* |
1723 | * Try to acquire and then immediately release the console | 1781 | * Try to acquire and then immediately release the console |
1724 | * semaphore. The release will print out buffers and wake up | 1782 | * semaphore. The release will print out buffers and wake up |
1725 | * /dev/kmsg and syslog() users. | 1783 | * /dev/kmsg and syslog() users. |
1726 | */ | 1784 | */ |
1727 | if (console_trylock_for_printk(this_cpu)) | 1785 | if (console_trylock_for_printk()) |
1728 | console_unlock(); | 1786 | console_unlock(); |
1787 | preempt_enable(); | ||
1788 | lockdep_on(); | ||
1729 | } | 1789 | } |
1730 | 1790 | ||
1731 | lockdep_on(); | ||
1732 | out_restore_irqs: | ||
1733 | local_irq_restore(flags); | ||
1734 | return printed_len; | 1791 | return printed_len; |
1735 | } | 1792 | } |
1736 | EXPORT_SYMBOL(vprintk_emit); | 1793 | EXPORT_SYMBOL(vprintk_emit); |
@@ -1802,7 +1859,7 @@ EXPORT_SYMBOL(printk); | |||
1802 | 1859 | ||
1803 | #define LOG_LINE_MAX 0 | 1860 | #define LOG_LINE_MAX 0 |
1804 | #define PREFIX_MAX 0 | 1861 | #define PREFIX_MAX 0 |
1805 | #define LOG_LINE_MAX 0 | 1862 | |
1806 | static u64 syslog_seq; | 1863 | static u64 syslog_seq; |
1807 | static u32 syslog_idx; | 1864 | static u32 syslog_idx; |
1808 | static u64 console_seq; | 1865 | static u64 console_seq; |
@@ -1881,11 +1938,12 @@ static int __add_preferred_console(char *name, int idx, char *options, | |||
1881 | return 0; | 1938 | return 0; |
1882 | } | 1939 | } |
1883 | /* | 1940 | /* |
1884 | * Set up a list of consoles. Called from init/main.c | 1941 | * Set up a console. Called via do_early_param() in init/main.c |
1942 | * for each "console=" parameter in the boot command line. | ||
1885 | */ | 1943 | */ |
1886 | static int __init console_setup(char *str) | 1944 | static int __init console_setup(char *str) |
1887 | { | 1945 | { |
1888 | char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ | 1946 | char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ |
1889 | char *s, *options, *brl_options = NULL; | 1947 | char *s, *options, *brl_options = NULL; |
1890 | int idx; | 1948 | int idx; |
1891 | 1949 | ||
@@ -1902,7 +1960,8 @@ static int __init console_setup(char *str) | |||
1902 | strncpy(buf, str, sizeof(buf) - 1); | 1960 | strncpy(buf, str, sizeof(buf) - 1); |
1903 | } | 1961 | } |
1904 | buf[sizeof(buf) - 1] = 0; | 1962 | buf[sizeof(buf) - 1] = 0; |
1905 | if ((options = strchr(str, ',')) != NULL) | 1963 | options = strchr(str, ','); |
1964 | if (options) | ||
1906 | *(options++) = 0; | 1965 | *(options++) = 0; |
1907 | #ifdef __sparc__ | 1966 | #ifdef __sparc__ |
1908 | if (!strcmp(str, "ttya")) | 1967 | if (!strcmp(str, "ttya")) |
@@ -1911,7 +1970,7 @@ static int __init console_setup(char *str) | |||
1911 | strcpy(buf, "ttyS1"); | 1970 | strcpy(buf, "ttyS1"); |
1912 | #endif | 1971 | #endif |
1913 | for (s = buf; *s; s++) | 1972 | for (s = buf; *s; s++) |
1914 | if ((*s >= '0' && *s <= '9') || *s == ',') | 1973 | if (isdigit(*s) || *s == ',') |
1915 | break; | 1974 | break; |
1916 | idx = simple_strtoul(s, NULL, 10); | 1975 | idx = simple_strtoul(s, NULL, 10); |
1917 | *s = 0; | 1976 | *s = 0; |
@@ -1950,7 +2009,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha | |||
1950 | i++, c++) | 2009 | i++, c++) |
1951 | if (strcmp(c->name, name) == 0 && c->index == idx) { | 2010 | if (strcmp(c->name, name) == 0 && c->index == idx) { |
1952 | strlcpy(c->name, name_new, sizeof(c->name)); | 2011 | strlcpy(c->name, name_new, sizeof(c->name)); |
1953 | c->name[sizeof(c->name) - 1] = 0; | ||
1954 | c->options = options; | 2012 | c->options = options; |
1955 | c->index = idx_new; | 2013 | c->index = idx_new; |
1956 | return i; | 2014 | return i; |
@@ -1959,12 +2017,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha | |||
1959 | return -1; | 2017 | return -1; |
1960 | } | 2018 | } |
1961 | 2019 | ||
1962 | bool console_suspend_enabled = 1; | 2020 | bool console_suspend_enabled = true; |
1963 | EXPORT_SYMBOL(console_suspend_enabled); | 2021 | EXPORT_SYMBOL(console_suspend_enabled); |
1964 | 2022 | ||
1965 | static int __init console_suspend_disable(char *str) | 2023 | static int __init console_suspend_disable(char *str) |
1966 | { | 2024 | { |
1967 | console_suspend_enabled = 0; | 2025 | console_suspend_enabled = false; |
1968 | return 1; | 2026 | return 1; |
1969 | } | 2027 | } |
1970 | __setup("no_console_suspend", console_suspend_disable); | 2028 | __setup("no_console_suspend", console_suspend_disable); |
@@ -2045,8 +2103,8 @@ EXPORT_SYMBOL(console_lock); | |||
2045 | /** | 2103 | /** |
2046 | * console_trylock - try to lock the console system for exclusive use. | 2104 | * console_trylock - try to lock the console system for exclusive use. |
2047 | * | 2105 | * |
2048 | * Tried to acquire a lock which guarantees that the caller has | 2106 | * Try to acquire a lock which guarantees that the caller has exclusive |
2049 | * exclusive access to the console system and the console_drivers list. | 2107 | * access to the console system and the console_drivers list. |
2050 | * | 2108 | * |
2051 | * returns 1 on success, and 0 on failure to acquire the lock. | 2109 | * returns 1 on success, and 0 on failure to acquire the lock. |
2052 | */ | 2110 | */ |
@@ -2618,14 +2676,13 @@ EXPORT_SYMBOL(__printk_ratelimit); | |||
2618 | bool printk_timed_ratelimit(unsigned long *caller_jiffies, | 2676 | bool printk_timed_ratelimit(unsigned long *caller_jiffies, |
2619 | unsigned int interval_msecs) | 2677 | unsigned int interval_msecs) |
2620 | { | 2678 | { |
2621 | if (*caller_jiffies == 0 | 2679 | unsigned long elapsed = jiffies - *caller_jiffies; |
2622 | || !time_in_range(jiffies, *caller_jiffies, | 2680 | |
2623 | *caller_jiffies | 2681 | if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) |
2624 | + msecs_to_jiffies(interval_msecs))) { | 2682 | return false; |
2625 | *caller_jiffies = jiffies; | 2683 | |
2626 | return true; | 2684 | *caller_jiffies = jiffies; |
2627 | } | 2685 | return true; |
2628 | return false; | ||
2629 | } | 2686 | } |
2630 | EXPORT_SYMBOL(printk_timed_ratelimit); | 2687 | EXPORT_SYMBOL(printk_timed_ratelimit); |
2631 | 2688 | ||
diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237ac32db..da14b8d09296 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); | |||
59 | static struct resource *bootmem_resource_free; | 59 | static struct resource *bootmem_resource_free; |
60 | static DEFINE_SPINLOCK(bootmem_resource_lock); | 60 | static DEFINE_SPINLOCK(bootmem_resource_lock); |
61 | 61 | ||
62 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 62 | static struct resource *next_resource(struct resource *p, bool sibling_only) |
63 | { | 63 | { |
64 | struct resource *p = v; | 64 | /* Caller wants to traverse through siblings only */ |
65 | (*pos)++; | 65 | if (sibling_only) |
66 | return p->sibling; | ||
67 | |||
66 | if (p->child) | 68 | if (p->child) |
67 | return p->child; | 69 | return p->child; |
68 | while (!p->sibling && p->parent) | 70 | while (!p->sibling && p->parent) |
@@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) | |||
70 | return p->sibling; | 72 | return p->sibling; |
71 | } | 73 | } |
72 | 74 | ||
75 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | ||
76 | { | ||
77 | struct resource *p = v; | ||
78 | (*pos)++; | ||
79 | return (void *)next_resource(p, false); | ||
80 | } | ||
81 | |||
73 | #ifdef CONFIG_PROC_FS | 82 | #ifdef CONFIG_PROC_FS |
74 | 83 | ||
75 | enum { MAX_IORES_LEVEL = 5 }; | 84 | enum { MAX_IORES_LEVEL = 5 }; |
@@ -322,16 +331,19 @@ int release_resource(struct resource *old) | |||
322 | 331 | ||
323 | EXPORT_SYMBOL(release_resource); | 332 | EXPORT_SYMBOL(release_resource); |
324 | 333 | ||
325 | #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) | ||
326 | /* | 334 | /* |
327 | * Finds the lowest memory reosurce exists within [res->start.res->end) | 335 | * Finds the lowest iomem reosurce exists with-in [res->start.res->end) |
328 | * the caller must specify res->start, res->end, res->flags and "name". | 336 | * the caller must specify res->start, res->end, res->flags and "name". |
329 | * If found, returns 0, res is overwritten, if not found, returns -1. | 337 | * If found, returns 0, res is overwritten, if not found, returns -1. |
338 | * This walks through whole tree and not just first level children | ||
339 | * until and unless first_level_children_only is true. | ||
330 | */ | 340 | */ |
331 | static int find_next_system_ram(struct resource *res, char *name) | 341 | static int find_next_iomem_res(struct resource *res, char *name, |
342 | bool first_level_children_only) | ||
332 | { | 343 | { |
333 | resource_size_t start, end; | 344 | resource_size_t start, end; |
334 | struct resource *p; | 345 | struct resource *p; |
346 | bool sibling_only = false; | ||
335 | 347 | ||
336 | BUG_ON(!res); | 348 | BUG_ON(!res); |
337 | 349 | ||
@@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name) | |||
340 | BUG_ON(start >= end); | 352 | BUG_ON(start >= end); |
341 | 353 | ||
342 | read_lock(&resource_lock); | 354 | read_lock(&resource_lock); |
343 | for (p = iomem_resource.child; p ; p = p->sibling) { | 355 | |
344 | /* system ram is just marked as IORESOURCE_MEM */ | 356 | if (first_level_children_only) { |
357 | p = iomem_resource.child; | ||
358 | sibling_only = true; | ||
359 | } else | ||
360 | p = &iomem_resource; | ||
361 | |||
362 | while ((p = next_resource(p, sibling_only))) { | ||
345 | if (p->flags != res->flags) | 363 | if (p->flags != res->flags) |
346 | continue; | 364 | continue; |
347 | if (name && strcmp(p->name, name)) | 365 | if (name && strcmp(p->name, name)) |
@@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name) | |||
353 | if ((p->end >= start) && (p->start < end)) | 371 | if ((p->end >= start) && (p->start < end)) |
354 | break; | 372 | break; |
355 | } | 373 | } |
374 | |||
356 | read_unlock(&resource_lock); | 375 | read_unlock(&resource_lock); |
357 | if (!p) | 376 | if (!p) |
358 | return -1; | 377 | return -1; |
@@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name) | |||
365 | } | 384 | } |
366 | 385 | ||
367 | /* | 386 | /* |
387 | * Walks through iomem resources and calls func() with matching resource | ||
388 | * ranges. This walks through whole tree and not just first level children. | ||
389 | * All the memory ranges which overlap start,end and also match flags and | ||
390 | * name are valid candidates. | ||
391 | * | ||
392 | * @name: name of resource | ||
393 | * @flags: resource flags | ||
394 | * @start: start addr | ||
395 | * @end: end addr | ||
396 | */ | ||
397 | int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, | ||
398 | void *arg, int (*func)(u64, u64, void *)) | ||
399 | { | ||
400 | struct resource res; | ||
401 | u64 orig_end; | ||
402 | int ret = -1; | ||
403 | |||
404 | res.start = start; | ||
405 | res.end = end; | ||
406 | res.flags = flags; | ||
407 | orig_end = res.end; | ||
408 | while ((res.start < res.end) && | ||
409 | (!find_next_iomem_res(&res, name, false))) { | ||
410 | ret = (*func)(res.start, res.end, arg); | ||
411 | if (ret) | ||
412 | break; | ||
413 | res.start = res.end + 1; | ||
414 | res.end = orig_end; | ||
415 | } | ||
416 | return ret; | ||
417 | } | ||
418 | |||
419 | /* | ||
420 | * This function calls callback against all memory range of "System RAM" | ||
421 | * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. | ||
422 | * Now, this function is only for "System RAM". This function deals with | ||
423 | * full ranges and not pfn. If resources are not pfn aligned, dealing | ||
424 | * with pfn can truncate ranges. | ||
425 | */ | ||
426 | int walk_system_ram_res(u64 start, u64 end, void *arg, | ||
427 | int (*func)(u64, u64, void *)) | ||
428 | { | ||
429 | struct resource res; | ||
430 | u64 orig_end; | ||
431 | int ret = -1; | ||
432 | |||
433 | res.start = start; | ||
434 | res.end = end; | ||
435 | res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; | ||
436 | orig_end = res.end; | ||
437 | while ((res.start < res.end) && | ||
438 | (!find_next_iomem_res(&res, "System RAM", true))) { | ||
439 | ret = (*func)(res.start, res.end, arg); | ||
440 | if (ret) | ||
441 | break; | ||
442 | res.start = res.end + 1; | ||
443 | res.end = orig_end; | ||
444 | } | ||
445 | return ret; | ||
446 | } | ||
447 | |||
448 | #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) | ||
449 | |||
450 | /* | ||
368 | * This function calls callback against all memory range of "System RAM" | 451 | * This function calls callback against all memory range of "System RAM" |
369 | * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. | 452 | * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. |
370 | * Now, this function is only for "System RAM". | 453 | * Now, this function is only for "System RAM". |
@@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
382 | res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 465 | res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
383 | orig_end = res.end; | 466 | orig_end = res.end; |
384 | while ((res.start < res.end) && | 467 | while ((res.start < res.end) && |
385 | (find_next_system_ram(&res, "System RAM") >= 0)) { | 468 | (find_next_iomem_res(&res, "System RAM", true) >= 0)) { |
386 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; | 469 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; |
387 | end_pfn = (res.end + 1) >> PAGE_SHIFT; | 470 | end_pfn = (res.end + 1) >> PAGE_SHIFT; |
388 | if (end_pfn > pfn) | 471 | if (end_pfn > pfn) |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1211575a2208..ec1a286684a5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2393,6 +2393,13 @@ unsigned long nr_iowait_cpu(int cpu) | |||
2393 | return atomic_read(&this->nr_iowait); | 2393 | return atomic_read(&this->nr_iowait); |
2394 | } | 2394 | } |
2395 | 2395 | ||
2396 | void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) | ||
2397 | { | ||
2398 | struct rq *this = this_rq(); | ||
2399 | *nr_waiters = atomic_read(&this->nr_iowait); | ||
2400 | *load = this->cpu_load[0]; | ||
2401 | } | ||
2402 | |||
2396 | #ifdef CONFIG_SMP | 2403 | #ifdef CONFIG_SMP |
2397 | 2404 | ||
2398 | /* | 2405 | /* |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9f1608f99819..11e7bc434f43 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -147,8 +147,6 @@ use_default: | |||
147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) |
148 | goto use_default; | 148 | goto use_default; |
149 | 149 | ||
150 | trace_cpu_idle_rcuidle(next_state, dev->cpu); | ||
151 | |||
152 | /* | 150 | /* |
153 | * Enter the idle state previously returned by the governor decision. | 151 | * Enter the idle state previously returned by the governor decision. |
154 | * This function will block until an interrupt occurs and will take | 152 | * This function will block until an interrupt occurs and will take |
@@ -156,8 +154,6 @@ use_default: | |||
156 | */ | 154 | */ |
157 | entered_state = cpuidle_enter(drv, dev, next_state); | 155 | entered_state = cpuidle_enter(drv, dev, next_state); |
158 | 156 | ||
159 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); | ||
160 | |||
161 | if (broadcast) | 157 | if (broadcast) |
162 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 158 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); |
163 | 159 | ||
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 16f5a30f9c88..8ecd552fe4f2 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c | |||
@@ -8,13 +8,6 @@ | |||
8 | 8 | ||
9 | #include "sched.h" | 9 | #include "sched.h" |
10 | 10 | ||
11 | unsigned long this_cpu_load(void) | ||
12 | { | ||
13 | struct rq *this = this_rq(); | ||
14 | return this->cpu_load[0]; | ||
15 | } | ||
16 | |||
17 | |||
18 | /* | 11 | /* |
19 | * Global load-average calculations | 12 | * Global load-average calculations |
20 | * | 13 | * |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 301bbc24739c..44eb005c6695 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -18,15 +18,17 @@ | |||
18 | #include <linux/compat.h> | 18 | #include <linux/compat.h> |
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/seccomp.h> | 20 | #include <linux/seccomp.h> |
21 | #include <linux/slab.h> | ||
22 | #include <linux/syscalls.h> | ||
21 | 23 | ||
22 | /* #define SECCOMP_DEBUG 1 */ | 24 | /* #define SECCOMP_DEBUG 1 */ |
23 | 25 | ||
24 | #ifdef CONFIG_SECCOMP_FILTER | 26 | #ifdef CONFIG_SECCOMP_FILTER |
25 | #include <asm/syscall.h> | 27 | #include <asm/syscall.h> |
26 | #include <linux/filter.h> | 28 | #include <linux/filter.h> |
29 | #include <linux/pid.h> | ||
27 | #include <linux/ptrace.h> | 30 | #include <linux/ptrace.h> |
28 | #include <linux/security.h> | 31 | #include <linux/security.h> |
29 | #include <linux/slab.h> | ||
30 | #include <linux/tracehook.h> | 32 | #include <linux/tracehook.h> |
31 | #include <linux/uaccess.h> | 33 | #include <linux/uaccess.h> |
32 | 34 | ||
@@ -54,7 +56,7 @@ | |||
54 | struct seccomp_filter { | 56 | struct seccomp_filter { |
55 | atomic_t usage; | 57 | atomic_t usage; |
56 | struct seccomp_filter *prev; | 58 | struct seccomp_filter *prev; |
57 | struct sk_filter *prog; | 59 | struct bpf_prog *prog; |
58 | }; | 60 | }; |
59 | 61 | ||
60 | /* Limit any path through the tree to 256KB worth of instructions. */ | 62 | /* Limit any path through the tree to 256KB worth of instructions. */ |
@@ -87,7 +89,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) | |||
87 | * @filter: filter to verify | 89 | * @filter: filter to verify |
88 | * @flen: length of filter | 90 | * @flen: length of filter |
89 | * | 91 | * |
90 | * Takes a previously checked filter (by sk_chk_filter) and | 92 | * Takes a previously checked filter (by bpf_check_classic) and |
91 | * redirects all filter code that loads struct sk_buff data | 93 | * redirects all filter code that loads struct sk_buff data |
92 | * and related data through seccomp_bpf_load. It also | 94 | * and related data through seccomp_bpf_load. It also |
93 | * enforces length and alignment checking of those loads. | 95 | * enforces length and alignment checking of those loads. |
@@ -172,51 +174,184 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |||
172 | */ | 174 | */ |
173 | static u32 seccomp_run_filters(int syscall) | 175 | static u32 seccomp_run_filters(int syscall) |
174 | { | 176 | { |
175 | struct seccomp_filter *f; | 177 | struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); |
176 | struct seccomp_data sd; | 178 | struct seccomp_data sd; |
177 | u32 ret = SECCOMP_RET_ALLOW; | 179 | u32 ret = SECCOMP_RET_ALLOW; |
178 | 180 | ||
179 | /* Ensure unexpected behavior doesn't result in failing open. */ | 181 | /* Ensure unexpected behavior doesn't result in failing open. */ |
180 | if (WARN_ON(current->seccomp.filter == NULL)) | 182 | if (unlikely(WARN_ON(f == NULL))) |
181 | return SECCOMP_RET_KILL; | 183 | return SECCOMP_RET_KILL; |
182 | 184 | ||
185 | /* Make sure cross-thread synced filter points somewhere sane. */ | ||
186 | smp_read_barrier_depends(); | ||
187 | |||
183 | populate_seccomp_data(&sd); | 188 | populate_seccomp_data(&sd); |
184 | 189 | ||
185 | /* | 190 | /* |
186 | * All filters in the list are evaluated and the lowest BPF return | 191 | * All filters in the list are evaluated and the lowest BPF return |
187 | * value always takes priority (ignoring the DATA). | 192 | * value always takes priority (ignoring the DATA). |
188 | */ | 193 | */ |
189 | for (f = current->seccomp.filter; f; f = f->prev) { | 194 | for (; f; f = f->prev) { |
190 | u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); | 195 | u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); |
191 | 196 | ||
192 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | 197 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) |
193 | ret = cur_ret; | 198 | ret = cur_ret; |
194 | } | 199 | } |
195 | return ret; | 200 | return ret; |
196 | } | 201 | } |
202 | #endif /* CONFIG_SECCOMP_FILTER */ | ||
203 | |||
204 | static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) | ||
205 | { | ||
206 | assert_spin_locked(¤t->sighand->siglock); | ||
207 | |||
208 | if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) | ||
209 | return false; | ||
210 | |||
211 | return true; | ||
212 | } | ||
213 | |||
214 | static inline void seccomp_assign_mode(struct task_struct *task, | ||
215 | unsigned long seccomp_mode) | ||
216 | { | ||
217 | assert_spin_locked(&task->sighand->siglock); | ||
218 | |||
219 | task->seccomp.mode = seccomp_mode; | ||
220 | /* | ||
221 | * Make sure TIF_SECCOMP cannot be set before the mode (and | ||
222 | * filter) is set. | ||
223 | */ | ||
224 | smp_mb__before_atomic(); | ||
225 | set_tsk_thread_flag(task, TIF_SECCOMP); | ||
226 | } | ||
227 | |||
228 | #ifdef CONFIG_SECCOMP_FILTER | ||
229 | /* Returns 1 if the parent is an ancestor of the child. */ | ||
230 | static int is_ancestor(struct seccomp_filter *parent, | ||
231 | struct seccomp_filter *child) | ||
232 | { | ||
233 | /* NULL is the root ancestor. */ | ||
234 | if (parent == NULL) | ||
235 | return 1; | ||
236 | for (; child; child = child->prev) | ||
237 | if (child == parent) | ||
238 | return 1; | ||
239 | return 0; | ||
240 | } | ||
197 | 241 | ||
198 | /** | 242 | /** |
199 | * seccomp_attach_filter: Attaches a seccomp filter to current. | 243 | * seccomp_can_sync_threads: checks if all threads can be synchronized |
244 | * | ||
245 | * Expects sighand and cred_guard_mutex locks to be held. | ||
246 | * | ||
247 | * Returns 0 on success, -ve on error, or the pid of a thread which was | ||
248 | * either not in the correct seccomp mode or it did not have an ancestral | ||
249 | * seccomp filter. | ||
250 | */ | ||
251 | static inline pid_t seccomp_can_sync_threads(void) | ||
252 | { | ||
253 | struct task_struct *thread, *caller; | ||
254 | |||
255 | BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); | ||
256 | assert_spin_locked(¤t->sighand->siglock); | ||
257 | |||
258 | /* Validate all threads being eligible for synchronization. */ | ||
259 | caller = current; | ||
260 | for_each_thread(caller, thread) { | ||
261 | pid_t failed; | ||
262 | |||
263 | /* Skip current, since it is initiating the sync. */ | ||
264 | if (thread == caller) | ||
265 | continue; | ||
266 | |||
267 | if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || | ||
268 | (thread->seccomp.mode == SECCOMP_MODE_FILTER && | ||
269 | is_ancestor(thread->seccomp.filter, | ||
270 | caller->seccomp.filter))) | ||
271 | continue; | ||
272 | |||
273 | /* Return the first thread that cannot be synchronized. */ | ||
274 | failed = task_pid_vnr(thread); | ||
275 | /* If the pid cannot be resolved, then return -ESRCH */ | ||
276 | if (unlikely(WARN_ON(failed == 0))) | ||
277 | failed = -ESRCH; | ||
278 | return failed; | ||
279 | } | ||
280 | |||
281 | return 0; | ||
282 | } | ||
283 | |||
284 | /** | ||
285 | * seccomp_sync_threads: sets all threads to use current's filter | ||
286 | * | ||
287 | * Expects sighand and cred_guard_mutex locks to be held, and for | ||
288 | * seccomp_can_sync_threads() to have returned success already | ||
289 | * without dropping the locks. | ||
290 | * | ||
291 | */ | ||
292 | static inline void seccomp_sync_threads(void) | ||
293 | { | ||
294 | struct task_struct *thread, *caller; | ||
295 | |||
296 | BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); | ||
297 | assert_spin_locked(¤t->sighand->siglock); | ||
298 | |||
299 | /* Synchronize all threads. */ | ||
300 | caller = current; | ||
301 | for_each_thread(caller, thread) { | ||
302 | /* Skip current, since it needs no changes. */ | ||
303 | if (thread == caller) | ||
304 | continue; | ||
305 | |||
306 | /* Get a task reference for the new leaf node. */ | ||
307 | get_seccomp_filter(caller); | ||
308 | /* | ||
309 | * Drop the task reference to the shared ancestor since | ||
310 | * current's path will hold a reference. (This also | ||
311 | * allows a put before the assignment.) | ||
312 | */ | ||
313 | put_seccomp_filter(thread); | ||
314 | smp_store_release(&thread->seccomp.filter, | ||
315 | caller->seccomp.filter); | ||
316 | /* | ||
317 | * Opt the other thread into seccomp if needed. | ||
318 | * As threads are considered to be trust-realm | ||
319 | * equivalent (see ptrace_may_access), it is safe to | ||
320 | * allow one thread to transition the other. | ||
321 | */ | ||
322 | if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { | ||
323 | /* | ||
324 | * Don't let an unprivileged task work around | ||
325 | * the no_new_privs restriction by creating | ||
326 | * a thread that sets it up, enters seccomp, | ||
327 | * then dies. | ||
328 | */ | ||
329 | if (task_no_new_privs(caller)) | ||
330 | task_set_no_new_privs(thread); | ||
331 | |||
332 | seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); | ||
333 | } | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * seccomp_prepare_filter: Prepares a seccomp filter for use. | ||
200 | * @fprog: BPF program to install | 339 | * @fprog: BPF program to install |
201 | * | 340 | * |
202 | * Returns 0 on success or an errno on failure. | 341 | * Returns filter on success or an ERR_PTR on failure. |
203 | */ | 342 | */ |
204 | static long seccomp_attach_filter(struct sock_fprog *fprog) | 343 | static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) |
205 | { | 344 | { |
206 | struct seccomp_filter *filter; | 345 | struct seccomp_filter *filter; |
207 | unsigned long fp_size = fprog->len * sizeof(struct sock_filter); | 346 | unsigned long fp_size; |
208 | unsigned long total_insns = fprog->len; | ||
209 | struct sock_filter *fp; | 347 | struct sock_filter *fp; |
210 | int new_len; | 348 | int new_len; |
211 | long ret; | 349 | long ret; |
212 | 350 | ||
213 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) | 351 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) |
214 | return -EINVAL; | 352 | return ERR_PTR(-EINVAL); |
215 | 353 | BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); | |
216 | for (filter = current->seccomp.filter; filter; filter = filter->prev) | 354 | fp_size = fprog->len * sizeof(struct sock_filter); |
217 | total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ | ||
218 | if (total_insns > MAX_INSNS_PER_PATH) | ||
219 | return -ENOMEM; | ||
220 | 355 | ||
221 | /* | 356 | /* |
222 | * Installing a seccomp filter requires that the task has | 357 | * Installing a seccomp filter requires that the task has |
@@ -224,14 +359,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
224 | * This avoids scenarios where unprivileged tasks can affect the | 359 | * This avoids scenarios where unprivileged tasks can affect the |
225 | * behavior of privileged children. | 360 | * behavior of privileged children. |
226 | */ | 361 | */ |
227 | if (!current->no_new_privs && | 362 | if (!task_no_new_privs(current) && |
228 | security_capable_noaudit(current_cred(), current_user_ns(), | 363 | security_capable_noaudit(current_cred(), current_user_ns(), |
229 | CAP_SYS_ADMIN) != 0) | 364 | CAP_SYS_ADMIN) != 0) |
230 | return -EACCES; | 365 | return ERR_PTR(-EACCES); |
231 | 366 | ||
232 | fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); | 367 | fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); |
233 | if (!fp) | 368 | if (!fp) |
234 | return -ENOMEM; | 369 | return ERR_PTR(-ENOMEM); |
235 | 370 | ||
236 | /* Copy the instructions from fprog. */ | 371 | /* Copy the instructions from fprog. */ |
237 | ret = -EFAULT; | 372 | ret = -EFAULT; |
@@ -239,7 +374,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
239 | goto free_prog; | 374 | goto free_prog; |
240 | 375 | ||
241 | /* Check and rewrite the fprog via the skb checker */ | 376 | /* Check and rewrite the fprog via the skb checker */ |
242 | ret = sk_chk_filter(fp, fprog->len); | 377 | ret = bpf_check_classic(fp, fprog->len); |
243 | if (ret) | 378 | if (ret) |
244 | goto free_prog; | 379 | goto free_prog; |
245 | 380 | ||
@@ -248,8 +383,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
248 | if (ret) | 383 | if (ret) |
249 | goto free_prog; | 384 | goto free_prog; |
250 | 385 | ||
251 | /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ | 386 | /* Convert 'sock_filter' insns to 'bpf_insn' insns */ |
252 | ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); | 387 | ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len); |
253 | if (ret) | 388 | if (ret) |
254 | goto free_prog; | 389 | goto free_prog; |
255 | 390 | ||
@@ -260,12 +395,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
260 | if (!filter) | 395 | if (!filter) |
261 | goto free_prog; | 396 | goto free_prog; |
262 | 397 | ||
263 | filter->prog = kzalloc(sk_filter_size(new_len), | 398 | filter->prog = kzalloc(bpf_prog_size(new_len), |
264 | GFP_KERNEL|__GFP_NOWARN); | 399 | GFP_KERNEL|__GFP_NOWARN); |
265 | if (!filter->prog) | 400 | if (!filter->prog) |
266 | goto free_filter; | 401 | goto free_filter; |
267 | 402 | ||
268 | ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); | 403 | ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); |
269 | if (ret) | 404 | if (ret) |
270 | goto free_filter_prog; | 405 | goto free_filter_prog; |
271 | kfree(fp); | 406 | kfree(fp); |
@@ -273,15 +408,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
273 | atomic_set(&filter->usage, 1); | 408 | atomic_set(&filter->usage, 1); |
274 | filter->prog->len = new_len; | 409 | filter->prog->len = new_len; |
275 | 410 | ||
276 | sk_filter_select_runtime(filter->prog); | 411 | bpf_prog_select_runtime(filter->prog); |
277 | 412 | ||
278 | /* | 413 | return filter; |
279 | * If there is an existing filter, make it the prev and don't drop its | ||
280 | * task reference. | ||
281 | */ | ||
282 | filter->prev = current->seccomp.filter; | ||
283 | current->seccomp.filter = filter; | ||
284 | return 0; | ||
285 | 414 | ||
286 | free_filter_prog: | 415 | free_filter_prog: |
287 | kfree(filter->prog); | 416 | kfree(filter->prog); |
@@ -289,19 +418,20 @@ free_filter: | |||
289 | kfree(filter); | 418 | kfree(filter); |
290 | free_prog: | 419 | free_prog: |
291 | kfree(fp); | 420 | kfree(fp); |
292 | return ret; | 421 | return ERR_PTR(ret); |
293 | } | 422 | } |
294 | 423 | ||
295 | /** | 424 | /** |
296 | * seccomp_attach_user_filter - attaches a user-supplied sock_fprog | 425 | * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog |
297 | * @user_filter: pointer to the user data containing a sock_fprog. | 426 | * @user_filter: pointer to the user data containing a sock_fprog. |
298 | * | 427 | * |
299 | * Returns 0 on success and non-zero otherwise. | 428 | * Returns 0 on success and non-zero otherwise. |
300 | */ | 429 | */ |
301 | static long seccomp_attach_user_filter(char __user *user_filter) | 430 | static struct seccomp_filter * |
431 | seccomp_prepare_user_filter(const char __user *user_filter) | ||
302 | { | 432 | { |
303 | struct sock_fprog fprog; | 433 | struct sock_fprog fprog; |
304 | long ret = -EFAULT; | 434 | struct seccomp_filter *filter = ERR_PTR(-EFAULT); |
305 | 435 | ||
306 | #ifdef CONFIG_COMPAT | 436 | #ifdef CONFIG_COMPAT |
307 | if (is_compat_task()) { | 437 | if (is_compat_task()) { |
@@ -314,9 +444,56 @@ static long seccomp_attach_user_filter(char __user *user_filter) | |||
314 | #endif | 444 | #endif |
315 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) | 445 | if (copy_from_user(&fprog, user_filter, sizeof(fprog))) |
316 | goto out; | 446 | goto out; |
317 | ret = seccomp_attach_filter(&fprog); | 447 | filter = seccomp_prepare_filter(&fprog); |
318 | out: | 448 | out: |
319 | return ret; | 449 | return filter; |
450 | } | ||
451 | |||
452 | /** | ||
453 | * seccomp_attach_filter: validate and attach filter | ||
454 | * @flags: flags to change filter behavior | ||
455 | * @filter: seccomp filter to add to the current process | ||
456 | * | ||
457 | * Caller must be holding current->sighand->siglock lock. | ||
458 | * | ||
459 | * Returns 0 on success, -ve on error. | ||
460 | */ | ||
461 | static long seccomp_attach_filter(unsigned int flags, | ||
462 | struct seccomp_filter *filter) | ||
463 | { | ||
464 | unsigned long total_insns; | ||
465 | struct seccomp_filter *walker; | ||
466 | |||
467 | assert_spin_locked(¤t->sighand->siglock); | ||
468 | |||
469 | /* Validate resulting filter length. */ | ||
470 | total_insns = filter->prog->len; | ||
471 | for (walker = current->seccomp.filter; walker; walker = walker->prev) | ||
472 | total_insns += walker->prog->len + 4; /* 4 instr penalty */ | ||
473 | if (total_insns > MAX_INSNS_PER_PATH) | ||
474 | return -ENOMEM; | ||
475 | |||
476 | /* If thread sync has been requested, check that it is possible. */ | ||
477 | if (flags & SECCOMP_FILTER_FLAG_TSYNC) { | ||
478 | int ret; | ||
479 | |||
480 | ret = seccomp_can_sync_threads(); | ||
481 | if (ret) | ||
482 | return ret; | ||
483 | } | ||
484 | |||
485 | /* | ||
486 | * If there is an existing filter, make it the prev and don't drop its | ||
487 | * task reference. | ||
488 | */ | ||
489 | filter->prev = current->seccomp.filter; | ||
490 | current->seccomp.filter = filter; | ||
491 | |||
492 | /* Now that the new filter is in place, synchronize to all threads. */ | ||
493 | if (flags & SECCOMP_FILTER_FLAG_TSYNC) | ||
494 | seccomp_sync_threads(); | ||
495 | |||
496 | return 0; | ||
320 | } | 497 | } |
321 | 498 | ||
322 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ | 499 | /* get_seccomp_filter - increments the reference count of the filter on @tsk */ |
@@ -329,6 +506,14 @@ void get_seccomp_filter(struct task_struct *tsk) | |||
329 | atomic_inc(&orig->usage); | 506 | atomic_inc(&orig->usage); |
330 | } | 507 | } |
331 | 508 | ||
509 | static inline void seccomp_filter_free(struct seccomp_filter *filter) | ||
510 | { | ||
511 | if (filter) { | ||
512 | bpf_prog_free(filter->prog); | ||
513 | kfree(filter); | ||
514 | } | ||
515 | } | ||
516 | |||
332 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ | 517 | /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ |
333 | void put_seccomp_filter(struct task_struct *tsk) | 518 | void put_seccomp_filter(struct task_struct *tsk) |
334 | { | 519 | { |
@@ -337,8 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk) | |||
337 | while (orig && atomic_dec_and_test(&orig->usage)) { | 522 | while (orig && atomic_dec_and_test(&orig->usage)) { |
338 | struct seccomp_filter *freeme = orig; | 523 | struct seccomp_filter *freeme = orig; |
339 | orig = orig->prev; | 524 | orig = orig->prev; |
340 | sk_filter_free(freeme->prog); | 525 | seccomp_filter_free(freeme); |
341 | kfree(freeme); | ||
342 | } | 526 | } |
343 | } | 527 | } |
344 | 528 | ||
@@ -382,12 +566,17 @@ static int mode1_syscalls_32[] = { | |||
382 | 566 | ||
383 | int __secure_computing(int this_syscall) | 567 | int __secure_computing(int this_syscall) |
384 | { | 568 | { |
385 | int mode = current->seccomp.mode; | ||
386 | int exit_sig = 0; | 569 | int exit_sig = 0; |
387 | int *syscall; | 570 | int *syscall; |
388 | u32 ret; | 571 | u32 ret; |
389 | 572 | ||
390 | switch (mode) { | 573 | /* |
574 | * Make sure that any changes to mode from another thread have | ||
575 | * been seen after TIF_SECCOMP was seen. | ||
576 | */ | ||
577 | rmb(); | ||
578 | |||
579 | switch (current->seccomp.mode) { | ||
391 | case SECCOMP_MODE_STRICT: | 580 | case SECCOMP_MODE_STRICT: |
392 | syscall = mode1_syscalls; | 581 | syscall = mode1_syscalls; |
393 | #ifdef CONFIG_COMPAT | 582 | #ifdef CONFIG_COMPAT |
@@ -473,47 +662,152 @@ long prctl_get_seccomp(void) | |||
473 | } | 662 | } |
474 | 663 | ||
475 | /** | 664 | /** |
476 | * prctl_set_seccomp: configures current->seccomp.mode | 665 | * seccomp_set_mode_strict: internal function for setting strict seccomp |
477 | * @seccomp_mode: requested mode to use | ||
478 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER | ||
479 | * | 666 | * |
480 | * This function may be called repeatedly with a @seccomp_mode of | 667 | * Once current->seccomp.mode is non-zero, it may not be changed. |
481 | * SECCOMP_MODE_FILTER to install additional filters. Every filter | 668 | * |
482 | * successfully installed will be evaluated (in reverse order) for each system | 669 | * Returns 0 on success or -EINVAL on failure. |
483 | * call the task makes. | 670 | */ |
671 | static long seccomp_set_mode_strict(void) | ||
672 | { | ||
673 | const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; | ||
674 | long ret = -EINVAL; | ||
675 | |||
676 | spin_lock_irq(¤t->sighand->siglock); | ||
677 | |||
678 | if (!seccomp_may_assign_mode(seccomp_mode)) | ||
679 | goto out; | ||
680 | |||
681 | #ifdef TIF_NOTSC | ||
682 | disable_TSC(); | ||
683 | #endif | ||
684 | seccomp_assign_mode(current, seccomp_mode); | ||
685 | ret = 0; | ||
686 | |||
687 | out: | ||
688 | spin_unlock_irq(¤t->sighand->siglock); | ||
689 | |||
690 | return ret; | ||
691 | } | ||
692 | |||
693 | #ifdef CONFIG_SECCOMP_FILTER | ||
694 | /** | ||
695 | * seccomp_set_mode_filter: internal function for setting seccomp filter | ||
696 | * @flags: flags to change filter behavior | ||
697 | * @filter: struct sock_fprog containing filter | ||
698 | * | ||
699 | * This function may be called repeatedly to install additional filters. | ||
700 | * Every filter successfully installed will be evaluated (in reverse order) | ||
701 | * for each system call the task makes. | ||
484 | * | 702 | * |
485 | * Once current->seccomp.mode is non-zero, it may not be changed. | 703 | * Once current->seccomp.mode is non-zero, it may not be changed. |
486 | * | 704 | * |
487 | * Returns 0 on success or -EINVAL on failure. | 705 | * Returns 0 on success or -EINVAL on failure. |
488 | */ | 706 | */ |
489 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | 707 | static long seccomp_set_mode_filter(unsigned int flags, |
708 | const char __user *filter) | ||
490 | { | 709 | { |
710 | const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; | ||
711 | struct seccomp_filter *prepared = NULL; | ||
491 | long ret = -EINVAL; | 712 | long ret = -EINVAL; |
492 | 713 | ||
493 | if (current->seccomp.mode && | 714 | /* Validate flags. */ |
494 | current->seccomp.mode != seccomp_mode) | 715 | if (flags & ~SECCOMP_FILTER_FLAG_MASK) |
716 | return -EINVAL; | ||
717 | |||
718 | /* Prepare the new filter before holding any locks. */ | ||
719 | prepared = seccomp_prepare_user_filter(filter); | ||
720 | if (IS_ERR(prepared)) | ||
721 | return PTR_ERR(prepared); | ||
722 | |||
723 | /* | ||
724 | * Make sure we cannot change seccomp or nnp state via TSYNC | ||
725 | * while another thread is in the middle of calling exec. | ||
726 | */ | ||
727 | if (flags & SECCOMP_FILTER_FLAG_TSYNC && | ||
728 | mutex_lock_killable(¤t->signal->cred_guard_mutex)) | ||
729 | goto out_free; | ||
730 | |||
731 | spin_lock_irq(¤t->sighand->siglock); | ||
732 | |||
733 | if (!seccomp_may_assign_mode(seccomp_mode)) | ||
734 | goto out; | ||
735 | |||
736 | ret = seccomp_attach_filter(flags, prepared); | ||
737 | if (ret) | ||
495 | goto out; | 738 | goto out; |
739 | /* Do not free the successfully attached filter. */ | ||
740 | prepared = NULL; | ||
741 | |||
742 | seccomp_assign_mode(current, seccomp_mode); | ||
743 | out: | ||
744 | spin_unlock_irq(¤t->sighand->siglock); | ||
745 | if (flags & SECCOMP_FILTER_FLAG_TSYNC) | ||
746 | mutex_unlock(¤t->signal->cred_guard_mutex); | ||
747 | out_free: | ||
748 | seccomp_filter_free(prepared); | ||
749 | return ret; | ||
750 | } | ||
751 | #else | ||
752 | static inline long seccomp_set_mode_filter(unsigned int flags, | ||
753 | const char __user *filter) | ||
754 | { | ||
755 | return -EINVAL; | ||
756 | } | ||
757 | #endif | ||
758 | |||
759 | /* Common entry point for both prctl and syscall. */ | ||
760 | static long do_seccomp(unsigned int op, unsigned int flags, | ||
761 | const char __user *uargs) | ||
762 | { | ||
763 | switch (op) { | ||
764 | case SECCOMP_SET_MODE_STRICT: | ||
765 | if (flags != 0 || uargs != NULL) | ||
766 | return -EINVAL; | ||
767 | return seccomp_set_mode_strict(); | ||
768 | case SECCOMP_SET_MODE_FILTER: | ||
769 | return seccomp_set_mode_filter(flags, uargs); | ||
770 | default: | ||
771 | return -EINVAL; | ||
772 | } | ||
773 | } | ||
774 | |||
775 | SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, | ||
776 | const char __user *, uargs) | ||
777 | { | ||
778 | return do_seccomp(op, flags, uargs); | ||
779 | } | ||
780 | |||
781 | /** | ||
782 | * prctl_set_seccomp: configures current->seccomp.mode | ||
783 | * @seccomp_mode: requested mode to use | ||
784 | * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER | ||
785 | * | ||
786 | * Returns 0 on success or -EINVAL on failure. | ||
787 | */ | ||
788 | long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | ||
789 | { | ||
790 | unsigned int op; | ||
791 | char __user *uargs; | ||
496 | 792 | ||
497 | switch (seccomp_mode) { | 793 | switch (seccomp_mode) { |
498 | case SECCOMP_MODE_STRICT: | 794 | case SECCOMP_MODE_STRICT: |
499 | ret = 0; | 795 | op = SECCOMP_SET_MODE_STRICT; |
500 | #ifdef TIF_NOTSC | 796 | /* |
501 | disable_TSC(); | 797 | * Setting strict mode through prctl always ignored filter, |
502 | #endif | 798 | * so make sure it is always NULL here to pass the internal |
799 | * check in do_seccomp(). | ||
800 | */ | ||
801 | uargs = NULL; | ||
503 | break; | 802 | break; |
504 | #ifdef CONFIG_SECCOMP_FILTER | ||
505 | case SECCOMP_MODE_FILTER: | 803 | case SECCOMP_MODE_FILTER: |
506 | ret = seccomp_attach_user_filter(filter); | 804 | op = SECCOMP_SET_MODE_FILTER; |
507 | if (ret) | 805 | uargs = filter; |
508 | goto out; | ||
509 | break; | 806 | break; |
510 | #endif | ||
511 | default: | 807 | default: |
512 | goto out; | 808 | return -EINVAL; |
513 | } | 809 | } |
514 | 810 | ||
515 | current->seccomp.mode = seccomp_mode; | 811 | /* prctl interface doesn't have flags, so they are always zero. */ |
516 | set_thread_flag(TIF_SECCOMP); | 812 | return do_seccomp(op, 0, uargs); |
517 | out: | ||
518 | return ret; | ||
519 | } | 813 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index 40b76e351e64..8f0876f9f6dd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2170,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info) | |||
2170 | return signr; | 2170 | return signr; |
2171 | } | 2171 | } |
2172 | 2172 | ||
2173 | int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | 2173 | int get_signal(struct ksignal *ksig) |
2174 | struct pt_regs *regs, void *cookie) | ||
2175 | { | 2174 | { |
2176 | struct sighand_struct *sighand = current->sighand; | 2175 | struct sighand_struct *sighand = current->sighand; |
2177 | struct signal_struct *signal = current->signal; | 2176 | struct signal_struct *signal = current->signal; |
@@ -2241,13 +2240,13 @@ relock: | |||
2241 | goto relock; | 2240 | goto relock; |
2242 | } | 2241 | } |
2243 | 2242 | ||
2244 | signr = dequeue_signal(current, ¤t->blocked, info); | 2243 | signr = dequeue_signal(current, ¤t->blocked, &ksig->info); |
2245 | 2244 | ||
2246 | if (!signr) | 2245 | if (!signr) |
2247 | break; /* will return 0 */ | 2246 | break; /* will return 0 */ |
2248 | 2247 | ||
2249 | if (unlikely(current->ptrace) && signr != SIGKILL) { | 2248 | if (unlikely(current->ptrace) && signr != SIGKILL) { |
2250 | signr = ptrace_signal(signr, info); | 2249 | signr = ptrace_signal(signr, &ksig->info); |
2251 | if (!signr) | 2250 | if (!signr) |
2252 | continue; | 2251 | continue; |
2253 | } | 2252 | } |
@@ -2255,13 +2254,13 @@ relock: | |||
2255 | ka = &sighand->action[signr-1]; | 2254 | ka = &sighand->action[signr-1]; |
2256 | 2255 | ||
2257 | /* Trace actually delivered signals. */ | 2256 | /* Trace actually delivered signals. */ |
2258 | trace_signal_deliver(signr, info, ka); | 2257 | trace_signal_deliver(signr, &ksig->info, ka); |
2259 | 2258 | ||
2260 | if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ | 2259 | if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ |
2261 | continue; | 2260 | continue; |
2262 | if (ka->sa.sa_handler != SIG_DFL) { | 2261 | if (ka->sa.sa_handler != SIG_DFL) { |
2263 | /* Run the handler. */ | 2262 | /* Run the handler. */ |
2264 | *return_ka = *ka; | 2263 | ksig->ka = *ka; |
2265 | 2264 | ||
2266 | if (ka->sa.sa_flags & SA_ONESHOT) | 2265 | if (ka->sa.sa_flags & SA_ONESHOT) |
2267 | ka->sa.sa_handler = SIG_DFL; | 2266 | ka->sa.sa_handler = SIG_DFL; |
@@ -2311,7 +2310,7 @@ relock: | |||
2311 | spin_lock_irq(&sighand->siglock); | 2310 | spin_lock_irq(&sighand->siglock); |
2312 | } | 2311 | } |
2313 | 2312 | ||
2314 | if (likely(do_signal_stop(info->si_signo))) { | 2313 | if (likely(do_signal_stop(ksig->info.si_signo))) { |
2315 | /* It released the siglock. */ | 2314 | /* It released the siglock. */ |
2316 | goto relock; | 2315 | goto relock; |
2317 | } | 2316 | } |
@@ -2332,7 +2331,7 @@ relock: | |||
2332 | 2331 | ||
2333 | if (sig_kernel_coredump(signr)) { | 2332 | if (sig_kernel_coredump(signr)) { |
2334 | if (print_fatal_signals) | 2333 | if (print_fatal_signals) |
2335 | print_fatal_signal(info->si_signo); | 2334 | print_fatal_signal(ksig->info.si_signo); |
2336 | proc_coredump_connector(current); | 2335 | proc_coredump_connector(current); |
2337 | /* | 2336 | /* |
2338 | * If it was able to dump core, this kills all | 2337 | * If it was able to dump core, this kills all |
@@ -2342,34 +2341,32 @@ relock: | |||
2342 | * first and our do_group_exit call below will use | 2341 | * first and our do_group_exit call below will use |
2343 | * that value and ignore the one we pass it. | 2342 | * that value and ignore the one we pass it. |
2344 | */ | 2343 | */ |
2345 | do_coredump(info); | 2344 | do_coredump(&ksig->info); |
2346 | } | 2345 | } |
2347 | 2346 | ||
2348 | /* | 2347 | /* |
2349 | * Death signals, no core dump. | 2348 | * Death signals, no core dump. |
2350 | */ | 2349 | */ |
2351 | do_group_exit(info->si_signo); | 2350 | do_group_exit(ksig->info.si_signo); |
2352 | /* NOTREACHED */ | 2351 | /* NOTREACHED */ |
2353 | } | 2352 | } |
2354 | spin_unlock_irq(&sighand->siglock); | 2353 | spin_unlock_irq(&sighand->siglock); |
2355 | return signr; | 2354 | |
2355 | ksig->sig = signr; | ||
2356 | return ksig->sig > 0; | ||
2356 | } | 2357 | } |
2357 | 2358 | ||
2358 | /** | 2359 | /** |
2359 | * signal_delivered - | 2360 | * signal_delivered - |
2360 | * @sig: number of signal being delivered | 2361 | * @ksig: kernel signal struct |
2361 | * @info: siginfo_t of signal being delivered | ||
2362 | * @ka: sigaction setting that chose the handler | ||
2363 | * @regs: user register state | ||
2364 | * @stepping: nonzero if debugger single-step or block-step in use | 2362 | * @stepping: nonzero if debugger single-step or block-step in use |
2365 | * | 2363 | * |
2366 | * This function should be called when a signal has successfully been | 2364 | * This function should be called when a signal has successfully been |
2367 | * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask | 2365 | * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask |
2368 | * is always blocked, and the signal itself is blocked unless %SA_NODEFER | 2366 | * is always blocked, and the signal itself is blocked unless %SA_NODEFER |
2369 | * is set in @ka->sa.sa_flags. Tracing is notified. | 2367 | * is set in @ksig->ka.sa.sa_flags. Tracing is notified. |
2370 | */ | 2368 | */ |
2371 | void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, | 2369 | static void signal_delivered(struct ksignal *ksig, int stepping) |
2372 | struct pt_regs *regs, int stepping) | ||
2373 | { | 2370 | { |
2374 | sigset_t blocked; | 2371 | sigset_t blocked; |
2375 | 2372 | ||
@@ -2379,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, | |||
2379 | simply clear the restore sigmask flag. */ | 2376 | simply clear the restore sigmask flag. */ |
2380 | clear_restore_sigmask(); | 2377 | clear_restore_sigmask(); |
2381 | 2378 | ||
2382 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); | 2379 | sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); |
2383 | if (!(ka->sa.sa_flags & SA_NODEFER)) | 2380 | if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) |
2384 | sigaddset(&blocked, sig); | 2381 | sigaddset(&blocked, ksig->sig); |
2385 | set_current_blocked(&blocked); | 2382 | set_current_blocked(&blocked); |
2386 | tracehook_signal_handler(sig, info, ka, regs, stepping); | 2383 | tracehook_signal_handler(stepping); |
2387 | } | 2384 | } |
2388 | 2385 | ||
2389 | void signal_setup_done(int failed, struct ksignal *ksig, int stepping) | 2386 | void signal_setup_done(int failed, struct ksignal *ksig, int stepping) |
@@ -2391,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping) | |||
2391 | if (failed) | 2388 | if (failed) |
2392 | force_sigsegv(ksig->sig, current); | 2389 | force_sigsegv(ksig->sig, current); |
2393 | else | 2390 | else |
2394 | signal_delivered(ksig->sig, &ksig->info, &ksig->ka, | 2391 | signal_delivered(ksig, stepping); |
2395 | signal_pt_regs(), stepping); | ||
2396 | } | 2392 | } |
2397 | 2393 | ||
2398 | /* | 2394 | /* |
diff --git a/kernel/smp.c b/kernel/smp.c index 487653b5844f..aff8aa14f547 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
670 | if (cond_func(cpu, info)) { | 670 | if (cond_func(cpu, info)) { |
671 | ret = smp_call_function_single(cpu, func, | 671 | ret = smp_call_function_single(cpu, func, |
672 | info, wait); | 672 | info, wait); |
673 | WARN_ON_ONCE(!ret); | 673 | WARN_ON_ONCE(ret); |
674 | } | 674 | } |
675 | preempt_enable(); | 675 | preempt_enable(); |
676 | } | 676 | } |
diff --git a/kernel/sys.c b/kernel/sys.c index 66a751ebf9d9..ce8129192a26 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1990 | if (arg2 != 1 || arg3 || arg4 || arg5) | 1990 | if (arg2 != 1 || arg3 || arg4 || arg5) |
1991 | return -EINVAL; | 1991 | return -EINVAL; |
1992 | 1992 | ||
1993 | current->no_new_privs = 1; | 1993 | task_set_no_new_privs(current); |
1994 | break; | 1994 | break; |
1995 | case PR_GET_NO_NEW_PRIVS: | 1995 | case PR_GET_NO_NEW_PRIVS: |
1996 | if (arg2 || arg3 || arg4 || arg5) | 1996 | if (arg2 || arg3 || arg4 || arg5) |
1997 | return -EINVAL; | 1997 | return -EINVAL; |
1998 | return current->no_new_privs ? 1 : 0; | 1998 | return task_no_new_privs(current) ? 1 : 0; |
1999 | case PR_GET_THP_DISABLE: | 1999 | case PR_GET_THP_DISABLE: |
2000 | if (arg2 || arg3 || arg4 || arg5) | 2000 | if (arg2 || arg3 || arg4 || arg5) |
2001 | return -EINVAL; | 2001 | return -EINVAL; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 36441b51b5df..391d4ddb6f4b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -25,6 +25,7 @@ cond_syscall(sys_swapon); | |||
25 | cond_syscall(sys_swapoff); | 25 | cond_syscall(sys_swapoff); |
26 | cond_syscall(sys_kexec_load); | 26 | cond_syscall(sys_kexec_load); |
27 | cond_syscall(compat_sys_kexec_load); | 27 | cond_syscall(compat_sys_kexec_load); |
28 | cond_syscall(sys_kexec_file_load); | ||
28 | cond_syscall(sys_init_module); | 29 | cond_syscall(sys_init_module); |
29 | cond_syscall(sys_finit_module); | 30 | cond_syscall(sys_finit_module); |
30 | cond_syscall(sys_delete_module); | 31 | cond_syscall(sys_delete_module); |
@@ -197,6 +198,7 @@ cond_syscall(compat_sys_timerfd_settime); | |||
197 | cond_syscall(compat_sys_timerfd_gettime); | 198 | cond_syscall(compat_sys_timerfd_gettime); |
198 | cond_syscall(sys_eventfd); | 199 | cond_syscall(sys_eventfd); |
199 | cond_syscall(sys_eventfd2); | 200 | cond_syscall(sys_eventfd2); |
201 | cond_syscall(sys_memfd_create); | ||
200 | 202 | ||
201 | /* performance counters: */ | 203 | /* performance counters: */ |
202 | cond_syscall(sys_perf_event_open); | 204 | cond_syscall(sys_perf_event_open); |
@@ -213,3 +215,6 @@ cond_syscall(compat_sys_open_by_handle_at); | |||
213 | 215 | ||
214 | /* compare kernel pointers */ | 216 | /* compare kernel pointers */ |
215 | cond_syscall(sys_kcmp); | 217 | cond_syscall(sys_kcmp); |
218 | |||
219 | /* operate on Secure Computing state */ | ||
220 | cond_syscall(sys_seccomp); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e22a72c..75875a741b5e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = { | |||
1240 | .maxlen = sizeof(unsigned long), | 1240 | .maxlen = sizeof(unsigned long), |
1241 | .mode = 0644, | 1241 | .mode = 0644, |
1242 | .proc_handler = hugetlb_sysctl_handler, | 1242 | .proc_handler = hugetlb_sysctl_handler, |
1243 | .extra1 = (void *)&hugetlb_zero, | 1243 | .extra1 = &zero, |
1244 | .extra2 = (void *)&hugetlb_infinity, | ||
1245 | }, | 1244 | }, |
1246 | #ifdef CONFIG_NUMA | 1245 | #ifdef CONFIG_NUMA |
1247 | { | 1246 | { |
@@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = { | |||
1250 | .maxlen = sizeof(unsigned long), | 1249 | .maxlen = sizeof(unsigned long), |
1251 | .mode = 0644, | 1250 | .mode = 0644, |
1252 | .proc_handler = &hugetlb_mempolicy_sysctl_handler, | 1251 | .proc_handler = &hugetlb_mempolicy_sysctl_handler, |
1253 | .extra1 = (void *)&hugetlb_zero, | 1252 | .extra1 = &zero, |
1254 | .extra2 = (void *)&hugetlb_infinity, | ||
1255 | }, | 1253 | }, |
1256 | #endif | 1254 | #endif |
1257 | { | 1255 | { |
@@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = { | |||
1274 | .maxlen = sizeof(unsigned long), | 1272 | .maxlen = sizeof(unsigned long), |
1275 | .mode = 0644, | 1273 | .mode = 0644, |
1276 | .proc_handler = hugetlb_overcommit_handler, | 1274 | .proc_handler = hugetlb_overcommit_handler, |
1277 | .extra1 = (void *)&hugetlb_zero, | 1275 | .extra1 = &zero, |
1278 | .extra2 = (void *)&hugetlb_infinity, | ||
1279 | }, | 1276 | }, |
1280 | #endif | 1277 | #endif |
1281 | { | 1278 | { |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd9e7ad..e4ba9a5a5ccb 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = { | |||
522 | { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, | 522 | { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, |
523 | { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, | 523 | { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, |
524 | { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, | 524 | { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, |
525 | { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, | ||
525 | {} | 526 | {} |
526 | }; | 527 | }; |
527 | 528 | ||
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 52ebc70263f4..875f64e8935b 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c | |||
@@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void) | |||
89 | pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", | 89 | pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", |
90 | PTR_ERR(key)); | 90 | PTR_ERR(key)); |
91 | } else { | 91 | } else { |
92 | set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags); | ||
92 | pr_notice("Loaded X.509 cert '%s'\n", | 93 | pr_notice("Loaded X.509 cert '%s'\n", |
93 | key_ref_to_ptr(key)->description); | 94 | key_ref_to_ptr(key)->description); |
94 | key_ref_put(key); | 95 | key_ref_put(key); |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbfdd83..0dbab6d1acb4 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c | |||
@@ -14,6 +14,8 @@ | |||
14 | * the GNU General Public License for more details. | 14 | * the GNU General Public License for more details. |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #define pr_fmt(fmt) "Kprobe smoke test: " fmt | ||
18 | |||
17 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
18 | #include <linux/kprobes.h> | 20 | #include <linux/kprobes.h> |
19 | #include <linux/random.h> | 21 | #include <linux/random.h> |
@@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
41 | { | 43 | { |
42 | if (preh_val != (rand1 / div_factor)) { | 44 | if (preh_val != (rand1 / div_factor)) { |
43 | handler_errors++; | 45 | handler_errors++; |
44 | printk(KERN_ERR "Kprobe smoke test failed: " | 46 | pr_err("incorrect value in post_handler\n"); |
45 | "incorrect value in post_handler\n"); | ||
46 | } | 47 | } |
47 | posth_val = preh_val + div_factor; | 48 | posth_val = preh_val + div_factor; |
48 | } | 49 | } |
@@ -59,8 +60,7 @@ static int test_kprobe(void) | |||
59 | 60 | ||
60 | ret = register_kprobe(&kp); | 61 | ret = register_kprobe(&kp); |
61 | if (ret < 0) { | 62 | if (ret < 0) { |
62 | printk(KERN_ERR "Kprobe smoke test failed: " | 63 | pr_err("register_kprobe returned %d\n", ret); |
63 | "register_kprobe returned %d\n", ret); | ||
64 | return ret; | 64 | return ret; |
65 | } | 65 | } |
66 | 66 | ||
@@ -68,14 +68,12 @@ static int test_kprobe(void) | |||
68 | unregister_kprobe(&kp); | 68 | unregister_kprobe(&kp); |
69 | 69 | ||
70 | if (preh_val == 0) { | 70 | if (preh_val == 0) { |
71 | printk(KERN_ERR "Kprobe smoke test failed: " | 71 | pr_err("kprobe pre_handler not called\n"); |
72 | "kprobe pre_handler not called\n"); | ||
73 | handler_errors++; | 72 | handler_errors++; |
74 | } | 73 | } |
75 | 74 | ||
76 | if (posth_val == 0) { | 75 | if (posth_val == 0) { |
77 | printk(KERN_ERR "Kprobe smoke test failed: " | 76 | pr_err("kprobe post_handler not called\n"); |
78 | "kprobe post_handler not called\n"); | ||
79 | handler_errors++; | 77 | handler_errors++; |
80 | } | 78 | } |
81 | 79 | ||
@@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, | |||
98 | { | 96 | { |
99 | if (preh_val != (rand1 / div_factor) + 1) { | 97 | if (preh_val != (rand1 / div_factor) + 1) { |
100 | handler_errors++; | 98 | handler_errors++; |
101 | printk(KERN_ERR "Kprobe smoke test failed: " | 99 | pr_err("incorrect value in post_handler2\n"); |
102 | "incorrect value in post_handler2\n"); | ||
103 | } | 100 | } |
104 | posth_val = preh_val + div_factor; | 101 | posth_val = preh_val + div_factor; |
105 | } | 102 | } |
@@ -120,8 +117,7 @@ static int test_kprobes(void) | |||
120 | kp.flags = 0; | 117 | kp.flags = 0; |
121 | ret = register_kprobes(kps, 2); | 118 | ret = register_kprobes(kps, 2); |
122 | if (ret < 0) { | 119 | if (ret < 0) { |
123 | printk(KERN_ERR "Kprobe smoke test failed: " | 120 | pr_err("register_kprobes returned %d\n", ret); |
124 | "register_kprobes returned %d\n", ret); | ||
125 | return ret; | 121 | return ret; |
126 | } | 122 | } |
127 | 123 | ||
@@ -130,14 +126,12 @@ static int test_kprobes(void) | |||
130 | ret = target(rand1); | 126 | ret = target(rand1); |
131 | 127 | ||
132 | if (preh_val == 0) { | 128 | if (preh_val == 0) { |
133 | printk(KERN_ERR "Kprobe smoke test failed: " | 129 | pr_err("kprobe pre_handler not called\n"); |
134 | "kprobe pre_handler not called\n"); | ||
135 | handler_errors++; | 130 | handler_errors++; |
136 | } | 131 | } |
137 | 132 | ||
138 | if (posth_val == 0) { | 133 | if (posth_val == 0) { |
139 | printk(KERN_ERR "Kprobe smoke test failed: " | 134 | pr_err("kprobe post_handler not called\n"); |
140 | "kprobe post_handler not called\n"); | ||
141 | handler_errors++; | 135 | handler_errors++; |
142 | } | 136 | } |
143 | 137 | ||
@@ -146,14 +140,12 @@ static int test_kprobes(void) | |||
146 | ret = target2(rand1); | 140 | ret = target2(rand1); |
147 | 141 | ||
148 | if (preh_val == 0) { | 142 | if (preh_val == 0) { |
149 | printk(KERN_ERR "Kprobe smoke test failed: " | 143 | pr_err("kprobe pre_handler2 not called\n"); |
150 | "kprobe pre_handler2 not called\n"); | ||
151 | handler_errors++; | 144 | handler_errors++; |
152 | } | 145 | } |
153 | 146 | ||
154 | if (posth_val == 0) { | 147 | if (posth_val == 0) { |
155 | printk(KERN_ERR "Kprobe smoke test failed: " | 148 | pr_err("kprobe post_handler2 not called\n"); |
156 | "kprobe post_handler2 not called\n"); | ||
157 | handler_errors++; | 149 | handler_errors++; |
158 | } | 150 | } |
159 | 151 | ||
@@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) | |||
166 | { | 158 | { |
167 | if (value != rand1) { | 159 | if (value != rand1) { |
168 | handler_errors++; | 160 | handler_errors++; |
169 | printk(KERN_ERR "Kprobe smoke test failed: " | 161 | pr_err("incorrect value in jprobe handler\n"); |
170 | "incorrect value in jprobe handler\n"); | ||
171 | } | 162 | } |
172 | 163 | ||
173 | jph_val = rand1; | 164 | jph_val = rand1; |
@@ -186,16 +177,14 @@ static int test_jprobe(void) | |||
186 | 177 | ||
187 | ret = register_jprobe(&jp); | 178 | ret = register_jprobe(&jp); |
188 | if (ret < 0) { | 179 | if (ret < 0) { |
189 | printk(KERN_ERR "Kprobe smoke test failed: " | 180 | pr_err("register_jprobe returned %d\n", ret); |
190 | "register_jprobe returned %d\n", ret); | ||
191 | return ret; | 181 | return ret; |
192 | } | 182 | } |
193 | 183 | ||
194 | ret = target(rand1); | 184 | ret = target(rand1); |
195 | unregister_jprobe(&jp); | 185 | unregister_jprobe(&jp); |
196 | if (jph_val == 0) { | 186 | if (jph_val == 0) { |
197 | printk(KERN_ERR "Kprobe smoke test failed: " | 187 | pr_err("jprobe handler not called\n"); |
198 | "jprobe handler not called\n"); | ||
199 | handler_errors++; | 188 | handler_errors++; |
200 | } | 189 | } |
201 | 190 | ||
@@ -217,24 +206,21 @@ static int test_jprobes(void) | |||
217 | jp.kp.flags = 0; | 206 | jp.kp.flags = 0; |
218 | ret = register_jprobes(jps, 2); | 207 | ret = register_jprobes(jps, 2); |
219 | if (ret < 0) { | 208 | if (ret < 0) { |
220 | printk(KERN_ERR "Kprobe smoke test failed: " | 209 | pr_err("register_jprobes returned %d\n", ret); |
221 | "register_jprobes returned %d\n", ret); | ||
222 | return ret; | 210 | return ret; |
223 | } | 211 | } |
224 | 212 | ||
225 | jph_val = 0; | 213 | jph_val = 0; |
226 | ret = target(rand1); | 214 | ret = target(rand1); |
227 | if (jph_val == 0) { | 215 | if (jph_val == 0) { |
228 | printk(KERN_ERR "Kprobe smoke test failed: " | 216 | pr_err("jprobe handler not called\n"); |
229 | "jprobe handler not called\n"); | ||
230 | handler_errors++; | 217 | handler_errors++; |
231 | } | 218 | } |
232 | 219 | ||
233 | jph_val = 0; | 220 | jph_val = 0; |
234 | ret = target2(rand1); | 221 | ret = target2(rand1); |
235 | if (jph_val == 0) { | 222 | if (jph_val == 0) { |
236 | printk(KERN_ERR "Kprobe smoke test failed: " | 223 | pr_err("jprobe handler2 not called\n"); |
237 | "jprobe handler2 not called\n"); | ||
238 | handler_errors++; | 224 | handler_errors++; |
239 | } | 225 | } |
240 | unregister_jprobes(jps, 2); | 226 | unregister_jprobes(jps, 2); |
@@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) | |||
256 | 242 | ||
257 | if (ret != (rand1 / div_factor)) { | 243 | if (ret != (rand1 / div_factor)) { |
258 | handler_errors++; | 244 | handler_errors++; |
259 | printk(KERN_ERR "Kprobe smoke test failed: " | 245 | pr_err("incorrect value in kretprobe handler\n"); |
260 | "incorrect value in kretprobe handler\n"); | ||
261 | } | 246 | } |
262 | if (krph_val == 0) { | 247 | if (krph_val == 0) { |
263 | handler_errors++; | 248 | handler_errors++; |
264 | printk(KERN_ERR "Kprobe smoke test failed: " | 249 | pr_err("call to kretprobe entry handler failed\n"); |
265 | "call to kretprobe entry handler failed\n"); | ||
266 | } | 250 | } |
267 | 251 | ||
268 | krph_val = rand1; | 252 | krph_val = rand1; |
@@ -281,16 +265,14 @@ static int test_kretprobe(void) | |||
281 | 265 | ||
282 | ret = register_kretprobe(&rp); | 266 | ret = register_kretprobe(&rp); |
283 | if (ret < 0) { | 267 | if (ret < 0) { |
284 | printk(KERN_ERR "Kprobe smoke test failed: " | 268 | pr_err("register_kretprobe returned %d\n", ret); |
285 | "register_kretprobe returned %d\n", ret); | ||
286 | return ret; | 269 | return ret; |
287 | } | 270 | } |
288 | 271 | ||
289 | ret = target(rand1); | 272 | ret = target(rand1); |
290 | unregister_kretprobe(&rp); | 273 | unregister_kretprobe(&rp); |
291 | if (krph_val != rand1) { | 274 | if (krph_val != rand1) { |
292 | printk(KERN_ERR "Kprobe smoke test failed: " | 275 | pr_err("kretprobe handler not called\n"); |
293 | "kretprobe handler not called\n"); | ||
294 | handler_errors++; | 276 | handler_errors++; |
295 | } | 277 | } |
296 | 278 | ||
@@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) | |||
303 | 285 | ||
304 | if (ret != (rand1 / div_factor) + 1) { | 286 | if (ret != (rand1 / div_factor) + 1) { |
305 | handler_errors++; | 287 | handler_errors++; |
306 | printk(KERN_ERR "Kprobe smoke test failed: " | 288 | pr_err("incorrect value in kretprobe handler2\n"); |
307 | "incorrect value in kretprobe handler2\n"); | ||
308 | } | 289 | } |
309 | if (krph_val == 0) { | 290 | if (krph_val == 0) { |
310 | handler_errors++; | 291 | handler_errors++; |
311 | printk(KERN_ERR "Kprobe smoke test failed: " | 292 | pr_err("call to kretprobe entry handler failed\n"); |
312 | "call to kretprobe entry handler failed\n"); | ||
313 | } | 293 | } |
314 | 294 | ||
315 | krph_val = rand1; | 295 | krph_val = rand1; |
@@ -332,24 +312,21 @@ static int test_kretprobes(void) | |||
332 | rp.kp.flags = 0; | 312 | rp.kp.flags = 0; |
333 | ret = register_kretprobes(rps, 2); | 313 | ret = register_kretprobes(rps, 2); |
334 | if (ret < 0) { | 314 | if (ret < 0) { |
335 | printk(KERN_ERR "Kprobe smoke test failed: " | 315 | pr_err("register_kretprobe returned %d\n", ret); |
336 | "register_kretprobe returned %d\n", ret); | ||
337 | return ret; | 316 | return ret; |
338 | } | 317 | } |
339 | 318 | ||
340 | krph_val = 0; | 319 | krph_val = 0; |
341 | ret = target(rand1); | 320 | ret = target(rand1); |
342 | if (krph_val != rand1) { | 321 | if (krph_val != rand1) { |
343 | printk(KERN_ERR "Kprobe smoke test failed: " | 322 | pr_err("kretprobe handler not called\n"); |
344 | "kretprobe handler not called\n"); | ||
345 | handler_errors++; | 323 | handler_errors++; |
346 | } | 324 | } |
347 | 325 | ||
348 | krph_val = 0; | 326 | krph_val = 0; |
349 | ret = target2(rand1); | 327 | ret = target2(rand1); |
350 | if (krph_val != rand1) { | 328 | if (krph_val != rand1) { |
351 | printk(KERN_ERR "Kprobe smoke test failed: " | 329 | pr_err("kretprobe handler2 not called\n"); |
352 | "kretprobe handler2 not called\n"); | ||
353 | handler_errors++; | 330 | handler_errors++; |
354 | } | 331 | } |
355 | unregister_kretprobes(rps, 2); | 332 | unregister_kretprobes(rps, 2); |
@@ -368,7 +345,7 @@ int init_test_probes(void) | |||
368 | rand1 = prandom_u32(); | 345 | rand1 = prandom_u32(); |
369 | } while (rand1 <= div_factor); | 346 | } while (rand1 <= div_factor); |
370 | 347 | ||
371 | printk(KERN_INFO "Kprobe smoke test started\n"); | 348 | pr_info("started\n"); |
372 | num_tests++; | 349 | num_tests++; |
373 | ret = test_kprobe(); | 350 | ret = test_kprobe(); |
374 | if (ret < 0) | 351 | if (ret < 0) |
@@ -402,13 +379,11 @@ int init_test_probes(void) | |||
402 | #endif /* CONFIG_KRETPROBES */ | 379 | #endif /* CONFIG_KRETPROBES */ |
403 | 380 | ||
404 | if (errors) | 381 | if (errors) |
405 | printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " | 382 | pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); |
406 | "%d tests failed\n", errors, num_tests); | ||
407 | else if (handler_errors) | 383 | else if (handler_errors) |
408 | printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " | 384 | pr_err("BUG: %d error(s) running handlers\n", handler_errors); |
409 | "running handlers\n", handler_errors); | ||
410 | else | 385 | else |
411 | printk(KERN_INFO "Kprobe smoke test passed successfully\n"); | 386 | pr_info("passed successfully\n"); |
412 | 387 | ||
413 | return 0; | 388 | return 0; |
414 | } | 389 | } |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f448513a45ed..d626dc98e8df 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG | |||
12 | config ARCH_CLOCKSOURCE_DATA | 12 | config ARCH_CLOCKSOURCE_DATA |
13 | bool | 13 | bool |
14 | 14 | ||
15 | # Clocksources require validation of the clocksource against the last | ||
16 | # cycle update - x86/TSC misfeature | ||
17 | config CLOCKSOURCE_VALIDATE_LAST_CYCLE | ||
18 | bool | ||
19 | |||
15 | # Timekeeping vsyscall support | 20 | # Timekeeping vsyscall support |
16 | config GENERIC_TIME_VSYSCALL | 21 | config GENERIC_TIME_VSYSCALL |
17 | bool | 22 | bool |
@@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL | |||
20 | config GENERIC_TIME_VSYSCALL_OLD | 25 | config GENERIC_TIME_VSYSCALL_OLD |
21 | bool | 26 | bool |
22 | 27 | ||
23 | # ktime_t scalar 64bit nsec representation | ||
24 | config KTIME_SCALAR | ||
25 | bool | ||
26 | |||
27 | # Old style timekeeping | 28 | # Old style timekeeping |
28 | config ARCH_USES_GETTIMEOFFSET | 29 | config ARCH_USES_GETTIMEOFFSET |
29 | bool | 30 | bool |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 57a413fd0ebf..7347426fa68d 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,3 +1,4 @@ | |||
1 | obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o | ||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o | 2 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o |
2 | obj-y += timeconv.o posix-clock.o alarmtimer.o | 3 | obj-y += timeconv.o posix-clock.o alarmtimer.o |
3 | 4 | ||
@@ -12,3 +13,21 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | |||
12 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o | 13 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o |
13 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | 14 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o |
14 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o | 15 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o |
16 | obj-$(CONFIG_TEST_UDELAY) += udelay_test.o | ||
17 | |||
18 | $(obj)/time.o: $(obj)/timeconst.h | ||
19 | |||
20 | quiet_cmd_hzfile = HZFILE $@ | ||
21 | cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ | ||
22 | |||
23 | targets += hz.bc | ||
24 | $(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE | ||
25 | $(call if_changed,hzfile) | ||
26 | |||
27 | quiet_cmd_bc = BC $@ | ||
28 | cmd_bc = bc -q $(filter-out FORCE,$^) > $@ | ||
29 | |||
30 | targets += timeconst.h | ||
31 | $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE | ||
32 | $(call if_changed,bc) | ||
33 | |||
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index fe75444ae7ec..4aec4a457431 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void) | |||
71 | 71 | ||
72 | return ret; | 72 | return ret; |
73 | } | 73 | } |
74 | 74 | EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); | |
75 | 75 | ||
76 | static int alarmtimer_rtc_add_device(struct device *dev, | 76 | static int alarmtimer_rtc_add_device(struct device *dev, |
77 | struct class_interface *class_intf) | 77 | struct class_interface *class_intf) |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ba3e502c955a..2e949cc9c9f1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/kthread.h> | 32 | #include <linux/kthread.h> |
33 | 33 | ||
34 | #include "tick-internal.h" | 34 | #include "tick-internal.h" |
35 | #include "timekeeping_internal.h" | ||
35 | 36 | ||
36 | void timecounter_init(struct timecounter *tc, | 37 | void timecounter_init(struct timecounter *tc, |
37 | const struct cyclecounter *cc, | 38 | const struct cyclecounter *cc, |
@@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs) | |||
249 | static void clocksource_watchdog(unsigned long data) | 250 | static void clocksource_watchdog(unsigned long data) |
250 | { | 251 | { |
251 | struct clocksource *cs; | 252 | struct clocksource *cs; |
252 | cycle_t csnow, wdnow; | 253 | cycle_t csnow, wdnow, delta; |
253 | int64_t wd_nsec, cs_nsec; | 254 | int64_t wd_nsec, cs_nsec; |
254 | int next_cpu, reset_pending; | 255 | int next_cpu, reset_pending; |
255 | 256 | ||
@@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data) | |||
282 | continue; | 283 | continue; |
283 | } | 284 | } |
284 | 285 | ||
285 | wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, | 286 | delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); |
286 | watchdog->mult, watchdog->shift); | 287 | wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, |
288 | watchdog->shift); | ||
287 | 289 | ||
288 | cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & | 290 | delta = clocksource_delta(csnow, cs->cs_last, cs->mask); |
289 | cs->mask, cs->mult, cs->shift); | 291 | cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); |
290 | cs->cs_last = csnow; | 292 | cs->cs_last = csnow; |
291 | cs->wd_last = wdnow; | 293 | cs->wd_last = wdnow; |
292 | 294 | ||
diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c index 3ab28993f6e0..1c2fe7de2842 100644 --- a/kernel/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -54,6 +54,8 @@ | |||
54 | 54 | ||
55 | #include <trace/events/timer.h> | 55 | #include <trace/events/timer.h> |
56 | 56 | ||
57 | #include "timekeeping.h" | ||
58 | |||
57 | /* | 59 | /* |
58 | * The timer bases: | 60 | * The timer bases: |
59 | * | 61 | * |
@@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) | |||
114 | */ | 116 | */ |
115 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | 117 | static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) |
116 | { | 118 | { |
117 | ktime_t xtim, mono, boot; | 119 | ktime_t xtim, mono, boot, tai; |
118 | struct timespec xts, tom, slp; | 120 | ktime_t off_real, off_boot, off_tai; |
119 | s32 tai_offset; | ||
120 | 121 | ||
121 | get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); | 122 | mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); |
122 | tai_offset = timekeeping_get_tai_offset(); | 123 | boot = ktime_add(mono, off_boot); |
124 | xtim = ktime_add(mono, off_real); | ||
125 | tai = ktime_add(xtim, off_tai); | ||
123 | 126 | ||
124 | xtim = timespec_to_ktime(xts); | ||
125 | mono = ktime_add(xtim, timespec_to_ktime(tom)); | ||
126 | boot = ktime_add(mono, timespec_to_ktime(slp)); | ||
127 | base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; | 127 | base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; |
128 | base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; | 128 | base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; |
129 | base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; | 129 | base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; |
130 | base->clock_base[HRTIMER_BASE_TAI].softirq_time = | 130 | base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; |
131 | ktime_add(xtim, ktime_set(tai_offset, 0)); | ||
132 | } | 131 | } |
133 | 132 | ||
134 | /* | 133 | /* |
@@ -264,60 +263,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
264 | * too large for inlining: | 263 | * too large for inlining: |
265 | */ | 264 | */ |
266 | #if BITS_PER_LONG < 64 | 265 | #if BITS_PER_LONG < 64 |
267 | # ifndef CONFIG_KTIME_SCALAR | ||
268 | /** | ||
269 | * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable | ||
270 | * @kt: addend | ||
271 | * @nsec: the scalar nsec value to add | ||
272 | * | ||
273 | * Returns the sum of kt and nsec in ktime_t format | ||
274 | */ | ||
275 | ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) | ||
276 | { | ||
277 | ktime_t tmp; | ||
278 | |||
279 | if (likely(nsec < NSEC_PER_SEC)) { | ||
280 | tmp.tv64 = nsec; | ||
281 | } else { | ||
282 | unsigned long rem = do_div(nsec, NSEC_PER_SEC); | ||
283 | |||
284 | /* Make sure nsec fits into long */ | ||
285 | if (unlikely(nsec > KTIME_SEC_MAX)) | ||
286 | return (ktime_t){ .tv64 = KTIME_MAX }; | ||
287 | |||
288 | tmp = ktime_set((long)nsec, rem); | ||
289 | } | ||
290 | |||
291 | return ktime_add(kt, tmp); | ||
292 | } | ||
293 | |||
294 | EXPORT_SYMBOL_GPL(ktime_add_ns); | ||
295 | |||
296 | /** | ||
297 | * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable | ||
298 | * @kt: minuend | ||
299 | * @nsec: the scalar nsec value to subtract | ||
300 | * | ||
301 | * Returns the subtraction of @nsec from @kt in ktime_t format | ||
302 | */ | ||
303 | ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) | ||
304 | { | ||
305 | ktime_t tmp; | ||
306 | |||
307 | if (likely(nsec < NSEC_PER_SEC)) { | ||
308 | tmp.tv64 = nsec; | ||
309 | } else { | ||
310 | unsigned long rem = do_div(nsec, NSEC_PER_SEC); | ||
311 | |||
312 | tmp = ktime_set((long)nsec, rem); | ||
313 | } | ||
314 | |||
315 | return ktime_sub(kt, tmp); | ||
316 | } | ||
317 | |||
318 | EXPORT_SYMBOL_GPL(ktime_sub_ns); | ||
319 | # endif /* !CONFIG_KTIME_SCALAR */ | ||
320 | |||
321 | /* | 266 | /* |
322 | * Divide a ktime value by a nanosecond value | 267 | * Divide a ktime value by a nanosecond value |
323 | */ | 268 | */ |
@@ -337,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div) | |||
337 | 282 | ||
338 | return dclc; | 283 | return dclc; |
339 | } | 284 | } |
285 | EXPORT_SYMBOL_GPL(ktime_divns); | ||
340 | #endif /* BITS_PER_LONG >= 64 */ | 286 | #endif /* BITS_PER_LONG >= 64 */ |
341 | 287 | ||
342 | /* | 288 | /* |
@@ -602,6 +548,11 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) | |||
602 | * timers, we have to check, whether it expires earlier than the timer for | 548 | * timers, we have to check, whether it expires earlier than the timer for |
603 | * which the clock event device was armed. | 549 | * which the clock event device was armed. |
604 | * | 550 | * |
551 | * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming | ||
552 | * and no expiry check happens. The timer gets enqueued into the rbtree. The | ||
553 | * reprogramming and expiry check is done in the hrtimer_interrupt or in the | ||
554 | * softirq. | ||
555 | * | ||
605 | * Called with interrupts disabled and base->cpu_base.lock held | 556 | * Called with interrupts disabled and base->cpu_base.lock held |
606 | */ | 557 | */ |
607 | static int hrtimer_reprogram(struct hrtimer *timer, | 558 | static int hrtimer_reprogram(struct hrtimer *timer, |
@@ -662,25 +613,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
662 | base->hres_active = 0; | 613 | base->hres_active = 0; |
663 | } | 614 | } |
664 | 615 | ||
665 | /* | ||
666 | * When High resolution timers are active, try to reprogram. Note, that in case | ||
667 | * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry | ||
668 | * check happens. The timer gets enqueued into the rbtree. The reprogramming | ||
669 | * and expiry check is done in the hrtimer_interrupt or in the softirq. | ||
670 | */ | ||
671 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | ||
672 | struct hrtimer_clock_base *base) | ||
673 | { | ||
674 | return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); | ||
675 | } | ||
676 | |||
677 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | 616 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) |
678 | { | 617 | { |
679 | ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; | 618 | ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; |
680 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; | 619 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; |
681 | ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; | 620 | ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; |
682 | 621 | ||
683 | return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); | 622 | return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); |
684 | } | 623 | } |
685 | 624 | ||
686 | /* | 625 | /* |
@@ -755,8 +694,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; } | |||
755 | static inline int hrtimer_switch_to_hres(void) { return 0; } | 694 | static inline int hrtimer_switch_to_hres(void) { return 0; } |
756 | static inline void | 695 | static inline void |
757 | hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } | 696 | hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } |
758 | static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | 697 | static inline int hrtimer_reprogram(struct hrtimer *timer, |
759 | struct hrtimer_clock_base *base) | 698 | struct hrtimer_clock_base *base) |
760 | { | 699 | { |
761 | return 0; | 700 | return 0; |
762 | } | 701 | } |
@@ -1013,14 +952,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
1013 | 952 | ||
1014 | leftmost = enqueue_hrtimer(timer, new_base); | 953 | leftmost = enqueue_hrtimer(timer, new_base); |
1015 | 954 | ||
1016 | /* | 955 | if (!leftmost) { |
1017 | * Only allow reprogramming if the new base is on this CPU. | 956 | unlock_hrtimer_base(timer, &flags); |
1018 | * (it might still be on another CPU if the timer was pending) | 957 | return ret; |
1019 | * | 958 | } |
1020 | * XXX send_remote_softirq() ? | 959 | |
1021 | */ | 960 | if (!hrtimer_is_hres_active(timer)) { |
1022 | if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) | 961 | /* |
1023 | && hrtimer_enqueue_reprogram(timer, new_base)) { | 962 | * Kick to reschedule the next tick to handle the new timer |
963 | * on dynticks target. | ||
964 | */ | ||
965 | wake_up_nohz_cpu(new_base->cpu_base->cpu); | ||
966 | } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && | ||
967 | hrtimer_reprogram(timer, new_base)) { | ||
968 | /* | ||
969 | * Only allow reprogramming if the new base is on this CPU. | ||
970 | * (it might still be on another CPU if the timer was pending) | ||
971 | * | ||
972 | * XXX send_remote_softirq() ? | ||
973 | */ | ||
1024 | if (wakeup) { | 974 | if (wakeup) { |
1025 | /* | 975 | /* |
1026 | * We need to drop cpu_base->lock to avoid a | 976 | * We need to drop cpu_base->lock to avoid a |
@@ -1680,6 +1630,7 @@ static void init_hrtimers_cpu(int cpu) | |||
1680 | timerqueue_init_head(&cpu_base->clock_base[i].active); | 1630 | timerqueue_init_head(&cpu_base->clock_base[i].active); |
1681 | } | 1631 | } |
1682 | 1632 | ||
1633 | cpu_base->cpu = cpu; | ||
1683 | hrtimer_init_hres(cpu_base); | 1634 | hrtimer_init_hres(cpu_base); |
1684 | } | 1635 | } |
1685 | 1636 | ||
diff --git a/kernel/itimer.c b/kernel/time/itimer.c index 8d262b467573..8d262b467573 100644 --- a/kernel/itimer.c +++ b/kernel/time/itimer.c | |||
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 33db43a39515..87a346fd6d61 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); | |||
466 | 466 | ||
467 | static void sync_cmos_clock(struct work_struct *work) | 467 | static void sync_cmos_clock(struct work_struct *work) |
468 | { | 468 | { |
469 | struct timespec now, next; | 469 | struct timespec64 now; |
470 | struct timespec next; | ||
470 | int fail = 1; | 471 | int fail = 1; |
471 | 472 | ||
472 | /* | 473 | /* |
@@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work) | |||
485 | return; | 486 | return; |
486 | } | 487 | } |
487 | 488 | ||
488 | getnstimeofday(&now); | 489 | getnstimeofday64(&now); |
489 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { | 490 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { |
490 | struct timespec adjust = now; | 491 | struct timespec adjust = timespec64_to_timespec(now); |
491 | 492 | ||
492 | fail = -ENODEV; | 493 | fail = -ENODEV; |
493 | if (persistent_clock_is_local) | 494 | if (persistent_clock_is_local) |
@@ -531,7 +532,7 @@ void ntp_notify_cmos_timer(void) { } | |||
531 | /* | 532 | /* |
532 | * Propagate a new txc->status value into the NTP state: | 533 | * Propagate a new txc->status value into the NTP state: |
533 | */ | 534 | */ |
534 | static inline void process_adj_status(struct timex *txc, struct timespec *ts) | 535 | static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) |
535 | { | 536 | { |
536 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { | 537 | if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { |
537 | time_state = TIME_OK; | 538 | time_state = TIME_OK; |
@@ -554,7 +555,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
554 | 555 | ||
555 | 556 | ||
556 | static inline void process_adjtimex_modes(struct timex *txc, | 557 | static inline void process_adjtimex_modes(struct timex *txc, |
557 | struct timespec *ts, | 558 | struct timespec64 *ts, |
558 | s32 *time_tai) | 559 | s32 *time_tai) |
559 | { | 560 | { |
560 | if (txc->modes & ADJ_STATUS) | 561 | if (txc->modes & ADJ_STATUS) |
@@ -640,7 +641,7 @@ int ntp_validate_timex(struct timex *txc) | |||
640 | * adjtimex mainly allows reading (and writing, if superuser) of | 641 | * adjtimex mainly allows reading (and writing, if superuser) of |
641 | * kernel time-keeping variables. used by xntpd. | 642 | * kernel time-keeping variables. used by xntpd. |
642 | */ | 643 | */ |
643 | int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) | 644 | int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) |
644 | { | 645 | { |
645 | int result; | 646 | int result; |
646 | 647 | ||
@@ -684,7 +685,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) | |||
684 | /* fill PPS status fields */ | 685 | /* fill PPS status fields */ |
685 | pps_fill_timex(txc); | 686 | pps_fill_timex(txc); |
686 | 687 | ||
687 | txc->time.tv_sec = ts->tv_sec; | 688 | txc->time.tv_sec = (time_t)ts->tv_sec; |
688 | txc->time.tv_usec = ts->tv_nsec; | 689 | txc->time.tv_usec = ts->tv_nsec; |
689 | if (!(time_status & STA_NANO)) | 690 | if (!(time_status & STA_NANO)) |
690 | txc->time.tv_usec /= NSEC_PER_USEC; | 691 | txc->time.tv_usec /= NSEC_PER_USEC; |
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 1950cb4ca2a4..bbd102ad9df7 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h | |||
@@ -7,6 +7,6 @@ extern void ntp_clear(void); | |||
7 | extern u64 ntp_tick_length(void); | 7 | extern u64 ntp_tick_length(void); |
8 | extern int second_overflow(unsigned long secs); | 8 | extern int second_overflow(unsigned long secs); |
9 | extern int ntp_validate_timex(struct timex *); | 9 | extern int ntp_validate_timex(struct timex *); |
10 | extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); | 10 | extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); |
11 | extern void __hardpps(const struct timespec *, const struct timespec *); | 11 | extern void __hardpps(const struct timespec *, const struct timespec *); |
12 | #endif /* _LINUX_NTP_INTERNAL_H */ | 12 | #endif /* _LINUX_NTP_INTERNAL_H */ |
diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..3b8946416a5f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c index 424c2d4265c9..42b463ad90f2 100644 --- a/kernel/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
@@ -49,6 +49,8 @@ | |||
49 | #include <linux/export.h> | 49 | #include <linux/export.h> |
50 | #include <linux/hashtable.h> | 50 | #include <linux/hashtable.h> |
51 | 51 | ||
52 | #include "timekeeping.h" | ||
53 | |||
52 | /* | 54 | /* |
53 | * Management arrays for POSIX timers. Timers are now kept in static hash table | 55 | * Management arrays for POSIX timers. Timers are now kept in static hash table |
54 | * with 512 entries. | 56 | * with 512 entries. |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7ab92b19965a..c19c1d84b6f3 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -4,6 +4,8 @@ | |||
4 | #include <linux/hrtimer.h> | 4 | #include <linux/hrtimer.h> |
5 | #include <linux/tick.h> | 5 | #include <linux/tick.h> |
6 | 6 | ||
7 | #include "timekeeping.h" | ||
8 | |||
7 | extern seqlock_t jiffies_lock; | 9 | extern seqlock_t jiffies_lock; |
8 | 10 | ||
9 | #define CS_NAME_LEN 32 | 11 | #define CS_NAME_LEN 32 |
diff --git a/kernel/time.c b/kernel/time/time.c index 7c7964c33ae7..f0294ba14634 100644 --- a/kernel/time.c +++ b/kernel/time/time.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <asm/unistd.h> | 42 | #include <asm/unistd.h> |
43 | 43 | ||
44 | #include "timeconst.h" | 44 | #include "timeconst.h" |
45 | #include "timekeeping.h" | ||
45 | 46 | ||
46 | /* | 47 | /* |
47 | * The timezone where the local system is located. Used as a default by some | 48 | * The timezone where the local system is located. Used as a default by some |
@@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec) | |||
420 | } | 421 | } |
421 | EXPORT_SYMBOL(ns_to_timeval); | 422 | EXPORT_SYMBOL(ns_to_timeval); |
422 | 423 | ||
424 | #if BITS_PER_LONG == 32 | ||
425 | /** | ||
426 | * set_normalized_timespec - set timespec sec and nsec parts and normalize | ||
427 | * | ||
428 | * @ts: pointer to timespec variable to be set | ||
429 | * @sec: seconds to set | ||
430 | * @nsec: nanoseconds to set | ||
431 | * | ||
432 | * Set seconds and nanoseconds field of a timespec variable and | ||
433 | * normalize to the timespec storage format | ||
434 | * | ||
435 | * Note: The tv_nsec part is always in the range of | ||
436 | * 0 <= tv_nsec < NSEC_PER_SEC | ||
437 | * For negative values only the tv_sec field is negative ! | ||
438 | */ | ||
439 | void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) | ||
440 | { | ||
441 | while (nsec >= NSEC_PER_SEC) { | ||
442 | /* | ||
443 | * The following asm() prevents the compiler from | ||
444 | * optimising this loop into a modulo operation. See | ||
445 | * also __iter_div_u64_rem() in include/linux/time.h | ||
446 | */ | ||
447 | asm("" : "+rm"(nsec)); | ||
448 | nsec -= NSEC_PER_SEC; | ||
449 | ++sec; | ||
450 | } | ||
451 | while (nsec < 0) { | ||
452 | asm("" : "+rm"(nsec)); | ||
453 | nsec += NSEC_PER_SEC; | ||
454 | --sec; | ||
455 | } | ||
456 | ts->tv_sec = sec; | ||
457 | ts->tv_nsec = nsec; | ||
458 | } | ||
459 | EXPORT_SYMBOL(set_normalized_timespec64); | ||
460 | |||
461 | /** | ||
462 | * ns_to_timespec64 - Convert nanoseconds to timespec64 | ||
463 | * @nsec: the nanoseconds value to be converted | ||
464 | * | ||
465 | * Returns the timespec64 representation of the nsec parameter. | ||
466 | */ | ||
467 | struct timespec64 ns_to_timespec64(const s64 nsec) | ||
468 | { | ||
469 | struct timespec64 ts; | ||
470 | s32 rem; | ||
471 | |||
472 | if (!nsec) | ||
473 | return (struct timespec64) {0, 0}; | ||
474 | |||
475 | ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); | ||
476 | if (unlikely(rem < 0)) { | ||
477 | ts.tv_sec--; | ||
478 | rem += NSEC_PER_SEC; | ||
479 | } | ||
480 | ts.tv_nsec = rem; | ||
481 | |||
482 | return ts; | ||
483 | } | ||
484 | EXPORT_SYMBOL(ns_to_timespec64); | ||
485 | #endif | ||
423 | /* | 486 | /* |
424 | * When we convert to jiffies then we interpret incoming values | 487 | * When we convert to jiffies then we interpret incoming values |
425 | * the following way: | 488 | * the following way: |
@@ -694,6 +757,7 @@ unsigned long nsecs_to_jiffies(u64 n) | |||
694 | { | 757 | { |
695 | return (unsigned long)nsecs_to_jiffies64(n); | 758 | return (unsigned long)nsecs_to_jiffies64(n); |
696 | } | 759 | } |
760 | EXPORT_SYMBOL_GPL(nsecs_to_jiffies); | ||
697 | 761 | ||
698 | /* | 762 | /* |
699 | * Add two timespec values and do a safety check for overflow. | 763 | * Add two timespec values and do a safety check for overflow. |
diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc index 511bdf2cafda..511bdf2cafda 100644 --- a/kernel/timeconst.bc +++ b/kernel/time/timeconst.bc | |||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 32d8d6aaedb8..fb4a9c2cf8d9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -32,11 +32,34 @@ | |||
32 | #define TK_MIRROR (1 << 1) | 32 | #define TK_MIRROR (1 << 1) |
33 | #define TK_CLOCK_WAS_SET (1 << 2) | 33 | #define TK_CLOCK_WAS_SET (1 << 2) |
34 | 34 | ||
35 | static struct timekeeper timekeeper; | 35 | /* |
36 | * The most important data for readout fits into a single 64 byte | ||
37 | * cache line. | ||
38 | */ | ||
39 | static struct { | ||
40 | seqcount_t seq; | ||
41 | struct timekeeper timekeeper; | ||
42 | } tk_core ____cacheline_aligned; | ||
43 | |||
36 | static DEFINE_RAW_SPINLOCK(timekeeper_lock); | 44 | static DEFINE_RAW_SPINLOCK(timekeeper_lock); |
37 | static seqcount_t timekeeper_seq; | ||
38 | static struct timekeeper shadow_timekeeper; | 45 | static struct timekeeper shadow_timekeeper; |
39 | 46 | ||
47 | /** | ||
48 | * struct tk_fast - NMI safe timekeeper | ||
49 | * @seq: Sequence counter for protecting updates. The lowest bit | ||
50 | * is the index for the tk_read_base array | ||
51 | * @base: tk_read_base array. Access is indexed by the lowest bit of | ||
52 | * @seq. | ||
53 | * | ||
54 | * See @update_fast_timekeeper() below. | ||
55 | */ | ||
56 | struct tk_fast { | ||
57 | seqcount_t seq; | ||
58 | struct tk_read_base base[2]; | ||
59 | }; | ||
60 | |||
61 | static struct tk_fast tk_fast_mono ____cacheline_aligned; | ||
62 | |||
40 | /* flag for if timekeeping is suspended */ | 63 | /* flag for if timekeeping is suspended */ |
41 | int __read_mostly timekeeping_suspended; | 64 | int __read_mostly timekeeping_suspended; |
42 | 65 | ||
@@ -45,49 +68,54 @@ bool __read_mostly persistent_clock_exist = false; | |||
45 | 68 | ||
46 | static inline void tk_normalize_xtime(struct timekeeper *tk) | 69 | static inline void tk_normalize_xtime(struct timekeeper *tk) |
47 | { | 70 | { |
48 | while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { | 71 | while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { |
49 | tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; | 72 | tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; |
50 | tk->xtime_sec++; | 73 | tk->xtime_sec++; |
51 | } | 74 | } |
52 | } | 75 | } |
53 | 76 | ||
54 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) | 77 | static inline struct timespec64 tk_xtime(struct timekeeper *tk) |
78 | { | ||
79 | struct timespec64 ts; | ||
80 | |||
81 | ts.tv_sec = tk->xtime_sec; | ||
82 | ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); | ||
83 | return ts; | ||
84 | } | ||
85 | |||
86 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) | ||
55 | { | 87 | { |
56 | tk->xtime_sec = ts->tv_sec; | 88 | tk->xtime_sec = ts->tv_sec; |
57 | tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; | 89 | tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; |
58 | } | 90 | } |
59 | 91 | ||
60 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) | 92 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) |
61 | { | 93 | { |
62 | tk->xtime_sec += ts->tv_sec; | 94 | tk->xtime_sec += ts->tv_sec; |
63 | tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; | 95 | tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; |
64 | tk_normalize_xtime(tk); | 96 | tk_normalize_xtime(tk); |
65 | } | 97 | } |
66 | 98 | ||
67 | static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) | 99 | static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) |
68 | { | 100 | { |
69 | struct timespec tmp; | 101 | struct timespec64 tmp; |
70 | 102 | ||
71 | /* | 103 | /* |
72 | * Verify consistency of: offset_real = -wall_to_monotonic | 104 | * Verify consistency of: offset_real = -wall_to_monotonic |
73 | * before modifying anything | 105 | * before modifying anything |
74 | */ | 106 | */ |
75 | set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, | 107 | set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, |
76 | -tk->wall_to_monotonic.tv_nsec); | 108 | -tk->wall_to_monotonic.tv_nsec); |
77 | WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); | 109 | WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64); |
78 | tk->wall_to_monotonic = wtm; | 110 | tk->wall_to_monotonic = wtm; |
79 | set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); | 111 | set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); |
80 | tk->offs_real = timespec_to_ktime(tmp); | 112 | tk->offs_real = timespec64_to_ktime(tmp); |
81 | tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); | 113 | tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); |
82 | } | 114 | } |
83 | 115 | ||
84 | static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) | 116 | static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) |
85 | { | 117 | { |
86 | /* Verify consistency before modifying */ | 118 | tk->offs_boot = ktime_add(tk->offs_boot, delta); |
87 | WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); | ||
88 | |||
89 | tk->total_sleep_time = t; | ||
90 | tk->offs_boot = timespec_to_ktime(t); | ||
91 | } | 119 | } |
92 | 120 | ||
93 | /** | 121 | /** |
@@ -107,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
107 | u64 tmp, ntpinterval; | 135 | u64 tmp, ntpinterval; |
108 | struct clocksource *old_clock; | 136 | struct clocksource *old_clock; |
109 | 137 | ||
110 | old_clock = tk->clock; | 138 | old_clock = tk->tkr.clock; |
111 | tk->clock = clock; | 139 | tk->tkr.clock = clock; |
112 | tk->cycle_last = clock->cycle_last = clock->read(clock); | 140 | tk->tkr.read = clock->read; |
141 | tk->tkr.mask = clock->mask; | ||
142 | tk->tkr.cycle_last = tk->tkr.read(clock); | ||
113 | 143 | ||
114 | /* Do the ns -> cycle conversion first, using original mult */ | 144 | /* Do the ns -> cycle conversion first, using original mult */ |
115 | tmp = NTP_INTERVAL_LENGTH; | 145 | tmp = NTP_INTERVAL_LENGTH; |
@@ -133,78 +163,213 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
133 | if (old_clock) { | 163 | if (old_clock) { |
134 | int shift_change = clock->shift - old_clock->shift; | 164 | int shift_change = clock->shift - old_clock->shift; |
135 | if (shift_change < 0) | 165 | if (shift_change < 0) |
136 | tk->xtime_nsec >>= -shift_change; | 166 | tk->tkr.xtime_nsec >>= -shift_change; |
137 | else | 167 | else |
138 | tk->xtime_nsec <<= shift_change; | 168 | tk->tkr.xtime_nsec <<= shift_change; |
139 | } | 169 | } |
140 | tk->shift = clock->shift; | 170 | tk->tkr.shift = clock->shift; |
141 | 171 | ||
142 | tk->ntp_error = 0; | 172 | tk->ntp_error = 0; |
143 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; | 173 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; |
174 | tk->ntp_tick = ntpinterval << tk->ntp_error_shift; | ||
144 | 175 | ||
145 | /* | 176 | /* |
146 | * The timekeeper keeps its own mult values for the currently | 177 | * The timekeeper keeps its own mult values for the currently |
147 | * active clocksource. These value will be adjusted via NTP | 178 | * active clocksource. These value will be adjusted via NTP |
148 | * to counteract clock drifting. | 179 | * to counteract clock drifting. |
149 | */ | 180 | */ |
150 | tk->mult = clock->mult; | 181 | tk->tkr.mult = clock->mult; |
182 | tk->ntp_err_mult = 0; | ||
151 | } | 183 | } |
152 | 184 | ||
153 | /* Timekeeper helper functions. */ | 185 | /* Timekeeper helper functions. */ |
154 | 186 | ||
155 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | 187 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
156 | u32 (*arch_gettimeoffset)(void); | 188 | static u32 default_arch_gettimeoffset(void) { return 0; } |
157 | 189 | u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; | |
158 | u32 get_arch_timeoffset(void) | ||
159 | { | ||
160 | if (likely(arch_gettimeoffset)) | ||
161 | return arch_gettimeoffset(); | ||
162 | return 0; | ||
163 | } | ||
164 | #else | 190 | #else |
165 | static inline u32 get_arch_timeoffset(void) { return 0; } | 191 | static inline u32 arch_gettimeoffset(void) { return 0; } |
166 | #endif | 192 | #endif |
167 | 193 | ||
168 | static inline s64 timekeeping_get_ns(struct timekeeper *tk) | 194 | static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) |
169 | { | 195 | { |
170 | cycle_t cycle_now, cycle_delta; | 196 | cycle_t cycle_now, delta; |
171 | struct clocksource *clock; | ||
172 | s64 nsec; | 197 | s64 nsec; |
173 | 198 | ||
174 | /* read clocksource: */ | 199 | /* read clocksource: */ |
175 | clock = tk->clock; | 200 | cycle_now = tkr->read(tkr->clock); |
176 | cycle_now = clock->read(clock); | ||
177 | 201 | ||
178 | /* calculate the delta since the last update_wall_time: */ | 202 | /* calculate the delta since the last update_wall_time: */ |
179 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 203 | delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); |
180 | 204 | ||
181 | nsec = cycle_delta * tk->mult + tk->xtime_nsec; | 205 | nsec = delta * tkr->mult + tkr->xtime_nsec; |
182 | nsec >>= tk->shift; | 206 | nsec >>= tkr->shift; |
183 | 207 | ||
184 | /* If arch requires, add in get_arch_timeoffset() */ | 208 | /* If arch requires, add in get_arch_timeoffset() */ |
185 | return nsec + get_arch_timeoffset(); | 209 | return nsec + arch_gettimeoffset(); |
186 | } | 210 | } |
187 | 211 | ||
188 | static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | 212 | static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) |
189 | { | 213 | { |
190 | cycle_t cycle_now, cycle_delta; | 214 | struct clocksource *clock = tk->tkr.clock; |
191 | struct clocksource *clock; | 215 | cycle_t cycle_now, delta; |
192 | s64 nsec; | 216 | s64 nsec; |
193 | 217 | ||
194 | /* read clocksource: */ | 218 | /* read clocksource: */ |
195 | clock = tk->clock; | 219 | cycle_now = tk->tkr.read(clock); |
196 | cycle_now = clock->read(clock); | ||
197 | 220 | ||
198 | /* calculate the delta since the last update_wall_time: */ | 221 | /* calculate the delta since the last update_wall_time: */ |
199 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 222 | delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); |
200 | 223 | ||
201 | /* convert delta to nanoseconds. */ | 224 | /* convert delta to nanoseconds. */ |
202 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 225 | nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); |
203 | 226 | ||
204 | /* If arch requires, add in get_arch_timeoffset() */ | 227 | /* If arch requires, add in get_arch_timeoffset() */ |
205 | return nsec + get_arch_timeoffset(); | 228 | return nsec + arch_gettimeoffset(); |
229 | } | ||
230 | |||
231 | /** | ||
232 | * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. | ||
233 | * @tk: The timekeeper from which we take the update | ||
234 | * @tkf: The fast timekeeper to update | ||
235 | * @tbase: The time base for the fast timekeeper (mono/raw) | ||
236 | * | ||
237 | * We want to use this from any context including NMI and tracing / | ||
238 | * instrumenting the timekeeping code itself. | ||
239 | * | ||
240 | * So we handle this differently than the other timekeeping accessor | ||
241 | * functions which retry when the sequence count has changed. The | ||
242 | * update side does: | ||
243 | * | ||
244 | * smp_wmb(); <- Ensure that the last base[1] update is visible | ||
245 | * tkf->seq++; | ||
246 | * smp_wmb(); <- Ensure that the seqcount update is visible | ||
247 | * update(tkf->base[0], tk); | ||
248 | * smp_wmb(); <- Ensure that the base[0] update is visible | ||
249 | * tkf->seq++; | ||
250 | * smp_wmb(); <- Ensure that the seqcount update is visible | ||
251 | * update(tkf->base[1], tk); | ||
252 | * | ||
253 | * The reader side does: | ||
254 | * | ||
255 | * do { | ||
256 | * seq = tkf->seq; | ||
257 | * smp_rmb(); | ||
258 | * idx = seq & 0x01; | ||
259 | * now = now(tkf->base[idx]); | ||
260 | * smp_rmb(); | ||
261 | * } while (seq != tkf->seq) | ||
262 | * | ||
263 | * As long as we update base[0] readers are forced off to | ||
264 | * base[1]. Once base[0] is updated readers are redirected to base[0] | ||
265 | * and the base[1] update takes place. | ||
266 | * | ||
267 | * So if a NMI hits the update of base[0] then it will use base[1] | ||
268 | * which is still consistent. In the worst case this can result is a | ||
269 | * slightly wrong timestamp (a few nanoseconds). See | ||
270 | * @ktime_get_mono_fast_ns. | ||
271 | */ | ||
272 | static void update_fast_timekeeper(struct timekeeper *tk) | ||
273 | { | ||
274 | struct tk_read_base *base = tk_fast_mono.base; | ||
275 | |||
276 | /* Force readers off to base[1] */ | ||
277 | raw_write_seqcount_latch(&tk_fast_mono.seq); | ||
278 | |||
279 | /* Update base[0] */ | ||
280 | memcpy(base, &tk->tkr, sizeof(*base)); | ||
281 | |||
282 | /* Force readers back to base[0] */ | ||
283 | raw_write_seqcount_latch(&tk_fast_mono.seq); | ||
284 | |||
285 | /* Update base[1] */ | ||
286 | memcpy(base + 1, base, sizeof(*base)); | ||
206 | } | 287 | } |
207 | 288 | ||
289 | /** | ||
290 | * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic | ||
291 | * | ||
292 | * This timestamp is not guaranteed to be monotonic across an update. | ||
293 | * The timestamp is calculated by: | ||
294 | * | ||
295 | * now = base_mono + clock_delta * slope | ||
296 | * | ||
297 | * So if the update lowers the slope, readers who are forced to the | ||
298 | * not yet updated second array are still using the old steeper slope. | ||
299 | * | ||
300 | * tmono | ||
301 | * ^ | ||
302 | * | o n | ||
303 | * | o n | ||
304 | * | u | ||
305 | * | o | ||
306 | * |o | ||
307 | * |12345678---> reader order | ||
308 | * | ||
309 | * o = old slope | ||
310 | * u = update | ||
311 | * n = new slope | ||
312 | * | ||
313 | * So reader 6 will observe time going backwards versus reader 5. | ||
314 | * | ||
315 | * While other CPUs are likely to be able observe that, the only way | ||
316 | * for a CPU local observation is when an NMI hits in the middle of | ||
317 | * the update. Timestamps taken from that NMI context might be ahead | ||
318 | * of the following timestamps. Callers need to be aware of that and | ||
319 | * deal with it. | ||
320 | */ | ||
321 | u64 notrace ktime_get_mono_fast_ns(void) | ||
322 | { | ||
323 | struct tk_read_base *tkr; | ||
324 | unsigned int seq; | ||
325 | u64 now; | ||
326 | |||
327 | do { | ||
328 | seq = raw_read_seqcount(&tk_fast_mono.seq); | ||
329 | tkr = tk_fast_mono.base + (seq & 0x01); | ||
330 | now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); | ||
331 | |||
332 | } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); | ||
333 | return now; | ||
334 | } | ||
335 | EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); | ||
336 | |||
337 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD | ||
338 | |||
339 | static inline void update_vsyscall(struct timekeeper *tk) | ||
340 | { | ||
341 | struct timespec xt, wm; | ||
342 | |||
343 | xt = timespec64_to_timespec(tk_xtime(tk)); | ||
344 | wm = timespec64_to_timespec(tk->wall_to_monotonic); | ||
345 | update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, | ||
346 | tk->tkr.cycle_last); | ||
347 | } | ||
348 | |||
349 | static inline void old_vsyscall_fixup(struct timekeeper *tk) | ||
350 | { | ||
351 | s64 remainder; | ||
352 | |||
353 | /* | ||
354 | * Store only full nanoseconds into xtime_nsec after rounding | ||
355 | * it up and add the remainder to the error difference. | ||
356 | * XXX - This is necessary to avoid small 1ns inconsistnecies caused | ||
357 | * by truncating the remainder in vsyscalls. However, it causes | ||
358 | * additional work to be done in timekeeping_adjust(). Once | ||
359 | * the vsyscall implementations are converted to use xtime_nsec | ||
360 | * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD | ||
361 | * users are removed, this can be killed. | ||
362 | */ | ||
363 | remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); | ||
364 | tk->tkr.xtime_nsec -= remainder; | ||
365 | tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; | ||
366 | tk->ntp_error += remainder << tk->ntp_error_shift; | ||
367 | tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; | ||
368 | } | ||
369 | #else | ||
370 | #define old_vsyscall_fixup(tk) | ||
371 | #endif | ||
372 | |||
208 | static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); | 373 | static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); |
209 | 374 | ||
210 | static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) | 375 | static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) |
@@ -217,7 +382,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) | |||
217 | */ | 382 | */ |
218 | int pvclock_gtod_register_notifier(struct notifier_block *nb) | 383 | int pvclock_gtod_register_notifier(struct notifier_block *nb) |
219 | { | 384 | { |
220 | struct timekeeper *tk = &timekeeper; | 385 | struct timekeeper *tk = &tk_core.timekeeper; |
221 | unsigned long flags; | 386 | unsigned long flags; |
222 | int ret; | 387 | int ret; |
223 | 388 | ||
@@ -247,6 +412,29 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) | |||
247 | } | 412 | } |
248 | EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); | 413 | EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); |
249 | 414 | ||
415 | /* | ||
416 | * Update the ktime_t based scalar nsec members of the timekeeper | ||
417 | */ | ||
418 | static inline void tk_update_ktime_data(struct timekeeper *tk) | ||
419 | { | ||
420 | s64 nsec; | ||
421 | |||
422 | /* | ||
423 | * The xtime based monotonic readout is: | ||
424 | * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); | ||
425 | * The ktime based monotonic readout is: | ||
426 | * nsec = base_mono + now(); | ||
427 | * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec | ||
428 | */ | ||
429 | nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); | ||
430 | nsec *= NSEC_PER_SEC; | ||
431 | nsec += tk->wall_to_monotonic.tv_nsec; | ||
432 | tk->tkr.base_mono = ns_to_ktime(nsec); | ||
433 | |||
434 | /* Update the monotonic raw base */ | ||
435 | tk->base_raw = timespec64_to_ktime(tk->raw_time); | ||
436 | } | ||
437 | |||
250 | /* must hold timekeeper_lock */ | 438 | /* must hold timekeeper_lock */ |
251 | static void timekeeping_update(struct timekeeper *tk, unsigned int action) | 439 | static void timekeeping_update(struct timekeeper *tk, unsigned int action) |
252 | { | 440 | { |
@@ -257,8 +445,13 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
257 | update_vsyscall(tk); | 445 | update_vsyscall(tk); |
258 | update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); | 446 | update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); |
259 | 447 | ||
448 | tk_update_ktime_data(tk); | ||
449 | |||
260 | if (action & TK_MIRROR) | 450 | if (action & TK_MIRROR) |
261 | memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); | 451 | memcpy(&shadow_timekeeper, &tk_core.timekeeper, |
452 | sizeof(tk_core.timekeeper)); | ||
453 | |||
454 | update_fast_timekeeper(tk); | ||
262 | } | 455 | } |
263 | 456 | ||
264 | /** | 457 | /** |
@@ -270,49 +463,48 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
270 | */ | 463 | */ |
271 | static void timekeeping_forward_now(struct timekeeper *tk) | 464 | static void timekeeping_forward_now(struct timekeeper *tk) |
272 | { | 465 | { |
273 | cycle_t cycle_now, cycle_delta; | 466 | struct clocksource *clock = tk->tkr.clock; |
274 | struct clocksource *clock; | 467 | cycle_t cycle_now, delta; |
275 | s64 nsec; | 468 | s64 nsec; |
276 | 469 | ||
277 | clock = tk->clock; | 470 | cycle_now = tk->tkr.read(clock); |
278 | cycle_now = clock->read(clock); | 471 | delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); |
279 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 472 | tk->tkr.cycle_last = cycle_now; |
280 | tk->cycle_last = clock->cycle_last = cycle_now; | ||
281 | 473 | ||
282 | tk->xtime_nsec += cycle_delta * tk->mult; | 474 | tk->tkr.xtime_nsec += delta * tk->tkr.mult; |
283 | 475 | ||
284 | /* If arch requires, add in get_arch_timeoffset() */ | 476 | /* If arch requires, add in get_arch_timeoffset() */ |
285 | tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; | 477 | tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; |
286 | 478 | ||
287 | tk_normalize_xtime(tk); | 479 | tk_normalize_xtime(tk); |
288 | 480 | ||
289 | nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 481 | nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); |
290 | timespec_add_ns(&tk->raw_time, nsec); | 482 | timespec64_add_ns(&tk->raw_time, nsec); |
291 | } | 483 | } |
292 | 484 | ||
293 | /** | 485 | /** |
294 | * __getnstimeofday - Returns the time of day in a timespec. | 486 | * __getnstimeofday64 - Returns the time of day in a timespec64. |
295 | * @ts: pointer to the timespec to be set | 487 | * @ts: pointer to the timespec to be set |
296 | * | 488 | * |
297 | * Updates the time of day in the timespec. | 489 | * Updates the time of day in the timespec. |
298 | * Returns 0 on success, or -ve when suspended (timespec will be undefined). | 490 | * Returns 0 on success, or -ve when suspended (timespec will be undefined). |
299 | */ | 491 | */ |
300 | int __getnstimeofday(struct timespec *ts) | 492 | int __getnstimeofday64(struct timespec64 *ts) |
301 | { | 493 | { |
302 | struct timekeeper *tk = &timekeeper; | 494 | struct timekeeper *tk = &tk_core.timekeeper; |
303 | unsigned long seq; | 495 | unsigned long seq; |
304 | s64 nsecs = 0; | 496 | s64 nsecs = 0; |
305 | 497 | ||
306 | do { | 498 | do { |
307 | seq = read_seqcount_begin(&timekeeper_seq); | 499 | seq = read_seqcount_begin(&tk_core.seq); |
308 | 500 | ||
309 | ts->tv_sec = tk->xtime_sec; | 501 | ts->tv_sec = tk->xtime_sec; |
310 | nsecs = timekeeping_get_ns(tk); | 502 | nsecs = timekeeping_get_ns(&tk->tkr); |
311 | 503 | ||
312 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 504 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
313 | 505 | ||
314 | ts->tv_nsec = 0; | 506 | ts->tv_nsec = 0; |
315 | timespec_add_ns(ts, nsecs); | 507 | timespec64_add_ns(ts, nsecs); |
316 | 508 | ||
317 | /* | 509 | /* |
318 | * Do not bail out early, in case there were callers still using | 510 | * Do not bail out early, in case there were callers still using |
@@ -322,116 +514,138 @@ int __getnstimeofday(struct timespec *ts) | |||
322 | return -EAGAIN; | 514 | return -EAGAIN; |
323 | return 0; | 515 | return 0; |
324 | } | 516 | } |
325 | EXPORT_SYMBOL(__getnstimeofday); | 517 | EXPORT_SYMBOL(__getnstimeofday64); |
326 | 518 | ||
327 | /** | 519 | /** |
328 | * getnstimeofday - Returns the time of day in a timespec. | 520 | * getnstimeofday64 - Returns the time of day in a timespec64. |
329 | * @ts: pointer to the timespec to be set | 521 | * @ts: pointer to the timespec to be set |
330 | * | 522 | * |
331 | * Returns the time of day in a timespec (WARN if suspended). | 523 | * Returns the time of day in a timespec (WARN if suspended). |
332 | */ | 524 | */ |
333 | void getnstimeofday(struct timespec *ts) | 525 | void getnstimeofday64(struct timespec64 *ts) |
334 | { | 526 | { |
335 | WARN_ON(__getnstimeofday(ts)); | 527 | WARN_ON(__getnstimeofday64(ts)); |
336 | } | 528 | } |
337 | EXPORT_SYMBOL(getnstimeofday); | 529 | EXPORT_SYMBOL(getnstimeofday64); |
338 | 530 | ||
339 | ktime_t ktime_get(void) | 531 | ktime_t ktime_get(void) |
340 | { | 532 | { |
341 | struct timekeeper *tk = &timekeeper; | 533 | struct timekeeper *tk = &tk_core.timekeeper; |
342 | unsigned int seq; | 534 | unsigned int seq; |
343 | s64 secs, nsecs; | 535 | ktime_t base; |
536 | s64 nsecs; | ||
344 | 537 | ||
345 | WARN_ON(timekeeping_suspended); | 538 | WARN_ON(timekeeping_suspended); |
346 | 539 | ||
347 | do { | 540 | do { |
348 | seq = read_seqcount_begin(&timekeeper_seq); | 541 | seq = read_seqcount_begin(&tk_core.seq); |
349 | secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; | 542 | base = tk->tkr.base_mono; |
350 | nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; | 543 | nsecs = timekeeping_get_ns(&tk->tkr); |
351 | 544 | ||
352 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 545 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
353 | /* | 546 | |
354 | * Use ktime_set/ktime_add_ns to create a proper ktime on | 547 | return ktime_add_ns(base, nsecs); |
355 | * 32-bit architectures without CONFIG_KTIME_SCALAR. | ||
356 | */ | ||
357 | return ktime_add_ns(ktime_set(secs, 0), nsecs); | ||
358 | } | 548 | } |
359 | EXPORT_SYMBOL_GPL(ktime_get); | 549 | EXPORT_SYMBOL_GPL(ktime_get); |
360 | 550 | ||
361 | /** | 551 | static ktime_t *offsets[TK_OFFS_MAX] = { |
362 | * ktime_get_ts - get the monotonic clock in timespec format | 552 | [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, |
363 | * @ts: pointer to timespec variable | 553 | [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, |
364 | * | 554 | [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, |
365 | * The function calculates the monotonic clock from the realtime | 555 | }; |
366 | * clock and the wall_to_monotonic offset and stores the result | 556 | |
367 | * in normalized timespec format in the variable pointed to by @ts. | 557 | ktime_t ktime_get_with_offset(enum tk_offsets offs) |
368 | */ | ||
369 | void ktime_get_ts(struct timespec *ts) | ||
370 | { | 558 | { |
371 | struct timekeeper *tk = &timekeeper; | 559 | struct timekeeper *tk = &tk_core.timekeeper; |
372 | struct timespec tomono; | ||
373 | s64 nsec; | ||
374 | unsigned int seq; | 560 | unsigned int seq; |
561 | ktime_t base, *offset = offsets[offs]; | ||
562 | s64 nsecs; | ||
375 | 563 | ||
376 | WARN_ON(timekeeping_suspended); | 564 | WARN_ON(timekeeping_suspended); |
377 | 565 | ||
378 | do { | 566 | do { |
379 | seq = read_seqcount_begin(&timekeeper_seq); | 567 | seq = read_seqcount_begin(&tk_core.seq); |
380 | ts->tv_sec = tk->xtime_sec; | 568 | base = ktime_add(tk->tkr.base_mono, *offset); |
381 | nsec = timekeeping_get_ns(tk); | 569 | nsecs = timekeeping_get_ns(&tk->tkr); |
382 | tomono = tk->wall_to_monotonic; | ||
383 | 570 | ||
384 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 571 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
385 | 572 | ||
386 | ts->tv_sec += tomono.tv_sec; | 573 | return ktime_add_ns(base, nsecs); |
387 | ts->tv_nsec = 0; | ||
388 | timespec_add_ns(ts, nsec + tomono.tv_nsec); | ||
389 | } | ||
390 | EXPORT_SYMBOL_GPL(ktime_get_ts); | ||
391 | 574 | ||
575 | } | ||
576 | EXPORT_SYMBOL_GPL(ktime_get_with_offset); | ||
392 | 577 | ||
393 | /** | 578 | /** |
394 | * timekeeping_clocktai - Returns the TAI time of day in a timespec | 579 | * ktime_mono_to_any() - convert mononotic time to any other time |
395 | * @ts: pointer to the timespec to be set | 580 | * @tmono: time to convert. |
396 | * | 581 | * @offs: which offset to use |
397 | * Returns the time of day in a timespec. | ||
398 | */ | 582 | */ |
399 | void timekeeping_clocktai(struct timespec *ts) | 583 | ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) |
400 | { | 584 | { |
401 | struct timekeeper *tk = &timekeeper; | 585 | ktime_t *offset = offsets[offs]; |
402 | unsigned long seq; | 586 | unsigned long seq; |
403 | u64 nsecs; | 587 | ktime_t tconv; |
404 | |||
405 | WARN_ON(timekeeping_suspended); | ||
406 | 588 | ||
407 | do { | 589 | do { |
408 | seq = read_seqcount_begin(&timekeeper_seq); | 590 | seq = read_seqcount_begin(&tk_core.seq); |
591 | tconv = ktime_add(tmono, *offset); | ||
592 | } while (read_seqcount_retry(&tk_core.seq, seq)); | ||
409 | 593 | ||
410 | ts->tv_sec = tk->xtime_sec + tk->tai_offset; | 594 | return tconv; |
411 | nsecs = timekeeping_get_ns(tk); | 595 | } |
596 | EXPORT_SYMBOL_GPL(ktime_mono_to_any); | ||
412 | 597 | ||
413 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 598 | /** |
599 | * ktime_get_raw - Returns the raw monotonic time in ktime_t format | ||
600 | */ | ||
601 | ktime_t ktime_get_raw(void) | ||
602 | { | ||
603 | struct timekeeper *tk = &tk_core.timekeeper; | ||
604 | unsigned int seq; | ||
605 | ktime_t base; | ||
606 | s64 nsecs; | ||
414 | 607 | ||
415 | ts->tv_nsec = 0; | 608 | do { |
416 | timespec_add_ns(ts, nsecs); | 609 | seq = read_seqcount_begin(&tk_core.seq); |
610 | base = tk->base_raw; | ||
611 | nsecs = timekeeping_get_ns_raw(tk); | ||
417 | 612 | ||
418 | } | 613 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
419 | EXPORT_SYMBOL(timekeeping_clocktai); | ||
420 | 614 | ||
615 | return ktime_add_ns(base, nsecs); | ||
616 | } | ||
617 | EXPORT_SYMBOL_GPL(ktime_get_raw); | ||
421 | 618 | ||
422 | /** | 619 | /** |
423 | * ktime_get_clocktai - Returns the TAI time of day in a ktime | 620 | * ktime_get_ts64 - get the monotonic clock in timespec64 format |
621 | * @ts: pointer to timespec variable | ||
424 | * | 622 | * |
425 | * Returns the time of day in a ktime. | 623 | * The function calculates the monotonic clock from the realtime |
624 | * clock and the wall_to_monotonic offset and stores the result | ||
625 | * in normalized timespec format in the variable pointed to by @ts. | ||
426 | */ | 626 | */ |
427 | ktime_t ktime_get_clocktai(void) | 627 | void ktime_get_ts64(struct timespec64 *ts) |
428 | { | 628 | { |
429 | struct timespec ts; | 629 | struct timekeeper *tk = &tk_core.timekeeper; |
630 | struct timespec64 tomono; | ||
631 | s64 nsec; | ||
632 | unsigned int seq; | ||
633 | |||
634 | WARN_ON(timekeeping_suspended); | ||
430 | 635 | ||
431 | timekeeping_clocktai(&ts); | 636 | do { |
432 | return timespec_to_ktime(ts); | 637 | seq = read_seqcount_begin(&tk_core.seq); |
638 | ts->tv_sec = tk->xtime_sec; | ||
639 | nsec = timekeeping_get_ns(&tk->tkr); | ||
640 | tomono = tk->wall_to_monotonic; | ||
641 | |||
642 | } while (read_seqcount_retry(&tk_core.seq, seq)); | ||
643 | |||
644 | ts->tv_sec += tomono.tv_sec; | ||
645 | ts->tv_nsec = 0; | ||
646 | timespec64_add_ns(ts, nsec + tomono.tv_nsec); | ||
433 | } | 647 | } |
434 | EXPORT_SYMBOL(ktime_get_clocktai); | 648 | EXPORT_SYMBOL_GPL(ktime_get_ts64); |
435 | 649 | ||
436 | #ifdef CONFIG_NTP_PPS | 650 | #ifdef CONFIG_NTP_PPS |
437 | 651 | ||
@@ -446,23 +660,23 @@ EXPORT_SYMBOL(ktime_get_clocktai); | |||
446 | */ | 660 | */ |
447 | void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | 661 | void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) |
448 | { | 662 | { |
449 | struct timekeeper *tk = &timekeeper; | 663 | struct timekeeper *tk = &tk_core.timekeeper; |
450 | unsigned long seq; | 664 | unsigned long seq; |
451 | s64 nsecs_raw, nsecs_real; | 665 | s64 nsecs_raw, nsecs_real; |
452 | 666 | ||
453 | WARN_ON_ONCE(timekeeping_suspended); | 667 | WARN_ON_ONCE(timekeeping_suspended); |
454 | 668 | ||
455 | do { | 669 | do { |
456 | seq = read_seqcount_begin(&timekeeper_seq); | 670 | seq = read_seqcount_begin(&tk_core.seq); |
457 | 671 | ||
458 | *ts_raw = tk->raw_time; | 672 | *ts_raw = timespec64_to_timespec(tk->raw_time); |
459 | ts_real->tv_sec = tk->xtime_sec; | 673 | ts_real->tv_sec = tk->xtime_sec; |
460 | ts_real->tv_nsec = 0; | 674 | ts_real->tv_nsec = 0; |
461 | 675 | ||
462 | nsecs_raw = timekeeping_get_ns_raw(tk); | 676 | nsecs_raw = timekeeping_get_ns_raw(tk); |
463 | nsecs_real = timekeeping_get_ns(tk); | 677 | nsecs_real = timekeeping_get_ns(&tk->tkr); |
464 | 678 | ||
465 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 679 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
466 | 680 | ||
467 | timespec_add_ns(ts_raw, nsecs_raw); | 681 | timespec_add_ns(ts_raw, nsecs_raw); |
468 | timespec_add_ns(ts_real, nsecs_real); | 682 | timespec_add_ns(ts_real, nsecs_real); |
@@ -479,9 +693,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real); | |||
479 | */ | 693 | */ |
480 | void do_gettimeofday(struct timeval *tv) | 694 | void do_gettimeofday(struct timeval *tv) |
481 | { | 695 | { |
482 | struct timespec now; | 696 | struct timespec64 now; |
483 | 697 | ||
484 | getnstimeofday(&now); | 698 | getnstimeofday64(&now); |
485 | tv->tv_sec = now.tv_sec; | 699 | tv->tv_sec = now.tv_sec; |
486 | tv->tv_usec = now.tv_nsec/1000; | 700 | tv->tv_usec = now.tv_nsec/1000; |
487 | } | 701 | } |
@@ -495,15 +709,15 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
495 | */ | 709 | */ |
496 | int do_settimeofday(const struct timespec *tv) | 710 | int do_settimeofday(const struct timespec *tv) |
497 | { | 711 | { |
498 | struct timekeeper *tk = &timekeeper; | 712 | struct timekeeper *tk = &tk_core.timekeeper; |
499 | struct timespec ts_delta, xt; | 713 | struct timespec64 ts_delta, xt, tmp; |
500 | unsigned long flags; | 714 | unsigned long flags; |
501 | 715 | ||
502 | if (!timespec_valid_strict(tv)) | 716 | if (!timespec_valid_strict(tv)) |
503 | return -EINVAL; | 717 | return -EINVAL; |
504 | 718 | ||
505 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 719 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
506 | write_seqcount_begin(&timekeeper_seq); | 720 | write_seqcount_begin(&tk_core.seq); |
507 | 721 | ||
508 | timekeeping_forward_now(tk); | 722 | timekeeping_forward_now(tk); |
509 | 723 | ||
@@ -511,13 +725,14 @@ int do_settimeofday(const struct timespec *tv) | |||
511 | ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; | 725 | ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; |
512 | ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; | 726 | ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; |
513 | 727 | ||
514 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); | 728 | tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); |
515 | 729 | ||
516 | tk_set_xtime(tk, tv); | 730 | tmp = timespec_to_timespec64(*tv); |
731 | tk_set_xtime(tk, &tmp); | ||
517 | 732 | ||
518 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); | 733 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
519 | 734 | ||
520 | write_seqcount_end(&timekeeper_seq); | 735 | write_seqcount_end(&tk_core.seq); |
521 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 736 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
522 | 737 | ||
523 | /* signal hrtimers about time change */ | 738 | /* signal hrtimers about time change */ |
@@ -535,33 +750,35 @@ EXPORT_SYMBOL(do_settimeofday); | |||
535 | */ | 750 | */ |
536 | int timekeeping_inject_offset(struct timespec *ts) | 751 | int timekeeping_inject_offset(struct timespec *ts) |
537 | { | 752 | { |
538 | struct timekeeper *tk = &timekeeper; | 753 | struct timekeeper *tk = &tk_core.timekeeper; |
539 | unsigned long flags; | 754 | unsigned long flags; |
540 | struct timespec tmp; | 755 | struct timespec64 ts64, tmp; |
541 | int ret = 0; | 756 | int ret = 0; |
542 | 757 | ||
543 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) | 758 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) |
544 | return -EINVAL; | 759 | return -EINVAL; |
545 | 760 | ||
761 | ts64 = timespec_to_timespec64(*ts); | ||
762 | |||
546 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 763 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
547 | write_seqcount_begin(&timekeeper_seq); | 764 | write_seqcount_begin(&tk_core.seq); |
548 | 765 | ||
549 | timekeeping_forward_now(tk); | 766 | timekeeping_forward_now(tk); |
550 | 767 | ||
551 | /* Make sure the proposed value is valid */ | 768 | /* Make sure the proposed value is valid */ |
552 | tmp = timespec_add(tk_xtime(tk), *ts); | 769 | tmp = timespec64_add(tk_xtime(tk), ts64); |
553 | if (!timespec_valid_strict(&tmp)) { | 770 | if (!timespec64_valid_strict(&tmp)) { |
554 | ret = -EINVAL; | 771 | ret = -EINVAL; |
555 | goto error; | 772 | goto error; |
556 | } | 773 | } |
557 | 774 | ||
558 | tk_xtime_add(tk, ts); | 775 | tk_xtime_add(tk, &ts64); |
559 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); | 776 | tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); |
560 | 777 | ||
561 | error: /* even if we error out, we forwarded the time, so call update */ | 778 | error: /* even if we error out, we forwarded the time, so call update */ |
562 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); | 779 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
563 | 780 | ||
564 | write_seqcount_end(&timekeeper_seq); | 781 | write_seqcount_end(&tk_core.seq); |
565 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 782 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
566 | 783 | ||
567 | /* signal hrtimers about time change */ | 784 | /* signal hrtimers about time change */ |
@@ -578,14 +795,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset); | |||
578 | */ | 795 | */ |
579 | s32 timekeeping_get_tai_offset(void) | 796 | s32 timekeeping_get_tai_offset(void) |
580 | { | 797 | { |
581 | struct timekeeper *tk = &timekeeper; | 798 | struct timekeeper *tk = &tk_core.timekeeper; |
582 | unsigned int seq; | 799 | unsigned int seq; |
583 | s32 ret; | 800 | s32 ret; |
584 | 801 | ||
585 | do { | 802 | do { |
586 | seq = read_seqcount_begin(&timekeeper_seq); | 803 | seq = read_seqcount_begin(&tk_core.seq); |
587 | ret = tk->tai_offset; | 804 | ret = tk->tai_offset; |
588 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 805 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
589 | 806 | ||
590 | return ret; | 807 | return ret; |
591 | } | 808 | } |
@@ -606,14 +823,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) | |||
606 | */ | 823 | */ |
607 | void timekeeping_set_tai_offset(s32 tai_offset) | 824 | void timekeeping_set_tai_offset(s32 tai_offset) |
608 | { | 825 | { |
609 | struct timekeeper *tk = &timekeeper; | 826 | struct timekeeper *tk = &tk_core.timekeeper; |
610 | unsigned long flags; | 827 | unsigned long flags; |
611 | 828 | ||
612 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 829 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
613 | write_seqcount_begin(&timekeeper_seq); | 830 | write_seqcount_begin(&tk_core.seq); |
614 | __timekeeping_set_tai_offset(tk, tai_offset); | 831 | __timekeeping_set_tai_offset(tk, tai_offset); |
615 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); | 832 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |
616 | write_seqcount_end(&timekeeper_seq); | 833 | write_seqcount_end(&tk_core.seq); |
617 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 834 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
618 | clock_was_set(); | 835 | clock_was_set(); |
619 | } | 836 | } |
@@ -625,14 +842,14 @@ void timekeeping_set_tai_offset(s32 tai_offset) | |||
625 | */ | 842 | */ |
626 | static int change_clocksource(void *data) | 843 | static int change_clocksource(void *data) |
627 | { | 844 | { |
628 | struct timekeeper *tk = &timekeeper; | 845 | struct timekeeper *tk = &tk_core.timekeeper; |
629 | struct clocksource *new, *old; | 846 | struct clocksource *new, *old; |
630 | unsigned long flags; | 847 | unsigned long flags; |
631 | 848 | ||
632 | new = (struct clocksource *) data; | 849 | new = (struct clocksource *) data; |
633 | 850 | ||
634 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 851 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
635 | write_seqcount_begin(&timekeeper_seq); | 852 | write_seqcount_begin(&tk_core.seq); |
636 | 853 | ||
637 | timekeeping_forward_now(tk); | 854 | timekeeping_forward_now(tk); |
638 | /* | 855 | /* |
@@ -641,7 +858,7 @@ static int change_clocksource(void *data) | |||
641 | */ | 858 | */ |
642 | if (try_module_get(new->owner)) { | 859 | if (try_module_get(new->owner)) { |
643 | if (!new->enable || new->enable(new) == 0) { | 860 | if (!new->enable || new->enable(new) == 0) { |
644 | old = tk->clock; | 861 | old = tk->tkr.clock; |
645 | tk_setup_internals(tk, new); | 862 | tk_setup_internals(tk, new); |
646 | if (old->disable) | 863 | if (old->disable) |
647 | old->disable(old); | 864 | old->disable(old); |
@@ -652,7 +869,7 @@ static int change_clocksource(void *data) | |||
652 | } | 869 | } |
653 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); | 870 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
654 | 871 | ||
655 | write_seqcount_end(&timekeeper_seq); | 872 | write_seqcount_end(&tk_core.seq); |
656 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 873 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
657 | 874 | ||
658 | return 0; | 875 | return 0; |
@@ -667,29 +884,14 @@ static int change_clocksource(void *data) | |||
667 | */ | 884 | */ |
668 | int timekeeping_notify(struct clocksource *clock) | 885 | int timekeeping_notify(struct clocksource *clock) |
669 | { | 886 | { |
670 | struct timekeeper *tk = &timekeeper; | 887 | struct timekeeper *tk = &tk_core.timekeeper; |
671 | 888 | ||
672 | if (tk->clock == clock) | 889 | if (tk->tkr.clock == clock) |
673 | return 0; | 890 | return 0; |
674 | stop_machine(change_clocksource, clock, NULL); | 891 | stop_machine(change_clocksource, clock, NULL); |
675 | tick_clock_notify(); | 892 | tick_clock_notify(); |
676 | return tk->clock == clock ? 0 : -1; | 893 | return tk->tkr.clock == clock ? 0 : -1; |
677 | } | ||
678 | |||
679 | /** | ||
680 | * ktime_get_real - get the real (wall-) time in ktime_t format | ||
681 | * | ||
682 | * returns the time in ktime_t format | ||
683 | */ | ||
684 | ktime_t ktime_get_real(void) | ||
685 | { | ||
686 | struct timespec now; | ||
687 | |||
688 | getnstimeofday(&now); | ||
689 | |||
690 | return timespec_to_ktime(now); | ||
691 | } | 894 | } |
692 | EXPORT_SYMBOL_GPL(ktime_get_real); | ||
693 | 895 | ||
694 | /** | 896 | /** |
695 | * getrawmonotonic - Returns the raw monotonic time in a timespec | 897 | * getrawmonotonic - Returns the raw monotonic time in a timespec |
@@ -699,18 +901,20 @@ EXPORT_SYMBOL_GPL(ktime_get_real); | |||
699 | */ | 901 | */ |
700 | void getrawmonotonic(struct timespec *ts) | 902 | void getrawmonotonic(struct timespec *ts) |
701 | { | 903 | { |
702 | struct timekeeper *tk = &timekeeper; | 904 | struct timekeeper *tk = &tk_core.timekeeper; |
905 | struct timespec64 ts64; | ||
703 | unsigned long seq; | 906 | unsigned long seq; |
704 | s64 nsecs; | 907 | s64 nsecs; |
705 | 908 | ||
706 | do { | 909 | do { |
707 | seq = read_seqcount_begin(&timekeeper_seq); | 910 | seq = read_seqcount_begin(&tk_core.seq); |
708 | nsecs = timekeeping_get_ns_raw(tk); | 911 | nsecs = timekeeping_get_ns_raw(tk); |
709 | *ts = tk->raw_time; | 912 | ts64 = tk->raw_time; |
710 | 913 | ||
711 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 914 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
712 | 915 | ||
713 | timespec_add_ns(ts, nsecs); | 916 | timespec64_add_ns(&ts64, nsecs); |
917 | *ts = timespec64_to_timespec(ts64); | ||
714 | } | 918 | } |
715 | EXPORT_SYMBOL(getrawmonotonic); | 919 | EXPORT_SYMBOL(getrawmonotonic); |
716 | 920 | ||
@@ -719,16 +923,16 @@ EXPORT_SYMBOL(getrawmonotonic); | |||
719 | */ | 923 | */ |
720 | int timekeeping_valid_for_hres(void) | 924 | int timekeeping_valid_for_hres(void) |
721 | { | 925 | { |
722 | struct timekeeper *tk = &timekeeper; | 926 | struct timekeeper *tk = &tk_core.timekeeper; |
723 | unsigned long seq; | 927 | unsigned long seq; |
724 | int ret; | 928 | int ret; |
725 | 929 | ||
726 | do { | 930 | do { |
727 | seq = read_seqcount_begin(&timekeeper_seq); | 931 | seq = read_seqcount_begin(&tk_core.seq); |
728 | 932 | ||
729 | ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | 933 | ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; |
730 | 934 | ||
731 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 935 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
732 | 936 | ||
733 | return ret; | 937 | return ret; |
734 | } | 938 | } |
@@ -738,16 +942,16 @@ int timekeeping_valid_for_hres(void) | |||
738 | */ | 942 | */ |
739 | u64 timekeeping_max_deferment(void) | 943 | u64 timekeeping_max_deferment(void) |
740 | { | 944 | { |
741 | struct timekeeper *tk = &timekeeper; | 945 | struct timekeeper *tk = &tk_core.timekeeper; |
742 | unsigned long seq; | 946 | unsigned long seq; |
743 | u64 ret; | 947 | u64 ret; |
744 | 948 | ||
745 | do { | 949 | do { |
746 | seq = read_seqcount_begin(&timekeeper_seq); | 950 | seq = read_seqcount_begin(&tk_core.seq); |
747 | 951 | ||
748 | ret = tk->clock->max_idle_ns; | 952 | ret = tk->tkr.clock->max_idle_ns; |
749 | 953 | ||
750 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 954 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
751 | 955 | ||
752 | return ret; | 956 | return ret; |
753 | } | 957 | } |
@@ -787,14 +991,15 @@ void __weak read_boot_clock(struct timespec *ts) | |||
787 | */ | 991 | */ |
788 | void __init timekeeping_init(void) | 992 | void __init timekeeping_init(void) |
789 | { | 993 | { |
790 | struct timekeeper *tk = &timekeeper; | 994 | struct timekeeper *tk = &tk_core.timekeeper; |
791 | struct clocksource *clock; | 995 | struct clocksource *clock; |
792 | unsigned long flags; | 996 | unsigned long flags; |
793 | struct timespec now, boot, tmp; | 997 | struct timespec64 now, boot, tmp; |
794 | 998 | struct timespec ts; | |
795 | read_persistent_clock(&now); | ||
796 | 999 | ||
797 | if (!timespec_valid_strict(&now)) { | 1000 | read_persistent_clock(&ts); |
1001 | now = timespec_to_timespec64(ts); | ||
1002 | if (!timespec64_valid_strict(&now)) { | ||
798 | pr_warn("WARNING: Persistent clock returned invalid value!\n" | 1003 | pr_warn("WARNING: Persistent clock returned invalid value!\n" |
799 | " Check your CMOS/BIOS settings.\n"); | 1004 | " Check your CMOS/BIOS settings.\n"); |
800 | now.tv_sec = 0; | 1005 | now.tv_sec = 0; |
@@ -802,8 +1007,9 @@ void __init timekeeping_init(void) | |||
802 | } else if (now.tv_sec || now.tv_nsec) | 1007 | } else if (now.tv_sec || now.tv_nsec) |
803 | persistent_clock_exist = true; | 1008 | persistent_clock_exist = true; |
804 | 1009 | ||
805 | read_boot_clock(&boot); | 1010 | read_boot_clock(&ts); |
806 | if (!timespec_valid_strict(&boot)) { | 1011 | boot = timespec_to_timespec64(ts); |
1012 | if (!timespec64_valid_strict(&boot)) { | ||
807 | pr_warn("WARNING: Boot clock returned invalid value!\n" | 1013 | pr_warn("WARNING: Boot clock returned invalid value!\n" |
808 | " Check your CMOS/BIOS settings.\n"); | 1014 | " Check your CMOS/BIOS settings.\n"); |
809 | boot.tv_sec = 0; | 1015 | boot.tv_sec = 0; |
@@ -811,7 +1017,7 @@ void __init timekeeping_init(void) | |||
811 | } | 1017 | } |
812 | 1018 | ||
813 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1019 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
814 | write_seqcount_begin(&timekeeper_seq); | 1020 | write_seqcount_begin(&tk_core.seq); |
815 | ntp_init(); | 1021 | ntp_init(); |
816 | 1022 | ||
817 | clock = clocksource_default_clock(); | 1023 | clock = clocksource_default_clock(); |
@@ -822,24 +1028,21 @@ void __init timekeeping_init(void) | |||
822 | tk_set_xtime(tk, &now); | 1028 | tk_set_xtime(tk, &now); |
823 | tk->raw_time.tv_sec = 0; | 1029 | tk->raw_time.tv_sec = 0; |
824 | tk->raw_time.tv_nsec = 0; | 1030 | tk->raw_time.tv_nsec = 0; |
1031 | tk->base_raw.tv64 = 0; | ||
825 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) | 1032 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) |
826 | boot = tk_xtime(tk); | 1033 | boot = tk_xtime(tk); |
827 | 1034 | ||
828 | set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); | 1035 | set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); |
829 | tk_set_wall_to_mono(tk, tmp); | 1036 | tk_set_wall_to_mono(tk, tmp); |
830 | 1037 | ||
831 | tmp.tv_sec = 0; | 1038 | timekeeping_update(tk, TK_MIRROR); |
832 | tmp.tv_nsec = 0; | ||
833 | tk_set_sleep_time(tk, tmp); | ||
834 | |||
835 | memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); | ||
836 | 1039 | ||
837 | write_seqcount_end(&timekeeper_seq); | 1040 | write_seqcount_end(&tk_core.seq); |
838 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1041 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
839 | } | 1042 | } |
840 | 1043 | ||
841 | /* time in seconds when suspend began */ | 1044 | /* time in seconds when suspend began */ |
842 | static struct timespec timekeeping_suspend_time; | 1045 | static struct timespec64 timekeeping_suspend_time; |
843 | 1046 | ||
844 | /** | 1047 | /** |
845 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval | 1048 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval |
@@ -849,17 +1052,17 @@ static struct timespec timekeeping_suspend_time; | |||
849 | * adds the sleep offset to the timekeeping variables. | 1052 | * adds the sleep offset to the timekeeping variables. |
850 | */ | 1053 | */ |
851 | static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | 1054 | static void __timekeeping_inject_sleeptime(struct timekeeper *tk, |
852 | struct timespec *delta) | 1055 | struct timespec64 *delta) |
853 | { | 1056 | { |
854 | if (!timespec_valid_strict(delta)) { | 1057 | if (!timespec64_valid_strict(delta)) { |
855 | printk_deferred(KERN_WARNING | 1058 | printk_deferred(KERN_WARNING |
856 | "__timekeeping_inject_sleeptime: Invalid " | 1059 | "__timekeeping_inject_sleeptime: Invalid " |
857 | "sleep delta value!\n"); | 1060 | "sleep delta value!\n"); |
858 | return; | 1061 | return; |
859 | } | 1062 | } |
860 | tk_xtime_add(tk, delta); | 1063 | tk_xtime_add(tk, delta); |
861 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); | 1064 | tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); |
862 | tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); | 1065 | tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); |
863 | tk_debug_account_sleep_time(delta); | 1066 | tk_debug_account_sleep_time(delta); |
864 | } | 1067 | } |
865 | 1068 | ||
@@ -875,7 +1078,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
875 | */ | 1078 | */ |
876 | void timekeeping_inject_sleeptime(struct timespec *delta) | 1079 | void timekeeping_inject_sleeptime(struct timespec *delta) |
877 | { | 1080 | { |
878 | struct timekeeper *tk = &timekeeper; | 1081 | struct timekeeper *tk = &tk_core.timekeeper; |
1082 | struct timespec64 tmp; | ||
879 | unsigned long flags; | 1083 | unsigned long flags; |
880 | 1084 | ||
881 | /* | 1085 | /* |
@@ -886,15 +1090,16 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
886 | return; | 1090 | return; |
887 | 1091 | ||
888 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1092 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
889 | write_seqcount_begin(&timekeeper_seq); | 1093 | write_seqcount_begin(&tk_core.seq); |
890 | 1094 | ||
891 | timekeeping_forward_now(tk); | 1095 | timekeeping_forward_now(tk); |
892 | 1096 | ||
893 | __timekeeping_inject_sleeptime(tk, delta); | 1097 | tmp = timespec_to_timespec64(*delta); |
1098 | __timekeeping_inject_sleeptime(tk, &tmp); | ||
894 | 1099 | ||
895 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); | 1100 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
896 | 1101 | ||
897 | write_seqcount_end(&timekeeper_seq); | 1102 | write_seqcount_end(&tk_core.seq); |
898 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1103 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
899 | 1104 | ||
900 | /* signal hrtimers about time change */ | 1105 | /* signal hrtimers about time change */ |
@@ -910,20 +1115,22 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
910 | */ | 1115 | */ |
911 | static void timekeeping_resume(void) | 1116 | static void timekeeping_resume(void) |
912 | { | 1117 | { |
913 | struct timekeeper *tk = &timekeeper; | 1118 | struct timekeeper *tk = &tk_core.timekeeper; |
914 | struct clocksource *clock = tk->clock; | 1119 | struct clocksource *clock = tk->tkr.clock; |
915 | unsigned long flags; | 1120 | unsigned long flags; |
916 | struct timespec ts_new, ts_delta; | 1121 | struct timespec64 ts_new, ts_delta; |
1122 | struct timespec tmp; | ||
917 | cycle_t cycle_now, cycle_delta; | 1123 | cycle_t cycle_now, cycle_delta; |
918 | bool suspendtime_found = false; | 1124 | bool suspendtime_found = false; |
919 | 1125 | ||
920 | read_persistent_clock(&ts_new); | 1126 | read_persistent_clock(&tmp); |
1127 | ts_new = timespec_to_timespec64(tmp); | ||
921 | 1128 | ||
922 | clockevents_resume(); | 1129 | clockevents_resume(); |
923 | clocksource_resume(); | 1130 | clocksource_resume(); |
924 | 1131 | ||
925 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1132 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
926 | write_seqcount_begin(&timekeeper_seq); | 1133 | write_seqcount_begin(&tk_core.seq); |
927 | 1134 | ||
928 | /* | 1135 | /* |
929 | * After system resumes, we need to calculate the suspended time and | 1136 | * After system resumes, we need to calculate the suspended time and |
@@ -937,15 +1144,16 @@ static void timekeeping_resume(void) | |||
937 | * The less preferred source will only be tried if there is no better | 1144 | * The less preferred source will only be tried if there is no better |
938 | * usable source. The rtc part is handled separately in rtc core code. | 1145 | * usable source. The rtc part is handled separately in rtc core code. |
939 | */ | 1146 | */ |
940 | cycle_now = clock->read(clock); | 1147 | cycle_now = tk->tkr.read(clock); |
941 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && | 1148 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && |
942 | cycle_now > clock->cycle_last) { | 1149 | cycle_now > tk->tkr.cycle_last) { |
943 | u64 num, max = ULLONG_MAX; | 1150 | u64 num, max = ULLONG_MAX; |
944 | u32 mult = clock->mult; | 1151 | u32 mult = clock->mult; |
945 | u32 shift = clock->shift; | 1152 | u32 shift = clock->shift; |
946 | s64 nsec = 0; | 1153 | s64 nsec = 0; |
947 | 1154 | ||
948 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 1155 | cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, |
1156 | tk->tkr.mask); | ||
949 | 1157 | ||
950 | /* | 1158 | /* |
951 | * "cycle_delta * mutl" may cause 64 bits overflow, if the | 1159 | * "cycle_delta * mutl" may cause 64 bits overflow, if the |
@@ -960,10 +1168,10 @@ static void timekeeping_resume(void) | |||
960 | } | 1168 | } |
961 | nsec += ((u64) cycle_delta * mult) >> shift; | 1169 | nsec += ((u64) cycle_delta * mult) >> shift; |
962 | 1170 | ||
963 | ts_delta = ns_to_timespec(nsec); | 1171 | ts_delta = ns_to_timespec64(nsec); |
964 | suspendtime_found = true; | 1172 | suspendtime_found = true; |
965 | } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { | 1173 | } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { |
966 | ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); | 1174 | ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); |
967 | suspendtime_found = true; | 1175 | suspendtime_found = true; |
968 | } | 1176 | } |
969 | 1177 | ||
@@ -971,11 +1179,11 @@ static void timekeeping_resume(void) | |||
971 | __timekeeping_inject_sleeptime(tk, &ts_delta); | 1179 | __timekeeping_inject_sleeptime(tk, &ts_delta); |
972 | 1180 | ||
973 | /* Re-base the last cycle value */ | 1181 | /* Re-base the last cycle value */ |
974 | tk->cycle_last = clock->cycle_last = cycle_now; | 1182 | tk->tkr.cycle_last = cycle_now; |
975 | tk->ntp_error = 0; | 1183 | tk->ntp_error = 0; |
976 | timekeeping_suspended = 0; | 1184 | timekeeping_suspended = 0; |
977 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); | 1185 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |
978 | write_seqcount_end(&timekeeper_seq); | 1186 | write_seqcount_end(&tk_core.seq); |
979 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1187 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
980 | 1188 | ||
981 | touch_softlockup_watchdog(); | 1189 | touch_softlockup_watchdog(); |
@@ -988,12 +1196,14 @@ static void timekeeping_resume(void) | |||
988 | 1196 | ||
989 | static int timekeeping_suspend(void) | 1197 | static int timekeeping_suspend(void) |
990 | { | 1198 | { |
991 | struct timekeeper *tk = &timekeeper; | 1199 | struct timekeeper *tk = &tk_core.timekeeper; |
992 | unsigned long flags; | 1200 | unsigned long flags; |
993 | struct timespec delta, delta_delta; | 1201 | struct timespec64 delta, delta_delta; |
994 | static struct timespec old_delta; | 1202 | static struct timespec64 old_delta; |
1203 | struct timespec tmp; | ||
995 | 1204 | ||
996 | read_persistent_clock(&timekeeping_suspend_time); | 1205 | read_persistent_clock(&tmp); |
1206 | timekeeping_suspend_time = timespec_to_timespec64(tmp); | ||
997 | 1207 | ||
998 | /* | 1208 | /* |
999 | * On some systems the persistent_clock can not be detected at | 1209 | * On some systems the persistent_clock can not be detected at |
@@ -1004,7 +1214,7 @@ static int timekeeping_suspend(void) | |||
1004 | persistent_clock_exist = true; | 1214 | persistent_clock_exist = true; |
1005 | 1215 | ||
1006 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1216 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
1007 | write_seqcount_begin(&timekeeper_seq); | 1217 | write_seqcount_begin(&tk_core.seq); |
1008 | timekeeping_forward_now(tk); | 1218 | timekeeping_forward_now(tk); |
1009 | timekeeping_suspended = 1; | 1219 | timekeeping_suspended = 1; |
1010 | 1220 | ||
@@ -1014,8 +1224,8 @@ static int timekeeping_suspend(void) | |||
1014 | * try to compensate so the difference in system time | 1224 | * try to compensate so the difference in system time |
1015 | * and persistent_clock time stays close to constant. | 1225 | * and persistent_clock time stays close to constant. |
1016 | */ | 1226 | */ |
1017 | delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); | 1227 | delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); |
1018 | delta_delta = timespec_sub(delta, old_delta); | 1228 | delta_delta = timespec64_sub(delta, old_delta); |
1019 | if (abs(delta_delta.tv_sec) >= 2) { | 1229 | if (abs(delta_delta.tv_sec) >= 2) { |
1020 | /* | 1230 | /* |
1021 | * if delta_delta is too large, assume time correction | 1231 | * if delta_delta is too large, assume time correction |
@@ -1025,11 +1235,11 @@ static int timekeeping_suspend(void) | |||
1025 | } else { | 1235 | } else { |
1026 | /* Otherwise try to adjust old_system to compensate */ | 1236 | /* Otherwise try to adjust old_system to compensate */ |
1027 | timekeeping_suspend_time = | 1237 | timekeeping_suspend_time = |
1028 | timespec_add(timekeeping_suspend_time, delta_delta); | 1238 | timespec64_add(timekeeping_suspend_time, delta_delta); |
1029 | } | 1239 | } |
1030 | 1240 | ||
1031 | timekeeping_update(tk, TK_MIRROR); | 1241 | timekeeping_update(tk, TK_MIRROR); |
1032 | write_seqcount_end(&timekeeper_seq); | 1242 | write_seqcount_end(&tk_core.seq); |
1033 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1243 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1034 | 1244 | ||
1035 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 1245 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
@@ -1050,125 +1260,34 @@ static int __init timekeeping_init_ops(void) | |||
1050 | register_syscore_ops(&timekeeping_syscore_ops); | 1260 | register_syscore_ops(&timekeeping_syscore_ops); |
1051 | return 0; | 1261 | return 0; |
1052 | } | 1262 | } |
1053 | |||
1054 | device_initcall(timekeeping_init_ops); | 1263 | device_initcall(timekeeping_init_ops); |
1055 | 1264 | ||
1056 | /* | 1265 | /* |
1057 | * If the error is already larger, we look ahead even further | 1266 | * Apply a multiplier adjustment to the timekeeper |
1058 | * to compensate for late or lost adjustments. | ||
1059 | */ | 1267 | */ |
1060 | static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, | 1268 | static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, |
1061 | s64 error, s64 *interval, | 1269 | s64 offset, |
1062 | s64 *offset) | 1270 | bool negative, |
1271 | int adj_scale) | ||
1063 | { | 1272 | { |
1064 | s64 tick_error, i; | 1273 | s64 interval = tk->cycle_interval; |
1065 | u32 look_ahead, adj; | 1274 | s32 mult_adj = 1; |
1066 | s32 error2, mult; | ||
1067 | |||
1068 | /* | ||
1069 | * Use the current error value to determine how much to look ahead. | ||
1070 | * The larger the error the slower we adjust for it to avoid problems | ||
1071 | * with losing too many ticks, otherwise we would overadjust and | ||
1072 | * produce an even larger error. The smaller the adjustment the | ||
1073 | * faster we try to adjust for it, as lost ticks can do less harm | ||
1074 | * here. This is tuned so that an error of about 1 msec is adjusted | ||
1075 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). | ||
1076 | */ | ||
1077 | error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); | ||
1078 | error2 = abs(error2); | ||
1079 | for (look_ahead = 0; error2 > 0; look_ahead++) | ||
1080 | error2 >>= 2; | ||
1081 | 1275 | ||
1082 | /* | 1276 | if (negative) { |
1083 | * Now calculate the error in (1 << look_ahead) ticks, but first | 1277 | mult_adj = -mult_adj; |
1084 | * remove the single look ahead already included in the error. | 1278 | interval = -interval; |
1085 | */ | 1279 | offset = -offset; |
1086 | tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); | ||
1087 | tick_error -= tk->xtime_interval >> 1; | ||
1088 | error = ((error - tick_error) >> look_ahead) + tick_error; | ||
1089 | |||
1090 | /* Finally calculate the adjustment shift value. */ | ||
1091 | i = *interval; | ||
1092 | mult = 1; | ||
1093 | if (error < 0) { | ||
1094 | error = -error; | ||
1095 | *interval = -*interval; | ||
1096 | *offset = -*offset; | ||
1097 | mult = -1; | ||
1098 | } | 1280 | } |
1099 | for (adj = 0; error > i; adj++) | 1281 | mult_adj <<= adj_scale; |
1100 | error >>= 1; | 1282 | interval <<= adj_scale; |
1101 | 1283 | offset <<= adj_scale; | |
1102 | *interval <<= adj; | ||
1103 | *offset <<= adj; | ||
1104 | return mult << adj; | ||
1105 | } | ||
1106 | |||
1107 | /* | ||
1108 | * Adjust the multiplier to reduce the error value, | ||
1109 | * this is optimized for the most common adjustments of -1,0,1, | ||
1110 | * for other values we can do a bit more work. | ||
1111 | */ | ||
1112 | static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | ||
1113 | { | ||
1114 | s64 error, interval = tk->cycle_interval; | ||
1115 | int adj; | ||
1116 | 1284 | ||
1117 | /* | 1285 | /* |
1118 | * The point of this is to check if the error is greater than half | ||
1119 | * an interval. | ||
1120 | * | ||
1121 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. | ||
1122 | * | ||
1123 | * Note we subtract one in the shift, so that error is really error*2. | ||
1124 | * This "saves" dividing(shifting) interval twice, but keeps the | ||
1125 | * (error > interval) comparison as still measuring if error is | ||
1126 | * larger than half an interval. | ||
1127 | * | ||
1128 | * Note: It does not "save" on aggravation when reading the code. | ||
1129 | */ | ||
1130 | error = tk->ntp_error >> (tk->ntp_error_shift - 1); | ||
1131 | if (error > interval) { | ||
1132 | /* | ||
1133 | * We now divide error by 4(via shift), which checks if | ||
1134 | * the error is greater than twice the interval. | ||
1135 | * If it is greater, we need a bigadjust, if its smaller, | ||
1136 | * we can adjust by 1. | ||
1137 | */ | ||
1138 | error >>= 2; | ||
1139 | if (likely(error <= interval)) | ||
1140 | adj = 1; | ||
1141 | else | ||
1142 | adj = timekeeping_bigadjust(tk, error, &interval, &offset); | ||
1143 | } else { | ||
1144 | if (error < -interval) { | ||
1145 | /* See comment above, this is just switched for the negative */ | ||
1146 | error >>= 2; | ||
1147 | if (likely(error >= -interval)) { | ||
1148 | adj = -1; | ||
1149 | interval = -interval; | ||
1150 | offset = -offset; | ||
1151 | } else { | ||
1152 | adj = timekeeping_bigadjust(tk, error, &interval, &offset); | ||
1153 | } | ||
1154 | } else { | ||
1155 | goto out_adjust; | ||
1156 | } | ||
1157 | } | ||
1158 | |||
1159 | if (unlikely(tk->clock->maxadj && | ||
1160 | (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { | ||
1161 | printk_deferred_once(KERN_WARNING | ||
1162 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | ||
1163 | tk->clock->name, (long)tk->mult + adj, | ||
1164 | (long)tk->clock->mult + tk->clock->maxadj); | ||
1165 | } | ||
1166 | /* | ||
1167 | * So the following can be confusing. | 1286 | * So the following can be confusing. |
1168 | * | 1287 | * |
1169 | * To keep things simple, lets assume adj == 1 for now. | 1288 | * To keep things simple, lets assume mult_adj == 1 for now. |
1170 | * | 1289 | * |
1171 | * When adj != 1, remember that the interval and offset values | 1290 | * When mult_adj != 1, remember that the interval and offset values |
1172 | * have been appropriately scaled so the math is the same. | 1291 | * have been appropriately scaled so the math is the same. |
1173 | * | 1292 | * |
1174 | * The basic idea here is that we're increasing the multiplier | 1293 | * The basic idea here is that we're increasing the multiplier |
@@ -1212,12 +1331,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1212 | * | 1331 | * |
1213 | * XXX - TODO: Doc ntp_error calculation. | 1332 | * XXX - TODO: Doc ntp_error calculation. |
1214 | */ | 1333 | */ |
1215 | tk->mult += adj; | 1334 | tk->tkr.mult += mult_adj; |
1216 | tk->xtime_interval += interval; | 1335 | tk->xtime_interval += interval; |
1217 | tk->xtime_nsec -= offset; | 1336 | tk->tkr.xtime_nsec -= offset; |
1218 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; | 1337 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; |
1338 | } | ||
1339 | |||
1340 | /* | ||
1341 | * Calculate the multiplier adjustment needed to match the frequency | ||
1342 | * specified by NTP | ||
1343 | */ | ||
1344 | static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, | ||
1345 | s64 offset) | ||
1346 | { | ||
1347 | s64 interval = tk->cycle_interval; | ||
1348 | s64 xinterval = tk->xtime_interval; | ||
1349 | s64 tick_error; | ||
1350 | bool negative; | ||
1351 | u32 adj; | ||
1352 | |||
1353 | /* Remove any current error adj from freq calculation */ | ||
1354 | if (tk->ntp_err_mult) | ||
1355 | xinterval -= tk->cycle_interval; | ||
1356 | |||
1357 | tk->ntp_tick = ntp_tick_length(); | ||
1358 | |||
1359 | /* Calculate current error per tick */ | ||
1360 | tick_error = ntp_tick_length() >> tk->ntp_error_shift; | ||
1361 | tick_error -= (xinterval + tk->xtime_remainder); | ||
1362 | |||
1363 | /* Don't worry about correcting it if its small */ | ||
1364 | if (likely((tick_error >= 0) && (tick_error <= interval))) | ||
1365 | return; | ||
1366 | |||
1367 | /* preserve the direction of correction */ | ||
1368 | negative = (tick_error < 0); | ||
1369 | |||
1370 | /* Sort out the magnitude of the correction */ | ||
1371 | tick_error = abs(tick_error); | ||
1372 | for (adj = 0; tick_error > interval; adj++) | ||
1373 | tick_error >>= 1; | ||
1374 | |||
1375 | /* scale the corrections */ | ||
1376 | timekeeping_apply_adjustment(tk, offset, negative, adj); | ||
1377 | } | ||
1378 | |||
1379 | /* | ||
1380 | * Adjust the timekeeper's multiplier to the correct frequency | ||
1381 | * and also to reduce the accumulated error value. | ||
1382 | */ | ||
1383 | static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | ||
1384 | { | ||
1385 | /* Correct for the current frequency error */ | ||
1386 | timekeeping_freqadjust(tk, offset); | ||
1387 | |||
1388 | /* Next make a small adjustment to fix any cumulative error */ | ||
1389 | if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { | ||
1390 | tk->ntp_err_mult = 1; | ||
1391 | timekeeping_apply_adjustment(tk, offset, 0, 0); | ||
1392 | } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { | ||
1393 | /* Undo any existing error adjustment */ | ||
1394 | timekeeping_apply_adjustment(tk, offset, 1, 0); | ||
1395 | tk->ntp_err_mult = 0; | ||
1396 | } | ||
1397 | |||
1398 | if (unlikely(tk->tkr.clock->maxadj && | ||
1399 | (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { | ||
1400 | printk_once(KERN_WARNING | ||
1401 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | ||
1402 | tk->tkr.clock->name, (long)tk->tkr.mult, | ||
1403 | (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); | ||
1404 | } | ||
1219 | 1405 | ||
1220 | out_adjust: | ||
1221 | /* | 1406 | /* |
1222 | * It may be possible that when we entered this function, xtime_nsec | 1407 | * It may be possible that when we entered this function, xtime_nsec |
1223 | * was very small. Further, if we're slightly speeding the clocksource | 1408 | * was very small. Further, if we're slightly speeding the clocksource |
@@ -1232,12 +1417,11 @@ out_adjust: | |||
1232 | * We'll correct this error next time through this function, when | 1417 | * We'll correct this error next time through this function, when |
1233 | * xtime_nsec is not as small. | 1418 | * xtime_nsec is not as small. |
1234 | */ | 1419 | */ |
1235 | if (unlikely((s64)tk->xtime_nsec < 0)) { | 1420 | if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { |
1236 | s64 neg = -(s64)tk->xtime_nsec; | 1421 | s64 neg = -(s64)tk->tkr.xtime_nsec; |
1237 | tk->xtime_nsec = 0; | 1422 | tk->tkr.xtime_nsec = 0; |
1238 | tk->ntp_error += neg << tk->ntp_error_shift; | 1423 | tk->ntp_error += neg << tk->ntp_error_shift; |
1239 | } | 1424 | } |
1240 | |||
1241 | } | 1425 | } |
1242 | 1426 | ||
1243 | /** | 1427 | /** |
@@ -1250,26 +1434,26 @@ out_adjust: | |||
1250 | */ | 1434 | */ |
1251 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) | 1435 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) |
1252 | { | 1436 | { |
1253 | u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; | 1437 | u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; |
1254 | unsigned int clock_set = 0; | 1438 | unsigned int clock_set = 0; |
1255 | 1439 | ||
1256 | while (tk->xtime_nsec >= nsecps) { | 1440 | while (tk->tkr.xtime_nsec >= nsecps) { |
1257 | int leap; | 1441 | int leap; |
1258 | 1442 | ||
1259 | tk->xtime_nsec -= nsecps; | 1443 | tk->tkr.xtime_nsec -= nsecps; |
1260 | tk->xtime_sec++; | 1444 | tk->xtime_sec++; |
1261 | 1445 | ||
1262 | /* Figure out if its a leap sec and apply if needed */ | 1446 | /* Figure out if its a leap sec and apply if needed */ |
1263 | leap = second_overflow(tk->xtime_sec); | 1447 | leap = second_overflow(tk->xtime_sec); |
1264 | if (unlikely(leap)) { | 1448 | if (unlikely(leap)) { |
1265 | struct timespec ts; | 1449 | struct timespec64 ts; |
1266 | 1450 | ||
1267 | tk->xtime_sec += leap; | 1451 | tk->xtime_sec += leap; |
1268 | 1452 | ||
1269 | ts.tv_sec = leap; | 1453 | ts.tv_sec = leap; |
1270 | ts.tv_nsec = 0; | 1454 | ts.tv_nsec = 0; |
1271 | tk_set_wall_to_mono(tk, | 1455 | tk_set_wall_to_mono(tk, |
1272 | timespec_sub(tk->wall_to_monotonic, ts)); | 1456 | timespec64_sub(tk->wall_to_monotonic, ts)); |
1273 | 1457 | ||
1274 | __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); | 1458 | __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); |
1275 | 1459 | ||
@@ -1301,9 +1485,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | |||
1301 | 1485 | ||
1302 | /* Accumulate one shifted interval */ | 1486 | /* Accumulate one shifted interval */ |
1303 | offset -= interval; | 1487 | offset -= interval; |
1304 | tk->cycle_last += interval; | 1488 | tk->tkr.cycle_last += interval; |
1305 | 1489 | ||
1306 | tk->xtime_nsec += tk->xtime_interval << shift; | 1490 | tk->tkr.xtime_nsec += tk->xtime_interval << shift; |
1307 | *clock_set |= accumulate_nsecs_to_secs(tk); | 1491 | *clock_set |= accumulate_nsecs_to_secs(tk); |
1308 | 1492 | ||
1309 | /* Accumulate raw time */ | 1493 | /* Accumulate raw time */ |
@@ -1317,48 +1501,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | |||
1317 | tk->raw_time.tv_nsec = raw_nsecs; | 1501 | tk->raw_time.tv_nsec = raw_nsecs; |
1318 | 1502 | ||
1319 | /* Accumulate error between NTP and clock interval */ | 1503 | /* Accumulate error between NTP and clock interval */ |
1320 | tk->ntp_error += ntp_tick_length() << shift; | 1504 | tk->ntp_error += tk->ntp_tick << shift; |
1321 | tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << | 1505 | tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << |
1322 | (tk->ntp_error_shift + shift); | 1506 | (tk->ntp_error_shift + shift); |
1323 | 1507 | ||
1324 | return offset; | 1508 | return offset; |
1325 | } | 1509 | } |
1326 | 1510 | ||
1327 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD | ||
1328 | static inline void old_vsyscall_fixup(struct timekeeper *tk) | ||
1329 | { | ||
1330 | s64 remainder; | ||
1331 | |||
1332 | /* | ||
1333 | * Store only full nanoseconds into xtime_nsec after rounding | ||
1334 | * it up and add the remainder to the error difference. | ||
1335 | * XXX - This is necessary to avoid small 1ns inconsistnecies caused | ||
1336 | * by truncating the remainder in vsyscalls. However, it causes | ||
1337 | * additional work to be done in timekeeping_adjust(). Once | ||
1338 | * the vsyscall implementations are converted to use xtime_nsec | ||
1339 | * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD | ||
1340 | * users are removed, this can be killed. | ||
1341 | */ | ||
1342 | remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); | ||
1343 | tk->xtime_nsec -= remainder; | ||
1344 | tk->xtime_nsec += 1ULL << tk->shift; | ||
1345 | tk->ntp_error += remainder << tk->ntp_error_shift; | ||
1346 | tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; | ||
1347 | } | ||
1348 | #else | ||
1349 | #define old_vsyscall_fixup(tk) | ||
1350 | #endif | ||
1351 | |||
1352 | |||
1353 | |||
1354 | /** | 1511 | /** |
1355 | * update_wall_time - Uses the current clocksource to increment the wall time | 1512 | * update_wall_time - Uses the current clocksource to increment the wall time |
1356 | * | 1513 | * |
1357 | */ | 1514 | */ |
1358 | void update_wall_time(void) | 1515 | void update_wall_time(void) |
1359 | { | 1516 | { |
1360 | struct clocksource *clock; | 1517 | struct timekeeper *real_tk = &tk_core.timekeeper; |
1361 | struct timekeeper *real_tk = &timekeeper; | ||
1362 | struct timekeeper *tk = &shadow_timekeeper; | 1518 | struct timekeeper *tk = &shadow_timekeeper; |
1363 | cycle_t offset; | 1519 | cycle_t offset; |
1364 | int shift = 0, maxshift; | 1520 | int shift = 0, maxshift; |
@@ -1371,12 +1527,11 @@ void update_wall_time(void) | |||
1371 | if (unlikely(timekeeping_suspended)) | 1527 | if (unlikely(timekeeping_suspended)) |
1372 | goto out; | 1528 | goto out; |
1373 | 1529 | ||
1374 | clock = real_tk->clock; | ||
1375 | |||
1376 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | 1530 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
1377 | offset = real_tk->cycle_interval; | 1531 | offset = real_tk->cycle_interval; |
1378 | #else | 1532 | #else |
1379 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | 1533 | offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), |
1534 | tk->tkr.cycle_last, tk->tkr.mask); | ||
1380 | #endif | 1535 | #endif |
1381 | 1536 | ||
1382 | /* Check if there's really nothing to do */ | 1537 | /* Check if there's really nothing to do */ |
@@ -1418,9 +1573,7 @@ void update_wall_time(void) | |||
1418 | */ | 1573 | */ |
1419 | clock_set |= accumulate_nsecs_to_secs(tk); | 1574 | clock_set |= accumulate_nsecs_to_secs(tk); |
1420 | 1575 | ||
1421 | write_seqcount_begin(&timekeeper_seq); | 1576 | write_seqcount_begin(&tk_core.seq); |
1422 | /* Update clock->cycle_last with the new value */ | ||
1423 | clock->cycle_last = tk->cycle_last; | ||
1424 | /* | 1577 | /* |
1425 | * Update the real timekeeper. | 1578 | * Update the real timekeeper. |
1426 | * | 1579 | * |
@@ -1428,12 +1581,12 @@ void update_wall_time(void) | |||
1428 | * requires changes to all other timekeeper usage sites as | 1581 | * requires changes to all other timekeeper usage sites as |
1429 | * well, i.e. move the timekeeper pointer getter into the | 1582 | * well, i.e. move the timekeeper pointer getter into the |
1430 | * spinlocked/seqcount protected sections. And we trade this | 1583 | * spinlocked/seqcount protected sections. And we trade this |
1431 | * memcpy under the timekeeper_seq against one before we start | 1584 | * memcpy under the tk_core.seq against one before we start |
1432 | * updating. | 1585 | * updating. |
1433 | */ | 1586 | */ |
1434 | memcpy(real_tk, tk, sizeof(*tk)); | 1587 | memcpy(real_tk, tk, sizeof(*tk)); |
1435 | timekeeping_update(real_tk, clock_set); | 1588 | timekeeping_update(real_tk, clock_set); |
1436 | write_seqcount_end(&timekeeper_seq); | 1589 | write_seqcount_end(&tk_core.seq); |
1437 | out: | 1590 | out: |
1438 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1591 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1439 | if (clock_set) | 1592 | if (clock_set) |
@@ -1454,83 +1607,16 @@ out: | |||
1454 | */ | 1607 | */ |
1455 | void getboottime(struct timespec *ts) | 1608 | void getboottime(struct timespec *ts) |
1456 | { | 1609 | { |
1457 | struct timekeeper *tk = &timekeeper; | 1610 | struct timekeeper *tk = &tk_core.timekeeper; |
1458 | struct timespec boottime = { | 1611 | ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); |
1459 | .tv_sec = tk->wall_to_monotonic.tv_sec + | ||
1460 | tk->total_sleep_time.tv_sec, | ||
1461 | .tv_nsec = tk->wall_to_monotonic.tv_nsec + | ||
1462 | tk->total_sleep_time.tv_nsec | ||
1463 | }; | ||
1464 | |||
1465 | set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); | ||
1466 | } | ||
1467 | EXPORT_SYMBOL_GPL(getboottime); | ||
1468 | |||
1469 | /** | ||
1470 | * get_monotonic_boottime - Returns monotonic time since boot | ||
1471 | * @ts: pointer to the timespec to be set | ||
1472 | * | ||
1473 | * Returns the monotonic time since boot in a timespec. | ||
1474 | * | ||
1475 | * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also | ||
1476 | * includes the time spent in suspend. | ||
1477 | */ | ||
1478 | void get_monotonic_boottime(struct timespec *ts) | ||
1479 | { | ||
1480 | struct timekeeper *tk = &timekeeper; | ||
1481 | struct timespec tomono, sleep; | ||
1482 | s64 nsec; | ||
1483 | unsigned int seq; | ||
1484 | |||
1485 | WARN_ON(timekeeping_suspended); | ||
1486 | |||
1487 | do { | ||
1488 | seq = read_seqcount_begin(&timekeeper_seq); | ||
1489 | ts->tv_sec = tk->xtime_sec; | ||
1490 | nsec = timekeeping_get_ns(tk); | ||
1491 | tomono = tk->wall_to_monotonic; | ||
1492 | sleep = tk->total_sleep_time; | ||
1493 | |||
1494 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | ||
1495 | |||
1496 | ts->tv_sec += tomono.tv_sec + sleep.tv_sec; | ||
1497 | ts->tv_nsec = 0; | ||
1498 | timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); | ||
1499 | } | ||
1500 | EXPORT_SYMBOL_GPL(get_monotonic_boottime); | ||
1501 | |||
1502 | /** | ||
1503 | * ktime_get_boottime - Returns monotonic time since boot in a ktime | ||
1504 | * | ||
1505 | * Returns the monotonic time since boot in a ktime | ||
1506 | * | ||
1507 | * This is similar to CLOCK_MONTONIC/ktime_get, but also | ||
1508 | * includes the time spent in suspend. | ||
1509 | */ | ||
1510 | ktime_t ktime_get_boottime(void) | ||
1511 | { | ||
1512 | struct timespec ts; | ||
1513 | |||
1514 | get_monotonic_boottime(&ts); | ||
1515 | return timespec_to_ktime(ts); | ||
1516 | } | ||
1517 | EXPORT_SYMBOL_GPL(ktime_get_boottime); | ||
1518 | |||
1519 | /** | ||
1520 | * monotonic_to_bootbased - Convert the monotonic time to boot based. | ||
1521 | * @ts: pointer to the timespec to be converted | ||
1522 | */ | ||
1523 | void monotonic_to_bootbased(struct timespec *ts) | ||
1524 | { | ||
1525 | struct timekeeper *tk = &timekeeper; | ||
1526 | 1612 | ||
1527 | *ts = timespec_add(*ts, tk->total_sleep_time); | 1613 | *ts = ktime_to_timespec(t); |
1528 | } | 1614 | } |
1529 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); | 1615 | EXPORT_SYMBOL_GPL(getboottime); |
1530 | 1616 | ||
1531 | unsigned long get_seconds(void) | 1617 | unsigned long get_seconds(void) |
1532 | { | 1618 | { |
1533 | struct timekeeper *tk = &timekeeper; | 1619 | struct timekeeper *tk = &tk_core.timekeeper; |
1534 | 1620 | ||
1535 | return tk->xtime_sec; | 1621 | return tk->xtime_sec; |
1536 | } | 1622 | } |
@@ -1538,43 +1624,44 @@ EXPORT_SYMBOL(get_seconds); | |||
1538 | 1624 | ||
1539 | struct timespec __current_kernel_time(void) | 1625 | struct timespec __current_kernel_time(void) |
1540 | { | 1626 | { |
1541 | struct timekeeper *tk = &timekeeper; | 1627 | struct timekeeper *tk = &tk_core.timekeeper; |
1542 | 1628 | ||
1543 | return tk_xtime(tk); | 1629 | return timespec64_to_timespec(tk_xtime(tk)); |
1544 | } | 1630 | } |
1545 | 1631 | ||
1546 | struct timespec current_kernel_time(void) | 1632 | struct timespec current_kernel_time(void) |
1547 | { | 1633 | { |
1548 | struct timekeeper *tk = &timekeeper; | 1634 | struct timekeeper *tk = &tk_core.timekeeper; |
1549 | struct timespec now; | 1635 | struct timespec64 now; |
1550 | unsigned long seq; | 1636 | unsigned long seq; |
1551 | 1637 | ||
1552 | do { | 1638 | do { |
1553 | seq = read_seqcount_begin(&timekeeper_seq); | 1639 | seq = read_seqcount_begin(&tk_core.seq); |
1554 | 1640 | ||
1555 | now = tk_xtime(tk); | 1641 | now = tk_xtime(tk); |
1556 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 1642 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
1557 | 1643 | ||
1558 | return now; | 1644 | return timespec64_to_timespec(now); |
1559 | } | 1645 | } |
1560 | EXPORT_SYMBOL(current_kernel_time); | 1646 | EXPORT_SYMBOL(current_kernel_time); |
1561 | 1647 | ||
1562 | struct timespec get_monotonic_coarse(void) | 1648 | struct timespec get_monotonic_coarse(void) |
1563 | { | 1649 | { |
1564 | struct timekeeper *tk = &timekeeper; | 1650 | struct timekeeper *tk = &tk_core.timekeeper; |
1565 | struct timespec now, mono; | 1651 | struct timespec64 now, mono; |
1566 | unsigned long seq; | 1652 | unsigned long seq; |
1567 | 1653 | ||
1568 | do { | 1654 | do { |
1569 | seq = read_seqcount_begin(&timekeeper_seq); | 1655 | seq = read_seqcount_begin(&tk_core.seq); |
1570 | 1656 | ||
1571 | now = tk_xtime(tk); | 1657 | now = tk_xtime(tk); |
1572 | mono = tk->wall_to_monotonic; | 1658 | mono = tk->wall_to_monotonic; |
1573 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 1659 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
1574 | 1660 | ||
1575 | set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, | 1661 | set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, |
1576 | now.tv_nsec + mono.tv_nsec); | 1662 | now.tv_nsec + mono.tv_nsec); |
1577 | return now; | 1663 | |
1664 | return timespec64_to_timespec(now); | ||
1578 | } | 1665 | } |
1579 | 1666 | ||
1580 | /* | 1667 | /* |
@@ -1587,29 +1674,38 @@ void do_timer(unsigned long ticks) | |||
1587 | } | 1674 | } |
1588 | 1675 | ||
1589 | /** | 1676 | /** |
1590 | * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, | 1677 | * ktime_get_update_offsets_tick - hrtimer helper |
1591 | * and sleep offsets. | 1678 | * @offs_real: pointer to storage for monotonic -> realtime offset |
1592 | * @xtim: pointer to timespec to be set with xtime | 1679 | * @offs_boot: pointer to storage for monotonic -> boottime offset |
1593 | * @wtom: pointer to timespec to be set with wall_to_monotonic | 1680 | * @offs_tai: pointer to storage for monotonic -> clock tai offset |
1594 | * @sleep: pointer to timespec to be set with time in suspend | 1681 | * |
1682 | * Returns monotonic time at last tick and various offsets | ||
1595 | */ | 1683 | */ |
1596 | void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | 1684 | ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, |
1597 | struct timespec *wtom, struct timespec *sleep) | 1685 | ktime_t *offs_tai) |
1598 | { | 1686 | { |
1599 | struct timekeeper *tk = &timekeeper; | 1687 | struct timekeeper *tk = &tk_core.timekeeper; |
1600 | unsigned long seq; | 1688 | unsigned int seq; |
1689 | ktime_t base; | ||
1690 | u64 nsecs; | ||
1601 | 1691 | ||
1602 | do { | 1692 | do { |
1603 | seq = read_seqcount_begin(&timekeeper_seq); | 1693 | seq = read_seqcount_begin(&tk_core.seq); |
1604 | *xtim = tk_xtime(tk); | 1694 | |
1605 | *wtom = tk->wall_to_monotonic; | 1695 | base = tk->tkr.base_mono; |
1606 | *sleep = tk->total_sleep_time; | 1696 | nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; |
1607 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 1697 | |
1698 | *offs_real = tk->offs_real; | ||
1699 | *offs_boot = tk->offs_boot; | ||
1700 | *offs_tai = tk->offs_tai; | ||
1701 | } while (read_seqcount_retry(&tk_core.seq, seq)); | ||
1702 | |||
1703 | return ktime_add_ns(base, nsecs); | ||
1608 | } | 1704 | } |
1609 | 1705 | ||
1610 | #ifdef CONFIG_HIGH_RES_TIMERS | 1706 | #ifdef CONFIG_HIGH_RES_TIMERS |
1611 | /** | 1707 | /** |
1612 | * ktime_get_update_offsets - hrtimer helper | 1708 | * ktime_get_update_offsets_now - hrtimer helper |
1613 | * @offs_real: pointer to storage for monotonic -> realtime offset | 1709 | * @offs_real: pointer to storage for monotonic -> realtime offset |
1614 | * @offs_boot: pointer to storage for monotonic -> boottime offset | 1710 | * @offs_boot: pointer to storage for monotonic -> boottime offset |
1615 | * @offs_tai: pointer to storage for monotonic -> clock tai offset | 1711 | * @offs_tai: pointer to storage for monotonic -> clock tai offset |
@@ -1617,57 +1713,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
1617 | * Returns current monotonic time and updates the offsets | 1713 | * Returns current monotonic time and updates the offsets |
1618 | * Called from hrtimer_interrupt() or retrigger_next_event() | 1714 | * Called from hrtimer_interrupt() or retrigger_next_event() |
1619 | */ | 1715 | */ |
1620 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, | 1716 | ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, |
1621 | ktime_t *offs_tai) | 1717 | ktime_t *offs_tai) |
1622 | { | 1718 | { |
1623 | struct timekeeper *tk = &timekeeper; | 1719 | struct timekeeper *tk = &tk_core.timekeeper; |
1624 | ktime_t now; | ||
1625 | unsigned int seq; | 1720 | unsigned int seq; |
1626 | u64 secs, nsecs; | 1721 | ktime_t base; |
1722 | u64 nsecs; | ||
1627 | 1723 | ||
1628 | do { | 1724 | do { |
1629 | seq = read_seqcount_begin(&timekeeper_seq); | 1725 | seq = read_seqcount_begin(&tk_core.seq); |
1630 | 1726 | ||
1631 | secs = tk->xtime_sec; | 1727 | base = tk->tkr.base_mono; |
1632 | nsecs = timekeeping_get_ns(tk); | 1728 | nsecs = timekeeping_get_ns(&tk->tkr); |
1633 | 1729 | ||
1634 | *offs_real = tk->offs_real; | 1730 | *offs_real = tk->offs_real; |
1635 | *offs_boot = tk->offs_boot; | 1731 | *offs_boot = tk->offs_boot; |
1636 | *offs_tai = tk->offs_tai; | 1732 | *offs_tai = tk->offs_tai; |
1637 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | 1733 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
1638 | 1734 | ||
1639 | now = ktime_add_ns(ktime_set(secs, 0), nsecs); | 1735 | return ktime_add_ns(base, nsecs); |
1640 | now = ktime_sub(now, *offs_real); | ||
1641 | return now; | ||
1642 | } | 1736 | } |
1643 | #endif | 1737 | #endif |
1644 | 1738 | ||
1645 | /** | 1739 | /** |
1646 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format | ||
1647 | */ | ||
1648 | ktime_t ktime_get_monotonic_offset(void) | ||
1649 | { | ||
1650 | struct timekeeper *tk = &timekeeper; | ||
1651 | unsigned long seq; | ||
1652 | struct timespec wtom; | ||
1653 | |||
1654 | do { | ||
1655 | seq = read_seqcount_begin(&timekeeper_seq); | ||
1656 | wtom = tk->wall_to_monotonic; | ||
1657 | } while (read_seqcount_retry(&timekeeper_seq, seq)); | ||
1658 | |||
1659 | return timespec_to_ktime(wtom); | ||
1660 | } | ||
1661 | EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); | ||
1662 | |||
1663 | /** | ||
1664 | * do_adjtimex() - Accessor function to NTP __do_adjtimex function | 1740 | * do_adjtimex() - Accessor function to NTP __do_adjtimex function |
1665 | */ | 1741 | */ |
1666 | int do_adjtimex(struct timex *txc) | 1742 | int do_adjtimex(struct timex *txc) |
1667 | { | 1743 | { |
1668 | struct timekeeper *tk = &timekeeper; | 1744 | struct timekeeper *tk = &tk_core.timekeeper; |
1669 | unsigned long flags; | 1745 | unsigned long flags; |
1670 | struct timespec ts; | 1746 | struct timespec64 ts; |
1671 | s32 orig_tai, tai; | 1747 | s32 orig_tai, tai; |
1672 | int ret; | 1748 | int ret; |
1673 | 1749 | ||
@@ -1687,10 +1763,10 @@ int do_adjtimex(struct timex *txc) | |||
1687 | return ret; | 1763 | return ret; |
1688 | } | 1764 | } |
1689 | 1765 | ||
1690 | getnstimeofday(&ts); | 1766 | getnstimeofday64(&ts); |
1691 | 1767 | ||
1692 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1768 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
1693 | write_seqcount_begin(&timekeeper_seq); | 1769 | write_seqcount_begin(&tk_core.seq); |
1694 | 1770 | ||
1695 | orig_tai = tai = tk->tai_offset; | 1771 | orig_tai = tai = tk->tai_offset; |
1696 | ret = __do_adjtimex(txc, &ts, &tai); | 1772 | ret = __do_adjtimex(txc, &ts, &tai); |
@@ -1699,7 +1775,7 @@ int do_adjtimex(struct timex *txc) | |||
1699 | __timekeeping_set_tai_offset(tk, tai); | 1775 | __timekeeping_set_tai_offset(tk, tai); |
1700 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); | 1776 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |
1701 | } | 1777 | } |
1702 | write_seqcount_end(&timekeeper_seq); | 1778 | write_seqcount_end(&tk_core.seq); |
1703 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1779 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1704 | 1780 | ||
1705 | if (tai != orig_tai) | 1781 | if (tai != orig_tai) |
@@ -1719,11 +1795,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
1719 | unsigned long flags; | 1795 | unsigned long flags; |
1720 | 1796 | ||
1721 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1797 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
1722 | write_seqcount_begin(&timekeeper_seq); | 1798 | write_seqcount_begin(&tk_core.seq); |
1723 | 1799 | ||
1724 | __hardpps(phase_ts, raw_ts); | 1800 | __hardpps(phase_ts, raw_ts); |
1725 | 1801 | ||
1726 | write_seqcount_end(&timekeeper_seq); | 1802 | write_seqcount_end(&tk_core.seq); |
1727 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1803 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1728 | } | 1804 | } |
1729 | EXPORT_SYMBOL(hardpps); | 1805 | EXPORT_SYMBOL(hardpps); |
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h new file mode 100644 index 000000000000..adc1fc98bde3 --- /dev/null +++ b/kernel/time/timekeeping.h | |||
@@ -0,0 +1,20 @@ | |||
1 | #ifndef _KERNEL_TIME_TIMEKEEPING_H | ||
2 | #define _KERNEL_TIME_TIMEKEEPING_H | ||
3 | /* | ||
4 | * Internal interfaces for kernel/time/ | ||
5 | */ | ||
6 | extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, | ||
7 | ktime_t *offs_boot, | ||
8 | ktime_t *offs_tai); | ||
9 | extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, | ||
10 | ktime_t *offs_boot, | ||
11 | ktime_t *offs_tai); | ||
12 | |||
13 | extern int timekeeping_valid_for_hres(void); | ||
14 | extern u64 timekeeping_max_deferment(void); | ||
15 | extern int timekeeping_inject_offset(struct timespec *ts); | ||
16 | extern s32 timekeeping_get_tai_offset(void); | ||
17 | extern void timekeeping_set_tai_offset(s32 tai_offset); | ||
18 | extern void timekeeping_clocktai(struct timespec *ts); | ||
19 | |||
20 | #endif | ||
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 4d54f97558df..f6bd65236712 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c | |||
@@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void) | |||
67 | } | 67 | } |
68 | late_initcall(tk_debug_sleep_time_init); | 68 | late_initcall(tk_debug_sleep_time_init); |
69 | 69 | ||
70 | void tk_debug_account_sleep_time(struct timespec *t) | 70 | void tk_debug_account_sleep_time(struct timespec64 *t) |
71 | { | 71 | { |
72 | sleep_time_bin[fls(t->tv_sec)]++; | 72 | sleep_time_bin[fls(t->tv_sec)]++; |
73 | } | 73 | } |
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 13323ea08ffa..4ea005a7f9da 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h | |||
@@ -3,12 +3,27 @@ | |||
3 | /* | 3 | /* |
4 | * timekeeping debug functions | 4 | * timekeeping debug functions |
5 | */ | 5 | */ |
6 | #include <linux/clocksource.h> | ||
6 | #include <linux/time.h> | 7 | #include <linux/time.h> |
7 | 8 | ||
8 | #ifdef CONFIG_DEBUG_FS | 9 | #ifdef CONFIG_DEBUG_FS |
9 | extern void tk_debug_account_sleep_time(struct timespec *t); | 10 | extern void tk_debug_account_sleep_time(struct timespec64 *t); |
10 | #else | 11 | #else |
11 | #define tk_debug_account_sleep_time(x) | 12 | #define tk_debug_account_sleep_time(x) |
12 | #endif | 13 | #endif |
13 | 14 | ||
15 | #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE | ||
16 | static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) | ||
17 | { | ||
18 | cycle_t ret = (now - last) & mask; | ||
19 | |||
20 | return (s64) ret > 0 ? ret : 0; | ||
21 | } | ||
22 | #else | ||
23 | static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) | ||
24 | { | ||
25 | return (now - last) & mask; | ||
26 | } | ||
27 | #endif | ||
28 | |||
14 | #endif /* _TIMEKEEPING_INTERNAL_H */ | 29 | #endif /* _TIMEKEEPING_INTERNAL_H */ |
diff --git a/kernel/timer.c b/kernel/time/timer.c index 3bb01a323b2a..aca5dfe2fa3d 100644 --- a/kernel/timer.c +++ b/kernel/time/timer.c | |||
@@ -82,6 +82,7 @@ struct tvec_base { | |||
82 | unsigned long next_timer; | 82 | unsigned long next_timer; |
83 | unsigned long active_timers; | 83 | unsigned long active_timers; |
84 | unsigned long all_timers; | 84 | unsigned long all_timers; |
85 | int cpu; | ||
85 | struct tvec_root tv1; | 86 | struct tvec_root tv1; |
86 | struct tvec tv2; | 87 | struct tvec tv2; |
87 | struct tvec tv3; | 88 | struct tvec tv3; |
@@ -409,6 +410,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) | |||
409 | base->next_timer = timer->expires; | 410 | base->next_timer = timer->expires; |
410 | } | 411 | } |
411 | base->all_timers++; | 412 | base->all_timers++; |
413 | |||
414 | /* | ||
415 | * Check whether the other CPU is in dynticks mode and needs | ||
416 | * to be triggered to reevaluate the timer wheel. | ||
417 | * We are protected against the other CPU fiddling | ||
418 | * with the timer by holding the timer base lock. This also | ||
419 | * makes sure that a CPU on the way to stop its tick can not | ||
420 | * evaluate the timer wheel. | ||
421 | * | ||
422 | * Spare the IPI for deferrable timers on idle targets though. | ||
423 | * The next busy ticks will take care of it. Except full dynticks | ||
424 | * require special care against races with idle_cpu(), lets deal | ||
425 | * with that later. | ||
426 | */ | ||
427 | if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) | ||
428 | wake_up_nohz_cpu(base->cpu); | ||
412 | } | 429 | } |
413 | 430 | ||
414 | #ifdef CONFIG_TIMER_STATS | 431 | #ifdef CONFIG_TIMER_STATS |
@@ -948,22 +965,6 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
948 | timer_set_base(timer, base); | 965 | timer_set_base(timer, base); |
949 | debug_activate(timer, timer->expires); | 966 | debug_activate(timer, timer->expires); |
950 | internal_add_timer(base, timer); | 967 | internal_add_timer(base, timer); |
951 | /* | ||
952 | * Check whether the other CPU is in dynticks mode and needs | ||
953 | * to be triggered to reevaluate the timer wheel. | ||
954 | * We are protected against the other CPU fiddling | ||
955 | * with the timer by holding the timer base lock. This also | ||
956 | * makes sure that a CPU on the way to stop its tick can not | ||
957 | * evaluate the timer wheel. | ||
958 | * | ||
959 | * Spare the IPI for deferrable timers on idle targets though. | ||
960 | * The next busy ticks will take care of it. Except full dynticks | ||
961 | * require special care against races with idle_cpu(), lets deal | ||
962 | * with that later. | ||
963 | */ | ||
964 | if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) | ||
965 | wake_up_nohz_cpu(cpu); | ||
966 | |||
967 | spin_unlock_irqrestore(&base->lock, flags); | 968 | spin_unlock_irqrestore(&base->lock, flags); |
968 | } | 969 | } |
969 | EXPORT_SYMBOL_GPL(add_timer_on); | 970 | EXPORT_SYMBOL_GPL(add_timer_on); |
@@ -1568,6 +1569,7 @@ static int init_timers_cpu(int cpu) | |||
1568 | } | 1569 | } |
1569 | spin_lock_init(&base->lock); | 1570 | spin_lock_init(&base->lock); |
1570 | tvec_base_done[cpu] = 1; | 1571 | tvec_base_done[cpu] = 1; |
1572 | base->cpu = cpu; | ||
1571 | } else { | 1573 | } else { |
1572 | base = per_cpu(tvec_bases, cpu); | 1574 | base = per_cpu(tvec_bases, cpu); |
1573 | } | 1575 | } |
diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c new file mode 100644 index 000000000000..e622ba365a13 --- /dev/null +++ b/kernel/time/udelay_test.c | |||
@@ -0,0 +1,168 @@ | |||
1 | /* | ||
2 | * udelay() test kernel module | ||
3 | * | ||
4 | * Test is executed by writing and reading to /sys/kernel/debug/udelay_test | ||
5 | * Tests are configured by writing: USECS ITERATIONS | ||
6 | * Tests are executed by reading from the same file. | ||
7 | * Specifying usecs of 0 or negative values will run multiples tests. | ||
8 | * | ||
9 | * Copyright (C) 2014 Google, Inc. | ||
10 | * | ||
11 | * This software is licensed under the terms of the GNU General Public | ||
12 | * License version 2, as published by the Free Software Foundation, and | ||
13 | * may be copied, distributed, and modified under those terms. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | * GNU General Public License for more details. | ||
19 | */ | ||
20 | |||
21 | #include <linux/debugfs.h> | ||
22 | #include <linux/delay.h> | ||
23 | #include <linux/ktime.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/uaccess.h> | ||
26 | |||
27 | #define DEFAULT_ITERATIONS 100 | ||
28 | |||
29 | #define DEBUGFS_FILENAME "udelay_test" | ||
30 | |||
31 | static DEFINE_MUTEX(udelay_test_lock); | ||
32 | static struct dentry *udelay_test_debugfs_file; | ||
33 | static int udelay_test_usecs; | ||
34 | static int udelay_test_iterations = DEFAULT_ITERATIONS; | ||
35 | |||
36 | static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) | ||
37 | { | ||
38 | int min = 0, max = 0, fail_count = 0; | ||
39 | uint64_t sum = 0; | ||
40 | uint64_t avg; | ||
41 | int i; | ||
42 | /* Allow udelay to be up to 0.5% fast */ | ||
43 | int allowed_error_ns = usecs * 5; | ||
44 | |||
45 | for (i = 0; i < iters; ++i) { | ||
46 | struct timespec ts1, ts2; | ||
47 | int time_passed; | ||
48 | |||
49 | ktime_get_ts(&ts1); | ||
50 | udelay(usecs); | ||
51 | ktime_get_ts(&ts2); | ||
52 | time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); | ||
53 | |||
54 | if (i == 0 || time_passed < min) | ||
55 | min = time_passed; | ||
56 | if (i == 0 || time_passed > max) | ||
57 | max = time_passed; | ||
58 | if ((time_passed + allowed_error_ns) / 1000 < usecs) | ||
59 | ++fail_count; | ||
60 | WARN_ON(time_passed < 0); | ||
61 | sum += time_passed; | ||
62 | } | ||
63 | |||
64 | avg = sum; | ||
65 | do_div(avg, iters); | ||
66 | seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d", | ||
67 | usecs, iters, usecs * 1000, | ||
68 | (usecs * 1000) - allowed_error_ns, min, avg, max); | ||
69 | if (fail_count) | ||
70 | seq_printf(s, " FAIL=%d", fail_count); | ||
71 | seq_puts(s, "\n"); | ||
72 | |||
73 | return 0; | ||
74 | } | ||
75 | |||
76 | static int udelay_test_show(struct seq_file *s, void *v) | ||
77 | { | ||
78 | int usecs; | ||
79 | int iters; | ||
80 | int ret = 0; | ||
81 | |||
82 | mutex_lock(&udelay_test_lock); | ||
83 | usecs = udelay_test_usecs; | ||
84 | iters = udelay_test_iterations; | ||
85 | mutex_unlock(&udelay_test_lock); | ||
86 | |||
87 | if (usecs > 0 && iters > 0) { | ||
88 | return udelay_test_single(s, usecs, iters); | ||
89 | } else if (usecs == 0) { | ||
90 | struct timespec ts; | ||
91 | |||
92 | ktime_get_ts(&ts); | ||
93 | seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", | ||
94 | loops_per_jiffy, ts.tv_sec, ts.tv_nsec); | ||
95 | seq_puts(s, "usage:\n"); | ||
96 | seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); | ||
97 | seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); | ||
98 | } | ||
99 | |||
100 | return ret; | ||
101 | } | ||
102 | |||
103 | static int udelay_test_open(struct inode *inode, struct file *file) | ||
104 | { | ||
105 | return single_open(file, udelay_test_show, inode->i_private); | ||
106 | } | ||
107 | |||
108 | static ssize_t udelay_test_write(struct file *file, const char __user *buf, | ||
109 | size_t count, loff_t *pos) | ||
110 | { | ||
111 | char lbuf[32]; | ||
112 | int ret; | ||
113 | int usecs; | ||
114 | int iters; | ||
115 | |||
116 | if (count >= sizeof(lbuf)) | ||
117 | return -EINVAL; | ||
118 | |||
119 | if (copy_from_user(lbuf, buf, count)) | ||
120 | return -EFAULT; | ||
121 | lbuf[count] = '\0'; | ||
122 | |||
123 | ret = sscanf(lbuf, "%d %d", &usecs, &iters); | ||
124 | if (ret < 1) | ||
125 | return -EINVAL; | ||
126 | else if (ret < 2) | ||
127 | iters = DEFAULT_ITERATIONS; | ||
128 | |||
129 | mutex_lock(&udelay_test_lock); | ||
130 | udelay_test_usecs = usecs; | ||
131 | udelay_test_iterations = iters; | ||
132 | mutex_unlock(&udelay_test_lock); | ||
133 | |||
134 | return count; | ||
135 | } | ||
136 | |||
137 | static const struct file_operations udelay_test_debugfs_ops = { | ||
138 | .owner = THIS_MODULE, | ||
139 | .open = udelay_test_open, | ||
140 | .read = seq_read, | ||
141 | .write = udelay_test_write, | ||
142 | .llseek = seq_lseek, | ||
143 | .release = single_release, | ||
144 | }; | ||
145 | |||
146 | static int __init udelay_test_init(void) | ||
147 | { | ||
148 | mutex_lock(&udelay_test_lock); | ||
149 | udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME, | ||
150 | S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops); | ||
151 | mutex_unlock(&udelay_test_lock); | ||
152 | |||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | module_init(udelay_test_init); | ||
157 | |||
158 | static void __exit udelay_test_exit(void) | ||
159 | { | ||
160 | mutex_lock(&udelay_test_lock); | ||
161 | debugfs_remove(udelay_test_debugfs_file); | ||
162 | mutex_unlock(&udelay_test_lock); | ||
163 | } | ||
164 | |||
165 | module_exit(udelay_test_exit); | ||
166 | |||
167 | MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); | ||
168 | MODULE_LICENSE("GPL"); | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 925f629658d6..afb04b9b818a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -1968,7 +1968,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | |||
1968 | 1968 | ||
1969 | /** | 1969 | /** |
1970 | * rb_update_event - update event type and data | 1970 | * rb_update_event - update event type and data |
1971 | * @event: the even to update | 1971 | * @event: the event to update |
1972 | * @type: the type of event | 1972 | * @type: the type of event |
1973 | * @length: the size of the event field in the ring buffer | 1973 | * @length: the size of the event field in the ring buffer |
1974 | * | 1974 | * |
@@ -3341,21 +3341,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) | |||
3341 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3341 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3342 | 3342 | ||
3343 | /* Iterator usage is expected to have record disabled */ | 3343 | /* Iterator usage is expected to have record disabled */ |
3344 | if (list_empty(&cpu_buffer->reader_page->list)) { | 3344 | iter->head_page = cpu_buffer->reader_page; |
3345 | iter->head_page = rb_set_head_page(cpu_buffer); | 3345 | iter->head = cpu_buffer->reader_page->read; |
3346 | if (unlikely(!iter->head_page)) | 3346 | |
3347 | return; | 3347 | iter->cache_reader_page = iter->head_page; |
3348 | iter->head = iter->head_page->read; | 3348 | iter->cache_read = iter->head; |
3349 | } else { | 3349 | |
3350 | iter->head_page = cpu_buffer->reader_page; | ||
3351 | iter->head = cpu_buffer->reader_page->read; | ||
3352 | } | ||
3353 | if (iter->head) | 3350 | if (iter->head) |
3354 | iter->read_stamp = cpu_buffer->read_stamp; | 3351 | iter->read_stamp = cpu_buffer->read_stamp; |
3355 | else | 3352 | else |
3356 | iter->read_stamp = iter->head_page->page->time_stamp; | 3353 | iter->read_stamp = iter->head_page->page->time_stamp; |
3357 | iter->cache_reader_page = cpu_buffer->reader_page; | ||
3358 | iter->cache_read = cpu_buffer->read; | ||
3359 | } | 3354 | } |
3360 | 3355 | ||
3361 | /** | 3356 | /** |
@@ -3748,12 +3743,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
3748 | return NULL; | 3743 | return NULL; |
3749 | 3744 | ||
3750 | /* | 3745 | /* |
3751 | * We repeat when a time extend is encountered. | 3746 | * We repeat when a time extend is encountered or we hit |
3752 | * Since the time extend is always attached to a data event, | 3747 | * the end of the page. Since the time extend is always attached |
3753 | * we should never loop more than once. | 3748 | * to a data event, we should never loop more than three times. |
3754 | * (We never hit the following condition more than twice). | 3749 | * Once for going to next page, once on time extend, and |
3750 | * finally once to get the event. | ||
3751 | * (We never hit the following condition more than thrice). | ||
3755 | */ | 3752 | */ |
3756 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) | 3753 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) |
3757 | return NULL; | 3754 | return NULL; |
3758 | 3755 | ||
3759 | if (rb_per_cpu_empty(cpu_buffer)) | 3756 | if (rb_per_cpu_empty(cpu_buffer)) |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bb80fe08767..8a528392b1f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -820,11 +820,12 @@ static struct { | |||
820 | const char *name; | 820 | const char *name; |
821 | int in_ns; /* is this clock in nanoseconds? */ | 821 | int in_ns; /* is this clock in nanoseconds? */ |
822 | } trace_clocks[] = { | 822 | } trace_clocks[] = { |
823 | { trace_clock_local, "local", 1 }, | 823 | { trace_clock_local, "local", 1 }, |
824 | { trace_clock_global, "global", 1 }, | 824 | { trace_clock_global, "global", 1 }, |
825 | { trace_clock_counter, "counter", 0 }, | 825 | { trace_clock_counter, "counter", 0 }, |
826 | { trace_clock_jiffies, "uptime", 0 }, | 826 | { trace_clock_jiffies, "uptime", 0 }, |
827 | { trace_clock, "perf", 1 }, | 827 | { trace_clock, "perf", 1 }, |
828 | { ktime_get_mono_fast_ns, "mono", 1 }, | ||
828 | ARCH_TRACE_CLOCKS | 829 | ARCH_TRACE_CLOCKS |
829 | }; | 830 | }; |
830 | 831 | ||
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index a1dd9a1b1327..975cb49e32bf 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns, | |||
31 | struct taskstats *stats, struct task_struct *tsk) | 31 | struct taskstats *stats, struct task_struct *tsk) |
32 | { | 32 | { |
33 | const struct cred *tcred; | 33 | const struct cred *tcred; |
34 | struct timespec uptime, ts; | ||
35 | cputime_t utime, stime, utimescaled, stimescaled; | 34 | cputime_t utime, stime, utimescaled, stimescaled; |
36 | u64 ac_etime; | 35 | u64 delta; |
37 | 36 | ||
38 | BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); | 37 | BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); |
39 | 38 | ||
40 | /* calculate task elapsed time in timespec */ | 39 | /* calculate task elapsed time in nsec */ |
41 | do_posix_clock_monotonic_gettime(&uptime); | 40 | delta = ktime_get_ns() - tsk->start_time; |
42 | ts = timespec_sub(uptime, tsk->start_time); | 41 | /* Convert to micro seconds */ |
43 | /* rebase elapsed time to usec (should never be negative) */ | 42 | do_div(delta, NSEC_PER_USEC); |
44 | ac_etime = timespec_to_ns(&ts); | 43 | stats->ac_etime = delta; |
45 | do_div(ac_etime, NSEC_PER_USEC); | 44 | /* Convert to seconds for btime */ |
46 | stats->ac_etime = ac_etime; | 45 | do_div(delta, USEC_PER_SEC); |
47 | stats->ac_btime = get_seconds() - ts.tv_sec; | 46 | stats->ac_btime = get_seconds() - delta; |
48 | if (thread_group_leader(tsk)) { | 47 | if (thread_group_leader(tsk)) { |
49 | stats->ac_exitcode = tsk->exit_code; | 48 | stats->ac_exitcode = tsk->exit_code; |
50 | if (tsk->flags & PF_FORKNOEXEC) | 49 | if (tsk->flags & PF_FORKNOEXEC) |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc02560fd6b..aa312b0dc3ec 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v) | |||
526 | return; | 526 | return; |
527 | } | 527 | } |
528 | 528 | ||
529 | struct seq_operations proc_uid_seq_operations = { | 529 | const struct seq_operations proc_uid_seq_operations = { |
530 | .start = uid_m_start, | 530 | .start = uid_m_start, |
531 | .stop = m_stop, | 531 | .stop = m_stop, |
532 | .next = m_next, | 532 | .next = m_next, |
533 | .show = uid_m_show, | 533 | .show = uid_m_show, |
534 | }; | 534 | }; |
535 | 535 | ||
536 | struct seq_operations proc_gid_seq_operations = { | 536 | const struct seq_operations proc_gid_seq_operations = { |
537 | .start = gid_m_start, | 537 | .start = gid_m_start, |
538 | .stop = m_stop, | 538 | .stop = m_stop, |
539 | .next = m_next, | 539 | .next = m_next, |
540 | .show = gid_m_show, | 540 | .show = gid_m_show, |
541 | }; | 541 | }; |
542 | 542 | ||
543 | struct seq_operations proc_projid_seq_operations = { | 543 | const struct seq_operations proc_projid_seq_operations = { |
544 | .start = projid_m_start, | 544 | .start = projid_m_start, |
545 | .stop = m_stop, | 545 | .stop = m_stop, |
546 | .next = m_next, | 546 | .next = m_next, |
diff --git a/kernel/utsname.c b/kernel/utsname.c index fd393124e507..883aaaa7de8a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task) | |||
93 | struct uts_namespace *ns = NULL; | 93 | struct uts_namespace *ns = NULL; |
94 | struct nsproxy *nsproxy; | 94 | struct nsproxy *nsproxy; |
95 | 95 | ||
96 | rcu_read_lock(); | 96 | task_lock(task); |
97 | nsproxy = task_nsproxy(task); | 97 | nsproxy = task->nsproxy; |
98 | if (nsproxy) { | 98 | if (nsproxy) { |
99 | ns = nsproxy->uts_ns; | 99 | ns = nsproxy->uts_ns; |
100 | get_uts_ns(ns); | 100 | get_uts_ns(ns); |
101 | } | 101 | } |
102 | rcu_read_unlock(); | 102 | task_unlock(task); |
103 | 103 | ||
104 | return ns; | 104 | return ns; |
105 | } | 105 | } |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd1b040..a8d6914030fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, | |||
260 | return; | 260 | return; |
261 | 261 | ||
262 | if (hardlockup_panic) | 262 | if (hardlockup_panic) |
263 | panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | 263 | panic("Watchdog detected hard LOCKUP on cpu %d", |
264 | this_cpu); | ||
264 | else | 265 | else |
265 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); | 266 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", |
267 | this_cpu); | ||
266 | 268 | ||
267 | __this_cpu_write(hard_watchdog_warn, true); | 269 | __this_cpu_write(hard_watchdog_warn, true); |
268 | return; | 270 | return; |
@@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
345 | } | 347 | } |
346 | } | 348 | } |
347 | 349 | ||
348 | printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 350 | pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
349 | smp_processor_id(), duration, | 351 | smp_processor_id(), duration, |
350 | current->comm, task_pid_nr(current)); | 352 | current->comm, task_pid_nr(current)); |
351 | print_modules(); | 353 | print_modules(); |
@@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
366 | smp_mb__after_atomic(); | 368 | smp_mb__after_atomic(); |
367 | } | 369 | } |
368 | 370 | ||
371 | add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); | ||
369 | if (softlockup_panic) | 372 | if (softlockup_panic) |
370 | panic("softlockup: hung tasks"); | 373 | panic("softlockup: hung tasks"); |
371 | __this_cpu_write(soft_watchdog_warn, true); | 374 | __this_cpu_write(soft_watchdog_warn, true); |
@@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
484 | if (PTR_ERR(event) == -EOPNOTSUPP) | 487 | if (PTR_ERR(event) == -EOPNOTSUPP) |
485 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | 488 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); |
486 | else if (PTR_ERR(event) == -ENOENT) | 489 | else if (PTR_ERR(event) == -ENOENT) |
487 | pr_warning("disabled (cpu%i): hardware events not enabled\n", | 490 | pr_warn("disabled (cpu%i): hardware events not enabled\n", |
488 | cpu); | 491 | cpu); |
489 | else | 492 | else |
490 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | 493 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", |