Merge branch 'linus' into perf/core, to fix conflicts

Conflicts: arch/x86/kernel/cpu/perf_event_intel_uncore*.c Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2014-08-24 16:32:24 -0400
committer: Ingo Molnar <mingo@kernel.org> 2014-08-24 16:32:24 -0400
commit: 83bc90e11576f9c100f8ef4ba2bcd0b89212e3fb (patch)
tree: e59186b4d315c80255851e0d204143ecc21399a0 /kernel
parent: e21ded5ecc531a64d6fc0c1693285e890b4e9569 (diff)
parent: 451fd72219dd6f3355e2d036c598544c760ee532 (diff)
67 files changed, 4265 insertions, 1717 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f2a8b6246ce9..dc5c77544fd6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,12 +3,11 @@
 #
 obj-y     = fork.o exec_domain.o panic.o \
-            cpu.o exit.o itimer.o time.o softirq.o resource.o \
+            cpu.o exit.o softirq.o resource.o \
-            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
+            sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
-            extable.o params.o posix-timers.o \
+            extable.o params.o \
-            kthread.o sys_ni.o posix-cpu-timers.o \
+            kthread.o sys_ni.o nsproxy.o \
-            hrtimer.o nsproxy.o \
            notifier.o ksysfs.o cred.o reboot.o \
            async.o range.o groups.o smpboot.o
@@ -87,6 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
+obj-$(CONFIG_NET) += bpf/
 obj-$(CONFIG_PERF_EVENTS) += events/
@@ -105,27 +105,11 @@ targets += config_data.gz
 $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
        $(call if_changed,gzip)
-      filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
+      filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
        $(call filechk,ikconfiggz)
-$(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_hzfile = HZFILE  $@
-      cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
-        $(call if_changed,hzfile)
-quiet_cmd_bc  = BC      $@
-      cmd_bc  = bc -q $(filter-out FORCE,$^) > $@
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
-        $(call if_changed,bc)
 ###############################################################################
 #
 # Roll all the X.509 certificates that we can find together and pull them into
diff --git a/kernel/acct.c b/kernel/acct.c
index 808a86ff229d..b4c667d22e79 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
+#include <linux/fs_pin.h>
 /*
 * These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
 /*
 * External references and all of the globals.
 */
-static void do_acct_process(struct bsd_acct_struct *acct,
+static void do_acct_process(struct bsd_acct_struct *acct);
-                struct pid_namespace *ns, struct file *);
-/*
- * This structure is used so that all the data protected by lock
- * can be placed in the same cache line as the lock.  This primes
- * the cache line to have the data after getting the lock.
- */
 struct bsd_acct_struct {
+        struct fs_pin           pin;
+        struct mutex            lock;
        int                     active;
        unsigned long           needcheck;
        struct file             *file;
        struct pid_namespace    *ns;
-        struct list_head        list;
+        struct work_struct      work;
+        struct completion       done;
 };
-static DEFINE_SPINLOCK(acct_lock);
-static LIST_HEAD(acct_list);
 /*
 * Check the amount of free space and suspend/resume accordingly.
 */
-static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct)
 {
        struct kstatfs sbuf;
-        int res;
-        int act;
+        if (time_is_before_jiffies(acct->needcheck))
-        u64 resume;
-        u64 suspend;
-        spin_lock(&acct_lock);
-        res = acct->active;
-        if (!file || time_is_before_jiffies(acct->needcheck))
                goto out;
-        spin_unlock(&acct_lock);
        /* May block */
-        if (vfs_statfs(&file->f_path, &sbuf))
+        if (vfs_statfs(&acct->file->f_path, &sbuf))
-                return res;
-        suspend = sbuf.f_blocks * SUSPEND;
-        resume = sbuf.f_blocks * RESUME;
-        do_div(suspend, 100);
-        do_div(resume, 100);
-        if (sbuf.f_bavail <= suspend)
-                act = -1;
-        else if (sbuf.f_bavail >= resume)
-                act = 1;
-        else
-                act = 0;
-        /*
-         * If some joker switched acct->file under us we'ld better be
-         * silent and _not_ touch anything.
-         */
-        spin_lock(&acct_lock);
-        if (file != acct->file) {
-                if (act)
-                        res = act > 0;
                goto out;
-        }
        if (acct->active) {
-                if (act < 0) {
+                u64 suspend = sbuf.f_blocks * SUSPEND;
+                do_div(suspend, 100);
+                if (sbuf.f_bavail <= suspend) {
                        acct->active = 0;
-                        printk(KERN_INFO "Process accounting paused\n");
+                        pr_info("Process accounting paused\n");
                }
        } else {
-                if (act > 0) {
+                u64 resume = sbuf.f_blocks * RESUME;
+                do_div(resume, 100);
+                if (sbuf.f_bavail >= resume) {
                        acct->active = 1;
-                        printk(KERN_INFO "Process accounting resumed\n");
+                        pr_info("Process accounting resumed\n");
                }
        }
        acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-        res = acct->active;
 out:
-        spin_unlock(&acct_lock);
+        return acct->active;
+}
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
+{
+        struct bsd_acct_struct *res;
+again:
+        smp_rmb();
+        rcu_read_lock();
+        res = ACCESS_ONCE(ns->bacct);
+        if (!res) {
+                rcu_read_unlock();
+                return NULL;
+        }
+        if (!atomic_long_inc_not_zero(&res->pin.count)) {
+                rcu_read_unlock();
+                cpu_relax();
+                goto again;
+        }
+        rcu_read_unlock();
+        mutex_lock(&res->lock);
+        if (!res->ns) {
+                mutex_unlock(&res->lock);
+                pin_put(&res->pin);
+                goto again;
+        }
        return res;
 }
-/*
+static void close_work(struct work_struct *work)
- * Close the old accounting file (if currently open) and then replace
+{
- * it with file (if non-NULL).
+        struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
- *
+        struct file *file = acct->file;
- * NOTE: acct_lock MUST be held on entry and exit.
+        if (file->f_op->flush)
- */
+                file->f_op->flush(file, NULL);
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
+        __fput_sync(file);
-                struct pid_namespace *ns)
+        complete(&acct->done);
+}
+static void acct_kill(struct bsd_acct_struct *acct,
+                      struct bsd_acct_struct *new)
 {
-        struct file *old_acct = NULL;
+        if (acct) {
-        struct pid_namespace *old_ns = NULL;
+                struct pid_namespace *ns = acct->ns;
+                do_acct_process(acct);
-        if (acct->file) {
+                INIT_WORK(&acct->work, close_work);
-                old_acct = acct->file;
+                init_completion(&acct->done);
-                old_ns = acct->ns;
+                schedule_work(&acct->work);
-                acct->active = 0;
+                wait_for_completion(&acct->done);
-                acct->file = NULL;
+                pin_remove(&acct->pin);
+                ns->bacct = new;
                acct->ns = NULL;
-                list_del(&acct->list);
+                atomic_long_dec(&acct->pin.count);
+                mutex_unlock(&acct->lock);
+                pin_put(&acct->pin);
        }
-        if (file) {
+}
-                acct->file = file;
-                acct->ns = ns;
+static void acct_pin_kill(struct fs_pin *pin)
-                acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
+{
-                acct->active = 1;
+        struct bsd_acct_struct *acct;
-                list_add(&acct->list, &acct_list);
+        acct = container_of(pin, struct bsd_acct_struct, pin);
-        }
+        mutex_lock(&acct->lock);
-        if (old_acct) {
+        if (!acct->ns) {
-                mnt_unpin(old_acct->f_path.mnt);
+                mutex_unlock(&acct->lock);
-                spin_unlock(&acct_lock);
+                pin_put(pin);
-                do_acct_process(acct, old_ns, old_acct);
+                acct = NULL;
-                filp_close(old_acct, NULL);
-                spin_lock(&acct_lock);
        }
+        acct_kill(acct, NULL);
 }
 static int acct_on(struct filename *pathname)
 {
        struct file *file;
-        struct vfsmount *mnt;
+        struct vfsmount *mnt, *internal;
-        struct pid_namespace *ns;
+        struct pid_namespace *ns = task_active_pid_ns(current);
-        struct bsd_acct_struct *acct = NULL;
+        struct bsd_acct_struct *acct, *old;
+        int err;
+        acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+        if (!acct)
+                return -ENOMEM;
        /* Difference from BSD - they don't do O_APPEND */
        file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
-        if (IS_ERR(file))
+        if (IS_ERR(file)) {
+                kfree(acct);
                return PTR_ERR(file);
+        }
        if (!S_ISREG(file_inode(file)->i_mode)) {
+                kfree(acct);
                filp_close(file, NULL);
                return -EACCES;
        }
        if (!file->f_op->write) {
+                kfree(acct);
                filp_close(file, NULL);
                return -EIO;
        }
+        internal = mnt_clone_internal(&file->f_path);
-        ns = task_active_pid_ns(current);
+        if (IS_ERR(internal)) {
-        if (ns->bacct == NULL) {
+                kfree(acct);
-                acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+                filp_close(file, NULL);
-                if (acct == NULL) {
+                return PTR_ERR(internal);
-                        filp_close(file, NULL);
-                        return -ENOMEM;
-                }
        }
+        err = mnt_want_write(internal);
-        spin_lock(&acct_lock);
+        if (err) {
-        if (ns->bacct == NULL) {
+                mntput(internal);
-                ns->bacct = acct;
+                kfree(acct);
-                acct = NULL;
+                filp_close(file, NULL);
+                return err;
        }
        mnt = file->f_path.mnt;
-        mnt_pin(mnt);
+        file->f_path.mnt = internal;
-        acct_file_reopen(ns->bacct, file, ns);
-        spin_unlock(&acct_lock);
+        atomic_long_set(&acct->pin.count, 1);
+        acct->pin.kill = acct_pin_kill;
-        mntput(mnt); /* it's pinned, now give up active reference */
+        acct->file = file;
-        kfree(acct);
+        acct->needcheck = jiffies;
+        acct->ns = ns;
+        mutex_init(&acct->lock);
+        mutex_lock_nested(&acct->lock, 1);      /* nobody has seen it yet */
+        pin_insert(&acct->pin, mnt);
+        old = acct_get(ns);
+        if (old)
+                acct_kill(old, acct);
+        else
+                ns->bacct = acct;
+        mutex_unlock(&acct->lock);
+        mnt_drop_write(mnt);
+        mntput(mnt);
        return 0;
 }
+static DEFINE_MUTEX(acct_on_mutex);
 /**
 * sys_acct - enable/disable process accounting
 * @name: file name for accounting records or NULL to shutdown accounting
@@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
        if (name) {
                struct filename *tmp = getname(name);
                if (IS_ERR(tmp))
                        return PTR_ERR(tmp);
+                mutex_lock(&acct_on_mutex);
                error = acct_on(tmp);
+                mutex_unlock(&acct_on_mutex);
                putname(tmp);
        } else {
-                struct bsd_acct_struct *acct;
+                acct_kill(acct_get(task_active_pid_ns(current)), NULL);
-                acct = task_active_pid_ns(current)->bacct;
-                if (acct == NULL)
-                        return 0;
-                spin_lock(&acct_lock);
-                acct_file_reopen(acct, NULL, NULL);
-                spin_unlock(&acct_lock);
        }
        return error;
 }
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @m: vfsmount being shut down
- *
- * If the accounting is turned on for a file in the subtree pointed to
- * to by m, turn accounting off.  Done when m is about to die.
- */
-void acct_auto_close_mnt(struct vfsmount *m)
-{
-        struct bsd_acct_struct *acct;
-        spin_lock(&acct_lock);
-restart:
-        list_for_each_entry(acct, &acct_list, list)
-                if (acct->file && acct->file->f_path.mnt == m) {
-                        acct_file_reopen(acct, NULL, NULL);
-                        goto restart;
-                }
-        spin_unlock(&acct_lock);
-}
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @sb: super block for the filesystem
- *
- * If the accounting is turned on for a file in the filesystem pointed
- * to by sb, turn accounting off.
- */
-void acct_auto_close(struct super_block *sb)
-{
-        struct bsd_acct_struct *acct;
-        spin_lock(&acct_lock);
-restart:
-        list_for_each_entry(acct, &acct_list, list)
-                if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
-                        acct_file_reopen(acct, NULL, NULL);
-                        goto restart;
-                }
-        spin_unlock(&acct_lock);
-}
 void acct_exit_ns(struct pid_namespace *ns)
 {
-        struct bsd_acct_struct *acct = ns->bacct;
+        acct_kill(acct_get(ns), NULL);
-        if (acct == NULL)
-                return;
-        spin_lock(&acct_lock);
-        if (acct->file != NULL)
-                acct_file_reopen(acct, NULL, NULL);
-        spin_unlock(&acct_lock);
-        kfree(acct);
 }
 /*
@@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value)
        return exp;
 }
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 /*
 * encode an u64 into a comp2_t (24 bits)
 *
@@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value)
 #define MANTSIZE2       20                      /* 20 bit mantissa. */
 #define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
 #define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
-#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
+#define MAXEXP2         ((1 << EXPSIZE2) - 1)    /* Maximum exponent. */
 static comp2_t encode_comp2_t(u64 value)
 {
@@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value)
 }
 #endif
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
 /*
 * encode an u64 into a 32 bit IEEE float
 */
@@ -429,8 +391,9 @@ static u32 encode_float(u64 value)
        unsigned exp = 190;
        unsigned u;
-        if (value==0) return 0;
+        if (value == 0)
-        while ((s64)value > 0){
+                return 0;
+        while ((s64)value > 0) {
                value <<= 1;
                exp--;
        }
@@ -448,120 +411,112 @@ static u32 encode_float(u64 value)
 *  do_exit() or when switching to a different output file.
 */
-/*
+static void fill_ac(acct_t *ac)
- *  do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct,
-                struct pid_namespace *ns, struct file *file)
 {
        struct pacct_struct *pacct = &current->signal->pacct;
-        acct_t ac;
+        u64 elapsed, run_time;
-        mm_segment_t fs;
-        unsigned long flim;
-        u64 elapsed;
-        u64 run_time;
-        struct timespec uptime;
        struct tty_struct *tty;
-        const struct cred *orig_cred;
-        /* Perform file operations on behalf of whoever enabled accounting */
-        orig_cred = override_creds(file->f_cred);
-        /*
-         * First check to see if there is enough free_space to continue
-         * the process accounting system.
-         */
-        if (!check_free_space(acct, file))
-                goto out;
        /*
         * Fill the accounting struct with the needed info as recorded
         * by the different kernel functions.
         */
-        memset(&ac, 0, sizeof(acct_t));
+        memset(ac, 0, sizeof(acct_t));
-        ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+        ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
-        strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+        strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
        /* calculate run_time in nsec*/
-        do_posix_clock_monotonic_gettime(&uptime);
+        run_time = ktime_get_ns();
-        run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
+        run_time -= current->group_leader->start_time;
-        run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
-                       + current->group_leader->start_time.tv_nsec;
        /* convert nsec -> AHZ */
        elapsed = nsec_to_AHZ(run_time);
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
-        ac.ac_etime = encode_float(elapsed);
+        ac->ac_etime = encode_float(elapsed);
 #else
-        ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+        ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
-                               (unsigned long) elapsed : (unsigned long) -1l);
+                                (unsigned long) elapsed : (unsigned long) -1l);
 #endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
        {
                /* new enlarged etime field */
                comp2_t etime = encode_comp2_t(elapsed);
-                ac.ac_etime_hi = etime >> 16;
-                ac.ac_etime_lo = (u16) etime;
+                ac->ac_etime_hi = etime >> 16;
+                ac->ac_etime_lo = (u16) etime;
        }
 #endif
        do_div(elapsed, AHZ);
-        ac.ac_btime = get_seconds() - elapsed;
+        ac->ac_btime = get_seconds() - elapsed;
+#if ACCT_VERSION==2
+        ac->ac_ahz = AHZ;
+#endif
+        spin_lock_irq(&current->sighand->siglock);
+        tty = current->signal->tty;     /* Safe as we hold the siglock */
+        ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+        ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+        ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+        ac->ac_flag = pacct->ac_flag;
+        ac->ac_mem = encode_comp_t(pacct->ac_mem);
+        ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
+        ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
+        ac->ac_exitcode = pacct->ac_exitcode;
+        spin_unlock_irq(&current->sighand->siglock);
+}
+/*
+ *  do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+        acct_t ac;
+        unsigned long flim;
+        const struct cred *orig_cred;
+        struct pid_namespace *ns = acct->ns;
+        struct file *file = acct->file;
+        /*
+         * Accounting records are not subject to resource limits.
+         */
+        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+        /* Perform file operations on behalf of whoever enabled accounting */
+        orig_cred = override_creds(file->f_cred);
+        /*
+         * First check to see if there is enough free_space to continue
+         * the process accounting system.
+         */
+        if (!check_free_space(acct))
+                goto out;
+        fill_ac(&ac);
        /* we really need to bite the bullet and change layout */
        ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
        ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
-#if ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
-        ac.ac_ahz = AHZ;
-#endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
        /* backward-compatible 16 bit fields */
        ac.ac_uid16 = ac.ac_uid;
        ac.ac_gid16 = ac.ac_gid;
 #endif
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
        ac.ac_pid = task_tgid_nr_ns(current, ns);
        rcu_read_lock();
        ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
        rcu_read_unlock();
 #endif
-        spin_lock_irq(&current->sighand->siglock);
-        tty = current->signal->tty;     /* Safe as we hold the siglock */
-        ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
-        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
-        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
-        ac.ac_flag = pacct->ac_flag;
-        ac.ac_mem = encode_comp_t(pacct->ac_mem);
-        ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
-        ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
-        ac.ac_exitcode = pacct->ac_exitcode;
-        spin_unlock_irq(&current->sighand->siglock);
-        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
-        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
-        ac.ac_swaps = encode_comp_t(0);
        /*
         * Get freeze protection. If the fs is frozen, just skip the write
         * as we could deadlock the system otherwise.
         */
-        if (!file_start_write_trylock(file))
+        if (file_start_write_trylock(file)) {
-                goto out;
+                /* it's been opened O_APPEND, so position is irrelevant */
-        /*
+                loff_t pos = 0;
-         * Kernel segment override to datasegment and write it
+                __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
-         * to the accounting file.
+                file_end_write(file);
-         */
+        }
-        fs = get_fs();
-        set_fs(KERNEL_DS);
-        /*
-         * Accounting records are not subject to resource limits.
-         */
-        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-        file->f_op->write(file, (char *)&ac,
-                               sizeof(acct_t), &file->f_pos);
-        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
-        set_fs(fs);
-        file_end_write(file);
 out:
+        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
        revert_creds(orig_cred);
 }
@@ -578,6 +533,7 @@ void acct_collect(long exitcode, int group_dead)
        if (group_dead && current->mm) {
                struct vm_area_struct *vma;
                down_read(&current->mm->mmap_sem);
                vma = current->mm->mmap;
                while (vma) {
@@ -609,34 +565,20 @@ void acct_collect(long exitcode, int group_dead)
        spin_unlock_irq(&current->sighand->siglock);
 }
-static void acct_process_in_ns(struct pid_namespace *ns)
+static void slow_acct_process(struct pid_namespace *ns)
 {
-        struct file *file = NULL;
+        for ( ; ns; ns = ns->parent) {
-        struct bsd_acct_struct *acct;
+                struct bsd_acct_struct *acct = acct_get(ns);
+                if (acct) {
-        acct = ns->bacct;
+                        do_acct_process(acct);
-        /*
+                        mutex_unlock(&acct->lock);
-         * accelerate the common fastpath:
+                        pin_put(&acct->pin);
-         */
+                }
-        if (!acct || !acct->file)
-                return;
-        spin_lock(&acct_lock);
-        file = acct->file;
-        if (unlikely(!file)) {
-                spin_unlock(&acct_lock);
-                return;
        }
-        get_file(file);
-        spin_unlock(&acct_lock);
-        do_acct_process(acct, ns, file);
-        fput(file);
 }
 /**
- * acct_process - now just a wrapper around acct_process_in_ns,
+ * acct_process
- * which in turn is a wrapper around do_acct_process.
 *
 * handles process accounting for an exiting task
 */
@@ -649,6 +591,10 @@ void acct_process(void)
         * alive and holds its namespace, which in turn holds
         * its parent.
         */
-        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
+        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
-                acct_process_in_ns(ns);
+                if (ns->bacct)
+                        break;
+        }
+        if (unlikely(ns))
+                slow_acct_process(ns);
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 3ef2e0e797e8..ba2ff5a5c600 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1677,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
        audit_log_format(ab, " %s=", prefix);
        CAP_FOR_EACH_U32(i) {
                audit_log_format(ab, "%08x",
-                                 cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+                                 cap->cap[CAP_LAST_U32 - i]);
        }
 }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 8e9bc9c3dbb7..c447cd9848d1 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count)
        if (unlikely(!entry))
                return NULL;
-        fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+        fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
        if (unlikely(!fields)) {
                kfree(entry);
                return NULL;
@@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES];
 int __init audit_register_class(int class, unsigned *list)
 {
-        __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
+        __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
        if (!p)
                return -ENOMEM;
        while (*list != ~0U) {
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9fd4246b04b8..e1d1d1952bfa 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,7 +9,6 @@
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
-#include <linux/page_cgroup.h>
 #include <linux/log2.h>
 #include <linux/spinlock_types.h>
@@ -18,7 +17,6 @@ void foo(void)
        /* The enum constants to put into include/generated/bounds.h */
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
-        DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
 #ifdef CONFIG_SMP
        DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
 #endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
new file mode 100644
index 000000000000..6a71145e2769
--- /dev/null
+++ b/kernel/bpf/Makefile
@@ -0,0 +1 @@
+obj-y := core.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
new file mode 100644
index 000000000000..7f0dbcbb34af
--- /dev/null
+++ b/kernel/bpf/core.c
@@ -0,0 +1,534 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
+ *
+ *      Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ *      Jay Schulist <jschlst@samba.org>
+ *      Alexei Starovoitov <ast@plumgrid.com>
+ *      Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
+ */
+#include <linux/filter.h>
+#include <linux/skbuff.h>
+#include <asm/unaligned.h>
+/* Registers */
+#define BPF_R0  regs[BPF_REG_0]
+#define BPF_R1  regs[BPF_REG_1]
+#define BPF_R2  regs[BPF_REG_2]
+#define BPF_R3  regs[BPF_REG_3]
+#define BPF_R4  regs[BPF_REG_4]
+#define BPF_R5  regs[BPF_REG_5]
+#define BPF_R6  regs[BPF_REG_6]
+#define BPF_R7  regs[BPF_REG_7]
+#define BPF_R8  regs[BPF_REG_8]
+#define BPF_R9  regs[BPF_REG_9]
+#define BPF_R10 regs[BPF_REG_10]
+/* Named registers */
+#define DST     regs[insn->dst_reg]
+#define SRC     regs[insn->src_reg]
+#define FP      regs[BPF_REG_FP]
+#define ARG1    regs[BPF_REG_ARG1]
+#define CTX     regs[BPF_REG_CTX]
+#define IMM     insn->imm
+/* No hurry in this branch
+ *
+ * Exported for the bpf jit load helper.
+ */
+void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
+{
+        u8 *ptr = NULL;
+        if (k >= SKF_NET_OFF)
+                ptr = skb_network_header(skb) + k - SKF_NET_OFF;
+        else if (k >= SKF_LL_OFF)
+                ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+        if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
+                return ptr;
+        return NULL;
+}
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        return 0;
+}
+/**
+ *      __bpf_prog_run - run eBPF program on a given context
+ *      @ctx: is the data we are operating on
+ *      @insn: is the array of eBPF instructions
+ *
+ * Decode and execute eBPF instructions.
+ */
+static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+{
+        u64 stack[MAX_BPF_STACK / sizeof(u64)];
+        u64 regs[MAX_BPF_REG], tmp;
+        static const void *jumptable[256] = {
+                [0 ... 255] = &&default_label,
+                /* Now overwrite non-defaults ... */
+                /* 32 bit ALU operations */
+                [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+                [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+                [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+                [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+                [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+                [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+                [BPF_ALU | BPF_OR | BPF_X]  = &&ALU_OR_X,
+                [BPF_ALU | BPF_OR | BPF_K]  = &&ALU_OR_K,
+                [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+                [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+                [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+                [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+                [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+                [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+                [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+                [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+                [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+                [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+                [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+                [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+                [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+                [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+                [BPF_ALU | BPF_NEG] = &&ALU_NEG,
+                [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+                [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+                /* 64 bit ALU operations */
+                [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+                [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+                [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+                [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+                [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+                [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+                [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+                [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+                [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+                [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+                [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+                [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+                [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+                [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+                [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+                [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+                [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+                [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+                [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+                [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+                [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+                [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+                [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+                [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+                [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+                /* Call instruction */
+                [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+                /* Jumps */
+                [BPF_JMP | BPF_JA] = &&JMP_JA,
+                [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+                [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+                [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+                [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+                [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+                [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+                [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+                [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+                [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+                [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+                [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+                [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+                [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+                [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+                /* Program return */
+                [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+                /* Store instructions */
+                [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+                [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+                [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+                [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+                [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+                [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+                [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+                [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+                [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+                [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+                /* Load instructions */
+                [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+                [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+                [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+                [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+                [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+                [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+                [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+                [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+                [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+                [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+        };
+        void *ptr;
+        int off;
+#define CONT     ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
+        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+        ARG1 = (u64) (unsigned long) ctx;
+        /* Registers used in classic BPF programs need to be reset first. */
+        regs[BPF_REG_A] = 0;
+        regs[BPF_REG_X] = 0;
+select_insn:
+        goto *jumptable[insn->code];
+        /* ALU */
+#define ALU(OPCODE, OP)                 \
+        ALU64_##OPCODE##_X:             \
+                DST = DST OP SRC;       \
+                CONT;                   \
+        ALU_##OPCODE##_X:               \
+                DST = (u32) DST OP (u32) SRC;   \
+                CONT;                   \
+        ALU64_##OPCODE##_K:             \
+                DST = DST OP IMM;               \
+                CONT;                   \
+        ALU_##OPCODE##_K:               \
+                DST = (u32) DST OP (u32) IMM;   \
+                CONT;
+        ALU(ADD,  +)
+        ALU(SUB,  -)
+        ALU(AND,  &)
+        ALU(OR,   |)
+        ALU(LSH, <<)
+        ALU(RSH, >>)
+        ALU(XOR,  ^)
+        ALU(MUL,  *)
+#undef ALU
+        ALU_NEG:
+                DST = (u32) -DST;
+                CONT;
+        ALU64_NEG:
+                DST = -DST;
+                CONT;
+        ALU_MOV_X:
+                DST = (u32) SRC;
+                CONT;
+        ALU_MOV_K:
+                DST = (u32) IMM;
+                CONT;
+        ALU64_MOV_X:
+                DST = SRC;
+                CONT;
+        ALU64_MOV_K:
+                DST = IMM;
+                CONT;
+        ALU64_ARSH_X:
+                (*(s64 *) &DST) >>= SRC;
+                CONT;
+        ALU64_ARSH_K:
+                (*(s64 *) &DST) >>= IMM;
+                CONT;
+        ALU64_MOD_X:
+                if (unlikely(SRC == 0))
+                        return 0;
+                tmp = DST;
+                DST = do_div(tmp, SRC);
+                CONT;
+        ALU_MOD_X:
+                if (unlikely(SRC == 0))
+                        return 0;
+                tmp = (u32) DST;
+                DST = do_div(tmp, (u32) SRC);
+                CONT;
+        ALU64_MOD_K:
+                tmp = DST;
+                DST = do_div(tmp, IMM);
+                CONT;
+        ALU_MOD_K:
+                tmp = (u32) DST;
+                DST = do_div(tmp, (u32) IMM);
+                CONT;
+        ALU64_DIV_X:
+                if (unlikely(SRC == 0))
+                        return 0;
+                do_div(DST, SRC);
+                CONT;
+        ALU_DIV_X:
+                if (unlikely(SRC == 0))
+                        return 0;
+                tmp = (u32) DST;
+                do_div(tmp, (u32) SRC);
+                DST = (u32) tmp;
+                CONT;
+        ALU64_DIV_K:
+                do_div(DST, IMM);
+                CONT;
+        ALU_DIV_K:
+                tmp = (u32) DST;
+                do_div(tmp, (u32) IMM);
+                DST = (u32) tmp;
+                CONT;
+        ALU_END_TO_BE:
+                switch (IMM) {
+                case 16:
+                        DST = (__force u16) cpu_to_be16(DST);
+                        break;
+                case 32:
+                        DST = (__force u32) cpu_to_be32(DST);
+                        break;
+                case 64:
+                        DST = (__force u64) cpu_to_be64(DST);
+                        break;
+                }
+                CONT;
+        ALU_END_TO_LE:
+                switch (IMM) {
+                case 16:
+                        DST = (__force u16) cpu_to_le16(DST);
+                        break;
+                case 32:
+                        DST = (__force u32) cpu_to_le32(DST);
+                        break;
+                case 64:
+                        DST = (__force u64) cpu_to_le64(DST);
+                        break;
+                }
+                CONT;
+        /* CALL */
+        JMP_CALL:
+                /* Function call scratches BPF_R1-BPF_R5 registers,
+                 * preserves BPF_R6-BPF_R9, and stores return value
+                 * into BPF_R0.
+                 */
+                BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
+                                                       BPF_R4, BPF_R5);
+                CONT;
+        /* JMP */
+        JMP_JA:
+                insn += insn->off;
+                CONT;
+        JMP_JEQ_X:
+                if (DST == SRC) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JEQ_K:
+                if (DST == IMM) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JNE_X:
+                if (DST != SRC) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JNE_K:
+                if (DST != IMM) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JGT_X:
+                if (DST > SRC) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JGT_K:
+                if (DST > IMM) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JGE_X:
+                if (DST >= SRC) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JGE_K:
+                if (DST >= IMM) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JSGT_X:
+                if (((s64) DST) > ((s64) SRC)) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JSGT_K:
+                if (((s64) DST) > ((s64) IMM)) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JSGE_X:
+                if (((s64) DST) >= ((s64) SRC)) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JSGE_K:
+                if (((s64) DST) >= ((s64) IMM)) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JSET_X:
+                if (DST & SRC) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_JSET_K:
+                if (DST & IMM) {
+                        insn += insn->off;
+                        CONT_JMP;
+                }
+                CONT;
+        JMP_EXIT:
+                return BPF_R0;
+        /* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE)                                              \
+        STX_MEM_##SIZEOP:                                               \
+                *(SIZE *)(unsigned long) (DST + insn->off) = SRC;       \
+                CONT;                                                   \
+        ST_MEM_##SIZEOP:                                                \
+                *(SIZE *)(unsigned long) (DST + insn->off) = IMM;       \
+                CONT;                                                   \
+        LDX_MEM_##SIZEOP:                                               \
+                DST = *(SIZE *)(unsigned long) (SRC + insn->off);       \
+                CONT;
+        LDST(B,   u8)
+        LDST(H,  u16)
+        LDST(W,  u32)
+        LDST(DW, u64)
+#undef LDST
+        STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+                atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+                           (DST + insn->off));
+                CONT;
+        STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+                atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+                             (DST + insn->off));
+                CONT;
+        LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+                off = IMM;
+load_word:
+                /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
+                 * only appearing in the programs where ctx ==
+                 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
+                 * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
+                 * internal BPF verifier will check that BPF_R6 ==
+                 * ctx.
+                 *
+                 * BPF_ABS and BPF_IND are wrappers of function calls,
+                 * so they scratch BPF_R1-BPF_R5 registers, preserve
+                 * BPF_R6-BPF_R9, and store return value into BPF_R0.
+                 *
+                 * Implicit input:
+                 *   ctx == skb == BPF_R6 == CTX
+                 *
+                 * Explicit input:
+                 *   SRC == any register
+                 *   IMM == 32-bit immediate
+                 *
+                 * Output:
+                 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+                 */
+                ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+                if (likely(ptr != NULL)) {
+                        BPF_R0 = get_unaligned_be32(ptr);
+                        CONT;
+                }
+                return 0;
+        LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+                off = IMM;
+load_half:
+                ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+                if (likely(ptr != NULL)) {
+                        BPF_R0 = get_unaligned_be16(ptr);
+                        CONT;
+                }
+                return 0;
+        LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+                off = IMM;
+load_byte:
+                ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+                if (likely(ptr != NULL)) {
+                        BPF_R0 = *(u8 *)ptr;
+                        CONT;
+                }
+                return 0;
+        LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+                off = IMM + SRC;
+                goto load_word;
+        LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+                off = IMM + SRC;
+                goto load_half;
+        LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+                off = IMM + SRC;
+                goto load_byte;
+        default_label:
+                /* If we ever reach this, we have a bug somewhere. */
+                WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+                return 0;
+}
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
+/**
+ *      bpf_prog_select_runtime - select execution runtime for BPF program
+ *      @fp: bpf_prog populated with internal BPF program
+ *
+ * try to JIT internal BPF program, if JIT is not available select interpreter
+ * BPF program will be executed via BPF_PROG_RUN() macro
+ */
+void bpf_prog_select_runtime(struct bpf_prog *fp)
+{
+        fp->bpf_func = (void *) __bpf_prog_run;
+        /* Probe if internal BPF can be JITed */
+        bpf_int_jit_compile(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+/* free internal BPF program */
+void bpf_prog_free(struct bpf_prog *fp)
+{
+        bpf_jit_free(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_free);
diff --git a/kernel/capability.c b/kernel/capability.c
index a5cf13c018ce..989f5bfc57dc 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -258,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
                i++;
        }
+        effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+        permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+        inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
        new = prepare_creds();
        if (!new)
                return -ENOMEM;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2f7c760305ca..379650b984f8 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2472,7 +2472,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
 static void kdb_sysinfo(struct sysinfo *val)
 {
        struct timespec uptime;
-        do_posix_clock_monotonic_gettime(&uptime);
+        ktime_get_ts(&uptime);
        memset(val, 0, sizeof(*val));
        val->uptime = uptime.tv_sec;
        val->loads[0] = avenrun[0];
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 54996b71e66d..ef90b04d783f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk)
 }
 /*
- * Start accounting for a delay statistic using
+ * Finish delay accounting for a statistic using its timestamps (@start),
- * its starting timestamp (@start)
+ * accumalator (@total) and @count
 */
+static void delayacct_end(u64 *start, u64 *total, u32 *count)
-static inline void delayacct_start(struct timespec *start)
 {
-        do_posix_clock_monotonic_gettime(start);
+        s64 ns = ktime_get_ns() - *start;
-}
-/*
- * Finish delay accounting for a statistic using
- * its timestamps (@start, @end), accumalator (@total) and @count
- */
-static void delayacct_end(struct timespec *start, struct timespec *end,
-                                u64 *total, u32 *count)
-{
-        struct timespec ts;
-        s64 ns;
        unsigned long flags;
-        do_posix_clock_monotonic_gettime(end);
+        if (ns > 0) {
-        ts = timespec_sub(*end, *start);
+                spin_lock_irqsave(&current->delays->lock, flags);
-        ns = timespec_to_ns(&ts);
+                *total += ns;
-        if (ns < 0)
+                (*count)++;
-                return;
+                spin_unlock_irqrestore(&current->delays->lock, flags);
+        }
-        spin_lock_irqsave(&current->delays->lock, flags);
-        *total += ns;
-        (*count)++;
-        spin_unlock_irqrestore(&current->delays->lock, flags);
 }
 void __delayacct_blkio_start(void)
 {
-        delayacct_start(&current->delays->blkio_start);
+        current->delays->blkio_start = ktime_get_ns();
 }
 void __delayacct_blkio_end(void)
@@ -89,35 +72,29 @@ void __delayacct_blkio_end(void)
        if (current->delays->flags & DELAYACCT_PF_SWAPIN)
                /* Swapin block I/O */
                delayacct_end(&current->delays->blkio_start,
-                        &current->delays->blkio_end,
                        &current->delays->swapin_delay,
                        &current->delays->swapin_count);
        else    /* Other block I/O */
                delayacct_end(&current->delays->blkio_start,
-                        &current->delays->blkio_end,
                        &current->delays->blkio_delay,
                        &current->delays->blkio_count);
 }
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 {
-        s64 tmp;
-        unsigned long t1;
-        unsigned long long t2, t3;
-        unsigned long flags;
-        struct timespec ts;
        cputime_t utime, stime, stimescaled, utimescaled;
+        unsigned long long t2, t3;
+        unsigned long flags, t1;
+        s64 tmp;
-        tmp = (s64)d->cpu_run_real_total;
        task_cputime(tsk, &utime, &stime);
-        cputime_to_timespec(utime + stime, &ts);
+        tmp = (s64)d->cpu_run_real_total;
-        tmp += timespec_to_ns(&ts);
+        tmp += cputime_to_nsecs(utime + stime);
        d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
-        tmp = (s64)d->cpu_scaled_run_real_total;
        task_cputime_scaled(tsk, &utimescaled, &stimescaled);
-        cputime_to_timespec(utimescaled + stimescaled, &ts);
+        tmp = (s64)d->cpu_scaled_run_real_total;
-        tmp += timespec_to_ns(&ts);
+        tmp += cputime_to_nsecs(utimescaled + stimescaled);
        d->cpu_scaled_run_real_total =
                (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
@@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 void __delayacct_freepages_start(void)
 {
-        delayacct_start(&current->delays->freepages_start);
+        current->delays->freepages_start = ktime_get_ns();
 }
 void __delayacct_freepages_end(void)
 {
        delayacct_end(&current->delays->freepages_start,
-                        &current->delays->freepages_end,
                        &current->delays->freepages_delay,
                        &current->delays->freepages_count);
 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6f3254e8c137..1d0af8a2c646 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        /* For mmu_notifiers */
        const unsigned long mmun_start = addr;
        const unsigned long mmun_end   = addr + PAGE_SIZE;
+        struct mem_cgroup *memcg;
+        err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+        if (err)
+                return err;
        /* For try_to_free_swap() and munlock_vma_page() below */
        lock_page(page);
@@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        get_page(kpage);
        page_add_new_anon_rmap(kpage, vma, addr);
+        mem_cgroup_commit_charge(kpage, memcg, false);
+        lru_cache_add_active_or_unevictable(kpage, vma);
        if (!PageAnon(page)) {
                dec_mm_counter(mm, MM_FILEPAGES);
@@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        err = 0;
 unlock:
+        mem_cgroup_cancel_charge(kpage, memcg);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
@@ -315,18 +323,11 @@ retry:
        if (!new_page)
                goto put_old;
-        if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
-                goto put_new;
        __SetPageUptodate(new_page);
        copy_highpage(new_page, old_page);
        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        ret = __replace_page(vma, vaddr, old_page, new_page);
-        if (ret)
-                mem_cgroup_uncharge_page(new_page);
-put_new:
        page_cache_release(new_page);
 put_old:
        put_page(old_page);
diff --git a/kernel/exit.c b/kernel/exit.c
index e5c4668f1799..32c58f7433a3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,7 +59,7 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-static void exit_mm(struct task_struct * tsk);
+static void exit_mm(struct task_struct *tsk);
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
@@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk)
        spin_unlock(&sighand->siglock);
        __cleanup_sighand(sighand);
-        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
        if (group_dead) {
                flush_sigqueue(&sig->shared_pending);
                tty_kref_put(tty);
@@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 }
-void release_task(struct task_struct * p)
+void release_task(struct task_struct *p)
 {
        struct task_struct *leader;
        int zap_leader;
@@ -192,7 +192,8 @@ repeat:
         */
        zap_leader = 0;
        leader = p->group_leader;
-        if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+        if (leader != p && thread_group_empty(leader)
+                        && leader->exit_state == EXIT_ZOMBIE) {
                /*
                 * If we were the last child thread and the leader has
                 * exited already, and the leader's parent ignores SIGCHLD,
@@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp)
 *
 * "I ask you, have you ever known what it is to be an orphan?"
 */
-static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
+static int will_become_orphaned_pgrp(struct pid *pgrp,
+                                        struct task_struct *ignored_task)
 {
        struct task_struct *p;
@@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
        struct task_struct *ignored_task = tsk;
        if (!parent)
-                 /* exit: our father is in a different pgrp than
+                /* exit: our father is in a different pgrp than
-                  * we are and we were the only connection outside.
+                 * we are and we were the only connection outside.
-                  */
+                 */
                parent = tsk->real_parent;
        else
                /* reparent: our child is in a different pgrp than
@@ -405,7 +407,7 @@ assign_new_owner:
 * Turn us into a lazy TLB process if we
 * aren't already..
 */
-static void exit_mm(struct task_struct * tsk)
+static void exit_mm(struct task_struct *tsk)
 {
        struct mm_struct *mm = tsk->mm;
        struct core_state *core_state;
@@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk)
        core_state = mm->core_state;
        if (core_state) {
                struct core_thread self;
                up_read(&mm->mmap_sem);
                self.task = tsk;
@@ -455,6 +458,7 @@ static void exit_mm(struct task_struct * tsk)
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
+        clear_thread_flag(TIF_MEMDIE);
 }
 /*
@@ -565,6 +569,7 @@ static void forget_original_parent(struct task_struct *father)
        list_for_each_entry_safe(p, n, &father->children, sibling) {
                struct task_struct *t = p;
                do {
                        t->real_parent = reaper;
                        if (t->parent == father) {
@@ -598,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        /*
         * This does two things:
         *
-         * A.  Make init inherit all the child processes
+         * A.  Make init inherit all the child processes
         * B.  Check to see if any process groups have become orphaned
         *      as a result of our exiting, and if they have any stopped
         *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
@@ -648,9 +653,8 @@ static void check_stack_usage(void)
        spin_lock(&low_water_lock);
        if (free < lowest_to_date) {
-                printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+                pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
-                                "%lu bytes left\n",
+                        current->comm, task_pid_nr(current), free);
-                                current->comm, task_pid_nr(current), free);
                lowest_to_date = free;
        }
        spin_unlock(&low_water_lock);
@@ -691,8 +695,7 @@ void do_exit(long code)
         * leave this task alone and wait for reboot.
         */
        if (unlikely(tsk->flags & PF_EXITING)) {
-                printk(KERN_ALERT
+                pr_alert("Fixing recursive fault but reboot is needed!\n");
-                        "Fixing recursive fault but reboot is needed!\n");
                /*
                 * We can do this unlocked here. The futex code uses
                 * this flag just to verify whether the pi state
@@ -716,9 +719,9 @@ void do_exit(long code)
        raw_spin_unlock_wait(&tsk->pi_lock);
        if (unlikely(in_atomic()))
-                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+                pr_info("note: %s[%d] exited with preempt_count %d\n",
-                                current->comm, task_pid_nr(current),
+                        current->comm, task_pid_nr(current),
-                                preempt_count());
+                        preempt_count());
        acct_update_integrals(tsk);
        /* sync mm's RSS info before statistics gathering */
@@ -836,7 +839,6 @@ void do_exit(long code)
        for (;;)
                cpu_relax();    /* For when BUG is null */
 }
 EXPORT_SYMBOL_GPL(do_exit);
 void complete_and_exit(struct completion *comp, long code)
@@ -846,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code)
        do_exit(code);
 }
 EXPORT_SYMBOL(complete_and_exit);
 SYSCALL_DEFINE1(exit, int, error_code)
@@ -869,6 +870,7 @@ do_group_exit(int exit_code)
                exit_code = sig->group_exit_code;
        else if (!thread_group_empty(current)) {
                struct sighand_struct *const sighand = current->sighand;
                spin_lock_irq(&sighand->siglock);
                if (signal_group_exit(sig))
                        /* Another thread got here before we took the lock.  */
@@ -1033,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                 * as other threads in the parent group can be right
                 * here reaping other children at the same time.
                 *
-                 * We use thread_group_cputime_adjusted() to get times for the thread
+                 * We use thread_group_cputime_adjusted() to get times for
-                 * group, which consolidates times for all threads in the
+                 * the thread group, which consolidates times for all threads
-                 * group including the group leader.
+                 * in the group including the group leader.
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                spin_lock_irq(&p->real_parent->sighand->siglock);
@@ -1417,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
        list_for_each_entry(p, &tsk->children, sibling) {
                int ret = wait_consider_task(wo, 0, p);
                if (ret)
                        return ret;
        }
@@ -1430,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
                int ret = wait_consider_task(wo, 1, p);
                if (ret)
                        return ret;
        }
diff --git a/kernel/fork.c b/kernel/fork.c
index 962885edbe53..0cf9cdb6e491 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                goto free_ti;
        tsk->stack = ti;
+#ifdef CONFIG_SECCOMP
+        /*
+         * We must handle setting up seccomp filters once we're under
+         * the sighand lock in case orig has changed between now and
+         * then. Until then, filter must be NULL to avoid messing up
+         * the usage counts on the error path calling free_task.
+         */
+        tsk->seccomp.filter = NULL;
+#endif
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
@@ -365,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
         */
        down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
-        mm->locked_vm = 0;
+        mm->total_vm = oldmm->total_vm;
-        mm->mmap = NULL;
+        mm->shared_vm = oldmm->shared_vm;
-        mm->vmacache_seqnum = 0;
+        mm->exec_vm = oldmm->exec_vm;
-        mm->map_count = 0;
+        mm->stack_vm = oldmm->stack_vm;
-        cpumask_clear(mm_cpumask(mm));
-        mm->mm_rb = RB_ROOT;
        rb_link = &mm->mm_rb.rb_node;
        rb_parent = NULL;
        pprev = &mm->mmap;
@@ -421,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                atomic_dec(&inode->i_writecount);
                        mutex_lock(&mapping->i_mmap_mutex);
                        if (tmp->vm_flags & VM_SHARED)
-                                mapping->i_mmap_writable++;
+                                atomic_inc(&mapping->i_mmap_writable);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        if (unlikely(tmp->vm_flags & VM_NONLINEAR))
@@ -527,19 +535,37 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
+static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+        mm->owner = p;
+#endif
+}
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 {
+        mm->mmap = NULL;
+        mm->mm_rb = RB_ROOT;
+        mm->vmacache_seqnum = 0;
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        init_rwsem(&mm->mmap_sem);
        INIT_LIST_HEAD(&mm->mmlist);
        mm->core_state = NULL;
        atomic_long_set(&mm->nr_ptes, 0);
+        mm->map_count = 0;
+        mm->locked_vm = 0;
+        mm->pinned_vm = 0;
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
+        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+        mmu_notifier_mm_init(mm);
        clear_tlb_flush_pending(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+        mm->pmd_huge_pte = NULL;
+#endif
        if (current->mm) {
                mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -549,11 +575,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
                mm->def_flags = 0;
        }
-        if (likely(!mm_alloc_pgd(mm))) {
+        if (mm_alloc_pgd(mm))
-                mmu_notifier_mm_init(mm);
+                goto fail_nopgd;
-                return mm;
-        }
+        if (init_new_context(p, mm))
+                goto fail_nocontext;
+        return mm;
+fail_nocontext:
+        mm_free_pgd(mm);
+fail_nopgd:
        free_mm(mm);
        return NULL;
 }
@@ -587,7 +619,6 @@ struct mm_struct *mm_alloc(void)
                return NULL;
        memset(mm, 0, sizeof(*mm));
-        mm_init_cpumask(mm);
        return mm_init(mm, current);
 }
@@ -819,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
                goto fail_nomem;
        memcpy(mm, oldmm, sizeof(*mm));
-        mm_init_cpumask(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-        mm->pmd_huge_pte = NULL;
-#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
-        if (init_new_context(tsk, mm))
-                goto fail_nocontext;
        dup_mm_exe_file(oldmm, mm);
        err = dup_mmap(mm, oldmm);
@@ -851,15 +875,6 @@ free_pt:
 fail_nomem:
        return NULL;
-fail_nocontext:
-        /*
-         * If init_new_context() failed, we cannot use mmput() to free the mm
-         * because it calls destroy_context()
-         */
-        mm_free_pgd(mm);
-        free_mm(mm);
-        return NULL;
 }
 static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@ -1081,6 +1096,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        return 0;
 }
+static void copy_seccomp(struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+        /*
+         * Must be called with sighand->lock held, which is common to
+         * all threads in the group. Holding cred_guard_mutex is not
+         * needed because this new task is not yet running and cannot
+         * be racing exec.
+         */
+        assert_spin_locked(&current->sighand->siglock);
+        /* Ref-count the new filter user, and assign it. */
+        get_seccomp_filter(current);
+        p->seccomp = current->seccomp;
+        /*
+         * Explicitly enable no_new_privs here in case it got set
+         * between the task_struct being duplicated and holding the
+         * sighand lock. The seccomp state and nnp must be in sync.
+         */
+        if (task_no_new_privs(current))
+                task_set_no_new_privs(p);
+        /*
+         * If the parent gained a seccomp mode after copying thread
+         * flags and between before we held the sighand lock, we have
+         * to manually enable the seccomp thread flag here.
+         */
+        if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
+                set_tsk_thread_flag(p, TIF_SECCOMP);
+#endif
+}
 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
 {
        current->clear_child_tid = tidptr;
@@ -1098,13 +1146,6 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
-#ifdef CONFIG_MEMCG
-void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
-{
-        mm->owner = p;
-}
-#endif /* CONFIG_MEMCG */
 /*
 * Initialize POSIX timer handling for a single task.
 */
@@ -1195,7 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto fork_out;
        ftrace_graph_init_task(p);
-        get_seccomp_filter(p);
        rt_mutex_init_task(p);
@@ -1261,9 +1301,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        posix_cpu_timers_init(p);
-        do_posix_clock_monotonic_gettime(&p->start_time);
+        p->start_time = ktime_get_ns();
-        p->real_start_time = p->start_time;
+        p->real_start_time = ktime_get_boot_ns();
-        monotonic_to_bootbased(&p->real_start_time);
        p->io_context = NULL;
        p->audit_context = NULL;
        if (clone_flags & CLONE_THREAD)
@@ -1306,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_MEMCG
-        p->memcg_batch.do_batch = 0;
-        p->memcg_batch.memcg = NULL;
-#endif
 #ifdef CONFIG_BCACHE
        p->sequential_io        = 0;
        p->sequential_io_avg    = 0;
@@ -1327,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (retval)
                goto bad_fork_cleanup_policy;
        /* copy all the process information */
+        shm_init_task(p);
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_audit;
@@ -1436,6 +1472,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        spin_lock(&current->sighand->siglock);
        /*
+         * Copy seccomp details explicitly here, in case they were changed
+         * before holding sighand lock.
+         */
+        copy_seccomp(p);
+        /*
         * Process group and session signals need to be delivered to just the
         * parent before the fork or both the parent and the child after the
         * fork. Restart if a signal comes in before we add the new process to
@@ -1872,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                         */
                        exit_sem(current);
                }
+                if (unshare_flags & CLONE_NEWIPC) {
+                        /* Orphan segments in old ns (see sem above). */
+                        exit_shm(current);
+                        shm_init_task(current);
+                }
                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 15ff01a76379..edf67c493a8e 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -784,8 +784,7 @@ static __init int gcov_fs_init(void)
 err_remove:
        pr_err("init failed\n");
-        if (root_node.dentry)
+        debugfs_remove(root_node.dentry);
-                debugfs_remove(root_node.dentry);
        return rc;
 }
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 452d6f2ba21d..cf80e7b0ddab 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class;
 /*
 * irq_map_generic_chip - Map a generic chip for an irq domain
 */
-static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
+int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
-                                irq_hw_number_t hw_irq)
+                         irq_hw_number_t hw_irq)
 {
        struct irq_data *data = irq_get_irq_data(virq);
        struct irq_domain_chip_generic *dgc = d->gc;
@@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
        irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
        return 0;
 }
+EXPORT_SYMBOL_GPL(irq_map_generic_chip);
 struct irq_domain_ops irq_generic_chip_ops = {
        .map    = irq_map_generic_chip,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index eb5e10e32e05..6534ff6ce02e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain)
 }
 EXPORT_SYMBOL_GPL(irq_set_default_host);
-static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
+void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
 {
        struct irq_data *irq_data = irq_get_irq_data(irq);
        irq_hw_number_t hwirq;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index cb0cf37dac3a..ae5167087845 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address,
        address += symbol_offset;
        name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
        if (!name)
-                return sprintf(buffer, "0x%lx", address);
+                return sprintf(buffer, "0x%lx", address - symbol_offset);
        if (name != buffer)
                strcpy(buffer, name);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4b8f0c925884..0b49a0a58102 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
 * Version 2.  See the file COPYING for more details.
 */
+#define pr_fmt(fmt)     "kexec: " fmt
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
@@ -40,6 +42,9 @@
 #include <asm/io.h>
 #include <asm/sections.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
@@ -52,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+static int kexec_calculate_store_digests(struct kimage *image);
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
        .name  = "Crash kernel",
@@ -125,45 +139,27 @@ static struct page *kimage_alloc_page(struct kimage *image,
                                       gfp_t gfp_mask,
                                       unsigned long dest);
-static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
+static int copy_user_segment_list(struct kimage *image,
-                           unsigned long nr_segments,
+                                  unsigned long nr_segments,
-                           struct kexec_segment __user *segments)
+                                  struct kexec_segment __user *segments)
 {
+        int ret;
        size_t segment_bytes;
-        struct kimage *image;
-        unsigned long i;
-        int result;
-        /* Allocate a controlling structure */
-        result = -ENOMEM;
-        image = kzalloc(sizeof(*image), GFP_KERNEL);
-        if (!image)
-                goto out;
-        image->head = 0;
-        image->entry = &image->head;
-        image->last_entry = &image->head;
-        image->control_page = ~0; /* By default this does not apply */
-        image->start = entry;
-        image->type = KEXEC_TYPE_DEFAULT;
-        /* Initialize the list of control pages */
-        INIT_LIST_HEAD(&image->control_pages);
-        /* Initialize the list of destination pages */
-        INIT_LIST_HEAD(&image->dest_pages);
-        /* Initialize the list of unusable pages */
-        INIT_LIST_HEAD(&image->unuseable_pages);
        /* Read in the segments */
        image->nr_segments = nr_segments;
        segment_bytes = nr_segments * sizeof(*segments);
-        result = copy_from_user(image->segment, segments, segment_bytes);
+        ret = copy_from_user(image->segment, segments, segment_bytes);
-        if (result) {
+        if (ret)
-                result = -EFAULT;
+                ret = -EFAULT;
-                goto out;
-        }
+        return ret;
+}
+static int sanity_check_segment_list(struct kimage *image)
+{
+        int result, i;
+        unsigned long nr_segments = image->nr_segments;
        /*
         * Verify we have good destination addresses.  The caller is
@@ -185,9 +181,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
                mstart = image->segment[i].mem;
                mend   = mstart + image->segment[i].memsz;
                if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-                        goto out;
+                        return result;
                if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-                        goto out;
+                        return result;
        }
        /* Verify our destination addresses do not overlap.
@@ -208,7 +204,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
                        pend   = pstart + image->segment[j].memsz;
                        /* Do the segments overlap ? */
                        if ((mend > pstart) && (mstart < pend))
-                                goto out;
+                                return result;
                }
        }
@@ -220,130 +216,401 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
        result = -EINVAL;
        for (i = 0; i < nr_segments; i++) {
                if (image->segment[i].bufsz > image->segment[i].memsz)
-                        goto out;
+                        return result;
        }
-        result = 0;
+        /*
-out:
+         * Verify we have good destination addresses.  Normally
-        if (result == 0)
+         * the caller is responsible for making certain we don't
-                *rimage = image;
+         * attempt to load the new image into invalid or reserved
-        else
+         * areas of RAM.  But crash kernels are preloaded into a
-                kfree(image);
+         * reserved area of ram.  We must ensure the addresses
+         * are in the reserved area otherwise preloading the
+         * kernel could corrupt things.
+         */
-        return result;
+        if (image->type == KEXEC_TYPE_CRASH) {
+                result = -EADDRNOTAVAIL;
+                for (i = 0; i < nr_segments; i++) {
+                        unsigned long mstart, mend;
+                        mstart = image->segment[i].mem;
+                        mend = mstart + image->segment[i].memsz - 1;
+                        /* Ensure we are within the crash kernel limits */
+                        if ((mstart < crashk_res.start) ||
+                            (mend > crashk_res.end))
+                                return result;
+                }
+        }
+        return 0;
+}
+static struct kimage *do_kimage_alloc_init(void)
+{
+        struct kimage *image;
+        /* Allocate a controlling structure */
+        image = kzalloc(sizeof(*image), GFP_KERNEL);
+        if (!image)
+                return NULL;
+        image->head = 0;
+        image->entry = &image->head;
+        image->last_entry = &image->head;
+        image->control_page = ~0; /* By default this does not apply */
+        image->type = KEXEC_TYPE_DEFAULT;
+        /* Initialize the list of control pages */
+        INIT_LIST_HEAD(&image->control_pages);
+        /* Initialize the list of destination pages */
+        INIT_LIST_HEAD(&image->dest_pages);
+        /* Initialize the list of unusable pages */
+        INIT_LIST_HEAD(&image->unusable_pages);
+        return image;
 }
 static void kimage_free_page_list(struct list_head *list);
-static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
+static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
-                                unsigned long nr_segments,
+                             unsigned long nr_segments,
-                                struct kexec_segment __user *segments)
+                             struct kexec_segment __user *segments,
+                             unsigned long flags)
 {
-        int result;
+        int ret;
        struct kimage *image;
+        bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+        if (kexec_on_panic) {
+                /* Verify we have a valid entry point */
+                if ((entry < crashk_res.start) || (entry > crashk_res.end))
+                        return -EADDRNOTAVAIL;
+        }
        /* Allocate and initialize a controlling structure */
-        image = NULL;
+        image = do_kimage_alloc_init();
-        result = do_kimage_alloc(&image, entry, nr_segments, segments);
+        if (!image)
-        if (result)
+                return -ENOMEM;
-                goto out;
+        image->start = entry;
+        ret = copy_user_segment_list(image, nr_segments, segments);
+        if (ret)
+                goto out_free_image;
+        ret = sanity_check_segment_list(image);
+        if (ret)
+                goto out_free_image;
+         /* Enable the special crash kernel control page allocation policy. */
+        if (kexec_on_panic) {
+                image->control_page = crashk_res.start;
+                image->type = KEXEC_TYPE_CRASH;
+        }
        /*
         * Find a location for the control code buffer, and add it
         * the vector of segments so that it's pages will also be
         * counted as destination pages.
         */
-        result = -ENOMEM;
+        ret = -ENOMEM;
        image->control_code_page = kimage_alloc_control_pages(image,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                pr_err("Could not allocate control_code_buffer\n");
-                goto out_free;
+                goto out_free_image;
        }
-        image->swap_page = kimage_alloc_control_pages(image, 0);
+        if (!kexec_on_panic) {
-        if (!image->swap_page) {
+                image->swap_page = kimage_alloc_control_pages(image, 0);
-                pr_err("Could not allocate swap buffer\n");
+                if (!image->swap_page) {
-                goto out_free;
+                        pr_err("Could not allocate swap buffer\n");
+                        goto out_free_control_pages;
+                }
        }
        *rimage = image;
        return 0;
+out_free_control_pages:
-out_free:
        kimage_free_page_list(&image->control_pages);
+out_free_image:
        kfree(image);
-out:
+        return ret;
-        return result;
 }
-static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-                                unsigned long nr_segments,
-                                struct kexec_segment __user *segments)
 {
-        int result;
+        struct fd f = fdget(fd);
-        struct kimage *image;
+        int ret;
-        unsigned long i;
+        struct kstat stat;
+        loff_t pos;
+        ssize_t bytes = 0;
-        image = NULL;
+        if (!f.file)
-        /* Verify we have a valid entry point */
+                return -EBADF;
-        if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
-                result = -EADDRNOTAVAIL;
+        ret = vfs_getattr(&f.file->f_path, &stat);
+        if (ret)
+                goto out;
+        if (stat.size > INT_MAX) {
+                ret = -EFBIG;
                goto out;
        }
-        /* Allocate and initialize a controlling structure */
+        /* Don't hand 0 to vmalloc, it whines. */
-        result = do_kimage_alloc(&image, entry, nr_segments, segments);
+        if (stat.size == 0) {
-        if (result)
+                ret = -EINVAL;
                goto out;
+        }
-        /* Enable the special crash kernel control page
+        *buf = vmalloc(stat.size);
-         * allocation policy.
+        if (!*buf) {
-         */
+                ret = -ENOMEM;
-        image->control_page = crashk_res.start;
+                goto out;
-        image->type = KEXEC_TYPE_CRASH;
+        }
-        /*
+        pos = 0;
-         * Verify we have good destination addresses.  Normally
+        while (pos < stat.size) {
-         * the caller is responsible for making certain we don't
+                bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
-         * attempt to load the new image into invalid or reserved
+                                    stat.size - pos);
-         * areas of RAM.  But crash kernels are preloaded into a
+                if (bytes < 0) {
-         * reserved area of ram.  We must ensure the addresses
+                        vfree(*buf);
-         * are in the reserved area otherwise preloading the
+                        ret = bytes;
-         * kernel could corrupt things.
+                        goto out;
-         */
+                }
-        result = -EADDRNOTAVAIL;
-        for (i = 0; i < nr_segments; i++) {
-                unsigned long mstart, mend;
-                mstart = image->segment[i].mem;
+                if (bytes == 0)
-                mend = mstart + image->segment[i].memsz - 1;
+                        break;
-                /* Ensure we are within the crash kernel limits */
+                pos += bytes;
-                if ((mstart < crashk_res.start) || (mend > crashk_res.end))
-                        goto out_free;
        }
+        if (pos != stat.size) {
+                ret = -EBADF;
+                vfree(*buf);
+                goto out;
+        }
+        *buf_len = pos;
+out:
+        fdput(f);
+        return ret;
+}
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                         unsigned long buf_len)
+{
+        return -ENOEXEC;
+}
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+        return ERR_PTR(-ENOEXEC);
+}
+void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+}
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+                                        unsigned long buf_len)
+{
+        return -EKEYREJECTED;
+}
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                                 unsigned int relsec)
+{
+        pr_err("RELA relocation unsupported.\n");
+        return -ENOEXEC;
+}
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                             unsigned int relsec)
+{
+        pr_err("REL relocation unsupported.\n");
+        return -ENOEXEC;
+}
+/*
+ * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+static void kimage_file_post_load_cleanup(struct kimage *image)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        vfree(image->kernel_buf);
+        image->kernel_buf = NULL;
+        vfree(image->initrd_buf);
+        image->initrd_buf = NULL;
+        kfree(image->cmdline_buf);
+        image->cmdline_buf = NULL;
+        vfree(pi->purgatory_buf);
+        pi->purgatory_buf = NULL;
+        vfree(pi->sechdrs);
+        pi->sechdrs = NULL;
+        /* See if architecture has anything to cleanup post load */
+        arch_kimage_file_post_load_cleanup(image);
        /*
-         * Find a location for the control code buffer, and add
+         * Above call should have called into bootloader to free up
-         * the vector of segments so that it's pages will also be
+         * any data stored in kimage->image_loader_data. It should
-         * counted as destination pages.
+         * be ok now to free it up.
         */
-        result = -ENOMEM;
+        kfree(image->image_loader_data);
+        image->image_loader_data = NULL;
+}
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+                             const char __user *cmdline_ptr,
+                             unsigned long cmdline_len, unsigned flags)
+{
+        int ret = 0;
+        void *ldata;
+        ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+                                &image->kernel_buf_len);
+        if (ret)
+                return ret;
+        /* Call arch image probe handlers */
+        ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+                                            image->kernel_buf_len);
+        if (ret)
+                goto out;
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+        ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+                                           image->kernel_buf_len);
+        if (ret) {
+                pr_debug("kernel signature verification failed.\n");
+                goto out;
+        }
+        pr_debug("kernel signature verification successful.\n");
+#endif
+        /* It is possible that there no initramfs is being loaded */
+        if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+                ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+                                        &image->initrd_buf_len);
+                if (ret)
+                        goto out;
+        }
+        if (cmdline_len) {
+                image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+                if (!image->cmdline_buf) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+                                     cmdline_len);
+                if (ret) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                image->cmdline_buf_len = cmdline_len;
+                /* command line should be a string with last byte null */
+                if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+                        ret = -EINVAL;
+                        goto out;
+                }
+        }
+        /* Call arch image load handlers */
+        ldata = arch_kexec_kernel_image_load(image);
+        if (IS_ERR(ldata)) {
+                ret = PTR_ERR(ldata);
+                goto out;
+        }
+        image->image_loader_data = ldata;
+out:
+        /* In case of error, free up all allocated memory in this function */
+        if (ret)
+                kimage_file_post_load_cleanup(image);
+        return ret;
+}
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+                       int initrd_fd, const char __user *cmdline_ptr,
+                       unsigned long cmdline_len, unsigned long flags)
+{
+        int ret;
+        struct kimage *image;
+        bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+        image = do_kimage_alloc_init();
+        if (!image)
+                return -ENOMEM;
+        image->file_mode = 1;
+        if (kexec_on_panic) {
+                /* Enable special crash kernel control page alloc policy. */
+                image->control_page = crashk_res.start;
+                image->type = KEXEC_TYPE_CRASH;
+        }
+        ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+                                           cmdline_ptr, cmdline_len, flags);
+        if (ret)
+                goto out_free_image;
+        ret = sanity_check_segment_list(image);
+        if (ret)
+                goto out_free_post_load_bufs;
+        ret = -ENOMEM;
        image->control_code_page = kimage_alloc_control_pages(image,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                pr_err("Could not allocate control_code_buffer\n");
-                goto out_free;
+                goto out_free_post_load_bufs;
+        }
+        if (!kexec_on_panic) {
+                image->swap_page = kimage_alloc_control_pages(image, 0);
+                if (!image->swap_page) {
+                        pr_err(KERN_ERR "Could not allocate swap buffer\n");
+                        goto out_free_control_pages;
+                }
        }
        *rimage = image;
        return 0;
+out_free_control_pages:
-out_free:
+        kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+        kimage_file_post_load_cleanup(image);
+out_free_image:
        kfree(image);
-out:
+        return ret;
-        return result;
 }
 static int kimage_is_destination_range(struct kimage *image,
@@ -609,7 +876,7 @@ static void kimage_free_extra_pages(struct kimage *image)
        kimage_free_page_list(&image->dest_pages);
        /* Walk through and free any unusable pages I have cached */
-        kimage_free_page_list(&image->unuseable_pages);
+        kimage_free_page_list(&image->unusable_pages);
 }
 static void kimage_terminate(struct kimage *image)
@@ -663,6 +930,14 @@ static void kimage_free(struct kimage *image)
        /* Free the kexec control pages... */
        kimage_free_page_list(&image->control_pages);
+        /*
+         * Free up any temporary buffers allocated. This might hit if
+         * error occurred much later after buffer allocation.
+         */
+        if (image->file_mode)
+                kimage_file_post_load_cleanup(image);
        kfree(image);
 }
@@ -732,7 +1007,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
                /* If the page cannot be used file it away */
                if (page_to_pfn(page) >
                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
-                        list_add(&page->lru, &image->unuseable_pages);
+                        list_add(&page->lru, &image->unusable_pages);
                        continue;
                }
                addr = page_to_pfn(page) << PAGE_SHIFT;
@@ -791,10 +1066,14 @@ static int kimage_load_normal_segment(struct kimage *image,
        unsigned long maddr;
        size_t ubytes, mbytes;
        int result;
-        unsigned char __user *buf;
+        unsigned char __user *buf = NULL;
+        unsigned char *kbuf = NULL;
        result = 0;
-        buf = segment->buf;
+        if (image->file_mode)
+                kbuf = segment->kbuf;
+        else
+                buf = segment->buf;
        ubytes = segment->bufsz;
        mbytes = segment->memsz;
        maddr = segment->mem;
@@ -826,7 +1105,11 @@ static int kimage_load_normal_segment(struct kimage *image,
                                PAGE_SIZE - (maddr & ~PAGE_MASK));
                uchunk = min(ubytes, mchunk);
-                result = copy_from_user(ptr, buf, uchunk);
+                /* For file based kexec, source pages are in kernel memory */
+                if (image->file_mode)
+                        memcpy(ptr, kbuf, uchunk);
+                else
+                        result = copy_from_user(ptr, buf, uchunk);
                kunmap(page);
                if (result) {
                        result = -EFAULT;
@@ -834,7 +1117,10 @@ static int kimage_load_normal_segment(struct kimage *image,
                }
                ubytes -= uchunk;
                maddr  += mchunk;
-                buf    += mchunk;
+                if (image->file_mode)
+                        kbuf += mchunk;
+                else
+                        buf += mchunk;
                mbytes -= mchunk;
        }
 out:
@@ -851,10 +1137,14 @@ static int kimage_load_crash_segment(struct kimage *image,
        unsigned long maddr;
        size_t ubytes, mbytes;
        int result;
-        unsigned char __user *buf;
+        unsigned char __user *buf = NULL;
+        unsigned char *kbuf = NULL;
        result = 0;
-        buf = segment->buf;
+        if (image->file_mode)
+                kbuf = segment->kbuf;
+        else
+                buf = segment->buf;
        ubytes = segment->bufsz;
        mbytes = segment->memsz;
        maddr = segment->mem;
@@ -877,7 +1167,12 @@ static int kimage_load_crash_segment(struct kimage *image,
                        /* Zero the trailing part of the page */
                        memset(ptr + uchunk, 0, mchunk - uchunk);
                }
-                result = copy_from_user(ptr, buf, uchunk);
+                /* For file based kexec, source pages are in kernel memory */
+                if (image->file_mode)
+                        memcpy(ptr, kbuf, uchunk);
+                else
+                        result = copy_from_user(ptr, buf, uchunk);
                kexec_flush_icache_page(page);
                kunmap(page);
                if (result) {
@@ -886,7 +1181,10 @@ static int kimage_load_crash_segment(struct kimage *image,
                }
                ubytes -= uchunk;
                maddr  += mchunk;
-                buf    += mchunk;
+                if (image->file_mode)
+                        kbuf += mchunk;
+                else
+                        buf += mchunk;
                mbytes -= mchunk;
        }
 out:
@@ -986,16 +1284,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                /* Loading another kernel to reboot into */
                if ((flags & KEXEC_ON_CRASH) == 0)
-                        result = kimage_normal_alloc(&image, entry,
+                        result = kimage_alloc_init(&image, entry, nr_segments,
-                                                        nr_segments, segments);
+                                                   segments, flags);
                /* Loading another kernel to switch to if this one crashes */
                else if (flags & KEXEC_ON_CRASH) {
                        /* Free any current crash dump kernel before
                         * we corrupt it.
                         */
                        kimage_free(xchg(&kexec_crash_image, NULL));
-                        result = kimage_crash_alloc(&image, entry,
+                        result = kimage_alloc_init(&image, entry, nr_segments,
-                                                     nr_segments, segments);
+                                                   segments, flags);
                        crash_map_reserved_pages();
                }
                if (result)
@@ -1077,6 +1375,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 }
 #endif
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+                unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+                unsigned long, flags)
+{
+        int ret = 0, i;
+        struct kimage **dest_image, *image;
+        /* We only trust the superuser with rebooting the system. */
+        if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+                return -EPERM;
+        /* Make sure we have a legal set of flags */
+        if (flags != (flags & KEXEC_FILE_FLAGS))
+                return -EINVAL;
+        image = NULL;
+        if (!mutex_trylock(&kexec_mutex))
+                return -EBUSY;
+        dest_image = &kexec_image;
+        if (flags & KEXEC_FILE_ON_CRASH)
+                dest_image = &kexec_crash_image;
+        if (flags & KEXEC_FILE_UNLOAD)
+                goto exchange;
+        /*
+         * In case of crash, new kernel gets loaded in reserved region. It is
+         * same memory where old crash kernel might be loaded. Free any
+         * current crash dump kernel before we corrupt it.
+         */
+        if (flags & KEXEC_FILE_ON_CRASH)
+                kimage_free(xchg(&kexec_crash_image, NULL));
+        ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+                                     cmdline_len, flags);
+        if (ret)
+                goto out;
+        ret = machine_kexec_prepare(image);
+        if (ret)
+                goto out;
+        ret = kexec_calculate_store_digests(image);
+        if (ret)
+                goto out;
+        for (i = 0; i < image->nr_segments; i++) {
+                struct kexec_segment *ksegment;
+                ksegment = &image->segment[i];
+                pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+                         i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+                         ksegment->memsz);
+                ret = kimage_load_segment(image, &image->segment[i]);
+                if (ret)
+                        goto out;
+        }
+        kimage_terminate(image);
+        /*
+         * Free up any temporary buffers allocated which are not needed
+         * after image has been loaded
+         */
+        kimage_file_post_load_cleanup(image);
+exchange:
+        image = xchg(dest_image, image);
+out:
+        mutex_unlock(&kexec_mutex);
+        kimage_free(image);
+        return ret;
+}
 void crash_kexec(struct pt_regs *regs)
 {
        /* Take the kexec_mutex here to prevent sys_kexec_load
@@ -1632,6 +2006,683 @@ static int __init crash_save_vmcoreinfo_init(void)
 subsys_initcall(crash_save_vmcoreinfo_init);
+static int __kexec_add_segment(struct kimage *image, char *buf,
+                               unsigned long bufsz, unsigned long mem,
+                               unsigned long memsz)
+{
+        struct kexec_segment *ksegment;
+        ksegment = &image->segment[image->nr_segments];
+        ksegment->kbuf = buf;
+        ksegment->bufsz = bufsz;
+        ksegment->mem = mem;
+        ksegment->memsz = memsz;
+        image->nr_segments++;
+        return 0;
+}
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+                                    struct kexec_buf *kbuf)
+{
+        struct kimage *image = kbuf->image;
+        unsigned long temp_start, temp_end;
+        temp_end = min(end, kbuf->buf_max);
+        temp_start = temp_end - kbuf->memsz;
+        do {
+                /* align down start */
+                temp_start = temp_start & (~(kbuf->buf_align - 1));
+                if (temp_start < start || temp_start < kbuf->buf_min)
+                        return 0;
+                temp_end = temp_start + kbuf->memsz - 1;
+                /*
+                 * Make sure this does not conflict with any of existing
+                 * segments
+                 */
+                if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                        temp_start = temp_start - PAGE_SIZE;
+                        continue;
+                }
+                /* We found a suitable memory range */
+                break;
+        } while (1);
+        /* If we are here, we found a suitable memory range */
+        __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+                            kbuf->memsz);
+        /* Success, stop navigating through remaining System RAM ranges */
+        return 1;
+}
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+                                     struct kexec_buf *kbuf)
+{
+        struct kimage *image = kbuf->image;
+        unsigned long temp_start, temp_end;
+        temp_start = max(start, kbuf->buf_min);
+        do {
+                temp_start = ALIGN(temp_start, kbuf->buf_align);
+                temp_end = temp_start + kbuf->memsz - 1;
+                if (temp_end > end || temp_end > kbuf->buf_max)
+                        return 0;
+                /*
+                 * Make sure this does not conflict with any of existing
+                 * segments
+                 */
+                if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                        temp_start = temp_start + PAGE_SIZE;
+                        continue;
+                }
+                /* We found a suitable memory range */
+                break;
+        } while (1);
+        /* If we are here, we found a suitable memory range */
+        __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+                            kbuf->memsz);
+        /* Success, stop navigating through remaining System RAM ranges */
+        return 1;
+}
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+        struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+        unsigned long sz = end - start + 1;
+        /* Returning 0 will take to next memory range */
+        if (sz < kbuf->memsz)
+                return 0;
+        if (end < kbuf->buf_min || start > kbuf->buf_max)
+                return 0;
+        /*
+         * Allocate memory top down with-in ram range. Otherwise bottom up
+         * allocation.
+         */
+        if (kbuf->top_down)
+                return locate_mem_hole_top_down(start, end, kbuf);
+        return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+                     unsigned long memsz, unsigned long buf_align,
+                     unsigned long buf_min, unsigned long buf_max,
+                     bool top_down, unsigned long *load_addr)
+{
+        struct kexec_segment *ksegment;
+        struct kexec_buf buf, *kbuf;
+        int ret;
+        /* Currently adding segment this way is allowed only in file mode */
+        if (!image->file_mode)
+                return -EINVAL;
+        if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+                return -EINVAL;
+        /*
+         * Make sure we are not trying to add buffer after allocating
+         * control pages. All segments need to be placed first before
+         * any control pages are allocated. As control page allocation
+         * logic goes through list of segments to make sure there are
+         * no destination overlaps.
+         */
+        if (!list_empty(&image->control_pages)) {
+                WARN_ON(1);
+                return -EINVAL;
+        }
+        memset(&buf, 0, sizeof(struct kexec_buf));
+        kbuf = &buf;
+        kbuf->image = image;
+        kbuf->buffer = buffer;
+        kbuf->bufsz = bufsz;
+        kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+        kbuf->buf_align = max(buf_align, PAGE_SIZE);
+        kbuf->buf_min = buf_min;
+        kbuf->buf_max = buf_max;
+        kbuf->top_down = top_down;
+        /* Walk the RAM ranges and allocate a suitable range for the buffer */
+        if (image->type == KEXEC_TYPE_CRASH)
+                ret = walk_iomem_res("Crash kernel",
+                                     IORESOURCE_MEM | IORESOURCE_BUSY,
+                                     crashk_res.start, crashk_res.end, kbuf,
+                                     locate_mem_hole_callback);
+        else
+                ret = walk_system_ram_res(0, -1, kbuf,
+                                          locate_mem_hole_callback);
+        if (ret != 1) {
+                /* A suitable memory range could not be found for buffer */
+                return -EADDRNOTAVAIL;
+        }
+        /* Found a suitable memory range */
+        ksegment = &image->segment[image->nr_segments - 1];
+        *load_addr = ksegment->mem;
+        return 0;
+}
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+        struct crypto_shash *tfm;
+        struct shash_desc *desc;
+        int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+        size_t desc_size, nullsz;
+        char *digest;
+        void *zero_buf;
+        struct kexec_sha_region *sha_regions;
+        struct purgatory_info *pi = &image->purgatory_info;
+        zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+        zero_buf_sz = PAGE_SIZE;
+        tfm = crypto_alloc_shash("sha256", 0, 0);
+        if (IS_ERR(tfm)) {
+                ret = PTR_ERR(tfm);
+                goto out;
+        }
+        desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+        desc = kzalloc(desc_size, GFP_KERNEL);
+        if (!desc) {
+                ret = -ENOMEM;
+                goto out_free_tfm;
+        }
+        sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+        sha_regions = vzalloc(sha_region_sz);
+        if (!sha_regions)
+                goto out_free_desc;
+        desc->tfm   = tfm;
+        desc->flags = 0;
+        ret = crypto_shash_init(desc);
+        if (ret < 0)
+                goto out_free_sha_regions;
+        digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+        if (!digest) {
+                ret = -ENOMEM;
+                goto out_free_sha_regions;
+        }
+        for (j = i = 0; i < image->nr_segments; i++) {
+                struct kexec_segment *ksegment;
+                ksegment = &image->segment[i];
+                /*
+                 * Skip purgatory as it will be modified once we put digest
+                 * info in purgatory.
+                 */
+                if (ksegment->kbuf == pi->purgatory_buf)
+                        continue;
+                ret = crypto_shash_update(desc, ksegment->kbuf,
+                                          ksegment->bufsz);
+                if (ret)
+                        break;
+                /*
+                 * Assume rest of the buffer is filled with zero and
+                 * update digest accordingly.
+                 */
+                nullsz = ksegment->memsz - ksegment->bufsz;
+                while (nullsz) {
+                        unsigned long bytes = nullsz;
+                        if (bytes > zero_buf_sz)
+                                bytes = zero_buf_sz;
+                        ret = crypto_shash_update(desc, zero_buf, bytes);
+                        if (ret)
+                                break;
+                        nullsz -= bytes;
+                }
+                if (ret)
+                        break;
+                sha_regions[j].start = ksegment->mem;
+                sha_regions[j].len = ksegment->memsz;
+                j++;
+        }
+        if (!ret) {
+                ret = crypto_shash_final(desc, digest);
+                if (ret)
+                        goto out_free_digest;
+                ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+                                                sha_regions, sha_region_sz, 0);
+                if (ret)
+                        goto out_free_digest;
+                ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+                                                digest, SHA256_DIGEST_SIZE, 0);
+                if (ret)
+                        goto out_free_digest;
+        }
+out_free_digest:
+        kfree(digest);
+out_free_sha_regions:
+        vfree(sha_regions);
+out_free_desc:
+        kfree(desc);
+out_free_tfm:
+        kfree(tfm);
+out:
+        return ret;
+}
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+                                  unsigned long max, int top_down)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+        unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+        unsigned char *buf_addr, *src;
+        int i, ret = 0, entry_sidx = -1;
+        const Elf_Shdr *sechdrs_c;
+        Elf_Shdr *sechdrs = NULL;
+        void *purgatory_buf = NULL;
+        /*
+         * sechdrs_c points to section headers in purgatory and are read
+         * only. No modifications allowed.
+         */
+        sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+        /*
+         * We can not modify sechdrs_c[] and its fields. It is read only.
+         * Copy it over to a local copy where one can store some temporary
+         * data and free it at the end. We need to modify ->sh_addr and
+         * ->sh_offset fields to keep track of permanent and temporary
+         * locations of sections.
+         */
+        sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+        if (!sechdrs)
+                return -ENOMEM;
+        memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+        /*
+         * We seem to have multiple copies of sections. First copy is which
+         * is embedded in kernel in read only section. Some of these sections
+         * will be copied to a temporary buffer and relocated. And these
+         * sections will finally be copied to their final destination at
+         * segment load time.
+         *
+         * Use ->sh_offset to reflect section address in memory. It will
+         * point to original read only copy if section is not allocatable.
+         * Otherwise it will point to temporary copy which will be relocated.
+         *
+         * Use ->sh_addr to contain final address of the section where it
+         * will go during execution time.
+         */
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (sechdrs[i].sh_type == SHT_NOBITS)
+                        continue;
+                sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+                                                sechdrs[i].sh_offset;
+        }
+        /*
+         * Identify entry point section and make entry relative to section
+         * start.
+         */
+        entry = pi->ehdr->e_entry;
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+                        continue;
+                /* Make entry section relative */
+                if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+                    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+                     pi->ehdr->e_entry)) {
+                        entry_sidx = i;
+                        entry -= sechdrs[i].sh_addr;
+                        break;
+                }
+        }
+        /* Determine how much memory is needed to load relocatable object. */
+        buf_align = 1;
+        bss_align = 1;
+        buf_sz = 0;
+        bss_sz = 0;
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                align = sechdrs[i].sh_addralign;
+                if (sechdrs[i].sh_type != SHT_NOBITS) {
+                        if (buf_align < align)
+                                buf_align = align;
+                        buf_sz = ALIGN(buf_sz, align);
+                        buf_sz += sechdrs[i].sh_size;
+                } else {
+                        /* bss section */
+                        if (bss_align < align)
+                                bss_align = align;
+                        bss_sz = ALIGN(bss_sz, align);
+                        bss_sz += sechdrs[i].sh_size;
+                }
+        }
+        /* Determine the bss padding required to align bss properly */
+        bss_pad = 0;
+        if (buf_sz & (bss_align - 1))
+                bss_pad = bss_align - (buf_sz & (bss_align - 1));
+        memsz = buf_sz + bss_pad + bss_sz;
+        /* Allocate buffer for purgatory */
+        purgatory_buf = vzalloc(buf_sz);
+        if (!purgatory_buf) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        if (buf_align < bss_align)
+                buf_align = bss_align;
+        /* Add buffer to segment list */
+        ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+                                buf_align, min, max, top_down,
+                                &pi->purgatory_load_addr);
+        if (ret)
+                goto out;
+        /* Load SHF_ALLOC sections */
+        buf_addr = purgatory_buf;
+        load_addr = curr_load_addr = pi->purgatory_load_addr;
+        bss_addr = load_addr + buf_sz + bss_pad;
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                align = sechdrs[i].sh_addralign;
+                if (sechdrs[i].sh_type != SHT_NOBITS) {
+                        curr_load_addr = ALIGN(curr_load_addr, align);
+                        offset = curr_load_addr - load_addr;
+                        /* We already modifed ->sh_offset to keep src addr */
+                        src = (char *) sechdrs[i].sh_offset;
+                        memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+                        /* Store load address and source address of section */
+                        sechdrs[i].sh_addr = curr_load_addr;
+                        /*
+                         * This section got copied to temporary buffer. Update
+                         * ->sh_offset accordingly.
+                         */
+                        sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+                        /* Advance to the next address */
+                        curr_load_addr += sechdrs[i].sh_size;
+                } else {
+                        bss_addr = ALIGN(bss_addr, align);
+                        sechdrs[i].sh_addr = bss_addr;
+                        bss_addr += sechdrs[i].sh_size;
+                }
+        }
+        /* Update entry point based on load address of text section */
+        if (entry_sidx >= 0)
+                entry += sechdrs[entry_sidx].sh_addr;
+        /* Make kernel jump to purgatory after shutdown */
+        image->start = entry;
+        /* Used later to get/set symbol values */
+        pi->sechdrs = sechdrs;
+        /*
+         * Used later to identify which section is purgatory and skip it
+         * from checksumming.
+         */
+        pi->purgatory_buf = purgatory_buf;
+        return ret;
+out:
+        vfree(sechdrs);
+        vfree(purgatory_buf);
+        return ret;
+}
+static int kexec_apply_relocations(struct kimage *image)
+{
+        int i, ret;
+        struct purgatory_info *pi = &image->purgatory_info;
+        Elf_Shdr *sechdrs = pi->sechdrs;
+        /* Apply relocations */
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                Elf_Shdr *section, *symtab;
+                if (sechdrs[i].sh_type != SHT_RELA &&
+                    sechdrs[i].sh_type != SHT_REL)
+                        continue;
+                /*
+                 * For section of type SHT_RELA/SHT_REL,
+                 * ->sh_link contains section header index of associated
+                 * symbol table. And ->sh_info contains section header
+                 * index of section to which relocations apply.
+                 */
+                if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+                    sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+                        return -ENOEXEC;
+                section = &sechdrs[sechdrs[i].sh_info];
+                symtab = &sechdrs[sechdrs[i].sh_link];
+                if (!(section->sh_flags & SHF_ALLOC))
+                        continue;
+                /*
+                 * symtab->sh_link contain section header index of associated
+                 * string table.
+                 */
+                if (symtab->sh_link >= pi->ehdr->e_shnum)
+                        /* Invalid section number? */
+                        continue;
+                /*
+                 * Respective archicture needs to provide support for applying
+                 * relocations of type SHT_RELA/SHT_REL.
+                 */
+                if (sechdrs[i].sh_type == SHT_RELA)
+                        ret = arch_kexec_apply_relocations_add(pi->ehdr,
+                                                               sechdrs, i);
+                else if (sechdrs[i].sh_type == SHT_REL)
+                        ret = arch_kexec_apply_relocations(pi->ehdr,
+                                                           sechdrs, i);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+                         unsigned long max, int top_down,
+                         unsigned long *load_addr)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        int ret;
+        if (kexec_purgatory_size <= 0)
+                return -EINVAL;
+        if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+                return -ENOEXEC;
+        pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+        if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+            || pi->ehdr->e_type != ET_REL
+            || !elf_check_arch(pi->ehdr)
+            || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+                return -ENOEXEC;
+        if (pi->ehdr->e_shoff >= kexec_purgatory_size
+            || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+            kexec_purgatory_size - pi->ehdr->e_shoff))
+                return -ENOEXEC;
+        ret = __kexec_load_purgatory(image, min, max, top_down);
+        if (ret)
+                return ret;
+        ret = kexec_apply_relocations(image);
+        if (ret)
+                goto out;
+        *load_addr = pi->purgatory_load_addr;
+        return 0;
+out:
+        vfree(pi->sechdrs);
+        vfree(pi->purgatory_buf);
+        return ret;
+}
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+                                            const char *name)
+{
+        Elf_Sym *syms;
+        Elf_Shdr *sechdrs;
+        Elf_Ehdr *ehdr;
+        int i, k;
+        const char *strtab;
+        if (!pi->sechdrs || !pi->ehdr)
+                return NULL;
+        sechdrs = pi->sechdrs;
+        ehdr = pi->ehdr;
+        for (i = 0; i < ehdr->e_shnum; i++) {
+                if (sechdrs[i].sh_type != SHT_SYMTAB)
+                        continue;
+                if (sechdrs[i].sh_link >= ehdr->e_shnum)
+                        /* Invalid strtab section number */
+                        continue;
+                strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+                syms = (Elf_Sym *)sechdrs[i].sh_offset;
+                /* Go through symbols for a match */
+                for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+                        if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+                                continue;
+                        if (strcmp(strtab + syms[k].st_name, name) != 0)
+                                continue;
+                        if (syms[k].st_shndx == SHN_UNDEF ||
+                            syms[k].st_shndx >= ehdr->e_shnum) {
+                                pr_debug("Symbol: %s has bad section index %d.\n",
+                                                name, syms[k].st_shndx);
+                                return NULL;
+                        }
+                        /* Found the symbol we are looking for */
+                        return &syms[k];
+                }
+        }
+        return NULL;
+}
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        Elf_Sym *sym;
+        Elf_Shdr *sechdr;
+        sym = kexec_purgatory_find_symbol(pi, name);
+        if (!sym)
+                return ERR_PTR(-EINVAL);
+        sechdr = &pi->sechdrs[sym->st_shndx];
+        /*
+         * Returns the address where symbol will finally be loaded after
+         * kexec_load_segment()
+         */
+        return (void *)(sechdr->sh_addr + sym->st_value);
+}
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+                                   void *buf, unsigned int size, bool get_value)
+{
+        Elf_Sym *sym;
+        Elf_Shdr *sechdrs;
+        struct purgatory_info *pi = &image->purgatory_info;
+        char *sym_buf;
+        sym = kexec_purgatory_find_symbol(pi, name);
+        if (!sym)
+                return -EINVAL;
+        if (sym->st_size != size) {
+                pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+                       name, (unsigned long)sym->st_size, size);
+                return -EINVAL;
+        }
+        sechdrs = pi->sechdrs;
+        if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+                pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+                       get_value ? "get" : "set");
+                return -EINVAL;
+        }
+        sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+                                        sym->st_value;
+        if (get_value)
+                memcpy((void *)buf, sym_buf, size);
+        else
+                memcpy((void *)sym_buf, buf, size);
+        return 0;
+}
 /*
 * Move into place and start executing a preloaded standalone
 * executable.  If nothing was preloaded return an error.
diff --git a/kernel/module.c b/kernel/module.c
index ae79ce615cb9..03214bd288e9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3304,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
        mutex_lock(&module_mutex);
        module_bug_cleanup(mod);
        mutex_unlock(&module_mutex);
+        /* we can't deallocate the module until we clear memory protection */
+        unset_module_init_ro_nx(mod);
+        unset_module_core_ro_nx(mod);
 ddebug_cleanup:
        dynamic_debug_remove(info->debug);
        synchronize_sched();
@@ -3381,6 +3386,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
 */
 static inline int is_arm_mapping_symbol(const char *str)
 {
+        if (str[0] == '.' && str[1] == 'L')
+                return true;
        return str[0] == '$' && strchr("atd", str[1])
               && (str[2] == '\0' || str[2] == '.');
 }
@@ -3444,8 +3451,7 @@ const char *module_address_lookup(unsigned long addr,
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-                if (within_module_init(addr, mod) ||
+                if (within_module(addr, mod)) {
-                    within_module_core(addr, mod)) {
                        if (modname)
                                *modname = mod->name;
                        ret = get_ksymbol(mod, addr, size, offset);
@@ -3469,8 +3475,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-                if (within_module_init(addr, mod) ||
+                if (within_module(addr, mod)) {
-                    within_module_core(addr, mod)) {
                        const char *sym;
                        sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -3495,8 +3500,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-                if (within_module_init(addr, mod) ||
+                if (within_module(addr, mod)) {
-                    within_module_core(addr, mod)) {
                        const char *sym;
                        sym = get_ksymbol(mod, addr, size, offset);
@@ -3760,8 +3764,7 @@ struct module *__module_address(unsigned long addr)
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-                if (within_module_core(addr, mod)
+                if (within_module(addr, mod))
-                    || within_module_init(addr, mod))
                        return mod;
        }
        return NULL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e7811086b82..ef42d0ab3115 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
        might_sleep();
+        task_lock(p);
        ns = p->nsproxy;
+        p->nsproxy = new;
+        task_unlock(p);
-        rcu_assign_pointer(p->nsproxy, new);
+        if (ns && atomic_dec_and_test(&ns->count))
-        if (ns && atomic_dec_and_test(&ns->count)) {
-                /*
-                 * wait for others to get what they want from this nsproxy.
-                 *
-                 * cannot release this nsproxy via the call_rcu() since
-                 * put_mnt_ns() will want to sleep
-                 */
-                synchronize_rcu();
                free_nsproxy(ns);
-        }
 }
 void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 62e16cef9cc2..d09dc5c32c67 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -224,6 +224,7 @@ static const struct tnt tnts[] = {
        { TAINT_FIRMWARE_WORKAROUND,    'I', ' ' },
        { TAINT_OOT_MODULE,             'O', ' ' },
        { TAINT_UNSIGNED_MODULE,        'E', ' ' },
+        { TAINT_SOFTLOCKUP,             'L', ' ' },
 };
 /**
diff --git a/kernel/params.c b/kernel/params.c
index 1e52ca233fd9..34f527023794 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -256,6 +256,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
 STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
 STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
 STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9a83d780facd..e4e4121fa327 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -253,9 +253,6 @@ config APM_EMULATION
          anything, try disabling/enabling this option (or disabling/enabling
          APM in your BIOS).
-config ARCH_HAS_OPP
-        bool
 config PM_OPP
        bool
        ---help---
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8e90f330f139..9a59d042ea84 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
        suspend_state_t i;
        for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
-                if (pm_states[i].state)
+                if (pm_states[i])
-                        s += sprintf(s,"%s ", pm_states[i].label);
+                        s += sprintf(s,"%s ", pm_states[i]);
 #endif
        if (hibernation_available())
@@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 static suspend_state_t decode_state(const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
-        suspend_state_t state = PM_SUSPEND_MIN;
+        suspend_state_t state;
-        struct pm_sleep_state *s;
 #endif
        char *p;
        int len;
@@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n)
                return PM_SUSPEND_MAX;
 #ifdef CONFIG_SUSPEND
-        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
+        for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
-                if (s->state && len == strlen(s->label)
+                const char *label = pm_states[state];
-                    && !strncmp(buf, s->label, len))
-                        return s->state;
+                if (label && len == strlen(label) && !strncmp(buf, label, len))
+                        return state;
+        }
 #endif
        return PM_SUSPEND_ON;
@@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
 #ifdef CONFIG_SUSPEND
        if (state < PM_SUSPEND_MAX)
-                return sprintf(buf, "%s\n", pm_states[state].state ?
+                return sprintf(buf, "%s\n", pm_states[state] ?
-                                        pm_states[state].label : "error");
+                                        pm_states[state] : "error");
 #endif
 #ifdef CONFIG_HIBERNATION
        return sprintf(buf, "disk\n");
@@ -615,7 +616,6 @@ static struct attribute_group attr_group = {
        .attrs = g,
 };
-#ifdef CONFIG_PM_RUNTIME
 struct workqueue_struct *pm_wq;
 EXPORT_SYMBOL_GPL(pm_wq);
@@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void)
        return pm_wq ? 0 : -ENOMEM;
 }
-#else
-static inline int pm_start_workqueue(void) { return 0; }
-#endif
 static int __init pm_init(void)
 {
diff --git a/kernel/power/power.h b/kernel/power/power.h
index c60f13b5270a..5d49dcac2537 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
                                unsigned int, char *);
 #ifdef CONFIG_SUSPEND
-struct pm_sleep_state {
-        const char *label;
-        suspend_state_t state;
-};
 /* kernel/power/suspend.c */
-extern struct pm_sleep_state pm_states[];
+extern const char *pm_states[];
 extern int suspend_devices_and_enter(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1ea328aafdc9..c4b8093c80b3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 *      information is stored (in the form of a block of bitmap)
 *      It also contains the pfns that correspond to the start and end of
 *      the represented memory area.
+ *
+ *      The memory bitmap is organized as a radix tree to guarantee fast random
+ *      access to the bits. There is one radix tree for each zone (as returned
+ *      from create_mem_extents).
+ *
+ *      One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ *      two linked lists for the nodes of the tree, one for the inner nodes and
+ *      one for the leave nodes. The linked leave nodes are used for fast linear
+ *      access of the memory bitmap.
+ *
+ *      The struct rtree_node represents one node of the radix tree.
 */
 #define BM_END_OF_MAP   (~0UL)
 #define BM_BITS_PER_BLOCK       (PAGE_SIZE * BITS_PER_BYTE)
+#define BM_BLOCK_SHIFT          (PAGE_SHIFT + 3)
+#define BM_BLOCK_MASK           ((1UL << BM_BLOCK_SHIFT) - 1)
-struct bm_block {
+/*
-        struct list_head hook;  /* hook into a list of bitmap blocks */
+ * struct rtree_node is a wrapper struct to link the nodes
-        unsigned long start_pfn;        /* pfn represented by the first bit */
+ * of the rtree together for easy linear iteration over
-        unsigned long end_pfn;  /* pfn represented by the last bit plus 1 */
+ * bits and easy freeing
-        unsigned long *data;    /* bitmap representing pages */
+ */
+struct rtree_node {
+        struct list_head list;
+        unsigned long *data;
 };
-static inline unsigned long bm_block_bits(struct bm_block *bb)
+/*
-{
+ * struct mem_zone_bm_rtree represents a bitmap used for one
-        return bb->end_pfn - bb->start_pfn;
+ * populated memory zone.
-}
+ */
+struct mem_zone_bm_rtree {
+        struct list_head list;          /* Link Zones together         */
+        struct list_head nodes;         /* Radix Tree inner nodes      */
+        struct list_head leaves;        /* Radix Tree leaves           */
+        unsigned long start_pfn;        /* Zone start page frame       */
+        unsigned long end_pfn;          /* Zone end page frame + 1     */
+        struct rtree_node *rtree;       /* Radix Tree Root             */
+        int levels;                     /* Number of Radix Tree Levels */
+        unsigned int blocks;            /* Number of Bitmap Blocks     */
+};
 /* strcut bm_position is used for browsing memory bitmaps */
 struct bm_position {
-        struct bm_block *block;
+        struct mem_zone_bm_rtree *zone;
-        int bit;
+        struct rtree_node *node;
+        unsigned long node_pfn;
+        int node_bit;
 };
 struct memory_bitmap {
-        struct list_head blocks;        /* list of bitmap blocks */
+        struct list_head zones;
        struct linked_page *p_list;     /* list of pages used to store zone
                                         * bitmap objects and bitmap block
                                         * objects
@@ -284,38 +312,178 @@ struct memory_bitmap {
 /* Functions that operate on memory bitmaps */
-static void memory_bm_position_reset(struct memory_bitmap *bm)
+#define BM_ENTRIES_PER_LEVEL    (PAGE_SIZE / sizeof(unsigned long))
+#if BITS_PER_LONG == 32
+#define BM_RTREE_LEVEL_SHIFT    (PAGE_SHIFT - 2)
+#else
+#define BM_RTREE_LEVEL_SHIFT    (PAGE_SHIFT - 3)
+#endif
+#define BM_RTREE_LEVEL_MASK     ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
+/*
+ *      alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ *
+ *      This function is used to allocate inner nodes as well as the
+ *      leave nodes of the radix tree. It also adds the node to the
+ *      corresponding linked list passed in by the *list parameter.
+ */
+static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
+                                           struct chain_allocator *ca,
+                                           struct list_head *list)
 {
-        bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
+        struct rtree_node *node;
-        bm->cur.bit = 0;
-}
-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+        node = chain_alloc(ca, sizeof(struct rtree_node));
+        if (!node)
+                return NULL;
-/**
+        node->data = get_image_page(gfp_mask, safe_needed);
- *      create_bm_block_list - create a list of block bitmap objects
+        if (!node->data)
- *      @pages - number of pages to track
+                return NULL;
- *      @list - list to put the allocated blocks into
- *      @ca - chain allocator to be used for allocating memory
+        list_add_tail(&node->list, list);
+        return node;
+}
+/*
+ *      add_rtree_block - Add a new leave node to the radix tree
+ *
+ *      The leave nodes need to be allocated in order to keep the leaves
+ *      linked list in order. This is guaranteed by the zone->blocks
+ *      counter.
 */
-static int create_bm_block_list(unsigned long pages,
+static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
-                                struct list_head *list,
+                           int safe_needed, struct chain_allocator *ca)
-                                struct chain_allocator *ca)
 {
-        unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+        struct rtree_node *node, *block, **dst;
+        unsigned int levels_needed, block_nr;
+        int i;
-        while (nr_blocks-- > 0) {
+        block_nr = zone->blocks;
-                struct bm_block *bb;
+        levels_needed = 0;
-                bb = chain_alloc(ca, sizeof(struct bm_block));
+        /* How many levels do we need for this block nr? */
-                if (!bb)
+        while (block_nr) {
+                levels_needed += 1;
+                block_nr >>= BM_RTREE_LEVEL_SHIFT;
+        }
+        /* Make sure the rtree has enough levels */
+        for (i = zone->levels; i < levels_needed; i++) {
+                node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+                                        &zone->nodes);
+                if (!node)
                        return -ENOMEM;
-                list_add(&bb->hook, list);
+                node->data[0] = (unsigned long)zone->rtree;
+                zone->rtree = node;
+                zone->levels += 1;
        }
+        /* Allocate new block */
+        block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
+        if (!block)
+                return -ENOMEM;
+        /* Now walk the rtree to insert the block */
+        node = zone->rtree;
+        dst = &zone->rtree;
+        block_nr = zone->blocks;
+        for (i = zone->levels; i > 0; i--) {
+                int index;
+                if (!node) {
+                        node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+                                                &zone->nodes);
+                        if (!node)
+                                return -ENOMEM;
+                        *dst = node;
+                }
+                index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+                index &= BM_RTREE_LEVEL_MASK;
+                dst = (struct rtree_node **)&((*dst)->data[index]);
+                node = *dst;
+        }
+        zone->blocks += 1;
+        *dst = block;
        return 0;
 }
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+                               int clear_nosave_free);
+/*
+ *      create_zone_bm_rtree - create a radix tree for one zone
+ *
+ *      Allocated the mem_zone_bm_rtree structure and initializes it.
+ *      This function also allocated and builds the radix tree for the
+ *      zone.
+ */
+static struct mem_zone_bm_rtree *
+create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
+                     struct chain_allocator *ca,
+                     unsigned long start, unsigned long end)
+{
+        struct mem_zone_bm_rtree *zone;
+        unsigned int i, nr_blocks;
+        unsigned long pages;
+        pages = end - start;
+        zone  = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
+        if (!zone)
+                return NULL;
+        INIT_LIST_HEAD(&zone->nodes);
+        INIT_LIST_HEAD(&zone->leaves);
+        zone->start_pfn = start;
+        zone->end_pfn = end;
+        nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+        for (i = 0; i < nr_blocks; i++) {
+                if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
+                        free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
+                        return NULL;
+                }
+        }
+        return zone;
+}
+/*
+ *      free_zone_bm_rtree - Free the memory of the radix tree
+ *
+ *      Free all node pages of the radix tree. The mem_zone_bm_rtree
+ *      structure itself is not freed here nor are the rtree_node
+ *      structs.
+ */
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+                               int clear_nosave_free)
+{
+        struct rtree_node *node;
+        list_for_each_entry(node, &zone->nodes, list)
+                free_image_page(node->data, clear_nosave_free);
+        list_for_each_entry(node, &zone->leaves, list)
+                free_image_page(node->data, clear_nosave_free);
+}
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+        bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+                                  list);
+        bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+                                  struct rtree_node, list);
+        bm->cur.node_pfn = 0;
+        bm->cur.node_bit = 0;
+}
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 struct mem_extent {
        struct list_head hook;
        unsigned long start;
@@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
        int error;
        chain_init(&ca, gfp_mask, safe_needed);
-        INIT_LIST_HEAD(&bm->blocks);
+        INIT_LIST_HEAD(&bm->zones);
        error = create_mem_extents(&mem_extents, gfp_mask);
        if (error)
                return error;
        list_for_each_entry(ext, &mem_extents, hook) {
-                struct bm_block *bb;
+                struct mem_zone_bm_rtree *zone;
-                unsigned long pfn = ext->start;
-                unsigned long pages = ext->end - ext->start;
-                bb = list_entry(bm->blocks.prev, struct bm_block, hook);
+                zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
+                                            ext->start, ext->end);
-                error = create_bm_block_list(pages, bm->blocks.prev, &ca);
+                if (!zone) {
-                if (error)
+                        error = -ENOMEM;
                        goto Error;
-                list_for_each_entry_continue(bb, &bm->blocks, hook) {
-                        bb->data = get_image_page(gfp_mask, safe_needed);
-                        if (!bb->data) {
-                                error = -ENOMEM;
-                                goto Error;
-                        }
-                        bb->start_pfn = pfn;
-                        if (pages >= BM_BITS_PER_BLOCK) {
-                                pfn += BM_BITS_PER_BLOCK;
-                                pages -= BM_BITS_PER_BLOCK;
-                        } else {
-                                /* This is executed only once in the loop */
-                                pfn += pages;
-                        }
-                        bb->end_pfn = pfn;
                }
+                list_add_tail(&zone->list, &bm->zones);
        }
        bm->p_list = ca.chain;
@@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
  */
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 {
-        struct bm_block *bb;
+        struct mem_zone_bm_rtree *zone;
-        list_for_each_entry(bb, &bm->blocks, hook)
+        list_for_each_entry(zone, &bm->zones, list)
-                if (bb->data)
+                free_zone_bm_rtree(zone, clear_nosave_free);
-                        free_image_page(bb->data, clear_nosave_free);
        free_list_of_pages(bm->p_list, clear_nosave_free);
-        INIT_LIST_HEAD(&bm->blocks);
+        INIT_LIST_HEAD(&bm->zones);
 }
 /**
- *      memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
+ *      memory_bm_find_bit - Find the bit for pfn in the memory
- *      to given pfn.  The cur_zone_bm member of @bm and the cur_block member
+ *                           bitmap
- *      of @bm->cur_zone_bm are updated.
+ *
+ *      Find the bit in the bitmap @bm that corresponds to given pfn.
+ *      The cur.zone, cur.block and cur.node_pfn member of @bm are
+ *      updated.
+ *      It walks the radix tree to find the page which contains the bit for
+ *      pfn and returns the bit position in **addr and *bit_nr.
 */
 static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
-                                void **addr, unsigned int *bit_nr)
+                              void **addr, unsigned int *bit_nr)
 {
-        struct bm_block *bb;
+        struct mem_zone_bm_rtree *curr, *zone;
+        struct rtree_node *node;
+        int i, block_nr;
+        zone = bm->cur.zone;
+        if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
+                goto zone_found;
+        zone = NULL;
+        /* Find the right zone */
+        list_for_each_entry(curr, &bm->zones, list) {
+                if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
+                        zone = curr;
+                        break;
+                }
+        }
+        if (!zone)
+                return -EFAULT;
+zone_found:
        /*
-         * Check if the pfn corresponds to the current bitmap block and find
+         * We have a zone. Now walk the radix tree to find the leave
-         * the block where it fits if this is not the case.
+         * node for our pfn.
         */
-        bb = bm->cur.block;
-        if (pfn < bb->start_pfn)
-                list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
-                        if (pfn >= bb->start_pfn)
-                                break;
-        if (pfn >= bb->end_pfn)
+        node = bm->cur.node;
-                list_for_each_entry_continue(bb, &bm->blocks, hook)
+        if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
-                        if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
+                goto node_found;
-                                break;
-        if (&bb->hook == &bm->blocks)
+        node      = zone->rtree;
-                return -EFAULT;
+        block_nr  = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
+        for (i = zone->levels; i > 0; i--) {
+                int index;
+                index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+                index &= BM_RTREE_LEVEL_MASK;
+                BUG_ON(node->data[index] == 0);
+                node = (struct rtree_node *)node->data[index];
+        }
+node_found:
+        /* Update last position */
+        bm->cur.zone = zone;
+        bm->cur.node = node;
+        bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+        /* Set return values */
+        *addr = node->data;
+        *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
-        /* The block has been found */
-        bm->cur.block = bb;
-        pfn -= bb->start_pfn;
-        bm->cur.bit = pfn + 1;
-        *bit_nr = pfn;
-        *addr = bb->data;
        return 0;
 }
@@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
        error = memory_bm_find_bit(bm, pfn, &addr, &bit);
        if (!error)
                set_bit(bit, addr);
        return error;
 }
@@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
        clear_bit(bit, addr);
 }
+static void memory_bm_clear_current(struct memory_bitmap *bm)
+{
+        int bit;
+        bit = max(bm->cur.node_bit - 1, 0);
+        clear_bit(bit, bm->cur.node->data);
+}
 static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
        void *addr;
@@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
        return !memory_bm_find_bit(bm, pfn, &addr, &bit);
 }
-/**
+/*
- *      memory_bm_next_pfn - find the pfn that corresponds to the next set bit
+ *      rtree_next_node - Jumps to the next leave node
- *      in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
- *      returned.
 *
- *      It is required to run memory_bm_position_reset() before the first call to
+ *      Sets the position to the beginning of the next node in the
- *      this function.
+ *      memory bitmap. This is either the next node in the current
+ *      zone's radix tree or the first node in the radix tree of the
+ *      next zone.
+ *
+ *      Returns true if there is a next node, false otherwise.
 */
+static bool rtree_next_node(struct memory_bitmap *bm)
+{
+        bm->cur.node = list_entry(bm->cur.node->list.next,
+                                  struct rtree_node, list);
+        if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+                bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+                bm->cur.node_bit  = 0;
+                touch_softlockup_watchdog();
+                return true;
+        }
+        /* No more nodes, goto next zone */
+        bm->cur.zone = list_entry(bm->cur.zone->list.next,
+                                  struct mem_zone_bm_rtree, list);
+        if (&bm->cur.zone->list != &bm->zones) {
+                bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+                                          struct rtree_node, list);
+                bm->cur.node_pfn = 0;
+                bm->cur.node_bit = 0;
+                return true;
+        }
+        /* No more zones */
+        return false;
+}
+/**
+ *      memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ *
+ *      Starting from the last returned position this function searches
+ *      for the next set bit in the memory bitmap and returns its
+ *      number. If no more bit is set BM_END_OF_MAP is returned.
+ *
+ *      It is required to run memory_bm_position_reset() before the
+ *      first call to this function.
+ */
 static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
-        struct bm_block *bb;
+        unsigned long bits, pfn, pages;
        int bit;
-        bb = bm->cur.block;
        do {
-                bit = bm->cur.bit;
+                pages     = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
-                bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
+                bits      = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
-                if (bit < bm_block_bits(bb))
+                bit       = find_next_bit(bm->cur.node->data, bits,
-                        goto Return_pfn;
+                                          bm->cur.node_bit);
+                if (bit < bits) {
-                bb = list_entry(bb->hook.next, struct bm_block, hook);
+                        pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
-                bm->cur.block = bb;
+                        bm->cur.node_bit = bit + 1;
-                bm->cur.bit = 0;
+                        return pfn;
-        } while (&bb->hook != &bm->blocks);
+                }
+        } while (rtree_next_node(bm));
-        memory_bm_position_reset(bm);
        return BM_END_OF_MAP;
- Return_pfn:
-        bm->cur.bit = bit + 1;
-        return bb->start_pfn + bit;
 }
 /**
@@ -731,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
        }
 }
+static bool is_nosave_page(unsigned long pfn)
+{
+        struct nosave_region *region;
+        list_for_each_entry(region, &nosave_regions, list) {
+                if (pfn >= region->start_pfn && pfn < region->end_pfn) {
+                        pr_err("PM: %#010llx in e820 nosave region: "
+                               "[mem %#010llx-%#010llx]\n",
+                               (unsigned long long) pfn << PAGE_SHIFT,
+                               (unsigned long long) region->start_pfn << PAGE_SHIFT,
+                               ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+                                        - 1);
+                        return true;
+                }
+        }
+        return false;
+}
 /**
 *      create_basic_memory_bitmaps - create bitmaps needed for marking page
 *      frames that should not be saved and free page frames.  The pointers
@@ -816,12 +1058,17 @@ void free_basic_memory_bitmaps(void)
 unsigned int snapshot_additional_pages(struct zone *zone)
 {
-        unsigned int res;
+        unsigned int rtree, nodes;
-        res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+        rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
-        res += DIV_ROUND_UP(res * sizeof(struct bm_block),
+        rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
-                            LINKED_PAGE_DATA_SIZE);
+                              LINKED_PAGE_DATA_SIZE);
-        return 2 * res;
+        while (nodes > 1) {
+                nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
+                rtree += nodes;
+        }
+        return 2 * rtree;
 }
 #ifdef CONFIG_HIGHMEM
@@ -1094,23 +1341,35 @@ static struct memory_bitmap copy_bm;
 void swsusp_free(void)
 {
-        struct zone *zone;
+        unsigned long fb_pfn, fr_pfn;
-        unsigned long pfn, max_zone_pfn;
-        for_each_populated_zone(zone) {
+        memory_bm_position_reset(forbidden_pages_map);
-                max_zone_pfn = zone_end_pfn(zone);
+        memory_bm_position_reset(free_pages_map);
-                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-                        if (pfn_valid(pfn)) {
+loop:
-                                struct page *page = pfn_to_page(pfn);
+        fr_pfn = memory_bm_next_pfn(free_pages_map);
+        fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
-                                if (swsusp_page_is_forbidden(page) &&
-                                    swsusp_page_is_free(page)) {
+        /*
-                                        swsusp_unset_page_forbidden(page);
+         * Find the next bit set in both bitmaps. This is guaranteed to
-                                        swsusp_unset_page_free(page);
+         * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
-                                        __free_page(page);
+         */
-                                }
+        do {
-                        }
+                if (fb_pfn < fr_pfn)
+                        fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+                if (fr_pfn < fb_pfn)
+                        fr_pfn = memory_bm_next_pfn(free_pages_map);
+        } while (fb_pfn != fr_pfn);
+        if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
+                struct page *page = pfn_to_page(fr_pfn);
+                memory_bm_clear_current(forbidden_pages_map);
+                memory_bm_clear_current(free_pages_map);
+                __free_page(page);
+                goto loop;
        }
        nr_copy_pages = 0;
        nr_meta_pages = 0;
        restore_pblist = NULL;
@@ -1775,7 +2034,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
        do {
                pfn = memory_bm_next_pfn(bm);
                if (likely(pfn != BM_END_OF_MAP)) {
-                        if (likely(pfn_valid(pfn)))
+                        if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
                                swsusp_set_page_free(pfn_to_page(pfn));
                        else
                                return -EFAULT;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4b736b4dfa96..6dadb25cb0d8 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,20 +31,11 @@
 #include "power.h"
-struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
+static const char *pm_labels[] = { "mem", "standby", "freeze", };
-        [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
+const char *pm_states[PM_SUSPEND_MAX];
-        [PM_SUSPEND_STANDBY] = { .label = "standby", },
-        [PM_SUSPEND_MEM] = { .label = "mem", },
-};
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
-static bool need_suspend_ops(suspend_state_t state)
-{
-        return state > PM_SUSPEND_FREEZE;
-}
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
 static bool suspend_freeze_wake;
@@ -97,10 +88,7 @@ static bool relative_states;
 static int __init sleep_states_setup(char *str)
 {
        relative_states = !strncmp(str, "1", 1);
-        if (relative_states) {
+        pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
-                pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
-                pm_states[PM_SUSPEND_FREEZE].state = 0;
-        }
        return 1;
 }
@@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup);
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
        suspend_state_t i;
-        int j = PM_SUSPEND_MAX - 1;
+        int j = 0;
        lock_system_sleep();
        suspend_ops = ops;
        for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
-                if (valid_state(i))
+                if (valid_state(i)) {
-                        pm_states[j--].state = i;
+                        pm_states[i] = pm_labels[j++];
-                else if (!relative_states)
+                } else if (!relative_states) {
-                        pm_states[j--].state = 0;
+                        pm_states[i] = NULL;
+                        j++;
+                }
-        pm_states[j--].state = PM_SUSPEND_FREEZE;
+        pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
-        while (j >= PM_SUSPEND_MIN)
-                pm_states[j--].state = 0;
        unlock_system_sleep();
 }
@@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state)
 }
 EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
+static bool sleep_state_supported(suspend_state_t state)
+{
+        return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+}
+static int platform_suspend_prepare(suspend_state_t state)
+{
+        return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+                suspend_ops->prepare() : 0;
+}
+static int platform_suspend_prepare_late(suspend_state_t state)
+{
+        return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+                suspend_ops->prepare_late() : 0;
+}
+static void platform_suspend_wake(suspend_state_t state)
+{
+        if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+                suspend_ops->wake();
+}
+static void platform_suspend_finish(suspend_state_t state)
+{
+        if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+                suspend_ops->finish();
+}
+static int platform_suspend_begin(suspend_state_t state)
+{
+        if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
+                return freeze_ops->begin();
+        else if (suspend_ops->begin)
+                return suspend_ops->begin(state);
+        else
+                return 0;
+}
+static void platform_suspend_end(suspend_state_t state)
+{
+        if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+                freeze_ops->end();
+        else if (suspend_ops->end)
+                suspend_ops->end();
+}
+static void platform_suspend_recover(suspend_state_t state)
+{
+        if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+                suspend_ops->recover();
+}
+static bool platform_suspend_again(suspend_state_t state)
+{
+        return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+                suspend_ops->suspend_again() : false;
+}
 static int suspend_test(int level)
 {
 #ifdef CONFIG_PM_DEBUG
@@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state)
 {
        int error;
-        if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
+        if (!sleep_state_supported(state))
                return -EPERM;
        pm_prepare_console();
@@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 {
        int error;
-        if (need_suspend_ops(state) && suspend_ops->prepare) {
+        error = platform_suspend_prepare(state);
-                error = suspend_ops->prepare();
+        if (error)
-                if (error)
+                goto Platform_finish;
-                        goto Platform_finish;
-        }
        error = dpm_suspend_end(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down\n");
                goto Platform_finish;
        }
+        error = platform_suspend_prepare_late(state);
-        if (need_suspend_ops(state) && suspend_ops->prepare_late) {
+        if (error)
-                error = suspend_ops->prepare_late();
+                goto Platform_wake;
-                if (error)
-                        goto Platform_wake;
-        }
        if (suspend_test(TEST_PLATFORM))
                goto Platform_wake;
@@ -276,15 +318,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        enable_nonboot_cpus();
 Platform_wake:
-        if (need_suspend_ops(state) && suspend_ops->wake)
+        platform_suspend_wake(state);
-                suspend_ops->wake();
        dpm_resume_start(PMSG_RESUME);
 Platform_finish:
-        if (need_suspend_ops(state) && suspend_ops->finish)
+        platform_suspend_finish(state);
-                suspend_ops->finish();
        return error;
 }
@@ -297,18 +335,13 @@ int suspend_devices_and_enter(suspend_state_t state)
        int error;
        bool wakeup = false;
-        if (need_suspend_ops(state) && !suspend_ops)
+        if (!sleep_state_supported(state))
                return -ENOSYS;
-        if (need_suspend_ops(state) && suspend_ops->begin) {
+        error = platform_suspend_begin(state);
-                error = suspend_ops->begin(state);
+        if (error)
-                if (error)
+                goto Close;
-                        goto Close;
-        } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
-                error = freeze_ops->begin();
-                if (error)
-                        goto Close;
-        }
        suspend_console();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
@@ -322,25 +355,20 @@ int suspend_devices_and_enter(suspend_state_t state)
        do {
                error = suspend_enter(state, &wakeup);
-        } while (!error && !wakeup && need_suspend_ops(state)
+        } while (!error && !wakeup && platform_suspend_again(state));
-                && suspend_ops->suspend_again && suspend_ops->suspend_again());
 Resume_devices:
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
        resume_console();
- Close:
-        if (need_suspend_ops(state) && suspend_ops->end)
-                suspend_ops->end();
-        else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
-                freeze_ops->end();
+ Close:
+        platform_suspend_end(state);
        return error;
 Recover_platform:
-        if (need_suspend_ops(state) && suspend_ops->recover)
+        platform_suspend_recover(state);
-                suspend_ops->recover();
        goto Resume_devices;
 }
@@ -393,7 +421,7 @@ static int enter_state(suspend_state_t state)
        printk("done.\n");
        trace_suspend_resume(TPS("sync_filesystems"), 0, false);
-        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
+        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
        error = suspend_prepare(state);
        if (error)
                goto Unlock;
@@ -402,7 +430,7 @@ static int enter_state(suspend_state_t state)
                goto Finish;
        trace_suspend_resume(TPS("suspend_enter"), state, false);
-        pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
+        pr_debug("PM: Entering %s sleep\n", pm_states[state]);
        pm_restrict_gfp_mask();
        error = suspend_devices_and_enter(state);
        pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 269b097e78ea..2f524928b6aa 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
        }
        if (state == PM_SUSPEND_MEM) {
-                printk(info_test, pm_states[state].label);
+                printk(info_test, pm_states[state]);
                status = pm_suspend(state);
                if (status == -ENODEV)
                        state = PM_SUSPEND_STANDBY;
        }
        if (state == PM_SUSPEND_STANDBY) {
-                printk(info_test, pm_states[state].label);
+                printk(info_test, pm_states[state]);
                status = pm_suspend(state);
        }
        if (status < 0)
@@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value)
        /* "=mem" ==> "mem" */
        value++;
        for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
-                if (!strcmp(pm_states[i].label, value)) {
+                if (!strcmp(pm_states[i], value)) {
-                        test_state = pm_states[i].state;
+                        test_state = i;
                        return 0;
                }
@@ -162,8 +162,8 @@ static int __init test_suspend(void)
        /* PM is initialized by now; is that state testable? */
        if (test_state == PM_SUSPEND_ON)
                goto done;
-        if (!pm_states[test_state].state) {
+        if (!pm_states[test_state]) {
-                printk(warn_bad_state, pm_states[test_state].label);
+                printk(warn_bad_state, pm_states[test_state]);
                goto done;
        }
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 13e839dbca07..e04c455a0e38 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
 #include <linux/poll.h>
 #include <linux/irq_work.h>
 #include <linux/utsname.h>
+#include <linux/ctype.h>
 #include <asm/uaccess.h>
@@ -56,7 +57,7 @@
 int console_printk[4] = {
        CONSOLE_LOGLEVEL_DEFAULT,       /* console_loglevel */
-        DEFAULT_MESSAGE_LOGLEVEL,       /* default_message_loglevel */
+        MESSAGE_LOGLEVEL_DEFAULT,       /* default_message_loglevel */
        CONSOLE_LOGLEVEL_MIN,           /* minimum_console_loglevel */
        CONSOLE_LOGLEVEL_DEFAULT,       /* default_console_loglevel */
 };
@@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip)
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
 * definitely not the perfect debug tool (we don't know if _WE_
- * hold it are racing, but it helps tracking those weird code
+ * hold it and are racing, but it helps tracking those weird code
- * path in the console code where we end up in places I want
+ * paths in the console code where we end up in places I want
- * locked without the console sempahore held
+ * locked without the console sempahore held).
 */
 static int console_locked, console_suspended;
@@ -146,8 +147,8 @@ static int console_may_schedule;
 * the overall length of the record.
 *
 * The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these both entries are maintained when messages
+ * sequence numbers of these entries are maintained when messages are
- * are stored..
+ * stored.
 *
 * If the heads indicate available messages, the length in the header
 * tells the start next message. A length == 0 for the next message
@@ -257,7 +258,7 @@ static u64 clear_seq;
 static u32 clear_idx;
 #define PREFIX_MAX              32
-#define LOG_LINE_MAX            1024 - PREFIX_MAX
+#define LOG_LINE_MAX            (1024 - PREFIX_MAX)
 /* record buffer */
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -266,10 +267,23 @@ static u32 clear_idx;
 #define LOG_ALIGN __alignof__(struct printk_log)
 #endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
 static u32 log_buf_len = __LOG_BUF_LEN;
+/* Return log buffer address */
+char *log_buf_addr_get(void)
+{
+        return log_buf;
+}
+/* Return log buffer size */
+u32 log_buf_len_get(void)
+{
+        return log_buf_len;
+}
 /* human readable text of the record */
 static char *log_text(const struct printk_log *msg)
 {
@@ -344,7 +358,7 @@ static int log_make_free_space(u32 msg_size)
        while (log_first_seq < log_next_seq) {
                if (logbuf_has_space(msg_size, false))
                        return 0;
-                /* drop old messages until we have enough continuous space */
+                /* drop old messages until we have enough contiguous space */
                log_first_idx = log_next(log_first_idx);
                log_first_seq++;
        }
@@ -453,11 +467,7 @@ static int log_store(int facility, int level,
        return msg->text_len;
 }
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
 static int syslog_action_restricted(int type)
 {
@@ -828,34 +838,74 @@ void log_buf_kexec_setup(void)
 /* requested log_buf_len from kernel cmdline */
 static unsigned long __initdata new_log_buf_len;
-/* save requested log_buf_len since it's too early to process it */
+/* we practice scaling the ring buffer by powers of 2 */
-static int __init log_buf_len_setup(char *str)
+static void __init log_buf_len_update(unsigned size)
 {
-        unsigned size = memparse(str, &str);
        if (size)
                size = roundup_pow_of_two(size);
        if (size > log_buf_len)
                new_log_buf_len = size;
+}
+/* save requested log_buf_len since it's too early to process it */
+static int __init log_buf_len_setup(char *str)
+{
+        unsigned size = memparse(str, &str);
+        log_buf_len_update(size);
        return 0;
 }
 early_param("log_buf_len", log_buf_len_setup);
+static void __init log_buf_add_cpu(void)
+{
+        unsigned int cpu_extra;
+        /*
+         * archs should set up cpu_possible_bits properly with
+         * set_cpu_possible() after setup_arch() but just in
+         * case lets ensure this is valid.
+         */
+        if (num_possible_cpus() == 1)
+                return;
+        cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
+        /* by default this will only continue through for large > 64 CPUs */
+        if (cpu_extra <= __LOG_BUF_LEN / 2)
+                return;
+        pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
+                __LOG_CPU_MAX_BUF_LEN);
+        pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
+                cpu_extra);
+        pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
+        log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
+}
 void __init setup_log_buf(int early)
 {
        unsigned long flags;
        char *new_log_buf;
        int free;
+        if (log_buf != __log_buf)
+                return;
+        if (!early && !new_log_buf_len)
+                log_buf_add_cpu();
        if (!new_log_buf_len)
                return;
        if (early) {
                new_log_buf =
-                        memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
+                        memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
        } else {
-                new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
+                new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
+                                                          LOG_ALIGN);
        }
        if (unlikely(!new_log_buf)) {
@@ -872,7 +922,7 @@ void __init setup_log_buf(int early)
        memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-        pr_info("log_buf_len: %d\n", log_buf_len);
+        pr_info("log_buf_len: %d bytes\n", log_buf_len);
        pr_info("early log buf free: %d(%d%%)\n",
                free, (free * 100) / __LOG_BUF_LEN);
 }
@@ -881,7 +931,7 @@ static bool __read_mostly ignore_loglevel;
 static int __init ignore_loglevel_setup(char *str)
 {
-        ignore_loglevel = 1;
+        ignore_loglevel = true;
        pr_info("debug: ignoring loglevel setting.\n");
        return 0;
@@ -947,11 +997,7 @@ static inline void boot_delay_msec(int level)
 }
 #endif
-#if defined(CONFIG_PRINTK_TIME)
+static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
-static bool printk_time = 1;
-#else
-static bool printk_time;
-#endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 static size_t print_time(u64 ts, char *buf)
@@ -1310,7 +1356,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                         * for pending data, not the size; return the count of
                         * records, not the length.
                         */
-                        error = log_next_idx - syslog_idx;
+                        error = log_next_seq - syslog_seq;
                } else {
                        u64 seq = syslog_seq;
                        u32 idx = syslog_idx;
@@ -1416,10 +1462,9 @@ static int have_callable_console(void)
 /*
 * Can we actually use the console at this time on this cpu?
 *
- * Console drivers may assume that per-cpu resources have
+ * Console drivers may assume that per-cpu resources have been allocated. So
- * been allocated. So unless they're explicitly marked as
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * being able to cope (CON_ANYTIME) don't call them until
+ * call them until this CPU is officially up.
- * this CPU is officially up.
 */
 static inline int can_use_console(unsigned int cpu)
 {
@@ -1432,8 +1477,10 @@ static inline int can_use_console(unsigned int cpu)
 * console_lock held, and 'console_locked' set) if it
 * is successful, false otherwise.
 */
-static int console_trylock_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(void)
 {
+        unsigned int cpu = smp_processor_id();
        if (!console_trylock())
                return 0;
        /*
@@ -1476,7 +1523,7 @@ static struct cont {
        struct task_struct *owner;      /* task of first print*/
        u64 ts_nsec;                    /* time of first print */
        u8 level;                       /* log level of first message */
-        u8 facility;                    /* log level of first message */
+        u8 facility;                    /* log facility of first message */
        enum log_flags flags;           /* prefix, newline flags */
        bool flushed:1;                 /* buffer sealed and committed */
 } cont;
@@ -1608,7 +1655,8 @@ asmlinkage int vprintk_emit(int facility, int level,
                 */
                if (!oops_in_progress && !lockdep_recursing(current)) {
                        recursion_bug = 1;
-                        goto out_restore_irqs;
+                        local_irq_restore(flags);
+                        return 0;
                }
                zap_locks();
        }
@@ -1716,21 +1764,30 @@ asmlinkage int vprintk_emit(int facility, int level,
        logbuf_cpu = UINT_MAX;
        raw_spin_unlock(&logbuf_lock);
+        lockdep_on();
+        local_irq_restore(flags);
        /* If called from the scheduler, we can not call up(). */
        if (!in_sched) {
+                lockdep_off();
+                /*
+                 * Disable preemption to avoid being preempted while holding
+                 * console_sem which would prevent anyone from printing to
+                 * console
+                 */
+                preempt_disable();
                /*
                 * Try to acquire and then immediately release the console
                 * semaphore.  The release will print out buffers and wake up
                 * /dev/kmsg and syslog() users.
                 */
-                if (console_trylock_for_printk(this_cpu))
+                if (console_trylock_for_printk())
                        console_unlock();
+                preempt_enable();
+                lockdep_on();
        }
-        lockdep_on();
-out_restore_irqs:
-        local_irq_restore(flags);
        return printed_len;
 }
 EXPORT_SYMBOL(vprintk_emit);
@@ -1802,7 +1859,7 @@ EXPORT_SYMBOL(printk);
 #define LOG_LINE_MAX            0
 #define PREFIX_MAX              0
-#define LOG_LINE_MAX 0
 static u64 syslog_seq;
 static u32 syslog_idx;
 static u64 console_seq;
@@ -1881,11 +1938,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
        return 0;
 }
 /*
- * Set up a list of consoles.  Called from init/main.c
+ * Set up a console.  Called via do_early_param() in init/main.c
+ * for each "console=" parameter in the boot command line.
 */
 static int __init console_setup(char *str)
 {
-        char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
+        char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
        char *s, *options, *brl_options = NULL;
        int idx;
@@ -1902,7 +1960,8 @@ static int __init console_setup(char *str)
                strncpy(buf, str, sizeof(buf) - 1);
        }
        buf[sizeof(buf) - 1] = 0;
-        if ((options = strchr(str, ',')) != NULL)
+        options = strchr(str, ',');
+        if (options)
                *(options++) = 0;
 #ifdef __sparc__
        if (!strcmp(str, "ttya"))
@@ -1911,7 +1970,7 @@ static int __init console_setup(char *str)
                strcpy(buf, "ttyS1");
 #endif
        for (s = buf; *s; s++)
-                if ((*s >= '0' && *s <= '9') || *s == ',')
+                if (isdigit(*s) || *s == ',')
                        break;
        idx = simple_strtoul(s, NULL, 10);
        *s = 0;
@@ -1950,7 +2009,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
             i++, c++)
                if (strcmp(c->name, name) == 0 && c->index == idx) {
                        strlcpy(c->name, name_new, sizeof(c->name));
-                        c->name[sizeof(c->name) - 1] = 0;
                        c->options = options;
                        c->index = idx_new;
                        return i;
@@ -1959,12 +2017,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
        return -1;
 }
-bool console_suspend_enabled = 1;
+bool console_suspend_enabled = true;
 EXPORT_SYMBOL(console_suspend_enabled);
 static int __init console_suspend_disable(char *str)
 {
-        console_suspend_enabled = 0;
+        console_suspend_enabled = false;
        return 1;
 }
 __setup("no_console_suspend", console_suspend_disable);
@@ -2045,8 +2103,8 @@ EXPORT_SYMBOL(console_lock);
 /**
 * console_trylock - try to lock the console system for exclusive use.
 *
- * Tried to acquire a lock which guarantees that the caller has
+ * Try to acquire a lock which guarantees that the caller has exclusive
- * exclusive access to the console system and the console_drivers list.
+ * access to the console system and the console_drivers list.
 *
 * returns 1 on success, and 0 on failure to acquire the lock.
 */
@@ -2618,14 +2676,13 @@ EXPORT_SYMBOL(__printk_ratelimit);
 bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                        unsigned int interval_msecs)
 {
-        if (*caller_jiffies == 0
+        unsigned long elapsed = jiffies - *caller_jiffies;
-                        || !time_in_range(jiffies, *caller_jiffies,
-                                        *caller_jiffies
+        if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
-                                        + msecs_to_jiffies(interval_msecs))) {
+                return false;
-                *caller_jiffies = jiffies;
-                return true;
+        *caller_jiffies = jiffies;
-        }
+        return true;
-        return false;
 }
 EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/resource.c b/kernel/resource.c
index 3c2237ac32db..da14b8d09296 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock);
 static struct resource *bootmem_resource_free;
 static DEFINE_SPINLOCK(bootmem_resource_lock);
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+static struct resource *next_resource(struct resource *p, bool sibling_only)
 {
-        struct resource *p = v;
+        /* Caller wants to traverse through siblings only */
-        (*pos)++;
+        if (sibling_only)
+                return p->sibling;
        if (p->child)
                return p->child;
        while (!p->sibling && p->parent)
@@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
        return p->sibling;
 }
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct resource *p = v;
+        (*pos)++;
+        return (void *)next_resource(p, false);
+}
 #ifdef CONFIG_PROC_FS
 enum { MAX_IORES_LEVEL = 5 };
@@ -322,16 +331,19 @@ int release_resource(struct resource *old)
 EXPORT_SYMBOL(release_resource);
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 /*
- * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
 * the caller must specify res->start, res->end, res->flags and "name".
 * If found, returns 0, res is overwritten, if not found, returns -1.
+ * This walks through whole tree and not just first level children
+ * until and unless first_level_children_only is true.
 */
-static int find_next_system_ram(struct resource *res, char *name)
+static int find_next_iomem_res(struct resource *res, char *name,
+                               bool first_level_children_only)
 {
        resource_size_t start, end;
        struct resource *p;
+        bool sibling_only = false;
        BUG_ON(!res);
@@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name)
        BUG_ON(start >= end);
        read_lock(&resource_lock);
-        for (p = iomem_resource.child; p ; p = p->sibling) {
-                /* system ram is just marked as IORESOURCE_MEM */
+        if (first_level_children_only) {
+                p = iomem_resource.child;
+                sibling_only = true;
+        } else
+                p = &iomem_resource;
+        while ((p = next_resource(p, sibling_only))) {
                if (p->flags != res->flags)
                        continue;
                if (name && strcmp(p->name, name))
@@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name)
                if ((p->end >= start) && (p->start < end))
                        break;
        }
        read_unlock(&resource_lock);
        if (!p)
                return -1;
@@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name)
 }
 /*
+ * Walks through iomem resources and calls func() with matching resource
+ * ranges. This walks through whole tree and not just first level children.
+ * All the memory ranges which overlap start,end and also match flags and
+ * name are valid candidates.
+ *
+ * @name: name of resource
+ * @flags: resource flags
+ * @start: start addr
+ * @end: end addr
+ */
+int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
+                void *arg, int (*func)(u64, u64, void *))
+{
+        struct resource res;
+        u64 orig_end;
+        int ret = -1;
+        res.start = start;
+        res.end = end;
+        res.flags = flags;
+        orig_end = res.end;
+        while ((res.start < res.end) &&
+                (!find_next_iomem_res(&res, name, false))) {
+                ret = (*func)(res.start, res.end, arg);
+                if (ret)
+                        break;
+                res.start = res.end + 1;
+                res.end = orig_end;
+        }
+        return ret;
+}
+/*
+ * This function calls callback against all memory range of "System RAM"
+ * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ * Now, this function is only for "System RAM". This function deals with
+ * full ranges and not pfn. If resources are not pfn aligned, dealing
+ * with pfn can truncate ranges.
+ */
+int walk_system_ram_res(u64 start, u64 end, void *arg,
+                                int (*func)(u64, u64, void *))
+{
+        struct resource res;
+        u64 orig_end;
+        int ret = -1;
+        res.start = start;
+        res.end = end;
+        res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+        orig_end = res.end;
+        while ((res.start < res.end) &&
+                (!find_next_iomem_res(&res, "System RAM", true))) {
+                ret = (*func)(res.start, res.end, arg);
+                if (ret)
+                        break;
+                res.start = res.end + 1;
+                res.end = orig_end;
+        }
+        return ret;
+}
+#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+/*
 * This function calls callback against all memory range of "System RAM"
 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
 * Now, this function is only for "System RAM".
@@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
        orig_end = res.end;
        while ((res.start < res.end) &&
-                (find_next_system_ram(&res, "System RAM") >= 0)) {
+                (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
                pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
                end_pfn = (res.end + 1) >> PAGE_SHIFT;
                if (end_pfn > pfn)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1211575a2208..ec1a286684a5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2393,6 +2393,13 @@ unsigned long nr_iowait_cpu(int cpu)
        return atomic_read(&this->nr_iowait);
 }
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+        struct rq *this = this_rq();
+        *nr_waiters = atomic_read(&this->nr_iowait);
+        *load = this->cpu_load[0];
+}
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9f1608f99819..11e7bc434f43 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,8 +147,6 @@ use_default:
            clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
                goto use_default;
-        trace_cpu_idle_rcuidle(next_state, dev->cpu);
        /*
         * Enter the idle state previously returned by the governor decision.
         * This function will block until an interrupt occurs and will take
@@ -156,8 +154,6 @@ use_default:
         */
        entered_state = cpuidle_enter(drv, dev, next_state);
-        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
        if (broadcast)
                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 16f5a30f9c88..8ecd552fe4f2 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -8,13 +8,6 @@
 #include "sched.h"
-unsigned long this_cpu_load(void)
-{
-        struct rq *this = this_rq();
-        return this->cpu_load[0];
-}
 /*
 * Global load-average calculations
 *
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 301bbc24739c..44eb005c6695 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,15 +18,17 @@
 #include <linux/compat.h>
 #include <linux/sched.h>
 #include <linux/seccomp.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
 /* #define SECCOMP_DEBUG 1 */
 #ifdef CONFIG_SECCOMP_FILTER
 #include <asm/syscall.h>
 #include <linux/filter.h>
+#include <linux/pid.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
-#include <linux/slab.h>
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
@@ -54,7 +56,7 @@
 struct seccomp_filter {
        atomic_t usage;
        struct seccomp_filter *prev;
-        struct sk_filter *prog;
+        struct bpf_prog *prog;
 };
 /* Limit any path through the tree to 256KB worth of instructions. */
@@ -87,7 +89,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
 *      @filter: filter to verify
 *      @flen: length of filter
 *
- * Takes a previously checked filter (by sk_chk_filter) and
+ * Takes a previously checked filter (by bpf_check_classic) and
 * redirects all filter code that loads struct sk_buff data
 * and related data through seccomp_bpf_load.  It also
 * enforces length and alignment checking of those loads.
@@ -172,51 +174,184 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 */
 static u32 seccomp_run_filters(int syscall)
 {
-        struct seccomp_filter *f;
+        struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
        struct seccomp_data sd;
        u32 ret = SECCOMP_RET_ALLOW;
        /* Ensure unexpected behavior doesn't result in failing open. */
-        if (WARN_ON(current->seccomp.filter == NULL))
+        if (unlikely(WARN_ON(f == NULL)))
                return SECCOMP_RET_KILL;
+        /* Make sure cross-thread synced filter points somewhere sane. */
+        smp_read_barrier_depends();
        populate_seccomp_data(&sd);
        /*
         * All filters in the list are evaluated and the lowest BPF return
         * value always takes priority (ignoring the DATA).
         */
-        for (f = current->seccomp.filter; f; f = f->prev) {
+        for (; f; f = f->prev) {
-                u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
+                u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);
                if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
                        ret = cur_ret;
        }
        return ret;
 }
+#endif /* CONFIG_SECCOMP_FILTER */
+static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
+{
+        assert_spin_locked(&current->sighand->siglock);
+        if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
+                return false;
+        return true;
+}
+static inline void seccomp_assign_mode(struct task_struct *task,
+                                       unsigned long seccomp_mode)
+{
+        assert_spin_locked(&task->sighand->siglock);
+        task->seccomp.mode = seccomp_mode;
+        /*
+         * Make sure TIF_SECCOMP cannot be set before the mode (and
+         * filter) is set.
+         */
+        smp_mb__before_atomic();
+        set_tsk_thread_flag(task, TIF_SECCOMP);
+}
+#ifdef CONFIG_SECCOMP_FILTER
+/* Returns 1 if the parent is an ancestor of the child. */
+static int is_ancestor(struct seccomp_filter *parent,
+                       struct seccomp_filter *child)
+{
+        /* NULL is the root ancestor. */
+        if (parent == NULL)
+                return 1;
+        for (; child; child = child->prev)
+                if (child == parent)
+                        return 1;
+        return 0;
+}
 /**
- * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * seccomp_can_sync_threads: checks if all threads can be synchronized
+ *
+ * Expects sighand and cred_guard_mutex locks to be held.
+ *
+ * Returns 0 on success, -ve on error, or the pid of a thread which was
+ * either not in the correct seccomp mode or it did not have an ancestral
+ * seccomp filter.
+ */
+static inline pid_t seccomp_can_sync_threads(void)
+{
+        struct task_struct *thread, *caller;
+        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+        assert_spin_locked(&current->sighand->siglock);
+        /* Validate all threads being eligible for synchronization. */
+        caller = current;
+        for_each_thread(caller, thread) {
+                pid_t failed;
+                /* Skip current, since it is initiating the sync. */
+                if (thread == caller)
+                        continue;
+                if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
+                    (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
+                     is_ancestor(thread->seccomp.filter,
+                                 caller->seccomp.filter)))
+                        continue;
+                /* Return the first thread that cannot be synchronized. */
+                failed = task_pid_vnr(thread);
+                /* If the pid cannot be resolved, then return -ESRCH */
+                if (unlikely(WARN_ON(failed == 0)))
+                        failed = -ESRCH;
+                return failed;
+        }
+        return 0;
+}
+/**
+ * seccomp_sync_threads: sets all threads to use current's filter
+ *
+ * Expects sighand and cred_guard_mutex locks to be held, and for
+ * seccomp_can_sync_threads() to have returned success already
+ * without dropping the locks.
+ *
+ */
+static inline void seccomp_sync_threads(void)
+{
+        struct task_struct *thread, *caller;
+        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+        assert_spin_locked(&current->sighand->siglock);
+        /* Synchronize all threads. */
+        caller = current;
+        for_each_thread(caller, thread) {
+                /* Skip current, since it needs no changes. */
+                if (thread == caller)
+                        continue;
+                /* Get a task reference for the new leaf node. */
+                get_seccomp_filter(caller);
+                /*
+                 * Drop the task reference to the shared ancestor since
+                 * current's path will hold a reference.  (This also
+                 * allows a put before the assignment.)
+                 */
+                put_seccomp_filter(thread);
+                smp_store_release(&thread->seccomp.filter,
+                                  caller->seccomp.filter);
+                /*
+                 * Opt the other thread into seccomp if needed.
+                 * As threads are considered to be trust-realm
+                 * equivalent (see ptrace_may_access), it is safe to
+                 * allow one thread to transition the other.
+                 */
+                if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
+                        /*
+                         * Don't let an unprivileged task work around
+                         * the no_new_privs restriction by creating
+                         * a thread that sets it up, enters seccomp,
+                         * then dies.
+                         */
+                        if (task_no_new_privs(caller))
+                                task_set_no_new_privs(thread);
+                        seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+                }
+        }
+}
+/**
+ * seccomp_prepare_filter: Prepares a seccomp filter for use.
 * @fprog: BPF program to install
 *
- * Returns 0 on success or an errno on failure.
+ * Returns filter on success or an ERR_PTR on failure.
 */
-static long seccomp_attach_filter(struct sock_fprog *fprog)
+static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
        struct seccomp_filter *filter;
-        unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
+        unsigned long fp_size;
-        unsigned long total_insns = fprog->len;
        struct sock_filter *fp;
        int new_len;
        long ret;
        if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
-                return -EINVAL;
+                return ERR_PTR(-EINVAL);
+        BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
-        for (filter = current->seccomp.filter; filter; filter = filter->prev)
+        fp_size = fprog->len * sizeof(struct sock_filter);
-                total_insns += filter->prog->len + 4;  /* include a 4 instr penalty */
-        if (total_insns > MAX_INSNS_PER_PATH)
-                return -ENOMEM;
        /*
         * Installing a seccomp filter requires that the task has
@@ -224,14 +359,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
         * This avoids scenarios where unprivileged tasks can affect the
         * behavior of privileged children.
         */
-        if (!current->no_new_privs &&
+        if (!task_no_new_privs(current) &&
            security_capable_noaudit(current_cred(), current_user_ns(),
                                     CAP_SYS_ADMIN) != 0)
-                return -EACCES;
+                return ERR_PTR(-EACCES);
        fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
        if (!fp)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        /* Copy the instructions from fprog. */
        ret = -EFAULT;
@@ -239,7 +374,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
                goto free_prog;
        /* Check and rewrite the fprog via the skb checker */
-        ret = sk_chk_filter(fp, fprog->len);
+        ret = bpf_check_classic(fp, fprog->len);
        if (ret)
                goto free_prog;
@@ -248,8 +383,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
        if (ret)
                goto free_prog;
-        /* Convert 'sock_filter' insns to 'sock_filter_int' insns */
+        /* Convert 'sock_filter' insns to 'bpf_insn' insns */
-        ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
+        ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
        if (ret)
                goto free_prog;
@@ -260,12 +395,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
        if (!filter)
                goto free_prog;
-        filter->prog = kzalloc(sk_filter_size(new_len),
+        filter->prog = kzalloc(bpf_prog_size(new_len),
                               GFP_KERNEL|__GFP_NOWARN);
        if (!filter->prog)
                goto free_filter;
-        ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
+        ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
        if (ret)
                goto free_filter_prog;
        kfree(fp);
@@ -273,15 +408,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
        atomic_set(&filter->usage, 1);
        filter->prog->len = new_len;
-        sk_filter_select_runtime(filter->prog);
+        bpf_prog_select_runtime(filter->prog);
-        /*
+        return filter;
-         * If there is an existing filter, make it the prev and don't drop its
-         * task reference.
-         */
-        filter->prev = current->seccomp.filter;
-        current->seccomp.filter = filter;
-        return 0;
 free_filter_prog:
        kfree(filter->prog);
@@ -289,19 +418,20 @@ free_filter:
        kfree(filter);
 free_prog:
        kfree(fp);
-        return ret;
+        return ERR_PTR(ret);
 }
 /**
- * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
 * @user_filter: pointer to the user data containing a sock_fprog.
 *
 * Returns 0 on success and non-zero otherwise.
 */
-static long seccomp_attach_user_filter(char __user *user_filter)
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
 {
        struct sock_fprog fprog;
-        long ret = -EFAULT;
+        struct seccomp_filter *filter = ERR_PTR(-EFAULT);
 #ifdef CONFIG_COMPAT
        if (is_compat_task()) {
@@ -314,9 +444,56 @@ static long seccomp_attach_user_filter(char __user *user_filter)
 #endif
        if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
                goto out;
-        ret = seccomp_attach_filter(&fprog);
+        filter = seccomp_prepare_filter(&fprog);
 out:
-        return ret;
+        return filter;
+}
+/**
+ * seccomp_attach_filter: validate and attach filter
+ * @flags:  flags to change filter behavior
+ * @filter: seccomp filter to add to the current process
+ *
+ * Caller must be holding current->sighand->siglock lock.
+ *
+ * Returns 0 on success, -ve on error.
+ */
+static long seccomp_attach_filter(unsigned int flags,
+                                  struct seccomp_filter *filter)
+{
+        unsigned long total_insns;
+        struct seccomp_filter *walker;
+        assert_spin_locked(&current->sighand->siglock);
+        /* Validate resulting filter length. */
+        total_insns = filter->prog->len;
+        for (walker = current->seccomp.filter; walker; walker = walker->prev)
+                total_insns += walker->prog->len + 4;  /* 4 instr penalty */
+        if (total_insns > MAX_INSNS_PER_PATH)
+                return -ENOMEM;
+        /* If thread sync has been requested, check that it is possible. */
+        if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+                int ret;
+                ret = seccomp_can_sync_threads();
+                if (ret)
+                        return ret;
+        }
+        /*
+         * If there is an existing filter, make it the prev and don't drop its
+         * task reference.
+         */
+        filter->prev = current->seccomp.filter;
+        current->seccomp.filter = filter;
+        /* Now that the new filter is in place, synchronize to all threads. */
+        if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+                seccomp_sync_threads();
+        return 0;
 }
 /* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -329,6 +506,14 @@ void get_seccomp_filter(struct task_struct *tsk)
        atomic_inc(&orig->usage);
 }
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+        if (filter) {
+                bpf_prog_free(filter->prog);
+                kfree(filter);
+        }
+}
 /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
 void put_seccomp_filter(struct task_struct *tsk)
 {
@@ -337,8 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk)
        while (orig && atomic_dec_and_test(&orig->usage)) {
                struct seccomp_filter *freeme = orig;
                orig = orig->prev;
-                sk_filter_free(freeme->prog);
+                seccomp_filter_free(freeme);
-                kfree(freeme);
        }
 }
@@ -382,12 +566,17 @@ static int mode1_syscalls_32[] = {
 int __secure_computing(int this_syscall)
 {
-        int mode = current->seccomp.mode;
        int exit_sig = 0;
        int *syscall;
        u32 ret;
-        switch (mode) {
+        /*
+         * Make sure that any changes to mode from another thread have
+         * been seen after TIF_SECCOMP was seen.
+         */
+        rmb();
+        switch (current->seccomp.mode) {
        case SECCOMP_MODE_STRICT:
                syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
@@ -473,47 +662,152 @@ long prctl_get_seccomp(void)
 }
 /**
- * prctl_set_seccomp: configures current->seccomp.mode
+ * seccomp_set_mode_strict: internal function for setting strict seccomp
- * @seccomp_mode: requested mode to use
- * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
 *
- * This function may be called repeatedly with a @seccomp_mode of
+ * Once current->seccomp.mode is non-zero, it may not be changed.
- * SECCOMP_MODE_FILTER to install additional filters.  Every filter
+ *
- * successfully installed will be evaluated (in reverse order) for each system
+ * Returns 0 on success or -EINVAL on failure.
- * call the task makes.
+ */
+static long seccomp_set_mode_strict(void)
+{
+        const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
+        long ret = -EINVAL;
+        spin_lock_irq(&current->sighand->siglock);
+        if (!seccomp_may_assign_mode(seccomp_mode))
+                goto out;
+#ifdef TIF_NOTSC
+        disable_TSC();
+#endif
+        seccomp_assign_mode(current, seccomp_mode);
+        ret = 0;
+out:
+        spin_unlock_irq(&current->sighand->siglock);
+        return ret;
+}
+#ifdef CONFIG_SECCOMP_FILTER
+/**
+ * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @flags:  flags to change filter behavior
+ * @filter: struct sock_fprog containing filter
+ *
+ * This function may be called repeatedly to install additional filters.
+ * Every filter successfully installed will be evaluated (in reverse order)
+ * for each system call the task makes.
 *
 * Once current->seccomp.mode is non-zero, it may not be changed.
 *
 * Returns 0 on success or -EINVAL on failure.
 */
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+static long seccomp_set_mode_filter(unsigned int flags,
+                                    const char __user *filter)
 {
+        const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
+        struct seccomp_filter *prepared = NULL;
        long ret = -EINVAL;
-        if (current->seccomp.mode &&
+        /* Validate flags. */
-            current->seccomp.mode != seccomp_mode)
+        if (flags & ~SECCOMP_FILTER_FLAG_MASK)
+                return -EINVAL;
+        /* Prepare the new filter before holding any locks. */
+        prepared = seccomp_prepare_user_filter(filter);
+        if (IS_ERR(prepared))
+                return PTR_ERR(prepared);
+        /*
+         * Make sure we cannot change seccomp or nnp state via TSYNC
+         * while another thread is in the middle of calling exec.
+         */
+        if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
+            mutex_lock_killable(&current->signal->cred_guard_mutex))
+                goto out_free;
+        spin_lock_irq(&current->sighand->siglock);
+        if (!seccomp_may_assign_mode(seccomp_mode))
+                goto out;
+        ret = seccomp_attach_filter(flags, prepared);
+        if (ret)
                goto out;
+        /* Do not free the successfully attached filter. */
+        prepared = NULL;
+        seccomp_assign_mode(current, seccomp_mode);
+out:
+        spin_unlock_irq(&current->sighand->siglock);
+        if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+                mutex_unlock(&current->signal->cred_guard_mutex);
+out_free:
+        seccomp_filter_free(prepared);
+        return ret;
+}
+#else
+static inline long seccomp_set_mode_filter(unsigned int flags,
+                                           const char __user *filter)
+{
+        return -EINVAL;
+}
+#endif
+/* Common entry point for both prctl and syscall. */
+static long do_seccomp(unsigned int op, unsigned int flags,
+                       const char __user *uargs)
+{
+        switch (op) {
+        case SECCOMP_SET_MODE_STRICT:
+                if (flags != 0 || uargs != NULL)
+                        return -EINVAL;
+                return seccomp_set_mode_strict();
+        case SECCOMP_SET_MODE_FILTER:
+                return seccomp_set_mode_filter(flags, uargs);
+        default:
+                return -EINVAL;
+        }
+}
+SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
+                         const char __user *, uargs)
+{
+        return do_seccomp(op, flags, uargs);
+}
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+{
+        unsigned int op;
+        char __user *uargs;
        switch (seccomp_mode) {
        case SECCOMP_MODE_STRICT:
-                ret = 0;
+                op = SECCOMP_SET_MODE_STRICT;
-#ifdef TIF_NOTSC
+                /*
-                disable_TSC();
+                 * Setting strict mode through prctl always ignored filter,
-#endif
+                 * so make sure it is always NULL here to pass the internal
+                 * check in do_seccomp().
+                 */
+                uargs = NULL;
                break;
-#ifdef CONFIG_SECCOMP_FILTER
        case SECCOMP_MODE_FILTER:
-                ret = seccomp_attach_user_filter(filter);
+                op = SECCOMP_SET_MODE_FILTER;
-                if (ret)
+                uargs = filter;
-                        goto out;
                break;
-#endif
        default:
-                goto out;
+                return -EINVAL;
        }
-        current->seccomp.mode = seccomp_mode;
+        /* prctl interface doesn't have flags, so they are always zero. */
-        set_thread_flag(TIF_SECCOMP);
+        return do_seccomp(op, 0, uargs);
-out:
-        return ret;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 40b76e351e64..8f0876f9f6dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2170,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
        return signr;
 }
-int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
+int get_signal(struct ksignal *ksig)
-                          struct pt_regs *regs, void *cookie)
 {
        struct sighand_struct *sighand = current->sighand;
        struct signal_struct *signal = current->signal;
@@ -2241,13 +2240,13 @@ relock:
                        goto relock;
                }
-                signr = dequeue_signal(current, &current->blocked, info);
+                signr = dequeue_signal(current, &current->blocked, &ksig->info);
                if (!signr)
                        break; /* will return 0 */
                if (unlikely(current->ptrace) && signr != SIGKILL) {
-                        signr = ptrace_signal(signr, info);
+                        signr = ptrace_signal(signr, &ksig->info);
                        if (!signr)
                                continue;
                }
@@ -2255,13 +2254,13 @@ relock:
                ka = &sighand->action[signr-1];
                /* Trace actually delivered signals. */
-                trace_signal_deliver(signr, info, ka);
+                trace_signal_deliver(signr, &ksig->info, ka);
                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
                        /* Run the handler.  */
-                        *return_ka = *ka;
+                        ksig->ka = *ka;
                        if (ka->sa.sa_flags & SA_ONESHOT)
                                ka->sa.sa_handler = SIG_DFL;
@@ -2311,7 +2310,7 @@ relock:
                                spin_lock_irq(&sighand->siglock);
                        }
-                        if (likely(do_signal_stop(info->si_signo))) {
+                        if (likely(do_signal_stop(ksig->info.si_signo))) {
                                /* It released the siglock.  */
                                goto relock;
                        }
@@ -2332,7 +2331,7 @@ relock:
                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
-                                print_fatal_signal(info->si_signo);
+                                print_fatal_signal(ksig->info.si_signo);
                        proc_coredump_connector(current);
                        /*
                         * If it was able to dump core, this kills all
@@ -2342,34 +2341,32 @@ relock:
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
-                        do_coredump(info);
+                        do_coredump(&ksig->info);
                }
                /*
                 * Death signals, no core dump.
                 */
-                do_group_exit(info->si_signo);
+                do_group_exit(ksig->info.si_signo);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);
-        return signr;
+        ksig->sig = signr;
+        return ksig->sig > 0;
 }
 /**
 * signal_delivered - 
- * @sig:                number of signal being delivered
+ * @ksig:               kernel signal struct
- * @info:               siginfo_t of signal being delivered
- * @ka:                 sigaction setting that chose the handler
- * @regs:               user register state
 * @stepping:           nonzero if debugger single-step or block-step in use
 *
 * This function should be called when a signal has successfully been
- * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
 * is always blocked, and the signal itself is blocked unless %SA_NODEFER
- * is set in @ka->sa.sa_flags.  Tracing is notified.
+ * is set in @ksig->ka.sa.sa_flags.  Tracing is notified.
 */
-void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
+static void signal_delivered(struct ksignal *ksig, int stepping)
-                        struct pt_regs *regs, int stepping)
 {
        sigset_t blocked;
@@ -2379,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
           simply clear the restore sigmask flag.  */
        clear_restore_sigmask();
-        sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+        sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
-        if (!(ka->sa.sa_flags & SA_NODEFER))
+        if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
-                sigaddset(&blocked, sig);
+                sigaddset(&blocked, ksig->sig);
        set_current_blocked(&blocked);
-        tracehook_signal_handler(sig, info, ka, regs, stepping);
+        tracehook_signal_handler(stepping);
 }
 void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -2391,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
        if (failed)
                force_sigsegv(ksig->sig, current);
        else
-                signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
+                signal_delivered(ksig, stepping);
-                        signal_pt_regs(), stepping);
 }
 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index 487653b5844f..aff8aa14f547 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
                        if (cond_func(cpu, info)) {
                                ret = smp_call_function_single(cpu, func,
                                                                info, wait);
-                                WARN_ON_ONCE(!ret);
+                                WARN_ON_ONCE(ret);
                        }
                preempt_enable();
        }
diff --git a/kernel/sys.c b/kernel/sys.c
index 66a751ebf9d9..ce8129192a26 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                if (arg2 != 1 || arg3 || arg4 || arg5)
                        return -EINVAL;
-                current->no_new_privs = 1;
+                task_set_no_new_privs(current);
                break;
        case PR_GET_NO_NEW_PRIVS:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
-                return current->no_new_privs ? 1 : 0;
+                return task_no_new_privs(current) ? 1 : 0;
        case PR_GET_THP_DISABLE:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 36441b51b5df..391d4ddb6f4b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapon);
 cond_syscall(sys_swapoff);
 cond_syscall(sys_kexec_load);
 cond_syscall(compat_sys_kexec_load);
+cond_syscall(sys_kexec_file_load);
 cond_syscall(sys_init_module);
 cond_syscall(sys_finit_module);
 cond_syscall(sys_delete_module);
@@ -197,6 +198,7 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
@@ -213,3 +215,6 @@ cond_syscall(compat_sys_open_by_handle_at);
 /* compare kernel pointers */
 cond_syscall(sys_kcmp);
+/* operate on Secure Computing state */
+cond_syscall(sys_seccomp);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75b22e22a72c..75875a741b5e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = hugetlb_sysctl_handler,
-                .extra1         = (void *)&hugetlb_zero,
+                .extra1         = &zero,
-                .extra2         = (void *)&hugetlb_infinity,
        },
 #ifdef CONFIG_NUMA
        {
@@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_mempolicy_sysctl_handler,
-                .extra1         = (void *)&hugetlb_zero,
+                .extra1         = &zero,
-                .extra2         = (void *)&hugetlb_infinity,
        },
 #endif
         {
@@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = hugetlb_overcommit_handler,
-                .extra1         = (void *)&hugetlb_zero,
+                .extra1         = &zero,
-                .extra2         = (void *)&hugetlb_infinity,
        },
 #endif
        {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 653cbbd9e7ad..e4ba9a5a5ccb 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = {
        { CTL_INT,      NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN,    "accept_ra_rt_info_max_plen" },
        { CTL_INT,      NET_IPV6_PROXY_NDP,                     "proxy_ndp" },
        { CTL_INT,      NET_IPV6_ACCEPT_SOURCE_ROUTE,           "accept_source_route" },
+        { CTL_INT,      NET_IPV6_ACCEPT_RA_FROM_LOCAL,          "accept_ra_from_local" },
        {}
 };
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 52ebc70263f4..875f64e8935b 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void)
                        pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
                               PTR_ERR(key));
                } else {
+                        set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
                        pr_notice("Loaded X.509 cert '%s'\n",
                                  key_ref_to_ptr(key)->description);
                        key_ref_put(key);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 12d6ebbfdd83..0dbab6d1acb4 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -14,6 +14,8 @@
 * the GNU General Public License for more details.
 */
+#define pr_fmt(fmt) "Kprobe smoke test: " fmt
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/random.h>
@@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
 {
        if (preh_val != (rand1 / div_factor)) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in post_handler\n");
-                                "incorrect value in post_handler\n");
        }
        posth_val = preh_val + div_factor;
 }
@@ -59,8 +60,7 @@ static int test_kprobe(void)
        ret = register_kprobe(&kp);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_kprobe returned %d\n", ret);
-                                "register_kprobe returned %d\n", ret);
                return ret;
        }
@@ -68,14 +68,12 @@ static int test_kprobe(void)
        unregister_kprobe(&kp);
        if (preh_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe pre_handler not called\n");
-                                "kprobe pre_handler not called\n");
                handler_errors++;
        }
        if (posth_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe post_handler not called\n");
-                                "kprobe post_handler not called\n");
                handler_errors++;
        }
@@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
 {
        if (preh_val != (rand1 / div_factor) + 1) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in post_handler2\n");
-                                "incorrect value in post_handler2\n");
        }
        posth_val = preh_val + div_factor;
 }
@@ -120,8 +117,7 @@ static int test_kprobes(void)
        kp.flags = 0;
        ret = register_kprobes(kps, 2);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_kprobes returned %d\n", ret);
-                                "register_kprobes returned %d\n", ret);
                return ret;
        }
@@ -130,14 +126,12 @@ static int test_kprobes(void)
        ret = target(rand1);
        if (preh_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe pre_handler not called\n");
-                                "kprobe pre_handler not called\n");
                handler_errors++;
        }
        if (posth_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe post_handler not called\n");
-                                "kprobe post_handler not called\n");
                handler_errors++;
        }
@@ -146,14 +140,12 @@ static int test_kprobes(void)
        ret = target2(rand1);
        if (preh_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe pre_handler2 not called\n");
-                                "kprobe pre_handler2 not called\n");
                handler_errors++;
        }
        if (posth_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe post_handler2 not called\n");
-                                "kprobe post_handler2 not called\n");
                handler_errors++;
        }
@@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value)
 {
        if (value != rand1) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in jprobe handler\n");
-                                "incorrect value in jprobe handler\n");
        }
        jph_val = rand1;
@@ -186,16 +177,14 @@ static int test_jprobe(void)
        ret = register_jprobe(&jp);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_jprobe returned %d\n", ret);
-                                "register_jprobe returned %d\n", ret);
                return ret;
        }
        ret = target(rand1);
        unregister_jprobe(&jp);
        if (jph_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("jprobe handler not called\n");
-                                "jprobe handler not called\n");
                handler_errors++;
        }
@@ -217,24 +206,21 @@ static int test_jprobes(void)
        jp.kp.flags = 0;
        ret = register_jprobes(jps, 2);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_jprobes returned %d\n", ret);
-                                "register_jprobes returned %d\n", ret);
                return ret;
        }
        jph_val = 0;
        ret = target(rand1);
        if (jph_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("jprobe handler not called\n");
-                                "jprobe handler not called\n");
                handler_errors++;
        }
        jph_val = 0;
        ret = target2(rand1);
        if (jph_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("jprobe handler2 not called\n");
-                                "jprobe handler2 not called\n");
                handler_errors++;
        }
        unregister_jprobes(jps, 2);
@@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
        if (ret != (rand1 / div_factor)) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in kretprobe handler\n");
-                                "incorrect value in kretprobe handler\n");
        }
        if (krph_val == 0) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("call to kretprobe entry handler failed\n");
-                                "call to kretprobe entry handler failed\n");
        }
        krph_val = rand1;
@@ -281,16 +265,14 @@ static int test_kretprobe(void)
        ret = register_kretprobe(&rp);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_kretprobe returned %d\n", ret);
-                                "register_kretprobe returned %d\n", ret);
                return ret;
        }
        ret = target(rand1);
        unregister_kretprobe(&rp);
        if (krph_val != rand1) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kretprobe handler not called\n");
-                                "kretprobe handler not called\n");
                handler_errors++;
        }
@@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
        if (ret != (rand1 / div_factor) + 1) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in kretprobe handler2\n");
-                                "incorrect value in kretprobe handler2\n");
        }
        if (krph_val == 0) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("call to kretprobe entry handler failed\n");
-                                "call to kretprobe entry handler failed\n");
        }
        krph_val = rand1;
@@ -332,24 +312,21 @@ static int test_kretprobes(void)
        rp.kp.flags = 0;
        ret = register_kretprobes(rps, 2);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_kretprobe returned %d\n", ret);
-                                "register_kretprobe returned %d\n", ret);
                return ret;
        }
        krph_val = 0;
        ret = target(rand1);
        if (krph_val != rand1) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kretprobe handler not called\n");
-                                "kretprobe handler not called\n");
                handler_errors++;
        }
        krph_val = 0;
        ret = target2(rand1);
        if (krph_val != rand1) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kretprobe handler2 not called\n");
-                                "kretprobe handler2 not called\n");
                handler_errors++;
        }
        unregister_kretprobes(rps, 2);
@@ -368,7 +345,7 @@ int init_test_probes(void)
                rand1 = prandom_u32();
        } while (rand1 <= div_factor);
-        printk(KERN_INFO "Kprobe smoke test started\n");
+        pr_info("started\n");
        num_tests++;
        ret = test_kprobe();
        if (ret < 0)
@@ -402,13 +379,11 @@ int init_test_probes(void)
 #endif /* CONFIG_KRETPROBES */
        if (errors)
-                printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
+                pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
-                                "%d tests failed\n", errors, num_tests);
        else if (handler_errors)
-                printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
+                pr_err("BUG: %d error(s) running handlers\n", handler_errors);
-                                "running handlers\n", handler_errors);
        else
-                printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+                pr_info("passed successfully\n");
        return 0;
 }
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f448513a45ed..d626dc98e8df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
 config ARCH_CLOCKSOURCE_DATA
        bool
+# Clocksources require validation of the clocksource against the last
+# cycle update - x86/TSC misfeature
+config CLOCKSOURCE_VALIDATE_LAST_CYCLE
+        bool
 # Timekeeping vsyscall support
 config GENERIC_TIME_VSYSCALL
        bool
@@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL
 config GENERIC_TIME_VSYSCALL_OLD
        bool
-# ktime_t scalar 64bit nsec representation
-config KTIME_SCALAR
-        bool
 # Old style timekeeping
 config ARCH_USES_GETTIMEOFFSET
        bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 57a413fd0ebf..7347426fa68d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,3 +1,4 @@
+obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
 obj-y += timeconv.o posix-clock.o alarmtimer.o
@@ -12,3 +13,21 @@ obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o
 obj-$(CONFIG_TICK_ONESHOT)                      += tick-sched.o
 obj-$(CONFIG_TIMER_STATS)                       += timer_stats.o
 obj-$(CONFIG_DEBUG_FS)                          += timekeeping_debug.o
+obj-$(CONFIG_TEST_UDELAY)                       += udelay_test.o
+$(obj)/time.o: $(obj)/timeconst.h
+quiet_cmd_hzfile = HZFILE  $@
+      cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+        $(call if_changed,hzfile)
+quiet_cmd_bc  = BC      $@
+      cmd_bc  = bc -q $(filter-out FORCE,$^) > $@
+targets += timeconst.h
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+        $(call if_changed,bc)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index fe75444ae7ec..4aec4a457431 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
        return ret;
 }
+EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
 static int alarmtimer_rtc_add_device(struct device *dev,
                                struct class_interface *class_intf)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ba3e502c955a..2e949cc9c9f1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -32,6 +32,7 @@
 #include <linux/kthread.h>
 #include "tick-internal.h"
+#include "timekeeping_internal.h"
 void timecounter_init(struct timecounter *tc,
                      const struct cyclecounter *cc,
@@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 static void clocksource_watchdog(unsigned long data)
 {
        struct clocksource *cs;
-        cycle_t csnow, wdnow;
+        cycle_t csnow, wdnow, delta;
        int64_t wd_nsec, cs_nsec;
        int next_cpu, reset_pending;
@@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data)
                        continue;
                }
-                wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
+                delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
-                                             watchdog->mult, watchdog->shift);
+                wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
+                                             watchdog->shift);
-                cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
+                delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
-                                             cs->mask, cs->mult, cs->shift);
+                cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
                cs->cs_last = csnow;
                cs->wd_last = wdnow;
diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c
index 3ab28993f6e0..1c2fe7de2842 100644
--- a/kernel/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -54,6 +54,8 @@
 #include <trace/events/timer.h>
+#include "timekeeping.h"
 /*
 * The timer bases:
 *
@@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 */
 static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
-        ktime_t xtim, mono, boot;
+        ktime_t xtim, mono, boot, tai;
-        struct timespec xts, tom, slp;
+        ktime_t off_real, off_boot, off_tai;
-        s32 tai_offset;
-        get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
+        mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
-        tai_offset = timekeeping_get_tai_offset();
+        boot = ktime_add(mono, off_boot);
+        xtim = ktime_add(mono, off_real);
+        tai = ktime_add(xtim, off_tai);
-        xtim = timespec_to_ktime(xts);
-        mono = ktime_add(xtim, timespec_to_ktime(tom));
-        boot = ktime_add(mono, timespec_to_ktime(slp));
        base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
        base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
        base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
-        base->clock_base[HRTIMER_BASE_TAI].softirq_time =
+        base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
-                                ktime_add(xtim, ktime_set(tai_offset, 0));
 }
 /*
@@ -264,60 +263,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 * too large for inlining:
 */
 #if BITS_PER_LONG < 64
-# ifndef CONFIG_KTIME_SCALAR
-/**
- * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
- * @kt:         addend
- * @nsec:       the scalar nsec value to add
- *
- * Returns the sum of kt and nsec in ktime_t format
- */
-ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
-{
-        ktime_t tmp;
-        if (likely(nsec < NSEC_PER_SEC)) {
-                tmp.tv64 = nsec;
-        } else {
-                unsigned long rem = do_div(nsec, NSEC_PER_SEC);
-                /* Make sure nsec fits into long */
-                if (unlikely(nsec > KTIME_SEC_MAX))
-                        return (ktime_t){ .tv64 = KTIME_MAX };
-                tmp = ktime_set((long)nsec, rem);
-        }
-        return ktime_add(kt, tmp);
-}
-EXPORT_SYMBOL_GPL(ktime_add_ns);
-/**
- * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
- * @kt:         minuend
- * @nsec:       the scalar nsec value to subtract
- *
- * Returns the subtraction of @nsec from @kt in ktime_t format
- */
-ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
-{
-        ktime_t tmp;
-        if (likely(nsec < NSEC_PER_SEC)) {
-                tmp.tv64 = nsec;
-        } else {
-                unsigned long rem = do_div(nsec, NSEC_PER_SEC);
-                tmp = ktime_set((long)nsec, rem);
-        }
-        return ktime_sub(kt, tmp);
-}
-EXPORT_SYMBOL_GPL(ktime_sub_ns);
-# endif /* !CONFIG_KTIME_SCALAR */
 /*
 * Divide a ktime value by a nanosecond value
 */
@@ -337,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
        return dclc;
 }
+EXPORT_SYMBOL_GPL(ktime_divns);
 #endif /* BITS_PER_LONG >= 64 */
 /*
@@ -602,6 +548,11 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 * timers, we have to check, whether it expires earlier than the timer for
 * which the clock event device was armed.
 *
+ * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
+ * and no expiry check happens. The timer gets enqueued into the rbtree. The
+ * reprogramming and expiry check is done in the hrtimer_interrupt or in the
+ * softirq.
+ *
 * Called with interrupts disabled and base->cpu_base.lock held
 */
 static int hrtimer_reprogram(struct hrtimer *timer,
@@ -662,25 +613,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
        base->hres_active = 0;
 }
-/*
- * When High resolution timers are active, try to reprogram. Note, that in case
- * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
- * check happens. The timer gets enqueued into the rbtree. The reprogramming
- * and expiry check is done in the hrtimer_interrupt or in the softirq.
- */
-static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-                                            struct hrtimer_clock_base *base)
-{
-        return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
-}
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 {
        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
        ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-        return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
+        return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
 }
 /*
@@ -755,8 +694,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
-static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+static inline int hrtimer_reprogram(struct hrtimer *timer,
-                                            struct hrtimer_clock_base *base)
+                                    struct hrtimer_clock_base *base)
 {
        return 0;
 }
@@ -1013,14 +952,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        leftmost = enqueue_hrtimer(timer, new_base);
-        /*
+        if (!leftmost) {
-         * Only allow reprogramming if the new base is on this CPU.
+                unlock_hrtimer_base(timer, &flags);
-         * (it might still be on another CPU if the timer was pending)
+                return ret;
-         *
+        }
-         * XXX send_remote_softirq() ?
-         */
+        if (!hrtimer_is_hres_active(timer)) {
-        if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
+                /*
-                && hrtimer_enqueue_reprogram(timer, new_base)) {
+                 * Kick to reschedule the next tick to handle the new timer
+                 * on dynticks target.
+                 */
+                wake_up_nohz_cpu(new_base->cpu_base->cpu);
+        } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) &&
+                        hrtimer_reprogram(timer, new_base)) {
+                /*
+                 * Only allow reprogramming if the new base is on this CPU.
+                 * (it might still be on another CPU if the timer was pending)
+                 *
+                 * XXX send_remote_softirq() ?
+                 */
                if (wakeup) {
                        /*
                         * We need to drop cpu_base->lock to avoid a
@@ -1680,6 +1630,7 @@ static void init_hrtimers_cpu(int cpu)
                timerqueue_init_head(&cpu_base->clock_base[i].active);
        }
+        cpu_base->cpu = cpu;
        hrtimer_init_hres(cpu_base);
 }
diff --git a/kernel/itimer.c b/kernel/time/itimer.c
index 8d262b467573..8d262b467573 100644
--- a/kernel/itimer.c
+++ b/kernel/time/itimer.c
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 33db43a39515..87a346fd6d61 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
 static void sync_cmos_clock(struct work_struct *work)
 {
-        struct timespec now, next;
+        struct timespec64 now;
+        struct timespec next;
        int fail = 1;
        /*
@@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work)
                return;
        }
-        getnstimeofday(&now);
+        getnstimeofday64(&now);
        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
-                struct timespec adjust = now;
+                struct timespec adjust = timespec64_to_timespec(now);
                fail = -ENODEV;
                if (persistent_clock_is_local)
@@ -531,7 +532,7 @@ void ntp_notify_cmos_timer(void) { }
 /*
 * Propagate a new txc->status value into the NTP state:
 */
-static inline void process_adj_status(struct timex *txc, struct timespec *ts)
+static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
 {
        if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
                time_state = TIME_OK;
@@ -554,7 +555,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
 static inline void process_adjtimex_modes(struct timex *txc,
-                                                struct timespec *ts,
+                                                struct timespec64 *ts,
                                                s32 *time_tai)
 {
        if (txc->modes & ADJ_STATUS)
@@ -640,7 +641,7 @@ int ntp_validate_timex(struct timex *txc)
 * adjtimex mainly allows reading (and writing, if superuser) of
 * kernel time-keeping variables. used by xntpd.
 */
-int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
+int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
 {
        int result;
@@ -684,7 +685,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
        /* fill PPS status fields */
        pps_fill_timex(txc);
-        txc->time.tv_sec = ts->tv_sec;
+        txc->time.tv_sec = (time_t)ts->tv_sec;
        txc->time.tv_usec = ts->tv_nsec;
        if (!(time_status & STA_NANO))
                txc->time.tv_usec /= NSEC_PER_USEC;
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 1950cb4ca2a4..bbd102ad9df7 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -7,6 +7,6 @@ extern void ntp_clear(void);
 extern u64 ntp_tick_length(void);
 extern int second_overflow(unsigned long secs);
 extern int ntp_validate_timex(struct timex *);
-extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
+extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
 extern void __hardpps(const struct timespec *, const struct timespec *);
 #endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..3b8946416a5f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c
index 424c2d4265c9..42b463ad90f2 100644
--- a/kernel/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -49,6 +49,8 @@
 #include <linux/export.h>
 #include <linux/hashtable.h>
+#include "timekeeping.h"
 /*
 * Management arrays for POSIX timers. Timers are now kept in static hash table
 * with 512 entries.
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7ab92b19965a..c19c1d84b6f3 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
+#include "timekeeping.h"
 extern seqlock_t jiffies_lock;
 #define CS_NAME_LEN     32
diff --git a/kernel/time.c b/kernel/time/time.c
index 7c7964c33ae7..f0294ba14634 100644
--- a/kernel/time.c
+++ b/kernel/time/time.c
@@ -42,6 +42,7 @@
 #include <asm/unistd.h>
 #include "timeconst.h"
+#include "timekeeping.h"
 /*
 * The timezone where the local system is located.  Used as a default by some
@@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec)
 }
 EXPORT_SYMBOL(ns_to_timeval);
+#if BITS_PER_LONG == 32
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts:         pointer to timespec variable to be set
+ * @sec:        seconds to set
+ * @nsec:       nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ *      0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
+{
+        while (nsec >= NSEC_PER_SEC) {
+                /*
+                 * The following asm() prevents the compiler from
+                 * optimising this loop into a modulo operation. See
+                 * also __iter_div_u64_rem() in include/linux/time.h
+                 */
+                asm("" : "+rm"(nsec));
+                nsec -= NSEC_PER_SEC;
+                ++sec;
+        }
+        while (nsec < 0) {
+                asm("" : "+rm"(nsec));
+                nsec += NSEC_PER_SEC;
+                --sec;
+        }
+        ts->tv_sec = sec;
+        ts->tv_nsec = nsec;
+}
+EXPORT_SYMBOL(set_normalized_timespec64);
+/**
+ * ns_to_timespec64 - Convert nanoseconds to timespec64
+ * @nsec:       the nanoseconds value to be converted
+ *
+ * Returns the timespec64 representation of the nsec parameter.
+ */
+struct timespec64 ns_to_timespec64(const s64 nsec)
+{
+        struct timespec64 ts;
+        s32 rem;
+        if (!nsec)
+                return (struct timespec64) {0, 0};
+        ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+        if (unlikely(rem < 0)) {
+                ts.tv_sec--;
+                rem += NSEC_PER_SEC;
+        }
+        ts.tv_nsec = rem;
+        return ts;
+}
+EXPORT_SYMBOL(ns_to_timespec64);
+#endif
 /*
 * When we convert to jiffies then we interpret incoming values
 * the following way:
@@ -694,6 +757,7 @@ unsigned long nsecs_to_jiffies(u64 n)
 {
        return (unsigned long)nsecs_to_jiffies64(n);
 }
+EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
 /*
 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2cafda..511bdf2cafda 100644
--- a/kernel/timeconst.bc
+++ b/kernel/time/timeconst.bc
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 32d8d6aaedb8..fb4a9c2cf8d9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,11 +32,34 @@
 #define TK_MIRROR               (1 << 1)
 #define TK_CLOCK_WAS_SET        (1 << 2)
-static struct timekeeper timekeeper;
+/*
+ * The most important data for readout fits into a single 64 byte
+ * cache line.
+ */
+static struct {
+        seqcount_t              seq;
+        struct timekeeper       timekeeper;
+} tk_core ____cacheline_aligned;
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
-static seqcount_t timekeeper_seq;
 static struct timekeeper shadow_timekeeper;
+/**
+ * struct tk_fast - NMI safe timekeeper
+ * @seq:        Sequence counter for protecting updates. The lowest bit
+ *              is the index for the tk_read_base array
+ * @base:       tk_read_base array. Access is indexed by the lowest bit of
+ *              @seq.
+ *
+ * See @update_fast_timekeeper() below.
+ */
+struct tk_fast {
+        seqcount_t              seq;
+        struct tk_read_base     base[2];
+};
+static struct tk_fast tk_fast_mono ____cacheline_aligned;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -45,49 +68,54 @@ bool __read_mostly persistent_clock_exist = false;
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
-        while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
+        while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
-                tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
+                tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
                tk->xtime_sec++;
        }
 }
-static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
+static inline struct timespec64 tk_xtime(struct timekeeper *tk)
+{
+        struct timespec64 ts;
+        ts.tv_sec = tk->xtime_sec;
+        ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+        return ts;
+}
+static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
 {
        tk->xtime_sec = ts->tv_sec;
-        tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
+        tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
 }
-static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
+static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
 {
        tk->xtime_sec += ts->tv_sec;
-        tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
+        tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
        tk_normalize_xtime(tk);
 }
-static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
+static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
 {
-        struct timespec tmp;
+        struct timespec64 tmp;
        /*
         * Verify consistency of: offset_real = -wall_to_monotonic
         * before modifying anything
         */
-        set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
+        set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
                                        -tk->wall_to_monotonic.tv_nsec);
-        WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
+        WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
        tk->wall_to_monotonic = wtm;
-        set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
+        set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
-        tk->offs_real = timespec_to_ktime(tmp);
+        tk->offs_real = timespec64_to_ktime(tmp);
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
 }
-static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
+static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 {
-        /* Verify consistency before modifying */
+        tk->offs_boot = ktime_add(tk->offs_boot, delta);
-        WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
-        tk->total_sleep_time    = t;
-        tk->offs_boot           = timespec_to_ktime(t);
 }
 /**
@@ -107,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;
-        old_clock = tk->clock;
+        old_clock = tk->tkr.clock;
-        tk->clock = clock;
+        tk->tkr.clock = clock;
-        tk->cycle_last = clock->cycle_last = clock->read(clock);
+        tk->tkr.read = clock->read;
+        tk->tkr.mask = clock->mask;
+        tk->tkr.cycle_last = tk->tkr.read(clock);
        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
@@ -133,78 +163,213 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0)
-                        tk->xtime_nsec >>= -shift_change;
+                        tk->tkr.xtime_nsec >>= -shift_change;
                else
-                        tk->xtime_nsec <<= shift_change;
+                        tk->tkr.xtime_nsec <<= shift_change;
        }
-        tk->shift = clock->shift;
+        tk->tkr.shift = clock->shift;
        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+        tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
-        tk->mult = clock->mult;
+        tk->tkr.mult = clock->mult;
+        tk->ntp_err_mult = 0;
 }
 /* Timekeeper helper functions. */
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-u32 (*arch_gettimeoffset)(void);
+static u32 default_arch_gettimeoffset(void) { return 0; }
+u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
-u32 get_arch_timeoffset(void)
-{
-        if (likely(arch_gettimeoffset))
-                return arch_gettimeoffset();
-        return 0;
-}
 #else
-static inline u32 get_arch_timeoffset(void) { return 0; }
+static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
-static inline s64 timekeeping_get_ns(struct timekeeper *tk)
+static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
-        cycle_t cycle_now, cycle_delta;
+        cycle_t cycle_now, delta;
-        struct clocksource *clock;
        s64 nsec;
        /* read clocksource: */
-        clock = tk->clock;
+        cycle_now = tkr->read(tkr->clock);
-        cycle_now = clock->read(clock);
        /* calculate the delta since the last update_wall_time: */
-        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+        delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
-        nsec = cycle_delta * tk->mult + tk->xtime_nsec;
+        nsec = delta * tkr->mult + tkr->xtime_nsec;
-        nsec >>= tk->shift;
+        nsec >>= tkr->shift;
        /* If arch requires, add in get_arch_timeoffset() */
-        return nsec + get_arch_timeoffset();
+        return nsec + arch_gettimeoffset();
 }
 static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 {
-        cycle_t cycle_now, cycle_delta;
+        struct clocksource *clock = tk->tkr.clock;
-        struct clocksource *clock;
+        cycle_t cycle_now, delta;
        s64 nsec;
        /* read clocksource: */
-        clock = tk->clock;
+        cycle_now = tk->tkr.read(clock);
-        cycle_now = clock->read(clock);
        /* calculate the delta since the last update_wall_time: */
-        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+        delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
        /* convert delta to nanoseconds. */
-        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+        nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
        /* If arch requires, add in get_arch_timeoffset() */
-        return nsec + get_arch_timeoffset();
+        return nsec + arch_gettimeoffset();
+}
+/**
+ * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
+ * @tk:         The timekeeper from which we take the update
+ * @tkf:        The fast timekeeper to update
+ * @tbase:      The time base for the fast timekeeper (mono/raw)
+ *
+ * We want to use this from any context including NMI and tracing /
+ * instrumenting the timekeeping code itself.
+ *
+ * So we handle this differently than the other timekeeping accessor
+ * functions which retry when the sequence count has changed. The
+ * update side does:
+ *
+ * smp_wmb();   <- Ensure that the last base[1] update is visible
+ * tkf->seq++;
+ * smp_wmb();   <- Ensure that the seqcount update is visible
+ * update(tkf->base[0], tk);
+ * smp_wmb();   <- Ensure that the base[0] update is visible
+ * tkf->seq++;
+ * smp_wmb();   <- Ensure that the seqcount update is visible
+ * update(tkf->base[1], tk);
+ *
+ * The reader side does:
+ *
+ * do {
+ *      seq = tkf->seq;
+ *      smp_rmb();
+ *      idx = seq & 0x01;
+ *      now = now(tkf->base[idx]);
+ *      smp_rmb();
+ * } while (seq != tkf->seq)
+ *
+ * As long as we update base[0] readers are forced off to
+ * base[1]. Once base[0] is updated readers are redirected to base[0]
+ * and the base[1] update takes place.
+ *
+ * So if a NMI hits the update of base[0] then it will use base[1]
+ * which is still consistent. In the worst case this can result is a
+ * slightly wrong timestamp (a few nanoseconds). See
+ * @ktime_get_mono_fast_ns.
+ */
+static void update_fast_timekeeper(struct timekeeper *tk)
+{
+        struct tk_read_base *base = tk_fast_mono.base;
+        /* Force readers off to base[1] */
+        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        /* Update base[0] */
+        memcpy(base, &tk->tkr, sizeof(*base));
+        /* Force readers back to base[0] */
+        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        /* Update base[1] */
+        memcpy(base + 1, base, sizeof(*base));
 }
+/**
+ * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
+ *
+ * This timestamp is not guaranteed to be monotonic across an update.
+ * The timestamp is calculated by:
+ *
+ *      now = base_mono + clock_delta * slope
+ *
+ * So if the update lowers the slope, readers who are forced to the
+ * not yet updated second array are still using the old steeper slope.
+ *
+ * tmono
+ * ^
+ * |    o  n
+ * |   o n
+ * |  u
+ * | o
+ * |o
+ * |12345678---> reader order
+ *
+ * o = old slope
+ * u = update
+ * n = new slope
+ *
+ * So reader 6 will observe time going backwards versus reader 5.
+ *
+ * While other CPUs are likely to be able observe that, the only way
+ * for a CPU local observation is when an NMI hits in the middle of
+ * the update. Timestamps taken from that NMI context might be ahead
+ * of the following timestamps. Callers need to be aware of that and
+ * deal with it.
+ */
+u64 notrace ktime_get_mono_fast_ns(void)
+{
+        struct tk_read_base *tkr;
+        unsigned int seq;
+        u64 now;
+        do {
+                seq = raw_read_seqcount(&tk_fast_mono.seq);
+                tkr = tk_fast_mono.base + (seq & 0x01);
+                now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+        } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
+        return now;
+}
+EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+static inline void update_vsyscall(struct timekeeper *tk)
+{
+        struct timespec xt, wm;
+        xt = timespec64_to_timespec(tk_xtime(tk));
+        wm = timespec64_to_timespec(tk->wall_to_monotonic);
+        update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
+                            tk->tkr.cycle_last);
+}
+static inline void old_vsyscall_fixup(struct timekeeper *tk)
+{
+        s64 remainder;
+        /*
+        * Store only full nanoseconds into xtime_nsec after rounding
+        * it up and add the remainder to the error difference.
+        * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+        * by truncating the remainder in vsyscalls. However, it causes
+        * additional work to be done in timekeeping_adjust(). Once
+        * the vsyscall implementations are converted to use xtime_nsec
+        * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
+        * users are removed, this can be killed.
+        */
+        remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
+        tk->tkr.xtime_nsec -= remainder;
+        tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+        tk->ntp_error += remainder << tk->ntp_error_shift;
+        tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+}
+#else
+#define old_vsyscall_fixup(tk)
+#endif
 static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
 static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -217,7 +382,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
 */
 int pvclock_gtod_register_notifier(struct notifier_block *nb)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        int ret;
@@ -247,6 +412,29 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
+/*
+ * Update the ktime_t based scalar nsec members of the timekeeper
+ */
+static inline void tk_update_ktime_data(struct timekeeper *tk)
+{
+        s64 nsec;
+        /*
+         * The xtime based monotonic readout is:
+         *      nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
+         * The ktime based monotonic readout is:
+         *      nsec = base_mono + now();
+         * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
+         */
+        nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
+        nsec *= NSEC_PER_SEC;
+        nsec += tk->wall_to_monotonic.tv_nsec;
+        tk->tkr.base_mono = ns_to_ktime(nsec);
+        /* Update the monotonic raw base */
+        tk->base_raw = timespec64_to_ktime(tk->raw_time);
+}
 /* must hold timekeeper_lock */
 static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 {
@@ -257,8 +445,13 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
        update_vsyscall(tk);
        update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+        tk_update_ktime_data(tk);
        if (action & TK_MIRROR)
-                memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
+                       sizeof(tk_core.timekeeper));
+        update_fast_timekeeper(tk);
 }
 /**
@@ -270,49 +463,48 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 */
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
-        cycle_t cycle_now, cycle_delta;
+        struct clocksource *clock = tk->tkr.clock;
-        struct clocksource *clock;
+        cycle_t cycle_now, delta;
        s64 nsec;
-        clock = tk->clock;
+        cycle_now = tk->tkr.read(clock);
-        cycle_now = clock->read(clock);
+        delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+        tk->tkr.cycle_last = cycle_now;
-        tk->cycle_last = clock->cycle_last = cycle_now;
-        tk->xtime_nsec += cycle_delta * tk->mult;
+        tk->tkr.xtime_nsec += delta * tk->tkr.mult;
        /* If arch requires, add in get_arch_timeoffset() */
-        tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
+        tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
        tk_normalize_xtime(tk);
-        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+        nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
-        timespec_add_ns(&tk->raw_time, nsec);
+        timespec64_add_ns(&tk->raw_time, nsec);
 }
 /**
- * __getnstimeofday - Returns the time of day in a timespec.
+ * __getnstimeofday64 - Returns the time of day in a timespec64.
 * @ts:         pointer to the timespec to be set
 *
 * Updates the time of day in the timespec.
 * Returns 0 on success, or -ve when suspended (timespec will be undefined).
 */
-int __getnstimeofday(struct timespec *ts)
+int __getnstimeofday64(struct timespec64 *ts)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long seq;
        s64 nsecs = 0;
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
-                nsecs = timekeeping_get_ns(tk);
+                nsecs = timekeeping_get_ns(&tk->tkr);
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
        ts->tv_nsec = 0;
-        timespec_add_ns(ts, nsecs);
+        timespec64_add_ns(ts, nsecs);
        /*
         * Do not bail out early, in case there were callers still using
@@ -322,116 +514,138 @@ int __getnstimeofday(struct timespec *ts)
                return -EAGAIN;
        return 0;
 }
-EXPORT_SYMBOL(__getnstimeofday);
+EXPORT_SYMBOL(__getnstimeofday64);
 /**
- * getnstimeofday - Returns the time of day in a timespec.
+ * getnstimeofday64 - Returns the time of day in a timespec64.
 * @ts:         pointer to the timespec to be set
 *
 * Returns the time of day in a timespec (WARN if suspended).
 */
-void getnstimeofday(struct timespec *ts)
+void getnstimeofday64(struct timespec64 *ts)
 {
-        WARN_ON(__getnstimeofday(ts));
+        WARN_ON(__getnstimeofday64(ts));
 }
-EXPORT_SYMBOL(getnstimeofday);
+EXPORT_SYMBOL(getnstimeofday64);
 ktime_t ktime_get(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
-        s64 secs, nsecs;
+        ktime_t base;
+        s64 nsecs;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
-                secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+                base = tk->tkr.base_mono;
-                nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
+                nsecs = timekeeping_get_ns(&tk->tkr);
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
-        /*
-         * Use ktime_set/ktime_add_ns to create a proper ktime on
+        return ktime_add_ns(base, nsecs);
-         * 32-bit architectures without CONFIG_KTIME_SCALAR.
-         */
-        return ktime_add_ns(ktime_set(secs, 0), nsecs);
 }
 EXPORT_SYMBOL_GPL(ktime_get);
-/**
+static ktime_t *offsets[TK_OFFS_MAX] = {
- * ktime_get_ts - get the monotonic clock in timespec format
+        [TK_OFFS_REAL]  = &tk_core.timekeeper.offs_real,
- * @ts:         pointer to timespec variable
+        [TK_OFFS_BOOT]  = &tk_core.timekeeper.offs_boot,
- *
+        [TK_OFFS_TAI]   = &tk_core.timekeeper.offs_tai,
- * The function calculates the monotonic clock from the realtime
+};
- * clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
+ktime_t ktime_get_with_offset(enum tk_offsets offs)
- */
-void ktime_get_ts(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        struct timespec tomono;
-        s64 nsec;
        unsigned int seq;
+        ktime_t base, *offset = offsets[offs];
+        s64 nsecs;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
-                ts->tv_sec = tk->xtime_sec;
+                base = ktime_add(tk->tkr.base_mono, *offset);
-                nsec = timekeeping_get_ns(tk);
+                nsecs = timekeeping_get_ns(&tk->tkr);
-                tomono = tk->wall_to_monotonic;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
-        ts->tv_sec += tomono.tv_sec;
+        return ktime_add_ns(base, nsecs);
-        ts->tv_nsec = 0;
-        timespec_add_ns(ts, nsec + tomono.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
+}
+EXPORT_SYMBOL_GPL(ktime_get_with_offset);
 /**
- * timekeeping_clocktai - Returns the TAI time of day in a timespec
+ * ktime_mono_to_any() - convert mononotic time to any other time
- * @ts:         pointer to the timespec to be set
+ * @tmono:      time to convert.
- *
+ * @offs:       which offset to use
- * Returns the time of day in a timespec.
 */
-void timekeeping_clocktai(struct timespec *ts)
+ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
 {
-        struct timekeeper *tk = &timekeeper;
+        ktime_t *offset = offsets[offs];
        unsigned long seq;
-        u64 nsecs;
+        ktime_t tconv;
-        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
+                tconv = ktime_add(tmono, *offset);
+        } while (read_seqcount_retry(&tk_core.seq, seq));
-                ts->tv_sec = tk->xtime_sec + tk->tai_offset;
+        return tconv;
-                nsecs = timekeeping_get_ns(tk);
+}
+EXPORT_SYMBOL_GPL(ktime_mono_to_any);
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+/**
+ * ktime_get_raw - Returns the raw monotonic time in ktime_t format
+ */
+ktime_t ktime_get_raw(void)
+{
+        struct timekeeper *tk = &tk_core.timekeeper;
+        unsigned int seq;
+        ktime_t base;
+        s64 nsecs;
-        ts->tv_nsec = 0;
+        do {
-        timespec_add_ns(ts, nsecs);
+                seq = read_seqcount_begin(&tk_core.seq);
+                base = tk->base_raw;
+                nsecs = timekeeping_get_ns_raw(tk);
-}
+        } while (read_seqcount_retry(&tk_core.seq, seq));
-EXPORT_SYMBOL(timekeeping_clocktai);
+        return ktime_add_ns(base, nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw);
 /**
- * ktime_get_clocktai - Returns the TAI time of day in a ktime
+ * ktime_get_ts64 - get the monotonic clock in timespec64 format
+ * @ts:         pointer to timespec variable
 *
- * Returns the time of day in a ktime.
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
 */
-ktime_t ktime_get_clocktai(void)
+void ktime_get_ts64(struct timespec64 *ts)
 {
-        struct timespec ts;
+        struct timekeeper *tk = &tk_core.timekeeper;
+        struct timespec64 tomono;
+        s64 nsec;
+        unsigned int seq;
+        WARN_ON(timekeeping_suspended);
-        timekeeping_clocktai(&ts);
+        do {
-        return timespec_to_ktime(ts);
+                seq = read_seqcount_begin(&tk_core.seq);
+                ts->tv_sec = tk->xtime_sec;
+                nsec = timekeeping_get_ns(&tk->tkr);
+                tomono = tk->wall_to_monotonic;
+        } while (read_seqcount_retry(&tk_core.seq, seq));
+        ts->tv_sec += tomono.tv_sec;
+        ts->tv_nsec = 0;
+        timespec64_add_ns(ts, nsec + tomono.tv_nsec);
 }
-EXPORT_SYMBOL(ktime_get_clocktai);
+EXPORT_SYMBOL_GPL(ktime_get_ts64);
 #ifdef CONFIG_NTP_PPS
@@ -446,23 +660,23 @@ EXPORT_SYMBOL(ktime_get_clocktai);
 */
 void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long seq;
        s64 nsecs_raw, nsecs_real;
        WARN_ON_ONCE(timekeeping_suspended);
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
-                *ts_raw = tk->raw_time;
+                *ts_raw = timespec64_to_timespec(tk->raw_time);
                ts_real->tv_sec = tk->xtime_sec;
                ts_real->tv_nsec = 0;
                nsecs_raw = timekeeping_get_ns_raw(tk);
-                nsecs_real = timekeeping_get_ns(tk);
+                nsecs_real = timekeeping_get_ns(&tk->tkr);
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
        timespec_add_ns(ts_raw, nsecs_raw);
        timespec_add_ns(ts_real, nsecs_real);
@@ -479,9 +693,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real);
 */
 void do_gettimeofday(struct timeval *tv)
 {
-        struct timespec now;
+        struct timespec64 now;
-        getnstimeofday(&now);
+        getnstimeofday64(&now);
        tv->tv_sec = now.tv_sec;
        tv->tv_usec = now.tv_nsec/1000;
 }
@@ -495,15 +709,15 @@ EXPORT_SYMBOL(do_gettimeofday);
 */
 int do_settimeofday(const struct timespec *tv)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        struct timespec ts_delta, xt;
+        struct timespec64 ts_delta, xt, tmp;
        unsigned long flags;
        if (!timespec_valid_strict(tv))
                return -EINVAL;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
@@ -511,13 +725,14 @@ int do_settimeofday(const struct timespec *tv)
        ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
        ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
-        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
+        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
-        tk_set_xtime(tk, tv);
+        tmp = timespec_to_timespec64(*tv);
+        tk_set_xtime(tk, &tmp);
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        /* signal hrtimers about time change */
@@ -535,33 +750,35 @@ EXPORT_SYMBOL(do_settimeofday);
 */
 int timekeeping_inject_offset(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
-        struct timespec tmp;
+        struct timespec64 ts64, tmp;
        int ret = 0;
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
+        ts64 = timespec_to_timespec64(*ts);
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
        /* Make sure the proposed value is valid */
-        tmp = timespec_add(tk_xtime(tk),  *ts);
+        tmp = timespec64_add(tk_xtime(tk),  ts64);
-        if (!timespec_valid_strict(&tmp)) {
+        if (!timespec64_valid_strict(&tmp)) {
                ret = -EINVAL;
                goto error;
        }
-        tk_xtime_add(tk, ts);
+        tk_xtime_add(tk, &ts64);
-        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
+        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
 error: /* even if we error out, we forwarded the time, so call update */
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        /* signal hrtimers about time change */
@@ -578,14 +795,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
 */
 s32 timekeeping_get_tai_offset(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        s32 ret;
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
                ret = tk->tai_offset;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
        return ret;
 }
@@ -606,14 +823,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
 */
 void timekeeping_set_tai_offset(s32 tai_offset)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        __timekeeping_set_tai_offset(tk, tai_offset);
        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        clock_was_set();
 }
@@ -625,14 +842,14 @@ void timekeeping_set_tai_offset(s32 tai_offset)
 */
 static int change_clocksource(void *data)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *new, *old;
        unsigned long flags;
        new = (struct clocksource *) data;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
        /*
@@ -641,7 +858,7 @@ static int change_clocksource(void *data)
         */
        if (try_module_get(new->owner)) {
                if (!new->enable || new->enable(new) == 0) {
-                        old = tk->clock;
+                        old = tk->tkr.clock;
                        tk_setup_internals(tk, new);
                        if (old->disable)
                                old->disable(old);
@@ -652,7 +869,7 @@ static int change_clocksource(void *data)
        }
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        return 0;
@@ -667,29 +884,14 @@ static int change_clocksource(void *data)
 */
 int timekeeping_notify(struct clocksource *clock)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        if (tk->clock == clock)
+        if (tk->tkr.clock == clock)
                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
-        return tk->clock == clock ? 0 : -1;
+        return tk->tkr.clock == clock ? 0 : -1;
-}
-/**
- * ktime_get_real - get the real (wall-) time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get_real(void)
-{
-        struct timespec now;
-        getnstimeofday(&now);
-        return timespec_to_ktime(now);
 }
-EXPORT_SYMBOL_GPL(ktime_get_real);
 /**
 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -699,18 +901,20 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
 */
 void getrawmonotonic(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
+        struct timespec64 ts64;
        unsigned long seq;
        s64 nsecs;
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
                nsecs = timekeeping_get_ns_raw(tk);
-                *ts = tk->raw_time;
+                ts64 = tk->raw_time;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
-        timespec_add_ns(ts, nsecs);
+        timespec64_add_ns(&ts64, nsecs);
+        *ts = timespec64_to_timespec(ts64);
 }
 EXPORT_SYMBOL(getrawmonotonic);
@@ -719,16 +923,16 @@ EXPORT_SYMBOL(getrawmonotonic);
 */
 int timekeeping_valid_for_hres(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long seq;
        int ret;
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
-                ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+                ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
        return ret;
 }
@@ -738,16 +942,16 @@ int timekeeping_valid_for_hres(void)
 */
 u64 timekeeping_max_deferment(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long seq;
        u64 ret;
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
-                ret = tk->clock->max_idle_ns;
+                ret = tk->tkr.clock->max_idle_ns;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
        return ret;
 }
@@ -787,14 +991,15 @@ void __weak read_boot_clock(struct timespec *ts)
 */
 void __init timekeeping_init(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *clock;
        unsigned long flags;
-        struct timespec now, boot, tmp;
+        struct timespec64 now, boot, tmp;
+        struct timespec ts;
-        read_persistent_clock(&now);
-        if (!timespec_valid_strict(&now)) {
+        read_persistent_clock(&ts);
+        now = timespec_to_timespec64(ts);
+        if (!timespec64_valid_strict(&now)) {
                pr_warn("WARNING: Persistent clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
                now.tv_sec = 0;
@@ -802,8 +1007,9 @@ void __init timekeeping_init(void)
        } else if (now.tv_sec || now.tv_nsec)
                persistent_clock_exist = true;
-        read_boot_clock(&boot);
+        read_boot_clock(&ts);
-        if (!timespec_valid_strict(&boot)) {
+        boot = timespec_to_timespec64(ts);
+        if (!timespec64_valid_strict(&boot)) {
                pr_warn("WARNING: Boot clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
                boot.tv_sec = 0;
@@ -811,7 +1017,7 @@ void __init timekeeping_init(void)
        }
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        ntp_init();
        clock = clocksource_default_clock();
@@ -822,24 +1028,21 @@ void __init timekeeping_init(void)
        tk_set_xtime(tk, &now);
        tk->raw_time.tv_sec = 0;
        tk->raw_time.tv_nsec = 0;
+        tk->base_raw.tv64 = 0;
        if (boot.tv_sec == 0 && boot.tv_nsec == 0)
                boot = tk_xtime(tk);
-        set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
+        set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
        tk_set_wall_to_mono(tk, tmp);
-        tmp.tv_sec = 0;
+        timekeeping_update(tk, TK_MIRROR);
-        tmp.tv_nsec = 0;
-        tk_set_sleep_time(tk, tmp);
-        memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 /* time in seconds when suspend began */
-static struct timespec timekeeping_suspend_time;
+static struct timespec64 timekeeping_suspend_time;
 /**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
@@ -849,17 +1052,17 @@ static struct timespec timekeeping_suspend_time;
 * adds the sleep offset to the timekeeping variables.
 */
 static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
-                                                        struct timespec *delta)
+                                           struct timespec64 *delta)
 {
-        if (!timespec_valid_strict(delta)) {
+        if (!timespec64_valid_strict(delta)) {
                printk_deferred(KERN_WARNING
                                "__timekeeping_inject_sleeptime: Invalid "
                                "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
-        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
+        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
-        tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
+        tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
        tk_debug_account_sleep_time(delta);
 }
@@ -875,7 +1078,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
 */
 void timekeeping_inject_sleeptime(struct timespec *delta)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
+        struct timespec64 tmp;
        unsigned long flags;
        /*
@@ -886,15 +1090,16 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
                return;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
-        __timekeeping_inject_sleeptime(tk, delta);
+        tmp = timespec_to_timespec64(*delta);
+        __timekeeping_inject_sleeptime(tk, &tmp);
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        /* signal hrtimers about time change */
@@ -910,20 +1115,22 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 */
 static void timekeeping_resume(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        struct clocksource *clock = tk->clock;
+        struct clocksource *clock = tk->tkr.clock;
        unsigned long flags;
-        struct timespec ts_new, ts_delta;
+        struct timespec64 ts_new, ts_delta;
+        struct timespec tmp;
        cycle_t cycle_now, cycle_delta;
        bool suspendtime_found = false;
-        read_persistent_clock(&ts_new);
+        read_persistent_clock(&tmp);
+        ts_new = timespec_to_timespec64(tmp);
        clockevents_resume();
        clocksource_resume();
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        /*
         * After system resumes, we need to calculate the suspended time and
@@ -937,15 +1144,16 @@ static void timekeeping_resume(void)
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
-        cycle_now = clock->read(clock);
+        cycle_now = tk->tkr.read(clock);
        if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
-                cycle_now > clock->cycle_last) {
+                cycle_now > tk->tkr.cycle_last) {
                u64 num, max = ULLONG_MAX;
                u32 mult = clock->mult;
                u32 shift = clock->shift;
                s64 nsec = 0;
-                cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+                cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
+                                                tk->tkr.mask);
                /*
                 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -960,10 +1168,10 @@ static void timekeeping_resume(void)
                }
                nsec += ((u64) cycle_delta * mult) >> shift;
-                ts_delta = ns_to_timespec(nsec);
+                ts_delta = ns_to_timespec64(nsec);
                suspendtime_found = true;
-        } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
+        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
-                ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
+                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
                suspendtime_found = true;
        }
@@ -971,11 +1179,11 @@ static void timekeeping_resume(void)
                __timekeeping_inject_sleeptime(tk, &ts_delta);
        /* Re-base the last cycle value */
-        tk->cycle_last = clock->cycle_last = cycle_now;
+        tk->tkr.cycle_last = cycle_now;
        tk->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        touch_softlockup_watchdog();
@@ -988,12 +1196,14 @@ static void timekeeping_resume(void)
 static int timekeeping_suspend(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
-        struct timespec         delta, delta_delta;
+        struct timespec64               delta, delta_delta;
-        static struct timespec  old_delta;
+        static struct timespec64        old_delta;
+        struct timespec tmp;
-        read_persistent_clock(&timekeeping_suspend_time);
+        read_persistent_clock(&tmp);
+        timekeeping_suspend_time = timespec_to_timespec64(tmp);
        /*
         * On some systems the persistent_clock can not be detected at
@@ -1004,7 +1214,7 @@ static int timekeeping_suspend(void)
                persistent_clock_exist = true;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;
@@ -1014,8 +1224,8 @@ static int timekeeping_suspend(void)
         * try to compensate so the difference in system time
         * and persistent_clock time stays close to constant.
         */
-        delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
+        delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
-        delta_delta = timespec_sub(delta, old_delta);
+        delta_delta = timespec64_sub(delta, old_delta);
        if (abs(delta_delta.tv_sec)  >= 2) {
                /*
                 * if delta_delta is too large, assume time correction
@@ -1025,11 +1235,11 @@ static int timekeeping_suspend(void)
        } else {
                /* Otherwise try to adjust old_system to compensate */
                timekeeping_suspend_time =
-                        timespec_add(timekeeping_suspend_time, delta_delta);
+                        timespec64_add(timekeeping_suspend_time, delta_delta);
        }
        timekeeping_update(tk, TK_MIRROR);
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -1050,125 +1260,34 @@ static int __init timekeeping_init_ops(void)
        register_syscore_ops(&timekeeping_syscore_ops);
        return 0;
 }
 device_initcall(timekeeping_init_ops);
 /*
- * If the error is already larger, we look ahead even further
+ * Apply a multiplier adjustment to the timekeeper
- * to compensate for late or lost adjustments.
 */
-static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
+static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
-                                                 s64 error, s64 *interval,
+                                                         s64 offset,
-                                                 s64 *offset)
+                                                         bool negative,
+                                                         int adj_scale)
 {
-        s64 tick_error, i;
+        s64 interval = tk->cycle_interval;
-        u32 look_ahead, adj;
+        s32 mult_adj = 1;
-        s32 error2, mult;
-        /*
-         * Use the current error value to determine how much to look ahead.
-         * The larger the error the slower we adjust for it to avoid problems
-         * with losing too many ticks, otherwise we would overadjust and
-         * produce an even larger error.  The smaller the adjustment the
-         * faster we try to adjust for it, as lost ticks can do less harm
-         * here.  This is tuned so that an error of about 1 msec is adjusted
-         * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
-         */
-        error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
-        error2 = abs(error2);
-        for (look_ahead = 0; error2 > 0; look_ahead++)
-                error2 >>= 2;
-        /*
+        if (negative) {
-         * Now calculate the error in (1 << look_ahead) ticks, but first
+                mult_adj = -mult_adj;
-         * remove the single look ahead already included in the error.
+                interval = -interval;
-         */
+                offset  = -offset;
-        tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
-        tick_error -= tk->xtime_interval >> 1;
-        error = ((error - tick_error) >> look_ahead) + tick_error;
-        /* Finally calculate the adjustment shift value.  */
-        i = *interval;
-        mult = 1;
-        if (error < 0) {
-                error = -error;
-                *interval = -*interval;
-                *offset = -*offset;
-                mult = -1;
        }
-        for (adj = 0; error > i; adj++)
+        mult_adj <<= adj_scale;
-                error >>= 1;
+        interval <<= adj_scale;
+        offset <<= adj_scale;
-        *interval <<= adj;
-        *offset <<= adj;
-        return mult << adj;
-}
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
- */
-static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
-{
-        s64 error, interval = tk->cycle_interval;
-        int adj;
        /*
-         * The point of this is to check if the error is greater than half
-         * an interval.
-         *
-         * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
-         *
-         * Note we subtract one in the shift, so that error is really error*2.
-         * This "saves" dividing(shifting) interval twice, but keeps the
-         * (error > interval) comparison as still measuring if error is
-         * larger than half an interval.
-         *
-         * Note: It does not "save" on aggravation when reading the code.
-         */
-        error = tk->ntp_error >> (tk->ntp_error_shift - 1);
-        if (error > interval) {
-                /*
-                 * We now divide error by 4(via shift), which checks if
-                 * the error is greater than twice the interval.
-                 * If it is greater, we need a bigadjust, if its smaller,
-                 * we can adjust by 1.
-                 */
-                error >>= 2;
-                if (likely(error <= interval))
-                        adj = 1;
-                else
-                        adj = timekeeping_bigadjust(tk, error, &interval, &offset);
-        } else {
-                if (error < -interval) {
-                        /* See comment above, this is just switched for the negative */
-                        error >>= 2;
-                        if (likely(error >= -interval)) {
-                                adj = -1;
-                                interval = -interval;
-                                offset = -offset;
-                        } else {
-                                adj = timekeeping_bigadjust(tk, error, &interval, &offset);
-                        }
-                } else {
-                        goto out_adjust;
-                }
-        }
-        if (unlikely(tk->clock->maxadj &&
-                (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
-                printk_deferred_once(KERN_WARNING
-                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
-                        tk->clock->name, (long)tk->mult + adj,
-                        (long)tk->clock->mult + tk->clock->maxadj);
-        }
-        /*
         * So the following can be confusing.
         *
-         * To keep things simple, lets assume adj == 1 for now.
+         * To keep things simple, lets assume mult_adj == 1 for now.
         *
-         * When adj != 1, remember that the interval and offset values
+         * When mult_adj != 1, remember that the interval and offset values
         * have been appropriately scaled so the math is the same.
         *
         * The basic idea here is that we're increasing the multiplier
@@ -1212,12 +1331,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
         *
         * XXX - TODO: Doc ntp_error calculation.
         */
-        tk->mult += adj;
+        tk->tkr.mult += mult_adj;
        tk->xtime_interval += interval;
-        tk->xtime_nsec -= offset;
+        tk->tkr.xtime_nsec -= offset;
        tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+}
+/*
+ * Calculate the multiplier adjustment needed to match the frequency
+ * specified by NTP
+ */
+static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
+                                                        s64 offset)
+{
+        s64 interval = tk->cycle_interval;
+        s64 xinterval = tk->xtime_interval;
+        s64 tick_error;
+        bool negative;
+        u32 adj;
+        /* Remove any current error adj from freq calculation */
+        if (tk->ntp_err_mult)
+                xinterval -= tk->cycle_interval;
+        tk->ntp_tick = ntp_tick_length();
+        /* Calculate current error per tick */
+        tick_error = ntp_tick_length() >> tk->ntp_error_shift;
+        tick_error -= (xinterval + tk->xtime_remainder);
+        /* Don't worry about correcting it if its small */
+        if (likely((tick_error >= 0) && (tick_error <= interval)))
+                return;
+        /* preserve the direction of correction */
+        negative = (tick_error < 0);
+        /* Sort out the magnitude of the correction */
+        tick_error = abs(tick_error);
+        for (adj = 0; tick_error > interval; adj++)
+                tick_error >>= 1;
+        /* scale the corrections */
+        timekeeping_apply_adjustment(tk, offset, negative, adj);
+}
+/*
+ * Adjust the timekeeper's multiplier to the correct frequency
+ * and also to reduce the accumulated error value.
+ */
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
+{
+        /* Correct for the current frequency error */
+        timekeeping_freqadjust(tk, offset);
+        /* Next make a small adjustment to fix any cumulative error */
+        if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
+                tk->ntp_err_mult = 1;
+                timekeeping_apply_adjustment(tk, offset, 0, 0);
+        } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
+                /* Undo any existing error adjustment */
+                timekeeping_apply_adjustment(tk, offset, 1, 0);
+                tk->ntp_err_mult = 0;
+        }
+        if (unlikely(tk->tkr.clock->maxadj &&
+                (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
+                printk_once(KERN_WARNING
+                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
+                        tk->tkr.clock->name, (long)tk->tkr.mult,
+                        (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+        }
-out_adjust:
        /*
         * It may be possible that when we entered this function, xtime_nsec
         * was very small.  Further, if we're slightly speeding the clocksource
@@ -1232,12 +1417,11 @@ out_adjust:
         * We'll correct this error next time through this function, when
         * xtime_nsec is not as small.
         */
-        if (unlikely((s64)tk->xtime_nsec < 0)) {
+        if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
-                s64 neg = -(s64)tk->xtime_nsec;
+                s64 neg = -(s64)tk->tkr.xtime_nsec;
-                tk->xtime_nsec = 0;
+                tk->tkr.xtime_nsec = 0;
                tk->ntp_error += neg << tk->ntp_error_shift;
        }
 }
 /**
@@ -1250,26 +1434,26 @@ out_adjust:
 */
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
-        u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
        unsigned int clock_set = 0;
-        while (tk->xtime_nsec >= nsecps) {
+        while (tk->tkr.xtime_nsec >= nsecps) {
                int leap;
-                tk->xtime_nsec -= nsecps;
+                tk->tkr.xtime_nsec -= nsecps;
                tk->xtime_sec++;
                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->xtime_sec);
                if (unlikely(leap)) {
-                        struct timespec ts;
+                        struct timespec64 ts;
                        tk->xtime_sec += leap;
                        ts.tv_sec = leap;
                        ts.tv_nsec = 0;
                        tk_set_wall_to_mono(tk,
-                                timespec_sub(tk->wall_to_monotonic, ts));
+                                timespec64_sub(tk->wall_to_monotonic, ts));
                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
@@ -1301,9 +1485,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        /* Accumulate one shifted interval */
        offset -= interval;
-        tk->cycle_last += interval;
+        tk->tkr.cycle_last += interval;
-        tk->xtime_nsec += tk->xtime_interval << shift;
+        tk->tkr.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);
        /* Accumulate raw time */
@@ -1317,48 +1501,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        tk->raw_time.tv_nsec = raw_nsecs;
        /* Accumulate error between NTP and clock interval */
-        tk->ntp_error += ntp_tick_length() << shift;
+        tk->ntp_error += tk->ntp_tick << shift;
        tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
                                                (tk->ntp_error_shift + shift);
        return offset;
 }
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
-        s64 remainder;
-        /*
-        * Store only full nanoseconds into xtime_nsec after rounding
-        * it up and add the remainder to the error difference.
-        * XXX - This is necessary to avoid small 1ns inconsistnecies caused
-        * by truncating the remainder in vsyscalls. However, it causes
-        * additional work to be done in timekeeping_adjust(). Once
-        * the vsyscall implementations are converted to use xtime_nsec
-        * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
-        * users are removed, this can be killed.
-        */
-        remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
-        tk->xtime_nsec -= remainder;
-        tk->xtime_nsec += 1ULL << tk->shift;
-        tk->ntp_error += remainder << tk->ntp_error_shift;
-        tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 */
 void update_wall_time(void)
 {
-        struct clocksource *clock;
+        struct timekeeper *real_tk = &tk_core.timekeeper;
-        struct timekeeper *real_tk = &timekeeper;
        struct timekeeper *tk = &shadow_timekeeper;
        cycle_t offset;
        int shift = 0, maxshift;
@@ -1371,12 +1527,11 @@ void update_wall_time(void)
        if (unlikely(timekeeping_suspended))
                goto out;
-        clock = real_tk->clock;
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
        offset = real_tk->cycle_interval;
 #else
-        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
+        offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
+                                   tk->tkr.cycle_last, tk->tkr.mask);
 #endif
        /* Check if there's really nothing to do */
@@ -1418,9 +1573,7 @@ void update_wall_time(void)
         */
        clock_set |= accumulate_nsecs_to_secs(tk);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
-        /* Update clock->cycle_last with the new value */
-        clock->cycle_last = tk->cycle_last;
        /*
         * Update the real timekeeper.
         *
@@ -1428,12 +1581,12 @@ void update_wall_time(void)
         * requires changes to all other timekeeper usage sites as
         * well, i.e. move the timekeeper pointer getter into the
         * spinlocked/seqcount protected sections. And we trade this
-         * memcpy under the timekeeper_seq against one before we start
+         * memcpy under the tk_core.seq against one before we start
         * updating.
         */
        memcpy(real_tk, tk, sizeof(*tk));
        timekeeping_update(real_tk, clock_set);
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
 out:
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        if (clock_set)
@@ -1454,83 +1607,16 @@ out:
 */
 void getboottime(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        struct timespec boottime = {
+        ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
-                .tv_sec = tk->wall_to_monotonic.tv_sec +
-                                tk->total_sleep_time.tv_sec,
-                .tv_nsec = tk->wall_to_monotonic.tv_nsec +
-                                tk->total_sleep_time.tv_nsec
-        };
-        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(getboottime);
-/**
- * get_monotonic_boottime - Returns monotonic time since boot
- * @ts:         pointer to the timespec to be set
- *
- * Returns the monotonic time since boot in a timespec.
- *
- * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
- * includes the time spent in suspend.
- */
-void get_monotonic_boottime(struct timespec *ts)
-{
-        struct timekeeper *tk = &timekeeper;
-        struct timespec tomono, sleep;
-        s64 nsec;
-        unsigned int seq;
-        WARN_ON(timekeeping_suspended);
-        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
-                ts->tv_sec = tk->xtime_sec;
-                nsec = timekeeping_get_ns(tk);
-                tomono = tk->wall_to_monotonic;
-                sleep = tk->total_sleep_time;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
-        ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
-        ts->tv_nsec = 0;
-        timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(get_monotonic_boottime);
-/**
- * ktime_get_boottime - Returns monotonic time since boot in a ktime
- *
- * Returns the monotonic time since boot in a ktime
- *
- * This is similar to CLOCK_MONTONIC/ktime_get, but also
- * includes the time spent in suspend.
- */
-ktime_t ktime_get_boottime(void)
-{
-        struct timespec ts;
-        get_monotonic_boottime(&ts);
-        return timespec_to_ktime(ts);
-}
-EXPORT_SYMBOL_GPL(ktime_get_boottime);
-/**
- * monotonic_to_bootbased - Convert the monotonic time to boot based.
- * @ts:         pointer to the timespec to be converted
- */
-void monotonic_to_bootbased(struct timespec *ts)
-{
-        struct timekeeper *tk = &timekeeper;
-        *ts = timespec_add(*ts, tk->total_sleep_time);
+        *ts = ktime_to_timespec(t);
 }
-EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
+EXPORT_SYMBOL_GPL(getboottime);
 unsigned long get_seconds(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        return tk->xtime_sec;
 }
@@ -1538,43 +1624,44 @@ EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        return tk_xtime(tk);
+        return timespec64_to_timespec(tk_xtime(tk));
 }
 struct timespec current_kernel_time(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        struct timespec now;
+        struct timespec64 now;
        unsigned long seq;
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_xtime(tk);
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
-        return now;
+        return timespec64_to_timespec(now);
 }
 EXPORT_SYMBOL(current_kernel_time);
 struct timespec get_monotonic_coarse(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        struct timespec now, mono;
+        struct timespec64 now, mono;
        unsigned long seq;
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_xtime(tk);
                mono = tk->wall_to_monotonic;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
-        set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+        set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
-        return now;
+        return timespec64_to_timespec(now);
 }
 /*
@@ -1587,29 +1674,38 @@ void do_timer(unsigned long ticks)
 }
 /**
- * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
+ * ktime_get_update_offsets_tick - hrtimer helper
- *    and sleep offsets.
+ * @offs_real:  pointer to storage for monotonic -> realtime offset
- * @xtim:       pointer to timespec to be set with xtime
+ * @offs_boot:  pointer to storage for monotonic -> boottime offset
- * @wtom:       pointer to timespec to be set with wall_to_monotonic
+ * @offs_tai:   pointer to storage for monotonic -> clock tai offset
- * @sleep:      pointer to timespec to be set with time in suspend
+ *
+ * Returns monotonic time at last tick and various offsets
 */
-void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
+ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
-                                struct timespec *wtom, struct timespec *sleep)
+                                                        ktime_t *offs_tai)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        unsigned long seq;
+        unsigned int seq;
+        ktime_t base;
+        u64 nsecs;
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
-                *xtim = tk_xtime(tk);
-                *wtom = tk->wall_to_monotonic;
+                base = tk->tkr.base_mono;
-                *sleep = tk->total_sleep_time;
+                nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+                *offs_real = tk->offs_real;
+                *offs_boot = tk->offs_boot;
+                *offs_tai = tk->offs_tai;
+        } while (read_seqcount_retry(&tk_core.seq, seq));
+        return ktime_add_ns(base, nsecs);
 }
 #ifdef CONFIG_HIGH_RES_TIMERS
 /**
- * ktime_get_update_offsets - hrtimer helper
+ * ktime_get_update_offsets_now - hrtimer helper
 * @offs_real:  pointer to storage for monotonic -> realtime offset
 * @offs_boot:  pointer to storage for monotonic -> boottime offset
 * @offs_tai:   pointer to storage for monotonic -> clock tai offset
@@ -1617,57 +1713,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 * Returns current monotonic time and updates the offsets
 * Called from hrtimer_interrupt() or retrigger_next_event()
 */
-ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
                                                        ktime_t *offs_tai)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
-        ktime_t now;
        unsigned int seq;
-        u64 secs, nsecs;
+        ktime_t base;
+        u64 nsecs;
        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
+                seq = read_seqcount_begin(&tk_core.seq);
-                secs = tk->xtime_sec;
+                base = tk->tkr.base_mono;
-                nsecs = timekeeping_get_ns(tk);
+                nsecs = timekeeping_get_ns(&tk->tkr);
                *offs_real = tk->offs_real;
                *offs_boot = tk->offs_boot;
                *offs_tai = tk->offs_tai;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        } while (read_seqcount_retry(&tk_core.seq, seq));
-        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+        return ktime_add_ns(base, nsecs);
-        now = ktime_sub(now, *offs_real);
-        return now;
 }
 #endif
 /**
- * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
- */
-ktime_t ktime_get_monotonic_offset(void)
-{
-        struct timekeeper *tk = &timekeeper;
-        unsigned long seq;
-        struct timespec wtom;
-        do {
-                seq = read_seqcount_begin(&timekeeper_seq);
-                wtom = tk->wall_to_monotonic;
-        } while (read_seqcount_retry(&timekeeper_seq, seq));
-        return timespec_to_ktime(wtom);
-}
-EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
-/**
 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
 */
 int do_adjtimex(struct timex *txc)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
-        struct timespec ts;
+        struct timespec64 ts;
        s32 orig_tai, tai;
        int ret;
@@ -1687,10 +1763,10 @@ int do_adjtimex(struct timex *txc)
                        return ret;
        }
-        getnstimeofday(&ts);
+        getnstimeofday64(&ts);
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        orig_tai = tai = tk->tai_offset;
        ret = __do_adjtimex(txc, &ts, &tai);
@@ -1699,7 +1775,7 @@ int do_adjtimex(struct timex *txc)
                __timekeeping_set_tai_offset(tk, tai);
                timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        }
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        if (tai != orig_tai)
@@ -1719,11 +1795,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        unsigned long flags;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
-        write_seqcount_begin(&timekeeper_seq);
+        write_seqcount_begin(&tk_core.seq);
        __hardpps(phase_ts, raw_ts);
-        write_seqcount_end(&timekeeper_seq);
+        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
new file mode 100644
index 000000000000..adc1fc98bde3
--- /dev/null
+++ b/kernel/time/timekeeping.h
@@ -0,0 +1,20 @@
+#ifndef _KERNEL_TIME_TIMEKEEPING_H
+#define _KERNEL_TIME_TIMEKEEPING_H
+/*
+ * Internal interfaces for kernel/time/
+ */
+extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
+                                                ktime_t *offs_boot,
+                                                ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
+                                                ktime_t *offs_boot,
+                                                ktime_t *offs_tai);
+extern int timekeeping_valid_for_hres(void);
+extern u64 timekeeping_max_deferment(void);
+extern int timekeeping_inject_offset(struct timespec *ts);
+extern s32 timekeeping_get_tai_offset(void);
+extern void timekeeping_set_tai_offset(s32 tai_offset);
+extern void timekeeping_clocktai(struct timespec *ts);
+#endif
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 4d54f97558df..f6bd65236712 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void)
 }
 late_initcall(tk_debug_sleep_time_init);
-void tk_debug_account_sleep_time(struct timespec *t)
+void tk_debug_account_sleep_time(struct timespec64 *t)
 {
        sleep_time_bin[fls(t->tv_sec)]++;
 }
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 13323ea08ffa..4ea005a7f9da 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -3,12 +3,27 @@
 /*
 * timekeeping debug functions
 */
+#include <linux/clocksource.h>
 #include <linux/time.h>
 #ifdef CONFIG_DEBUG_FS
-extern void tk_debug_account_sleep_time(struct timespec *t);
+extern void tk_debug_account_sleep_time(struct timespec64 *t);
 #else
 #define tk_debug_account_sleep_time(x)
 #endif
+#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+        cycle_t ret = (now - last) & mask;
+        return (s64) ret > 0 ? ret : 0;
+}
+#else
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+        return (now - last) & mask;
+}
+#endif
 #endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/timer.c b/kernel/time/timer.c
index 3bb01a323b2a..aca5dfe2fa3d 100644
--- a/kernel/timer.c
+++ b/kernel/time/timer.c
@@ -82,6 +82,7 @@ struct tvec_base {
        unsigned long next_timer;
        unsigned long active_timers;
        unsigned long all_timers;
+        int cpu;
        struct tvec_root tv1;
        struct tvec tv2;
        struct tvec tv3;
@@ -409,6 +410,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
                        base->next_timer = timer->expires;
        }
        base->all_timers++;
+        /*
+         * Check whether the other CPU is in dynticks mode and needs
+         * to be triggered to reevaluate the timer wheel.
+         * We are protected against the other CPU fiddling
+         * with the timer by holding the timer base lock. This also
+         * makes sure that a CPU on the way to stop its tick can not
+         * evaluate the timer wheel.
+         *
+         * Spare the IPI for deferrable timers on idle targets though.
+         * The next busy ticks will take care of it. Except full dynticks
+         * require special care against races with idle_cpu(), lets deal
+         * with that later.
+         */
+        if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
+                wake_up_nohz_cpu(base->cpu);
 }
 #ifdef CONFIG_TIMER_STATS
@@ -948,22 +965,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
        timer_set_base(timer, base);
        debug_activate(timer, timer->expires);
        internal_add_timer(base, timer);
-        /*
-         * Check whether the other CPU is in dynticks mode and needs
-         * to be triggered to reevaluate the timer wheel.
-         * We are protected against the other CPU fiddling
-         * with the timer by holding the timer base lock. This also
-         * makes sure that a CPU on the way to stop its tick can not
-         * evaluate the timer wheel.
-         *
-         * Spare the IPI for deferrable timers on idle targets though.
-         * The next busy ticks will take care of it. Except full dynticks
-         * require special care against races with idle_cpu(), lets deal
-         * with that later.
-         */
-        if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu))
-                wake_up_nohz_cpu(cpu);
        spin_unlock_irqrestore(&base->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1568,6 +1569,7 @@ static int init_timers_cpu(int cpu)
                }
                spin_lock_init(&base->lock);
                tvec_base_done[cpu] = 1;
+                base->cpu = cpu;
        } else {
                base = per_cpu(tvec_bases, cpu);
        }
diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c
new file mode 100644
index 000000000000..e622ba365a13
--- /dev/null
+++ b/kernel/time/udelay_test.c
@@ -0,0 +1,168 @@
+/*
+ * udelay() test kernel module
+ *
+ * Test is executed by writing and reading to /sys/kernel/debug/udelay_test
+ * Tests are configured by writing: USECS ITERATIONS
+ * Tests are executed by reading from the same file.
+ * Specifying usecs of 0 or negative values will run multiples tests.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#define DEFAULT_ITERATIONS 100
+#define DEBUGFS_FILENAME "udelay_test"
+static DEFINE_MUTEX(udelay_test_lock);
+static struct dentry *udelay_test_debugfs_file;
+static int udelay_test_usecs;
+static int udelay_test_iterations = DEFAULT_ITERATIONS;
+static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
+{
+        int min = 0, max = 0, fail_count = 0;
+        uint64_t sum = 0;
+        uint64_t avg;
+        int i;
+        /* Allow udelay to be up to 0.5% fast */
+        int allowed_error_ns = usecs * 5;
+        for (i = 0; i < iters; ++i) {
+                struct timespec ts1, ts2;
+                int time_passed;
+                ktime_get_ts(&ts1);
+                udelay(usecs);
+                ktime_get_ts(&ts2);
+                time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
+                if (i == 0 || time_passed < min)
+                        min = time_passed;
+                if (i == 0 || time_passed > max)
+                        max = time_passed;
+                if ((time_passed + allowed_error_ns) / 1000 < usecs)
+                        ++fail_count;
+                WARN_ON(time_passed < 0);
+                sum += time_passed;
+        }
+        avg = sum;
+        do_div(avg, iters);
+        seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d",
+                        usecs, iters, usecs * 1000,
+                        (usecs * 1000) - allowed_error_ns, min, avg, max);
+        if (fail_count)
+                seq_printf(s, " FAIL=%d", fail_count);
+        seq_puts(s, "\n");
+        return 0;
+}
+static int udelay_test_show(struct seq_file *s, void *v)
+{
+        int usecs;
+        int iters;
+        int ret = 0;
+        mutex_lock(&udelay_test_lock);
+        usecs = udelay_test_usecs;
+        iters = udelay_test_iterations;
+        mutex_unlock(&udelay_test_lock);
+        if (usecs > 0 && iters > 0) {
+                return udelay_test_single(s, usecs, iters);
+        } else if (usecs == 0) {
+                struct timespec ts;
+                ktime_get_ts(&ts);
+                seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
+                                loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
+                seq_puts(s, "usage:\n");
+                seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
+                seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
+        }
+        return ret;
+}
+static int udelay_test_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, udelay_test_show, inode->i_private);
+}
+static ssize_t udelay_test_write(struct file *file, const char __user *buf,
+                size_t count, loff_t *pos)
+{
+        char lbuf[32];
+        int ret;
+        int usecs;
+        int iters;
+        if (count >= sizeof(lbuf))
+                return -EINVAL;
+        if (copy_from_user(lbuf, buf, count))
+                return -EFAULT;
+        lbuf[count] = '\0';
+        ret = sscanf(lbuf, "%d %d", &usecs, &iters);
+        if (ret < 1)
+                return -EINVAL;
+        else if (ret < 2)
+                iters = DEFAULT_ITERATIONS;
+        mutex_lock(&udelay_test_lock);
+        udelay_test_usecs = usecs;
+        udelay_test_iterations = iters;
+        mutex_unlock(&udelay_test_lock);
+        return count;
+}
+static const struct file_operations udelay_test_debugfs_ops = {
+        .owner = THIS_MODULE,
+        .open = udelay_test_open,
+        .read = seq_read,
+        .write = udelay_test_write,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int __init udelay_test_init(void)
+{
+        mutex_lock(&udelay_test_lock);
+        udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
+                        S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
+        mutex_unlock(&udelay_test_lock);
+        return 0;
+}
+module_init(udelay_test_init);
+static void __exit udelay_test_exit(void)
+{
+        mutex_lock(&udelay_test_lock);
+        debugfs_remove(udelay_test_debugfs_file);
+        mutex_unlock(&udelay_test_lock);
+}
+module_exit(udelay_test_exit);
+MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 925f629658d6..afb04b9b818a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1968,7 +1968,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 /**
 * rb_update_event - update event type and data
- * @event: the even to update
+ * @event: the event to update
 * @type: the type of event
 * @length: the size of the event field in the ring buffer
 *
@@ -3341,21 +3341,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
        /* Iterator usage is expected to have record disabled */
-        if (list_empty(&cpu_buffer->reader_page->list)) {
+        iter->head_page = cpu_buffer->reader_page;
-                iter->head_page = rb_set_head_page(cpu_buffer);
+        iter->head = cpu_buffer->reader_page->read;
-                if (unlikely(!iter->head_page))
-                        return;
+        iter->cache_reader_page = iter->head_page;
-                iter->head = iter->head_page->read;
+        iter->cache_read = iter->head;
-        } else {
-                iter->head_page = cpu_buffer->reader_page;
-                iter->head = cpu_buffer->reader_page->read;
-        }
        if (iter->head)
                iter->read_stamp = cpu_buffer->read_stamp;
        else
                iter->read_stamp = iter->head_page->page->time_stamp;
-        iter->cache_reader_page = cpu_buffer->reader_page;
-        iter->cache_read = cpu_buffer->read;
 }
 /**
@@ -3748,12 +3743,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
                return NULL;
        /*
-         * We repeat when a time extend is encountered.
+         * We repeat when a time extend is encountered or we hit
-         * Since the time extend is always attached to a data event,
+         * the end of the page. Since the time extend is always attached
-         * we should never loop more than once.
+         * to a data event, we should never loop more than three times.
-         * (We never hit the following condition more than twice).
+         * Once for going to next page, once on time extend, and
+         * finally once to get the event.
+         * (We never hit the following condition more than thrice).
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
                return NULL;
        if (rb_per_cpu_empty(cpu_buffer))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8bb80fe08767..8a528392b1f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -820,11 +820,12 @@ static struct {
        const char *name;
        int in_ns;              /* is this clock in nanoseconds? */
 } trace_clocks[] = {
-        { trace_clock_local,    "local",        1 },
+        { trace_clock_local,            "local",        1 },
-        { trace_clock_global,   "global",       1 },
+        { trace_clock_global,           "global",       1 },
-        { trace_clock_counter,  "counter",      0 },
+        { trace_clock_counter,          "counter",      0 },
-        { trace_clock_jiffies,  "uptime",       0 },
+        { trace_clock_jiffies,          "uptime",       0 },
-        { trace_clock,          "perf",         1 },
+        { trace_clock,                  "perf",         1 },
+        { ktime_get_mono_fast_ns,       "mono",         1 },
        ARCH_TRACE_CLOCKS
 };
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index a1dd9a1b1327..975cb49e32bf 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns,
                   struct taskstats *stats, struct task_struct *tsk)
 {
        const struct cred *tcred;
-        struct timespec uptime, ts;
        cputime_t utime, stime, utimescaled, stimescaled;
-        u64 ac_etime;
+        u64 delta;
        BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
-        /* calculate task elapsed time in timespec */
+        /* calculate task elapsed time in nsec */
-        do_posix_clock_monotonic_gettime(&uptime);
+        delta = ktime_get_ns() - tsk->start_time;
-        ts = timespec_sub(uptime, tsk->start_time);
+        /* Convert to micro seconds */
-        /* rebase elapsed time to usec (should never be negative) */
+        do_div(delta, NSEC_PER_USEC);
-        ac_etime = timespec_to_ns(&ts);
+        stats->ac_etime = delta;
-        do_div(ac_etime, NSEC_PER_USEC);
+        /* Convert to seconds for btime */
-        stats->ac_etime = ac_etime;
+        do_div(delta, USEC_PER_SEC);
-        stats->ac_btime = get_seconds() - ts.tv_sec;
+        stats->ac_btime = get_seconds() - delta;
        if (thread_group_leader(tsk)) {
                stats->ac_exitcode = tsk->exit_code;
                if (tsk->flags & PF_FORKNOEXEC)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index fcc02560fd6b..aa312b0dc3ec 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v)
        return;
 }
-struct seq_operations proc_uid_seq_operations = {
+const struct seq_operations proc_uid_seq_operations = {
        .start = uid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = uid_m_show,
 };
-struct seq_operations proc_gid_seq_operations = {
+const struct seq_operations proc_gid_seq_operations = {
        .start = gid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = gid_m_show,
 };
-struct seq_operations proc_projid_seq_operations = {
+const struct seq_operations proc_projid_seq_operations = {
        .start = projid_m_start,
        .stop = m_stop,
        .next = m_next,
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd393124e507..883aaaa7de8a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
        struct uts_namespace *ns = NULL;
        struct nsproxy *nsproxy;
-        rcu_read_lock();
+        task_lock(task);
-        nsproxy = task_nsproxy(task);
+        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = nsproxy->uts_ns;
                get_uts_ns(ns);
        }
-        rcu_read_unlock();
+        task_unlock(task);
        return ns;
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c3319bd1b040..a8d6914030fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
                        return;
                if (hardlockup_panic)
-                        panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+                        panic("Watchdog detected hard LOCKUP on cpu %d",
+                              this_cpu);
                else
-                        WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+                        WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
+                             this_cpu);
                __this_cpu_write(hard_watchdog_warn, true);
                return;
@@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                        }
                }
-                printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+                pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
                print_modules();
@@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                        smp_mb__after_atomic();
                }
+                add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
                __this_cpu_write(soft_watchdog_warn, true);
@@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
        if (PTR_ERR(event) == -EOPNOTSUPP)
                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
        else if (PTR_ERR(event) == -ENOENT)
-                pr_warning("disabled (cpu%i): hardware events not enabled\n",
+                pr_warn("disabled (cpu%i): hardware events not enabled\n",
                         cpu);
        else
                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
author	Ingo Molnar <mingo@kernel.org>	2014-08-24 16:32:24 -0400
committer	Ingo Molnar <mingo@kernel.org>	2014-08-24 16:32:24 -0400
commit	83bc90e11576f9c100f8ef4ba2bcd0b89212e3fb (patch)
tree	e59186b4d315c80255851e0d204143ecc21399a0 /kernel
parent	e21ded5ecc531a64d6fc0c1693285e890b4e9569 (diff)
parent	451fd72219dd6f3355e2d036c598544c760ee532 (diff)