Merge branch 'rt/mm' into rt/base

Conflicts: include/linux/percpu.h Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Thomas Gleixner <tglx@linutronix.de> 2009-07-28 18:00:16 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2009-07-28 18:00:16 -0400
commit: ba36d1d9dd11b98a0bdee1d15ef2a11148905805 (patch)
tree: 7749d3ba1d71aaa62a8dab72cca8820e27af7069 /mm
parent: 55f9e9a3b3a3229f0ee73c1c2f990785bbf2ff88 (diff)
parent: 104f75cb1a751a023beddacf56ca6c19ed90ce6c (diff)
8 files changed, 718 insertions, 237 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e717964cb5a0..e5159e2ff807 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -948,13 +948,14 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
                goto done;
        /*
-         * Preemption is already disabled, we don't need get_cpu()
+         * Preemption is already disabled, we don't need get_cpu(), but
+         * that's not true for RT :)
         */
-        cpu = smp_processor_id();
+        cpu = get_cpu();
        stat = &mem->stat;
        cpustat = &stat->cpustat[cpu];
        __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
+        put_cpu();
 done:
        unlock_page_cgroup(pc);
 }
diff --git a/mm/memory.c b/mm/memory.c
index 2d2fc7a3db52..f5579956fa4c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -922,10 +922,13 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
        return addr;
 }
-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT)
 # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
 #else
-/* No preempt: go for improved straight-line efficiency */
+/*
+ * No preempt: go for improved straight-line efficiency
+ * on PREEMPT_RT this is not a critical latency-path.
+ */
 # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index caa92689aac9..910b62810a1e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -161,6 +161,53 @@ static unsigned long __meminitdata dma_reserve;
  EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#ifdef CONFIG_PREEMPT_RT
+static DEFINE_PER_CPU_LOCKED(int, pcp_locks);
+#endif
+static inline void __lock_cpu_pcp(unsigned long *flags, int cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+        spin_lock(&__get_cpu_lock(pcp_locks, cpu));
+        flags = 0;
+#else
+        local_irq_save(*flags);
+#endif
+}
+static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+        (void)get_cpu_var_locked(pcp_locks, this_cpu);
+        flags = 0;
+#else
+        local_irq_save(*flags);
+        *this_cpu = smp_processor_id();
+#endif
+}
+static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu)
+{
+#ifdef CONFIG_PREEMPT_RT
+        put_cpu_var_locked(pcp_locks, this_cpu);
+#else
+        local_irq_restore(flags);
+#endif
+}
+static struct per_cpu_pageset *
+get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu)
+{
+        lock_cpu_pcp(flags, this_cpu);
+        return zone_pcp(zone, *this_cpu);
+}
+static void
+put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu)
+{
+        unlock_cpu_pcp(flags, this_cpu);
+}
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
@@ -523,7 +570,9 @@ static inline int free_pages_check(struct page *page)
 static void free_pages_bulk(struct zone *zone, int count,
                                        struct list_head *list, int order)
 {
-        spin_lock(&zone->lock);
+        unsigned long flags;
+        spin_lock_irqsave(&zone->lock, flags);
        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
        zone->pages_scanned = 0;
@@ -536,27 +585,31 @@ static void free_pages_bulk(struct zone *zone, int count,
                /* have to delete it as __free_one_page list manipulates */
                list_del(&page->lru);
                __free_one_page(page, zone, order, page_private(page));
+#ifdef CONFIG_PREEMPT_RT
+                cond_resched_lock(&zone->lock);
+#endif
        }
-        spin_unlock(&zone->lock);
+        spin_unlock_irqrestore(&zone->lock, flags);
 }
 static void free_one_page(struct zone *zone, struct page *page, int order,
                                int migratetype)
 {
-        spin_lock(&zone->lock);
+        unsigned long flags;
+        spin_lock_irqsave(&zone->lock, flags);
        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
        zone->pages_scanned = 0;
        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
        __free_one_page(page, zone, order, migratetype);
-        spin_unlock(&zone->lock);
+        spin_unlock_irqrestore(&zone->lock, flags);
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
        unsigned long flags;
-        int i;
+        int i, this_cpu, bad = 0;
-        int bad = 0;
        int wasMlocked = TestClearPageMlocked(page);
        kmemcheck_free_shadow(page, order);
@@ -574,13 +627,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        arch_free_page(page, order);
        kernel_map_pages(page, 1 << order, 0);
-        local_irq_save(flags);
+        lock_cpu_pcp(&flags, &this_cpu);
        if (unlikely(wasMlocked))
                free_page_mlock(page);
-        __count_vm_events(PGFREE, 1 << order);
+        count_vm_events(PGFREE, 1 << order);
+        unlock_cpu_pcp(flags, this_cpu);
        free_one_page(page_zone(page), page, order,
-                                        get_pageblock_migratetype(page));
+                      get_pageblock_migratetype(page));
-        local_irq_restore(flags);
 }
 /*
@@ -910,6 +963,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
        return i;
 }
+static void
+isolate_pcp_pages(int count, struct list_head *src, struct list_head *dst)
+{
+        while (count--) {
+                struct page *page = list_last_entry(src, struct page, lru);
+                list_move(&page->lru, dst);
+        }
+}
 #ifdef CONFIG_NUMA
 /*
 * Called from the vmstat counter updater to drain pagesets of this
@@ -921,17 +984,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
+        LIST_HEAD(free_list);
        unsigned long flags;
        int to_drain;
+        int this_cpu;
-        local_irq_save(flags);
+        lock_cpu_pcp(&flags, &this_cpu);
        if (pcp->count >= pcp->batch)
                to_drain = pcp->batch;
        else
                to_drain = pcp->count;
-        free_pages_bulk(zone, to_drain, &pcp->list, 0);
+        isolate_pcp_pages(to_drain, &pcp->list, &free_list);
        pcp->count -= to_drain;
-        local_irq_restore(flags);
+        unlock_cpu_pcp(flags, this_cpu);
+        free_pages_bulk(zone, to_drain, &free_list, 0);
 }
 #endif
@@ -950,14 +1016,22 @@ static void drain_pages(unsigned int cpu)
        for_each_populated_zone(zone) {
                struct per_cpu_pageset *pset;
                struct per_cpu_pages *pcp;
+                LIST_HEAD(free_list);
+                int count;
+                __lock_cpu_pcp(&flags, cpu);
                pset = zone_pcp(zone, cpu);
+                if (!pset) {
+                        unlock_cpu_pcp(flags, cpu);
+                        WARN_ON(1);
+                        continue;
+                }
                pcp = &pset->pcp;
-                local_irq_save(flags);
+                isolate_pcp_pages(pcp->count, &pcp->list, &free_list);
-                free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                count = pcp->count;
                pcp->count = 0;
-                local_irq_restore(flags);
+                unlock_cpu_pcp(flags, cpu);
+                free_pages_bulk(zone, count, &free_list, 0);
        }
 }
@@ -969,12 +1043,52 @@ void drain_local_pages(void *arg)
        drain_pages(smp_processor_id());
 }
+#ifdef CONFIG_PREEMPT_RT
+static void drain_local_pages_work(struct work_struct *wrk)
+{
+        drain_pages(smp_processor_id());
+}
+#endif
 /*
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
 */
 void drain_all_pages(void)
 {
+#ifdef CONFIG_PREEMPT_RT
+        /*
+         * HACK!!!!!
+         *  For RT we can't use IPIs to run drain_local_pages, since
+         *  that code will call spin_locks that will now sleep.
+         *  But, schedule_on_each_cpu will call kzalloc, which will
+         *  call page_alloc which was what calls this.
+         *
+         *  Luckily, there's a condition to get here, and that is if
+         *  the order passed in to alloc_pages is greater than 0
+         *  (alloced more than a page size).  The slabs only allocate
+         *  what is needed, and the allocation made by schedule_on_each_cpu
+         *  does an alloc of "sizeof(void *)*nr_cpu_ids".
+         *
+         *  So we can safely call schedule_on_each_cpu if that number
+         *  is less than a page. Otherwise don't bother. At least warn of
+         *  this issue.
+         *
+         * And yes, this is one big hack.  Please fix ;-)
+         */
+        if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE)
+                schedule_on_each_cpu(drain_local_pages_work);
+        else {
+                static int once;
+                if (!once) {
+                        printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n");
+                        once = 1;
+                }
+                drain_local_pages(NULL);
+        }
+#else
        on_each_cpu(drain_local_pages, NULL, 1);
+#endif
 }
 #ifdef CONFIG_HIBERNATION
@@ -1019,9 +1133,10 @@ void mark_free_pages(struct zone *zone)
 static void free_hot_cold_page(struct page *page, int cold)
 {
        struct zone *zone = page_zone(page);
+        struct per_cpu_pageset *pset;
        struct per_cpu_pages *pcp;
        unsigned long flags;
-        int wasMlocked = TestClearPageMlocked(page);
+        int count, this_cpu, wasMlocked = TestClearPageMlocked(page);
        kmemcheck_free_shadow(page, 0);
@@ -1037,12 +1152,12 @@ static void free_hot_cold_page(struct page *page, int cold)
        arch_free_page(page, 0);
        kernel_map_pages(page, 1, 0);
-        pcp = &zone_pcp(zone, get_cpu())->pcp;
+        pset = get_zone_pcp(zone, &flags, &this_cpu);
+        pcp = &pset->pcp;
        set_page_private(page, get_pageblock_migratetype(page));
-        local_irq_save(flags);
        if (unlikely(wasMlocked))
                free_page_mlock(page);
-        __count_vm_event(PGFREE);
+        count_vm_event(PGFREE);
        if (cold)
                list_add_tail(&page->lru, &pcp->list);
@@ -1050,11 +1165,15 @@ static void free_hot_cold_page(struct page *page, int cold)
                list_add(&page->lru, &pcp->list);
        pcp->count++;
        if (pcp->count >= pcp->high) {
-                free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+                LIST_HEAD(free_list);
+                isolate_pcp_pages(pcp->batch, &pcp->list, &free_list);
                pcp->count -= pcp->batch;
-        }
+                count = pcp->batch;
-        local_irq_restore(flags);
+                put_zone_pcp(zone, flags, this_cpu);
-        put_cpu();
+                free_pages_bulk(zone, count, &free_list, 0);
+        } else
+                put_zone_pcp(zone, flags, this_cpu);
 }
 void free_hot_page(struct page *page)
@@ -1108,15 +1227,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
        unsigned long flags;
        struct page *page;
        int cold = !!(gfp_flags & __GFP_COLD);
-        int cpu;
+        struct per_cpu_pageset *pset;
+        int this_cpu;
 again:
-        cpu  = get_cpu();
+        pset = get_zone_pcp(zone, &flags, &this_cpu);
        if (likely(order == 0)) {
-                struct per_cpu_pages *pcp;
+                struct per_cpu_pages *pcp = &pset->pcp;
-                pcp = &zone_pcp(zone, cpu)->pcp;
-                local_irq_save(flags);
                if (!pcp->count) {
                        pcp->count = rmqueue_bulk(zone, 0,
                                        pcp->batch, &pcp->list, migratetype);
@@ -1158,7 +1277,7 @@ again:
                         */
                        WARN_ON_ONCE(order > 1);
                }
-                spin_lock_irqsave(&zone->lock, flags);
+                spin_lock(&zone->lock);
                page = __rmqueue(zone, order, migratetype);
                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
                spin_unlock(&zone->lock);
@@ -1168,8 +1287,7 @@ again:
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone);
-        local_irq_restore(flags);
+        put_zone_pcp(zone, flags, this_cpu);
-        put_cpu();
        VM_BUG_ON(bad_range(zone, page));
        if (prep_new_page(page, order, gfp_flags))
@@ -1177,8 +1295,7 @@ again:
        return page;
 failed:
-        local_irq_restore(flags);
+        put_zone_pcp(zone, flags, this_cpu);
-        put_cpu();
        return NULL;
 }
@@ -3036,7 +3153,23 @@ static inline void free_zone_pagesets(int cpu)
        struct zone *zone;
        for_each_zone(zone) {
-                struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+                unsigned long flags;
+                struct per_cpu_pageset *pset;
+                /*
+                 * On PREEMPT_RT the allocator is preemptible, therefore
+                 * kstopmachine can preempt a process in the middle of an
+                 * allocation, freeing the pset underneath such a process
+                 * isn't a good idea.
+                 *
+                 * Take the per-cpu pcp lock to allow the task to complete
+                 * before we free it. New tasks will be held off by the
+                 * cpu_online() check in get_cpu_var_locked().
+                 */
+                __lock_cpu_pcp(&flags, cpu);
+                pset = zone_pcp(zone, cpu);
+                zone_pcp(zone, cpu) = NULL;
+                unlock_cpu_pcp(flags, cpu);
                /* Free per_cpu_pageset if it is slab allocated */
                if (pset != &boot_pageset[cpu])
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4ff..03341b014c2b 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
 #include <linux/module.h>
 #include <linux/quicklist.h>
-DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+DEFINE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK];
 #define FRACTION_OF_NODE_MEM    16
@@ -66,17 +66,14 @@ void quicklist_trim(int nr, void (*dtor)(void *),
 {
        long pages_to_free;
        struct quicklist *q;
+        int cpu;
-        q = &get_cpu_var(quicklist)[nr];
+        q = &get_cpu_var_locked(quicklist, &cpu)[nr];
        if (q->nr_pages > min_pages) {
                pages_to_free = min_pages_to_free(q, min_pages, max_free);
                while (pages_to_free > 0) {
-                        /*
+                        void *p = __quicklist_alloc(q);
-                         * We pass a gfp_t of 0 to quicklist_alloc here
-                         * because we will never call into the page allocator.
-                         */
-                        void *p = quicklist_alloc(nr, 0, NULL);
                        if (dtor)
                                dtor(p);
@@ -84,7 +81,7 @@ void quicklist_trim(int nr, void (*dtor)(void *),
                        pages_to_free--;
                }
        }
-        put_cpu_var(quicklist);
+        put_cpu_var_locked(quicklist, cpu);
 }
 unsigned long quicklist_total_size(void)
@@ -94,7 +91,7 @@ unsigned long quicklist_total_size(void)
        struct quicklist *ql, *q;
        for_each_online_cpu(cpu) {
-                ql = per_cpu(quicklist, cpu);
+                ql = per_cpu_var_locked(quicklist, cpu);
                for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
                        count += q->nr_pages;
        }
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4deacfcd..a4bd9068c557 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -121,6 +121,138 @@
 #include        <asm/page.h>
 /*
+ * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking
+ * mechanism.
+ *
+ * On PREEMPT_RT, we use per-CPU locks for this. That's why the
+ * calling convention is changed slightly: a new 'flags' argument
+ * is passed to 'irq disable/enable' - the PREEMPT_RT code stores
+ * the CPU number of the lock there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define slab_irq_disable(cpu) \
+        do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_enable(cpu)           local_irq_enable()
+static inline void slab_irq_disable_this_rt(int cpu)
+{
+}
+static inline void slab_irq_enable_rt(int cpu)
+{
+}
+# define slab_irq_save(flags, cpu) \
+        do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_irq_restore(flags, cpu)   local_irq_restore(flags)
+/*
+ * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT,
+ * which has no per-CPU locking effect since we are holding the cache
+ * lock in that case already.
+ */
+static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu)
+{
+        if (flags & __GFP_WAIT)
+                local_irq_enable();
+}
+static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu)
+{
+        if (flags & __GFP_WAIT)
+                local_irq_disable();
+}
+# define slab_spin_lock_irq(lock, cpu) \
+        do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) spin_unlock_irq(lock)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+        do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+        do { spin_unlock_irqrestore(lock, flags); } while (0)
+#else /* CONFIG_PREEMPT_RT */
+/*
+ * Instead of serializing the per-cpu state by disabling interrupts we do so
+ * by a lock. This keeps the code preemptable - albeit at the cost of remote
+ * memory access when the task does get migrated away.
+ */
+DEFINE_PER_CPU_LOCKED(struct list_head, slab) = { 0, };
+static void _slab_irq_disable(int *cpu)
+{
+        (void)get_cpu_var_locked(slab, cpu);
+}
+#define slab_irq_disable(cpu) _slab_irq_disable(&(cpu))
+static inline void slab_irq_enable(int cpu)
+{
+        LIST_HEAD(list);
+        list_splice_init(&__get_cpu_var_locked(slab, cpu), &list);
+        put_cpu_var_locked(slab, cpu);
+        while (!list_empty(&list)) {
+                struct page *page = list_first_entry(&list, struct page, lru);
+                list_del(&page->lru);
+                __free_pages(page, page->index);
+        }
+}
+static inline void slab_irq_disable_this_rt(int cpu)
+{
+        spin_lock(&__get_cpu_lock(slab, cpu));
+}
+static inline void slab_irq_enable_rt(int cpu)
+{
+        LIST_HEAD(list);
+        list_splice_init(&__get_cpu_var_locked(slab, cpu), &list);
+        spin_unlock(&__get_cpu_lock(slab, cpu));
+        while (!list_empty(&list)) {
+                struct page *page = list_first_entry(&list, struct page, lru);
+                list_del(&page->lru);
+                __free_pages(page, page->index);
+        }
+}
+# define slab_irq_save(flags, cpu) \
+        do { slab_irq_disable(cpu); (void) (flags); } while (0)
+# define slab_irq_restore(flags, cpu) \
+        do { slab_irq_enable(cpu); (void) (flags); } while (0)
+/*
+ * On PREEMPT_RT we have to drop the locks unconditionally to avoid lock
+ * recursion on the cache_grow()->alloc_slabmgmt() path.
+ */
+static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu)
+{
+        slab_irq_enable(*cpu);
+}
+static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu)
+{
+        slab_irq_disable(*cpu);
+}
+# define slab_spin_lock_irq(lock, cpu) \
+                do { slab_irq_disable(cpu); spin_lock(lock); } while (0)
+# define slab_spin_unlock_irq(lock, cpu) \
+                do { spin_unlock(lock); slab_irq_enable(cpu); } while (0)
+# define slab_spin_lock_irqsave(lock, flags, cpu) \
+        do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0)
+# define slab_spin_unlock_irqrestore(lock, flags, cpu) \
+        do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0)
+#endif /* CONFIG_PREEMPT_RT */
+/*
 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 *                0 for faster, smaller code (especially in the critical paths).
 *
@@ -316,7 +448,7 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 static int drain_freelist(struct kmem_cache *cache,
                        struct kmem_list3 *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
-                        int node);
+                       int node, int *this_cpu);
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
@@ -687,9 +819,10 @@ int slab_is_available(void)
 static DEFINE_PER_CPU(struct delayed_work, reap_work);
-static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
+static inline struct array_cache *
+cpu_cache_get(struct kmem_cache *cachep, int this_cpu)
 {
-        return cachep->array[smp_processor_id()];
+        return cachep->array[this_cpu];
 }
 static inline struct kmem_cache *__find_general_cachep(size_t size,
@@ -930,7 +1063,7 @@ static int transfer_objects(struct array_cache *to,
 #ifndef CONFIG_NUMA
 #define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
+#define reap_alien(cachep, l3, this_cpu) 0
 static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
@@ -941,27 +1074,28 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 {
 }
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static inline int
+cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
        return 0;
 }
 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
-                gfp_t flags)
+                                         gfp_t flags, int *this_cpu)
 {
        return NULL;
 }
 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
-                 gfp_t flags, int nodeid)
+                                         gfp_t flags, int nodeid, int *this_cpu)
 {
        return NULL;
 }
 #else   /* CONFIG_NUMA */
-static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int, int *);
-static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int *);
 static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
@@ -1002,7 +1136,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
 }
 static void __drain_alien_cache(struct kmem_cache *cachep,
-                                struct array_cache *ac, int node)
+                                struct array_cache *ac, int node,
+                                int *this_cpu)
 {
        struct kmem_list3 *rl3 = cachep->nodelists[node];
@@ -1016,7 +1151,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
                if (rl3->shared)
                        transfer_objects(rl3->shared, ac, ac->limit);
-                free_block(cachep, ac->entry, ac->avail, node);
+                free_block(cachep, ac->entry, ac->avail, node, this_cpu);
                ac->avail = 0;
                spin_unlock(&rl3->list_lock);
        }
@@ -1025,38 +1160,42 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 /*
 * Called from cache_reap() to regularly drain alien caches round robin.
 */
-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static int
+reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu)
 {
-        int node = __get_cpu_var(reap_node);
+        int node = per_cpu(reap_node, *this_cpu);
        if (l3->alien) {
                struct array_cache *ac = l3->alien[node];
                if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
-                        __drain_alien_cache(cachep, ac, node);
+                        __drain_alien_cache(cachep, ac, node, this_cpu);
                        spin_unlock_irq(&ac->lock);
+                        return 1;
                }
        }
+        return 0;
 }
 static void drain_alien_cache(struct kmem_cache *cachep,
                                struct array_cache **alien)
 {
-        int i = 0;
+        int i = 0, this_cpu;
        struct array_cache *ac;
        unsigned long flags;
        for_each_online_node(i) {
                ac = alien[i];
                if (ac) {
-                        spin_lock_irqsave(&ac->lock, flags);
+                        slab_spin_lock_irqsave(&ac->lock, flags, this_cpu);
-                        __drain_alien_cache(cachep, ac, i);
+                        __drain_alien_cache(cachep, ac, i, &this_cpu);
-                        spin_unlock_irqrestore(&ac->lock, flags);
+                        slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu);
                }
        }
 }
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static inline int
+cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
        struct slab *slabp = virt_to_slab(objp);
        int nodeid = slabp->nodeid;
@@ -1064,7 +1203,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
        struct array_cache *alien = NULL;
        int node;
-        node = numa_node_id();
+        node = cpu_to_node(*this_cpu);
        /*
         * Make sure we are not freeing a object from another node to the array
@@ -1080,20 +1219,20 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
                spin_lock(&alien->lock);
                if (unlikely(alien->avail == alien->limit)) {
                        STATS_INC_ACOVERFLOW(cachep);
-                        __drain_alien_cache(cachep, alien, nodeid);
+                        __drain_alien_cache(cachep, alien, nodeid, this_cpu);
                }
                alien->entry[alien->avail++] = objp;
                spin_unlock(&alien->lock);
        } else {
                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
-                free_block(cachep, &objp, 1, nodeid);
+                free_block(cachep, &objp, 1, nodeid, this_cpu);
                spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
        }
        return 1;
 }
 #endif
-static void __cpuinit cpuup_canceled(long cpu)
+static void __cpuinit cpuup_canceled(int cpu)
 {
        struct kmem_cache *cachep;
        struct kmem_list3 *l3 = NULL;
@@ -1104,6 +1243,7 @@ static void __cpuinit cpuup_canceled(long cpu)
                struct array_cache *nc;
                struct array_cache *shared;
                struct array_cache **alien;
+                int orig_cpu = cpu;
                /* cpu is dead; no one can alloc from it. */
                nc = cachep->array[cpu];
@@ -1118,7 +1258,8 @@ static void __cpuinit cpuup_canceled(long cpu)
                /* Free limit for this kmem_list3 */
                l3->free_limit -= cachep->batchcount;
                if (nc)
-                        free_block(cachep, nc->entry, nc->avail, node);
+                        free_block(cachep, nc->entry, nc->avail, node,
+                                   &cpu);
                if (!cpus_empty(*mask)) {
                        spin_unlock_irq(&l3->list_lock);
@@ -1128,7 +1269,7 @@ static void __cpuinit cpuup_canceled(long cpu)
                shared = l3->shared;
                if (shared) {
                        free_block(cachep, shared->entry,
-                                   shared->avail, node);
+                                   shared->avail, node, &cpu);
                        l3->shared = NULL;
                }
@@ -1144,6 +1285,7 @@ static void __cpuinit cpuup_canceled(long cpu)
                }
 free_array_cache:
                kfree(nc);
+                BUG_ON(cpu != orig_cpu);
        }
        /*
         * In the previous loop, all the objects were freed to
@@ -1158,7 +1300,7 @@ free_array_cache:
        }
 }
-static int __cpuinit cpuup_prepare(long cpu)
+static int __cpuinit cpuup_prepare(int cpu)
 {
        struct kmem_cache *cachep;
        struct kmem_list3 *l3 = NULL;
@@ -1266,10 +1408,19 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
        long cpu = (long)hcpu;
        int err = 0;
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
                mutex_lock(&cache_chain_mutex);
+                /*
+                 * lock/unlock cycle to push any holders away -- no new ones
+                 * can come in due to the cpu still being offline.
+                 *
+                 * XXX -- weird case anyway, can it happen?
+                 */
+                slab_irq_disable_this_rt(cpu);
+                slab_irq_enable_rt(cpu);
                err = cpuup_prepare(cpu);
                mutex_unlock(&cache_chain_mutex);
                break;
@@ -1309,10 +1460,14 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
                mutex_lock(&cache_chain_mutex);
+                slab_irq_disable_this_rt(cpu);
                cpuup_canceled(cpu);
+                slab_irq_enable_rt(cpu);
                mutex_unlock(&cache_chain_mutex);
                break;
        }
        return err ? NOTIFY_BAD : NOTIFY_OK;
 }
@@ -1370,6 +1525,12 @@ void __init kmem_cache_init(void)
        int order;
        int node;
+#ifdef CONFIG_PREEMPT_RT
+        for_each_possible_cpu(i) {
+                INIT_LIST_HEAD(&__get_cpu_var_locked(slab, i));
+        }
+#endif
        if (num_possible_nodes() == 1)
                use_alien_caches = 0;
@@ -1499,32 +1660,34 @@ void __init kmem_cache_init(void)
        /* 4) Replace the bootstrap head arrays */
        {
                struct array_cache *ptr;
+                int cpu = smp_processor_id();
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-                BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
+                BUG_ON(cpu_cache_get(&cache_cache, cpu) !=
-                memcpy(ptr, cpu_cache_get(&cache_cache),
+                       &initarray_cache.cache);
+                memcpy(ptr, cpu_cache_get(&cache_cache, cpu),
                       sizeof(struct arraycache_init));
                /*
                 * Do not assume that spinlocks can be initialized via memcpy:
                 */
                spin_lock_init(&ptr->lock);
-                cache_cache.array[smp_processor_id()] = ptr;
+                cache_cache.array[cpu] = ptr;
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-                BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
+                BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, cpu)
                       != &initarray_generic.cache);
-                memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
+                memcpy(ptr,
+                       cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, cpu),
                       sizeof(struct arraycache_init));
                /*
                 * Do not assume that spinlocks can be initialized via memcpy:
                 */
                spin_lock_init(&ptr->lock);
-                malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
+                malloc_sizes[INDEX_AC].cs_cachep->array[cpu] = ptr;
-                    ptr;
        }
        /* 5) Replace the bootstrap kmem_list3's */
        {
@@ -1642,12 +1805,14 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 /*
 * Interface to system's page release.
 */
-static void kmem_freepages(struct kmem_cache *cachep, void *addr)
+static void kmem_freepages(struct kmem_cache *cachep, void *addr, int cpu)
 {
        unsigned long i = (1 << cachep->gfporder);
-        struct page *page = virt_to_page(addr);
+        struct page *page, *basepage = virt_to_page(addr);
        const unsigned long nr_freed = i;
+        page = basepage;
        kmemcheck_free_shadow(page, cachep->gfporder);
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -1656,6 +1821,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
        else
                sub_zone_page_state(page_zone(page),
                                NR_SLAB_UNRECLAIMABLE, nr_freed);
        while (i--) {
                BUG_ON(!PageSlab(page));
                __ClearPageSlab(page);
@@ -1663,6 +1829,13 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
        }
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
+#ifdef CONFIG_PREEMPT_RT
+        if (cpu >= 0) {
+                basepage->index = cachep->gfporder;
+                list_add(&basepage->lru, &__get_cpu_var_locked(slab, cpu));
+        } else
+#endif
        free_pages((unsigned long)addr, cachep->gfporder);
 }
@@ -1671,7 +1844,7 @@ static void kmem_rcu_free(struct rcu_head *head)
        struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
        struct kmem_cache *cachep = slab_rcu->cachep;
-        kmem_freepages(cachep, slab_rcu->addr);
+        kmem_freepages(cachep, slab_rcu->addr, -1);
        if (OFF_SLAB(cachep))
                kmem_cache_free(cachep->slabp_cache, slab_rcu);
 }
@@ -1691,7 +1864,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
        *addr++ = 0x12345678;
        *addr++ = caller;
-        *addr++ = smp_processor_id();
+        *addr++ = raw_smp_processor_id();
        size -= 3 * sizeof(unsigned long);
        {
                unsigned long *sptr = &caller;
@@ -1881,6 +2054,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
 }
 #endif
+static void
+__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu);
 /**
 * slab_destroy - destroy and release all objects in a slab
 * @cachep: cache pointer being destroyed
@@ -1890,7 +2067,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
 * Before calling the slab must have been unlinked from the cache.  The
 * cache-lock is not held/needed.
 */
-static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
+static void
+slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu)
 {
        void *addr = slabp->s_mem - slabp->colouroff;
@@ -1903,9 +2081,13 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
                slab_rcu->addr = addr;
                call_rcu(&slab_rcu->head, kmem_rcu_free);
        } else {
-                kmem_freepages(cachep, addr);
+                kmem_freepages(cachep, addr, *this_cpu);
-                if (OFF_SLAB(cachep))
+                if (OFF_SLAB(cachep)) {
-                        kmem_cache_free(cachep->slabp_cache, slabp);
+                        if (this_cpu)
+                                __cache_free(cachep->slabp_cache, slabp, this_cpu);
+                        else
+                                kmem_cache_free(cachep->slabp_cache, slabp);
+                }
        }
 }
@@ -2002,6 +2184,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
+        int this_cpu;
        if (g_cpucache_up == FULL)
                return enable_cpucache(cachep, gfp);
@@ -2045,10 +2229,12 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
                        jiffies + REAPTIMEOUT_LIST3 +
                        ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-        cpu_cache_get(cachep)->avail = 0;
+        this_cpu = raw_smp_processor_id();
-        cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-        cpu_cache_get(cachep)->batchcount = 1;
+        cpu_cache_get(cachep, this_cpu)->avail = 0;
-        cpu_cache_get(cachep)->touched = 0;
+        cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES;
+        cpu_cache_get(cachep, this_cpu)->batchcount = 1;
+        cpu_cache_get(cachep, this_cpu)->touched = 0;
        cachep->batchcount = 1;
        cachep->limit = BOOT_CPUCACHE_ENTRIES;
        return 0;
@@ -2358,19 +2544,19 @@ EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
 {
+/*
+ * On PREEMPT_RT we use locks to protect the per-CPU lists,
+ * and keep interrupts enabled.
+ */
+#ifndef CONFIG_PREEMPT_RT
        BUG_ON(!irqs_disabled());
+#endif
 }
 static void check_irq_on(void)
 {
+#ifndef CONFIG_PREEMPT_RT
        BUG_ON(irqs_disabled());
-}
-static void check_spinlock_acquired(struct kmem_cache *cachep)
-{
-#ifdef CONFIG_SMP
-        check_irq_off();
-        assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
 #endif
 }
@@ -2385,34 +2571,67 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #else
 #define check_irq_off() do { } while(0)
 #define check_irq_on()  do { } while(0)
-#define check_spinlock_acquired(x) do { } while(0)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
-static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
                        struct array_cache *ac,
                        int force, int node);
-static void do_drain(void *arg)
+static void __do_drain(void *arg, int this_cpu)
 {
        struct kmem_cache *cachep = arg;
+        int node = cpu_to_node(this_cpu);
        struct array_cache *ac;
-        int node = numa_node_id();
        check_irq_off();
-        ac = cpu_cache_get(cachep);
+        ac = cpu_cache_get(cachep, this_cpu);
        spin_lock(&cachep->nodelists[node]->list_lock);
-        free_block(cachep, ac->entry, ac->avail, node);
+        free_block(cachep, ac->entry, ac->avail, node, &this_cpu);
        spin_unlock(&cachep->nodelists[node]->list_lock);
        ac->avail = 0;
 }
+#ifdef CONFIG_PREEMPT_RT
+static void do_drain(void *arg, int this_cpu)
+{
+        __do_drain(arg, this_cpu);
+}
+#else
+static void do_drain(void *arg)
+{
+        __do_drain(arg, smp_processor_id());
+}
+#endif
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * execute func() for all CPUs. On PREEMPT_RT we dont actually have
+ * to run on the remote CPUs - we only have to take their CPU-locks.
+ * (This is a rare operation, so cacheline bouncing is not an issue.)
+ */
+static void
+slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg)
+{
+        unsigned int i;
+        check_irq_on();
+        for_each_online_cpu(i) {
+                spin_lock(&__get_cpu_lock(slab, i));
+                func(arg, i);
+                spin_unlock(&__get_cpu_lock(slab, i));
+        }
+}
+#else
+# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1)
+#endif
 static void drain_cpu_caches(struct kmem_cache *cachep)
 {
        struct kmem_list3 *l3;
        int node;
-        on_each_cpu(do_drain, cachep, 1);
+        slab_on_each_cpu(do_drain, cachep);
        check_irq_on();
        for_each_online_node(node) {
                l3 = cachep->nodelists[node];
@@ -2437,16 +2656,16 @@ static int drain_freelist(struct kmem_cache *cache,
                        struct kmem_list3 *l3, int tofree)
 {
        struct list_head *p;
-        int nr_freed;
+        int nr_freed, this_cpu;
        struct slab *slabp;
        nr_freed = 0;
        while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
-                spin_lock_irq(&l3->list_lock);
+                slab_spin_lock_irq(&l3->list_lock, this_cpu);
                p = l3->slabs_free.prev;
                if (p == &l3->slabs_free) {
-                        spin_unlock_irq(&l3->list_lock);
+                        slab_spin_unlock_irq(&l3->list_lock, this_cpu);
                        goto out;
                }
@@ -2455,13 +2674,9 @@ static int drain_freelist(struct kmem_cache *cache,
                BUG_ON(slabp->inuse);
 #endif
                list_del(&slabp->list);
-                /*
-                 * Safe to drop the lock. The slab is no longer linked
-                 * to the cache.
-                 */
                l3->free_objects -= cache->num;
-                spin_unlock_irq(&l3->list_lock);
+                slab_destroy(cache, slabp, &this_cpu);
-                slab_destroy(cache, slabp);
+                slab_spin_unlock_irq(&l3->list_lock, this_cpu);
                nr_freed++;
        }
 out:
@@ -2725,8 +2940,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
 * Grow (by 1) the number of slabs within a cache.  This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
-static int cache_grow(struct kmem_cache *cachep,
+static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,
-                gfp_t flags, int nodeid, void *objp)
+                      void *objp, int *this_cpu)
 {
        struct slab *slabp;
        size_t offset;
@@ -2754,8 +2969,7 @@ static int cache_grow(struct kmem_cache *cachep,
        offset *= cachep->colour_off;
-        if (local_flags & __GFP_WAIT)
+        slab_irq_enable_GFP_WAIT(local_flags, this_cpu);
-                local_irq_enable();
        /*
         * The test for missing atomic flag is performed here, rather than
@@ -2784,8 +2998,8 @@ static int cache_grow(struct kmem_cache *cachep,
        cache_init_objs(cachep, slabp);
-        if (local_flags & __GFP_WAIT)
+        slab_irq_disable_GFP_WAIT(local_flags, this_cpu);
-                local_irq_disable();
        check_irq_off();
        spin_lock(&l3->list_lock);
@@ -2796,10 +3010,9 @@ static int cache_grow(struct kmem_cache *cachep,
        spin_unlock(&l3->list_lock);
        return 1;
 opps1:
-        kmem_freepages(cachep, objp);
+        kmem_freepages(cachep, objp, -1);
 failed:
-        if (local_flags & __GFP_WAIT)
+        slab_irq_disable_GFP_WAIT(local_flags, this_cpu);
-                local_irq_disable();
        return 0;
 }
@@ -2921,7 +3134,8 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *
+cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
        int batchcount;
        struct kmem_list3 *l3;
@@ -2931,7 +3145,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 retry:
        check_irq_off();
        node = numa_node_id();
-        ac = cpu_cache_get(cachep);
+        ac = cpu_cache_get(cachep, *this_cpu);
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
                /*
@@ -2941,7 +3155,7 @@ retry:
                 */
                batchcount = BATCHREFILL_LIMIT;
        }
-        l3 = cachep->nodelists[node];
+        l3 = cachep->nodelists[cpu_to_node(*this_cpu)];
        BUG_ON(ac->avail > 0 || !l3);
        spin_lock(&l3->list_lock);
@@ -2964,7 +3178,7 @@ retry:
                slabp = list_entry(entry, struct slab, list);
                check_slabp(cachep, slabp);
-                check_spinlock_acquired(cachep);
+                check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu));
                /*
                 * The slab was either on partial or free list so
@@ -2978,8 +3192,9 @@ retry:
                        STATS_INC_ACTIVE(cachep);
                        STATS_SET_HIGH(cachep);
-                        ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
+                        ac->entry[ac->avail++] =
-                                                            node);
+                                slab_get_obj(cachep, slabp,
+                                             cpu_to_node(*this_cpu));
                }
                check_slabp(cachep, slabp);
@@ -2998,10 +3213,10 @@ alloc_done:
        if (unlikely(!ac->avail)) {
                int x;
-                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+                x = cache_grow(cachep, flags | GFP_THISNODE, cpu_to_node(*this_cpu), NULL, this_cpu);
                /* cache_grow can reenable interrupts, then ac could change. */
-                ac = cpu_cache_get(cachep);
+                ac = cpu_cache_get(cachep, *this_cpu);
                if (!x && ac->avail == 0)       /* no objects in sight? abort */
                        return NULL;
@@ -3088,21 +3303,22 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
        return should_failslab(obj_size(cachep), flags);
 }
-static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+static inline void *
+____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
        void *objp;
        struct array_cache *ac;
        check_irq_off();
-        ac = cpu_cache_get(cachep);
+        ac = cpu_cache_get(cachep, *this_cpu);
        if (likely(ac->avail)) {
                STATS_INC_ALLOCHIT(cachep);
                ac->touched = 1;
                objp = ac->entry[--ac->avail];
        } else {
                STATS_INC_ALLOCMISS(cachep);
-                objp = cache_alloc_refill(cachep, flags);
+                objp = cache_alloc_refill(cachep, flags, this_cpu);
        }
        /*
         * To avoid a false negative, if an object that is in one of the
@@ -3120,7 +3336,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 * If we are in_interrupt, then process context, including cpusets and
 * mempolicy, may not apply and should not be used for allocation policy.
 */
-static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags,
+                                int *this_cpu)
 {
        int nid_alloc, nid_here;
@@ -3132,7 +3349,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        else if (current->mempolicy)
                nid_alloc = slab_node(current->mempolicy);
        if (nid_alloc != nid_here)
-                return ____cache_alloc_node(cachep, flags, nid_alloc);
+                return ____cache_alloc_node(cachep, flags, nid_alloc, this_cpu);
        return NULL;
 }
@@ -3144,7 +3361,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 * allocator to do its reclaim / fallback magic. We then insert the
 * slab into the proper nodelist and then allocate from it.
 */
-static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu)
 {
        struct zonelist *zonelist;
        gfp_t local_flags;
@@ -3172,7 +3389,8 @@ retry:
                        cache->nodelists[nid] &&
                        cache->nodelists[nid]->free_objects) {
                                obj = ____cache_alloc_node(cache,
-                                        flags | GFP_THISNODE, nid);
+                                        flags | GFP_THISNODE, nid,
+                                        this_cpu);
                                if (obj)
                                        break;
                }
@@ -3185,20 +3403,21 @@ retry:
                 * We may trigger various forms of reclaim on the allowed
                 * set and go into memory reserves if necessary.
                 */
-                if (local_flags & __GFP_WAIT)
+                slab_irq_enable_GFP_WAIT(local_flags, this_cpu);
-                        local_irq_enable();
                kmem_flagcheck(cache, flags);
-                obj = kmem_getpages(cache, local_flags, numa_node_id());
+                obj = kmem_getpages(cache, local_flags, cpu_to_node(*this_cpu));
-                if (local_flags & __GFP_WAIT)
-                        local_irq_disable();
+                slab_irq_disable_GFP_WAIT(local_flags, this_cpu);
                if (obj) {
                        /*
                         * Insert into the appropriate per node queues
                         */
                        nid = page_to_nid(virt_to_page(obj));
-                        if (cache_grow(cache, flags, nid, obj)) {
+                        if (cache_grow(cache, flags, nid, obj, this_cpu)) {
                                obj = ____cache_alloc_node(cache,
-                                        flags | GFP_THISNODE, nid);
+                                        flags | GFP_THISNODE, nid, this_cpu);
                                if (!obj)
                                        /*
                                         * Another processor may allocate the
@@ -3219,7 +3438,7 @@ retry:
 * A interface to enable slab creation on nodeid
 */
 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
-                                int nodeid)
+                                int nodeid, int *this_cpu)
 {
        struct list_head *entry;
        struct slab *slabp;
@@ -3267,11 +3486,11 @@ retry:
 must_grow:
        spin_unlock(&l3->list_lock);
-        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, this_cpu);
        if (x)
                goto retry;
-        return fallback_alloc(cachep, flags);
+        return fallback_alloc(cachep, flags, this_cpu);
 done:
        return obj;
@@ -3294,6 +3513,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
                   void *caller)
 {
        unsigned long save_flags;
+        int this_cpu, this_node;
        void *ptr;
        flags &= gfp_allowed_mask;
@@ -3304,32 +3524,34 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
                return NULL;
        cache_alloc_debugcheck_before(cachep, flags);
-        local_irq_save(save_flags);
+        slab_irq_save(save_flags, this_cpu);
+        this_node = cpu_to_node(this_cpu);
        if (unlikely(nodeid == -1))
-                nodeid = numa_node_id();
+                nodeid = this_node;
        if (unlikely(!cachep->nodelists[nodeid])) {
                /* Node not bootstrapped yet */
-                ptr = fallback_alloc(cachep, flags);
+                ptr = fallback_alloc(cachep, flags, &this_cpu);
                goto out;
        }
-        if (nodeid == numa_node_id()) {
+        if (nodeid == this_node) {
                /*
                 * Use the locally cached objects if possible.
                 * However ____cache_alloc does not allow fallback
                 * to other nodes. It may fail while we still have
                 * objects on other nodes available.
                 */
-                ptr = ____cache_alloc(cachep, flags);
+                ptr = ____cache_alloc(cachep, flags, &this_cpu);
                if (ptr)
                        goto out;
        }
        /* ___cache_alloc_node can fall back to other nodes */
-        ptr = ____cache_alloc_node(cachep, flags, nodeid);
+        ptr = ____cache_alloc_node(cachep, flags, nodeid, &this_cpu);
  out:
-        local_irq_restore(save_flags);
+        slab_irq_restore(save_flags, this_cpu);
        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
        kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
                                 flags);
@@ -3344,33 +3566,33 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 }
 static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
+__do_cache_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu)
 {
        void *objp;
        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
-                objp = alternate_node_alloc(cache, flags);
+                objp = alternate_node_alloc(cache, flags, this_cpu);
                if (objp)
                        goto out;
        }
-        objp = ____cache_alloc(cache, flags);
+        objp = ____cache_alloc(cache, flags, this_cpu);
        /*
         * We may just have run out of memory on the local node.
         * ____cache_alloc_node() knows how to locate memory on other nodes
         */
-        if (!objp)
+        if (!objp)
-                objp = ____cache_alloc_node(cache, flags, numa_node_id());
+                objp = ____cache_alloc_node(cache, flags,
+                                            cpu_to_node(*this_cpu), this_cpu);
  out:
        return objp;
 }
 #else
 static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
 {
-        return ____cache_alloc(cachep, flags);
+        return ____cache_alloc(cachep, flags, this_cpu);
 }
 #endif /* CONFIG_NUMA */
@@ -3379,6 +3601,7 @@ static __always_inline void *
 __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 {
        unsigned long save_flags;
+        int this_cpu;
        void *objp;
        flags &= gfp_allowed_mask;
@@ -3389,9 +3612,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
                return NULL;
        cache_alloc_debugcheck_before(cachep, flags);
-        local_irq_save(save_flags);
+        slab_irq_save(save_flags, this_cpu);
-        objp = __do_cache_alloc(cachep, flags);
+        objp = __do_cache_alloc(cachep, flags, &this_cpu);
-        local_irq_restore(save_flags);
+        slab_irq_restore(save_flags, this_cpu);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
        kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
                                 flags);
@@ -3410,7 +3633,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 * Caller needs to acquire correct kmem_list's list_lock
 */
 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
-                       int node)
+                       int node, int *this_cpu)
 {
        int i;
        struct kmem_list3 *l3;
@@ -3439,7 +3662,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
                                 * a different cache, refer to comments before
                                 * alloc_slabmgmt.
                                 */
-                                slab_destroy(cachep, slabp);
+                                slab_destroy(cachep, slabp, this_cpu);
                        } else {
                                list_add(&slabp->list, &l3->slabs_free);
                        }
@@ -3453,11 +3676,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
        }
 }
-static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
+static void
+cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu)
 {
        int batchcount;
        struct kmem_list3 *l3;
-        int node = numa_node_id();
+        int node = cpu_to_node(*this_cpu);
        batchcount = ac->batchcount;
 #if DEBUG
@@ -3479,7 +3703,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
                }
        }
-        free_block(cachep, ac->entry, batchcount, node);
+        free_block(cachep, ac->entry, batchcount, node, this_cpu);
 free_done:
 #if STATS
        {
@@ -3508,9 +3732,10 @@ free_done:
 * Release an obj back to its cache. If the obj has a constructed state, it must
 * be in this state _before_ it is released.  Called with disabled ints.
 */
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static inline void
+__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu)
 {
-        struct array_cache *ac = cpu_cache_get(cachep);
+        struct array_cache *ac = cpu_cache_get(cachep, *this_cpu);
        check_irq_off();
        kmemleak_free_recursive(objp, cachep->flags);
@@ -3525,7 +3750,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
         * variable to skip the call, which is mostly likely to be present in
         * the cache.
         */
-        if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
+        if (nr_online_nodes > 1 && cache_free_alien(cachep, objp, this_cpu))
                return;
        if (likely(ac->avail < ac->limit)) {
@@ -3534,7 +3759,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
                return;
        } else {
                STATS_INC_FREEMISS(cachep);
-                cache_flusharray(cachep, ac);
+                cache_flusharray(cachep, ac, this_cpu);
                ac->entry[ac->avail++] = objp;
        }
 }
@@ -3733,13 +3958,14 @@ EXPORT_SYMBOL(__kmalloc);
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
        unsigned long flags;
+        int this_cpu;
-        local_irq_save(flags);
+        slab_irq_save(flags, this_cpu);
        debug_check_no_locks_freed(objp, obj_size(cachep));
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(objp, obj_size(cachep));
-        __cache_free(cachep, objp);
+        __cache_free(cachep, objp, &this_cpu);
-        local_irq_restore(flags);
+        slab_irq_restore(flags, this_cpu);
        trace_kmem_cache_free(_RET_IP_, objp);
 }
@@ -3758,18 +3984,19 @@ void kfree(const void *objp)
 {
        struct kmem_cache *c;
        unsigned long flags;
+        int this_cpu;
        trace_kfree(_RET_IP_, objp);
        if (unlikely(ZERO_OR_NULL_PTR(objp)))
                return;
-        local_irq_save(flags);
+        slab_irq_save(flags, this_cpu);
        kfree_debugcheck(objp);
        c = virt_to_cache(objp);
        debug_check_no_locks_freed(objp, obj_size(c));
        debug_check_no_obj_freed(objp, obj_size(c));
-        __cache_free(c, (void *)objp);
+        __cache_free(c, (void *)objp, &this_cpu);
-        local_irq_restore(flags);
+        slab_irq_restore(flags, this_cpu);
 }
 EXPORT_SYMBOL(kfree);
@@ -3790,7 +4017,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
 */
 static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 {
-        int node;
+        int node, this_cpu;
        struct kmem_list3 *l3;
        struct array_cache *new_shared;
        struct array_cache **new_alien = NULL;
@@ -3818,11 +4045,11 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
                if (l3) {
                        struct array_cache *shared = l3->shared;
-                        spin_lock_irq(&l3->list_lock);
+                        slab_spin_lock_irq(&l3->list_lock, this_cpu);
                        if (shared)
                                free_block(cachep, shared->entry,
-                                                shared->avail, node);
+                                           shared->avail, node, &this_cpu);
                        l3->shared = new_shared;
                        if (!l3->alien) {
@@ -3831,7 +4058,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
                        }
                        l3->free_limit = (1 + nr_cpus_node(node)) *
                                        cachep->batchcount + cachep->num;
-                        spin_unlock_irq(&l3->list_lock);
+                        slab_spin_unlock_irq(&l3->list_lock, this_cpu);
                        kfree(shared);
                        free_alien_cache(new_alien);
                        continue;
@@ -3878,24 +4105,36 @@ struct ccupdate_struct {
        struct array_cache *new[NR_CPUS];
 };
-static void do_ccupdate_local(void *info)
+static void __do_ccupdate_local(void *info, int this_cpu)
 {
        struct ccupdate_struct *new = info;
        struct array_cache *old;
        check_irq_off();
-        old = cpu_cache_get(new->cachep);
+        old = cpu_cache_get(new->cachep, this_cpu);
-        new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
+        new->cachep->array[this_cpu] = new->new[this_cpu];
-        new->new[smp_processor_id()] = old;
+        new->new[this_cpu] = old;
 }
+#ifdef CONFIG_PREEMPT_RT
+static void do_ccupdate_local(void *arg, int this_cpu)
+{
+        __do_ccupdate_local(arg, this_cpu);
+}
+#else
+static void do_ccupdate_local(void *arg)
+{
+        __do_ccupdate_local(arg, smp_processor_id());
+}
+#endif
 /* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
                                int batchcount, int shared, gfp_t gfp)
 {
        struct ccupdate_struct *new;
-        int i;
+        int i, this_cpu;
        new = kzalloc(sizeof(*new), gfp);
        if (!new)
@@ -3913,7 +4152,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        }
        new->cachep = cachep;
-        on_each_cpu(do_ccupdate_local, (void *)new, 1);
+        slab_on_each_cpu(do_ccupdate_local, (void *)new);
        check_irq_on();
        cachep->batchcount = batchcount;
@@ -3924,9 +4163,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
                struct array_cache *ccold = new->new[i];
                if (!ccold)
                        continue;
-                spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+                slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock,
-                free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
+                                   this_cpu);
-                spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+                free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i),
+                           &this_cpu);
+                slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock,
+                                     this_cpu);
                kfree(ccold);
        }
        kfree(new);
@@ -3991,29 +4233,31 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 * Drain an array if it contains any elements taking the l3 lock only if
 * necessary. Note that the l3 listlock also protects the array_cache
 * if drain_array() is used on the shared array.
+ * returns non-zero if some work is done
 */
-void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
-                         struct array_cache *ac, int force, int node)
+                 struct array_cache *ac, int force, int node)
 {
-        int tofree;
+        int tofree, this_cpu;
        if (!ac || !ac->avail)
-                return;
+                return 0;
        if (ac->touched && !force) {
                ac->touched = 0;
        } else {
-                spin_lock_irq(&l3->list_lock);
+                slab_spin_lock_irq(&l3->list_lock, this_cpu);
                if (ac->avail) {
                        tofree = force ? ac->avail : (ac->limit + 4) / 5;
                        if (tofree > ac->avail)
                                tofree = (ac->avail + 1) / 2;
-                        free_block(cachep, ac->entry, tofree, node);
+                        free_block(cachep, ac->entry, tofree, node, &this_cpu);
                        ac->avail -= tofree;
                        memmove(ac->entry, &(ac->entry[tofree]),
                                sizeof(void *) * ac->avail);
                }
-                spin_unlock_irq(&l3->list_lock);
+                slab_spin_unlock_irq(&l3->list_lock, this_cpu);
        }
+        return 1;
 }
 /**
@@ -4030,10 +4274,11 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 */
 static void cache_reap(struct work_struct *w)
 {
+        int this_cpu = raw_smp_processor_id(), node = cpu_to_node(this_cpu);
        struct kmem_cache *searchp;
        struct kmem_list3 *l3;
-        int node = numa_node_id();
        struct delayed_work *work = to_delayed_work(w);
+        int work_done = 0;
        if (!mutex_trylock(&cache_chain_mutex))
                /* Give up. Setup the next iteration. */
@@ -4049,9 +4294,12 @@ static void cache_reap(struct work_struct *w)
                 */
                l3 = searchp->nodelists[node];
-                reap_alien(searchp, l3);
+                work_done += reap_alien(searchp, l3, &this_cpu);
+                node = cpu_to_node(this_cpu);
-                drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
+                work_done += drain_array(searchp, l3,
+                            cpu_cache_get(searchp, this_cpu), 0, node);
                /*
                 * These are racy checks but it does not matter
@@ -4062,7 +4310,7 @@ static void cache_reap(struct work_struct *w)
                l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
-                drain_array(searchp, l3, l3->shared, 0, node);
+                work_done += drain_array(searchp, l3, l3->shared, 0, node);
                if (l3->free_touched)
                        l3->free_touched = 0;
@@ -4081,7 +4329,8 @@ next:
        next_reap_node();
 out:
        /* Set up the next iteration */
-        schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
+        schedule_delayed_work(work,
+                round_jiffies_relative((1+!work_done) * REAPTIMEOUT_CPUC));
 }
 #ifdef CONFIG_SLABINFO
@@ -4140,7 +4389,7 @@ static int s_show(struct seq_file *m, void *p)
        unsigned long num_slabs, free_objects = 0, shared_avail = 0;
        const char *name;
        char *error = NULL;
-        int node;
+        int this_cpu, node;
        struct kmem_list3 *l3;
        active_objs = 0;
@@ -4151,7 +4400,7 @@ static int s_show(struct seq_file *m, void *p)
                        continue;
                check_irq_on();
-                spin_lock_irq(&l3->list_lock);
+                slab_spin_lock_irq(&l3->list_lock, this_cpu);
                list_for_each_entry(slabp, &l3->slabs_full, list) {
                        if (slabp->inuse != cachep->num && !error)
@@ -4176,7 +4425,7 @@ static int s_show(struct seq_file *m, void *p)
                if (l3->shared)
                        shared_avail += l3->shared->avail;
-                spin_unlock_irq(&l3->list_lock);
+                slab_spin_unlock_irq(&l3->list_lock, this_cpu);
        }
        num_slabs += active_slabs;
        num_objs = num_slabs * cachep->num;
@@ -4386,7 +4635,7 @@ static int leaks_show(struct seq_file *m, void *p)
        struct kmem_list3 *l3;
        const char *name;
        unsigned long *n = m->private;
-        int node;
+        int node, this_cpu;
        int i;
        if (!(cachep->flags & SLAB_STORE_USER))
@@ -4404,13 +4653,13 @@ static int leaks_show(struct seq_file *m, void *p)
                        continue;
                check_irq_on();
-                spin_lock_irq(&l3->list_lock);
+                slab_spin_lock_irq(&l3->list_lock, this_cpu);
                list_for_each_entry(slabp, &l3->slabs_full, list)
                        handle_slab(n, cachep, slabp);
                list_for_each_entry(slabp, &l3->slabs_partial, list)
                        handle_slab(n, cachep, slabp);
-                spin_unlock_irq(&l3->list_lock);
+                slab_spin_unlock_irq(&l3->list_lock, this_cpu);
        }
        name = cachep->name;
        if (n[0] == n[1]) {
diff --git a/mm/swap.c b/mm/swap.c
index cb29ae5d33ab..a981acde8554 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,15 +30,92 @@
 #include <linux/notifier.h>
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
+#include <linux/interrupt.h>
 #include "internal.h"
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * On PREEMPT_RT we don't want to disable preemption for cpu variables.
+ * We grab a cpu and then use that cpu to lock the variables accordingly.
+ *
+ * (On !PREEMPT_RT this turns into normal preempt-off sections, as before.)
+ */
+static DEFINE_PER_CPU_LOCKED(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_rotate_pvecs);
+#define swap_get_cpu_var_irq_save(var, flags, cpu)      \
+        ({                                              \
+                (void)flags;                            \
+                &get_cpu_var_locked(var, &cpu);         \
+        })
+#define swap_put_cpu_var_irq_restore(var, flags, cpu)   \
+        put_cpu_var_locked(var, cpu)
+#define swap_get_cpu_var(var, cpu)                      \
+        &get_cpu_var_locked(var, &cpu)
+#define swap_put_cpu_var(var, cpu)                      \
+        put_cpu_var_locked(var, cpu)
+#define swap_per_cpu_lock(var, cpu)                     \
+        ({                                              \
+                spin_lock(&__get_cpu_lock(var, cpu));   \
+                &__get_cpu_var_locked(var, cpu);        \
+        })
+#define swap_per_cpu_unlock(var, cpu)                   \
+        spin_unlock(&__get_cpu_lock(var, cpu));
+#define swap_get_cpu() raw_smp_processor_id()
+#define swap_put_cpu() do { } while (0)
+#define swap_irq_save(flags) do { (void)flags; } while (0)
+#define swap_irq_restore(flags) do { (void)flags; } while (0)
+#else
 static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
+#define swap_get_cpu_var_irq_save(var, flags, cpu)      \
+        ({                                              \
+                (void)cpu;                              \
+                local_irq_save(flags);                  \
+                &__get_cpu_var(var);                    \
+        })
+#define swap_put_cpu_var_irq_restore(var, flags, cpu)   \
+        local_irq_restore(flags)
+#define swap_get_cpu_var(var, cpu)                      \
+        ({                                              \
+                (void)cpu;                              \
+                &get_cpu_var(var);                      \
+         })
+#define swap_put_cpu_var(var, cpu)      put_cpu_var(var)
+#define swap_per_cpu_lock(var, cpu)     &per_cpu(var, cpu)
+#define swap_per_cpu_unlock(var, cpu)   do { } while (0)
+#define swap_get_cpu() get_cpu()
+#define swap_put_cpu() put_cpu()
+#define swap_irq_save(flags) local_irq_save(flags)
+#define swap_irq_restore(flags) local_irq_restore(flags)
+#endif
 /*
 * This path almost never happens for VM activity - pages are normally
 * freed via pagevecs.  But it gets used by networking.
@@ -141,13 +218,13 @@ void  rotate_reclaimable_page(struct page *page)
            !PageUnevictable(page) && PageLRU(page)) {
                struct pagevec *pvec;
                unsigned long flags;
+                int cpu;
                page_cache_get(page);
-                local_irq_save(flags);
+                pvec = swap_get_cpu_var_irq_save(lru_rotate_pvecs, flags, cpu);
-                pvec = &__get_cpu_var(lru_rotate_pvecs);
                if (!pagevec_add(pvec, page))
                        pagevec_move_tail(pvec);
-                local_irq_restore(flags);
+                swap_put_cpu_var_irq_restore(lru_rotate_pvecs, flags, cpu);
        }
 }
@@ -216,12 +293,14 @@ EXPORT_SYMBOL(mark_page_accessed);
 void __lru_cache_add(struct page *page, enum lru_list lru)
 {
-        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
+        struct pagevec *pvec;
+        int cpu;
+        pvec = swap_get_cpu_var(lru_add_pvecs, cpu)[lru];
        page_cache_get(page);
        if (!pagevec_add(pvec, page))
                ____pagevec_lru_add(pvec, lru);
-        put_cpu_var(lru_add_pvecs);
+        swap_put_cpu_var(lru_add_pvecs, cpu);
 }
 /**
@@ -271,31 +350,33 @@ void add_page_to_unevictable_list(struct page *page)
 */
 static void drain_cpu_pagevecs(int cpu)
 {
-        struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
+        struct pagevec *pvecs, *pvec;
-        struct pagevec *pvec;
        int lru;
+        pvecs = swap_per_cpu_lock(lru_add_pvecs, cpu)[0];
        for_each_lru(lru) {
                pvec = &pvecs[lru - LRU_BASE];
                if (pagevec_count(pvec))
                        ____pagevec_lru_add(pvec, lru);
        }
+        swap_per_cpu_unlock(lru_add_pvecs, cpu);
-        pvec = &per_cpu(lru_rotate_pvecs, cpu);
+        pvec = swap_per_cpu_lock(lru_rotate_pvecs, cpu);
        if (pagevec_count(pvec)) {
                unsigned long flags;
                /* No harm done if a racing interrupt already did this */
-                local_irq_save(flags);
+                swap_irq_save(flags);
                pagevec_move_tail(pvec);
-                local_irq_restore(flags);
+                swap_irq_restore(flags);
        }
+        swap_per_cpu_unlock(lru_rotate_pvecs, cpu);
 }
 void lru_add_drain(void)
 {
-        drain_cpu_pagevecs(get_cpu());
+        drain_cpu_pagevecs(swap_get_cpu());
-        put_cpu();
+        swap_put_cpu();
 }
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
@@ -369,7 +450,7 @@ void release_pages(struct page **pages, int nr, int cold)
                        }
                        __pagevec_free(&pages_to_free);
                        pagevec_reinit(&pages_to_free);
-                }
+                }
        }
        if (zone)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dea7abd31098..6911d54ff9c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -23,6 +23,7 @@
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/interrupt.h>
 #include <linux/buffer_head.h>  /* for try_to_release_page(),
                                        buffer_heads_over_limit */
 #include <linux/mm_inline.h>
@@ -1118,7 +1119,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                }
                nr_reclaimed += nr_freed;
-                local_irq_disable();
+                local_irq_disable_nort();
                if (current_is_kswapd()) {
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
                        __count_vm_events(KSWAPD_STEAL, nr_freed);
@@ -1159,9 +1160,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                        }
                }
        } while (nr_scanned < max_scan);
+        /*
+         * Non-PREEMPT_RT relies on IRQs-off protecting the page_states
+         * per-CPU data. PREEMPT_RT has that data protected even in
+         * __mod_page_state(), so no need to keep IRQs disabled.
+         */
        spin_unlock(&zone->lru_lock);
 done:
-        local_irq_enable();
+        local_irq_enable_nort();
        pagevec_release(&pvec);
        return nr_reclaimed;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 138bed53706e..9f7c001f1820 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -149,17 +149,16 @@ static void refresh_zone_stat_thresholds(void)
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
                                int delta)
 {
-        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+        struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu());
        s8 *p = pcp->vm_stat_diff + item;
-        long x;
+        long x = delta + *p;
-        x = delta + *p;
        if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
                zone_page_state_add(x, zone, item);
                x = 0;
        }
        *p = x;
+        put_cpu();
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
@@ -202,7 +201,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
 */
 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+        struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu());
        s8 *p = pcp->vm_stat_diff + item;
        (*p)++;
@@ -213,17 +212,28 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
                zone_page_state_add(*p + overstep, zone, item);
                *p = -overstep;
        }
+        put_cpu();
 }
 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
+#ifdef CONFIG_PREEMPT_RT
+        unsigned long flags;
+        struct zone *zone;
+        zone = page_zone(page);
+        local_irq_save(flags);
+        __inc_zone_state(zone, item);
+        local_irq_restore(flags);
+#else
        __inc_zone_state(page_zone(page), item);
+#endif
 }
 EXPORT_SYMBOL(__inc_zone_page_state);
 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+        struct per_cpu_pageset *pcp = zone_pcp(zone, get_cpu());
        s8 *p = pcp->vm_stat_diff + item;
        (*p)--;
@@ -234,6 +244,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
                zone_page_state_add(*p - overstep, zone, item);
                *p = overstep;
        }
+        put_cpu();
 }
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
author	Thomas Gleixner <tglx@linutronix.de>	2009-07-28 18:00:16 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2009-07-28 18:00:16 -0400
commit	ba36d1d9dd11b98a0bdee1d15ef2a11148905805 (patch)
tree	7749d3ba1d71aaa62a8dab72cca8820e27af7069 /mm
parent	55f9e9a3b3a3229f0ee73c1c2f990785bbf2ff88 (diff)
parent	104f75cb1a751a023beddacf56ca6c19ed90ce6c (diff)