aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c140
-rw-r--r--mm/memory.c1
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/oom_kill.c3
-rw-r--r--mm/page_alloc.c54
-rw-r--r--mm/rmap.c7
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c105
-rw-r--r--mm/swapfile.c29
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmalloc.c18
-rw-r--r--mm/vmscan.c133
15 files changed, 392 insertions, 141 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d3457..f820e600f1ad 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -78,9 +78,6 @@
78 * ->i_mutex (generic_file_buffered_write) 78 * ->i_mutex (generic_file_buffered_write)
79 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 79 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 80 *
81 * ->i_mutex
82 * ->i_alloc_sem (various)
83 *
84 * inode_wb_list_lock 81 * inode_wb_list_lock
85 * sb_lock (fs/fs-writeback.c) 82 * sb_lock (fs/fs-writeback.c)
86 * ->mapping->tree_lock (__sync_single_inode) 83 * ->mapping->tree_lock (__sync_single_inode)
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed503..74bf193eff04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
218 endoff = (loff_t)(end - vma->vm_start - 1) 218 endoff = (loff_t)(end - vma->vm_start - 1)
219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220 220
221 /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 221 /* vmtruncate_range needs to take i_mutex */
222 up_read(&current->mm->mmap_sem); 222 up_read(&current->mm->mmap_sem);
223 error = vmtruncate_range(mapping->host, offset, endoff); 223 error = vmtruncate_range(mapping->host, offset, endoff);
224 down_read(&current->mm->mmap_sem); 224 down_read(&current->mm->mmap_sem);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ddffc74cdebe..e013b8e57d25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -108,10 +108,12 @@ enum mem_cgroup_events_index {
108enum mem_cgroup_events_target { 108enum mem_cgroup_events_target {
109 MEM_CGROUP_TARGET_THRESH, 109 MEM_CGROUP_TARGET_THRESH,
110 MEM_CGROUP_TARGET_SOFTLIMIT, 110 MEM_CGROUP_TARGET_SOFTLIMIT,
111 MEM_CGROUP_TARGET_NUMAINFO,
111 MEM_CGROUP_NTARGETS, 112 MEM_CGROUP_NTARGETS,
112}; 113};
113#define THRESHOLDS_EVENTS_TARGET (128) 114#define THRESHOLDS_EVENTS_TARGET (128)
114#define SOFTLIMIT_EVENTS_TARGET (1024) 115#define SOFTLIMIT_EVENTS_TARGET (1024)
116#define NUMAINFO_EVENTS_TARGET (1024)
115 117
116struct mem_cgroup_stat_cpu { 118struct mem_cgroup_stat_cpu {
117 long count[MEM_CGROUP_STAT_NSTATS]; 119 long count[MEM_CGROUP_STAT_NSTATS];
@@ -237,7 +239,8 @@ struct mem_cgroup {
237 int last_scanned_node; 239 int last_scanned_node;
238#if MAX_NUMNODES > 1 240#if MAX_NUMNODES > 1
239 nodemask_t scan_nodes; 241 nodemask_t scan_nodes;
240 unsigned long next_scan_node_update; 242 atomic_t numainfo_events;
243 atomic_t numainfo_updating;
241#endif 244#endif
242 /* 245 /*
243 * Should the accounting and control be hierarchical, per subtree? 246 * Should the accounting and control be hierarchical, per subtree?
@@ -577,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
577 return val; 580 return val;
578} 581}
579 582
580static long mem_cgroup_local_usage(struct mem_cgroup *mem)
581{
582 long ret;
583
584 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
585 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
586 return ret;
587}
588
589static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 583static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
590 bool charge) 584 bool charge)
591{ 585{
@@ -689,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
689 case MEM_CGROUP_TARGET_SOFTLIMIT: 683 case MEM_CGROUP_TARGET_SOFTLIMIT:
690 next = val + SOFTLIMIT_EVENTS_TARGET; 684 next = val + SOFTLIMIT_EVENTS_TARGET;
691 break; 685 break;
686 case MEM_CGROUP_TARGET_NUMAINFO:
687 next = val + NUMAINFO_EVENTS_TARGET;
688 break;
692 default: 689 default:
693 return; 690 return;
694 } 691 }
@@ -707,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
707 mem_cgroup_threshold(mem); 704 mem_cgroup_threshold(mem);
708 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 705 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
709 if (unlikely(__memcg_event_check(mem, 706 if (unlikely(__memcg_event_check(mem,
710 MEM_CGROUP_TARGET_SOFTLIMIT))){ 707 MEM_CGROUP_TARGET_SOFTLIMIT))) {
711 mem_cgroup_update_tree(mem, page); 708 mem_cgroup_update_tree(mem, page);
712 __mem_cgroup_target_update(mem, 709 __mem_cgroup_target_update(mem,
713 MEM_CGROUP_TARGET_SOFTLIMIT); 710 MEM_CGROUP_TARGET_SOFTLIMIT);
711 }
712#if MAX_NUMNODES > 1
713 if (unlikely(__memcg_event_check(mem,
714 MEM_CGROUP_TARGET_NUMAINFO))) {
715 atomic_inc(&mem->numainfo_events);
716 __mem_cgroup_target_update(mem,
717 MEM_CGROUP_TARGET_NUMAINFO);
714 } 718 }
719#endif
715 } 720 }
716} 721}
717 722
@@ -1129,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1129 return MEM_CGROUP_ZSTAT(mz, lru); 1134 return MEM_CGROUP_ZSTAT(mz, lru);
1130} 1135}
1131 1136
1132#ifdef CONFIG_NUMA
1133static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, 1137static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1134 int nid) 1138 int nid)
1135{ 1139{
@@ -1141,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1141 return ret; 1145 return ret;
1142} 1146}
1143 1147
1148static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1149 int nid)
1150{
1151 unsigned long ret;
1152
1153 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1154 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1155 return ret;
1156}
1157
1158#if MAX_NUMNODES > 1
1144static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) 1159static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1145{ 1160{
1146 u64 total = 0; 1161 u64 total = 0;
@@ -1152,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1152 return total; 1167 return total;
1153} 1168}
1154 1169
1155static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1156 int nid)
1157{
1158 unsigned long ret;
1159
1160 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1161 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1162
1163 return ret;
1164}
1165
1166static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) 1170static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1167{ 1171{
1168 u64 total = 0; 1172 u64 total = 0;
@@ -1559,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1559 return ret; 1563 return ret;
1560} 1564}
1561 1565
1566/**
1567 * test_mem_cgroup_node_reclaimable
1568 * @mem: the target memcg
1569 * @nid: the node ID to be checked.
1570 * @noswap : specify true here if the user wants flle only information.
1571 *
1572 * This function returns whether the specified memcg contains any
1573 * reclaimable pages on a node. Returns true if there are any reclaimable
1574 * pages in the node.
1575 */
1576static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1577 int nid, bool noswap)
1578{
1579 if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
1580 return true;
1581 if (noswap || !total_swap_pages)
1582 return false;
1583 if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
1584 return true;
1585 return false;
1586
1587}
1562#if MAX_NUMNODES > 1 1588#if MAX_NUMNODES > 1
1563 1589
1564/* 1590/*
@@ -1570,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1570static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1596static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1571{ 1597{
1572 int nid; 1598 int nid;
1573 1599 /*
1574 if (time_after(mem->next_scan_node_update, jiffies)) 1600 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1601 * pagein/pageout changes since the last update.
1602 */
1603 if (!atomic_read(&mem->numainfo_events))
1604 return;
1605 if (atomic_inc_return(&mem->numainfo_updating) > 1)
1575 return; 1606 return;
1576 1607
1577 mem->next_scan_node_update = jiffies + 10*HZ;
1578 /* make a nodemask where this memcg uses memory from */ 1608 /* make a nodemask where this memcg uses memory from */
1579 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1609 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1580 1610
1581 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1611 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1582 1612
1583 if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || 1613 if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
1584 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) 1614 node_clear(nid, mem->scan_nodes);
1585 continue;
1586
1587 if (total_swap_pages &&
1588 (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
1589 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
1590 continue;
1591 node_clear(nid, mem->scan_nodes);
1592 } 1615 }
1616
1617 atomic_set(&mem->numainfo_events, 0);
1618 atomic_set(&mem->numainfo_updating, 0);
1593} 1619}
1594 1620
1595/* 1621/*
@@ -1627,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1627 return node; 1653 return node;
1628} 1654}
1629 1655
1656/*
1657 * Check all nodes whether it contains reclaimable pages or not.
1658 * For quick scan, we make use of scan_nodes. This will allow us to skip
1659 * unused nodes. But scan_nodes is lazily updated and may not cotain
1660 * enough new information. We need to do double check.
1661 */
1662bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1663{
1664 int nid;
1665
1666 /*
1667 * quick check...making use of scan_node.
1668 * We can skip unused nodes.
1669 */
1670 if (!nodes_empty(mem->scan_nodes)) {
1671 for (nid = first_node(mem->scan_nodes);
1672 nid < MAX_NUMNODES;
1673 nid = next_node(nid, mem->scan_nodes)) {
1674
1675 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1676 return true;
1677 }
1678 }
1679 /*
1680 * Check rest of nodes.
1681 */
1682 for_each_node_state(nid, N_HIGH_MEMORY) {
1683 if (node_isset(nid, mem->scan_nodes))
1684 continue;
1685 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1686 return true;
1687 }
1688 return false;
1689}
1690
1630#else 1691#else
1631int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1692int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1632{ 1693{
1633 return 0; 1694 return 0;
1634} 1695}
1696
1697bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1698{
1699 return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
1700}
1635#endif 1701#endif
1636 1702
1637/* 1703/*
@@ -1702,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1702 } 1768 }
1703 } 1769 }
1704 } 1770 }
1705 if (!mem_cgroup_local_usage(victim)) { 1771 if (!mem_cgroup_reclaimable(victim, noswap)) {
1706 /* this cgroup's local usage == 0 */ 1772 /* this cgroup's local usage == 0 */
1707 css_put(&victim->css); 1773 css_put(&victim->css);
1708 continue; 1774 continue;
diff --git a/mm/memory.c b/mm/memory.c
index 40b7531ee8ba..9b8a01d941cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
305 if (batch->nr == batch->max) { 305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb)) 306 if (!tlb_next_batch(tlb))
307 return 0; 307 return 0;
308 batch = tlb->active;
308 } 309 }
309 VM_BUG_ON(batch->nr > batch->max); 310 VM_BUG_ON(batch->nr > batch->max);
310 311
diff --git a/mm/nommu.c b/mm/nommu.c
index 1fd0c51b10a6..5c5c2d4b1807 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 25#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
1087 * it's being traced - otherwise breakpoints set in it may interfere 1086 * it's being traced - otherwise breakpoints set in it may interfere
1088 * with another untraced process 1087 * with another untraced process
1089 */ 1088 */
1090 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) 1089 if ((flags & MAP_PRIVATE) && current->ptrace)
1091 vm_flags &= ~VM_MAYSHARE; 1090 vm_flags &= ~VM_MAYSHARE;
1092 1091
1093 return vm_flags; 1092 return vm_flags;
@@ -1813,10 +1812,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1813 return NULL; 1812 return NULL;
1814} 1813}
1815 1814
1816int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1815int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1817 unsigned long to, unsigned long size, pgprot_t prot) 1816 unsigned long pfn, unsigned long size, pgprot_t prot)
1818{ 1817{
1819 vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; 1818 if (addr != (pfn << PAGE_SHIFT))
1819 return -EINVAL;
1820
1821 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1820 return 0; 1822 return 0;
1821} 1823}
1822EXPORT_SYMBOL(remap_pfn_range); 1824EXPORT_SYMBOL(remap_pfn_range);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca351..b0be989d4365 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
339 * then wait for it to finish before killing 339 * then wait for it to finish before killing
340 * some other task unnecessarily. 340 * some other task unnecessarily.
341 */ 341 */
342 if (!(task_ptrace(p->group_leader) & 342 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
343 PT_TRACE_EXIT))
344 return ERR_PTR(-1UL); 343 return ERR_PTR(-1UL);
345 } 344 }
346 } 345 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab8..9119faae6e6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4585,6 +4585,60 @@ void __init sort_node_map(void)
4585 cmp_node_active_region, NULL); 4585 cmp_node_active_region, NULL);
4586} 4586}
4587 4587
4588/**
4589 * node_map_pfn_alignment - determine the maximum internode alignment
4590 *
4591 * This function should be called after node map is populated and sorted.
4592 * It calculates the maximum power of two alignment which can distinguish
4593 * all the nodes.
4594 *
4595 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4596 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4597 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4598 * shifted, 1GiB is enough and this function will indicate so.
4599 *
4600 * This is used to test whether pfn -> nid mapping of the chosen memory
4601 * model has fine enough granularity to avoid incorrect mapping for the
4602 * populated node map.
4603 *
4604 * Returns the determined alignment in pfn's. 0 if there is no alignment
4605 * requirement (single node).
4606 */
4607unsigned long __init node_map_pfn_alignment(void)
4608{
4609 unsigned long accl_mask = 0, last_end = 0;
4610 int last_nid = -1;
4611 int i;
4612
4613 for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
4614 int nid = early_node_map[i].nid;
4615 unsigned long start = early_node_map[i].start_pfn;
4616 unsigned long end = early_node_map[i].end_pfn;
4617 unsigned long mask;
4618
4619 if (!start || last_nid < 0 || last_nid == nid) {
4620 last_nid = nid;
4621 last_end = end;
4622 continue;
4623 }
4624
4625 /*
4626 * Start with a mask granular enough to pin-point to the
4627 * start pfn and tick off bits one-by-one until it becomes
4628 * too coarse to separate the current node from the last.
4629 */
4630 mask = ~((1 << __ffs(start)) - 1);
4631 while (mask && last_end <= (start & (mask << 1)))
4632 mask <<= 1;
4633
4634 /* accumulate all internode masks */
4635 accl_mask |= mask;
4636 }
4637
4638 /* convert mask to number of pages */
4639 return ~accl_mask + 1;
4640}
4641
4588/* Find the lowest pfn for a node */ 4642/* Find the lowest pfn for a node */
4589static unsigned long __init find_min_pfn_for_node(int nid) 4643static unsigned long __init find_min_pfn_for_node(int nid)
4590{ 4644{
diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f65ae43..9701574bb67a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,6 @@
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem (vmtruncate_range)
25 * mm->mmap_sem 24 * mm->mmap_sem
26 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
27 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
@@ -870,11 +869,11 @@ int page_referenced(struct page *page,
870 vm_flags); 869 vm_flags);
871 if (we_locked) 870 if (we_locked)
872 unlock_page(page); 871 unlock_page(page);
872
873 if (page_test_and_clear_young(page_to_pfn(page)))
874 referenced++;
873 } 875 }
874out: 876out:
875 if (page_test_and_clear_young(page_to_pfn(page)))
876 referenced++;
877
878 return referenced; 877 return referenced;
879} 878}
880 879
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de775..1e523ed47c61 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic =
574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
575 575
576/* internal cache of cache description objs */ 576/* internal cache of cache description objs */
577static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
577static struct kmem_cache cache_cache = { 578static struct kmem_cache cache_cache = {
579 .nodelists = cache_cache_nodelists,
578 .batchcount = 1, 580 .batchcount = 1,
579 .limit = BOOT_CPUCACHE_ENTRIES, 581 .limit = BOOT_CPUCACHE_ENTRIES,
580 .shared = 1, 582 .shared = 1,
@@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void)
1492 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1494 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1493 1495
1494 /* 1496 /*
1495 * struct kmem_cache size depends on nr_node_ids, which 1497 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1496 * can be less than MAX_NUMNODES.
1497 */ 1498 */
1498 cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + 1499 cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1499 nr_node_ids * sizeof(struct kmem_list3 *); 1500 nr_node_ids * sizeof(struct kmem_list3 *);
1500#if DEBUG 1501#if DEBUG
1501 cache_cache.obj_size = cache_cache.buffer_size; 1502 cache_cache.obj_size = cache_cache.buffer_size;
1502#endif 1503#endif
@@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2308 if (!cachep) 2309 if (!cachep)
2309 goto oops; 2310 goto oops;
2310 2311
2312 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2311#if DEBUG 2313#if DEBUG
2312 cachep->obj_size = size; 2314 cachep->obj_size = size;
2313 2315
@@ -3153,12 +3155,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3153 objp += obj_offset(cachep); 3155 objp += obj_offset(cachep);
3154 if (cachep->ctor && cachep->flags & SLAB_POISON) 3156 if (cachep->ctor && cachep->flags & SLAB_POISON)
3155 cachep->ctor(objp); 3157 cachep->ctor(objp);
3156#if ARCH_SLAB_MINALIGN 3158 if (ARCH_SLAB_MINALIGN &&
3157 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3159 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
3158 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3160 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3159 objp, ARCH_SLAB_MINALIGN); 3161 objp, (int)ARCH_SLAB_MINALIGN);
3160 } 3162 }
3161#endif
3162 return objp; 3163 return objp;
3163} 3164}
3164#else 3165#else
diff --git a/mm/slob.c b/mm/slob.c
index 46e0aee33a23..0ae881831ae2 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
483 void *ret; 483 void *ret;
484 484
485 gfp &= gfp_allowed_mask;
486
485 lockdep_trace_alloc(gfp); 487 lockdep_trace_alloc(gfp);
486 488
487 if (size < PAGE_SIZE - align) { 489 if (size < PAGE_SIZE - align) {
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
608{ 610{
609 void *b; 611 void *b;
610 612
613 flags &= gfp_allowed_mask;
614
615 lockdep_trace_alloc(flags);
616
611 if (c->size < PAGE_SIZE) { 617 if (c->size < PAGE_SIZE) {
612 b = slob_alloc(c->size, flags, c->align, node); 618 b = slob_alloc(c->size, flags, c->align, node);
613 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 619 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
diff --git a/mm/slub.c b/mm/slub.c
index 35f351f26193..ba83f3fd0757 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -27,6 +27,7 @@
27#include <linux/memory.h> 27#include <linux/memory.h>
28#include <linux/math64.h> 28#include <linux/math64.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/stacktrace.h>
30 31
31#include <trace/events/kmem.h> 32#include <trace/events/kmem.h>
32 33
@@ -191,8 +192,12 @@ static LIST_HEAD(slab_caches);
191/* 192/*
192 * Tracking user of a slab. 193 * Tracking user of a slab.
193 */ 194 */
195#define TRACK_ADDRS_COUNT 16
194struct track { 196struct track {
195 unsigned long addr; /* Called from address */ 197 unsigned long addr; /* Called from address */
198#ifdef CONFIG_STACKTRACE
199 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
200#endif
196 int cpu; /* Was running on cpu */ 201 int cpu; /* Was running on cpu */
197 int pid; /* Pid context */ 202 int pid; /* Pid context */
198 unsigned long when; /* When did the operation occur */ 203 unsigned long when; /* When did the operation occur */
@@ -420,6 +425,24 @@ static void set_track(struct kmem_cache *s, void *object,
420 struct track *p = get_track(s, object, alloc); 425 struct track *p = get_track(s, object, alloc);
421 426
422 if (addr) { 427 if (addr) {
428#ifdef CONFIG_STACKTRACE
429 struct stack_trace trace;
430 int i;
431
432 trace.nr_entries = 0;
433 trace.max_entries = TRACK_ADDRS_COUNT;
434 trace.entries = p->addrs;
435 trace.skip = 3;
436 save_stack_trace(&trace);
437
438 /* See rant in lockdep.c */
439 if (trace.nr_entries != 0 &&
440 trace.entries[trace.nr_entries - 1] == ULONG_MAX)
441 trace.nr_entries--;
442
443 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
444 p->addrs[i] = 0;
445#endif
423 p->addr = addr; 446 p->addr = addr;
424 p->cpu = smp_processor_id(); 447 p->cpu = smp_processor_id();
425 p->pid = current->pid; 448 p->pid = current->pid;
@@ -444,6 +467,16 @@ static void print_track(const char *s, struct track *t)
444 467
445 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 468 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
446 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 469 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
470#ifdef CONFIG_STACKTRACE
471 {
472 int i;
473 for (i = 0; i < TRACK_ADDRS_COUNT; i++)
474 if (t->addrs[i])
475 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
476 else
477 break;
478 }
479#endif
447} 480}
448 481
449static void print_tracking(struct kmem_cache *s, void *object) 482static void print_tracking(struct kmem_cache *s, void *object)
@@ -557,10 +590,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
557 memset(p + s->objsize, val, s->inuse - s->objsize); 590 memset(p + s->objsize, val, s->inuse - s->objsize);
558} 591}
559 592
560static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 593static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
561{ 594{
562 while (bytes) { 595 while (bytes) {
563 if (*start != (u8)value) 596 if (*start != value)
564 return start; 597 return start;
565 start++; 598 start++;
566 bytes--; 599 bytes--;
@@ -568,6 +601,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
568 return NULL; 601 return NULL;
569} 602}
570 603
604static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
605{
606 u64 value64;
607 unsigned int words, prefix;
608
609 if (bytes <= 16)
610 return check_bytes8(start, value, bytes);
611
612 value64 = value | value << 8 | value << 16 | value << 24;
613 value64 = value64 | value64 << 32;
614 prefix = 8 - ((unsigned long)start) % 8;
615
616 if (prefix) {
617 u8 *r = check_bytes8(start, value, prefix);
618 if (r)
619 return r;
620 start += prefix;
621 bytes -= prefix;
622 }
623
624 words = bytes / 8;
625
626 while (words) {
627 if (*(u64 *)start != value64)
628 return check_bytes8(start, value, 8);
629 start += 8;
630 words--;
631 }
632
633 return check_bytes8(start, value, bytes % 8);
634}
635
571static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 636static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
572 void *from, void *to) 637 void *from, void *to)
573{ 638{
@@ -2928,6 +2993,42 @@ size_t ksize(const void *object)
2928} 2993}
2929EXPORT_SYMBOL(ksize); 2994EXPORT_SYMBOL(ksize);
2930 2995
2996#ifdef CONFIG_SLUB_DEBUG
2997bool verify_mem_not_deleted(const void *x)
2998{
2999 struct page *page;
3000 void *object = (void *)x;
3001 unsigned long flags;
3002 bool rv;
3003
3004 if (unlikely(ZERO_OR_NULL_PTR(x)))
3005 return false;
3006
3007 local_irq_save(flags);
3008
3009 page = virt_to_head_page(x);
3010 if (unlikely(!PageSlab(page))) {
3011 /* maybe it was from stack? */
3012 rv = true;
3013 goto out_unlock;
3014 }
3015
3016 slab_lock(page);
3017 if (on_freelist(page->slab, page, object)) {
3018 object_err(page->slab, page, object, "Object is on free-list");
3019 rv = false;
3020 } else {
3021 rv = true;
3022 }
3023 slab_unlock(page);
3024
3025out_unlock:
3026 local_irq_restore(flags);
3027 return rv;
3028}
3029EXPORT_SYMBOL(verify_mem_not_deleted);
3030#endif
3031
2931void kfree(const void *x) 3032void kfree(const void *x)
2932{ 3033{
2933 struct page *page; 3034 struct page *page;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff8dc1a18cb4..1b8c33907242 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1681,19 +1681,14 @@ out:
1681} 1681}
1682 1682
1683#ifdef CONFIG_PROC_FS 1683#ifdef CONFIG_PROC_FS
1684struct proc_swaps {
1685 struct seq_file seq;
1686 int event;
1687};
1688
1689static unsigned swaps_poll(struct file *file, poll_table *wait) 1684static unsigned swaps_poll(struct file *file, poll_table *wait)
1690{ 1685{
1691 struct proc_swaps *s = file->private_data; 1686 struct seq_file *seq = file->private_data;
1692 1687
1693 poll_wait(file, &proc_poll_wait, wait); 1688 poll_wait(file, &proc_poll_wait, wait);
1694 1689
1695 if (s->event != atomic_read(&proc_poll_event)) { 1690 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1696 s->event = atomic_read(&proc_poll_event); 1691 seq->poll_event = atomic_read(&proc_poll_event);
1697 return POLLIN | POLLRDNORM | POLLERR | POLLPRI; 1692 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1698 } 1693 }
1699 1694
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
1783 1778
1784static int swaps_open(struct inode *inode, struct file *file) 1779static int swaps_open(struct inode *inode, struct file *file)
1785{ 1780{
1786 struct proc_swaps *s; 1781 struct seq_file *seq;
1787 int ret; 1782 int ret;
1788 1783
1789 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1790 if (!s)
1791 return -ENOMEM;
1792
1793 file->private_data = s;
1794
1795 ret = seq_open(file, &swaps_op); 1784 ret = seq_open(file, &swaps_op);
1796 if (ret) { 1785 if (ret)
1797 kfree(s);
1798 return ret; 1786 return ret;
1799 }
1800 1787
1801 s->seq.private = s; 1788 seq = file->private_data;
1802 s->event = atomic_read(&proc_poll_event); 1789 seq->poll_event = atomic_read(&proc_poll_event);
1803 return ret; 1790 return 0;
1804} 1791}
1805 1792
1806static const struct file_operations proc_swaps_operations = { 1793static const struct file_operations proc_swaps_operations = {
diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22efaad7..003c6c685fc8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -622,12 +622,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
622 return -ENOSYS; 622 return -ENOSYS;
623 623
624 mutex_lock(&inode->i_mutex); 624 mutex_lock(&inode->i_mutex);
625 down_write(&inode->i_alloc_sem); 625 inode_dio_wait(inode);
626 unmap_mapping_range(mapping, offset, (end - offset), 1); 626 unmap_mapping_range(mapping, offset, (end - offset), 1);
627 inode->i_op->truncate_range(inode, offset, end); 627 inode->i_op->truncate_range(inode, offset, end);
628 /* unmap again to remove racily COWed private pages */ 628 /* unmap again to remove racily COWed private pages */
629 unmap_mapping_range(mapping, offset, (end - offset), 1); 629 unmap_mapping_range(mapping, offset, (end - offset), 1);
630 up_write(&inode->i_alloc_sem);
631 mutex_unlock(&inode->i_mutex); 630 mutex_unlock(&inode->i_mutex);
632 631
633 return 0; 632 return 0;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a7..ab8494cde007 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -452,13 +452,6 @@ overflow:
452 return ERR_PTR(-EBUSY); 452 return ERR_PTR(-EBUSY);
453} 453}
454 454
455static void rcu_free_va(struct rcu_head *head)
456{
457 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
458
459 kfree(va);
460}
461
462static void __free_vmap_area(struct vmap_area *va) 455static void __free_vmap_area(struct vmap_area *va)
463{ 456{
464 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 457 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va)
491 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) 484 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
492 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); 485 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
493 486
494 call_rcu(&va->rcu_head, rcu_free_va); 487 kfree_rcu(va, rcu_head);
495} 488}
496 489
497/* 490/*
@@ -837,13 +830,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
837 return vb; 830 return vb;
838} 831}
839 832
840static void rcu_free_vb(struct rcu_head *head)
841{
842 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
843
844 kfree(vb);
845}
846
847static void free_vmap_block(struct vmap_block *vb) 833static void free_vmap_block(struct vmap_block *vb)
848{ 834{
849 struct vmap_block *tmp; 835 struct vmap_block *tmp;
@@ -856,7 +842,7 @@ static void free_vmap_block(struct vmap_block *vb)
856 BUG_ON(tmp != vb); 842 BUG_ON(tmp != vb);
857 843
858 free_vmap_area_noflush(vb->va); 844 free_vmap_area_noflush(vb->va);
859 call_rcu(&vb->rcu_head, rcu_free_vb); 845 kfree_rcu(vb, rcu_head);
860} 846}
861 847
862static void purge_fragmented_blocks(int cpu) 848static void purge_fragmented_blocks(int cpu)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f49535d4cd3..febbc044e792 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink,
250 unsigned long long delta; 250 unsigned long long delta;
251 unsigned long total_scan; 251 unsigned long total_scan;
252 unsigned long max_pass; 252 unsigned long max_pass;
253 int shrink_ret = 0;
254 long nr;
255 long new_nr;
256 long batch_size = shrinker->batch ? shrinker->batch
257 : SHRINK_BATCH;
253 258
259 /*
260 * copy the current shrinker scan count into a local variable
261 * and zero it so that other concurrent shrinker invocations
262 * don't also do this scanning work.
263 */
264 do {
265 nr = shrinker->nr;
266 } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
267
268 total_scan = nr;
254 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 269 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
255 delta = (4 * nr_pages_scanned) / shrinker->seeks; 270 delta = (4 * nr_pages_scanned) / shrinker->seeks;
256 delta *= max_pass; 271 delta *= max_pass;
257 do_div(delta, lru_pages + 1); 272 do_div(delta, lru_pages + 1);
258 shrinker->nr += delta; 273 total_scan += delta;
259 if (shrinker->nr < 0) { 274 if (total_scan < 0) {
260 printk(KERN_ERR "shrink_slab: %pF negative objects to " 275 printk(KERN_ERR "shrink_slab: %pF negative objects to "
261 "delete nr=%ld\n", 276 "delete nr=%ld\n",
262 shrinker->shrink, shrinker->nr); 277 shrinker->shrink, total_scan);
263 shrinker->nr = max_pass; 278 total_scan = max_pass;
264 } 279 }
265 280
266 /* 281 /*
282 * We need to avoid excessive windup on filesystem shrinkers
283 * due to large numbers of GFP_NOFS allocations causing the
284 * shrinkers to return -1 all the time. This results in a large
285 * nr being built up so when a shrink that can do some work
286 * comes along it empties the entire cache due to nr >>>
287 * max_pass. This is bad for sustaining a working set in
288 * memory.
289 *
290 * Hence only allow the shrinker to scan the entire cache when
291 * a large delta change is calculated directly.
292 */
293 if (delta < max_pass / 4)
294 total_scan = min(total_scan, max_pass / 2);
295
296 /*
267 * Avoid risking looping forever due to too large nr value: 297 * Avoid risking looping forever due to too large nr value:
268 * never try to free more than twice the estimate number of 298 * never try to free more than twice the estimate number of
269 * freeable entries. 299 * freeable entries.
270 */ 300 */
271 if (shrinker->nr > max_pass * 2) 301 if (total_scan > max_pass * 2)
272 shrinker->nr = max_pass * 2; 302 total_scan = max_pass * 2;
273 303
274 total_scan = shrinker->nr; 304 trace_mm_shrink_slab_start(shrinker, shrink, nr,
275 shrinker->nr = 0; 305 nr_pages_scanned, lru_pages,
306 max_pass, delta, total_scan);
276 307
277 while (total_scan >= SHRINK_BATCH) { 308 while (total_scan >= batch_size) {
278 long this_scan = SHRINK_BATCH;
279 int shrink_ret;
280 int nr_before; 309 int nr_before;
281 310
282 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 311 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
283 shrink_ret = do_shrinker_shrink(shrinker, shrink, 312 shrink_ret = do_shrinker_shrink(shrinker, shrink,
284 this_scan); 313 batch_size);
285 if (shrink_ret == -1) 314 if (shrink_ret == -1)
286 break; 315 break;
287 if (shrink_ret < nr_before) 316 if (shrink_ret < nr_before)
288 ret += nr_before - shrink_ret; 317 ret += nr_before - shrink_ret;
289 count_vm_events(SLABS_SCANNED, this_scan); 318 count_vm_events(SLABS_SCANNED, batch_size);
290 total_scan -= this_scan; 319 total_scan -= batch_size;
291 320
292 cond_resched(); 321 cond_resched();
293 } 322 }
294 323
295 shrinker->nr += total_scan; 324 /*
325 * move the unused scan count back into the shrinker in a
326 * manner that handles concurrent updates. If we exhausted the
327 * scan, there is no need to do an update.
328 */
329 do {
330 nr = shrinker->nr;
331 new_nr = total_scan + nr;
332 if (total_scan <= 0)
333 break;
334 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
335
336 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
296 } 337 }
297 up_read(&shrinker_rwsem); 338 up_read(&shrinker_rwsem);
298out: 339out:
@@ -2310,7 +2351,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2310 for (i = 0; i <= classzone_idx; i++) 2351 for (i = 0; i <= classzone_idx; i++)
2311 present_pages += pgdat->node_zones[i].present_pages; 2352 present_pages += pgdat->node_zones[i].present_pages;
2312 2353
2313 return balanced_pages > (present_pages >> 2); 2354 /* A special case here: if zone has no page, we think it's balanced */
2355 return balanced_pages >= (present_pages >> 2);
2314} 2356}
2315 2357
2316/* is kswapd sleeping prematurely? */ 2358/* is kswapd sleeping prematurely? */
@@ -2326,7 +2368,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2326 return true; 2368 return true;
2327 2369
2328 /* Check the watermark levels */ 2370 /* Check the watermark levels */
2329 for (i = 0; i < pgdat->nr_zones; i++) { 2371 for (i = 0; i <= classzone_idx; i++) {
2330 struct zone *zone = pgdat->node_zones + i; 2372 struct zone *zone = pgdat->node_zones + i;
2331 2373
2332 if (!populated_zone(zone)) 2374 if (!populated_zone(zone))
@@ -2344,7 +2386,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2344 } 2386 }
2345 2387
2346 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 2388 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2347 classzone_idx, 0)) 2389 i, 0))
2348 all_zones_ok = false; 2390 all_zones_ok = false;
2349 else 2391 else
2350 balanced += zone->present_pages; 2392 balanced += zone->present_pages;
@@ -2451,7 +2493,6 @@ loop_again:
2451 if (!zone_watermark_ok_safe(zone, order, 2493 if (!zone_watermark_ok_safe(zone, order,
2452 high_wmark_pages(zone), 0, 0)) { 2494 high_wmark_pages(zone), 0, 0)) {
2453 end_zone = i; 2495 end_zone = i;
2454 *classzone_idx = i;
2455 break; 2496 break;
2456 } 2497 }
2457 } 2498 }
@@ -2510,18 +2551,18 @@ loop_again:
2510 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2551 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2511 if (!zone_watermark_ok_safe(zone, order, 2552 if (!zone_watermark_ok_safe(zone, order,
2512 high_wmark_pages(zone) + balance_gap, 2553 high_wmark_pages(zone) + balance_gap,
2513 end_zone, 0)) 2554 end_zone, 0)) {
2514 shrink_zone(priority, zone, &sc); 2555 shrink_zone(priority, zone, &sc);
2515 reclaim_state->reclaimed_slab = 0;
2516 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2517 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2518 total_scanned += sc.nr_scanned;
2519 2556
2520 if (zone->all_unreclaimable) 2557 reclaim_state->reclaimed_slab = 0;
2521 continue; 2558 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2522 if (nr_slab == 0 && 2559 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2523 !zone_reclaimable(zone)) 2560 total_scanned += sc.nr_scanned;
2524 zone->all_unreclaimable = 1; 2561
2562 if (nr_slab == 0 && !zone_reclaimable(zone))
2563 zone->all_unreclaimable = 1;
2564 }
2565
2525 /* 2566 /*
2526 * If we've done a decent amount of scanning and 2567 * If we've done a decent amount of scanning and
2527 * the reclaim ratio is low, start doing writepage 2568 * the reclaim ratio is low, start doing writepage
@@ -2531,6 +2572,12 @@ loop_again:
2531 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2572 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2532 sc.may_writepage = 1; 2573 sc.may_writepage = 1;
2533 2574
2575 if (zone->all_unreclaimable) {
2576 if (end_zone && end_zone == i)
2577 end_zone--;
2578 continue;
2579 }
2580
2534 if (!zone_watermark_ok_safe(zone, order, 2581 if (!zone_watermark_ok_safe(zone, order,
2535 high_wmark_pages(zone), end_zone, 0)) { 2582 high_wmark_pages(zone), end_zone, 0)) {
2536 all_zones_ok = 0; 2583 all_zones_ok = 0;
@@ -2709,8 +2756,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2709 */ 2756 */
2710static int kswapd(void *p) 2757static int kswapd(void *p)
2711{ 2758{
2712 unsigned long order; 2759 unsigned long order, new_order;
2713 int classzone_idx; 2760 int classzone_idx, new_classzone_idx;
2714 pg_data_t *pgdat = (pg_data_t*)p; 2761 pg_data_t *pgdat = (pg_data_t*)p;
2715 struct task_struct *tsk = current; 2762 struct task_struct *tsk = current;
2716 2763
@@ -2740,17 +2787,23 @@ static int kswapd(void *p)
2740 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2787 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2741 set_freezable(); 2788 set_freezable();
2742 2789
2743 order = 0; 2790 order = new_order = 0;
2744 classzone_idx = MAX_NR_ZONES - 1; 2791 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2745 for ( ; ; ) { 2792 for ( ; ; ) {
2746 unsigned long new_order;
2747 int new_classzone_idx;
2748 int ret; 2793 int ret;
2749 2794
2750 new_order = pgdat->kswapd_max_order; 2795 /*
2751 new_classzone_idx = pgdat->classzone_idx; 2796 * If the last balance_pgdat was unsuccessful it's unlikely a
2752 pgdat->kswapd_max_order = 0; 2797 * new request of a similar or harder type will succeed soon
2753 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2798 * so consider going to sleep on the basis we reclaimed at
2799 */
2800 if (classzone_idx >= new_classzone_idx && order == new_order) {
2801 new_order = pgdat->kswapd_max_order;
2802 new_classzone_idx = pgdat->classzone_idx;
2803 pgdat->kswapd_max_order = 0;
2804 pgdat->classzone_idx = pgdat->nr_zones - 1;
2805 }
2806
2754 if (order < new_order || classzone_idx > new_classzone_idx) { 2807 if (order < new_order || classzone_idx > new_classzone_idx) {
2755 /* 2808 /*
2756 * Don't sleep if someone wants a larger 'order' 2809 * Don't sleep if someone wants a larger 'order'
@@ -2763,7 +2816,7 @@ static int kswapd(void *p)
2763 order = pgdat->kswapd_max_order; 2816 order = pgdat->kswapd_max_order;
2764 classzone_idx = pgdat->classzone_idx; 2817 classzone_idx = pgdat->classzone_idx;
2765 pgdat->kswapd_max_order = 0; 2818 pgdat->kswapd_max_order = 0;
2766 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2819 pgdat->classzone_idx = pgdat->nr_zones - 1;
2767 } 2820 }
2768 2821
2769 ret = try_to_freeze(); 2822 ret = try_to_freeze();