diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 3 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 140 | ||||
-rw-r--r-- | mm/memory.c | 1 | ||||
-rw-r--r-- | mm/nommu.c | 12 | ||||
-rw-r--r-- | mm/oom_kill.c | 3 | ||||
-rw-r--r-- | mm/page_alloc.c | 54 | ||||
-rw-r--r-- | mm/rmap.c | 7 | ||||
-rw-r--r-- | mm/slab.c | 17 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 105 | ||||
-rw-r--r-- | mm/swapfile.c | 29 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/vmalloc.c | 18 | ||||
-rw-r--r-- | mm/vmscan.c | 133 |
15 files changed, 392 insertions, 141 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8d3457..f820e600f1ad 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -78,9 +78,6 @@ | |||
78 | * ->i_mutex (generic_file_buffered_write) | 78 | * ->i_mutex (generic_file_buffered_write) |
79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) | 79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | * | 80 | * |
81 | * ->i_mutex | ||
82 | * ->i_alloc_sem (various) | ||
83 | * | ||
84 | * inode_wb_list_lock | 81 | * inode_wb_list_lock |
85 | * sb_lock (fs/fs-writeback.c) | 82 | * sb_lock (fs/fs-writeback.c) |
86 | * ->mapping->tree_lock (__sync_single_inode) | 83 | * ->mapping->tree_lock (__sync_single_inode) |
diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed503..74bf193eff04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
218 | endoff = (loff_t)(end - vma->vm_start - 1) | 218 | endoff = (loff_t)(end - vma->vm_start - 1) |
219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
220 | 220 | ||
221 | /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ | 221 | /* vmtruncate_range needs to take i_mutex */ |
222 | up_read(¤t->mm->mmap_sem); | 222 | up_read(¤t->mm->mmap_sem); |
223 | error = vmtruncate_range(mapping->host, offset, endoff); | 223 | error = vmtruncate_range(mapping->host, offset, endoff); |
224 | down_read(¤t->mm->mmap_sem); | 224 | down_read(¤t->mm->mmap_sem); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ddffc74cdebe..e013b8e57d25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -108,10 +108,12 @@ enum mem_cgroup_events_index { | |||
108 | enum mem_cgroup_events_target { | 108 | enum mem_cgroup_events_target { |
109 | MEM_CGROUP_TARGET_THRESH, | 109 | MEM_CGROUP_TARGET_THRESH, |
110 | MEM_CGROUP_TARGET_SOFTLIMIT, | 110 | MEM_CGROUP_TARGET_SOFTLIMIT, |
111 | MEM_CGROUP_TARGET_NUMAINFO, | ||
111 | MEM_CGROUP_NTARGETS, | 112 | MEM_CGROUP_NTARGETS, |
112 | }; | 113 | }; |
113 | #define THRESHOLDS_EVENTS_TARGET (128) | 114 | #define THRESHOLDS_EVENTS_TARGET (128) |
114 | #define SOFTLIMIT_EVENTS_TARGET (1024) | 115 | #define SOFTLIMIT_EVENTS_TARGET (1024) |
116 | #define NUMAINFO_EVENTS_TARGET (1024) | ||
115 | 117 | ||
116 | struct mem_cgroup_stat_cpu { | 118 | struct mem_cgroup_stat_cpu { |
117 | long count[MEM_CGROUP_STAT_NSTATS]; | 119 | long count[MEM_CGROUP_STAT_NSTATS]; |
@@ -237,7 +239,8 @@ struct mem_cgroup { | |||
237 | int last_scanned_node; | 239 | int last_scanned_node; |
238 | #if MAX_NUMNODES > 1 | 240 | #if MAX_NUMNODES > 1 |
239 | nodemask_t scan_nodes; | 241 | nodemask_t scan_nodes; |
240 | unsigned long next_scan_node_update; | 242 | atomic_t numainfo_events; |
243 | atomic_t numainfo_updating; | ||
241 | #endif | 244 | #endif |
242 | /* | 245 | /* |
243 | * Should the accounting and control be hierarchical, per subtree? | 246 | * Should the accounting and control be hierarchical, per subtree? |
@@ -577,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, | |||
577 | return val; | 580 | return val; |
578 | } | 581 | } |
579 | 582 | ||
580 | static long mem_cgroup_local_usage(struct mem_cgroup *mem) | ||
581 | { | ||
582 | long ret; | ||
583 | |||
584 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | ||
585 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | ||
586 | return ret; | ||
587 | } | ||
588 | |||
589 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 583 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
590 | bool charge) | 584 | bool charge) |
591 | { | 585 | { |
@@ -689,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) | |||
689 | case MEM_CGROUP_TARGET_SOFTLIMIT: | 683 | case MEM_CGROUP_TARGET_SOFTLIMIT: |
690 | next = val + SOFTLIMIT_EVENTS_TARGET; | 684 | next = val + SOFTLIMIT_EVENTS_TARGET; |
691 | break; | 685 | break; |
686 | case MEM_CGROUP_TARGET_NUMAINFO: | ||
687 | next = val + NUMAINFO_EVENTS_TARGET; | ||
688 | break; | ||
692 | default: | 689 | default: |
693 | return; | 690 | return; |
694 | } | 691 | } |
@@ -707,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | |||
707 | mem_cgroup_threshold(mem); | 704 | mem_cgroup_threshold(mem); |
708 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); | 705 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); |
709 | if (unlikely(__memcg_event_check(mem, | 706 | if (unlikely(__memcg_event_check(mem, |
710 | MEM_CGROUP_TARGET_SOFTLIMIT))){ | 707 | MEM_CGROUP_TARGET_SOFTLIMIT))) { |
711 | mem_cgroup_update_tree(mem, page); | 708 | mem_cgroup_update_tree(mem, page); |
712 | __mem_cgroup_target_update(mem, | 709 | __mem_cgroup_target_update(mem, |
713 | MEM_CGROUP_TARGET_SOFTLIMIT); | 710 | MEM_CGROUP_TARGET_SOFTLIMIT); |
711 | } | ||
712 | #if MAX_NUMNODES > 1 | ||
713 | if (unlikely(__memcg_event_check(mem, | ||
714 | MEM_CGROUP_TARGET_NUMAINFO))) { | ||
715 | atomic_inc(&mem->numainfo_events); | ||
716 | __mem_cgroup_target_update(mem, | ||
717 | MEM_CGROUP_TARGET_NUMAINFO); | ||
714 | } | 718 | } |
719 | #endif | ||
715 | } | 720 | } |
716 | } | 721 | } |
717 | 722 | ||
@@ -1129,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, | |||
1129 | return MEM_CGROUP_ZSTAT(mz, lru); | 1134 | return MEM_CGROUP_ZSTAT(mz, lru); |
1130 | } | 1135 | } |
1131 | 1136 | ||
1132 | #ifdef CONFIG_NUMA | ||
1133 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | 1137 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, |
1134 | int nid) | 1138 | int nid) |
1135 | { | 1139 | { |
@@ -1141,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | |||
1141 | return ret; | 1145 | return ret; |
1142 | } | 1146 | } |
1143 | 1147 | ||
1148 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1149 | int nid) | ||
1150 | { | ||
1151 | unsigned long ret; | ||
1152 | |||
1153 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1154 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1155 | return ret; | ||
1156 | } | ||
1157 | |||
1158 | #if MAX_NUMNODES > 1 | ||
1144 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | 1159 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) |
1145 | { | 1160 | { |
1146 | u64 total = 0; | 1161 | u64 total = 0; |
@@ -1152,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | |||
1152 | return total; | 1167 | return total; |
1153 | } | 1168 | } |
1154 | 1169 | ||
1155 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1156 | int nid) | ||
1157 | { | ||
1158 | unsigned long ret; | ||
1159 | |||
1160 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1161 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1162 | |||
1163 | return ret; | ||
1164 | } | ||
1165 | |||
1166 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) | 1170 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) |
1167 | { | 1171 | { |
1168 | u64 total = 0; | 1172 | u64 total = 0; |
@@ -1559,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1559 | return ret; | 1563 | return ret; |
1560 | } | 1564 | } |
1561 | 1565 | ||
1566 | /** | ||
1567 | * test_mem_cgroup_node_reclaimable | ||
1568 | * @mem: the target memcg | ||
1569 | * @nid: the node ID to be checked. | ||
1570 | * @noswap : specify true here if the user wants flle only information. | ||
1571 | * | ||
1572 | * This function returns whether the specified memcg contains any | ||
1573 | * reclaimable pages on a node. Returns true if there are any reclaimable | ||
1574 | * pages in the node. | ||
1575 | */ | ||
1576 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, | ||
1577 | int nid, bool noswap) | ||
1578 | { | ||
1579 | if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) | ||
1580 | return true; | ||
1581 | if (noswap || !total_swap_pages) | ||
1582 | return false; | ||
1583 | if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) | ||
1584 | return true; | ||
1585 | return false; | ||
1586 | |||
1587 | } | ||
1562 | #if MAX_NUMNODES > 1 | 1588 | #if MAX_NUMNODES > 1 |
1563 | 1589 | ||
1564 | /* | 1590 | /* |
@@ -1570,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1570 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | 1596 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) |
1571 | { | 1597 | { |
1572 | int nid; | 1598 | int nid; |
1573 | 1599 | /* | |
1574 | if (time_after(mem->next_scan_node_update, jiffies)) | 1600 | * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET |
1601 | * pagein/pageout changes since the last update. | ||
1602 | */ | ||
1603 | if (!atomic_read(&mem->numainfo_events)) | ||
1604 | return; | ||
1605 | if (atomic_inc_return(&mem->numainfo_updating) > 1) | ||
1575 | return; | 1606 | return; |
1576 | 1607 | ||
1577 | mem->next_scan_node_update = jiffies + 10*HZ; | ||
1578 | /* make a nodemask where this memcg uses memory from */ | 1608 | /* make a nodemask where this memcg uses memory from */ |
1579 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; | 1609 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; |
1580 | 1610 | ||
1581 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | 1611 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { |
1582 | 1612 | ||
1583 | if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || | 1613 | if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) |
1584 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) | 1614 | node_clear(nid, mem->scan_nodes); |
1585 | continue; | ||
1586 | |||
1587 | if (total_swap_pages && | ||
1588 | (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || | ||
1589 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) | ||
1590 | continue; | ||
1591 | node_clear(nid, mem->scan_nodes); | ||
1592 | } | 1615 | } |
1616 | |||
1617 | atomic_set(&mem->numainfo_events, 0); | ||
1618 | atomic_set(&mem->numainfo_updating, 0); | ||
1593 | } | 1619 | } |
1594 | 1620 | ||
1595 | /* | 1621 | /* |
@@ -1627,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | |||
1627 | return node; | 1653 | return node; |
1628 | } | 1654 | } |
1629 | 1655 | ||
1656 | /* | ||
1657 | * Check all nodes whether it contains reclaimable pages or not. | ||
1658 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1659 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1660 | * enough new information. We need to do double check. | ||
1661 | */ | ||
1662 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | ||
1663 | { | ||
1664 | int nid; | ||
1665 | |||
1666 | /* | ||
1667 | * quick check...making use of scan_node. | ||
1668 | * We can skip unused nodes. | ||
1669 | */ | ||
1670 | if (!nodes_empty(mem->scan_nodes)) { | ||
1671 | for (nid = first_node(mem->scan_nodes); | ||
1672 | nid < MAX_NUMNODES; | ||
1673 | nid = next_node(nid, mem->scan_nodes)) { | ||
1674 | |||
1675 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | ||
1676 | return true; | ||
1677 | } | ||
1678 | } | ||
1679 | /* | ||
1680 | * Check rest of nodes. | ||
1681 | */ | ||
1682 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
1683 | if (node_isset(nid, mem->scan_nodes)) | ||
1684 | continue; | ||
1685 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | ||
1686 | return true; | ||
1687 | } | ||
1688 | return false; | ||
1689 | } | ||
1690 | |||
1630 | #else | 1691 | #else |
1631 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | 1692 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) |
1632 | { | 1693 | { |
1633 | return 0; | 1694 | return 0; |
1634 | } | 1695 | } |
1696 | |||
1697 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | ||
1698 | { | ||
1699 | return test_mem_cgroup_node_reclaimable(mem, 0, noswap); | ||
1700 | } | ||
1635 | #endif | 1701 | #endif |
1636 | 1702 | ||
1637 | /* | 1703 | /* |
@@ -1702,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1702 | } | 1768 | } |
1703 | } | 1769 | } |
1704 | } | 1770 | } |
1705 | if (!mem_cgroup_local_usage(victim)) { | 1771 | if (!mem_cgroup_reclaimable(victim, noswap)) { |
1706 | /* this cgroup's local usage == 0 */ | 1772 | /* this cgroup's local usage == 0 */ |
1707 | css_put(&victim->css); | 1773 | css_put(&victim->css); |
1708 | continue; | 1774 | continue; |
diff --git a/mm/memory.c b/mm/memory.c index 40b7531ee8ba..9b8a01d941cb 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
305 | if (batch->nr == batch->max) { | 305 | if (batch->nr == batch->max) { |
306 | if (!tlb_next_batch(tlb)) | 306 | if (!tlb_next_batch(tlb)) |
307 | return 0; | 307 | return 0; |
308 | batch = tlb->active; | ||
308 | } | 309 | } |
309 | VM_BUG_ON(batch->nr > batch->max); | 310 | VM_BUG_ON(batch->nr > batch->max); |
310 | 311 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 1fd0c51b10a6..5c5c2d4b1807 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -22,7 +22,6 @@ | |||
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
25 | #include <linux/tracehook.h> | ||
26 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 26 | #include <linux/backing-dev.h> |
28 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
1087 | * it's being traced - otherwise breakpoints set in it may interfere | 1086 | * it's being traced - otherwise breakpoints set in it may interfere |
1088 | * with another untraced process | 1087 | * with another untraced process |
1089 | */ | 1088 | */ |
1090 | if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) | 1089 | if ((flags & MAP_PRIVATE) && current->ptrace) |
1091 | vm_flags &= ~VM_MAYSHARE; | 1090 | vm_flags &= ~VM_MAYSHARE; |
1092 | 1091 | ||
1093 | return vm_flags; | 1092 | return vm_flags; |
@@ -1813,10 +1812,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1813 | return NULL; | 1812 | return NULL; |
1814 | } | 1813 | } |
1815 | 1814 | ||
1816 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | 1815 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, |
1817 | unsigned long to, unsigned long size, pgprot_t prot) | 1816 | unsigned long pfn, unsigned long size, pgprot_t prot) |
1818 | { | 1817 | { |
1819 | vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; | 1818 | if (addr != (pfn << PAGE_SHIFT)) |
1819 | return -EINVAL; | ||
1820 | |||
1821 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | ||
1820 | return 0; | 1822 | return 0; |
1821 | } | 1823 | } |
1822 | EXPORT_SYMBOL(remap_pfn_range); | 1824 | EXPORT_SYMBOL(remap_pfn_range); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4b0991ca351..b0be989d4365 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
339 | * then wait for it to finish before killing | 339 | * then wait for it to finish before killing |
340 | * some other task unnecessarily. | 340 | * some other task unnecessarily. |
341 | */ | 341 | */ |
342 | if (!(task_ptrace(p->group_leader) & | 342 | if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) |
343 | PT_TRACE_EXIT)) | ||
344 | return ERR_PTR(-1UL); | 343 | return ERR_PTR(-1UL); |
345 | } | 344 | } |
346 | } | 345 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8985acdab8..9119faae6e6a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -4585,6 +4585,60 @@ void __init sort_node_map(void) | |||
4585 | cmp_node_active_region, NULL); | 4585 | cmp_node_active_region, NULL); |
4586 | } | 4586 | } |
4587 | 4587 | ||
4588 | /** | ||
4589 | * node_map_pfn_alignment - determine the maximum internode alignment | ||
4590 | * | ||
4591 | * This function should be called after node map is populated and sorted. | ||
4592 | * It calculates the maximum power of two alignment which can distinguish | ||
4593 | * all the nodes. | ||
4594 | * | ||
4595 | * For example, if all nodes are 1GiB and aligned to 1GiB, the return value | ||
4596 | * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the | ||
4597 | * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is | ||
4598 | * shifted, 1GiB is enough and this function will indicate so. | ||
4599 | * | ||
4600 | * This is used to test whether pfn -> nid mapping of the chosen memory | ||
4601 | * model has fine enough granularity to avoid incorrect mapping for the | ||
4602 | * populated node map. | ||
4603 | * | ||
4604 | * Returns the determined alignment in pfn's. 0 if there is no alignment | ||
4605 | * requirement (single node). | ||
4606 | */ | ||
4607 | unsigned long __init node_map_pfn_alignment(void) | ||
4608 | { | ||
4609 | unsigned long accl_mask = 0, last_end = 0; | ||
4610 | int last_nid = -1; | ||
4611 | int i; | ||
4612 | |||
4613 | for_each_active_range_index_in_nid(i, MAX_NUMNODES) { | ||
4614 | int nid = early_node_map[i].nid; | ||
4615 | unsigned long start = early_node_map[i].start_pfn; | ||
4616 | unsigned long end = early_node_map[i].end_pfn; | ||
4617 | unsigned long mask; | ||
4618 | |||
4619 | if (!start || last_nid < 0 || last_nid == nid) { | ||
4620 | last_nid = nid; | ||
4621 | last_end = end; | ||
4622 | continue; | ||
4623 | } | ||
4624 | |||
4625 | /* | ||
4626 | * Start with a mask granular enough to pin-point to the | ||
4627 | * start pfn and tick off bits one-by-one until it becomes | ||
4628 | * too coarse to separate the current node from the last. | ||
4629 | */ | ||
4630 | mask = ~((1 << __ffs(start)) - 1); | ||
4631 | while (mask && last_end <= (start & (mask << 1))) | ||
4632 | mask <<= 1; | ||
4633 | |||
4634 | /* accumulate all internode masks */ | ||
4635 | accl_mask |= mask; | ||
4636 | } | ||
4637 | |||
4638 | /* convert mask to number of pages */ | ||
4639 | return ~accl_mask + 1; | ||
4640 | } | ||
4641 | |||
4588 | /* Find the lowest pfn for a node */ | 4642 | /* Find the lowest pfn for a node */ |
4589 | static unsigned long __init find_min_pfn_for_node(int nid) | 4643 | static unsigned long __init find_min_pfn_for_node(int nid) |
4590 | { | 4644 | { |
@@ -21,7 +21,6 @@ | |||
21 | * Lock ordering in mm: | 21 | * Lock ordering in mm: |
22 | * | 22 | * |
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * inode->i_alloc_sem (vmtruncate_range) | ||
25 | * mm->mmap_sem | 24 | * mm->mmap_sem |
26 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
27 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_mutex |
@@ -870,11 +869,11 @@ int page_referenced(struct page *page, | |||
870 | vm_flags); | 869 | vm_flags); |
871 | if (we_locked) | 870 | if (we_locked) |
872 | unlock_page(page); | 871 | unlock_page(page); |
872 | |||
873 | if (page_test_and_clear_young(page_to_pfn(page))) | ||
874 | referenced++; | ||
873 | } | 875 | } |
874 | out: | 876 | out: |
875 | if (page_test_and_clear_young(page_to_pfn(page))) | ||
876 | referenced++; | ||
877 | |||
878 | return referenced; | 877 | return referenced; |
879 | } | 878 | } |
880 | 879 | ||
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic = | |||
574 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 574 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
575 | 575 | ||
576 | /* internal cache of cache description objs */ | 576 | /* internal cache of cache description objs */ |
577 | static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; | ||
577 | static struct kmem_cache cache_cache = { | 578 | static struct kmem_cache cache_cache = { |
579 | .nodelists = cache_cache_nodelists, | ||
578 | .batchcount = 1, | 580 | .batchcount = 1, |
579 | .limit = BOOT_CPUCACHE_ENTRIES, | 581 | .limit = BOOT_CPUCACHE_ENTRIES, |
580 | .shared = 1, | 582 | .shared = 1, |
@@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void) | |||
1492 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; | 1494 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; |
1493 | 1495 | ||
1494 | /* | 1496 | /* |
1495 | * struct kmem_cache size depends on nr_node_ids, which | 1497 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1496 | * can be less than MAX_NUMNODES. | ||
1497 | */ | 1498 | */ |
1498 | cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + | 1499 | cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1499 | nr_node_ids * sizeof(struct kmem_list3 *); | 1500 | nr_node_ids * sizeof(struct kmem_list3 *); |
1500 | #if DEBUG | 1501 | #if DEBUG |
1501 | cache_cache.obj_size = cache_cache.buffer_size; | 1502 | cache_cache.obj_size = cache_cache.buffer_size; |
1502 | #endif | 1503 | #endif |
@@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2308 | if (!cachep) | 2309 | if (!cachep) |
2309 | goto oops; | 2310 | goto oops; |
2310 | 2311 | ||
2312 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | ||
2311 | #if DEBUG | 2313 | #if DEBUG |
2312 | cachep->obj_size = size; | 2314 | cachep->obj_size = size; |
2313 | 2315 | ||
@@ -3153,12 +3155,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3153 | objp += obj_offset(cachep); | 3155 | objp += obj_offset(cachep); |
3154 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3156 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
3155 | cachep->ctor(objp); | 3157 | cachep->ctor(objp); |
3156 | #if ARCH_SLAB_MINALIGN | 3158 | if (ARCH_SLAB_MINALIGN && |
3157 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3159 | ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { |
3158 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3160 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
3159 | objp, ARCH_SLAB_MINALIGN); | 3161 | objp, (int)ARCH_SLAB_MINALIGN); |
3160 | } | 3162 | } |
3161 | #endif | ||
3162 | return objp; | 3163 | return objp; |
3163 | } | 3164 | } |
3164 | #else | 3165 | #else |
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
482 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 482 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
483 | void *ret; | 483 | void *ret; |
484 | 484 | ||
485 | gfp &= gfp_allowed_mask; | ||
486 | |||
485 | lockdep_trace_alloc(gfp); | 487 | lockdep_trace_alloc(gfp); |
486 | 488 | ||
487 | if (size < PAGE_SIZE - align) { | 489 | if (size < PAGE_SIZE - align) { |
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
608 | { | 610 | { |
609 | void *b; | 611 | void *b; |
610 | 612 | ||
613 | flags &= gfp_allowed_mask; | ||
614 | |||
615 | lockdep_trace_alloc(flags); | ||
616 | |||
611 | if (c->size < PAGE_SIZE) { | 617 | if (c->size < PAGE_SIZE) { |
612 | b = slob_alloc(c->size, flags, c->align, node); | 618 | b = slob_alloc(c->size, flags, c->align, node); |
613 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, | 619 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, |
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/memory.h> | 27 | #include <linux/memory.h> |
28 | #include <linux/math64.h> | 28 | #include <linux/math64.h> |
29 | #include <linux/fault-inject.h> | 29 | #include <linux/fault-inject.h> |
30 | #include <linux/stacktrace.h> | ||
30 | 31 | ||
31 | #include <trace/events/kmem.h> | 32 | #include <trace/events/kmem.h> |
32 | 33 | ||
@@ -191,8 +192,12 @@ static LIST_HEAD(slab_caches); | |||
191 | /* | 192 | /* |
192 | * Tracking user of a slab. | 193 | * Tracking user of a slab. |
193 | */ | 194 | */ |
195 | #define TRACK_ADDRS_COUNT 16 | ||
194 | struct track { | 196 | struct track { |
195 | unsigned long addr; /* Called from address */ | 197 | unsigned long addr; /* Called from address */ |
198 | #ifdef CONFIG_STACKTRACE | ||
199 | unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ | ||
200 | #endif | ||
196 | int cpu; /* Was running on cpu */ | 201 | int cpu; /* Was running on cpu */ |
197 | int pid; /* Pid context */ | 202 | int pid; /* Pid context */ |
198 | unsigned long when; /* When did the operation occur */ | 203 | unsigned long when; /* When did the operation occur */ |
@@ -420,6 +425,24 @@ static void set_track(struct kmem_cache *s, void *object, | |||
420 | struct track *p = get_track(s, object, alloc); | 425 | struct track *p = get_track(s, object, alloc); |
421 | 426 | ||
422 | if (addr) { | 427 | if (addr) { |
428 | #ifdef CONFIG_STACKTRACE | ||
429 | struct stack_trace trace; | ||
430 | int i; | ||
431 | |||
432 | trace.nr_entries = 0; | ||
433 | trace.max_entries = TRACK_ADDRS_COUNT; | ||
434 | trace.entries = p->addrs; | ||
435 | trace.skip = 3; | ||
436 | save_stack_trace(&trace); | ||
437 | |||
438 | /* See rant in lockdep.c */ | ||
439 | if (trace.nr_entries != 0 && | ||
440 | trace.entries[trace.nr_entries - 1] == ULONG_MAX) | ||
441 | trace.nr_entries--; | ||
442 | |||
443 | for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) | ||
444 | p->addrs[i] = 0; | ||
445 | #endif | ||
423 | p->addr = addr; | 446 | p->addr = addr; |
424 | p->cpu = smp_processor_id(); | 447 | p->cpu = smp_processor_id(); |
425 | p->pid = current->pid; | 448 | p->pid = current->pid; |
@@ -444,6 +467,16 @@ static void print_track(const char *s, struct track *t) | |||
444 | 467 | ||
445 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", | 468 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", |
446 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); | 469 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); |
470 | #ifdef CONFIG_STACKTRACE | ||
471 | { | ||
472 | int i; | ||
473 | for (i = 0; i < TRACK_ADDRS_COUNT; i++) | ||
474 | if (t->addrs[i]) | ||
475 | printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); | ||
476 | else | ||
477 | break; | ||
478 | } | ||
479 | #endif | ||
447 | } | 480 | } |
448 | 481 | ||
449 | static void print_tracking(struct kmem_cache *s, void *object) | 482 | static void print_tracking(struct kmem_cache *s, void *object) |
@@ -557,10 +590,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) | |||
557 | memset(p + s->objsize, val, s->inuse - s->objsize); | 590 | memset(p + s->objsize, val, s->inuse - s->objsize); |
558 | } | 591 | } |
559 | 592 | ||
560 | static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) | 593 | static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) |
561 | { | 594 | { |
562 | while (bytes) { | 595 | while (bytes) { |
563 | if (*start != (u8)value) | 596 | if (*start != value) |
564 | return start; | 597 | return start; |
565 | start++; | 598 | start++; |
566 | bytes--; | 599 | bytes--; |
@@ -568,6 +601,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) | |||
568 | return NULL; | 601 | return NULL; |
569 | } | 602 | } |
570 | 603 | ||
604 | static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) | ||
605 | { | ||
606 | u64 value64; | ||
607 | unsigned int words, prefix; | ||
608 | |||
609 | if (bytes <= 16) | ||
610 | return check_bytes8(start, value, bytes); | ||
611 | |||
612 | value64 = value | value << 8 | value << 16 | value << 24; | ||
613 | value64 = value64 | value64 << 32; | ||
614 | prefix = 8 - ((unsigned long)start) % 8; | ||
615 | |||
616 | if (prefix) { | ||
617 | u8 *r = check_bytes8(start, value, prefix); | ||
618 | if (r) | ||
619 | return r; | ||
620 | start += prefix; | ||
621 | bytes -= prefix; | ||
622 | } | ||
623 | |||
624 | words = bytes / 8; | ||
625 | |||
626 | while (words) { | ||
627 | if (*(u64 *)start != value64) | ||
628 | return check_bytes8(start, value, 8); | ||
629 | start += 8; | ||
630 | words--; | ||
631 | } | ||
632 | |||
633 | return check_bytes8(start, value, bytes % 8); | ||
634 | } | ||
635 | |||
571 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, | 636 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, |
572 | void *from, void *to) | 637 | void *from, void *to) |
573 | { | 638 | { |
@@ -2928,6 +2993,42 @@ size_t ksize(const void *object) | |||
2928 | } | 2993 | } |
2929 | EXPORT_SYMBOL(ksize); | 2994 | EXPORT_SYMBOL(ksize); |
2930 | 2995 | ||
2996 | #ifdef CONFIG_SLUB_DEBUG | ||
2997 | bool verify_mem_not_deleted(const void *x) | ||
2998 | { | ||
2999 | struct page *page; | ||
3000 | void *object = (void *)x; | ||
3001 | unsigned long flags; | ||
3002 | bool rv; | ||
3003 | |||
3004 | if (unlikely(ZERO_OR_NULL_PTR(x))) | ||
3005 | return false; | ||
3006 | |||
3007 | local_irq_save(flags); | ||
3008 | |||
3009 | page = virt_to_head_page(x); | ||
3010 | if (unlikely(!PageSlab(page))) { | ||
3011 | /* maybe it was from stack? */ | ||
3012 | rv = true; | ||
3013 | goto out_unlock; | ||
3014 | } | ||
3015 | |||
3016 | slab_lock(page); | ||
3017 | if (on_freelist(page->slab, page, object)) { | ||
3018 | object_err(page->slab, page, object, "Object is on free-list"); | ||
3019 | rv = false; | ||
3020 | } else { | ||
3021 | rv = true; | ||
3022 | } | ||
3023 | slab_unlock(page); | ||
3024 | |||
3025 | out_unlock: | ||
3026 | local_irq_restore(flags); | ||
3027 | return rv; | ||
3028 | } | ||
3029 | EXPORT_SYMBOL(verify_mem_not_deleted); | ||
3030 | #endif | ||
3031 | |||
2931 | void kfree(const void *x) | 3032 | void kfree(const void *x) |
2932 | { | 3033 | { |
2933 | struct page *page; | 3034 | struct page *page; |
diff --git a/mm/swapfile.c b/mm/swapfile.c index ff8dc1a18cb4..1b8c33907242 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1681,19 +1681,14 @@ out: | |||
1681 | } | 1681 | } |
1682 | 1682 | ||
1683 | #ifdef CONFIG_PROC_FS | 1683 | #ifdef CONFIG_PROC_FS |
1684 | struct proc_swaps { | ||
1685 | struct seq_file seq; | ||
1686 | int event; | ||
1687 | }; | ||
1688 | |||
1689 | static unsigned swaps_poll(struct file *file, poll_table *wait) | 1684 | static unsigned swaps_poll(struct file *file, poll_table *wait) |
1690 | { | 1685 | { |
1691 | struct proc_swaps *s = file->private_data; | 1686 | struct seq_file *seq = file->private_data; |
1692 | 1687 | ||
1693 | poll_wait(file, &proc_poll_wait, wait); | 1688 | poll_wait(file, &proc_poll_wait, wait); |
1694 | 1689 | ||
1695 | if (s->event != atomic_read(&proc_poll_event)) { | 1690 | if (seq->poll_event != atomic_read(&proc_poll_event)) { |
1696 | s->event = atomic_read(&proc_poll_event); | 1691 | seq->poll_event = atomic_read(&proc_poll_event); |
1697 | return POLLIN | POLLRDNORM | POLLERR | POLLPRI; | 1692 | return POLLIN | POLLRDNORM | POLLERR | POLLPRI; |
1698 | } | 1693 | } |
1699 | 1694 | ||
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = { | |||
1783 | 1778 | ||
1784 | static int swaps_open(struct inode *inode, struct file *file) | 1779 | static int swaps_open(struct inode *inode, struct file *file) |
1785 | { | 1780 | { |
1786 | struct proc_swaps *s; | 1781 | struct seq_file *seq; |
1787 | int ret; | 1782 | int ret; |
1788 | 1783 | ||
1789 | s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); | ||
1790 | if (!s) | ||
1791 | return -ENOMEM; | ||
1792 | |||
1793 | file->private_data = s; | ||
1794 | |||
1795 | ret = seq_open(file, &swaps_op); | 1784 | ret = seq_open(file, &swaps_op); |
1796 | if (ret) { | 1785 | if (ret) |
1797 | kfree(s); | ||
1798 | return ret; | 1786 | return ret; |
1799 | } | ||
1800 | 1787 | ||
1801 | s->seq.private = s; | 1788 | seq = file->private_data; |
1802 | s->event = atomic_read(&proc_poll_event); | 1789 | seq->poll_event = atomic_read(&proc_poll_event); |
1803 | return ret; | 1790 | return 0; |
1804 | } | 1791 | } |
1805 | 1792 | ||
1806 | static const struct file_operations proc_swaps_operations = { | 1793 | static const struct file_operations proc_swaps_operations = { |
diff --git a/mm/truncate.c b/mm/truncate.c index e13f22efaad7..003c6c685fc8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -622,12 +622,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
622 | return -ENOSYS; | 622 | return -ENOSYS; |
623 | 623 | ||
624 | mutex_lock(&inode->i_mutex); | 624 | mutex_lock(&inode->i_mutex); |
625 | down_write(&inode->i_alloc_sem); | 625 | inode_dio_wait(inode); |
626 | unmap_mapping_range(mapping, offset, (end - offset), 1); | 626 | unmap_mapping_range(mapping, offset, (end - offset), 1); |
627 | inode->i_op->truncate_range(inode, offset, end); | 627 | inode->i_op->truncate_range(inode, offset, end); |
628 | /* unmap again to remove racily COWed private pages */ | 628 | /* unmap again to remove racily COWed private pages */ |
629 | unmap_mapping_range(mapping, offset, (end - offset), 1); | 629 | unmap_mapping_range(mapping, offset, (end - offset), 1); |
630 | up_write(&inode->i_alloc_sem); | ||
631 | mutex_unlock(&inode->i_mutex); | 630 | mutex_unlock(&inode->i_mutex); |
632 | 631 | ||
633 | return 0; | 632 | return 0; |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d34d75366a7..ab8494cde007 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -452,13 +452,6 @@ overflow: | |||
452 | return ERR_PTR(-EBUSY); | 452 | return ERR_PTR(-EBUSY); |
453 | } | 453 | } |
454 | 454 | ||
455 | static void rcu_free_va(struct rcu_head *head) | ||
456 | { | ||
457 | struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); | ||
458 | |||
459 | kfree(va); | ||
460 | } | ||
461 | |||
462 | static void __free_vmap_area(struct vmap_area *va) | 455 | static void __free_vmap_area(struct vmap_area *va) |
463 | { | 456 | { |
464 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | 457 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); |
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va) | |||
491 | if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) | 484 | if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) |
492 | vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); | 485 | vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); |
493 | 486 | ||
494 | call_rcu(&va->rcu_head, rcu_free_va); | 487 | kfree_rcu(va, rcu_head); |
495 | } | 488 | } |
496 | 489 | ||
497 | /* | 490 | /* |
@@ -837,13 +830,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
837 | return vb; | 830 | return vb; |
838 | } | 831 | } |
839 | 832 | ||
840 | static void rcu_free_vb(struct rcu_head *head) | ||
841 | { | ||
842 | struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); | ||
843 | |||
844 | kfree(vb); | ||
845 | } | ||
846 | |||
847 | static void free_vmap_block(struct vmap_block *vb) | 833 | static void free_vmap_block(struct vmap_block *vb) |
848 | { | 834 | { |
849 | struct vmap_block *tmp; | 835 | struct vmap_block *tmp; |
@@ -856,7 +842,7 @@ static void free_vmap_block(struct vmap_block *vb) | |||
856 | BUG_ON(tmp != vb); | 842 | BUG_ON(tmp != vb); |
857 | 843 | ||
858 | free_vmap_area_noflush(vb->va); | 844 | free_vmap_area_noflush(vb->va); |
859 | call_rcu(&vb->rcu_head, rcu_free_vb); | 845 | kfree_rcu(vb, rcu_head); |
860 | } | 846 | } |
861 | 847 | ||
862 | static void purge_fragmented_blocks(int cpu) | 848 | static void purge_fragmented_blocks(int cpu) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f49535d4cd3..febbc044e792 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink, | |||
250 | unsigned long long delta; | 250 | unsigned long long delta; |
251 | unsigned long total_scan; | 251 | unsigned long total_scan; |
252 | unsigned long max_pass; | 252 | unsigned long max_pass; |
253 | int shrink_ret = 0; | ||
254 | long nr; | ||
255 | long new_nr; | ||
256 | long batch_size = shrinker->batch ? shrinker->batch | ||
257 | : SHRINK_BATCH; | ||
253 | 258 | ||
259 | /* | ||
260 | * copy the current shrinker scan count into a local variable | ||
261 | * and zero it so that other concurrent shrinker invocations | ||
262 | * don't also do this scanning work. | ||
263 | */ | ||
264 | do { | ||
265 | nr = shrinker->nr; | ||
266 | } while (cmpxchg(&shrinker->nr, nr, 0) != nr); | ||
267 | |||
268 | total_scan = nr; | ||
254 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); | 269 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
255 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 270 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
256 | delta *= max_pass; | 271 | delta *= max_pass; |
257 | do_div(delta, lru_pages + 1); | 272 | do_div(delta, lru_pages + 1); |
258 | shrinker->nr += delta; | 273 | total_scan += delta; |
259 | if (shrinker->nr < 0) { | 274 | if (total_scan < 0) { |
260 | printk(KERN_ERR "shrink_slab: %pF negative objects to " | 275 | printk(KERN_ERR "shrink_slab: %pF negative objects to " |
261 | "delete nr=%ld\n", | 276 | "delete nr=%ld\n", |
262 | shrinker->shrink, shrinker->nr); | 277 | shrinker->shrink, total_scan); |
263 | shrinker->nr = max_pass; | 278 | total_scan = max_pass; |
264 | } | 279 | } |
265 | 280 | ||
266 | /* | 281 | /* |
282 | * We need to avoid excessive windup on filesystem shrinkers | ||
283 | * due to large numbers of GFP_NOFS allocations causing the | ||
284 | * shrinkers to return -1 all the time. This results in a large | ||
285 | * nr being built up so when a shrink that can do some work | ||
286 | * comes along it empties the entire cache due to nr >>> | ||
287 | * max_pass. This is bad for sustaining a working set in | ||
288 | * memory. | ||
289 | * | ||
290 | * Hence only allow the shrinker to scan the entire cache when | ||
291 | * a large delta change is calculated directly. | ||
292 | */ | ||
293 | if (delta < max_pass / 4) | ||
294 | total_scan = min(total_scan, max_pass / 2); | ||
295 | |||
296 | /* | ||
267 | * Avoid risking looping forever due to too large nr value: | 297 | * Avoid risking looping forever due to too large nr value: |
268 | * never try to free more than twice the estimate number of | 298 | * never try to free more than twice the estimate number of |
269 | * freeable entries. | 299 | * freeable entries. |
270 | */ | 300 | */ |
271 | if (shrinker->nr > max_pass * 2) | 301 | if (total_scan > max_pass * 2) |
272 | shrinker->nr = max_pass * 2; | 302 | total_scan = max_pass * 2; |
273 | 303 | ||
274 | total_scan = shrinker->nr; | 304 | trace_mm_shrink_slab_start(shrinker, shrink, nr, |
275 | shrinker->nr = 0; | 305 | nr_pages_scanned, lru_pages, |
306 | max_pass, delta, total_scan); | ||
276 | 307 | ||
277 | while (total_scan >= SHRINK_BATCH) { | 308 | while (total_scan >= batch_size) { |
278 | long this_scan = SHRINK_BATCH; | ||
279 | int shrink_ret; | ||
280 | int nr_before; | 309 | int nr_before; |
281 | 310 | ||
282 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); | 311 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
283 | shrink_ret = do_shrinker_shrink(shrinker, shrink, | 312 | shrink_ret = do_shrinker_shrink(shrinker, shrink, |
284 | this_scan); | 313 | batch_size); |
285 | if (shrink_ret == -1) | 314 | if (shrink_ret == -1) |
286 | break; | 315 | break; |
287 | if (shrink_ret < nr_before) | 316 | if (shrink_ret < nr_before) |
288 | ret += nr_before - shrink_ret; | 317 | ret += nr_before - shrink_ret; |
289 | count_vm_events(SLABS_SCANNED, this_scan); | 318 | count_vm_events(SLABS_SCANNED, batch_size); |
290 | total_scan -= this_scan; | 319 | total_scan -= batch_size; |
291 | 320 | ||
292 | cond_resched(); | 321 | cond_resched(); |
293 | } | 322 | } |
294 | 323 | ||
295 | shrinker->nr += total_scan; | 324 | /* |
325 | * move the unused scan count back into the shrinker in a | ||
326 | * manner that handles concurrent updates. If we exhausted the | ||
327 | * scan, there is no need to do an update. | ||
328 | */ | ||
329 | do { | ||
330 | nr = shrinker->nr; | ||
331 | new_nr = total_scan + nr; | ||
332 | if (total_scan <= 0) | ||
333 | break; | ||
334 | } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); | ||
335 | |||
336 | trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); | ||
296 | } | 337 | } |
297 | up_read(&shrinker_rwsem); | 338 | up_read(&shrinker_rwsem); |
298 | out: | 339 | out: |
@@ -2310,7 +2351,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2310 | for (i = 0; i <= classzone_idx; i++) | 2351 | for (i = 0; i <= classzone_idx; i++) |
2311 | present_pages += pgdat->node_zones[i].present_pages; | 2352 | present_pages += pgdat->node_zones[i].present_pages; |
2312 | 2353 | ||
2313 | return balanced_pages > (present_pages >> 2); | 2354 | /* A special case here: if zone has no page, we think it's balanced */ |
2355 | return balanced_pages >= (present_pages >> 2); | ||
2314 | } | 2356 | } |
2315 | 2357 | ||
2316 | /* is kswapd sleeping prematurely? */ | 2358 | /* is kswapd sleeping prematurely? */ |
@@ -2326,7 +2368,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2326 | return true; | 2368 | return true; |
2327 | 2369 | ||
2328 | /* Check the watermark levels */ | 2370 | /* Check the watermark levels */ |
2329 | for (i = 0; i < pgdat->nr_zones; i++) { | 2371 | for (i = 0; i <= classzone_idx; i++) { |
2330 | struct zone *zone = pgdat->node_zones + i; | 2372 | struct zone *zone = pgdat->node_zones + i; |
2331 | 2373 | ||
2332 | if (!populated_zone(zone)) | 2374 | if (!populated_zone(zone)) |
@@ -2344,7 +2386,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2344 | } | 2386 | } |
2345 | 2387 | ||
2346 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), | 2388 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2347 | classzone_idx, 0)) | 2389 | i, 0)) |
2348 | all_zones_ok = false; | 2390 | all_zones_ok = false; |
2349 | else | 2391 | else |
2350 | balanced += zone->present_pages; | 2392 | balanced += zone->present_pages; |
@@ -2451,7 +2493,6 @@ loop_again: | |||
2451 | if (!zone_watermark_ok_safe(zone, order, | 2493 | if (!zone_watermark_ok_safe(zone, order, |
2452 | high_wmark_pages(zone), 0, 0)) { | 2494 | high_wmark_pages(zone), 0, 0)) { |
2453 | end_zone = i; | 2495 | end_zone = i; |
2454 | *classzone_idx = i; | ||
2455 | break; | 2496 | break; |
2456 | } | 2497 | } |
2457 | } | 2498 | } |
@@ -2510,18 +2551,18 @@ loop_again: | |||
2510 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2551 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2511 | if (!zone_watermark_ok_safe(zone, order, | 2552 | if (!zone_watermark_ok_safe(zone, order, |
2512 | high_wmark_pages(zone) + balance_gap, | 2553 | high_wmark_pages(zone) + balance_gap, |
2513 | end_zone, 0)) | 2554 | end_zone, 0)) { |
2514 | shrink_zone(priority, zone, &sc); | 2555 | shrink_zone(priority, zone, &sc); |
2515 | reclaim_state->reclaimed_slab = 0; | ||
2516 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | ||
2517 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2518 | total_scanned += sc.nr_scanned; | ||
2519 | 2556 | ||
2520 | if (zone->all_unreclaimable) | 2557 | reclaim_state->reclaimed_slab = 0; |
2521 | continue; | 2558 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
2522 | if (nr_slab == 0 && | 2559 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2523 | !zone_reclaimable(zone)) | 2560 | total_scanned += sc.nr_scanned; |
2524 | zone->all_unreclaimable = 1; | 2561 | |
2562 | if (nr_slab == 0 && !zone_reclaimable(zone)) | ||
2563 | zone->all_unreclaimable = 1; | ||
2564 | } | ||
2565 | |||
2525 | /* | 2566 | /* |
2526 | * If we've done a decent amount of scanning and | 2567 | * If we've done a decent amount of scanning and |
2527 | * the reclaim ratio is low, start doing writepage | 2568 | * the reclaim ratio is low, start doing writepage |
@@ -2531,6 +2572,12 @@ loop_again: | |||
2531 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2572 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2532 | sc.may_writepage = 1; | 2573 | sc.may_writepage = 1; |
2533 | 2574 | ||
2575 | if (zone->all_unreclaimable) { | ||
2576 | if (end_zone && end_zone == i) | ||
2577 | end_zone--; | ||
2578 | continue; | ||
2579 | } | ||
2580 | |||
2534 | if (!zone_watermark_ok_safe(zone, order, | 2581 | if (!zone_watermark_ok_safe(zone, order, |
2535 | high_wmark_pages(zone), end_zone, 0)) { | 2582 | high_wmark_pages(zone), end_zone, 0)) { |
2536 | all_zones_ok = 0; | 2583 | all_zones_ok = 0; |
@@ -2709,8 +2756,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2709 | */ | 2756 | */ |
2710 | static int kswapd(void *p) | 2757 | static int kswapd(void *p) |
2711 | { | 2758 | { |
2712 | unsigned long order; | 2759 | unsigned long order, new_order; |
2713 | int classzone_idx; | 2760 | int classzone_idx, new_classzone_idx; |
2714 | pg_data_t *pgdat = (pg_data_t*)p; | 2761 | pg_data_t *pgdat = (pg_data_t*)p; |
2715 | struct task_struct *tsk = current; | 2762 | struct task_struct *tsk = current; |
2716 | 2763 | ||
@@ -2740,17 +2787,23 @@ static int kswapd(void *p) | |||
2740 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 2787 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
2741 | set_freezable(); | 2788 | set_freezable(); |
2742 | 2789 | ||
2743 | order = 0; | 2790 | order = new_order = 0; |
2744 | classzone_idx = MAX_NR_ZONES - 1; | 2791 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2745 | for ( ; ; ) { | 2792 | for ( ; ; ) { |
2746 | unsigned long new_order; | ||
2747 | int new_classzone_idx; | ||
2748 | int ret; | 2793 | int ret; |
2749 | 2794 | ||
2750 | new_order = pgdat->kswapd_max_order; | 2795 | /* |
2751 | new_classzone_idx = pgdat->classzone_idx; | 2796 | * If the last balance_pgdat was unsuccessful it's unlikely a |
2752 | pgdat->kswapd_max_order = 0; | 2797 | * new request of a similar or harder type will succeed soon |
2753 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | 2798 | * so consider going to sleep on the basis we reclaimed at |
2799 | */ | ||
2800 | if (classzone_idx >= new_classzone_idx && order == new_order) { | ||
2801 | new_order = pgdat->kswapd_max_order; | ||
2802 | new_classzone_idx = pgdat->classzone_idx; | ||
2803 | pgdat->kswapd_max_order = 0; | ||
2804 | pgdat->classzone_idx = pgdat->nr_zones - 1; | ||
2805 | } | ||
2806 | |||
2754 | if (order < new_order || classzone_idx > new_classzone_idx) { | 2807 | if (order < new_order || classzone_idx > new_classzone_idx) { |
2755 | /* | 2808 | /* |
2756 | * Don't sleep if someone wants a larger 'order' | 2809 | * Don't sleep if someone wants a larger 'order' |
@@ -2763,7 +2816,7 @@ static int kswapd(void *p) | |||
2763 | order = pgdat->kswapd_max_order; | 2816 | order = pgdat->kswapd_max_order; |
2764 | classzone_idx = pgdat->classzone_idx; | 2817 | classzone_idx = pgdat->classzone_idx; |
2765 | pgdat->kswapd_max_order = 0; | 2818 | pgdat->kswapd_max_order = 0; |
2766 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | 2819 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
2767 | } | 2820 | } |
2768 | 2821 | ||
2769 | ret = try_to_freeze(); | 2822 | ret = try_to_freeze(); |