diff options
author | Hugh Dickins <hugh.dickins@tiscali.co.uk> | 2009-12-14 20:59:29 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-15 11:53:19 -0500 |
commit | 80e148226028257ec0a1909d99b2c40d0ffe17f2 (patch) | |
tree | 1e3cae42144f5c80e215ba254e01bd6847ba1b36 /mm | |
parent | 4035c07a895974d0ac06a56fe870ad293fc451a7 (diff) |
ksm: share anon page without allocating
When ksm pages were unswappable, it made no sense to include them in mem
cgroup accounting; but now that they are swappable (although I see no
strict logical connection) the principle of least surprise implies that
they should be accounted (with the usual dissatisfaction, that a shared
page is accounted to only one of the cgroups using it).
This patch was intended to add mem cgroup accounting where necessary; but
turned inside out, it now avoids allocating a ksm page, instead upgrading
an anon page to ksm - which brings its existing mem cgroup accounting with
it. Thus mem cgroups don't appear in the patch at all.
This upgrade from PageAnon to PageKsm takes place under page lock (via a
somewhat hacky NULL kpage interface), and audit showed only one place
which needed to cope with the race - page_referenced() is sometimes used
without page lock, so page_lock_anon_vma() needs an ACCESS_ONCE() to be
sure of getting anon_vma and flags together (no problem if the page goes
ksm an instant after, the integrity of that anon_vma list is unaffected).
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Wright <chrisw@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/ksm.c | 67 | ||||
-rw-r--r-- | mm/rmap.c | 6 |
2 files changed, 25 insertions, 48 deletions
@@ -831,7 +831,8 @@ out: | |||
831 | * try_to_merge_one_page - take two pages and merge them into one | 831 | * try_to_merge_one_page - take two pages and merge them into one |
832 | * @vma: the vma that holds the pte pointing to page | 832 | * @vma: the vma that holds the pte pointing to page |
833 | * @page: the PageAnon page that we want to replace with kpage | 833 | * @page: the PageAnon page that we want to replace with kpage |
834 | * @kpage: the PageKsm page that we want to map instead of page | 834 | * @kpage: the PageKsm page that we want to map instead of page, |
835 | * or NULL the first time when we want to use page as kpage. | ||
835 | * | 836 | * |
836 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | 837 | * This function returns 0 if the pages were merged, -EFAULT otherwise. |
837 | */ | 838 | */ |
@@ -864,15 +865,24 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
864 | * ptes are necessarily already write-protected. But in either | 865 | * ptes are necessarily already write-protected. But in either |
865 | * case, we need to lock and check page_count is not raised. | 866 | * case, we need to lock and check page_count is not raised. |
866 | */ | 867 | */ |
867 | if (write_protect_page(vma, page, &orig_pte) == 0 && | 868 | if (write_protect_page(vma, page, &orig_pte) == 0) { |
868 | pages_identical(page, kpage)) | 869 | if (!kpage) { |
869 | err = replace_page(vma, page, kpage, orig_pte); | 870 | /* |
871 | * While we hold page lock, upgrade page from | ||
872 | * PageAnon+anon_vma to PageKsm+NULL stable_node: | ||
873 | * stable_tree_insert() will update stable_node. | ||
874 | */ | ||
875 | set_page_stable_node(page, NULL); | ||
876 | mark_page_accessed(page); | ||
877 | err = 0; | ||
878 | } else if (pages_identical(page, kpage)) | ||
879 | err = replace_page(vma, page, kpage, orig_pte); | ||
880 | } | ||
870 | 881 | ||
871 | if ((vma->vm_flags & VM_LOCKED) && !err) { | 882 | if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { |
872 | munlock_vma_page(page); | 883 | munlock_vma_page(page); |
873 | if (!PageMlocked(kpage)) { | 884 | if (!PageMlocked(kpage)) { |
874 | unlock_page(page); | 885 | unlock_page(page); |
875 | lru_add_drain(); | ||
876 | lock_page(kpage); | 886 | lock_page(kpage); |
877 | mlock_vma_page(kpage); | 887 | mlock_vma_page(kpage); |
878 | page = kpage; /* for final unlock */ | 888 | page = kpage; /* for final unlock */ |
@@ -922,7 +932,7 @@ out: | |||
922 | * This function returns the kpage if we successfully merged two identical | 932 | * This function returns the kpage if we successfully merged two identical |
923 | * pages into one ksm page, NULL otherwise. | 933 | * pages into one ksm page, NULL otherwise. |
924 | * | 934 | * |
925 | * Note that this function allocates a new kernel page: if one of the pages | 935 | * Note that this function upgrades page to ksm page: if one of the pages |
926 | * is already a ksm page, try_to_merge_with_ksm_page should be used. | 936 | * is already a ksm page, try_to_merge_with_ksm_page should be used. |
927 | */ | 937 | */ |
928 | static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | 938 | static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, |
@@ -930,10 +940,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
930 | struct rmap_item *tree_rmap_item, | 940 | struct rmap_item *tree_rmap_item, |
931 | struct page *tree_page) | 941 | struct page *tree_page) |
932 | { | 942 | { |
933 | struct mm_struct *mm = rmap_item->mm; | 943 | int err; |
934 | struct vm_area_struct *vma; | ||
935 | struct page *kpage; | ||
936 | int err = -EFAULT; | ||
937 | 944 | ||
938 | /* | 945 | /* |
939 | * The number of nodes in the stable tree | 946 | * The number of nodes in the stable tree |
@@ -943,37 +950,10 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
943 | ksm_max_kernel_pages <= ksm_pages_shared) | 950 | ksm_max_kernel_pages <= ksm_pages_shared) |
944 | return NULL; | 951 | return NULL; |
945 | 952 | ||
946 | kpage = alloc_page(GFP_HIGHUSER); | 953 | err = try_to_merge_with_ksm_page(rmap_item, page, NULL); |
947 | if (!kpage) | ||
948 | return NULL; | ||
949 | |||
950 | down_read(&mm->mmap_sem); | ||
951 | if (ksm_test_exit(mm)) | ||
952 | goto up; | ||
953 | vma = find_vma(mm, rmap_item->address); | ||
954 | if (!vma || vma->vm_start > rmap_item->address) | ||
955 | goto up; | ||
956 | |||
957 | copy_user_highpage(kpage, page, rmap_item->address, vma); | ||
958 | |||
959 | SetPageDirty(kpage); | ||
960 | __SetPageUptodate(kpage); | ||
961 | SetPageSwapBacked(kpage); | ||
962 | set_page_stable_node(kpage, NULL); /* mark it PageKsm */ | ||
963 | lru_cache_add_lru(kpage, LRU_ACTIVE_ANON); | ||
964 | |||
965 | err = try_to_merge_one_page(vma, page, kpage); | ||
966 | if (err) | ||
967 | goto up; | ||
968 | |||
969 | /* Must get reference to anon_vma while still holding mmap_sem */ | ||
970 | hold_anon_vma(rmap_item, vma->anon_vma); | ||
971 | up: | ||
972 | up_read(&mm->mmap_sem); | ||
973 | |||
974 | if (!err) { | 954 | if (!err) { |
975 | err = try_to_merge_with_ksm_page(tree_rmap_item, | 955 | err = try_to_merge_with_ksm_page(tree_rmap_item, |
976 | tree_page, kpage); | 956 | tree_page, page); |
977 | /* | 957 | /* |
978 | * If that fails, we have a ksm page with only one pte | 958 | * If that fails, we have a ksm page with only one pte |
979 | * pointing to it: so break it. | 959 | * pointing to it: so break it. |
@@ -981,11 +961,7 @@ up: | |||
981 | if (err) | 961 | if (err) |
982 | break_cow(rmap_item); | 962 | break_cow(rmap_item); |
983 | } | 963 | } |
984 | if (err) { | 964 | return err ? NULL : page; |
985 | put_page(kpage); | ||
986 | kpage = NULL; | ||
987 | } | ||
988 | return kpage; | ||
989 | } | 965 | } |
990 | 966 | ||
991 | /* | 967 | /* |
@@ -1244,7 +1220,6 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1244 | stable_tree_append(rmap_item, stable_node); | 1220 | stable_tree_append(rmap_item, stable_node); |
1245 | } | 1221 | } |
1246 | unlock_page(kpage); | 1222 | unlock_page(kpage); |
1247 | put_page(kpage); | ||
1248 | 1223 | ||
1249 | /* | 1224 | /* |
1250 | * If we fail to insert the page into the stable tree, | 1225 | * If we fail to insert the page into the stable tree, |
@@ -204,7 +204,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
204 | unsigned long anon_mapping; | 204 | unsigned long anon_mapping; |
205 | 205 | ||
206 | rcu_read_lock(); | 206 | rcu_read_lock(); |
207 | anon_mapping = (unsigned long) page->mapping; | 207 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); |
208 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | 208 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
209 | goto out; | 209 | goto out; |
210 | if (!page_mapped(page)) | 210 | if (!page_mapped(page)) |
@@ -666,7 +666,9 @@ static void __page_check_anon_rmap(struct page *page, | |||
666 | * @address: the user virtual address mapped | 666 | * @address: the user virtual address mapped |
667 | * | 667 | * |
668 | * The caller needs to hold the pte lock, and the page must be locked in | 668 | * The caller needs to hold the pte lock, and the page must be locked in |
669 | * the anon_vma case: to serialize mapping,index checking after setting. | 669 | * the anon_vma case: to serialize mapping,index checking after setting, |
670 | * and to ensure that PageAnon is not being upgraded racily to PageKsm | ||
671 | * (but PageKsm is never downgraded to PageAnon). | ||
670 | */ | 672 | */ |
671 | void page_add_anon_rmap(struct page *page, | 673 | void page_add_anon_rmap(struct page *page, |
672 | struct vm_area_struct *vma, unsigned long address) | 674 | struct vm_area_struct *vma, unsigned long address) |