diff options
Diffstat (limited to 'mm/madvise.c')
-rw-r--r-- | mm/madvise.c | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/mm/madvise.c b/mm/madvise.c new file mode 100644 index 000000000000..944b5e52d812 --- /dev/null +++ b/mm/madvise.c | |||
@@ -0,0 +1,242 @@ | |||
1 | /* | ||
2 | * linux/mm/madvise.c | ||
3 | * | ||
4 | * Copyright (C) 1999 Linus Torvalds | ||
5 | * Copyright (C) 2002 Christoph Hellwig | ||
6 | */ | ||
7 | |||
8 | #include <linux/mman.h> | ||
9 | #include <linux/pagemap.h> | ||
10 | #include <linux/syscalls.h> | ||
11 | #include <linux/hugetlb.h> | ||
12 | |||
13 | /* | ||
14 | * We can potentially split a vm area into separate | ||
15 | * areas, each area with its own behavior. | ||
16 | */ | ||
17 | static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, | ||
18 | unsigned long end, int behavior) | ||
19 | { | ||
20 | struct mm_struct * mm = vma->vm_mm; | ||
21 | int error = 0; | ||
22 | |||
23 | if (start != vma->vm_start) { | ||
24 | error = split_vma(mm, vma, start, 1); | ||
25 | if (error) | ||
26 | goto out; | ||
27 | } | ||
28 | |||
29 | if (end != vma->vm_end) { | ||
30 | error = split_vma(mm, vma, end, 0); | ||
31 | if (error) | ||
32 | goto out; | ||
33 | } | ||
34 | |||
35 | /* | ||
36 | * vm_flags is protected by the mmap_sem held in write mode. | ||
37 | */ | ||
38 | VM_ClearReadHint(vma); | ||
39 | |||
40 | switch (behavior) { | ||
41 | case MADV_SEQUENTIAL: | ||
42 | vma->vm_flags |= VM_SEQ_READ; | ||
43 | break; | ||
44 | case MADV_RANDOM: | ||
45 | vma->vm_flags |= VM_RAND_READ; | ||
46 | break; | ||
47 | default: | ||
48 | break; | ||
49 | } | ||
50 | |||
51 | out: | ||
52 | if (error == -ENOMEM) | ||
53 | error = -EAGAIN; | ||
54 | return error; | ||
55 | } | ||
56 | |||
57 | /* | ||
58 | * Schedule all required I/O operations. Do not wait for completion. | ||
59 | */ | ||
60 | static long madvise_willneed(struct vm_area_struct * vma, | ||
61 | unsigned long start, unsigned long end) | ||
62 | { | ||
63 | struct file *file = vma->vm_file; | ||
64 | |||
65 | if (!file) | ||
66 | return -EBADF; | ||
67 | |||
68 | start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
69 | if (end > vma->vm_end) | ||
70 | end = vma->vm_end; | ||
71 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
72 | |||
73 | force_page_cache_readahead(file->f_mapping, | ||
74 | file, start, max_sane_readahead(end - start)); | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | /* | ||
79 | * Application no longer needs these pages. If the pages are dirty, | ||
80 | * it's OK to just throw them away. The app will be more careful about | ||
81 | * data it wants to keep. Be sure to free swap resources too. The | ||
82 | * zap_page_range call sets things up for refill_inactive to actually free | ||
83 | * these pages later if no one else has touched them in the meantime, | ||
84 | * although we could add these pages to a global reuse list for | ||
85 | * refill_inactive to pick up before reclaiming other pages. | ||
86 | * | ||
87 | * NB: This interface discards data rather than pushes it out to swap, | ||
88 | * as some implementations do. This has performance implications for | ||
89 | * applications like large transactional databases which want to discard | ||
90 | * pages in anonymous maps after committing to backing store the data | ||
91 | * that was kept in them. There is no reason to write this data out to | ||
92 | * the swap area if the application is discarding it. | ||
93 | * | ||
94 | * An interface that causes the system to free clean pages and flush | ||
95 | * dirty pages is already available as msync(MS_INVALIDATE). | ||
96 | */ | ||
97 | static long madvise_dontneed(struct vm_area_struct * vma, | ||
98 | unsigned long start, unsigned long end) | ||
99 | { | ||
100 | if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) | ||
101 | return -EINVAL; | ||
102 | |||
103 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | ||
104 | struct zap_details details = { | ||
105 | .nonlinear_vma = vma, | ||
106 | .last_index = ULONG_MAX, | ||
107 | }; | ||
108 | zap_page_range(vma, start, end - start, &details); | ||
109 | } else | ||
110 | zap_page_range(vma, start, end - start, NULL); | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | static long madvise_vma(struct vm_area_struct * vma, unsigned long start, | ||
115 | unsigned long end, int behavior) | ||
116 | { | ||
117 | long error = -EBADF; | ||
118 | |||
119 | switch (behavior) { | ||
120 | case MADV_NORMAL: | ||
121 | case MADV_SEQUENTIAL: | ||
122 | case MADV_RANDOM: | ||
123 | error = madvise_behavior(vma, start, end, behavior); | ||
124 | break; | ||
125 | |||
126 | case MADV_WILLNEED: | ||
127 | error = madvise_willneed(vma, start, end); | ||
128 | break; | ||
129 | |||
130 | case MADV_DONTNEED: | ||
131 | error = madvise_dontneed(vma, start, end); | ||
132 | break; | ||
133 | |||
134 | default: | ||
135 | error = -EINVAL; | ||
136 | break; | ||
137 | } | ||
138 | |||
139 | return error; | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * The madvise(2) system call. | ||
144 | * | ||
145 | * Applications can use madvise() to advise the kernel how it should | ||
146 | * handle paging I/O in this VM area. The idea is to help the kernel | ||
147 | * use appropriate read-ahead and caching techniques. The information | ||
148 | * provided is advisory only, and can be safely disregarded by the | ||
149 | * kernel without affecting the correct operation of the application. | ||
150 | * | ||
151 | * behavior values: | ||
152 | * MADV_NORMAL - the default behavior is to read clusters. This | ||
153 | * results in some read-ahead and read-behind. | ||
154 | * MADV_RANDOM - the system should read the minimum amount of data | ||
155 | * on any access, since it is unlikely that the appli- | ||
156 | * cation will need more than what it asks for. | ||
157 | * MADV_SEQUENTIAL - pages in the given range will probably be accessed | ||
158 | * once, so they can be aggressively read ahead, and | ||
159 | * can be freed soon after they are accessed. | ||
160 | * MADV_WILLNEED - the application is notifying the system to read | ||
161 | * some pages ahead. | ||
162 | * MADV_DONTNEED - the application is finished with the given range, | ||
163 | * so the kernel can free resources associated with it. | ||
164 | * | ||
165 | * return values: | ||
166 | * zero - success | ||
167 | * -EINVAL - start + len < 0, start is not page-aligned, | ||
168 | * "behavior" is not a valid value, or application | ||
169 | * is attempting to release locked or shared pages. | ||
170 | * -ENOMEM - addresses in the specified range are not currently | ||
171 | * mapped, or are outside the AS of the process. | ||
172 | * -EIO - an I/O error occurred while paging in data. | ||
173 | * -EBADF - map exists, but area maps something that isn't a file. | ||
174 | * -EAGAIN - a kernel resource was temporarily unavailable. | ||
175 | */ | ||
176 | asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | ||
177 | { | ||
178 | unsigned long end; | ||
179 | struct vm_area_struct * vma; | ||
180 | int unmapped_error = 0; | ||
181 | int error = -EINVAL; | ||
182 | size_t len; | ||
183 | |||
184 | down_write(¤t->mm->mmap_sem); | ||
185 | |||
186 | if (start & ~PAGE_MASK) | ||
187 | goto out; | ||
188 | len = (len_in + ~PAGE_MASK) & PAGE_MASK; | ||
189 | |||
190 | /* Check to see whether len was rounded up from small -ve to zero */ | ||
191 | if (len_in && !len) | ||
192 | goto out; | ||
193 | |||
194 | end = start + len; | ||
195 | if (end < start) | ||
196 | goto out; | ||
197 | |||
198 | error = 0; | ||
199 | if (end == start) | ||
200 | goto out; | ||
201 | |||
202 | /* | ||
203 | * If the interval [start,end) covers some unmapped address | ||
204 | * ranges, just ignore them, but return -ENOMEM at the end. | ||
205 | */ | ||
206 | vma = find_vma(current->mm, start); | ||
207 | for (;;) { | ||
208 | /* Still start < end. */ | ||
209 | error = -ENOMEM; | ||
210 | if (!vma) | ||
211 | goto out; | ||
212 | |||
213 | /* Here start < vma->vm_end. */ | ||
214 | if (start < vma->vm_start) { | ||
215 | unmapped_error = -ENOMEM; | ||
216 | start = vma->vm_start; | ||
217 | } | ||
218 | |||
219 | /* Here vma->vm_start <= start < vma->vm_end. */ | ||
220 | if (end <= vma->vm_end) { | ||
221 | if (start < end) { | ||
222 | error = madvise_vma(vma, start, end, | ||
223 | behavior); | ||
224 | if (error) | ||
225 | goto out; | ||
226 | } | ||
227 | error = unmapped_error; | ||
228 | goto out; | ||
229 | } | ||
230 | |||
231 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | ||
232 | error = madvise_vma(vma, start, vma->vm_end, behavior); | ||
233 | if (error) | ||
234 | goto out; | ||
235 | start = vma->vm_end; | ||
236 | vma = vma->vm_next; | ||
237 | } | ||
238 | |||
239 | out: | ||
240 | up_write(¤t->mm->mmap_sem); | ||
241 | return error; | ||
242 | } | ||