summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVasiliy Kulikov <segoon@openwall.com>2011-07-26 19:08:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-26 19:49:44 -0400
commitb34a6b1da371ed8af1221459a18c67970f7e3d53 (patch)
tree5addc850de13623b172395b9d0d7d670930fa6b3
parentd40dcdb0172a1ba853464983a059fb45e0aaf61a (diff)
ipc: introduce shm_rmid_forced sysctl
Add support for the shm_rmid_forced sysctl. If set to 1, all shared memory objects in current ipc namespace will be automatically forced to use IPC_RMID. The POSIX way of handling shmem allows one to create shm objects and call shmdt(), leaving shm object associated with no process, thus consuming memory not counted via rlimits. With shm_rmid_forced=1 the shared memory object is counted at least for one process, so OOM killer may effectively kill the fat process holding the shared memory. It obviously breaks POSIX - some programs relying on the feature would stop working. So set shm_rmid_forced=1 only if you're sure nobody uses "orphaned" memory. Use shm_rmid_forced=0 by default for compatability reasons. The feature was previously impemented in -ow as a configure option. [akpm@linux-foundation.org: fix documentation, per Randy] [akpm@linux-foundation.org: fix warning] [akpm@linux-foundation.org: readability/conventionality tweaks] [akpm@linux-foundation.org: fix shm_rmid_forced/shm_forced_rmid confusion, use standard comment layout] Signed-off-by: Vasiliy Kulikov <segoon@openwall.com> Cc: Randy Dunlap <rdunlap@xenotime.net> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: "Serge E. Hallyn" <serge.hallyn@canonical.com> Cc: Daniel Lezcano <daniel.lezcano@free.fr> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: Solar Designer <solar@openwall.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/sysctl/kernel.txt22
-rw-r--r--include/linux/ipc_namespace.h7
-rw-r--r--include/linux/shm.h4
-rw-r--r--ipc/ipc_sysctl.c36
-rw-r--r--ipc/shm.c97
-rw-r--r--kernel/exit.c1
6 files changed, 163 insertions, 4 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 1c7fb0a94e28..704e474a93df 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -61,6 +61,7 @@ show up in /proc/sys/kernel:
61- rtsig-nr 61- rtsig-nr
62- sem 62- sem
63- sg-big-buff [ generic SCSI device (sg) ] 63- sg-big-buff [ generic SCSI device (sg) ]
64- shm_rmid_forced
64- shmall 65- shmall
65- shmmax [ sysv ipc ] 66- shmmax [ sysv ipc ]
66- shmmni 67- shmmni
@@ -518,6 +519,27 @@ kernel. This value defaults to SHMMAX.
518 519
519============================================================== 520==============================================================
520 521
522shm_rmid_forced:
523
524Linux lets you set resource limits, including how much memory one
525process can consume, via setrlimit(2). Unfortunately, shared memory
526segments are allowed to exist without association with any process, and
527thus might not be counted against any resource limits. If enabled,
528shared memory segments are automatically destroyed when their attach
529count becomes zero after a detach or a process termination. It will
530also destroy segments that were created, but never attached to, on exit
531from the process. The only use left for IPC_RMID is to immediately
532destroy an unattached segment. Of course, this breaks the way things are
533defined, so some applications might stop working. Note that this
534feature will do you no good unless you also configure your resource
535limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't
536need this.
537
538Note that if you change this from 0 to 1, already created segments
539without users and with a dead originative process will be destroyed.
540
541==============================================================
542
521softlockup_thresh: 543softlockup_thresh:
522 544
523This value can be used to lower the softlockup tolerance threshold. The 545This value can be used to lower the softlockup tolerance threshold. The
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index a6d1655f9607..8a297a5e794c 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -44,6 +44,11 @@ struct ipc_namespace {
44 size_t shm_ctlall; 44 size_t shm_ctlall;
45 int shm_ctlmni; 45 int shm_ctlmni;
46 int shm_tot; 46 int shm_tot;
47 /*
48 * Defines whether IPC_RMID is forced for _all_ shm segments regardless
49 * of shmctl()
50 */
51 int shm_rmid_forced;
47 52
48 struct notifier_block ipcns_nb; 53 struct notifier_block ipcns_nb;
49 54
@@ -72,6 +77,7 @@ extern int register_ipcns_notifier(struct ipc_namespace *);
72extern int cond_register_ipcns_notifier(struct ipc_namespace *); 77extern int cond_register_ipcns_notifier(struct ipc_namespace *);
73extern void unregister_ipcns_notifier(struct ipc_namespace *); 78extern void unregister_ipcns_notifier(struct ipc_namespace *);
74extern int ipcns_notify(unsigned long); 79extern int ipcns_notify(unsigned long);
80extern void shm_destroy_orphaned(struct ipc_namespace *ns);
75#else /* CONFIG_SYSVIPC */ 81#else /* CONFIG_SYSVIPC */
76static inline int register_ipcns_notifier(struct ipc_namespace *ns) 82static inline int register_ipcns_notifier(struct ipc_namespace *ns)
77{ return 0; } 83{ return 0; }
@@ -79,6 +85,7 @@ static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns)
79{ return 0; } 85{ return 0; }
80static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { } 86static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { }
81static inline int ipcns_notify(unsigned long l) { return 0; } 87static inline int ipcns_notify(unsigned long l) { return 0; }
88static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
82#endif /* CONFIG_SYSVIPC */ 89#endif /* CONFIG_SYSVIPC */
83 90
84#ifdef CONFIG_POSIX_MQUEUE 91#ifdef CONFIG_POSIX_MQUEUE
diff --git a/include/linux/shm.h b/include/linux/shm.h
index eca6235a46c0..7d27ffde0190 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -106,6 +106,7 @@ struct shmid_kernel /* private to the kernel */
106#ifdef CONFIG_SYSVIPC 106#ifdef CONFIG_SYSVIPC
107long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr); 107long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr);
108extern int is_file_shm_hugepages(struct file *file); 108extern int is_file_shm_hugepages(struct file *file);
109extern void exit_shm(struct task_struct *task);
109#else 110#else
110static inline long do_shmat(int shmid, char __user *shmaddr, 111static inline long do_shmat(int shmid, char __user *shmaddr,
111 int shmflg, unsigned long *addr) 112 int shmflg, unsigned long *addr)
@@ -116,6 +117,9 @@ static inline int is_file_shm_hugepages(struct file *file)
116{ 117{
117 return 0; 118 return 0;
118} 119}
120static inline void exit_shm(struct task_struct *task)
121{
122}
119#endif 123#endif
120 124
121#endif /* __KERNEL__ */ 125#endif /* __KERNEL__ */
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 56410faa4550..00fba2bab87d 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -31,12 +31,37 @@ static int proc_ipc_dointvec(ctl_table *table, int write,
31 void __user *buffer, size_t *lenp, loff_t *ppos) 31 void __user *buffer, size_t *lenp, loff_t *ppos)
32{ 32{
33 struct ctl_table ipc_table; 33 struct ctl_table ipc_table;
34
34 memcpy(&ipc_table, table, sizeof(ipc_table)); 35 memcpy(&ipc_table, table, sizeof(ipc_table));
35 ipc_table.data = get_ipc(table); 36 ipc_table.data = get_ipc(table);
36 37
37 return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); 38 return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
38} 39}
39 40
41static int proc_ipc_dointvec_minmax(ctl_table *table, int write,
42 void __user *buffer, size_t *lenp, loff_t *ppos)
43{
44 struct ctl_table ipc_table;
45
46 memcpy(&ipc_table, table, sizeof(ipc_table));
47 ipc_table.data = get_ipc(table);
48
49 return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
50}
51
52static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write,
53 void __user *buffer, size_t *lenp, loff_t *ppos)
54{
55 struct ipc_namespace *ns = current->nsproxy->ipc_ns;
56 int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
57
58 if (err < 0)
59 return err;
60 if (ns->shm_rmid_forced)
61 shm_destroy_orphaned(ns);
62 return err;
63}
64
40static int proc_ipc_callback_dointvec(ctl_table *table, int write, 65static int proc_ipc_callback_dointvec(ctl_table *table, int write,
41 void __user *buffer, size_t *lenp, loff_t *ppos) 66 void __user *buffer, size_t *lenp, loff_t *ppos)
42{ 67{
@@ -125,6 +150,8 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
125#else 150#else
126#define proc_ipc_doulongvec_minmax NULL 151#define proc_ipc_doulongvec_minmax NULL
127#define proc_ipc_dointvec NULL 152#define proc_ipc_dointvec NULL
153#define proc_ipc_dointvec_minmax NULL
154#define proc_ipc_dointvec_minmax_orphans NULL
128#define proc_ipc_callback_dointvec NULL 155#define proc_ipc_callback_dointvec NULL
129#define proc_ipcauto_dointvec_minmax NULL 156#define proc_ipcauto_dointvec_minmax NULL
130#endif 157#endif
@@ -155,6 +182,15 @@ static struct ctl_table ipc_kern_table[] = {
155 .proc_handler = proc_ipc_dointvec, 182 .proc_handler = proc_ipc_dointvec,
156 }, 183 },
157 { 184 {
185 .procname = "shm_rmid_forced",
186 .data = &init_ipc_ns.shm_rmid_forced,
187 .maxlen = sizeof(init_ipc_ns.shm_rmid_forced),
188 .mode = 0644,
189 .proc_handler = proc_ipc_dointvec_minmax_orphans,
190 .extra1 = &zero,
191 .extra2 = &one,
192 },
193 {
158 .procname = "msgmax", 194 .procname = "msgmax",
159 .data = &init_ipc_ns.msg_ctlmax, 195 .data = &init_ipc_ns.msg_ctlmax,
160 .maxlen = sizeof (init_ipc_ns.msg_ctlmax), 196 .maxlen = sizeof (init_ipc_ns.msg_ctlmax),
diff --git a/ipc/shm.c b/ipc/shm.c
index 27884adb1a90..3f5b14365f33 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -74,6 +74,7 @@ void shm_init_ns(struct ipc_namespace *ns)
74 ns->shm_ctlmax = SHMMAX; 74 ns->shm_ctlmax = SHMMAX;
75 ns->shm_ctlall = SHMALL; 75 ns->shm_ctlall = SHMALL;
76 ns->shm_ctlmni = SHMMNI; 76 ns->shm_ctlmni = SHMMNI;
77 ns->shm_rmid_forced = 0;
77 ns->shm_tot = 0; 78 ns->shm_tot = 0;
78 ipc_init_ids(&shm_ids(ns)); 79 ipc_init_ids(&shm_ids(ns));
79} 80}
@@ -187,6 +188,23 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
187} 188}
188 189
189/* 190/*
191 * shm_may_destroy - identifies whether shm segment should be destroyed now
192 *
193 * Returns true if and only if there are no active users of the segment and
194 * one of the following is true:
195 *
196 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
197 *
198 * 2) sysctl kernel.shm_rmid_forced is set to 1.
199 */
200static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
201{
202 return (shp->shm_nattch == 0) &&
203 (ns->shm_rmid_forced ||
204 (shp->shm_perm.mode & SHM_DEST));
205}
206
207/*
190 * remove the attach descriptor vma. 208 * remove the attach descriptor vma.
191 * free memory for segment if it is marked destroyed. 209 * free memory for segment if it is marked destroyed.
192 * The descriptor has already been removed from the current->mm->mmap list 210 * The descriptor has already been removed from the current->mm->mmap list
@@ -206,11 +224,83 @@ static void shm_close(struct vm_area_struct *vma)
206 shp->shm_lprid = task_tgid_vnr(current); 224 shp->shm_lprid = task_tgid_vnr(current);
207 shp->shm_dtim = get_seconds(); 225 shp->shm_dtim = get_seconds();
208 shp->shm_nattch--; 226 shp->shm_nattch--;
209 if(shp->shm_nattch == 0 && 227 if (shm_may_destroy(ns, shp))
210 shp->shm_perm.mode & SHM_DEST) 228 shm_destroy(ns, shp);
229 else
230 shm_unlock(shp);
231 up_write(&shm_ids(ns).rw_mutex);
232}
233
234static int shm_try_destroy_current(int id, void *p, void *data)
235{
236 struct ipc_namespace *ns = data;
237 struct shmid_kernel *shp = shm_lock(ns, id);
238
239 if (IS_ERR(shp))
240 return 0;
241
242 if (shp->shm_cprid != task_tgid_vnr(current)) {
243 shm_unlock(shp);
244 return 0;
245 }
246
247 if (shm_may_destroy(ns, shp))
248 shm_destroy(ns, shp);
249 else
250 shm_unlock(shp);
251 return 0;
252}
253
254static int shm_try_destroy_orphaned(int id, void *p, void *data)
255{
256 struct ipc_namespace *ns = data;
257 struct shmid_kernel *shp = shm_lock(ns, id);
258 struct task_struct *task;
259
260 if (IS_ERR(shp))
261 return 0;
262
263 /*
264 * We want to destroy segments without users and with already
265 * exit'ed originating process.
266 *
267 * XXX: the originating process may exist in another pid namespace.
268 */
269 task = find_task_by_vpid(shp->shm_cprid);
270 if (task != NULL) {
271 shm_unlock(shp);
272 return 0;
273 }
274
275 if (shm_may_destroy(ns, shp))
211 shm_destroy(ns, shp); 276 shm_destroy(ns, shp);
212 else 277 else
213 shm_unlock(shp); 278 shm_unlock(shp);
279 return 0;
280}
281
282void shm_destroy_orphaned(struct ipc_namespace *ns)
283{
284 down_write(&shm_ids(ns).rw_mutex);
285 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
286 up_write(&shm_ids(ns).rw_mutex);
287}
288
289
290void exit_shm(struct task_struct *task)
291{
292 struct nsproxy *nsp = task->nsproxy;
293 struct ipc_namespace *ns;
294
295 if (!nsp)
296 return;
297 ns = nsp->ipc_ns;
298 if (!ns || !ns->shm_rmid_forced)
299 return;
300
301 /* Destroy all already created segments, but not mapped yet */
302 down_write(&shm_ids(ns).rw_mutex);
303 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
214 up_write(&shm_ids(ns).rw_mutex); 304 up_write(&shm_ids(ns).rw_mutex);
215} 305}
216 306
@@ -950,8 +1040,7 @@ out_nattch:
950 shp = shm_lock(ns, shmid); 1040 shp = shm_lock(ns, shmid);
951 BUG_ON(IS_ERR(shp)); 1041 BUG_ON(IS_ERR(shp));
952 shp->shm_nattch--; 1042 shp->shm_nattch--;
953 if(shp->shm_nattch == 0 && 1043 if (shm_may_destroy(ns, shp))
954 shp->shm_perm.mode & SHM_DEST)
955 shm_destroy(ns, shp); 1044 shm_destroy(ns, shp);
956 else 1045 else
957 shm_unlock(shp); 1046 shm_unlock(shp);
diff --git a/kernel/exit.c b/kernel/exit.c
index 9ee58bb9e60f..2913b3509d42 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -980,6 +980,7 @@ NORET_TYPE void do_exit(long code)
980 trace_sched_process_exit(tsk); 980 trace_sched_process_exit(tsk);
981 981
982 exit_sem(tsk); 982 exit_sem(tsk);
983 exit_shm(tsk);
983 exit_files(tsk); 984 exit_files(tsk);
984 exit_fs(tsk); 985 exit_fs(tsk);
985 check_stack_usage(); 986 check_stack_usage();