aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2006-03-31 05:31:42 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-31 15:19:00 -0500
commit92476d7fc0326a409ab1d3864a04093a6be9aca7 (patch)
treeea50a5a31522492d9915e0763a7adc6ac87c4fbc /kernel
parent8c7904a00b06d2ee51149794b619e07369fcf9d4 (diff)
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the code more capable. In the discussions I had with Oleg it was felt that to a large extent the cleanup itself justified the work. With struct pid being dynamically allocated meant we could create the hash table entry when the pid was allocated and free the hash table entry when the pid was freed. Instead of playing with the hash lists when ever a process would attach or detach to a process. For myself the fact that it gave what my previous task_ref patch gave for free with simpler code was a big win. The problem is that if you hold a reference to struct task_struct you lock in 10K of low memory. If you do that in a user controllable way like /proc does, with an unprivileged but hostile user space application with typical resource limits of 1000 fds and 100 processes I can trigger the OOM killer by consuming all of low memory with task structs, on a machine wight 1GB of low memory. If I instead hold a reference to struct pid which holds a pointer to my task_struct, I don't suffer from that problem because struct pid is 2 orders of magnitude smaller. In fact struct pid is small enough that most other kernel data structures dwarf it, so simply limiting the number of referring data structures is enough to prevent exhaustion of low memory. This splits the current struct pid into two structures, struct pid and struct pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one. struct pid_link is the per process linkage into the hash tables and lives in struct task_struct. struct pid is given an indepedent lifetime, and holds pointers to each of the pid types. The independent life of struct pid simplifies attach_pid, and detach_pid, because we are always manipulating the list of pids and not the hash table. In addition in giving struct pid an indpendent life it makes the concept much more powerful. Kernel data structures can now embed a struct pid * instead of a pid_t and not suffer from pid wrap around problems or from keeping unnecessarily large amounts of memory allocated. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c16
-rw-r--r--kernel/pid.c212
2 files changed, 155 insertions, 73 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index b1341205be27..03975d0467f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1315,17 +1315,19 @@ long do_fork(unsigned long clone_flags,
1315{ 1315{
1316 struct task_struct *p; 1316 struct task_struct *p;
1317 int trace = 0; 1317 int trace = 0;
1318 long pid = alloc_pidmap(); 1318 struct pid *pid = alloc_pid();
1319 long nr;
1319 1320
1320 if (pid < 0) 1321 if (!pid)
1321 return -EAGAIN; 1322 return -EAGAIN;
1323 nr = pid->nr;
1322 if (unlikely(current->ptrace)) { 1324 if (unlikely(current->ptrace)) {
1323 trace = fork_traceflag (clone_flags); 1325 trace = fork_traceflag (clone_flags);
1324 if (trace) 1326 if (trace)
1325 clone_flags |= CLONE_PTRACE; 1327 clone_flags |= CLONE_PTRACE;
1326 } 1328 }
1327 1329
1328 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 1330 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
1329 /* 1331 /*
1330 * Do this prior waking up the new thread - the thread pointer 1332 * Do this prior waking up the new thread - the thread pointer
1331 * might get invalid after that point, if the thread exits quickly. 1333 * might get invalid after that point, if the thread exits quickly.
@@ -1352,7 +1354,7 @@ long do_fork(unsigned long clone_flags,
1352 p->state = TASK_STOPPED; 1354 p->state = TASK_STOPPED;
1353 1355
1354 if (unlikely (trace)) { 1356 if (unlikely (trace)) {
1355 current->ptrace_message = pid; 1357 current->ptrace_message = nr;
1356 ptrace_notify ((trace << 8) | SIGTRAP); 1358 ptrace_notify ((trace << 8) | SIGTRAP);
1357 } 1359 }
1358 1360
@@ -1362,10 +1364,10 @@ long do_fork(unsigned long clone_flags,
1362 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 1364 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1363 } 1365 }
1364 } else { 1366 } else {
1365 free_pidmap(pid); 1367 free_pid(pid);
1366 pid = PTR_ERR(p); 1368 nr = PTR_ERR(p);
1367 } 1369 }
1368 return pid; 1370 return nr;
1369} 1371}
1370 1372
1371#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1373#ifndef ARCH_MIN_MMSTRUCT_ALIGN
diff --git a/kernel/pid.c b/kernel/pid.c
index a9f2dfd006d2..eeb836b65ca4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
28#include <linux/hash.h> 28#include <linux/hash.h>
29 29
30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
31static struct hlist_head *pid_hash[PIDTYPE_MAX]; 31static struct hlist_head *pid_hash;
32static int pidhash_shift; 32static int pidhash_shift;
33static kmem_cache_t *pid_cachep;
33 34
34int pid_max = PID_MAX_DEFAULT; 35int pid_max = PID_MAX_DEFAULT;
35int last_pid; 36int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
60static pidmap_t pidmap_array[PIDMAP_ENTRIES] = 61static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
61 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; 62 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
62 63
64/*
65 * Note: disable interrupts while the pidmap_lock is held as an
66 * interrupt might come in and do read_lock(&tasklist_lock).
67 *
68 * If we don't disable interrupts there is a nasty deadlock between
69 * detach_pid()->free_pid() and another cpu that does
70 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
71 * read_lock(&tasklist_lock);
72 *
73 * After we clean up the tasklist_lock and know there are no
74 * irq handlers that take it we can leave the interrupts enabled.
75 * For now it is easier to be safe than to prove it can't happen.
76 */
63static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 77static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
64 78
65fastcall void free_pidmap(int pid) 79static fastcall void free_pidmap(int pid)
66{ 80{
67 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; 81 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
68 int offset = pid & BITS_PER_PAGE_MASK; 82 int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
71 atomic_inc(&map->nr_free); 85 atomic_inc(&map->nr_free);
72} 86}
73 87
74int alloc_pidmap(void) 88static int alloc_pidmap(void)
75{ 89{
76 int i, offset, max_scan, pid, last = last_pid; 90 int i, offset, max_scan, pid, last = last_pid;
77 pidmap_t *map; 91 pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
89 * Free the page if someone raced with us 103 * Free the page if someone raced with us
90 * installing it: 104 * installing it:
91 */ 105 */
92 spin_lock(&pidmap_lock); 106 spin_lock_irq(&pidmap_lock);
93 if (map->page) 107 if (map->page)
94 free_page(page); 108 free_page(page);
95 else 109 else
96 map->page = (void *)page; 110 map->page = (void *)page;
97 spin_unlock(&pidmap_lock); 111 spin_unlock_irq(&pidmap_lock);
98 if (unlikely(!map->page)) 112 if (unlikely(!map->page))
99 break; 113 break;
100 } 114 }
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
131 return -1; 145 return -1;
132} 146}
133 147
134struct pid * fastcall find_pid(enum pid_type type, int nr) 148fastcall void put_pid(struct pid *pid)
149{
150 if (!pid)
151 return;
152 if ((atomic_read(&pid->count) == 1) ||
153 atomic_dec_and_test(&pid->count))
154 kmem_cache_free(pid_cachep, pid);
155}
156
157static void delayed_put_pid(struct rcu_head *rhp)
158{
159 struct pid *pid = container_of(rhp, struct pid, rcu);
160 put_pid(pid);
161}
162
163fastcall void free_pid(struct pid *pid)
164{
165 /* We can be called with write_lock_irq(&tasklist_lock) held */
166 unsigned long flags;
167
168 spin_lock_irqsave(&pidmap_lock, flags);
169 hlist_del_rcu(&pid->pid_chain);
170 spin_unlock_irqrestore(&pidmap_lock, flags);
171
172 free_pidmap(pid->nr);
173 call_rcu(&pid->rcu, delayed_put_pid);
174}
175
176struct pid *alloc_pid(void)
177{
178 struct pid *pid;
179 enum pid_type type;
180 int nr = -1;
181
182 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
183 if (!pid)
184 goto out;
185
186 nr = alloc_pidmap();
187 if (nr < 0)
188 goto out_free;
189
190 atomic_set(&pid->count, 1);
191 pid->nr = nr;
192 for (type = 0; type < PIDTYPE_MAX; ++type)
193 INIT_HLIST_HEAD(&pid->tasks[type]);
194
195 spin_lock_irq(&pidmap_lock);
196 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
197 spin_unlock_irq(&pidmap_lock);
198
199out:
200 return pid;
201
202out_free:
203 kmem_cache_free(pid_cachep, pid);
204 pid = NULL;
205 goto out;
206}
207
208struct pid * fastcall find_pid(int nr)
135{ 209{
136 struct hlist_node *elem; 210 struct hlist_node *elem;
137 struct pid *pid; 211 struct pid *pid;
138 212
139 hlist_for_each_entry_rcu(pid, elem, 213 hlist_for_each_entry_rcu(pid, elem,
140 &pid_hash[type][pid_hashfn(nr)], pid_chain) { 214 &pid_hash[pid_hashfn(nr)], pid_chain) {
141 if (pid->nr == nr) 215 if (pid->nr == nr)
142 return pid; 216 return pid;
143 } 217 }
@@ -146,77 +220,82 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
146 220
147int fastcall attach_pid(task_t *task, enum pid_type type, int nr) 221int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
148{ 222{
149 struct pid *pid, *task_pid; 223 struct pid_link *link;
150 224 struct pid *pid;
151 task_pid = &task->pids[type]; 225
152 pid = find_pid(type, nr); 226 WARN_ON(!task->pid); /* to be removed soon */
153 task_pid->nr = nr; 227 WARN_ON(!nr); /* to be removed soon */
154 if (pid == NULL) { 228
155 INIT_LIST_HEAD(&task_pid->pid_list); 229 link = &task->pids[type];
156 hlist_add_head_rcu(&task_pid->pid_chain, 230 link->pid = pid = find_pid(nr);
157 &pid_hash[type][pid_hashfn(nr)]); 231 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
158 } else {
159 INIT_HLIST_NODE(&task_pid->pid_chain);
160 list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
161 }
162 232
163 return 0; 233 return 0;
164} 234}
165 235
166static fastcall int __detach_pid(task_t *task, enum pid_type type) 236void fastcall detach_pid(task_t *task, enum pid_type type)
167{ 237{
168 struct pid *pid, *pid_next; 238 struct pid_link *link;
169 int nr = 0; 239 struct pid *pid;
240 int tmp;
170 241
171 pid = &task->pids[type]; 242 link = &task->pids[type];
172 if (!hlist_unhashed(&pid->pid_chain)) { 243 pid = link->pid;
173 244
174 if (list_empty(&pid->pid_list)) { 245 hlist_del_rcu(&link->node);
175 nr = pid->nr; 246 link->pid = NULL;
176 hlist_del_rcu(&pid->pid_chain);
177 } else {
178 pid_next = list_entry(pid->pid_list.next,
179 struct pid, pid_list);
180 /* insert next pid from pid_list to hash */
181 hlist_replace_rcu(&pid->pid_chain,
182 &pid_next->pid_chain);
183 }
184 }
185 247
186 list_del_rcu(&pid->pid_list); 248 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
187 pid->nr = 0; 249 if (!hlist_empty(&pid->tasks[tmp]))
250 return;
188 251
189 return nr; 252 free_pid(pid);
190} 253}
191 254
192void fastcall detach_pid(task_t *task, enum pid_type type) 255struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
193{ 256{
194 int tmp, nr; 257 struct task_struct *result = NULL;
258 if (pid) {
259 struct hlist_node *first;
260 first = rcu_dereference(pid->tasks[type].first);
261 if (first)
262 result = hlist_entry(first, struct task_struct, pids[(type)].node);
263 }
264 return result;
265}
195 266
196 nr = __detach_pid(task, type); 267/*
197 if (!nr) 268 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
198 return; 269 */
270task_t *find_task_by_pid_type(int type, int nr)
271{
272 return pid_task(find_pid(nr), type);
273}
199 274
200 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 275EXPORT_SYMBOL(find_task_by_pid_type);
201 if (tmp != type && find_pid(tmp, nr))
202 return;
203 276
204 free_pidmap(nr); 277struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
278{
279 struct task_struct *result;
280 rcu_read_lock();
281 result = pid_task(pid, type);
282 if (result)
283 get_task_struct(result);
284 rcu_read_unlock();
285 return result;
205} 286}
206 287
207task_t *find_task_by_pid_type(int type, int nr) 288struct pid *find_get_pid(pid_t nr)
208{ 289{
209 struct pid *pid; 290 struct pid *pid;
210 291
211 pid = find_pid(type, nr); 292 rcu_read_lock();
212 if (!pid) 293 pid = get_pid(find_pid(nr));
213 return NULL; 294 rcu_read_unlock();
214 295
215 return pid_task(&pid->pid_list, type); 296 return pid;
216} 297}
217 298
218EXPORT_SYMBOL(find_task_by_pid_type);
219
220/* 299/*
221 * The pid hash table is scaled according to the amount of memory in the 300 * The pid hash table is scaled according to the amount of memory in the
222 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 301 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -224,7 +303,7 @@ EXPORT_SYMBOL(find_task_by_pid_type);
224 */ 303 */
225void __init pidhash_init(void) 304void __init pidhash_init(void)
226{ 305{
227 int i, j, pidhash_size; 306 int i, pidhash_size;
228 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); 307 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
229 308
230 pidhash_shift = max(4, fls(megabytes * 4)); 309 pidhash_shift = max(4, fls(megabytes * 4));
@@ -233,16 +312,13 @@ void __init pidhash_init(void)
233 312
234 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", 313 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
235 pidhash_size, pidhash_shift, 314 pidhash_size, pidhash_shift,
236 PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); 315 pidhash_size * sizeof(struct hlist_head));
237 316
238 for (i = 0; i < PIDTYPE_MAX; i++) { 317 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
239 pid_hash[i] = alloc_bootmem(pidhash_size * 318 if (!pid_hash)
240 sizeof(*(pid_hash[i]))); 319 panic("Could not alloc pidhash!\n");
241 if (!pid_hash[i]) 320 for (i = 0; i < pidhash_size; i++)
242 panic("Could not alloc pidhash!\n"); 321 INIT_HLIST_HEAD(&pid_hash[i]);
243 for (j = 0; j < pidhash_size; j++)
244 INIT_HLIST_HEAD(&pid_hash[i][j]);
245 }
246} 322}
247 323
248void __init pidmap_init(void) 324void __init pidmap_init(void)
@@ -251,4 +327,8 @@ void __init pidmap_init(void)
251 /* Reserve PID 0. We never call free_pidmap(0) */ 327 /* Reserve PID 0. We never call free_pidmap(0) */
252 set_bit(0, pidmap_array->page); 328 set_bit(0, pidmap_array->page);
253 atomic_dec(&pidmap_array->nr_free); 329 atomic_dec(&pidmap_array->nr_free);
330
331 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
332 __alignof__(struct pid),
333 SLAB_PANIC, NULL, NULL);
254} 334}