aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/pid.h96
-rw-r--r--include/linux/sched.h4
-rw-r--r--kernel/fork.c16
-rw-r--r--kernel/pid.c212
4 files changed, 238 insertions, 90 deletions
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 5b9082cc600f..29960b03bef7 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -1,6 +1,8 @@
1#ifndef _LINUX_PID_H 1#ifndef _LINUX_PID_H
2#define _LINUX_PID_H 2#define _LINUX_PID_H
3 3
4#include <linux/rcupdate.h>
5
4enum pid_type 6enum pid_type
5{ 7{
6 PIDTYPE_PID, 8 PIDTYPE_PID,
@@ -9,45 +11,109 @@ enum pid_type
9 PIDTYPE_MAX 11 PIDTYPE_MAX
10}; 12};
11 13
14/*
15 * What is struct pid?
16 *
17 * A struct pid is the kernel's internal notion of a process identifier.
18 * It refers to individual tasks, process groups, and sessions. While
19 * there are processes attached to it the struct pid lives in a hash
20 * table, so it and then the processes that it refers to can be found
21 * quickly from the numeric pid value. The attached processes may be
22 * quickly accessed by following pointers from struct pid.
23 *
24 * Storing pid_t values in the kernel and refering to them later has a
25 * problem. The process originally with that pid may have exited and the
26 * pid allocator wrapped, and another process could have come along
27 * and been assigned that pid.
28 *
29 * Referring to user space processes by holding a reference to struct
30 * task_struct has a problem. When the user space process exits
31 * the now useless task_struct is still kept. A task_struct plus a
32 * stack consumes around 10K of low kernel memory. More precisely
33 * this is THREAD_SIZE + sizeof(struct task_struct). By comparison
34 * a struct pid is about 64 bytes.
35 *
36 * Holding a reference to struct pid solves both of these problems.
37 * It is small so holding a reference does not consume a lot of
38 * resources, and since a new struct pid is allocated when the numeric
39 * pid value is reused we don't mistakenly refer to new processes.
40 */
41
12struct pid 42struct pid
13{ 43{
44 atomic_t count;
14 /* Try to keep pid_chain in the same cacheline as nr for find_pid */ 45 /* Try to keep pid_chain in the same cacheline as nr for find_pid */
15 int nr; 46 int nr;
16 struct hlist_node pid_chain; 47 struct hlist_node pid_chain;
17 /* list of pids with the same nr, only one of them is in the hash */ 48 /* lists of tasks that use this pid */
18 struct list_head pid_list; 49 struct hlist_head tasks[PIDTYPE_MAX];
50 struct rcu_head rcu;
19}; 51};
20 52
21#define pid_task(elem, type) \ 53struct pid_link
22 list_entry(elem, struct task_struct, pids[type].pid_list) 54{
55 struct hlist_node node;
56 struct pid *pid;
57};
58
59static inline struct pid *get_pid(struct pid *pid)
60{
61 if (pid)
62 atomic_inc(&pid->count);
63 return pid;
64}
65
66extern void FASTCALL(put_pid(struct pid *pid));
67extern struct task_struct *FASTCALL(pid_task(struct pid *pid, enum pid_type));
68extern struct task_struct *FASTCALL(get_pid_task(struct pid *pid,
69 enum pid_type));
23 70
24/* 71/*
25 * attach_pid() and detach_pid() must be called with the tasklist_lock 72 * attach_pid() and detach_pid() must be called with the tasklist_lock
26 * write-held. 73 * write-held.
27 */ 74 */
28extern int FASTCALL(attach_pid(struct task_struct *task, enum pid_type type, int nr)); 75extern int FASTCALL(attach_pid(struct task_struct *task,
76 enum pid_type type, int nr));
29 77
30extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type)); 78extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type));
31 79
32/* 80/*
33 * look up a PID in the hash table. Must be called with the tasklist_lock 81 * look up a PID in the hash table. Must be called with the tasklist_lock
34 * held. 82 * or rcu_read_lock() held.
83 */
84extern struct pid *FASTCALL(find_pid(int nr));
85
86/*
87 * Lookup a PID in the hash table, and return with it's count elevated.
35 */ 88 */
36extern struct pid *FASTCALL(find_pid(enum pid_type, int)); 89extern struct pid *find_get_pid(int nr);
37 90
38extern int alloc_pidmap(void); 91extern struct pid *alloc_pid(void);
39extern void FASTCALL(free_pidmap(int)); 92extern void FASTCALL(free_pid(struct pid *pid));
40 93
94#define pid_next(task, type) \
95 ((task)->pids[(type)].node.next)
96
97#define pid_next_task(task, type) \
98 hlist_entry(pid_next(task, type), struct task_struct, \
99 pids[(type)].node)
100
101
102/* We could use hlist_for_each_entry_rcu here but it takes more arguments
103 * than the do_each_task_pid/while_each_task_pid. So we roll our own
104 * to preserve the existing interface.
105 */
41#define do_each_task_pid(who, type, task) \ 106#define do_each_task_pid(who, type, task) \
42 if ((task = find_task_by_pid_type(type, who))) { \ 107 if ((task = find_task_by_pid_type(type, who))) { \
43 prefetch((task)->pids[type].pid_list.next); \ 108 prefetch(pid_next(task, type)); \
44 do { 109 do {
45 110
46#define while_each_task_pid(who, type, task) \ 111#define while_each_task_pid(who, type, task) \
47 } while (task = pid_task((task)->pids[type].pid_list.next,\ 112 } while (pid_next(task, type) && ({ \
48 type), \ 113 task = pid_next_task(task, type); \
49 prefetch((task)->pids[type].pid_list.next), \ 114 rcu_dereference(task); \
50 hlist_unhashed(&(task)->pids[type].pid_chain)); \ 115 prefetch(pid_next(task, type)); \
51 } \ 116 1; }) ); \
117 }
52 118
53#endif /* _LINUX_PID_H */ 119#endif /* _LINUX_PID_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7e0ff5dba986..541f4828f5e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -760,7 +760,7 @@ struct task_struct {
760 struct task_struct *group_leader; /* threadgroup leader */ 760 struct task_struct *group_leader; /* threadgroup leader */
761 761
762 /* PID/PID hash table linkage. */ 762 /* PID/PID hash table linkage. */
763 struct pid pids[PIDTYPE_MAX]; 763 struct pid_link pids[PIDTYPE_MAX];
764 struct list_head thread_group; 764 struct list_head thread_group;
765 765
766 struct completion *vfork_done; /* for vfork() */ 766 struct completion *vfork_done; /* for vfork() */
@@ -899,7 +899,7 @@ static inline pid_t process_group(struct task_struct *tsk)
899 */ 899 */
900static inline int pid_alive(struct task_struct *p) 900static inline int pid_alive(struct task_struct *p)
901{ 901{
902 return p->pids[PIDTYPE_PID].nr != 0; 902 return p->pids[PIDTYPE_PID].pid != NULL;
903} 903}
904 904
905extern void free_task(struct task_struct *tsk); 905extern void free_task(struct task_struct *tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b1341205be27..03975d0467f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1315,17 +1315,19 @@ long do_fork(unsigned long clone_flags,
1315{ 1315{
1316 struct task_struct *p; 1316 struct task_struct *p;
1317 int trace = 0; 1317 int trace = 0;
1318 long pid = alloc_pidmap(); 1318 struct pid *pid = alloc_pid();
1319 long nr;
1319 1320
1320 if (pid < 0) 1321 if (!pid)
1321 return -EAGAIN; 1322 return -EAGAIN;
1323 nr = pid->nr;
1322 if (unlikely(current->ptrace)) { 1324 if (unlikely(current->ptrace)) {
1323 trace = fork_traceflag (clone_flags); 1325 trace = fork_traceflag (clone_flags);
1324 if (trace) 1326 if (trace)
1325 clone_flags |= CLONE_PTRACE; 1327 clone_flags |= CLONE_PTRACE;
1326 } 1328 }
1327 1329
1328 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 1330 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
1329 /* 1331 /*
1330 * Do this prior waking up the new thread - the thread pointer 1332 * Do this prior waking up the new thread - the thread pointer
1331 * might get invalid after that point, if the thread exits quickly. 1333 * might get invalid after that point, if the thread exits quickly.
@@ -1352,7 +1354,7 @@ long do_fork(unsigned long clone_flags,
1352 p->state = TASK_STOPPED; 1354 p->state = TASK_STOPPED;
1353 1355
1354 if (unlikely (trace)) { 1356 if (unlikely (trace)) {
1355 current->ptrace_message = pid; 1357 current->ptrace_message = nr;
1356 ptrace_notify ((trace << 8) | SIGTRAP); 1358 ptrace_notify ((trace << 8) | SIGTRAP);
1357 } 1359 }
1358 1360
@@ -1362,10 +1364,10 @@ long do_fork(unsigned long clone_flags,
1362 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 1364 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1363 } 1365 }
1364 } else { 1366 } else {
1365 free_pidmap(pid); 1367 free_pid(pid);
1366 pid = PTR_ERR(p); 1368 nr = PTR_ERR(p);
1367 } 1369 }
1368 return pid; 1370 return nr;
1369} 1371}
1370 1372
1371#ifndef ARCH_MIN_MMSTRUCT_ALIGN 1373#ifndef ARCH_MIN_MMSTRUCT_ALIGN
diff --git a/kernel/pid.c b/kernel/pid.c
index a9f2dfd006d2..eeb836b65ca4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
28#include <linux/hash.h> 28#include <linux/hash.h>
29 29
30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 30#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
31static struct hlist_head *pid_hash[PIDTYPE_MAX]; 31static struct hlist_head *pid_hash;
32static int pidhash_shift; 32static int pidhash_shift;
33static kmem_cache_t *pid_cachep;
33 34
34int pid_max = PID_MAX_DEFAULT; 35int pid_max = PID_MAX_DEFAULT;
35int last_pid; 36int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
60static pidmap_t pidmap_array[PIDMAP_ENTRIES] = 61static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
61 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; 62 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
62 63
64/*
65 * Note: disable interrupts while the pidmap_lock is held as an
66 * interrupt might come in and do read_lock(&tasklist_lock).
67 *
68 * If we don't disable interrupts there is a nasty deadlock between
69 * detach_pid()->free_pid() and another cpu that does
70 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
71 * read_lock(&tasklist_lock);
72 *
73 * After we clean up the tasklist_lock and know there are no
74 * irq handlers that take it we can leave the interrupts enabled.
75 * For now it is easier to be safe than to prove it can't happen.
76 */
63static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 77static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
64 78
65fastcall void free_pidmap(int pid) 79static fastcall void free_pidmap(int pid)
66{ 80{
67 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; 81 pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
68 int offset = pid & BITS_PER_PAGE_MASK; 82 int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
71 atomic_inc(&map->nr_free); 85 atomic_inc(&map->nr_free);
72} 86}
73 87
74int alloc_pidmap(void) 88static int alloc_pidmap(void)
75{ 89{
76 int i, offset, max_scan, pid, last = last_pid; 90 int i, offset, max_scan, pid, last = last_pid;
77 pidmap_t *map; 91 pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
89 * Free the page if someone raced with us 103 * Free the page if someone raced with us
90 * installing it: 104 * installing it:
91 */ 105 */
92 spin_lock(&pidmap_lock); 106 spin_lock_irq(&pidmap_lock);
93 if (map->page) 107 if (map->page)
94 free_page(page); 108 free_page(page);
95 else 109 else
96 map->page = (void *)page; 110 map->page = (void *)page;
97 spin_unlock(&pidmap_lock); 111 spin_unlock_irq(&pidmap_lock);
98 if (unlikely(!map->page)) 112 if (unlikely(!map->page))
99 break; 113 break;
100 } 114 }
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
131 return -1; 145 return -1;
132} 146}
133 147
134struct pid * fastcall find_pid(enum pid_type type, int nr) 148fastcall void put_pid(struct pid *pid)
149{
150 if (!pid)
151 return;
152 if ((atomic_read(&pid->count) == 1) ||
153 atomic_dec_and_test(&pid->count))
154 kmem_cache_free(pid_cachep, pid);
155}
156
157static void delayed_put_pid(struct rcu_head *rhp)
158{
159 struct pid *pid = container_of(rhp, struct pid, rcu);
160 put_pid(pid);
161}
162
163fastcall void free_pid(struct pid *pid)
164{
165 /* We can be called with write_lock_irq(&tasklist_lock) held */
166 unsigned long flags;
167
168 spin_lock_irqsave(&pidmap_lock, flags);
169 hlist_del_rcu(&pid->pid_chain);
170 spin_unlock_irqrestore(&pidmap_lock, flags);
171
172 free_pidmap(pid->nr);
173 call_rcu(&pid->rcu, delayed_put_pid);
174}
175
176struct pid *alloc_pid(void)
177{
178 struct pid *pid;
179 enum pid_type type;
180 int nr = -1;
181
182 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
183 if (!pid)
184 goto out;
185
186 nr = alloc_pidmap();
187 if (nr < 0)
188 goto out_free;
189
190 atomic_set(&pid->count, 1);
191 pid->nr = nr;
192 for (type = 0; type < PIDTYPE_MAX; ++type)
193 INIT_HLIST_HEAD(&pid->tasks[type]);
194
195 spin_lock_irq(&pidmap_lock);
196 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
197 spin_unlock_irq(&pidmap_lock);
198
199out:
200 return pid;
201
202out_free:
203 kmem_cache_free(pid_cachep, pid);
204 pid = NULL;
205 goto out;
206}
207
208struct pid * fastcall find_pid(int nr)
135{ 209{
136 struct hlist_node *elem; 210 struct hlist_node *elem;
137 struct pid *pid; 211 struct pid *pid;
138 212
139 hlist_for_each_entry_rcu(pid, elem, 213 hlist_for_each_entry_rcu(pid, elem,
140 &pid_hash[type][pid_hashfn(nr)], pid_chain) { 214 &pid_hash[pid_hashfn(nr)], pid_chain) {
141 if (pid->nr == nr) 215 if (pid->nr == nr)
142 return pid; 216 return pid;
143 } 217 }
@@ -146,77 +220,82 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
146 220
147int fastcall attach_pid(task_t *task, enum pid_type type, int nr) 221int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
148{ 222{
149 struct pid *pid, *task_pid; 223 struct pid_link *link;
150 224 struct pid *pid;
151 task_pid = &task->pids[type]; 225
152 pid = find_pid(type, nr); 226 WARN_ON(!task->pid); /* to be removed soon */
153 task_pid->nr = nr; 227 WARN_ON(!nr); /* to be removed soon */
154 if (pid == NULL) { 228
155 INIT_LIST_HEAD(&task_pid->pid_list); 229 link = &task->pids[type];
156 hlist_add_head_rcu(&task_pid->pid_chain, 230 link->pid = pid = find_pid(nr);
157 &pid_hash[type][pid_hashfn(nr)]); 231 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
158 } else {
159 INIT_HLIST_NODE(&task_pid->pid_chain);
160 list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
161 }
162 232
163 return 0; 233 return 0;
164} 234}
165 235
166static fastcall int __detach_pid(task_t *task, enum pid_type type) 236void fastcall detach_pid(task_t *task, enum pid_type type)
167{ 237{
168 struct pid *pid, *pid_next; 238 struct pid_link *link;
169 int nr = 0; 239 struct pid *pid;
240 int tmp;
170 241
171 pid = &task->pids[type]; 242 link = &task->pids[type];
172 if (!hlist_unhashed(&pid->pid_chain)) { 243 pid = link->pid;
173 244
174 if (list_empty(&pid->pid_list)) { 245 hlist_del_rcu(&link->node);
175 nr = pid->nr; 246 link->pid = NULL;
176 hlist_del_rcu(&pid->pid_chain);
177 } else {
178 pid_next = list_entry(pid->pid_list.next,
179 struct pid, pid_list);
180 /* insert next pid from pid_list to hash */
181 hlist_replace_rcu(&pid->pid_chain,
182 &pid_next->pid_chain);
183 }
184 }
185 247
186 list_del_rcu(&pid->pid_list); 248 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
187 pid->nr = 0; 249 if (!hlist_empty(&pid->tasks[tmp]))
250 return;
188 251
189 return nr; 252 free_pid(pid);
190} 253}
191 254
192void fastcall detach_pid(task_t *task, enum pid_type type) 255struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
193{ 256{
194 int tmp, nr; 257 struct task_struct *result = NULL;
258 if (pid) {
259 struct hlist_node *first;
260 first = rcu_dereference(pid->tasks[type].first);
261 if (first)
262 result = hlist_entry(first, struct task_struct, pids[(type)].node);
263 }
264 return result;
265}
195 266
196 nr = __detach_pid(task, type); 267/*
197 if (!nr) 268 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
198 return; 269 */
270task_t *find_task_by_pid_type(int type, int nr)
271{
272 return pid_task(find_pid(nr), type);
273}
199 274
200 for (tmp = PIDTYPE_MAX; --tmp >= 0; ) 275EXPORT_SYMBOL(find_task_by_pid_type);
201 if (tmp != type && find_pid(tmp, nr))
202 return;
203 276
204 free_pidmap(nr); 277struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
278{
279 struct task_struct *result;
280 rcu_read_lock();
281 result = pid_task(pid, type);
282 if (result)
283 get_task_struct(result);
284 rcu_read_unlock();
285 return result;
205} 286}
206 287
207task_t *find_task_by_pid_type(int type, int nr) 288struct pid *find_get_pid(pid_t nr)
208{ 289{
209 struct pid *pid; 290 struct pid *pid;
210 291
211 pid = find_pid(type, nr); 292 rcu_read_lock();
212 if (!pid) 293 pid = get_pid(find_pid(nr));
213 return NULL; 294 rcu_read_unlock();
214 295
215 return pid_task(&pid->pid_list, type); 296 return pid;
216} 297}
217 298
218EXPORT_SYMBOL(find_task_by_pid_type);
219
220/* 299/*
221 * The pid hash table is scaled according to the amount of memory in the 300 * The pid hash table is scaled according to the amount of memory in the
222 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or 301 * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -224,7 +303,7 @@ EXPORT_SYMBOL(find_task_by_pid_type);
224 */ 303 */
225void __init pidhash_init(void) 304void __init pidhash_init(void)
226{ 305{
227 int i, j, pidhash_size; 306 int i, pidhash_size;
228 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); 307 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
229 308
230 pidhash_shift = max(4, fls(megabytes * 4)); 309 pidhash_shift = max(4, fls(megabytes * 4));
@@ -233,16 +312,13 @@ void __init pidhash_init(void)
233 312
234 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", 313 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
235 pidhash_size, pidhash_shift, 314 pidhash_size, pidhash_shift,
236 PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head)); 315 pidhash_size * sizeof(struct hlist_head));
237 316
238 for (i = 0; i < PIDTYPE_MAX; i++) { 317 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
239 pid_hash[i] = alloc_bootmem(pidhash_size * 318 if (!pid_hash)
240 sizeof(*(pid_hash[i]))); 319 panic("Could not alloc pidhash!\n");
241 if (!pid_hash[i]) 320 for (i = 0; i < pidhash_size; i++)
242 panic("Could not alloc pidhash!\n"); 321 INIT_HLIST_HEAD(&pid_hash[i]);
243 for (j = 0; j < pidhash_size; j++)
244 INIT_HLIST_HEAD(&pid_hash[i][j]);
245 }
246} 322}
247 323
248void __init pidmap_init(void) 324void __init pidmap_init(void)
@@ -251,4 +327,8 @@ void __init pidmap_init(void)
251 /* Reserve PID 0. We never call free_pidmap(0) */ 327 /* Reserve PID 0. We never call free_pidmap(0) */
252 set_bit(0, pidmap_array->page); 328 set_bit(0, pidmap_array->page);
253 atomic_dec(&pidmap_array->nr_free); 329 atomic_dec(&pidmap_array->nr_free);
330
331 pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
332 __alignof__(struct pid),
333 SLAB_PANIC, NULL, NULL);
254} 334}