diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-10-05 03:32:55 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2010-10-25 21:18:20 -0400 |
commit | 7e360c38abe2c70eae3ba5a8a17f17671d8b77c5 (patch) | |
tree | 319034360c667ac704bce87b1a0856657bf67e4b | |
parent | fde214d414218fb6cace35708730986bcc94fb53 (diff) |
fs: allow for more than 2^31 files
Andrew,
Could you please review this patch, you probably are the right guy to
take it, because it crosses fs and net trees.
Note : /proc/sys/fs/file-nr is a read-only file, so this patch doesnt
depend on previous patch (sysctl: fix min/max handling in
__do_proc_doulongvec_minmax())
Thanks !
[PATCH V4] fs: allow for more than 2^31 files
Robin Holt tried to boot a 16TB system and found af_unix was overflowing
a 32bit value :
<quote>
We were seeing a failure which prevented boot. The kernel was incapable
of creating either a named pipe or unix domain socket. This comes down
to a common kernel function called unix_create1() which does:
atomic_inc(&unix_nr_socks);
if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
goto out;
The function get_max_files() is a simple return of files_stat.max_files.
files_stat.max_files is a signed integer and is computed in
fs/file_table.c's files_init().
n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = n;
In our case, mempages (total_ram_pages) is approx 3,758,096,384
(0xe0000000). That leaves max_files at approximately 1,503,238,553.
This causes 2 * get_max_files() to integer overflow.
</quote>
Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long
integers, and change af_unix to use an atomic_long_t instead of
atomic_t.
get_max_files() is changed to return an unsigned long.
get_nr_files() is changed to return a long.
unix_nr_socks is changed from atomic_t to atomic_long_t, while not
strictly needed to address Robin problem.
Before patch (on a 64bit kernel) :
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
-18446744071562067968
After patch:
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
2147483648
# cat /proc/sys/fs/file-nr
704 0 2147483648
Reported-by: Robin Holt <holt@sgi.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Robin Holt <holt@sgi.com>
Tested-by: Robin Holt <holt@sgi.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r-- | fs/file_table.c | 17 | ||||
-rw-r--r-- | include/linux/fs.h | 8 | ||||
-rw-r--r-- | kernel/sysctl.c | 6 | ||||
-rw-r--r-- | net/unix/af_unix.c | 14 |
4 files changed, 21 insertions, 24 deletions
diff --git a/fs/file_table.c b/fs/file_table.c index a04bdd81c11..c3dee381f1b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c | |||
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f) | |||
60 | /* | 60 | /* |
61 | * Return the total number of open files in the system | 61 | * Return the total number of open files in the system |
62 | */ | 62 | */ |
63 | static int get_nr_files(void) | 63 | static long get_nr_files(void) |
64 | { | 64 | { |
65 | return percpu_counter_read_positive(&nr_files); | 65 | return percpu_counter_read_positive(&nr_files); |
66 | } | 66 | } |
@@ -68,7 +68,7 @@ static int get_nr_files(void) | |||
68 | /* | 68 | /* |
69 | * Return the maximum number of open files in the system | 69 | * Return the maximum number of open files in the system |
70 | */ | 70 | */ |
71 | int get_max_files(void) | 71 | unsigned long get_max_files(void) |
72 | { | 72 | { |
73 | return files_stat.max_files; | 73 | return files_stat.max_files; |
74 | } | 74 | } |
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write, | |||
82 | void __user *buffer, size_t *lenp, loff_t *ppos) | 82 | void __user *buffer, size_t *lenp, loff_t *ppos) |
83 | { | 83 | { |
84 | files_stat.nr_files = get_nr_files(); | 84 | files_stat.nr_files = get_nr_files(); |
85 | return proc_dointvec(table, write, buffer, lenp, ppos); | 85 | return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
86 | } | 86 | } |
87 | #else | 87 | #else |
88 | int proc_nr_files(ctl_table *table, int write, | 88 | int proc_nr_files(ctl_table *table, int write, |
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write, | |||
105 | struct file *get_empty_filp(void) | 105 | struct file *get_empty_filp(void) |
106 | { | 106 | { |
107 | const struct cred *cred = current_cred(); | 107 | const struct cred *cred = current_cred(); |
108 | static int old_max; | 108 | static long old_max; |
109 | struct file * f; | 109 | struct file * f; |
110 | 110 | ||
111 | /* | 111 | /* |
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void) | |||
140 | over: | 140 | over: |
141 | /* Ran out of filps - report that */ | 141 | /* Ran out of filps - report that */ |
142 | if (get_nr_files() > old_max) { | 142 | if (get_nr_files() > old_max) { |
143 | printk(KERN_INFO "VFS: file-max limit %d reached\n", | 143 | pr_info("VFS: file-max limit %lu reached\n", get_max_files()); |
144 | get_max_files()); | ||
145 | old_max = get_nr_files(); | 144 | old_max = get_nr_files(); |
146 | } | 145 | } |
147 | goto fail; | 146 | goto fail; |
@@ -487,7 +486,7 @@ retry: | |||
487 | 486 | ||
488 | void __init files_init(unsigned long mempages) | 487 | void __init files_init(unsigned long mempages) |
489 | { | 488 | { |
490 | int n; | 489 | unsigned long n; |
491 | 490 | ||
492 | filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, | 491 | filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, |
493 | SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); | 492 | SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); |
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages) | |||
498 | */ | 497 | */ |
499 | 498 | ||
500 | n = (mempages * (PAGE_SIZE / 1024)) / 10; | 499 | n = (mempages * (PAGE_SIZE / 1024)) / 10; |
501 | files_stat.max_files = n; | 500 | files_stat.max_files = max_t(unsigned long, n, NR_FILE); |
502 | if (files_stat.max_files < NR_FILE) | ||
503 | files_stat.max_files = NR_FILE; | ||
504 | files_defer_init(); | 501 | files_defer_init(); |
505 | lg_lock_init(files_lglock); | 502 | lg_lock_init(files_lglock); |
506 | percpu_counter_init(&nr_files, 0); | 503 | percpu_counter_init(&nr_files, 0); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 0a5d8363388..0cd6821013a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -34,9 +34,9 @@ | |||
34 | 34 | ||
35 | /* And dynamically-tunable limits and defaults: */ | 35 | /* And dynamically-tunable limits and defaults: */ |
36 | struct files_stat_struct { | 36 | struct files_stat_struct { |
37 | int nr_files; /* read only */ | 37 | unsigned long nr_files; /* read only */ |
38 | int nr_free_files; /* read only */ | 38 | unsigned long nr_free_files; /* read only */ |
39 | int max_files; /* tunable */ | 39 | unsigned long max_files; /* tunable */ |
40 | }; | 40 | }; |
41 | 41 | ||
42 | struct inodes_stat_t { | 42 | struct inodes_stat_t { |
@@ -400,7 +400,7 @@ extern void __init inode_init_early(void); | |||
400 | extern void __init files_init(unsigned long); | 400 | extern void __init files_init(unsigned long); |
401 | 401 | ||
402 | extern struct files_stat_struct files_stat; | 402 | extern struct files_stat_struct files_stat; |
403 | extern int get_max_files(void); | 403 | extern unsigned long get_max_files(void); |
404 | extern int sysctl_nr_open; | 404 | extern int sysctl_nr_open; |
405 | extern struct inodes_stat_t inodes_stat; | 405 | extern struct inodes_stat_t inodes_stat; |
406 | extern int leases_enable, lease_break_time; | 406 | extern int leases_enable, lease_break_time; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770..694b140852c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = { | |||
1352 | { | 1352 | { |
1353 | .procname = "file-nr", | 1353 | .procname = "file-nr", |
1354 | .data = &files_stat, | 1354 | .data = &files_stat, |
1355 | .maxlen = 3*sizeof(int), | 1355 | .maxlen = sizeof(files_stat), |
1356 | .mode = 0444, | 1356 | .mode = 0444, |
1357 | .proc_handler = proc_nr_files, | 1357 | .proc_handler = proc_nr_files, |
1358 | }, | 1358 | }, |
1359 | { | 1359 | { |
1360 | .procname = "file-max", | 1360 | .procname = "file-max", |
1361 | .data = &files_stat.max_files, | 1361 | .data = &files_stat.max_files, |
1362 | .maxlen = sizeof(int), | 1362 | .maxlen = sizeof(files_stat.max_files), |
1363 | .mode = 0644, | 1363 | .mode = 0644, |
1364 | .proc_handler = proc_dointvec, | 1364 | .proc_handler = proc_doulongvec_minmax, |
1365 | }, | 1365 | }, |
1366 | { | 1366 | { |
1367 | .procname = "nr_open", | 1367 | .procname = "nr_open", |
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0ebc777a666..3c95304a081 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c | |||
@@ -117,7 +117,7 @@ | |||
117 | 117 | ||
118 | static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; | 118 | static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; |
119 | static DEFINE_SPINLOCK(unix_table_lock); | 119 | static DEFINE_SPINLOCK(unix_table_lock); |
120 | static atomic_t unix_nr_socks = ATOMIC_INIT(0); | 120 | static atomic_long_t unix_nr_socks; |
121 | 121 | ||
122 | #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) | 122 | #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) |
123 | 123 | ||
@@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk) | |||
360 | if (u->addr) | 360 | if (u->addr) |
361 | unix_release_addr(u->addr); | 361 | unix_release_addr(u->addr); |
362 | 362 | ||
363 | atomic_dec(&unix_nr_socks); | 363 | atomic_long_dec(&unix_nr_socks); |
364 | local_bh_disable(); | 364 | local_bh_disable(); |
365 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 365 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
366 | local_bh_enable(); | 366 | local_bh_enable(); |
367 | #ifdef UNIX_REFCNT_DEBUG | 367 | #ifdef UNIX_REFCNT_DEBUG |
368 | printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, | 368 | printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, |
369 | atomic_read(&unix_nr_socks)); | 369 | atomic_long_read(&unix_nr_socks)); |
370 | #endif | 370 | #endif |
371 | } | 371 | } |
372 | 372 | ||
@@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) | |||
606 | struct sock *sk = NULL; | 606 | struct sock *sk = NULL; |
607 | struct unix_sock *u; | 607 | struct unix_sock *u; |
608 | 608 | ||
609 | atomic_inc(&unix_nr_socks); | 609 | atomic_long_inc(&unix_nr_socks); |
610 | if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) | 610 | if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) |
611 | goto out; | 611 | goto out; |
612 | 612 | ||
613 | sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); | 613 | sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); |
@@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) | |||
632 | unix_insert_socket(unix_sockets_unbound, sk); | 632 | unix_insert_socket(unix_sockets_unbound, sk); |
633 | out: | 633 | out: |
634 | if (sk == NULL) | 634 | if (sk == NULL) |
635 | atomic_dec(&unix_nr_socks); | 635 | atomic_long_dec(&unix_nr_socks); |
636 | else { | 636 | else { |
637 | local_bh_disable(); | 637 | local_bh_disable(); |
638 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 638 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |