aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Morton <akpm@osdl.org>2006-12-10 05:19:19 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-10 12:55:41 -0500
commit7c3ab7381e79dfc7db14a67c6f4f3285664e1ec2 (patch)
treede5d63d17e400eb06b26c88adfd2ef2cf290898e
parent47694bb86af3648d4ec34c7afd46653cefc9b359 (diff)
[PATCH] io-accounting: core statistics
The present per-task IO accounting isn't very useful. It simply counts the number of bytes passed into read() and write(). So if a process reads 1MB from an already-cached file, it is accused of having performed 1MB of I/O, which is wrong. (David Wright had some comments on the applicability of the present logical IO accounting: For billing purposes it is useless but for workload analysis it is very useful read_bytes/read_calls average read request size write_bytes/write_calls average write request size read_bytes/read_blocks ie logical/physical can indicate hit rate or thrashing write_bytes/write_blocks ie logical/physical guess since pdflush writes can be missed I often look for logical larger than physical to see filesystem cache problems. And the bytes/cpusec can help find applications that are dominating the cache and causing slow interactive response from page cache contention. I want to find the IO intensive applications and make sure they are doing efficient IO. Thus the acctcms(sysV) or csacms command would give the high IO commands). This patchset adds new accounting which tries to be more accurate. We account for three things: reads: attempt to count the number of bytes which this process really did cause to be fetched from the storage layer. Done at the submit_bio() level, so it is accurate for block-backed filesystems. I also attempt to wire up NFS and CIFS. writes: attempt to count the number of bytes which this process caused to be sent to the storage layer. This is done at page-dirtying time. The big inaccuracy here is truncate. If a process writes 1MB to a file and then deletes the file, it will in fact perform no writeout. But it will have been accounted as having caused 1MB of write. So... cancelled_writes: account the number of bytes which this process caused to not happen, by truncating pagecache. We _could_ just subtract this from the process's `write' accounting. But that means that some processes would be reported to have done negative amounts of write IO, which is silly. So we just report the raw number and punt this decision up to userspace. Now, we _could_ account for writes at the physical I/O level. But - This would require that we track memory-dirtying tasks at the per-page level (would require a new pointer in struct page). - It would mean that IO statistics for a process are usually only available long after that process has exitted. Which means that we probably cannot communicate this info via taskstats. This patch: Wire up the kernel-private data structures and the accessor functions to manipulate them. Cc: Jay Lan <jlan@sgi.com> Cc: Shailabh Nagar <nagar@watson.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Chris Sturtivant <csturtiv@sgi.com> Cc: Tony Ernst <tee@sgi.com> Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net> Cc: David Wright <daw@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/task_io_accounting.h37
-rw-r--r--include/linux/task_io_accounting_ops.h47
-rw-r--r--init/Kconfig9
-rw-r--r--kernel/fork.c2
5 files changed, 97 insertions, 0 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad9c46071ff8..1208feab46e0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -82,6 +82,7 @@ struct sched_param {
82#include <linux/resource.h> 82#include <linux/resource.h>
83#include <linux/timer.h> 83#include <linux/timer.h>
84#include <linux/hrtimer.h> 84#include <linux/hrtimer.h>
85#include <linux/task_io_accounting.h>
85 86
86#include <asm/processor.h> 87#include <asm/processor.h>
87 88
@@ -1013,6 +1014,7 @@ struct task_struct {
1013 wait_queue_t *io_wait; 1014 wait_queue_t *io_wait;
1014/* i/o counters(bytes read/written, #syscalls */ 1015/* i/o counters(bytes read/written, #syscalls */
1015 u64 rchar, wchar, syscr, syscw; 1016 u64 rchar, wchar, syscr, syscw;
1017 struct task_io_accounting ioac;
1016#if defined(CONFIG_TASK_XACCT) 1018#if defined(CONFIG_TASK_XACCT)
1017 u64 acct_rss_mem1; /* accumulated rss usage */ 1019 u64 acct_rss_mem1; /* accumulated rss usage */
1018 u64 acct_vm_mem1; /* accumulated virtual memory usage */ 1020 u64 acct_vm_mem1; /* accumulated virtual memory usage */
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
new file mode 100644
index 000000000000..44d00e9cceea
--- /dev/null
+++ b/include/linux/task_io_accounting.h
@@ -0,0 +1,37 @@
1/*
2 * task_io_accounting: a structure which is used for recording a single task's
3 * IO statistics.
4 *
5 * Don't include this header file directly - it is designed to be dragged in via
6 * sched.h.
7 *
8 * Blame akpm@osdl.org for all this.
9 */
10
11#ifdef CONFIG_TASK_IO_ACCOUNTING
12struct task_io_accounting {
13 /*
14 * The number of bytes which this task has caused to be read from
15 * storage.
16 */
17 u64 read_bytes;
18
19 /*
20 * The number of bytes which this task has caused, or shall cause to be
21 * written to disk.
22 */
23 u64 write_bytes;
24
25 /*
26 * A task can cause "negative" IO too. If this task truncates some
27 * dirty pagecache, some IO which another task has been accounted for
28 * (in its write_bytes) will not be happening. We _could_ just
29 * subtract that from the truncating task's write_bytes, but there is
30 * information loss in doing that.
31 */
32 u64 cancelled_write_bytes;
33};
34#else
35struct task_io_accounting {
36};
37#endif
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
new file mode 100644
index 000000000000..df2a319106b2
--- /dev/null
+++ b/include/linux/task_io_accounting_ops.h
@@ -0,0 +1,47 @@
1/*
2 * Task I/O accounting operations
3 */
4#ifndef __TASK_IO_ACCOUNTING_OPS_INCLUDED
5#define __TASK_IO_ACCOUNTING_OPS_INCLUDED
6
7#ifdef CONFIG_TASK_IO_ACCOUNTING
8static inline void task_io_account_read(size_t bytes)
9{
10 current->ioac.read_bytes += bytes;
11}
12
13static inline void task_io_account_write(size_t bytes)
14{
15 current->ioac.write_bytes += bytes;
16}
17
18static inline void task_io_account_cancelled_write(size_t bytes)
19{
20 current->ioac.cancelled_write_bytes += bytes;
21}
22
23static inline void task_io_accounting_init(struct task_struct *tsk)
24{
25 memset(&tsk->ioac, 0, sizeof(tsk->ioac));
26}
27
28#else
29
30static inline void task_io_account_read(size_t bytes)
31{
32}
33
34static inline void task_io_account_write(size_t bytes)
35{
36}
37
38static inline void task_io_account_cancelled_write(size_t bytes)
39{
40}
41
42static inline void task_io_accounting_init(struct task_struct *tsk)
43{
44}
45
46#endif /* CONFIG_TASK_IO_ACCOUNTING */
47#endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */
diff --git a/init/Kconfig b/init/Kconfig
index 14d484606fab..9edf103b3ec3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -304,6 +304,15 @@ config TASK_XACCT
304 304
305 Say N if unsure. 305 Say N if unsure.
306 306
307config TASK_IO_ACCOUNTING
308 bool "Enable per-task storage I/O accounting (EXPERIMENTAL)"
309 depends on TASK_XACCT
310 help
311 Collect information on the number of bytes of storage I/O which this
312 task has caused.
313
314 Say N if unsure.
315
307config SYSCTL 316config SYSCTL
308 bool 317 bool
309 318
diff --git a/kernel/fork.c b/kernel/fork.c
index 8c859eef8e6a..086e172d0d3d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -36,6 +36,7 @@
36#include <linux/syscalls.h> 36#include <linux/syscalls.h>
37#include <linux/jiffies.h> 37#include <linux/jiffies.h>
38#include <linux/futex.h> 38#include <linux/futex.h>
39#include <linux/task_io_accounting_ops.h>
39#include <linux/rcupdate.h> 40#include <linux/rcupdate.h>
40#include <linux/ptrace.h> 41#include <linux/ptrace.h>
41#include <linux/mount.h> 42#include <linux/mount.h>
@@ -1055,6 +1056,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1055 p->wchar = 0; /* I/O counter: bytes written */ 1056 p->wchar = 0; /* I/O counter: bytes written */
1056 p->syscr = 0; /* I/O counter: read syscalls */ 1057 p->syscr = 0; /* I/O counter: read syscalls */
1057 p->syscw = 0; /* I/O counter: write syscalls */ 1058 p->syscw = 0; /* I/O counter: write syscalls */
1059 task_io_accounting_init(p);
1058 acct_clear_integrals(p); 1060 acct_clear_integrals(p);
1059 1061
1060 p->it_virt_expires = cputime_zero; 1062 p->it_virt_expires = cputime_zero;