From 1cc5f7142eca352109895fe20b1fc6405dd17727 Mon Sep 17 00:00:00 2001 From: Ed Swierk Date: Mon, 25 Sep 2006 16:25:36 -0700 Subject: [PATCH] load_module: no BUG if module_subsys uninitialized Invoking load_module() before param_sysfs_init() is called crashes in mod_sysfs_setup(), since the kset in module_subsys is not initialized yet. In my case, net-pf-1 is getting modprobed as a result of hotplug trying to create a UNIX socket. Calls to hotplug begin after the topology_init initcall. Another patch for the same symptom (module_subsys-initialize-earlier.patch) moves param_sysfs_init() to the subsys initcalls, but this is still not early enough in the boot process in some cases. In particular, topology_init() causes /sbin/hotplug to run, which requests net-pf-1 (the UNIX socket protocol) which can be compiled as a module. Moving param_sysfs_init() to the postcore initcalls fixes this particular race, but there might well be other cases where a usermodehelper causes a module to load earlier still. The patch makes load_module() return an error rather than crashing the kernel if invoked before module_subsys is initialized. Cc: Mark Huang Cc: Greg KH Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2a19cd47c046..b7fe6e840963 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1054,6 +1054,12 @@ static int mod_sysfs_setup(struct module *mod, { int err; + if (!module_subsys.kset.subsys) { + printk(KERN_ERR "%s: module_subsys not initialized\n", + mod->name); + err = -EINVAL; + goto out; + } memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); if (err) -- cgit v1.2.2 From 0a2966b48fb784e437520e400ddc94874ddbd4e8 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:30:51 -0700 Subject: [PATCH] Fix longstanding load balancing bug in the scheduler The scheduler will stop load balancing if the most busy processor contains processes pinned via processor affinity. The scheduler currently only does one search for busiest cpu. If it cannot pull any tasks away from the busiest cpu because they were pinned then the scheduler goes into a corner and sulks leaving the idle processors idle. F.e. If you have processor 0 busy running four tasks pinned via taskset, there are none on processor 1 and one just started two processes on processor 2 then the scheduler will not move one of the two processes away from processor 2. This patch fixes that issue by forcing the scheduler to come out of its corner and retrying the load balancing by considering other processors for load balancing. This patch was originally developed by John Hawkes and discussed at http://marc.theaimsgroup.com/?l=linux-kernel&m=113901368523205&w=2. I have removed extraneous material and gone back to equipping struct rq with the cpu the queue is associated with since this makes the patch much easier and it is likely that others in the future will have the same difficulty of figuring out which processor owns which runqueue. The overhead added through these patches is a single word on the stack if the kernel is configured to support 32 cpus or less (32 bit). For 32 bit environments the maximum number of cpus that can be configued is 255 which would result in the use of 32 bytes additional on the stack. On IA64 up to 1k cpus can be configured which will result in the use of 128 additional bytes on the stack. The maximum additional cache footprint is one cacheline. Typically memory use will be much less than a cacheline and the additional cpumask will be placed on the stack in a cacheline that already contains other local variable. Signed-off-by: Christoph Lameter Cc: John Hawkes Cc: "Siddha, Suresh B" Cc: Ingo Molnar Cc: Nick Piggin Cc: Peter Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a234fbee1238..5c848fd4e461 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -238,6 +238,7 @@ struct rq { /* For active balancing */ int active_balance; int push_cpu; + int cpu; /* cpu of this runqueue */ struct task_struct *migration_thread; struct list_head migration_queue; @@ -267,6 +268,15 @@ struct rq { static DEFINE_PER_CPU(struct rq, runqueues); +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -2211,7 +2221,8 @@ out: */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle, int *sd_idle) + unsigned long *imbalance, enum idle_type idle, int *sd_idle, + cpumask_t *cpus) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_weighted_load = sum_nr_running = avg_load = 0; for_each_cpu_mask(i, group->cpumask) { - struct rq *rq = cpu_rq(i); + struct rq *rq; + + if (!cpu_isset(i, *cpus)) + continue; + + rq = cpu_rq(i); if (*sd_idle && !idle_cpu(i)) *sd_idle = 0; @@ -2466,13 +2482,17 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum idle_type idle, - unsigned long imbalance) + unsigned long imbalance, cpumask_t *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; int i; for_each_cpu_mask(i, group->cpumask) { + + if (!cpu_isset(i, *cpus)) + continue; + rq = cpu_rq(i); if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) @@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; unsigned long imbalance; struct rq *busiest; + cpumask_t cpus = CPU_MASK_ALL; if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) @@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_cnt[idle]); - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); +redo: + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + &cpus); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance); + busiest = find_busiest_queue(group, idle, imbalance, &cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, double_rq_unlock(this_rq, busiest); /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(all_pinned)) + if (unlikely(all_pinned)) { + cpu_clear(cpu_of(busiest), cpus); + if (!cpus_empty(cpus)) + goto redo; goto out_balanced; + } } if (!nr_moved) { @@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) unsigned long imbalance; int nr_moved = 0; int sd_idle = 0; + cpumask_t cpus = CPU_MASK_ALL; if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); +redo: + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, + &sd_idle, &cpus); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); + busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, + &cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; @@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) minus_1_or_zero(busiest->nr_running), imbalance, sd, NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); + + if (!nr_moved) { + cpu_clear(cpu_of(busiest), cpus); + if (!cpus_empty(cpus)) + goto redo; + } } if (!nr_moved) { @@ -6747,6 +6784,7 @@ void __init sched_init(void) rq->cpu_load[j] = 0; rq->active_balance = 0; rq->push_cpu = 0; + rq->cpu = i; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); #endif -- cgit v1.2.2 From 9b819d204cf602eab1a53a9ec4b8d2ca51e02a1d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:40 -0700 Subject: [PATCH] Add __GFP_THISNODE to avoid fallback to other nodes and ignore cpuset/memory policy restrictions Add a new gfp flag __GFP_THISNODE to avoid fallback to other nodes. This flag is essential if a kernel component requires memory to be located on a certain node. It will be needed for alloc_pages_node() to force allocation on the indicated node and for alloc_pages() to force allocation on the current node. Signed-off-by: Christoph Lameter Cc: Andy Whitcroft Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4ea6f0dc2fc5..76940361273e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2316,7 +2316,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) const struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ - if (in_interrupt()) + if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) return 1; node = z->zone_pgdat->node_id; might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); -- cgit v1.2.2 From fbd98167e653535c5816be154f2149c0efa7757d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:45 -0700 Subject: [PATCH] Profiling: require buffer allocation on the correct node Profiling really suffers with off node buffers. Fail if no memory is available on the nodes. The profiling code can deal with these failures should they occur. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index d5bd75e7501c..fb660c7d35ba 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -309,13 +309,17 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, node = cpu_to_node(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + page = alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + 0); if (!page) return NOTIFY_BAD; per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + page = alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + 0); if (!page) goto out_free; per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); @@ -491,12 +495,16 @@ static int __init create_hash_tables(void) int node = cpu_to_node(cpu); struct page *page; - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + page = alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + page = alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[0] -- cgit v1.2.2 From 0ff38490c836dc379ff7ec45b10a15a662f4e5f6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:52 -0700 Subject: [PATCH] zone_reclaim: dynamic slab reclaim Currently one can enable slab reclaim by setting an explicit option in /proc/sys/vm/zone_reclaim_mode. Slab reclaim is then used as a final option if the freeing of unmapped file backed pages is not enough to free enough pages to allow a local allocation. However, that means that the slab can grow excessively and that most memory of a node may be used by slabs. We have had a case where a machine with 46GB of memory was using 40-42GB for slab. Zone reclaim was effective in dealing with pagecache pages. However, slab reclaim was only done during global reclaim (which is a bit rare on NUMA systems). This patch implements slab reclaim during zone reclaim. Zone reclaim occurs if there is a danger of an off node allocation. At that point we 1. Shrink the per node page cache if the number of pagecache pages is more than min_unmapped_ratio percent of pages in a zone. 2. Shrink the slab cache if the number of the nodes reclaimable slab pages (patch depends on earlier one that implements that counter) are more than min_slab_ratio (a new /proc/sys/vm tunable). The shrinking of the slab cache is a bit problematic since it is not node specific. So we simply calculate what point in the slab we want to reach (current per node slab use minus the number of pages that neeed to be allocated) and then repeately run the global reclaim until that is unsuccessful or we have reached the limit. I hope we will have zone based slab reclaim at some point which will make that easier. The default for the min_slab_ratio is 5% Also remove the slab option from /proc/sys/vm/zone_reclaim_mode. [akpm@osdl.org: cleanups] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 362a0cc37138..fd43c3e6786b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -943,6 +943,17 @@ static ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_MIN_SLAB, + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_X86_32 { -- cgit v1.2.2 From 89fa30242facca249aead2aac03c4c69764f911c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 25 Sep 2006 23:31:55 -0700 Subject: [PATCH] NUMA: Add zone_to_nid function There are many places where we need to determine the node of a zone. Currently we use a difficult to read sequence of pointer dereferencing. Put that into an inline function and use throughout VM. Maybe we can find a way to optimize the lookup in the future. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 76940361273e..cff41511269f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2245,7 +2245,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) int i; for (i = 0; zl->zones[i]; i++) { - int nid = zl->zones[i]->zone_pgdat->node_id; + int nid = zone_to_nid(zl->zones[i]); if (node_isset(nid, current->mems_allowed)) return 1; @@ -2318,7 +2318,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) return 1; - node = z->zone_pgdat->node_id; + node = zone_to_nid(z); might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); if (node_isset(node, current->mems_allowed)) return 1; -- cgit v1.2.2 From 62bac0185ad3dfef11d9602980445c54d45199c6 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 25 Sep 2006 23:31:56 -0700 Subject: [PATCH] selinux: eliminate selinux_task_ctxid Eliminate selinux_task_ctxid since it duplicates selinux_task_get_sid. Signed-off-by: Stephen Smalley Acked-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1bd8827a0102..331e17010393 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -385,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk, logged upon error */ if (f->se_rule) { if (need_sid) { - selinux_task_ctxid(tsk, &sid); + selinux_get_task_sid(tsk, &sid); need_sid = 0; } result = selinux_audit_rule_match(sid, f->type, -- cgit v1.2.2 From 1a70cd40cb291c25b67ec0da715a49d76719329d Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 25 Sep 2006 23:31:57 -0700 Subject: [PATCH] selinux: rename selinux_ctxid_to_string Rename selinux_ctxid_to_string to selinux_sid_to_string to be consistent with other interfaces. Signed-off-by: Stephen Smalley Acked-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 14 +++++++------- kernel/auditfilter.c | 2 +- kernel/auditsc.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 963fd15c9621..f9889ee77825 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) char *ctx = NULL; u32 len; int rc; - if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) + if ((rc = selinux_sid_to_string(sid, &ctx, &len))) return rc; else audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, @@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) char *ctx = NULL; u32 len; int rc; - if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) + if ((rc = selinux_sid_to_string(sid, &ctx, &len))) return rc; else audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, @@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) char *ctx = NULL; u32 len; int rc; - if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) + if ((rc = selinux_sid_to_string(sid, &ctx, &len))) return rc; else audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, @@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) char *ctx = NULL; u32 len; int rc; - if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) + if ((rc = selinux_sid_to_string(sid, &ctx, &len))) return rc; else audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, @@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; if (sid) { - if ((err = selinux_ctxid_to_string( + if ((err = selinux_sid_to_string( sid, &ctx, &len))) return err; else @@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) "user pid=%d uid=%u auid=%u", pid, uid, loginuid); if (sid) { - if (selinux_ctxid_to_string( + if (selinux_sid_to_string( sid, &ctx, &len)) { audit_log_format(ab, " ssid=%u", sid); @@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid, sid); break; case AUDIT_SIGNAL_INFO: - err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); + err = selinux_sid_to_string(audit_sig_sid, &ctx, &len); if (err) return err; sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a44879b0c72f..1a58a81fb09d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1398,7 +1398,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, if (sid) { char *ctx = NULL; u32 len; - if (selinux_ctxid_to_string(sid, &ctx, &len)) + if (selinux_sid_to_string(sid, &ctx, &len)) audit_log_format(ab, " ssid=%u", sid); else audit_log_format(ab, " subj=%s", ctx); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 331e17010393..fb83c5cb8c32 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -898,7 +898,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (axi->osid != 0) { char *ctx = NULL; u32 len; - if (selinux_ctxid_to_string( + if (selinux_sid_to_string( axi->osid, &ctx, &len)) { audit_log_format(ab, " osid=%u", axi->osid); @@ -1005,7 +1005,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (n->osid != 0) { char *ctx = NULL; u32 len; - if (selinux_ctxid_to_string( + if (selinux_sid_to_string( n->osid, &ctx, &len)) { audit_log_format(ab, " osid=%u", n->osid); call_panic = 2; -- cgit v1.2.2 From af8c65b57aaa4ae321af34dbfc5ca7f5625263fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Sep 2006 23:32:07 -0700 Subject: [PATCH] FRV: permit __do_IRQ() to be dispensed with Permit __do_IRQ() to be dispensed with based on a configuration option. Signed-off-by: David Howells Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 48a53f68af96..4c6cdbaed661 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -154,6 +154,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, return retval; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ /** * __do_IRQ - original all in one highlevel IRQ handler * @irq: the interrupt number @@ -253,6 +254,7 @@ out: return 1; } +#endif #ifdef CONFIG_TRACE_IRQFLAGS -- cgit v1.2.2 From 3a4f7577c9ef393ca80c783f02ffbc125de771c7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Sep 2006 23:32:41 -0700 Subject: [PATCH] swsusp: add write-speed instrumentation Add some instrumentation to the swsusp writeout code to show what bandwidth we're achieving. Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f1dd146bd64d..79b66e734bdb 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -146,6 +146,26 @@ static void release_swap_writer(struct swap_map_handle *handle) handle->bitmap = NULL; } +static void show_speed(struct timeval *start, struct timeval *stop, + unsigned nr_pages, char *msg) +{ + s64 elapsed_centisecs64; + int centisecs; + int k; + int kps; + + elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, + centisecs / 100, centisecs % 100, + kps / 1000, (kps % 1000) / 10); +} + static int get_swap_writer(struct swap_map_handle *handle) { handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); @@ -206,17 +226,21 @@ static int flush_swap_writer(struct swap_map_handle *handle) static int save_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, - unsigned int nr_pages) + unsigned int nr_to_write) { unsigned int m; int ret; int error = 0; + int nr_pages; + struct timeval start; + struct timeval stop; - printk("Saving image data pages (%u pages) ... ", nr_pages); - m = nr_pages / 100; + printk("Saving image data pages (%u pages) ... ", nr_to_write); + m = nr_to_write / 100; if (!m) m = 1; nr_pages = 0; + do_gettimeofday(&start); do { ret = snapshot_read_next(snapshot, PAGE_SIZE); if (ret > 0) { @@ -228,8 +252,10 @@ static int save_image(struct swap_map_handle *handle, nr_pages++; } } while (ret > 0); + do_gettimeofday(&stop); if (!error) printk("\b\b\b\bdone\n"); + show_speed(&start, &stop, nr_to_write, "Wrote"); return error; } -- cgit v1.2.2 From ab954160350c91c77ae03740ef90458c3ad5412c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Sep 2006 23:32:42 -0700 Subject: [PATCH] swsusp: write speedup Switch the swsusp writeout code from 4k-at-a-time to 4MB-at-a-time. Crufty old PIII testbox: 12.9 MB/s -> 20.9 MB/s Sony Vaio: 14.7 MB/s -> 26.5 MB/s The implementation is crude. A better one would use larger BIOs, but wouldn't gain any performance. The memcpys will be mostly pipelined with the IO and basically come for free. The ENOMEM path has not been tested. It should be. Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 93 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 75 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 79b66e734bdb..2dc883d361d5 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -49,18 +49,16 @@ static int mark_swapfiles(swp_entry_t start) { int error; - rw_swap_page_sync(READ, - swp_entry(root_swap, 0), - virt_to_page((unsigned long)&swsusp_header)); + rw_swap_page_sync(READ, swp_entry(root_swap, 0), + virt_to_page((unsigned long)&swsusp_header), NULL); if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); swsusp_header.image = start; - error = rw_swap_page_sync(WRITE, - swp_entry(root_swap, 0), - virt_to_page((unsigned long) - &swsusp_header)); + error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), + virt_to_page((unsigned long)&swsusp_header), + NULL); } else { pr_debug("swsusp: Partition is not swap space.\n"); error = -ENODEV; @@ -88,16 +86,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */ * write_page - Write one page to given swap location. * @buf: Address we're writing. * @offset: Offset of the swap page we're writing to. + * @bio_chain: Link the next write BIO here */ -static int write_page(void *buf, unsigned long offset) +static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) { swp_entry_t entry; int error = -ENOSPC; if (offset) { + struct page *page = virt_to_page(buf); + + if (bio_chain) { + /* + * Whether or not we successfully allocated a copy page, + * we take a ref on the page here. It gets undone in + * wait_on_bio_chain(). + */ + struct page *page_copy; + page_copy = alloc_page(GFP_ATOMIC); + if (page_copy == NULL) { + WARN_ON_ONCE(1); + bio_chain = NULL; /* Go synchronous */ + get_page(page); + } else { + memcpy(page_address(page_copy), + page_address(page), PAGE_SIZE); + page = page_copy; + } + } entry = swp_entry(root_swap, offset); - error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf)); + error = rw_swap_page_sync(WRITE, entry, page, bio_chain); } return error; } @@ -185,37 +204,68 @@ static int get_swap_writer(struct swap_map_handle *handle) return 0; } -static int swap_write_page(struct swap_map_handle *handle, void *buf) +static int wait_on_bio_chain(struct bio **bio_chain) { - int error; + struct bio *bio; + struct bio *next_bio; + int ret = 0; + + if (bio_chain == NULL) + return 0; + + bio = *bio_chain; + while (bio) { + struct page *page; + + next_bio = bio->bi_private; + page = bio->bi_io_vec[0].bv_page; + wait_on_page_locked(page); + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; + put_page(page); + bio_put(bio); + bio = next_bio; + } + *bio_chain = NULL; + return ret; +} + +static int swap_write_page(struct swap_map_handle *handle, void *buf, + struct bio **bio_chain) +{ + int error = 0; unsigned long offset; if (!handle->cur) return -EINVAL; offset = alloc_swap_page(root_swap, handle->bitmap); - error = write_page(buf, offset); + error = write_page(buf, offset, bio_chain); if (error) return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { + error = wait_on_bio_chain(bio_chain); + if (error) + goto out; offset = alloc_swap_page(root_swap, handle->bitmap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap); + error = write_page(handle->cur, handle->cur_swap, NULL); if (error) - return error; + goto out; memset(handle->cur, 0, PAGE_SIZE); handle->cur_swap = offset; handle->k = 0; } - return 0; +out: + return error; } static int flush_swap_writer(struct swap_map_handle *handle) { if (handle->cur && handle->cur_swap) - return write_page(handle->cur, handle->cur_swap); + return write_page(handle->cur, handle->cur_swap, NULL); else return -EINVAL; } @@ -232,6 +282,8 @@ static int save_image(struct swap_map_handle *handle, int ret; int error = 0; int nr_pages; + int err2; + struct bio *bio; struct timeval start; struct timeval stop; @@ -240,11 +292,13 @@ static int save_image(struct swap_map_handle *handle, if (!m) m = 1; nr_pages = 0; + bio = NULL; do_gettimeofday(&start); do { ret = snapshot_read_next(snapshot, PAGE_SIZE); if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot)); + error = swap_write_page(handle, data_of(*snapshot), + &bio); if (error) break; if (!(nr_pages % m)) @@ -252,7 +306,10 @@ static int save_image(struct swap_map_handle *handle, nr_pages++; } } while (ret > 0); + err2 = wait_on_bio_chain(&bio); do_gettimeofday(&stop); + if (!error) + error = err2; if (!error) printk("\b\b\b\bdone\n"); show_speed(&start, &stop, nr_to_write, "Wrote"); @@ -307,7 +364,7 @@ int swsusp_write(void) error = get_swap_writer(&handle); if (!error) { unsigned long start = handle.cur_swap; - error = swap_write_page(&handle, header); + error = swap_write_page(&handle, header, NULL); if (!error) error = save_image(&handle, &snapshot, header->pages - 1); -- cgit v1.2.2 From 8c002494b55119a3fd1dddee83b4fb75cfda47e5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Sep 2006 23:32:43 -0700 Subject: [PATCH] swsusp: add read-speed instrumentation Add some instrumentation to the swsusp readin code to show what bandwidth we're achieving. Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 2dc883d361d5..9ab989572164 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -522,12 +522,15 @@ static int load_image(struct swap_map_handle *handle, unsigned int m; int ret; int error = 0; + struct timeval start; + struct timeval stop; printk("Loading image data pages (%u pages) ... ", nr_pages); m = nr_pages / 100; if (!m) m = 1; nr_pages = 0; + do_gettimeofday(&start); do { ret = snapshot_write_next(snapshot, PAGE_SIZE); if (ret > 0) { @@ -539,11 +542,13 @@ static int load_image(struct swap_map_handle *handle, nr_pages++; } } while (ret > 0); + do_gettimeofday(&stop); if (!error) { printk("\b\b\b\bdone\n"); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; } + show_speed(&start, &stop, nr_pages, "Read"); return error; } -- cgit v1.2.2 From 546e0d271941dd1ff6961e2a1f7eac75f1fc277e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 25 Sep 2006 23:32:44 -0700 Subject: [PATCH] swsusp: read speedup Implement async reads for swsusp resuming. Crufty old PIII testbox: 15.7 MB/s -> 20.3 MB/s Sony Vaio: 14.6 MB/s -> 33.3 MB/s I didn't implement the post-resume bio_set_pages_dirty(). I don't really understand why resume needs to run set_page_dirty() against these pages. It might be a worry that this code modifies PG_Uptodate, PG_Error and PG_Locked against the image pages. Can this possibly affect the resumed-into kernel? Hopefully not, if we're atomically restoring its mem_map? Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Jens Axboe Cc: Laurent Riffard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 1 + kernel/power/snapshot.c | 11 ++-- kernel/power/swap.c | 138 ++++++++++++++++++++++++------------------------ 3 files changed, 79 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 57a792982fb9..59ce712f082d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -58,6 +58,7 @@ struct snapshot_handle { struct pbe *pbe, *last_pbe; void *buffer; unsigned int buf_offset; + int sync_read; }; #define data_of(handle) ((handle).buffer + (handle).buf_offset) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 75d4886e648e..591301ae8b7d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -314,7 +314,7 @@ static unsigned int unsafe_pages; * and we count them using unsafe_pages */ -static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) +static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) { void *res; @@ -828,13 +828,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) } if (!handle->offset) handle->buffer = buffer; + handle->sync_read = 1; if (handle->prev < handle->page) { if (!handle->prev) { - error = load_header(handle, (struct swsusp_info *)buffer); + error = load_header(handle, + (struct swsusp_info *)buffer); if (error) return error; } else if (handle->prev <= nr_meta_pages) { - handle->pbe = unpack_orig_addresses(buffer, handle->pbe); + handle->pbe = unpack_orig_addresses(buffer, + handle->pbe); if (!handle->pbe) { error = prepare_image(handle); if (error) @@ -842,10 +845,12 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) handle->pbe = pagedir_nosave; handle->last_pbe = NULL; handle->buffer = get_buffer(handle); + handle->sync_read = 0; } } else { handle->pbe = handle->pbe->next; handle->buffer = get_buffer(handle); + handle->sync_read = 0; } handle->prev = handle->page; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9ab989572164..8309d20b2563 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -214,6 +215,8 @@ static int wait_on_bio_chain(struct bio **bio_chain) return 0; bio = *bio_chain; + if (bio == NULL) + return 0; while (bio) { struct page *page; @@ -349,7 +352,8 @@ int swsusp_write(void) int error; if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); + printk(KERN_ERR "swsusp: Cannot find swap device, try " + "swapon -a.\n"); return error; } memset(&snapshot, 0, sizeof(struct snapshot_handle)); @@ -381,27 +385,6 @@ int swsusp_write(void) return error; } -/* - * Using bio to read from swap. - * This code requires a bit more work than just using buffer heads - * but, it is the recommended way for 2.5/2.6. - * The following are to signal the beginning and end of I/O. Bios - * finish asynchronously, while we want them to happen synchronously. - * A simple atomic_t, and a wait loop take care of this problem. - */ - -static atomic_t io_done = ATOMIC_INIT(0); - -static int end_io(struct bio *bio, unsigned int num, int err) -{ - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { - printk(KERN_ERR "I/O error reading swsusp image.\n"); - return -EIO; - } - atomic_set(&io_done, 0); - return 0; -} - static struct block_device *resume_bdev; /** @@ -409,15 +392,15 @@ static struct block_device *resume_bdev; * @rw: READ or WRITE. * @off physical offset of page. * @page: page we're reading or writing. + * @bio_chain: list of pending biod (for async reading) * * Straight from the textbook - allocate and initialize the bio. - * If we're writing, make sure the page is marked as dirty. - * Then submit it and wait. + * If we're reading, make sure the page is marked as dirty. + * Then submit it and, if @bio_chain == NULL, wait. */ - -static int submit(int rw, pgoff_t page_off, void *page) +static int submit(int rw, pgoff_t page_off, struct page *page, + struct bio **bio_chain) { - int error = 0; struct bio *bio; bio = bio_alloc(GFP_ATOMIC, 1); @@ -425,33 +408,40 @@ static int submit(int rw, pgoff_t page_off, void *page) return -ENOMEM; bio->bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_io; + bio->bi_end_io = end_swap_bio_read; - if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { - printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); - error = -EFAULT; - goto Done; + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); + bio_put(bio); + return -EFAULT; } - atomic_set(&io_done, 1); - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - while (atomic_read(&io_done)) - yield(); - if (rw == READ) - bio_set_pages_dirty(bio); - Done: - bio_put(bio); - return error; + lock_page(page); + bio_get(bio); + + if (bio_chain == NULL) { + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + wait_on_page_locked(page); + if (rw == READ) + bio_set_pages_dirty(bio); + bio_put(bio); + } else { + get_page(page); + bio->bi_private = *bio_chain; + *bio_chain = bio; + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + } + return 0; } -static int bio_read_page(pgoff_t page_off, void *page) +static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) { - return submit(READ, page_off, page); + return submit(READ, page_off, virt_to_page(addr), bio_chain); } -static int bio_write_page(pgoff_t page_off, void *page) +static int bio_write_page(pgoff_t page_off, void *addr) { - return submit(WRITE, page_off, page); + return submit(WRITE, page_off, virt_to_page(addr), NULL); } /** @@ -476,7 +466,7 @@ static int get_swap_reader(struct swap_map_handle *handle, handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); if (!handle->cur) return -ENOMEM; - error = bio_read_page(swp_offset(start), handle->cur); + error = bio_read_page(swp_offset(start), handle->cur, NULL); if (error) { release_swap_reader(handle); return error; @@ -485,7 +475,8 @@ static int get_swap_reader(struct swap_map_handle *handle, return 0; } -static int swap_read_page(struct swap_map_handle *handle, void *buf) +static int swap_read_page(struct swap_map_handle *handle, void *buf, + struct bio **bio_chain) { unsigned long offset; int error; @@ -495,16 +486,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf) offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = bio_read_page(offset, buf); + error = bio_read_page(offset, buf, bio_chain); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { + error = wait_on_bio_chain(bio_chain); handle->k = 0; offset = handle->cur->next_swap; if (!offset) release_swap_reader(handle); - else - error = bio_read_page(offset, handle->cur); + else if (!error) + error = bio_read_page(offset, handle->cur, NULL); } return error; } @@ -517,38 +509,48 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf) static int load_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, - unsigned int nr_pages) + unsigned int nr_to_read) { unsigned int m; - int ret; int error = 0; struct timeval start; struct timeval stop; + struct bio *bio; + int err2; + unsigned nr_pages; - printk("Loading image data pages (%u pages) ... ", nr_pages); - m = nr_pages / 100; + printk("Loading image data pages (%u pages) ... ", nr_to_read); + m = nr_to_read / 100; if (!m) m = 1; nr_pages = 0; + bio = NULL; do_gettimeofday(&start); - do { - ret = snapshot_write_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_read_page(handle, data_of(*snapshot)); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - } while (ret > 0); + for ( ; ; ) { + error = snapshot_write_next(snapshot, PAGE_SIZE); + if (error <= 0) + break; + error = swap_read_page(handle, data_of(*snapshot), &bio); + if (error) + break; + if (snapshot->sync_read) + error = wait_on_bio_chain(&bio); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + err2 = wait_on_bio_chain(&bio); do_gettimeofday(&stop); + if (!error) + error = err2; if (!error) { printk("\b\b\b\bdone\n"); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; } - show_speed(&start, &stop, nr_pages, "Read"); + show_speed(&start, &stop, nr_to_read, "Read"); return error; } @@ -571,7 +573,7 @@ int swsusp_read(void) header = (struct swsusp_info *)data_of(snapshot); error = get_swap_reader(&handle, swsusp_header.image); if (!error) - error = swap_read_page(&handle, header); + error = swap_read_page(&handle, header, NULL); if (!error) error = load_image(&handle, &snapshot, header->pages - 1); release_swap_reader(&handle); @@ -597,7 +599,7 @@ int swsusp_check(void) if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); memset(&swsusp_header, 0, sizeof(swsusp_header)); - if ((error = bio_read_page(0, &swsusp_header))) + if ((error = bio_read_page(0, &swsusp_header, NULL))) return error; if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); -- cgit v1.2.2 From ae83c5eef59ffe6eb61110b8c8fd1ea3e0881712 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:45 -0700 Subject: [PATCH] swsusp: clean up browsing of pfns Clean up some loops over pfns for each zone in snapshot.c: reduce the number of additions to perform, rework detection of saveable pages and make the code a bit less difficult to understand, hopefully. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 62 +++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 591301ae8b7d..979096c27773 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -156,7 +156,7 @@ static inline int save_highmem(void) {return 0;} static inline int restore_highmem(void) {return 0;} #endif -static int pfn_is_nosave(unsigned long pfn) +static inline int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; @@ -167,43 +167,43 @@ static int pfn_is_nosave(unsigned long pfn) * saveable - Determine whether a page should be cloned or not. * @pfn: The page * - * We save a page if it's Reserved, and not in the range of pages - * statically defined as 'unsaveable', or if it isn't reserved, and - * isn't part of a free chunk of pages. + * We save a page if it isn't Nosave, and is not in the range of pages + * statically defined as 'unsaveable', and it + * isn't a part of a free chunk of pages. */ -static int saveable(struct zone *zone, unsigned long *zone_pfn) +static struct page *saveable_page(unsigned long pfn) { - unsigned long pfn = *zone_pfn + zone->zone_start_pfn; struct page *page; if (!pfn_valid(pfn)) - return 0; + return NULL; page = pfn_to_page(pfn); - BUG_ON(PageReserved(page) && PageNosave(page)); + if (PageNosave(page)) - return 0; + return NULL; if (PageReserved(page) && pfn_is_nosave(pfn)) - return 0; + return NULL; if (PageNosaveFree(page)) - return 0; + return NULL; - return 1; + return page; } unsigned int count_data_pages(void) { struct zone *zone; - unsigned long zone_pfn; + unsigned long pfn, max_zone_pfn; unsigned int n = 0; for_each_zone (zone) { if (is_highmem(zone)) continue; mark_free_pages(zone); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - n += saveable(zone, &zone_pfn); + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + n += !!saveable_page(pfn); } return n; } @@ -211,7 +211,7 @@ unsigned int count_data_pages(void) static void copy_data_pages(struct pbe *pblist) { struct zone *zone; - unsigned long zone_pfn; + unsigned long pfn, max_zone_pfn; struct pbe *pbe, *p; pbe = pblist; @@ -224,13 +224,14 @@ static void copy_data_pages(struct pbe *pblist) SetPageNosaveFree(virt_to_page(p)); for_each_pbe (p, pblist) SetPageNosaveFree(virt_to_page(p->address)); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { - if (saveable(zone, &zone_pfn)) { - struct page *page; + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + struct page *page = saveable_page(pfn); + + if (page) { long *src, *dst; int n; - page = pfn_to_page(zone_pfn + zone->zone_start_pfn); BUG_ON(!pbe); pbe->orig_address = (unsigned long)page_address(page); /* copy_page and memcpy are not usable for copying task structs. */ @@ -383,13 +384,14 @@ static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, void swsusp_free(void) { struct zone *zone; - unsigned long zone_pfn; + unsigned long pfn, max_zone_pfn; for_each_zone(zone) { - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { - struct page *page; - page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (PageNosave(page) && PageNosaveFree(page)) { ClearPageNosave(page); ClearPageNosaveFree(page); @@ -598,7 +600,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) static int mark_unsafe_pages(struct pbe *pblist) { struct zone *zone; - unsigned long zone_pfn; + unsigned long pfn, max_zone_pfn; struct pbe *p; if (!pblist) /* a sanity check */ @@ -606,10 +608,10 @@ static int mark_unsafe_pages(struct pbe *pblist) /* Clear page flags */ for_each_zone (zone) { - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - if (pfn_valid(zone_pfn + zone->zone_start_pfn)) - ClearPageNosaveFree(pfn_to_page(zone_pfn + - zone->zone_start_pfn)); + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) + ClearPageNosaveFree(pfn_to_page(pfn)); } /* Mark orig addresses */ -- cgit v1.2.2 From fb13a28b0f5ada60861868c4fa48a12bd0cb8dea Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:46 -0700 Subject: [PATCH] swsusp: struct snapshot_handle cleanup Add comments describing struct snapshot_handle and its members, change the confusing name of its member 'page' to 'cur'. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 64 ++++++++++++++++++++++++++++++++++++++++++------- kernel/power/snapshot.c | 40 +++++++++++++++---------------- 2 files changed, 76 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 59ce712f082d..1cefcf87a694 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -50,17 +50,65 @@ extern asmlinkage int swsusp_arch_resume(void); extern unsigned int count_data_pages(void); +/** + * Auxiliary structure used for reading the snapshot image data and + * metadata from and writing them to the list of page backup entries + * (PBEs) which is the main data structure of swsusp. + * + * Using struct snapshot_handle we can transfer the image, including its + * metadata, as a continuous sequence of bytes with the help of + * snapshot_read_next() and snapshot_write_next(). + * + * The code that writes the image to a storage or transfers it to + * the user land is required to use snapshot_read_next() for this + * purpose and it should not make any assumptions regarding the internal + * structure of the image. Similarly, the code that reads the image from + * a storage or transfers it from the user land is required to use + * snapshot_write_next(). + * + * This may allow us to change the internal structure of the image + * in the future with considerably less effort. + */ + struct snapshot_handle { - loff_t offset; - unsigned int page; - unsigned int page_offset; - unsigned int prev; - struct pbe *pbe, *last_pbe; - void *buffer; - unsigned int buf_offset; - int sync_read; + loff_t offset; /* number of the last byte ready for reading + * or writing in the sequence + */ + unsigned int cur; /* number of the block of PAGE_SIZE bytes the + * next operation will refer to (ie. current) + */ + unsigned int cur_offset; /* offset with respect to the current + * block (for the next operation) + */ + unsigned int prev; /* number of the block of PAGE_SIZE bytes that + * was the current one previously + */ + struct pbe *pbe; /* PBE that corresponds to 'buffer' */ + struct pbe *last_pbe; /* When the image is restored (eg. read + * from disk) we can store some image + * data directly in the page frames + * in which they were before suspend. + * In such a case the PBEs that + * correspond to them will be unused. + * This is the last PBE, so far, that + * does not correspond to such data. + */ + void *buffer; /* address of the block to read from + * or write to + */ + unsigned int buf_offset; /* location to read from or write to, + * given as a displacement from 'buffer' + */ + int sync_read; /* Set to one to notify the caller of + * snapshot_write_next() that it may + * need to call wait_on_bio_chain() + */ }; +/* This macro returns the address from/to which the caller of + * snapshot_read_next()/snapshot_write_next() is allowed to + * read/write data after the function returns + */ #define data_of(handle) ((handle).buffer + (handle).buf_offset) extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 979096c27773..81fe8de9e604 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -555,7 +555,7 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb int snapshot_read_next(struct snapshot_handle *handle, size_t count) { - if (handle->page > nr_meta_pages + nr_copy_pages) + if (handle->cur > nr_meta_pages + nr_copy_pages) return 0; if (!buffer) { /* This makes the buffer be freed by swsusp_free() */ @@ -568,8 +568,8 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) handle->buffer = buffer; handle->pbe = pagedir_nosave; } - if (handle->prev < handle->page) { - if (handle->page <= nr_meta_pages) { + if (handle->prev < handle->cur) { + if (handle->cur <= nr_meta_pages) { handle->pbe = pack_orig_addresses(buffer, handle->pbe); if (!handle->pbe) handle->pbe = pagedir_nosave; @@ -577,15 +577,15 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) handle->buffer = (void *)handle->pbe->address; handle->pbe = handle->pbe->next; } - handle->prev = handle->page; + handle->prev = handle->cur; } - handle->buf_offset = handle->page_offset; - if (handle->page_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->page_offset; - handle->page_offset = 0; - handle->page++; + handle->buf_offset = handle->cur_offset; + if (handle->cur_offset + count >= PAGE_SIZE) { + count = PAGE_SIZE - handle->cur_offset; + handle->cur_offset = 0; + handle->cur++; } else { - handle->page_offset += count; + handle->cur_offset += count; } handle->offset += count; return count; @@ -820,7 +820,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) { int error = 0; - if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) + if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) return 0; if (!buffer) { /* This makes the buffer be freed by swsusp_free() */ @@ -831,7 +831,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) if (!handle->offset) handle->buffer = buffer; handle->sync_read = 1; - if (handle->prev < handle->page) { + if (handle->prev < handle->cur) { if (!handle->prev) { error = load_header(handle, (struct swsusp_info *)buffer); @@ -854,15 +854,15 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) handle->buffer = get_buffer(handle); handle->sync_read = 0; } - handle->prev = handle->page; + handle->prev = handle->cur; } - handle->buf_offset = handle->page_offset; - if (handle->page_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->page_offset; - handle->page_offset = 0; - handle->page++; + handle->buf_offset = handle->cur_offset; + if (handle->cur_offset + count >= PAGE_SIZE) { + count = PAGE_SIZE - handle->cur_offset; + handle->cur_offset = 0; + handle->cur++; } else { - handle->page_offset += count; + handle->cur_offset += count; } handle->offset += count; return count; @@ -871,5 +871,5 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) int snapshot_image_loaded(struct snapshot_handle *handle) { return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || - handle->page <= nr_meta_pages + nr_copy_pages); + handle->cur <= nr_meta_pages + nr_copy_pages); } -- cgit v1.2.2 From e3920fb42c8ddfe63befb54d95c0e13eabacea9b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:48 -0700 Subject: [PATCH] Disable CPU hotplug during suspend The current suspend code has to be run on one CPU, so we use the CPU hotplug to take the non-boot CPUs offline on SMP machines. However, we should also make sure that these CPUs will not be enabled by someone else after we have disabled them. The functions disable_nonboot_cpus() and enable_nonboot_cpus() are moved to kernel/cpu.c, because they now refer to some stuff in there that should better be static. Also it's better if disable_nonboot_cpus() returns an error instead of panicking if something goes wrong, and enable_nonboot_cpus() has no reason to panic(), because the CPUs may have been enabled by the userland before it tries to take them online. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 138 ++++++++++++++++++++++++++++++++++++++++++-------- kernel/power/Makefile | 2 - kernel/power/disk.c | 7 ++- kernel/power/main.c | 10 ++-- kernel/power/smp.c | 62 ----------------------- kernel/power/user.c | 14 +++-- 6 files changed, 137 insertions(+), 96 deletions(-) delete mode 100644 kernel/power/smp.c (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f230f9ae01c2..32c96628463e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -21,6 +21,11 @@ static DEFINE_MUTEX(cpu_bitmask_lock); static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); +/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. + * Should always be manipulated under cpu_add_remove_lock + */ +static int cpu_hotplug_disabled; + #ifdef CONFIG_HOTPLUG_CPU /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ @@ -108,30 +113,25 @@ static int take_cpu_down(void *unused) return 0; } -int cpu_down(unsigned int cpu) +/* Requires cpu_add_remove_lock to be held */ +static int _cpu_down(unsigned int cpu) { int err; struct task_struct *p; cpumask_t old_allowed, tmp; - mutex_lock(&cpu_add_remove_lock); - if (num_online_cpus() == 1) { - err = -EBUSY; - goto out; - } + if (num_online_cpus() == 1) + return -EBUSY; - if (!cpu_online(cpu)) { - err = -EINVAL; - goto out; - } + if (!cpu_online(cpu)) + return -EINVAL; err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, (void *)(long)cpu); if (err == NOTIFY_BAD) { printk("%s: attempt to take down CPU %u failed\n", __FUNCTION__, cpu); - err = -EINVAL; - goto out; + return -EINVAL; } /* Ensure that we are not runnable on dying cpu */ @@ -179,22 +179,32 @@ out_thread: err = kthread_stop(p); out_allowed: set_cpus_allowed(current, old_allowed); -out: + return err; +} + +int cpu_down(unsigned int cpu) +{ + int err = 0; + + mutex_lock(&cpu_add_remove_lock); + if (cpu_hotplug_disabled) + err = -EBUSY; + else + err = _cpu_down(cpu); + mutex_unlock(&cpu_add_remove_lock); return err; } #endif /*CONFIG_HOTPLUG_CPU*/ -int __devinit cpu_up(unsigned int cpu) +/* Requires cpu_add_remove_lock to be held */ +static int __devinit _cpu_up(unsigned int cpu) { int ret; void *hcpu = (void *)(long)cpu; - mutex_lock(&cpu_add_remove_lock); - if (cpu_online(cpu) || !cpu_present(cpu)) { - ret = -EINVAL; - goto out; - } + if (cpu_online(cpu) || !cpu_present(cpu)) + return -EINVAL; ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { @@ -219,7 +229,95 @@ out_notify: if (ret != 0) blocking_notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); + + return ret; +} + +int __devinit cpu_up(unsigned int cpu) +{ + int err = 0; + + mutex_lock(&cpu_add_remove_lock); + if (cpu_hotplug_disabled) + err = -EBUSY; + else + err = _cpu_up(cpu); + + mutex_unlock(&cpu_add_remove_lock); + return err; +} + +#ifdef CONFIG_SUSPEND_SMP +static cpumask_t frozen_cpus; + +int disable_nonboot_cpus(void) +{ + int cpu, first_cpu, error; + + mutex_lock(&cpu_add_remove_lock); + first_cpu = first_cpu(cpu_present_map); + if (!cpu_online(first_cpu)) { + error = _cpu_up(first_cpu); + if (error) { + printk(KERN_ERR "Could not bring CPU%d up.\n", + first_cpu); + goto out; + } + } + error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); + if (error) { + printk(KERN_ERR "Could not run on CPU%d\n", first_cpu); + goto out; + } + /* We take down all of the non-boot CPUs in one shot to avoid races + * with the userspace trying to use the CPU hotplug at the same time + */ + cpus_clear(frozen_cpus); + printk("Disabling non-boot CPUs ...\n"); + for_each_online_cpu(cpu) { + if (cpu == first_cpu) + continue; + error = _cpu_down(cpu); + if (!error) { + cpu_set(cpu, frozen_cpus); + printk("CPU%d is down\n", cpu); + } else { + printk(KERN_ERR "Error taking CPU%d down: %d\n", + cpu, error); + break; + } + } + if (!error) { + BUG_ON(num_online_cpus() > 1); + /* Make sure the CPUs won't be enabled by someone else */ + cpu_hotplug_disabled = 1; + } else { + printk(KERN_ERR "Non-boot CPUs are not disabled"); + } out: mutex_unlock(&cpu_add_remove_lock); - return ret; + return error; +} + +void enable_nonboot_cpus(void) +{ + int cpu, error; + + /* Allow everyone to use the CPU hotplug again */ + mutex_lock(&cpu_add_remove_lock); + cpu_hotplug_disabled = 0; + mutex_unlock(&cpu_add_remove_lock); + + printk("Enabling non-boot CPUs ...\n"); + for_each_cpu_mask(cpu, frozen_cpus) { + error = cpu_up(cpu); + if (!error) { + printk("CPU%d is up\n", cpu); + continue; + } + printk(KERN_WARNING "Error taking CPU%d up: %d\n", + cpu, error); + } + cpus_clear(frozen_cpus); } +#endif diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 8d0af3d37a4b..38725f526afc 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -7,6 +7,4 @@ obj-y := main.o process.o console.o obj-$(CONFIG_PM_LEGACY) += pm.o obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o -obj-$(CONFIG_SUSPEND_SMP) += smp.o - obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c index e13e74067845..7c7b9b65e365 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "power.h" @@ -72,7 +73,10 @@ static int prepare_processes(void) int error; pm_prepare_console(); - disable_nonboot_cpus(); + + error = disable_nonboot_cpus(); + if (error) + goto enable_cpus; if (freeze_processes()) { error = -EBUSY; @@ -84,6 +88,7 @@ static int prepare_processes(void) return 0; thaw: thaw_processes(); +enable_cpus: enable_nonboot_cpus(); pm_restore_console(); return error; diff --git a/kernel/power/main.c b/kernel/power/main.c index 6d295c776794..4d403323a7bb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "power.h" @@ -51,7 +52,7 @@ void pm_set_ops(struct pm_ops * ops) static int suspend_prepare(suspend_state_t state) { - int error = 0; + int error; unsigned int free_pages; if (!pm_ops || !pm_ops->enter) @@ -59,12 +60,9 @@ static int suspend_prepare(suspend_state_t state) pm_prepare_console(); - disable_nonboot_cpus(); - - if (num_online_cpus() != 1) { - error = -EPERM; + error = disable_nonboot_cpus(); + if (error) goto Enable_cpu; - } if (freeze_processes()) { error = -EAGAIN; diff --git a/kernel/power/smp.c b/kernel/power/smp.c deleted file mode 100644 index 5957312b2d68..000000000000 --- a/kernel/power/smp.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * drivers/power/smp.c - Functions for stopping other CPUs. - * - * Copyright 2004 Pavel Machek - * Copyright (C) 2002-2003 Nigel Cunningham - * - * This file is released under the GPLv2. - */ - -#undef DEBUG - -#include -#include -#include -#include -#include -#include -#include - -/* This is protected by pm_sem semaphore */ -static cpumask_t frozen_cpus; - -void disable_nonboot_cpus(void) -{ - int cpu, error; - - error = 0; - cpus_clear(frozen_cpus); - printk("Freezing cpus ...\n"); - for_each_online_cpu(cpu) { - if (cpu == 0) - continue; - error = cpu_down(cpu); - if (!error) { - cpu_set(cpu, frozen_cpus); - printk("CPU%d is down\n", cpu); - continue; - } - printk("Error taking cpu %d down: %d\n", cpu, error); - } - BUG_ON(raw_smp_processor_id() != 0); - if (error) - panic("cpus not sleeping"); -} - -void enable_nonboot_cpus(void) -{ - int cpu, error; - - printk("Thawing cpus ...\n"); - for_each_cpu_mask(cpu, frozen_cpus) { - error = cpu_up(cpu); - if (!error) { - printk("CPU%d is up\n", cpu); - continue; - } - printk("Error taking cpu %d up: %d\n", cpu, error); - panic("Not enough cpus"); - } - cpus_clear(frozen_cpus); -} - diff --git a/kernel/power/user.c b/kernel/power/user.c index 3f1539fbe48a..0ef5e4ba39e5 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, if (data->frozen) break; down(&pm_sem); - disable_nonboot_cpus(); - if (freeze_processes()) { - thaw_processes(); - enable_nonboot_cpus(); - error = -EBUSY; + error = disable_nonboot_cpus(); + if (!error) { + error = freeze_processes(); + if (error) { + thaw_processes(); + error = -EBUSY; + } } + enable_nonboot_cpus(); up(&pm_sem); if (!error) data->frozen = 1; -- cgit v1.2.2 From f623f0db8e6aa86a37be86167e4ff478821a9f4f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:49 -0700 Subject: [PATCH] swsusp: Fix mark_free_pages Clean up mm/page_alloc.c#mark_free_pages() and make it avoid clearing PageNosaveFree for PageNosave pages. This allows us to get rid of an ugly hack in kernel/power/snapshot.c#copy_data_pages(). Additionally, the page-copying loop in copy_data_pages() is moved to an inline function. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 81fe8de9e604..4afb7900002b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -208,37 +208,36 @@ unsigned int count_data_pages(void) return n; } +static inline void copy_data_page(long *dst, long *src) +{ + int n; + + /* copy_page and memcpy are not usable for copying task structs. */ + for (n = PAGE_SIZE / sizeof(long); n; n--) + *dst++ = *src++; +} + static void copy_data_pages(struct pbe *pblist) { struct zone *zone; unsigned long pfn, max_zone_pfn; - struct pbe *pbe, *p; + struct pbe *pbe; pbe = pblist; for_each_zone (zone) { if (is_highmem(zone)) continue; mark_free_pages(zone); - /* This is necessary for swsusp_free() */ - for_each_pb_page (p, pblist) - SetPageNosaveFree(virt_to_page(p)); - for_each_pbe (p, pblist) - SetPageNosaveFree(virt_to_page(p->address)); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { struct page *page = saveable_page(pfn); if (page) { - long *src, *dst; - int n; + void *ptr = page_address(page); BUG_ON(!pbe); - pbe->orig_address = (unsigned long)page_address(page); - /* copy_page and memcpy are not usable for copying task structs. */ - dst = (long *)pbe->address; - src = (long *)pbe->orig_address; - for (n = PAGE_SIZE / sizeof(long); n; n--) - *dst++ = *src++; + copy_data_page((void *)pbe->address, ptr); + pbe->orig_address = (unsigned long)ptr; pbe = pbe->next; } } -- cgit v1.2.2 From f6143aa60ed71e58578bc92cc64d98158a694d99 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:50 -0700 Subject: [PATCH] swsusp: Reorder memory-allocating functions Move some functions in kernel/power/snapshot.c to a better place (in the same file) and introduce free_image_page() (will be necessary in the future). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 93 ++++++++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4afb7900002b..7ad0c0465524 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -156,6 +156,58 @@ static inline int save_highmem(void) {return 0;} static inline int restore_highmem(void) {return 0;} #endif +/** + * @safe_needed - on resume, for storing the PBE list and the image, + * we can only use memory pages that do not conflict with the pages + * used before suspend. + * + * The unsafe pages are marked with the PG_nosave_free flag + * and we count them using unsafe_pages + */ + +static unsigned int unsafe_pages; + +static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) +{ + void *res; + + res = (void *)get_zeroed_page(gfp_mask); + if (safe_needed) + while (res && PageNosaveFree(virt_to_page(res))) { + /* The page is unsafe, mark it for swsusp_free() */ + SetPageNosave(virt_to_page(res)); + unsafe_pages++; + res = (void *)get_zeroed_page(gfp_mask); + } + if (res) { + SetPageNosave(virt_to_page(res)); + SetPageNosaveFree(virt_to_page(res)); + } + return res; +} + +unsigned long get_safe_page(gfp_t gfp_mask) +{ + return (unsigned long)alloc_image_page(gfp_mask, 1); +} + +/** + * free_image_page - free page represented by @addr, allocated with + * alloc_image_page (page flags set by it must be cleared) + */ + +static inline void free_image_page(void *addr, int clear_nosave_free) +{ + ClearPageNosave(virt_to_page(addr)); + if (clear_nosave_free) + ClearPageNosaveFree(virt_to_page(addr)); + free_page((unsigned long)addr); +} + +/** + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + static inline int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; @@ -245,7 +297,6 @@ static void copy_data_pages(struct pbe *pblist) BUG_ON(pbe); } - /** * free_pagedir - free pages allocated with alloc_pagedir() */ @@ -256,10 +307,7 @@ static void free_pagedir(struct pbe *pblist, int clear_nosave_free) while (pblist) { pbe = (pblist + PB_PAGE_SKIP)->next; - ClearPageNosave(virt_to_page(pblist)); - if (clear_nosave_free) - ClearPageNosaveFree(virt_to_page(pblist)); - free_page((unsigned long)pblist); + free_image_page(pblist, clear_nosave_free); pblist = pbe; } } @@ -303,41 +351,6 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) } } -static unsigned int unsafe_pages; - -/** - * @safe_needed - on resume, for storing the PBE list and the image, - * we can only use memory pages that do not conflict with the pages - * used before suspend. - * - * The unsafe pages are marked with the PG_nosave_free flag - * and we count them using unsafe_pages - */ - -static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) -{ - void *res; - - res = (void *)get_zeroed_page(gfp_mask); - if (safe_needed) - while (res && PageNosaveFree(virt_to_page(res))) { - /* The page is unsafe, mark it for swsusp_free() */ - SetPageNosave(virt_to_page(res)); - unsafe_pages++; - res = (void *)get_zeroed_page(gfp_mask); - } - if (res) { - SetPageNosave(virt_to_page(res)); - SetPageNosaveFree(virt_to_page(res)); - } - return res; -} - -unsigned long get_safe_page(gfp_t gfp_mask) -{ - return (unsigned long)alloc_image_page(gfp_mask, 1); -} - /** * alloc_pagedir - Allocate the page directory. * -- cgit v1.2.2 From cd560bb2f9e2cd451bb3942af43da19632ba4a8e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:50 -0700 Subject: [PATCH] swsusp: Fix alloc_pagedir Get rid of the FIXME in kernel/power/snapshot.c#alloc_pagedir() and simplify the functions called by it. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 7ad0c0465524..4ca372f2bc14 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -316,12 +316,12 @@ static void free_pagedir(struct pbe *pblist, int clear_nosave_free) * fill_pb_page - Create a list of PBEs on a given memory page */ -static inline void fill_pb_page(struct pbe *pbpage) +static inline void fill_pb_page(struct pbe *pbpage, unsigned int n) { struct pbe *p; p = pbpage; - pbpage += PB_PAGE_SKIP; + pbpage += n - 1; do p->next = p + 1; while (++p < pbpage); @@ -330,24 +330,26 @@ static inline void fill_pb_page(struct pbe *pbpage) /** * create_pbe_list - Create a list of PBEs on top of a given chain * of memory pages allocated with alloc_pagedir() + * + * This function assumes that pages allocated by alloc_image_page() will + * always be zeroed. */ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) { - struct pbe *pbpage, *p; + struct pbe *pbpage; unsigned int num = PBES_PER_PAGE; for_each_pb_page (pbpage, pblist) { if (num >= nr_pages) break; - fill_pb_page(pbpage); + fill_pb_page(pbpage, PBES_PER_PAGE); num += PBES_PER_PAGE; } if (pbpage) { - for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) - p->next = p + 1; - p->next = NULL; + num -= PBES_PER_PAGE; + fill_pb_page(pbpage, nr_pages - num); } } @@ -374,17 +376,17 @@ static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, return NULL; pblist = alloc_image_page(gfp_mask, safe_needed); - /* FIXME: rewrite this ugly loop */ - for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; - pbe = pbe->next, num += PBES_PER_PAGE) { + pbe = pblist; + for (num = PBES_PER_PAGE; num < nr_pages; num += PBES_PER_PAGE) { + if (!pbe) { + free_pagedir(pblist, 1); + return NULL; + } pbe += PB_PAGE_SKIP; pbe->next = alloc_image_page(gfp_mask, safe_needed); + pbe = pbe->next; } - if (!pbe) { /* get_zeroed_page() failed */ - free_pagedir(pblist, 1); - pblist = NULL; - } else - create_pbe_list(pblist, nr_pages); + create_pbe_list(pblist, nr_pages); return pblist; } -- cgit v1.2.2 From 75534b50cc658e951bcb213c2763c81e9f7b0b48 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:52 -0700 Subject: [PATCH] Change the name of pagedir_nosave The name of the pagedir_nosave variable does not make sense any more, so it seems reasonable to change it to something more meaningful. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 2 -- kernel/power/snapshot.c | 26 ++++++++++++++------------ 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 1cefcf87a694..e18ba207e784 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -38,8 +38,6 @@ extern struct subsystem power_subsys; /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; -extern struct pbe *pagedir_nosave; - /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; extern int in_suspend; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4ca372f2bc14..1d276b3ae152 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -34,7 +34,9 @@ #include "power.h" -struct pbe *pagedir_nosave; +/* List of PBEs used for creating and restoring the suspend image */ +struct pbe *restore_pblist; + static unsigned int nr_copy_pages; static unsigned int nr_meta_pages; static unsigned long *buffer; @@ -415,7 +417,7 @@ void swsusp_free(void) } nr_copy_pages = 0; nr_meta_pages = 0; - pagedir_nosave = NULL; + restore_pblist = NULL; buffer = NULL; } @@ -490,15 +492,15 @@ asmlinkage int swsusp_save(void) return -ENOMEM; } - pagedir_nosave = swsusp_alloc(nr_pages); - if (!pagedir_nosave) + restore_pblist = swsusp_alloc(nr_pages); + if (!restore_pblist) return -ENOMEM; /* During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ drain_local_pages(); - copy_data_pages(pagedir_nosave); + copy_data_pages(restore_pblist); /* * End of critical section. From now on, we can write to memory, @@ -580,13 +582,13 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) if (!handle->offset) { init_header((struct swsusp_info *)buffer); handle->buffer = buffer; - handle->pbe = pagedir_nosave; + handle->pbe = restore_pblist; } if (handle->prev < handle->cur) { if (handle->cur <= nr_meta_pages) { handle->pbe = pack_orig_addresses(buffer, handle->pbe); if (!handle->pbe) - handle->pbe = pagedir_nosave; + handle->pbe = restore_pblist; } else { handle->buffer = (void *)handle->pbe->address; handle->pbe = handle->pbe->next; @@ -689,7 +691,7 @@ static int load_header(struct snapshot_handle *handle, pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0); if (!pblist) return -ENOMEM; - pagedir_nosave = pblist; + restore_pblist = pblist; handle->pbe = pblist; nr_copy_pages = info->image_pages; nr_meta_pages = info->pages - info->image_pages - 1; @@ -716,7 +718,7 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf, /** * prepare_image - use metadata contained in the PBE list - * pointed to by pagedir_nosave to mark the pages that will + * pointed to by restore_pblist to mark the pages that will * be overwritten in the process of restoring the system * memory state from the image ("unsafe" pages) and allocate * memory for the image @@ -741,7 +743,7 @@ static int prepare_image(struct snapshot_handle *handle) unsigned int nr_pages = nr_copy_pages; struct pbe *p, *pblist = NULL; - p = pagedir_nosave; + p = restore_pblist; error = mark_unsafe_pages(p); if (!error) { pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); @@ -773,7 +775,7 @@ static int prepare_image(struct snapshot_handle *handle) } } if (!error) { - pagedir_nosave = pblist; + restore_pblist = pblist; } else { handle->pbe = NULL; swsusp_free(); @@ -858,7 +860,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) error = prepare_image(handle); if (error) return error; - handle->pbe = pagedir_nosave; + handle->pbe = restore_pblist; handle->last_pbe = NULL; handle->buffer = get_buffer(handle); handle->sync_read = 0; -- cgit v1.2.2 From 0bcd888d64684f896ffa70c1d16a42b00753c184 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:52 -0700 Subject: [PATCH] swsusp: Introduce some helpful constants Introduce some constants that hopefully will help improve the readability of code in kernel/power/snapshot.c. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1d276b3ae152..d0d691f976d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -167,6 +167,11 @@ static inline int restore_highmem(void) {return 0;} * and we count them using unsafe_pages */ +#define PG_ANY 0 +#define PG_SAFE 1 +#define PG_UNSAFE_CLEAR 1 +#define PG_UNSAFE_KEEP 0 + static unsigned int unsafe_pages; static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) @@ -190,7 +195,7 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) unsigned long get_safe_page(gfp_t gfp_mask) { - return (unsigned long)alloc_image_page(gfp_mask, 1); + return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); } /** @@ -381,7 +386,7 @@ static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, pbe = pblist; for (num = PBES_PER_PAGE; num < nr_pages; num += PBES_PER_PAGE) { if (!pbe) { - free_pagedir(pblist, 1); + free_pagedir(pblist, PG_UNSAFE_CLEAR); return NULL; } pbe += PB_PAGE_SKIP; @@ -458,12 +463,13 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages) { struct pbe *pblist; - if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) { + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, PG_ANY); + if (!pblist) { printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); return NULL; } - if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { + if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, PG_ANY)) { printk(KERN_ERR "suspend: Allocating image pages failed.\n"); swsusp_free(); return NULL; @@ -575,7 +581,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) return 0; if (!buffer) { /* This makes the buffer be freed by swsusp_free() */ - buffer = alloc_image_page(GFP_ATOMIC, 0); + buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); if (!buffer) return -ENOMEM; } @@ -688,7 +694,7 @@ static int load_header(struct snapshot_handle *handle, error = check_header(info); if (!error) { - pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0); + pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, PG_ANY); if (!pblist) return -ENOMEM; restore_pblist = pblist; @@ -746,10 +752,10 @@ static int prepare_image(struct snapshot_handle *handle) p = restore_pblist; error = mark_unsafe_pages(p); if (!error) { - pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, PG_SAFE); if (pblist) copy_page_backup_list(pblist, p); - free_pagedir(p, 0); + free_pagedir(p, PG_UNSAFE_KEEP); if (!pblist) error = -ENOMEM; } @@ -840,7 +846,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return 0; if (!buffer) { /* This makes the buffer be freed by swsusp_free() */ - buffer = alloc_image_page(GFP_ATOMIC, 0); + buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); if (!buffer) return -ENOMEM; } -- cgit v1.2.2 From b788db79896ef2a5817b9395ad63573b254a6d93 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:54 -0700 Subject: [PATCH] swsusp: Introduce memory bitmaps Introduce the memory bitmap data structure and make swsusp use in the suspend phase. The current swsusp's internal data structure is not very efficient from the memory usage point of view, so it seems reasonable to replace it with a data structure that will require less memory, such as a pair of bitmaps. The idea is to use bitmaps that may be allocated as sets of individual pages, so that we can avoid making allocations of order greater than 0. For this reason the memory bitmap structure consists of several linked lists of objects that contain pointers to memory pages with the actual bitmap data. Still, for a typical system all of these lists fit in a single page, so it's reasonable to introduce an additional mechanism allowing us to allocate all of them efficiently without sacrificing the generality of the design. This is done with the help of the chain_allocator structure and associated functions. We need to use two memory bitmaps during the suspend phase of the suspend-resume cycle. One of them is necessary for marking the saveable pages, and the second is used to mark the pages in which to store the copies of them (aka image pages). First, the bitmaps are created and we allocate as many image pages as needed (the corresponding bits in the second bitmap are set as soon as the pages are allocated). Second, the bits corresponding to the saveable pages are set in the first bitmap and the saveable pages are copied to the image pages. Finally, the first bitmap is used to save the kernel virtual addresses of the saveable pages and the second one is used to save the contents of the image pages. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 3 +- kernel/power/snapshot.c | 602 ++++++++++++++++++++++++++++++++++++++++++------ kernel/power/swsusp.c | 5 +- 3 files changed, 540 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index e18ba207e784..6e9e2acc34f8 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -109,9 +109,10 @@ struct snapshot_handle { */ #define data_of(handle) ((handle).buffer + (handle).buf_offset) +extern unsigned int snapshot_additional_pages(struct zone *zone); extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); -int snapshot_image_loaded(struct snapshot_handle *handle); +extern int snapshot_image_loaded(struct snapshot_handle *handle); #define SNAPSHOT_IOC_MAGIC '3' #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d0d691f976d8..852e0df41719 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -211,6 +211,467 @@ static inline void free_image_page(void *addr, int clear_nosave_free) free_page((unsigned long)addr); } +/* struct linked_page is used to build chains of pages */ + +#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) + +struct linked_page { + struct linked_page *next; + char data[LINKED_PAGE_DATA_SIZE]; +} __attribute__((packed)); + +static inline void +free_list_of_pages(struct linked_page *list, int clear_page_nosave) +{ + while (list) { + struct linked_page *lp = list->next; + + free_image_page(list, clear_page_nosave); + list = lp; + } +} + +/** + * struct chain_allocator is used for allocating small objects out of + * a linked list of pages called 'the chain'. + * + * The chain grows each time when there is no room for a new object in + * the current page. The allocated objects cannot be freed individually. + * It is only possible to free them all at once, by freeing the entire + * chain. + * + * NOTE: The chain allocator may be inefficient if the allocated objects + * are not much smaller than PAGE_SIZE. + */ + +struct chain_allocator { + struct linked_page *chain; /* the chain */ + unsigned int used_space; /* total size of objects allocated out + * of the current page + */ + gfp_t gfp_mask; /* mask for allocating pages */ + int safe_needed; /* if set, only "safe" pages are allocated */ +}; + +static void +chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) +{ + ca->chain = NULL; + ca->used_space = LINKED_PAGE_DATA_SIZE; + ca->gfp_mask = gfp_mask; + ca->safe_needed = safe_needed; +} + +static void *chain_alloc(struct chain_allocator *ca, unsigned int size) +{ + void *ret; + + if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { + struct linked_page *lp; + + lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); + if (!lp) + return NULL; + + lp->next = ca->chain; + ca->chain = lp; + ca->used_space = 0; + } + ret = ca->chain->data + ca->used_space; + ca->used_space += size; + return ret; +} + +static void chain_free(struct chain_allocator *ca, int clear_page_nosave) +{ + free_list_of_pages(ca->chain, clear_page_nosave); + memset(ca, 0, sizeof(struct chain_allocator)); +} + +/** + * Data types related to memory bitmaps. + * + * Memory bitmap is a structure consiting of many linked lists of + * objects. The main list's elements are of type struct zone_bitmap + * and each of them corresonds to one zone. For each zone bitmap + * object there is a list of objects of type struct bm_block that + * represent each blocks of bit chunks in which information is + * stored. + * + * struct memory_bitmap contains a pointer to the main list of zone + * bitmap objects, a struct bm_position used for browsing the bitmap, + * and a pointer to the list of pages used for allocating all of the + * zone bitmap objects and bitmap block objects. + * + * NOTE: It has to be possible to lay out the bitmap in memory + * using only allocations of order 0. Additionally, the bitmap is + * designed to work with arbitrary number of zones (this is over the + * top for now, but let's avoid making unnecessary assumptions ;-). + * + * struct zone_bitmap contains a pointer to a list of bitmap block + * objects and a pointer to the bitmap block object that has been + * most recently used for setting bits. Additionally, it contains the + * pfns that correspond to the start and end of the represented zone. + * + * struct bm_block contains a pointer to the memory page in which + * information is stored (in the form of a block of bit chunks + * of type unsigned long each). It also contains the pfns that + * correspond to the start and end of the represented memory area and + * the number of bit chunks in the block. + * + * NOTE: Memory bitmaps are used for two types of operations only: + * "set a bit" and "find the next bit set". Moreover, the searching + * is always carried out after all of the "set a bit" operations + * on given bitmap. + */ + +#define BM_END_OF_MAP (~0UL) + +#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long)) +#define BM_BITS_PER_CHUNK (sizeof(long) << 3) +#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) + +struct bm_block { + struct bm_block *next; /* next element of the list */ + unsigned long start_pfn; /* pfn represented by the first bit */ + unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ + unsigned int size; /* number of bit chunks */ + unsigned long *data; /* chunks of bits representing pages */ +}; + +struct zone_bitmap { + struct zone_bitmap *next; /* next element of the list */ + unsigned long start_pfn; /* minimal pfn in this zone */ + unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ + struct bm_block *bm_blocks; /* list of bitmap blocks */ + struct bm_block *cur_block; /* recently used bitmap block */ +}; + +/* strcut bm_position is used for browsing memory bitmaps */ + +struct bm_position { + struct zone_bitmap *zone_bm; + struct bm_block *block; + int chunk; + int bit; +}; + +struct memory_bitmap { + struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ + struct linked_page *p_list; /* list of pages used to store zone + * bitmap objects and bitmap block + * objects + */ + struct bm_position cur; /* most recently used bit position */ +}; + +/* Functions that operate on memory bitmaps */ + +static inline void memory_bm_reset_chunk(struct memory_bitmap *bm) +{ + bm->cur.chunk = 0; + bm->cur.bit = -1; +} + +static void memory_bm_position_reset(struct memory_bitmap *bm) +{ + struct zone_bitmap *zone_bm; + + zone_bm = bm->zone_bm_list; + bm->cur.zone_bm = zone_bm; + bm->cur.block = zone_bm->bm_blocks; + memory_bm_reset_chunk(bm); +} + +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + +/** + * create_bm_block_list - create a list of block bitmap objects + */ + +static inline struct bm_block * +create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) +{ + struct bm_block *bblist = NULL; + + while (nr_blocks-- > 0) { + struct bm_block *bb; + + bb = chain_alloc(ca, sizeof(struct bm_block)); + if (!bb) + return NULL; + + bb->next = bblist; + bblist = bb; + } + return bblist; +} + +/** + * create_zone_bm_list - create a list of zone bitmap objects + */ + +static inline struct zone_bitmap * +create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) +{ + struct zone_bitmap *zbmlist = NULL; + + while (nr_zones-- > 0) { + struct zone_bitmap *zbm; + + zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); + if (!zbm) + return NULL; + + zbm->next = zbmlist; + zbmlist = zbm; + } + return zbmlist; +} + +/** + * memory_bm_create - allocate memory for a memory bitmap + */ + +static int +memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) +{ + struct chain_allocator ca; + struct zone *zone; + struct zone_bitmap *zone_bm; + struct bm_block *bb; + unsigned int nr; + + chain_init(&ca, gfp_mask, safe_needed); + + /* Compute the number of zones */ + nr = 0; + for_each_zone (zone) + if (populated_zone(zone) && !is_highmem(zone)) + nr++; + + /* Allocate the list of zones bitmap objects */ + zone_bm = create_zone_bm_list(nr, &ca); + bm->zone_bm_list = zone_bm; + if (!zone_bm) { + chain_free(&ca, PG_UNSAFE_CLEAR); + return -ENOMEM; + } + + /* Initialize the zone bitmap objects */ + for_each_zone (zone) { + unsigned long pfn; + + if (!populated_zone(zone) || is_highmem(zone)) + continue; + + zone_bm->start_pfn = zone->zone_start_pfn; + zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; + /* Allocate the list of bitmap block objects */ + nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); + bb = create_bm_block_list(nr, &ca); + zone_bm->bm_blocks = bb; + zone_bm->cur_block = bb; + if (!bb) + goto Free; + + nr = zone->spanned_pages; + pfn = zone->zone_start_pfn; + /* Initialize the bitmap block objects */ + while (bb) { + unsigned long *ptr; + + ptr = alloc_image_page(gfp_mask, safe_needed); + bb->data = ptr; + if (!ptr) + goto Free; + + bb->start_pfn = pfn; + if (nr >= BM_BITS_PER_BLOCK) { + pfn += BM_BITS_PER_BLOCK; + bb->size = BM_CHUNKS_PER_BLOCK; + nr -= BM_BITS_PER_BLOCK; + } else { + /* This is executed only once in the loop */ + pfn += nr; + bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK); + } + bb->end_pfn = pfn; + bb = bb->next; + } + zone_bm = zone_bm->next; + } + bm->p_list = ca.chain; + memory_bm_position_reset(bm); + return 0; + +Free: + bm->p_list = ca.chain; + memory_bm_free(bm, PG_UNSAFE_CLEAR); + return -ENOMEM; +} + +/** + * memory_bm_free - free memory occupied by the memory bitmap @bm + */ + +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) +{ + struct zone_bitmap *zone_bm; + + /* Free the list of bit blocks for each zone_bitmap object */ + zone_bm = bm->zone_bm_list; + while (zone_bm) { + struct bm_block *bb; + + bb = zone_bm->bm_blocks; + while (bb) { + if (bb->data) + free_image_page(bb->data, clear_nosave_free); + bb = bb->next; + } + zone_bm = zone_bm->next; + } + free_list_of_pages(bm->p_list, clear_nosave_free); + bm->zone_bm_list = NULL; +} + +/** + * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds + * to given pfn. The cur_zone_bm member of @bm and the cur_block member + * of @bm->cur_zone_bm are updated. + * + * If the bit cannot be set, the function returns -EINVAL . + */ + +static int +memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + struct zone_bitmap *zone_bm; + struct bm_block *bb; + + /* Check if the pfn is from the current zone */ + zone_bm = bm->cur.zone_bm; + if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { + zone_bm = bm->zone_bm_list; + /* We don't assume that the zones are sorted by pfns */ + while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { + zone_bm = zone_bm->next; + if (unlikely(!zone_bm)) + return -EINVAL; + } + bm->cur.zone_bm = zone_bm; + } + /* Check if the pfn corresponds to the current bitmap block */ + bb = zone_bm->cur_block; + if (pfn < bb->start_pfn) + bb = zone_bm->bm_blocks; + + while (pfn >= bb->end_pfn) { + bb = bb->next; + if (unlikely(!bb)) + return -EINVAL; + } + zone_bm->cur_block = bb; + pfn -= bb->start_pfn; + set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK); + return 0; +} + +/* Two auxiliary functions for memory_bm_next_pfn */ + +/* Find the first set bit in the given chunk, if there is one */ + +static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p) +{ + bit++; + while (bit < BM_BITS_PER_CHUNK) { + if (test_bit(bit, chunk_p)) + return bit; + + bit++; + } + return -1; +} + +/* Find a chunk containing some bits set in given block of bits */ + +static inline int next_chunk_in_block(int n, struct bm_block *bb) +{ + n++; + while (n < bb->size) { + if (bb->data[n]) + return n; + + n++; + } + return -1; +} + +/** + * memory_bm_next_pfn - find the pfn that corresponds to the next set bit + * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is + * returned. + * + * It is required to run memory_bm_position_reset() before the first call to + * this function. + */ + +static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) +{ + struct zone_bitmap *zone_bm; + struct bm_block *bb; + int chunk; + int bit; + + do { + bb = bm->cur.block; + do { + chunk = bm->cur.chunk; + bit = bm->cur.bit; + do { + bit = next_bit_in_chunk(bit, bb->data + chunk); + if (bit >= 0) + goto Return_pfn; + + chunk = next_chunk_in_block(chunk, bb); + bit = -1; + } while (chunk >= 0); + bb = bb->next; + bm->cur.block = bb; + memory_bm_reset_chunk(bm); + } while (bb); + zone_bm = bm->cur.zone_bm->next; + if (zone_bm) { + bm->cur.zone_bm = zone_bm; + bm->cur.block = zone_bm->bm_blocks; + memory_bm_reset_chunk(bm); + } + } while (zone_bm); + memory_bm_position_reset(bm); + return BM_END_OF_MAP; + +Return_pfn: + bm->cur.chunk = chunk; + bm->cur.bit = bit; + return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; +} + +/** + * snapshot_additional_pages - estimate the number of additional pages + * be needed for setting up the suspend image data structures for given + * zone (usually the returned value is greater than the exact number) + */ + +unsigned int snapshot_additional_pages(struct zone *zone) +{ + unsigned int res; + + res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); + res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); + return res; +} + /** * pfn_is_nosave - check if given pfn is in the 'nosave' section */ @@ -276,32 +737,38 @@ static inline void copy_data_page(long *dst, long *src) *dst++ = *src++; } -static void copy_data_pages(struct pbe *pblist) +static void +copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) { struct zone *zone; - unsigned long pfn, max_zone_pfn; - struct pbe *pbe; + unsigned long pfn; - pbe = pblist; for_each_zone (zone) { + unsigned long max_zone_pfn; + if (is_highmem(zone)) continue; + mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { - struct page *page = saveable_page(pfn); - - if (page) { - void *ptr = page_address(page); - - BUG_ON(!pbe); - copy_data_page((void *)pbe->address, ptr); - pbe->orig_address = (unsigned long)ptr; - pbe = pbe->next; - } - } + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (saveable_page(pfn)) + memory_bm_set_bit(orig_bm, pfn); } - BUG_ON(pbe); + memory_bm_position_reset(orig_bm); + memory_bm_position_reset(copy_bm); + do { + pfn = memory_bm_next_pfn(orig_bm); + if (likely(pfn != BM_END_OF_MAP)) { + struct page *page; + void *src; + + page = pfn_to_page(pfn); + src = page_address(page); + page = pfn_to_page(memory_bm_next_pfn(copy_bm)); + copy_data_page(page_address(page), src); + } + } while (pfn != BM_END_OF_MAP); } /** @@ -447,37 +914,43 @@ static int enough_free_mem(unsigned int nr_pages) (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } -static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) +static int +swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, + unsigned int nr_pages) { - struct pbe *p; + int error; - for_each_pbe (p, pblist) { - p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed); - if (!p->address) - return -ENOMEM; - } - return 0; -} + error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); + if (error) + goto Free; -static struct pbe *swsusp_alloc(unsigned int nr_pages) -{ - struct pbe *pblist; + error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); + if (error) + goto Free; - pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (!pblist) { - printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); - return NULL; - } + while (nr_pages-- > 0) { + struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); + if (!page) + goto Free; - if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, PG_ANY)) { - printk(KERN_ERR "suspend: Allocating image pages failed.\n"); - swsusp_free(); - return NULL; + SetPageNosave(page); + SetPageNosaveFree(page); + memory_bm_set_bit(copy_bm, page_to_pfn(page)); } + return 0; - return pblist; +Free: + swsusp_free(); + return -ENOMEM; } +/* Memory bitmap used for marking saveable pages */ +static struct memory_bitmap orig_bm; +/* Memory bitmap used for marking allocated pages that will contain the copies + * of saveable pages + */ +static struct memory_bitmap copy_bm; + asmlinkage int swsusp_save(void) { unsigned int nr_pages; @@ -498,15 +971,14 @@ asmlinkage int swsusp_save(void) return -ENOMEM; } - restore_pblist = swsusp_alloc(nr_pages); - if (!restore_pblist) + if (swsusp_alloc(&orig_bm, ©_bm, nr_pages)) return -ENOMEM; /* During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ drain_local_pages(); - copy_data_pages(restore_pblist); + copy_data_pages(©_bm, &orig_bm); /* * End of critical section. From now on, we can write to memory, @@ -535,22 +1007,23 @@ static void init_header(struct swsusp_info *info) } /** - * pack_orig_addresses - the .orig_address fields of the PBEs from the - * list starting at @pbe are stored in the array @buf[] (1 page) + * pack_addresses - the addresses corresponding to pfns found in the + * bitmap @bm are stored in the array @buf[] (1 page) */ -static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe) +static inline void +pack_addresses(unsigned long *buf, struct memory_bitmap *bm) { int j; - for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { - buf[j] = pbe->orig_address; - pbe = pbe->next; + for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { + unsigned long pfn = memory_bm_next_pfn(bm); + + if (unlikely(pfn == BM_END_OF_MAP)) + break; + + buf[j] = (unsigned long)page_address(pfn_to_page(pfn)); } - if (!pbe) - for (; j < PAGE_SIZE / sizeof(long); j++) - buf[j] = 0; - return pbe; } /** @@ -579,6 +1052,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) { if (handle->cur > nr_meta_pages + nr_copy_pages) return 0; + if (!buffer) { /* This makes the buffer be freed by swsusp_free() */ buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); @@ -588,16 +1062,17 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) if (!handle->offset) { init_header((struct swsusp_info *)buffer); handle->buffer = buffer; - handle->pbe = restore_pblist; + memory_bm_position_reset(&orig_bm); + memory_bm_position_reset(©_bm); } if (handle->prev < handle->cur) { if (handle->cur <= nr_meta_pages) { - handle->pbe = pack_orig_addresses(buffer, handle->pbe); - if (!handle->pbe) - handle->pbe = restore_pblist; + memset(buffer, 0, PAGE_SIZE); + pack_addresses(buffer, &orig_bm); } else { - handle->buffer = (void *)handle->pbe->address; - handle->pbe = handle->pbe->next; + unsigned long pfn = memory_bm_next_pfn(©_bm); + + handle->buffer = page_address(pfn_to_page(pfn)); } handle->prev = handle->cur; } @@ -736,12 +1211,7 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf, * of "safe" which will be used later */ -struct safe_page { - struct safe_page *next; - char padding[PAGE_SIZE - sizeof(void *)]; -}; - -static struct safe_page *safe_pages; +static struct linked_page *safe_pages; static int prepare_image(struct snapshot_handle *handle) { @@ -763,9 +1233,9 @@ static int prepare_image(struct snapshot_handle *handle) if (!error && nr_pages > unsafe_pages) { nr_pages -= unsafe_pages; while (nr_pages--) { - struct safe_page *ptr; + struct linked_page *ptr; - ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); + ptr = (void *)get_zeroed_page(GFP_ATOMIC); if (!ptr) { error = -ENOMEM; break; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 17f669c83012..8ef677ea0cea 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -193,14 +193,13 @@ int swsusp_shrink_memory(void) printk("Shrinking memory... "); do { size = 2 * count_highmem_pages(); - size += size / 50 + count_data_pages(); - size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + - PAGES_FOR_IO; + size += size / 50 + count_data_pages() + PAGES_FOR_IO; tmp = size; for_each_zone (zone) if (!is_highmem(zone) && populated_zone(zone)) { tmp -= zone->free_pages; tmp += zone->lowmem_reserve[ZONE_NORMAL]; + tmp += snapshot_additional_pages(zone); } if (tmp > 0) { tmp = __shrink_memory(tmp); -- cgit v1.2.2 From 940864ddabdb180e02041c4dcd46ba6f9eee732f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:55 -0700 Subject: [PATCH] swsusp: Use memory bitmaps during resume Make swsusp use memory bitmaps to store its internal information during the resume phase of the suspend-resume cycle. If the pfns of saveable pages are saved during the suspend phase instead of the kernel virtual addresses of these pages, we can use them during the resume phase directly to set the corresponding bits in a memory bitmap. Then, this bitmap is used to mark the page frames corresponding to the pages that were saveable before the suspend (aka "unsafe" page frames). Next, we allocate as many page frames as needed to store the entire suspend image and make sure that there will be some extra free "safe" page frames for the list of PBEs constructed later. Subsequently, the image is loaded and, if possible, the data loaded from it are written into their "original" page frames (ie. the ones they had occupied before the suspend). The image data that cannot be written into their "original" page frames are loaded into "safe" page frames and their "original" kernel virtual addresses, as well as the addresses of the "safe" pages containing their copies, are stored in a list of PBEs. Finally, the list of PBEs is used to copy the remaining image data into their "original" page frames (this is done atomically, by the architecture-dependent parts of swsusp). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 11 +- kernel/power/snapshot.c | 416 +++++++++++++++++++++--------------------------- kernel/power/swap.c | 4 +- kernel/power/user.c | 1 + 4 files changed, 186 insertions(+), 246 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 6e9e2acc34f8..bfe999f7b272 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -81,16 +81,6 @@ struct snapshot_handle { unsigned int prev; /* number of the block of PAGE_SIZE bytes that * was the current one previously */ - struct pbe *pbe; /* PBE that corresponds to 'buffer' */ - struct pbe *last_pbe; /* When the image is restored (eg. read - * from disk) we can store some image - * data directly in the page frames - * in which they were before suspend. - * In such a case the PBEs that - * correspond to them will be unused. - * This is the last PBE, so far, that - * does not correspond to such data. - */ void *buffer; /* address of the block to read from * or write to */ @@ -113,6 +103,7 @@ extern unsigned int snapshot_additional_pages(struct zone *zone); extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); extern int snapshot_image_loaded(struct snapshot_handle *handle); +extern void snapshot_free_unused_memory(struct snapshot_handle *handle); #define SNAPSHOT_IOC_MAGIC '3' #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 852e0df41719..1b84313cbab5 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -39,7 +39,7 @@ struct pbe *restore_pblist; static unsigned int nr_copy_pages; static unsigned int nr_meta_pages; -static unsigned long *buffer; +static void *buffer; #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void) @@ -172,7 +172,7 @@ static inline int restore_highmem(void) {return 0;} #define PG_UNSAFE_CLEAR 1 #define PG_UNSAFE_KEEP 0 -static unsigned int unsafe_pages; +static unsigned int allocated_unsafe_pages; static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) { @@ -183,7 +183,7 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) while (res && PageNosaveFree(virt_to_page(res))) { /* The page is unsafe, mark it for swsusp_free() */ SetPageNosave(virt_to_page(res)); - unsafe_pages++; + allocated_unsafe_pages++; res = (void *)get_zeroed_page(gfp_mask); } if (res) { @@ -772,101 +772,10 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) } /** - * free_pagedir - free pages allocated with alloc_pagedir() - */ - -static void free_pagedir(struct pbe *pblist, int clear_nosave_free) -{ - struct pbe *pbe; - - while (pblist) { - pbe = (pblist + PB_PAGE_SKIP)->next; - free_image_page(pblist, clear_nosave_free); - pblist = pbe; - } -} - -/** - * fill_pb_page - Create a list of PBEs on a given memory page - */ - -static inline void fill_pb_page(struct pbe *pbpage, unsigned int n) -{ - struct pbe *p; - - p = pbpage; - pbpage += n - 1; - do - p->next = p + 1; - while (++p < pbpage); -} - -/** - * create_pbe_list - Create a list of PBEs on top of a given chain - * of memory pages allocated with alloc_pagedir() + * swsusp_free - free pages allocated for the suspend. * - * This function assumes that pages allocated by alloc_image_page() will - * always be zeroed. - */ - -static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) -{ - struct pbe *pbpage; - unsigned int num = PBES_PER_PAGE; - - for_each_pb_page (pbpage, pblist) { - if (num >= nr_pages) - break; - - fill_pb_page(pbpage, PBES_PER_PAGE); - num += PBES_PER_PAGE; - } - if (pbpage) { - num -= PBES_PER_PAGE; - fill_pb_page(pbpage, nr_pages - num); - } -} - -/** - * alloc_pagedir - Allocate the page directory. - * - * First, determine exactly how many pages we need and - * allocate them. - * - * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE - * struct pbe elements (pbes) and the last element in the page points - * to the next page. - * - * On each page we set up a list of struct_pbe elements. - */ - -static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, - int safe_needed) -{ - unsigned int num; - struct pbe *pblist, *pbe; - - if (!nr_pages) - return NULL; - - pblist = alloc_image_page(gfp_mask, safe_needed); - pbe = pblist; - for (num = PBES_PER_PAGE; num < nr_pages; num += PBES_PER_PAGE) { - if (!pbe) { - free_pagedir(pblist, PG_UNSAFE_CLEAR); - return NULL; - } - pbe += PB_PAGE_SKIP; - pbe->next = alloc_image_page(gfp_mask, safe_needed); - pbe = pbe->next; - } - create_pbe_list(pblist, nr_pages); - return pblist; -} - -/** - * Free pages we allocated for suspend. Suspend pages are alocated - * before atomic copy, so we need to free them after resume. + * Suspend pages are alocated before the atomic copy is made, so we + * need to release them after the resume. */ void swsusp_free(void) @@ -904,14 +813,18 @@ void swsusp_free(void) static int enough_free_mem(unsigned int nr_pages) { struct zone *zone; - unsigned int n = 0; + unsigned int free = 0, meta = 0; for_each_zone (zone) - if (!is_highmem(zone)) - n += zone->free_pages; - pr_debug("swsusp: available memory: %u pages\n", n); - return n > (nr_pages + PAGES_FOR_IO + - (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); + if (!is_highmem(zone)) { + free += zone->free_pages; + meta += snapshot_additional_pages(zone); + } + + pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, meta, free); + + return free > nr_pages + PAGES_FOR_IO + meta; } static int @@ -961,11 +874,6 @@ asmlinkage int swsusp_save(void) nr_pages = count_data_pages(); printk("swsusp: Need to copy %u pages\n", nr_pages); - pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n", - nr_pages, - (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, - PAGES_FOR_IO, nr_free_pages()); - if (!enough_free_mem(nr_pages)) { printk(KERN_ERR "swsusp: Not enough free memory\n"); return -ENOMEM; @@ -1007,22 +915,19 @@ static void init_header(struct swsusp_info *info) } /** - * pack_addresses - the addresses corresponding to pfns found in the - * bitmap @bm are stored in the array @buf[] (1 page) + * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm + * are stored in the array @buf[] (1 page at a time) */ static inline void -pack_addresses(unsigned long *buf, struct memory_bitmap *bm) +pack_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { - unsigned long pfn = memory_bm_next_pfn(bm); - - if (unlikely(pfn == BM_END_OF_MAP)) + buf[j] = memory_bm_next_pfn(bm); + if (unlikely(buf[j] == BM_END_OF_MAP)) break; - - buf[j] = (unsigned long)page_address(pfn_to_page(pfn)); } } @@ -1068,7 +973,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) if (handle->prev < handle->cur) { if (handle->cur <= nr_meta_pages) { memset(buffer, 0, PAGE_SIZE); - pack_addresses(buffer, &orig_bm); + pack_pfns(buffer, &orig_bm); } else { unsigned long pfn = memory_bm_next_pfn(©_bm); @@ -1094,14 +999,10 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) * had been used before suspend */ -static int mark_unsafe_pages(struct pbe *pblist) +static int mark_unsafe_pages(struct memory_bitmap *bm) { struct zone *zone; unsigned long pfn, max_zone_pfn; - struct pbe *p; - - if (!pblist) /* a sanity check */ - return -EINVAL; /* Clear page flags */ for_each_zone (zone) { @@ -1111,30 +1012,37 @@ static int mark_unsafe_pages(struct pbe *pblist) ClearPageNosaveFree(pfn_to_page(pfn)); } - /* Mark orig addresses */ - for_each_pbe (p, pblist) { - if (virt_addr_valid(p->orig_address)) - SetPageNosaveFree(virt_to_page(p->orig_address)); - else - return -EFAULT; - } + /* Mark pages that correspond to the "original" pfns as "unsafe" */ + memory_bm_position_reset(bm); + do { + pfn = memory_bm_next_pfn(bm); + if (likely(pfn != BM_END_OF_MAP)) { + if (likely(pfn_valid(pfn))) + SetPageNosaveFree(pfn_to_page(pfn)); + else + return -EFAULT; + } + } while (pfn != BM_END_OF_MAP); - unsafe_pages = 0; + allocated_unsafe_pages = 0; return 0; } -static void copy_page_backup_list(struct pbe *dst, struct pbe *src) +static void +duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) { - /* We assume both lists contain the same number of elements */ - while (src) { - dst->orig_address = src->orig_address; - dst = dst->next; - src = src->next; + unsigned long pfn; + + memory_bm_position_reset(src); + pfn = memory_bm_next_pfn(src); + while (pfn != BM_END_OF_MAP) { + memory_bm_set_bit(dst, pfn); + pfn = memory_bm_next_pfn(src); } } -static int check_header(struct swsusp_info *info) +static inline int check_header(struct swsusp_info *info) { char *reason = NULL; @@ -1161,19 +1069,14 @@ static int check_header(struct swsusp_info *info) * load header - check the image header and copy data from it */ -static int load_header(struct snapshot_handle *handle, - struct swsusp_info *info) +static int +load_header(struct swsusp_info *info) { int error; - struct pbe *pblist; + restore_pblist = NULL; error = check_header(info); if (!error) { - pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, PG_ANY); - if (!pblist) - return -ENOMEM; - restore_pblist = pblist; - handle->pbe = pblist; nr_copy_pages = info->image_pages; nr_meta_pages = info->pages - info->image_pages - 1; } @@ -1181,108 +1084,137 @@ static int load_header(struct snapshot_handle *handle, } /** - * unpack_orig_addresses - copy the elements of @buf[] (1 page) to - * the PBEs in the list starting at @pbe + * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set + * the corresponding bit in the memory bitmap @bm */ -static inline struct pbe *unpack_orig_addresses(unsigned long *buf, - struct pbe *pbe) +static inline void +unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; - for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { - pbe->orig_address = buf[j]; - pbe = pbe->next; + for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { + if (unlikely(buf[j] == BM_END_OF_MAP)) + break; + + memory_bm_set_bit(bm, buf[j]); } - return pbe; } /** - * prepare_image - use metadata contained in the PBE list - * pointed to by restore_pblist to mark the pages that will - * be overwritten in the process of restoring the system - * memory state from the image ("unsafe" pages) and allocate - * memory for the image + * prepare_image - use the memory bitmap @bm to mark the pages that will + * be overwritten in the process of restoring the system memory state + * from the suspend image ("unsafe" pages) and allocate memory for the + * image. * - * The idea is to allocate the PBE list first and then - * allocate as many pages as it's needed for the image data, - * but not to assign these pages to the PBEs initially. - * Instead, we just mark them as allocated and create a list - * of "safe" which will be used later + * The idea is to allocate a new memory bitmap first and then allocate + * as many pages as needed for the image data, but not to assign these + * pages to specific tasks initially. Instead, we just mark them as + * allocated and create a list of "safe" pages that will be used later. */ -static struct linked_page *safe_pages; +#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) + +static struct linked_page *safe_pages_list; -static int prepare_image(struct snapshot_handle *handle) +static int +prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) { - int error = 0; - unsigned int nr_pages = nr_copy_pages; - struct pbe *p, *pblist = NULL; + unsigned int nr_pages; + struct linked_page *sp_list, *lp; + int error; - p = restore_pblist; - error = mark_unsafe_pages(p); - if (!error) { - pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, PG_SAFE); - if (pblist) - copy_page_backup_list(pblist, p); - free_pagedir(p, PG_UNSAFE_KEEP); - if (!pblist) + error = mark_unsafe_pages(bm); + if (error) + goto Free; + + error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(new_bm, bm); + memory_bm_free(bm, PG_UNSAFE_KEEP); + /* Reserve some safe pages for potential later use. + * + * NOTE: This way we make sure there will be enough safe pages for the + * chain_alloc() in get_buffer(). It is a bit wasteful, but + * nr_copy_pages cannot be greater than 50% of the memory anyway. + */ + sp_list = NULL; + /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ + nr_pages = nr_copy_pages - allocated_unsafe_pages; + nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); + while (nr_pages > 0) { + lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); + if (!lp) { error = -ENOMEM; + goto Free; + } + lp->next = sp_list; + sp_list = lp; + nr_pages--; } - safe_pages = NULL; - if (!error && nr_pages > unsafe_pages) { - nr_pages -= unsafe_pages; - while (nr_pages--) { - struct linked_page *ptr; - - ptr = (void *)get_zeroed_page(GFP_ATOMIC); - if (!ptr) { - error = -ENOMEM; - break; - } - if (!PageNosaveFree(virt_to_page(ptr))) { - /* The page is "safe", add it to the list */ - ptr->next = safe_pages; - safe_pages = ptr; - } - /* Mark the page as allocated */ - SetPageNosave(virt_to_page(ptr)); - SetPageNosaveFree(virt_to_page(ptr)); + /* Preallocate memory for the image */ + safe_pages_list = NULL; + nr_pages = nr_copy_pages - allocated_unsafe_pages; + while (nr_pages > 0) { + lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); + if (!lp) { + error = -ENOMEM; + goto Free; + } + if (!PageNosaveFree(virt_to_page(lp))) { + /* The page is "safe", add it to the list */ + lp->next = safe_pages_list; + safe_pages_list = lp; } + /* Mark the page as allocated */ + SetPageNosave(virt_to_page(lp)); + SetPageNosaveFree(virt_to_page(lp)); + nr_pages--; } - if (!error) { - restore_pblist = pblist; - } else { - handle->pbe = NULL; - swsusp_free(); + /* Free the reserved safe pages so that chain_alloc() can use them */ + while (sp_list) { + lp = sp_list->next; + free_image_page(sp_list, PG_UNSAFE_CLEAR); + sp_list = lp; } + return 0; + +Free: + swsusp_free(); return error; } -static void *get_buffer(struct snapshot_handle *handle) +/** + * get_buffer - compute the address that snapshot_write_next() should + * set for its caller to write to. + */ + +static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { - struct pbe *pbe = handle->pbe, *last = handle->last_pbe; - struct page *page = virt_to_page(pbe->orig_address); + struct pbe *pbe; + struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); - if (PageNosave(page) && PageNosaveFree(page)) { - /* - * We have allocated the "original" page frame and we can - * use it directly to store the read page + if (PageNosave(page) && PageNosaveFree(page)) + /* We have allocated the "original" page frame and we can + * use it directly to store the loaded page. */ - pbe->address = 0; - if (last && last->next) - last->next = NULL; - return (void *)pbe->orig_address; - } - /* - * The "original" page frame has not been allocated and we have to - * use a "safe" page frame to store the read page + return page_address(page); + + /* The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the loaded page. */ - pbe->address = (unsigned long)safe_pages; - safe_pages = safe_pages->next; - if (last) - last->next = pbe; - handle->last_pbe = pbe; + pbe = chain_alloc(ca, sizeof(struct pbe)); + if (!pbe) { + swsusp_free(); + return NULL; + } + pbe->orig_address = (unsigned long)page_address(page); + pbe->address = (unsigned long)safe_pages_list; + safe_pages_list = safe_pages_list->next; + pbe->next = restore_pblist; + restore_pblist = pbe; return (void *)pbe->address; } @@ -1310,10 +1242,13 @@ static void *get_buffer(struct snapshot_handle *handle) int snapshot_write_next(struct snapshot_handle *handle, size_t count) { + static struct chain_allocator ca; int error = 0; + /* Check if we have already loaded the entire image */ if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) return 0; + if (!buffer) { /* This makes the buffer be freed by swsusp_free() */ buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); @@ -1324,26 +1259,32 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) handle->buffer = buffer; handle->sync_read = 1; if (handle->prev < handle->cur) { - if (!handle->prev) { - error = load_header(handle, - (struct swsusp_info *)buffer); + if (handle->prev == 0) { + error = load_header(buffer); + if (error) + return error; + + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); if (error) return error; + } else if (handle->prev <= nr_meta_pages) { - handle->pbe = unpack_orig_addresses(buffer, - handle->pbe); - if (!handle->pbe) { - error = prepare_image(handle); + unpack_orig_pfns(buffer, ©_bm); + if (handle->prev == nr_meta_pages) { + error = prepare_image(&orig_bm, ©_bm); if (error) return error; - handle->pbe = restore_pblist; - handle->last_pbe = NULL; - handle->buffer = get_buffer(handle); + + chain_init(&ca, GFP_ATOMIC, PG_SAFE); + memory_bm_position_reset(&orig_bm); + restore_pblist = NULL; + handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; + if (!handle->buffer) + return -ENOMEM; } } else { - handle->pbe = handle->pbe->next; - handle->buffer = get_buffer(handle); + handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; } handle->prev = handle->cur; @@ -1362,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) int snapshot_image_loaded(struct snapshot_handle *handle) { - return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || - handle->cur <= nr_meta_pages + nr_copy_pages); + return !(!nr_copy_pages || + handle->cur <= nr_meta_pages + nr_copy_pages); +} + +void snapshot_free_unused_memory(struct snapshot_handle *handle) +{ + /* Free only if we have loaded the image entirely */ + if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) + memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8309d20b2563..9b2ee5344dee 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -331,8 +331,7 @@ static int enough_swap(unsigned int nr_pages) unsigned int free_swap = count_swap_pages(root_swap, 1); pr_debug("swsusp: free swap pages: %u\n", free_swap); - return free_swap > (nr_pages + PAGES_FOR_IO + - (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); + return free_swap > nr_pages + PAGES_FOR_IO; } /** @@ -547,6 +546,7 @@ static int load_image(struct swap_map_handle *handle, error = err2; if (!error) { printk("\b\b\b\bdone\n"); + snapshot_free_unused_memory(snapshot); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; } diff --git a/kernel/power/user.c b/kernel/power/user.c index 0ef5e4ba39e5..2e4499f3e4d9 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -193,6 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } + snapshot_free_unused_memory(&data->handle); down(&pm_sem); pm_prepare_console(); error = device_suspend(PMSG_FREEZE); -- cgit v1.2.2 From c8eb8b4025175f967af0ba8e933f23aa9954dc35 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:56 -0700 Subject: [PATCH] PM: make it possible to disable console suspending Change suspend_console() so that it waits for all consoles to flush the remaining messages and make it possible to switch the console suspending off with the help of a Kconfig option. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Stefan Seyfried Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 11 +++++++++++ kernel/printk.c | 3 +++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 619ecabf7c58..4b6e2f18e056 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -36,6 +36,17 @@ config PM_DEBUG code. This is helpful when debugging and reporting various PM bugs, like suspend support. +config DISABLE_CONSOLE_SUSPEND + bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" + depends on PM && PM_DEBUG + default n + ---help--- + This option turns off the console suspend mechanism that prevents + debug messages from reaching the console during the suspend/resume + operations. This may be helpful when debugging device drivers' + suspend/resume routines, but may itself lead to problems, for example + if netconsole is used. + config PM_TRACE bool "Suspend/resume event tracing" depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL diff --git a/kernel/printk.c b/kernel/printk.c index 1149365e989e..771f5e861bcd 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -721,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options) return 0; } +#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND /** * suspend_console - suspend the console subsystem * @@ -728,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options) */ void suspend_console(void) { + printk("Suspending console(s)\n"); acquire_console_sem(); console_suspended = 1; } @@ -737,6 +739,7 @@ void resume_console(void) console_suspended = 0; release_console_sem(); } +#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */ /** * acquire_console_sem - lock the console system for exclusive use. -- cgit v1.2.2 From c5c6ba4e08ab9c9e390a0f3a7d9a5c332f5cc6ef Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Sep 2006 23:32:58 -0700 Subject: [PATCH] PM: Add pm_trace switch Add the pm_trace attribute in /sys/power which has to be explicitly set to one to really enable the "PM tracing" code compiled in when CONFIG_PM_TRACE is set (which modifies the machine's CMOS clock in unpredictable ways). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 4d403323a7bb..873228c71dab 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "power.h" @@ -281,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n power_attr(state); +#ifdef CONFIG_PM_TRACE +int pm_trace_enabled; + +static ssize_t pm_trace_show(struct subsystem * subsys, char * buf) +{ + return sprintf(buf, "%d\n", pm_trace_enabled); +} + +static ssize_t +pm_trace_store(struct subsystem * subsys, const char * buf, size_t n) +{ + int val; + + if (sscanf(buf, "%d", &val) == 1) { + pm_trace_enabled = !!val; + return n; + } + return -EINVAL; +} + +power_attr(pm_trace); + +static struct attribute * g[] = { + &state_attr.attr, + &pm_trace_attr.attr, + NULL, +}; +#else static struct attribute * g[] = { &state_attr.attr, NULL, }; +#endif /* CONFIG_PM_TRACE */ static struct attribute_group attr_group = { .attrs = g, -- cgit v1.2.2