aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/scripts/python/sched-migration.py
blob: 74d55ec08aed5ec27867b1d74682a5a0bb320748 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
#!/usr/bin/python
#
# Cpu task migration overview toy
#
# Copyright (C) 2010 Frederic Weisbecker <fweisbec@gmail.com>
#
# perf script event handlers have been generated by perf script -g python
#
# This software is distributed under the terms of the GNU General
# Public License ("GPL") version 2 as published by the Free Software
# Foundation.


import os
import sys

from collections import defaultdict
from UserList import UserList

sys.path.append(os.environ['PERF_EXEC_PATH'] + \
	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
sys.path.append('scripts/python/Perf-Trace-Util/lib/Perf/Trace')

from perf_trace_context import *
from Core import *
from SchedGui import *


threads = { 0 : "idle"}

def thread_name(pid):
	return "%s:%d" % (threads[pid], pid)

class RunqueueEventUnknown:
	@staticmethod
	def color():
		return None

	def __repr__(self):
		return "unknown"

class RunqueueEventSleep:
	@staticmethod
	def color():
		return (0, 0, 0xff)

	def __init__(self, sleeper):
		self.sleeper = sleeper

	def __repr__(self):
		return "%s gone to sleep" % thread_name(self.sleeper)

class RunqueueEventWakeup:
	@staticmethod
	def color():
		return (0xff, 0xff, 0)

	def __init__(self, wakee):
		self.wakee = wakee

	def __repr__(self):
		return "%s woke up" % thread_name(self.wakee)

class RunqueueEventFork:
	@staticmethod
	def color():
		return (0, 0xff, 0)

	def __init__(self, child):
		self.child = child

	def __repr__(self):
		return "new forked task %s" % thread_name(self.child)

class RunqueueMigrateIn:
	@staticmethod
	def color():
		return (0, 0xf0, 0xff)

	def __init__(self, new):
		self.new = new

	def __repr__(self):
		return "task migrated in %s" % thread_name(self.new)

class RunqueueMigrateOut:
	@staticmethod
	def color():
		return (0xff, 0, 0xff)

	def __init__(self, old):
		self.old = old

	def __repr__(self):
		return "task migrated out %s" % thread_name(self.old)

class RunqueueSnapshot:
	def __init__(self, tasks = [0], event = RunqueueEventUnknown()):
		self.tasks = tuple(tasks)
		self.event = event

	def sched_switch(self, prev, prev_state, next):
		event = RunqueueEventUnknown()

		if taskState(prev_state) == "R" and next in self.tasks \
			and prev in self.tasks:
			return self

		if taskState(prev_state) != "R":
			event = RunqueueEventSleep(prev)

		next_tasks = list(self.tasks[:])
		if prev in self.tasks:
			if taskState(prev_state) != "R":
				next_tasks.remove(prev)
		elif taskState(prev_state) == "R":
			next_tasks.append(prev)

		if next not in next_tasks:
			next_tasks.append(next)

		return RunqueueSnapshot(next_tasks, event)

	def migrate_out(self, old):
		if old not in self.tasks:
			return self
		next_tasks = [task for task in self.tasks if task != old]

		return RunqueueSnapshot(next_tasks, RunqueueMigrateOut(old))

	def __migrate_in(self, new, event):
		if new in self.tasks:
			self.event = event
			return self
		next_tasks = self.tasks[:] + tuple([new])

		return RunqueueSnapshot(next_tasks, event)

	def migrate_in(self, new):
		return self.__migrate_in(new, RunqueueMigrateIn(new))

	def wake_up(self, new):
		return self.__migrate_in(new, RunqueueEventWakeup(new))

	def wake_up_new(self, new):
		return self.__migrate_in(new, RunqueueEventFork(new))

	def load(self):
		""" Provide the number of tasks on the runqueue.
		    Don't count idle"""
		return len(self.tasks) - 1

	def __repr__(self):
		ret = self.tasks.__repr__()
		ret += self.origin_tostring()

		return ret

class TimeSlice:
	def __init__(self, start, prev):
		self.start = start
		self.prev = prev
		self.end = start
		# cpus that triggered the event
		self.event_cpus = []
		if prev is not None:
			self.total_load = prev.total_load
			self.rqs = prev.rqs.copy()
		else:
			self.rqs = defaultdict(RunqueueSnapshot)
			self.total_load = 0

	def __update_total_load(self, old_rq, new_rq):
		diff = new_rq.load() - old_rq.load()
		self.total_load += diff

	def sched_switch(self, ts_list, prev, prev_state, next, cpu):
		old_rq = self.prev.rqs[cpu]
		new_rq = old_rq.sched_switch(prev, prev_state, next)

		if old_rq is new_rq:
			return

		self.rqs[cpu] = new_rq
		self.__update_total_load(old_rq, new_rq)
		ts_list.append(self)
		self.event_cpus = [cpu]

	def migrate(self, ts_list, new, old_cpu, new_cpu):
		if old_cpu == new_cpu:
			return
		old_rq = self.prev.rqs[old_cpu]
		out_rq = old_rq.migrate_out(new)
		self.rqs[old_cpu] = out_rq
		self.__update_total_load(old_rq, out_rq)

		new_rq = self.prev.rqs[new_cpu]
		in_rq = new_rq.migrate_in(new)
		self.rqs[new_cpu] = in_rq
		self.__update_total_load(new_rq, in_rq)

		ts_list.append(self)

		if old_rq is not out_rq:
			self.event_cpus.append(old_cpu)
		self.event_cpus.append(new_cpu)

	def wake_up(self, ts_list, pid, cpu, fork):
		old_rq = self.prev.rqs[cpu]
		if fork:
			new_rq = old_rq.wake_up_new(pid)
		else:
			new_rq = old_rq.wake_up(pid)

		if new_rq is old_rq:
			return
		self.rqs[cpu] = new_rq
		self.__update_total_load(old_rq, new_rq)
		ts_list.append(self)
		self.event_cpus = [cpu]

	def next(self, t):
		self.end = t
		return TimeSlice(t, self)

class TimeSliceList(UserList):
	def __init__(self, arg = []):
		self.data = arg

	def get_time_slice(self, ts):
		if len(self.data) == 0:
			slice = TimeSlice(ts, TimeSlice(-1, None))
		else:
			slice = self.data[-1].next(ts)
		return slice

	def find_time_slice(self, ts):
		start = 0
		end = len(self.data)
		found = -1
		searching = True
		while searching:
			if start == end or start == end - 1:
				searching = False

			i = (end + start) / 2
			if self.data[i].start <= ts and self.data[i].end >= ts:
				found = i
				end = i
				continue

			if self.data[i].end < ts:
				start = i

			elif self.data[i].start > ts:
				end = i

		return found

	def set_root_win(self, win):
		self.root_win = win

	def mouse_down(self, cpu, t):
		idx = self.find_time_slice(t)
		if idx == -1:
			return

		ts = self[idx]
		rq = ts.rqs[cpu]
		raw = "CPU: %d\n" % cpu
		raw += "Last event : %s\n" % rq.event.__repr__()
		raw += "Timestamp : %d.%06d\n" % (ts.start / (10 ** 9), (ts.start % (10 ** 9)) / 1000)
		raw += "Duration : %6d us\n" % ((ts.end - ts.start) / (10 ** 6))
		raw += "Load = %d\n" % rq.load()
		for t in rq.tasks:
			raw += "%s \n" % thread_name(t)

		self.root_win.update_summary(raw)

	def update_rectangle_cpu(self, slice, cpu):
		rq = slice.rqs[cpu]

		if slice.total_load != 0:
			load_rate = rq.load() / float(slice.total_load)
		else:
			load_rate = 0

		red_power = int(0xff - (0xff * load_rate))
		color = (0xff, red_power, red_power)

		top_color = None

		if cpu in slice.event_cpus:
			top_color = rq.event.color()

		self.root_win.paint_rectangle_zone(cpu, color, top_color, slice.start, slice.end)

	def fill_zone(self, start, end):
		i = self.find_time_slice(start)
		if i == -1:
			return

		for i in xrange(i, len(self.data)):
			timeslice = self.data[i]
			if timeslice.start > end:
				return

			for cpu in timeslice.rqs:
				self.update_rectangle_cpu(timeslice, cpu)

	def interval(self):
		if len(self.data) == 0:
			return (0, 0)

		return (self.data[0].start, self.data[-1].end)

	def nr_rectangles(self):
		last_ts = self.data[-1]
		max_cpu = 0
		for cpu in last_ts.rqs:
			if cpu > max_cpu:
				max_cpu = cpu
		return max_cpu


class SchedEventProxy:
	def __init__(self):
		self.current_tsk = defaultdict(lambda : -1)
		self.timeslices = TimeSliceList()

	def sched_switch(self, headers, prev_comm, prev_pid, prev_prio, prev_state,
			 next_comm, next_pid, next_prio):
		""" Ensure the task we sched out this cpu is really the one
		    we logged. Otherwise we may have missed traces """

		on_cpu_task = self.current_tsk[headers.cpu]

		if on_cpu_task != -1 and on_cpu_task != prev_pid:
			print "Sched switch event rejected ts: %s cpu: %d prev: %s(%d) next: %s(%d)" % \
				(headers.ts_format(), headers.cpu, prev_comm, prev_pid, next_comm, next_pid)

		threads[prev_pid] = prev_comm
		threads[next_pid] = next_comm
		self.current_tsk[headers.cpu] = next_pid

		ts = self.timeslices.get_time_slice(headers.ts())
		ts.sched_switch(self.timeslices, prev_pid, prev_state, next_pid, headers.cpu)

	def migrate(self, headers, pid, prio, orig_cpu, dest_cpu):
		ts = self.timeslices.get_time_slice(headers.ts())
		ts.migrate(self.timeslices, pid, orig_cpu, dest_cpu)

	def wake_up(self, headers, comm, pid, success, target_cpu, fork):
		if success == 0:
			return
		ts = self.timeslices.get_time_slice(headers.ts())
		ts.wake_up(self.timeslices, pid, target_cpu, fork)


def trace_begin():
	global parser
	parser = SchedEventProxy()

def trace_end():
	app = wx.App(False)
	timeslices = parser.timeslices
	frame = RootFrame(timeslices, "Migration")
	app.MainLoop()

def sched__sched_stat_runtime(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, runtime, vruntime):
	pass

def sched__sched_stat_iowait(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, delay):
	pass

def sched__sched_stat_sleep(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, delay):
	pass

def sched__sched_stat_wait(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, delay):
	pass

def sched__sched_process_fork(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	parent_comm, parent_pid, child_comm, child_pid):
	pass

def sched__sched_process_wait(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, prio):
	pass

def sched__sched_process_exit(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, prio):
	pass

def sched__sched_process_free(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, prio):
	pass

def sched__sched_migrate_task(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, prio, orig_cpu,
	dest_cpu):
	headers = EventHeaders(common_cpu, common_secs, common_nsecs,
				common_pid, common_comm)
	parser.migrate(headers, pid, prio, orig_cpu, dest_cpu)

def sched__sched_switch(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	prev_comm, prev_pid, prev_prio, prev_state,
	next_comm, next_pid, next_prio):

	headers = EventHeaders(common_cpu, common_secs, common_nsecs,
				common_pid, common_comm)
	parser.sched_switch(headers, prev_comm, prev_pid, prev_prio, prev_state,
			 next_comm, next_pid, next_prio)

def sched__sched_wakeup_new(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, prio, success,
	target_cpu):
	headers = EventHeaders(common_cpu, common_secs, common_nsecs,
				common_pid, common_comm)
	parser.wake_up(headers, comm, pid, success, target_cpu, 1)

def sched__sched_wakeup(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, prio, success,
	target_cpu):
	headers = EventHeaders(common_cpu, common_secs, common_nsecs,
				common_pid, common_comm)
	parser.wake_up(headers, comm, pid, success, target_cpu, 0)

def sched__sched_wait_task(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid, prio):
	pass

def sched__sched_kthread_stop_ret(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	ret):
	pass

def sched__sched_kthread_stop(event_name, context, common_cpu,
	common_secs, common_nsecs, common_pid, common_comm,
	comm, pid):
	pass

def trace_unhandled(event_name, context, common_cpu, common_secs, common_nsecs,
		common_pid, common_comm):
	pass
struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; /* * First job: lock down the current transaction and wait for * all outstanding updates to complete. */ #ifdef COMMIT_STATS spin_lock(&journal->j_list_lock); summarise_journal_usage(journal); spin_unlock(&journal->j_list_lock); #endif /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); jbd2_journal_update_superblock(journal, 1); } else { jbd_debug(3, "superblock not updated\n"); } J_ASSERT(journal->j_running_transaction != NULL); J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_mark(jbd2_start_commit, "dev %s transaction %d", journal->j_devname, commit_transaction->t_tid); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; stats.u.run.rs_wait = commit_transaction->t_max_wait; stats.u.run.rs_locked = jiffies; stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, stats.u.run.rs_locked); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); if (commit_transaction->t_updates) { spin_unlock(&commit_transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); schedule(); spin_lock(&journal->j_state_lock); spin_lock(&commit_transaction->t_handle_lock); } finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); J_ASSERT (commit_transaction->t_outstanding_credits <= journal->j_max_transaction_buffers); /* * First thing we are allowed to do is to discard any remaining * BJ_Reserved buffers. Note, it is _not_ permissible to assume * that there are no such buffers: if a large filesystem * operation like a truncate needs to split itself over multiple * transactions, then it may try to do a jbd2_journal_restart() while * there are still BJ_Reserved buffers outstanding. These must * be released cleanly from the current transaction. * * In this case, the filesystem must still reserve write access * again before modifying the buffer in the new transaction, but * we do not require it to remember exactly which old buffers it * has reserved. This is consistent with the existing behaviour * that multiple jbd2_journal_get_write_access() calls to the same * buffer are perfectly permissable. */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; JBUFFER_TRACE(jh, "reserved, unused: refile"); /* * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may * leave undo-committed data. */ if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); jbd2_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; jbd_unlock_bh_state(bh); } jbd2_journal_refile_buffer(journal, jh); } /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially * frees some memory */ spin_lock(&journal->j_list_lock); __jbd2_journal_clean_checkpoint_list(journal); spin_unlock(&journal->j_list_lock); jbd_debug (3, "JBD: commit phase 1\n"); /* * Switch to a new revoke table. */ jbd2_journal_switch_revoke_table(journal); stats.u.run.rs_flushing = jiffies; stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, stats.u.run.rs_flushing); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ err = journal_submit_data_buffers(journal, commit_transaction); if (err) jbd2_journal_abort(journal, err); jbd2_journal_write_revoke_records(journal, commit_transaction); jbd_debug(3, "JBD: commit phase 2\n"); /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); stats.u.run.rs_logging = jiffies; stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, stats.u.run.rs_logging); stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; stats.u.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); err = 0; descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and release it. */ if (is_journal_aborted(journal)) { clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); jbd2_buffer_abort_trigger(jh, jh->b_frozen_data ? jh->b_frozen_triggers : jh->b_triggers); jbd2_journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up * any descriptor buffers which may have been * already allocated, even if we are now * aborting. */ if (!commit_transaction->t_buffers) goto start_journal_io; continue; } /* Make sure we have a descriptor block in which to record the metadata buffer. */ if (!descriptor) { struct buffer_head *bh; J_ASSERT (bufs == 0); jbd_debug(4, "JBD: get descriptor\n"); descriptor = jbd2_journal_get_descriptor_buffer(journal); if (!descriptor) { jbd2_journal_abort(journal, -EIO); continue; } bh = jh2bh(descriptor); jbd_debug(4, "JBD: got buffer %llu (%p)\n", (unsigned long long)bh->b_blocknr, bh->b_data); header = (journal_header_t *)&bh->b_data[0]; header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); header->h_sequence = cpu_to_be32(commit_transaction->t_tid); tagp = &bh->b_data[sizeof(journal_header_t)]; space_left = bh->b_size - sizeof(journal_header_t); first_tag = 1; set_buffer_jwrite(bh); set_buffer_dirty(bh); wbuf[bufs++] = bh; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(bh, "ph3: file as descriptor"); jbd2_journal_file_buffer(descriptor, commit_transaction, BJ_LogCtl); } /* Where is the buffer to be written? */ err = jbd2_journal_next_log_block(journal, &blocknr); /* If the block mapping failed, just abandon the buffer and repeat this loop: we'll fall into the refile-on-abort condition above. */ if (err) { jbd2_journal_abort(journal, err); continue; } /* * start_this_handle() uses t_outstanding_credits to determine * the free space in the log, but this counter is changed * by jbd2_journal_next_log_block() also. */ commit_transaction->t_outstanding_credits--; /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get rid of the BJ_IO/BJ_Shadow pairing of buffers. */ atomic_inc(&jh2bh(jh)->b_count); /* Make a temporary IO buffer with which to write it out (this will requeue both the metadata buffer and the temporary IO buffer). new_bh goes on BJ_IO*/ set_bit(BH_JWrite, &jh2bh(jh)->b_state); /* * akpm: jbd2_journal_write_metadata_buffer() sets * new_bh->b_transaction to commit_transaction. * We need to clean this up before we release new_bh * (which is of type BJ_IO) */ JBUFFER_TRACE(jh, "ph3: write metadata"); flags = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); wbuf[bufs++] = jh2bh(new_jh); /* Record the new block's tag in the current descriptor buffer */ tag_flag = 0; if (flags & 1) tag_flag |= JBD2_FLAG_ESCAPE; if (!first_tag) tag_flag |= JBD2_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be32(tag_flag); tagp += tag_bytes; space_left -= tag_bytes; if (first_tag) { memcpy (tagp, journal->j_uuid, 16); tagp += 16; space_left -= 16; first_tag = 0; } /* If there's no more to do, or if the descriptor is full, let the IO rip! */ if (bufs == journal->j_wbufsize || commit_transaction->t_buffers == NULL || space_left < tag_bytes + 16) { jbd_debug(4, "JBD: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to the last tag we set up. */ tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; /* * Compute checksum. */ if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { crc32_sum = jbd2_checksum_data(crc32_sum, bh); } lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; submit_bh(WRITE, bh); } cond_resched(); stats.u.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ descriptor = NULL; bufs = 0; } } /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); } /* * This is the right place to wait for data buffers both for ASYNC * and !ASYNC commit. If commit is ASYNC, we need to wait only after * the commit block went to disk (which happens above). If commit is * SYNC, we need to wait for data buffers before we start writing * commit block, which happens below in such setting. */ err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING "JBD2: Detected IO errors while flushing file data " "on %s\n", journal->j_devname); if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) jbd2_journal_abort(journal, err); err = 0; } /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the transaction's t_log_list queue, and metadata buffers are on the t_iobuf_list queue. Wait for the buffers in reverse order. That way we are less likely to be woken up until all IOs have completed, and so we incur less scheduling load. */ jbd_debug(3, "JBD: commit phase 3\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. * See __journal_try_to_free_buffer. */ wait_for_iobuf: while (commit_transaction->t_iobuf_list != NULL) { struct buffer_head *bh; jh = commit_transaction->t_iobuf_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { wait_on_buffer(bh); goto wait_for_iobuf; } if (cond_resched()) goto wait_for_iobuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; clear_buffer_jwrite(bh); JBUFFER_TRACE(jh, "ph4: unfile after journal write"); jbd2_journal_unfile_buffer(journal, jh); /* * ->t_iobuf_list should contain only dummy buffer_heads * which were created by jbd2_journal_write_metadata_buffer(). */ BUFFER_TRACE(bh, "dumping temporary bh"); jbd2_journal_put_journal_head(jh); __brelse(bh); J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); free_buffer_head(bh); /* We also have to unlock and free the corresponding shadowed buffer */ jh = commit_transaction->t_shadow_list->b_tprev; bh = jh2bh(jh); clear_bit(BH_JWrite, &bh->b_state); J_ASSERT_BH(bh, buffer_jbddirty(bh)); /* The metadata is now released for reuse, but we need to remember it against this transaction so that when we finally commit, we can do any checkpointing required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); /* Wake up any transactions which were waiting for this IO to complete */ wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); } J_ASSERT (commit_transaction->t_shadow_list == NULL); jbd_debug(3, "JBD: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: while (commit_transaction->t_log_list != NULL) { struct buffer_head *bh; jh = commit_transaction->t_log_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { wait_on_buffer(bh); goto wait_for_ctlbuf; } if (cond_resched()) goto wait_for_ctlbuf; if (unlikely(!buffer_uptodate(bh))) err = -EIO; BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); jbd2_journal_unfile_buffer(journal, jh); jbd2_journal_put_journal_head(jh); __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } if (err) jbd2_journal_abort(journal, err); jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); } if (!err && !is_journal_aborted(journal)) err = journal_wait_on_commit_record(journal, cbh); if (err) jbd2_journal_abort(journal, err); /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this transaction can be removed from any checkpoint list it was on before. */ jbd_debug(3, "JBD: commit phase 6\n"); J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); J_ASSERT(commit_transaction->t_shadow_list == NULL); J_ASSERT(commit_transaction->t_log_list == NULL); restart_loop: /* * As there are other places (journal_unmap_buffer()) adding buffers * to this list we have to be careful and hold the j_list_lock. */ spin_lock(&journal->j_list_lock); while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); /* * If there is undo-protected committed data against * this buffer, then we can remove it now. If it is a * buffer needing such protection, the old frozen_data * field now points to a committed version of the * buffer, so rotate that field to the new committed * data. * * Otherwise, we can just throw away the frozen data now. * * We also know that the frozen data has already fired * its triggers if they exist, so we can clear that too. */ if (jh->b_committed_data) { jbd2_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; jh->b_frozen_triggers = NULL; } } else if (jh->b_frozen_data) { jbd2_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; jh->b_frozen_triggers = NULL; } spin_lock(&journal->j_list_lock); cp_transaction = jh->b_cp_transaction; if (cp_transaction) { JBUFFER_TRACE(jh, "remove from old cp transaction"); cp_transaction->t_chp_stats.cs_dropped++; __jbd2_journal_remove_checkpoint(jh); } /* Only re-checkpoint the buffer_head if it is marked * dirty. If the buffer was added to the BJ_Forget list * by jbd2_journal_forget, it may no longer be dirty and * there's no point in keeping a checkpoint record for * it. */ /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back * that buffer in the future now that the last use has * been committed. That's not only a performance gain, * it also stops aliasing problems if the buffer is left * behind for writeback and gets reallocated for another * use in a different page. */ if (buffer_freed(bh)) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __jbd2_journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); /* The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget * list. */ JBUFFER_TRACE(jh, "refile or unfile freed buffer"); __jbd2_journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); /* needs a brelse */ jbd2_journal_remove_journal_head(bh); release_buffer_page(bh); } else jbd_unlock_bh_state(bh); } cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); /* * This is a bit sleazy. We use j_list_lock to protect transition * of a transaction into T_FINISHED state and calling * __jbd2_journal_drop_transaction(). Otherwise we could race with * other checkpointing code processing the transaction... */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); /* * Now recheck if some buffers did not get attached to the transaction * while the lock was dropped... */ if (commit_transaction->t_forget) { spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); goto restart_loop; } /* Done with this transaction! */ jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_start = jiffies; stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, commit_transaction->t_start); /* * File the transaction for history */ stats.ts_type = JBD2_STATS_RUN; stats.ts_tid = commit_transaction->t_tid; stats.u.run.rs_handle_count = commit_transaction->t_handle_count; spin_lock(&journal->j_history_lock); memcpy(journal->j_history + journal->j_history_cur, &stats, sizeof(stats)); if (++journal->j_history_cur == journal->j_history_max) journal->j_history_cur = 0; /* * Calculate overall stats */ journal->j_stats.ts_tid++; journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; journal->j_stats.u.run.rs_running += stats.u.run.rs_running; journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; spin_unlock(&journal->j_history_lock); commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); /* * weight the commit time higher than the average time so we don't * react too strongly to vast changes in the commit time */ if (likely(journal->j_average_commit_time)) journal->j_average_commit_time = (commit_time + journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; spin_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); to_free = 1; } else { if (journal->j_checkpoint_transactions == NULL) { journal->j_checkpoint_transactions = commit_transaction; commit_transaction->t_cpnext = commit_transaction; commit_transaction->t_cpprev = commit_transaction; } else { commit_transaction->t_cpnext = journal->j_checkpoint_transactions; commit_transaction->t_cpprev = commit_transaction->t_cpnext->t_cpprev; commit_transaction->t_cpnext->t_cpprev = commit_transaction; commit_transaction->t_cpprev->t_cpnext = commit_transaction; } } spin_unlock(&journal->j_list_lock); if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); if (to_free) kfree(commit_transaction); wake_up(&journal->j_wait_done_commit); }