aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjörn B. Brandenburg <bbb@cs.unc.edu>2010-10-12 17:52:30 -0400
committerBjörn B. Brandenburg <bbb@cs.unc.edu>2010-10-12 17:52:51 -0400
commit3da42149996ac964b2f6fd6a6673e1b59522a5c1 (patch)
treec8f1867a8bfc09c781f64e5313cfa72fc30eed4b
parent55d915be53a20c416d8d83c55889c29e9c788a7c (diff)
add tool for merging multiple data files into one
-rwxr-xr-xmerge.py150
1 files changed, 150 insertions, 0 deletions
diff --git a/merge.py b/merge.py
new file mode 100755
index 0000000..153c0c2
--- /dev/null
+++ b/merge.py
@@ -0,0 +1,150 @@
1#!/usr/bin/env python
2
3import defapp
4from optparse import make_option as o
5from os.path import splitext, basename, split
6
7import os
8
9import sys
10
11from util import load_csv_file
12
13from wsched import split_rows, group, filter_comments, print_row
14
15
16def data_from(fname, keycol):
17 data = load_csv_file(fname)
18 return group(data, keycol)
19
20def merged_data(fnames, keycol, replace=True):
21 all_data = {}
22 for fname in fnames:
23 try:
24 for (key, rows) in data_from(fname, keycol):
25 if key in all_data and not replace:
26 all_data[key].extend(rows)
27 else:
28 all_data[key] = rows
29 except IOError as ioe:
30 print >>sys.stderr, "[!!] Skipping %s; %s." % (fname, str(ioe))
31 for key in sorted(all_data.keys()):
32 for row in all_data[key]:
33 yield row
34
35def substituted_data(origname, files, keycol):
36 data = load_csv_file(origname)
37 (by_key, order) = split_rows(data, keycol)
38 for (idx, fname) in files:
39 try:
40 for (key, rows) in data_from(fname, keycol):
41 if key in by_key:
42 for orow, nrow in zip(by_key[key], rows):
43 # substitute
44 orow[idx] = nrow[idx]
45 except IOError as ioe:
46 print >>sys.stderr, "[!!] Skipping %s; %s." % (fname, str(ioe))
47 for key in order:
48 for row in by_key[key]:
49 yield row
50
51
52options = [
53 # output options
54 o('-o', '--output-prefix', action='store', dest='prefix'),
55 o('-k', '--key-column', action='store', type='int', dest='keycol'),
56 o(None, '--no-comments', action='store_true', dest='no_comments'),
57 o('-w', '--col-width', action='store', type='int', dest='colwidth'),
58 o('-d', '--dir', action='append', dest='merge_dirs'),
59 o('-s', '--sub', action='append', type='int', dest='sub_col'),
60 ]
61
62defaults = {
63 'prefix' : None,
64 'keycol' : 0,
65 'no_comments' : False,
66 'colwidth' : 10,
67 'precision' : 3,
68 'indent' : 2,
69 'merge_dirs' : [],
70 'sub_col' : [],
71 }
72
73
74class DataFileMerger(defapp.App):
75 def __init__(self):
76 defapp.App.__init__(self, options, defaults, no_std_opts=True)
77 self.out = sys.stdout
78
79 def merge_file(self, bname, dirs, fout):
80 out = open(fout, 'w')
81 files = [os.path.join(d, bname) for d in dirs]
82 if not self.options.no_comments:
83 for src in files:
84 filter_comments(src, out=out)
85 for r in merged_data(files, self.options.keycol):
86 print_row(r, out, self.fmt)
87 out.close()
88
89
90 def sub_column(self, bname, dirs, fout):
91 out = open(fout, 'w')
92 files = [os.path.join(d, bname) for d in dirs]
93 if not self.options.no_comments:
94 for src in files:
95 filter_comments(src, out=out)
96 srcs = zip(self.options.sub_col, files[1:])
97 print fout, '<-', files[0], '+', srcs
98 for r in substituted_data(files[0], srcs,
99 self.options.keycol):
100 print_row(r, out, self.fmt)
101 out.close()
102
103 def target_name(self, fname, extra=None):
104 path, bname = os.path.split(fname)
105 if extra:
106 bname, ext = os.path.splitext(bname)
107 bname = "%s%s%s" % (bname, extra, ext)
108 return (self.options.prefix + bname, bname, [path] + self.options.merge_dirs)
109
110 def merge(self):
111 if self.options.prefix:
112 for i, datafile in enumerate(self.args):
113 (fout, bname, dirs) = self.target_name(datafile)
114 print "[%d/%d] Merging {%s}/%s -> %s" % \
115 (i + 1, len(self.args), ",".join(dirs), bname, fout)
116 try:
117 self.merge_file(bname, dirs, fout)
118 except IOError as ioe:
119 sys.stderr.write("[!!] ")
120 sys.stderr.write(str(ioe))
121 sys.stderr.write("\n")
122 else:
123 print "Requires output prefix (-o)."
124
125 def sub(self):
126 if self.options.prefix:
127 for i, datafile in enumerate(self.args):
128 (fout, bname, dirs) = self.target_name(datafile)
129 print "[%d/%d] Column-merging {%s}/%s -> %s" % \
130 (i + 1, len(self.args), ",".join(dirs), bname, fout)
131 try:
132 self.sub_column(bname, dirs, fout)
133 except IOError as ioe:
134 sys.stderr.write("[!!] ")
135 sys.stderr.write(str(ioe))
136 sys.stderr.write("\n")
137 else:
138 print "Requires output prefix (-o)."
139
140 def default(self, _):
141 self.fmt = "%%%d.%df" % (self.options.colwidth, self.options.precision)
142 print self.options.sub_col
143 if self.options.sub_col:
144 self.sub()
145 else:
146 self.merge()
147
148if __name__ == '__main__':
149 DataFileMerger().launch()
150