merge.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

#!/usr/bin/env python

import defapp
from optparse import make_option as o
from os.path  import splitext, basename, split

import os

import sys

from util import load_csv_file

from wsched import split_rows, group, filter_comments, print_row


def data_from(fname, keycol):
    data = load_csv_file(fname)
    return group(data, keycol)

def merged_data(fnames, keycol, replace=True):
    all_data = {}
    for fname in fnames:
        try:
            for (key, rows) in data_from(fname, keycol):
                if key in all_data and not replace:
                    all_data[key].extend(rows)
                else:
                    all_data[key] = rows
        except IOError as ioe:
            print >>sys.stderr, "[!!] Skipping %s; %s." % (fname, str(ioe))
    for key in sorted(all_data.keys()):
        for row in all_data[key]:
            yield row

def substituted_data(origname, files, keycol):
    data = load_csv_file(origname)
    (by_key, order) = split_rows(data, keycol)
    for (idx, fname) in files:
        try:
            for (key, rows) in data_from(fname, keycol):
                if key in by_key:
                    for orow, nrow in zip(by_key[key], rows):
                        # substitute
                        orow[idx] = nrow[idx]
        except IOError as ioe:
            print >>sys.stderr, "[!!] Skipping %s; %s." % (fname, str(ioe))
    for key in order:
        for row in by_key[key]:
            yield row


options = [
    # output options
    o('-o', '--output-prefix', action='store', dest='prefix'),
    o('-k', '--key-column', action='store', type='int', dest='keycol'),
    o(None, '--no-comments', action='store_true', dest='no_comments'),
    o('-w', '--col-width', action='store', type='int', dest='colwidth'),
    o('-d', '--dir', action='append', dest='merge_dirs'),
    o('-s', '--sub', action='append', type='int', dest='sub_col'),
    ]

defaults = {
    'prefix'      : None,
    'keycol'      : 0,
    'no_comments' : False,
    'colwidth'    : 10,
    'precision'   :  3,
    'indent'      :  2,
    'merge_dirs'  : [],
    'sub_col'     : [],
    }


class DataFileMerger(defapp.App):
    def __init__(self):
        defapp.App.__init__(self, options, defaults, no_std_opts=True)
        self.out = sys.stdout

    def merge_file(self, bname, dirs, fout):
        out = open(fout, 'w')
        files = [os.path.join(d, bname) for d in dirs]
        if not self.options.no_comments:
            for src in files:
                filter_comments(src, out=out)
        for r in merged_data(files, self.options.keycol):
            print_row(r, out, self.fmt)
        out.close()


    def sub_column(self, bname, dirs, fout):
        out = open(fout, 'w')
        files = [os.path.join(d, bname) for d in dirs]
        if not self.options.no_comments:
            for src in files:
                filter_comments(src, out=out)
        srcs = zip(self.options.sub_col, files[1:])
        print fout, '<-', files[0], '+', srcs
        for r in substituted_data(files[0], srcs,
                                  self.options.keycol):
            print_row(r, out, self.fmt)
        out.close()

    def target_name(self, fname, extra=None):
        path, bname = os.path.split(fname)
        if extra:
            bname, ext = os.path.splitext(bname)
            bname = "%s%s%s" % (bname, extra, ext)
        return (self.options.prefix + bname, bname, [path] + self.options.merge_dirs)

    def merge(self):
        if self.options.prefix:
            for i, datafile in enumerate(self.args):
                (fout, bname, dirs) = self.target_name(datafile)
                print "[%d/%d] Merging {%s}/%s -> %s" % \
                    (i + 1, len(self.args), ",".join(dirs), bname, fout)
                try:
                    self.merge_file(bname, dirs, fout)
                except IOError as ioe:
                    sys.stderr.write("[!!] ")
                    sys.stderr.write(str(ioe))
                    sys.stderr.write("\n")
        else:
            print "Requires output prefix (-o)."

    def sub(self):
        if self.options.prefix:
            for i, datafile in enumerate(self.args):
                (fout, bname, dirs) = self.target_name(datafile)
                print "[%d/%d] Column-merging {%s}/%s -> %s" % \
                    (i + 1, len(self.args), ",".join(dirs), bname, fout)
                try:
                    self.sub_column(bname, dirs, fout)
                except IOError as ioe:
                    sys.stderr.write("[!!] ")
                    sys.stderr.write(str(ioe))
                    sys.stderr.write("\n")
        else:
            print "Requires output prefix (-o)."

    def default(self, _):
        self.fmt = "%%%d.%df" % (self.options.colwidth, self.options.precision)
        print self.options.sub_col
        if self.options.sub_col:
            self.sub()
        else:
            self.merge()

if __name__ == '__main__':
    DataFileMerger().launch()