aboutsummaryrefslogtreecommitdiffstats
path: root/csv_tool
blob: 455037c8ff8fff078a2655058a0fd9643e1fdc3d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python

"""
Do stuff with csv files.
"""

import optparse
import defapp

import csv
import operator
from collections import defaultdict as defdict

o = optparse.make_option

opts = [

#    o('-t', '--two', action='store', dest='double_val',  nargs=2, type='int',
#      help='A two-parameter option.'),

    o('-c', '--column', action='store', dest='col', type='int',
      help='The column on which to operate.'),

#     o(None, '--true', action='store_true', dest='truth',
#       help='A boolean flag value.'),

#     o(None, '--degree', action='store', type='float', dest='thruthiness',
#       help='Not quite absolut truth.'),
    ]

defaults = {
    'col'          : 0,
    }

def make_vector_op(op):
    def vector_op(a, b, defvalue=0):
        if len(a) > len(b):
            shorter = b
            longer  = a
        else:
            shorter = a
            longer  = b
        c = list(longer)
        for i in xrange(len(shorter)):
            c[i] = op(longer[i], shorter[i])
        for i in xrange(len(shorter), len(longer)):
            c[i] = op(longer[i], defvalue)
        return c
    return vector_op

def make_scalar_op(op):
    def scalar_op(scalar, a):
        return [op(x, scalar) for x in a]
    return scalar_op

row_add = make_vector_op(operator.add)
row_min = make_vector_op(min)
row_max = make_vector_op(max)

def row_reduce(row_op, fixup=lambda key, rows, res: res):
    def _reduce(order, by_key):
        for key in order:
            if key in by_key:
                rows  = by_key[key]
                res   = reduce(row_op, rows)
                del by_key[key]
                yield fixup(key, rows, res)
    return _reduce

        
row_mul = make_scalar_op(operator.mul)
row_div = make_scalar_op(operator.div)

def select_by_key(rows, col, cast=None):
    by_key = defdict(list)
    order  = []
    for r in rows:
        key = r[col]
        if cast:
            by_key[key] += [[cast(x) for x in r]]
        else:
            by_key[key] += [r]
        order += [key]
    return (order, by_key)

class CsvApp(defapp.App):
    def __init__(self):
        defapp.App.__init__(self, opts, defaults)

    def ordered_transform(self, make_iterator):
        """Average all rows with the same key in a given column."""
        files = list(self.args)
        del files[0]
        try:
            for fn in files:
                # read in content
                (order, by_key) = select_by_key(csv.reader(open(fn, 'r')),
                                                self.options.col, float)
                # write out
                csv.writer(self.outfile()).writerows(make_iterator(order, by_key))
        except IOError, ex:
            print "Error:", ex
        
    def do_avg(self, _):
        def fixup_avg(key, rows, res):
            res = row_div(len(rows), res)
            res[self.options.col] = key
            return res
        self.ordered_transform(row_reduce(row_add, fixup_avg))
                
    def do_max(self, _):
        self.ordered_transform(row_reduce(row_max))

    def do_min(self, _):
        self.ordered_transform(row_reduce(row_min))

if __name__ == '__main__':
    CsvApp().launch()