diff options
Diffstat (limited to 'csv_tool')
| -rwxr-xr-x | csv_tool | 214 |
1 files changed, 0 insertions, 214 deletions
diff --git a/csv_tool b/csv_tool deleted file mode 100755 index 17ad949..0000000 --- a/csv_tool +++ /dev/null | |||
| @@ -1,214 +0,0 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | """ | ||
| 4 | Do stuff with csv files. | ||
| 5 | """ | ||
| 6 | |||
| 7 | import optparse | ||
| 8 | import defapp | ||
| 9 | |||
| 10 | import csv | ||
| 11 | import operator | ||
| 12 | import os.path | ||
| 13 | from collections import defaultdict as defdict | ||
| 14 | from itertools import izip | ||
| 15 | |||
| 16 | o = optparse.make_option | ||
| 17 | |||
| 18 | opts = [ | ||
| 19 | |||
| 20 | o('-c', '--column', action='append', dest='col', type='int', | ||
| 21 | help='The column(s) on which to operate.'), | ||
| 22 | |||
| 23 | o(None, '--write-to-file', action='store_true', dest='write_to_file', | ||
| 24 | help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'), | ||
| 25 | |||
| 26 | o('-u', '--upper-bound', action='store_true', dest='upper_bound', | ||
| 27 | help="The linear regression is modified to represent an upper bound."), | ||
| 28 | |||
| 29 | # o(None, '--degree', action='store', type='float', dest='thruthiness', | ||
| 30 | # help='Not quite absolut truth.'), | ||
| 31 | ] | ||
| 32 | |||
| 33 | defaults = { | ||
| 34 | 'col' : [], | ||
| 35 | 'write_to_file' : False, | ||
| 36 | 'upper_bound' : False, | ||
| 37 | } | ||
| 38 | |||
| 39 | def pair_iter(it): | ||
| 40 | it = iter(it) | ||
| 41 | while True: | ||
| 42 | yield (it.next(), it.next()) | ||
| 43 | |||
| 44 | def make_vector_op(op): | ||
| 45 | def vector_op(a, b, defvalue=0): | ||
| 46 | if len(a) > len(b): | ||
| 47 | shorter = b | ||
| 48 | longer = a | ||
| 49 | else: | ||
| 50 | shorter = a | ||
| 51 | longer = b | ||
| 52 | c = list(longer) | ||
| 53 | for i in xrange(len(shorter)): | ||
| 54 | c[i] = op(longer[i], shorter[i]) | ||
| 55 | for i in xrange(len(shorter), len(longer)): | ||
| 56 | c[i] = op(longer[i], defvalue) | ||
| 57 | return c | ||
| 58 | return vector_op | ||
| 59 | |||
| 60 | def make_scalar_op(op): | ||
| 61 | def scalar_op(scalar, a): | ||
| 62 | return [op(x, scalar) for x in a] | ||
| 63 | return scalar_op | ||
| 64 | |||
| 65 | row_add = make_vector_op(operator.add) | ||
| 66 | row_min = make_vector_op(min) | ||
| 67 | row_max = make_vector_op(max) | ||
| 68 | |||
| 69 | def row_reduce(row_op, fixup=lambda key, rows, res: res): | ||
| 70 | def _reduce(order, by_key): | ||
| 71 | for key in order: | ||
| 72 | if key in by_key: | ||
| 73 | rows = by_key[key] | ||
| 74 | res = reduce(row_op, rows) | ||
| 75 | del by_key[key] | ||
| 76 | yield fixup(key, rows, res) | ||
| 77 | return _reduce | ||
| 78 | |||
| 79 | |||
| 80 | row_mul = make_scalar_op(operator.mul) | ||
| 81 | row_div = make_scalar_op(operator.div) | ||
| 82 | |||
| 83 | def transpose(rows): | ||
| 84 | rows = list(rows) | ||
| 85 | if rows: | ||
| 86 | r = len(rows) | ||
| 87 | c = max([len(x) for x in rows]) | ||
| 88 | def at(x, y): | ||
| 89 | try: | ||
| 90 | return rows[x][y] | ||
| 91 | except IndexError: | ||
| 92 | return 0 | ||
| 93 | for i in xrange(c): | ||
| 94 | yield [at(j, i) for j in xrange(r) ] | ||
| 95 | |||
| 96 | def reorder_columns(rows, xchg_pairs): | ||
| 97 | for r in rows: | ||
| 98 | for (x,y) in xchg_pairs: | ||
| 99 | r[x], r[y] = r[y], r[x] | ||
| 100 | yield r | ||
| 101 | |||
| 102 | def select_columns(rows, cols): | ||
| 103 | for r in rows: | ||
| 104 | yield [r[x] for x in cols] | ||
| 105 | |||
| 106 | def numpy_lstsq(x, y): | ||
| 107 | from numpy import ones, array | ||
| 108 | from numpy.linalg import lstsq | ||
| 109 | A = ones((len(y), 2), dtype=float) | ||
| 110 | A[:,0] = array(x) | ||
| 111 | b = array(y) | ||
| 112 | return lstsq(A, b)[0] | ||
| 113 | |||
| 114 | def max_delta(c0, c1, x, y): | ||
| 115 | return max([float(y) - (c0 + c1 * float(x)) for (x,y) in izip(x, y)]) | ||
| 116 | |||
| 117 | def least_squares(rows, xy_pairs, upper_bound): | ||
| 118 | cols = [] | ||
| 119 | for (x, y) in xy_pairs: | ||
| 120 | cols += [x, y] | ||
| 121 | rows = select_columns(rows, cols) | ||
| 122 | cols = transpose(rows) | ||
| 123 | for ((x, y), (xval, yval)) in izip(xy_pairs, pair_iter(cols)): | ||
| 124 | c1, c0 = numpy_lstsq(xval, yval) | ||
| 125 | if upper_bound: | ||
| 126 | c0 += max_delta(c0, c1, xval, yval) | ||
| 127 | yield [x + 1, y + 1, c0, c1] | ||
| 128 | |||
| 129 | def select_by_key(rows, col, cast=None): | ||
| 130 | by_key = defdict(list) | ||
| 131 | order = [] | ||
| 132 | for r in rows: | ||
| 133 | key = r[col] | ||
| 134 | if cast: | ||
| 135 | by_key[key] += [[cast(x) for x in r]] | ||
| 136 | else: | ||
| 137 | by_key[key] += [r] | ||
| 138 | order += [key] | ||
| 139 | return (order, by_key) | ||
| 140 | |||
| 141 | class CsvApp(defapp.App): | ||
| 142 | def __init__(self): | ||
| 143 | defapp.App.__init__(self, opts, defaults) | ||
| 144 | # fixup human-friendly offsets | ||
| 145 | if not self.options.col: | ||
| 146 | self.options.col = [1] | ||
| 147 | self.options.col = [x - 1 for x in self.options.col] | ||
| 148 | self.options.col_pairs = list(pair_iter(self.options.col)) | ||
| 149 | |||
| 150 | def transform(self, make_iterator, ordered=True): | ||
| 151 | """ | ||
| 152 | Read a file, pass the rows in the file to an iterator factory, and | ||
| 153 | write out the output of the iterator. The iterator performs the | ||
| 154 | desired transformation. | ||
| 155 | """ | ||
| 156 | files = list(self.args) | ||
| 157 | del files[0] | ||
| 158 | for fn in files: | ||
| 159 | try: | ||
| 160 | # read in content | ||
| 161 | rows = csv.reader(open(fn, 'r')) | ||
| 162 | # set up transformation | ||
| 163 | if ordered: | ||
| 164 | (order, by_key) = select_by_key(rows, self.options.col[0], | ||
| 165 | float) | ||
| 166 | rows = make_iterator(order, by_key) | ||
| 167 | else: | ||
| 168 | rows = make_iterator(rows) | ||
| 169 | # write out | ||
| 170 | outfile = self.outfile() | ||
| 171 | if self.options.write_to_file: | ||
| 172 | (dir, file) = os.path.split(fn) | ||
| 173 | fn = os.path.join(dir, self.args[0] + '_' + file) | ||
| 174 | outfile = open(fn, 'w') | ||
| 175 | csv.writer(outfile).writerows(rows) | ||
| 176 | if self.options.write_to_file: | ||
| 177 | outfile.close() | ||
| 178 | except IOError, ex: | ||
| 179 | self.err("%s:%s" % (fn, str(ex))) | ||
| 180 | except IndexError, ex: | ||
| 181 | self.err("%s: Sorry, index out of range." % fn) | ||
| 182 | |||
| 183 | def do_avg(self, _): | ||
| 184 | def fixup_avg(key, rows, res): | ||
| 185 | res = row_div(len(rows), res) | ||
| 186 | res[self.options.col[0]] = key | ||
| 187 | return res | ||
| 188 | self.transform(row_reduce(row_add, fixup_avg)) | ||
| 189 | |||
| 190 | def do_max(self, _): | ||
| 191 | self.transform(row_reduce(row_max)) | ||
| 192 | |||
| 193 | def do_min(self, _): | ||
| 194 | self.transform(row_reduce(row_min)) | ||
| 195 | |||
| 196 | def do_transpose(self, _): | ||
| 197 | self.transform(transpose, ordered=False) | ||
| 198 | |||
| 199 | def do_reorder(self, _): | ||
| 200 | self.transform(lambda rows: reorder_columns( | ||
| 201 | rows, self.options.col_pairs), ordered=False) | ||
| 202 | |||
| 203 | def do_select(self, _): | ||
| 204 | self.transform(lambda rows: select_columns( | ||
| 205 | rows, self.options.col), ordered=False) | ||
| 206 | |||
| 207 | def do_lstsqrs(self, _): | ||
| 208 | self.transform( | ||
| 209 | lambda rows: least_squares(rows, self.options.col_pairs, | ||
| 210 | self.options.upper_bound), | ||
| 211 | ordered=False) | ||
| 212 | |||
| 213 | if __name__ == '__main__': | ||
| 214 | CsvApp().launch() | ||
