#!/usr/bin/env python """ Do stuff with csv files. """ import optparse import defapp import csv import operator import os.path from collections import defaultdict as defdict from itertools import izip o = optparse.make_option opts = [ o('-c', '--column', action='append', dest='col', type='int', help='The column(s) on which to operate.'), o(None, '--write-to-file', action='store_true', dest='write_to_file', help='Write the output of operation xyz on file abc.csv to xyz_abc.csv.'), o('-u', '--upper-bound', action='store_true', dest='upper_bound', help="The linear regression is modified to represent an upper bound."), # o(None, '--degree', action='store', type='float', dest='thruthiness', # help='Not quite absolut truth.'), ] defaults = { 'col' : [], 'write_to_file' : False, 'upper_bound' : False, } def pair_iter(it): it = iter(it) while True: yield (it.next(), it.next()) def make_vector_op(op): def vector_op(a, b, defvalue=0): if len(a) > len(b): shorter = b longer = a else: shorter = a longer = b c = list(longer) for i in xrange(len(shorter)): c[i] = op(longer[i], shorter[i]) for i in xrange(len(shorter), len(longer)): c[i] = op(longer[i], defvalue) return c return vector_op def make_scalar_op(op): def scalar_op(scalar, a): return [op(x, scalar) for x in a] return scalar_op row_add = make_vector_op(operator.add) row_min = make_vector_op(min) row_max = make_vector_op(max) def row_reduce(row_op, fixup=lambda key, rows, res: res): def _reduce(order, by_key): for key in order: if key in by_key: rows = by_key[key] res = reduce(row_op, rows) del by_key[key] yield fixup(key, rows, res) return _reduce row_mul = make_scalar_op(operator.mul) row_div = make_scalar_op(operator.div) def transpose(rows): rows = list(rows) if rows: r = len(rows) c = max([len(x) for x in rows]) def at(x, y): try: return rows[x][y] except IndexError: return 0 for i in xrange(c): yield [at(j, i) for j in xrange(r) ] def reorder_columns(rows, xchg_pairs): for r in rows: for (x,y) in xchg_pairs: r[x], r[y] = r[y], r[x] yield r def select_columns(rows, cols): for r in rows: yield [r[x] for x in cols] def numpy_lstsq(x, y): from numpy import ones, array from numpy.linalg import lstsq A = ones((len(y), 2), dtype=float) A[:,0] = array(x) b = array(y) return lstsq(A, b)[0] def max_delta(c0, c1, x, y): return max([abs(c0 + c1 * float(x) - float(y)) for (x,y) in izip(x, y)]) def least_squares(rows, xy_pairs, upper_bound): cols = [] for (x, y) in xy_pairs: cols += [x, y] rows = select_columns(rows, cols) cols = transpose(rows) for ((x, y), (xval, yval)) in izip(xy_pairs, pair_iter(cols)): c1, c0 = numpy_lstsq(xval, yval) if upper_bound: c0 += max_delta(c0, c1, xval, yval) yield [x + 1, y + 1, c0, c1] def select_by_key(rows, col, cast=None): by_key = defdict(list) order = [] for r in rows: key = r[col] if cast: by_key[key] += [[cast(x) for x in r]] else: by_key[key] += [r] order += [key] return (order, by_key) class CsvApp(defapp.App): def __init__(self): defapp.App.__init__(self, opts, defaults) # fixup human-friendly offsets if not self.options.col: self.options.col = [1] self.options.col = [x - 1 for x in self.options.col] self.options.col_pairs = list(pair_iter(self.options.col)) def transform(self, make_iterator, ordered=True): """ Read a file, pass the rows in the file to an iterator factory, and write out the output of the iterator. The iterator performs the desired transformation. """ files = list(self.args) del files[0] for fn in files: try: # read in content rows = csv.reader(open(fn, 'r')) # set up transformation if ordered: (order, by_key) = select_by_key(rows, self.options.col[0], float) rows = make_iterator(order, by_key) else: rows = make_iterator(rows) # write out outfile = self.outfile() if self.options.write_to_file: (dir, file) = os.path.split(fn) fn = os.path.join(dir, self.args[0] + '_' + file) outfile = open(fn, 'w') csv.writer(outfile).writerows(rows) if self.options.write_to_file: outfile.close() except IOError, ex: self.err("%s:%s" % (fn, str(ex))) except IndexError, ex: self.err("%s: Sorry, index out of range." % fn) def do_avg(self, _): def fixup_avg(key, rows, res): res = row_div(len(rows), res) res[self.options.col[0]] = key return res self.transform(row_reduce(row_add, fixup_avg)) def do_max(self, _): self.transform(row_reduce(row_max)) def do_min(self, _): self.transform(row_reduce(row_min)) def do_transpose(self, _): self.transform(transpose, ordered=False) def do_reorder(self, _): self.transform(lambda rows: reorder_columns( rows, self.options.col_pairs), ordered=False) def do_select(self, _): self.transform(lambda rows: select_columns( rows, self.options.col), ordered=False) def do_lstsqrs(self, _): self.transform( lambda rows: least_squares(rows, self.options.col_pairs, self.options.upper_bound), ordered=False) if __name__ == '__main__': CsvApp().launch()