diff options
| author | Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu> | 2008-09-04 20:48:46 -0400 |
|---|---|---|
| committer | Bjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu> | 2008-09-04 20:48:46 -0400 |
| commit | 8a32b55ce1c25580da379555b2c4a5f149cfd43b (patch) | |
| tree | 82af205c47bbba9a6bd3e0bc1404101c09f1b8ce | |
| parent | 924e8a861c6fc0ce73efa6caf1525404fedb4494 (diff) | |
started work on a csv transformation tool
| -rwxr-xr-x | csv_tool | 118 | ||||
| -rw-r--r-- | defapp.py | 95 |
2 files changed, 213 insertions, 0 deletions
diff --git a/csv_tool b/csv_tool new file mode 100755 index 0000000..455037c --- /dev/null +++ b/csv_tool | |||
| @@ -0,0 +1,118 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | """ | ||
| 4 | Do stuff with csv files. | ||
| 5 | """ | ||
| 6 | |||
| 7 | import optparse | ||
| 8 | import defapp | ||
| 9 | |||
| 10 | import csv | ||
| 11 | import operator | ||
| 12 | from collections import defaultdict as defdict | ||
| 13 | |||
| 14 | o = optparse.make_option | ||
| 15 | |||
| 16 | opts = [ | ||
| 17 | |||
| 18 | # o('-t', '--two', action='store', dest='double_val', nargs=2, type='int', | ||
| 19 | # help='A two-parameter option.'), | ||
| 20 | |||
| 21 | o('-c', '--column', action='store', dest='col', type='int', | ||
| 22 | help='The column on which to operate.'), | ||
| 23 | |||
| 24 | # o(None, '--true', action='store_true', dest='truth', | ||
| 25 | # help='A boolean flag value.'), | ||
| 26 | |||
| 27 | # o(None, '--degree', action='store', type='float', dest='thruthiness', | ||
| 28 | # help='Not quite absolut truth.'), | ||
| 29 | ] | ||
| 30 | |||
| 31 | defaults = { | ||
| 32 | 'col' : 0, | ||
| 33 | } | ||
| 34 | |||
| 35 | def make_vector_op(op): | ||
| 36 | def vector_op(a, b, defvalue=0): | ||
| 37 | if len(a) > len(b): | ||
| 38 | shorter = b | ||
| 39 | longer = a | ||
| 40 | else: | ||
| 41 | shorter = a | ||
| 42 | longer = b | ||
| 43 | c = list(longer) | ||
| 44 | for i in xrange(len(shorter)): | ||
| 45 | c[i] = op(longer[i], shorter[i]) | ||
| 46 | for i in xrange(len(shorter), len(longer)): | ||
| 47 | c[i] = op(longer[i], defvalue) | ||
| 48 | return c | ||
| 49 | return vector_op | ||
| 50 | |||
| 51 | def make_scalar_op(op): | ||
| 52 | def scalar_op(scalar, a): | ||
| 53 | return [op(x, scalar) for x in a] | ||
| 54 | return scalar_op | ||
| 55 | |||
| 56 | row_add = make_vector_op(operator.add) | ||
| 57 | row_min = make_vector_op(min) | ||
| 58 | row_max = make_vector_op(max) | ||
| 59 | |||
| 60 | def row_reduce(row_op, fixup=lambda key, rows, res: res): | ||
| 61 | def _reduce(order, by_key): | ||
| 62 | for key in order: | ||
| 63 | if key in by_key: | ||
| 64 | rows = by_key[key] | ||
| 65 | res = reduce(row_op, rows) | ||
| 66 | del by_key[key] | ||
| 67 | yield fixup(key, rows, res) | ||
| 68 | return _reduce | ||
| 69 | |||
| 70 | |||
| 71 | row_mul = make_scalar_op(operator.mul) | ||
| 72 | row_div = make_scalar_op(operator.div) | ||
| 73 | |||
| 74 | def select_by_key(rows, col, cast=None): | ||
| 75 | by_key = defdict(list) | ||
| 76 | order = [] | ||
| 77 | for r in rows: | ||
| 78 | key = r[col] | ||
| 79 | if cast: | ||
| 80 | by_key[key] += [[cast(x) for x in r]] | ||
| 81 | else: | ||
| 82 | by_key[key] += [r] | ||
| 83 | order += [key] | ||
| 84 | return (order, by_key) | ||
| 85 | |||
| 86 | class CsvApp(defapp.App): | ||
| 87 | def __init__(self): | ||
| 88 | defapp.App.__init__(self, opts, defaults) | ||
| 89 | |||
| 90 | def ordered_transform(self, make_iterator): | ||
| 91 | """Average all rows with the same key in a given column.""" | ||
| 92 | files = list(self.args) | ||
| 93 | del files[0] | ||
| 94 | try: | ||
| 95 | for fn in files: | ||
| 96 | # read in content | ||
| 97 | (order, by_key) = select_by_key(csv.reader(open(fn, 'r')), | ||
| 98 | self.options.col, float) | ||
| 99 | # write out | ||
| 100 | csv.writer(self.outfile()).writerows(make_iterator(order, by_key)) | ||
| 101 | except IOError, ex: | ||
| 102 | print "Error:", ex | ||
| 103 | |||
| 104 | def do_avg(self, _): | ||
| 105 | def fixup_avg(key, rows, res): | ||
| 106 | res = row_div(len(rows), res) | ||
| 107 | res[self.options.col] = key | ||
| 108 | return res | ||
| 109 | self.ordered_transform(row_reduce(row_add, fixup_avg)) | ||
| 110 | |||
| 111 | def do_max(self, _): | ||
| 112 | self.ordered_transform(row_reduce(row_max)) | ||
| 113 | |||
| 114 | def do_min(self, _): | ||
| 115 | self.ordered_transform(row_reduce(row_min)) | ||
| 116 | |||
| 117 | if __name__ == '__main__': | ||
| 118 | CsvApp().launch() | ||
diff --git a/defapp.py b/defapp.py new file mode 100644 index 0000000..e459084 --- /dev/null +++ b/defapp.py | |||
| @@ -0,0 +1,95 @@ | |||
| 1 | #!/usr/bin/env python | ||
| 2 | |||
| 3 | """ | ||
| 4 | A basic Python application shell, for copy&paste development. | ||
| 5 | """ | ||
| 6 | |||
| 7 | import optparse | ||
| 8 | import cmd | ||
| 9 | import sys | ||
| 10 | |||
| 11 | o = optparse.make_option | ||
| 12 | |||
| 13 | class App(cmd.Cmd): | ||
| 14 | def __init__(self, opts=None, defaults=None, no_std_opts=False, | ||
| 15 | stdout=sys.stdout, stderr=sys.stderr, default_cmd=None): | ||
| 16 | cmd.Cmd.__init__(self, None, stdout, stderr) | ||
| 17 | self.default_cmd = default_cmd | ||
| 18 | if not opts: | ||
| 19 | opts = [] | ||
| 20 | if not defaults: | ||
| 21 | defaults = {} | ||
| 22 | defaults["_App_file"] = None | ||
| 23 | self.f = None | ||
| 24 | if not no_std_opts: | ||
| 25 | opts += [ o('-o', '--output', action='store', dest='_App_file', | ||
| 26 | help='store output in FILE', metavar='FILE')] | ||
| 27 | (self.options, self.args) = self.__parse(opts, defaults) | ||
| 28 | |||
| 29 | def __parse(self, opts, defaults): | ||
| 30 | parser = optparse.OptionParser(option_list=opts) | ||
| 31 | parser.set_defaults(**defaults) | ||
| 32 | return parser.parse_args() | ||
| 33 | |||
| 34 | def launch(self, args=None): | ||
| 35 | if args: | ||
| 36 | self.args = args | ||
| 37 | try: | ||
| 38 | if self.options._App_file: | ||
| 39 | self.f = open(self.options._App_file, 'w') | ||
| 40 | self.onecmd(' '.join(self.args)) | ||
| 41 | except IOError, msg: | ||
| 42 | self.err("I/O Error:", msg) | ||
| 43 | except KeyboardInterrupt: | ||
| 44 | self.err("Interrupted.") | ||
| 45 | if self.f: | ||
| 46 | self.f.close() | ||
| 47 | |||
| 48 | def outfile(self): | ||
| 49 | if self.f: | ||
| 50 | return f | ||
| 51 | else: | ||
| 52 | return sys.stdout | ||
| 53 | |||
| 54 | def emptyline(self): | ||
| 55 | if self.default_cmd: | ||
| 56 | self.onecmd(self.default_cmd) | ||
| 57 | |||
| 58 | def default(self, line): | ||
| 59 | self.err("%s: Command not recognized." % line) | ||
| 60 | |||
| 61 | def do_dump_config(self, key): | ||
| 62 | """Display the configuration as parsed on the console.""" | ||
| 63 | def is_private(k): return k[0] == '_' | ||
| 64 | def show(k): print "%20s : %10s" % (k, str(self.options.__dict__[k])) | ||
| 65 | if not key: | ||
| 66 | for x in sorted(self.options.__dict__.keys()): | ||
| 67 | if not is_private(x): | ||
| 68 | show(x) | ||
| 69 | elif not is_private(key) and key in self.options.__dict__: | ||
| 70 | show(key) | ||
| 71 | else: | ||
| 72 | self.err("%s: unknown option." % key) | ||
| 73 | |||
| 74 | @staticmethod | ||
| 75 | def __write(stream, *args, **kargs): | ||
| 76 | stream.write(" ".join([str(a) for a in args])) | ||
| 77 | if not ('omit_newline' in kargs and kargs['omit_newline']): | ||
| 78 | stream.write("\n") | ||
| 79 | stream.flush() | ||
| 80 | |||
| 81 | def err(self, *args, **kargs): | ||
| 82 | self.__write(sys.stderr, *args, **kargs) | ||
| 83 | |||
| 84 | def msg(self, *args, **kargs): | ||
| 85 | self.__write(sys.stdout, *args, **kargs) | ||
| 86 | |||
| 87 | def out(self, *args, **kargs): | ||
| 88 | if self.f: | ||
| 89 | self.__write(self.f, *args, **kargs) | ||
| 90 | else: | ||
| 91 | self.__write(sys.stdout, *args, **kargs) | ||
| 92 | |||
| 93 | if __name__ == "__main__": | ||
| 94 | a = App() | ||
| 95 | a.launch() | ||
