import scipy.stats as s import numpy as np from util import select def iqr(vect): "return inter-quartile range of a vector" q25 = s.scoreatpercentile(vect, 25) q75 = s.scoreatpercentile(vect, 75) return (q75 - q25, q25, q75) def cutoff_max(vect, percentile=99): return s.scoreatpercentile(vect, percentile) def iqr_cutoff(vect, extend): (spread, low, high) = iqr(vect) min_val = low - extend * spread max_val = high + extend * spread return min_val, max_val def iqr_is_not_outlier(table, col=1, extend=1.5): "create a filter function that flags outliers" (spread, low, high) = iqr(table[:,col]) min_val = low - extend * spread max_val = high + extend * spread return lambda row: min_val <= row[col] <= max_val def iqr_remove_outliers(table, col=1, extend=1.5): """Return a copy that only includes rows that fall within the IQR-based window. """ valid = iqr_is_not_outlier(table, col, extend) return select(valid, table)