From 3b21f9d15822453117d1e908ab97cacd8f7f39be Mon Sep 17 00:00:00 2001 From: Andrea Bastoni Date: Mon, 12 Apr 2010 23:23:50 -0400 Subject: Add IQR outliers removal - this code is ugly C style code "transformed" in Python should be reworked at some point... --- data_analysis/statanalyzer.py | 56 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 data_analysis/statanalyzer.py (limited to 'data_analysis/statanalyzer.py') diff --git a/data_analysis/statanalyzer.py b/data_analysis/statanalyzer.py new file mode 100644 index 0000000..6e096c4 --- /dev/null +++ b/data_analysis/statanalyzer.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +import numpy as np +from scipy import stats + +class InterQuartileRange: + def __init__(self, low, high, extend = False): + self.low = low + self.high = high + # extend is 1.5 extension of IQR + self.extend = extend + + def remOutliers(self, vector): + svect = np.sort(vector) + q1 = stats.scoreatpercentile(svect, self.low) + q3 = stats.scoreatpercentile(svect, self.high) + + # match the values \in svect which are closer to q[1|3] + # (q1, q3) + q1_pos = -1 + q3_pos = -1 + cur_pos = 0 + for i in svect: + if q1_pos != -1 and q3_pos != -1: + break + if q1_pos == -1 and i > q1: + q1_pos = cur_pos + if q3_pos == -1 and q3 < i: + q3_pos = cur_pos + + cur_pos += 1 + + if self.extend == True: + # 1.5 IQR outliers elimination + eiqr = (svect[q3_pos] - svect[q1_pos]) * 1.5 + eq1 = svect[q1_pos] - eiqr + if eq1 < svect[0]: + eq1 = svect[0] + eq3 = svect[q3_pos] + eiqr + if eq3 > svect[len(svect) - 1]: + eq3 = svect[len(svect) - 1] + # match the values \in svect which are closer to eq[1|3] + q1_pos = -1 + q3_pos = -1 + cur_pos = 0 + for i in svect: + if q1_pos != -1 and q3_pos != -1: + break + if q1_pos == -1 and i > eq1: + q1_pos = cur_pos + if q3_pos == -1 and eq3 < i: + q3_pos = cur_pos + + cur_pos += 1 + + return svect[q1_pos : q3_pos] -- cgit v1.2.2